--- /dev/null
+Index: linux-2.6.12-rc6/fs/ext3/extents.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/extents.c 2005-06-14 16:31:25.756503133 +0200
++++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200
+@@ -0,0 +1,2347 @@
++/*
++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++/*
++ * Extents support for EXT3
++ *
++ * TODO:
++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent()
++ * - ext3_ext_calc_credits() could take 'mergable' into account
++ * - ext3*_error() should be used in some situations
++ * - find_goal() [to be tested and improved]
++ * - smart tree reduction
++ * - arch-independence
++ * common on-disk format for big/little-endian arch
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/time.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/smp_lock.h>
++#include <linux/highuid.h>
++#include <linux/pagemap.h>
++#include <linux/quotaops.h>
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/ext3_extents.h>
++#include <asm/uaccess.h>
++
++
++static inline int ext3_ext_check_header(struct ext3_extent_header *eh)
++{
++ if (eh->eh_magic != EXT3_EXT_MAGIC) {
++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n",
++ (unsigned)eh->eh_magic);
++ return -EIO;
++ }
++ if (eh->eh_max == 0) {
++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n",
++ (unsigned)eh->eh_max);
++ return -EIO;
++ }
++ if (eh->eh_entries > eh->eh_max) {
++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n",
++ (unsigned)eh->eh_entries);
++ return -EIO;
++ }
++ return 0;
++}
++
++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
++{
++ int err;
++
++ if (handle->h_buffer_credits > needed)
++ return handle;
++ if (!ext3_journal_extend(handle, needed))
++ return handle;
++ err = ext3_journal_restart(handle, needed);
++
++ return handle;
++}
++
++static int inline
++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree)
++{
++ if (tree->ops->get_write_access)
++ return tree->ops->get_write_access(h,tree->buffer);
++ else
++ return 0;
++}
++
++static int inline
++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree)
++{
++ if (tree->ops->mark_buffer_dirty)
++ return tree->ops->mark_buffer_dirty(h,tree->buffer);
++ else
++ return 0;
++}
++
++/*
++ * could return:
++ * - EROFS
++ * - ENOMEM
++ */
++static int ext3_ext_get_access(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int err;
++
++ if (path->p_bh) {
++ /* path points to block */
++ err = ext3_journal_get_write_access(handle, path->p_bh);
++ } else {
++ /* path points to leaf/index in inode body */
++ err = ext3_ext_get_access_for_root(handle, tree);
++ }
++ return err;
++}
++
++/*
++ * could return:
++ * - EROFS
++ * - ENOMEM
++ * - EIO
++ */
++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int err;
++ if (path->p_bh) {
++ /* path points to block */
++ err =ext3_journal_dirty_metadata(handle, path->p_bh);
++ } else {
++ /* path points to leaf/index in inode body */
++ err = ext3_ext_mark_root_dirty(handle, tree);
++ }
++ return err;
++}
++
++static int inline
++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, struct ext3_extent *ex,
++ int *err)
++{
++ int goal, depth, newblock;
++ struct inode *inode;
++
++ EXT_ASSERT(tree);
++ if (tree->ops->new_block)
++ return tree->ops->new_block(handle, tree, path, ex, err);
++
++ inode = tree->inode;
++ depth = EXT_DEPTH(tree);
++ if (path && depth > 0) {
++ goal = path[depth-1].p_block;
++ } else {
++ struct ext3_inode_info *ei = EXT3_I(inode);
++ unsigned long bg_start;
++ unsigned long colour;
++
++ bg_start = (ei->i_block_group *
++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++ colour = (current->pid % 16) *
++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++ goal = bg_start + colour;
++ }
++
++ newblock = ext3_new_block(handle, inode, goal, err);
++ return newblock;
++}
++
++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
++{
++ struct ext3_extent_header *neh;
++ neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation++;
++}
++
++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->inode->i_sb->s_blocksize -
++ sizeof(struct ext3_extent_header)) /
++ sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++ size = 6;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->inode->i_sb->s_blocksize -
++ sizeof(struct ext3_extent_header)) /
++ sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++ size = 5;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) /
++ sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++ size = 3;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) /
++ sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++ size = 4;
++#endif
++ return size;
++}
++
++static void ext3_ext_show_path(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++ int k, l = path->p_depth;
++
++ ext_debug(tree, "path:");
++ for (k = 0; k <= l; k++, path++) {
++ if (path->p_idx) {
++ ext_debug(tree, " %d->%d", path->p_idx->ei_block,
++ path->p_idx->ei_leaf);
++ } else if (path->p_ext) {
++ ext_debug(tree, " %d:%d:%d",
++ path->p_ext->ee_block,
++ path->p_ext->ee_len,
++ path->p_ext->ee_start);
++ } else
++ ext_debug(tree, " []");
++ }
++ ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent_header *eh;
++ struct ext3_extent *ex;
++ int i;
++
++ if (!path)
++ return;
++
++ eh = path[depth].p_hdr;
++ ex = EXT_FIRST_EXTENT(eh);
++
++ for (i = 0; i < eh->eh_entries; i++, ex++) {
++ ext_debug(tree, "%d:%d:%d ",
++ ex->ee_block, ex->ee_len, ex->ee_start);
++ }
++ ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_drop_refs(struct ext3_ext_path *path)
++{
++ int depth = path->p_depth;
++ int i;
++
++ for (i = 0; i <= depth; i++, path++) {
++ if (path->p_bh) {
++ brelse(path->p_bh);
++ path->p_bh = NULL;
++ }
++ }
++}
++
++/*
++ * binary search for closest index by given block
++ */
++static inline void
++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, int block)
++{
++ struct ext3_extent_header *eh = path->p_hdr;
++ struct ext3_extent_idx *ix;
++ int l = 0, k, r;
++
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++ EXT_ASSERT(eh->eh_entries > 0);
++
++ ext_debug(tree, "binsearch for %d(idx): ", block);
++
++ path->p_idx = ix = EXT_FIRST_INDEX(eh);
++
++ r = k = eh->eh_entries;
++ while (k > 1) {
++ k = (r - l) / 2;
++ if (block < ix[l + k].ei_block)
++ r -= k;
++ else
++ l += k;
++ ext_debug(tree, "%d:%d:%d ", k, l, r);
++ }
++
++ ix += l;
++ path->p_idx = ix;
++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf);
++
++ while (l++ < r) {
++ if (block < ix->ei_block)
++ break;
++ path->p_idx = ix++;
++ }
++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block,
++ path->p_idx->ei_leaf);
++
++#ifdef CHECK_BINSEARCH
++ {
++ struct ext3_extent_idx *chix;
++
++ chix = ix = EXT_FIRST_INDEX(eh);
++ for (k = 0; k < eh->eh_entries; k++, ix++) {
++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) {
++ printk("k=%d, ix=0x%p, first=0x%p\n", k,
++ ix, EXT_FIRST_INDEX(eh));
++ printk("%u <= %u\n",
++ ix->ei_block,ix[-1].ei_block);
++ }
++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block);
++ if (block < ix->ei_block)
++ break;
++ chix = ix;
++ }
++ EXT_ASSERT(chix == path->p_idx);
++ }
++#endif
++}
++
++/*
++ * binary search for closest extent by given block
++ */
++static inline void
++ext3_ext_binsearch(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, int block)
++{
++ struct ext3_extent_header *eh = path->p_hdr;
++ struct ext3_extent *ex;
++ int l = 0, k, r;
++
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++
++ if (eh->eh_entries == 0) {
++ /*
++ * this leaf is empty yet:
++ * we get such a leaf in split/add case
++ */
++ return;
++ }
++
++ ext_debug(tree, "binsearch for %d: ", block);
++
++ path->p_ext = ex = EXT_FIRST_EXTENT(eh);
++
++ r = k = eh->eh_entries;
++ while (k > 1) {
++ k = (r - l) / 2;
++ if (block < ex[l + k].ee_block)
++ r -= k;
++ else
++ l += k;
++ ext_debug(tree, "%d:%d:%d ", k, l, r);
++ }
++
++ ex += l;
++ path->p_ext = ex;
++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block,
++ path->p_ext->ee_start, path->p_ext->ee_len);
++
++ while (l++ < r) {
++ if (block < ex->ee_block)
++ break;
++ path->p_ext = ex++;
++ }
++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block,
++ path->p_ext->ee_start, path->p_ext->ee_len);
++
++#ifdef CHECK_BINSEARCH
++ {
++ struct ext3_extent *chex;
++
++ chex = ex = EXT_FIRST_EXTENT(eh);
++ for (k = 0; k < eh->eh_entries; k++, ex++) {
++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block);
++ if (block < ex->ee_block)
++ break;
++ chex = ex;
++ }
++ EXT_ASSERT(chex == path->p_ext);
++ }
++#endif
++}
++
++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree)
++{
++ struct ext3_extent_header *eh;
++
++ BUG_ON(tree->buffer_len == 0);
++ ext3_ext_get_access_for_root(handle, tree);
++ eh = EXT_ROOT_HDR(tree);
++ eh->eh_depth = 0;
++ eh->eh_entries = 0;
++ eh->eh_magic = EXT3_EXT_MAGIC;
++ eh->eh_max = ext3_ext_space_root(tree);
++ ext3_ext_mark_root_dirty(handle, tree);
++ ext3_ext_invalidate_cache(tree);
++ return 0;
++}
++
++struct ext3_ext_path *
++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block,
++ struct ext3_ext_path *path)
++{
++ struct ext3_extent_header *eh;
++ struct buffer_head *bh;
++ int depth, i, ppos = 0;
++
++ EXT_ASSERT(tree);
++ EXT_ASSERT(tree->inode);
++ EXT_ASSERT(tree->root);
++
++ eh = EXT_ROOT_HDR(tree);
++ EXT_ASSERT(eh);
++ if (ext3_ext_check_header(eh))
++ goto err;
++
++ i = depth = EXT_DEPTH(tree);
++ EXT_ASSERT(eh->eh_max);
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++
++ /* account possible depth increase */
++ if (!path) {
++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
++ GFP_NOFS);
++ if (!path)
++ return ERR_PTR(-ENOMEM);
++ }
++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++ path[0].p_hdr = eh;
++
++ /* walk through the tree */
++ while (i) {
++ ext_debug(tree, "depth %d: num %d, max %d\n",
++ ppos, eh->eh_entries, eh->eh_max);
++ ext3_ext_binsearch_idx(tree, path + ppos, block);
++ path[ppos].p_block = path[ppos].p_idx->ei_leaf;
++ path[ppos].p_depth = i;
++ path[ppos].p_ext = NULL;
++
++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block);
++ if (!bh)
++ goto err;
++
++ eh = EXT_BLOCK_HDR(bh);
++ ppos++;
++ EXT_ASSERT(ppos <= depth);
++ path[ppos].p_bh = bh;
++ path[ppos].p_hdr = eh;
++ i--;
++
++ if (ext3_ext_check_header(eh))
++ goto err;
++ }
++
++ path[ppos].p_depth = i;
++ path[ppos].p_hdr = eh;
++ path[ppos].p_ext = NULL;
++ path[ppos].p_idx = NULL;
++
++ if (ext3_ext_check_header(eh))
++ goto err;
++
++ /* find extent */
++ ext3_ext_binsearch(tree, path + ppos, block);
++
++ ext3_ext_show_path(tree, path);
++
++ return path;
++
++err:
++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ return ERR_PTR(-EIO);
++}
++
++/*
++ * insert new index [logical;ptr] into the block at cupr
++ * it check where to insert: before curp or after curp
++ */
++static int ext3_ext_insert_index(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *curp,
++ int logical, int ptr)
++{
++ struct ext3_extent_idx *ix;
++ int len, err;
++
++ if ((err = ext3_ext_get_access(handle, tree, curp)))
++ return err;
++
++ EXT_ASSERT(logical != curp->p_idx->ei_block);
++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
++ if (logical > curp->p_idx->ei_block) {
++ /* insert after */
++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
++ len = (len - 1) * sizeof(struct ext3_extent_idx);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert new index %d after: %d. "
++ "move %d from 0x%p to 0x%p\n",
++ logical, ptr, len,
++ (curp->p_idx + 1), (curp->p_idx + 2));
++ memmove(curp->p_idx + 2, curp->p_idx + 1, len);
++ }
++ ix = curp->p_idx + 1;
++ } else {
++ /* insert before */
++ len = len * sizeof(struct ext3_extent_idx);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert new index %d before: %d. "
++ "move %d from 0x%p to 0x%p\n",
++ logical, ptr, len,
++ curp->p_idx, (curp->p_idx + 1));
++ memmove(curp->p_idx + 1, curp->p_idx, len);
++ ix = curp->p_idx;
++ }
++
++ ix->ei_block = logical;
++ ix->ei_leaf = ptr;
++ curp->p_hdr->eh_entries++;
++
++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max);
++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr));
++
++ err = ext3_ext_dirty(handle, tree, curp);
++ ext3_std_error(tree->inode->i_sb, err);
++
++ return err;
++}
++
++/*
++ * routine inserts new subtree into the path, using free index entry
++ * at depth 'at:
++ * - allocates all needed blocks (new leaf and all intermediate index blocks)
++ * - makes decision where to split
++ * - moves remaining extens and index entries (right to the split point)
++ * into the newly allocated blocks
++ * - initialize subtree
++ */
++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext, int at)
++{
++ struct buffer_head *bh = NULL;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent_header *neh;
++ struct ext3_extent_idx *fidx;
++ struct ext3_extent *ex;
++ int i = at, k, m, a;
++ unsigned long newblock, oldblock, border;
++ int *ablocks = NULL; /* array of allocated blocks */
++ int err = 0;
++
++ /* make decision: where to split? */
++ /* FIXME: now desicion is simplest: at current extent */
++
++ /* if current leaf will be splitted, then we should use
++ * border from split point */
++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr));
++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
++ border = path[depth].p_ext[1].ee_block;
++ ext_debug(tree, "leaf will be splitted."
++ " next leaf starts at %d\n",
++ (int)border);
++ } else {
++ border = newext->ee_block;
++ ext_debug(tree, "leaf will be added."
++ " next leaf starts at %d\n",
++ (int)border);
++ }
++
++ /*
++ * if error occurs, then we break processing
++ * and turn filesystem read-only. so, index won't
++ * be inserted and tree will be in consistent
++ * state. next mount will repair buffers too
++ */
++
++ /*
++ * get array to track all allocated blocks
++ * we need this to handle errors and free blocks
++ * upon them
++ */
++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS);
++ if (!ablocks)
++ return -ENOMEM;
++ memset(ablocks, 0, sizeof(unsigned long) * depth);
++
++ /* allocate all needed blocks */
++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at);
++ for (a = 0; a < depth - at; a++) {
++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++ if (newblock == 0)
++ goto cleanup;
++ ablocks[a] = newblock;
++ }
++
++ /* initialize new leaf */
++ newblock = ablocks[--a];
++ EXT_ASSERT(newblock);
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh)))
++ goto cleanup;
++
++ neh = EXT_BLOCK_HDR(bh);
++ neh->eh_entries = 0;
++ neh->eh_max = ext3_ext_space_block(tree);
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ neh->eh_depth = 0;
++ ex = EXT_FIRST_EXTENT(neh);
++
++ /* move remain of path[depth] to the new leaf */
++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max);
++ /* start copy from next extent */
++ /* TODO: we could do it by single memmove */
++ m = 0;
++ path[depth].p_ext++;
++ while (path[depth].p_ext <=
++ EXT_MAX_EXTENT(path[depth].p_hdr)) {
++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n",
++ path[depth].p_ext->ee_block,
++ path[depth].p_ext->ee_start,
++ path[depth].p_ext->ee_len,
++ newblock);
++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent));
++ neh->eh_entries++;
++ m++;
++ }
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto cleanup;
++ brelse(bh);
++ bh = NULL;
++
++ /* correct old leaf */
++ if (m) {
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ goto cleanup;
++ path[depth].p_hdr->eh_entries -= m;
++ if ((err = ext3_ext_dirty(handle, tree, path + depth)))
++ goto cleanup;
++
++ }
++
++ /* create intermediate indexes */
++ k = depth - at - 1;
++ EXT_ASSERT(k >= 0);
++ if (k)
++ ext_debug(tree, "create %d intermediate indices\n", k);
++ /* insert new index into current index block */
++ /* current depth stored in i var */
++ i = depth - 1;
++ while (k--) {
++ oldblock = newblock;
++ newblock = ablocks[--a];
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh)))
++ goto cleanup;
++
++ neh = EXT_BLOCK_HDR(bh);
++ neh->eh_entries = 1;
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ neh->eh_max = ext3_ext_space_block_idx(tree);
++ neh->eh_depth = depth - i;
++ fidx = EXT_FIRST_INDEX(neh);
++ fidx->ei_block = border;
++ fidx->ei_leaf = oldblock;
++
++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n",
++ i, newblock, border, oldblock);
++ /* copy indexes */
++ m = 0;
++ path[i].p_idx++;
++
++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx,
++ EXT_MAX_INDEX(path[i].p_hdr));
++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
++ EXT_LAST_INDEX(path[i].p_hdr));
++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
++ ext_debug(tree, "%d: move %d:%d in new index %lu\n",
++ i, path[i].p_idx->ei_block,
++ path[i].p_idx->ei_leaf, newblock);
++ memmove(++fidx, path[i].p_idx++,
++ sizeof(struct ext3_extent_idx));
++ neh->eh_entries++;
++ EXT_ASSERT(neh->eh_entries <= neh->eh_max);
++ m++;
++ }
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto cleanup;
++ brelse(bh);
++ bh = NULL;
++
++ /* correct old index */
++ if (m) {
++ err = ext3_ext_get_access(handle, tree, path + i);
++ if (err)
++ goto cleanup;
++ path[i].p_hdr->eh_entries -= m;
++ err = ext3_ext_dirty(handle, tree, path + i);
++ if (err)
++ goto cleanup;
++ }
++
++ i--;
++ }
++
++ /* insert new index */
++ if (!err)
++ err = ext3_ext_insert_index(handle, tree, path + at,
++ border, newblock);
++
++cleanup:
++ if (bh) {
++ if (buffer_locked(bh))
++ unlock_buffer(bh);
++ brelse(bh);
++ }
++
++ if (err) {
++ /* free all allocated blocks in error case */
++ for (i = 0; i < depth; i++) {
++ if (!ablocks[i])
++ continue;
++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++ }
++ }
++ kfree(ablocks);
++
++ return err;
++}
++
++/*
++ * routine implements tree growing procedure:
++ * - allocates new block
++ * - moves top-level data (index block or leaf) into the new block
++ * - initialize new top-level, creating index that points to the
++ * just created block
++ */
++static int ext3_ext_grow_indepth(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_ext_path *curp = path;
++ struct ext3_extent_header *neh;
++ struct ext3_extent_idx *fidx;
++ struct buffer_head *bh;
++ unsigned long newblock;
++ int err = 0;
++
++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++ if (newblock == 0)
++ return err;
++
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ ext3_std_error(tree->inode->i_sb, err);
++ return err;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh))) {
++ unlock_buffer(bh);
++ goto out;
++ }
++
++ /* move top-level index/leaf into new block */
++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len);
++
++ /* set size of new block */
++ neh = EXT_BLOCK_HDR(bh);
++ /* old root could have indexes or leaves
++ * so calculate eh_max right way */
++ if (EXT_DEPTH(tree))
++ neh->eh_max = ext3_ext_space_block_idx(tree);
++ else
++ neh->eh_max = ext3_ext_space_block(tree);
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto out;
++
++ /* create index in new top-level index: num,max,pointer */
++ if ((err = ext3_ext_get_access(handle, tree, curp)))
++ goto out;
++
++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC;
++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree);
++ curp->p_hdr->eh_entries = 1;
++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
++ /* FIXME: it works, but actually path[0] can be index */
++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
++ curp->p_idx->ei_leaf = newblock;
++
++ neh = EXT_ROOT_HDR(tree);
++ fidx = EXT_FIRST_INDEX(neh);
++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n",
++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf);
++
++ neh->eh_depth = path->p_depth + 1;
++ err = ext3_ext_dirty(handle, tree, curp);
++out:
++ brelse(bh);
++
++ return err;
++}
++
++/*
++ * routine finds empty index and adds new leaf. if no free index found
++ * then it requests in-depth growing
++ */
++static int ext3_ext_create_new_leaf(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_ext_path *curp;
++ int depth, i, err = 0;
++
++repeat:
++ i = depth = EXT_DEPTH(tree);
++
++ /* walk up to the tree and look for free index entry */
++ curp = path + depth;
++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
++ i--;
++ curp--;
++ }
++
++ /* we use already allocated block for index block
++ * so, subsequent data blocks should be contigoues */
++ if (EXT_HAS_FREE_INDEX(curp)) {
++ /* if we found index with free entry, then use that
++ * entry: create all needed subtree and add new leaf */
++ err = ext3_ext_split(handle, tree, path, newext, i);
++
++ /* refill path */
++ ext3_ext_drop_refs(path);
++ path = ext3_ext_find_extent(tree, newext->ee_block, path);
++ if (IS_ERR(path))
++ err = PTR_ERR(path);
++ } else {
++ /* tree is full, time to grow in depth */
++ err = ext3_ext_grow_indepth(handle, tree, path, newext);
++
++ /* refill path */
++ ext3_ext_drop_refs(path);
++ path = ext3_ext_find_extent(tree, newext->ee_block, path);
++ if (IS_ERR(path))
++ err = PTR_ERR(path);
++
++ /*
++ * only first (depth 0 -> 1) produces free space
++ * in all other cases we have to split growed tree
++ */
++ depth = EXT_DEPTH(tree);
++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
++ /* now we need split */
++ goto repeat;
++ }
++ }
++
++ if (err)
++ return err;
++
++ return 0;
++}
++
++/*
++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK
++ * NOTE: it consider block number from index entry as
++ * allocated block. thus, index entries have to be consistent
++ * with leafs
++ */
++static unsigned long
++ext3_ext_next_allocated_block(struct ext3_ext_path *path)
++{
++ int depth;
++
++ EXT_ASSERT(path != NULL);
++ depth = path->p_depth;
++
++ if (depth == 0 && path->p_ext == NULL)
++ return EXT_MAX_BLOCK;
++
++ /* FIXME: what if index isn't full ?! */
++ while (depth >= 0) {
++ if (depth == path->p_depth) {
++ /* leaf */
++ if (path[depth].p_ext !=
++ EXT_LAST_EXTENT(path[depth].p_hdr))
++ return path[depth].p_ext[1].ee_block;
++ } else {
++ /* index */
++ if (path[depth].p_idx !=
++ EXT_LAST_INDEX(path[depth].p_hdr))
++ return path[depth].p_idx[1].ei_block;
++ }
++ depth--;
++ }
++
++ return EXT_MAX_BLOCK;
++}
++
++/*
++ * returns first allocated block from next leaf or EXT_MAX_BLOCK
++ */
++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int depth;
++
++ EXT_ASSERT(path != NULL);
++ depth = path->p_depth;
++
++ /* zero-tree has no leaf blocks at all */
++ if (depth == 0)
++ return EXT_MAX_BLOCK;
++
++ /* go to index block */
++ depth--;
++
++ while (depth >= 0) {
++ if (path[depth].p_idx !=
++ EXT_LAST_INDEX(path[depth].p_hdr))
++ return path[depth].p_idx[1].ei_block;
++ depth--;
++ }
++
++ return EXT_MAX_BLOCK;
++}
++
++/*
++ * if leaf gets modified and modified extent is first in the leaf
++ * then we have to correct all indexes above
++ * TODO: do we need to correct tree in all cases?
++ */
++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ struct ext3_extent_header *eh;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent *ex;
++ unsigned long border;
++ int k, err = 0;
++
++ eh = path[depth].p_hdr;
++ ex = path[depth].p_ext;
++ EXT_ASSERT(ex);
++ EXT_ASSERT(eh);
++
++ if (depth == 0) {
++ /* there is no tree at all */
++ return 0;
++ }
++
++ if (ex != EXT_FIRST_EXTENT(eh)) {
++ /* we correct tree if first leaf got modified only */
++ return 0;
++ }
++
++ /*
++ * TODO: we need correction if border is smaller then current one
++ */
++ k = depth - 1;
++ border = path[depth].p_ext->ee_block;
++ if ((err = ext3_ext_get_access(handle, tree, path + k)))
++ return err;
++ path[k].p_idx->ei_block = border;
++ if ((err = ext3_ext_dirty(handle, tree, path + k)))
++ return err;
++
++ while (k--) {
++ /* change all left-side indexes */
++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
++ break;
++ if ((err = ext3_ext_get_access(handle, tree, path + k)))
++ break;
++ path[k].p_idx->ei_block = border;
++ if ((err = ext3_ext_dirty(handle, tree, path + k)))
++ break;
++ }
++
++ return err;
++}
++
++static int inline
++ext3_can_extents_be_merged(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex1,
++ struct ext3_extent *ex2)
++{
++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block)
++ return 0;
++
++#ifdef AGRESSIVE_TEST
++ if (ex1->ee_len >= 4)
++ return 0;
++#endif
++
++ if (!tree->ops->mergable)
++ return 1;
++
++ return tree->ops->mergable(ex1, ex2);
++}
++
++/*
++ * this routine tries to merge requsted extent into the existing
++ * extent or inserts requested extent as new one into the tree,
++ * creating new leaf in no-space case
++ */
++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_extent_header * eh;
++ struct ext3_extent *ex, *fex;
++ struct ext3_extent *nearex; /* nearest extent */
++ struct ext3_ext_path *npath = NULL;
++ int depth, len, err, next;
++
++ EXT_ASSERT(newext->ee_len > 0);
++ depth = EXT_DEPTH(tree);
++ ex = path[depth].p_ext;
++ EXT_ASSERT(path[depth].p_hdr);
++
++ /* try to insert block into found extent and return */
++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) {
++ ext_debug(tree, "append %d block to %d:%d (from %d)\n",
++ newext->ee_len, ex->ee_block, ex->ee_len,
++ ex->ee_start);
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ return err;
++ ex->ee_len += newext->ee_len;
++ eh = path[depth].p_hdr;
++ nearex = ex;
++ goto merge;
++ }
++
++repeat:
++ depth = EXT_DEPTH(tree);
++ eh = path[depth].p_hdr;
++ if (eh->eh_entries < eh->eh_max)
++ goto has_space;
++
++ /* probably next leaf has space for us? */
++ fex = EXT_LAST_EXTENT(eh);
++ next = ext3_ext_next_leaf_block(tree, path);
++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) {
++ ext_debug(tree, "next leaf block - %d\n", next);
++ EXT_ASSERT(!npath);
++ npath = ext3_ext_find_extent(tree, next, NULL);
++ if (IS_ERR(npath))
++ return PTR_ERR(npath);
++ EXT_ASSERT(npath->p_depth == path->p_depth);
++ eh = npath[depth].p_hdr;
++ if (eh->eh_entries < eh->eh_max) {
++ ext_debug(tree, "next leaf isnt full(%d)\n",
++ eh->eh_entries);
++ path = npath;
++ goto repeat;
++ }
++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n",
++ eh->eh_entries, eh->eh_max);
++ }
++
++ /*
++ * there is no free space in found leaf
++ * we're gonna add new leaf in the tree
++ */
++ err = ext3_ext_create_new_leaf(handle, tree, path, newext);
++ if (err)
++ goto cleanup;
++ depth = EXT_DEPTH(tree);
++ eh = path[depth].p_hdr;
++
++has_space:
++ nearex = path[depth].p_ext;
++
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ goto cleanup;
++
++ if (!nearex) {
++ /* there is no extent in this leaf, create first one */
++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n",
++ newext->ee_block, newext->ee_start,
++ newext->ee_len);
++ path[depth].p_ext = EXT_FIRST_EXTENT(eh);
++ } else if (newext->ee_block > nearex->ee_block) {
++ EXT_ASSERT(newext->ee_block != nearex->ee_block);
++ if (nearex != EXT_LAST_EXTENT(eh)) {
++ len = EXT_MAX_EXTENT(eh) - nearex;
++ len = (len - 1) * sizeof(struct ext3_extent);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, "
++ "move %d from 0x%p to 0x%p\n",
++ newext->ee_block, newext->ee_start,
++ newext->ee_len,
++ nearex, len, nearex + 1, nearex + 2);
++ memmove(nearex + 2, nearex + 1, len);
++ }
++ path[depth].p_ext = nearex + 1;
++ } else {
++ EXT_ASSERT(newext->ee_block != nearex->ee_block);
++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, "
++ "move %d from 0x%p to 0x%p\n",
++ newext->ee_block, newext->ee_start, newext->ee_len,
++ nearex, len, nearex + 1, nearex + 2);
++ memmove(nearex + 1, nearex, len);
++ path[depth].p_ext = nearex;
++ }
++
++ eh->eh_entries++;
++ nearex = path[depth].p_ext;
++ nearex->ee_block = newext->ee_block;
++ nearex->ee_start = newext->ee_start;
++ nearex->ee_len = newext->ee_len;
++ /* FIXME: support for large fs */
++ nearex->ee_start_hi = 0;
++
++merge:
++ /* try to merge extents to the right */
++ while (nearex < EXT_LAST_EXTENT(eh)) {
++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1))
++ break;
++ /* merge with next extent! */
++ nearex->ee_len += nearex[1].ee_len;
++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) *
++ sizeof(struct ext3_extent);
++ memmove(nearex + 1, nearex + 2, len);
++ }
++ eh->eh_entries--;
++ EXT_ASSERT(eh->eh_entries > 0);
++ }
++
++ /* try to merge extents to the left */
++
++ /* time to correct all indexes above */
++ err = ext3_ext_correct_indexes(handle, tree, path);
++ if (err)
++ goto cleanup;
++
++ err = ext3_ext_dirty(handle, tree, path + depth);
++
++cleanup:
++ if (npath) {
++ ext3_ext_drop_refs(npath);
++ kfree(npath);
++ }
++ ext3_ext_tree_changed(tree);
++ ext3_ext_invalidate_cache(tree);
++ return err;
++}
++
++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block,
++ unsigned long num, ext_prepare_callback func)
++{
++ struct ext3_ext_path *path = NULL;
++ struct ext3_ext_cache cbex;
++ struct ext3_extent *ex;
++ unsigned long next, start = 0, end = 0;
++ unsigned long last = block + num;
++ int depth, exists, err = 0;
++
++ EXT_ASSERT(tree);
++ EXT_ASSERT(func);
++ EXT_ASSERT(tree->inode);
++ EXT_ASSERT(tree->root);
++
++ while (block < last && block != EXT_MAX_BLOCK) {
++ num = last - block;
++ /* find extent for this block */
++ path = ext3_ext_find_extent(tree, block, path);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ break;
++ }
++
++ depth = EXT_DEPTH(tree);
++ EXT_ASSERT(path[depth].p_hdr);
++ ex = path[depth].p_ext;
++ next = ext3_ext_next_allocated_block(path);
++
++ exists = 0;
++ if (!ex) {
++ /* there is no extent yet, so try to allocate
++ * all requested space */
++ start = block;
++ end = block + num;
++ } else if (ex->ee_block > block) {
++ /* need to allocate space before found extent */
++ start = block;
++ end = ex->ee_block;
++ if (block + num < end)
++ end = block + num;
++ } else if (block >= ex->ee_block + ex->ee_len) {
++ /* need to allocate space after found extent */
++ start = block;
++ end = block + num;
++ if (end >= next)
++ end = next;
++ } else if (block >= ex->ee_block) {
++ /*
++ * some part of requested space is covered
++ * by found extent
++ */
++ start = block;
++ end = ex->ee_block + ex->ee_len;
++ if (block + num < end)
++ end = block + num;
++ exists = 1;
++ } else {
++ BUG();
++ }
++ EXT_ASSERT(end > start);
++
++ if (!exists) {
++ cbex.ec_block = start;
++ cbex.ec_len = end - start;
++ cbex.ec_start = 0;
++ cbex.ec_type = EXT3_EXT_CACHE_GAP;
++ } else {
++ cbex.ec_block = ex->ee_block;
++ cbex.ec_len = ex->ee_len;
++ cbex.ec_start = ex->ee_start;
++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT;
++ }
++
++ EXT_ASSERT(cbex.ec_len > 0);
++ EXT_ASSERT(path[depth].p_hdr);
++ err = func(tree, path, &cbex);
++ ext3_ext_drop_refs(path);
++
++ if (err < 0)
++ break;
++ if (err == EXT_REPEAT)
++ continue;
++ else if (err == EXT_BREAK) {
++ err = 0;
++ break;
++ }
++
++ if (EXT_DEPTH(tree) != depth) {
++ /* depth was changed. we have to realloc path */
++ kfree(path);
++ path = NULL;
++ }
++
++ block = cbex.ec_block + cbex.ec_len;
++ }
++
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
++
++ return err;
++}
++
++static inline void
++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block,
++ __u32 len, __u32 start, int type)
++{
++ EXT_ASSERT(len > 0);
++ if (tree->cex) {
++ tree->cex->ec_type = type;
++ tree->cex->ec_block = block;
++ tree->cex->ec_len = len;
++ tree->cex->ec_start = start;
++ }
++}
++
++/*
++ * this routine calculate boundaries of the gap requested block fits into
++ * and cache this gap
++ */
++static inline void
++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ unsigned long block)
++{
++ int depth = EXT_DEPTH(tree);
++ unsigned long lblock, len;
++ struct ext3_extent *ex;
++
++ if (!tree->cex)
++ return;
++
++ ex = path[depth].p_ext;
++ if (ex == NULL) {
++ /* there is no extent yet, so gap is [0;-] */
++ lblock = 0;
++ len = EXT_MAX_BLOCK;
++ ext_debug(tree, "cache gap(whole file):");
++ } else if (block < ex->ee_block) {
++ lblock = block;
++ len = ex->ee_block - block;
++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]",
++ (unsigned long) block,
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len);
++ } else if (block >= ex->ee_block + ex->ee_len) {
++ lblock = ex->ee_block + ex->ee_len;
++ len = ext3_ext_next_allocated_block(path);
++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu",
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len,
++ (unsigned long) block);
++ EXT_ASSERT(len > lblock);
++ len = len - lblock;
++ } else {
++ lblock = len = 0;
++ BUG();
++ }
++
++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len);
++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP);
++}
++
++static inline int
++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block,
++ struct ext3_extent *ex)
++{
++ struct ext3_ext_cache *cex = tree->cex;
++
++ /* is there cache storage at all? */
++ if (!cex)
++ return EXT3_EXT_CACHE_NO;
++
++ /* has cache valid data? */
++ if (cex->ec_type == EXT3_EXT_CACHE_NO)
++ return EXT3_EXT_CACHE_NO;
++
++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP ||
++ cex->ec_type == EXT3_EXT_CACHE_EXTENT);
++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
++ ex->ee_block = cex->ec_block;
++ ex->ee_start = cex->ec_start;
++ ex->ee_len = cex->ec_len;
++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n",
++ (unsigned long) block,
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len,
++ (unsigned long) ex->ee_start);
++ return cex->ec_type;
++ }
++
++ /* not in cache */
++ return EXT3_EXT_CACHE_NO;
++}
++
++/*
++ * routine removes index from the index block
++ * it's used in truncate case only. thus all requests are for
++ * last index in the block only
++ */
++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ struct buffer_head *bh;
++ int err;
++
++ /* free index block */
++ path--;
++ EXT_ASSERT(path->p_hdr->eh_entries);
++ if ((err = ext3_ext_get_access(handle, tree, path)))
++ return err;
++ path->p_hdr->eh_entries--;
++ if ((err = ext3_ext_dirty(handle, tree, path)))
++ return err;
++ ext_debug(tree, "index is empty, remove it, free block %d\n",
++ path->p_idx->ei_leaf);
++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++ return err;
++}
++
++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int depth = EXT_DEPTH(tree);
++ int needed;
++
++ if (path) {
++ /* probably there is space in leaf? */
++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max)
++ return 1;
++ }
++
++ /*
++ * the worste case we're expecting is creation of the
++ * new root (growing in depth) with index splitting
++ * for splitting we have to consider depth + 1 because
++ * previous growing could increase it
++ */
++ depth = depth + 1;
++
++ /*
++ * growing in depth:
++ * block allocation + new root + old root
++ */
++ needed = EXT3_ALLOC_NEEDED + 2;
++
++ /* index split. we may need:
++ * allocate intermediate indexes and new leaf
++ * change two blocks at each level, but root
++ * modify root block (inode)
++ */
++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
++
++ return needed;
++}
++
++static int
++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, unsigned long start,
++ unsigned long end)
++{
++ struct ext3_extent *ex, tex;
++ struct ext3_ext_path *npath;
++ int depth, creds, err;
++
++ depth = EXT_DEPTH(tree);
++ ex = path[depth].p_ext;
++ EXT_ASSERT(ex);
++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1);
++ EXT_ASSERT(ex->ee_block < start);
++
++ /* calculate tail extent */
++ tex.ee_block = end + 1;
++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len);
++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block;
++
++ creds = ext3_ext_calc_credits_for_insert(tree, path);
++ handle = ext3_ext_journal_restart(handle, creds);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ /* calculate head extent. use primary extent */
++ err = ext3_ext_get_access(handle, tree, path + depth);
++ if (err)
++ return err;
++ ex->ee_len = start - ex->ee_block;
++ err = ext3_ext_dirty(handle, tree, path + depth);
++ if (err)
++ return err;
++
++ /* FIXME: some callback to free underlying resource
++ * and correct ee_start? */
++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n",
++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len);
++
++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL);
++ if (IS_ERR(npath))
++ return PTR_ERR(npath);
++ depth = EXT_DEPTH(tree);
++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block);
++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len);
++
++ err = ext3_ext_insert_extent(handle, tree, npath, &tex);
++ ext3_ext_drop_refs(npath);
++ kfree(npath);
++
++ return err;
++}
++
++static int
++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, unsigned long start,
++ unsigned long end)
++{
++ struct ext3_extent *ex, *fu = NULL, *lu, *le;
++ int err = 0, correct_index = 0;
++ int depth = EXT_DEPTH(tree), credits;
++ struct ext3_extent_header *eh;
++ unsigned a, b, block, num;
++
++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end);
++ if (!path[depth].p_hdr)
++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh);
++ eh = path[depth].p_hdr;
++ EXT_ASSERT(eh);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++
++ /* find where to start removing */
++ le = ex = EXT_LAST_EXTENT(eh);
++ while (ex != EXT_FIRST_EXTENT(eh)) {
++ if (ex->ee_block <= end)
++ break;
++ ex--;
++ }
++
++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) {
++ /* removal of internal part of the extent requested
++ * tail and head must be placed in different extent
++ * so, we have to insert one more extent */
++ path[depth].p_ext = ex;
++ return ext3_ext_split_for_rm(handle, tree, path, start, end);
++ }
++
++ lu = ex;
++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) {
++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len);
++ path[depth].p_ext = ex;
++
++ a = ex->ee_block > start ? ex->ee_block : start;
++ b = ex->ee_block + ex->ee_len - 1 < end ?
++ ex->ee_block + ex->ee_len - 1 : end;
++
++ ext_debug(tree, " border %u:%u\n", a, b);
++
++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) {
++ block = 0;
++ num = 0;
++ BUG();
++ } else if (a != ex->ee_block) {
++ /* remove tail of the extent */
++ block = ex->ee_block;
++ num = a - block;
++ } else if (b != ex->ee_block + ex->ee_len - 1) {
++ /* remove head of the extent */
++ block = a;
++ num = b - a;
++ } else {
++ /* remove whole extent: excelent! */
++ block = ex->ee_block;
++ num = 0;
++ EXT_ASSERT(a == ex->ee_block &&
++ b == ex->ee_block + ex->ee_len - 1);
++ }
++
++ if (ex == EXT_FIRST_EXTENT(eh))
++ correct_index = 1;
++
++ credits = 1;
++ if (correct_index)
++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1;
++ if (tree->ops->remove_extent_credits)
++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b);
++
++ handle = ext3_ext_journal_restart(handle, credits);
++ if (IS_ERR(handle)) {
++ err = PTR_ERR(handle);
++ goto out;
++ }
++
++ err = ext3_ext_get_access(handle, tree, path + depth);
++ if (err)
++ goto out;
++
++ if (tree->ops->remove_extent)
++ err = tree->ops->remove_extent(tree, ex, a, b);
++ if (err)
++ goto out;
++
++ if (num == 0) {
++ /* this extent is removed entirely mark slot unused */
++ ex->ee_start = 0;
++ eh->eh_entries--;
++ fu = ex;
++ }
++
++ ex->ee_block = block;
++ ex->ee_len = num;
++
++ err = ext3_ext_dirty(handle, tree, path + depth);
++ if (err)
++ goto out;
++
++ ext_debug(tree, "new extent: %u:%u:%u\n",
++ ex->ee_block, ex->ee_len, ex->ee_start);
++ ex--;
++ }
++
++ if (fu) {
++ /* reuse unused slots */
++ while (lu < le) {
++ if (lu->ee_start) {
++ *fu = *lu;
++ lu->ee_start = 0;
++ fu++;
++ }
++ lu++;
++ }
++ }
++
++ if (correct_index && eh->eh_entries)
++ err = ext3_ext_correct_indexes(handle, tree, path);
++
++ /* if this leaf is free, then we should
++ * remove it from index block above */
++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
++ err = ext3_ext_rm_idx(handle, tree, path + depth);
++
++out:
++ return err;
++}
++
++
++static struct ext3_extent_idx *
++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block)
++{
++ struct ext3_extent_idx *ix;
++
++ ix = EXT_LAST_INDEX(hdr);
++ while (ix != EXT_FIRST_INDEX(hdr)) {
++ if (ix->ei_block <= block)
++ break;
++ ix--;
++ }
++ return ix;
++}
++
++/*
++ * returns 1 if current index have to be freed (even partial)
++ */
++static int inline
++ext3_ext_more_to_rm(struct ext3_ext_path *path)
++{
++ EXT_ASSERT(path->p_idx);
++
++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
++ return 0;
++
++ /*
++ * if truncate on deeper level happened it it wasn't partial
++ * so we have to consider current index for truncation
++ */
++ if (path->p_hdr->eh_entries == path->p_block)
++ return 0;
++ return 1;
++}
++
++int ext3_ext_remove_space(struct ext3_extents_tree *tree,
++ unsigned long start, unsigned long end)
++{
++ struct inode *inode = tree->inode;
++ struct super_block *sb = inode->i_sb;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_ext_path *path;
++ handle_t *handle;
++ int i = 0, err = 0;
++
++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end);
++
++ /* probably first extent we're gonna free will be last in block */
++ handle = ext3_journal_start(inode, depth + 1);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ ext3_ext_invalidate_cache(tree);
++
++ /*
++ * we start scanning from right side freeing all the blocks
++ * after i_size and walking into the deep
++ */
++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
++ if (IS_ERR(path)) {
++ ext3_error(sb, __FUNCTION__, "Can't allocate path array");
++ ext3_journal_stop(handle);
++ return -ENOMEM;
++ }
++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++ path[i].p_hdr = EXT_ROOT_HDR(tree);
++
++ while (i >= 0 && err == 0) {
++ if (i == depth) {
++ /* this is leaf block */
++ err = ext3_ext_rm_leaf(handle, tree, path, start, end);
++ /* root level have p_bh == NULL, brelse() eats this */
++ brelse(path[i].p_bh);
++ i--;
++ continue;
++ }
++
++ /* this is index block */
++ if (!path[i].p_hdr) {
++ ext_debug(tree, "initialize header\n");
++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh);
++ }
++
++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max);
++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC);
++
++ if (!path[i].p_idx) {
++ /* this level hasn't touched yet */
++ path[i].p_idx =
++ ext3_ext_last_covered(path[i].p_hdr, end);
++ path[i].p_block = path[i].p_hdr->eh_entries + 1;
++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n",
++ path[i].p_hdr, path[i].p_hdr->eh_entries);
++ } else {
++ /* we've already was here, see at next index */
++ path[i].p_idx--;
++ }
++
++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n",
++ i, EXT_FIRST_INDEX(path[i].p_hdr),
++ path[i].p_idx);
++ if (ext3_ext_more_to_rm(path + i)) {
++ /* go to the next level */
++ ext_debug(tree, "move to level %d (block %d)\n",
++ i + 1, path[i].p_idx->ei_leaf);
++ memset(path + i + 1, 0, sizeof(*path));
++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf);
++ if (!path[i+1].p_bh) {
++ /* should we reset i_size? */
++ err = -EIO;
++ break;
++ }
++ /* put actual number of indexes to know is this
++ * number got changed at the next iteration */
++ path[i].p_block = path[i].p_hdr->eh_entries;
++ i++;
++ } else {
++ /* we finish processing this index, go up */
++ if (path[i].p_hdr->eh_entries == 0 && i > 0) {
++ /* index is empty, remove it
++ * handle must be already prepared by the
++ * truncatei_leaf() */
++ err = ext3_ext_rm_idx(handle, tree, path + i);
++ }
++ /* root level have p_bh == NULL, brelse() eats this */
++ brelse(path[i].p_bh);
++ i--;
++ ext_debug(tree, "return to level %d\n", i);
++ }
++ }
++
++ /* TODO: flexible tree reduction should be here */
++ if (path->p_hdr->eh_entries == 0) {
++ /*
++ * truncate to zero freed all the tree
++ * so, we need to correct eh_depth
++ */
++ err = ext3_ext_get_access(handle, tree, path);
++ if (err == 0) {
++ EXT_ROOT_HDR(tree)->eh_depth = 0;
++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree);
++ err = ext3_ext_dirty(handle, tree, path);
++ }
++ }
++ ext3_ext_tree_changed(tree);
++
++ kfree(path);
++ ext3_journal_stop(handle);
++
++ return err;
++}
++
++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks)
++{
++ int lcap, icap, rcap, leafs, idxs, num;
++
++ rcap = ext3_ext_space_root(tree);
++ if (blocks <= rcap) {
++ /* all extents fit to the root */
++ return 0;
++ }
++
++ rcap = ext3_ext_space_root_idx(tree);
++ lcap = ext3_ext_space_block(tree);
++ icap = ext3_ext_space_block_idx(tree);
++
++ num = leafs = (blocks + lcap - 1) / lcap;
++ if (leafs <= rcap) {
++ /* all pointers to leafs fit to the root */
++ return leafs;
++ }
++
++ /* ok. we need separate index block(s) to link all leaf blocks */
++ idxs = (leafs + icap - 1) / icap;
++ do {
++ num += idxs;
++ idxs = (idxs + icap - 1) / icap;
++ } while (idxs > rcap);
++
++ return num;
++}
++
++/*
++ * called at mount time
++ */
++void ext3_ext_init(struct super_block *sb)
++{
++ /*
++ * possible initialization would be here
++ */
++
++ if (test_opt(sb, EXTENTS)) {
++ printk("EXT3-fs: file extents enabled");
++#ifdef AGRESSIVE_TEST
++ printk(", agressive tests");
++#endif
++#ifdef CHECK_BINSEARCH
++ printk(", check binsearch");
++#endif
++ printk("\n");
++ }
++}
++
++/*
++ * called at umount time
++ */
++void ext3_ext_release(struct super_block *sb)
++{
++}
++
++/************************************************************************
++ * VFS related routines
++ ************************************************************************/
++
++static int ext3_get_inode_write_access(handle_t *handle, void *buffer)
++{
++ /* we use in-core data, not bh */
++ return 0;
++}
++
++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer)
++{
++ struct inode *inode = buffer;
++ return ext3_mark_inode_dirty(handle, inode);
++}
++
++static int ext3_ext_mergable(struct ext3_extent *ex1,
++ struct ext3_extent *ex2)
++{
++ /* FIXME: support for large fs */
++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start)
++ return 1;
++ return 0;
++}
++
++static int
++ext3_remove_blocks_credits(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex,
++ unsigned long from, unsigned long to)
++{
++ int needed;
++
++ /* at present, extent can't cross block group */;
++ needed = 4; /* bitmap + group desc + sb + inode */
++
++#ifdef CONFIG_QUOTA
++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++ return needed;
++}
++
++static int
++ext3_remove_blocks(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex,
++ unsigned long from, unsigned long to)
++{
++ int needed = ext3_remove_blocks_credits(tree, ex, from, to);
++ handle_t *handle = ext3_journal_start(tree->inode, needed);
++ struct buffer_head *bh;
++ int i;
++
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
++ /* tail removal */
++ unsigned long num, start;
++ num = ex->ee_block + ex->ee_len - from;
++ start = ex->ee_start + ex->ee_len - num;
++ ext_debug(tree, "free last %lu blocks starting %lu\n",
++ num, start);
++ for (i = 0; i < num; i++) {
++ bh = sb_find_get_block(tree->inode->i_sb, start + i);
++ ext3_forget(handle, 0, tree->inode, bh, start + i);
++ }
++ ext3_free_blocks(handle, tree->inode, start, num);
++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
++ printk("strange request: removal %lu-%lu from %u:%u\n",
++ from, to, ex->ee_block, ex->ee_len);
++ } else {
++ printk("strange request: removal(2) %lu-%lu from %u:%u\n",
++ from, to, ex->ee_block, ex->ee_len);
++ }
++ ext3_journal_stop(handle);
++ return 0;
++}
++
++static int ext3_ext_find_goal(struct inode *inode,
++ struct ext3_ext_path *path, unsigned long block)
++{
++ struct ext3_inode_info *ei = EXT3_I(inode);
++ unsigned long bg_start;
++ unsigned long colour;
++ int depth;
++
++ if (path) {
++ struct ext3_extent *ex;
++ depth = path->p_depth;
++
++ /* try to predict block placement */
++ if ((ex = path[depth].p_ext))
++ return ex->ee_start + (block - ex->ee_block);
++
++ /* it looks index is empty
++ * try to find starting from index itself */
++ if (path[depth].p_bh)
++ return path[depth].p_bh->b_blocknr;
++ }
++
++ /* OK. use inode's group */
++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++ colour = (current->pid % 16) *
++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++ return bg_start + colour + block;
++}
++
++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *ex, int *err)
++{
++ struct inode *inode = tree->inode;
++ int newblock, goal;
++
++ EXT_ASSERT(path);
++ EXT_ASSERT(ex);
++ EXT_ASSERT(ex->ee_start);
++ EXT_ASSERT(ex->ee_len);
++
++ /* reuse block from the extent to order data/metadata */
++ newblock = ex->ee_start++;
++ ex->ee_len--;
++ if (ex->ee_len == 0) {
++ ex->ee_len = 1;
++ /* allocate new block for the extent */
++ goal = ext3_ext_find_goal(inode, path, ex->ee_block);
++ ex->ee_start = ext3_new_block(handle, inode, goal, err);
++ if (ex->ee_start == 0) {
++ /* error occured: restore old extent */
++ ex->ee_start = newblock;
++ return 0;
++ }
++ }
++ return newblock;
++}
++
++static struct ext3_extents_helpers ext3_blockmap_helpers = {
++ .get_write_access = ext3_get_inode_write_access,
++ .mark_buffer_dirty = ext3_mark_buffer_dirty,
++ .mergable = ext3_ext_mergable,
++ .new_block = ext3_new_block_cb,
++ .remove_extent = ext3_remove_blocks,
++ .remove_extent_credits = ext3_remove_blocks_credits,
++};
++
++void ext3_init_tree_desc(struct ext3_extents_tree *tree,
++ struct inode *inode)
++{
++ tree->inode = inode;
++ tree->root = (void *) EXT3_I(inode)->i_data;
++ tree->buffer = (void *) inode;
++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data);
++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent;
++ tree->ops = &ext3_blockmap_helpers;
++}
++
++int ext3_ext_get_block(handle_t *handle, struct inode *inode,
++ long iblock, struct buffer_head *bh_result,
++ int create, int extend_disksize)
++{
++ struct ext3_ext_path *path = NULL;
++ struct ext3_extent newex;
++ struct ext3_extent *ex;
++ int goal, newblock, err = 0, depth;
++ struct ext3_extents_tree tree;
++
++ clear_buffer_new(bh_result);
++ ext3_init_tree_desc(&tree, inode);
++ ext_debug(&tree, "block %d requested for inode %u\n",
++ (int) iblock, (unsigned) inode->i_ino);
++ down(&EXT3_I(inode)->truncate_sem);
++
++ /* check in cache */
++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) {
++ if (goal == EXT3_EXT_CACHE_GAP) {
++ if (!create) {
++ /* block isn't allocated yet and
++ * user don't want to allocate it */
++ goto out2;
++ }
++ /* we should allocate requested block */
++ } else if (goal == EXT3_EXT_CACHE_EXTENT) {
++ /* block is already allocated */
++ newblock = iblock - newex.ee_block + newex.ee_start;
++ goto out;
++ } else {
++ EXT_ASSERT(0);
++ }
++ }
++
++ /* find extent for this block */
++ path = ext3_ext_find_extent(&tree, iblock, NULL);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ goto out2;
++ }
++
++ depth = EXT_DEPTH(&tree);
++
++ /*
++ * consistent leaf must not be empty
++ * this situations is possible, though, _during_ tree modification
++ * this is why assert can't be put in ext3_ext_find_extent()
++ */
++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0);
++
++ if ((ex = path[depth].p_ext)) {
++ /* if found exent covers block, simple return it */
++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) {
++ newblock = iblock - ex->ee_block + ex->ee_start;
++ ext_debug(&tree, "%d fit into %d:%d -> %d\n",
++ (int) iblock, ex->ee_block, ex->ee_len,
++ newblock);
++ ext3_ext_put_in_cache(&tree, ex->ee_block,
++ ex->ee_len, ex->ee_start,
++ EXT3_EXT_CACHE_EXTENT);
++ goto out;
++ }
++ }
++
++ /*
++ * requested block isn't allocated yet
++ * we couldn't try to create block if create flag is zero
++ */
++ if (!create) {
++ /* put just found gap into cache to speedup subsequest reqs */
++ ext3_ext_put_gap_in_cache(&tree, path, iblock);
++ goto out2;
++ }
++
++ /* allocate new block */
++ goal = ext3_ext_find_goal(inode, path, iblock);
++ newblock = ext3_new_block(handle, inode, goal, &err);
++ if (!newblock)
++ goto out2;
++ ext_debug(&tree, "allocate new block: goal %d, found %d\n",
++ goal, newblock);
++
++ /* try to insert new extent into found leaf and return */
++ newex.ee_block = iblock;
++ newex.ee_start = newblock;
++ newex.ee_len = 1;
++ err = ext3_ext_insert_extent(handle, &tree, path, &newex);
++ if (err)
++ goto out2;
++
++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize)
++ EXT3_I(inode)->i_disksize = inode->i_size;
++
++ /* previous routine could use block we allocated */
++ newblock = newex.ee_start;
++ set_buffer_new(bh_result);
++
++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len,
++ newex.ee_start, EXT3_EXT_CACHE_EXTENT);
++out:
++ ext3_ext_show_leaf(&tree, path);
++ map_bh(bh_result, inode->i_sb, newblock);
++out2:
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
++ up(&EXT3_I(inode)->truncate_sem);
++
++ return err;
++}
++
++void ext3_ext_truncate(struct inode * inode, struct page *page)
++{
++ struct address_space *mapping = inode->i_mapping;
++ struct super_block *sb = inode->i_sb;
++ struct ext3_extents_tree tree;
++ unsigned long last_block;
++ handle_t *handle;
++ int err = 0;
++
++ ext3_init_tree_desc(&tree, inode);
++
++ /*
++ * probably first extent we're gonna free will be last in block
++ */
++ err = ext3_writepage_trans_blocks(inode) + 3;
++ handle = ext3_journal_start(inode, err);
++ if (IS_ERR(handle)) {
++ if (page) {
++ clear_highpage(page);
++ flush_dcache_page(page);
++ unlock_page(page);
++ page_cache_release(page);
++ }
++ return;
++ }
++
++ if (page)
++ ext3_block_truncate_page(handle, page, mapping, inode->i_size);
++
++ down(&EXT3_I(inode)->truncate_sem);
++ ext3_ext_invalidate_cache(&tree);
++
++ /*
++ * TODO: optimization is possible here
++ * probably we need not scaning at all,
++ * because page truncation is enough
++ */
++ if (ext3_orphan_add(handle, inode))
++ goto out_stop;
++
++ /* we have to know where to truncate from in crash case */
++ EXT3_I(inode)->i_disksize = inode->i_size;
++ ext3_mark_inode_dirty(handle, inode);
++
++ last_block = (inode->i_size + sb->s_blocksize - 1) >>
++ EXT3_BLOCK_SIZE_BITS(sb);
++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK);
++
++ /* In a multi-transaction truncate, we only make the final
++ * transaction synchronous */
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++
++out_stop:
++ /*
++ * If this was a simple ftruncate(), and the file will remain alive
++ * then we need to clear up the orphan record which we created above.
++ * However, if this was a real unlink then we were called by
++ * ext3_delete_inode(), and we allow that function to clean up the
++ * orphan info for us.
++ */
++ if (inode->i_nlink)
++ ext3_orphan_del(handle, inode);
++
++ up(&EXT3_I(inode)->truncate_sem);
++ ext3_journal_stop(handle);
++}
++
++/*
++ * this routine calculate max number of blocks we could modify
++ * in order to allocate new block for an inode
++ */
++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
++{
++ struct ext3_extents_tree tree;
++ int needed;
++
++ ext3_init_tree_desc(&tree, inode);
++
++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL);
++
++ /* caller want to allocate num blocks */
++ needed *= num;
++
++#ifdef CONFIG_QUOTA
++ /*
++ * FIXME: real calculation should be here
++ * it depends on blockmap format of qouta file
++ */
++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++ return needed;
++}
++
++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode)
++{
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ ext3_extent_tree_init(handle, &tree);
++}
++
++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks)
++{
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ return ext3_ext_calc_metadata_amount(&tree, blocks);
++}
++
++static int
++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_ext_cache *newex)
++{
++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
++
++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT)
++ return EXT_CONTINUE;
++
++ if (buf->err < 0)
++ return EXT_BREAK;
++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
++ return EXT_BREAK;
++
++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) {
++ buf->err++;
++ buf->cur += sizeof(*newex);
++ } else {
++ buf->err = -EFAULT;
++ return EXT_BREAK;
++ }
++ return EXT_CONTINUE;
++}
++
++static int
++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_ext_cache *ex)
++{
++ struct ext3_extent_tree_stats *buf =
++ (struct ext3_extent_tree_stats *) tree->private;
++ int depth;
++
++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT)
++ return EXT_CONTINUE;
++
++ depth = EXT_DEPTH(tree);
++ buf->extents_num++;
++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr))
++ buf->leaf_num++;
++ return EXT_CONTINUE;
++}
++
++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
++ unsigned long arg)
++{
++ int err = 0;
++
++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
++ return -EINVAL;
++
++ if (cmd == EXT3_IOC_GET_EXTENTS) {
++ struct ext3_extent_buf buf;
++ struct ext3_extents_tree tree;
++
++ if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++ return -EFAULT;
++
++ ext3_init_tree_desc(&tree, inode);
++ buf.cur = buf.buffer;
++ buf.err = 0;
++ tree.private = &buf;
++ down(&EXT3_I(inode)->truncate_sem);
++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK,
++ ext3_ext_store_extent_cb);
++ up(&EXT3_I(inode)->truncate_sem);
++ if (err == 0)
++ err = buf.err;
++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) {
++ struct ext3_extent_tree_stats buf;
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ down(&EXT3_I(inode)->truncate_sem);
++ buf.depth = EXT_DEPTH(&tree);
++ buf.extents_num = 0;
++ buf.leaf_num = 0;
++ tree.private = &buf;
++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK,
++ ext3_ext_collect_stats_cb);
++ up(&EXT3_I(inode)->truncate_sem);
++ if (!err)
++ err = copy_to_user((void *) arg, &buf, sizeof(buf));
++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) {
++ struct ext3_extents_tree tree;
++ ext3_init_tree_desc(&tree, inode);
++ down(&EXT3_I(inode)->truncate_sem);
++ err = EXT_DEPTH(&tree);
++ up(&EXT3_I(inode)->truncate_sem);
++ }
++
++ return err;
++}
++
++EXPORT_SYMBOL(ext3_init_tree_desc);
++EXPORT_SYMBOL(ext3_mark_inode_dirty);
++EXPORT_SYMBOL(ext3_ext_invalidate_cache);
++EXPORT_SYMBOL(ext3_ext_insert_extent);
++EXPORT_SYMBOL(ext3_ext_walk_space);
++EXPORT_SYMBOL(ext3_ext_find_goal);
++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert);
+Index: linux-2.6.12-rc6/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/ialloc.c 2005-06-14 16:31:08.634433030 +0200
++++ linux-2.6.12-rc6/fs/ext3/ialloc.c 2005-06-14 16:31:25.846346882 +0200
+@@ -598,7 +598,7 @@
+ ei->i_dir_start_lookup = 0;
+ ei->i_disksize = 0;
+
+- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL);
+ if (S_ISLNK(mode))
+ ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
+ /* dirsync only applies to directories */
+@@ -639,6 +639,18 @@
+ DQUOT_FREE_INODE(inode);
+ goto fail2;
+ }
++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) {
++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
++ ext3_extents_initialize_blockmap(handle, inode);
++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) {
++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
++ if (err) goto fail;
++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS);
++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
++ }
++ }
++
+ err = ext3_mark_inode_dirty(handle, inode);
+ if (err) {
+ ext3_std_error(sb, err);
+Index: linux-2.6.12-rc6/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:31:09.701815830 +0200
++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:31:25.861971882 +0200
+@@ -40,7 +40,7 @@
+ #include "iopen.h"
+ #include "acl.h"
+
+-static int ext3_writepage_trans_blocks(struct inode *inode);
++int ext3_writepage_trans_blocks(struct inode *inode);
+
+ /*
+ * Test whether an inode is a fast symlink.
+@@ -784,6 +784,17 @@
+ return err;
+ }
+
++static inline int
++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block,
++ struct buffer_head *bh, int create, int extend_disksize)
++{
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_get_block(handle, inode, block, bh, create,
++ extend_disksize);
++ return ext3_get_block_handle(handle, inode, block, bh, create,
++ extend_disksize);
++}
++
+ static int ext3_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+ {
+@@ -794,8 +805,8 @@
+ handle = ext3_journal_current_handle();
+ J_ASSERT(handle != 0);
+ }
+- ret = ext3_get_block_handle(handle, inode, iblock,
+- bh_result, create, 1);
++ ret = ext3_get_block_wrap(handle, inode, iblock,
++ bh_result, create, 1);
+ return ret;
+ }
+
+@@ -839,7 +850,7 @@
+
+ get_block:
+ if (ret == 0)
+- ret = ext3_get_block_handle(handle, inode, iblock,
++ ret = ext3_get_block_wrap(handle, inode, iblock,
+ bh_result, create, 0);
+ bh_result->b_size = (1 << inode->i_blkbits);
+ return ret;
+@@ -859,7 +870,7 @@
+ dummy.b_state = 0;
+ dummy.b_blocknr = -1000;
+ buffer_trace_init(&dummy.b_history);
+- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1);
+ if (!*errp && buffer_mapped(&dummy)) {
+ struct buffer_head *bh;
+ bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+@@ -1593,7 +1604,7 @@
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
++int ext3_block_truncate_page(handle_t *handle, struct page *page,
+ struct address_space *mapping, loff_t from)
+ {
+ unsigned long index = from >> PAGE_CACHE_SHIFT;
+@@ -2104,6 +2115,9 @@
+ return;
+ }
+
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_truncate(inode, page);
++
+ handle = start_transaction(inode);
+ if (IS_ERR(handle)) {
+ if (page) {
+@@ -2850,12 +2864,15 @@
+ * block and work out the exact number of indirects which are touched. Pah.
+ */
+
+-static int ext3_writepage_trans_blocks(struct inode *inode)
++int ext3_writepage_trans_blocks(struct inode *inode)
+ {
+ int bpp = ext3_journal_blocks_per_page(inode);
+ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+ int ret;
+
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_writepage_trans_blocks(inode, bpp);
++
+ if (ext3_should_journal_data(inode))
+ ret = 3 * (bpp + indirects) + 2;
+ else
+Index: linux-2.6.12-rc6/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:31:09.179354899 +0200
++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:31:25.872714069 +0200
+@@ -5,7 +5,7 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\
+- ioctl.o namei.o super.o symlink.o hash.o resize.o
++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: linux-2.6.12-rc6/fs/ext3/super.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:31:09.950839264 +0200
++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:31:25.886385944 +0200
+@@ -387,6 +387,7 @@
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+ if (!(sb->s_flags & MS_RDONLY)) {
+@@ -451,6 +452,8 @@
+ #endif
+ ei->i_block_alloc_info = NULL;
+ ei->vfs_inode.i_version = 1;
++
++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent));
+ return &ei->vfs_inode;
+ }
+
+@@ -593,7 +596,7 @@
+ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+- Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
++ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug,
+ };
+
+ static match_table_t tokens = {
+@@ -644,6 +647,8 @@
+ {Opt_iopen, "iopen"},
+ {Opt_noiopen, "noiopen"},
+ {Opt_iopen_nopriv, "iopen_nopriv"},
++ {Opt_extents, "extents"},
++ {Opt_extdebug, "extdebug"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+@@ -953,6 +958,12 @@
+ case Opt_nobh:
+ set_opt(sbi->s_mount_opt, NOBH);
+ break;
++ case Opt_extents:
++ set_opt (sbi->s_mount_opt, EXTENTS);
++ break;
++ case Opt_extdebug:
++ set_opt (sbi->s_mount_opt, EXTDEBUG);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1668,6 +1681,7 @@
+ percpu_counter_mod(&sbi->s_dirs_counter,
+ ext3_count_dirs(sb));
+
++ ext3_ext_init(sb);
+ lock_kernel();
+ return 0;
+
+Index: linux-2.6.12-rc6/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/ioctl.c 2005-06-14 16:31:08.646151780 +0200
++++ linux-2.6.12-rc6/fs/ext3/ioctl.c 2005-06-14 16:31:25.897128131 +0200
+@@ -124,6 +124,10 @@
+ err = ext3_change_inode_journal_flag(inode, jflag);
+ return err;
+ }
++ case EXT3_IOC_GET_EXTENTS:
++ case EXT3_IOC_GET_TREE_STATS:
++ case EXT3_IOC_GET_TREE_DEPTH:
++ return ext3_ext_ioctl(inode, filp, cmd, arg);
+ case EXT3_IOC_GETVERSION:
+ case EXT3_IOC_GETVERSION_OLD:
+ return put_user(inode->i_generation, (int __user *) arg);
+Index: linux-2.6.12-rc6/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:31:10.185214261 +0200
++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:31:52.859041864 +0200
+@@ -186,8 +186,9 @@
+ #define EXT3_NOTAIL_FL 0x00008000 /* don't merge file tail */
+ #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+ #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */
+ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
+
+-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
+ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
+
+@@ -237,6 +238,9 @@
+ #endif
+ #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
+ #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long)
++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long)
++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long)
+
+ /*
+ * Structure of an inode on the disk
+@@ -360,6 +364,8 @@
+ #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */
+ #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000 /* Make iopen world-readable */
++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */
++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -548,11 +554,13 @@
+ #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
+ #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010
++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */
+
+ #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ EXT3_FEATURE_INCOMPAT_RECOVER| \
+- EXT3_FEATURE_INCOMPAT_META_BG)
++ EXT3_FEATURE_INCOMPAT_META_BG| \
++ EXT3_FEATURE_INCOMPAT_EXTENTS)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
+@@ -759,6 +767,7 @@
+
+
+ /* inode.c */
++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
+ extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+@@ -828,6 +837,16 @@
+ extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+
++/* extents.c */
++extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
++extern int ext3_ext_get_block(handle_t *, struct inode *, long,
++ struct buffer_head *, int, int);
++extern void ext3_ext_truncate(struct inode *, struct page *);
++extern void ext3_ext_init(struct super_block *);
++extern void ext3_ext_release(struct super_block *);
++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *);
++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
++ unsigned int cmd, unsigned long arg);
+
+ #endif /* __KERNEL__ */
+
+Index: linux-2.6.12-rc6/include/linux/ext3_extents.h
+===================================================================
+--- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200
++++ linux-2.6.12-rc6/include/linux/ext3_extents.h 2005-06-14 16:31:25.932284381 +0200
+@@ -0,0 +1,264 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++#ifndef _LINUX_EXT3_EXTENTS
++#define _LINUX_EXT3_EXTENTS
++
++/*
++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
++ * become very little, so index split, in-depth growing and
++ * other hard changes happens much more often
++ * this is for debug purposes only
++ */
++#define AGRESSIVE_TEST_
++
++/*
++ * if CHECK_BINSEARCH defined, then results of binary search
++ * will be checked by linear search
++ */
++#define CHECK_BINSEARCH_
++
++/*
++ * if EXT_DEBUG is defined you can use 'extdebug' mount option
++ * to get lots of info what's going on
++ */
++#define EXT_DEBUG_
++#ifdef EXT_DEBUG
++#define ext_debug(tree,fmt,a...) \
++do { \
++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \
++ printk(fmt, ##a); \
++} while (0);
++#else
++#define ext_debug(tree,fmt,a...)
++#endif
++
++/*
++ * if EXT_STATS is defined then stats numbers are collected
++ * these number will be displayed at umount time
++ */
++#define EXT_STATS_
++
++
++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */
++
++/*
++ * ext3_inode has i_block array (total 60 bytes)
++ * first 4 bytes are used to store:
++ * - tree depth (0 mean there is no tree yet. all extents in the inode)
++ * - number of alive extents in the inode
++ */
++
++/*
++ * this is extent on-disk structure
++ * it's used at the bottom of the tree
++ */
++struct ext3_extent {
++ __u32 ee_block; /* first logical block extent covers */
++ __u16 ee_len; /* number of blocks covered by extent */
++ __u16 ee_start_hi; /* high 16 bits of physical block */
++ __u32 ee_start; /* low 32 bigs of physical block */
++};
++
++/*
++ * this is index on-disk structure
++ * it's used at all the levels, but the bottom
++ */
++struct ext3_extent_idx {
++ __u32 ei_block; /* index covers logical blocks from 'block' */
++ __u32 ei_leaf; /* pointer to the physical block of the next *
++ * level. leaf or next index could bet here */
++ __u16 ei_leaf_hi; /* high 16 bits of physical block */
++ __u16 ei_unused;
++};
++
++/*
++ * each block (leaves and indexes), even inode-stored has header
++ */
++struct ext3_extent_header {
++ __u16 eh_magic; /* probably will support different formats */
++ __u16 eh_entries; /* number of valid entries */
++ __u16 eh_max; /* capacity of store in entries */
++ __u16 eh_depth; /* has tree real underlaying blocks? */
++ __u32 eh_generation; /* generation of the tree */
++};
++
++#define EXT3_EXT_MAGIC 0xf30a
++
++/*
++ * array of ext3_ext_path contains path to some extent
++ * creation/lookup routines use it for traversal/splitting/etc
++ * truncate uses it to simulate recursive walking
++ */
++struct ext3_ext_path {
++ __u32 p_block;
++ __u16 p_depth;
++ struct ext3_extent *p_ext;
++ struct ext3_extent_idx *p_idx;
++ struct ext3_extent_header *p_hdr;
++ struct buffer_head *p_bh;
++};
++
++/*
++ * structure for external API
++ */
++
++/*
++ * storage for cached extent
++ */
++struct ext3_ext_cache {
++ __u32 ec_start;
++ __u32 ec_block;
++ __u32 ec_len;
++ __u32 ec_type;
++};
++
++#define EXT3_EXT_CACHE_NO 0
++#define EXT3_EXT_CACHE_GAP 1
++#define EXT3_EXT_CACHE_EXTENT 2
++
++/*
++ * ext3_extents_tree is used to pass initial information
++ * to top-level extents API
++ */
++struct ext3_extents_helpers;
++struct ext3_extents_tree {
++ struct inode *inode; /* inode which tree belongs to */
++ void *root; /* ptr to data top of tree resides at */
++ void *buffer; /* will be passed as arg to ^^ routines */
++ int buffer_len;
++ void *private;
++ struct ext3_ext_cache *cex;/* last found extent */
++ struct ext3_extents_helpers *ops;
++};
++
++struct ext3_extents_helpers {
++ int (*get_write_access)(handle_t *h, void *buffer);
++ int (*mark_buffer_dirty)(handle_t *h, void *buffer);
++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2);
++ int (*remove_extent_credits)(struct ext3_extents_tree *,
++ struct ext3_extent *, unsigned long,
++ unsigned long);
++ int (*remove_extent)(struct ext3_extents_tree *,
++ struct ext3_extent *, unsigned long,
++ unsigned long);
++ int (*new_block)(handle_t *, struct ext3_extents_tree *,
++ struct ext3_ext_path *, struct ext3_extent *,
++ int *);
++};
++
++/*
++ * to be called by ext3_ext_walk_space()
++ * negative retcode - error
++ * positive retcode - signal for ext3_ext_walk_space(), see below
++ * callback must return valid extent (passed or newly created)
++ */
++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *,
++ struct ext3_ext_path *,
++ struct ext3_ext_cache *);
++
++#define EXT_CONTINUE 0
++#define EXT_BREAK 1
++#define EXT_REPEAT 2
++
++
++#define EXT_MAX_BLOCK 0xffffffff
++
++
++#define EXT_FIRST_EXTENT(__hdr__) \
++ ((struct ext3_extent *) (((char *) (__hdr__)) + \
++ sizeof(struct ext3_extent_header)))
++#define EXT_FIRST_INDEX(__hdr__) \
++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \
++ sizeof(struct ext3_extent_header)))
++#define EXT_HAS_FREE_INDEX(__path__) \
++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max)
++#define EXT_LAST_EXTENT(__hdr__) \
++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_LAST_INDEX(__hdr__) \
++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_MAX_EXTENT(__hdr__) \
++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_MAX_INDEX(__hdr__) \
++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++
++#define EXT_ROOT_HDR(tree) \
++ ((struct ext3_extent_header *) (tree)->root)
++#define EXT_BLOCK_HDR(bh) \
++ ((struct ext3_extent_header *) (bh)->b_data)
++#define EXT_DEPTH(_t_) \
++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
++#define EXT_GENERATION(_t_) \
++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++
++
++#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
++
++#define EXT_CHECK_PATH(tree,path) \
++{ \
++ int depth = EXT_DEPTH(tree); \
++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \
++ BUG_ON((unsigned long) (path)[depth].p_idx < \
++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \
++ BUG_ON((unsigned long) (path)[depth].p_ext < \
++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \
++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \
++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \
++ && depth != 0); \
++ BUG_ON((path)[0].p_depth != depth); \
++}
++
++
++/*
++ * this structure is used to gather extents from the tree via ioctl
++ */
++struct ext3_extent_buf {
++ unsigned long start;
++ int buflen;
++ void *buffer;
++ void *cur;
++ int err;
++};
++
++/*
++ * this structure is used to collect stats info about the tree
++ */
++struct ext3_extent_tree_stats {
++ int depth;
++ int extents_num;
++ int leaf_num;
++};
++
++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *);
++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *);
++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *);
++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *);
++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback);
++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long);
++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *);
++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int);
++
++static inline void
++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree)
++{
++ if (tree->cex)
++ tree->cex->ec_type = EXT3_EXT_CACHE_NO;
++}
++
++
++#endif /* _LINUX_EXT3_EXTENTS */
+Index: linux-2.6.12-rc6/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-2.6.12-rc6.orig/include/linux/ext3_fs_i.h 2005-06-06 17:22:29.000000000 +0200
++++ linux-2.6.12-rc6/include/linux/ext3_fs_i.h 2005-06-14 16:31:25.941073443 +0200
+@@ -133,6 +133,8 @@
+ */
+ struct semaphore truncate_sem;
+ struct inode vfs_inode;
++
++ __u32 i_cached_extent[4];
+ };
+
+ #endif /* _LINUX_EXT3_FS_I */
--- /dev/null
+Signed-off-by: Johann Lombardi <johann.lombardi@bull.net>
+
+--- linux-2.6.12.orig/fs/ext3/super.c 2005-06-17 21:48:29.000000000 +0200
++++ linux-2.6.12/fs/ext3/super.c 2005-11-07 13:37:30.000000000 +0100
+@@ -39,7 +39,8 @@
+ #include "xattr.h"
+ #include "acl.h"
+
+-static int ext3_load_journal(struct super_block *, struct ext3_super_block *);
++static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
++ unsigned long journal_devnum);
+ static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
+ int);
+ static void ext3_commit_super (struct super_block * sb,
+@@ -586,7 +587,7 @@ enum {
+ Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+ Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+ Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh,
+- Opt_commit, Opt_journal_update, Opt_journal_inum,
++ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+@@ -624,6 +625,7 @@ static match_table_t tokens = {
+ {Opt_commit, "commit=%u"},
+ {Opt_journal_update, "journal=update"},
+ {Opt_journal_inum, "journal=%u"},
++ {Opt_journal_dev, "journal_dev=%u"},
+ {Opt_abort, "abort"},
+ {Opt_data_journal, "data=journal"},
+ {Opt_data_ordered, "data=ordered"},
+@@ -663,8 +665,9 @@ static unsigned long get_sb_block(void *
+ return sb_block;
+ }
+
+-static int parse_options (char * options, struct super_block *sb,
+- unsigned long * inum, unsigned long *n_blocks_count, int is_remount)
++static int parse_options (char *options, struct super_block *sb,
++ unsigned long *inum, unsigned long *journal_devnum,
++ unsigned long *n_blocks_count, int is_remount)
+ {
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ char * p;
+@@ -805,6 +808,16 @@ static int parse_options (char * options
+ return 0;
+ *inum = option;
+ break;
++ case Opt_journal_dev:
++ if (is_remount) {
++ printk(KERN_ERR "EXT3-fs: cannot specify "
++ "journal on remount\n");
++ return 0;
++ }
++ if (match_int(&args[0], &option))
++ return 0;
++ *journal_devnum = option;
++ break;
+ case Opt_noload:
+ set_opt (sbi->s_mount_opt, NOLOAD);
+ break;
+@@ -1250,6 +1263,7 @@ static int ext3_fill_super (struct super
+ unsigned long logic_sb_block;
+ unsigned long offset = 0;
+ unsigned long journal_inum = 0;
++ unsigned long journal_devnum = 0;
+ unsigned long def_mount_opts;
+ struct inode *root;
+ int blocksize;
+@@ -1330,7 +1344,8 @@ static int ext3_fill_super (struct super
+
+ set_opt(sbi->s_mount_opt, RESERVATION);
+
+- if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0))
++ if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
++ NULL, 0))
+ goto failed_mount;
+
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+@@ -1541,7 +1556,7 @@ static int ext3_fill_super (struct super
+ */
+ if (!test_opt(sb, NOLOAD) &&
+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
+- if (ext3_load_journal(sb, es))
++ if (ext3_load_journal(sb, es, journal_devnum))
+ goto failed_mount2;
+ } else if (journal_inum) {
+ if (ext3_create_journal(sb, es, journal_inum))
+@@ -1821,15 +1836,24 @@ out_bdev:
+ return NULL;
+ }
+
+-static int ext3_load_journal(struct super_block * sb,
+- struct ext3_super_block * es)
++static int ext3_load_journal(struct super_block *sb,
++ struct ext3_super_block *es,
++ unsigned long journal_devnum)
+ {
+ journal_t *journal;
+ int journal_inum = le32_to_cpu(es->s_journal_inum);
+- dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
++ dev_t journal_dev;
+ int err = 0;
+ int really_read_only;
+
++ if (journal_devnum &&
++ journal_devnum != le32_to_cpu(es->s_journal_dev)) {
++ printk(KERN_INFO "EXT3-fs: external journal device major/minor "
++ "numbers have changed\n");
++ journal_dev = new_decode_dev(journal_devnum);
++ } else
++ journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
++
+ really_read_only = bdev_read_only(sb->s_bdev);
+
+ /*
+@@ -1888,6 +1912,16 @@ static int ext3_load_journal(struct supe
+
+ EXT3_SB(sb)->s_journal = journal;
+ ext3_clear_journal_err(sb, es);
++
++ if (journal_devnum &&
++ journal_devnum != le32_to_cpu(es->s_journal_dev)) {
++ es->s_journal_dev = cpu_to_le32(journal_devnum);
++ sb->s_dirt = 1;
++
++ /* Make sure we flush the recovery flag to disk. */
++ ext3_commit_super(sb, es, 1);
++ }
++
+ return 0;
+ }
+
+@@ -2093,13 +2127,13 @@ static int ext3_remount (struct super_bl
+ {
+ struct ext3_super_block * es;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+- unsigned long tmp;
++ unsigned long tmp1, tmp2;
+ unsigned long n_blocks_count = 0;
+
+ /*
+ * Allow the "check" option to be passed as a remount option.
+ */
+- if (!parse_options(data, sb, &tmp, &n_blocks_count, 1))
++ if (!parse_options(data, sb, &tmp1, &tmp2, &n_blocks_count, 1))
+ return -EINVAL;
+
+ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
-Index: linux-2.6.5-7.201/include/linux/ext3_fs_sb.h
-===================================================================
---- linux-2.6.5-7.201.orig/include/linux/ext3_fs_sb.h 2005-10-14 08:59:35.000000000 +0400
-+++ linux-2.6.5-7.201/include/linux/ext3_fs_sb.h 2005-10-14 08:59:39.000000000 +0400
-@@ -23,10 +23,30 @@
- #define EXT_INCLUDE
- #include <linux/blockgroup_lock.h>
- #include <linux/percpu_counter.h>
-+#include <linux/list.h>
- #endif
- #endif
- #include <linux/rbtree.h>
-
-+#define EXT3_BB_MAX_BLOCKS 30
-+struct ext3_free_metadata {
-+ unsigned short group;
-+ unsigned short num;
-+ unsigned short blocks[EXT3_BB_MAX_BLOCKS];
-+ struct list_head list;
-+};
-+
-+struct ext3_buddy_group_blocks {
-+ __u32 bb_bitmap;
-+ __u32 bb_buddy;
-+ spinlock_t bb_lock;
-+ unsigned long bb_tid;
-+ struct ext3_free_metadata *bb_md_cur;
-+ unsigned short bb_first_free;
-+ unsigned short bb_free;
-+ unsigned bb_counters[];
-+};
-+
- /*
- * third extended-fs super-block data in memory
- */
-@@ -78,6 +98,27 @@
- struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
- wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
- #endif
-+
-+ /* for buddy allocator */
-+ struct ext3_buddy_group_blocks **s_buddy_blocks;
-+ struct inode *s_buddy;
-+ long s_blocks_reserved;
-+ spinlock_t s_reserve_lock;
-+ struct list_head s_active_transaction;
-+ struct list_head s_closed_transaction;
-+ struct list_head s_committed_transaction;
-+ spinlock_t s_md_lock;
-+ tid_t s_last_transaction;
-+ int s_mb_factor;
-+
-+ /* stats for buddy allocator */
-+ spinlock_t s_bal_lock;
-+ unsigned long s_bal_reqs; /* number of reqs with len > 1 */
-+ unsigned long s_bal_success; /* we found long enough chunks */
-+ unsigned long s_bal_allocated; /* in blocks */
-+ unsigned long s_bal_ex_scanned; /* total extents scanned */
-+ unsigned long s_bal_goals; /* goal hits */
-+ unsigned long s_bal_breaks; /* too long searches */
- };
-
- #endif /* _LINUX_EXT3_FS_SB */
Index: linux-2.6.5-7.201/include/linux/ext3_fs.h
===================================================================
---- linux-2.6.5-7.201.orig/include/linux/ext3_fs.h 2005-10-14 08:59:38.000000000 +0400
-+++ linux-2.6.5-7.201/include/linux/ext3_fs.h 2005-10-14 09:02:36.000000000 +0400
-@@ -57,6 +57,14 @@
+--- linux-2.6.5-7.201.orig/include/linux/ext3_fs.h 2005-12-17 02:53:30.000000000 +0300
++++ linux-2.6.5-7.201/include/linux/ext3_fs.h 2005-12-17 03:13:38.000000000 +0300
+@@ -57,6 +57,14 @@ struct statfs;
#define ext3_debug(f, a...) do {} while (0)
#endif
/*
* Special inodes numbers
*/
-@@ -339,6 +347,7 @@
+@@ -339,6 +347,7 @@ struct ext3_inode {
#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */
#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */
-+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */
++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef clear_opt
-@@ -700,7 +709,7 @@
+@@ -700,7 +709,9 @@ extern int ext3_bg_has_super(struct supe
extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
- unsigned long);
+ unsigned long, int);
++extern void ext3_free_blocks_old (handle_t *, struct inode *, unsigned long,
++ unsigned long);
extern unsigned long ext3_count_free_blocks (struct super_block *);
extern void ext3_check_blocks_bitmap (struct super_block *);
extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
-@@ -822,6 +831,44 @@
+@@ -822,6 +833,17 @@ extern void ext3_extents_initialize_bloc
extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg);
+/* mballoc.c */
-+extern long ext3_mb_aggressive;
+extern long ext3_mb_stats;
+extern long ext3_mb_max_to_scan;
+extern int ext3_mb_init(struct super_block *, int);
+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
+extern int ext3_mb_reserve_blocks(struct super_block *, int);
+extern void ext3_mb_release_blocks(struct super_block *, int);
-+
-+/* writeback.c */
-+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
-+extern int ext3_wb_prepare_write(struct file *file, struct page *page,
-+ unsigned from, unsigned to);
-+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
-+extern int ext3_wb_writepage(struct page *, struct writeback_control *);
-+extern int ext3_wb_invalidatepage(struct page *, unsigned long);
-+extern int ext3_wb_releasepage(struct page *, int);
-+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
-+extern void ext3_wb_init(struct super_block *);
-+extern void ext3_wb_release(struct super_block *);
-+
-+/* writeback.c */
-+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
-+extern int ext3_wb_prepare_write(struct file *file, struct page *page,
-+ unsigned from, unsigned to);
-+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
-+extern int ext3_wb_writepage(struct page *, struct writeback_control *);
-+extern int ext3_wb_invalidatepage(struct page *, unsigned long);
-+extern int ext3_wb_releasepage(struct page *, int);
-+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
-+extern void ext3_wb_init(struct super_block *);
-+extern void ext3_wb_release(struct super_block *);
-+
-+/* proc.c */
-+extern int init_ext3_proc(void);
-+extern void exit_ext3_proc(void);
++int __init init_ext3_proc(void);
++void exit_ext3_proc(void);
+
#endif /* __KERNEL__ */
#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long)
-Index: linux-2.6.5-7.201/fs/ext3/balloc.c
+Index: linux-2.6.5-7.201/include/linux/ext3_fs_sb.h
===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/balloc.c 2005-10-11 00:12:45.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/balloc.c 2005-10-14 08:59:39.000000000 +0400
-@@ -78,7 +78,7 @@
- *
- * Return buffer_head on success or NULL in case of failure.
- */
--static struct buffer_head *
-+struct buffer_head *
- read_block_bitmap(struct super_block *sb, unsigned int block_group)
+--- linux-2.6.5-7.201.orig/include/linux/ext3_fs_sb.h 2005-12-17 02:53:25.000000000 +0300
++++ linux-2.6.5-7.201/include/linux/ext3_fs_sb.h 2005-12-17 03:10:23.000000000 +0300
+@@ -23,9 +23,15 @@
+ #define EXT_INCLUDE
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
++#include <linux/list.h>
+ #endif
+ #endif
+ #include <linux/rbtree.h>
++#include <linux/proc_fs.h>
++
++struct ext3_buddy_group_blocks;
++struct ext3_mb_history;
++#define EXT3_BB_MAX_BLOCKS
+
+ /*
+ * third extended-fs super-block data in memory
+@@ -78,6 +84,38 @@ struct ext3_sb_info {
+ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
+ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
+ #endif
++
++ /* for buddy allocator */
++ struct ext3_group_info **s_group_info;
++ struct inode *s_buddy_cache;
++ long s_blocks_reserved;
++ spinlock_t s_reserve_lock;
++ struct list_head s_active_transaction;
++ struct list_head s_closed_transaction;
++ struct list_head s_committed_transaction;
++ spinlock_t s_md_lock;
++ tid_t s_last_transaction;
++ int s_mb_factor;
++ unsigned short *s_mb_offsets, *s_mb_maxs;
++
++ /* history to debug policy */
++ struct ext3_mb_history *s_mb_history;
++ int s_mb_history_cur;
++ int s_mb_history_max;
++ struct proc_dir_entry *s_mb_proc;
++ spinlock_t s_mb_history_lock;
++
++ /* stats for buddy allocator */
++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
++ atomic_t s_bal_success; /* we found long enough chunks */
++ atomic_t s_bal_allocated; /* in blocks */
++ atomic_t s_bal_ex_scanned; /* total extents scanned */
++ atomic_t s_bal_goals; /* goal hits */
++ atomic_t s_bal_breaks; /* too long searches */
++ atomic_t s_bal_2orders; /* 2^order hits */
++ spinlock_t s_bal_lock;
++ unsigned long s_mb_buddies_generated;
++ unsigned long long s_mb_generation_time;
+ };
+
+ #endif /* _LINUX_EXT3_FS_SB */
+Index: linux-2.6.5-7.201/fs/ext3/super.c
+===================================================================
+--- linux-2.6.5-7.201.orig/fs/ext3/super.c 2005-12-17 02:53:30.000000000 +0300
++++ linux-2.6.5-7.201/fs/ext3/super.c 2005-12-17 03:10:23.000000000 +0300
+@@ -389,6 +389,7 @@ void ext3_put_super (struct super_block
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_mb_release(sb);
+ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+@@ -543,7 +544,7 @@ enum {
+ Opt_commit, Opt_journal_update, Opt_journal_inum,
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+- Opt_err, Opt_extents, Opt_extdebug
++ Opt_err, Opt_extents, Opt_extdebug, Opt_mballoc
+ };
+
+ static match_table_t tokens = {
+@@ -590,6 +591,7 @@ static match_table_t tokens = {
+ {Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
+ {Opt_extdebug, "extdebug"},
++ {Opt_mballoc, "mballoc"},
+ {Opt_err, NULL}
+ };
+
+@@ -811,6 +813,9 @@ static int parse_options (char * options
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
++ case Opt_mballoc:
++ set_opt (sbi->s_mount_opt, MBALLOC);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1464,6 +1469,7 @@ static int ext3_fill_super (struct super
+ ext3_count_dirs(sb));
+
+ ext3_ext_init(sb);
++ ext3_mb_init(sb, needs_recovery);
+
+ return 0;
+
+@@ -2112,7 +2118,13 @@ static struct file_system_type ext3_fs_t
+
+ static int __init init_ext3_fs(void)
{
- struct ext3_group_desc * desc;
-@@ -274,7 +274,7 @@
+- int err = init_ext3_xattr();
++ int err;
++
++ err = init_ext3_proc();
++ if (err)
++ return err;
++
++ err = init_ext3_xattr();
+ if (err)
+ return err;
+ err = init_inodecache();
+@@ -2141,6 +2153,7 @@ static void __exit exit_ext3_fs(void)
+ unregister_filesystem(&ext3_fs_type);
+ destroy_inodecache();
+ exit_ext3_xattr();
++ exit_ext3_proc();
}
- /* Free given blocks, update quota and i_blocks field */
--void ext3_free_blocks(handle_t *handle, struct inode *inode,
-+void ext3_free_blocks_old(handle_t *handle, struct inode *inode,
- unsigned long block, unsigned long count)
- {
- struct buffer_head *bitmap_bh = NULL;
-@@ -1142,7 +1142,7 @@
- * bitmap, and then for any free bit if that fails.
- * This function also updates quota and i_blocks field.
- */
--int ext3_new_block(handle_t *handle, struct inode *inode,
-+int ext3_new_block_old(handle_t *handle, struct inode *inode,
- unsigned long goal, int *errp)
- {
- struct buffer_head *bitmap_bh = NULL;
+ int ext3_prep_san_write(struct inode *inode, long *blocks,
Index: linux-2.6.5-7.201/fs/ext3/extents.c
===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/extents.c 2005-10-14 08:59:38.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/extents.c 2005-10-14 08:59:39.000000000 +0400
-@@ -771,7 +771,7 @@
+--- linux-2.6.5-7.201.orig/fs/ext3/extents.c 2005-12-17 02:53:29.000000000 +0300
++++ linux-2.6.5-7.201/fs/ext3/extents.c 2005-12-17 03:10:23.000000000 +0300
+@@ -771,7 +771,7 @@ cleanup:
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
}
}
kfree(ablocks);
-@@ -1428,7 +1428,7 @@
+@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st
path->p_idx->ei_leaf);
bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
return err;
}
-@@ -1913,10 +1913,12 @@
+@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t
int needed = ext3_remove_blocks_credits(tree, ex, from, to);
handle_t *handle = ext3_journal_start(tree->inode, needed);
struct buffer_head *bh;
if (IS_ERR(handle))
return PTR_ERR(handle);
-+ if (S_ISDIR(tree->inode->i_mode))
++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode))
+ metadata = 1;
if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
/* tail removal */
unsigned long num, start;
-@@ -1928,7 +1930,7 @@
+@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t
bh = sb_find_get_block(tree->inode->i_sb, start + i);
ext3_forget(handle, 0, tree->inode, bh, start + i);
}
} else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
printk("strange request: removal %lu-%lu from %u:%u\n",
from, to, ex->ee_block, ex->ee_len);
-Index: linux-2.6.5-7.201/fs/ext3/namei.c
+Index: linux-2.6.5-7.201/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.5-7.201.orig/fs/ext3/inode.c 2005-12-17 02:53:30.000000000 +0300
++++ linux-2.6.5-7.201/fs/ext3/inode.c 2005-12-17 03:10:23.000000000 +0300
+@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i < keys; i++)
+- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
+ return err;
+ }
+
+@@ -673,7 +673,7 @@ err_out:
+ if (err == -EAGAIN)
+ for (i = 0; i < num; i++)
+ ext3_free_blocks(handle, inode,
+- le32_to_cpu(where[i].key), 1);
++ le32_to_cpu(where[i].key), 1, 1);
+ return err;
+ }
+
+@@ -1835,7 +1835,7 @@ ext3_clear_blocks(handle_t *handle, stru
+ }
+ }
+
+- ext3_free_blocks(handle, inode, block_to_free, count);
++ ext3_free_blocks(handle, inode, block_to_free, count, 1);
+ }
+
+ /**
+@@ -2006,7 +2006,7 @@ static void ext3_free_branches(handle_t
+ ext3_journal_test_restart(handle, inode);
+ }
+
+- ext3_free_blocks(handle, inode, nr, 1);
++ ext3_free_blocks(handle, inode, nr, 1, 1);
+
+ if (parent_bh) {
+ /*
+Index: linux-2.6.5-7.201/fs/ext3/balloc.c
===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/namei.c 2005-10-14 08:59:35.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/namei.c 2005-10-14 08:59:39.000000000 +0400
-@@ -1640,7 +1640,7 @@
- * If the create succeeds, we fill in the inode information
- * with d_instantiate().
+--- linux-2.6.5-7.201.orig/fs/ext3/balloc.c 2005-10-11 00:12:45.000000000 +0400
++++ linux-2.6.5-7.201/fs/ext3/balloc.c 2005-12-17 03:10:23.000000000 +0300
+@@ -78,7 +78,7 @@ struct ext3_group_desc * ext3_get_group_
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ read_block_bitmap(struct super_block *sb, unsigned int block_group)
+ {
+ struct ext3_group_desc * desc;
+@@ -274,7 +274,7 @@ void ext3_discard_reservation(struct ino
+ }
+
+ /* Free given blocks, update quota and i_blocks field */
+-void ext3_free_blocks(handle_t *handle, struct inode *inode,
++void ext3_free_blocks_old(handle_t *handle, struct inode *inode,
+ unsigned long block, unsigned long count)
+ {
+ struct buffer_head *bitmap_bh = NULL;
+@@ -1142,7 +1142,7 @@ int ext3_should_retry_alloc(struct super
+ * bitmap, and then for any free bit if that fails.
+ * This function also updates quota and i_blocks field.
*/
--static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
-+int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
- struct nameidata *nd)
+-int ext3_new_block(handle_t *handle, struct inode *inode,
++int ext3_new_block_old(handle_t *handle, struct inode *inode,
+ unsigned long goal, int *errp)
{
- handle_t *handle;
+ struct buffer_head *bitmap_bh = NULL;
Index: linux-2.6.5-7.201/fs/ext3/xattr.c
===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/xattr.c 2005-10-14 08:59:36.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/xattr.c 2005-10-14 08:59:39.000000000 +0400
-@@ -1371,7 +1371,7 @@
+--- linux-2.6.5-7.201.orig/fs/ext3/xattr.c 2005-12-17 02:53:26.000000000 +0300
++++ linux-2.6.5-7.201/fs/ext3/xattr.c 2005-12-17 03:10:41.000000000 +0300
+@@ -1371,7 +1371,7 @@ ext3_xattr_set_handle2(handle_t *handle,
new_bh = sb_getblk(sb, block);
if (!new_bh) {
getblk_failed:
error = -EIO;
goto cleanup;
}
-@@ -1411,7 +1411,7 @@
+@@ -1411,7 +1411,7 @@ getblk_failed:
if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
/* Free the old block. */
ea_bdebug(old_bh, "freeing");
/* ext3_forget() calls bforget() for us, but we
let our caller release old_bh, so we need to
-@@ -1519,7 +1519,7 @@
+@@ -1519,7 +1519,7 @@ ext3_xattr_delete_inode(handle_t *handle
mb_cache_entry_free(ce);
ce = NULL;
}
get_bh(bh);
ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl);
} else {
-Index: linux-2.6.5-7.201/fs/ext3/Makefile
-===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/Makefile 2005-10-14 08:59:38.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/Makefile 2005-10-14 08:59:39.000000000 +0400
-@@ -5,7 +5,7 @@
- obj-$(CONFIG_EXT3_FS) += ext3.o
-
- ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
-- ioctl.o namei.o super.o symlink.o hash.o extents.o
-+ ioctl.o namei.o super.o symlink.o hash.o extents.o mballoc.o
-
- ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
- ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
Index: linux-2.6.5-7.201/fs/ext3/mballoc.c
===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/mballoc.c 2005-10-13 19:40:57.851699336 +0400
-+++ linux-2.6.5-7.201/fs/ext3/mballoc.c 2005-10-14 09:02:36.000000000 +0400
-@@ -0,0 +1,1868 @@
+--- linux-2.6.5-7.201.orig/fs/ext3/mballoc.c 2005-12-09 13:08:53.191437750 +0300
++++ linux-2.6.5-7.201/fs/ext3/mballoc.c 2005-12-17 03:15:04.000000000 +0300
+@@ -0,0 +1,2435 @@
+/*
-+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
++#include <linux/swap.h>
++#include <linux/proc_fs.h>
++#include <linux/pagemap.h>
++#include <linux/seq_file.h>
+
+/*
+ * TODO:
-+ * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green)
++ * - bitmap read-ahead (proposed by Oleg Drokin aka green)
+ * - track min/max extents in each group for better group selection
-+ * - is it worthwhile to use buddies directly if req is 2^N blocks?
+ * - mb_mark_used() may allocate chunk right after splitting buddy
+ * - special flag to advice allocator to look for requested + N blocks
+ * this may improve interaction between extents and mballoc
+ */
+
+/*
-+ * with 'ext3_mb_aggressive' set the allocator runs consistency checks over
++ * with AGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
-+long ext3_mb_aggressive = 0;
-+
-+
-+/*
-+ * with 'ext3_mb_stats' allocator will collect stats that will be
-+ * shown at umount. The collecting costs though!
-+ */
-+long ext3_mb_stats = 1;
++#define AGGRESSIVE_CHECK__
+
+/*
+ */
+#endif
+
+/*
-+ * where to save buddies structures beetween umount/mount (clean case only)
++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory
++ * and you can monitor it in /proc/fs/ext3/<dev>/mb_history
+ */
-+#define EXT3_BUDDY_FILE ".buddy"
++#define EXT3_MB_HISTORY
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
-+long ext3_mb_max_to_scan = 100;
++long ext3_mb_max_to_scan = 500;
+
+/*
-+ * This structure is on-disk description of a group for mballoc
++ * How long mballoc must look for a best extent
+ */
-+struct ext3_mb_group_descr {
-+ __u16 mgd_first_free; /* first free block in the group */
-+ __u16 mgd_free; /* number of free blocks in the group */
-+ __u16 mgd_counters[16]; /* number of free blocks by order */
-+};
++long ext3_mb_min_to_scan = 30;
+
+/*
-+ * This structure is header of mballoc's file
++ * with 'ext3_mb_stats' allocator will collect stats that will be
++ * shown at umount. The collecting costs though!
+ */
-+struct ext3_mb_grp_header {
-+ __u32 mh_magic;
++
++long ext3_mb_stats = 1;
++
++#ifdef EXT3_BB_MAX_BLOCKS
++#undef EXT3_BB_MAX_BLOCKS
++#endif
++#define EXT3_BB_MAX_BLOCKS 30
++
++struct ext3_free_metadata {
++ unsigned short group;
++ unsigned short num;
++ unsigned short blocks[EXT3_BB_MAX_BLOCKS];
++ struct list_head list;
+};
+
-+#define EXT3_MB_MAGIC_V1 0xbabd16fd
++struct ext3_group_info {
++ unsigned long bb_state;
++ unsigned long bb_tid;
++ struct ext3_free_metadata *bb_md_cur;
++ unsigned short bb_first_free;
++ unsigned short bb_free;
++ unsigned short bb_fragments;
++ unsigned short bb_counters[];
++};
++
++
++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0
++#define EXT3_GROUP_INFO_LOCKED_BIT 1
+
++#define EXT3_MB_GRP_NEED_INIT(grp) \
++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state))
+
+struct ext3_free_extent {
+ __u16 fe_start;
+ unsigned long ac_ex_scanned;
+ __u16 ac_groups_scanned;
+ __u16 ac_found;
++ __u16 ac_tail;
++ __u16 ac_buddy;
+ __u8 ac_status;
+ __u8 ac_flags; /* allocation hints */
++ __u8 ac_criteria;
+ __u8 ac_repeats;
++ __u8 ac_2order; /* if request is to allocate 2^N blocks and
++ * N > 0, the field stores N, otherwise 0 */
+};
+
+#define AC_STATUS_CONTINUE 1
+#define AC_STATUS_FOUND 2
+#define AC_STATUS_BREAK 3
+
++struct ext3_mb_history {
++ struct ext3_free_extent goal; /* goal allocation */
++ struct ext3_free_extent result; /* result allocation */
++ __u16 found; /* how many extents have been found */
++ __u16 groups; /* how many groups have been scanned */
++ __u16 tail; /* what tail broke some buddy */
++ __u16 buddy; /* buddy the tail ^^^ broke */
++ __u8 cr; /* which phase the result extent was found at */
++ __u8 merged;
++};
++
+struct ext3_buddy {
-+ struct buffer_head *bd_bh;
-+ struct buffer_head *bd_bh2;
-+ struct ext3_buddy_group_blocks *bd_bd;
++ struct page *bd_buddy_page;
++ void *bd_buddy;
++ struct page *bd_bitmap_page;
++ void *bd_bitmap;
++ struct ext3_group_info *bd_info;
+ struct super_block *bd_sb;
+ __u16 bd_blkbits;
+ __u16 bd_group;
+};
-+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data)
-+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data)
++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap)
++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy)
++
++#ifndef EXT3_MB_HISTORY
++#define ext3_mb_store_history(sb,ac)
++#else
++static void ext3_mb_store_history(struct super_block *,
++ struct ext3_allocation_context *ac);
++#endif
+
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+
++static struct proc_dir_entry *proc_root_ext3;
++
+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
+static inline int mb_test_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ return ext3_test_bit(bit, addr);
++ return ext2_test_bit(bit, addr);
+}
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_set_bit(bit, addr);
++ ext2_set_bit(bit, addr);
+}
+
+static inline void mb_set_bit_atomic(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_set_bit_atomic(NULL, bit, addr);
++ ext2_set_bit_atomic(NULL, bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_clear_bit(bit, addr);
++ ext2_clear_bit(bit, addr);
+}
+
+static inline void mb_clear_bit_atomic(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_clear_bit_atomic(NULL, bit, addr);
++ ext2_clear_bit_atomic(NULL, bit, addr);
+}
+
-+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
++static inline int mb_find_next_zero_bit(void *addr, int max, int start)
+{
-+ int i = 1;
-+ char *bb;
-+
-+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
-+ J_ASSERT(max != NULL);
-+
-+ if (order > e3b->bd_blkbits + 1) {
-+ *max = 0;
-+ return NULL;
-+ }
-+
-+ /* at order 0 we see each particular block */
-+ *max = 1 << (e3b->bd_blkbits + 3);
-+ if (order == 0)
-+ return EXT3_MB_BITMAP(e3b);
-+
-+ bb = EXT3_MB_BUDDY(e3b);
-+ *max = *max >> 1;
-+ while (i < order) {
-+ bb += 1 << (e3b->bd_blkbits - i);
-+ i++;
-+ *max = *max >> 1;
-+ }
-+ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) <
-+ e3b->bd_sb->s_blocksize);
-+ return bb;
++ int fix;
++#if BITS_PER_LONG == 64
++ fix = ((unsigned long) addr & 7UL) << 3;
++ addr = (void *) ((unsigned long) addr & ~7UL);
++#elif BITS_PER_LONG == 32
++ fix = ((unsigned long) addr & 3UL) << 3;
++ addr = (void *) ((unsigned long) addr & ~3UL);
++#else
++#error "how many bits you are?!"
++#endif
++ max += fix;
++ start += fix;
++ return ext2_find_next_zero_bit(addr, max, start) - fix;
+}
-+
-+static int ext3_mb_load_buddy(struct super_block *sb, int group,
-+ struct ext3_buddy *e3b)
++
++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
+{
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char *bb;
+
-+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap);
-+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy);
++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
++ J_ASSERT(max != NULL);
+
-+ /* load bitmap */
-+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap);
-+ if (e3b->bd_bh == NULL) {
-+ ext3_error(sb, "ext3_mb_load_buddy",
-+ "can't get block for buddy bitmap\n");
-+ goto out;
-+ }
-+ /* load buddy */
-+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy);
-+ if (e3b->bd_bh2 == NULL) {
-+ ext3_error(sb, "ext3_mb_load_buddy",
-+ "can't get block for buddy bitmap\n");
-+ goto out;
++ if (order > e3b->bd_blkbits + 1) {
++ *max = 0;
++ return NULL;
+ }
+
-+ if (!buffer_uptodate(e3b->bd_bh))
-+ ll_rw_block(READ, 1, &e3b->bd_bh);
-+ if (!buffer_uptodate(e3b->bd_bh2))
-+ ll_rw_block(READ, 1, &e3b->bd_bh2);
-+
-+ wait_on_buffer(e3b->bd_bh);
-+ J_ASSERT(buffer_uptodate(e3b->bd_bh));
-+ wait_on_buffer(e3b->bd_bh2);
-+ J_ASSERT(buffer_uptodate(e3b->bd_bh2));
-+
-+ e3b->bd_blkbits = sb->s_blocksize_bits;
-+ e3b->bd_bd = sbi->s_buddy_blocks[group];
-+ e3b->bd_sb = sb;
-+ e3b->bd_group = group;
++ /* at order 0 we see each particular block */
++ *max = 1 << (e3b->bd_blkbits + 3);
++ if (order == 0)
++ return EXT3_MB_BITMAP(e3b);
+
-+ return 0;
-+out:
-+ brelse(e3b->bd_bh);
-+ brelse(e3b->bd_bh2);
-+ e3b->bd_bh = NULL;
-+ e3b->bd_bh2 = NULL;
-+ return -EIO;
-+}
++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order];
++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order];
+
-+static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b)
-+{
-+ mark_buffer_dirty(e3b->bd_bh);
-+ mark_buffer_dirty(e3b->bd_bh2);
++ return bb;
+}
+
-+static void ext3_mb_release_desc(struct ext3_buddy *e3b)
-+{
-+ brelse(e3b->bd_bh);
-+ brelse(e3b->bd_bh2);
-+}
++#ifdef AGGRESSIVE_CHECK
+
+static void mb_check_buddy(struct ext3_buddy *e3b)
+{
+ int order = e3b->bd_blkbits + 1;
+ int max, max2, i, j, k, count;
++ int fragments = 0, fstart;
+ void *buddy, *buddy2;
+
-+ if (likely(!ext3_mb_aggressive))
-+ return;
-+
+ if (!test_opt(e3b->bd_sb, MBALLOC))
+ return;
+
++ {
++ static int mb_check_counter = 0;
++ if (mb_check_counter++ % 300 != 0)
++ return;
++ }
++
+ while (order > 1) {
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ }
+ count++;
+ }
-+ J_ASSERT(e3b->bd_bd->bb_counters[order] == count);
++ J_ASSERT(e3b->bd_info->bb_counters[order] == count);
+ order--;
+ }
+
++ fstart = -1;
+ buddy = mb_find_buddy(e3b, 0, &max);
+ for (i = 0; i < max; i++) {
-+ if (!mb_test_bit(i, buddy))
++ if (!mb_test_bit(i, buddy)) {
++ J_ASSERT(i >= e3b->bd_info->bb_first_free);
++ if (fstart == -1) {
++ fragments++;
++ fstart = i;
++ }
+ continue;
++ }
++ fstart = -1;
+ /* check used bits only */
+ for (j = 0; j < e3b->bd_blkbits + 1; j++) {
+ buddy2 = mb_find_buddy(e3b, j, &max2);
+ J_ASSERT(mb_test_bit(k, buddy2));
+ }
+ }
++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info));
++ J_ASSERT(e3b->bd_info->bb_fragments == fragments);
++}
++
++#else
++#define mb_check_buddy(e3b)
++#endif
++
++/* find most significant bit */
++static int inline fmsb(unsigned short word)
++{
++ int order;
++
++ if (word > 255) {
++ order = 7;
++ word >>= 8;
++ } else {
++ order = -1;
++ }
++
++ do {
++ order++;
++ word >>= 1;
++ } while (word != 0);
++
++ return order;
++}
++
++static void inline
++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first,
++ int len, struct ext3_group_info *grp)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ unsigned short min, max, chunk, border;
++
++ mb_debug("mark %u/%u free\n", first, len);
++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb));
++
++ border = 2 << sb->s_blocksize_bits;
++
++ while (len > 0) {
++ /* find how many blocks can be covered since this position */
++ max = ffs(first | border) - 1;
++
++ /* find how many blocks of power 2 we need to mark */
++ min = fmsb(len);
++
++ mb_debug(" %u/%u -> max %u, min %u\n",
++ first & ((2 << sb->s_blocksize_bits) - 1),
++ len, max, min);
++
++ if (max < min)
++ min = max;
++ chunk = 1 << min;
++
++ /* mark multiblock chunks only */
++ grp->bb_counters[min]++;
++ if (min > 0) {
++ mb_debug(" set %u at %u \n", first >> min,
++ sbi->s_mb_offsets[min]);
++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]);
++ }
++
++ len -= chunk;
++ first += chunk;
++ }
++}
++
++static void
++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap,
++ struct ext3_group_info *grp)
++{
++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb);
++ unsigned short i = 0, first, len;
++ unsigned free = 0, fragments = 0;
++ unsigned long long period = get_cycles();
++
++ i = mb_find_next_zero_bit(bitmap, max, 0);
++ grp->bb_first_free = i;
++ while (i < max) {
++ fragments++;
++ first = i;
++ i = find_next_bit(bitmap, max, i);
++ len = i - first;
++ free += len;
++ if (len > 1)
++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp);
++ else
++ grp->bb_counters[0]++;
++ if (i < max)
++ i = mb_find_next_zero_bit(bitmap, max, i);
++ }
++ grp->bb_fragments = fragments;
++
++ /* bb_state shouldn't being modified because all
++ * others waits for init completion on page lock */
++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state);
++ if (free != grp->bb_free) {
++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n",
++ free, grp->bb_free);
++ grp->bb_free = free;
++ }
++
++ period = get_cycles() - period;
++ spin_lock(&EXT3_SB(sb)->s_bal_lock);
++ EXT3_SB(sb)->s_mb_buddies_generated++;
++ EXT3_SB(sb)->s_mb_generation_time += period;
++ spin_unlock(&EXT3_SB(sb)->s_bal_lock);
++}
++
++static int ext3_mb_init_cache(struct page *page)
++{
++ int blocksize, blocks_per_page, groups_per_page;
++ int err = 0, i, first_group, first_block;
++ struct super_block *sb;
++ struct buffer_head *bhs;
++ struct buffer_head **bh;
++ struct inode *inode;
++ char *data, *bitmap;
++
++ mb_debug("init page %lu\n", page->index);
++
++ inode = page->mapping->host;
++ sb = inode->i_sb;
++ blocksize = 1 << inode->i_blkbits;
++ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
++
++ groups_per_page = blocks_per_page >> 1;
++ if (groups_per_page == 0)
++ groups_per_page = 1;
++
++ /* allocate buffer_heads to read bitmaps */
++ if (groups_per_page > 1) {
++ err = -ENOMEM;
++ i = sizeof(struct buffer_head *) * groups_per_page;
++ bh = kmalloc(i, GFP_NOFS);
++ if (bh == NULL)
++ goto out;
++ memset(bh, 0, i);
++ } else
++ bh = &bhs;
++
++ first_group = page->index * blocks_per_page / 2;
++
++ /* read all groups the page covers into the cache */
++ for (i = 0; i < groups_per_page; i++) {
++ struct ext3_group_desc * desc;
++
++ if (first_group + i >= EXT3_SB(sb)->s_groups_count)
++ break;
++
++ err = -EIO;
++ desc = ext3_get_group_desc(sb, first_group + i, NULL);
++ if (desc == NULL)
++ goto out;
++
++ err = -ENOMEM;
++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap));
++ if (bh[i] == NULL)
++ goto out;
++
++ if (buffer_uptodate(bh[i]))
++ continue;
++
++ lock_buffer(bh[i]);
++ if (buffer_uptodate(bh[i])) {
++ unlock_buffer(bh[i]);
++ continue;
++ }
++
++ get_bh(bh[i]);
++ bh[i]->b_end_io = end_buffer_read_sync;
++ submit_bh(READ, bh[i]);
++ mb_debug("read bitmap for group %u\n", first_group + i);
++ }
++
++ /* wait for I/O completion */
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ wait_on_buffer(bh[i]);
++
++ /* XXX: I/O error handling here */
++
++ first_block = page->index * blocks_per_page;
++ for (i = 0; i < blocks_per_page; i++) {
++ int group;
++
++ group = (first_block + i) >> 1;
++ if (group >= EXT3_SB(sb)->s_groups_count)
++ break;
++
++ data = page_address(page) + (i * blocksize);
++ bitmap = bh[group - first_group]->b_data;
++
++ if ((first_block + i) & 1) {
++ /* this is block of buddy */
++ mb_debug("put buddy for group %u in page %lu/%x\n",
++ group, page->index, i * blocksize);
++ memset(data, 0xff, blocksize);
++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
++ ext3_mb_generate_buddy(sb, data, bitmap,
++ EXT3_SB(sb)->s_group_info[group]);
++ } else {
++ /* this is block of bitmap */
++ mb_debug("put bitmap for group %u in page %lu/%x\n",
++ group, page->index, i * blocksize);
++ memcpy(data, bitmap, blocksize);
++ }
++ }
++ SetPageUptodate(page);
++
++out:
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ brelse(bh[i]);
++ if (bh && bh != &bhs)
++ kfree(bh);
++ return err;
++}
++
++static int ext3_mb_load_buddy(struct super_block *sb, int group,
++ struct ext3_buddy *e3b)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct inode *inode = sbi->s_buddy_cache;
++ int blocks_per_page, block, pnum, poff;
++ struct page *page;
++
++ mb_debug("load group %u\n", group);
++
++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
++
++ e3b->bd_blkbits = sb->s_blocksize_bits;
++ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_sb = sb;
++ e3b->bd_group = group;
++ e3b->bd_buddy_page = NULL;
++ e3b->bd_bitmap_page = NULL;
++
++ block = group * 2;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++
++ page = find_get_page(inode->i_mapping, pnum);
++ if (page == NULL || !PageUptodate(page)) {
++ if (page)
++ page_cache_release(page);
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ if (!PageUptodate(page))
++ ext3_mb_init_cache(page);
++ unlock_page(page);
++ }
++ }
++ if (page == NULL || !PageUptodate(page))
++ goto err;
++ e3b->bd_bitmap_page = page;
++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
++ mark_page_accessed(page);
++
++ block++;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++
++ page = find_get_page(inode->i_mapping, pnum);
++ if (page == NULL || !PageUptodate(page)) {
++ if (page)
++ page_cache_release(page);
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ if (!PageUptodate(page))
++ ext3_mb_init_cache(page);
++ unlock_page(page);
++ }
++ }
++ if (page == NULL || !PageUptodate(page))
++ goto err;
++ e3b->bd_buddy_page = page;
++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
++ mark_page_accessed(page);
++
++ J_ASSERT(e3b->bd_bitmap_page != NULL);
++ J_ASSERT(e3b->bd_buddy_page != NULL);
++
++ return 0;
++
++err:
++ if (e3b->bd_bitmap_page)
++ page_cache_release(e3b->bd_bitmap_page);
++ if (e3b->bd_buddy_page)
++ page_cache_release(e3b->bd_buddy_page);
++ e3b->bd_buddy = NULL;
++ e3b->bd_bitmap = NULL;
++ return -EIO;
++}
++
++static void ext3_mb_release_desc(struct ext3_buddy *e3b)
++{
++ if (e3b->bd_bitmap_page)
++ page_cache_release(e3b->bd_bitmap_page);
++ if (e3b->bd_buddy_page)
++ page_cache_release(e3b->bd_buddy_page);
+}
+
++
+static inline void
+ext3_lock_group(struct super_block *sb, int group)
+{
-+ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock);
++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
++ &EXT3_SB(sb)->s_group_info[group]->bb_state);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
-+ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock);
++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
++ &EXT3_SB(sb)->s_group_info[group]->bb_state);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+
+static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count)
+{
-+ int block, max, order;
++ int block = 0, max = 0, order;
+ void *buddy, *buddy2;
+
+ mb_check_buddy(e3b);
+
-+ e3b->bd_bd->bb_free += count;
-+ if (first < e3b->bd_bd->bb_first_free)
-+ e3b->bd_bd->bb_first_free = first;
-+
++ e3b->bd_info->bb_free += count;
++ if (first < e3b->bd_info->bb_first_free)
++ e3b->bd_info->bb_first_free = first;
++
++ /* let's maintain fragments counter */
++ if (first != 0)
++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b));
++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0])
++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b));
++ if (block && max)
++ e3b->bd_info->bb_fragments--;
++ else if (!block && !max)
++ e3b->bd_info->bb_fragments++;
++
++ /* let's maintain buddy itself */
+ while (count-- > 0) {
+ block = first++;
+ order = 0;
+
+ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b)));
+ mb_clear_bit(block, EXT3_MB_BITMAP(e3b));
-+ e3b->bd_bd->bb_counters[order]++;
++ e3b->bd_info->bb_counters[order]++;
+
+ /* start of the buddy */
+ buddy = mb_find_buddy(e3b, order, &max);
+ mb_set_bit(block, buddy);
+ mb_set_bit(block + 1, buddy);
+ }
-+ e3b->bd_bd->bb_counters[order]--;
-+ e3b->bd_bd->bb_counters[order]--;
++ e3b->bd_info->bb_counters[order]--;
++ e3b->bd_info->bb_counters[order]--;
+
+ block = block >> 1;
+ order++;
-+ e3b->bd_bd->bb_counters[order]++;
++ e3b->bd_info->bb_counters[order]++;
+
+ mb_clear_bit(block, buddy2);
+ buddy = buddy2;
+}
+
+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
-+ int needed, struct ext3_free_extent *ex)
++ int needed, struct ext3_free_extent *ex)
+{
+ int next, max, ord;
+ void *buddy;
+ return 0;
+ }
+
-+ if (order == 0) {
++ if (likely(order == 0)) {
+ /* find actual order */
+ order = mb_find_order_for_block(e3b, block);
+ block = block >> order;
+ ex->fe_start = block << order;
+ ex->fe_group = e3b->bd_group;
+
-+ while ((buddy = mb_find_buddy(e3b, order, &max))) {
++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) {
+
+ if (block + 1 >= max)
+ break;
+
+static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex)
+{
++ int ord, mlen = 0, max = 0, cur;
+ int start = ex->fe_start;
+ int len = ex->fe_len;
-+ int ord, mlen, max, cur;
++ unsigned ret = 0;
+ int len0 = len;
+ void *buddy;
+
-+ e3b->bd_bd->bb_free -= len;
-+ if (e3b->bd_bd->bb_first_free == start)
-+ e3b->bd_bd->bb_first_free += len;
++ mb_check_buddy(e3b);
+
++ e3b->bd_info->bb_free -= len;
++ if (e3b->bd_info->bb_first_free == start)
++ e3b->bd_info->bb_first_free += len;
++
++ /* let's maintain fragments counter */
++ if (start != 0)
++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b));
++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0])
++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b));
++ if (mlen && max)
++ e3b->bd_info->bb_fragments++;
++ else if (!mlen && !max)
++ e3b->bd_info->bb_fragments--;
++
++ /* let's maintain buddy itself */
+ while (len) {
+ ord = mb_find_order_for_block(e3b, start);
+
+ buddy = mb_find_buddy(e3b, ord, &max);
+ J_ASSERT((start >> ord) < max);
+ mb_set_bit(start >> ord, buddy);
-+ e3b->bd_bd->bb_counters[ord]--;
++ e3b->bd_info->bb_counters[ord]--;
+ start += mlen;
+ len -= mlen;
+ J_ASSERT(len >= 0);
+ continue;
+ }
+
++ /* store for history */
++ if (ret == 0)
++ ret = len | (ord << 16);
++
+ /* we have to split large buddy */
+ J_ASSERT(ord > 0);
+ buddy = mb_find_buddy(e3b, ord, &max);
+ mb_set_bit(start >> ord, buddy);
-+ e3b->bd_bd->bb_counters[ord]--;
++ e3b->bd_info->bb_counters[ord]--;
+
+ ord--;
+ cur = (start >> ord) & ~1U;
+ buddy = mb_find_buddy(e3b, ord, &max);
+ mb_clear_bit(cur, buddy);
+ mb_clear_bit(cur + 1, buddy);
-+ e3b->bd_bd->bb_counters[ord]++;
-+ e3b->bd_bd->bb_counters[ord]++;
++ e3b->bd_info->bb_counters[ord]++;
++ e3b->bd_info->bb_counters[ord]++;
+ }
+
+ /* now drop all the bits in bitmap */
+
+ mb_check_buddy(e3b);
+
-+ return 0;
++ return ret;
+}
+
+/*
+static void ext3_mb_use_best_found(struct ext3_allocation_context *ac,
+ struct ext3_buddy *e3b)
+{
++ unsigned long ret;
++
+ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
-+ mb_mark_used(e3b, &ac->ac_b_ex);
++ ret = mb_mark_used(e3b, &ac->ac_b_ex);
++
+ ac->ac_status = AC_STATUS_FOUND;
++ ac->ac_tail = ret & 0xffff;
++ ac->ac_buddy = ret >> 16;
+}
+
+/*
+ struct ext3_free_extent *ex,
+ struct ext3_buddy *e3b)
+{
-+ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor;
+ struct ext3_free_extent *bex = &ac->ac_b_ex;
-+ int diff = ac->ac_g_ex.fe_len - ex->fe_len;
++ struct ext3_free_extent *gex = &ac->ac_g_ex;
+
+ J_ASSERT(ex->fe_len > 0);
+ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8);
+ /*
+ * The special case - take what you catch first
+ */
-+ if (ac->ac_flags & EXT3_MB_HINT_FIRST) {
++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) {
+ *bex = *ex;
+ ext3_mb_use_best_found(ac, e3b);
+ return;
+ /*
+ * Let's check whether the chuck is good enough
+ */
-+ if (ex->fe_len >= ac->ac_g_ex.fe_len) {
-+ *bex = *ex;
-+ ext3_mb_use_best_found(ac, e3b);
-+ return;
-+ }
-+
-+ /*
-+ * If the request is vey large, then it makes sense to use large
-+ * chunks for it. Even if they don't satisfy whole request.
-+ */
-+ if (ex->fe_len > 1000) {
-+ *bex = *ex;
-+ ext3_mb_use_best_found(ac, e3b);
-+ return;
-+ }
-+
-+ /*
-+ * Sometimes it's worty to take close chunk
-+ */
-+ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) {
++ if (ex->fe_len == gex->fe_len) {
+ *bex = *ex;
+ ext3_mb_use_best_found(ac, e3b);
+ return;
+
+ /*
+ * If new found extent is better, store it in the context
-+ * FIXME: possible the policy should be more complex?
+ */
-+ if (ex->fe_len > bex->fe_len) {
++ if (bex->fe_len < gex->fe_len) {
++ /* if the request isn't satisfied, any found extent
++ * larger than previous best one is better */
++ if (ex->fe_len > bex->fe_len)
++ *bex = *ex;
++ } else if (ex->fe_len > gex->fe_len) {
++ /* if the request is satisfied, then we try to find
++ * an extent that still satisfy the request, but is
++ * smaller than previous one */
+ *bex = *ex;
+ }
+
+ /*
++ * Let's scan at least few extents and don't pick up a first one
++ */
++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan)
++ ac->ac_status = AC_STATUS_BREAK;
++
++ /*
+ * We don't want to scan for a whole year
+ */
+ if (ac->ac_found > ext3_mb_max_to_scan)
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
+
-+ if (max > 0)
++ if (max > 0) {
++ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
++ }
+
+ ext3_unlock_group(ac->ac_sb, group);
+
-+ if (ac->ac_status == AC_STATUS_FOUND)
-+ ext3_mb_dirty_buddy(e3b);
+ ext3_mb_release_desc(e3b);
+
+ return 0;
+ J_ASSERT(ex.fe_len > 0);
+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
++ ac->ac_found++;
+ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
+ }
+ ext3_unlock_group(ac->ac_sb, group);
+
-+ if (ac->ac_status == AC_STATUS_FOUND)
-+ ext3_mb_dirty_buddy(e3b);
+ ext3_mb_release_desc(e3b);
+
+ return 0;
+}
++
++/*
++ * The routine scans buddy structures (not bitmap!) from given order
++ * to max order and tries to find big enough chunk to satisfy the req
++ */
++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ struct ext3_group_info *grp = e3b->bd_info;
++ void *buddy;
++ int i, k, max;
++
++ J_ASSERT(ac->ac_2order > 0);
++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) {
++ if (grp->bb_counters[i] == 0)
++ continue;
++
++ buddy = mb_find_buddy(e3b, i, &max);
++ if (buddy == NULL) {
++ printk(KERN_ALERT "looking for wrong order?\n");
++ break;
++ }
++
++ k = mb_find_next_zero_bit(buddy, max, 0);
++ J_ASSERT(k < max);
++
++ ac->ac_found++;
++
++ ac->ac_b_ex.fe_len = 1 << i;
++ ac->ac_b_ex.fe_start = k << i;
++ ac->ac_b_ex.fe_group = e3b->bd_group;
++
++ ext3_mb_use_best_found(ac, e3b);
++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len);
++
++ if (unlikely(ext3_mb_stats))
++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders);
++
++ break;
++ }
++}
++
+/*
+ * The routine scans the group and measures all found extents.
+ * In order to optimize scanning, caller must pass number of
-+ * free blocks in the group, so the routine can upper limit.
++ * free blocks in the group, so the routine can know upper limit.
+ */
-+static void ext3_mb_scan_group(struct ext3_allocation_context *ac,
-+ struct ext3_buddy *e3b)
++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
+{
+ struct super_block *sb = ac->ac_sb;
+ void *bitmap = EXT3_MB_BITMAP(e3b);
+ struct ext3_free_extent ex;
+ int i, free;
+
-+ free = e3b->bd_bd->bb_free;
++ free = e3b->bd_info->bb_free;
+ J_ASSERT(free > 0);
+
-+ i = e3b->bd_bd->bb_first_free;
++ i = e3b->bd_info->bb_first_free;
+
-+ while (free && ac->ac_status != AC_STATUS_FOUND) {
-+ i = ext3_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
++ while (free && ac->ac_status == AC_STATUS_CONTINUE) {
++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
+ if (i >= sb->s_blocksize * 8) {
+ J_ASSERT(free == 0);
+ break;
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+ int group, int cr)
+{
-+ int free;
++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
++ struct ext3_group_info *grp = sbi->s_group_info[group];
++ unsigned free, fragments, i, bits;
+
-+ J_ASSERT(cr >= 0 && cr < 3);
++ J_ASSERT(cr >= 0 && cr < 4);
++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp));
+
-+ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free;
++ free = grp->bb_free;
++ fragments = grp->bb_fragments;
+ if (free == 0)
+ return 0;
++ if (fragments == 0)
++ return 0;
+
-+ if (cr == 0) {
-+ if (free >= ac->ac_g_ex.fe_len >> 1)
++ switch (cr) {
++ case 0:
++ J_ASSERT(ac->ac_2order != 0);
++ bits = ac->ac_sb->s_blocksize_bits + 1;
++ for (i = ac->ac_2order; i < bits; i++)
++ if (grp->bb_counters[i] > 0)
++ return 1;
++ case 1:
++ if ((free / fragments) >= ac->ac_g_ex.fe_len)
++ return 1;
++ case 2:
++ if (free >= ac->ac_g_ex.fe_len)
++ return 1;
++ case 3:
+ return 1;
-+ } else if (cr == 1) {
-+ if (free >= ac->ac_g_ex.fe_len >> 2)
-+ return 1;
-+ } else if (cr == 2) {
-+ return 1;
++ default:
++ BUG();
+ }
++
+ return 0;
+}
+
+ ac.ac_g_ex.fe_start = block;
+ ac.ac_g_ex.fe_len = *len;
+ ac.ac_flags = flags;
++ ac.ac_2order = 0;
++ ac.ac_criteria = 0;
+
-+ /*
-+ * Sometimes, caller may want to merge even small number
-+ * of blocks to an existing extent
-+ */
++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */
++ i = ffs(*len);
++ if (i >= 8) {
++ i--;
++ if ((*len & (~(1 << i))) == 0)
++ ac.ac_2order = i;
++ }
++
++ /* Sometimes, caller may want to merge even small
++ * number of blocks to an existing extent */
+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) {
+ err = ext3_mb_find_by_goal(&ac, &e3b);
+ if (err)
+ goto found;
+ }
+
-+ /*
-+ * FIXME
-+ * If requested chunk is power of 2 length, we can try
-+ * to exploit buddy nature to speed allocation up
-+ */
-+
-+
-+ /*
-+ * Let's just scan groups to find more-less suitable blocks
-+ */
-+ cr = 0;
++ /* Let's just scan groups to find more-less suitable blocks */
++ cr = ac.ac_2order ? 0 : 1;
+repeat:
-+ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
++ ac.ac_criteria = cr;
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) {
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ /* we need full data about the group
++ * to make a good selection */
++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
++ if (err)
++ goto out_err;
++ ext3_mb_release_desc(&e3b);
++ }
++
+ /* check is group good for our criteries */
+ if (!ext3_mb_good_group(&ac, group, cr))
+ continue;
+ continue;
+ }
+
-+ ext3_mb_scan_group(&ac, &e3b);
++ ac.ac_groups_scanned++;
++ if (cr == 0)
++ ext3_mb_simple_scan_group(&ac, &e3b);
++ else
++ ext3_mb_complex_scan_group(&ac, &e3b);
++
+ ext3_unlock_group(sb, group);
+
-+ if (ac.ac_status == AC_STATUS_FOUND)
-+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
+ if (err)
+ }
+ }
+
-+ if (ac.ac_status == AC_STATUS_BREAK &&
++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND &&
+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
-+ /* We've been searching too long. Let's try to allocate
-+ * the best chunk we've found so far. */
-+ if (ac.ac_g_ex.fe_len >= 128 &&
-+ ac.ac_b_ex.fe_len < ac.ac_g_ex.fe_len / 4)
-+ ext3_warning(inode->i_sb, __FUNCTION__,
-+ "too long searching: got %d want %d\n",
-+ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len);
++ /*
++ * We've been searching too long. Let's try to allocate
++ * the best chunk we've found so far
++ */
++
++ /*if (ac.ac_found > ext3_mb_max_to_scan)
++ printk(KERN_ERR "EXT3-fs: too long searching at "
++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len,
++ ac.ac_g_ex.fe_len);*/
+ ext3_mb_try_best_found(&ac, &e3b);
+ if (ac.ac_status != AC_STATUS_FOUND) {
+ /*
+ * The only thing we can do is just take first
+ * found block(s)
+ */
-+ mb_debug(KERN_ERR "EXT3-fs: and someone won our chunk\n");
++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");
+ ac.ac_b_ex.fe_group = 0;
+ ac.ac_b_ex.fe_start = 0;
+ ac.ac_b_ex.fe_len = 0;
+ ac.ac_status = AC_STATUS_CONTINUE;
+ ac.ac_flags |= EXT3_MB_HINT_FIRST;
-+ cr = 2;
++ cr = 3;
+ goto repeat;
+ }
+ }
+ printk("EXT3-fs: groups: ");
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
+ printk("%d: %d ", i,
-+ sbi->s_buddy_blocks[i]->bb_free);
++ sbi->s_group_info[i]->bb_free);
+ printk("\n");
+#endif
+ goto out;
+ ext3_error(sb, "ext3_new_block",
+ "Allocating block in system zone - "
+ "block = %u", block);
-+ if (unlikely(ext3_mb_aggressive)) {
-+ for (i = 0; i < ac.ac_b_ex.fe_len; i++)
-+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i,
-+ bitmap_bh->b_data));
-+ }
-+
++#ifdef AGGRESSIVE_CHECK
++ for (i = 0; i < ac.ac_b_ex.fe_len; i++)
++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data));
++#endif
+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len);
+
+ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
+ ext3_mb_release_blocks(sb, 1);
+ }
+
-+ if ((ext3_mb_stats) && (ac.ac_g_ex.fe_len > 1)) {
-+ spin_lock(&sbi->s_bal_lock);
-+ sbi->s_bal_reqs++;
-+ sbi->s_bal_allocated += *len;
++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) {
++ atomic_inc(&sbi->s_bal_reqs);
++ atomic_add(*len, &sbi->s_bal_allocated);
+ if (*len >= ac.ac_g_ex.fe_len)
-+ sbi->s_bal_success++;
-+ sbi->s_bal_ex_scanned += ac.ac_found;
++ atomic_inc(&sbi->s_bal_success);
++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned);
+ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start &&
+ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group)
-+ sbi->s_bal_goals++;
++ atomic_inc(&sbi->s_bal_goals);
+ if (ac.ac_found > ext3_mb_max_to_scan)
-+ sbi->s_bal_breaks++;
-+ spin_unlock(&sbi->s_bal_lock);
++ atomic_inc(&sbi->s_bal_breaks);
+ }
+
++ ext3_mb_store_history(sb, &ac);
++
+ return block;
+}
++EXPORT_SYMBOL(ext3_mb_new_blocks);
+
-+int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh,
-+ struct ext3_mb_group_descr **grp)
-+{
-+ struct super_block *sb = e3b->bd_sb;
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int descr_per_block, err, offset;
-+ struct ext3_mb_grp_header *hdr;
-+ unsigned long block;
-+
-+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
-+ / sizeof(struct ext3_mb_group_descr);
-+ block = e3b->bd_group / descr_per_block;
-+ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err);
-+ if (*bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n",
-+ e3b->bd_group, err);
-+ return err;
-+ }
-+
-+ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data;
-+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
-+ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n",
-+ e3b->bd_group);
-+ brelse(*bh);
-+ *bh = NULL;
-+ return -EIO;
-+ }
++#ifdef EXT3_MB_HISTORY
++struct ext3_mb_proc_session {
++ struct ext3_mb_history *history;
++ struct super_block *sb;
++ int start;
++ int max;
++};
+
-+ offset = e3b->bd_group % descr_per_block
-+ * sizeof(struct ext3_mb_group_descr)
-+ + sizeof(struct ext3_mb_grp_header);
-+ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset);
++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s,
++ struct ext3_mb_history *hs,
++ int first)
++{
++ if (hs == s->history + s->max)
++ hs = s->history;
++ if (!first && hs == s->history + s->start)
++ return NULL;
++ while (hs->goal.fe_len == 0) {
++ hs++;
++ if (hs == s->history + s->max)
++ hs = s->history;
++ if (hs == s->history + s->start)
++ return NULL;
++ }
++ return hs;
++}
+
-+ return 0;
++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
++{
++ struct ext3_mb_proc_session *s = seq->private;
++ struct ext3_mb_history *hs;
++ int l = *pos;
++
++ if (l == 0)
++ return SEQ_START_TOKEN;
++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1);
++ if (!hs)
++ return NULL;
++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL);
++ return hs;
+}
+
-+int ext3_mb_load_descr(struct ext3_buddy *e3b)
++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
+{
-+ struct ext3_mb_group_descr *grp;
-+ struct ext3_group_desc *gdp;
-+ struct buffer_head *bh;
-+ int err, i;
++ struct ext3_mb_proc_session *s = seq->private;
++ struct ext3_mb_history *hs = v;
++
++ ++*pos;
++ if (v == SEQ_START_TOKEN)
++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1);
++ else
++ return ext3_mb_history_skip_empty(s, ++hs, 0);
++}
+
-+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
-+ if (err)
-+ return err;
-+
-+ e3b->bd_bd->bb_first_free = grp->mgd_first_free;
-+ e3b->bd_bd->bb_free = grp->mgd_free;
-+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
-+ J_ASSERT(i < 16);
-+ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i];
-+ }
-+ brelse(bh);
++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v)
++{
++ struct ext3_mb_history *hs = v;
++ char buf[20], buf2[20];
+
-+ /* additional checks against old group descriptor */
-+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
-+ if (!gdp)
-+ return -EIO;
-+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
-+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
-+ e3b->bd_group, e3b->bd_bd->bb_free,
-+ le16_to_cpu(gdp->bg_free_blocks_count));
-+ return -ENODATA;
++ if (v == SEQ_START_TOKEN) {
++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n",
++ "goal", "result", "found", "grps", "cr", "merge",
++ "tail", "broken");
++ return 0;
+ }
+
++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group,
++ hs->goal.fe_start, hs->goal.fe_len);
++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group,
++ hs->result.fe_start, hs->result.fe_len);
++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf,
++ buf2, hs->found, hs->groups, hs->cr,
++ hs->merged ? "M" : "", hs->tail,
++ hs->buddy ? 1 << hs->buddy : 0);
+ return 0;
+}
+
++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v)
++{
++}
++
++static struct seq_operations ext3_mb_seq_history_ops = {
++ .start = ext3_mb_seq_history_start,
++ .next = ext3_mb_seq_history_next,
++ .stop = ext3_mb_seq_history_stop,
++ .show = ext3_mb_seq_history_show,
++};
+
-+int ext3_mb_update_descr(struct ext3_buddy *e3b)
++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file)
+{
-+ struct ext3_mb_group_descr *grp;
-+ struct ext3_group_desc *gdp;
-+ struct buffer_head *bh;
-+ handle_t *handle;
-+ int err, i;
++ struct super_block *sb = PDE(inode)->data;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_proc_session *s;
++ int rc, size;
+
-+ /* additional checks against old group descriptor */
-+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
-+ if (!gdp)
++ s = kmalloc(sizeof(*s), GFP_KERNEL);
++ if (s == NULL)
++ return -EIO;
++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max;
++ s->history = kmalloc(size, GFP_KERNEL);
++ if (s == NULL) {
++ kfree(s);
+ return -EIO;
-+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
-+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
-+ e3b->bd_group, e3b->bd_bd->bb_free,
-+ le16_to_cpu(gdp->bg_free_blocks_count));
-+ return -ENODATA;
+ }
+
-+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
-+ if (err)
-+ return err;
++ spin_lock(&sbi->s_mb_history_lock);
++ memcpy(s->history, sbi->s_mb_history, size);
++ s->max = sbi->s_mb_history_max;
++ s->start = sbi->s_mb_history_cur % s->max;
++ spin_unlock(&sbi->s_mb_history_lock);
+
-+ handle = ext3_journal_start(EXT3_SB(e3b->bd_sb)->s_buddy, 1);
-+ if (IS_ERR(handle)) {
-+ err = PTR_ERR(handle);
-+ handle = NULL;
-+ goto out;
++ rc = seq_open(file, &ext3_mb_seq_history_ops);
++ if (rc == 0) {
++ struct seq_file *m = (struct seq_file *)file->private_data;
++ m->private = s;
++ } else {
++ kfree(s->history);
++ kfree(s);
+ }
++ return rc;
+
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err)
-+ goto out;
-+ grp->mgd_first_free = e3b->bd_bd->bb_first_free;
-+ grp->mgd_free = e3b->bd_bd->bb_free;
-+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
-+ J_ASSERT(i < 16);
-+ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i];
-+ }
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err)
-+ goto out;
-+ err = 0;
-+out:
-+ brelse(bh);
-+ if (handle)
-+ ext3_journal_stop(handle);
-+ return err;
+}
+
-+int ext3_mb_generate_buddy(struct ext3_buddy *e3b)
++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file)
+{
-+ struct super_block *sb = e3b->bd_sb;
-+ struct buffer_head *bh;
-+ int i, count = 0;
++ struct seq_file *seq = (struct seq_file *)file->private_data;
++ struct ext3_mb_proc_session *s = seq->private;
++ kfree(s->history);
++ kfree(s);
++ return seq_release(inode, file);
++}
++
++static struct file_operations ext3_mb_seq_history_fops = {
++ .owner = THIS_MODULE,
++ .open = ext3_mb_seq_history_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext3_mb_seq_history_release,
++};
+
-+ mb_debug("generate buddy for group %d\n", e3b->bd_group);
-+ memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize);
-+ memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize);
++static void ext3_mb_history_release(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char name[64];
+
-+ bh = read_block_bitmap(sb, e3b->bd_group);
-+ if (bh == NULL)
-+ return -EIO;
++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ remove_proc_entry("mb_history", sbi->s_mb_proc);
++ remove_proc_entry(name, proc_root_ext3);
++
++ if (sbi->s_mb_history)
++ kfree(sbi->s_mb_history);
++}
+
-+ /* mb_free_blocks will set real free */
-+ e3b->bd_bd->bb_free = 0;
-+ e3b->bd_bd->bb_first_free = 1 << 15;
-+ /*
-+ * if change bb_counters size, don't forget about
-+ * ext3_mb_init_backend() -bzzz
-+ */
-+ memset(e3b->bd_bd->bb_counters, 0,
-+ sizeof(unsigned) * (sb->s_blocksize_bits + 2));
++static void ext3_mb_history_init(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char name[64];
++ int i;
+
-+ /* loop over the blocks, and create buddies for free ones */
-+ for (i = 0; i < sb->s_blocksize * 8; i++) {
-+ if (!mb_test_bit(i, (void *) bh->b_data)) {
-+ mb_free_blocks(e3b, i, 1);
-+ count++;
++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3);
++ if (sbi->s_mb_proc != NULL) {
++ struct proc_dir_entry *p;
++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
++ if (p) {
++ p->proc_fops = &ext3_mb_seq_history_fops;
++ p->data = sb;
+ }
+ }
-+ brelse(bh);
-+ mb_check_buddy(e3b);
-+ ext3_mb_dirty_buddy(e3b);
+
-+ return 0;
++ sbi->s_mb_history_max = 1000;
++ sbi->s_mb_history_cur = 0;
++ spin_lock_init(&sbi->s_mb_history_lock);
++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history);
++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
++ memset(sbi->s_mb_history, 0, i);
++ /* if we can't allocate history, then we simple won't use it */
+}
+
-+EXPORT_SYMBOL(ext3_mb_new_blocks);
++static void
++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_history h;
++
++ if (likely(sbi->s_mb_history == NULL))
++ return;
++
++ h.goal = ac->ac_g_ex;
++ h.result = ac->ac_b_ex;
++ h.found = ac->ac_found;
++ h.cr = ac->ac_criteria;
++ h.groups = ac->ac_groups_scanned;
++ h.tail = ac->ac_tail;
++ h.buddy = ac->ac_buddy;
++ h.merged = 0;
++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
++ h.merged = 1;
++
++ spin_lock(&sbi->s_mb_history_lock);
++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
++ sbi->s_mb_history_cur = 0;
++ spin_unlock(&sbi->s_mb_history_lock);
++}
+
-+#define MB_CREDITS \
-+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \
-+ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS)
++#else
++#define ext3_mb_history_release(sb)
++#define ext3_mb_history_init(sb)
++#endif
+
-+int ext3_mb_init_backend(struct super_block *sb, int *created)
++int ext3_mb_init_backend(struct super_block *sb)
+{
-+ int err, i, len, descr_per_block, buddy_offset, size;
-+ struct inode *root = sb->s_root->d_inode;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ struct ext3_mb_grp_header *hdr;
-+ struct buffer_head *bh = NULL;
-+ unsigned long block;
-+ struct dentry *db;
-+ handle_t *handle;
-+ tid_t target;
-+
-+ *created = 0;
++ int i, len;
++
+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
-+ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_buddy_blocks == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
++ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ if (sbi->s_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_buddy_blocks, 0, len);
-+ sbi->s_buddy = NULL;
-+
-+ down(&root->i_sem);
-+ len = strlen(EXT3_BUDDY_FILE);
-+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len);
-+ if (IS_ERR(db)) {
-+ err = PTR_ERR(db);
-+ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err);
-+ up(&root->i_sem);
-+ goto out;
-+ }
++ memset(sbi->s_group_info, 0, len);
+
-+ if (db->d_inode == NULL) {
-+ err = ext3_create(root, db, S_IFREG, NULL);
-+ if (err) {
-+ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err);
-+ up(&root->i_sem);
-+ goto out;
-+ }
-+ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME;
-+ *created = 1;
-+ mb_debug("no buddy file, regenerate\n");
-+ }
-+ up(&root->i_sem);
-+ sbi->s_buddy = igrab(db->d_inode);
-+
-+ /* calculate needed size */
-+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
-+ / sizeof(struct ext3_mb_group_descr);
-+ buddy_offset = (sbi->s_groups_count + descr_per_block - 1)
-+ / descr_per_block;
-+ len = sbi->s_groups_count * sb->s_blocksize * 2 +
-+ buddy_offset * sb->s_blocksize;
-+ if (len != i_size_read(sbi->s_buddy)) {
-+ if (*created == 0)
-+ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n",
-+ (unsigned) len,
-+ (unsigned) i_size_read(sbi->s_buddy));
-+ *created = 1;
-+ }
-+
-+ /* read/create mb group descriptors */
-+ for (i = 0; i < buddy_offset; i++) {
-+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
-+ if (IS_ERR(handle)) {
-+ printk(KERN_ERR "EXT3-fs: cant start transaction\n");
-+ err = PTR_ERR(handle);
-+ goto err_out;
-+ }
-+
-+ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err);
-+ if (bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err);
-+ goto err_out;
-+ }
-+ hdr = (struct ext3_mb_grp_header *) bh->b_data;
-+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err)
-+ goto err_out;
-+ if (*created == 0)
-+ printk(KERN_ERR
-+ "EXT3-fs: invalid header 0x%x in %d,"
-+ "regenerate\n", hdr->mh_magic, i);
-+ *created = 1;
-+ hdr->mh_magic = EXT3_MB_MAGIC_V1;
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err)
-+ goto err_out;
-+ }
-+ brelse(bh);
-+ ext3_journal_stop(handle);
++ sbi->s_buddy_cache = new_inode(sb);
++ if (sbi->s_buddy_cache == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
++ kfree(sbi->s_group_info);
++ return -ENOMEM;
+ }
+
+ /*
-+ * if change bb_counters size, don't forget about ext3_mb_generate_buddy()
++ * calculate needed size. if change bb_counters size,
++ * don't forget about ext3_mb_generate_buddy()
+ */
-+ len = sizeof(struct ext3_buddy_group_blocks);
-+ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2);
++ len = sizeof(struct ext3_group_info);
++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
+ for (i = 0; i < sbi->s_groups_count; i++) {
++ struct ext3_group_desc * desc;
+
-+ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_buddy_blocks[i] == NULL) {
++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
++ if (sbi->s_group_info[i] == NULL) {
+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
-+ err = -ENOMEM;
-+ goto out2;
-+ }
-+ memset(sbi->s_buddy_blocks[i], 0, len);
-+
-+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
-+ if (IS_ERR(handle)) {
-+ printk(KERN_ERR "EXT3-fs: cant start transaction\n");
-+ err = PTR_ERR(handle);
-+ goto out2;
-+ }
-+
-+ /* allocate block for bitmap */
-+ block = buddy_offset + i * 2;
-+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
-+ if (bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err);
-+ goto out2;
-+ }
-+ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr;
-+ brelse(bh);
-+
-+ /* allocate block for buddy */
-+ block = buddy_offset + i * 2 + 1;
-+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
-+ if (bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err);
-+ goto out2;
++ goto err_out;
+ }
-+ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr;
-+ brelse(bh);
-+
-+ size = (block + 1) << sbi->s_buddy->i_blkbits;
-+ if (size > sbi->s_buddy->i_size) {
-+ *created = 1;
-+ EXT3_I(sbi->s_buddy)->i_disksize = size;
-+ i_size_write(sbi->s_buddy, size);
-+ mark_inode_dirty(sbi->s_buddy);
++ desc = ext3_get_group_desc(sb, i, NULL);
++ if (desc == NULL) {
++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i);
++ goto err_out;
+ }
-+ ext3_journal_stop(handle);
-+
-+ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock);
-+ sbi->s_buddy_blocks[i]->bb_md_cur = NULL;
-+ sbi->s_buddy_blocks[i]->bb_tid = 0;
++ memset(sbi->s_group_info[i], 0, len);
++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
++ &sbi->s_group_info[i]->bb_state);
++ sbi->s_group_info[i]->bb_free =
++ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
-+ if (journal_start_commit(sbi->s_journal, &target))
-+ log_wait_commit(sbi->s_journal, target);
-+
-+out2:
-+ dput(db);
-+out:
-+ return err;
++ return 0;
+
+err_out:
-+ return err;
++ while (--i >= 0)
++ kfree(sbi->s_group_info[i]);
++ iput(sbi->s_buddy_cache);
++
++ return -ENOMEM;
+}
+
-+int ext3_mb_write_descriptors(struct super_block *sb)
++int ext3_mb_init(struct super_block *sb, int needs_recovery)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ struct ext3_buddy e3b;
-+ int ret = 0, i, err;
++ struct inode *root = sb->s_root->d_inode;
++ unsigned i, offset, max;
++ struct dentry *dentry;
+
-+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_buddy_blocks[i] == NULL)
-+ continue;
++ if (!test_opt(sb, MBALLOC))
++ return 0;
+
-+ err = ext3_mb_load_buddy(sb, i, &e3b);
-+ if (err == 0) {
-+ ext3_mb_update_descr(&e3b);
-+ ext3_mb_release_desc(&e3b);
-+ } else
-+ ret = err;
++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
++
++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
++ if (sbi->s_mb_offsets == NULL) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ return -ENOMEM;
+ }
-+ return ret;
++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
++ if (sbi->s_mb_maxs == NULL) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ kfree(sbi->s_mb_maxs);
++ return -ENOMEM;
++ }
++
++ /* order 0 is regular bitmap */
++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
++ sbi->s_mb_offsets[0] = 0;
++
++ i = 1;
++ offset = 0;
++ max = sb->s_blocksize << 2;
++ do {
++ sbi->s_mb_offsets[i] = offset;
++ sbi->s_mb_maxs[i] = max;
++ offset += 1 << (sb->s_blocksize_bits - i);
++ max = max >> 1;
++ i++;
++ } while (i <= sb->s_blocksize_bits + 1);
++
++
++ /* init file for buddy data */
++ if ((i = ext3_mb_init_backend(sb))) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ kfree(sbi->s_mb_offsets);
++ kfree(sbi->s_mb_maxs);
++ return i;
++ }
++
++ spin_lock_init(&sbi->s_reserve_lock);
++ spin_lock_init(&sbi->s_md_lock);
++ INIT_LIST_HEAD(&sbi->s_active_transaction);
++ INIT_LIST_HEAD(&sbi->s_closed_transaction);
++ INIT_LIST_HEAD(&sbi->s_committed_transaction);
++ spin_lock_init(&sbi->s_bal_lock);
++
++ /* remove old on-disk buddy file */
++ down(&root->i_sem);
++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy"));
++ if (dentry->d_inode != NULL) {
++ i = vfs_unlink(root, dentry);
++ if (i != 0)
++ printk("EXT3-fs: can't remove .buddy file: %d\n", i);
++ }
++ dput(dentry);
++ up(&root->i_sem);
++
++ ext3_mb_history_init(sb);
++
++ printk("EXT3-fs: mballoc enabled\n");
++ return 0;
+}
+
+int ext3_mb_release(struct super_block *sb)
+ spin_unlock(&sbi->s_md_lock);
+ ext3_mb_free_committed_blocks(sb);
+
-+ if (sbi->s_buddy_blocks) {
-+ ext3_mb_write_descriptors(sb);
++ if (sbi->s_group_info) {
+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_buddy_blocks[i] == NULL)
++ if (sbi->s_group_info[i] == NULL)
+ continue;
-+ kfree(sbi->s_buddy_blocks[i]);
++ kfree(sbi->s_group_info[i]);
+ }
-+ kfree(sbi->s_buddy_blocks);
-+ }
-+ if (sbi->s_buddy)
-+ iput(sbi->s_buddy);
++ kfree(sbi->s_group_info);
++ }
++ if (sbi->s_mb_offsets)
++ kfree(sbi->s_mb_offsets);
++ if (sbi->s_mb_maxs)
++ kfree(sbi->s_mb_maxs);
++ if (sbi->s_buddy_cache)
++ iput(sbi->s_buddy_cache);
+ if (sbi->s_blocks_reserved)
+ printk("ext3-fs: %ld blocks being reserved at umount!\n",
+ sbi->s_blocks_reserved);
+ if (ext3_mb_stats) {
-+ printk("EXT3-fs: mballoc: %lu blocks %lu reqs "
-+ "(%lu success)\n", sbi->s_bal_allocated,
-+ sbi->s_bal_reqs, sbi->s_bal_success);
-+ printk("EXT3-fs: mballoc: %lu extents scanned, "
-+ "%lu goal hits, %lu breaks\n", sbi->s_bal_ex_scanned,
-+ sbi->s_bal_goals, sbi->s_bal_breaks);
-+ }
-+
-+ return 0;
-+}
-+
-+int ext3_mb_init(struct super_block *sb, int needs_recovery)
-+{
-+ struct ext3_buddy e3b;
-+ int i, err, created;
-+
-+ if (!test_opt(sb, MBALLOC))
-+ return 0;
-+
-+ /* init file for buddy data */
-+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
-+ if ((err = ext3_mb_init_backend(sb, &created)))
-+ return err;
-+
-+repeat:
-+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
-+ err = ext3_mb_load_buddy(sb, i, &e3b);
-+ if (err) {
-+ /* FIXME: release backend */
-+ return err;
-+ }
-+ if (created || needs_recovery)
-+ ext3_mb_generate_buddy(&e3b);
-+ else
-+ err = ext3_mb_load_descr(&e3b);
-+ ext3_mb_release_desc(&e3b);
-+ if (err == -ENODATA) {
-+ created = 1;
-+ goto repeat;
-+ }
-+ }
-+ if (created || needs_recovery)
-+ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n",
-+ EXT3_SB(sb)->s_groups_count);
-+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock);
-+ spin_lock_init(&EXT3_SB(sb)->s_md_lock);
-+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction);
-+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction);
-+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction);
-+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
-+
-+ spin_lock_init(&EXT3_SB(sb)->s_bal_lock);
-+ if (ext3_mb_stats) {
-+ printk("EXT3-fs: mballoc enabled (stats)\n");
-+ } else {
-+ printk("EXT3-fs: mballoc enabled\n");
-+ }
++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n",
++ atomic_read(&sbi->s_bal_allocated),
++ atomic_read(&sbi->s_bal_reqs),
++ atomic_read(&sbi->s_bal_success));
++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, "
++ "%u 2^N hits, %u breaks\n",
++ atomic_read(&sbi->s_bal_ex_scanned),
++ atomic_read(&sbi->s_bal_goals),
++ atomic_read(&sbi->s_bal_2orders),
++ atomic_read(&sbi->s_bal_breaks));
++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n",
++ sbi->s_mb_buddies_generated++,
++ sbi->s_mb_generation_time);
++ }
++
++ ext3_mb_history_release(sb);
+
+ return 0;
+}
+ mb_debug("\n");
+ ext3_unlock_group(sb, md->group);
+
++ /* balance refcounts from ext3_mb_free_metadata() */
++ page_cache_release(e3b.bd_buddy_page);
++ page_cache_release(e3b.bd_bitmap_page);
++
+ kfree(md);
-+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
+ } while (md);
+ /* new transaction! time to close last one and free blocks for
+ * committed transaction. we know that only transaction can be
+ * active, so previos transaction can be being logged and we
-+ * know that transaction before previous is known to be alreade
++ * know that transaction before previous is known to be already
+ * logged. this means that now we may free blocks freed in all
+ * transactions before previous one. hope I'm clear enough ... */
+
+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b,
+ int group, int block, int count)
+{
-+ struct ext3_buddy_group_blocks *db = e3b->bd_bd;
++ struct ext3_group_info *db = e3b->bd_info;
+ struct super_block *sb = e3b->bd_sb;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ struct ext3_free_metadata *md;
+ int i;
+
++ J_ASSERT(e3b->bd_bitmap_page != NULL);
++ J_ASSERT(e3b->bd_buddy_page != NULL);
++
+ ext3_lock_group(sb, group);
+ for (i = 0; i < count; i++) {
+ md = db->bb_md_cur;
+ spin_lock(&sbi->s_md_lock);
+ list_add(&md->list, &sbi->s_active_transaction);
+ spin_unlock(&sbi->s_md_lock);
++ /* protect buddy cache from being freed,
++ * otherwise we'll refresh it from
++ * on-disk bitmap and lose not-yet-available
++ * blocks */
++ page_cache_get(e3b->bd_buddy_page);
++ page_cache_get(e3b->bd_bitmap_page);
+ db->bb_md_cur = md;
+ db->bb_tid = handle->h_transaction->t_tid;
+ mb_debug("new md 0x%p for group %u\n",
+ if (err)
+ goto error_return;
+
-+ if (unlikely(ext3_mb_aggressive)) {
++#ifdef AGGRESSIVE_CHECK
++ {
+ int i;
+ for (i = 0; i < count; i++)
+ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data));
+ }
-+
++#endif
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
+
+ /* We dirtied the bitmap block */
+ spin_unlock(sb_bgl_lock(sbi, block_group));
+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+
-+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
+ *freed = count;
+}
+
+
-+extern void ext3_free_blocks_old(handle_t *handle, struct inode *inode,
-+ unsigned long block, unsigned long count);
-+void ext3_free_blocks(handle_t *handle, struct inode *inode,
-+ unsigned long block, unsigned long count, int metadata)
++void ext3_free_blocks(handle_t *handle, struct inode * inode,
++ unsigned long block, unsigned long count, int metadata)
+{
++ struct super_block *sb;
+ int freed;
+
-+ if (!test_opt(inode->i_sb, MBALLOC) ||
-+ EXT3_SB(inode->i_sb)->s_buddy_blocks == NULL)
++ sb = inode->i_sb;
++ if (!test_opt(sb, MBALLOC))
+ ext3_free_blocks_old(handle, inode, block, count);
+ else {
-+ ext3_mb_free_blocks(handle, inode, block,count,metadata,&freed);
++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
+ if (freed)
+ DQUOT_FREE_BLOCK(inode, freed);
+ }
+ return;
+}
-Index: linux-2.6.5-7.201/fs/ext3/proc.c
-===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/proc.c 2005-10-13 19:40:57.851699336 +0400
-+++ linux-2.6.5-7.201/fs/ext3/proc.c 2005-10-14 09:02:36.000000000 +0400
-@@ -0,0 +1,195 @@
-+#include <linux/config.h>
-+#include <linux/fs.h>
-+#include <linux/init.h>
-+#include <linux/module.h>
-+#include <linux/kernel.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/proc_fs.h>
-+#include <linux/errno.h>
-+#include <asm/uaccess.h>
-+
+
+#define EXT3_ROOT "ext3"
-+#define EXT3_MB_AGGRESSIVE_NAME "mb_aggressive"
+#define EXT3_MB_STATS_NAME "mb_stats"
+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
+
-+
-+static struct proc_dir_entry *proc_root_ext3;
-+
-+
-+static int ext3_mb_aggressive_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
++static int ext3_mb_stats_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
+ int len;
+
+ if (off != 0)
+ return 0;
+
-+ len = sprintf(page, "%ld\n", ext3_mb_aggressive);
++ len = sprintf(page, "%ld\n", ext3_mb_stats);
+ *start = page;
+ return len;
+}
+
-+static int ext3_mb_aggressive_write(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
++static int ext3_mb_stats_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
+{
+ char str[32];
+
+ if (count >= sizeof(str)) {
+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
-+ EXT3_MB_AGGRESSIVE_NAME, sizeof(str));
++ EXT3_MB_STATS_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+
+ return -EFAULT;
+
+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
-+ ext3_mb_aggressive = (simple_strtol(str, NULL, 0) != 0);
++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0);
+ return count;
+}
+
-+static int ext3_mb_stats_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
+ int len;
+
+ if (off != 0)
+ return 0;
+
-+ len = sprintf(page, "%ld\n", ext3_mb_stats);
++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan);
+ *start = page;
+ return len;
+}
+
-+static int ext3_mb_stats_write(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
+{
+ char str[32];
++ long value;
+
+ if (count >= sizeof(str)) {
+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
-+ EXT3_MB_STATS_NAME, sizeof(str));
++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+
+ return -EFAULT;
+
+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
-+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0);
++ value = simple_strtol(str, NULL, 0);
++ if (value <= 0)
++ return -ERANGE;
++
++ ext3_mb_max_to_scan = value;
++
+ return count;
+}
+
-+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
+ int len;
+
+ if (off != 0)
+ return 0;
+
-+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan);
++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan);
+ *start = page;
+ return len;
+}
+
-+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
+{
+ char str[32];
+ long value;
+
+ if (count >= sizeof(str)) {
+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
-+ EXT3_MB_MAX_TO_SCAN_NAME, sizeof(str));
++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+
+
+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
+ value = simple_strtol(str, NULL, 0);
-+ if (value <= 0)
++ if (value <= 0)
+ return -ERANGE;
+
-+ ext3_mb_max_to_scan = value;
++ ext3_mb_min_to_scan = value;
+
+ return count;
+}
+
+int __init init_ext3_proc(void)
+{
-+ struct proc_dir_entry *proc_ext3_mb_aggressive;
+ struct proc_dir_entry *proc_ext3_mb_stats;
+ struct proc_dir_entry *proc_ext3_mb_max_to_scan;
++ struct proc_dir_entry *proc_ext3_mb_min_to_scan;
+
+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs);
+ if (proc_root_ext3 == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT);
-+ return -EIO;
-+ }
-+
-+ /* Initialize EXT3_MB_AGGRESSIVE_NAME */
-+ proc_ext3_mb_aggressive = create_proc_entry(EXT3_MB_AGGRESSIVE_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
-+ if (proc_ext3_mb_aggressive == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_AGGRESSIVE_NAME);
-+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT);
+ return -EIO;
+ }
+
-+ proc_ext3_mb_aggressive->data = NULL;
-+ proc_ext3_mb_aggressive->read_proc = ext3_mb_aggressive_read;
-+ proc_ext3_mb_aggressive->write_proc = ext3_mb_aggressive_write;
-+
+ /* Initialize EXT3_MB_STATS_NAME */
+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
+ if (proc_ext3_mb_stats == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_STATS_NAME);
-+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3);
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_STATS_NAME);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+ return -EIO;
+ }
+
+ /* Initialize EXT3_MAX_TO_SCAN_NAME */
+ proc_ext3_mb_max_to_scan = create_proc_entry(
-+ EXT3_MB_MAX_TO_SCAN_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ EXT3_MB_MAX_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
+ if (proc_ext3_mb_max_to_scan == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_STATS_NAME);
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_MAX_TO_SCAN_NAME);
+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+ return -EIO;
+ }
+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read;
+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write;
+
++ /* Initialize EXT3_MIN_TO_SCAN_NAME */
++ proc_ext3_mb_min_to_scan = create_proc_entry(
++ EXT3_MB_MIN_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_min_to_scan == NULL) {
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_MIN_TO_SCAN_NAME);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_min_to_scan->data = NULL;
++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read;
++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write;
++
+ return 0;
+}
+
+void exit_ext3_proc(void)
+{
-+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+}
-Index: linux-2.6.5-7.201/fs/ext3/inode.c
-===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/inode.c 2005-10-14 08:59:38.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/inode.c 2005-10-14 08:59:39.000000000 +0400
-@@ -572,7 +572,7 @@
- ext3_journal_forget(handle, branch[i].bh);
- }
- for (i = 0; i < keys; i++)
-- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
-+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
- return err;
- }
-
-@@ -673,7 +673,7 @@
- if (err == -EAGAIN)
- for (i = 0; i < num; i++)
- ext3_free_blocks(handle, inode,
-- le32_to_cpu(where[i].key), 1);
-+ le32_to_cpu(where[i].key), 1, 1);
- return err;
- }
-
-@@ -1835,7 +1835,7 @@
- }
- }
-
-- ext3_free_blocks(handle, inode, block_to_free, count);
-+ ext3_free_blocks(handle, inode, block_to_free, count, 1);
- }
-
- /**
-@@ -2006,7 +2006,7 @@
- ext3_journal_test_restart(handle, inode);
- }
-
-- ext3_free_blocks(handle, inode, nr, 1);
-+ ext3_free_blocks(handle, inode, nr, 1, 1);
-
- if (parent_bh) {
- /*
-Index: linux-2.6.5-7.201/fs/ext3/super.c
++
+Index: linux-2.6.5-7.201/fs/ext3/Makefile
===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/super.c 2005-10-14 08:59:38.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/super.c 2005-10-14 09:02:36.000000000 +0400
-@@ -389,6 +389,7 @@
- struct ext3_super_block *es = sbi->s_es;
- int i;
-
-+ ext3_mb_release(sb);
- ext3_ext_release(sb);
- ext3_xattr_put_super(sb);
- journal_destroy(sbi->s_journal);
-@@ -543,6 +544,7 @@
- Opt_commit, Opt_journal_update, Opt_journal_inum,
- Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-+ Opt_mballoc, Opt_mbfactor,
- Opt_err, Opt_extents, Opt_extdebug
- };
-
-@@ -590,6 +592,8 @@
- {Opt_iopen_nopriv, "iopen_nopriv"},
- {Opt_extents, "extents"},
- {Opt_extdebug, "extdebug"},
-+ {Opt_mballoc, "mballoc"},
-+ {Opt_mbfactor, "mbfactor=%u"},
- {Opt_err, NULL}
- };
-
-@@ -811,6 +815,16 @@
- case Opt_extdebug:
- set_opt (sbi->s_mount_opt, EXTDEBUG);
- break;
-+ case Opt_mballoc:
-+ set_opt (sbi->s_mount_opt, MBALLOC);
-+ break;
-+ case Opt_mbfactor:
-+ if (match_int(&args[0], &option))
-+ return 0;
-+ if (option < 0)
-+ return 0;
-+ sbi->s_mb_factor = option;
-+ break;
- default:
- printk (KERN_ERR
- "EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1464,6 +1478,7 @@
- ext3_count_dirs(sb));
-
- ext3_ext_init(sb);
-+ ext3_mb_init(sb, needs_recovery);
-
- return 0;
-
-@@ -2112,7 +2127,13 @@
+--- linux-2.6.5-7.201.orig/fs/ext3/Makefile 2005-12-17 02:53:30.000000000 +0300
++++ linux-2.6.5-7.201/fs/ext3/Makefile 2005-12-17 03:10:23.000000000 +0300
+@@ -5,7 +5,8 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
- static int __init init_ext3_fs(void)
- {
-- int err = init_ext3_xattr();
-+ int err;
-+
-+ err = init_ext3_proc();
-+ if (err)
-+ return err;
-+
-+ err = init_ext3_xattr();
- if (err)
- return err;
- err = init_inodecache();
-@@ -2141,6 +2162,7 @@
- unregister_filesystem(&ext3_fs_type);
- destroy_inodecache();
- exit_ext3_xattr();
-+ exit_ext3_proc();
- }
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+- ioctl.o namei.o super.o symlink.o hash.o extents.o
++ ioctl.o namei.o super.o symlink.o hash.o extents.o \
++ mballoc.o
- int ext3_prep_san_write(struct inode *inode, long *blocks,
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
--- /dev/null
+Index: linux-2.6.12.6/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.12.6.orig/include/linux/ext3_fs.h 2005-12-17 02:17:16.000000000 +0300
++++ linux-2.6.12.6/include/linux/ext3_fs.h 2005-12-17 02:21:21.000000000 +0300
+@@ -57,6 +57,14 @@ struct statfs;
+ #define ext3_debug(f, a...) do {} while (0)
+ #endif
+
++#define EXT3_MULTIBLOCK_ALLOCATOR 1
++
++#define EXT3_MB_HINT_MERGE 1
++#define EXT3_MB_HINT_RESERVED 2
++#define EXT3_MB_HINT_METADATA 4
++#define EXT3_MB_HINT_FIRST 8
++#define EXT3_MB_HINT_BEST 16
++
+ /*
+ * Special inodes numbers
+ */
+@@ -366,6 +374,7 @@ struct ext3_inode {
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000 /* Make iopen world-readable */
+ #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */
+ #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */
++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -727,7 +736,7 @@ extern int ext3_bg_has_super(struct supe
+ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+ extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
+ extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
+- unsigned long);
++ unsigned long, int);
+ extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
+ unsigned long, unsigned long, int *);
+ extern unsigned long ext3_count_free_blocks (struct super_block *);
+@@ -848,6 +857,17 @@ extern void ext3_extents_initialize_bloc
+ extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg);
+
++/* mballoc.c */
++extern long ext3_mb_stats;
++extern long ext3_mb_max_to_scan;
++extern int ext3_mb_init(struct super_block *, int);
++extern int ext3_mb_release(struct super_block *);
++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
++extern int ext3_mb_reserve_blocks(struct super_block *, int);
++extern void ext3_mb_release_blocks(struct super_block *, int);
++int __init init_ext3_proc(void);
++void exit_ext3_proc(void);
++
+ #endif /* __KERNEL__ */
+
+ /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
+Index: linux-2.6.12.6/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-2.6.12.6.orig/include/linux/ext3_fs_sb.h 2005-08-29 20:55:27.000000000 +0400
++++ linux-2.6.12.6/include/linux/ext3_fs_sb.h 2005-12-17 02:21:21.000000000 +0300
+@@ -21,8 +21,14 @@
+ #include <linux/wait.h>
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
++#include <linux/list.h>
+ #endif
+ #include <linux/rbtree.h>
++#include <linux/proc_fs.h>
++
++struct ext3_buddy_group_blocks;
++struct ext3_mb_history;
++#define EXT3_BB_MAX_BLOCKS
+
+ /*
+ * third extended-fs super-block data in memory
+@@ -78,6 +84,38 @@ struct ext3_sb_info {
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+ #endif
++
++ /* for buddy allocator */
++ struct ext3_group_info **s_group_info;
++ struct inode *s_buddy_cache;
++ long s_blocks_reserved;
++ spinlock_t s_reserve_lock;
++ struct list_head s_active_transaction;
++ struct list_head s_closed_transaction;
++ struct list_head s_committed_transaction;
++ spinlock_t s_md_lock;
++ tid_t s_last_transaction;
++ int s_mb_factor;
++ unsigned short *s_mb_offsets, *s_mb_maxs;
++
++ /* history to debug policy */
++ struct ext3_mb_history *s_mb_history;
++ int s_mb_history_cur;
++ int s_mb_history_max;
++ struct proc_dir_entry *s_mb_proc;
++ spinlock_t s_mb_history_lock;
++
++ /* stats for buddy allocator */
++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
++ atomic_t s_bal_success; /* we found long enough chunks */
++ atomic_t s_bal_allocated; /* in blocks */
++ atomic_t s_bal_ex_scanned; /* total extents scanned */
++ atomic_t s_bal_goals; /* goal hits */
++ atomic_t s_bal_breaks; /* too long searches */
++ atomic_t s_bal_2orders; /* 2^order hits */
++ spinlock_t s_bal_lock;
++ unsigned long s_mb_buddies_generated;
++ unsigned long long s_mb_generation_time;
+ };
+
+ #endif /* _LINUX_EXT3_FS_SB */
+Index: linux-2.6.12.6/fs/ext3/super.c
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/super.c 2005-12-17 02:17:16.000000000 +0300
++++ linux-2.6.12.6/fs/ext3/super.c 2005-12-17 02:21:21.000000000 +0300
+@@ -387,6 +387,7 @@ static void ext3_put_super (struct super
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_mb_release(sb);
+ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+@@ -597,6 +598,7 @@ enum {
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug,
++ Opt_mballoc,
+ };
+
+ static match_table_t tokens = {
+@@ -649,6 +651,7 @@ static match_table_t tokens = {
+ {Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
+ {Opt_extdebug, "extdebug"},
++ {Opt_mballoc, "mballoc"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+@@ -964,6 +967,9 @@ clear_qf_name:
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
++ case Opt_mballoc:
++ set_opt (sbi->s_mount_opt, MBALLOC);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1669,6 +1675,7 @@ static int ext3_fill_super (struct super
+ ext3_count_dirs(sb));
+
+ ext3_ext_init(sb);
++ ext3_mb_init(sb, needs_recovery);
+ lock_kernel();
+ return 0;
+
+@@ -2548,7 +2555,13 @@ static struct file_system_type ext3_fs_t
+
+ static int __init init_ext3_fs(void)
+ {
+- int err = init_ext3_xattr();
++ int err;
++
++ err = init_ext3_proc();
++ if (err)
++ return err;
++
++ err = init_ext3_xattr();
+ if (err)
+ return err;
+ err = init_inodecache();
+@@ -2570,6 +2583,7 @@ static void __exit exit_ext3_fs(void)
+ unregister_filesystem(&ext3_fs_type);
+ destroy_inodecache();
+ exit_ext3_xattr();
++ exit_ext3_proc();
+ }
+
+ int ext3_prep_san_write(struct inode *inode, long *blocks,
+Index: linux-2.6.12.6/fs/ext3/extents.c
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/extents.c 2005-12-17 02:17:16.000000000 +0300
++++ linux-2.6.12.6/fs/ext3/extents.c 2005-12-17 02:21:21.000000000 +0300
+@@ -771,7 +771,7 @@ cleanup:
+ for (i = 0; i < depth; i++) {
+ if (!ablocks[i])
+ continue;
+- ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1);
+ }
+ }
+ kfree(ablocks);
+@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st
+ path->p_idx->ei_leaf);
+ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
+ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
+- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1);
+ return err;
+ }
+
+@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t
+ int needed = ext3_remove_blocks_credits(tree, ex, from, to);
+ handle_t *handle = ext3_journal_start(tree->inode, needed);
+ struct buffer_head *bh;
+- int i;
++ int i, metadata = 0;
+
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode))
++ metadata = 1;
+ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
+ /* tail removal */
+ unsigned long num, start;
+@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t
+ bh = sb_find_get_block(tree->inode->i_sb, start + i);
+ ext3_forget(handle, 0, tree->inode, bh, start + i);
+ }
+- ext3_free_blocks(handle, tree->inode, start, num);
++ ext3_free_blocks(handle, tree->inode, start, num, metadata);
+ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
+ printk("strange request: removal %lu-%lu from %u:%u\n",
+ from, to, ex->ee_block, ex->ee_len);
+Index: linux-2.6.12.6/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/inode.c 2005-12-17 02:17:16.000000000 +0300
++++ linux-2.6.12.6/fs/ext3/inode.c 2005-12-17 02:21:21.000000000 +0300
+@@ -564,7 +564,7 @@ static int ext3_alloc_branch(handle_t *h
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i < keys; i++)
+- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
+ return err;
+ }
+
+@@ -1850,7 +1850,7 @@ ext3_clear_blocks(handle_t *handle, stru
+ }
+ }
+
+- ext3_free_blocks(handle, inode, block_to_free, count);
++ ext3_free_blocks(handle, inode, block_to_free, count, 1);
+ }
+
+ /**
+@@ -2023,7 +2023,7 @@ static void ext3_free_branches(handle_t
+ ext3_journal_test_restart(handle, inode);
+ }
+
+- ext3_free_blocks(handle, inode, nr, 1);
++ ext3_free_blocks(handle, inode, nr, 1, 1);
+
+ if (parent_bh) {
+ /*
+Index: linux-2.6.12.6/fs/ext3/balloc.c
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/balloc.c 2005-08-29 20:55:27.000000000 +0400
++++ linux-2.6.12.6/fs/ext3/balloc.c 2005-12-17 02:21:21.000000000 +0300
+@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ read_block_bitmap(struct super_block *sb, unsigned int block_group)
+ {
+ struct ext3_group_desc * desc;
+@@ -490,24 +490,6 @@ error_return:
+ return;
+ }
+
+-/* Free given blocks, update quota and i_blocks field */
+-void ext3_free_blocks(handle_t *handle, struct inode *inode,
+- unsigned long block, unsigned long count)
+-{
+- struct super_block * sb;
+- int dquot_freed_blocks;
+-
+- sb = inode->i_sb;
+- if (!sb) {
+- printk ("ext3_free_blocks: nonexistent device");
+- return;
+- }
+- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+- if (dquot_freed_blocks)
+- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+- return;
+-}
+-
+ /*
+ * For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy. This
+@@ -1162,7 +1144,7 @@ int ext3_should_retry_alloc(struct super
+ * bitmap, and then for any free bit if that fails.
+ * This function also updates quota and i_blocks field.
+ */
+-int ext3_new_block(handle_t *handle, struct inode *inode,
++int ext3_new_block_old(handle_t *handle, struct inode *inode,
+ unsigned long goal, int *errp)
+ {
+ struct buffer_head *bitmap_bh = NULL;
+Index: linux-2.6.12.6/fs/ext3/xattr.c
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/xattr.c 2005-08-29 20:55:27.000000000 +0400
++++ linux-2.6.12.6/fs/ext3/xattr.c 2005-12-17 02:21:33.000000000 +0300
+@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl
+ ea_bdebug(bh, "refcount now=0; freeing");
+ if (ce)
+ mb_cache_entry_free(ce);
+- ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
+ get_bh(bh);
+ ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+ } else {
+@@ -804,7 +804,7 @@ inserted:
+ new_bh = sb_getblk(sb, block);
+ if (!new_bh) {
+ getblk_failed:
+- ext3_free_blocks(handle, inode, block, 1);
++ ext3_free_blocks(handle, inode, block, 1, 1);
+ error = -EIO;
+ goto cleanup;
+ }
+Index: linux-2.6.12.6/fs/ext3/mballoc.c
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/mballoc.c 2005-12-09 13:08:53.191437750 +0300
++++ linux-2.6.12.6/fs/ext3/mballoc.c 2005-12-17 02:21:21.000000000 +0300
+@@ -0,0 +1,2434 @@
++/*
++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++
++/*
++ * mballoc.c contains the multiblocks allocation routines
++ */
++
++#include <linux/config.h>
++#include <linux/time.h>
++#include <linux/fs.h>
++#include <linux/namei.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/quotaops.h>
++#include <linux/buffer_head.h>
++#include <linux/module.h>
++#include <linux/swap.h>
++#include <linux/proc_fs.h>
++#include <linux/pagemap.h>
++#include <linux/seq_file.h>
++
++/*
++ * TODO:
++ * - bitmap read-ahead (proposed by Oleg Drokin aka green)
++ * - track min/max extents in each group for better group selection
++ * - mb_mark_used() may allocate chunk right after splitting buddy
++ * - special flag to advice allocator to look for requested + N blocks
++ * this may improve interaction between extents and mballoc
++ * - tree of groups sorted by number of free blocks
++ * - percpu reservation code (hotpath)
++ * - error handling
++ */
++
++/*
++ * with AGRESSIVE_CHECK allocator runs consistency checks over
++ * structures. these checks slow things down a lot
++ */
++#define AGGRESSIVE_CHECK__
++
++/*
++ */
++#define MB_DEBUG__
++#ifdef MB_DEBUG
++#define mb_debug(fmt,a...) printk(fmt, ##a)
++#else
++#define mb_debug(fmt,a...)
++#endif
++
++/*
++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory
++ * and you can monitor it in /proc/fs/ext3/<dev>/mb_history
++ */
++#define EXT3_MB_HISTORY
++
++/*
++ * How long mballoc can look for a best extent (in found extents)
++ */
++long ext3_mb_max_to_scan = 500;
++
++/*
++ * How long mballoc must look for a best extent
++ */
++long ext3_mb_min_to_scan = 30;
++
++/*
++ * with 'ext3_mb_stats' allocator will collect stats that will be
++ * shown at umount. The collecting costs though!
++ */
++
++long ext3_mb_stats = 1;
++
++#ifdef EXT3_BB_MAX_BLOCKS
++#undef EXT3_BB_MAX_BLOCKS
++#endif
++#define EXT3_BB_MAX_BLOCKS 30
++
++struct ext3_free_metadata {
++ unsigned short group;
++ unsigned short num;
++ unsigned short blocks[EXT3_BB_MAX_BLOCKS];
++ struct list_head list;
++};
++
++struct ext3_group_info {
++ unsigned long bb_state;
++ unsigned long bb_tid;
++ struct ext3_free_metadata *bb_md_cur;
++ unsigned short bb_first_free;
++ unsigned short bb_free;
++ unsigned short bb_fragments;
++ unsigned short bb_counters[];
++};
++
++
++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0
++#define EXT3_GROUP_INFO_LOCKED_BIT 1
++
++#define EXT3_MB_GRP_NEED_INIT(grp) \
++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state))
++
++struct ext3_free_extent {
++ __u16 fe_start;
++ __u16 fe_len;
++ __u16 fe_group;
++};
++
++struct ext3_allocation_context {
++ struct super_block *ac_sb;
++
++ /* search goals */
++ struct ext3_free_extent ac_g_ex;
++
++ /* the best found extent */
++ struct ext3_free_extent ac_b_ex;
++
++ /* number of iterations done. we have to track to limit searching */
++ unsigned long ac_ex_scanned;
++ __u16 ac_groups_scanned;
++ __u16 ac_found;
++ __u16 ac_tail;
++ __u16 ac_buddy;
++ __u8 ac_status;
++ __u8 ac_flags; /* allocation hints */
++ __u8 ac_criteria;
++ __u8 ac_repeats;
++ __u8 ac_2order; /* if request is to allocate 2^N blocks and
++ * N > 0, the field stores N, otherwise 0 */
++};
++
++#define AC_STATUS_CONTINUE 1
++#define AC_STATUS_FOUND 2
++#define AC_STATUS_BREAK 3
++
++struct ext3_mb_history {
++ struct ext3_free_extent goal; /* goal allocation */
++ struct ext3_free_extent result; /* result allocation */
++ __u16 found; /* how many extents have been found */
++ __u16 groups; /* how many groups have been scanned */
++ __u16 tail; /* what tail broke some buddy */
++ __u16 buddy; /* buddy the tail ^^^ broke */
++ __u8 cr; /* which phase the result extent was found at */
++ __u8 merged;
++};
++
++struct ext3_buddy {
++ struct page *bd_buddy_page;
++ void *bd_buddy;
++ struct page *bd_bitmap_page;
++ void *bd_bitmap;
++ struct ext3_group_info *bd_info;
++ struct super_block *bd_sb;
++ __u16 bd_blkbits;
++ __u16 bd_group;
++};
++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap)
++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy)
++
++#ifndef EXT3_MB_HISTORY
++#define ext3_mb_store_history(sb,ac)
++#else
++static void ext3_mb_store_history(struct super_block *,
++ struct ext3_allocation_context *ac);
++#endif
++
++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
++
++static struct proc_dir_entry *proc_root_ext3;
++
++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
++int ext3_mb_reserve_blocks(struct super_block *, int);
++void ext3_mb_release_blocks(struct super_block *, int);
++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *);
++void ext3_mb_free_committed_blocks(struct super_block *);
++
++#if BITS_PER_LONG == 64
++#define mb_correct_addr_and_bit(bit,addr) \
++{ \
++ bit += ((unsigned long) addr & 7UL) << 3; \
++ addr = (void *) ((unsigned long) addr & ~7UL); \
++}
++#elif BITS_PER_LONG == 32
++#define mb_correct_addr_and_bit(bit,addr) \
++{ \
++ bit += ((unsigned long) addr & 3UL) << 3; \
++ addr = (void *) ((unsigned long) addr & ~3UL); \
++}
++#else
++#error "how many bits you are?!"
++#endif
++
++static inline int mb_test_bit(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ return ext2_test_bit(bit, addr);
++}
++
++static inline void mb_set_bit(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ ext2_set_bit(bit, addr);
++}
++
++static inline void mb_set_bit_atomic(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ ext2_set_bit_atomic(NULL, bit, addr);
++}
++
++static inline void mb_clear_bit(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ ext2_clear_bit(bit, addr);
++}
++
++static inline void mb_clear_bit_atomic(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ ext2_clear_bit_atomic(NULL, bit, addr);
++}
++
++static inline int mb_find_next_zero_bit(void *addr, int max, int start)
++{
++ int fix;
++#if BITS_PER_LONG == 64
++ fix = ((unsigned long) addr & 7UL) << 3;
++ addr = (void *) ((unsigned long) addr & ~7UL);
++#elif BITS_PER_LONG == 32
++ fix = ((unsigned long) addr & 3UL) << 3;
++ addr = (void *) ((unsigned long) addr & ~3UL);
++#else
++#error "how many bits you are?!"
++#endif
++ max += fix;
++ start += fix;
++ return ext2_find_next_zero_bit(addr, max, start) - fix;
++}
++
++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
++{
++ char *bb;
++
++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
++ J_ASSERT(max != NULL);
++
++ if (order > e3b->bd_blkbits + 1) {
++ *max = 0;
++ return NULL;
++ }
++
++ /* at order 0 we see each particular block */
++ *max = 1 << (e3b->bd_blkbits + 3);
++ if (order == 0)
++ return EXT3_MB_BITMAP(e3b);
++
++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order];
++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order];
++
++ return bb;
++}
++
++#ifdef AGGRESSIVE_CHECK
++
++static void mb_check_buddy(struct ext3_buddy *e3b)
++{
++ int order = e3b->bd_blkbits + 1;
++ int max, max2, i, j, k, count;
++ int fragments = 0, fstart;
++ void *buddy, *buddy2;
++
++ if (!test_opt(e3b->bd_sb, MBALLOC))
++ return;
++
++ {
++ static int mb_check_counter = 0;
++ if (mb_check_counter++ % 300 != 0)
++ return;
++ }
++
++ while (order > 1) {
++ buddy = mb_find_buddy(e3b, order, &max);
++ J_ASSERT(buddy);
++ buddy2 = mb_find_buddy(e3b, order - 1, &max2);
++ J_ASSERT(buddy2);
++ J_ASSERT(buddy != buddy2);
++ J_ASSERT(max * 2 == max2);
++
++ count = 0;
++ for (i = 0; i < max; i++) {
++
++ if (mb_test_bit(i, buddy)) {
++ /* only single bit in buddy2 may be 1 */
++ if (!mb_test_bit(i << 1, buddy2))
++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2));
++ else if (!mb_test_bit((i << 1) + 1, buddy2))
++ J_ASSERT(mb_test_bit(i << 1, buddy2));
++ continue;
++ }
++
++ /* both bits in buddy2 must be 0 */
++ J_ASSERT(mb_test_bit(i << 1, buddy2));
++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
++
++ for (j = 0; j < (1 << order); j++) {
++ k = (i * (1 << order)) + j;
++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b)));
++ }
++ count++;
++ }
++ J_ASSERT(e3b->bd_info->bb_counters[order] == count);
++ order--;
++ }
++
++ fstart = -1;
++ buddy = mb_find_buddy(e3b, 0, &max);
++ for (i = 0; i < max; i++) {
++ if (!mb_test_bit(i, buddy)) {
++ J_ASSERT(i >= e3b->bd_info->bb_first_free);
++ if (fstart == -1) {
++ fragments++;
++ fstart = i;
++ }
++ continue;
++ }
++ fstart = -1;
++ /* check used bits only */
++ for (j = 0; j < e3b->bd_blkbits + 1; j++) {
++ buddy2 = mb_find_buddy(e3b, j, &max2);
++ k = i >> j;
++ J_ASSERT(k < max2);
++ J_ASSERT(mb_test_bit(k, buddy2));
++ }
++ }
++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info));
++ J_ASSERT(e3b->bd_info->bb_fragments == fragments);
++}
++
++#else
++#define mb_check_buddy(e3b)
++#endif
++
++/* find most significant bit */
++static int inline fmsb(unsigned short word)
++{
++ int order;
++
++ if (word > 255) {
++ order = 7;
++ word >>= 8;
++ } else {
++ order = -1;
++ }
++
++ do {
++ order++;
++ word >>= 1;
++ } while (word != 0);
++
++ return order;
++}
++
++static void inline
++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first,
++ int len, struct ext3_group_info *grp)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ unsigned short min, max, chunk, border;
++
++ mb_debug("mark %u/%u free\n", first, len);
++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb));
++
++ border = 2 << sb->s_blocksize_bits;
++
++ while (len > 0) {
++ /* find how many blocks can be covered since this position */
++ max = ffs(first | border) - 1;
++
++ /* find how many blocks of power 2 we need to mark */
++ min = fmsb(len);
++
++ mb_debug(" %u/%u -> max %u, min %u\n",
++ first & ((2 << sb->s_blocksize_bits) - 1),
++ len, max, min);
++
++ if (max < min)
++ min = max;
++ chunk = 1 << min;
++
++ /* mark multiblock chunks only */
++ grp->bb_counters[min]++;
++ if (min > 0) {
++ mb_debug(" set %u at %u \n", first >> min,
++ sbi->s_mb_offsets[min]);
++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]);
++ }
++
++ len -= chunk;
++ first += chunk;
++ }
++}
++
++static void
++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap,
++ struct ext3_group_info *grp)
++{
++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb);
++ unsigned short i = 0, first, len;
++ unsigned free = 0, fragments = 0;
++ unsigned long long period = get_cycles();
++
++ i = mb_find_next_zero_bit(bitmap, max, 0);
++ grp->bb_first_free = i;
++ while (i < max) {
++ fragments++;
++ first = i;
++ i = find_next_bit(bitmap, max, i);
++ len = i - first;
++ free += len;
++ if (len > 1)
++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp);
++ else
++ grp->bb_counters[0]++;
++ if (i < max)
++ i = mb_find_next_zero_bit(bitmap, max, i);
++ }
++ grp->bb_fragments = fragments;
++
++ /* bb_state shouldn't being modified because all
++ * others waits for init completion on page lock */
++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state);
++ if (free != grp->bb_free) {
++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n",
++ free, grp->bb_free);
++ grp->bb_free = free;
++ }
++
++ period = get_cycles() - period;
++ spin_lock(&EXT3_SB(sb)->s_bal_lock);
++ EXT3_SB(sb)->s_mb_buddies_generated++;
++ EXT3_SB(sb)->s_mb_generation_time += period;
++ spin_unlock(&EXT3_SB(sb)->s_bal_lock);
++}
++
++static int ext3_mb_init_cache(struct page *page)
++{
++ int blocksize, blocks_per_page, groups_per_page;
++ int err = 0, i, first_group, first_block;
++ struct super_block *sb;
++ struct buffer_head *bhs;
++ struct buffer_head **bh;
++ struct inode *inode;
++ char *data, *bitmap;
++
++ mb_debug("init page %lu\n", page->index);
++
++ inode = page->mapping->host;
++ sb = inode->i_sb;
++ blocksize = 1 << inode->i_blkbits;
++ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
++
++ groups_per_page = blocks_per_page >> 1;
++ if (groups_per_page == 0)
++ groups_per_page = 1;
++
++ /* allocate buffer_heads to read bitmaps */
++ if (groups_per_page > 1) {
++ err = -ENOMEM;
++ i = sizeof(struct buffer_head *) * groups_per_page;
++ bh = kmalloc(i, GFP_NOFS);
++ if (bh == NULL)
++ goto out;
++ memset(bh, 0, i);
++ } else
++ bh = &bhs;
++
++ first_group = page->index * blocks_per_page / 2;
++
++ /* read all groups the page covers into the cache */
++ for (i = 0; i < groups_per_page; i++) {
++ struct ext3_group_desc * desc;
++
++ if (first_group + i >= EXT3_SB(sb)->s_groups_count)
++ break;
++
++ err = -EIO;
++ desc = ext3_get_group_desc(sb, first_group + i, NULL);
++ if (desc == NULL)
++ goto out;
++
++ err = -ENOMEM;
++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap));
++ if (bh[i] == NULL)
++ goto out;
++
++ if (buffer_uptodate(bh[i]))
++ continue;
++
++ lock_buffer(bh[i]);
++ if (buffer_uptodate(bh[i])) {
++ unlock_buffer(bh[i]);
++ continue;
++ }
++
++ get_bh(bh[i]);
++ bh[i]->b_end_io = end_buffer_read_sync;
++ submit_bh(READ, bh[i]);
++ mb_debug("read bitmap for group %u\n", first_group + i);
++ }
++
++ /* wait for I/O completion */
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ wait_on_buffer(bh[i]);
++
++ /* XXX: I/O error handling here */
++
++ first_block = page->index * blocks_per_page;
++ for (i = 0; i < blocks_per_page; i++) {
++ int group;
++
++ group = (first_block + i) >> 1;
++ if (group >= EXT3_SB(sb)->s_groups_count)
++ break;
++
++ data = page_address(page) + (i * blocksize);
++ bitmap = bh[group - first_group]->b_data;
++
++ if ((first_block + i) & 1) {
++ /* this is block of buddy */
++ mb_debug("put buddy for group %u in page %lu/%x\n",
++ group, page->index, i * blocksize);
++ memset(data, 0xff, blocksize);
++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
++ ext3_mb_generate_buddy(sb, data, bitmap,
++ EXT3_SB(sb)->s_group_info[group]);
++ } else {
++ /* this is block of bitmap */
++ mb_debug("put bitmap for group %u in page %lu/%x\n",
++ group, page->index, i * blocksize);
++ memcpy(data, bitmap, blocksize);
++ }
++ }
++ SetPageUptodate(page);
++
++out:
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ brelse(bh[i]);
++ if (bh && bh != &bhs)
++ kfree(bh);
++ return err;
++}
++
++static int ext3_mb_load_buddy(struct super_block *sb, int group,
++ struct ext3_buddy *e3b)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct inode *inode = sbi->s_buddy_cache;
++ int blocks_per_page, block, pnum, poff;
++ struct page *page;
++
++ mb_debug("load group %u\n", group);
++
++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
++
++ e3b->bd_blkbits = sb->s_blocksize_bits;
++ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_sb = sb;
++ e3b->bd_group = group;
++ e3b->bd_buddy_page = NULL;
++ e3b->bd_bitmap_page = NULL;
++
++ block = group * 2;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++
++ page = find_get_page(inode->i_mapping, pnum);
++ if (page == NULL || !PageUptodate(page)) {
++ if (page)
++ page_cache_release(page);
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ if (!PageUptodate(page))
++ ext3_mb_init_cache(page);
++ unlock_page(page);
++ }
++ }
++ if (page == NULL || !PageUptodate(page))
++ goto err;
++ e3b->bd_bitmap_page = page;
++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
++ mark_page_accessed(page);
++
++ block++;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++
++ page = find_get_page(inode->i_mapping, pnum);
++ if (page == NULL || !PageUptodate(page)) {
++ if (page)
++ page_cache_release(page);
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ if (!PageUptodate(page))
++ ext3_mb_init_cache(page);
++ unlock_page(page);
++ }
++ }
++ if (page == NULL || !PageUptodate(page))
++ goto err;
++ e3b->bd_buddy_page = page;
++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
++ mark_page_accessed(page);
++
++ J_ASSERT(e3b->bd_bitmap_page != NULL);
++ J_ASSERT(e3b->bd_buddy_page != NULL);
++
++ return 0;
++
++err:
++ if (e3b->bd_bitmap_page)
++ page_cache_release(e3b->bd_bitmap_page);
++ if (e3b->bd_buddy_page)
++ page_cache_release(e3b->bd_buddy_page);
++ e3b->bd_buddy = NULL;
++ e3b->bd_bitmap = NULL;
++ return -EIO;
++}
++
++static void ext3_mb_release_desc(struct ext3_buddy *e3b)
++{
++ if (e3b->bd_bitmap_page)
++ page_cache_release(e3b->bd_bitmap_page);
++ if (e3b->bd_buddy_page)
++ page_cache_release(e3b->bd_buddy_page);
++}
++
++
++static inline void
++ext3_lock_group(struct super_block *sb, int group)
++{
++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
++ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++}
++
++static inline void
++ext3_unlock_group(struct super_block *sb, int group)
++{
++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
++ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++}
++
++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
++{
++ int order = 1;
++ void *bb;
++
++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3)));
++
++ bb = EXT3_MB_BUDDY(e3b);
++ while (order <= e3b->bd_blkbits + 1) {
++ block = block >> 1;
++ if (!mb_test_bit(block, bb)) {
++ /* this block is part of buddy of order 'order' */
++ return order;
++ }
++ bb += 1 << (e3b->bd_blkbits - order);
++ order++;
++ }
++ return 0;
++}
++
++static inline void mb_clear_bits(void *bm, int cur, int len)
++{
++ __u32 *addr;
++
++ len = cur + len;
++ while (cur < len) {
++ if ((cur & 31) == 0 && (len - cur) >= 32) {
++ /* fast path: clear whole word at once */
++ addr = bm + (cur >> 3);
++ *addr = 0;
++ cur += 32;
++ continue;
++ }
++ mb_clear_bit_atomic(cur, bm);
++ cur++;
++ }
++}
++
++static inline void mb_set_bits(void *bm, int cur, int len)
++{
++ __u32 *addr;
++
++ len = cur + len;
++ while (cur < len) {
++ if ((cur & 31) == 0 && (len - cur) >= 32) {
++ /* fast path: clear whole word at once */
++ addr = bm + (cur >> 3);
++ *addr = 0xffffffff;
++ cur += 32;
++ continue;
++ }
++ mb_set_bit_atomic(cur, bm);
++ cur++;
++ }
++}
++
++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count)
++{
++ int block = 0, max = 0, order;
++ void *buddy, *buddy2;
++
++ mb_check_buddy(e3b);
++
++ e3b->bd_info->bb_free += count;
++ if (first < e3b->bd_info->bb_first_free)
++ e3b->bd_info->bb_first_free = first;
++
++ /* let's maintain fragments counter */
++ if (first != 0)
++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b));
++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0])
++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b));
++ if (block && max)
++ e3b->bd_info->bb_fragments--;
++ else if (!block && !max)
++ e3b->bd_info->bb_fragments++;
++
++ /* let's maintain buddy itself */
++ while (count-- > 0) {
++ block = first++;
++ order = 0;
++
++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b)));
++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b));
++ e3b->bd_info->bb_counters[order]++;
++
++ /* start of the buddy */
++ buddy = mb_find_buddy(e3b, order, &max);
++
++ do {
++ block &= ~1UL;
++ if (mb_test_bit(block, buddy) ||
++ mb_test_bit(block + 1, buddy))
++ break;
++
++ /* both the buddies are free, try to coalesce them */
++ buddy2 = mb_find_buddy(e3b, order + 1, &max);
++
++ if (!buddy2)
++ break;
++
++ if (order > 0) {
++ /* for special purposes, we don't set
++ * free bits in bitmap */
++ mb_set_bit(block, buddy);
++ mb_set_bit(block + 1, buddy);
++ }
++ e3b->bd_info->bb_counters[order]--;
++ e3b->bd_info->bb_counters[order]--;
++
++ block = block >> 1;
++ order++;
++ e3b->bd_info->bb_counters[order]++;
++
++ mb_clear_bit(block, buddy2);
++ buddy = buddy2;
++ } while (1);
++ }
++ mb_check_buddy(e3b);
++
++ return 0;
++}
++
++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
++ int needed, struct ext3_free_extent *ex)
++{
++ int next, max, ord;
++ void *buddy;
++
++ J_ASSERT(ex != NULL);
++
++ buddy = mb_find_buddy(e3b, order, &max);
++ J_ASSERT(buddy);
++ J_ASSERT(block < max);
++ if (mb_test_bit(block, buddy)) {
++ ex->fe_len = 0;
++ ex->fe_start = 0;
++ ex->fe_group = 0;
++ return 0;
++ }
++
++ if (likely(order == 0)) {
++ /* find actual order */
++ order = mb_find_order_for_block(e3b, block);
++ block = block >> order;
++ }
++
++ ex->fe_len = 1 << order;
++ ex->fe_start = block << order;
++ ex->fe_group = e3b->bd_group;
++
++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) {
++
++ if (block + 1 >= max)
++ break;
++
++ next = (block + 1) * (1 << order);
++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b)))
++ break;
++
++ ord = mb_find_order_for_block(e3b, next);
++
++ order = ord;
++ block = next >> order;
++ ex->fe_len += 1 << order;
++ }
++
++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3)));
++ return ex->fe_len;
++}
++
++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex)
++{
++ int ord, mlen = 0, max = 0, cur;
++ int start = ex->fe_start;
++ int len = ex->fe_len;
++ unsigned ret = 0;
++ int len0 = len;
++ void *buddy;
++
++ mb_check_buddy(e3b);
++
++ e3b->bd_info->bb_free -= len;
++ if (e3b->bd_info->bb_first_free == start)
++ e3b->bd_info->bb_first_free += len;
++
++ /* let's maintain fragments counter */
++ if (start != 0)
++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b));
++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0])
++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b));
++ if (mlen && max)
++ e3b->bd_info->bb_fragments++;
++ else if (!mlen && !max)
++ e3b->bd_info->bb_fragments--;
++
++ /* let's maintain buddy itself */
++ while (len) {
++ ord = mb_find_order_for_block(e3b, start);
++
++ if (((start >> ord) << ord) == start && len >= (1 << ord)) {
++ /* the whole chunk may be allocated at once! */
++ mlen = 1 << ord;
++ buddy = mb_find_buddy(e3b, ord, &max);
++ J_ASSERT((start >> ord) < max);
++ mb_set_bit(start >> ord, buddy);
++ e3b->bd_info->bb_counters[ord]--;
++ start += mlen;
++ len -= mlen;
++ J_ASSERT(len >= 0);
++ continue;
++ }
++
++ /* store for history */
++ if (ret == 0)
++ ret = len | (ord << 16);
++
++ /* we have to split large buddy */
++ J_ASSERT(ord > 0);
++ buddy = mb_find_buddy(e3b, ord, &max);
++ mb_set_bit(start >> ord, buddy);
++ e3b->bd_info->bb_counters[ord]--;
++
++ ord--;
++ cur = (start >> ord) & ~1U;
++ buddy = mb_find_buddy(e3b, ord, &max);
++ mb_clear_bit(cur, buddy);
++ mb_clear_bit(cur + 1, buddy);
++ e3b->bd_info->bb_counters[ord]++;
++ e3b->bd_info->bb_counters[ord]++;
++ }
++
++ /* now drop all the bits in bitmap */
++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0);
++
++ mb_check_buddy(e3b);
++
++ return ret;
++}
++
++/*
++ * Must be called under group lock!
++ */
++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ unsigned long ret;
++
++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
++ ret = mb_mark_used(e3b, &ac->ac_b_ex);
++
++ ac->ac_status = AC_STATUS_FOUND;
++ ac->ac_tail = ret & 0xffff;
++ ac->ac_buddy = ret >> 16;
++}
++
++/*
++ * The routine checks whether found extent is good enough. If it is,
++ * then the extent gets marked used and flag is set to the context
++ * to stop scanning. Otherwise, the extent is compared with the
++ * previous found extent and if new one is better, then it's stored
++ * in the context. Later, the best found extent will be used, if
++ * mballoc can't find good enough extent.
++ *
++ * FIXME: real allocation policy is to be designed yet!
++ */
++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac,
++ struct ext3_free_extent *ex,
++ struct ext3_buddy *e3b)
++{
++ struct ext3_free_extent *bex = &ac->ac_b_ex;
++ struct ext3_free_extent *gex = &ac->ac_g_ex;
++
++ J_ASSERT(ex->fe_len > 0);
++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8);
++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8);
++
++ ac->ac_found++;
++
++ /*
++ * The special case - take what you catch first
++ */
++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) {
++ *bex = *ex;
++ ext3_mb_use_best_found(ac, e3b);
++ return;
++ }
++
++ /*
++ * Let's check whether the chuck is good enough
++ */
++ if (ex->fe_len == gex->fe_len) {
++ *bex = *ex;
++ ext3_mb_use_best_found(ac, e3b);
++ return;
++ }
++
++ /*
++ * If this is first found extent, just store it in the context
++ */
++ if (bex->fe_len == 0) {
++ *bex = *ex;
++ return;
++ }
++
++ /*
++ * If new found extent is better, store it in the context
++ */
++ if (bex->fe_len < gex->fe_len) {
++ /* if the request isn't satisfied, any found extent
++ * larger than previous best one is better */
++ if (ex->fe_len > bex->fe_len)
++ *bex = *ex;
++ } else if (ex->fe_len > gex->fe_len) {
++ /* if the request is satisfied, then we try to find
++ * an extent that still satisfy the request, but is
++ * smaller than previous one */
++ *bex = *ex;
++ }
++
++ /*
++ * Let's scan at least few extents and don't pick up a first one
++ */
++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan)
++ ac->ac_status = AC_STATUS_BREAK;
++
++ /*
++ * We don't want to scan for a whole year
++ */
++ if (ac->ac_found > ext3_mb_max_to_scan)
++ ac->ac_status = AC_STATUS_BREAK;
++}
++
++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct ext3_free_extent ex = ac->ac_b_ex;
++ int group = ex.fe_group, max, err;
++
++ J_ASSERT(ex.fe_len > 0);
++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
++ if (err)
++ return err;
++
++ ext3_lock_group(ac->ac_sb, group);
++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
++
++ if (max > 0) {
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ }
++
++ ext3_unlock_group(ac->ac_sb, group);
++
++ ext3_mb_release_desc(e3b);
++
++ return 0;
++}
++
++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ int group = ac->ac_g_ex.fe_group, max, err;
++ struct ext3_free_extent ex;
++
++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
++ if (err)
++ return err;
++
++ ext3_lock_group(ac->ac_sb, group);
++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
++ ac->ac_g_ex.fe_len, &ex);
++
++ if (max > 0) {
++ J_ASSERT(ex.fe_len > 0);
++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ }
++ ext3_unlock_group(ac->ac_sb, group);
++
++ ext3_mb_release_desc(e3b);
++
++ return 0;
++}
++
++/*
++ * The routine scans buddy structures (not bitmap!) from given order
++ * to max order and tries to find big enough chunk to satisfy the req
++ */
++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ struct ext3_group_info *grp = e3b->bd_info;
++ void *buddy;
++ int i, k, max;
++
++ J_ASSERT(ac->ac_2order > 0);
++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) {
++ if (grp->bb_counters[i] == 0)
++ continue;
++
++ buddy = mb_find_buddy(e3b, i, &max);
++ if (buddy == NULL) {
++ printk(KERN_ALERT "looking for wrong order?\n");
++ break;
++ }
++
++ k = mb_find_next_zero_bit(buddy, max, 0);
++ J_ASSERT(k < max);
++
++ ac->ac_found++;
++
++ ac->ac_b_ex.fe_len = 1 << i;
++ ac->ac_b_ex.fe_start = k << i;
++ ac->ac_b_ex.fe_group = e3b->bd_group;
++
++ ext3_mb_use_best_found(ac, e3b);
++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len);
++
++ if (unlikely(ext3_mb_stats))
++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders);
++
++ break;
++ }
++}
++
++/*
++ * The routine scans the group and measures all found extents.
++ * In order to optimize scanning, caller must pass number of
++ * free blocks in the group, so the routine can know upper limit.
++ */
++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ void *bitmap = EXT3_MB_BITMAP(e3b);
++ struct ext3_free_extent ex;
++ int i, free;
++
++ free = e3b->bd_info->bb_free;
++ J_ASSERT(free > 0);
++
++ i = e3b->bd_info->bb_first_free;
++
++ while (free && ac->ac_status == AC_STATUS_CONTINUE) {
++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
++ if (i >= sb->s_blocksize * 8) {
++ J_ASSERT(free == 0);
++ break;
++ }
++
++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex);
++ J_ASSERT(ex.fe_len > 0);
++ J_ASSERT(free >= ex.fe_len);
++
++ ext3_mb_measure_extent(ac, &ex, e3b);
++
++ i += ex.fe_len;
++ free -= ex.fe_len;
++ }
++}
++
++static int ext3_mb_good_group(struct ext3_allocation_context *ac,
++ int group, int cr)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
++ struct ext3_group_info *grp = sbi->s_group_info[group];
++ unsigned free, fragments, i, bits;
++
++ J_ASSERT(cr >= 0 && cr < 4);
++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp));
++
++ free = grp->bb_free;
++ fragments = grp->bb_fragments;
++ if (free == 0)
++ return 0;
++ if (fragments == 0)
++ return 0;
++
++ switch (cr) {
++ case 0:
++ J_ASSERT(ac->ac_2order != 0);
++ bits = ac->ac_sb->s_blocksize_bits + 1;
++ for (i = ac->ac_2order; i < bits; i++)
++ if (grp->bb_counters[i] > 0)
++ return 1;
++ case 1:
++ if ((free / fragments) >= ac->ac_g_ex.fe_len)
++ return 1;
++ case 2:
++ if (free >= ac->ac_g_ex.fe_len)
++ return 1;
++ case 3:
++ return 1;
++ default:
++ BUG();
++ }
++
++ return 0;
++}
++
++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode,
++ unsigned long goal, int *len, int flags, int *errp)
++{
++ struct buffer_head *bitmap_bh = NULL;
++ struct ext3_allocation_context ac;
++ int i, group, block, cr, err = 0;
++ struct ext3_group_desc *gdp;
++ struct ext3_super_block *es;
++ struct buffer_head *gdp_bh;
++ struct ext3_sb_info *sbi;
++ struct super_block *sb;
++ struct ext3_buddy e3b;
++
++ J_ASSERT(len != NULL);
++ J_ASSERT(*len > 0);
++
++ sb = inode->i_sb;
++ if (!sb) {
++ printk("ext3_mb_new_nblocks: nonexistent device");
++ return 0;
++ }
++
++ if (!test_opt(sb, MBALLOC)) {
++ static int ext3_mballoc_warning = 0;
++ if (ext3_mballoc_warning == 0) {
++ printk(KERN_ERR "EXT3-fs: multiblock request with "
++ "mballoc disabled!\n");
++ ext3_mballoc_warning++;
++ }
++ *len = 1;
++ err = ext3_new_block_old(handle, inode, goal, errp);
++ return err;
++ }
++
++ ext3_mb_poll_new_transaction(sb, handle);
++
++ sbi = EXT3_SB(sb);
++ es = EXT3_SB(sb)->s_es;
++
++ /*
++ * We can't allocate > group size
++ */
++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10)
++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10;
++
++ if (!(flags & EXT3_MB_HINT_RESERVED)) {
++ /* someone asks for non-reserved blocks */
++ BUG_ON(*len > 1);
++ err = ext3_mb_reserve_blocks(sb, 1);
++ if (err) {
++ *errp = err;
++ return 0;
++ }
++ }
++
++ /*
++ * Check quota for allocation of this blocks.
++ */
++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len))
++ *len -= 1;
++ if (*len == 0) {
++ *errp = -EDQUOT;
++ block = 0;
++ goto out;
++ }
++
++ /* start searching from the goal */
++ if (goal < le32_to_cpu(es->s_first_data_block) ||
++ goal >= le32_to_cpu(es->s_blocks_count))
++ goal = le32_to_cpu(es->s_first_data_block);
++ group = (goal - le32_to_cpu(es->s_first_data_block)) /
++ EXT3_BLOCKS_PER_GROUP(sb);
++ block = ((goal - le32_to_cpu(es->s_first_data_block)) %
++ EXT3_BLOCKS_PER_GROUP(sb));
++
++ /* set up allocation goals */
++ ac.ac_b_ex.fe_group = 0;
++ ac.ac_b_ex.fe_start = 0;
++ ac.ac_b_ex.fe_len = 0;
++ ac.ac_status = AC_STATUS_CONTINUE;
++ ac.ac_groups_scanned = 0;
++ ac.ac_ex_scanned = 0;
++ ac.ac_found = 0;
++ ac.ac_sb = inode->i_sb;
++ ac.ac_g_ex.fe_group = group;
++ ac.ac_g_ex.fe_start = block;
++ ac.ac_g_ex.fe_len = *len;
++ ac.ac_flags = flags;
++ ac.ac_2order = 0;
++ ac.ac_criteria = 0;
++
++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */
++ i = ffs(*len);
++ if (i >= 8) {
++ i--;
++ if ((*len & (~(1 << i))) == 0)
++ ac.ac_2order = i;
++ }
++
++ /* Sometimes, caller may want to merge even small
++ * number of blocks to an existing extent */
++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) {
++ err = ext3_mb_find_by_goal(&ac, &e3b);
++ if (err)
++ goto out_err;
++ if (ac.ac_status == AC_STATUS_FOUND)
++ goto found;
++ }
++
++ /* Let's just scan groups to find more-less suitable blocks */
++ cr = ac.ac_2order ? 0 : 1;
++repeat:
++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
++ ac.ac_criteria = cr;
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) {
++ if (group == EXT3_SB(sb)->s_groups_count)
++ group = 0;
++
++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ /* we need full data about the group
++ * to make a good selection */
++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
++ if (err)
++ goto out_err;
++ ext3_mb_release_desc(&e3b);
++ }
++
++ /* check is group good for our criteries */
++ if (!ext3_mb_good_group(&ac, group, cr))
++ continue;
++
++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
++ if (err)
++ goto out_err;
++
++ ext3_lock_group(sb, group);
++ if (!ext3_mb_good_group(&ac, group, cr)) {
++ /* someone did allocation from this group */
++ ext3_unlock_group(sb, group);
++ ext3_mb_release_desc(&e3b);
++ continue;
++ }
++
++ ac.ac_groups_scanned++;
++ if (cr == 0)
++ ext3_mb_simple_scan_group(&ac, &e3b);
++ else
++ ext3_mb_complex_scan_group(&ac, &e3b);
++
++ ext3_unlock_group(sb, group);
++
++ ext3_mb_release_desc(&e3b);
++
++ if (err)
++ goto out_err;
++ if (ac.ac_status != AC_STATUS_CONTINUE)
++ break;
++ }
++ }
++
++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND &&
++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
++ /*
++ * We've been searching too long. Let's try to allocate
++ * the best chunk we've found so far
++ */
++
++ /*if (ac.ac_found > ext3_mb_max_to_scan)
++ printk(KERN_ERR "EXT3-fs: too long searching at "
++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len,
++ ac.ac_g_ex.fe_len);*/
++ ext3_mb_try_best_found(&ac, &e3b);
++ if (ac.ac_status != AC_STATUS_FOUND) {
++ /*
++ * Someone more lucky has already allocated it.
++ * The only thing we can do is just take first
++ * found block(s)
++ */
++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");
++ ac.ac_b_ex.fe_group = 0;
++ ac.ac_b_ex.fe_start = 0;
++ ac.ac_b_ex.fe_len = 0;
++ ac.ac_status = AC_STATUS_CONTINUE;
++ ac.ac_flags |= EXT3_MB_HINT_FIRST;
++ cr = 3;
++ goto repeat;
++ }
++ }
++
++ if (ac.ac_status != AC_STATUS_FOUND) {
++ /*
++ * We aren't lucky definitely
++ */
++ DQUOT_FREE_BLOCK(inode, *len);
++ *errp = -ENOSPC;
++ block = 0;
++#if 1
++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n",
++ ac.ac_status, ac.ac_flags);
++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n",
++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group,
++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr);
++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n",
++ sbi->s_blocks_reserved, ac.ac_found);
++ printk("EXT3-fs: groups: ");
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
++ printk("%d: %d ", i,
++ sbi->s_group_info[i]->bb_free);
++ printk("\n");
++#endif
++ goto out;
++ }
++
++found:
++ J_ASSERT(ac.ac_b_ex.fe_len > 0);
++
++ /* good news - free block(s) have been found. now it's time
++ * to mark block(s) in good old journaled bitmap */
++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + ac.ac_b_ex.fe_start
++ + le32_to_cpu(es->s_first_data_block);
++
++ /* we made a desicion, now mark found blocks in good old
++ * bitmap to be journaled */
++
++ ext3_debug("using block group %d(%d)\n",
++ ac.ac_b_group.group, gdp->bg_free_blocks_count);
++
++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group);
++ if (!bitmap_bh) {
++ *errp = -EIO;
++ goto out_err;
++ }
++
++ err = ext3_journal_get_write_access(handle, bitmap_bh);
++ if (err) {
++ *errp = err;
++ goto out_err;
++ }
++
++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh);
++ if (!gdp) {
++ *errp = -EIO;
++ goto out_err;
++ }
++
++ err = ext3_journal_get_write_access(handle, gdp_bh);
++ if (err)
++ goto out_err;
++
++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + ac.ac_b_ex.fe_start
++ + le32_to_cpu(es->s_first_data_block);
++
++ if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
++ block == le32_to_cpu(gdp->bg_inode_bitmap) ||
++ in_range(block, le32_to_cpu(gdp->bg_inode_table),
++ EXT3_SB(sb)->s_itb_per_group))
++ ext3_error(sb, "ext3_new_block",
++ "Allocating block in system zone - "
++ "block = %u", block);
++#ifdef AGGRESSIVE_CHECK
++ for (i = 0; i < ac.ac_b_ex.fe_len; i++)
++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data));
++#endif
++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len);
++
++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
++ - ac.ac_b_ex.fe_len);
++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len);
++
++ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++ if (err)
++ goto out_err;
++ err = ext3_journal_dirty_metadata(handle, gdp_bh);
++ if (err)
++ goto out_err;
++
++ sb->s_dirt = 1;
++ *errp = 0;
++ brelse(bitmap_bh);
++
++ /* drop non-allocated, but dquote'd blocks */
++ J_ASSERT(*len >= ac.ac_b_ex.fe_len);
++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len);
++
++ *len = ac.ac_b_ex.fe_len;
++ J_ASSERT(*len > 0);
++ J_ASSERT(block != 0);
++ goto out;
++
++out_err:
++ /* if we've already allocated something, roll it back */
++ if (ac.ac_status == AC_STATUS_FOUND) {
++ /* FIXME: free blocks here */
++ }
++
++ DQUOT_FREE_BLOCK(inode, *len);
++ brelse(bitmap_bh);
++ *errp = err;
++ block = 0;
++out:
++ if (!(flags & EXT3_MB_HINT_RESERVED)) {
++ /* block wasn't reserved before and we reserved it
++ * at the beginning of allocation. it doesn't matter
++ * whether we allocated anything or we failed: time
++ * to release reservation. NOTE: because I expect
++ * any multiblock request from delayed allocation
++ * path only, here is single block always */
++ ext3_mb_release_blocks(sb, 1);
++ }
++
++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) {
++ atomic_inc(&sbi->s_bal_reqs);
++ atomic_add(*len, &sbi->s_bal_allocated);
++ if (*len >= ac.ac_g_ex.fe_len)
++ atomic_inc(&sbi->s_bal_success);
++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned);
++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start &&
++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group)
++ atomic_inc(&sbi->s_bal_goals);
++ if (ac.ac_found > ext3_mb_max_to_scan)
++ atomic_inc(&sbi->s_bal_breaks);
++ }
++
++ ext3_mb_store_history(sb, &ac);
++
++ return block;
++}
++EXPORT_SYMBOL(ext3_mb_new_blocks);
++
++#ifdef EXT3_MB_HISTORY
++struct ext3_mb_proc_session {
++ struct ext3_mb_history *history;
++ struct super_block *sb;
++ int start;
++ int max;
++};
++
++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s,
++ struct ext3_mb_history *hs,
++ int first)
++{
++ if (hs == s->history + s->max)
++ hs = s->history;
++ if (!first && hs == s->history + s->start)
++ return NULL;
++ while (hs->goal.fe_len == 0) {
++ hs++;
++ if (hs == s->history + s->max)
++ hs = s->history;
++ if (hs == s->history + s->start)
++ return NULL;
++ }
++ return hs;
++}
++
++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
++{
++ struct ext3_mb_proc_session *s = seq->private;
++ struct ext3_mb_history *hs;
++ int l = *pos;
++
++ if (l == 0)
++ return SEQ_START_TOKEN;
++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1);
++ if (!hs)
++ return NULL;
++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL);
++ return hs;
++}
++
++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
++{
++ struct ext3_mb_proc_session *s = seq->private;
++ struct ext3_mb_history *hs = v;
++
++ ++*pos;
++ if (v == SEQ_START_TOKEN)
++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1);
++ else
++ return ext3_mb_history_skip_empty(s, ++hs, 0);
++}
++
++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v)
++{
++ struct ext3_mb_history *hs = v;
++ char buf[20], buf2[20];
++
++ if (v == SEQ_START_TOKEN) {
++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n",
++ "goal", "result", "found", "grps", "cr", "merge",
++ "tail", "broken");
++ return 0;
++ }
++
++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group,
++ hs->goal.fe_start, hs->goal.fe_len);
++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group,
++ hs->result.fe_start, hs->result.fe_len);
++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf,
++ buf2, hs->found, hs->groups, hs->cr,
++ hs->merged ? "M" : "", hs->tail,
++ hs->buddy ? 1 << hs->buddy : 0);
++ return 0;
++}
++
++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v)
++{
++}
++
++static struct seq_operations ext3_mb_seq_history_ops = {
++ .start = ext3_mb_seq_history_start,
++ .next = ext3_mb_seq_history_next,
++ .stop = ext3_mb_seq_history_stop,
++ .show = ext3_mb_seq_history_show,
++};
++
++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file)
++{
++ struct super_block *sb = PDE(inode)->data;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_proc_session *s;
++ int rc, size;
++
++ s = kmalloc(sizeof(*s), GFP_KERNEL);
++ if (s == NULL)
++ return -EIO;
++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max;
++ s->history = kmalloc(size, GFP_KERNEL);
++ if (s == NULL) {
++ kfree(s);
++ return -EIO;
++ }
++
++ spin_lock(&sbi->s_mb_history_lock);
++ memcpy(s->history, sbi->s_mb_history, size);
++ s->max = sbi->s_mb_history_max;
++ s->start = sbi->s_mb_history_cur % s->max;
++ spin_unlock(&sbi->s_mb_history_lock);
++
++ rc = seq_open(file, &ext3_mb_seq_history_ops);
++ if (rc == 0) {
++ struct seq_file *m = (struct seq_file *)file->private_data;
++ m->private = s;
++ } else {
++ kfree(s->history);
++ kfree(s);
++ }
++ return rc;
++
++}
++
++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file)
++{
++ struct seq_file *seq = (struct seq_file *)file->private_data;
++ struct ext3_mb_proc_session *s = seq->private;
++ kfree(s->history);
++ kfree(s);
++ return seq_release(inode, file);
++}
++
++static struct file_operations ext3_mb_seq_history_fops = {
++ .owner = THIS_MODULE,
++ .open = ext3_mb_seq_history_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext3_mb_seq_history_release,
++};
++
++static void ext3_mb_history_release(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char name[64];
++
++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ remove_proc_entry("mb_history", sbi->s_mb_proc);
++ remove_proc_entry(name, proc_root_ext3);
++
++ if (sbi->s_mb_history)
++ kfree(sbi->s_mb_history);
++}
++
++static void ext3_mb_history_init(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char name[64];
++ int i;
++
++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3);
++ if (sbi->s_mb_proc != NULL) {
++ struct proc_dir_entry *p;
++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
++ if (p) {
++ p->proc_fops = &ext3_mb_seq_history_fops;
++ p->data = sb;
++ }
++ }
++
++ sbi->s_mb_history_max = 1000;
++ sbi->s_mb_history_cur = 0;
++ spin_lock_init(&sbi->s_mb_history_lock);
++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history);
++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
++ memset(sbi->s_mb_history, 0, i);
++ /* if we can't allocate history, then we simple won't use it */
++}
++
++static void
++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_history h;
++
++ if (likely(sbi->s_mb_history == NULL))
++ return;
++
++ h.goal = ac->ac_g_ex;
++ h.result = ac->ac_b_ex;
++ h.found = ac->ac_found;
++ h.cr = ac->ac_criteria;
++ h.groups = ac->ac_groups_scanned;
++ h.tail = ac->ac_tail;
++ h.buddy = ac->ac_buddy;
++ h.merged = 0;
++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
++ h.merged = 1;
++
++ spin_lock(&sbi->s_mb_history_lock);
++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
++ sbi->s_mb_history_cur = 0;
++ spin_unlock(&sbi->s_mb_history_lock);
++}
++
++#else
++#define ext3_mb_history_release(sb)
++#define ext3_mb_history_init(sb)
++#endif
++
++int ext3_mb_init_backend(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int i, len;
++
++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
++ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ if (sbi->s_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
++ return -ENOMEM;
++ }
++ memset(sbi->s_group_info, 0, len);
++
++ sbi->s_buddy_cache = new_inode(sb);
++ if (sbi->s_buddy_cache == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
++ kfree(sbi->s_group_info);
++ return -ENOMEM;
++ }
++
++ /*
++ * calculate needed size. if change bb_counters size,
++ * don't forget about ext3_mb_generate_buddy()
++ */
++ len = sizeof(struct ext3_group_info);
++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
++ for (i = 0; i < sbi->s_groups_count; i++) {
++ struct ext3_group_desc * desc;
++
++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
++ if (sbi->s_group_info[i] == NULL) {
++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
++ goto err_out;
++ }
++ desc = ext3_get_group_desc(sb, i, NULL);
++ if (desc == NULL) {
++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i);
++ goto err_out;
++ }
++ memset(sbi->s_group_info[i], 0, len);
++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
++ &sbi->s_group_info[i]->bb_state);
++ sbi->s_group_info[i]->bb_free =
++ le16_to_cpu(desc->bg_free_blocks_count);
++ }
++
++ return 0;
++
++err_out:
++ while (--i >= 0)
++ kfree(sbi->s_group_info[i]);
++ iput(sbi->s_buddy_cache);
++
++ return -ENOMEM;
++}
++
++int ext3_mb_init(struct super_block *sb, int needs_recovery)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct inode *root = sb->s_root->d_inode;
++ unsigned i, offset, max;
++ struct dentry *dentry;
++
++ if (!test_opt(sb, MBALLOC))
++ return 0;
++
++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
++
++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
++ if (sbi->s_mb_offsets == NULL) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ return -ENOMEM;
++ }
++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
++ if (sbi->s_mb_maxs == NULL) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ kfree(sbi->s_mb_maxs);
++ return -ENOMEM;
++ }
++
++ /* order 0 is regular bitmap */
++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
++ sbi->s_mb_offsets[0] = 0;
++
++ i = 1;
++ offset = 0;
++ max = sb->s_blocksize << 2;
++ do {
++ sbi->s_mb_offsets[i] = offset;
++ sbi->s_mb_maxs[i] = max;
++ offset += 1 << (sb->s_blocksize_bits - i);
++ max = max >> 1;
++ i++;
++ } while (i <= sb->s_blocksize_bits + 1);
++
++
++ /* init file for buddy data */
++ if ((i = ext3_mb_init_backend(sb))) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ kfree(sbi->s_mb_offsets);
++ kfree(sbi->s_mb_maxs);
++ return i;
++ }
++
++ spin_lock_init(&sbi->s_reserve_lock);
++ spin_lock_init(&sbi->s_md_lock);
++ INIT_LIST_HEAD(&sbi->s_active_transaction);
++ INIT_LIST_HEAD(&sbi->s_closed_transaction);
++ INIT_LIST_HEAD(&sbi->s_committed_transaction);
++ spin_lock_init(&sbi->s_bal_lock);
++
++ /* remove old on-disk buddy file */
++ down(&root->i_sem);
++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy"));
++ if (dentry->d_inode != NULL) {
++ i = vfs_unlink(root, dentry);
++ if (i != 0)
++ printk("EXT3-fs: can't remove .buddy file: %d\n", i);
++ }
++ dput(dentry);
++ up(&root->i_sem);
++
++ ext3_mb_history_init(sb);
++
++ printk("EXT3-fs: mballoc enabled\n");
++ return 0;
++}
++
++int ext3_mb_release(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int i;
++
++ if (!test_opt(sb, MBALLOC))
++ return 0;
++
++ /* release freed, non-committed blocks */
++ spin_lock(&sbi->s_md_lock);
++ list_splice_init(&sbi->s_closed_transaction,
++ &sbi->s_committed_transaction);
++ list_splice_init(&sbi->s_active_transaction,
++ &sbi->s_committed_transaction);
++ spin_unlock(&sbi->s_md_lock);
++ ext3_mb_free_committed_blocks(sb);
++
++ if (sbi->s_group_info) {
++ for (i = 0; i < sbi->s_groups_count; i++) {
++ if (sbi->s_group_info[i] == NULL)
++ continue;
++ kfree(sbi->s_group_info[i]);
++ }
++ kfree(sbi->s_group_info);
++ }
++ if (sbi->s_mb_offsets)
++ kfree(sbi->s_mb_offsets);
++ if (sbi->s_mb_maxs)
++ kfree(sbi->s_mb_maxs);
++ if (sbi->s_buddy_cache)
++ iput(sbi->s_buddy_cache);
++ if (sbi->s_blocks_reserved)
++ printk("ext3-fs: %ld blocks being reserved at umount!\n",
++ sbi->s_blocks_reserved);
++ if (ext3_mb_stats) {
++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n",
++ atomic_read(&sbi->s_bal_allocated),
++ atomic_read(&sbi->s_bal_reqs),
++ atomic_read(&sbi->s_bal_success));
++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, "
++ "%u 2^N hits, %u breaks\n",
++ atomic_read(&sbi->s_bal_ex_scanned),
++ atomic_read(&sbi->s_bal_goals),
++ atomic_read(&sbi->s_bal_2orders),
++ atomic_read(&sbi->s_bal_breaks));
++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n",
++ sbi->s_mb_buddies_generated++,
++ sbi->s_mb_generation_time);
++ }
++
++ ext3_mb_history_release(sb);
++
++ return 0;
++}
++
++void ext3_mb_free_committed_blocks(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int err, i, count = 0, count2 = 0;
++ struct ext3_free_metadata *md;
++ struct ext3_buddy e3b;
++
++ if (list_empty(&sbi->s_committed_transaction))
++ return;
++
++ /* there is committed blocks to be freed yet */
++ do {
++ /* get next array of blocks */
++ md = NULL;
++ spin_lock(&sbi->s_md_lock);
++ if (!list_empty(&sbi->s_committed_transaction)) {
++ md = list_entry(sbi->s_committed_transaction.next,
++ struct ext3_free_metadata, list);
++ list_del(&md->list);
++ }
++ spin_unlock(&sbi->s_md_lock);
++
++ if (md == NULL)
++ break;
++
++ mb_debug("gonna free %u blocks in group %u (0x%p):",
++ md->num, md->group, md);
++
++ err = ext3_mb_load_buddy(sb, md->group, &e3b);
++ BUG_ON(err != 0);
++
++ /* there are blocks to put in buddy to make them really free */
++ count += md->num;
++ count2++;
++ ext3_lock_group(sb, md->group);
++ for (i = 0; i < md->num; i++) {
++ mb_debug(" %u", md->blocks[i]);
++ mb_free_blocks(&e3b, md->blocks[i], 1);
++ }
++ mb_debug("\n");
++ ext3_unlock_group(sb, md->group);
++
++ /* balance refcounts from ext3_mb_free_metadata() */
++ page_cache_release(e3b.bd_buddy_page);
++ page_cache_release(e3b.bd_bitmap_page);
++
++ kfree(md);
++ ext3_mb_release_desc(&e3b);
++
++ } while (md);
++ mb_debug("freed %u blocks in %u structures\n", count, count2);
++}
++
++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++ if (sbi->s_last_transaction == handle->h_transaction->t_tid)
++ return;
++
++ /* new transaction! time to close last one and free blocks for
++ * committed transaction. we know that only transaction can be
++ * active, so previos transaction can be being logged and we
++ * know that transaction before previous is known to be already
++ * logged. this means that now we may free blocks freed in all
++ * transactions before previous one. hope I'm clear enough ... */
++
++ spin_lock(&sbi->s_md_lock);
++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
++ mb_debug("new transaction %lu, old %lu\n",
++ (unsigned long) handle->h_transaction->t_tid,
++ (unsigned long) sbi->s_last_transaction);
++ list_splice_init(&sbi->s_closed_transaction,
++ &sbi->s_committed_transaction);
++ list_splice_init(&sbi->s_active_transaction,
++ &sbi->s_closed_transaction);
++ sbi->s_last_transaction = handle->h_transaction->t_tid;
++ }
++ spin_unlock(&sbi->s_md_lock);
++
++ ext3_mb_free_committed_blocks(sb);
++}
++
++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b,
++ int group, int block, int count)
++{
++ struct ext3_group_info *db = e3b->bd_info;
++ struct super_block *sb = e3b->bd_sb;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_free_metadata *md;
++ int i;
++
++ J_ASSERT(e3b->bd_bitmap_page != NULL);
++ J_ASSERT(e3b->bd_buddy_page != NULL);
++
++ ext3_lock_group(sb, group);
++ for (i = 0; i < count; i++) {
++ md = db->bb_md_cur;
++ if (md && db->bb_tid != handle->h_transaction->t_tid) {
++ db->bb_md_cur = NULL;
++ md = NULL;
++ }
++
++ if (md == NULL) {
++ ext3_unlock_group(sb, group);
++ md = kmalloc(sizeof(*md), GFP_KERNEL);
++ if (md == NULL)
++ return -ENOMEM;
++ md->num = 0;
++ md->group = group;
++
++ ext3_lock_group(sb, group);
++ if (db->bb_md_cur == NULL) {
++ spin_lock(&sbi->s_md_lock);
++ list_add(&md->list, &sbi->s_active_transaction);
++ spin_unlock(&sbi->s_md_lock);
++ /* protect buddy cache from being freed,
++ * otherwise we'll refresh it from
++ * on-disk bitmap and lose not-yet-available
++ * blocks */
++ page_cache_get(e3b->bd_buddy_page);
++ page_cache_get(e3b->bd_bitmap_page);
++ db->bb_md_cur = md;
++ db->bb_tid = handle->h_transaction->t_tid;
++ mb_debug("new md 0x%p for group %u\n",
++ md, md->group);
++ } else {
++ kfree(md);
++ md = db->bb_md_cur;
++ }
++ }
++
++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS);
++ md->blocks[md->num] = block + i;
++ md->num++;
++ if (md->num == EXT3_BB_MAX_BLOCKS) {
++ /* no more space, put full container on a sb's list */
++ db->bb_md_cur = NULL;
++ }
++ }
++ ext3_unlock_group(sb, group);
++ return 0;
++}
++
++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode,
++ unsigned long block, unsigned long count,
++ int metadata, int *freed)
++{
++ struct buffer_head *bitmap_bh = NULL;
++ struct ext3_group_desc *gdp;
++ struct ext3_super_block *es;
++ unsigned long bit, overflow;
++ struct buffer_head *gd_bh;
++ unsigned long block_group;
++ struct ext3_sb_info *sbi;
++ struct super_block *sb;
++ struct ext3_buddy e3b;
++ int err = 0, ret;
++
++ *freed = 0;
++ sb = inode->i_sb;
++ if (!sb) {
++ printk ("ext3_free_blocks: nonexistent device");
++ return;
++ }
++
++ ext3_mb_poll_new_transaction(sb, handle);
++
++ sbi = EXT3_SB(sb);
++ es = EXT3_SB(sb)->s_es;
++ if (block < le32_to_cpu(es->s_first_data_block) ||
++ block + count < block ||
++ block + count > le32_to_cpu(es->s_blocks_count)) {
++ ext3_error (sb, "ext3_free_blocks",
++ "Freeing blocks not in datazone - "
++ "block = %lu, count = %lu", block, count);
++ goto error_return;
++ }
++
++ ext3_debug("freeing block %lu\n", block);
++
++do_more:
++ overflow = 0;
++ block_group = (block - le32_to_cpu(es->s_first_data_block)) /
++ EXT3_BLOCKS_PER_GROUP(sb);
++ bit = (block - le32_to_cpu(es->s_first_data_block)) %
++ EXT3_BLOCKS_PER_GROUP(sb);
++ /*
++ * Check to see if we are freeing blocks across a group
++ * boundary.
++ */
++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
++ count -= overflow;
++ }
++ brelse(bitmap_bh);
++ bitmap_bh = read_block_bitmap(sb, block_group);
++ if (!bitmap_bh)
++ goto error_return;
++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
++ if (!gdp)
++ goto error_return;
++
++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
++ in_range (block, le32_to_cpu(gdp->bg_inode_table),
++ EXT3_SB(sb)->s_itb_per_group) ||
++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
++ EXT3_SB(sb)->s_itb_per_group))
++ ext3_error (sb, "ext3_free_blocks",
++ "Freeing blocks in system zones - "
++ "Block = %lu, count = %lu",
++ block, count);
++
++ BUFFER_TRACE(bitmap_bh, "getting write access");
++ err = ext3_journal_get_write_access(handle, bitmap_bh);
++ if (err)
++ goto error_return;
++
++ /*
++ * We are about to modify some metadata. Call the journal APIs
++ * to unshare ->b_data if a currently-committing transaction is
++ * using it
++ */
++ BUFFER_TRACE(gd_bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, gd_bh);
++ if (err)
++ goto error_return;
++
++ err = ext3_mb_load_buddy(sb, block_group, &e3b);
++ if (err)
++ goto error_return;
++
++#ifdef AGGRESSIVE_CHECK
++ {
++ int i;
++ for (i = 0; i < count; i++)
++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data));
++ }
++#endif
++ mb_clear_bits(bitmap_bh->b_data, bit, count);
++
++ /* We dirtied the bitmap block */
++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
++ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++
++ if (metadata) {
++ /* blocks being freed are metadata. these blocks shouldn't
++ * be used until this transaction is committed */
++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count);
++ } else {
++ ext3_lock_group(sb, block_group);
++ mb_free_blocks(&e3b, bit, count);
++ ext3_unlock_group(sb, block_group);
++ }
++
++ spin_lock(sb_bgl_lock(sbi, block_group));
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
++ spin_unlock(sb_bgl_lock(sbi, block_group));
++ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
++
++ ext3_mb_release_desc(&e3b);
++
++ *freed = count;
++
++ /* And the group descriptor block */
++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
++ ret = ext3_journal_dirty_metadata(handle, gd_bh);
++ if (!err) err = ret;
++
++ if (overflow && !err) {
++ block += count;
++ count = overflow;
++ goto do_more;
++ }
++ sb->s_dirt = 1;
++error_return:
++ brelse(bitmap_bh);
++ ext3_std_error(sb, err);
++ return;
++}
++
++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int free, ret = -ENOSPC;
++
++ BUG_ON(blocks < 0);
++ spin_lock(&sbi->s_reserve_lock);
++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
++ if (blocks <= free - sbi->s_blocks_reserved) {
++ sbi->s_blocks_reserved += blocks;
++ ret = 0;
++ }
++ spin_unlock(&sbi->s_reserve_lock);
++ return ret;
++}
++
++void ext3_mb_release_blocks(struct super_block *sb, int blocks)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++ BUG_ON(blocks < 0);
++ spin_lock(&sbi->s_reserve_lock);
++ sbi->s_blocks_reserved -= blocks;
++ WARN_ON(sbi->s_blocks_reserved < 0);
++ if (sbi->s_blocks_reserved < 0)
++ sbi->s_blocks_reserved = 0;
++ spin_unlock(&sbi->s_reserve_lock);
++}
++
++int ext3_new_block(handle_t *handle, struct inode *inode,
++ unsigned long goal, int *errp)
++{
++ int ret, len;
++
++ if (!test_opt(inode->i_sb, MBALLOC)) {
++ ret = ext3_new_block_old(handle, inode, goal, errp);
++ goto out;
++ }
++ len = 1;
++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp);
++out:
++ return ret;
++}
++
++
++void ext3_free_blocks(handle_t *handle, struct inode * inode,
++ unsigned long block, unsigned long count, int metadata)
++{
++ struct super_block *sb;
++ int freed;
++
++ sb = inode->i_sb;
++ if (!test_opt(sb, MBALLOC))
++ ext3_free_blocks_sb(handle, sb, block, count, &freed);
++ else
++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
++ if (freed)
++ DQUOT_FREE_BLOCK(inode, freed);
++ return;
++}
++
++#define EXT3_ROOT "ext3"
++#define EXT3_MB_STATS_NAME "mb_stats"
++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
++
++static int ext3_mb_stats_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ int len;
++
++ *eof = 1;
++ if (off != 0)
++ return 0;
++
++ len = sprintf(page, "%ld\n", ext3_mb_stats);
++ *start = page;
++ return len;
++}
++
++static int ext3_mb_stats_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ char str[32];
++
++ if (count >= sizeof(str)) {
++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
++ EXT3_MB_STATS_NAME, (int)sizeof(str));
++ return -EOVERFLOW;
++ }
++
++ if (copy_from_user(str, buffer, count))
++ return -EFAULT;
++
++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0);
++ return count;
++}
++
++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ int len;
++
++ *eof = 1;
++ if (off != 0)
++ return 0;
++
++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan);
++ *start = page;
++ return len;
++}
++
++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ char str[32];
++ long value;
++
++ if (count >= sizeof(str)) {
++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str));
++ return -EOVERFLOW;
++ }
++
++ if (copy_from_user(str, buffer, count))
++ return -EFAULT;
++
++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
++ value = simple_strtol(str, NULL, 0);
++ if (value <= 0)
++ return -ERANGE;
++
++ ext3_mb_max_to_scan = value;
++
++ return count;
++}
++
++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ int len;
++
++ *eof = 1;
++ if (off != 0)
++ return 0;
++
++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan);
++ *start = page;
++ return len;
++}
++
++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ char str[32];
++ long value;
++
++ if (count >= sizeof(str)) {
++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str));
++ return -EOVERFLOW;
++ }
++
++ if (copy_from_user(str, buffer, count))
++ return -EFAULT;
++
++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
++ value = simple_strtol(str, NULL, 0);
++ if (value <= 0)
++ return -ERANGE;
++
++ ext3_mb_min_to_scan = value;
++
++ return count;
++}
++
++int __init init_ext3_proc(void)
++{
++ struct proc_dir_entry *proc_ext3_mb_stats;
++ struct proc_dir_entry *proc_ext3_mb_max_to_scan;
++ struct proc_dir_entry *proc_ext3_mb_min_to_scan;
++
++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs);
++ if (proc_root_ext3 == NULL) {
++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT);
++ return -EIO;
++ }
++
++ /* Initialize EXT3_MB_STATS_NAME */
++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_stats == NULL) {
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_STATS_NAME);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_stats->data = NULL;
++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read;
++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write;
++
++ /* Initialize EXT3_MAX_TO_SCAN_NAME */
++ proc_ext3_mb_max_to_scan = create_proc_entry(
++ EXT3_MB_MAX_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_max_to_scan == NULL) {
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_MAX_TO_SCAN_NAME);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_max_to_scan->data = NULL;
++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read;
++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write;
++
++ /* Initialize EXT3_MIN_TO_SCAN_NAME */
++ proc_ext3_mb_min_to_scan = create_proc_entry(
++ EXT3_MB_MIN_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_min_to_scan == NULL) {
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_MIN_TO_SCAN_NAME);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_min_to_scan->data = NULL;
++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read;
++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write;
++
++ return 0;
++}
++
++void exit_ext3_proc(void)
++{
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++}
++
+Index: linux-2.6.12.6/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/Makefile 2005-12-17 02:17:16.000000000 +0300
++++ linux-2.6.12.6/fs/ext3/Makefile 2005-12-17 02:21:21.000000000 +0300
+@@ -5,7 +5,8 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\
+- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
++ mballoc.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
-Index: linux-2.6.9/include/linux/ext3_fs_sb.h
+Index: linux-2.6.9-full/include/linux/ext3_fs.h
===================================================================
---- linux-2.6.9.orig/include/linux/ext3_fs_sb.h 2005-10-14 09:10:05.000000000 +0400
-+++ linux-2.6.9/include/linux/ext3_fs_sb.h 2005-10-14 09:10:13.000000000 +0400
-@@ -23,10 +23,30 @@
- #define EXT_INCLUDE
- #include <linux/blockgroup_lock.h>
- #include <linux/percpu_counter.h>
-+#include <linux/list.h>
- #endif
- #endif
- #include <linux/rbtree.h>
-
-+#define EXT3_BB_MAX_BLOCKS 30
-+struct ext3_free_metadata {
-+ unsigned short group;
-+ unsigned short num;
-+ unsigned short blocks[EXT3_BB_MAX_BLOCKS];
-+ struct list_head list;
-+};
-+
-+struct ext3_buddy_group_blocks {
-+ __u32 bb_bitmap;
-+ __u32 bb_buddy;
-+ spinlock_t bb_lock;
-+ unsigned long bb_tid;
-+ struct ext3_free_metadata *bb_md_cur;
-+ unsigned short bb_first_free;
-+ unsigned short bb_free;
-+ unsigned bb_counters[];
-+};
-+
- /*
- * third extended-fs super-block data in memory
- */
-@@ -81,6 +101,27 @@
- char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
- int s_jquota_fmt; /* Format of quota to use */
- #endif
-+
-+ /* for buddy allocator */
-+ struct ext3_buddy_group_blocks **s_buddy_blocks;
-+ struct inode *s_buddy;
-+ long s_blocks_reserved;
-+ spinlock_t s_reserve_lock;
-+ struct list_head s_active_transaction;
-+ struct list_head s_closed_transaction;
-+ struct list_head s_committed_transaction;
-+ spinlock_t s_md_lock;
-+ tid_t s_last_transaction;
-+ int s_mb_factor;
-+
-+ /* stats for buddy allocator */
-+ spinlock_t s_bal_lock;
-+ unsigned long s_bal_reqs; /* number of reqs with len > 1 */
-+ unsigned long s_bal_success; /* we found long enough chunks */
-+ unsigned long s_bal_allocated; /* in blocks */
-+ unsigned long s_bal_ex_scanned; /* total extents scanned */
-+ unsigned long s_bal_goals; /* goal hits */
-+ unsigned long s_bal_breaks; /* too long searches */
- };
-
- #endif /* _LINUX_EXT3_FS_SB */
-Index: linux-2.6.9/include/linux/ext3_fs.h
-===================================================================
---- linux-2.6.9.orig/include/linux/ext3_fs.h 2005-10-14 09:10:12.000000000 +0400
-+++ linux-2.6.9/include/linux/ext3_fs.h 2005-10-14 09:10:31.000000000 +0400
-@@ -57,6 +57,14 @@
+--- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2005-12-16 23:16:41.000000000 +0300
++++ linux-2.6.9-full/include/linux/ext3_fs.h 2005-12-16 23:16:42.000000000 +0300
+@@ -57,6 +57,14 @@ struct statfs;
#define ext3_debug(f, a...) do {} while (0)
#endif
/*
* Special inodes numbers
*/
-@@ -365,6 +373,7 @@
+@@ -365,6 +373,7 @@ struct ext3_inode {
#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */
#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */
-+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */
++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef clear_opt
-@@ -726,7 +735,7 @@
+@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe
extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
unsigned long, unsigned long, int *);
extern unsigned long ext3_count_free_blocks (struct super_block *);
-@@ -857,6 +866,44 @@
+@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc
extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg);
+/* mballoc.c */
-+extern long ext3_mb_aggressive;
+extern long ext3_mb_stats;
+extern long ext3_mb_max_to_scan;
+extern int ext3_mb_init(struct super_block *, int);
+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
+extern int ext3_mb_reserve_blocks(struct super_block *, int);
+extern void ext3_mb_release_blocks(struct super_block *, int);
-+
-+/* writeback.c */
-+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
-+extern int ext3_wb_prepare_write(struct file *file, struct page *page,
-+ unsigned from, unsigned to);
-+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
-+extern int ext3_wb_writepage(struct page *, struct writeback_control *);
-+extern int ext3_wb_invalidatepage(struct page *, unsigned long);
-+extern int ext3_wb_releasepage(struct page *, int);
-+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
-+extern void ext3_wb_init(struct super_block *);
-+extern void ext3_wb_release(struct super_block *);
-+
-+/* writeback.c */
-+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
-+extern int ext3_wb_prepare_write(struct file *file, struct page *page,
-+ unsigned from, unsigned to);
-+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
-+extern int ext3_wb_writepage(struct page *, struct writeback_control *);
-+extern int ext3_wb_invalidatepage(struct page *, unsigned long);
-+extern int ext3_wb_releasepage(struct page *, int);
-+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
-+extern void ext3_wb_init(struct super_block *);
-+extern void ext3_wb_release(struct super_block *);
-+
-+/* proc.c */
-+extern int init_ext3_proc(void);
-+extern void exit_ext3_proc(void);
++int __init init_ext3_proc(void);
++void exit_ext3_proc(void);
+
#endif /* __KERNEL__ */
/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
-Index: linux-2.6.9/fs/ext3/balloc.c
+Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h
===================================================================
---- linux-2.6.9.orig/fs/ext3/balloc.c 2005-05-13 21:39:03.000000000 +0400
-+++ linux-2.6.9/fs/ext3/balloc.c 2005-10-14 09:10:13.000000000 +0400
-@@ -79,7 +79,7 @@
- *
- * Return buffer_head on success or NULL in case of failure.
- */
--static struct buffer_head *
-+struct buffer_head *
- read_block_bitmap(struct super_block *sb, unsigned int block_group)
- {
- struct ext3_group_desc * desc;
-@@ -450,24 +450,6 @@
- return;
- }
+--- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2005-12-16 23:16:39.000000000 +0300
++++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2005-12-16 23:16:42.000000000 +0300
+@@ -23,9 +23,15 @@
+ #define EXT_INCLUDE
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
++#include <linux/list.h>
+ #endif
+ #endif
+ #include <linux/rbtree.h>
++#include <linux/proc_fs.h>
++
++struct ext3_buddy_group_blocks;
++struct ext3_mb_history;
++#define EXT3_BB_MAX_BLOCKS
--/* Free given blocks, update quota and i_blocks field */
--void ext3_free_blocks(handle_t *handle, struct inode *inode,
-- unsigned long block, unsigned long count)
--{
-- struct super_block * sb;
-- int dquot_freed_blocks;
--
-- sb = inode->i_sb;
-- if (!sb) {
-- printk ("ext3_free_blocks: nonexistent device");
-- return;
-- }
-- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
-- if (dquot_freed_blocks)
-- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
-- return;
--}
--
/*
- * For ext3 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy. This
-@@ -1140,7 +1122,7 @@
- * bitmap, and then for any free bit if that fails.
- * This function also updates quota and i_blocks field.
- */
--int ext3_new_block(handle_t *handle, struct inode *inode,
-+int ext3_new_block_old(handle_t *handle, struct inode *inode,
- unsigned long goal, int *errp)
+ * third extended-fs super-block data in memory
+@@ -81,6 +87,38 @@ struct ext3_sb_info {
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+ #endif
++
++ /* for buddy allocator */
++ struct ext3_group_info **s_group_info;
++ struct inode *s_buddy_cache;
++ long s_blocks_reserved;
++ spinlock_t s_reserve_lock;
++ struct list_head s_active_transaction;
++ struct list_head s_closed_transaction;
++ struct list_head s_committed_transaction;
++ spinlock_t s_md_lock;
++ tid_t s_last_transaction;
++ int s_mb_factor;
++ unsigned short *s_mb_offsets, *s_mb_maxs;
++
++ /* history to debug policy */
++ struct ext3_mb_history *s_mb_history;
++ int s_mb_history_cur;
++ int s_mb_history_max;
++ struct proc_dir_entry *s_mb_proc;
++ spinlock_t s_mb_history_lock;
++
++ /* stats for buddy allocator */
++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
++ atomic_t s_bal_success; /* we found long enough chunks */
++ atomic_t s_bal_allocated; /* in blocks */
++ atomic_t s_bal_ex_scanned; /* total extents scanned */
++ atomic_t s_bal_goals; /* goal hits */
++ atomic_t s_bal_breaks; /* too long searches */
++ atomic_t s_bal_2orders; /* 2^order hits */
++ spinlock_t s_bal_lock;
++ unsigned long s_mb_buddies_generated;
++ unsigned long long s_mb_generation_time;
+ };
+
+ #endif /* _LINUX_EXT3_FS_SB */
+Index: linux-2.6.9-full/fs/ext3/super.c
+===================================================================
+--- linux-2.6.9-full.orig/fs/ext3/super.c 2005-12-16 23:16:41.000000000 +0300
++++ linux-2.6.9-full/fs/ext3/super.c 2005-12-16 23:16:42.000000000 +0300
+@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_mb_release(sb);
+ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+@@ -596,6 +597,7 @@ enum {
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug,
++ Opt_mballoc,
+ };
+
+ static match_table_t tokens = {
+@@ -647,6 +649,7 @@ static match_table_t tokens = {
+ {Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
+ {Opt_extdebug, "extdebug"},
++ {Opt_mballoc, "mballoc"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+@@ -957,6 +960,9 @@ clear_qf_name:
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
++ case Opt_mballoc:
++ set_opt (sbi->s_mount_opt, MBALLOC);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1646,6 +1652,7 @@ static int ext3_fill_super (struct super
+ ext3_count_dirs(sb));
+
+ ext3_ext_init(sb);
++ ext3_mb_init(sb, needs_recovery);
+
+ return 0;
+
+@@ -2428,7 +2435,13 @@ static struct file_system_type ext3_fs_t
+
+ static int __init init_ext3_fs(void)
{
- struct buffer_head *bitmap_bh = NULL;
-Index: linux-2.6.9/fs/ext3/extents.c
+- int err = init_ext3_xattr();
++ int err;
++
++ err = init_ext3_proc();
++ if (err)
++ return err;
++
++ err = init_ext3_xattr();
+ if (err)
+ return err;
+ err = init_inodecache();
+@@ -2450,6 +2463,7 @@ static void __exit exit_ext3_fs(void)
+ unregister_filesystem(&ext3_fs_type);
+ destroy_inodecache();
+ exit_ext3_xattr();
++ exit_ext3_proc();
+ }
+
+ int ext3_prep_san_write(struct inode *inode, long *blocks,
+Index: linux-2.6.9-full/fs/ext3/extents.c
===================================================================
---- linux-2.6.9.orig/fs/ext3/extents.c 2005-10-14 09:10:12.000000000 +0400
-+++ linux-2.6.9/fs/ext3/extents.c 2005-10-14 09:10:13.000000000 +0400
-@@ -771,7 +771,7 @@
+--- linux-2.6.9-full.orig/fs/ext3/extents.c 2005-12-16 23:16:41.000000000 +0300
++++ linux-2.6.9-full/fs/ext3/extents.c 2005-12-16 23:16:42.000000000 +0300
+@@ -771,7 +771,7 @@ cleanup:
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
}
}
kfree(ablocks);
-@@ -1428,7 +1428,7 @@
+@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st
path->p_idx->ei_leaf);
bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
return err;
}
-@@ -1913,10 +1913,12 @@
+@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t
int needed = ext3_remove_blocks_credits(tree, ex, from, to);
handle_t *handle = ext3_journal_start(tree->inode, needed);
struct buffer_head *bh;
if (IS_ERR(handle))
return PTR_ERR(handle);
-+ if (S_ISDIR(tree->inode->i_mode))
++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode))
+ metadata = 1;
if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
/* tail removal */
unsigned long num, start;
-@@ -1928,7 +1930,7 @@
+@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t
bh = sb_find_get_block(tree->inode->i_sb, start + i);
ext3_forget(handle, 0, tree->inode, bh, start + i);
}
} else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
printk("strange request: removal %lu-%lu from %u:%u\n",
from, to, ex->ee_block, ex->ee_len);
-Index: linux-2.6.9/fs/ext3/namei.c
+Index: linux-2.6.9-full/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.9-full.orig/fs/ext3/inode.c 2005-12-16 23:16:41.000000000 +0300
++++ linux-2.6.9-full/fs/ext3/inode.c 2005-12-16 23:16:42.000000000 +0300
+@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i < keys; i++)
+- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
+ return err;
+ }
+
+@@ -673,7 +673,7 @@ err_out:
+ if (err == -EAGAIN)
+ for (i = 0; i < num; i++)
+ ext3_free_blocks(handle, inode,
+- le32_to_cpu(where[i].key), 1);
++ le32_to_cpu(where[i].key), 1, 1);
+ return err;
+ }
+
+@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru
+ }
+ }
+
+- ext3_free_blocks(handle, inode, block_to_free, count);
++ ext3_free_blocks(handle, inode, block_to_free, count, 1);
+ }
+
+ /**
+@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t
+ ext3_journal_test_restart(handle, inode);
+ }
+
+- ext3_free_blocks(handle, inode, nr, 1);
++ ext3_free_blocks(handle, inode, nr, 1, 1);
+
+ if (parent_bh) {
+ /*
+Index: linux-2.6.9-full/fs/ext3/balloc.c
===================================================================
---- linux-2.6.9.orig/fs/ext3/namei.c 2005-10-14 09:10:04.000000000 +0400
-+++ linux-2.6.9/fs/ext3/namei.c 2005-10-14 09:10:13.000000000 +0400
-@@ -1639,7 +1639,7 @@
- * If the create succeeds, we fill in the inode information
- * with d_instantiate().
+--- linux-2.6.9-full.orig/fs/ext3/balloc.c 2005-10-27 21:44:24.000000000 +0400
++++ linux-2.6.9-full/fs/ext3/balloc.c 2005-12-16 23:16:42.000000000 +0300
+@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ read_block_bitmap(struct super_block *sb, unsigned int block_group)
+ {
+ struct ext3_group_desc * desc;
+@@ -450,24 +450,6 @@ error_return:
+ return;
+ }
+
+-/* Free given blocks, update quota and i_blocks field */
+-void ext3_free_blocks(handle_t *handle, struct inode *inode,
+- unsigned long block, unsigned long count)
+-{
+- struct super_block * sb;
+- int dquot_freed_blocks;
+-
+- sb = inode->i_sb;
+- if (!sb) {
+- printk ("ext3_free_blocks: nonexistent device");
+- return;
+- }
+- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+- if (dquot_freed_blocks)
+- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+- return;
+-}
+-
+ /*
+ * For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy. This
+@@ -1140,7 +1122,7 @@ int ext3_should_retry_alloc(struct super
+ * bitmap, and then for any free bit if that fails.
+ * This function also updates quota and i_blocks field.
*/
--static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
-+int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
- struct nameidata *nd)
+-int ext3_new_block(handle_t *handle, struct inode *inode,
++int ext3_new_block_old(handle_t *handle, struct inode *inode,
+ unsigned long goal, int *errp)
{
- handle_t *handle;
-Index: linux-2.6.9/fs/ext3/xattr.c
+ struct buffer_head *bitmap_bh = NULL;
+Index: linux-2.6.9-full/fs/ext3/xattr.c
===================================================================
---- linux-2.6.9.orig/fs/ext3/xattr.c 2005-10-14 09:10:08.000000000 +0400
-+++ linux-2.6.9/fs/ext3/xattr.c 2005-10-14 09:10:13.000000000 +0400
-@@ -1281,7 +1281,7 @@
+--- linux-2.6.9-full.orig/fs/ext3/xattr.c 2005-12-16 23:16:40.000000000 +0300
++++ linux-2.6.9-full/fs/ext3/xattr.c 2005-12-16 23:16:42.000000000 +0300
+@@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle,
new_bh = sb_getblk(sb, block);
if (!new_bh) {
getblk_failed:
error = -EIO;
goto cleanup;
}
-@@ -1328,7 +1328,7 @@
+@@ -1328,7 +1328,7 @@ getblk_failed:
if (ce)
mb_cache_entry_free(ce);
ea_bdebug(old_bh, "freeing");
/* ext3_forget() calls bforget() for us, but we
let our caller release old_bh, so we need to
-@@ -1427,7 +1427,7 @@
+@@ -1427,7 +1427,7 @@ ext3_xattr_delete_inode(handle_t *handle
if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
if (ce)
mb_cache_entry_free(ce);
get_bh(bh);
ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl);
} else {
-Index: linux-2.6.9/fs/ext3/Makefile
-===================================================================
---- linux-2.6.9.orig/fs/ext3/Makefile 2005-10-14 09:10:12.000000000 +0400
-+++ linux-2.6.9/fs/ext3/Makefile 2005-10-14 09:10:13.000000000 +0400
-@@ -5,7 +5,8 @@
- obj-$(CONFIG_EXT3_FS) += ext3.o
-
- ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\
-- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
-+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-+ mballoc.o
-
- ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
- ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
-Index: linux-2.6.9/fs/ext3/mballoc.c
+Index: linux-2.6.9-full/fs/ext3/mballoc.c
===================================================================
---- linux-2.6.9.orig/fs/ext3/mballoc.c 2005-10-13 19:40:57.851699336 +0400
-+++ linux-2.6.9/fs/ext3/mballoc.c 2005-10-14 09:10:31.000000000 +0400
-@@ -0,0 +1,1865 @@
+--- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2005-12-16 17:46:19.148560250 +0300
++++ linux-2.6.9-full/fs/ext3/mballoc.c 2005-12-17 00:10:15.000000000 +0300
+@@ -0,0 +1,2434 @@
+/*
-+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
++#include <linux/swap.h>
++#include <linux/proc_fs.h>
++#include <linux/pagemap.h>
++#include <linux/seq_file.h>
+
+/*
+ * TODO:
-+ * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green)
++ * - bitmap read-ahead (proposed by Oleg Drokin aka green)
+ * - track min/max extents in each group for better group selection
-+ * - is it worthwhile to use buddies directly if req is 2^N blocks?
+ * - mb_mark_used() may allocate chunk right after splitting buddy
+ * - special flag to advice allocator to look for requested + N blocks
+ * this may improve interaction between extents and mballoc
+ */
+
+/*
-+ * with 'ext3_mb_aggressive' set the allocator runs consistency checks over
++ * with AGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
-+long ext3_mb_aggressive = 0;
-+
-+
-+/*
-+ * with 'ext3_mb_stats' allocator will collect stats that will be
-+ * shown at umount. The collecting costs though!
-+ */
-+long ext3_mb_stats = 1;
++#define AGGRESSIVE_CHECK__
+
+/*
+ */
+#endif
+
+/*
-+ * where to save buddies structures beetween umount/mount (clean case only)
++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory
++ * and you can monitor it in /proc/fs/ext3/<dev>/mb_history
+ */
-+#define EXT3_BUDDY_FILE ".buddy"
++#define EXT3_MB_HISTORY
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
-+long ext3_mb_max_to_scan = 100;
++long ext3_mb_max_to_scan = 500;
+
+/*
-+ * This structure is on-disk description of a group for mballoc
++ * How long mballoc must look for a best extent
+ */
-+struct ext3_mb_group_descr {
-+ __u16 mgd_first_free; /* first free block in the group */
-+ __u16 mgd_free; /* number of free blocks in the group */
-+ __u16 mgd_counters[16]; /* number of free blocks by order */
-+};
++long ext3_mb_min_to_scan = 30;
+
+/*
-+ * This structure is header of mballoc's file
++ * with 'ext3_mb_stats' allocator will collect stats that will be
++ * shown at umount. The collecting costs though!
+ */
-+struct ext3_mb_grp_header {
-+ __u32 mh_magic;
++
++long ext3_mb_stats = 1;
++
++#ifdef EXT3_BB_MAX_BLOCKS
++#undef EXT3_BB_MAX_BLOCKS
++#endif
++#define EXT3_BB_MAX_BLOCKS 30
++
++struct ext3_free_metadata {
++ unsigned short group;
++ unsigned short num;
++ unsigned short blocks[EXT3_BB_MAX_BLOCKS];
++ struct list_head list;
++};
++
++struct ext3_group_info {
++ unsigned long bb_state;
++ unsigned long bb_tid;
++ struct ext3_free_metadata *bb_md_cur;
++ unsigned short bb_first_free;
++ unsigned short bb_free;
++ unsigned short bb_fragments;
++ unsigned short bb_counters[];
+};
+
-+#define EXT3_MB_MAGIC_V1 0xbabd16fd
+
++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0
++#define EXT3_GROUP_INFO_LOCKED_BIT 1
++
++#define EXT3_MB_GRP_NEED_INIT(grp) \
++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state))
+
+struct ext3_free_extent {
+ __u16 fe_start;
+ unsigned long ac_ex_scanned;
+ __u16 ac_groups_scanned;
+ __u16 ac_found;
++ __u16 ac_tail;
++ __u16 ac_buddy;
+ __u8 ac_status;
+ __u8 ac_flags; /* allocation hints */
++ __u8 ac_criteria;
+ __u8 ac_repeats;
++ __u8 ac_2order; /* if request is to allocate 2^N blocks and
++ * N > 0, the field stores N, otherwise 0 */
+};
+
+#define AC_STATUS_CONTINUE 1
+#define AC_STATUS_FOUND 2
+#define AC_STATUS_BREAK 3
+
++struct ext3_mb_history {
++ struct ext3_free_extent goal; /* goal allocation */
++ struct ext3_free_extent result; /* result allocation */
++ __u16 found; /* how many extents have been found */
++ __u16 groups; /* how many groups have been scanned */
++ __u16 tail; /* what tail broke some buddy */
++ __u16 buddy; /* buddy the tail ^^^ broke */
++ __u8 cr; /* which phase the result extent was found at */
++ __u8 merged;
++};
++
+struct ext3_buddy {
-+ struct buffer_head *bd_bh;
-+ struct buffer_head *bd_bh2;
-+ struct ext3_buddy_group_blocks *bd_bd;
++ struct page *bd_buddy_page;
++ void *bd_buddy;
++ struct page *bd_bitmap_page;
++ void *bd_bitmap;
++ struct ext3_group_info *bd_info;
+ struct super_block *bd_sb;
+ __u16 bd_blkbits;
+ __u16 bd_group;
+};
-+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data)
-+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data)
++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap)
++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy)
++
++#ifndef EXT3_MB_HISTORY
++#define ext3_mb_store_history(sb,ac)
++#else
++static void ext3_mb_store_history(struct super_block *,
++ struct ext3_allocation_context *ac);
++#endif
+
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+
++static struct proc_dir_entry *proc_root_ext3;
++
+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
+static inline int mb_test_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ return ext3_test_bit(bit, addr);
++ return ext2_test_bit(bit, addr);
+}
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_set_bit(bit, addr);
++ ext2_set_bit(bit, addr);
+}
+
+static inline void mb_set_bit_atomic(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_set_bit_atomic(NULL, bit, addr);
++ ext2_set_bit_atomic(NULL, bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_clear_bit(bit, addr);
++ ext2_clear_bit(bit, addr);
+}
+
+static inline void mb_clear_bit_atomic(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_clear_bit_atomic(NULL, bit, addr);
++ ext2_clear_bit_atomic(NULL, bit, addr);
+}
+
-+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
++static inline int mb_find_next_zero_bit(void *addr, int max, int start)
+{
-+ int i = 1;
-+ char *bb;
-+
-+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
-+ J_ASSERT(max != NULL);
-+
-+ if (order > e3b->bd_blkbits + 1) {
-+ *max = 0;
-+ return NULL;
-+ }
-+
-+ /* at order 0 we see each particular block */
-+ *max = 1 << (e3b->bd_blkbits + 3);
-+ if (order == 0)
-+ return EXT3_MB_BITMAP(e3b);
-+
-+ bb = EXT3_MB_BUDDY(e3b);
-+ *max = *max >> 1;
-+ while (i < order) {
-+ bb += 1 << (e3b->bd_blkbits - i);
-+ i++;
-+ *max = *max >> 1;
-+ }
-+ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) <
-+ e3b->bd_sb->s_blocksize);
-+ return bb;
++ int fix;
++#if BITS_PER_LONG == 64
++ fix = ((unsigned long) addr & 7UL) << 3;
++ addr = (void *) ((unsigned long) addr & ~7UL);
++#elif BITS_PER_LONG == 32
++ fix = ((unsigned long) addr & 3UL) << 3;
++ addr = (void *) ((unsigned long) addr & ~3UL);
++#else
++#error "how many bits you are?!"
++#endif
++ max += fix;
++ start += fix;
++ return ext2_find_next_zero_bit(addr, max, start) - fix;
+}
+
-+static int ext3_mb_load_buddy(struct super_block *sb, int group,
-+ struct ext3_buddy *e3b)
++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
+{
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char *bb;
+
-+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap);
-+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy);
++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
++ J_ASSERT(max != NULL);
+
-+ /* load bitmap */
-+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap);
-+ if (e3b->bd_bh == NULL) {
-+ ext3_error(sb, "ext3_mb_load_buddy",
-+ "can't get block for buddy bitmap\n");
-+ goto out;
-+ }
-+ /* load buddy */
-+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy);
-+ if (e3b->bd_bh2 == NULL) {
-+ ext3_error(sb, "ext3_mb_load_buddy",
-+ "can't get block for buddy bitmap\n");
-+ goto out;
++ if (order > e3b->bd_blkbits + 1) {
++ *max = 0;
++ return NULL;
+ }
+
-+ if (!buffer_uptodate(e3b->bd_bh))
-+ ll_rw_block(READ, 1, &e3b->bd_bh);
-+ if (!buffer_uptodate(e3b->bd_bh2))
-+ ll_rw_block(READ, 1, &e3b->bd_bh2);
-+
-+ wait_on_buffer(e3b->bd_bh);
-+ J_ASSERT(buffer_uptodate(e3b->bd_bh));
-+ wait_on_buffer(e3b->bd_bh2);
-+ J_ASSERT(buffer_uptodate(e3b->bd_bh2));
-+
-+ e3b->bd_blkbits = sb->s_blocksize_bits;
-+ e3b->bd_bd = sbi->s_buddy_blocks[group];
-+ e3b->bd_sb = sb;
-+ e3b->bd_group = group;
++ /* at order 0 we see each particular block */
++ *max = 1 << (e3b->bd_blkbits + 3);
++ if (order == 0)
++ return EXT3_MB_BITMAP(e3b);
+
-+ return 0;
-+out:
-+ brelse(e3b->bd_bh);
-+ brelse(e3b->bd_bh2);
-+ e3b->bd_bh = NULL;
-+ e3b->bd_bh2 = NULL;
-+ return -EIO;
-+}
++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order];
++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order];
+
-+static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b)
-+{
-+ mark_buffer_dirty(e3b->bd_bh);
-+ mark_buffer_dirty(e3b->bd_bh2);
++ return bb;
+}
+
-+static void ext3_mb_release_desc(struct ext3_buddy *e3b)
-+{
-+ brelse(e3b->bd_bh);
-+ brelse(e3b->bd_bh2);
-+}
++#ifdef AGGRESSIVE_CHECK
+
+static void mb_check_buddy(struct ext3_buddy *e3b)
+{
+ int order = e3b->bd_blkbits + 1;
+ int max, max2, i, j, k, count;
++ int fragments = 0, fstart;
+ void *buddy, *buddy2;
+
-+ if (likely(!ext3_mb_aggressive))
-+ return;
-+
+ if (!test_opt(e3b->bd_sb, MBALLOC))
+ return;
+
++ {
++ static int mb_check_counter = 0;
++ if (mb_check_counter++ % 300 != 0)
++ return;
++ }
++
+ while (order > 1) {
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ }
+ count++;
+ }
-+ J_ASSERT(e3b->bd_bd->bb_counters[order] == count);
++ J_ASSERT(e3b->bd_info->bb_counters[order] == count);
+ order--;
+ }
+
++ fstart = -1;
+ buddy = mb_find_buddy(e3b, 0, &max);
+ for (i = 0; i < max; i++) {
-+ if (!mb_test_bit(i, buddy))
++ if (!mb_test_bit(i, buddy)) {
++ J_ASSERT(i >= e3b->bd_info->bb_first_free);
++ if (fstart == -1) {
++ fragments++;
++ fstart = i;
++ }
+ continue;
++ }
++ fstart = -1;
+ /* check used bits only */
+ for (j = 0; j < e3b->bd_blkbits + 1; j++) {
+ buddy2 = mb_find_buddy(e3b, j, &max2);
+ J_ASSERT(mb_test_bit(k, buddy2));
+ }
+ }
++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info));
++ J_ASSERT(e3b->bd_info->bb_fragments == fragments);
++}
++
++#else
++#define mb_check_buddy(e3b)
++#endif
++
++/* find most significant bit */
++static int inline fmsb(unsigned short word)
++{
++ int order;
++
++ if (word > 255) {
++ order = 7;
++ word >>= 8;
++ } else {
++ order = -1;
++ }
++
++ do {
++ order++;
++ word >>= 1;
++ } while (word != 0);
++
++ return order;
++}
++
++static void inline
++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first,
++ int len, struct ext3_group_info *grp)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ unsigned short min, max, chunk, border;
++
++ mb_debug("mark %u/%u free\n", first, len);
++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb));
++
++ border = 2 << sb->s_blocksize_bits;
++
++ while (len > 0) {
++ /* find how many blocks can be covered since this position */
++ max = ffs(first | border) - 1;
++
++ /* find how many blocks of power 2 we need to mark */
++ min = fmsb(len);
++
++ mb_debug(" %u/%u -> max %u, min %u\n",
++ first & ((2 << sb->s_blocksize_bits) - 1),
++ len, max, min);
++
++ if (max < min)
++ min = max;
++ chunk = 1 << min;
++
++ /* mark multiblock chunks only */
++ grp->bb_counters[min]++;
++ if (min > 0) {
++ mb_debug(" set %u at %u \n", first >> min,
++ sbi->s_mb_offsets[min]);
++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]);
++ }
++
++ len -= chunk;
++ first += chunk;
++ }
++}
++
++static void
++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap,
++ struct ext3_group_info *grp)
++{
++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb);
++ unsigned short i = 0, first, len;
++ unsigned free = 0, fragments = 0;
++ unsigned long long period = get_cycles();
++
++ i = mb_find_next_zero_bit(bitmap, max, 0);
++ grp->bb_first_free = i;
++ while (i < max) {
++ fragments++;
++ first = i;
++ i = find_next_bit(bitmap, max, i);
++ len = i - first;
++ free += len;
++ if (len > 1)
++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp);
++ else
++ grp->bb_counters[0]++;
++ if (i < max)
++ i = mb_find_next_zero_bit(bitmap, max, i);
++ }
++ grp->bb_fragments = fragments;
++
++ /* bb_state shouldn't being modified because all
++ * others waits for init completion on page lock */
++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state);
++ if (free != grp->bb_free) {
++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n",
++ free, grp->bb_free);
++ grp->bb_free = free;
++ }
++
++ period = get_cycles() - period;
++ spin_lock(&EXT3_SB(sb)->s_bal_lock);
++ EXT3_SB(sb)->s_mb_buddies_generated++;
++ EXT3_SB(sb)->s_mb_generation_time += period;
++ spin_unlock(&EXT3_SB(sb)->s_bal_lock);
++}
++
++static int ext3_mb_init_cache(struct page *page)
++{
++ int blocksize, blocks_per_page, groups_per_page;
++ int err = 0, i, first_group, first_block;
++ struct super_block *sb;
++ struct buffer_head *bhs;
++ struct buffer_head **bh;
++ struct inode *inode;
++ char *data, *bitmap;
++
++ mb_debug("init page %lu\n", page->index);
++
++ inode = page->mapping->host;
++ sb = inode->i_sb;
++ blocksize = 1 << inode->i_blkbits;
++ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
++
++ groups_per_page = blocks_per_page >> 1;
++ if (groups_per_page == 0)
++ groups_per_page = 1;
++
++ /* allocate buffer_heads to read bitmaps */
++ if (groups_per_page > 1) {
++ err = -ENOMEM;
++ i = sizeof(struct buffer_head *) * groups_per_page;
++ bh = kmalloc(i, GFP_NOFS);
++ if (bh == NULL)
++ goto out;
++ memset(bh, 0, i);
++ } else
++ bh = &bhs;
++
++ first_group = page->index * blocks_per_page / 2;
++
++ /* read all groups the page covers into the cache */
++ for (i = 0; i < groups_per_page; i++) {
++ struct ext3_group_desc * desc;
++
++ if (first_group + i >= EXT3_SB(sb)->s_groups_count)
++ break;
++
++ err = -EIO;
++ desc = ext3_get_group_desc(sb, first_group + i, NULL);
++ if (desc == NULL)
++ goto out;
++
++ err = -ENOMEM;
++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap));
++ if (bh[i] == NULL)
++ goto out;
++
++ if (buffer_uptodate(bh[i]))
++ continue;
++
++ lock_buffer(bh[i]);
++ if (buffer_uptodate(bh[i])) {
++ unlock_buffer(bh[i]);
++ continue;
++ }
++
++ get_bh(bh[i]);
++ bh[i]->b_end_io = end_buffer_read_sync;
++ submit_bh(READ, bh[i]);
++ mb_debug("read bitmap for group %u\n", first_group + i);
++ }
++
++ /* wait for I/O completion */
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ wait_on_buffer(bh[i]);
++
++ /* XXX: I/O error handling here */
++
++ first_block = page->index * blocks_per_page;
++ for (i = 0; i < blocks_per_page; i++) {
++ int group;
++
++ group = (first_block + i) >> 1;
++ if (group >= EXT3_SB(sb)->s_groups_count)
++ break;
++
++ data = page_address(page) + (i * blocksize);
++ bitmap = bh[group - first_group]->b_data;
++
++ if ((first_block + i) & 1) {
++ /* this is block of buddy */
++ mb_debug("put buddy for group %u in page %lu/%x\n",
++ group, page->index, i * blocksize);
++ memset(data, 0xff, blocksize);
++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
++ ext3_mb_generate_buddy(sb, data, bitmap,
++ EXT3_SB(sb)->s_group_info[group]);
++ } else {
++ /* this is block of bitmap */
++ mb_debug("put bitmap for group %u in page %lu/%x\n",
++ group, page->index, i * blocksize);
++ memcpy(data, bitmap, blocksize);
++ }
++ }
++ SetPageUptodate(page);
++
++out:
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ brelse(bh[i]);
++ if (bh && bh != &bhs)
++ kfree(bh);
++ return err;
++}
++
++static int ext3_mb_load_buddy(struct super_block *sb, int group,
++ struct ext3_buddy *e3b)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct inode *inode = sbi->s_buddy_cache;
++ int blocks_per_page, block, pnum, poff;
++ struct page *page;
++
++ mb_debug("load group %u\n", group);
++
++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
++
++ e3b->bd_blkbits = sb->s_blocksize_bits;
++ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_sb = sb;
++ e3b->bd_group = group;
++ e3b->bd_buddy_page = NULL;
++ e3b->bd_bitmap_page = NULL;
++
++ block = group * 2;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++
++ page = find_get_page(inode->i_mapping, pnum);
++ if (page == NULL || !PageUptodate(page)) {
++ if (page)
++ page_cache_release(page);
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ if (!PageUptodate(page))
++ ext3_mb_init_cache(page);
++ unlock_page(page);
++ }
++ }
++ if (page == NULL || !PageUptodate(page))
++ goto err;
++ e3b->bd_bitmap_page = page;
++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
++ mark_page_accessed(page);
++
++ block++;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++
++ page = find_get_page(inode->i_mapping, pnum);
++ if (page == NULL || !PageUptodate(page)) {
++ if (page)
++ page_cache_release(page);
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ if (!PageUptodate(page))
++ ext3_mb_init_cache(page);
++ unlock_page(page);
++ }
++ }
++ if (page == NULL || !PageUptodate(page))
++ goto err;
++ e3b->bd_buddy_page = page;
++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
++ mark_page_accessed(page);
++
++ J_ASSERT(e3b->bd_bitmap_page != NULL);
++ J_ASSERT(e3b->bd_buddy_page != NULL);
++
++ return 0;
++
++err:
++ if (e3b->bd_bitmap_page)
++ page_cache_release(e3b->bd_bitmap_page);
++ if (e3b->bd_buddy_page)
++ page_cache_release(e3b->bd_buddy_page);
++ e3b->bd_buddy = NULL;
++ e3b->bd_bitmap = NULL;
++ return -EIO;
+}
+
++static void ext3_mb_release_desc(struct ext3_buddy *e3b)
++{
++ if (e3b->bd_bitmap_page)
++ page_cache_release(e3b->bd_bitmap_page);
++ if (e3b->bd_buddy_page)
++ page_cache_release(e3b->bd_buddy_page);
++}
++
++
+static inline void
+ext3_lock_group(struct super_block *sb, int group)
+{
-+ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock);
++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
++ &EXT3_SB(sb)->s_group_info[group]->bb_state);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
-+ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock);
++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
++ &EXT3_SB(sb)->s_group_info[group]->bb_state);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+
+static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count)
+{
-+ int block, max, order;
++ int block = 0, max = 0, order;
+ void *buddy, *buddy2;
+
+ mb_check_buddy(e3b);
+
-+ e3b->bd_bd->bb_free += count;
-+ if (first < e3b->bd_bd->bb_first_free)
-+ e3b->bd_bd->bb_first_free = first;
-+
++ e3b->bd_info->bb_free += count;
++ if (first < e3b->bd_info->bb_first_free)
++ e3b->bd_info->bb_first_free = first;
++
++ /* let's maintain fragments counter */
++ if (first != 0)
++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b));
++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0])
++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b));
++ if (block && max)
++ e3b->bd_info->bb_fragments--;
++ else if (!block && !max)
++ e3b->bd_info->bb_fragments++;
++
++ /* let's maintain buddy itself */
+ while (count-- > 0) {
+ block = first++;
+ order = 0;
+
+ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b)));
+ mb_clear_bit(block, EXT3_MB_BITMAP(e3b));
-+ e3b->bd_bd->bb_counters[order]++;
++ e3b->bd_info->bb_counters[order]++;
+
+ /* start of the buddy */
+ buddy = mb_find_buddy(e3b, order, &max);
+ mb_set_bit(block, buddy);
+ mb_set_bit(block + 1, buddy);
+ }
-+ e3b->bd_bd->bb_counters[order]--;
-+ e3b->bd_bd->bb_counters[order]--;
++ e3b->bd_info->bb_counters[order]--;
++ e3b->bd_info->bb_counters[order]--;
+
+ block = block >> 1;
+ order++;
-+ e3b->bd_bd->bb_counters[order]++;
++ e3b->bd_info->bb_counters[order]++;
+
+ mb_clear_bit(block, buddy2);
+ buddy = buddy2;
+}
+
+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
-+ int needed, struct ext3_free_extent *ex)
++ int needed, struct ext3_free_extent *ex)
+{
+ int next, max, ord;
+ void *buddy;
+ return 0;
+ }
+
-+ if (order == 0) {
++ if (likely(order == 0)) {
+ /* find actual order */
+ order = mb_find_order_for_block(e3b, block);
+ block = block >> order;
+ ex->fe_start = block << order;
+ ex->fe_group = e3b->bd_group;
+
-+ while ((buddy = mb_find_buddy(e3b, order, &max))) {
++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) {
+
+ if (block + 1 >= max)
+ break;
+
+static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex)
+{
++ int ord, mlen = 0, max = 0, cur;
+ int start = ex->fe_start;
+ int len = ex->fe_len;
-+ int ord, mlen, max, cur;
++ unsigned ret = 0;
+ int len0 = len;
+ void *buddy;
+
-+ e3b->bd_bd->bb_free -= len;
-+ if (e3b->bd_bd->bb_first_free == start)
-+ e3b->bd_bd->bb_first_free += len;
++ mb_check_buddy(e3b);
+
++ e3b->bd_info->bb_free -= len;
++ if (e3b->bd_info->bb_first_free == start)
++ e3b->bd_info->bb_first_free += len;
++
++ /* let's maintain fragments counter */
++ if (start != 0)
++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b));
++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0])
++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b));
++ if (mlen && max)
++ e3b->bd_info->bb_fragments++;
++ else if (!mlen && !max)
++ e3b->bd_info->bb_fragments--;
++
++ /* let's maintain buddy itself */
+ while (len) {
+ ord = mb_find_order_for_block(e3b, start);
+
+ buddy = mb_find_buddy(e3b, ord, &max);
+ J_ASSERT((start >> ord) < max);
+ mb_set_bit(start >> ord, buddy);
-+ e3b->bd_bd->bb_counters[ord]--;
++ e3b->bd_info->bb_counters[ord]--;
+ start += mlen;
+ len -= mlen;
+ J_ASSERT(len >= 0);
+ continue;
+ }
+
++ /* store for history */
++ if (ret == 0)
++ ret = len | (ord << 16);
++
+ /* we have to split large buddy */
+ J_ASSERT(ord > 0);
+ buddy = mb_find_buddy(e3b, ord, &max);
+ mb_set_bit(start >> ord, buddy);
-+ e3b->bd_bd->bb_counters[ord]--;
++ e3b->bd_info->bb_counters[ord]--;
+
+ ord--;
+ cur = (start >> ord) & ~1U;
+ buddy = mb_find_buddy(e3b, ord, &max);
+ mb_clear_bit(cur, buddy);
+ mb_clear_bit(cur + 1, buddy);
-+ e3b->bd_bd->bb_counters[ord]++;
-+ e3b->bd_bd->bb_counters[ord]++;
++ e3b->bd_info->bb_counters[ord]++;
++ e3b->bd_info->bb_counters[ord]++;
+ }
+
+ /* now drop all the bits in bitmap */
+
+ mb_check_buddy(e3b);
+
-+ return 0;
++ return ret;
+}
+
+/*
+static void ext3_mb_use_best_found(struct ext3_allocation_context *ac,
+ struct ext3_buddy *e3b)
+{
++ unsigned long ret;
++
+ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
-+ mb_mark_used(e3b, &ac->ac_b_ex);
++ ret = mb_mark_used(e3b, &ac->ac_b_ex);
++
+ ac->ac_status = AC_STATUS_FOUND;
++ ac->ac_tail = ret & 0xffff;
++ ac->ac_buddy = ret >> 16;
+}
+
+/*
+ struct ext3_free_extent *ex,
+ struct ext3_buddy *e3b)
+{
-+ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor;
+ struct ext3_free_extent *bex = &ac->ac_b_ex;
-+ int diff = ac->ac_g_ex.fe_len - ex->fe_len;
++ struct ext3_free_extent *gex = &ac->ac_g_ex;
+
+ J_ASSERT(ex->fe_len > 0);
+ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8);
+ /*
+ * The special case - take what you catch first
+ */
-+ if (ac->ac_flags & EXT3_MB_HINT_FIRST) {
++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) {
+ *bex = *ex;
+ ext3_mb_use_best_found(ac, e3b);
+ return;
+ /*
+ * Let's check whether the chuck is good enough
+ */
-+ if (ex->fe_len >= ac->ac_g_ex.fe_len) {
-+ *bex = *ex;
-+ ext3_mb_use_best_found(ac, e3b);
-+ return;
-+ }
-+
-+ /*
-+ * If the request is vey large, then it makes sense to use large
-+ * chunks for it. Even if they don't satisfy whole request.
-+ */
-+ if (ex->fe_len > 1000) {
-+ *bex = *ex;
-+ ext3_mb_use_best_found(ac, e3b);
-+ return;
-+ }
-+
-+ /*
-+ * Sometimes it's worty to take close chunk
-+ */
-+ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) {
++ if (ex->fe_len == gex->fe_len) {
+ *bex = *ex;
+ ext3_mb_use_best_found(ac, e3b);
+ return;
+
+ /*
+ * If new found extent is better, store it in the context
-+ * FIXME: possible the policy should be more complex?
+ */
-+ if (ex->fe_len > bex->fe_len) {
++ if (bex->fe_len < gex->fe_len) {
++ /* if the request isn't satisfied, any found extent
++ * larger than previous best one is better */
++ if (ex->fe_len > bex->fe_len)
++ *bex = *ex;
++ } else if (ex->fe_len > gex->fe_len) {
++ /* if the request is satisfied, then we try to find
++ * an extent that still satisfy the request, but is
++ * smaller than previous one */
+ *bex = *ex;
+ }
+
+ /*
++ * Let's scan at least few extents and don't pick up a first one
++ */
++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan)
++ ac->ac_status = AC_STATUS_BREAK;
++
++ /*
+ * We don't want to scan for a whole year
+ */
+ if (ac->ac_found > ext3_mb_max_to_scan)
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
+
-+ if (max > 0)
++ if (max > 0) {
++ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
++ }
+
+ ext3_unlock_group(ac->ac_sb, group);
+
-+ if (ac->ac_status == AC_STATUS_FOUND)
-+ ext3_mb_dirty_buddy(e3b);
+ ext3_mb_release_desc(e3b);
+
+ return 0;
+ J_ASSERT(ex.fe_len > 0);
+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
++ ac->ac_found++;
+ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
+ }
+ ext3_unlock_group(ac->ac_sb, group);
+
-+ if (ac->ac_status == AC_STATUS_FOUND)
-+ ext3_mb_dirty_buddy(e3b);
+ ext3_mb_release_desc(e3b);
+
+ return 0;
+}
++
++/*
++ * The routine scans buddy structures (not bitmap!) from given order
++ * to max order and tries to find big enough chunk to satisfy the req
++ */
++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ struct ext3_group_info *grp = e3b->bd_info;
++ void *buddy;
++ int i, k, max;
++
++ J_ASSERT(ac->ac_2order > 0);
++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) {
++ if (grp->bb_counters[i] == 0)
++ continue;
++
++ buddy = mb_find_buddy(e3b, i, &max);
++ if (buddy == NULL) {
++ printk(KERN_ALERT "looking for wrong order?\n");
++ break;
++ }
++
++ k = mb_find_next_zero_bit(buddy, max, 0);
++ J_ASSERT(k < max);
++
++ ac->ac_found++;
++
++ ac->ac_b_ex.fe_len = 1 << i;
++ ac->ac_b_ex.fe_start = k << i;
++ ac->ac_b_ex.fe_group = e3b->bd_group;
++
++ ext3_mb_use_best_found(ac, e3b);
++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len);
++
++ if (unlikely(ext3_mb_stats))
++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders);
++
++ break;
++ }
++}
++
+/*
+ * The routine scans the group and measures all found extents.
+ * In order to optimize scanning, caller must pass number of
-+ * free blocks in the group, so the routine can upper limit.
++ * free blocks in the group, so the routine can know upper limit.
+ */
-+static void ext3_mb_scan_group(struct ext3_allocation_context *ac,
-+ struct ext3_buddy *e3b)
++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
+{
+ struct super_block *sb = ac->ac_sb;
+ void *bitmap = EXT3_MB_BITMAP(e3b);
+ struct ext3_free_extent ex;
+ int i, free;
+
-+ free = e3b->bd_bd->bb_free;
++ free = e3b->bd_info->bb_free;
+ J_ASSERT(free > 0);
+
-+ i = e3b->bd_bd->bb_first_free;
++ i = e3b->bd_info->bb_first_free;
+
-+ while (free && ac->ac_status != AC_STATUS_FOUND) {
-+ i = ext3_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
++ while (free && ac->ac_status == AC_STATUS_CONTINUE) {
++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
+ if (i >= sb->s_blocksize * 8) {
+ J_ASSERT(free == 0);
+ break;
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+ int group, int cr)
+{
-+ int free;
++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
++ struct ext3_group_info *grp = sbi->s_group_info[group];
++ unsigned free, fragments, i, bits;
+
-+ J_ASSERT(cr >= 0 && cr < 3);
++ J_ASSERT(cr >= 0 && cr < 4);
++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp));
+
-+ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free;
++ free = grp->bb_free;
++ fragments = grp->bb_fragments;
+ if (free == 0)
+ return 0;
++ if (fragments == 0)
++ return 0;
+
-+ if (cr == 0) {
-+ if (free >= ac->ac_g_ex.fe_len >> 1)
++ switch (cr) {
++ case 0:
++ J_ASSERT(ac->ac_2order != 0);
++ bits = ac->ac_sb->s_blocksize_bits + 1;
++ for (i = ac->ac_2order; i < bits; i++)
++ if (grp->bb_counters[i] > 0)
++ return 1;
++ case 1:
++ if ((free / fragments) >= ac->ac_g_ex.fe_len)
++ return 1;
++ case 2:
++ if (free >= ac->ac_g_ex.fe_len)
++ return 1;
++ case 3:
+ return 1;
-+ } else if (cr == 1) {
-+ if (free >= ac->ac_g_ex.fe_len >> 2)
-+ return 1;
-+ } else if (cr == 2) {
-+ return 1;
++ default:
++ BUG();
+ }
++
+ return 0;
+}
+
+ ac.ac_g_ex.fe_start = block;
+ ac.ac_g_ex.fe_len = *len;
+ ac.ac_flags = flags;
++ ac.ac_2order = 0;
++ ac.ac_criteria = 0;
+
-+ /*
-+ * Sometimes, caller may want to merge even small number
-+ * of blocks to an existing extent
-+ */
++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */
++ i = ffs(*len);
++ if (i >= 8) {
++ i--;
++ if ((*len & (~(1 << i))) == 0)
++ ac.ac_2order = i;
++ }
++
++ /* Sometimes, caller may want to merge even small
++ * number of blocks to an existing extent */
+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) {
+ err = ext3_mb_find_by_goal(&ac, &e3b);
+ if (err)
+ goto found;
+ }
+
-+ /*
-+ * FIXME
-+ * If requested chunk is power of 2 length, we can try
-+ * to exploit buddy nature to speed allocation up
-+ */
-+
-+
-+ /*
-+ * Let's just scan groups to find more-less suitable blocks
-+ */
-+ cr = 0;
++ /* Let's just scan groups to find more-less suitable blocks */
++ cr = ac.ac_2order ? 0 : 1;
+repeat:
-+ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
++ ac.ac_criteria = cr;
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) {
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ /* we need full data about the group
++ * to make a good selection */
++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
++ if (err)
++ goto out_err;
++ ext3_mb_release_desc(&e3b);
++ }
++
+ /* check is group good for our criteries */
+ if (!ext3_mb_good_group(&ac, group, cr))
+ continue;
+ continue;
+ }
+
-+ ext3_mb_scan_group(&ac, &e3b);
++ ac.ac_groups_scanned++;
++ if (cr == 0)
++ ext3_mb_simple_scan_group(&ac, &e3b);
++ else
++ ext3_mb_complex_scan_group(&ac, &e3b);
++
+ ext3_unlock_group(sb, group);
+
-+ if (ac.ac_status == AC_STATUS_FOUND)
-+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
+ if (err)
+ }
+ }
+
-+ if (ac.ac_status == AC_STATUS_BREAK &&
++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND &&
+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
-+ /* We've been searching too long. Let's try to allocate
-+ * the best chunk we've found so far. */
-+ if (ac.ac_g_ex.fe_len >= 128 &&
-+ ac.ac_b_ex.fe_len < ac.ac_g_ex.fe_len / 4)
-+ ext3_warning(inode->i_sb, __FUNCTION__,
-+ "too long searching: got %d want %d\n",
-+ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len);
++ /*
++ * We've been searching too long. Let's try to allocate
++ * the best chunk we've found so far
++ */
++
++ /*if (ac.ac_found > ext3_mb_max_to_scan)
++ printk(KERN_ERR "EXT3-fs: too long searching at "
++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len,
++ ac.ac_g_ex.fe_len);*/
+ ext3_mb_try_best_found(&ac, &e3b);
+ if (ac.ac_status != AC_STATUS_FOUND) {
+ /*
+ * The only thing we can do is just take first
+ * found block(s)
+ */
-+ mb_debug(KERN_ERR "EXT3-fs: and someone won our chunk\n");
++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");
+ ac.ac_b_ex.fe_group = 0;
+ ac.ac_b_ex.fe_start = 0;
+ ac.ac_b_ex.fe_len = 0;
+ ac.ac_status = AC_STATUS_CONTINUE;
+ ac.ac_flags |= EXT3_MB_HINT_FIRST;
-+ cr = 2;
++ cr = 3;
+ goto repeat;
+ }
+ }
+ printk("EXT3-fs: groups: ");
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
+ printk("%d: %d ", i,
-+ sbi->s_buddy_blocks[i]->bb_free);
++ sbi->s_group_info[i]->bb_free);
+ printk("\n");
+#endif
+ goto out;
+ ext3_error(sb, "ext3_new_block",
+ "Allocating block in system zone - "
+ "block = %u", block);
-+ if (unlikely(ext3_mb_aggressive)) {
-+ for (i = 0; i < ac.ac_b_ex.fe_len; i++)
-+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i,
-+ bitmap_bh->b_data));
-+ }
-+
++#ifdef AGGRESSIVE_CHECK
++ for (i = 0; i < ac.ac_b_ex.fe_len; i++)
++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data));
++#endif
+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len);
+
+ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
+ * path only, here is single block always */
+ ext3_mb_release_blocks(sb, 1);
+ }
-+
-+ if ((ext3_mb_stats) && (ac.ac_g_ex.fe_len > 1)) {
-+ spin_lock(&sbi->s_bal_lock);
-+ sbi->s_bal_reqs++;
-+ sbi->s_bal_allocated += *len;
++
++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) {
++ atomic_inc(&sbi->s_bal_reqs);
++ atomic_add(*len, &sbi->s_bal_allocated);
+ if (*len >= ac.ac_g_ex.fe_len)
-+ sbi->s_bal_success++;
-+ sbi->s_bal_ex_scanned += ac.ac_found;
++ atomic_inc(&sbi->s_bal_success);
++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned);
+ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start &&
+ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group)
-+ sbi->s_bal_goals++;
++ atomic_inc(&sbi->s_bal_goals);
+ if (ac.ac_found > ext3_mb_max_to_scan)
-+ sbi->s_bal_breaks++;
-+ spin_unlock(&sbi->s_bal_lock);
++ atomic_inc(&sbi->s_bal_breaks);
+ }
+
++ ext3_mb_store_history(sb, &ac);
++
+ return block;
+}
++EXPORT_SYMBOL(ext3_mb_new_blocks);
++
++#ifdef EXT3_MB_HISTORY
++struct ext3_mb_proc_session {
++ struct ext3_mb_history *history;
++ struct super_block *sb;
++ int start;
++ int max;
++};
+
-+int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh,
-+ struct ext3_mb_group_descr **grp)
++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s,
++ struct ext3_mb_history *hs,
++ int first)
+{
-+ struct super_block *sb = e3b->bd_sb;
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int descr_per_block, err, offset;
-+ struct ext3_mb_grp_header *hdr;
-+ unsigned long block;
-+
-+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
-+ / sizeof(struct ext3_mb_group_descr);
-+ block = e3b->bd_group / descr_per_block;
-+ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err);
-+ if (*bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n",
-+ e3b->bd_group, err);
-+ return err;
-+ }
++ if (hs == s->history + s->max)
++ hs = s->history;
++ if (!first && hs == s->history + s->start)
++ return NULL;
++ while (hs->goal.fe_len == 0) {
++ hs++;
++ if (hs == s->history + s->max)
++ hs = s->history;
++ if (hs == s->history + s->start)
++ return NULL;
++ }
++ return hs;
++}
+
-+ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data;
-+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
-+ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n",
-+ e3b->bd_group);
-+ brelse(*bh);
-+ *bh = NULL;
-+ return -EIO;
-+ }
++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
++{
++ struct ext3_mb_proc_session *s = seq->private;
++ struct ext3_mb_history *hs;
++ int l = *pos;
++
++ if (l == 0)
++ return SEQ_START_TOKEN;
++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1);
++ if (!hs)
++ return NULL;
++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL);
++ return hs;
++}
+
-+ offset = e3b->bd_group % descr_per_block
-+ * sizeof(struct ext3_mb_group_descr)
-+ + sizeof(struct ext3_mb_grp_header);
-+ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset);
++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
++{
++ struct ext3_mb_proc_session *s = seq->private;
++ struct ext3_mb_history *hs = v;
+
-+ return 0;
++ ++*pos;
++ if (v == SEQ_START_TOKEN)
++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1);
++ else
++ return ext3_mb_history_skip_empty(s, ++hs, 0);
+}
+
-+int ext3_mb_load_descr(struct ext3_buddy *e3b)
++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v)
+{
-+ struct ext3_mb_group_descr *grp;
-+ struct ext3_group_desc *gdp;
-+ struct buffer_head *bh;
-+ int err, i;
-+
-+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
-+ if (err)
-+ return err;
-+
-+ e3b->bd_bd->bb_first_free = grp->mgd_first_free;
-+ e3b->bd_bd->bb_free = grp->mgd_free;
-+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
-+ J_ASSERT(i < 16);
-+ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i];
-+ }
-+ brelse(bh);
++ struct ext3_mb_history *hs = v;
++ char buf[20], buf2[20];
+
-+ /* additional checks against old group descriptor */
-+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
-+ if (!gdp)
-+ return -EIO;
-+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
-+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
-+ e3b->bd_group, e3b->bd_bd->bb_free,
-+ le16_to_cpu(gdp->bg_free_blocks_count));
-+ return -ENODATA;
++ if (v == SEQ_START_TOKEN) {
++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n",
++ "goal", "result", "found", "grps", "cr", "merge",
++ "tail", "broken");
++ return 0;
+ }
+
++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group,
++ hs->goal.fe_start, hs->goal.fe_len);
++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group,
++ hs->result.fe_start, hs->result.fe_len);
++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf,
++ buf2, hs->found, hs->groups, hs->cr,
++ hs->merged ? "M" : "", hs->tail,
++ hs->buddy ? 1 << hs->buddy : 0);
+ return 0;
+}
+
++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v)
++{
++}
++
++static struct seq_operations ext3_mb_seq_history_ops = {
++ .start = ext3_mb_seq_history_start,
++ .next = ext3_mb_seq_history_next,
++ .stop = ext3_mb_seq_history_stop,
++ .show = ext3_mb_seq_history_show,
++};
+
-+int ext3_mb_update_descr(struct ext3_buddy *e3b)
++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file)
+{
-+ struct ext3_mb_group_descr *grp;
-+ struct ext3_group_desc *gdp;
-+ struct buffer_head *bh;
-+ handle_t *handle;
-+ int err, i;
++ struct super_block *sb = PDE(inode)->data;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_proc_session *s;
++ int rc, size;
+
-+ /* additional checks against old group descriptor */
-+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
-+ if (!gdp)
++ s = kmalloc(sizeof(*s), GFP_KERNEL);
++ if (s == NULL)
++ return -EIO;
++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max;
++ s->history = kmalloc(size, GFP_KERNEL);
++ if (s == NULL) {
++ kfree(s);
+ return -EIO;
-+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
-+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
-+ e3b->bd_group, e3b->bd_bd->bb_free,
-+ le16_to_cpu(gdp->bg_free_blocks_count));
-+ return -ENODATA;
+ }
+
-+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
-+ if (err)
-+ return err;
++ spin_lock(&sbi->s_mb_history_lock);
++ memcpy(s->history, sbi->s_mb_history, size);
++ s->max = sbi->s_mb_history_max;
++ s->start = sbi->s_mb_history_cur % s->max;
++ spin_unlock(&sbi->s_mb_history_lock);
+
-+ handle = ext3_journal_start_sb(e3b->bd_sb, 1);
-+ if (IS_ERR(handle)) {
-+ err = PTR_ERR(handle);
-+ handle = NULL;
-+ goto out;
++ rc = seq_open(file, &ext3_mb_seq_history_ops);
++ if (rc == 0) {
++ struct seq_file *m = (struct seq_file *)file->private_data;
++ m->private = s;
++ } else {
++ kfree(s->history);
++ kfree(s);
+ }
++ return rc;
+
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err)
-+ goto out;
-+ grp->mgd_first_free = e3b->bd_bd->bb_first_free;
-+ grp->mgd_free = e3b->bd_bd->bb_free;
-+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
-+ J_ASSERT(i < 16);
-+ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i];
-+ }
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err)
-+ goto out;
-+ err = 0;
-+out:
-+ brelse(bh);
-+ if (handle)
-+ ext3_journal_stop(handle);
-+ return err;
+}
+
-+int ext3_mb_generate_buddy(struct ext3_buddy *e3b)
++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file)
+{
-+ struct super_block *sb = e3b->bd_sb;
-+ struct buffer_head *bh;
-+ int i, count = 0;
++ struct seq_file *seq = (struct seq_file *)file->private_data;
++ struct ext3_mb_proc_session *s = seq->private;
++ kfree(s->history);
++ kfree(s);
++ return seq_release(inode, file);
++}
++
++static struct file_operations ext3_mb_seq_history_fops = {
++ .owner = THIS_MODULE,
++ .open = ext3_mb_seq_history_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext3_mb_seq_history_release,
++};
+
-+ mb_debug("generate buddy for group %d\n", e3b->bd_group);
-+ memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize);
-+ memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize);
++static void ext3_mb_history_release(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char name[64];
+
-+ bh = read_block_bitmap(sb, e3b->bd_group);
-+ if (bh == NULL)
-+ return -EIO;
++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ remove_proc_entry("mb_history", sbi->s_mb_proc);
++ remove_proc_entry(name, proc_root_ext3);
++
++ if (sbi->s_mb_history)
++ kfree(sbi->s_mb_history);
++}
+
-+ /* mb_free_blocks will set real free */
-+ e3b->bd_bd->bb_free = 0;
-+ e3b->bd_bd->bb_first_free = 1 << 15;
-+ /*
-+ * if change bb_counters size, don't forget about
-+ * ext3_mb_init_backend() -bzzz
-+ */
-+ memset(e3b->bd_bd->bb_counters, 0,
-+ sizeof(unsigned) * (sb->s_blocksize_bits + 2));
++static void ext3_mb_history_init(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char name[64];
++ int i;
+
-+ /* loop over the blocks, and create buddies for free ones */
-+ for (i = 0; i < sb->s_blocksize * 8; i++) {
-+ if (!mb_test_bit(i, (void *) bh->b_data)) {
-+ mb_free_blocks(e3b, i, 1);
-+ count++;
++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3);
++ if (sbi->s_mb_proc != NULL) {
++ struct proc_dir_entry *p;
++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
++ if (p) {
++ p->proc_fops = &ext3_mb_seq_history_fops;
++ p->data = sb;
+ }
+ }
-+ brelse(bh);
-+ mb_check_buddy(e3b);
-+ ext3_mb_dirty_buddy(e3b);
+
-+ return 0;
++ sbi->s_mb_history_max = 1000;
++ sbi->s_mb_history_cur = 0;
++ spin_lock_init(&sbi->s_mb_history_lock);
++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history);
++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
++ memset(sbi->s_mb_history, 0, i);
++ /* if we can't allocate history, then we simple won't use it */
+}
+
-+EXPORT_SYMBOL(ext3_mb_new_blocks);
++static void
++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_history h;
++
++ if (likely(sbi->s_mb_history == NULL))
++ return;
++
++ h.goal = ac->ac_g_ex;
++ h.result = ac->ac_b_ex;
++ h.found = ac->ac_found;
++ h.cr = ac->ac_criteria;
++ h.groups = ac->ac_groups_scanned;
++ h.tail = ac->ac_tail;
++ h.buddy = ac->ac_buddy;
++ h.merged = 0;
++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
++ h.merged = 1;
++
++ spin_lock(&sbi->s_mb_history_lock);
++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
++ sbi->s_mb_history_cur = 0;
++ spin_unlock(&sbi->s_mb_history_lock);
++}
+
-+#define MB_CREDITS \
-+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \
-+ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS)
++#else
++#define ext3_mb_history_release(sb)
++#define ext3_mb_history_init(sb)
++#endif
+
-+int ext3_mb_init_backend(struct super_block *sb, int *created)
++int ext3_mb_init_backend(struct super_block *sb)
+{
-+ int err, i, len, descr_per_block, buddy_offset, size;
-+ struct inode *root = sb->s_root->d_inode;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ struct ext3_mb_grp_header *hdr;
-+ struct buffer_head *bh = NULL;
-+ unsigned long block;
-+ struct dentry *db;
-+ handle_t *handle;
-+ tid_t target;
-+
-+ *created = 0;
++ int i, len;
++
+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
-+ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_buddy_blocks == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
++ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ if (sbi->s_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_buddy_blocks, 0, len);
-+ sbi->s_buddy = NULL;
-+
-+ down(&root->i_sem);
-+ len = strlen(EXT3_BUDDY_FILE);
-+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len);
-+ if (IS_ERR(db)) {
-+ err = PTR_ERR(db);
-+ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err);
-+ up(&root->i_sem);
-+ goto out;
-+ }
++ memset(sbi->s_group_info, 0, len);
+
-+ if (db->d_inode == NULL) {
-+ err = ext3_create(root, db, S_IFREG, NULL);
-+ if (err) {
-+ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err);
-+ up(&root->i_sem);
-+ goto out;
-+ }
-+ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME;
-+ *created = 1;
-+ mb_debug("no buddy file, regenerate\n");
-+ }
-+ up(&root->i_sem);
-+ sbi->s_buddy = igrab(db->d_inode);
-+
-+ /* calculate needed size */
-+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
-+ / sizeof(struct ext3_mb_group_descr);
-+ buddy_offset = (sbi->s_groups_count + descr_per_block - 1)
-+ / descr_per_block;
-+ len = sbi->s_groups_count * sb->s_blocksize * 2 +
-+ buddy_offset * sb->s_blocksize;
-+ if (len != i_size_read(sbi->s_buddy)) {
-+ if (*created == 0)
-+ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n",
-+ (unsigned) len,
-+ (unsigned) i_size_read(sbi->s_buddy));
-+ *created = 1;
-+ }
-+
-+ /* read/create mb group descriptors */
-+ for (i = 0; i < buddy_offset; i++) {
-+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
-+ if (IS_ERR(handle)) {
-+ printk(KERN_ERR "EXT3-fs: cant start transaction\n");
-+ err = PTR_ERR(handle);
-+ goto err_out;
-+ }
-+
-+ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err);
-+ if (bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err);
-+ goto err_out;
-+ }
-+ hdr = (struct ext3_mb_grp_header *) bh->b_data;
-+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err)
-+ goto err_out;
-+ if (*created == 0)
-+ printk(KERN_ERR
-+ "EXT3-fs: invalid header 0x%x in %d,"
-+ "regenerate\n", hdr->mh_magic, i);
-+ *created = 1;
-+ hdr->mh_magic = EXT3_MB_MAGIC_V1;
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err)
-+ goto err_out;
-+ }
-+ brelse(bh);
-+ ext3_journal_stop(handle);
++ sbi->s_buddy_cache = new_inode(sb);
++ if (sbi->s_buddy_cache == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
++ kfree(sbi->s_group_info);
++ return -ENOMEM;
+ }
+
+ /*
-+ * if change bb_counters size, don't forget about ext3_mb_generate_buddy()
++ * calculate needed size. if change bb_counters size,
++ * don't forget about ext3_mb_generate_buddy()
+ */
-+ len = sizeof(struct ext3_buddy_group_blocks);
-+ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2);
++ len = sizeof(struct ext3_group_info);
++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
+ for (i = 0; i < sbi->s_groups_count; i++) {
++ struct ext3_group_desc * desc;
+
-+ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_buddy_blocks[i] == NULL) {
++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
++ if (sbi->s_group_info[i] == NULL) {
+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
-+ err = -ENOMEM;
-+ goto out2;
-+ }
-+ memset(sbi->s_buddy_blocks[i], 0, len);
-+
-+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
-+ if (IS_ERR(handle)) {
-+ printk(KERN_ERR "EXT3-fs: cant start transaction\n");
-+ err = PTR_ERR(handle);
-+ goto out2;
-+ }
-+
-+ /* allocate block for bitmap */
-+ block = buddy_offset + i * 2;
-+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
-+ if (bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err);
-+ goto out2;
-+ }
-+ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr;
-+ brelse(bh);
-+
-+ /* allocate block for buddy */
-+ block = buddy_offset + i * 2 + 1;
-+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
-+ if (bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err);
-+ goto out2;
++ goto err_out;
+ }
-+ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr;
-+ brelse(bh);
-+
-+ size = (block + 1) << sbi->s_buddy->i_blkbits;
-+ if (size > sbi->s_buddy->i_size) {
-+ *created = 1;
-+ EXT3_I(sbi->s_buddy)->i_disksize = size;
-+ i_size_write(sbi->s_buddy, size);
-+ mark_inode_dirty(sbi->s_buddy);
++ desc = ext3_get_group_desc(sb, i, NULL);
++ if (desc == NULL) {
++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i);
++ goto err_out;
+ }
-+ ext3_journal_stop(handle);
-+
-+ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock);
-+ sbi->s_buddy_blocks[i]->bb_md_cur = NULL;
-+ sbi->s_buddy_blocks[i]->bb_tid = 0;
++ memset(sbi->s_group_info[i], 0, len);
++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
++ &sbi->s_group_info[i]->bb_state);
++ sbi->s_group_info[i]->bb_free =
++ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
-+ if (journal_start_commit(sbi->s_journal, &target))
-+ log_wait_commit(sbi->s_journal, target);
-+
-+out2:
-+ dput(db);
-+out:
-+ return err;
++ return 0;
+
+err_out:
-+ return err;
++ while (--i >= 0)
++ kfree(sbi->s_group_info[i]);
++ iput(sbi->s_buddy_cache);
++
++ return -ENOMEM;
+}
+
-+int ext3_mb_write_descriptors(struct super_block *sb)
++int ext3_mb_init(struct super_block *sb, int needs_recovery)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ struct ext3_buddy e3b;
-+ int ret = 0, i, err;
++ struct inode *root = sb->s_root->d_inode;
++ unsigned i, offset, max;
++ struct dentry *dentry;
+
-+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_buddy_blocks[i] == NULL)
-+ continue;
++ if (!test_opt(sb, MBALLOC))
++ return 0;
+
-+ err = ext3_mb_load_buddy(sb, i, &e3b);
-+ if (err == 0) {
-+ ext3_mb_update_descr(&e3b);
-+ ext3_mb_release_desc(&e3b);
-+ } else
-+ ret = err;
++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
++
++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
++ if (sbi->s_mb_offsets == NULL) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ return -ENOMEM;
+ }
-+ return ret;
++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
++ if (sbi->s_mb_maxs == NULL) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ kfree(sbi->s_mb_maxs);
++ return -ENOMEM;
++ }
++
++ /* order 0 is regular bitmap */
++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
++ sbi->s_mb_offsets[0] = 0;
++
++ i = 1;
++ offset = 0;
++ max = sb->s_blocksize << 2;
++ do {
++ sbi->s_mb_offsets[i] = offset;
++ sbi->s_mb_maxs[i] = max;
++ offset += 1 << (sb->s_blocksize_bits - i);
++ max = max >> 1;
++ i++;
++ } while (i <= sb->s_blocksize_bits + 1);
++
++
++ /* init file for buddy data */
++ if ((i = ext3_mb_init_backend(sb))) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ kfree(sbi->s_mb_offsets);
++ kfree(sbi->s_mb_maxs);
++ return i;
++ }
++
++ spin_lock_init(&sbi->s_reserve_lock);
++ spin_lock_init(&sbi->s_md_lock);
++ INIT_LIST_HEAD(&sbi->s_active_transaction);
++ INIT_LIST_HEAD(&sbi->s_closed_transaction);
++ INIT_LIST_HEAD(&sbi->s_committed_transaction);
++ spin_lock_init(&sbi->s_bal_lock);
++
++ /* remove old on-disk buddy file */
++ down(&root->i_sem);
++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy"));
++ if (dentry->d_inode != NULL) {
++ i = vfs_unlink(root, dentry);
++ if (i != 0)
++ printk("EXT3-fs: can't remove .buddy file: %d\n", i);
++ }
++ dput(dentry);
++ up(&root->i_sem);
++
++ ext3_mb_history_init(sb);
++
++ printk("EXT3-fs: mballoc enabled\n");
++ return 0;
+}
+
+int ext3_mb_release(struct super_block *sb)
+ spin_unlock(&sbi->s_md_lock);
+ ext3_mb_free_committed_blocks(sb);
+
-+ if (sbi->s_buddy_blocks) {
-+ ext3_mb_write_descriptors(sb);
++ if (sbi->s_group_info) {
+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_buddy_blocks[i] == NULL)
++ if (sbi->s_group_info[i] == NULL)
+ continue;
-+ kfree(sbi->s_buddy_blocks[i]);
++ kfree(sbi->s_group_info[i]);
+ }
-+ kfree(sbi->s_buddy_blocks);
-+ }
-+ if (sbi->s_buddy)
-+ iput(sbi->s_buddy);
++ kfree(sbi->s_group_info);
++ }
++ if (sbi->s_mb_offsets)
++ kfree(sbi->s_mb_offsets);
++ if (sbi->s_mb_maxs)
++ kfree(sbi->s_mb_maxs);
++ if (sbi->s_buddy_cache)
++ iput(sbi->s_buddy_cache);
+ if (sbi->s_blocks_reserved)
+ printk("ext3-fs: %ld blocks being reserved at umount!\n",
+ sbi->s_blocks_reserved);
+ if (ext3_mb_stats) {
-+ printk("EXT3-fs: mballoc: %lu blocks %lu reqs "
-+ "(%lu success)\n", sbi->s_bal_allocated,
-+ sbi->s_bal_reqs, sbi->s_bal_success);
-+ printk("EXT3-fs: mballoc: %lu extents scanned, "
-+ "%lu goal hits, %lu breaks\n", sbi->s_bal_ex_scanned,
-+ sbi->s_bal_goals, sbi->s_bal_breaks);
-+ }
-+
-+ return 0;
-+}
-+
-+int ext3_mb_init(struct super_block *sb, int needs_recovery)
-+{
-+ struct ext3_buddy e3b;
-+ int i, err, created;
-+
-+ if (!test_opt(sb, MBALLOC))
-+ return 0;
-+
-+ /* init file for buddy data */
-+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
-+ if ((err = ext3_mb_init_backend(sb, &created)))
-+ return err;
-+
-+repeat:
-+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
-+ err = ext3_mb_load_buddy(sb, i, &e3b);
-+ if (err) {
-+ /* FIXME: release backend */
-+ return err;
-+ }
-+ if (created || needs_recovery)
-+ ext3_mb_generate_buddy(&e3b);
-+ else
-+ err = ext3_mb_load_descr(&e3b);
-+ ext3_mb_release_desc(&e3b);
-+ if (err == -ENODATA) {
-+ created = 1;
-+ goto repeat;
-+ }
-+ }
-+ if (created || needs_recovery)
-+ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n",
-+ EXT3_SB(sb)->s_groups_count);
-+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock);
-+ spin_lock_init(&EXT3_SB(sb)->s_md_lock);
-+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction);
-+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction);
-+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction);
-+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
-+
-+ spin_lock_init(&EXT3_SB(sb)->s_bal_lock);
-+ if (ext3_mb_stats) {
-+ printk("EXT3-fs: mballoc enabled (stats)\n");
-+ } else {
-+ printk("EXT3-fs: mballoc enabled\n");
-+ }
++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n",
++ atomic_read(&sbi->s_bal_allocated),
++ atomic_read(&sbi->s_bal_reqs),
++ atomic_read(&sbi->s_bal_success));
++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, "
++ "%u 2^N hits, %u breaks\n",
++ atomic_read(&sbi->s_bal_ex_scanned),
++ atomic_read(&sbi->s_bal_goals),
++ atomic_read(&sbi->s_bal_2orders),
++ atomic_read(&sbi->s_bal_breaks));
++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n",
++ sbi->s_mb_buddies_generated++,
++ sbi->s_mb_generation_time);
++ }
++
++ ext3_mb_history_release(sb);
+
+ return 0;
+}
+ mb_debug("\n");
+ ext3_unlock_group(sb, md->group);
+
++ /* balance refcounts from ext3_mb_free_metadata() */
++ page_cache_release(e3b.bd_buddy_page);
++ page_cache_release(e3b.bd_bitmap_page);
++
+ kfree(md);
-+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
+ } while (md);
+ /* new transaction! time to close last one and free blocks for
+ * committed transaction. we know that only transaction can be
+ * active, so previos transaction can be being logged and we
-+ * know that transaction before previous is known to be alreade
++ * know that transaction before previous is known to be already
+ * logged. this means that now we may free blocks freed in all
+ * transactions before previous one. hope I'm clear enough ... */
+
+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b,
+ int group, int block, int count)
+{
-+ struct ext3_buddy_group_blocks *db = e3b->bd_bd;
++ struct ext3_group_info *db = e3b->bd_info;
+ struct super_block *sb = e3b->bd_sb;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ struct ext3_free_metadata *md;
+ int i;
+
++ J_ASSERT(e3b->bd_bitmap_page != NULL);
++ J_ASSERT(e3b->bd_buddy_page != NULL);
++
+ ext3_lock_group(sb, group);
+ for (i = 0; i < count; i++) {
+ md = db->bb_md_cur;
+ spin_lock(&sbi->s_md_lock);
+ list_add(&md->list, &sbi->s_active_transaction);
+ spin_unlock(&sbi->s_md_lock);
++ /* protect buddy cache from being freed,
++ * otherwise we'll refresh it from
++ * on-disk bitmap and lose not-yet-available
++ * blocks */
++ page_cache_get(e3b->bd_buddy_page);
++ page_cache_get(e3b->bd_bitmap_page);
+ db->bb_md_cur = md;
+ db->bb_tid = handle->h_transaction->t_tid;
+ mb_debug("new md 0x%p for group %u\n",
+ if (err)
+ goto error_return;
+
-+ if (unlikely(ext3_mb_aggressive)) {
++#ifdef AGGRESSIVE_CHECK
++ {
+ int i;
+ for (i = 0; i < count; i++)
+ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data));
+ }
-+
++#endif
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
+
+ /* We dirtied the bitmap block */
+ spin_unlock(sb_bgl_lock(sbi, block_group));
+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+
-+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
+ *freed = count;
+ return ret;
+}
+
-+void ext3_free_blocks(handle_t *handle, struct inode *inode,
-+ unsigned long block, unsigned long count, int metadata)
++
++void ext3_free_blocks(handle_t *handle, struct inode * inode,
++ unsigned long block, unsigned long count, int metadata)
+{
++ struct super_block *sb;
+ int freed;
+
-+ if (!test_opt(inode->i_sb, MBALLOC) ||
-+ EXT3_SB(inode->i_sb)->s_buddy_blocks == NULL)
-+ ext3_free_blocks_sb(handle, inode->i_sb, block, count, &freed);
++ sb = inode->i_sb;
++ if (!test_opt(sb, MBALLOC))
++ ext3_free_blocks_sb(handle, sb, block, count, &freed);
+ else
-+ ext3_mb_free_blocks(handle, inode, block,count,metadata,&freed);
-+
++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
+ if (freed)
+ DQUOT_FREE_BLOCK(inode, freed);
+ return;
+}
-Index: linux-2.6.5-7.201/fs/ext3/proc.c
-===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/proc.c 2005-10-13 19:40:57.851699336 +0400
-+++ linux-2.6.5-7.201/fs/ext3/proc.c 2005-10-14 09:02:36.000000000 +0400
-@@ -0,0 +1,195 @@
-+#include <linux/config.h>
-+#include <linux/fs.h>
-+#include <linux/init.h>
-+#include <linux/module.h>
-+#include <linux/kernel.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/proc_fs.h>
-+#include <linux/errno.h>
-+#include <asm/uaccess.h>
-+
+
+#define EXT3_ROOT "ext3"
-+#define EXT3_MB_AGGRESSIVE_NAME "mb_aggressive"
+#define EXT3_MB_STATS_NAME "mb_stats"
+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
+
-+
-+static struct proc_dir_entry *proc_root_ext3;
-+
-+
-+static int ext3_mb_aggressive_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
++static int ext3_mb_stats_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
+ int len;
+
+ if (off != 0)
+ return 0;
+
-+ len = sprintf(page, "%ld\n", ext3_mb_aggressive);
++ len = sprintf(page, "%ld\n", ext3_mb_stats);
+ *start = page;
+ return len;
+}
+
-+static int ext3_mb_aggressive_write(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
++static int ext3_mb_stats_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
+{
+ char str[32];
+
+ if (count >= sizeof(str)) {
+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
-+ EXT3_MB_AGGRESSIVE_NAME, sizeof(str));
++ EXT3_MB_STATS_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+
+ return -EFAULT;
+
+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
-+ ext3_mb_aggressive = (simple_strtol(str, NULL, 0) != 0);
++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0);
+ return count;
+}
+
-+static int ext3_mb_stats_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
+ int len;
+
+ if (off != 0)
+ return 0;
+
-+ len = sprintf(page, "%ld\n", ext3_mb_stats);
++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan);
+ *start = page;
+ return len;
+}
+
-+static int ext3_mb_stats_write(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
+{
+ char str[32];
++ long value;
+
+ if (count >= sizeof(str)) {
+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
-+ EXT3_MB_STATS_NAME, sizeof(str));
++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+
+ return -EFAULT;
+
+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
-+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0);
++ value = simple_strtol(str, NULL, 0);
++ if (value <= 0)
++ return -ERANGE;
++
++ ext3_mb_max_to_scan = value;
++
+ return count;
+}
+
-+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
+ int len;
+
+ if (off != 0)
+ return 0;
+
-+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan);
++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan);
+ *start = page;
+ return len;
+}
+
-+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
+{
+ char str[32];
+ long value;
+
+ if (count >= sizeof(str)) {
+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
-+ EXT3_MB_MAX_TO_SCAN_NAME, sizeof(str));
++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+
+
+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
+ value = simple_strtol(str, NULL, 0);
-+ if (value <= 0)
++ if (value <= 0)
+ return -ERANGE;
+
-+ ext3_mb_max_to_scan = value;
++ ext3_mb_min_to_scan = value;
+
+ return count;
+}
+
+int __init init_ext3_proc(void)
+{
-+ struct proc_dir_entry *proc_ext3_mb_aggressive;
+ struct proc_dir_entry *proc_ext3_mb_stats;
+ struct proc_dir_entry *proc_ext3_mb_max_to_scan;
++ struct proc_dir_entry *proc_ext3_mb_min_to_scan;
+
+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs);
+ if (proc_root_ext3 == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT);
-+ return -EIO;
-+ }
-+
-+ /* Initialize EXT3_MB_AGGRESSIVE_NAME */
-+ proc_ext3_mb_aggressive = create_proc_entry(EXT3_MB_AGGRESSIVE_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
-+ if (proc_ext3_mb_aggressive == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_AGGRESSIVE_NAME);
-+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT);
+ return -EIO;
+ }
+
-+ proc_ext3_mb_aggressive->data = NULL;
-+ proc_ext3_mb_aggressive->read_proc = ext3_mb_aggressive_read;
-+ proc_ext3_mb_aggressive->write_proc = ext3_mb_aggressive_write;
-+
+ /* Initialize EXT3_MB_STATS_NAME */
+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
+ if (proc_ext3_mb_stats == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_STATS_NAME);
-+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3);
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_STATS_NAME);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+ return -EIO;
+ }
+
+ /* Initialize EXT3_MAX_TO_SCAN_NAME */
+ proc_ext3_mb_max_to_scan = create_proc_entry(
-+ EXT3_MB_MAX_TO_SCAN_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ EXT3_MB_MAX_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
+ if (proc_ext3_mb_max_to_scan == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_STATS_NAME);
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_MAX_TO_SCAN_NAME);
+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+ return -EIO;
+ }
+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read;
+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write;
+
++ /* Initialize EXT3_MIN_TO_SCAN_NAME */
++ proc_ext3_mb_min_to_scan = create_proc_entry(
++ EXT3_MB_MIN_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_min_to_scan == NULL) {
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_MIN_TO_SCAN_NAME);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_min_to_scan->data = NULL;
++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read;
++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write;
++
+ return 0;
+}
+
+void exit_ext3_proc(void)
+{
-+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+}
-Index: linux-2.6.9/fs/ext3/inode.c
-===================================================================
---- linux-2.6.9.orig/fs/ext3/inode.c 2005-10-14 09:10:12.000000000 +0400
-+++ linux-2.6.9/fs/ext3/inode.c 2005-10-14 09:10:13.000000000 +0400
-@@ -572,7 +572,7 @@
- ext3_journal_forget(handle, branch[i].bh);
- }
- for (i = 0; i < keys; i++)
-- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
-+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
- return err;
- }
-
-@@ -673,7 +673,7 @@
- if (err == -EAGAIN)
- for (i = 0; i < num; i++)
- ext3_free_blocks(handle, inode,
-- le32_to_cpu(where[i].key), 1);
-+ le32_to_cpu(where[i].key), 1, 1);
- return err;
- }
-
-@@ -1831,7 +1831,7 @@
- }
- }
-
-- ext3_free_blocks(handle, inode, block_to_free, count);
-+ ext3_free_blocks(handle, inode, block_to_free, count, 1);
- }
-
- /**
-@@ -2004,7 +2004,7 @@
- ext3_journal_test_restart(handle, inode);
- }
-
-- ext3_free_blocks(handle, inode, nr, 1);
-+ ext3_free_blocks(handle, inode, nr, 1, 1);
-
- if (parent_bh) {
- /*
-Index: linux-2.6.9/fs/ext3/super.c
++
+Index: linux-2.6.9-full/fs/ext3/Makefile
===================================================================
---- linux-2.6.9.orig/fs/ext3/super.c 2005-10-14 09:10:12.000000000 +0400
-+++ linux-2.6.9/fs/ext3/super.c 2005-10-14 09:10:31.000000000 +0400
-@@ -394,6 +394,7 @@
- struct ext3_super_block *es = sbi->s_es;
- int i;
-
-+ ext3_mb_release(sb);
- ext3_ext_release(sb);
- ext3_xattr_put_super(sb);
- journal_destroy(sbi->s_journal);
-@@ -590,7 +591,7 @@
- Opt_commit, Opt_journal_update, Opt_journal_inum,
- Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
-- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
-+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_mballoc, Opt_mbfactor,
- Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug,
- };
-@@ -644,6 +645,8 @@
- {Opt_iopen_nopriv, "iopen_nopriv"},
- {Opt_extents, "extents"},
- {Opt_extdebug, "extdebug"},
-+ {Opt_mballoc, "mballoc"},
-+ {Opt_mbfactor, "mbfactor=%u"},
- {Opt_barrier, "barrier=%u"},
- {Opt_err, NULL},
- {Opt_resize, "resize"},
-@@ -954,6 +957,16 @@
- case Opt_extdebug:
- set_opt (sbi->s_mount_opt, EXTDEBUG);
- break;
-+ case Opt_mballoc:
-+ set_opt (sbi->s_mount_opt, MBALLOC);
-+ break;
-+ case Opt_mbfactor:
-+ if (match_int(&args[0], &option))
-+ return 0;
-+ if (option < 0)
-+ return 0;
-+ sbi->s_mb_factor = option;
-+ break;
- default:
- printk (KERN_ERR
- "EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1637,6 +1650,7 @@
- ext3_count_dirs(sb));
-
- ext3_ext_init(sb);
-+ ext3_mb_init(sb, needs_recovery);
-
- return 0;
-
-@@ -2419,7 +2433,13 @@
+--- linux-2.6.9-full.orig/fs/ext3/Makefile 2005-12-16 23:16:41.000000000 +0300
++++ linux-2.6.9-full/fs/ext3/Makefile 2005-12-16 23:16:42.000000000 +0300
+@@ -5,7 +5,8 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
- static int __init init_ext3_fs(void)
- {
-- int err = init_ext3_xattr();
-+ int err;
-+
-+ err = init_ext3_proc();
-+ if (err)
-+ return err;
-+
-+ err = init_ext3_xattr();
- if (err)
- return err;
- err = init_inodecache();
-@@ -2441,6 +2461,7 @@
- unregister_filesystem(&ext3_fs_type);
- destroy_inodecache();
- exit_ext3_xattr();
-+ exit_ext3_proc();
- }
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\
+- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
++ mballoc.o
- int ext3_prep_san_write(struct inode *inode, long *blocks,
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
--- /dev/null
+diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
+--- orig/fs/ext3/namei.c 2005-10-12 13:58:19.000000000 -0700
++++ patch/fs/ext3/namei.c 2005-10-12 14:00:33.000000000 -0700
+@@ -1603,11 +1603,17 @@
+ static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
+ {
+ inode->i_nlink++;
++ if (is_dx(inode) && inode->i_nlink > 1) {
++ /* limit is 16-bit i_links_count */
++ if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2)
++ inode->i_nlink = 1;
++ }
+ }
+
+ static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
+ {
+- inode->i_nlink--;
++ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
++ inode->i_nlink--;
+ }
+
+ static int ext3_add_nondir(handle_t *handle,
+@@ -1706,7 +1712,7 @@
+ struct ext3_dir_entry_2 * de;
+ int err, retries = 0;
+
+- if (dir->i_nlink >= EXT3_LINK_MAX)
++ if (EXT3_DIR_LINK_MAXED(dir))
+ return -EMLINK;
+
+ retry:
+@@ -1729,7 +1735,7 @@
+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+- inode->i_nlink--; /* is this nlink == 0? */
++ ext3_dec_count(handle, inode); /* is this nlink == 0? */
+ ext3_mark_inode_dirty(handle, inode);
+ iput (inode);
+ goto out_stop;
+@@ -1761,7 +1767,7 @@
+ iput (inode);
+ goto out_stop;
+ }
+- dir->i_nlink++;
++ ext3_inc_count(handle, dir);
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ d_instantiate(dentry, inode);
+@@ -2026,10 +2032,10 @@
+ retval = ext3_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto end_rmdir;
+- if (inode->i_nlink != 2)
+- ext3_warning (inode->i_sb, "ext3_rmdir",
+- "empty directory has nlink!=2 (%d)",
+- inode->i_nlink);
++ if (!EXT3_DIR_LINK_EMPTY(inode))
++ ext3_warning(inode->i_sb, "ext3_rmdir",
++ "empty directory has too many links (%d)",
++ inode->i_nlink);
+ inode->i_version++;
+ inode->i_nlink = 0;
+ /* There's no need to set i_disksize: the fact that i_nlink is
+@@ -2039,7 +2045,7 @@
+ ext3_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ ext3_mark_inode_dirty(handle, inode);
+- dir->i_nlink--;
++ ext3_dec_count(handle, dir);
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+
+@@ -2090,7 +2096,7 @@
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+- inode->i_nlink--;
++ ext3_dec_count(handle, inode);
+ if (!inode->i_nlink)
+ ext3_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime;
+@@ -2165,7 +2171,7 @@
+ struct inode *inode = old_dentry->d_inode;
+ int err, retries = 0;
+
+- if (inode->i_nlink >= EXT3_LINK_MAX)
++ if (EXT3_DIR_LINK_MAXED(inode))
+ return -EMLINK;
+
+ retry:
+@@ -2252,8 +2258,8 @@
+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+ goto end_rename;
+ retval = -EMLINK;
+- if (!new_inode && new_dir!=old_dir &&
+- new_dir->i_nlink >= EXT3_LINK_MAX)
++ if (!new_inode && new_dir != old_dir &&
++ EXT3_DIR_LINK_MAXED(new_dir))
+ goto end_rename;
+ }
+ if (!new_bh) {
+@@ -2310,7 +2316,7 @@
+ }
+
+ if (new_inode) {
+- new_inode->i_nlink--;
++ ext3_dec_count(handle, new_inode);
+ new_inode->i_ctime = CURRENT_TIME_SEC;
+ }
+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+@@ -2321,11 +2327,13 @@
+ PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+ BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, dir_bh);
+- old_dir->i_nlink--;
++ ext3_dec_count(handle, old_dir);
+ if (new_inode) {
+- new_inode->i_nlink--;
++ /* checked empty_dir above, can't have another parent,
++ * ext3_dec_count() won't work for many-linked dirs */
++ new_inode->i_nlink = 0;
+ } else {
+- new_dir->i_nlink++;
++ ext3_inc_count(handle, new_dir);
+ ext3_update_dx_flag(new_dir);
+ ext3_mark_inode_dirty(handle, new_dir);
+ }
+
+Index: linux-2.6.7/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600
++++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600
+@@ -79,7 +81,7 @@
+ /*
+ * Maximal count of links to a file
+ */
+-#define EXT3_LINK_MAX 32000
++#define EXT3_LINK_MAX 65000
+
+ /*
+ * Macro-instructions used to manage several block sizes
+@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
+ */
+
+ #ifdef CONFIG_EXT3_INDEX
+- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
+- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
++#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
+-#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
+-#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
++#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
++ (is_dx(dir) && (dir)->i_nlink == 1))
+ #else
+ #define is_dx(dir) 0
+-#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
+ #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
+ #endif
+
--- /dev/null
+Index: linux-stage/fs/ext3/ialloc.c
+===================================================================
+--- linux-stage.orig/fs/ext3/ialloc.c 2005-06-26 10:59:43.048185981 +0200
++++ linux-stage/fs/ext3/ialloc.c 2005-06-26 11:01:21.317716027 +0200
+@@ -775,7 +775,6 @@
+ if (!gdp)
+ continue;
+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+- cond_resched();
+ }
+ return desc_count;
+ #endif
+Index: linux-stage/fs/ext3/super.c
+===================================================================
+--- linux-stage.orig/fs/ext3/super.c 2005-06-26 10:59:43.205412542 +0200
++++ linux-stage/fs/ext3/super.c 2005-06-26 11:02:29.599941754 +0200
+@@ -2236,11 +2232,9 @@
+ * block group descriptors. If the sparse superblocks
+ * feature is turned on, then not all groups have this.
+ */
+- for (i = 0; i < ngroups; i++) {
++ for (i = 0; i < ngroups; i++)
+ overhead += ext3_bg_has_super(sb, i) +
+ ext3_bg_num_gdb(sb, i);
+- cond_resched();
+- }
+
+ /*
+ * Every block group has an inode bitmap, a block
===================================================================
--- linux-stage.orig/fs/ext3/iopen.c 2005-02-25 14:41:01.017787968 +0200
+++ linux-stage/fs/ext3/iopen.c 2005-02-25 14:41:01.045783712 +0200
-@@ -0,0 +1,277 @@
+@@ -0,0 +1,278 @@
+/*
+ * linux/fs/ext3/iopen.c
+ *
+ goto do_instantiate;
+
+ /* Move the goal to the de hash queue */
-+ goal->d_flags &= ~ DCACHE_DISCONNECTED;
++ goal->d_flags &= ~DCACHE_DISCONNECTED;
+ security_d_instantiate(goal, inode);
++ __d_drop(dentry);
+ __d_rehash(dentry, 0);
+ __d_move(goal, dentry);
+ spin_unlock(&dcache_lock);
===================================================================
--- linux-2.6.5-sles9.orig/fs/ext3/iopen.c 2003-01-30 13:24:37.000000000 +0300
+++ linux-2.6.5-sles9/fs/ext3/iopen.c 2004-11-09 02:18:27.611913312 +0300
-@@ -0,0 +1,275 @@
+@@ -0,0 +1,278 @@
+/*
+ * linux/fs/ext3/iopen.c
+ *
+ alternate = list_entry(inode->i_dentry.next,
+ struct dentry, d_alias);
+ dget_locked(alternate);
++ spin_lock(&alternate->d_lock);
+ alternate->d_vfs_flags |= DCACHE_REFERENCED;
++ spin_unlock(&alternate->d_lock);
+ iput(inode);
+ spin_unlock(&dcache_lock);
+ return alternate;
+ goto do_instantiate;
+
+ /* Move the goal to the de hash queue */
-+ goal->d_flags &= ~ DCACHE_DISCONNECTED;
++ goal->d_flags &= ~DCACHE_DISCONNECTED;
+ security_d_instantiate(goal, inode);
++ __d_drop(dentry);
+ __d_rehash(dentry, 0);
+ __d_move(goal, dentry);
+ spin_unlock(&dcache_lock);
- Documentation/filesystems/ext2.txt | 16 ++
- fs/ext3/Makefile | 2
- fs/ext3/inode.c | 4
- fs/ext3/iopen.c | 259 +++++++++++++++++++++++++++++++++++++
- fs/ext3/iopen.h | 13 +
- fs/ext3/namei.c | 13 +
- fs/ext3/super.c | 11 +
- include/linux/ext3_fs.h | 2
- 8 files changed, 318 insertions(+), 2 deletions(-)
-
-Index: kernel-2.4.212l35/Documentation/filesystems/ext2.txt
+Index: linux-2.6.12-rc6/fs/ext3/Makefile
===================================================================
---- kernel-2.4.212l35.orig/Documentation/filesystems/ext2.txt 2001-07-11 15:44:45.000000000 -0700
-+++ kernel-2.4.212l35/Documentation/filesystems/ext2.txt 2004-05-06 19:48:32.000000000 -0700
-@@ -35,6 +35,22 @@
+--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:00:45.206720992 +0200
++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:14:33.595382720 +0200
+@@ -4,7 +4,7 @@
- sb=n Use alternate superblock at this location.
+ obj-$(CONFIG_EXT3_FS) += ext3.o
-+iopen Makes an invisible pseudo-directory called
-+ __iopen__ available in the root directory
-+ of the filesystem. Allows open-by-inode-
-+ number. i.e., inode 3145 can be accessed
-+ via /mntpt/__iopen__/3145
-+
-+iopen_nopriv This option makes the iopen directory be
-+ world-readable. This may be safer since it
-+ allows daemons to run as an unprivileged user,
-+ however it significantly changes the security
-+ model of a Unix filesystem, since previously
-+ all files under a mode 700 directory were not
-+ generally avilable even if the
-+ permissions on the file itself is
-+ world-readable.
-+
- grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2.
+-ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\
+ ioctl.o namei.o super.o symlink.o hash.o resize.o
-
-Index: kernel-2.4.212l35/fs/ext3/Makefile
-===================================================================
---- kernel-2.4.212l35.orig/fs/ext3/Makefile 2004-05-06 19:46:22.000000000 -0700
-+++ kernel-2.4.212l35/fs/ext3/Makefile 2004-05-06 19:48:32.000000000 -0700
-@@ -11,7 +11,7 @@
-
- export-objs := ext3-exports.o
-
--obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-+obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
- ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
- obj-m := $(O_TARGET)
-
-Index: kernel-2.4.212l35/fs/ext3/inode.c
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+Index: linux-2.6.12-rc6/fs/ext3/inode.c
===================================================================
---- kernel-2.4.212l35.orig/fs/ext3/inode.c 2004-05-06 19:46:24.000000000 -0700
-+++ kernel-2.4.212l35/fs/ext3/inode.c 2004-05-06 19:48:32.000000000 -0700
-@@ -34,6 +34,7 @@
- #include <linux/highuid.h>
- #include <linux/quotaops.h>
- #include <linux/module.h>
+--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:01:16.272150299 +0200
++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:24:55.686195412 +0200
+@@ -37,6 +37,7 @@
+ #include <linux/mpage.h>
+ #include <linux/uio.h>
+ #include "xattr.h"
+#include "iopen.h"
+ #include "acl.h"
- /*
- * SEARCH_FROM_ZERO forces each block allocation to search from the start
-@@ -2252,6 +2253,9 @@
- struct buffer_head *bh;
- int block;
-
-+ if (ext3_iopen_get_inode(inode))
-+ return;
-+
- if(ext3_get_inode_loc(inode, &iloc))
+ static int ext3_writepage_trans_blocks(struct inode *inode);
+@@ -2437,6 +2438,8 @@
+ ei->i_default_acl = EXT3_ACL_NOT_CACHED;
+ #endif
+ ei->i_block_alloc_info = NULL;
++ if (ext3_iopen_get_inode(inode))
++ return;
+
+ if (__ext3_get_inode_loc(inode, &iloc, 0))
goto bad_inode;
- bh = iloc.bh;
-Index: kernel-2.4.212l35/fs/ext3/iopen.c
+Index: linux-2.6.12-rc6/fs/ext3/iopen.c
===================================================================
---- kernel-2.4.212l35.orig/fs/ext3/iopen.c 2003-03-27 11:16:05.000000000 -0800
-+++ kernel-2.4.212l35/fs/ext3/iopen.c 2004-05-06 19:48:41.000000000 -0700
-@@ -0,0 +1,285 @@
+--- linux-2.6.12-rc6.orig/fs/ext3/iopen.c 2005-06-14 16:14:33.530929595 +0200
++++ linux-2.6.12-rc6/fs/ext3/iopen.c 2005-06-14 16:14:33.626632719 +0200
+@@ -0,0 +1,278 @@
+/*
+ * linux/fs/ext3/iopen.c
+ *
+
+#include <linux/sched.h>
+#include <linux/fs.h>
-+#include <linux/locks.h>
+#include <linux/ext3_jbd.h>
+#include <linux/jbd.h>
+#include <linux/ext3_fs.h>
+#include <linux/smp_lock.h>
++#include <linux/dcache.h>
++#include <linux/security.h>
+#include "iopen.h"
+
+#ifndef assert
+/*
+ * This implements looking up an inode by number.
+ */
-+static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry)
++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry,
++ struct nameidata *nd)
+{
+ struct inode *inode;
+ unsigned long ino;
+ //ino != EXT3_ACL_IDX_INO &&
+ //ino != EXT3_ACL_DATA_INO &&
+ ino < EXT3_FIRST_INO(dir->i_sb)) ||
-+ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
++ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
+ return ERR_PTR(-ENOENT);
+
+ inode = iget(dir->i_sb, ino);
+ }
+
+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
-+ assert(list_empty(&dentry->d_hash)); /* d_rehash */
++ assert(d_unhashed(dentry)); /* d_rehash */
+
+ /* preferrably return a connected dentry */
+ spin_lock(&dcache_lock);
+ list_for_each(lp, &inode->i_dentry) {
+ alternate = list_entry(lp, struct dentry, d_alias);
-+ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED));
++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED));
+ }
+
+ if (!list_empty(&inode->i_dentry)) {
+ alternate = list_entry(inode->i_dentry.next,
+ struct dentry, d_alias);
+ dget_locked(alternate);
-+ alternate->d_vfs_flags |= DCACHE_REFERENCED;
++ spin_lock(&alternate->d_lock);
++ alternate->d_flags |= DCACHE_REFERENCED;
++ spin_unlock(&alternate->d_lock);
+ iput(inode);
+ spin_unlock(&dcache_lock);
+ return alternate;
+ }
-+ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
++ dentry->d_flags |= DCACHE_DISCONNECTED;
+
+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
+ dentry->d_inode = inode;
+
-+ __d_rehash(dentry, 0); /* d_rehash */
++ d_rehash_cond(dentry, 0); /* d_rehash */
+ spin_unlock(&dcache_lock);
+
+ return NULL;
+{
+ const unsigned char *old_name, *new_name;
+
-+ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN);
++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN);
+ old_name = target->d_name.name;
+ new_name = dentry->d_name.name;
+ if (old_name == target->d_iname)
+ assert(dentry->d_inode == NULL);
+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
+ if (rehash)
-+ assert(list_empty(&dentry->d_hash)); /* d_rehash */
++ assert(d_unhashed(dentry)); /* d_rehash */
+ assert(list_empty(&dentry->d_subdirs));
+
+ spin_lock(&dcache_lock);
+ /* preferrably return a connected dentry */
+ list_for_each(lp, &inode->i_dentry) {
+ tmp = list_entry(lp, struct dentry, d_alias);
-+ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) {
++ if (tmp->d_flags & DCACHE_DISCONNECTED) {
+ assert(tmp->d_alias.next == &inode->i_dentry);
+ assert(tmp->d_alias.prev == &inode->i_dentry);
+ goal = tmp;
+ if (!goal)
+ goto do_instantiate;
+
-+ /* Move the goal to the de hash queue - like d_move() */
-+ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
-+ list_del_init(&goal->d_hash);
-+
-+ list_del(&goal->d_child);
-+ list_del(&dentry->d_child);
-+
-+ /* Switch the parents and the names.. */
-+ switch_names(goal, dentry);
-+ do_switch(goal->d_parent, dentry->d_parent);
-+ do_switch(goal->d_name.len, dentry->d_name.len);
-+ do_switch(goal->d_name.hash, dentry->d_name.hash);
-+
-+ /* And add them back to the (new) parent lists */
-+ list_add(&goal->d_child, &goal->d_parent->d_subdirs);
-+ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs);
-+ __d_rehash(goal, 0);
++ /* Move the goal to the de hash queue */
++ goal->d_flags &= ~DCACHE_DISCONNECTED;
++ security_d_instantiate(goal, inode);
++ __d_drop(dentry);
++ d_rehash_cond(dentry, 0);
++ __d_move(goal, dentry);
+ spin_unlock(&dcache_lock);
+ iput(inode);
+
+ dentry->d_inode = inode;
+do_rehash:
+ if (rehash)
-+ __d_rehash(dentry, 0); /* d_rehash */
++ d_rehash_cond(dentry, 0); /* d_rehash */
+ spin_unlock(&dcache_lock);
+
+ return NULL;
+ inode->i_atime = CURRENT_TIME;
+ inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = CURRENT_TIME;
-+ inode->u.ext3_i.i_dtime = 0;
++ EXT3_I(inode)->i_dtime = 0;
+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
+ * (for stat), not the fs block
+ * size */
+
+ return 1;
+}
-Index: kernel-2.4.212l35/fs/ext3/iopen.h
+Index: linux-2.6.12-rc6/fs/ext3/iopen.h
===================================================================
---- kernel-2.4.212l35.orig/fs/ext3/iopen.h 2003-03-27 11:16:05.000000000 -0800
-+++ kernel-2.4.212l35/fs/ext3/iopen.h 2004-05-06 19:48:41.000000000 -0700
+--- linux-2.6.12-rc6.orig/fs/ext3/iopen.h 2005-06-14 16:14:33.534835845 +0200
++++ linux-2.6.12-rc6/fs/ext3/iopen.h 2005-06-14 16:14:33.633468657 +0200
@@ -0,0 +1,15 @@
+/*
+ * iopen.h
+extern int ext3_iopen_get_inode(struct inode *inode);
+extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
+ struct inode *inode, int rehash);
-Index: kernel-2.4.212l35/fs/ext3/namei.c
+Index: linux-2.6.12-rc6/fs/ext3/namei.c
===================================================================
---- kernel-2.4.212l35.orig/fs/ext3/namei.c 2004-05-06 19:46:23.000000000 -0700
-+++ kernel-2.4.212l35/fs/ext3/namei.c 2004-05-06 19:51:48.000000000 -0700
-@@ -36,7 +36,7 @@
- #include <linux/string.h>
- #include <linux/locks.h>
- #include <linux/quotaops.h>
--
+--- linux-2.6.12-rc6.orig/fs/ext3/namei.c 2005-06-14 16:01:14.701837819 +0200
++++ linux-2.6.12-rc6/fs/ext3/namei.c 2005-06-14 16:14:33.644210844 +0200
+@@ -37,6 +37,7 @@
+ #include <linux/buffer_head.h>
+ #include <linux/smp_lock.h>
+ #include "xattr.h"
+#include "iopen.h"
+ #include "acl.h"
/*
- * define how far ahead to read directories while searching them.
-@@ -932,6 +932,9 @@
+@@ -985,6 +986,9 @@
if (dentry->d_name.len > EXT3_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
bh = ext3_find_entry(dentry, &de);
inode = NULL;
if (bh) {
-@@ -943,8 +946,8 @@
+@@ -995,10 +999,8 @@
+ if (!inode)
return ERR_PTR(-EACCES);
- }
}
+- if (inode)
+- return d_splice_alias(inode, dentry);
- d_add(dentry, inode);
- return NULL;
+
+ return iopen_connect_dentry(dentry, inode, 1);
}
- #define S_SHIFT 12
-@@ -1936,10 +1940,6 @@
+
+@@ -2042,10 +2044,6 @@
inode->i_nlink);
- inode->i_version = ++event;
+ inode->i_version++;
inode->i_nlink = 0;
- /* There's no need to set i_disksize: the fact that i_nlink is
- * zero will ensure that the right thing happens during any
- * recovery. */
- inode->i_size = 0;
ext3_orphan_add(handle, inode);
- dir->i_nlink--;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-@@ -2058,6 +2058,23 @@
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ ext3_mark_inode_dirty(handle, inode);
+@@ -2168,6 +2166,23 @@
return err;
}
static int ext3_link (struct dentry * old_dentry,
struct inode * dir, struct dentry *dentry)
{
-@@ -2085,7 +2102,8 @@
+@@ -2191,7 +2206,8 @@
ext3_inc_count(handle, inode);
atomic_inc(&inode->i_count);
- err = ext3_add_nondir(handle, dentry, inode);
+ err = ext3_add_link(handle, dentry, inode);
-+ ext3_orphan_del(handle, inode);
- ext3_journal_stop(handle, dir);
- return err;
- }
-Index: kernel-2.4.212l35/fs/ext3/super.c
++ ext3_orphan_del(handle,inode);
+ ext3_journal_stop(handle);
+ if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+Index: linux-2.6.12-rc6/fs/ext3/super.c
===================================================================
---- kernel-2.4.212l35.orig/fs/ext3/super.c 2004-05-06 19:46:23.000000000 -0700
-+++ kernel-2.4.212l35/fs/ext3/super.c 2004-05-06 19:48:32.000000000 -0700
-@@ -869,6 +869,18 @@
- || !strcmp (this_char, "quota")
- || !strcmp (this_char, "usrquota"))
- /* Don't do anything ;-) */ ;
-+ else if (!strcmp (this_char, "iopen")) {
+--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:01:16.287775299 +0200
++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:14:33.656906156 +0200
+@@ -590,6 +590,7 @@
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+ };
+
+@@ -638,6 +639,9 @@
+ {Opt_ignore, "noquota"},
+ {Opt_ignore, "quota"},
+ {Opt_ignore, "usrquota"},
++ {Opt_iopen, "iopen"},
++ {Opt_noiopen, "noiopen"},
++ {Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+@@ -921,6 +925,18 @@
+ else
+ clear_opt(sbi->s_mount_opt, BARRIER);
+ break;
++ case Opt_iopen:
+ set_opt (sbi->s_mount_opt, IOPEN);
+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
-+ }
-+ else if (!strcmp (this_char, "noiopen")) {
++ break;
++ case Opt_noiopen:
+ clear_opt (sbi->s_mount_opt, IOPEN);
+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
-+ }
-+ else if (!strcmp (this_char, "iopen_nopriv")) {
++ break;
++ case Opt_iopen_nopriv:
+ set_opt (sbi->s_mount_opt, IOPEN);
+ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
-+ }
- else if (!strcmp (this_char, "journal")) {
- /* @@@ FIXME */
- /* Eventually we will want to be able to create
-Index: kernel-2.4.212l35/include/linux/ext3_fs.h
++ break;
+ case Opt_ignore:
+ break;
+ case Opt_resize:
+Index: linux-2.6.12-rc6/include/linux/ext3_fs.h
===================================================================
---- kernel-2.4.212l35.orig/include/linux/ext3_fs.h 2004-05-06 19:46:24.000000000 -0700
-+++ kernel-2.4.212l35/include/linux/ext3_fs.h 2004-05-06 19:48:32.000000000 -0700
-@@ -324,6 +324,8 @@
- #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
- #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */
- #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */
-+#define EXT3_MOUNT_IOPEN 0x40000 /* Allow access via iopen */
-+#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
+--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:01:14.709650318 +0200
++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:28:38.452794245 +0200
+@@ -358,6 +358,8 @@
+ #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
+ #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
+ #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */
++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */
++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000 /* Make iopen world-readable */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
ext3-include-fixes-2.6-rhel4.patch
ext3-extents-2.6.9-rhel4.patch
ext3-mballoc2-2.6.9-rhel4.patch
-ext3-nlinks-2.6.7.patch
-ext3-htree-dot-2.6.patch
+ext3-nlinks-2.6.9.patch
ext3-ialloc-2.6.patch
--- /dev/null
+ext3-wantedi-2.6-rhel4.patch
+ext3-san-jdike-2.6-suse.patch
+iopen-2.6.12.patch
+ext3-map_inode_page-2.6-suse.patch
+export-ext3-2.6-rhel4.patch
+ext3-include-fixes-2.6-rhel4.patch
+ext3-extents-2.6.12.patch
+ext3-mballoc2-2.6.12.patch
+ext3-nlinks-2.6.9.patch
+ext3-ialloc-2.6.patch
+ext3-remove-cond_resched-calls-2.6.12.patch
+ext3-htree-dot-2.6.patch
+ext3-external-journal-2.6.12.patch
default: all
-MODULES := ldiskfs #quotafmt_test
+MODULES := ldiskfs
+
+@QUOTA_TRUE@MODULES += quotafmt_test
# copy makefile over to not break patches
ext3_extra := $(wildcard @LINUX@/fs/ext3/Makefile)
linux_headers := $(wildcard @LINUX@/include/linux/ext3*.h)
ext3_sources := $(filter-out %.mod.c,$(wildcard @LINUX@/fs/ext3/*.c))
-new_sources := iopen.c iopen.h extents.c mballoc.c proc.c
+new_sources := iopen.c iopen.h extents.c mballoc.c
new_headers := ext3_extents.h
-#quotafmt_sources := lustre_quota_fmt.c
-#quotafmt_headers := lustre_quota_fmt.h
ldiskfs_patched_sources := $(notdir $(ext3_sources) $(ext3_headers)) $(new_sources) $(new_headers)
-ldiskfs_sources := $(ldiskfs_patched_sources) #$(quotafmt_sources) $(quotafmt_headers)
+ldiskfs_sources := $(ldiskfs_patched_sources)
+
+quotafmt_sources := lustre_quota_fmt.c
+quotafmt_headers := lustre_quota_fmt.h
+@QUOTA_TRUE@ldiskfs_sources += $(quotafmt_sources) $(quotafmt_headers)
ldiskfs-objs := $(filter %.o,$(ldiskfs_sources:.c=.o))
-#quotafmt-objs := quotafmt_test.o
+
+@QUOTA_TRUE@quotafmt-objs := quotafmt_test.o
EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LUSTRE@ -I@LUSTRE@/ldiskfs
-tbd Cluster File Systems, Inc. <info@clusterfs.com>
- * version 1.4.7
-
-Severity : minor
-Frequency : occasional (Cray XT3 only)
-Bugzilla : 7305
-Description: root not authorized to access files in CRAY_PORTALS environment
-Details : The client process capabilities were not honoured on the MDS in
- a CRAY_PORTALS/CRAY_XT3 environment. If the file had previously
- been accessed by an authorized user then root was able to access
- the file on the local client also. The root user capabilities
- are now allowed on the MDS, as this environment has secure UID.
-
-Severity : minor
-Frequency : occasional
-Bugzilla : 6449
-Description: ldiskfs "too long searching" message happens too often
-Details : A debugging message (otherwise harmless) prints too often on
- the OST console. This has been reduced to only happen when
- there are fragmentation problems on the filesystem.
-
-Severity : minor
-Frequency : rare
-Bugzilla : 9598
-Description: Division by zero in statfs when all OSCs are inactive
-Details : lov_get_stripecnt() returns zero due to incorrect order of checks,
- lov_statfs divides by value returned by lov_get_stripecnt().
-
-Severity : minor
-Frequency : common
-Bugzilla : 9489, 3273
-Description: First write from each client to each OST was only 4kB in size,
- to initialize client writeback cache, which caused sub-optimal
- RPCs and poor layout on disk for the first writen file.
-Details : Clients now request an initial cache grant at (re)connect time
- and so that they can start streaming writes to the cache right
- away and always do full-sized RPCs if there is enough data.
- If the OST is rebooted the client also re-establishes its grant
- so that client cached writes will be honoured under the grant.
-
-Severity : minor
-Frequency : common
-Bugzilla : 7198
-Description: Slow ls (and stat(2) syscall) on files residing on IO-loaded OSTs
-Details : Now I/O RPCs go to different portal number and (presumably) fast
- lock requests (and glimses) and other RPCs get their own service
- threads pool that should be able to service those RPCs
- immediatelly.
-
-Severity : enhancement
-Bugzilla : 7417
-Description: Ability to exchange lustre version between client and servers and
- issue warnings at client side if client is too old. Also for
- liblustre clients there is ability to refuse connection of too old
- clients.
-Details : New 'version' field is added to connect data structure that is
- filled with version info. That info is later checked by server and
- by client.
-
-Severity : minor
-Frequency : rare, liblustre only.
-Bugzilla : 7198
-Description: Two simultaneous writes from liblustre at offset within same page
- might proceed at the same time overwriting eachother with stale
- data.
-Details : I/O lock withing llu_file_prwv was released too early, before data
- actually was hitting the wire. Extended lock-holding time until
- server acknowledges receiving data.
-
-
-tbd Cluster File Systems, Inc. <info@clusterfs.com>
+01-31-2006 Cluster File Systems, Inc. <info@clusterfs.com>
* version 1.4.6
* WIRE PROTOCOL CHANGE. This version of Lustre networking WILL NOT
INTEROPERATE with older versions automatically. Please read the
user documentation before upgrading any part of a live system.
+ * WARNING: Lustre networking configuration changes are required with
+ this release. See https://bugzilla.clusterfs.com/show_bug.cgi?id=10052
+ for details.
* bug fixes
Severity : enhancement
Severity : enhancement
Bugzilla : 7982
-Description: Configuration change for the XT3
+Description: Configuration change for the XT3
The PTLLND is now used to run Lustre over Portals on the XT3
The configure option(s) --with-cray-portals are no longer used.
Rather --with-portals=<path-to-portals-includes> is used to
Details : Modify ldlm_extent_policy() to not expand local locks, acquired
by server: they are not cached anyway.
-Severity : medium
-Frequency : seldom, when mmap is used/files executed from lustre
+Severity : major
+Frequency : when mmap is used/binaries executed from Lustre
Bugzilla : 9482
Description: Unmmap pages before throwing them away from read cache.
Details : llap_shrink cache now attempts to unmap pages before discarding
Severity : enhancement
Frequency : if MDS is started with down OST
Bugzilla : 9439,5706
-Description: Allow startup/shutdown of an MDS without depending on the
+Description: Allow startup/shutdown of an MDS without depending on the
availability of the OSTs.
Details : Asynchronously call mds_lov_synchronize during MDS startup.
Add appropriate locking and lov-osc refcounts for safe
synchronize never started.
Severity : minor
+Frequency : occasional (Cray XT3 only)
+Bugzilla : 7305
+Description: root not authorized to access files in CRAY_PORTALS environment
+Details : The client process capabilities were not honoured on the MDS in
+ a CRAY_PORTALS/CRAY_XT3 environment. If the file had previously
+ been accessed by an authorized user then root was able to access
+ the file on the local client also. The root user capabilities
+ are now allowed on the MDS, as this environment has secure UID.
+
+Severity : minor
+Frequency : occasional
+Bugzilla : 6449
+Description: ldiskfs "too long searching" message happens too often
+Details : A debugging message (otherwise harmless) prints too often on
+ the OST console. This has been reduced to only happen when
+ there are fragmentation problems on the filesystem.
+
+Severity : minor
+Frequency : rare
+Bugzilla : 9598
+Description: Division by zero in statfs when all OSCs are inactive
+Details : lov_get_stripecnt() returns zero due to incorrect order of checks,
+ lov_statfs divides by value returned by lov_get_stripecnt().
+
+Severity : minor
+Frequency : common
+Bugzilla : 9489, 3273
+Description: First write from each client to each OST was only 4kB in size,
+ to initialize client writeback cache, which caused sub-optimal
+ RPCs and poor layout on disk for the first writen file.
+Details : Clients now request an initial cache grant at (re)connect time
+ and so that they can start streaming writes to the cache right
+ away and always do full-sized RPCs if there is enough data.
+ If the OST is rebooted the client also re-establishes its grant
+ so that client cached writes will be honoured under the grant.
+
+Severity : minor
+Frequency : common
+Bugzilla : 7198
+Description: Slow ls (and stat(2) syscall) on files residing on IO-loaded OSTs
+Details : Now I/O RPCs go to different portal number and (presumably) fast
+ lock requests (and glimses) and other RPCs get their own service
+ threads pool that should be able to service those RPCs
+ immediatelly.
+
+Severity : enhancement
+Bugzilla : 7417
+Description: Ability to exchange lustre version between client and servers and
+ issue warnings at client side if client is too old. Also for
+ liblustre clients there is ability to refuse connection of too old
+ clients.
+Details : New 'version' field is added to connect data structure that is
+ filled with version info. That info is later checked by server and
+ by client.
+
+Severity : minor
+Frequency : rare, liblustre only.
+Bugzilla : 9296, 9581
+Description: Two simultaneous writes from liblustre at offset within same page
+ might proceed at the same time overwriting eachother with stale
+ data.
+Details : I/O lock withing llu_file_prwv was released too early, before data
+ actually was hitting the wire. Extended lock-holding time until
+ server acknowledges receiving data.
+
+Severity : minor
Frequency : extremely rare. Never observed in practice.
Bugzilla : 9652
Description: avoid generating lustre_handle cookie of 0.
sections in this function into one.
Severity : enhancement
-Frequency : liblustre-only
Bugzilla : 9528
Description: allow liblustre clients to delegate truncate locking to OST
Details : To avoid overhead of locking, liblustre client instructs OST to
take extent lock in ost_punch() on client's behalf. New connection
flag is added to handle backward compatibility.
+Severity : enhancement
+Bugzilla : 4928, 7341, 9758
+Description: allow number of OST service threads to be specified
+Details : a module parameter allows the number of OST service threads
+ to be specified via "options ost ost_num_threads=X" in
+ /etc/modules.conf or /etc/modutils.conf.
+
+Severity : major
+Frequency : rare
+Bugzilla : 9635
+Description: servers crash with bad pointer in target_handle_connect()
+Details : In rare cases when a client is reconnecting it was possible that
+ the connection request was the last reference for that export.
+ We would temporarily drop the export reference and get a new
+ one, but this may have been the last reference and the export
+ was just destroyed. Get new reference before dropping old one.
+
+Severity : enhancement
+Frequency : if client is started with failover MDS
+Bugzilla : 9818
+Description: Allow multiple MDS hostnames in the mount command
+Details : Try to read the configuration from all specified MDS
+ hostnames during a client mount in case the "primary"
+ MDS is down.
+
+Severity : enhancement
+Bugzilla : 9297
+Description: Stop sending data to evicted clients as soon as possible.
+Details : Check if the client we are about to send or are sending data to
+ was evicted already. (Check is done every second of waiting,
+ for which l_wait_event interface was extended to allow checking
+ of exit condition at specified intervals).
+
+Severity : minor
+Frequency : rare, normally only when NFS exporting is done from client
+Bugzilla : 9301
+Description: 'bad disk LOV MAGIC: 0x00000000' error when chown'ing files
+ without objects
+Details : Make mds_get_md() recognise empty md case and set lmm size to 0.
+
+Severity : minor
+Frequency : always, if srand() is called before liblustre initialization
+Bugzilla : 9794
+Description: Liblustre uses system PRNG disturbing its usage by user application
+Details : Introduce internal to lustre fast and high-quality PRNG for
+ lustre usage and make liblustre and some other places in generic
+ lustre code to use it.
+
+Severity : enhancement
+Bugzilla : 9477, 9557, 9870
+Description: Verify that the MDS configuration logs are updated when xml is
+Details : Check if the .xml configuration logs are newer than the config
+ logs stored on the MDS and report an error if this is the case.
+ Request --write-conf, or allow starting with --old_conf.
+
+Severity : enhancement
+Bugzilla : 6034
+Description: Handle symlinks in the path when checking if Lustre is mounted.
+Details : Resolve intermediate symlinks when checking if a client has
+ mounted a filesystem to avoid duplicate client mounts.
+
+Severity : minor
+Frequency : rare
+Bugzilla : 9309
+Description: lconf can hit an error exception but still return success.
+Details : The lconf command catches the Command error exception at the top
+ level script context and will exit with the associated exit
+ status, but doesn't ensure that this exit status is non-zero.
+
+Severity : minor
+Frequency : rare
+Bugzilla : 9493
+Description: failure of ptlrpc thread startup can cause oops
+Details : Starting a ptlrpc service thread can fail if there are a large
+ number of threads or the server memory is very fragmented.
+ Handle this without oopsing.
+
+Severity : minor
+Frequency : always, only if liblustre and non-default acceptor port was used
+Bugzilla : 9933
+Description: liblustre cannot connect to servers with non-default acceptor port
+Details : tcpnal_set_default_params() was not called and was therefore
+ ignoring the environment varaible TCPNAL_PORT, as well as other
+ TCPNAL_ environment variables
+
+Severity : minor
+Frequency : rare
+Bugzilla : 9923
+Description: two objects could be created on the same OST for a single file
+Details : If an OST is down, in some cases it was possible to create two
+ objects on a single OST for a single file. No problems other
+ than potential performance impact and spurious error messages.
+
+Severity : minor
+Frequency : always
+Bugzilla : 9942
+Description: Inode refcounting problems in NFS export code
+Details : link_raw functions used to call d_instantiate without obtaining
+ extra inode reference first.
+
+Severity : minor
+Frequency : rare
+Bugzilla : 9942, 9903
+Description: Referencing freed requests leading to crash, memleask with NFS.
+Details : We used to require that call to ll_revalidate_it was always
+ followed by ll_lookup_it. Also with revalidate_special() it is
+ possible to call ll_revalidate_it() twice for the same dentry
+ even if first occurence returned success. This fix changes semantic
+ between DISP_ENQ_COMPLETE disposition flag to mean there is extra
+ reference on a request referred from the intent.
+ ll_intent_release() then releases such a request.
+
+Severity : minor
+Frequency : rare, normally benchmark loads only
+Bugzilla : 1443
+Description: unlinked inodes were kept in memory on the client
+Details : If a client is repeatedly creating and unlinking files it
+ can accumulate a lot of stale inodes in the inode slab cache.
+ If there is no other client load running this can cause the
+ client node to run out of memory. Instead flush old inodes
+ from client cache that have the same inode number as a new inode.
+
+Severity : minor
+Frequency : echo_client brw_test command
+Bugzilla : 9919
+Description: fix echo_client to work with OST preallocated code
+Details : OST preallocation code (5137) didn't take echo_client IO path
+ into account: echo_client calls filter methods outside of any
+ OST thread and, hence, there is no per-thread preallocated
+ pages and buffers to use. Solution: hijack pga pages for IO. As
+ a byproduct, this avoids unnecessary data copying.
+
+Severity : major
+Frequency : rare, unless heavy write-truncate concurrency is continuous
+Bugzilla : 4180, 6984, 7171, 9963, 9331
+Description: OST becomes very slow and/or deadlocked during object unlink
+Details : filter_destroy() was holding onto the parent directory lock
+ while truncating+unlinking objects. For very large objects this
+ may block other threads for a long time and slow overall OST
+ responsiveness. It may also be possible to get a lock ordering
+ deadlock in this case, or run out of journal credits because of
+ the combined truncate+unlink. Solution is to do object truncate
+ first in one transaction without parent lock, and then do the
+ final unlink in a new transaction with the parent lock. This
+ reduces the lock hold time dramatically.
+
+Severity : major
+Frequency : rare, 2.4 kernels only
+Bugzilla : 9967
+Description: MDS or OST cleanup may trip kernel BUG when dropping kernel lock
+Details : mds_cleanup() and filter_cleanup() need to drop the kernel lock
+ before unmounting their filesystem in order to avoid deadlock.
+ The kernel_locked() function in 2.4 kernels only checks whether
+ the kernel lock is held, not whether it is this process that is
+ holding it as 2.6 kernels do.
+
------------------------------------------------------------------------------
08-26-2005 Cluster File Systems, Inc. <info@clusterfs.com>
* bug fixes
Severity : major
-Frequency : rare (only unsupported configurations with a node running as an
+Frequency : rare (only unsupported configurations with a node running as an
OST and a client)
Bugzilla : 6514, 5137
Description: Mounting a Lustre file system on a node running as an OST could
Severity : enhancement
Bugzilla : 1693
Description: Health checks are now provided for MDS and OSTs
-Details : Additional detailed health check information on MSD and OSTs
+Details : Additional detailed health check information on MSD and OSTs
is now provided through the procfs health_check value.
Severity : minor
Severity : minor
Bugzilla : 7241
-Frequency : filesystems with default stripe_count larger than 77
+Frequency : filesystems with default stripe_count larger than 77
Description: lconf+mke2fs fail when formatting filesystem with > 77 stripes
Details : lconf specifies an inode size of 4096 bytes when the default
stripe_count is larger than 77. This conflicts with the default
Bugzilla : 3262, 6359
Description: Attempts to reconnect to servers are now more aggressive.
Details : This builds on the enhanced upcall-less recovery that was added
- in 1.4.2. When trying to reconnect to servers, clients will
+ in 1.4.2. When trying to reconnect to servers, clients will
now try each server in the failover group every 10 seconds. By
default, clients would previously try one server every 25 seconds.
Description: After recovery, certain operations trigger a failed
assertion on a client.
Details : Failing over an mds, using lconf -d --failover, while a
- client was doing a readdir() call would cause the client to
+ client was doing a readdir() call would cause the client to
LBUG after recovery completed and the readdir() was resent.
Severity : enhancement
Bugzilla : 6296
Description: Default groups are now added by lconf
-Details : You can now run lconf --group <servicename> without having to
+Details : You can now run lconf --group <servicename> without having to
manually add groups with lmc.
Severity : major
Severity : minor
Frequency : occasional
-Description: While starting a server, the fsfilt_ext3 module could not be
+Description: While starting a server, the fsfilt_ext3 module could not be
loaded.
Details : CFS's improved ext3 filesystem is named ldiskfs for 2.6
kernels. Previously, lconf would still use the ext3 name
- lconf should create multiple TCP connections from a client (5201)
- init scripts are now turned off by default; run chkconfig --on
lustre and chkconfig --on lustrefs to use them
- - upcalls are no longer needed for clients to recover to failover
+ - upcalls are no longer needed for clients to recover to failover
servers (3262)
- add --abort-recovery option to lconf to abort recovery on device
startup (6017)
- - add support for an arbitrary number of OSTs (3026)
+ - add support for an arbitrary number of OSTs (3026)
- Quota support protocol changes.
- forward compatibility changes to wire structs (6007)
- rmmod NALs that might be loaded because of /etc/modules.conf (6133)
- fix dbench 2, extN refcount problem (170, 258, 356, 418)
- fix double-O_EXCL intent crash (424)
- avoid sending multiple lock CANCELs (352)
- * Features
+ * Features
- MDS can do multi-client recovery (modulo bugs in new code)
- * Documentation
+ * Documentation
- many updates, edits, cleanups
2002-11-18 Phil Schwan <phil@clusterfs.com>
* small changes in the DLM wire protocol
2002-07-25 Peter J. Braam <braam@clusterfs.com>
- * version 0_5_1 with some initial stability,
- * locking on MD and file I/O.
+ * version 0_5_1 with some initial stability,
+ * locking on MD and file I/O.
* documentation updates
* several bug fixes since 0.5.0
* small changes in wire protocol
* move forward to latest Lustre kernel
2002-06-25 Peter Braam <braam@clusterfs.com>
- * release version v0_4_1. Hopefully stable on single node use.
+ * release version v0_4_1. Hopefully stable on single node use.
@SERVER_TRUE@subdir-m += mds obdfilter ost mgs
@CLIENT_TRUE@subdir-m += mdc llite
+@QUOTA_TRUE@subdir-m += quota
@INCLUDE_RULES@
CLIENT_SUBDIRS := mdc llite
+QUOTA_SUBDIRS := quota
+
LIBLUSTRE_SUBDIRS := liblustre
SUBDIRS := $(ALWAYS_SUBDIRS)
SUBDIRS += $(CLIENT_SUBDIRS)
endif
+if QUOTA
+SUBDIRS += $(QUOTA_SUBDIRS)
+endif
+
# this needs to be after the client subdirs
if LIBLUSTRE
if !CLIENT
endif
DIST_SUBDIRS := $(ALWAYS_SUBDIRS) $(SERVER_SUBDIRS) $(CLIENT_SUBDIRS) \
- $(LIBLUSTRE_SUBDIRS)
+ $(LIBLUSTRE_SUBDIRS) $(QUOTA_SUBDIRS)
EXTRA_DIST = BUGS FDL kernel_patches
if test x$enable_ldiskfs = xyes ; then
BACKINGFS="ldiskfs"
+ AC_MSG_CHECKING([whether to enable quilt for making ldiskfs])
+ AC_ARG_ENABLE([quilt],
+ AC_HELP_STRING([--disable-quilt],[disable use of quilt for ldiskfs]),
+ [],[enable_quilt='yes'])
+ AC_MSG_RESULT([$enable_quilt])
+
AC_PATH_PROG(PATCH, patch, [no])
- AC_PATH_PROG(QUILT, quilt, [no])
+
+ if test x$enable_quilt = xno ; then
+ QUILT="no"
+ else
+ AC_PATH_PROG(QUILT, quilt, [no])
+ fi
if test x$enable_ldiskfs$PATCH$QUILT = xyesnono ; then
AC_MSG_ERROR([Quilt or patch are needed to build the ldiskfs module (for Linux 2.6)])
])
;;
ldiskfs)
- LC_FSHOOKS([
- LDISKFS_SERIES="2.6-suse.series"
- ],[
- LDISKFS_SERIES="2.6-rhel4.series"
- ])
+ AC_MSG_CHECKING([which ldiskfs series to use])
+ case $LINUXRELEASE in
+ 2.6.5*) LDISKFS_SERIES="2.6-suse.series" ;;
+ 2.6.9*) LDISKFS_SERIES="2.6-rhel4.series" ;;
+ 2.6.10*) LDISKFS_SERIES="2.6-rhel4.series" ;;
+ 2.6.12*) LDISKFS_SERIES="2.6.12-vanilla.series" ;;
+ *) AC_MSG_WARN([Unknown kernel version $LINUXRELEASE, fix lustre/autoconf/lustre-core.m4])
+ esac
+ AC_MSG_RESULT([$LDISKFS_SERIES])
AC_SUBST(LDISKFS_SERIES)
;;
esac # $BACKINGFS
LC_CONFIG_BACKINGFS
fi
LC_CONFIG_PINGER
+LC_CONFIG_QUOTA
LC_STRUCT_KIOBUF
LC_FUNC_COND_RESCHED
])
#
+# LC_CONFIG_QUOTA
+#
+# whether to enable quota support
+#
+AC_DEFUN([LC_CONFIG_QUOTA],
+[AC_MSG_CHECKING([whether to enable quota support])
+AC_ARG_ENABLE([quota],
+ AC_HELP_STRING([--enable-quota],
+ [enable quota support]),
+ [],[enable_quota='yes'])
+AC_MSG_RESULT([$enable_quota])
+if test x$linux25 != xyes; then
+ enable_quota='no'
+fi
+if test x$enable_quota != xno; then
+ AC_DEFINE(HAVE_QUOTA_SUPPORT, 1, [Enable quota support])
+fi
+])
+
+#
# LC_CONFIGURE
#
# other configure checks
# See note there re: __ASM_X86_64_PROCESSOR_H
AC_CHECK_HEADERS([linux/quota.h])
-AC_CHECK_TYPES([struct if_dqinfo],[],[],[
-#define __ASM_X86_64_PROCESSOR_H
-#include <linux/quota.h>
-])
-
-AC_CHECK_TYPES([struct if_dqblk],[],[],[
-#define __ASM_X86_64_PROCESSOR_H
-#include <linux/quota.h>
-])
-
# liblustre/llite_lib.h
AC_CHECK_HEADERS([xtio.h file.h])
AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests)
AM_CONDITIONAL(CLIENT, test x$enable_client = xyes)
AM_CONDITIONAL(SERVER, test x$enable_server = xyes)
+AM_CONDITIONAL(QUOTA, test x$enable_quota = xyes)
])
#
lustre/mgs/autoMakefile
lustre/ptlrpc/Makefile
lustre/ptlrpc/autoMakefile
+lustre/quota/Makefile
+lustre/quota/autoMakefile
lustre/scripts/Makefile
lustre/scripts/version_tag.pl
lustre/tests/Makefile
m4_define([LUSTRE_MAJOR],[1])
m4_define([LUSTRE_MINOR],[4])
m4_define([LUSTRE_PATCH],[5])
-m4_define([LUSTRE_FIX],[93])
+m4_define([LUSTRE_FIX],[95])
dnl # 288 stands for 0.0.1.32 , next version with fixes is ok, but next after
dnl # next release candidate/beta would spill this warning already.
SUFFIXES = .lin .lyx .pdf .ps .sgml .html .txt .tex .fig .eps .dvi
if UTILS
-man_MANS = lfs.1 lmc.1 lwizard.1 lconf.8 lctl.8
+man_MANS = lfs.1 lmc.1 lconf.8 lctl.8
endif
LYXFILES= $(filter-out $(patsubst %.lin,%.lyx,$(wildcard *.lin)),\
CLEANFILES = *.aux *.tex *.log *.pdf
EXTRA_DIST = tex2pdf $(man_MANS) \
- $(LYXFILES) lfs.1 lmc.1 lwizard.1 lconf.8 lctl.8
+ $(LYXFILES) lfs.1 lmc.1 lconf.8 lctl.8
all:
.B setup
-lctl > setup /dev/loop0 extN
+lctl > setup /dev/loop0 ldiskfs
.br
lctl > quit
\size small
-lctl > setup /dev/loop0 extN
+lctl > setup /dev/loop0 ldiskfs
\newline
lctl > quit
\size default
.br
.B lfs setstripe <filename> <stripe-size> <start-ost> <stripe-cnt>
.br
+.B lfs quotachown [-i] <filesystem>
+.br
+.B lfs quotacheck [-ug] <filesystem>
+.br
+.B lfs quotaon [-ugf] <filesystem>
+.br
+.B lfs quotaoff [-ug] <filesystem>
+.br
+.B lfs setquota [-u|-g] <name> <block-softlimit> <block-hardlimit> <inode-softlimit> <inode-hardlimit> <filesystem>
+.br
+.B lfs quota [-o obd_uuid] [-u|-g] <name> <filesystem>
+.br
+.B lfs setstripe <filename> <stripe-size> <start-ost> <stripe-cnt>
+.br
.B lfs check <mds| osts| servers>
.SH DESCRIPTION
.B lfs
.B getstripe
To list the striping pattern for given filename
.TP
+.B quotachown
+To change files' owner and group on OSTs of the specified filesystem
+.TP
+.B quotacheck
+To scan the specified filesystem for disk usage, and create or update quota files
+.TP
+.B quotaon
+To turn filesystem quotas on
+.TP
+.B quotaoff
+To turn filesystem quotas off
+.TP
+.B setquota
+To set filesystem quotas
+.TP
+.B quota
+To display disk usage and limits
+.TP
.B check
Display the status of MDS or OSTs (as specified in the command) or all the servers (MDS and OSTs)
.TP
.B $lfs find -r --obd OST2-UUID /mnt/lustre/
List all the files that have objects on a specific OST
.TP
+.B $lfs quotachown -i /mnt/lustre
+Change file owner and group
+.TP
+.B $lfs quotacheck -ug /mnt/lustre
+Quotacheck for user and group
+.TP
+.B $lfs quotaon -ug /mnt/lustre
+Turn quotas of user and group on
+.TP
+.B $lfs quotaoff -ug /mnt/lustre
+Turn quotas of user and group off
+.TP
+.B $lfs setquota -u bob 0 1000000 0 10000 /mnt/lustre
+Set quotas of user `bob': 1GB block quota and 10,000 file quota
+.TP
+.B $lfs quota -u bob /mnt/lustre
+List quotas of user `bob'
+.TP
+.B $ lfs find -r --obd OST2-UUID /mnt/lustre/
+.TP
.B $lfs check servers
Check the status of all servers(mds, osts)
.TP
NAME
\layout Description
-lfs Lustre utility to create a file with specific striping pattern
+lfs Lustre utility to create a file with specific striping pattern and manipulat
+e disk quotas
\layout Subsection
SYNOPSIS
\series bold
lfs\SpecialChar ~
check <mds| osts| servers>
+\layout Standard
+
+
+\series bold
+lfs\SpecialChar ~
+quotachog [-i] <filesystem>
+\layout Standard
+
+
+\series bold
+lfs\SpecialChar ~
+quotacheck [-ug] <filesystem>
+\layout Standard
+
+
+\series bold
+lfs\SpecialChar ~
+quotaon [-ugf] <filesystem>
+\layout Standard
+
+
+\series bold
+lfs\SpecialChar ~
+quotaoff [-ug] <filesystem>
+\layout Standard
+
+
+\series bold
+lfs\SpecialChar ~
+setquota [-u|-g] <name> <block-softlimit> <block-hardlimit> <inode-softlimit>
+ <inode-hardlimit> <filesystem>
+\layout Standard
+
+
+\series bold
+lfs\SpecialChar ~
+quota [-o obd_uuid] [-u|-g] <name> <filesystem>
\layout Subsection
DESCRIPTION
This utility can be used to create a new file with a specific striping pattern,
determine the default striping pattern, gather the extended attributes
- (object numbers and location) for a specific file.
+ (object numbers and location) for a specific file, and manipulate disk
+ quotas.
It can be invoked interactively without any arguments or in a non-interactive
mode with one of the arguements listed and explained below:
\layout List
\series bold
+quotachog
+\series default
+ Change files' owner and group on OSTs of the specified filesystem
+\layout List
+\labelwidthstring 00.00.0000
+
+
+\series bold
+quotacheck
+\series default
+ Scan the specified filesystem for disk usage, and create or update quota
+ files
+\layout List
+\labelwidthstring 00.00.0000
+
+
+\series bold
+quotaon
+\series default
+ Turn filesystem quotas on
+\layout List
+\labelwidthstring 00.00.0000
+
+
+\series bold
+quotaoff
+\series default
+ Turn filesystem quotas off
+\layout List
+\labelwidthstring 00.00.0000
+
+
+\series bold
+setquota
+\series default
+ Set filesystem quotas
+\layout List
+\labelwidthstring 00.00.0000
+
+
+\series bold
+quota
+\series default
+ Display disk usage and limits
+\layout List
+\labelwidthstring 00.00.0000
+
+
+\series bold
help
\series default
Provides brief help on the various arguments
\layout LyX-Code
$
+\layout Description
+
+Change\SpecialChar ~
+file\SpecialChar ~
+owner\SpecialChar ~
+and\SpecialChar ~
+group
+\layout LyX-Code
+
+ $lfs quotachog -i /mnt/lustre
+\layout Description
+
+Quotacheck\SpecialChar ~
+for\SpecialChar ~
+user\SpecialChar ~
+and\SpecialChar ~
+group
+\layout LyX-Code
+
+ $lfs quotacheck -ug /mnt/lustre
+\layout Description
+
+Turn\SpecialChar ~
+quotas\SpecialChar ~
+of\SpecialChar ~
+user\SpecialChar ~
+and\SpecialChar ~
+group\SpecialChar ~
+on
+\layout LyX-Code
+
+ $lfs quotaon -ug /mnt/lustre
+\layout Description
+
+Turn\SpecialChar ~
+quotas\SpecialChar ~
+of\SpecialChar ~
+user\SpecialChar ~
+and\SpecialChar ~
+group\SpecialChar ~
+off
+\layout LyX-Code
+
+ $lfs quotaoff -ug /mnt/lustre
+\layout Description
+
+Set\SpecialChar ~
+quotas\SpecialChar ~
+of\SpecialChar ~
+user\SpecialChar ~
+`bob':\SpecialChar ~
+1GB\SpecialChar ~
+block\SpecialChar ~
+quota\SpecialChar ~
+and\SpecialChar ~
+10,000\SpecialChar ~
+file\SpecialChar ~
+quota
+\layout LyX-Code
+
+ $lfs setquota -u bob 0 1000000 0 10000 /mnt/lustre
+\layout Description
+
+List\SpecialChar ~
+quotas\SpecialChar ~
+of\SpecialChar ~
+user\SpecialChar ~
+`bob'
+\layout LyX-Code
+
+ $lfs quota -u bob /mnt/lustre
\layout Subsection
BUGS
Specify the UUID of the OST device.
.TP
--fstype
-extN|ext3 Optional arguement used to specify the file system type. Default is ext3.
+ldiskfs|ext3 Optional arguement used to specify the file system type. Default is ext3.
.TP
--inode_size <size>
Specify new inode size for underlying ext3 file system.
\layout Description
--fstype\SpecialChar ~
-extN|ext3 Optional arguement used to specify the file system type.
+ldiskfs|ext3 Optional arguement used to specify the file system type.
Default is ext3.
\layout Description
+++ /dev/null
-.TH lwizard 1 "2003 Oct 29" Lustre "Configuration utilities"
-.SH NAME
-lwizard \- Lustre configuration wizard
-.SH SYNOPSIS
-.br
-.B lwizard
-.br
-.B lwizard [--help]
-.br
-.BR lwizard [-o|--file=CONFIG_FILE][--stripe_size=SIZE][--stripe_cnt=COUNT]
-.SH DESCRIPTION
-The configuration files for Lustre installation are generally created through a series of lmc commands, this generates an XML file which describes the complete cluster. The lwizard eliminates the need to learn lmc to generate configuration files, instead it achieves the same through asking some simple questions. The
-XML configuration file generated using lwizard will still have to be made accessible to all the cluster nodes either by storing it on an LDAP server, NFS or by copying it over to all the involved nodes and then running lconf on all nodes to start up the various Lustre services, device setups or mounting the filesystem.
-So, once invoked, lwizard asks a series of questions about the various pieces of the cluster :
-.TP
-.B MDS hostname
-If `hostname' has more than one network interfaces (not including lo) and you dicide to use as many interfaces as possible, you need to specify the interfaces' IP addresses separated by blank space. See below example for how to enter interfaces.
-.TP
-.B MDS device information
-.TP
-.B MDS failover information
-Failover is optional. if failover is enabled, failover hostname and device name are needed. The failover device MUST be the shared device of MDS device.
-.TP
-.B OST hostname
-This will be asked for every new OST added. You can also specify multiple network interfaces as mentioned above for MDS hostname.
-.TP
-.B OST device information
-This will be asked for every new OST added
-.TP
-.B OST failover information
-Failover is optional. if failover is enabled, failover hostname and device name are needed. The failover device MUST be the shared device of OST device.
-.TP
-.B Lustre mount-point
-This is the Lustre mount-point on the client (default - /mnt/lustre).
-.TP
-.B Lustre client
-By default, Lustre can be mounted on any node. However, by default, Lustre will use only one network interface to communicate with servers. If you want to mount Lustre filesystem on a multi-host node and use many netowork interfaces to communicate, you need to configure it specifically. This will tell Lustre client which interfaces it can use to communicate with servers. See example below for details.
-
-The wizard saves the XML file to the filename specified using the -o or --file option or the default file config.xml. It will also save the lmc commands used to create the XML file in a script config.sh or <specified-file-name>.sh.
-
-The lwizard tool currently assumes the following defaults:
-
-.TP
-.B Network type
-tcp
-.TP
-.B Filesystem type
-ext3
-.TP
-.B LMC path
-.I /usr/sbin/lmc
-
-.SH EXAMPLES
-The example below shows a sample session using lwizard.
-.PP
-[username@meghna utils]$ ./lwizard --stripe_size=64 --stripe_cnt=2
-.br
-This script will help you create a Lustre configuration file.
-.br
-Creating mds "mds1"...
-.br
-Please enter the HOSTNAME for mds1: meghna
-.br
-If meghna has more than one network INTERFACE, enter here, separating them
-by blank space. See lwizard man page for help.
-.br
-(hit enter if only one): 192.168.1.29/24 10.0.0.29/24
-.br
-Please enter the device name or loop file name for meghna: /dev/sda1
-.br
-Please enter the device SIZE or 0 to use entire device (in KB):
-.br
-Do you want to configure FAILOVER mds1? y
-.br
-Please enter the HOSTNAME for failover mds1: lester2
-.br
-Please enter the device for failover mds1 on lester2: /dev/sdb2
-.br
-Creating ost "ost1"...
-.br
-Please enter the HOSTNAME for ost1: meghna
-.br
-If meghna has more than one network INTERFACE, enter here, separating them
-by blank space. See lwizard man page for help.
-.br
-(hit enter if only one):
-.br
-Please enter the device name or loop file name for meghna: /tmp/ost1
-.br
-Please enter the device SIZE or 0 to use entire device (in KB): 10000
-.br
-Do you want to configure FAILOVER ost1?
-.br
-Creating ost "ost2"...
-.br
-Please enter the HOSTNAME for ost2, or just hit enter to finish:
-.br
-Please enter the clients' mountpoint (/mnt/lustre):
-.br
-Do you want to configure another client with multiple network interfaces? y
-.br
-Please enter the HOSTNAME: client2
-.br
-Please enter network interface address (separated by space): 192.168.1.30/24 10.0.0.30/24
-.br
-Do you want to configure another client with multiple network interfaces?
-.br
- mds1 lov1 ost1 client client
-.br
-The Lustre configuration has been written to lwizard.xml.
-.br
-.SH BUGS
-None are known.
#define GFP_HIGHUSER 1
#define GFP_ATOMIC 1
#define GFP_NOFS 1
-#define IS_ERR(a) (((a) && abs((long)(a)) < 500) ? 1 : 0)
+#define IS_ERR(a) ((unsigned long)(a) < 1000)
#define PTR_ERR(a) ((long)(a))
#define ERR_PTR(a) ((void*)((long)(a)))
#define SPIN_LOCK_UNLOCKED (spinlock_t) { }
#define LASSERT_SPIN_LOCKED(lock) do {} while(0)
+#define LASSERT_SEM_LOCKED(sem) do {} while(0)
static inline void spin_lock(spinlock_t *l) {return;}
static inline void spin_unlock(spinlock_t *l) {return;}
/* memory */
-/* FIXME */
-#define num_physpages (16 * 1024)
+/* memory size: used for some client tunables */
+#define num_physpages (256 * 1024) /* 1GB */
static inline int copy_from_user(void *a,void *b, int c)
{
result; \
})
+/* ACL */
+struct posix_acl_entry {
+ short e_tag;
+ unsigned short e_perm;
+ unsigned int e_id;
+};
+
+struct posix_acl {
+ atomic_t a_refcount;
+ unsigned int a_count;
+ struct posix_acl_entry a_entries[0];
+};
+
+typedef struct {
+ __u16 e_tag;
+ __u16 e_perm;
+ __u32 e_id;
+} xattr_acl_entry;
+
+typedef struct {
+ __u32 a_version;
+ xattr_acl_entry a_entries[0];
+} xattr_acl_header;
+
+static inline size_t xattr_acl_size(int count)
+{
+ return sizeof(xattr_acl_header) + count * sizeof(xattr_acl_entry);
+}
+
+static inline
+struct posix_acl * posix_acl_from_xattr(const void *value, size_t size)
+{
+ return NULL;
+}
+
+static inline
+int posix_acl_valid(const struct posix_acl *acl)
+{
+ return 0;
+}
+
+static inline
+void posix_acl_release(struct posix_acl *acl)
+{
+}
+
#ifndef ENOTSUPP
#define ENOTSUPP ENOTSUP
#endif
endif
EXTRA_DIST = lprocfs_status.h lustre_debug.h lustre_ha.h lustre_lib.h \
- obd_cache.h obd_lov.h lustre_dlm.h lustre_handles.h \
+ obd_cache.h obd_lov.h lustre_dlm.h lustre_handles.h lustre_disk.h \
lustre_net.h obd_class.h obd_ost.h obd_support.h lustre_commit_confd.h \
lustre_export.h lustre_log.h obd_echo.h \
lustre_compat25.h lustre_fsfilt.h lustre_import.h lustre_mds.h obd.h \
int count, int *eof, void *data);
extern int lprocfs_rd_conn_uuid(char *page, char **start, off_t off,
int count, int *eof, void *data);
+extern int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
+ int count, int *eof, void *data);
extern int lprocfs_rd_num_exports(char *page, char **start, off_t off,
int count, int *eof, void *data);
extern int lprocfs_rd_numrefs(char *page, char **start, off_t off,
static inline int lprocfs_rd_conn_uuid(char *page, char **start, off_t off,
int count, int *eof, void *data)
{ return 0; }
+static inline int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{ return 0; }
+
static inline int lprocfs_rd_num_exports(char *page, char **start, off_t off,
int count, int *eof, void *data)
{ return 0; }
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+#define lock_dentry(___dentry) spin_lock(&(___dentry)->d_lock)
+#define unlock_dentry(___dentry) spin_unlock(&(___dentry)->d_lock)
+
#define lock_24kernel() do {} while (0)
#define unlock_24kernel() do {} while (0)
+#define ll_kernel_locked() kernel_locked()
/*
* OBD need working random driver, thus all our
#else /* 2.4.. */
+#define lock_dentry(___dentry)
+#define unlock_dentry(___dentry)
+
#define lock_24kernel() lock_kernel()
#define unlock_24kernel() unlock_kernel()
+#define ll_kernel_locked() (current->lock_depth >= 0)
#ifdef HAVE_MM_INLINE
#include <linux/mm_inline.h>
#define INIT_HLIST_HEAD INIT_LIST_HEAD
#define hlist_del_init list_del_init
#define hlist_add_head list_add
+#endif
+#ifndef INIT_HLIST_NODE
+#define INIT_HLIST_NODE(p) ((p)->next = NULL, (p)->prev = NULL)
+#endif
+#ifndef hlist_for_each
+#define hlist_for_each list_for_each
+#endif
+#ifndef hlist_for_each_safe
#define hlist_for_each_safe list_for_each_safe
#endif
#define KDEVT_INIT(val) (val)
*
* Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
* Author: Nathan Rutman <nathan@clusterfs.com>
- * Author: Lin Song Tao <lincent@clusterfs.com>
*
* This file is part of Lustre, http://www.lustre.org.
*
#define _LUSTRE_DISK_H
#include <linux/types.h>
-
#include <lnet/types.h>
/****************** mount command *********************/
+/* The lmd is only used internally by Lustre; mount simply passes
+ everything as string options */
+
+#define LMD_MAGIC 0xbdacbd03
+
/* gleaned from the mount command - no persistent info here */
struct lustre_mount_data {
__u32 lmd_magic;
_device_ mount options) */
};
-#define LMD_FLG_RECOVER 0x0001 /* Allow recovery */
-#define LMD_FLG_NOSVC 0x0002 /* Only start MGS/MGC for servers, no other services */
-#define LMD_FLG_MNTCNF 0x1000 /* MountConf compat */
-#define LMD_FLG_CLIENT 0x2000 /* Mounting a client only; no real device */
+#define LMD_FLG_MNTCNF 0x0001 /* Mountconf compat */
+#define LMD_FLG_CLIENT 0x0002 /* Mounting a client only */
+#define LMD_FLG_RECOVER 0x0004 /* Allow recovery */
+#define LMD_FLG_NOSVC 0x0008 /* Only start MGS/MGC for servers,
+ no other services */
/* 2nd half is for old clients */
#define lmd_is_client(x) \
/****************** last_rcvd file *********************/
#define LAST_RCVD "last_rcvd"
+#define LOV_OBJID "lov_objid"
#define LR_SERVER_SIZE 512
#define LR_CLIENT_START 8192
#define LR_CLIENT_SIZE 128
+#if LR_CLIENT_START < LR_SERVER_SIZE
+#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE"
+#endif
+/* This limit is arbitrary (32k clients on x86), but it is convenient to use
+ * 2^n * PAGE_SIZE * 8 for the number of bits that fit an order-n allocation. */#define LR_MAX_CLIENTS (PAGE_SIZE * 8)
+#define LR_MAX_CLIENTS (PAGE_SIZE * 8)
+
+#define OBD_COMPAT_OST 0x00000002 /* this is an OST (temporary) */
+#define OBD_COMPAT_MDT 0x00000004 /* this is an MDT (temporary) */
+#define OBD_COMPAT_COMMON_LR 0x00000008 /* common last_rvcd format */
+
+#define OBD_ROCOMPAT_LOVOBJID 0x00000001 /* MDS handles LOV_OBJID file */
+#define OBD_ROCOMPAT_CROW 0x00000002 /* OST will CROW create objects */
+
+#define OBD_INCOMPAT_GROUPS 0x00000001 /* OST handles group subdirs */
+#define OBD_INCOMPAT_OST 0x00000002 /* this is an OST (permanent) */
+#define OBD_INCOMPAT_MDT 0x00000004 /* this is an MDT (permanent) */
+
/* Data stored per server at the head of the last_rcvd file. In le32 order.
This should be common to filter_internal.h, lustre_mds.h */
struct lr_server_data {
__u8 lsd_uuid[40]; /* server UUID */
- __u64 lsd_unused; /* was lsd_last_objid - don't use for now */
+ __u64 lsd_unused; /* was fsd_last_objid - don't use for now */
__u64 lsd_last_transno; /* last completed transaction ID */
__u64 lsd_mount_count; /* incarnation number */
__u32 lsd_feature_compat; /* compatible feature flags */
__u64 lsd_catalog_oid; /* recovery catalog object id */
__u32 lsd_catalog_ogen; /* recovery catalog inode generation */
__u8 lsd_peeruuid[40]; /* UUID of MDS associated with this OST */
- __u32 lsd_index; /* target index (stripe index for ost)*/
- __u8 lsd_padding[LR_SERVER_SIZE - 144];
+ __u32 lsd_ost_index; /* index number of OST in LOV */
+ __u32 lsd_mds_index; /* index number of MDS in LMV */
+ __u8 lsd_padding[LR_SERVER_SIZE - 148];
+};
+
+/* Data stored per client in the last_rcvd file. In le32 order. */
+struct lsd_client_data {
+ __u8 lcd_uuid[40]; /* client UUID */
+ __u64 lcd_last_transno; /* last completed transaction ID */
+ __u64 lcd_last_xid; /* xid for the last transaction */
+ __u32 lcd_last_result; /* result from last RPC */
+ __u32 lcd_last_data; /* per-op data (disposition for open &c.) */
+ /* for MDS_CLOSE requests */
+ __u64 lcd_last_close_transno; /* last completed transaction ID */
+ __u64 lcd_last_close_xid; /* xid for the last transaction */
+ __u32 lcd_last_close_result; /* result from last RPC */
+ __u32 lcd_last_close_data; /* per-op data */
+ __u8 lcd_padding[LR_CLIENT_SIZE - 88];
};
-#define LR_COMPAT_COMMON_LR 0x10000000 /* Common last_rvcd format (e.g. above) */
+/*
#define MDS_ROCOMPAT_LOVOBJID 0x00000001
#define MDS_ROCOMPAT_SUPP (MDS_ROCOMPAT_LOVOBJID)
#define MDS_INCOMPAT_SUPP (0)
+*/
#ifdef __KERNEL__
/****************** superblock additional info *********************/
/****************** prototypes *********************/
#ifdef __KERNEL__
+#include <linux/obd_class.h>
/* obd_mount.c */
void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb));
#define OBD_LDLM_DEVICENAME "ldlm"
-#define LDLM_DEFAULT_LRU_SIZE 100
+#define LDLM_DEFAULT_LRU_SIZE (100 * smp_num_cpus)
typedef enum {
ELDLM_OK = 0,
#define LCK_COMPAT_NL (LCK_COMPAT_CR | LCK_EX)
#define LCK_COMPAT_GROUP (LCK_GROUP | LCK_NL)
-static ldlm_mode_t lck_compat_array[] = {
- [LCK_EX] LCK_COMPAT_EX,
- [LCK_PW] LCK_COMPAT_PW,
- [LCK_PR] LCK_COMPAT_PR,
- [LCK_CW] LCK_COMPAT_CW,
- [LCK_CR] LCK_COMPAT_CR,
- [LCK_NL] LCK_COMPAT_NL,
- [LCK_GROUP] LCK_COMPAT_GROUP
-};
+extern ldlm_mode_t lck_compat_array[];
static inline void lockmode_verify(ldlm_mode_t mode)
{
lock->l_pid); \
break; \
} \
+ if (lock->l_resource->lr_type == LDLM_IBITS) { \
+ CDEBUG(level, "### " format \
+ " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " \
+ "res: "LPU64"/"LPU64" bits "LPX64" rrc: %d type: %s " \
+ "flags: %x remote: "LPX64" expref: %d " \
+ "pid %u\n" , ## a, \
+ lock->l_resource->lr_namespace->ns_name, \
+ lock, lock->l_handle.h_cookie, \
+ atomic_read (&lock->l_refc), \
+ lock->l_readers, lock->l_writers, \
+ ldlm_lockname[lock->l_granted_mode], \
+ ldlm_lockname[lock->l_req_mode], \
+ lock->l_resource->lr_name.name[0], \
+ lock->l_resource->lr_name.name[1], \
+ lock->l_policy_data.l_inodebits.bits, \
+ atomic_read(&lock->l_resource->lr_refcount), \
+ ldlm_typename[lock->l_resource->lr_type], \
+ lock->l_flags, lock->l_remote_handle.cookie, \
+ lock->l_export ? \
+ atomic_read(&lock->l_export->exp_refcount) : -99, \
+ lock->l_pid); \
+ break; \
+ } \
{ \
CDEBUG(level, "### " format \
" ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " \
void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode);
void ldlm_lock_allow_match(struct ldlm_lock *lock);
int ldlm_lock_match(struct ldlm_namespace *ns, int flags, struct ldlm_res_id *,
- __u32 type, ldlm_policy_data_t *, ldlm_mode_t mode,
+ ldlm_type_t type, ldlm_policy_data_t *, ldlm_mode_t mode,
struct lustre_handle *);
struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
int *flags);
/* resource.c - internal */
struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns,
struct ldlm_resource *parent,
- struct ldlm_res_id, __u32 type,
+ struct ldlm_res_id, ldlm_type_t type,
int create);
struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res);
int ldlm_resource_putref(struct ldlm_resource *res);
struct ptlrpc_request *req,
struct ldlm_namespace *ns,
struct ldlm_res_id,
- __u32 type,
+ ldlm_type_t type,
ldlm_policy_data_t *,
ldlm_mode_t mode,
int *flags,
struct list_head med_open_head;
spinlock_t med_open_lock; /* lock med_open_head, mfd_list*/
struct mds_client_data *med_mcd;
+ __u64 med_ibits_known;
loff_t med_lr_off;
int med_lr_idx;
};
#define XATTR_LUSTRE_MDS_LOV_EA "lov"
+struct lustre_dquot;
struct fsfilt_operations {
struct list_head fs_list;
struct module *fs_owner;
char *fs_type;
+ char *(* fs_label)(struct super_block *sb);
+ char *(* fs_uuid)(struct super_block *sb);
void *(* fs_start)(struct inode *inode, int op, void *desc_private,
int logs);
void *(* fs_brw_start)(int objcount, struct fsfilt_objinfo *fso,
struct obd_quotactl *oqctl);
int (* fs_quotainfo)(struct lustre_quota_info *lqi, int type,
int cmd);
+ int (* fs_qids)(struct file *file, struct inode *inode, int type,
+ struct list_head *list);
int (* fs_dquot)(struct lustre_dquot *dquot, int cmd);
};
extern struct fsfilt_operations *fsfilt_get_ops(const char *type);
extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops);
+static inline char *fsfilt_label(struct obd_device *obd, struct super_block *sb)
+{
+ if (obd->obd_fsops->fs_label == NULL)
+ return NULL;
+ if (obd->obd_fsops->fs_label(sb)[0] == '\0')
+ return NULL;
+
+ return obd->obd_fsops->fs_label(sb);
+}
+
+static inline __u8 *fsfilt_uuid(struct obd_device *obd, struct super_block *sb)
+{
+ if (obd->obd_fsops->fs_uuid == NULL)
+ return NULL;
+
+ return obd->obd_fsops->fs_uuid(sb);
+}
+
#define FSFILT_OP_UNLINK 1
#define FSFILT_OP_RMDIR 2
#define FSFILT_OP_RENAME 3
#define FSFILT_OP_SETATTR 8
#define FSFILT_OP_LINK 9
#define FSFILT_OP_CANCEL_UNLINK 10
+#define FSFILT_OP_JOIN 11
+#define FSFILT_OP_NOOP 15
#define fsfilt_check_slow(start, timeout, msg) \
do { \
return -ENOTSUPP;
}
+static inline int fsfilt_qids(struct obd_device *obd, struct file *file,
+ struct inode *inode, int type,
+ struct list_head *list)
+{
+ if (obd->obd_fsops->fs_qids)
+ return obd->obd_fsops->fs_qids(file, inode, type, list);
+ return -ENOTSUPP;
+}
+
static inline int fsfilt_dquot(struct obd_device *obd,
struct lustre_dquot *dquot, int cmd)
{
#endif
#endif
+#include <lnet/types.h> /* for lnet_nid_t */
+
/* Defn's shared with user-space. */
#include <lustre/lustre_user.h>
#define LUSTRE_DLM_VERSION 0x00040000
#define LUSTRE_LOG_VERSION 0x00050000
#define LUSTRE_PBD_VERSION 0x00060000
+#define LUSTRE_MGS_VERSION 0x00070000
+
struct lustre_handle {
__u64 cookie;
return lh->cookie != 0ull;
}
+static inline int lustre_handle_equal(struct lustre_handle *lh1,
+ struct lustre_handle *lh2)
+{
+ return lh1->cookie == lh2->cookie;
+}
+
+static inline void lustre_handle_copy(struct lustre_handle *tgt,
+ struct lustre_handle *src)
+{
+ tgt->cookie = src->cookie;
+}
+
/* we depend on this structure to be 8-byte aligned */
/* this type is only endian-adjusted in lustre_unpack_msg() */
struct lustre_msg {
/* Connect flags */
#define OBD_CONNECT_RDONLY 0x1ULL
+#define OBD_CONNECT_RDONLY 0x1ULL /* client allowed read-only access */
#define OBD_CONNECT_INDEX 0x2ULL /* connect to specific LOV idx */
#define OBD_CONNECT_GRANT 0x8ULL /* OSC acquires grant at connect */
#define OBD_CONNECT_SRVLOCK 0x10ULL /* server takes locks for client */
#define OBD_CONNECT_REQPORTAL 0x40ULL /* Separate portal for non-IO reqs */
#define OBD_CONNECT_ACL 0x80ULL /* client using access control lists */
#define OBD_CONNECT_XATTR 0x100ULL /* client using extended attributes*/
-#define OBD_CONNECT_CROW 0x200ULL /* MDS is expecting create-on-write */
+#define OBD_CONNECT_CROW 0x200ULL /* MDS+OST do object create-on-write */
+#define OBD_CONNECT_TRUNCLOCK 0x400ULL /* server gets locks for punch b=9528 */
+#define OBD_CONNECT_TRANSNO 0x800ULL /* replay is sending initial transno */
+#define OBD_CONNECT_IBITS 0x1000ULL /* support for inodebits locks */
+#define OBD_CONNECT_JOIN 0x2000ULL /* files can be concatenated */
#define OBD_CONNECT_EMPTY 0x80000000ULL /* fake: these are empty connect flags*/
-/*
- * set by servers supporting taking extent locks during obd_punch(). Currently
- * is requested by liblustre clients only. See bug 9528.
- */
-#define OBD_CONNECT_TRUNCLOCK 0x400ULL
+/* also update obd_connect_names[] for lprocfs_rd_connect_flags() */
-#define MDS_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY|OBD_CONNECT_VERSION)
+#define MDS_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \
+ OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \
+ OBD_CONNECT_IBITS | OBD_CONNECT_JOIN)
#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
OBD_CONNECT_TRUNCLOCK)
#define ECHO_CONNECT_SUPPORTED (0)
-#define MGMT_CONNECT_SUPPORTED (0)
+#define MGMT_CONNECT_SUPPORTED (OBD_CONNECT_VERSION)
#define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\
((patch)<<8) + (fix))
* If we eventually have separate connect data for different types, which we
* almost certainly will, then perhaps we stick a union in here. */
struct obd_connect_data {
- __u64 ocd_connect_flags;
- __u32 ocd_version;
- __u32 ocd_grant;
- __u32 ocd_index;
+ __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */
+ __u32 ocd_version; /* lustre release version number */
+ __u32 ocd_grant; /* initial cache grant amount (bytes) */
+ __u32 ocd_index; /* LOV index to connect to */
__u32 ocd_unused;
- __u64 padding1; /* also fix lustre_swab_connect */
- __u64 padding2; /* also fix lustre_swab_connect */
- __u64 padding3; /* also fix lustre_swab_connect */
- __u64 padding4; /* also fix lustre_swab_connect */
- __u64 padding5; /* also fix lustre_swab_connect */
- __u64 padding6; /* also fix lustre_swab_connect */
+ __u64 ocd_ibits_known; /* inode bits this client understands */
+ __u64 padding2; /* also fix lustre_swab_connect */
+ __u64 padding3; /* also fix lustre_swab_connect */
+ __u64 padding4; /* also fix lustre_swab_connect */
+ __u64 padding5; /* also fix lustre_swab_connect */
+ __u64 padding6; /* also fix lustre_swab_connect */
};
extern void lustre_swab_connect(struct obd_connect_data *ocd);
#define OBD_FL_DEBUG_CHECK (0x00000040) /* echo client/server debug check */
#define OBD_FL_NO_USRQUOTA (0x00000100) /* the object's owner is over quota */
#define OBD_FL_NO_GRPQUOTA (0x00000200) /* the object's group is over quota */
+#define OBD_FL_CREATE_CROW (0x00000400) /* object should be create on write */
+
/*
* set this to delegate DLM locking during obd_punch() to the OSTs. Only OSTs
* that declared OBD_CONNECT_TRUNCLOCK in their connect flags support this
*/
#define OBD_FL_TRUNCLOCK (0x00000800)
+/* this should be not smaller than sizeof(struct lustre_handle) + sizeof(struct
+ * llog_cookie) + sizeof(ll_fid). Nevertheless struct ll_fid is not longer
+ * stored in o_inline, we keep this just for case. */
#define OBD_INLINESZ 80
/* Note: 64-bit types are 64-bit aligned in structure */
obd_time o_ctime;
obd_blocks o_blocks; /* brw: cli sent cached bytes */
obd_size o_grant;
+
/* 32-bit fields start here: keep an even number of them via padding */
obd_blksize o_blksize; /* optimal IO blocksize */
obd_mode o_mode; /* brw: cli sent cache remain */
obd_count o_misc; /* brw: o_dropped */
__u32 o_easize; /* epoch in ost writes */
__u32 o_mds;
- __u32 o_padding_1; /* also fix lustre_swab_obdo */
- __u32 o_padding_2; /* also fix lustre_swab_obdo */
+ __u32 o_stripe_idx; /* holds stripe idx */
+ __u32 o_padding_1;
char o_inline[OBD_INLINESZ]; /* fid in ost writes */
};
extern void lustre_swab_obdo (struct obdo *o);
+
#define LOV_MAGIC_V1 0x0BD10BD0
#define LOV_MAGIC LOV_MAGIC_V1
+#define LOV_MAGIC_JOIN 0x0BD20BD0
#define LOV_PATTERN_RAID0 0x001 /* stripes are used round-robin */
#define LOV_PATTERN_RAID1 0x002 /* stripes are mirrors of each other */
struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
};
+
#define OBD_MD_FLID (0x00000001ULL) /* object ID */
#define OBD_MD_FLATIME (0x00000002ULL) /* access time */
#define OBD_MD_FLMTIME (0x00000004ULL) /* data modification time */
#define OBD_MD_FLOSCOPQ (0x00400000ULL) /* osc opaque data */
#define OBD_MD_FLCOOKIE (0x00800000ULL) /* log cancellation cookie */
#define OBD_MD_FLGROUP (0x01000000ULL) /* group */
-#define OBD_MD_FLIFID (0x02000000ULL) /* ->ost write inline fid */
+#define OBD_MD_FLFID (0x02000000ULL) /* ->ost write inline fid */
#define OBD_MD_FLEPOCH (0x04000000ULL) /* ->ost write easize is epoch */
#define OBD_MD_FLGRANT (0x08000000ULL) /* ost preallocation space grant */
#define OBD_MD_FLDIREA (0x10000000ULL) /* dir's extended attribute data */
#define OBD_MD_FLUSRQUOTA (0x20000000ULL) /* over quota flags sent from ost */
#define OBD_MD_FLGRPQUOTA (0x40000000ULL) /* over quota flags sent from ost */
+#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */
#define OBD_MD_MDS (0x0000000100000000ULL) /* where an inode lives on */
#define OBD_MD_REINT (0x0000000200000000ULL) /* reintegrate oa */
#define OBD_MD_FLXATTR (0x0000001000000000ULL) /* xattr */
#define OBD_MD_FLXATTRLS (0x0000002000000000ULL) /* xattr list */
#define OBD_MD_FLXATTRRM (0x0000004000000000ULL) /* xattr remove */
+#define OBD_MD_FLACL (0x0000008000000000ULL) /* ACL */
#define OBD_MD_FLGETATTR (OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | \
OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLKSZ | \
__u32 os_bsize;
__u32 os_namelen;
__u64 os_maxbytes;
- __u32 os_spare[10];
+ __u32 os_state; /* positive error code on server */
+ __u32 os_spare1;
+ __u32 os_spare2;
+ __u32 os_spare3;
+ __u32 os_spare4;
+ __u32 os_spare5;
+ __u32 os_spare6;
+ __u32 os_spare7;
+ __u32 os_spare8;
+ __u32 os_spare9;
};
extern void lustre_swab_obd_statfs (struct obd_statfs *os);
* MDS REQ RECORDS
*/
+/* FIXME: this is different from HEAD, adjust it
+ * while merge GSS */
+#define MDS_REQ_REC_OFF 0
+
+#define MDS_REQ_INTENT_LOCKREQ_OFF 0
+#define MDS_REQ_INTENT_IT_OFF 1
+#define MDS_REQ_INTENT_REC_OFF 2
+
/* opcodes */
typedef enum {
MDS_GETATTR = 33,
#define DISP_OPEN_OPEN 0x20
#define DISP_ENQ_COMPLETE 0x40
+/* INODE LOCK PARTS */
+#define MDS_INODELOCK_LOOKUP 0x000001 /* dentry, mode, owner, group */
+#define MDS_INODELOCK_UPDATE 0x000002 /* size, links, timestamps */
+#define MDS_INODELOCK_OPEN 0x000004 /* For opened files */
+
+/* Do not forget to increase MDS_INODELOCK_MAXSHIFT when adding new bits */
+#define MDS_INODELOCK_MAXSHIFT 2
+/* This FULL lock is useful to take on unlink sort of operations */
+#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1)
+
struct ll_fid {
- __u64 id;
- __u32 generation;
- __u32 f_type;
+ __u64 id; /* holds object id */
+ __u32 generation; /* holds object generation */
+
+ __u32 f_type; /* holds object type or stripe idx when passing it to
+ * OST for saving into EA. */
};
extern void lustre_swab_ll_fid (struct ll_fid *fid);
__u64 ino;
__u32 fsuid;
__u32 fsgid;
- __u32 capability;
+ __u32 capability;
__u32 mode;
__u32 uid;
__u32 gid;
__u32 generation;
__u32 suppgid;
__u32 eadatasize;
- __u32 padding_1; /* also fix lustre_swab_mds_body */
- __u32 padding_2; /* also fix lustre_swab_mds_body */
- __u32 padding_3; /* also fix lustre_swab_mds_body */
+ __u32 aclsize;
+ __u32 max_mdsize;
+ __u32 max_cookiesize; /* also fix lustre_swab_mds_body */
__u32 padding_4; /* also fix lustre_swab_mds_body */
};
extern void lustre_swab_mds_body (struct mds_body *b);
-/* XXX: same as if_dqinfo struct in kernel */
-struct obd_dqinfo {
- __u64 dqi_bgrace;
- __u64 dqi_igrace;
- __u32 dqi_flags;
- __u32 dqi_valid;
-};
-
-/* XXX: same as if_dqblk struct in kernel, plus one padding */
-struct obd_dqblk {
- __u64 dqb_bhardlimit;
- __u64 dqb_bsoftlimit;
- __u64 dqb_curspace;
- __u64 dqb_ihardlimit;
- __u64 dqb_isoftlimit;
- __u64 dqb_curinodes;
- __u64 dqb_btime;
- __u64 dqb_itime;
- __u32 dqb_valid;
- __u32 padding; /* also fix lustre_swab_obd_quotactl */
-};
-
#define Q_QUOTACHECK 0x800100
#define Q_INITQUOTA 0x800101 /* init slave limits */
#define Q_GETOINFO 0x800102 /* get obd quota info */
#define MDS_OPEN_DELAY_CREATE 0100000000 /* delay initial object create */
#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */
+#define MDS_OPEN_JOIN_FILE 0400000000 /* open for join file*/
#define MDS_OPEN_HAS_EA 010000000000 /* specify object create pattern */
#define MDS_OPEN_HAS_OBJS 020000000000 /* Just set the EA the obj exist */
extern void lustre_swab_mds_rec_create (struct mds_rec_create *cr);
+struct mds_rec_join {
+ struct ll_fid jr_fid;
+ __u64 jr_headsize;
+};
+
+extern void lustre_swab_mds_rec_join (struct mds_rec_join *jr);
+
struct mds_rec_link {
__u32 lk_opcode;
__u32 lk_fsuid;
LDLM_PLAIN = 10,
LDLM_EXTENT = 11,
LDLM_FLOCK = 12,
-// LDLM_IBITS = 13,
+ LDLM_IBITS = 13,
LDLM_MAX_TYPE
} ldlm_type_t;
__u64 gid;
};
+struct ldlm_inodebits {
+ __u64 bits;
+};
+
struct ldlm_flock {
__u64 start;
__u64 end;
typedef union {
struct ldlm_extent l_extent;
struct ldlm_flock l_flock;
+ struct ldlm_inodebits l_inodebits;
} ldlm_policy_data_t;
extern void lustre_swab_ldlm_policy_data (ldlm_policy_data_t *d);
struct mgmt_target_info {
char mti_fsname[MTI_NAME_MAXLEN];
char mti_svname[MTI_NAME_MAXLEN];
- __u64 mti_nids[MTI_NIDS_MAX]; /* lnet_nid_t host nids */
- __u64 mti_failnids[MTI_NIDS_MAX]; /* partner nids */
+ lnet_nid_t mti_nids[MTI_NIDS_MAX]; /* host nids */
+ lnet_nid_t mti_failnids[MTI_NIDS_MAX]; /* partner nids */
__u64 mti_stripe_size;
__u64 mti_stripe_offset;
__u32 mti_stripe_count; /* how many objects are used */
/* catalog of log objects */
-/* Identifier for a single log object */
-struct llog_logid {
- __u64 lgl_oid;
- __u64 lgl_ogr;
- __u32 lgl_ogen;
-} __attribute__((packed));
-
/* Records written to the CATALOGS list */
#define CATLIST "CATALOGS"
struct llog_catid {
__u32 lci_padding3;
} __attribute__((packed));
+
+/*join file lov mds md*/
+struct lov_mds_md_join {
+ struct lov_mds_md lmmj_md;
+ /*join private info*/
+ struct llog_logid lmmj_array_id; /*array object id*/
+ __u32 lmmj_extent_count; /*array extent count*/
+};
+
/* Log data record types - there is no specific reason that these need to
* be related to the RPC opcodes, but no reason not to (may be handy later?)
*/
OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000,
PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000, /* obsolete */
LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000,
+ LLOG_JOIN_REC = LLOG_OP_MAGIC | 0x50000,
LLOG_HDR_MAGIC = LLOG_OP_MAGIC | 0x45539,
LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b,
} llog_op_type;
struct llog_rec_tail lid_tail;
} __attribute__((packed));
+/* MDS extent description
+ * It is for joined file extent info, each extent info for joined file
+ * just like (start, end, lmm).
+ */
+struct mds_extent_desc {
+ __u64 med_start; /* extent start */
+ __u64 med_len; /* extent length */
+ struct lov_mds_md med_lmm; /* extent's lmm */
+};
+/*Joined file array extent log record*/
+struct llog_array_rec {
+ struct llog_rec_hdr lmr_hdr;
+ struct mds_extent_desc lmr_med;
+ struct llog_rec_tail lmr_tail;
+};
+
struct llog_create_rec {
struct llog_rec_hdr lcr_hdr;
struct ll_fid lcr_fid;
LLOG_ORIGIN_HANDLE_CLOSE = 505,
LLOG_ORIGIN_CONNECT = 506,
LLOG_CATINFO = 507, /* for lfs catinfo */
+ LLOG_ORIGIN_HANDLE_PREV_BLOCK = 508,
+ LLOG_ORIGIN_HANDLE_DESTROY = 509, /* for destroy llog object*/
};
struct llogd_body {
extern void lustre_swab_lov_user_md(struct lov_user_md *lum);
extern void lustre_swab_lov_user_md_objects(struct lov_user_md *lum);
+extern void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj);
/* llog_swab.c */
extern void lustre_swab_llogd_body (struct llogd_body *d);
struct lustre_cfg;
extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg);
-static inline struct ll_fid *obdo_fid(struct obdo *oa)
-{
- return (struct ll_fid *)(oa->o_inline + sizeof(struct lustre_handle) +
- sizeof(struct llog_cookie));
-}
-
/* qutoa */
struct qunit_data {
- __u32 qd_id;
- __u32 qd_type;
- __u32 qd_count;
- __u32 qd_isblk; /* indicating if it's block quota */
+ __u32 qd_id; /* ID appiles to (uid, gid) */
+ __u32 qd_type; /* Quota type (USRQUOTA, GRPQUOTA) */
+ __u32 qd_count; /* acquire/release count (bytes for block quota) */
+ __u32 qd_isblk; /* Block quota or file quota */
};
extern void lustre_swab_qdata(struct qunit_data *d);
QUOTA_DQREL = 602,
} quota_cmd_t;
+#define JOIN_FILE_ALIGN 4096
#endif
/* flags */
unsigned int imp_invalid:1, imp_replayable:1,
imp_dlm_fake:1, imp_server_timeout:1,
- imp_initial_recov:1, imp_force_verify:1,
- imp_pingable:1, imp_resend_replay:1,
- imp_deactive:1, imp_initial_recov_bk:1;
+ imp_initial_recov:1, imp_initial_recov_bk:1,
+ imp_force_verify:1, imp_pingable:1,
+ imp_resend_replay:1, imp_deactive:1;
__u32 imp_connect_op;
struct obd_connect_data imp_connect_data;
+ __u64 imp_connect_flags_orig;
};
typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
#endif
#endif
+/* prng.c */
+unsigned int ll_rand(void); /* returns a random 32-bit integer */
+void ll_srand(unsigned int, unsigned int); /* seed the generator */
+
/* target.c */
struct ptlrpc_request;
struct recovd_data;
#define POISON_BULK 0
-static inline int ll_insecure_random_int(void)
-{
- struct timeval t;
- do_gettimeofday(&t);
- return (int)(t.tv_usec);
-}
-
/*
* l_wait_event is a flexible sleeping function, permitting simple caller
* configuration of interrupt and timeout sensitivity along with actions to
* SIGNALS. The caller must therefore beware that if 'timeout' is zero, or if
* 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that
* can unblock the current process is 'condition' becoming TRUE.
+ *
+ * Another form of usage is:
+ * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval,
+ * timeout_handler);
+ * rc = l_wait_event(waitq, condition, &lwi);
+ * This is the same as previous case, but condition is checked once every
+ * 'interval' jiffies (if non-zero).
+ *
*/
#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1))
struct l_wait_info {
long lwi_timeout;
+ long lwi_interval;
int (*lwi_on_timeout)(void *);
void (*lwi_on_signal)(void *);
void *lwi_cb_data;
((struct l_wait_info) { \
.lwi_timeout = time, \
.lwi_on_timeout = cb, \
- .lwi_cb_data = data \
+ .lwi_cb_data = data, \
+ .lwi_interval = 0 \
+})
+
+#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data) \
+((struct l_wait_info) { \
+ .lwi_timeout = time, \
+ .lwi_on_timeout = cb, \
+ .lwi_cb_data = data, \
+ .lwi_interval = interval \
})
+
#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data) \
((struct l_wait_info) { \
.lwi_timeout = time, \
.lwi_on_timeout = time_cb, \
.lwi_on_signal = (sig_cb == NULL) ? LWI_ON_SIGNAL_NOOP : sig_cb, \
- .lwi_cb_data = data \
+ .lwi_cb_data = data, \
+ .lwi_interval = 0 \
})
#define LWI_INTR(cb, data) LWI_TIMEOUT_INTR(0, NULL, cb, data)
#define __l_wait_event(wq, condition, info, ret, excl) \
do { \
wait_queue_t __wait; \
- signed long __timeout = info->lwi_timeout; \
+ unsigned long __timeout = info->lwi_timeout; \
unsigned long __irqflags; \
sigset_t __blocked; \
\
if (__timeout == 0) { \
schedule(); \
} else { \
- __timeout = schedule_timeout(__timeout); \
+ unsigned long interval = info->lwi_interval? \
+ min_t(unsigned long, \
+ info->lwi_interval,__timeout):\
+ __timeout; \
+ __timeout -= interval - schedule_timeout(interval); \
if (__timeout == 0) { \
if (info->lwi_on_timeout == NULL || \
info->lwi_on_timeout(info->lwi_cb_data)) { \
__then = time(NULL); \
\
while (!(condition)) { \
- if (liblustre_wait_event(__timeout)) { \
+ if (liblustre_wait_event(info->lwi_interval?:__timeout) || \
+ (info->lwi_interval && info->lwi_interval < __timeout)) {\
if (__timeout != 0 && info->lwi_timeout != 0) { \
__now = time(NULL); \
__timeout -= __now - __then; \
__ret; \
})
-#define LMD_MAGIC 0xbdacbd03
-
#ifdef __KERNEL__
#define LIBLUSTRE_CLIENT (0)
#else
} u;
};
-struct llog_fill_rec_data {
- obd_id lfd_id; /* object id */
- obd_count lfd_ogen; /* object group */
-};
-
-
/* llog.c - general API */
typedef int (*llog_cb_t)(struct llog_handle *, struct llog_rec_hdr *, void *);
typedef int (*llog_fill_rec_cb_t)(struct llog_rec_hdr *rec, void *data);
extern void llog_free_handle(struct llog_handle *handle);
int llog_process(struct llog_handle *loghandle, llog_cb_t cb,
void *data, void *catdata);
+int llog_reverse_process(struct llog_handle *loghandle, llog_cb_t cb,
+ void *data, void *catdata);
extern int llog_cancel_rec(struct llog_handle *loghandle, int index);
extern int llog_close(struct llog_handle *cathandle);
extern int llog_get_size(struct llog_handle *loghandle);
int llog_cat_cancel_records(struct llog_handle *cathandle, int count,
struct llog_cookie *cookies);
int llog_cat_process(struct llog_handle *cat_llh, llog_cb_t cb, void *data);
+int llog_cat_reverse_process(struct llog_handle *cat_llh, llog_cb_t cb, void *data);
int llog_cat_set_first_idx(struct llog_handle *cathandle, int index);
/* llog_obd.c */
int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp);
int llog_add(struct llog_ctxt *ctxt, struct llog_rec_hdr *rec,
struct lov_stripe_md *lsm, struct llog_cookie *logcookies,
- int numcookies, llog_fill_rec_cb_t fill_cb);
+ int numcookies);
int llog_cancel(struct llog_ctxt *, struct lov_stripe_md *lsm,
int count, struct llog_cookie *cookies, int flags);
int llog_obd_origin_cleanup(struct llog_ctxt *ctxt);
int llog_obd_origin_add(struct llog_ctxt *ctxt,
struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
- struct llog_cookie *logcookies, int numcookies,
- llog_fill_rec_cb_t fill_cb);
+ struct llog_cookie *logcookies, int numcookies);
int llog_cat_initialize(struct obd_device *obd, int count);
int obd_llog_init(struct obd_device *obd, struct obd_device *disk_obd,
int (*lop_destroy)(struct llog_handle *handle);
int (*lop_next_block)(struct llog_handle *h, int *curr_idx,
int next_idx, __u64 *offset, void *buf, int len);
+ int (*lop_prev_block)(struct llog_handle *h,
+ int prev_idx, void *buf, int len);
int (*lop_create)(struct llog_ctxt *ctxt, struct llog_handle **,
struct llog_logid *logid, char *name);
int (*lop_close)(struct llog_handle *handle);
int (*lop_cleanup)(struct llog_ctxt *ctxt);
int (*lop_add)(struct llog_ctxt *ctxt, struct llog_rec_hdr *rec,
struct lov_stripe_md *lsm,
- struct llog_cookie *logcookies, int numcookies,
- llog_fill_rec_cb_t fill_cb);
+ struct llog_cookie *logcookies, int numcookies);
int (*lop_cancel)(struct llog_ctxt *ctxt, struct lov_stripe_md *lsm,
int count, struct llog_cookie *cookies, int flags);
int (*lop_connect)(struct llog_ctxt *ctxt, int count,
int loc_idx; /* my index the obd array of ctxt's */
struct llog_gen loc_gen;
struct obd_device *loc_obd; /* points back to the containing obd*/
- struct obd_export *loc_exp;
+ struct obd_export *loc_exp; /* parent "disk" export (e.g. MDS) */
struct obd_import *loc_imp; /* to use in RPC's: can be backward
pointing import */
struct llog_operations *loc_logops;
#define LLOG_GEN_INC(gen) ((gen).conn_cnt ++)
#define LLOG_PROC_BREAK 0x0001
+#define LLOG_DEL_RECORD 0x0002
static inline int llog_obd2ops(struct llog_ctxt *ctxt,
struct llog_operations **lop)
RETURN(rc);
}
+static inline int llog_prev_block(struct llog_handle *loghandle,
+ int prev_idx, void *buf, int len)
+{
+ struct llog_operations *lop;
+ int rc;
+ ENTRY;
+
+ rc = llog_handle2ops(loghandle, &lop);
+ if (rc)
+ RETURN(rc);
+ if (lop->lop_prev_block == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ rc = lop->lop_prev_block(loghandle, prev_idx, buf, len);
+ RETURN(rc);
+}
+
static inline int llog_create(struct llog_ctxt *ctxt, struct llog_handle **res,
struct llog_logid *logid, char *name)
{
#ifdef __KERNEL__
# include <linux/fs.h>
# include <linux/dcache.h>
+# ifdef CONFIG_FS_POSIX_ACL
+# include <linux/xattr_acl.h>
+# endif
#endif
#include <linux/lustre_handles.h>
#include <libcfs/kp30.h>
struct ll_file_data;
struct lustre_md {
- struct mds_body *body;
- struct lov_stripe_md *lsm;
+ struct mds_body *body;
+ struct lov_stripe_md *lsm;
+#ifdef CONFIG_FS_POSIX_ACL
+ struct posix_acl *posix_acl;
+#endif
};
struct mdc_op_data {
struct lvfs_grp_hash_entry *ur_grp_entry;
};
-#define MDS_LR_SERVER_SIZE 512
-
-#define MDS_LR_CLIENT_START 8192
-#define MDS_LR_CLIENT_SIZE 128
-#if MDS_LR_CLIENT_START < MDS_LR_SERVER_SIZE
-#error "Can't have MDS_LR_CLIENT_START < MDS_LR_SERVER_SIZE"
-#endif
-
-#define MDS_CLIENT_SLOTS 17
-
-/* Data stored per client in the last_rcvd file. In le32 order. */
-struct mds_client_data {
- __u8 mcd_uuid[40]; /* client UUID */
- __u64 mcd_last_transno; /* last completed transaction ID */
- __u64 mcd_last_xid; /* xid for the last transaction */
- __u32 mcd_last_result; /* result from last RPC */
- __u32 mcd_last_data; /* per-op data (disposition for open &c.) */
- __u8 mcd_padding[MDS_LR_CLIENT_SIZE - 64];
-};
-
/* file data for open files on MDS */
struct mds_file_data {
struct portals_handle mfd_handle; /* must be first */
struct dentry *mfd_dentry;
};
+/* ACL */
+#ifdef CONFIG_FS_POSIX_ACL
+#define LUSTRE_POSIX_ACL_MAX_ENTRIES (32)
+#define LUSTRE_POSIX_ACL_MAX_SIZE \
+ (xattr_acl_size(LUSTRE_POSIX_ACL_MAX_ENTRIES))
+#else
+#define LUSTRE_POSIX_ACL_MAX_SIZE 0
+#endif
+
/* mds/mds_reint.c */
int mds_reint_rec(struct mds_update_record *r, int offset,
struct ptlrpc_request *req, struct lustre_handle *);
struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid,
struct vfsmount **mnt, int lock_mode,
struct lustre_handle *lockh,
- char *name, int namelen);
+ char *name, int namelen, __u64 lockpart);
struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
struct vfsmount **mnt);
int mds_update_server_data(struct obd_device *, int force_sync);
/* mdc/mdc_request.c */
int mdc_init_ea_size(struct obd_export *mdc_exp, struct obd_export *lov_exp);
-int mdc_req2lustre_md(struct ptlrpc_request *req, int offset,
+
+int mdc_req2lustre_md(struct ptlrpc_request *req, int offset,
struct obd_export *exp, struct lustre_md *md);
+void mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md);
+
int mdc_getstatus(struct obd_export *exp, struct ll_fid *rootfid);
int mdc_getattr(struct obd_export *exp, struct ll_fid *fid,
obd_valid valid, unsigned int ea_size,
struct ptlrpc_request **request);
int mdc_getattr_name(struct obd_export *exp, struct ll_fid *fid,
- char *filename, int namelen, unsigned long valid,
+ const char *filename, int namelen, unsigned long valid,
unsigned int ea_size, struct ptlrpc_request **request);
int mdc_setattr(struct obd_export *exp, struct mdc_op_data *data,
struct iattr *iattr, void *ea, int ealen, void *ea2, int ea2len,
/* MD flags we _always_ use */
#define PTLRPC_MD_OPTIONS 0
-/* Define maxima for bulk I/O
+/* Define maxima for bulk I/O
* CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks)
* these limits are system wide and not interface-local. */
#define PTLRPC_MAX_BRW_SIZE LNET_MTU
* considered full when less than ?_MAXREQSIZE is left in them.
*/
-#define LDLM_NUM_THREADS min((int)(smp_num_cpus * smp_num_cpus * 8), 64)
+#define LDLM_NUM_THREADS min((int)(smp_num_cpus * smp_num_cpus * 8), 64)
#define LDLM_NBUFS 64
#define LDLM_BUFSIZE (8 * 1024)
#define LDLM_MAXREQSIZE (5 * 1024)
#define LDLM_MAXREPSIZE (1024)
#define MDT_MAX_THREADS 32UL
-#define MDT_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
- MDT_MAX_THREADS), 2UL)
+#define MDT_NUM_THREADS max(min_t(unsigned long, MDT_MAX_THREADS, \
+ num_physpages >> (25 - PAGE_SHIFT)), 2UL)
#define MDS_NBUFS (64 * smp_num_cpus)
#define MDS_BUFSIZE (8 * 1024)
/* Assume file name length = FNAME_MAX = 256 (true for ext3).
*
* MDS_MAXREQSIZE ~= 4736 bytes =
* lustre_msg + ldlm_request + mds_body + mds_rec_create + FNAME_MAX + PATH_MAX
+ * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header
+ * or, for mds_close() and mds_reint_unlink() on a many-OST filesystem:
+ * = 9210 bytes = lustre_msg + mds_body + 160 * (easize + cookiesize)
*
* Realistic size is about 512 bytes (20 character name + 128 char symlink),
* except in the open case where there are a large number of OSTs in a LOV.
*/
#define MDS_MAXREQSIZE (5 * 1024)
-#define MDS_MAXREPSIZE (9 * 1024)
+#define MDS_MAXREPSIZE max(9 * 1024, 280 + LOV_MAX_STRIPE_COUNT * 56)
+/* FIXME fix all constants here */
#define MGS_MAX_THREADS 32UL
#define MGS_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
MGS_MAX_THREADS), 2UL)
#define MGS_NBUFS (64 * smp_num_cpus)
#define MGS_BUFSIZE (8 * 1024)
-
#define MGS_MAXREQSIZE (5 * 1024)
#define MGS_MAXREPSIZE (9 * 1024)
-#define OST_MAX_THREADS 36UL
-#define OST_NUM_THREADS max(min_t(unsigned long, num_physpages / 8192, \
- OST_MAX_THREADS), 2UL)
+#define OST_MAX_THREADS 512UL
+#define OST_DEF_THREADS max_t(unsigned long, 2, \
+ (num_physpages >> (26-PAGE_SHIFT)) * smp_num_cpus)
#define OST_NBUFS (64 * smp_num_cpus)
#define OST_BUFSIZE (8 * 1024)
/* OST_MAXREQSIZE ~= 4768 bytes =
#define OST_MAXREQSIZE (5 * 1024)
#define OST_MAXREPSIZE (9 * 1024)
-#define PTLBD_NUM_THREADS 4
-#define PTLBD_NBUFS 64
-#define PTLBD_BUFSIZE (32 * 1024)
-#define PTLBD_MAXREQSIZE 1024
-
struct ptlrpc_connection {
struct list_head c_link;
lnet_nid_t c_self;
int rq_reqlen;
struct lustre_msg *rq_reqmsg;
- int rq_timeout; /* seconds */
+ int rq_timeout; /* time to wait for reply (seconds) */
int rq_replen;
struct lustre_msg *rq_repmsg;
__u64 rq_transno;
struct ptlrpc_reply_state *rq_reply_state; /* separated reply state */
struct ptlrpc_request_buffer_desc *rq_rqbd; /* incoming request buffer*/
#if CRAY_XT3
-# error "Need to get the uid from the event?"
__u32 rq_uid; /* peer uid, used in MDS only */
#endif
struct ptlrpc_cb_id bd_cbid; /* network callback info */
lnet_handle_md_t bd_md_h; /* associated MD */
-
+
#if defined(__KERNEL__)
lnet_kiov_t bd_iov[0];
#else
int srv_num_threads; /* # threads to start/started */
unsigned srv_cpu_affinity:1; /* bind threads to CPUs */
- __u32 srv_req_portal;
- __u32 srv_rep_portal;
+ __u32 srv_req_portal;
+ __u32 srv_rep_portal;
int srv_n_queued_reqs; /* # reqs waiting to be served */
struct list_head srv_request_queue; /* reqs waiting for service */
wait_queue_head_t srv_waitq; /* all threads sleep on this */
struct list_head srv_threads;
- struct obd_device *srv_obddev;
svc_handler_t srv_handler;
-
+
char *srv_name; /* only statically allocated strings here; we don't clean them */
spinlock_t srv_lock;
struct lprocfs_stats *srv_stats;
/* List of free reply_states */
- struct list_head srv_free_rs_list;
+ struct list_head srv_free_rs_list;
/* waitq to run, when adding stuff to srv_free_rs_list */
- wait_queue_head_t srv_free_rs_waitq;
-
+ wait_queue_head_t srv_free_rs_waitq;
+
/*
* if non-NULL called during thread creation (ptlrpc_start_thread())
* to initialize service specific per-thread state.
/* ptlrpc/events.c */
extern lnet_handle_eq_t ptlrpc_eq_h;
-extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
+extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid,
lnet_process_id_t *peer, lnet_nid_t *self);
extern void request_out_callback (lnet_event_t *ev);
extern void reply_in_callback(lnet_event_t *ev);
int ptlrpc_register_bulk(struct ptlrpc_request *req);
void ptlrpc_unregister_bulk (struct ptlrpc_request *req);
-static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc)
+static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc)
{
unsigned long flags;
int rc;
int ptlrpc_reply(struct ptlrpc_request *req);
int ptlrpc_error(struct ptlrpc_request *req);
void ptlrpc_resend_req(struct ptlrpc_request *request);
-int ptl_send_rpc(struct ptlrpc_request *request);
-int ptl_send_rpc_nowait(struct ptlrpc_request *request);
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply);
int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd);
/* ptlrpc/client.c */
{
unsigned long flags;
int rc;
-
+
spin_lock_irqsave(&req->rq_lock, flags);
rc = req->rq_receiving_reply;
spin_unlock_irqrestore(&req->rq_lock, flags);
{
unsigned long flags;
int rc;
-
+
spin_lock_irqsave(&req->rq_lock, flags);
rc = req->rq_replied;
spin_unlock_irqrestore(&req->rq_lock, flags);
void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
struct ptlrpc_request_pool *ptlrpc_init_rq_pool(int, int,
void (*populate_pool)(struct ptlrpc_request_pool *, int));
-struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
+struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode,
int count, int *lengths, char **bufs);
-struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp, int opcode,
- int count, int *lengths,
- char **bufs,
+struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode,
+ int count, int *lengths, char **bufs,
struct ptlrpc_request_pool *pool);
void ptlrpc_free_req(struct ptlrpc_request *request);
void ptlrpc_req_finished(struct ptlrpc_request *request);
__u64 ptlrpc_req_xid(struct ptlrpc_request *request);
/* ptlrpc/service.c */
-void ptlrpc_save_lock (struct ptlrpc_request *req,
+void ptlrpc_save_lock (struct ptlrpc_request *req,
struct lustre_handle *lock, int mode);
void ptlrpc_commit_replies (struct obd_device *obd);
void ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs);
/* ptlrpc/pack_generic.c */
int lustre_msg_swabbed(struct lustre_msg *msg);
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version);
int lustre_pack_request(struct ptlrpc_request *, int count, int *lens,
char **bufs);
int lustre_pack_reply(struct ptlrpc_request *, int count, int *lens,
char **bufs);
+void lustre_shrink_reply(struct ptlrpc_request *req,
+ int segment, unsigned int newlen, int move_data);
void lustre_free_reply_state(struct ptlrpc_reply_state *rs);
int lustre_msg_size(int count, int *lengths);
int lustre_unpack_msg(struct lustre_msg *m, int len);
/* ptlrpc/llog_server.c */
int llog_origin_handle_create(struct ptlrpc_request *req);
+int llog_origin_handle_destroy(struct ptlrpc_request *req);
+int llog_origin_handle_prev_block(struct ptlrpc_request *req);
int llog_origin_handle_next_block(struct ptlrpc_request *req);
int llog_origin_handle_read_header(struct ptlrpc_request *req);
int llog_origin_handle_close(struct ptlrpc_request *req);
#define _LUSTRE_QUOTA_H
#ifdef __KERNEL__
-# include <linux/version.h>
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
#endif
#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lvfs.h>
+
+struct obd_device;
+struct client_obd;
+
+#ifndef NR_DQHASH
+#define NR_DQHASH 45
+#endif
#ifdef HAVE_QUOTA_SUPPORT
-#include <linux/lustre_realquota.h>
-#else
+#ifdef __KERNEL__
+
+/* structures to access admin quotafile */
struct lustre_mem_dqinfo {
+ unsigned int dqi_bgrace;
+ unsigned int dqi_igrace;
+ unsigned long dqi_flags;
+ unsigned int dqi_blocks;
+ unsigned int dqi_free_blk;
+ unsigned int dqi_free_entry;
};
struct lustre_quota_info {
+ struct file *qi_files[MAXQUOTAS];
+ struct lustre_mem_dqinfo qi_info[MAXQUOTAS];
};
+#define DQ_STATUS_AVAIL 0x0 /* Available dquot */
+#define DQ_STATUS_SET 0x01 /* Sombody is setting dquot */
+#define DQ_STATUS_RECOVERY 0x02 /* dquot is in recovery */
+
struct lustre_dquot {
+ /* Hash list in memory, protect by dquot_hash_lock */
+ struct list_head dq_hash;
+ /* Protect the data in lustre_dquot */
+ struct semaphore dq_sem;
+ /* Use count */
+ int dq_refcnt;
+ /* Pointer of quota info it belongs to */
+ struct lustre_quota_info *dq_info;
+
+ loff_t dq_off; /* Offset of dquot on disk */
+ unsigned int dq_id; /* ID this applies to (uid, gid) */
+ int dq_type; /* Type fo quota (USRQUOTA, GRPQUOUTA) */
+ unsigned short dq_status; /* See DQ_STATUS_ */
+ unsigned long dq_flags; /* See DQ_ in quota.h */
+ struct mem_dqblk dq_dqb; /* Diskquota usage */
};
+struct dquot_id {
+ struct list_head di_link;
+ __u32 di_id;
+};
+
+#define QFILE_CHK 1
+#define QFILE_RD_INFO 2
+#define QFILE_WR_INFO 3
+#define QFILE_INIT_INFO 4
+#define QFILE_RD_DQUOT 5
+#define QFILE_WR_DQUOT 6
+
+/* admin quotafile operations */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+int lustre_check_quota_file(struct lustre_quota_info *lqi, int type);
+int lustre_read_quota_info(struct lustre_quota_info *lqi, int type);
+int lustre_write_quota_info(struct lustre_quota_info *lqi, int type);
+int lustre_read_dquot(struct lustre_dquot *dquot);
+int lustre_commit_dquot(struct lustre_dquot *dquot);
+int lustre_init_quota_info(struct lustre_quota_info *lqi, int type);
+int lustre_get_qids(struct file *file, struct inode *inode, int type,
+ struct list_head *list);
+#else
+
+#ifndef DQ_FAKE_B
+#define DQ_FAKE_B 6
+#endif
static inline int lustre_check_quota_file(struct lustre_quota_info *lqi,
int type)
{
return 0;
}
-#ifdef __KERNEL__
static inline int lustre_read_dquot(struct lustre_dquot *dquot)
{
return 0;
{
return 0;
}
-#endif
static inline int lustre_init_quota_info(struct lustre_quota_info *lqi,
int type)
{
return 0;
}
+#endif /* KERNEL_VERSION(2,5,0) */
-struct obd_device;
+#define LL_DQUOT_OFF(sb) DQUOT_OFF(sb)
typedef int (*dqacq_handler_t) (struct obd_device * obd, struct qunit_data * qd,
int opc);
+struct lustre_quota_ctxt {
+ struct super_block *lqc_sb; /* superblock this applies to */
+ struct obd_import *lqc_import; /* import used to send dqacq/dqrel RPC */
+ dqacq_handler_t lqc_handler; /* dqacq/dqrel RPC handler, only for quota master */
+ unsigned long lqc_recovery:1; /* Doing recovery */
+ unsigned long lqc_iunit_sz; /* Unit size of file quota */
+ unsigned long lqc_itune_sz; /* Trigger dqacq when available file quota less than
+ * this value, trigger dqrel when available file quota
+ * more than this value + 1 iunit */
+ unsigned long lqc_bunit_sz; /* Unit size of block quota */
+ unsigned long lqc_btune_sz; /* See comment of lqc_itune_sz */
+};
+
+#else
+
+struct lustre_quota_info {
+};
+
+struct lustre_quota_ctxt {
+};
+
+#endif /* !__KERNEL__ */
+
+#else
+
+#define LL_DQUOT_OFF(sb) do {} while(0)
+
+struct lustre_quota_info {
+};
struct lustre_quota_ctxt {
};
-struct lustre_qunit {
+#endif /* !HAVE_QUOTA_SUPPORT */
+
+/* If the (quota limit < qunit * slave count), the slave which can't
+ * acquire qunit should set it's local limit as MIN_QLIMIT */
+#define MIN_QLIMIT 1
+
+struct quotacheck_thread_args {
+ struct obd_export *qta_exp; /* obd export */
+ struct obd_quotactl qta_oqctl; /* obd_quotactl args */
+ struct super_block *qta_sb; /* obd super block */
+ atomic_t *qta_sem; /* obt_quotachecking */
};
-struct super_block;
-static inline int qctxt_init(struct lustre_quota_ctxt *qctxt,
- struct super_block *sb, dqacq_handler_t handler)
+typedef struct {
+ int (*quota_init) (void);
+ int (*quota_exit) (void);
+ int (*quota_setup) (struct obd_device *, struct lustre_cfg *);
+ int (*quota_cleanup) (struct obd_device *);
+ /* For quota master, close admin quota files */
+ int (*quota_fs_cleanup) (struct obd_device *);
+ int (*quota_ctl) (struct obd_export *, struct obd_quotactl *);
+ int (*quota_check) (struct obd_export *, struct obd_quotactl *);
+ int (*quota_recovery) (struct obd_device *);
+
+ /* For quota master/slave, adjust quota limit after fs operation */
+ int (*quota_adjust) (struct obd_device *, unsigned int[],
+ unsigned int[], int, int);
+
+ /* For quota slave, set import, trigger quota recovery */
+ int (*quota_setinfo) (struct obd_export *, struct obd_device *);
+
+ /* For quota slave, set proper thread resoure capability */
+ int (*quota_enforce) (struct obd_device *, unsigned int);
+
+ /* For quota slave, check whether specified uid/gid is over quota */
+ int (*quota_getflag) (struct obd_device *, struct obdo *);
+
+ /* For quota slave, acquire/release quota from master if needed */
+ int (*quota_acquire) (struct obd_device *, unsigned int, unsigned int);
+
+ /* For quota client, poll if the quota check done */
+ int (*quota_poll_check) (struct obd_export *, struct if_quotacheck *);
+
+ /* For quota client, check whether specified uid/gid is over quota */
+ int (*quota_chkdq) (struct client_obd *, unsigned int, unsigned int);
+
+ /* For quota client, set over quota flag for specifed uid/gid */
+ int (*quota_setdq) (struct client_obd *, unsigned int, unsigned int,
+ obd_flag, obd_flag);
+} quota_interface_t;
+
+#define Q_COPY(out, in, member) (out)->member = (in)->member
+
+#define QUOTA_OP(interface, op) interface->quota_ ## op
+
+#define QUOTA_CHECK_OP(interface, op) \
+do { \
+ if (!interface) \
+ RETURN(0); \
+ if (!QUOTA_OP(interface, op)) { \
+ CERROR("no quota operation: " #op "\n"); \
+ RETURN(-EOPNOTSUPP); \
+ } \
+} while(0)
+
+static inline int lquota_init(quota_interface_t *interface)
{
- return 0;
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, init);
+ rc = QUOTA_OP(interface, init)();
+ RETURN(rc);
}
-static inline void qctxt_cleanup(struct lustre_quota_ctxt *qctxt, int force)
+
+static inline int lquota_exit(quota_interface_t *interface)
{
- return;
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, exit);
+ rc = QUOTA_OP(interface, exit)();
+ RETURN(rc);
}
-static inline int qctxt_adjust_qunit(struct obd_device *obd,
- struct lustre_quota_ctxt *qctxt,
- uid_t uid, gid_t gid, __u32 isblk)
+
+static inline int lquota_setup(quota_interface_t *interface,
+ struct obd_device *obd,
+ struct lustre_cfg *lcfg)
{
- return 0;
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, setup);
+ rc = QUOTA_OP(interface, setup)(obd, lcfg);
+ RETURN(rc);
}
-static inline int qctxt_wait_on_dqacq(struct obd_device *obd,
- struct lustre_quota_ctxt *qctxt,
- uid_t uid, gid_t gid, __u32 isblk)
+
+static inline int lquota_cleanup(quota_interface_t *interface,
+ struct obd_device *obd)
{
- return 0;
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, cleanup);
+ rc = QUOTA_OP(interface, cleanup)(obd);
+ RETURN(rc);
}
-struct quotacheck_info {
-};
+static inline int lquota_fs_cleanup(quota_interface_t *interface,
+ struct obd_device *obd)
+{
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, fs_cleanup);
+ rc = QUOTA_OP(interface, fs_cleanup)(obd);
+ RETURN(rc);
+}
-#define LL_DQUOT_OFF(sb) do {} while(0)
+static inline int lquota_recovery(quota_interface_t *interface,
+ struct obd_device *obd)
+{
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, recovery);
+ rc = QUOTA_OP(interface, recovery)(obd);
+ RETURN(rc);
+}
+
+static inline int lquota_adjust(quota_interface_t *interface,
+ struct obd_device *obd,
+ unsigned int qcids[],
+ unsigned int qpids[],
+ int rc, int opc)
+{
+ int ret;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, adjust);
+ ret = QUOTA_OP(interface, adjust)(obd, qcids, qpids, rc, opc);
+ RETURN(ret);
+}
+
+static inline int lquota_chkdq(quota_interface_t *interface,
+ struct client_obd *cli,
+ unsigned int uid, unsigned int gid)
+{
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, chkdq);
+ rc = QUOTA_OP(interface, chkdq)(cli, uid, gid);
+ RETURN(rc);
+}
+
+static inline int lquota_setdq(quota_interface_t *interface,
+ struct client_obd *cli,
+ unsigned int uid, unsigned int gid,
+ obd_flag valid, obd_flag flags)
+{
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, setdq);
+ rc = QUOTA_OP(interface, setdq)(cli, uid, gid, valid, flags);
+ RETURN(rc);
+}
+
+static inline int lquota_poll_check(quota_interface_t *interface,
+ struct obd_export *exp,
+ struct if_quotacheck *qchk)
+{
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, poll_check);
+ rc = QUOTA_OP(interface, poll_check)(exp, qchk);
+ RETURN(rc);
+}
+
+
+static inline int lquota_setinfo(quota_interface_t *interface,
+ struct obd_export *exp,
+ struct obd_device *obd)
+{
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, setinfo);
+ rc = QUOTA_OP(interface, setinfo)(exp, obd);
+ RETURN(rc);
+}
+
+static inline int lquota_enforce(quota_interface_t *interface,
+ struct obd_device *obd,
+ unsigned int ignore)
+{
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, enforce);
+ rc = QUOTA_OP(interface, enforce)(obd, ignore);
+ RETURN(rc);
+}
+
+static inline int lquota_getflag(quota_interface_t *interface,
+ struct obd_device *obd, struct obdo *oa)
+{
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, getflag);
+ rc = QUOTA_OP(interface, getflag)(obd, oa);
+ RETURN(rc);
+}
+
+static inline int lquota_acquire(quota_interface_t *interface,
+ struct obd_device *obd,
+ unsigned int uid, unsigned int gid)
+{
+ int rc;
+ ENTRY;
+
+ QUOTA_CHECK_OP(interface, acquire);
+ rc = QUOTA_OP(interface, acquire)(obd, uid, gid);
+ RETURN(rc);
+}
+
+#ifndef __KERNEL__
+extern quota_interface_t osc_quota_interface;
+extern quota_interface_t mdc_quota_interface;
+extern quota_interface_t lov_quota_interface;
+#endif
-#endif /*!HAVE_QUOTA_SUPPORT */
#endif /* _LUSTRE_QUOTA_H */
INIT_LIST_HEAD(&loi->loi_write_item);
INIT_LIST_HEAD(&loi->loi_read_item);
}
+/*extent array item for describing the joined file extent info*/
+struct lov_extent {
+ __u64 le_start; /* extent start */
+ __u64 le_len; /* extent length */
+ int le_loi_idx; /* extent #1 loi's index in lsm loi array */
+ int le_stripe_count; /* extent stripe count*/
+};
+
+/*Lov array info for describing joined file array EA info*/
+struct lov_array_info {
+ struct llog_logid lai_array_id; /* MDS med llog object id */
+ unsigned lai_ext_count; /* number of extent count */
+ struct lov_extent *lai_ext_array; /* extent desc array */
+};
struct lov_stripe_md {
spinlock_t lsm_lock;
void *lsm_lock_owner; /* debugging */
- /* Public members. */
- __u64 lsm_object_id; /* lov object id */
- __u64 lsm_object_gr; /* lov object id */
- __u64 lsm_maxbytes; /* maximum possible file size */
- unsigned long lsm_xfersize; /* optimal transfer size */
-
- /* LOV-private members start here -- only for use in lov/. */
- __u32 lsm_magic;
- __u32 lsm_stripe_size; /* size of the stripe */
- __u32 lsm_pattern; /* striping pattern (RAID0, RAID1) */
- unsigned lsm_stripe_count; /* number of objects being striped over */
+ struct {
+ /* Public members. */
+ __u64 lw_object_id; /* lov object id */
+ __u64 lw_object_gr; /* lov object id */
+ __u64 lw_maxbytes; /* maximum possible file size */
+ unsigned long lw_xfersize; /* optimal transfer size */
+
+ /* LOV-private members start here -- only for use in lov/. */
+ __u32 lw_magic;
+ __u32 lw_stripe_size; /* size of the stripe */
+ __u32 lw_pattern; /* striping pattern (RAID0, RAID1) */
+ unsigned lw_stripe_count; /* number of objects being striped over */
+ } lsm_wire;
+
+ struct lov_array_info *lsm_array; /*Only for joined file array info*/
struct lov_oinfo lsm_oinfo[0];
};
-/* compare all fields except for semaphore */
+#define lsm_object_id lsm_wire.lw_object_id
+#define lsm_object_gr lsm_wire.lw_object_gr
+#define lsm_maxbytes lsm_wire.lw_maxbytes
+#define lsm_xfersize lsm_wire.lw_xfersize
+#define lsm_magic lsm_wire.lw_magic
+#define lsm_stripe_size lsm_wire.lw_stripe_size
+#define lsm_pattern lsm_wire.lw_pattern
+#define lsm_stripe_count lsm_wire.lw_stripe_count
+
+/* compare all relevant fields. */
static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1,
struct lov_stripe_md *m2)
{
- return memcmp(&m1->lsm_object_id, &m2->lsm_object_id,
- (char *)&m2->lsm_oinfo[0] - (char *)&m2->lsm_object_id);
+ /*
+ * ->lsm_wire contains padding, but it should be zeroed out during
+ * allocation.
+ */
+ return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof m1->lsm_wire);
}
void lov_stripe_lock(struct lov_stripe_md *md);
struct ost_server_data;
+/* hold common fields for "target" device */
+struct obd_device_target {
+ struct super_block *obt_sb;
+ atomic_t obt_quotachecking;
+ struct lustre_quota_ctxt obt_qctxt;
+};
+
+#define FILTER_GROUP_LLOG 1
+#define FILTER_GROUP_ECHO 2
+
+struct filter_ext {
+ __u64 fe_start;
+ __u64 fe_end;
+};
+
struct filter_obd {
+ /* NB this field MUST be first */
+ struct obd_device_target fo_obt;
const char *fo_fstype;
- struct super_block *fo_sb;
struct vfsmount *fo_vfsmnt;
struct dentry *fo_dentry_O;
struct dentry **fo_dentry_O_groups;
struct dentry **fo_dentry_O_sub;
- spinlock_t fo_objidlock; /* protect fo_lastobjid increment */
- spinlock_t fo_translock; /* protect fsd_last_rcvd increment */
+ spinlock_t fo_objidlock; /* protect fo_lastobjid
+ * increment */
+
+ spinlock_t fo_translock; /* protect fsd_last_rcvd
+ * increment */
+
struct file *fo_rcvd_filp;
struct file *fo_health_check_filp;
struct lr_server_data *fo_fsd;
unsigned long *fo_last_rcvd_slots;
__u64 fo_mount_count;
- unsigned int fo_destroy_in_progress:1;
+ int fo_destroy_in_progress;
struct semaphore fo_create_lock;
struct file_operations *fo_fop;
*
* Locking: none, each OST thread uses only one element, determined by
* its "ordinal number", ->t_id.
- *
- * This is (void *) array, because 2.4 and 2.6 use different iobuf
- * structures.
*/
- void **fo_iobuf_pool;
+ struct filter_iobuf **fo_iobuf_pool;
int fo_iobuf_count;
struct obd_histogram fo_r_pages;
};
#define OSC_MAX_RIF_DEFAULT 8
-#define OSC_MAX_RIF_MAX 64
-#define OSC_MAX_DIRTY_DEFAULT 32
-#define OSC_MAX_DIRTY_MB_MAX 512 /* totally arbitrary */
-
-enum {
- CL_QUOTACHECKING = 1,
- CL_NO_QUOTACHECK
-};
+#define OSC_MAX_RIF_MAX 256
+#define OSC_MAX_DIRTY_DEFAULT (OSC_MAX_RIF_DEFAULT * 4)
+#define OSC_MAX_DIRTY_MB_MAX 2048 /* totally arbitrary */
struct mdc_rpc_lock;
struct client_obd {
struct osc_async_rc cl_ar;
/* used by quotacheck */
- spinlock_t cl_qchk_lock;
int cl_qchk_stat; /* quotacheck stat of the peer */
struct ptlrpc_request_pool *cl_rq_pool; /* emergency pool of requests */
};
+#define CL_NOT_QUOTACHECKED 1 /* client->cl_qchk_stat init value */
+
struct mgs_obd {
struct ptlrpc_service *mgs_service;
struct vfsmount *mgs_vfsmnt;
};
struct mds_obd {
+ /* NB this field MUST be first */
+ struct obd_device_target mds_obt;
struct ptlrpc_service *mds_service;
struct ptlrpc_service *mds_setattr_service;
struct ptlrpc_service *mds_readpage_service;
- struct super_block *mds_sb;
struct vfsmount *mds_vfsmnt;
struct dentry *mds_fid_de;
int mds_max_mdsize;
unsigned long *mds_client_bitmap;
struct semaphore mds_orphan_recovery_sem;
struct upcall_cache *mds_group_hash;
+
struct lustre_quota_info mds_quota_info;
- struct lustre_quota_ctxt mds_quota_ctxt;
- atomic_t mds_quotachecking;
+ struct semaphore mds_qonoff_sem;
struct semaphore mds_health_sem;
+ unsigned long mds_lov_objids_valid:1,
+ mds_fl_user_xattr:1,
+ mds_fl_acl:1;
};
struct echo_obd {
int oti_numcookies;
/* initial thread handling transaction */
- struct ptlrpc_thread *oti_thread;
+ int oti_thread_id;
};
+static inline void oti_init(struct obd_trans_info *oti,
+ struct ptlrpc_request *req)
+{
+ if (oti == NULL)
+ return;
+ memset(oti, 0, sizeof *oti);
+
+ if (req == NULL)
+ return;
+
+ if (req->rq_repmsg && req->rq_reqmsg != 0)
+ oti->oti_transno = req->rq_repmsg->transno;
+ oti->oti_thread_id = req->rq_svc_thread ? req->rq_svc_thread->t_id : -1;
+}
+
static inline void oti_alloc_cookies(struct obd_trans_info *oti,int num_cookies)
{
if (!oti)
LLOG_RD1_REPL_CTXT = 9,
LLOG_TEST_ORIG_CTXT = 10,
LLOG_TEST_REPL_CTXT = 11,
+ LLOG_LOVEA_ORIG_CTXT = 12,
+ LLOG_LOVEA_REPL_CTXT = 13,
LLOG_MAX_CTXTS
};
__u64 obd_last_committed;
struct fsfilt_operations *obd_fsops;
spinlock_t obd_osfs_lock;
- struct obd_statfs obd_osfs;
- unsigned long obd_osfs_age; /* jiffies */
+ struct obd_statfs obd_osfs; /* locked by obd_osfs_lock */
+ unsigned long obd_osfs_age; /* jiffies */
struct lvfs_run_ctxt obd_lvfs_ctxt;
struct llog_ctxt *obd_llog_ctxt[LLOG_MAX_CTXTS];
struct obd_device *obd_observer;
time_t obd_recovery_end;
union {
+ struct obd_device_target obt;
struct filter_obd filter;
struct mds_obd mds;
struct client_obd cli;
int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt,
struct lov_stripe_md *mem_src);
int (*o_unpackmd)(struct obd_export *exp,struct lov_stripe_md **mem_tgt,
- struct lov_mds_md *disk_src, int disk_len);
+ struct lov_mds_md *disk_src, int disk_len);
+ int (*o_checkmd)(struct obd_export *exp, struct obd_export *md_exp,
+ struct lov_stripe_md *mem_tgt);
int (*o_preallocate)(struct lustre_handle *, obd_count *req,
obd_id *ids);
int (*o_create)(struct obd_export *exp, struct obdo *oa,
struct lov_stripe_md **ea, struct obd_trans_info *oti);
int (*o_destroy)(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea, struct obd_trans_info *oti);
+ struct lov_stripe_md *ea, struct obd_trans_info *oti,
+ struct obd_export *md_exp);
int (*o_setattr)(struct obd_export *exp, struct obdo *oa,
struct lov_stripe_md *ea, struct obd_trans_info *oti);
int (*o_setattr_async)(struct obd_export *exp, struct obdo *oa,
*/
};
+struct lsm_operations {
+ void (*lsm_free)(struct lov_stripe_md *);
+ int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa,
+ struct obd_export *md_exp);
+ void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, obd_off *,
+ unsigned long *);
+ void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, obd_off *,
+ unsigned long *);
+ obd_off (*lsm_stripe_offset_by_index)(struct lov_stripe_md *, int);
+ int (*lsm_stripe_index_by_offset)(struct lov_stripe_md *, obd_off);
+ int (*lsm_revalidate) (struct lov_stripe_md *, struct obd_device *obd);
+ int (*lsm_lmm_verify) (struct lov_mds_md *lmm, int lmm_bytes,
+ int *stripe_count);
+ int (*lsm_unpackmd) (struct lov_obd *lov, struct lov_stripe_md *lsm,
+ struct lov_mds_md *lmm);
+};
+
+extern struct lsm_operations lsm_plain_ops;
+extern struct lsm_operations lsm_join_ops;
+static inline struct lsm_operations *lsm_op_find(int magic)
+{
+ switch(magic) {
+ case LOV_MAGIC:
+ return &lsm_plain_ops;
+ case LOV_MAGIC_JOIN:
+ return &lsm_join_ops;
+ default:
+ CERROR("Cannot recognize lsm_magic %d", magic);
+ return NULL;
+ }
+}
+
int lvfs_check_io_health(struct obd_device *obd, struct file *file);
static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno,
}
}
+static inline void init_obd_quota_ops(quota_interface_t *interface,
+ struct obd_ops *obd_ops)
+{
+ if (!interface)
+ return;
+
+ LASSERT(obd_ops);
+ obd_ops->o_quotacheck = QUOTA_OP(interface, check);
+ obd_ops->o_quotactl = QUOTA_OP(interface, ctl);
+}
+
/* get/set_info keys */
#define KEY_MDS_CONN "mds_conn"
#define KEY_NEXT_ID "next_id"
return obd_unpackmd(exp, mem_tgt, NULL, 0);
}
+static inline int obd_checkmd(struct obd_export *exp,
+ struct obd_export *md_exp,
+ struct lov_stripe_md *mem_tgt)
+{
+ int rc;
+ ENTRY;
+
+ EXP_CHECK_OP(exp, checkmd);
+ OBD_COUNTER_INCREMENT(exp->exp_obd, checkmd);
+
+ rc = OBP(exp->exp_obd, checkmd)(exp, md_exp, mem_tgt);
+ RETURN(rc);
+}
+
static inline int obd_create(struct obd_export *exp, struct obdo *obdo,
struct lov_stripe_md **ea,
struct obd_trans_info *oti)
static inline int obd_destroy(struct obd_export *exp, struct obdo *obdo,
struct lov_stripe_md *ea,
- struct obd_trans_info *oti)
+ struct obd_trans_info *oti,
+ struct obd_export *md_exp)
{
int rc;
ENTRY;
EXP_CHECK_OP(exp, destroy);
OBD_COUNTER_INCREMENT(exp->exp_obd, destroy);
- rc = OBP(exp->exp_obd, destroy)(exp, obdo, ea, oti);
+ rc = OBP(exp->exp_obd, destroy)(exp, obdo, ea, oti, md_exp);
RETURN(rc);
}
return oa;
}
-/* qunit hash stuff */
-extern kmem_cache_t *qunit_cachep;
-extern struct list_head qunit_hash[];
-extern spinlock_t qunit_hash_lock;
-
static inline void obdo_free(struct obdo *oa)
{
OBD_SLAB_FREE(oa, obdo_cachep, sizeof(*oa));
#define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601
#define OBD_FAIL_OBD_LOGD_NET 0x602
#define OBD_FAIL_OBD_QC_CALLBACK_NET 0x603
+#define OBD_FAIL_OBD_DQACQ 0x604
#define OBD_FAIL_TGT_REPLY_NET 0x700
#define OBD_FAIL_TGT_CONN_RACE 0x701
#include <lustre/lustre_user.h>
+typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid, void *args);
+
/* liblustreapi.c */
extern int llapi_file_create(char *name, long stripe_size, int stripe_offset,
int stripe_count, int stripe_pattern);
extern int llapi_catinfo(char *dir, char *keyword, char *node_name);
extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
extern int llapi_is_lustre_mnttype(char *type);
+extern int llapi_quotachown(char *path, int flag);
extern int llapi_quotacheck(char *mnt, int check_type);
extern int llapi_poll_quotacheck(char *mnt, struct if_quotacheck *qchk);
extern int llapi_quotactl(char *mnt, struct if_quotactl *qctl);
-extern int llapi_quotachog(char *path, int flag);
+extern int llapi_target_iterate(int type_num, char **obd_type, void *args, llapi_cb_t cb);
#endif
#define LL_IOC_QUOTACHECK _IOW ('f', 160, int)
#define LL_IOC_POLL_QUOTACHECK _IOR ('f', 161, struct if_quotacheck *)
#define LL_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl *)
+#define LL_IOC_JOIN _IOW ('f', 163, long)
+
#define IOC_MDC_TYPE 'i'
#define IOC_MDC_GETSTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_mds_md *)
#define IOC_MDC_GETFILEINFO _IOWR(IOC_MDC_TYPE, 22, struct lov_mds_data *)
#define O_LOV_DELAY_CREATE 0100000000 /* hopefully this does not conflict */
+#define O_JOIN_FILE 0400000000 /* hopefully this does not conflict */
#define LL_FILE_IGNORE_LOCK 0x00000001
#define LL_FILE_GROUP_LOCKED 0x00000002
#define LOV_USER_MAGIC_V1 0x0BD10BD0
#define LOV_USER_MAGIC LOV_USER_MAGIC_V1
+#define LOV_USER_MAGIC_JOIN 0x0BD20BD0
+
#define LOV_PATTERN_RAID0 0x001
#define LOV_PATTERN_RAID1 0x002
#define LOV_PATTERN_FIRST 0x100
struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
} __attribute__((packed));
-#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__)
+#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \
+ defined(__craynv)
typedef struct stat lstat_t;
#define HAVE_LOV_USER_MDS_DATA
#elif defined(__USE_LARGEFILE64) || defined(__KERNEL__)
} __attribute__((packed));
#endif
+struct lov_user_ost_data_join { /* per-stripe data structure */
+ __u64 l_extent_start; /* extent start*/
+ __u64 l_extent_end; /* extent end*/
+ __u64 l_object_id; /* OST object ID */
+ __u64 l_object_gr; /* OST object group (creating MDS number) */
+ __u32 l_ost_gen; /* generation of this OST index */
+ __u32 l_ost_idx; /* OST index in LOV */
+} __attribute__((packed));
+
+/* Identifier for a single log object */
+struct llog_logid {
+ __u64 lgl_oid;
+ __u64 lgl_ogr;
+ __u32 lgl_ogen;
+} __attribute__((packed));
+
+struct lov_user_md_join { /* LOV EA user data (host-endian) */
+ __u32 lmm_magic; /* magic number = LOV_MAGIC_JOIN */
+ __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+ __u64 lmm_object_id; /* LOV object ID */
+ __u64 lmm_object_gr; /* LOV object group */
+ __u32 lmm_stripe_size; /* size of stripe in bytes */
+ __u32 lmm_stripe_count; /* num stripes in use for this object */
+ __u32 lmm_extent_count; /* extent count of lmm*/
+ __u64 lmm_tree_id; /* mds tree object id */
+ __u64 lmm_tree_gen; /* mds tree object gen */
+ struct llog_logid lmm_array_id; /* mds extent desc llog object id */
+ struct lov_user_ost_data_join lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
+
struct ll_recreate_obj {
__u64 lrc_id;
__u32 lrc_ost_idx;
uuid->uuid[sizeof(*uuid) - 1] = '\0';
}
+#define LUSTRE_Q_QUOTAON 0x800002 /* turn quotas on */
+#define LUSTRE_Q_QUOTAOFF 0x800003 /* turn quotas off */
+#define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */
+#define LUSTRE_Q_SETINFO 0x800006 /* set information about quota files */
+#define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */
+#define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */
+
#define UGQUOTA 2 /* set both USRQUOTA and GRPQUOTA */
#define QFMT_LDISKFS 2 /* QFMT_VFS_V0(2), quota format for ldiskfs */
struct if_quotacheck {
- char obd_type[10];
+ __u8 obd_type[16];
struct obd_uuid obd_uuid;
- int stat;
};
#define MDS_GRP_DOWNCALL_MAGIC 0x6d6dd620
# endif
#endif
-#ifdef HAVE_QUOTA_SUPPORT
-
#ifdef NEED_QUOTA_DEFS
#ifndef QUOTABLOCK_BITS
#define QUOTABLOCK_BITS 10
#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS)
#endif
-/* XXX: these two structs should be in /usr/include/linux/quota.h */
-#ifndef HAVE_STRUCT_IF_DQINFO
-struct if_dqinfo {
+#ifndef QIF_BLIMITS
+#define QIF_BLIMITS 1
+#define QIF_SPACE 2
+#define QIF_ILIMITS 4
+#define QIF_INODES 8
+#define QIF_BTIME 16
+#define QIF_ITIME 32
+#define QIF_LIMITS (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+#endif
+
+#endif /* !__KERNEL__ */
+
+/* XXX: same as if_dqinfo struct in kernel */
+struct obd_dqinfo {
__u64 dqi_bgrace;
__u64 dqi_igrace;
__u32 dqi_flags;
__u32 dqi_valid;
};
-#endif
-#ifndef HAVE_STRUCT_IF_DQBLK
-struct if_dqblk {
+/* XXX: same as if_dqblk struct in kernel, plus one padding */
+struct obd_dqblk {
__u64 dqb_bhardlimit;
__u64 dqb_bsoftlimit;
__u64 dqb_curspace;
__u64 dqb_btime;
__u64 dqb_itime;
__u32 dqb_valid;
+ __u32 padding;
};
-#endif
-
-#ifndef QIF_BLIMITS
-#define QIF_BLIMITS 1
-#define QIF_SPACE 2
-#define QIF_ILIMITS 4
-#define QIF_INODES 8
-#define QIF_BTIME 16
-#define QIF_ITIME 32
-#define QIF_LIMITS (QIF_BLIMITS | QIF_ILIMITS)
-#define QIF_USAGE (QIF_SPACE | QIF_INODES)
-#define QIF_TIMES (QIF_BTIME | QIF_ITIME)
-#define QIF_ALL (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
-#endif
-
-#endif /* NEED_QUOTA_DEFS */
struct if_quotactl {
- int qc_cmd;
- int qc_type;
- int qc_id;
- int qc_stat;
- struct if_dqinfo qc_dqinfo;
- struct if_dqblk qc_dqblk;
- char obd_type[10];
+ __u32 qc_cmd;
+ __u32 qc_type;
+ __u32 qc_id;
+ __u32 qc_stat;
+ struct obd_dqinfo qc_dqinfo;
+ struct obd_dqblk qc_dqblk;
+ __u8 obd_type[16];
struct obd_uuid obd_uuid;
};
-#else
-
-struct if_quotactl {
-};
-
-#endif /* HAVE_QUOTA_SUPPORT */
-
#ifndef LPU64
/* x86_64 defines __u64 as "long" in userspace, but "long long" in the kernel */
#if defined(__x86_64__) && defined(__KERNEL__)
# CONFIG_IA64_DEBUG_CMPXCHG is not set
# CONFIG_IA64_DEBUG_IRQ is not set
CONFIG_KALLSYMS=y
+CONFIG_IEEE1394=m
+CONFIG_IEEE1394_PCILYNX=m
+CONFIG_IEEE1394_OHCI1394=m
+CONFIG_IEEE1394_SBP2=m
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
# CONFIG_IA64_DEBUG_CMPXCHG is not set
# CONFIG_IA64_DEBUG_IRQ is not set
CONFIG_KALLSYMS=y
+CONFIG_IEEE1394=m
+CONFIG_IEEE1394_PCILYNX=m
+CONFIG_IEEE1394_OHCI1394=m
+CONFIG_IEEE1394_SBP2=m
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
# IEEE 1394 (FireWire) support (EXPERIMENTAL)
#
CONFIG_IEEE1394=m
-# CONFIG_IEEE1394_PCILYNX is not set
+CONFIG_IEEE1394_PCILYNX=m
CONFIG_IEEE1394_OHCI1394=m
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
# IEEE 1394 (FireWire) support (EXPERIMENTAL)
#
CONFIG_IEEE1394=m
-# CONFIG_IEEE1394_PCILYNX is not set
+CONFIG_IEEE1394_PCILYNX=m
CONFIG_IEEE1394_OHCI1394=m
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
CONFIG_BLK_DEV_INITRD=y
CONFIG_BLK_STATS=y
CONFIG_DISKDUMP=m
+CONFIG_BLOCKDUMP=m
#
# Multi-device support (RAID and LVM)
#
# Device Drivers
#
-# CONFIG_IEEE1394_PCILYNX is not set
+CONFIG_IEEE1394_PCILYNX=m
CONFIG_IEEE1394_OHCI1394=m
#
# CONFIG_QIC02_TAPE is not set
CONFIG_IPMI_HANDLER=m
# CONFIG_IPMI_PANIC_EVENT is not set
+# CONFIG_IPMI_PANIC_STRING is not set
CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_SI=m
CONFIG_IPMI_KCS=m
CONFIG_IPMI_WATCHDOG=m
+CONFIG_IPMI_POWEROFF=m
#
# Watchdog Cards
CONFIG_BLK_DEV_INITRD=y
CONFIG_BLK_STATS=y
CONFIG_DISKDUMP=m
+CONFIG_BLOCKDUMP=m
#
# Multi-device support (RAID and LVM)
#
# Device Drivers
#
-# CONFIG_IEEE1394_PCILYNX is not set
+CONFIG_IEEE1394_PCILYNX=m
CONFIG_IEEE1394_OHCI1394=m
#
# CONFIG_QIC02_TAPE is not set
CONFIG_IPMI_HANDLER=m
# CONFIG_IPMI_PANIC_EVENT is not set
+# CONFIG_IPMI_PANIC_STRING is not set
CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_SI=m
CONFIG_IPMI_KCS=m
CONFIG_IPMI_WATCHDOG=m
+CONFIG_IPMI_POWEROFF=m
#
# Watchdog Cards
CONFIG_BLK_DEV_INITRD=y
CONFIG_BLK_STATS=y
CONFIG_DISKDUMP=m
+CONFIG_BLOCKDUMP=m
#
# IEEE 1394 (FireWire) support (EXPERIMENTAL)
# CONFIG_QIC02_TAPE is not set
CONFIG_IPMI_HANDLER=m
# CONFIG_IPMI_PANIC_EVENT is not set
+# CONFIG_IPMI_PANIC_STRING is not set
CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_SI=m
CONFIG_IPMI_KCS=m
CONFIG_IPMI_WATCHDOG=m
+CONFIG_IPMI_POWEROFF=m
#
# Watchdog Cards
# CONFIG_IA64_DEBUG_CMPXCHG is not set
# CONFIG_IA64_DEBUG_IRQ is not set
CONFIG_KALLSYMS=y
+CONFIG_IEEE1394_PCILYNX=m
CONFIG_BLK_DEV_INITRD=y
CONFIG_BLK_STATS=y
CONFIG_DISKDUMP=m
+CONFIG_BLOCKDUMP=m
#
# IEEE 1394 (FireWire) support (EXPERIMENTAL)
# CONFIG_QIC02_TAPE is not set
CONFIG_IPMI_HANDLER=m
# CONFIG_IPMI_PANIC_EVENT is not set
+# CONFIG_IPMI_PANIC_STRING is not set
CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_SI=m
CONFIG_IPMI_KCS=m
CONFIG_IPMI_WATCHDOG=m
+CONFIG_IPMI_POWEROFF=m
#
# Watchdog Cards
# CONFIG_IA64_DEBUG_CMPXCHG is not set
# CONFIG_IA64_DEBUG_IRQ is not set
CONFIG_KALLSYMS=y
+CONFIG_IEEE1394_PCILYNX=m
CONFIG_BLK_DEV_INITRD=y
CONFIG_BLK_STATS=y
CONFIG_DISKDUMP=m
+CONFIG_BLOCKDUMP=m
#
# Multi-device support (RAID and LVM)
# CONFIG_QIC02_TAPE is not set
CONFIG_IPMI_HANDLER=m
# CONFIG_IPMI_PANIC_EVENT is not set
+# CONFIG_IPMI_PANIC_STRING is not set
CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_SI=m
CONFIG_IPMI_KCS=m
CONFIG_IPMI_WATCHDOG=m
+CONFIG_IPMI_POWEROFF=m
#
# Watchdog Cards
CONFIG_BLK_DEV_INITRD=y
CONFIG_BLK_STATS=y
CONFIG_DISKDUMP=m
+CONFIG_BLOCKDUMP=m
#
# Multi-device support (RAID and LVM)
# CONFIG_QIC02_TAPE is not set
CONFIG_IPMI_HANDLER=m
# CONFIG_IPMI_PANIC_EVENT is not set
+# CONFIG_IPMI_PANIC_STRING is not set
CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_SI=m
CONFIG_IPMI_KCS=m
CONFIG_IPMI_WATCHDOG=m
+CONFIG_IPMI_POWEROFF=m
#
# Watchdog Cards
CONFIG_ZLIB_INFLATE=y
CONFIG_ZLIB_DEFLATE=y
CONFIG_QSORT=y
+CONFIG_IEEE1394_PCILYNX=m
CONFIG_IEEE1394_OHCI1394=m
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
-# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
CONFIG_IEEE1394_ETH1394=m
CONFIG_IEEE1394_DV1394=m
CONFIG_IEEE1394_RAWIO=m
CONFIG_IEEE1394_OHCI1394=m
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
-# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
CONFIG_IEEE1394_ETH1394=m
CONFIG_IEEE1394_DV1394=m
CONFIG_IEEE1394_RAWIO=m
CONFIG_IEEE1394_OHCI1394=m
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
-# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
CONFIG_IEEE1394_ETH1394=m
CONFIG_IEEE1394_DV1394=m
CONFIG_IEEE1394_RAWIO=m
#
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
-# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
CONFIG_IEEE1394_ETH1394=m
CONFIG_IEEE1394_DV1394=m
CONFIG_IEEE1394_RAWIO=m
#
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
-# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
CONFIG_IEEE1394_ETH1394=m
CONFIG_IEEE1394_DV1394=m
CONFIG_IEEE1394_RAWIO=m
#
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
-# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
CONFIG_IEEE1394_ETH1394=m
CONFIG_IEEE1394_DV1394=m
CONFIG_IEEE1394_RAWIO=m
#
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
-# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
CONFIG_IEEE1394_ETH1394=m
CONFIG_IEEE1394_DV1394=m
CONFIG_IEEE1394_RAWIO=m
#
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
-# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
CONFIG_IEEE1394_ETH1394=m
CONFIG_IEEE1394_DV1394=m
CONFIG_IEEE1394_RAWIO=m
#
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
-# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
CONFIG_IEEE1394_ETH1394=m
CONFIG_IEEE1394_DV1394=m
CONFIG_IEEE1394_RAWIO=m
CONFIG_SUSE_KERNEL=y
CONFIG_CFGNAME="pseries64"
CONFIG_RELEASE="7.141"
+CONFIG_IEEE1394=m
+CONFIG_IEEE1394_PCILYNX=m
#
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
-# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
CONFIG_IEEE1394_ETH1394=m
CONFIG_IEEE1394_DV1394=m
CONFIG_IEEE1394_RAWIO=m
CONFIG_SUSE_KERNEL=y
CONFIG_CFGNAME="pseries64"
CONFIG_RELEASE="SLES9_SP1_BRANCH_91"
+CONFIG_IEEE1394=m
+CONFIG_IEEE1394_PCILYNX=m
#
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
-# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
CONFIG_IEEE1394_ETH1394=m
CONFIG_IEEE1394_DV1394=m
CONFIG_IEEE1394_RAWIO=m
#
CONFIG_IEEE1394_VIDEO1394=m
CONFIG_IEEE1394_SBP2=m
-# CONFIG_IEEE1394_SBP2_PHYS_DMA is not set
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
CONFIG_IEEE1394_ETH1394=m
CONFIG_IEEE1394_DV1394=m
CONFIG_IEEE1394_RAWIO=m
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.9-prep
-# Fri May 13 14:09:31 2005
+# Linux kernel version: 2.6.9-prep.qp2.2.5.11.3qsnet
+# Thu Oct 27 17:02:11 2005
#
CONFIG_X86=y
CONFIG_MMU=y
CONFIG_SYSCTL=y
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
+# CONFIG_AUDITFILESYSTEM is not set
CONFIG_LOG_BUF_SHIFT=17
CONFIG_HOTPLUG=y
# CONFIG_IKCONFIG is not set
#
# Processor type and features
#
+CONFIG_MEM_MIRROR=y
# CONFIG_X86_PC is not set
# CONFIG_X86_ELAN is not set
# CONFIG_X86_VOYAGER is not set
CONFIG_MICROCODE=m
CONFIG_X86_MSR=m
CONFIG_X86_CPUID=m
-CONFIG_IOPROC=y
-CONFIG_PTRACK=y
-
-#
-# Quadrics QsNet
-#
-CONFIG_QSNET=m
-CONFIG_ELAN3=m
-CONFIG_ELAN4=m
-CONFIG_EP=m
-CONFIG_EIP=m
-CONFIG_RMS=m
-CONFIG_JTAG=m
-CONFIG_NET_FC=y
-CONFIG_SHAPER=m
-CONFIG_NETCONSOLE=m
-
#
# Firmware Drivers
# CONFIG_IRQBALANCE is not set
CONFIG_HAVE_DEC_LOCK=y
CONFIG_REGPARM=y
-CONFIG_KEXEC=y
+CONFIG_IOPROC=y
+CONFIG_PTRACK=y
#
# Power management options (ACPI, APM)
#
CONFIG_SCSI_SPI_ATTRS=m
CONFIG_SCSI_FC_ATTRS=m
+CONFIG_SCSI_ISCSI_ATTRS=m
#
# SCSI low-level drivers
CONFIG_SCSI_IPS=m
CONFIG_SCSI_INITIO=m
# CONFIG_SCSI_INIA100 is not set
+CONFIG_SCSI_ISCSI_SFNET=m
CONFIG_SCSI_PPA=m
CONFIG_SCSI_IMM=m
# CONFIG_SCSI_IZIP_EPP16 is not set
CONFIG_SCSI_QLA2300=m
CONFIG_SCSI_QLA2322=m
CONFIG_SCSI_QLA6312=m
-CONFIG_SCSI_QLA6322=m
+CONFIG_SCSI_QLA24XX=m
# CONFIG_SCSI_SYM53C416 is not set
# CONFIG_SCSI_DC395x is not set
# CONFIG_SCSI_DC390T is not set
CONFIG_DM_SNAPSHOT=m
CONFIG_DM_MIRROR=m
CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_EMC=m
#
# Fusion MPT device support
#
# IEEE 1394 (FireWire) support
#
-# CONFIG_IEEE1394 is not set
+CONFIG_IEEE1394=m
+
+#
+# Subsystem Options
+#
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+# CONFIG_IEEE1394_OUI_DB is not set
+CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y
+CONFIG_IEEE1394_CONFIG_ROM_IP1394=y
+
+#
+# Device Drivers
+#
+CONFIG_IEEE1394_PCILYNX=m
+CONFIG_IEEE1394_OHCI1394=m
+
+#
+# Protocol Drivers
+#
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+CONFIG_IEEE1394_AMDTP=m
#
# I2O device support
CONFIG_SMCTR=m
#
+# Quadrics QsNet
+#
+CONFIG_QSNET=m
+CONFIG_ELAN3=m
+CONFIG_ELAN4=m
+CONFIG_EP=m
+CONFIG_EIP=m
+CONFIG_RMS=m
+CONFIG_JTAG=m
+
+#
# Wireless LAN (non-hamradio)
#
CONFIG_NET_RADIO=y
CONFIG_IEEE80211_CRYPT_CCMP=m
CONFIG_IEEE80211_CRYPT_TKIP=m
CONFIG_IPW2100=m
-# CONFIG_IPW_DEBUG is not set
CONFIG_IPW2100_PROMISC=y
-# CONFIG_IPW2100_LEGACY_FW_LOAD is not set
+# CONFIG_IPW_DEBUG is not set
CONFIG_IPW2200=m
CONFIG_AIRO=m
CONFIG_HERMES=m
CONFIG_SND_AU8820=m
CONFIG_SND_AU8830=m
CONFIG_SND_AZT3328=m
+CONFIG_SND_AZX=m
CONFIG_SND_BT87X=m
CONFIG_SND_CS46XX=m
CONFIG_SND_CS46XX_NEW_DSP=y
CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
CONFIG_RAMFS=y
+CONFIG_RELAYFS_FS=y
#
# Miscellaneous filesystems
# CONFIG_FRAME_POINTER is not set
CONFIG_EARLY_PRINTK=y
CONFIG_DEBUG_STACKOVERFLOW=y
-# CONFIG_KPROBES is not set
+CONFIG_KPROBES=y
CONFIG_DEBUG_STACK_USAGE=y
# CONFIG_DEBUG_PAGEALLOC is not set
+# CONFIG_4KSTACKS is not set
# CONFIG_SCHEDSTATS is not set
CONFIG_X86_FIND_SMP_CONFIG=y
CONFIG_X86_MPPARSE=y
#
# Security options
#
+CONFIG_KEYS=y
+CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_CAPABILITIES=y
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.9-prep
-# Fri May 13 14:09:31 2005
+# Linux kernel version: 2.6.9-prep.qp2.2.5.11.3qsnet
+# Thu Oct 27 17:01:23 2005
#
CONFIG_X86=y
CONFIG_MMU=y
CONFIG_SYSCTL=y
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
+# CONFIG_AUDITFILESYSTEM is not set
CONFIG_LOG_BUF_SHIFT=17
CONFIG_HOTPLUG=y
# CONFIG_IKCONFIG is not set
#
# Processor type and features
#
+CONFIG_MEM_MIRROR=y
# CONFIG_X86_PC is not set
# CONFIG_X86_ELAN is not set
# CONFIG_X86_VOYAGER is not set
CONFIG_MICROCODE=m
CONFIG_X86_MSR=m
CONFIG_X86_CPUID=m
-CONFIG_IOPROC=y
-CONFIG_PTRACK=y
-
-#
-# Quadrics QsNet
-#
-CONFIG_QSNET=m
-CONFIG_ELAN3=m
-CONFIG_ELAN4=m
-CONFIG_EP=m
-CONFIG_EIP=m
-CONFIG_RMS=m
-CONFIG_JTAG=m
-CONFIG_NET_FC=y
-CONFIG_SHAPER=m
-CONFIG_NETCONSOLE=m
-
#
# Firmware Drivers
# CONFIG_IRQBALANCE is not set
CONFIG_HAVE_DEC_LOCK=y
CONFIG_REGPARM=y
-CONFIG_KEXEC=y
+CONFIG_IOPROC=y
+CONFIG_PTRACK=y
#
# Power management options (ACPI, APM)
#
CONFIG_SCSI_SPI_ATTRS=m
CONFIG_SCSI_FC_ATTRS=m
+CONFIG_SCSI_ISCSI_ATTRS=m
#
# SCSI low-level drivers
CONFIG_SCSI_IPS=m
CONFIG_SCSI_INITIO=m
# CONFIG_SCSI_INIA100 is not set
+CONFIG_SCSI_ISCSI_SFNET=m
CONFIG_SCSI_PPA=m
CONFIG_SCSI_IMM=m
# CONFIG_SCSI_IZIP_EPP16 is not set
CONFIG_SCSI_QLA2300=m
CONFIG_SCSI_QLA2322=m
CONFIG_SCSI_QLA6312=m
-CONFIG_SCSI_QLA6322=m
+CONFIG_SCSI_QLA24XX=m
# CONFIG_SCSI_SYM53C416 is not set
# CONFIG_SCSI_DC395x is not set
# CONFIG_SCSI_DC390T is not set
CONFIG_DM_SNAPSHOT=m
CONFIG_DM_MIRROR=m
CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_EMC=m
#
# Fusion MPT device support
#
# IEEE 1394 (FireWire) support
#
-# CONFIG_IEEE1394 is not set
+CONFIG_IEEE1394=m
+
+#
+# Subsystem Options
+#
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+# CONFIG_IEEE1394_OUI_DB is not set
+# CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set
+
+#
+# Device Drivers
+#
+CONFIG_IEEE1394_PCILYNX=m
+CONFIG_IEEE1394_OHCI1394=y
+
+#
+# Protocol Drivers
+#
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+CONFIG_IEEE1394_AMDTP=m
#
# I2O device support
CONFIG_SMCTR=m
#
+# Quadrics QsNet
+#
+CONFIG_QSNET=m
+CONFIG_ELAN3=m
+CONFIG_ELAN4=m
+CONFIG_EP=m
+CONFIG_EIP=m
+CONFIG_RMS=m
+CONFIG_JTAG=m
+
+#
# Wireless LAN (non-hamradio)
#
CONFIG_NET_RADIO=y
CONFIG_IEEE80211_CRYPT_CCMP=m
CONFIG_IEEE80211_CRYPT_TKIP=m
CONFIG_IPW2100=m
-# CONFIG_IPW_DEBUG is not set
CONFIG_IPW2100_PROMISC=y
-# CONFIG_IPW2100_LEGACY_FW_LOAD is not set
+# CONFIG_IPW_DEBUG is not set
CONFIG_IPW2200=m
CONFIG_AIRO=m
CONFIG_HERMES=m
CONFIG_SND_AU8820=m
CONFIG_SND_AU8830=m
CONFIG_SND_AZT3328=m
+CONFIG_SND_AZX=m
CONFIG_SND_BT87X=m
CONFIG_SND_CS46XX=m
CONFIG_SND_CS46XX_NEW_DSP=y
CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
CONFIG_RAMFS=y
+CONFIG_RELAYFS_FS=y
#
# Miscellaneous filesystems
# CONFIG_FRAME_POINTER is not set
CONFIG_EARLY_PRINTK=y
CONFIG_DEBUG_STACKOVERFLOW=y
-# CONFIG_KPROBES is not set
+CONFIG_KPROBES=y
CONFIG_DEBUG_STACK_USAGE=y
# CONFIG_DEBUG_PAGEALLOC is not set
+# CONFIG_4KSTACKS is not set
# CONFIG_SCHEDSTATS is not set
CONFIG_X86_FIND_SMP_CONFIG=y
CONFIG_X86_MPPARSE=y
#
# Security options
#
+CONFIG_KEYS=y
+CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_CAPABILITIES=y
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.9-5.0.3.EL_lustre-b1_4_rhel4.200503031449smp
-# Thu Mar 3 14:52:42 2005
+# Linux kernel version: 2.6.9-prep.qp2.2.5.11.3qsnet
+# Thu Oct 27 17:05:00 2005
#
#
CONFIG_SYSCTL=y
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
+# CONFIG_AUDITFILESYSTEM is not set
CONFIG_LOG_BUF_SHIFT=17
CONFIG_HOTPLUG=y
# CONFIG_IKCONFIG is not set
CONFIG_PTRACK=y
#
-# Quadrics QsNet
-#
-CONFIG_QSNET=m
-CONFIG_ELAN3=m
-CONFIG_ELAN4=m
-CONFIG_EP=m
-CONFIG_EIP=m
-CONFIG_RMS=m
-CONFIG_JTAG=m
-CONFIG_NET_FC=y
-CONFIG_SHAPER=m
-CONFIG_NETCONSOLE=m
-
-
-#
# Firmware Drivers
#
CONFIG_EFI_VARS=y
#
CONFIG_SCSI_SPI_ATTRS=m
CONFIG_SCSI_FC_ATTRS=m
+CONFIG_SCSI_ISCSI_ATTRS=m
#
# SCSI low-level drivers
CONFIG_SCSI_IPS=m
CONFIG_SCSI_INITIO=m
# CONFIG_SCSI_INIA100 is not set
+CONFIG_SCSI_ISCSI_SFNET=m
CONFIG_SCSI_PPA=m
CONFIG_SCSI_IMM=m
# CONFIG_SCSI_IZIP_EPP16 is not set
CONFIG_SCSI_QLA2300=m
CONFIG_SCSI_QLA2322=m
CONFIG_SCSI_QLA6312=m
-CONFIG_SCSI_QLA6322=m
+CONFIG_SCSI_QLA24XX=m
# CONFIG_SCSI_DC395x is not set
# CONFIG_SCSI_DC390T is not set
# CONFIG_SCSI_DEBUG is not set
CONFIG_DM_SNAPSHOT=m
CONFIG_DM_MIRROR=m
CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_EMC=m
#
# Fusion MPT device support
#
# IEEE 1394 (FireWire) support
#
-# CONFIG_IEEE1394 is not set
+CONFIG_IEEE1394=m
#
# I2O device support
CONFIG_ABYSS=m
#
+# Quadrics QsNet
+#
+CONFIG_QSNET=m
+CONFIG_ELAN3=m
+CONFIG_ELAN4=m
+CONFIG_EP=m
+CONFIG_EIP=m
+CONFIG_RMS=m
+CONFIG_JTAG=m
+
+#
# Wireless LAN (non-hamradio)
#
CONFIG_NET_RADIO=y
CONFIG_SND_AU8820=m
CONFIG_SND_AU8830=m
CONFIG_SND_AZT3328=m
+CONFIG_SND_AZX=m
CONFIG_SND_BT87X=m
CONFIG_SND_CS46XX=m
CONFIG_SND_CS46XX_NEW_DSP=y
CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
CONFIG_RAMFS=y
+CONFIG_RELAYFS_FS=y
#
# Miscellaneous filesystems
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_SPINLOCK_SLEEP=y
# CONFIG_DEBUG_INFO is not set
+CONFIG_KPROBES=y
CONFIG_IA64_GRANULE_16MB=y
# CONFIG_IA64_GRANULE_64MB is not set
# CONFIG_IA64_PRINT_HAZARDS is not set
#
# Security options
#
+CONFIG_KEYS=y
+CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_CAPABILITIES=y
CONFIG_CRYPTO_SIGNATURE=y
CONFIG_CRYPTO_SIGNATURE_DSA=y
CONFIG_CRYPTO_MPILIB=y
+CONFIG_IEEE1394_PCILYNX=m
+CONFIG_IEEE1394_OHCI1394=y
+CONFIG_IEEE1394_SBP2=m
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.9-5.0.3.EL_lustre-b1_4_rhel4.200503031449smp
-# Thu Mar 3 14:52:42 2005
+# Linux kernel version: 2.6.9-prep.qp2.2.5.11.3qsnet
+# Thu Oct 27 17:04:10 2005
#
#
CONFIG_SYSCTL=y
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
+# CONFIG_AUDITFILESYSTEM is not set
CONFIG_LOG_BUF_SHIFT=17
CONFIG_HOTPLUG=y
# CONFIG_IKCONFIG is not set
CONFIG_PTRACK=y
#
-# Quadrics QsNet
-#
-CONFIG_QSNET=m
-CONFIG_ELAN3=m
-CONFIG_ELAN4=m
-CONFIG_EP=m
-CONFIG_EIP=m
-CONFIG_RMS=m
-CONFIG_JTAG=m
-CONFIG_NET_FC=y
-CONFIG_SHAPER=m
-CONFIG_NETCONSOLE=m
-
-
-#
# Firmware Drivers
#
CONFIG_EFI_VARS=y
#
CONFIG_SCSI_SPI_ATTRS=m
CONFIG_SCSI_FC_ATTRS=m
+CONFIG_SCSI_ISCSI_ATTRS=m
#
# SCSI low-level drivers
CONFIG_SCSI_IPS=m
CONFIG_SCSI_INITIO=m
# CONFIG_SCSI_INIA100 is not set
+CONFIG_SCSI_ISCSI_SFNET=m
CONFIG_SCSI_PPA=m
CONFIG_SCSI_IMM=m
# CONFIG_SCSI_IZIP_EPP16 is not set
CONFIG_SCSI_QLA2300=m
CONFIG_SCSI_QLA2322=m
CONFIG_SCSI_QLA6312=m
-CONFIG_SCSI_QLA6322=m
+CONFIG_SCSI_QLA24XX=m
# CONFIG_SCSI_DC395x is not set
# CONFIG_SCSI_DC390T is not set
# CONFIG_SCSI_DEBUG is not set
CONFIG_DM_SNAPSHOT=m
CONFIG_DM_MIRROR=m
CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_EMC=m
#
# Fusion MPT device support
#
# IEEE 1394 (FireWire) support
#
-# CONFIG_IEEE1394 is not set
+CONFIG_IEEE1394=m
#
# I2O device support
CONFIG_ABYSS=m
#
+# Quadrics QsNet
+#
+CONFIG_QSNET=m
+CONFIG_ELAN3=m
+CONFIG_ELAN4=m
+CONFIG_EP=m
+CONFIG_EIP=m
+CONFIG_RMS=m
+CONFIG_JTAG=m
+
+#
# Wireless LAN (non-hamradio)
#
CONFIG_NET_RADIO=y
CONFIG_SND_AU8820=m
CONFIG_SND_AU8830=m
CONFIG_SND_AZT3328=m
+CONFIG_SND_AZX=m
CONFIG_SND_BT87X=m
CONFIG_SND_CS46XX=m
CONFIG_SND_CS46XX_NEW_DSP=y
CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
CONFIG_RAMFS=y
+CONFIG_RELAYFS_FS=y
#
# Miscellaneous filesystems
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_SPINLOCK_SLEEP=y
# CONFIG_DEBUG_INFO is not set
+CONFIG_KPROBES=y
CONFIG_IA64_GRANULE_16MB=y
# CONFIG_IA64_GRANULE_64MB is not set
# CONFIG_IA64_PRINT_HAZARDS is not set
#
# Security options
#
+CONFIG_KEYS=y
+CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_CAPABILITIES=y
CONFIG_CRYPTO_SIGNATURE=y
CONFIG_CRYPTO_SIGNATURE_DSA=y
CONFIG_CRYPTO_MPILIB=y
+CONFIG_IEEE1394_PCILYNX=m
+CONFIG_IEEE1394_OHCI1394=y
+CONFIG_IEEE1394_SBP2=m
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.9-prep
-# Tue Aug 2 15:46:19 2005
+# Linux kernel version: 2.6.9-prep.qp2.2.5.11.3qsnet
+# Thu Oct 27 17:06:20 2005
#
CONFIG_X86_64=y
CONFIG_64BIT=y
CONFIG_X86_CMPXCHG=y
CONFIG_EARLY_PRINTK=y
CONFIG_HPET_TIMER=y
+CONFIG_X86_PM_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_IOMAP=y
CONFIG_SYSCTL=y
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
+# CONFIG_AUDITFILESYSTEM is not set
CONFIG_LOG_BUF_SHIFT=17
CONFIG_HOTPLUG=y
# CONFIG_IKCONFIG is not set
CONFIG_PTRACK=y
#
-# Quadrics QsNet
-#
-CONFIG_QSNET=m
-CONFIG_ELAN3=m
-CONFIG_ELAN4=m
-CONFIG_EP=m
-CONFIG_EIP=m
-CONFIG_RMS=m
-CONFIG_JTAG=m
-CONFIG_NET_FC=y
-CONFIG_SHAPER=m
-CONFIG_NETCONSOLE=m
-
-
-#
# Power management options
#
CONFIG_PM=y
CONFIG_ACPI_FAN=y
CONFIG_ACPI_PROCESSOR=y
CONFIG_ACPI_THERMAL=y
+CONFIG_ACPI_NUMA=y
CONFIG_ACPI_ASUS=m
CONFIG_ACPI_TOSHIBA=m
CONFIG_ACPI_BLACKLIST_YEAR=2001
CONFIG_COMPAT=y
CONFIG_SYSVIPC_COMPAT=y
CONFIG_UID16=y
-CONFIG_KEXEC=y
#
# Device Drivers
#
CONFIG_SCSI_SPI_ATTRS=m
CONFIG_SCSI_FC_ATTRS=m
+CONFIG_SCSI_ISCSI_ATTRS=m
#
# SCSI low-level drivers
CONFIG_SCSI_IPS=m
CONFIG_SCSI_INITIO=m
# CONFIG_SCSI_INIA100 is not set
+CONFIG_SCSI_ISCSI_SFNET=m
CONFIG_SCSI_PPA=m
CONFIG_SCSI_IMM=m
# CONFIG_SCSI_IZIP_EPP16 is not set
CONFIG_SCSI_QLA2300=m
CONFIG_SCSI_QLA2322=m
CONFIG_SCSI_QLA6312=m
-CONFIG_SCSI_QLA6322=m
+CONFIG_SCSI_QLA24XX=m
# CONFIG_SCSI_DC395x is not set
# CONFIG_SCSI_DC390T is not set
# CONFIG_SCSI_DEBUG is not set
CONFIG_DM_SNAPSHOT=m
CONFIG_DM_MIRROR=m
CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_EMC=m
#
# Fusion MPT device support
#
# IEEE 1394 (FireWire) support
#
-# CONFIG_IEEE1394 is not set
+CONFIG_IEEE1394=m
+
+#
+# Subsystem Options
+#
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+# CONFIG_IEEE1394_OUI_DB is not set
+CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y
+CONFIG_IEEE1394_CONFIG_ROM_IP1394=y
+
+#
+# Device Drivers
+#
+CONFIG_IEEE1394_PCILYNX=m
+CONFIG_IEEE1394_OHCI1394=m
+
+#
+# Protocol Drivers
+#
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+CONFIG_IEEE1394_AMDTP=m
#
# I2O device support
CONFIG_ABYSS=m
#
+# Quadrics QsNet
+#
+CONFIG_QSNET=m
+CONFIG_ELAN3=m
+CONFIG_ELAN4=m
+CONFIG_EP=m
+CONFIG_EIP=m
+CONFIG_RMS=m
+CONFIG_JTAG=m
+
+#
# Wireless LAN (non-hamradio)
#
CONFIG_NET_RADIO=y
CONFIG_IEEE80211_WPA=m
CONFIG_IEEE80211_CRYPT_TKIP=m
CONFIG_IPW2100=m
-# CONFIG_IPW_DEBUG is not set
CONFIG_IPW2100_PROMISC=y
-# CONFIG_IPW2100_LEGACY_FW_LOAD is not set
+# CONFIG_IPW_DEBUG is not set
CONFIG_IPW2200=m
CONFIG_HERMES=m
CONFIG_PLX_HERMES=m
CONFIG_SND_AU8820=m
CONFIG_SND_AU8830=m
CONFIG_SND_AZT3328=m
+CONFIG_SND_AZX=m
CONFIG_SND_BT87X=m
CONFIG_SND_CS46XX=m
CONFIG_SND_CS46XX_NEW_DSP=y
CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
CONFIG_RAMFS=y
+CONFIG_RELAYFS_FS=y
#
# Miscellaneous filesystems
CONFIG_INIT_DEBUG=y
# CONFIG_SCHEDSTATS is not set
# CONFIG_IOMMU_DEBUG is not set
+CONFIG_KPROBES=y
#
# Security options
#
+CONFIG_KEYS=y
+CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_CAPABILITIES=y
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.9-prep
-# Tue Aug 2 15:46:19 2005
+# Linux kernel version: 2.6.9-prep.qp2.2.5.11.3qsnet
+# Thu Oct 27 17:05:31 2005
#
CONFIG_X86_64=y
CONFIG_64BIT=y
CONFIG_X86_CMPXCHG=y
CONFIG_EARLY_PRINTK=y
CONFIG_HPET_TIMER=y
+CONFIG_X86_PM_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_IOMAP=y
CONFIG_SYSCTL=y
CONFIG_AUDIT=y
CONFIG_AUDITSYSCALL=y
+# CONFIG_AUDITFILESYSTEM is not set
CONFIG_LOG_BUF_SHIFT=17
CONFIG_HOTPLUG=y
# CONFIG_IKCONFIG is not set
CONFIG_PTRACK=y
#
-# Quadrics QsNet
-#
-CONFIG_QSNET=m
-CONFIG_ELAN3=m
-CONFIG_ELAN4=m
-CONFIG_EP=m
-CONFIG_EIP=m
-CONFIG_RMS=m
-CONFIG_JTAG=m
-CONFIG_NET_FC=y
-CONFIG_SHAPER=m
-CONFIG_NETCONSOLE=m
-
-
-#
# Power management options
#
CONFIG_PM=y
CONFIG_ACPI_FAN=y
CONFIG_ACPI_PROCESSOR=y
CONFIG_ACPI_THERMAL=y
+CONFIG_ACPI_NUMA=y
CONFIG_ACPI_ASUS=m
CONFIG_ACPI_TOSHIBA=m
CONFIG_ACPI_BLACKLIST_YEAR=2001
CONFIG_COMPAT=y
CONFIG_SYSVIPC_COMPAT=y
CONFIG_UID16=y
-CONFIG_KEXEC=y
#
# Device Drivers
#
CONFIG_SCSI_SPI_ATTRS=m
CONFIG_SCSI_FC_ATTRS=m
+CONFIG_SCSI_ISCSI_ATTRS=m
#
# SCSI low-level drivers
CONFIG_SCSI_IPS=m
CONFIG_SCSI_INITIO=m
# CONFIG_SCSI_INIA100 is not set
+CONFIG_SCSI_ISCSI_SFNET=m
CONFIG_SCSI_PPA=m
CONFIG_SCSI_IMM=m
# CONFIG_SCSI_IZIP_EPP16 is not set
CONFIG_SCSI_QLA2300=m
CONFIG_SCSI_QLA2322=m
CONFIG_SCSI_QLA6312=m
-CONFIG_SCSI_QLA6322=m
+CONFIG_SCSI_QLA24XX=m
# CONFIG_SCSI_DC395x is not set
# CONFIG_SCSI_DC390T is not set
# CONFIG_SCSI_DEBUG is not set
CONFIG_DM_SNAPSHOT=m
CONFIG_DM_MIRROR=m
CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_EMC=m
#
# Fusion MPT device support
#
# IEEE 1394 (FireWire) support
#
-# CONFIG_IEEE1394 is not set
+CONFIG_IEEE1394=m
+
+#
+# Subsystem Options
+#
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+# CONFIG_IEEE1394_OUI_DB is not set
+CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y
+CONFIG_IEEE1394_CONFIG_ROM_IP1394=y
+
+#
+# Device Drivers
+#
+CONFIG_IEEE1394_PCILYNX=m
+CONFIG_IEEE1394_OHCI1394=y
+
+#
+# Protocol Drivers
+#
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+CONFIG_IEEE1394_AMDTP=m
#
# I2O device support
CONFIG_ABYSS=m
#
+# Quadrics QsNet
+#
+CONFIG_QSNET=m
+CONFIG_ELAN3=m
+CONFIG_ELAN4=m
+CONFIG_EP=m
+CONFIG_EIP=m
+CONFIG_RMS=m
+CONFIG_JTAG=m
+
+#
# Wireless LAN (non-hamradio)
#
CONFIG_NET_RADIO=y
CONFIG_IEEE80211_WPA=m
CONFIG_IEEE80211_CRYPT_TKIP=m
CONFIG_IPW2100=m
-# CONFIG_IPW_DEBUG is not set
CONFIG_IPW2100_PROMISC=y
-# CONFIG_IPW2100_LEGACY_FW_LOAD is not set
+# CONFIG_IPW_DEBUG is not set
CONFIG_IPW2200=m
CONFIG_HERMES=m
CONFIG_PLX_HERMES=m
CONFIG_SND_AU8820=m
CONFIG_SND_AU8830=m
CONFIG_SND_AZT3328=m
+CONFIG_SND_AZX=m
CONFIG_SND_BT87X=m
CONFIG_SND_CS46XX=m
CONFIG_SND_CS46XX_NEW_DSP=y
CONFIG_HUGETLBFS=y
CONFIG_HUGETLB_PAGE=y
CONFIG_RAMFS=y
+CONFIG_RELAYFS_FS=y
#
# Miscellaneous filesystems
CONFIG_INIT_DEBUG=y
# CONFIG_SCHEDSTATS is not set
# CONFIG_IOMMU_DEBUG is not set
+CONFIG_KPROBES=y
#
# Security options
#
+CONFIG_KEYS=y
+CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_CAPABILITIES=y
# CONFIG_XFS_FS is not set
# CONFIG_MINIX_FS is not set
# CONFIG_ROMFS_FS is not set
-# CONFIG_QUOTA is not set
+CONFIG_QUOTA=y
+CONFIG_QFMT_V2=y
+CONFIG_QUOTACTL=y
CONFIG_DNOTIFY=y
# CONFIG_AUTOFS_FS is not set
CONFIG_AUTOFS4_FS=m
# CONFIG_HIGHMEM is not set
CONFIG_PROC_MM=y
CONFIG_KERNEL_STACK_ORDER=4
-CONFIG_UML_REAL_TIME_CLOCK=y
+# CONFIG_UML_REAL_TIME_CLOCK is not set
#
# Loadable module support
--- /dev/null
+Index: linux-2.6.9-5.0.3.EL/include/asm-i386/thread_info.h
+===================================================================
+--- linux-2.6.9-5.0.3.EL.orig/include/asm-i386/thread_info.h 2005-02-25 10:25:33.000000000 +0200
++++ linux-2.6.9-5.0.3.EL/include/asm-i386/thread_info.h 2005-02-25 20:19:11.676139032 +0200
+@@ -54,7 +54,7 @@
+
+ #define PREEMPT_ACTIVE 0x10000000
+ #ifdef CONFIG_4KSTACKS
+-#define THREAD_SIZE (4096)
++#define THREAD_SIZE (8192)
+ #else
+ #define THREAD_SIZE (8192)
+ #endif
--- /dev/null
+--- linux-2.6.9/arch/i386/kernel/apic.c.orig 2005-08-04 08:11:13.000000000 -0400
++++ linux-2.6.9/arch/i386/kernel/apic.c 2005-08-04 08:27:04.000000000 -0400
+@@ -1125,8 +1125,10 @@ asmlinkage void smp_local_timer_interrup
+
+ void smp_apic_timer_interrupt(struct pt_regs regs)
+ {
++#ifdef CONFIG_4KSTACKS
+ union irq_ctx *curctx;
+ union irq_ctx *irqctx;
++#endif
+ int cpu;
+ u32 *isp;
+
+@@ -1147,11 +1149,11 @@ void smp_apic_timer_interrupt(struct pt_
+ * interrupt lock, which is the WrongThing (tm) to do.
+ */
+ irq_enter();
++
++#ifdef CONFIG_4KSTACKS
+ curctx = (union irq_ctx *) current_thread_info();
+ irqctx = hardirq_ctx[cpu];
+- if (curctx == irqctx) {
+- smp_local_timer_interrupt(®s);
+- } else {
++ if (curctx != irqctx) {
+ /* build the stack frame on the IRQ stack */
+ isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+ irqctx->tinfo.task = curctx->tinfo.task;
+@@ -1167,7 +1169,10 @@ void smp_apic_timer_interrupt(struct pt_
+ : : "b"(isp)
+ : "memory", "cc", "edx", "ecx"
+ );
+- }
++ } else
++#endif
++ smp_local_timer_interrupt(®s);
++
+ irq_exit();
+ }
+
+--- linux-2.6.9/include/asm-i386/crashdump.h.orig 2005-08-04 08:11:22.000000000 -0400
++++ linux-2.6.9/include/asm-i386/crashdump.h 2005-08-04 08:27:04.000000000 -0400
+@@ -48,12 +48,14 @@ extern unsigned long next_ram_page (unsi
+
+ static inline void platform_init_stack(void **stackptr)
+ {
++#ifdef CONFIG_4KSTACKS
+ *stackptr = (void *)kmalloc(sizeof(union irq_ctx), GFP_KERNEL);
+ if (*stackptr)
+ memset(*stackptr, 0, sizeof(union irq_ctx));
+ else
+ printk(KERN_WARNING
+ "crashdump: unable to allocate separate stack\n");
++#endif
+ }
+
+ typedef asmlinkage void (*crashdump_func_t)(struct pt_regs *, void *);
+@@ -62,6 +64,7 @@ static inline void platform_start_crashd
+ crashdump_func_t dumpfunc,
+ struct pt_regs *regs)
+ {
++#ifdef CONFIG_4KSTACKS
+ u32 *dsp;
+ union irq_ctx * curctx;
+ union irq_ctx * dumpctx;
+@@ -90,6 +93,10 @@ static inline void platform_start_crashd
+ : "memory", "cc", "edx", "ecx"
+ );
+ }
++#else
++ dumpfunc(regs, NULL);
++#endif
++
+ }
+
+ #define platform_cleanup_stack(stackptr) \
--- /dev/null
+Index: linux-2.6.5-7.201/include/linux/dcache.h
+===================================================================
+--- linux-2.6.5-7.201.orig/include/linux/dcache.h 2005-10-11 00:12:48.000000000 +0400
++++ linux-2.6.5-7.201/include/linux/dcache.h 2005-12-20 23:16:31.000000000 +0300
+@@ -38,7 +38,6 @@ struct qstr {
+ const unsigned char * name;
+ unsigned int len;
+ unsigned int hash;
+- char name_str[0];
+ };
+
+ #include <linux/namei.h>
+@@ -104,7 +103,6 @@ struct dentry {
+ struct rcu_head d_rcu;
+ struct dcookie_struct * d_cookie; /* cookie, if any */
+ unsigned long d_move_count; /* to indicated moved dentry while lockless lookup */
+- struct qstr * d_qstr; /* quick str ptr used in lockless lookup and concurrent d_move */
+ struct dentry * d_parent; /* parent directory */
+ struct qstr d_name;
+ struct hlist_node d_hash; /* lookup hash list */
+Index: linux-2.6.5-7.201/fs/dcache.c
+===================================================================
+--- linux-2.6.5-7.201.orig/fs/dcache.c 2005-10-11 00:12:45.000000000 +0400
++++ linux-2.6.5-7.201/fs/dcache.c 2005-12-20 23:16:31.000000000 +0300
+@@ -41,6 +41,8 @@ EXPORT_SYMBOL(dcache_lock);
+
+ static kmem_cache_t *dentry_cache;
+
++#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
++
+ /*
+ * This is the single most critical data structure when it comes
+ * to the dcache: the hashtable for lookups. Somebody should try
+@@ -67,7 +69,7 @@ static void d_callback(void *arg)
+ struct dentry * dentry = (struct dentry *)arg;
+
+ if (dname_external(dentry)) {
+- kfree(dentry->d_qstr);
++ kfree(dentry->d_name.name);
+ }
+ kmem_cache_free(dentry_cache, dentry);
+ }
+@@ -678,8 +680,6 @@ static int shrink_dcache_memory(int nr,
+ return dentry_stat.nr_unused;
+ }
+
+-#define NAME_ALLOC_LEN(len) ((len+16) & ~15)
+-
+ /**
+ * d_alloc - allocate a dcache entry
+ * @parent: parent of entry to allocate
+@@ -694,26 +694,18 @@ struct dentry * d_alloc(struct dentry *
+ {
+ char * str;
+ struct dentry *dentry;
+- struct qstr * qstr;
+
+ dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ if (!dentry)
+ return NULL;
+
+ if (name->len > DNAME_INLINE_LEN-1) {
+- qstr = kmalloc(sizeof(*qstr) + NAME_ALLOC_LEN(name->len),
+- GFP_KERNEL);
+- if (!qstr) {
++ str = kmalloc(name->len + 1, GFP_KERNEL);
++ if (!str) {
+ kmem_cache_free(dentry_cache, dentry);
+ return NULL;
+ }
+- qstr->name = qstr->name_str;
+- qstr->len = name->len;
+- qstr->hash = name->hash;
+- dentry->d_qstr = qstr;
+- str = qstr->name_str;
+ } else {
+- dentry->d_qstr = &dentry->d_name;
+ str = dentry->d_iname;
+ }
+
+@@ -1010,7 +1002,7 @@ struct dentry * __d_lookup(struct dentry
+ if (dentry->d_parent != parent)
+ continue;
+
+- qstr = dentry->d_qstr;
++ qstr = &dentry->d_name;
+ smp_read_barrier_depends();
+ if (parent->d_op && parent->d_op->d_compare) {
+ if (parent->d_op->d_compare(parent, qstr, name))
+@@ -1163,26 +1155,38 @@ void d_rehash(struct dentry * entry)
+ */
+ static inline void switch_names(struct dentry * dentry, struct dentry * target)
+ {
+- const unsigned char *old_name, *new_name;
+- struct qstr *old_qstr, *new_qstr;
+-
+- memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN);
+- old_qstr = target->d_qstr;
+- old_name = target->d_name.name;
+- new_qstr = dentry->d_qstr;
+- new_name = dentry->d_name.name;
+- if (old_name == target->d_iname) {
+- old_name = dentry->d_iname;
+- old_qstr = &dentry->d_name;
+- }
+- if (new_name == dentry->d_iname) {
+- new_name = target->d_iname;
+- new_qstr = &target->d_name;
+- }
+- target->d_name.name = new_name;
+- dentry->d_name.name = old_name;
+- target->d_qstr = new_qstr;
+- dentry->d_qstr = old_qstr;
++ if (dname_external(target)) {
++ if (dname_external(dentry)) {
++ /*
++ * Both external: swap the pointers
++ */
++ do_switch(target->d_name.name, dentry->d_name.name);
++ } else {
++ /*
++ * dentry:internal, target:external. Steal target's
++ * storage and make target internal.
++ */
++ dentry->d_name.name = target->d_name.name;
++ target->d_name.name = target->d_iname;
++ }
++ } else {
++ if (dname_external(dentry)) {
++ /*
++ * dentry:external, target:internal. Give dentry's
++ * storage to target and make dentry internal
++ */
++ memcpy(dentry->d_iname, target->d_name.name,
++ target->d_name.len + 1);
++ target->d_name.name = dentry->d_name.name;
++ dentry->d_name.name = dentry->d_iname;
++ } else {
++ /*
++ * Both are internal. Just copy target to dentry
++ */
++ memcpy(dentry->d_iname, target->d_name.name,
++ target->d_name.len + 1);
++ }
++ }
+ }
+
+ /*
===================================================================
--- linux-2.6.9-5.0.3.EL.orig/kernel/exit.c 2005-02-26 13:47:31.300655280 +0200
+++ linux-2.6.9-5.0.3.EL/kernel/exit.c 2005-02-26 13:53:13.805586616 +0200
+@@ -244,6 +244,8 @@
+ write_unlock_irq(&tasklist_lock);
+ }
+
++EXPORT_SYMBOL(reparent_to_init);
++
+ void __set_special_pids(pid_t session, pid_t pgrp)
+ {
+ struct task_struct *curr = current;
+@@ -428,6 +430,8 @@
+ __exit_files(tsk);
+ }
+
++EXPORT_SYMBOL(exit_files);
++
+ static inline void __put_fs_struct(struct fs_struct *fs)
+ {
+ /* No need to hold fs->lock if we are killing it */
@@ -516,6 +516,7 @@
{
__exit_mm(tsk);
/*
* second extended-fs super-block data in memory
*/
+Index: linux-2.6.5-12.1/kernel/exit.c
+===================================================================
+--- linux-2.6.5-12.1.orig/kernel/exit.c 2004-05-10 12:21:56.000000000 -0400
++++ linux-2.6.5-12.1/kernel/exit.c 2004-06-03 18:31:28.000000000 -0400
+@@ -260,6 +260,8 @@
+ write_unlock_irq(&tasklist_lock);
+ }
+
++EXPORT_SYMBOL(reparent_to_init);
++
+ void __set_special_pids(pid_t session, pid_t pgrp)
+ {
+ struct task_struct *curr = current;
+@@ -429,6 +431,8 @@
+ __exit_files(tsk);
+ }
+
++EXPORT_SYMBOL(exit_files);
++
+ static inline void __put_fs_struct(struct fs_struct *fs)
+ {
+ /* No need to hold fs->lock if we are killing it */
Index: linux-2.6.4-51.0/kernel/kallsyms.c
===================================================================
--- linux-2.6.4-51.0.orig/kernel/kallsyms.c 2004-04-05 12:42:08.000000000 -0400
--- /dev/null
+Index: linux-2.6.12-rc6/fs/filesystems.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/filesystems.c 2005-06-06 17:22:29.000000000 +0200
++++ linux-2.6.12-rc6/fs/filesystems.c 2005-06-14 15:53:58.298522852 +0200
+@@ -28,7 +28,9 @@
+ */
+
+ static struct file_system_type *file_systems;
+-static DEFINE_RWLOCK(file_systems_lock);
++DEFINE_RWLOCK(file_systems_lock);
++
++EXPORT_SYMBOL(file_systems_lock);
+
+ /* WARNING: This can be used only if we _already_ own a reference */
+ void get_filesystem(struct file_system_type *fs)
+Index: linux-2.6.12-rc6/include/linux/fs.h
+===================================================================
+--- linux-2.6.12-rc6.orig/include/linux/fs.h 2005-06-14 15:53:18.356140529 +0200
++++ linux-2.6.12-rc6/include/linux/fs.h 2005-06-14 15:53:58.309265039 +0200
+@@ -1563,6 +1563,7 @@
+
+ extern struct file_operations generic_ro_fops;
+
++extern rwlock_t file_systems_lock;
+ #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
+
+ extern int vfs_readlink(struct dentry *, char __user *, int, const char *);
+Index: linux-2.6.12-rc6/net/core/sock.c
+===================================================================
+--- linux-2.6.12-rc6.orig/net/core/sock.c 2005-06-06 17:22:29.000000000 +0200
++++ linux-2.6.12-rc6/net/core/sock.c 2005-06-14 15:53:58.349304101 +0200
+@@ -613,6 +613,7 @@
+ return -EFAULT;
+ return 0;
+ }
++EXPORT_SYMBOL(sock_getsockopt);
+
+ /**
+ * sk_alloc - All socket objects are allocated here
+Index: linux-2.6.12-rc6/fs/namespace.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/namespace.c 2005-06-14 15:53:17.868835847 +0200
++++ linux-2.6.12-rc6/fs/namespace.c 2005-06-14 15:53:58.361022851 +0200
+@@ -1240,6 +1240,7 @@
+ mntput(old_pwdmnt);
+ }
+ }
++EXPORT_SYMBOL(set_fs_pwd);
+
+ static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
+ {
+Index: linux-2.6.12.5/kernel/exit.c
+===================================================================
+--- linux-2.6.12.5.orig/kernel/exit.c 2005-08-17 17:51:28.000000000 +0200
++++ linux-2.6.12.5/kernel/exit.c 2005-08-17 17:51:44.000000000 +0200
+@@ -250,6 +250,8 @@
+ switch_uid(INIT_USER);
+ }
+
++EXPORT_SYMBOL(reparent_to_init);
++
+ void __set_special_pids(pid_t session, pid_t pgrp)
+ {
+ struct task_struct *curr = current;
+@@ -432,6 +434,8 @@
+ __exit_files(tsk);
+ }
+
++EXPORT_SYMBOL(exit_files);
++
+ static inline void __put_fs_struct(struct fs_struct *fs)
+ {
+ /* No need to hold fs->lock if we are killing it */
+@@ -515,6 +515,7 @@
+ task_unlock(tsk);
+ mmput(mm);
+ }
++EXPORT_SYMBOL(exit_mm);
+
+ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+ {
+Index: linux-2.6.12-rc6/fs/dcache.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/dcache.c 2005-06-14 15:53:19.812195198 +0200
++++ linux-2.6.12-rc6/fs/dcache.c 2005-06-14 15:53:58.385436913 +0200
+@@ -1581,6 +1581,7 @@
+
+ return result;
+ }
++EXPORT_SYMBOL(is_subdir);
+
+ void d_genocide(struct dentry *root)
+ {
+Index: linux-2.6.12-rc6/fs/file_table.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/file_table.c 2005-06-06 17:22:29.000000000 +0200
++++ linux-2.6.12-rc6/fs/file_table.c 2005-06-14 15:53:58.396179101 +0200
+@@ -197,6 +197,7 @@
+ file_free(file);
+ }
+ }
++EXPORT_SYMBOL(put_filp);
+
+ void file_move(struct file *file, struct list_head *list)
+ {
--- /dev/null
+Index: linux-2.6.12-rc6/fs/ext3/extents.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/extents.c 2005-06-14 16:31:25.756503133 +0200
++++ linux-2.6.12-rc6/fs/ext3/extents.c 2005-06-14 16:31:25.836581257 +0200
+@@ -0,0 +1,2347 @@
++/*
++ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++/*
++ * Extents support for EXT3
++ *
++ * TODO:
++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent()
++ * - ext3_ext_calc_credits() could take 'mergable' into account
++ * - ext3*_error() should be used in some situations
++ * - find_goal() [to be tested and improved]
++ * - smart tree reduction
++ * - arch-independence
++ * common on-disk format for big/little-endian arch
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/time.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/smp_lock.h>
++#include <linux/highuid.h>
++#include <linux/pagemap.h>
++#include <linux/quotaops.h>
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/ext3_extents.h>
++#include <asm/uaccess.h>
++
++
++static inline int ext3_ext_check_header(struct ext3_extent_header *eh)
++{
++ if (eh->eh_magic != EXT3_EXT_MAGIC) {
++ printk(KERN_ERR "EXT3-fs: invalid magic = 0x%x\n",
++ (unsigned)eh->eh_magic);
++ return -EIO;
++ }
++ if (eh->eh_max == 0) {
++ printk(KERN_ERR "EXT3-fs: invalid eh_max = %u\n",
++ (unsigned)eh->eh_max);
++ return -EIO;
++ }
++ if (eh->eh_entries > eh->eh_max) {
++ printk(KERN_ERR "EXT3-fs: invalid eh_entries = %u\n",
++ (unsigned)eh->eh_entries);
++ return -EIO;
++ }
++ return 0;
++}
++
++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed)
++{
++ int err;
++
++ if (handle->h_buffer_credits > needed)
++ return handle;
++ if (!ext3_journal_extend(handle, needed))
++ return handle;
++ err = ext3_journal_restart(handle, needed);
++
++ return handle;
++}
++
++static int inline
++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree)
++{
++ if (tree->ops->get_write_access)
++ return tree->ops->get_write_access(h,tree->buffer);
++ else
++ return 0;
++}
++
++static int inline
++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree)
++{
++ if (tree->ops->mark_buffer_dirty)
++ return tree->ops->mark_buffer_dirty(h,tree->buffer);
++ else
++ return 0;
++}
++
++/*
++ * could return:
++ * - EROFS
++ * - ENOMEM
++ */
++static int ext3_ext_get_access(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int err;
++
++ if (path->p_bh) {
++ /* path points to block */
++ err = ext3_journal_get_write_access(handle, path->p_bh);
++ } else {
++ /* path points to leaf/index in inode body */
++ err = ext3_ext_get_access_for_root(handle, tree);
++ }
++ return err;
++}
++
++/*
++ * could return:
++ * - EROFS
++ * - ENOMEM
++ * - EIO
++ */
++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int err;
++ if (path->p_bh) {
++ /* path points to block */
++ err =ext3_journal_dirty_metadata(handle, path->p_bh);
++ } else {
++ /* path points to leaf/index in inode body */
++ err = ext3_ext_mark_root_dirty(handle, tree);
++ }
++ return err;
++}
++
++static int inline
++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, struct ext3_extent *ex,
++ int *err)
++{
++ int goal, depth, newblock;
++ struct inode *inode;
++
++ EXT_ASSERT(tree);
++ if (tree->ops->new_block)
++ return tree->ops->new_block(handle, tree, path, ex, err);
++
++ inode = tree->inode;
++ depth = EXT_DEPTH(tree);
++ if (path && depth > 0) {
++ goal = path[depth-1].p_block;
++ } else {
++ struct ext3_inode_info *ei = EXT3_I(inode);
++ unsigned long bg_start;
++ unsigned long colour;
++
++ bg_start = (ei->i_block_group *
++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++ colour = (current->pid % 16) *
++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++ goal = bg_start + colour;
++ }
++
++ newblock = ext3_new_block(handle, inode, goal, err);
++ return newblock;
++}
++
++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree)
++{
++ struct ext3_extent_header *neh;
++ neh = EXT_ROOT_HDR(tree);
++ neh->eh_generation++;
++}
++
++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->inode->i_sb->s_blocksize -
++ sizeof(struct ext3_extent_header)) /
++ sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++ size = 6;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->inode->i_sb->s_blocksize -
++ sizeof(struct ext3_extent_header)) /
++ sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++ size = 5;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) /
++ sizeof(struct ext3_extent);
++#ifdef AGRESSIVE_TEST
++ size = 3;
++#endif
++ return size;
++}
++
++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree)
++{
++ int size;
++
++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) /
++ sizeof(struct ext3_extent_idx);
++#ifdef AGRESSIVE_TEST
++ size = 4;
++#endif
++ return size;
++}
++
++static void ext3_ext_show_path(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++ int k, l = path->p_depth;
++
++ ext_debug(tree, "path:");
++ for (k = 0; k <= l; k++, path++) {
++ if (path->p_idx) {
++ ext_debug(tree, " %d->%d", path->p_idx->ei_block,
++ path->p_idx->ei_leaf);
++ } else if (path->p_ext) {
++ ext_debug(tree, " %d:%d:%d",
++ path->p_ext->ee_block,
++ path->p_ext->ee_len,
++ path->p_ext->ee_start);
++ } else
++ ext_debug(tree, " []");
++ }
++ ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++#ifdef EXT_DEBUG
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent_header *eh;
++ struct ext3_extent *ex;
++ int i;
++
++ if (!path)
++ return;
++
++ eh = path[depth].p_hdr;
++ ex = EXT_FIRST_EXTENT(eh);
++
++ for (i = 0; i < eh->eh_entries; i++, ex++) {
++ ext_debug(tree, "%d:%d:%d ",
++ ex->ee_block, ex->ee_len, ex->ee_start);
++ }
++ ext_debug(tree, "\n");
++#endif
++}
++
++static void ext3_ext_drop_refs(struct ext3_ext_path *path)
++{
++ int depth = path->p_depth;
++ int i;
++
++ for (i = 0; i <= depth; i++, path++) {
++ if (path->p_bh) {
++ brelse(path->p_bh);
++ path->p_bh = NULL;
++ }
++ }
++}
++
++/*
++ * binary search for closest index by given block
++ */
++static inline void
++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, int block)
++{
++ struct ext3_extent_header *eh = path->p_hdr;
++ struct ext3_extent_idx *ix;
++ int l = 0, k, r;
++
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++ EXT_ASSERT(eh->eh_entries > 0);
++
++ ext_debug(tree, "binsearch for %d(idx): ", block);
++
++ path->p_idx = ix = EXT_FIRST_INDEX(eh);
++
++ r = k = eh->eh_entries;
++ while (k > 1) {
++ k = (r - l) / 2;
++ if (block < ix[l + k].ei_block)
++ r -= k;
++ else
++ l += k;
++ ext_debug(tree, "%d:%d:%d ", k, l, r);
++ }
++
++ ix += l;
++ path->p_idx = ix;
++ ext_debug(tree," -> %d->%d ",path->p_idx->ei_block,path->p_idx->ei_leaf);
++
++ while (l++ < r) {
++ if (block < ix->ei_block)
++ break;
++ path->p_idx = ix++;
++ }
++ ext_debug(tree, " -> %d->%d\n", path->p_idx->ei_block,
++ path->p_idx->ei_leaf);
++
++#ifdef CHECK_BINSEARCH
++ {
++ struct ext3_extent_idx *chix;
++
++ chix = ix = EXT_FIRST_INDEX(eh);
++ for (k = 0; k < eh->eh_entries; k++, ix++) {
++ if (k != 0 && ix->ei_block <= ix[-1].ei_block) {
++ printk("k=%d, ix=0x%p, first=0x%p\n", k,
++ ix, EXT_FIRST_INDEX(eh));
++ printk("%u <= %u\n",
++ ix->ei_block,ix[-1].ei_block);
++ }
++ EXT_ASSERT(k == 0 || ix->ei_block > ix[-1].ei_block);
++ if (block < ix->ei_block)
++ break;
++ chix = ix;
++ }
++ EXT_ASSERT(chix == path->p_idx);
++ }
++#endif
++}
++
++/*
++ * binary search for closest extent by given block
++ */
++static inline void
++ext3_ext_binsearch(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, int block)
++{
++ struct ext3_extent_header *eh = path->p_hdr;
++ struct ext3_extent *ex;
++ int l = 0, k, r;
++
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++
++ if (eh->eh_entries == 0) {
++ /*
++ * this leaf is empty yet:
++ * we get such a leaf in split/add case
++ */
++ return;
++ }
++
++ ext_debug(tree, "binsearch for %d: ", block);
++
++ path->p_ext = ex = EXT_FIRST_EXTENT(eh);
++
++ r = k = eh->eh_entries;
++ while (k > 1) {
++ k = (r - l) / 2;
++ if (block < ex[l + k].ee_block)
++ r -= k;
++ else
++ l += k;
++ ext_debug(tree, "%d:%d:%d ", k, l, r);
++ }
++
++ ex += l;
++ path->p_ext = ex;
++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->ee_block,
++ path->p_ext->ee_start, path->p_ext->ee_len);
++
++ while (l++ < r) {
++ if (block < ex->ee_block)
++ break;
++ path->p_ext = ex++;
++ }
++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->ee_block,
++ path->p_ext->ee_start, path->p_ext->ee_len);
++
++#ifdef CHECK_BINSEARCH
++ {
++ struct ext3_extent *chex;
++
++ chex = ex = EXT_FIRST_EXTENT(eh);
++ for (k = 0; k < eh->eh_entries; k++, ex++) {
++ EXT_ASSERT(k == 0 || ex->ee_block > ex[-1].ee_block);
++ if (block < ex->ee_block)
++ break;
++ chex = ex;
++ }
++ EXT_ASSERT(chex == path->p_ext);
++ }
++#endif
++}
++
++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree)
++{
++ struct ext3_extent_header *eh;
++
++ BUG_ON(tree->buffer_len == 0);
++ ext3_ext_get_access_for_root(handle, tree);
++ eh = EXT_ROOT_HDR(tree);
++ eh->eh_depth = 0;
++ eh->eh_entries = 0;
++ eh->eh_magic = EXT3_EXT_MAGIC;
++ eh->eh_max = ext3_ext_space_root(tree);
++ ext3_ext_mark_root_dirty(handle, tree);
++ ext3_ext_invalidate_cache(tree);
++ return 0;
++}
++
++struct ext3_ext_path *
++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block,
++ struct ext3_ext_path *path)
++{
++ struct ext3_extent_header *eh;
++ struct buffer_head *bh;
++ int depth, i, ppos = 0;
++
++ EXT_ASSERT(tree);
++ EXT_ASSERT(tree->inode);
++ EXT_ASSERT(tree->root);
++
++ eh = EXT_ROOT_HDR(tree);
++ EXT_ASSERT(eh);
++ if (ext3_ext_check_header(eh))
++ goto err;
++
++ i = depth = EXT_DEPTH(tree);
++ EXT_ASSERT(eh->eh_max);
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++
++ /* account possible depth increase */
++ if (!path) {
++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2),
++ GFP_NOFS);
++ if (!path)
++ return ERR_PTR(-ENOMEM);
++ }
++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++ path[0].p_hdr = eh;
++
++ /* walk through the tree */
++ while (i) {
++ ext_debug(tree, "depth %d: num %d, max %d\n",
++ ppos, eh->eh_entries, eh->eh_max);
++ ext3_ext_binsearch_idx(tree, path + ppos, block);
++ path[ppos].p_block = path[ppos].p_idx->ei_leaf;
++ path[ppos].p_depth = i;
++ path[ppos].p_ext = NULL;
++
++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block);
++ if (!bh)
++ goto err;
++
++ eh = EXT_BLOCK_HDR(bh);
++ ppos++;
++ EXT_ASSERT(ppos <= depth);
++ path[ppos].p_bh = bh;
++ path[ppos].p_hdr = eh;
++ i--;
++
++ if (ext3_ext_check_header(eh))
++ goto err;
++ }
++
++ path[ppos].p_depth = i;
++ path[ppos].p_hdr = eh;
++ path[ppos].p_ext = NULL;
++ path[ppos].p_idx = NULL;
++
++ if (ext3_ext_check_header(eh))
++ goto err;
++
++ /* find extent */
++ ext3_ext_binsearch(tree, path + ppos, block);
++
++ ext3_ext_show_path(tree, path);
++
++ return path;
++
++err:
++ printk(KERN_ERR "EXT3-fs: header is corrupted!\n");
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ return ERR_PTR(-EIO);
++}
++
++/*
++ * insert new index [logical;ptr] into the block at cupr
++ * it check where to insert: before curp or after curp
++ */
++static int ext3_ext_insert_index(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *curp,
++ int logical, int ptr)
++{
++ struct ext3_extent_idx *ix;
++ int len, err;
++
++ if ((err = ext3_ext_get_access(handle, tree, curp)))
++ return err;
++
++ EXT_ASSERT(logical != curp->p_idx->ei_block);
++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
++ if (logical > curp->p_idx->ei_block) {
++ /* insert after */
++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
++ len = (len - 1) * sizeof(struct ext3_extent_idx);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert new index %d after: %d. "
++ "move %d from 0x%p to 0x%p\n",
++ logical, ptr, len,
++ (curp->p_idx + 1), (curp->p_idx + 2));
++ memmove(curp->p_idx + 2, curp->p_idx + 1, len);
++ }
++ ix = curp->p_idx + 1;
++ } else {
++ /* insert before */
++ len = len * sizeof(struct ext3_extent_idx);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert new index %d before: %d. "
++ "move %d from 0x%p to 0x%p\n",
++ logical, ptr, len,
++ curp->p_idx, (curp->p_idx + 1));
++ memmove(curp->p_idx + 1, curp->p_idx, len);
++ ix = curp->p_idx;
++ }
++
++ ix->ei_block = logical;
++ ix->ei_leaf = ptr;
++ curp->p_hdr->eh_entries++;
++
++ EXT_ASSERT(curp->p_hdr->eh_entries <= curp->p_hdr->eh_max);
++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr));
++
++ err = ext3_ext_dirty(handle, tree, curp);
++ ext3_std_error(tree->inode->i_sb, err);
++
++ return err;
++}
++
++/*
++ * routine inserts new subtree into the path, using free index entry
++ * at depth 'at:
++ * - allocates all needed blocks (new leaf and all intermediate index blocks)
++ * - makes decision where to split
++ * - moves remaining extens and index entries (right to the split point)
++ * into the newly allocated blocks
++ * - initialize subtree
++ */
++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext, int at)
++{
++ struct buffer_head *bh = NULL;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent_header *neh;
++ struct ext3_extent_idx *fidx;
++ struct ext3_extent *ex;
++ int i = at, k, m, a;
++ unsigned long newblock, oldblock, border;
++ int *ablocks = NULL; /* array of allocated blocks */
++ int err = 0;
++
++ /* make decision: where to split? */
++ /* FIXME: now desicion is simplest: at current extent */
++
++ /* if current leaf will be splitted, then we should use
++ * border from split point */
++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr));
++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
++ border = path[depth].p_ext[1].ee_block;
++ ext_debug(tree, "leaf will be splitted."
++ " next leaf starts at %d\n",
++ (int)border);
++ } else {
++ border = newext->ee_block;
++ ext_debug(tree, "leaf will be added."
++ " next leaf starts at %d\n",
++ (int)border);
++ }
++
++ /*
++ * if error occurs, then we break processing
++ * and turn filesystem read-only. so, index won't
++ * be inserted and tree will be in consistent
++ * state. next mount will repair buffers too
++ */
++
++ /*
++ * get array to track all allocated blocks
++ * we need this to handle errors and free blocks
++ * upon them
++ */
++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS);
++ if (!ablocks)
++ return -ENOMEM;
++ memset(ablocks, 0, sizeof(unsigned long) * depth);
++
++ /* allocate all needed blocks */
++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at);
++ for (a = 0; a < depth - at; a++) {
++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++ if (newblock == 0)
++ goto cleanup;
++ ablocks[a] = newblock;
++ }
++
++ /* initialize new leaf */
++ newblock = ablocks[--a];
++ EXT_ASSERT(newblock);
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh)))
++ goto cleanup;
++
++ neh = EXT_BLOCK_HDR(bh);
++ neh->eh_entries = 0;
++ neh->eh_max = ext3_ext_space_block(tree);
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ neh->eh_depth = 0;
++ ex = EXT_FIRST_EXTENT(neh);
++
++ /* move remain of path[depth] to the new leaf */
++ EXT_ASSERT(path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max);
++ /* start copy from next extent */
++ /* TODO: we could do it by single memmove */
++ m = 0;
++ path[depth].p_ext++;
++ while (path[depth].p_ext <=
++ EXT_MAX_EXTENT(path[depth].p_hdr)) {
++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n",
++ path[depth].p_ext->ee_block,
++ path[depth].p_ext->ee_start,
++ path[depth].p_ext->ee_len,
++ newblock);
++ memmove(ex++, path[depth].p_ext++, sizeof(struct ext3_extent));
++ neh->eh_entries++;
++ m++;
++ }
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto cleanup;
++ brelse(bh);
++ bh = NULL;
++
++ /* correct old leaf */
++ if (m) {
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ goto cleanup;
++ path[depth].p_hdr->eh_entries -= m;
++ if ((err = ext3_ext_dirty(handle, tree, path + depth)))
++ goto cleanup;
++
++ }
++
++ /* create intermediate indexes */
++ k = depth - at - 1;
++ EXT_ASSERT(k >= 0);
++ if (k)
++ ext_debug(tree, "create %d intermediate indices\n", k);
++ /* insert new index into current index block */
++ /* current depth stored in i var */
++ i = depth - 1;
++ while (k--) {
++ oldblock = newblock;
++ newblock = ablocks[--a];
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh)))
++ goto cleanup;
++
++ neh = EXT_BLOCK_HDR(bh);
++ neh->eh_entries = 1;
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ neh->eh_max = ext3_ext_space_block_idx(tree);
++ neh->eh_depth = depth - i;
++ fidx = EXT_FIRST_INDEX(neh);
++ fidx->ei_block = border;
++ fidx->ei_leaf = oldblock;
++
++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n",
++ i, newblock, border, oldblock);
++ /* copy indexes */
++ m = 0;
++ path[i].p_idx++;
++
++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx,
++ EXT_MAX_INDEX(path[i].p_hdr));
++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) ==
++ EXT_LAST_INDEX(path[i].p_hdr));
++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
++ ext_debug(tree, "%d: move %d:%d in new index %lu\n",
++ i, path[i].p_idx->ei_block,
++ path[i].p_idx->ei_leaf, newblock);
++ memmove(++fidx, path[i].p_idx++,
++ sizeof(struct ext3_extent_idx));
++ neh->eh_entries++;
++ EXT_ASSERT(neh->eh_entries <= neh->eh_max);
++ m++;
++ }
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto cleanup;
++ brelse(bh);
++ bh = NULL;
++
++ /* correct old index */
++ if (m) {
++ err = ext3_ext_get_access(handle, tree, path + i);
++ if (err)
++ goto cleanup;
++ path[i].p_hdr->eh_entries -= m;
++ err = ext3_ext_dirty(handle, tree, path + i);
++ if (err)
++ goto cleanup;
++ }
++
++ i--;
++ }
++
++ /* insert new index */
++ if (!err)
++ err = ext3_ext_insert_index(handle, tree, path + at,
++ border, newblock);
++
++cleanup:
++ if (bh) {
++ if (buffer_locked(bh))
++ unlock_buffer(bh);
++ brelse(bh);
++ }
++
++ if (err) {
++ /* free all allocated blocks in error case */
++ for (i = 0; i < depth; i++) {
++ if (!ablocks[i])
++ continue;
++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++ }
++ }
++ kfree(ablocks);
++
++ return err;
++}
++
++/*
++ * routine implements tree growing procedure:
++ * - allocates new block
++ * - moves top-level data (index block or leaf) into the new block
++ * - initialize new top-level, creating index that points to the
++ * just created block
++ */
++static int ext3_ext_grow_indepth(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_ext_path *curp = path;
++ struct ext3_extent_header *neh;
++ struct ext3_extent_idx *fidx;
++ struct buffer_head *bh;
++ unsigned long newblock;
++ int err = 0;
++
++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err);
++ if (newblock == 0)
++ return err;
++
++ bh = sb_getblk(tree->inode->i_sb, newblock);
++ if (!bh) {
++ err = -EIO;
++ ext3_std_error(tree->inode->i_sb, err);
++ return err;
++ }
++ lock_buffer(bh);
++
++ if ((err = ext3_journal_get_create_access(handle, bh))) {
++ unlock_buffer(bh);
++ goto out;
++ }
++
++ /* move top-level index/leaf into new block */
++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len);
++
++ /* set size of new block */
++ neh = EXT_BLOCK_HDR(bh);
++ /* old root could have indexes or leaves
++ * so calculate eh_max right way */
++ if (EXT_DEPTH(tree))
++ neh->eh_max = ext3_ext_space_block_idx(tree);
++ else
++ neh->eh_max = ext3_ext_space_block(tree);
++ neh->eh_magic = EXT3_EXT_MAGIC;
++ set_buffer_uptodate(bh);
++ unlock_buffer(bh);
++
++ if ((err = ext3_journal_dirty_metadata(handle, bh)))
++ goto out;
++
++ /* create index in new top-level index: num,max,pointer */
++ if ((err = ext3_ext_get_access(handle, tree, curp)))
++ goto out;
++
++ curp->p_hdr->eh_magic = EXT3_EXT_MAGIC;
++ curp->p_hdr->eh_max = ext3_ext_space_root_idx(tree);
++ curp->p_hdr->eh_entries = 1;
++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
++ /* FIXME: it works, but actually path[0] can be index */
++ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
++ curp->p_idx->ei_leaf = newblock;
++
++ neh = EXT_ROOT_HDR(tree);
++ fidx = EXT_FIRST_INDEX(neh);
++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n",
++ neh->eh_entries, neh->eh_max, fidx->ei_block, fidx->ei_leaf);
++
++ neh->eh_depth = path->p_depth + 1;
++ err = ext3_ext_dirty(handle, tree, curp);
++out:
++ brelse(bh);
++
++ return err;
++}
++
++/*
++ * routine finds empty index and adds new leaf. if no free index found
++ * then it requests in-depth growing
++ */
++static int ext3_ext_create_new_leaf(handle_t *handle,
++ struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_ext_path *curp;
++ int depth, i, err = 0;
++
++repeat:
++ i = depth = EXT_DEPTH(tree);
++
++ /* walk up to the tree and look for free index entry */
++ curp = path + depth;
++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
++ i--;
++ curp--;
++ }
++
++ /* we use already allocated block for index block
++ * so, subsequent data blocks should be contigoues */
++ if (EXT_HAS_FREE_INDEX(curp)) {
++ /* if we found index with free entry, then use that
++ * entry: create all needed subtree and add new leaf */
++ err = ext3_ext_split(handle, tree, path, newext, i);
++
++ /* refill path */
++ ext3_ext_drop_refs(path);
++ path = ext3_ext_find_extent(tree, newext->ee_block, path);
++ if (IS_ERR(path))
++ err = PTR_ERR(path);
++ } else {
++ /* tree is full, time to grow in depth */
++ err = ext3_ext_grow_indepth(handle, tree, path, newext);
++
++ /* refill path */
++ ext3_ext_drop_refs(path);
++ path = ext3_ext_find_extent(tree, newext->ee_block, path);
++ if (IS_ERR(path))
++ err = PTR_ERR(path);
++
++ /*
++ * only first (depth 0 -> 1) produces free space
++ * in all other cases we have to split growed tree
++ */
++ depth = EXT_DEPTH(tree);
++ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
++ /* now we need split */
++ goto repeat;
++ }
++ }
++
++ if (err)
++ return err;
++
++ return 0;
++}
++
++/*
++ * returns allocated block in subsequent extent or EXT_MAX_BLOCK
++ * NOTE: it consider block number from index entry as
++ * allocated block. thus, index entries have to be consistent
++ * with leafs
++ */
++static unsigned long
++ext3_ext_next_allocated_block(struct ext3_ext_path *path)
++{
++ int depth;
++
++ EXT_ASSERT(path != NULL);
++ depth = path->p_depth;
++
++ if (depth == 0 && path->p_ext == NULL)
++ return EXT_MAX_BLOCK;
++
++ /* FIXME: what if index isn't full ?! */
++ while (depth >= 0) {
++ if (depth == path->p_depth) {
++ /* leaf */
++ if (path[depth].p_ext !=
++ EXT_LAST_EXTENT(path[depth].p_hdr))
++ return path[depth].p_ext[1].ee_block;
++ } else {
++ /* index */
++ if (path[depth].p_idx !=
++ EXT_LAST_INDEX(path[depth].p_hdr))
++ return path[depth].p_idx[1].ei_block;
++ }
++ depth--;
++ }
++
++ return EXT_MAX_BLOCK;
++}
++
++/*
++ * returns first allocated block from next leaf or EXT_MAX_BLOCK
++ */
++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int depth;
++
++ EXT_ASSERT(path != NULL);
++ depth = path->p_depth;
++
++ /* zero-tree has no leaf blocks at all */
++ if (depth == 0)
++ return EXT_MAX_BLOCK;
++
++ /* go to index block */
++ depth--;
++
++ while (depth >= 0) {
++ if (path[depth].p_idx !=
++ EXT_LAST_INDEX(path[depth].p_hdr))
++ return path[depth].p_idx[1].ei_block;
++ depth--;
++ }
++
++ return EXT_MAX_BLOCK;
++}
++
++/*
++ * if leaf gets modified and modified extent is first in the leaf
++ * then we have to correct all indexes above
++ * TODO: do we need to correct tree in all cases?
++ */
++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ struct ext3_extent_header *eh;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_extent *ex;
++ unsigned long border;
++ int k, err = 0;
++
++ eh = path[depth].p_hdr;
++ ex = path[depth].p_ext;
++ EXT_ASSERT(ex);
++ EXT_ASSERT(eh);
++
++ if (depth == 0) {
++ /* there is no tree at all */
++ return 0;
++ }
++
++ if (ex != EXT_FIRST_EXTENT(eh)) {
++ /* we correct tree if first leaf got modified only */
++ return 0;
++ }
++
++ /*
++ * TODO: we need correction if border is smaller then current one
++ */
++ k = depth - 1;
++ border = path[depth].p_ext->ee_block;
++ if ((err = ext3_ext_get_access(handle, tree, path + k)))
++ return err;
++ path[k].p_idx->ei_block = border;
++ if ((err = ext3_ext_dirty(handle, tree, path + k)))
++ return err;
++
++ while (k--) {
++ /* change all left-side indexes */
++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
++ break;
++ if ((err = ext3_ext_get_access(handle, tree, path + k)))
++ break;
++ path[k].p_idx->ei_block = border;
++ if ((err = ext3_ext_dirty(handle, tree, path + k)))
++ break;
++ }
++
++ return err;
++}
++
++static int inline
++ext3_can_extents_be_merged(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex1,
++ struct ext3_extent *ex2)
++{
++ if (ex1->ee_block + ex1->ee_len != ex2->ee_block)
++ return 0;
++
++#ifdef AGRESSIVE_TEST
++ if (ex1->ee_len >= 4)
++ return 0;
++#endif
++
++ if (!tree->ops->mergable)
++ return 1;
++
++ return tree->ops->mergable(ex1, ex2);
++}
++
++/*
++ * this routine tries to merge requsted extent into the existing
++ * extent or inserts requested extent as new one into the tree,
++ * creating new leaf in no-space case
++ */
++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *newext)
++{
++ struct ext3_extent_header * eh;
++ struct ext3_extent *ex, *fex;
++ struct ext3_extent *nearex; /* nearest extent */
++ struct ext3_ext_path *npath = NULL;
++ int depth, len, err, next;
++
++ EXT_ASSERT(newext->ee_len > 0);
++ depth = EXT_DEPTH(tree);
++ ex = path[depth].p_ext;
++ EXT_ASSERT(path[depth].p_hdr);
++
++ /* try to insert block into found extent and return */
++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) {
++ ext_debug(tree, "append %d block to %d:%d (from %d)\n",
++ newext->ee_len, ex->ee_block, ex->ee_len,
++ ex->ee_start);
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ return err;
++ ex->ee_len += newext->ee_len;
++ eh = path[depth].p_hdr;
++ nearex = ex;
++ goto merge;
++ }
++
++repeat:
++ depth = EXT_DEPTH(tree);
++ eh = path[depth].p_hdr;
++ if (eh->eh_entries < eh->eh_max)
++ goto has_space;
++
++ /* probably next leaf has space for us? */
++ fex = EXT_LAST_EXTENT(eh);
++ next = ext3_ext_next_leaf_block(tree, path);
++ if (newext->ee_block > fex->ee_block && next != EXT_MAX_BLOCK) {
++ ext_debug(tree, "next leaf block - %d\n", next);
++ EXT_ASSERT(!npath);
++ npath = ext3_ext_find_extent(tree, next, NULL);
++ if (IS_ERR(npath))
++ return PTR_ERR(npath);
++ EXT_ASSERT(npath->p_depth == path->p_depth);
++ eh = npath[depth].p_hdr;
++ if (eh->eh_entries < eh->eh_max) {
++ ext_debug(tree, "next leaf isnt full(%d)\n",
++ eh->eh_entries);
++ path = npath;
++ goto repeat;
++ }
++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n",
++ eh->eh_entries, eh->eh_max);
++ }
++
++ /*
++ * there is no free space in found leaf
++ * we're gonna add new leaf in the tree
++ */
++ err = ext3_ext_create_new_leaf(handle, tree, path, newext);
++ if (err)
++ goto cleanup;
++ depth = EXT_DEPTH(tree);
++ eh = path[depth].p_hdr;
++
++has_space:
++ nearex = path[depth].p_ext;
++
++ if ((err = ext3_ext_get_access(handle, tree, path + depth)))
++ goto cleanup;
++
++ if (!nearex) {
++ /* there is no extent in this leaf, create first one */
++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n",
++ newext->ee_block, newext->ee_start,
++ newext->ee_len);
++ path[depth].p_ext = EXT_FIRST_EXTENT(eh);
++ } else if (newext->ee_block > nearex->ee_block) {
++ EXT_ASSERT(newext->ee_block != nearex->ee_block);
++ if (nearex != EXT_LAST_EXTENT(eh)) {
++ len = EXT_MAX_EXTENT(eh) - nearex;
++ len = (len - 1) * sizeof(struct ext3_extent);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, "
++ "move %d from 0x%p to 0x%p\n",
++ newext->ee_block, newext->ee_start,
++ newext->ee_len,
++ nearex, len, nearex + 1, nearex + 2);
++ memmove(nearex + 2, nearex + 1, len);
++ }
++ path[depth].p_ext = nearex + 1;
++ } else {
++ EXT_ASSERT(newext->ee_block != nearex->ee_block);
++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent);
++ len = len < 0 ? 0 : len;
++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, "
++ "move %d from 0x%p to 0x%p\n",
++ newext->ee_block, newext->ee_start, newext->ee_len,
++ nearex, len, nearex + 1, nearex + 2);
++ memmove(nearex + 1, nearex, len);
++ path[depth].p_ext = nearex;
++ }
++
++ eh->eh_entries++;
++ nearex = path[depth].p_ext;
++ nearex->ee_block = newext->ee_block;
++ nearex->ee_start = newext->ee_start;
++ nearex->ee_len = newext->ee_len;
++ /* FIXME: support for large fs */
++ nearex->ee_start_hi = 0;
++
++merge:
++ /* try to merge extents to the right */
++ while (nearex < EXT_LAST_EXTENT(eh)) {
++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1))
++ break;
++ /* merge with next extent! */
++ nearex->ee_len += nearex[1].ee_len;
++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) *
++ sizeof(struct ext3_extent);
++ memmove(nearex + 1, nearex + 2, len);
++ }
++ eh->eh_entries--;
++ EXT_ASSERT(eh->eh_entries > 0);
++ }
++
++ /* try to merge extents to the left */
++
++ /* time to correct all indexes above */
++ err = ext3_ext_correct_indexes(handle, tree, path);
++ if (err)
++ goto cleanup;
++
++ err = ext3_ext_dirty(handle, tree, path + depth);
++
++cleanup:
++ if (npath) {
++ ext3_ext_drop_refs(npath);
++ kfree(npath);
++ }
++ ext3_ext_tree_changed(tree);
++ ext3_ext_invalidate_cache(tree);
++ return err;
++}
++
++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block,
++ unsigned long num, ext_prepare_callback func)
++{
++ struct ext3_ext_path *path = NULL;
++ struct ext3_ext_cache cbex;
++ struct ext3_extent *ex;
++ unsigned long next, start = 0, end = 0;
++ unsigned long last = block + num;
++ int depth, exists, err = 0;
++
++ EXT_ASSERT(tree);
++ EXT_ASSERT(func);
++ EXT_ASSERT(tree->inode);
++ EXT_ASSERT(tree->root);
++
++ while (block < last && block != EXT_MAX_BLOCK) {
++ num = last - block;
++ /* find extent for this block */
++ path = ext3_ext_find_extent(tree, block, path);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ break;
++ }
++
++ depth = EXT_DEPTH(tree);
++ EXT_ASSERT(path[depth].p_hdr);
++ ex = path[depth].p_ext;
++ next = ext3_ext_next_allocated_block(path);
++
++ exists = 0;
++ if (!ex) {
++ /* there is no extent yet, so try to allocate
++ * all requested space */
++ start = block;
++ end = block + num;
++ } else if (ex->ee_block > block) {
++ /* need to allocate space before found extent */
++ start = block;
++ end = ex->ee_block;
++ if (block + num < end)
++ end = block + num;
++ } else if (block >= ex->ee_block + ex->ee_len) {
++ /* need to allocate space after found extent */
++ start = block;
++ end = block + num;
++ if (end >= next)
++ end = next;
++ } else if (block >= ex->ee_block) {
++ /*
++ * some part of requested space is covered
++ * by found extent
++ */
++ start = block;
++ end = ex->ee_block + ex->ee_len;
++ if (block + num < end)
++ end = block + num;
++ exists = 1;
++ } else {
++ BUG();
++ }
++ EXT_ASSERT(end > start);
++
++ if (!exists) {
++ cbex.ec_block = start;
++ cbex.ec_len = end - start;
++ cbex.ec_start = 0;
++ cbex.ec_type = EXT3_EXT_CACHE_GAP;
++ } else {
++ cbex.ec_block = ex->ee_block;
++ cbex.ec_len = ex->ee_len;
++ cbex.ec_start = ex->ee_start;
++ cbex.ec_type = EXT3_EXT_CACHE_EXTENT;
++ }
++
++ EXT_ASSERT(cbex.ec_len > 0);
++ EXT_ASSERT(path[depth].p_hdr);
++ err = func(tree, path, &cbex);
++ ext3_ext_drop_refs(path);
++
++ if (err < 0)
++ break;
++ if (err == EXT_REPEAT)
++ continue;
++ else if (err == EXT_BREAK) {
++ err = 0;
++ break;
++ }
++
++ if (EXT_DEPTH(tree) != depth) {
++ /* depth was changed. we have to realloc path */
++ kfree(path);
++ path = NULL;
++ }
++
++ block = cbex.ec_block + cbex.ec_len;
++ }
++
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
++
++ return err;
++}
++
++static inline void
++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, __u32 block,
++ __u32 len, __u32 start, int type)
++{
++ EXT_ASSERT(len > 0);
++ if (tree->cex) {
++ tree->cex->ec_type = type;
++ tree->cex->ec_block = block;
++ tree->cex->ec_len = len;
++ tree->cex->ec_start = start;
++ }
++}
++
++/*
++ * this routine calculate boundaries of the gap requested block fits into
++ * and cache this gap
++ */
++static inline void
++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ unsigned long block)
++{
++ int depth = EXT_DEPTH(tree);
++ unsigned long lblock, len;
++ struct ext3_extent *ex;
++
++ if (!tree->cex)
++ return;
++
++ ex = path[depth].p_ext;
++ if (ex == NULL) {
++ /* there is no extent yet, so gap is [0;-] */
++ lblock = 0;
++ len = EXT_MAX_BLOCK;
++ ext_debug(tree, "cache gap(whole file):");
++ } else if (block < ex->ee_block) {
++ lblock = block;
++ len = ex->ee_block - block;
++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]",
++ (unsigned long) block,
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len);
++ } else if (block >= ex->ee_block + ex->ee_len) {
++ lblock = ex->ee_block + ex->ee_len;
++ len = ext3_ext_next_allocated_block(path);
++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu",
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len,
++ (unsigned long) block);
++ EXT_ASSERT(len > lblock);
++ len = len - lblock;
++ } else {
++ lblock = len = 0;
++ BUG();
++ }
++
++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) lblock, len);
++ ext3_ext_put_in_cache(tree, lblock, len, 0, EXT3_EXT_CACHE_GAP);
++}
++
++static inline int
++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block,
++ struct ext3_extent *ex)
++{
++ struct ext3_ext_cache *cex = tree->cex;
++
++ /* is there cache storage at all? */
++ if (!cex)
++ return EXT3_EXT_CACHE_NO;
++
++ /* has cache valid data? */
++ if (cex->ec_type == EXT3_EXT_CACHE_NO)
++ return EXT3_EXT_CACHE_NO;
++
++ EXT_ASSERT(cex->ec_type == EXT3_EXT_CACHE_GAP ||
++ cex->ec_type == EXT3_EXT_CACHE_EXTENT);
++ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
++ ex->ee_block = cex->ec_block;
++ ex->ee_start = cex->ec_start;
++ ex->ee_len = cex->ec_len;
++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n",
++ (unsigned long) block,
++ (unsigned long) ex->ee_block,
++ (unsigned long) ex->ee_len,
++ (unsigned long) ex->ee_start);
++ return cex->ec_type;
++ }
++
++ /* not in cache */
++ return EXT3_EXT_CACHE_NO;
++}
++
++/*
++ * routine removes index from the index block
++ * it's used in truncate case only. thus all requests are for
++ * last index in the block only
++ */
++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ struct buffer_head *bh;
++ int err;
++
++ /* free index block */
++ path--;
++ EXT_ASSERT(path->p_hdr->eh_entries);
++ if ((err = ext3_ext_get_access(handle, tree, path)))
++ return err;
++ path->p_hdr->eh_entries--;
++ if ((err = ext3_ext_dirty(handle, tree, path)))
++ return err;
++ ext_debug(tree, "index is empty, remove it, free block %d\n",
++ path->p_idx->ei_leaf);
++ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++ return err;
++}
++
++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path)
++{
++ int depth = EXT_DEPTH(tree);
++ int needed;
++
++ if (path) {
++ /* probably there is space in leaf? */
++ if (path[depth].p_hdr->eh_entries < path[depth].p_hdr->eh_max)
++ return 1;
++ }
++
++ /*
++ * the worste case we're expecting is creation of the
++ * new root (growing in depth) with index splitting
++ * for splitting we have to consider depth + 1 because
++ * previous growing could increase it
++ */
++ depth = depth + 1;
++
++ /*
++ * growing in depth:
++ * block allocation + new root + old root
++ */
++ needed = EXT3_ALLOC_NEEDED + 2;
++
++ /* index split. we may need:
++ * allocate intermediate indexes and new leaf
++ * change two blocks at each level, but root
++ * modify root block (inode)
++ */
++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1;
++
++ return needed;
++}
++
++static int
++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, unsigned long start,
++ unsigned long end)
++{
++ struct ext3_extent *ex, tex;
++ struct ext3_ext_path *npath;
++ int depth, creds, err;
++
++ depth = EXT_DEPTH(tree);
++ ex = path[depth].p_ext;
++ EXT_ASSERT(ex);
++ EXT_ASSERT(end < ex->ee_block + ex->ee_len - 1);
++ EXT_ASSERT(ex->ee_block < start);
++
++ /* calculate tail extent */
++ tex.ee_block = end + 1;
++ EXT_ASSERT(tex.ee_block < ex->ee_block + ex->ee_len);
++ tex.ee_len = ex->ee_block + ex->ee_len - tex.ee_block;
++
++ creds = ext3_ext_calc_credits_for_insert(tree, path);
++ handle = ext3_ext_journal_restart(handle, creds);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ /* calculate head extent. use primary extent */
++ err = ext3_ext_get_access(handle, tree, path + depth);
++ if (err)
++ return err;
++ ex->ee_len = start - ex->ee_block;
++ err = ext3_ext_dirty(handle, tree, path + depth);
++ if (err)
++ return err;
++
++ /* FIXME: some callback to free underlying resource
++ * and correct ee_start? */
++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n",
++ ex->ee_block, ex->ee_len, tex.ee_block, tex.ee_len);
++
++ npath = ext3_ext_find_extent(tree, ex->ee_block, NULL);
++ if (IS_ERR(npath))
++ return PTR_ERR(npath);
++ depth = EXT_DEPTH(tree);
++ EXT_ASSERT(npath[depth].p_ext->ee_block == ex->ee_block);
++ EXT_ASSERT(npath[depth].p_ext->ee_len == ex->ee_len);
++
++ err = ext3_ext_insert_extent(handle, tree, npath, &tex);
++ ext3_ext_drop_refs(npath);
++ kfree(npath);
++
++ return err;
++}
++
++static int
++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path, unsigned long start,
++ unsigned long end)
++{
++ struct ext3_extent *ex, *fu = NULL, *lu, *le;
++ int err = 0, correct_index = 0;
++ int depth = EXT_DEPTH(tree), credits;
++ struct ext3_extent_header *eh;
++ unsigned a, b, block, num;
++
++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end);
++ if (!path[depth].p_hdr)
++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh);
++ eh = path[depth].p_hdr;
++ EXT_ASSERT(eh);
++ EXT_ASSERT(eh->eh_entries <= eh->eh_max);
++ EXT_ASSERT(eh->eh_magic == EXT3_EXT_MAGIC);
++
++ /* find where to start removing */
++ le = ex = EXT_LAST_EXTENT(eh);
++ while (ex != EXT_FIRST_EXTENT(eh)) {
++ if (ex->ee_block <= end)
++ break;
++ ex--;
++ }
++
++ if (start > ex->ee_block && end < ex->ee_block + ex->ee_len - 1) {
++ /* removal of internal part of the extent requested
++ * tail and head must be placed in different extent
++ * so, we have to insert one more extent */
++ path[depth].p_ext = ex;
++ return ext3_ext_split_for_rm(handle, tree, path, start, end);
++ }
++
++ lu = ex;
++ while (ex >= EXT_FIRST_EXTENT(eh) && ex->ee_block + ex->ee_len > start) {
++ ext_debug(tree, "remove ext %u:%u\n", ex->ee_block, ex->ee_len);
++ path[depth].p_ext = ex;
++
++ a = ex->ee_block > start ? ex->ee_block : start;
++ b = ex->ee_block + ex->ee_len - 1 < end ?
++ ex->ee_block + ex->ee_len - 1 : end;
++
++ ext_debug(tree, " border %u:%u\n", a, b);
++
++ if (a != ex->ee_block && b != ex->ee_block + ex->ee_len - 1) {
++ block = 0;
++ num = 0;
++ BUG();
++ } else if (a != ex->ee_block) {
++ /* remove tail of the extent */
++ block = ex->ee_block;
++ num = a - block;
++ } else if (b != ex->ee_block + ex->ee_len - 1) {
++ /* remove head of the extent */
++ block = a;
++ num = b - a;
++ } else {
++ /* remove whole extent: excelent! */
++ block = ex->ee_block;
++ num = 0;
++ EXT_ASSERT(a == ex->ee_block &&
++ b == ex->ee_block + ex->ee_len - 1);
++ }
++
++ if (ex == EXT_FIRST_EXTENT(eh))
++ correct_index = 1;
++
++ credits = 1;
++ if (correct_index)
++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1;
++ if (tree->ops->remove_extent_credits)
++ credits+=tree->ops->remove_extent_credits(tree,ex,a,b);
++
++ handle = ext3_ext_journal_restart(handle, credits);
++ if (IS_ERR(handle)) {
++ err = PTR_ERR(handle);
++ goto out;
++ }
++
++ err = ext3_ext_get_access(handle, tree, path + depth);
++ if (err)
++ goto out;
++
++ if (tree->ops->remove_extent)
++ err = tree->ops->remove_extent(tree, ex, a, b);
++ if (err)
++ goto out;
++
++ if (num == 0) {
++ /* this extent is removed entirely mark slot unused */
++ ex->ee_start = 0;
++ eh->eh_entries--;
++ fu = ex;
++ }
++
++ ex->ee_block = block;
++ ex->ee_len = num;
++
++ err = ext3_ext_dirty(handle, tree, path + depth);
++ if (err)
++ goto out;
++
++ ext_debug(tree, "new extent: %u:%u:%u\n",
++ ex->ee_block, ex->ee_len, ex->ee_start);
++ ex--;
++ }
++
++ if (fu) {
++ /* reuse unused slots */
++ while (lu < le) {
++ if (lu->ee_start) {
++ *fu = *lu;
++ lu->ee_start = 0;
++ fu++;
++ }
++ lu++;
++ }
++ }
++
++ if (correct_index && eh->eh_entries)
++ err = ext3_ext_correct_indexes(handle, tree, path);
++
++ /* if this leaf is free, then we should
++ * remove it from index block above */
++ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
++ err = ext3_ext_rm_idx(handle, tree, path + depth);
++
++out:
++ return err;
++}
++
++
++static struct ext3_extent_idx *
++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block)
++{
++ struct ext3_extent_idx *ix;
++
++ ix = EXT_LAST_INDEX(hdr);
++ while (ix != EXT_FIRST_INDEX(hdr)) {
++ if (ix->ei_block <= block)
++ break;
++ ix--;
++ }
++ return ix;
++}
++
++/*
++ * returns 1 if current index have to be freed (even partial)
++ */
++static int inline
++ext3_ext_more_to_rm(struct ext3_ext_path *path)
++{
++ EXT_ASSERT(path->p_idx);
++
++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
++ return 0;
++
++ /*
++ * if truncate on deeper level happened it it wasn't partial
++ * so we have to consider current index for truncation
++ */
++ if (path->p_hdr->eh_entries == path->p_block)
++ return 0;
++ return 1;
++}
++
++int ext3_ext_remove_space(struct ext3_extents_tree *tree,
++ unsigned long start, unsigned long end)
++{
++ struct inode *inode = tree->inode;
++ struct super_block *sb = inode->i_sb;
++ int depth = EXT_DEPTH(tree);
++ struct ext3_ext_path *path;
++ handle_t *handle;
++ int i = 0, err = 0;
++
++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end);
++
++ /* probably first extent we're gonna free will be last in block */
++ handle = ext3_journal_start(inode, depth + 1);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++
++ ext3_ext_invalidate_cache(tree);
++
++ /*
++ * we start scanning from right side freeing all the blocks
++ * after i_size and walking into the deep
++ */
++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL);
++ if (IS_ERR(path)) {
++ ext3_error(sb, __FUNCTION__, "Can't allocate path array");
++ ext3_journal_stop(handle);
++ return -ENOMEM;
++ }
++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1));
++ path[i].p_hdr = EXT_ROOT_HDR(tree);
++
++ while (i >= 0 && err == 0) {
++ if (i == depth) {
++ /* this is leaf block */
++ err = ext3_ext_rm_leaf(handle, tree, path, start, end);
++ /* root level have p_bh == NULL, brelse() eats this */
++ brelse(path[i].p_bh);
++ i--;
++ continue;
++ }
++
++ /* this is index block */
++ if (!path[i].p_hdr) {
++ ext_debug(tree, "initialize header\n");
++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh);
++ }
++
++ EXT_ASSERT(path[i].p_hdr->eh_entries <= path[i].p_hdr->eh_max);
++ EXT_ASSERT(path[i].p_hdr->eh_magic == EXT3_EXT_MAGIC);
++
++ if (!path[i].p_idx) {
++ /* this level hasn't touched yet */
++ path[i].p_idx =
++ ext3_ext_last_covered(path[i].p_hdr, end);
++ path[i].p_block = path[i].p_hdr->eh_entries + 1;
++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n",
++ path[i].p_hdr, path[i].p_hdr->eh_entries);
++ } else {
++ /* we've already was here, see at next index */
++ path[i].p_idx--;
++ }
++
++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n",
++ i, EXT_FIRST_INDEX(path[i].p_hdr),
++ path[i].p_idx);
++ if (ext3_ext_more_to_rm(path + i)) {
++ /* go to the next level */
++ ext_debug(tree, "move to level %d (block %d)\n",
++ i + 1, path[i].p_idx->ei_leaf);
++ memset(path + i + 1, 0, sizeof(*path));
++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->ei_leaf);
++ if (!path[i+1].p_bh) {
++ /* should we reset i_size? */
++ err = -EIO;
++ break;
++ }
++ /* put actual number of indexes to know is this
++ * number got changed at the next iteration */
++ path[i].p_block = path[i].p_hdr->eh_entries;
++ i++;
++ } else {
++ /* we finish processing this index, go up */
++ if (path[i].p_hdr->eh_entries == 0 && i > 0) {
++ /* index is empty, remove it
++ * handle must be already prepared by the
++ * truncatei_leaf() */
++ err = ext3_ext_rm_idx(handle, tree, path + i);
++ }
++ /* root level have p_bh == NULL, brelse() eats this */
++ brelse(path[i].p_bh);
++ i--;
++ ext_debug(tree, "return to level %d\n", i);
++ }
++ }
++
++ /* TODO: flexible tree reduction should be here */
++ if (path->p_hdr->eh_entries == 0) {
++ /*
++ * truncate to zero freed all the tree
++ * so, we need to correct eh_depth
++ */
++ err = ext3_ext_get_access(handle, tree, path);
++ if (err == 0) {
++ EXT_ROOT_HDR(tree)->eh_depth = 0;
++ EXT_ROOT_HDR(tree)->eh_max = ext3_ext_space_root(tree);
++ err = ext3_ext_dirty(handle, tree, path);
++ }
++ }
++ ext3_ext_tree_changed(tree);
++
++ kfree(path);
++ ext3_journal_stop(handle);
++
++ return err;
++}
++
++int ext3_ext_calc_metadata_amount(struct ext3_extents_tree *tree, int blocks)
++{
++ int lcap, icap, rcap, leafs, idxs, num;
++
++ rcap = ext3_ext_space_root(tree);
++ if (blocks <= rcap) {
++ /* all extents fit to the root */
++ return 0;
++ }
++
++ rcap = ext3_ext_space_root_idx(tree);
++ lcap = ext3_ext_space_block(tree);
++ icap = ext3_ext_space_block_idx(tree);
++
++ num = leafs = (blocks + lcap - 1) / lcap;
++ if (leafs <= rcap) {
++ /* all pointers to leafs fit to the root */
++ return leafs;
++ }
++
++ /* ok. we need separate index block(s) to link all leaf blocks */
++ idxs = (leafs + icap - 1) / icap;
++ do {
++ num += idxs;
++ idxs = (idxs + icap - 1) / icap;
++ } while (idxs > rcap);
++
++ return num;
++}
++
++/*
++ * called at mount time
++ */
++void ext3_ext_init(struct super_block *sb)
++{
++ /*
++ * possible initialization would be here
++ */
++
++ if (test_opt(sb, EXTENTS)) {
++ printk("EXT3-fs: file extents enabled");
++#ifdef AGRESSIVE_TEST
++ printk(", agressive tests");
++#endif
++#ifdef CHECK_BINSEARCH
++ printk(", check binsearch");
++#endif
++ printk("\n");
++ }
++}
++
++/*
++ * called at umount time
++ */
++void ext3_ext_release(struct super_block *sb)
++{
++}
++
++/************************************************************************
++ * VFS related routines
++ ************************************************************************/
++
++static int ext3_get_inode_write_access(handle_t *handle, void *buffer)
++{
++ /* we use in-core data, not bh */
++ return 0;
++}
++
++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer)
++{
++ struct inode *inode = buffer;
++ return ext3_mark_inode_dirty(handle, inode);
++}
++
++static int ext3_ext_mergable(struct ext3_extent *ex1,
++ struct ext3_extent *ex2)
++{
++ /* FIXME: support for large fs */
++ if (ex1->ee_start + ex1->ee_len == ex2->ee_start)
++ return 1;
++ return 0;
++}
++
++static int
++ext3_remove_blocks_credits(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex,
++ unsigned long from, unsigned long to)
++{
++ int needed;
++
++ /* at present, extent can't cross block group */;
++ needed = 4; /* bitmap + group desc + sb + inode */
++
++#ifdef CONFIG_QUOTA
++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++ return needed;
++}
++
++static int
++ext3_remove_blocks(struct ext3_extents_tree *tree,
++ struct ext3_extent *ex,
++ unsigned long from, unsigned long to)
++{
++ int needed = ext3_remove_blocks_credits(tree, ex, from, to);
++ handle_t *handle = ext3_journal_start(tree->inode, needed);
++ struct buffer_head *bh;
++ int i;
++
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
++ /* tail removal */
++ unsigned long num, start;
++ num = ex->ee_block + ex->ee_len - from;
++ start = ex->ee_start + ex->ee_len - num;
++ ext_debug(tree, "free last %lu blocks starting %lu\n",
++ num, start);
++ for (i = 0; i < num; i++) {
++ bh = sb_find_get_block(tree->inode->i_sb, start + i);
++ ext3_forget(handle, 0, tree->inode, bh, start + i);
++ }
++ ext3_free_blocks(handle, tree->inode, start, num);
++ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
++ printk("strange request: removal %lu-%lu from %u:%u\n",
++ from, to, ex->ee_block, ex->ee_len);
++ } else {
++ printk("strange request: removal(2) %lu-%lu from %u:%u\n",
++ from, to, ex->ee_block, ex->ee_len);
++ }
++ ext3_journal_stop(handle);
++ return 0;
++}
++
++static int ext3_ext_find_goal(struct inode *inode,
++ struct ext3_ext_path *path, unsigned long block)
++{
++ struct ext3_inode_info *ei = EXT3_I(inode);
++ unsigned long bg_start;
++ unsigned long colour;
++ int depth;
++
++ if (path) {
++ struct ext3_extent *ex;
++ depth = path->p_depth;
++
++ /* try to predict block placement */
++ if ((ex = path[depth].p_ext))
++ return ex->ee_start + (block - ex->ee_block);
++
++ /* it looks index is empty
++ * try to find starting from index itself */
++ if (path[depth].p_bh)
++ return path[depth].p_bh->b_blocknr;
++ }
++
++ /* OK. use inode's group */
++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
++ colour = (current->pid % 16) *
++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
++ return bg_start + colour + block;
++}
++
++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_extent *ex, int *err)
++{
++ struct inode *inode = tree->inode;
++ int newblock, goal;
++
++ EXT_ASSERT(path);
++ EXT_ASSERT(ex);
++ EXT_ASSERT(ex->ee_start);
++ EXT_ASSERT(ex->ee_len);
++
++ /* reuse block from the extent to order data/metadata */
++ newblock = ex->ee_start++;
++ ex->ee_len--;
++ if (ex->ee_len == 0) {
++ ex->ee_len = 1;
++ /* allocate new block for the extent */
++ goal = ext3_ext_find_goal(inode, path, ex->ee_block);
++ ex->ee_start = ext3_new_block(handle, inode, goal, err);
++ if (ex->ee_start == 0) {
++ /* error occured: restore old extent */
++ ex->ee_start = newblock;
++ return 0;
++ }
++ }
++ return newblock;
++}
++
++static struct ext3_extents_helpers ext3_blockmap_helpers = {
++ .get_write_access = ext3_get_inode_write_access,
++ .mark_buffer_dirty = ext3_mark_buffer_dirty,
++ .mergable = ext3_ext_mergable,
++ .new_block = ext3_new_block_cb,
++ .remove_extent = ext3_remove_blocks,
++ .remove_extent_credits = ext3_remove_blocks_credits,
++};
++
++void ext3_init_tree_desc(struct ext3_extents_tree *tree,
++ struct inode *inode)
++{
++ tree->inode = inode;
++ tree->root = (void *) EXT3_I(inode)->i_data;
++ tree->buffer = (void *) inode;
++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data);
++ tree->cex = (struct ext3_ext_cache *) &EXT3_I(inode)->i_cached_extent;
++ tree->ops = &ext3_blockmap_helpers;
++}
++
++int ext3_ext_get_block(handle_t *handle, struct inode *inode,
++ long iblock, struct buffer_head *bh_result,
++ int create, int extend_disksize)
++{
++ struct ext3_ext_path *path = NULL;
++ struct ext3_extent newex;
++ struct ext3_extent *ex;
++ int goal, newblock, err = 0, depth;
++ struct ext3_extents_tree tree;
++
++ clear_buffer_new(bh_result);
++ ext3_init_tree_desc(&tree, inode);
++ ext_debug(&tree, "block %d requested for inode %u\n",
++ (int) iblock, (unsigned) inode->i_ino);
++ down(&EXT3_I(inode)->truncate_sem);
++
++ /* check in cache */
++ if ((goal = ext3_ext_in_cache(&tree, iblock, &newex))) {
++ if (goal == EXT3_EXT_CACHE_GAP) {
++ if (!create) {
++ /* block isn't allocated yet and
++ * user don't want to allocate it */
++ goto out2;
++ }
++ /* we should allocate requested block */
++ } else if (goal == EXT3_EXT_CACHE_EXTENT) {
++ /* block is already allocated */
++ newblock = iblock - newex.ee_block + newex.ee_start;
++ goto out;
++ } else {
++ EXT_ASSERT(0);
++ }
++ }
++
++ /* find extent for this block */
++ path = ext3_ext_find_extent(&tree, iblock, NULL);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ path = NULL;
++ goto out2;
++ }
++
++ depth = EXT_DEPTH(&tree);
++
++ /*
++ * consistent leaf must not be empty
++ * this situations is possible, though, _during_ tree modification
++ * this is why assert can't be put in ext3_ext_find_extent()
++ */
++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0);
++
++ if ((ex = path[depth].p_ext)) {
++ /* if found exent covers block, simple return it */
++ if (iblock >= ex->ee_block && iblock < ex->ee_block + ex->ee_len) {
++ newblock = iblock - ex->ee_block + ex->ee_start;
++ ext_debug(&tree, "%d fit into %d:%d -> %d\n",
++ (int) iblock, ex->ee_block, ex->ee_len,
++ newblock);
++ ext3_ext_put_in_cache(&tree, ex->ee_block,
++ ex->ee_len, ex->ee_start,
++ EXT3_EXT_CACHE_EXTENT);
++ goto out;
++ }
++ }
++
++ /*
++ * requested block isn't allocated yet
++ * we couldn't try to create block if create flag is zero
++ */
++ if (!create) {
++ /* put just found gap into cache to speedup subsequest reqs */
++ ext3_ext_put_gap_in_cache(&tree, path, iblock);
++ goto out2;
++ }
++
++ /* allocate new block */
++ goal = ext3_ext_find_goal(inode, path, iblock);
++ newblock = ext3_new_block(handle, inode, goal, &err);
++ if (!newblock)
++ goto out2;
++ ext_debug(&tree, "allocate new block: goal %d, found %d\n",
++ goal, newblock);
++
++ /* try to insert new extent into found leaf and return */
++ newex.ee_block = iblock;
++ newex.ee_start = newblock;
++ newex.ee_len = 1;
++ err = ext3_ext_insert_extent(handle, &tree, path, &newex);
++ if (err)
++ goto out2;
++
++ if (extend_disksize && inode->i_size > EXT3_I(inode)->i_disksize)
++ EXT3_I(inode)->i_disksize = inode->i_size;
++
++ /* previous routine could use block we allocated */
++ newblock = newex.ee_start;
++ set_buffer_new(bh_result);
++
++ ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len,
++ newex.ee_start, EXT3_EXT_CACHE_EXTENT);
++out:
++ ext3_ext_show_leaf(&tree, path);
++ map_bh(bh_result, inode->i_sb, newblock);
++out2:
++ if (path) {
++ ext3_ext_drop_refs(path);
++ kfree(path);
++ }
++ up(&EXT3_I(inode)->truncate_sem);
++
++ return err;
++}
++
++void ext3_ext_truncate(struct inode * inode, struct page *page)
++{
++ struct address_space *mapping = inode->i_mapping;
++ struct super_block *sb = inode->i_sb;
++ struct ext3_extents_tree tree;
++ unsigned long last_block;
++ handle_t *handle;
++ int err = 0;
++
++ ext3_init_tree_desc(&tree, inode);
++
++ /*
++ * probably first extent we're gonna free will be last in block
++ */
++ err = ext3_writepage_trans_blocks(inode) + 3;
++ handle = ext3_journal_start(inode, err);
++ if (IS_ERR(handle)) {
++ if (page) {
++ clear_highpage(page);
++ flush_dcache_page(page);
++ unlock_page(page);
++ page_cache_release(page);
++ }
++ return;
++ }
++
++ if (page)
++ ext3_block_truncate_page(handle, page, mapping, inode->i_size);
++
++ down(&EXT3_I(inode)->truncate_sem);
++ ext3_ext_invalidate_cache(&tree);
++
++ /*
++ * TODO: optimization is possible here
++ * probably we need not scaning at all,
++ * because page truncation is enough
++ */
++ if (ext3_orphan_add(handle, inode))
++ goto out_stop;
++
++ /* we have to know where to truncate from in crash case */
++ EXT3_I(inode)->i_disksize = inode->i_size;
++ ext3_mark_inode_dirty(handle, inode);
++
++ last_block = (inode->i_size + sb->s_blocksize - 1) >>
++ EXT3_BLOCK_SIZE_BITS(sb);
++ err = ext3_ext_remove_space(&tree, last_block, EXT_MAX_BLOCK);
++
++ /* In a multi-transaction truncate, we only make the final
++ * transaction synchronous */
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++
++out_stop:
++ /*
++ * If this was a simple ftruncate(), and the file will remain alive
++ * then we need to clear up the orphan record which we created above.
++ * However, if this was a real unlink then we were called by
++ * ext3_delete_inode(), and we allow that function to clean up the
++ * orphan info for us.
++ */
++ if (inode->i_nlink)
++ ext3_orphan_del(handle, inode);
++
++ up(&EXT3_I(inode)->truncate_sem);
++ ext3_journal_stop(handle);
++}
++
++/*
++ * this routine calculate max number of blocks we could modify
++ * in order to allocate new block for an inode
++ */
++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num)
++{
++ struct ext3_extents_tree tree;
++ int needed;
++
++ ext3_init_tree_desc(&tree, inode);
++
++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL);
++
++ /* caller want to allocate num blocks */
++ needed *= num;
++
++#ifdef CONFIG_QUOTA
++ /*
++ * FIXME: real calculation should be here
++ * it depends on blockmap format of qouta file
++ */
++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++ return needed;
++}
++
++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode)
++{
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ ext3_extent_tree_init(handle, &tree);
++}
++
++int ext3_ext_calc_blockmap_metadata(struct inode *inode, int blocks)
++{
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ return ext3_ext_calc_metadata_amount(&tree, blocks);
++}
++
++static int
++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_ext_cache *newex)
++{
++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private;
++
++ if (newex->ec_type != EXT3_EXT_CACHE_EXTENT)
++ return EXT_CONTINUE;
++
++ if (buf->err < 0)
++ return EXT_BREAK;
++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen)
++ return EXT_BREAK;
++
++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) {
++ buf->err++;
++ buf->cur += sizeof(*newex);
++ } else {
++ buf->err = -EFAULT;
++ return EXT_BREAK;
++ }
++ return EXT_CONTINUE;
++}
++
++static int
++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree,
++ struct ext3_ext_path *path,
++ struct ext3_ext_cache *ex)
++{
++ struct ext3_extent_tree_stats *buf =
++ (struct ext3_extent_tree_stats *) tree->private;
++ int depth;
++
++ if (ex->ec_type != EXT3_EXT_CACHE_EXTENT)
++ return EXT_CONTINUE;
++
++ depth = EXT_DEPTH(tree);
++ buf->extents_num++;
++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr))
++ buf->leaf_num++;
++ return EXT_CONTINUE;
++}
++
++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
++ unsigned long arg)
++{
++ int err = 0;
++
++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL))
++ return -EINVAL;
++
++ if (cmd == EXT3_IOC_GET_EXTENTS) {
++ struct ext3_extent_buf buf;
++ struct ext3_extents_tree tree;
++
++ if (copy_from_user(&buf, (void *) arg, sizeof(buf)))
++ return -EFAULT;
++
++ ext3_init_tree_desc(&tree, inode);
++ buf.cur = buf.buffer;
++ buf.err = 0;
++ tree.private = &buf;
++ down(&EXT3_I(inode)->truncate_sem);
++ err = ext3_ext_walk_space(&tree, buf.start, EXT_MAX_BLOCK,
++ ext3_ext_store_extent_cb);
++ up(&EXT3_I(inode)->truncate_sem);
++ if (err == 0)
++ err = buf.err;
++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) {
++ struct ext3_extent_tree_stats buf;
++ struct ext3_extents_tree tree;
++
++ ext3_init_tree_desc(&tree, inode);
++ down(&EXT3_I(inode)->truncate_sem);
++ buf.depth = EXT_DEPTH(&tree);
++ buf.extents_num = 0;
++ buf.leaf_num = 0;
++ tree.private = &buf;
++ err = ext3_ext_walk_space(&tree, 0, EXT_MAX_BLOCK,
++ ext3_ext_collect_stats_cb);
++ up(&EXT3_I(inode)->truncate_sem);
++ if (!err)
++ err = copy_to_user((void *) arg, &buf, sizeof(buf));
++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) {
++ struct ext3_extents_tree tree;
++ ext3_init_tree_desc(&tree, inode);
++ down(&EXT3_I(inode)->truncate_sem);
++ err = EXT_DEPTH(&tree);
++ up(&EXT3_I(inode)->truncate_sem);
++ }
++
++ return err;
++}
++
++EXPORT_SYMBOL(ext3_init_tree_desc);
++EXPORT_SYMBOL(ext3_mark_inode_dirty);
++EXPORT_SYMBOL(ext3_ext_invalidate_cache);
++EXPORT_SYMBOL(ext3_ext_insert_extent);
++EXPORT_SYMBOL(ext3_ext_walk_space);
++EXPORT_SYMBOL(ext3_ext_find_goal);
++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert);
+Index: linux-2.6.12-rc6/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/ialloc.c 2005-06-14 16:31:08.634433030 +0200
++++ linux-2.6.12-rc6/fs/ext3/ialloc.c 2005-06-14 16:31:25.846346882 +0200
+@@ -598,7 +598,7 @@
+ ei->i_dir_start_lookup = 0;
+ ei->i_disksize = 0;
+
+- ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
++ ei->i_flags = EXT3_I(dir)->i_flags & ~(EXT3_INDEX_FL|EXT3_EXTENTS_FL);
+ if (S_ISLNK(mode))
+ ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
+ /* dirsync only applies to directories */
+@@ -639,6 +639,18 @@
+ DQUOT_FREE_INODE(inode);
+ goto fail2;
+ }
++ if (test_opt(sb, EXTENTS) && S_ISREG(inode->i_mode)) {
++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL;
++ ext3_extents_initialize_blockmap(handle, inode);
++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS)) {
++ err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
++ if (err) goto fail;
++ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_EXTENTS);
++ BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
++ }
++ }
++
+ err = ext3_mark_inode_dirty(handle, inode);
+ if (err) {
+ ext3_std_error(sb, err);
+Index: linux-2.6.12-rc6/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:31:09.701815830 +0200
++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:31:25.861971882 +0200
+@@ -40,7 +40,7 @@
+ #include "iopen.h"
+ #include "acl.h"
+
+-static int ext3_writepage_trans_blocks(struct inode *inode);
++int ext3_writepage_trans_blocks(struct inode *inode);
+
+ /*
+ * Test whether an inode is a fast symlink.
+@@ -784,6 +784,17 @@
+ return err;
+ }
+
++static inline int
++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block,
++ struct buffer_head *bh, int create, int extend_disksize)
++{
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_get_block(handle, inode, block, bh, create,
++ extend_disksize);
++ return ext3_get_block_handle(handle, inode, block, bh, create,
++ extend_disksize);
++}
++
+ static int ext3_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+ {
+@@ -794,8 +805,8 @@
+ handle = ext3_journal_current_handle();
+ J_ASSERT(handle != 0);
+ }
+- ret = ext3_get_block_handle(handle, inode, iblock,
+- bh_result, create, 1);
++ ret = ext3_get_block_wrap(handle, inode, iblock,
++ bh_result, create, 1);
+ return ret;
+ }
+
+@@ -839,7 +850,7 @@
+
+ get_block:
+ if (ret == 0)
+- ret = ext3_get_block_handle(handle, inode, iblock,
++ ret = ext3_get_block_wrap(handle, inode, iblock,
+ bh_result, create, 0);
+ bh_result->b_size = (1 << inode->i_blkbits);
+ return ret;
+@@ -859,7 +870,7 @@
+ dummy.b_state = 0;
+ dummy.b_blocknr = -1000;
+ buffer_trace_init(&dummy.b_history);
+- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create, 1);
+ if (!*errp && buffer_mapped(&dummy)) {
+ struct buffer_head *bh;
+ bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+@@ -1593,7 +1604,7 @@
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
++int ext3_block_truncate_page(handle_t *handle, struct page *page,
+ struct address_space *mapping, loff_t from)
+ {
+ unsigned long index = from >> PAGE_CACHE_SHIFT;
+@@ -2104,6 +2115,9 @@
+ return;
+ }
+
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_truncate(inode, page);
++
+ handle = start_transaction(inode);
+ if (IS_ERR(handle)) {
+ if (page) {
+@@ -2850,12 +2864,15 @@
+ * block and work out the exact number of indirects which are touched. Pah.
+ */
+
+-static int ext3_writepage_trans_blocks(struct inode *inode)
++int ext3_writepage_trans_blocks(struct inode *inode)
+ {
+ int bpp = ext3_journal_blocks_per_page(inode);
+ int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+ int ret;
+
++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)
++ return ext3_ext_writepage_trans_blocks(inode, bpp);
++
+ if (ext3_should_journal_data(inode))
+ ret = 3 * (bpp + indirects) + 2;
+ else
+Index: linux-2.6.12-rc6/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:31:09.179354899 +0200
++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:31:25.872714069 +0200
+@@ -5,7 +5,7 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\
+- ioctl.o namei.o super.o symlink.o hash.o resize.o
++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
+Index: linux-2.6.12-rc6/fs/ext3/super.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:31:09.950839264 +0200
++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:31:25.886385944 +0200
+@@ -387,6 +387,7 @@
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+ if (!(sb->s_flags & MS_RDONLY)) {
+@@ -451,6 +452,8 @@
+ #endif
+ ei->i_block_alloc_info = NULL;
+ ei->vfs_inode.i_version = 1;
++
++ memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent));
+ return &ei->vfs_inode;
+ }
+
+@@ -593,7 +596,7 @@
+ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+- Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
++ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug,
+ };
+
+ static match_table_t tokens = {
+@@ -644,6 +647,8 @@
+ {Opt_iopen, "iopen"},
+ {Opt_noiopen, "noiopen"},
+ {Opt_iopen_nopriv, "iopen_nopriv"},
++ {Opt_extents, "extents"},
++ {Opt_extdebug, "extdebug"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+@@ -953,6 +958,12 @@
+ case Opt_nobh:
+ set_opt(sbi->s_mount_opt, NOBH);
+ break;
++ case Opt_extents:
++ set_opt (sbi->s_mount_opt, EXTENTS);
++ break;
++ case Opt_extdebug:
++ set_opt (sbi->s_mount_opt, EXTDEBUG);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1668,6 +1681,7 @@
+ percpu_counter_mod(&sbi->s_dirs_counter,
+ ext3_count_dirs(sb));
+
++ ext3_ext_init(sb);
+ lock_kernel();
+ return 0;
+
+Index: linux-2.6.12-rc6/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/ioctl.c 2005-06-14 16:31:08.646151780 +0200
++++ linux-2.6.12-rc6/fs/ext3/ioctl.c 2005-06-14 16:31:25.897128131 +0200
+@@ -124,6 +124,10 @@
+ err = ext3_change_inode_journal_flag(inode, jflag);
+ return err;
+ }
++ case EXT3_IOC_GET_EXTENTS:
++ case EXT3_IOC_GET_TREE_STATS:
++ case EXT3_IOC_GET_TREE_DEPTH:
++ return ext3_ext_ioctl(inode, filp, cmd, arg);
+ case EXT3_IOC_GETVERSION:
+ case EXT3_IOC_GETVERSION_OLD:
+ return put_user(inode->i_generation, (int __user *) arg);
+Index: linux-2.6.12-rc6/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:31:10.185214261 +0200
++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:31:52.859041864 +0200
+@@ -186,8 +186,9 @@
+ #define EXT3_NOTAIL_FL 0x00008000 /* don't merge file tail */
+ #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
+ #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */
+ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */
+
+-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
++#define EXT3_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
+ #define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
+
+@@ -237,6 +238,9 @@
+ #endif
+ #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long)
+ #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long)
++#define EXT3_IOC_GET_EXTENTS _IOR('f', 7, long)
++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 8, long)
++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 9, long)
+
+ /*
+ * Structure of an inode on the disk
+@@ -360,6 +364,8 @@
+ #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */
+ #define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000 /* Make iopen world-readable */
++#define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */
++#define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -548,11 +554,13 @@
+ #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
+ #define EXT3_FEATURE_INCOMPAT_META_BG 0x0010
++#define EXT3_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */
+
+ #define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ EXT3_FEATURE_INCOMPAT_RECOVER| \
+- EXT3_FEATURE_INCOMPAT_META_BG)
++ EXT3_FEATURE_INCOMPAT_META_BG| \
++ EXT3_FEATURE_INCOMPAT_EXTENTS)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
+@@ -759,6 +767,7 @@
+
+
+ /* inode.c */
++extern int ext3_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
+ extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+@@ -828,6 +837,16 @@
+ extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+
++/* extents.c */
++extern int ext3_ext_writepage_trans_blocks(struct inode *, int);
++extern int ext3_ext_get_block(handle_t *, struct inode *, long,
++ struct buffer_head *, int, int);
++extern void ext3_ext_truncate(struct inode *, struct page *);
++extern void ext3_ext_init(struct super_block *);
++extern void ext3_ext_release(struct super_block *);
++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *);
++extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
++ unsigned int cmd, unsigned long arg);
+
+ #endif /* __KERNEL__ */
+
+Index: linux-2.6.12-rc6/include/linux/ext3_extents.h
+===================================================================
+--- linux-2.6.12-rc6.orig/include/linux/ext3_extents.h 2005-06-14 16:31:25.780917195 +0200
++++ linux-2.6.12-rc6/include/linux/ext3_extents.h 2005-06-14 16:31:25.932284381 +0200
+@@ -0,0 +1,264 @@
++/*
++ * Copyright (c) 2003, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++#ifndef _LINUX_EXT3_EXTENTS
++#define _LINUX_EXT3_EXTENTS
++
++/*
++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
++ * become very little, so index split, in-depth growing and
++ * other hard changes happens much more often
++ * this is for debug purposes only
++ */
++#define AGRESSIVE_TEST_
++
++/*
++ * if CHECK_BINSEARCH defined, then results of binary search
++ * will be checked by linear search
++ */
++#define CHECK_BINSEARCH_
++
++/*
++ * if EXT_DEBUG is defined you can use 'extdebug' mount option
++ * to get lots of info what's going on
++ */
++#define EXT_DEBUG_
++#ifdef EXT_DEBUG
++#define ext_debug(tree,fmt,a...) \
++do { \
++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \
++ printk(fmt, ##a); \
++} while (0);
++#else
++#define ext_debug(tree,fmt,a...)
++#endif
++
++/*
++ * if EXT_STATS is defined then stats numbers are collected
++ * these number will be displayed at umount time
++ */
++#define EXT_STATS_
++
++
++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */
++
++/*
++ * ext3_inode has i_block array (total 60 bytes)
++ * first 4 bytes are used to store:
++ * - tree depth (0 mean there is no tree yet. all extents in the inode)
++ * - number of alive extents in the inode
++ */
++
++/*
++ * this is extent on-disk structure
++ * it's used at the bottom of the tree
++ */
++struct ext3_extent {
++ __u32 ee_block; /* first logical block extent covers */
++ __u16 ee_len; /* number of blocks covered by extent */
++ __u16 ee_start_hi; /* high 16 bits of physical block */
++ __u32 ee_start; /* low 32 bigs of physical block */
++};
++
++/*
++ * this is index on-disk structure
++ * it's used at all the levels, but the bottom
++ */
++struct ext3_extent_idx {
++ __u32 ei_block; /* index covers logical blocks from 'block' */
++ __u32 ei_leaf; /* pointer to the physical block of the next *
++ * level. leaf or next index could bet here */
++ __u16 ei_leaf_hi; /* high 16 bits of physical block */
++ __u16 ei_unused;
++};
++
++/*
++ * each block (leaves and indexes), even inode-stored has header
++ */
++struct ext3_extent_header {
++ __u16 eh_magic; /* probably will support different formats */
++ __u16 eh_entries; /* number of valid entries */
++ __u16 eh_max; /* capacity of store in entries */
++ __u16 eh_depth; /* has tree real underlaying blocks? */
++ __u32 eh_generation; /* generation of the tree */
++};
++
++#define EXT3_EXT_MAGIC 0xf30a
++
++/*
++ * array of ext3_ext_path contains path to some extent
++ * creation/lookup routines use it for traversal/splitting/etc
++ * truncate uses it to simulate recursive walking
++ */
++struct ext3_ext_path {
++ __u32 p_block;
++ __u16 p_depth;
++ struct ext3_extent *p_ext;
++ struct ext3_extent_idx *p_idx;
++ struct ext3_extent_header *p_hdr;
++ struct buffer_head *p_bh;
++};
++
++/*
++ * structure for external API
++ */
++
++/*
++ * storage for cached extent
++ */
++struct ext3_ext_cache {
++ __u32 ec_start;
++ __u32 ec_block;
++ __u32 ec_len;
++ __u32 ec_type;
++};
++
++#define EXT3_EXT_CACHE_NO 0
++#define EXT3_EXT_CACHE_GAP 1
++#define EXT3_EXT_CACHE_EXTENT 2
++
++/*
++ * ext3_extents_tree is used to pass initial information
++ * to top-level extents API
++ */
++struct ext3_extents_helpers;
++struct ext3_extents_tree {
++ struct inode *inode; /* inode which tree belongs to */
++ void *root; /* ptr to data top of tree resides at */
++ void *buffer; /* will be passed as arg to ^^ routines */
++ int buffer_len;
++ void *private;
++ struct ext3_ext_cache *cex;/* last found extent */
++ struct ext3_extents_helpers *ops;
++};
++
++struct ext3_extents_helpers {
++ int (*get_write_access)(handle_t *h, void *buffer);
++ int (*mark_buffer_dirty)(handle_t *h, void *buffer);
++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2);
++ int (*remove_extent_credits)(struct ext3_extents_tree *,
++ struct ext3_extent *, unsigned long,
++ unsigned long);
++ int (*remove_extent)(struct ext3_extents_tree *,
++ struct ext3_extent *, unsigned long,
++ unsigned long);
++ int (*new_block)(handle_t *, struct ext3_extents_tree *,
++ struct ext3_ext_path *, struct ext3_extent *,
++ int *);
++};
++
++/*
++ * to be called by ext3_ext_walk_space()
++ * negative retcode - error
++ * positive retcode - signal for ext3_ext_walk_space(), see below
++ * callback must return valid extent (passed or newly created)
++ */
++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *,
++ struct ext3_ext_path *,
++ struct ext3_ext_cache *);
++
++#define EXT_CONTINUE 0
++#define EXT_BREAK 1
++#define EXT_REPEAT 2
++
++
++#define EXT_MAX_BLOCK 0xffffffff
++
++
++#define EXT_FIRST_EXTENT(__hdr__) \
++ ((struct ext3_extent *) (((char *) (__hdr__)) + \
++ sizeof(struct ext3_extent_header)))
++#define EXT_FIRST_INDEX(__hdr__) \
++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \
++ sizeof(struct ext3_extent_header)))
++#define EXT_HAS_FREE_INDEX(__path__) \
++ ((__path__)->p_hdr->eh_entries < (__path__)->p_hdr->eh_max)
++#define EXT_LAST_EXTENT(__hdr__) \
++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_LAST_INDEX(__hdr__) \
++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_entries - 1)
++#define EXT_MAX_EXTENT(__hdr__) \
++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->eh_max - 1)
++#define EXT_MAX_INDEX(__hdr__) \
++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->eh_max - 1)
++
++#define EXT_ROOT_HDR(tree) \
++ ((struct ext3_extent_header *) (tree)->root)
++#define EXT_BLOCK_HDR(bh) \
++ ((struct ext3_extent_header *) (bh)->b_data)
++#define EXT_DEPTH(_t_) \
++ (((struct ext3_extent_header *)((_t_)->root))->eh_depth)
++#define EXT_GENERATION(_t_) \
++ (((struct ext3_extent_header *)((_t_)->root))->eh_generation)
++
++
++#define EXT_ASSERT(__x__) if (!(__x__)) BUG();
++
++#define EXT_CHECK_PATH(tree,path) \
++{ \
++ int depth = EXT_DEPTH(tree); \
++ BUG_ON((unsigned long) (path) < __PAGE_OFFSET); \
++ BUG_ON((unsigned long) (path)[depth].p_idx < \
++ __PAGE_OFFSET && (path)[depth].p_idx != NULL); \
++ BUG_ON((unsigned long) (path)[depth].p_ext < \
++ __PAGE_OFFSET && (path)[depth].p_ext != NULL); \
++ BUG_ON((unsigned long) (path)[depth].p_hdr < __PAGE_OFFSET); \
++ BUG_ON((unsigned long) (path)[depth].p_bh < __PAGE_OFFSET \
++ && depth != 0); \
++ BUG_ON((path)[0].p_depth != depth); \
++}
++
++
++/*
++ * this structure is used to gather extents from the tree via ioctl
++ */
++struct ext3_extent_buf {
++ unsigned long start;
++ int buflen;
++ void *buffer;
++ void *cur;
++ int err;
++};
++
++/*
++ * this structure is used to collect stats info about the tree
++ */
++struct ext3_extent_tree_stats {
++ int depth;
++ int extents_num;
++ int leaf_num;
++};
++
++extern void ext3_init_tree_desc(struct ext3_extents_tree *, struct inode *);
++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *);
++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *);
++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *);
++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback);
++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long);
++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *);
++extern int ext3_ext_calc_blockmap_metadata(struct inode *, int);
++
++static inline void
++ext3_ext_invalidate_cache(struct ext3_extents_tree *tree)
++{
++ if (tree->cex)
++ tree->cex->ec_type = EXT3_EXT_CACHE_NO;
++}
++
++
++#endif /* _LINUX_EXT3_EXTENTS */
+Index: linux-2.6.12-rc6/include/linux/ext3_fs_i.h
+===================================================================
+--- linux-2.6.12-rc6.orig/include/linux/ext3_fs_i.h 2005-06-06 17:22:29.000000000 +0200
++++ linux-2.6.12-rc6/include/linux/ext3_fs_i.h 2005-06-14 16:31:25.941073443 +0200
+@@ -133,6 +133,8 @@
+ */
+ struct semaphore truncate_sem;
+ struct inode vfs_inode;
++
++ __u32 i_cached_extent[4];
+ };
+
+ #endif /* _LINUX_EXT3_FS_I */
--- /dev/null
+Signed-off-by: Johann Lombardi <johann.lombardi@bull.net>
+
+--- linux-2.6.12.orig/fs/ext3/super.c 2005-06-17 21:48:29.000000000 +0200
++++ linux-2.6.12/fs/ext3/super.c 2005-11-07 13:37:30.000000000 +0100
+@@ -39,7 +39,8 @@
+ #include "xattr.h"
+ #include "acl.h"
+
+-static int ext3_load_journal(struct super_block *, struct ext3_super_block *);
++static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
++ unsigned long journal_devnum);
+ static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
+ int);
+ static void ext3_commit_super (struct super_block * sb,
+@@ -586,7 +587,7 @@ enum {
+ Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+ Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+ Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh,
+- Opt_commit, Opt_journal_update, Opt_journal_inum,
++ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+@@ -624,6 +625,7 @@ static match_table_t tokens = {
+ {Opt_commit, "commit=%u"},
+ {Opt_journal_update, "journal=update"},
+ {Opt_journal_inum, "journal=%u"},
++ {Opt_journal_dev, "journal_dev=%u"},
+ {Opt_abort, "abort"},
+ {Opt_data_journal, "data=journal"},
+ {Opt_data_ordered, "data=ordered"},
+@@ -663,8 +665,9 @@ static unsigned long get_sb_block(void *
+ return sb_block;
+ }
+
+-static int parse_options (char * options, struct super_block *sb,
+- unsigned long * inum, unsigned long *n_blocks_count, int is_remount)
++static int parse_options (char *options, struct super_block *sb,
++ unsigned long *inum, unsigned long *journal_devnum,
++ unsigned long *n_blocks_count, int is_remount)
+ {
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ char * p;
+@@ -805,6 +808,16 @@ static int parse_options (char * options
+ return 0;
+ *inum = option;
+ break;
++ case Opt_journal_dev:
++ if (is_remount) {
++ printk(KERN_ERR "EXT3-fs: cannot specify "
++ "journal on remount\n");
++ return 0;
++ }
++ if (match_int(&args[0], &option))
++ return 0;
++ *journal_devnum = option;
++ break;
+ case Opt_noload:
+ set_opt (sbi->s_mount_opt, NOLOAD);
+ break;
+@@ -1250,6 +1263,7 @@ static int ext3_fill_super (struct super
+ unsigned long logic_sb_block;
+ unsigned long offset = 0;
+ unsigned long journal_inum = 0;
++ unsigned long journal_devnum = 0;
+ unsigned long def_mount_opts;
+ struct inode *root;
+ int blocksize;
+@@ -1330,7 +1344,8 @@ static int ext3_fill_super (struct super
+
+ set_opt(sbi->s_mount_opt, RESERVATION);
+
+- if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0))
++ if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
++ NULL, 0))
+ goto failed_mount;
+
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+@@ -1541,7 +1556,7 @@ static int ext3_fill_super (struct super
+ */
+ if (!test_opt(sb, NOLOAD) &&
+ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
+- if (ext3_load_journal(sb, es))
++ if (ext3_load_journal(sb, es, journal_devnum))
+ goto failed_mount2;
+ } else if (journal_inum) {
+ if (ext3_create_journal(sb, es, journal_inum))
+@@ -1821,15 +1836,24 @@ out_bdev:
+ return NULL;
+ }
+
+-static int ext3_load_journal(struct super_block * sb,
+- struct ext3_super_block * es)
++static int ext3_load_journal(struct super_block *sb,
++ struct ext3_super_block *es,
++ unsigned long journal_devnum)
+ {
+ journal_t *journal;
+ int journal_inum = le32_to_cpu(es->s_journal_inum);
+- dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
++ dev_t journal_dev;
+ int err = 0;
+ int really_read_only;
+
++ if (journal_devnum &&
++ journal_devnum != le32_to_cpu(es->s_journal_dev)) {
++ printk(KERN_INFO "EXT3-fs: external journal device major/minor "
++ "numbers have changed\n");
++ journal_dev = new_decode_dev(journal_devnum);
++ } else
++ journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
++
+ really_read_only = bdev_read_only(sb->s_bdev);
+
+ /*
+@@ -1888,6 +1912,16 @@ static int ext3_load_journal(struct supe
+
+ EXT3_SB(sb)->s_journal = journal;
+ ext3_clear_journal_err(sb, es);
++
++ if (journal_devnum &&
++ journal_devnum != le32_to_cpu(es->s_journal_dev)) {
++ es->s_journal_dev = cpu_to_le32(journal_devnum);
++ sb->s_dirt = 1;
++
++ /* Make sure we flush the recovery flag to disk. */
++ ext3_commit_super(sb, es, 1);
++ }
++
+ return 0;
+ }
+
+@@ -2093,13 +2127,13 @@ static int ext3_remount (struct super_bl
+ {
+ struct ext3_super_block * es;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+- unsigned long tmp;
++ unsigned long tmp1, tmp2;
+ unsigned long n_blocks_count = 0;
+
+ /*
+ * Allow the "check" option to be passed as a remount option.
+ */
+- if (!parse_options(data, sb, &tmp, &n_blocks_count, 1))
++ if (!parse_options(data, sb, &tmp1, &tmp2, &n_blocks_count, 1))
+ return -EINVAL;
+
+ if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
-Index: linux-2.6.5-7.201/include/linux/ext3_fs_sb.h
-===================================================================
---- linux-2.6.5-7.201.orig/include/linux/ext3_fs_sb.h 2005-10-14 08:59:35.000000000 +0400
-+++ linux-2.6.5-7.201/include/linux/ext3_fs_sb.h 2005-10-14 08:59:39.000000000 +0400
-@@ -23,10 +23,30 @@
- #define EXT_INCLUDE
- #include <linux/blockgroup_lock.h>
- #include <linux/percpu_counter.h>
-+#include <linux/list.h>
- #endif
- #endif
- #include <linux/rbtree.h>
-
-+#define EXT3_BB_MAX_BLOCKS 30
-+struct ext3_free_metadata {
-+ unsigned short group;
-+ unsigned short num;
-+ unsigned short blocks[EXT3_BB_MAX_BLOCKS];
-+ struct list_head list;
-+};
-+
-+struct ext3_buddy_group_blocks {
-+ __u32 bb_bitmap;
-+ __u32 bb_buddy;
-+ spinlock_t bb_lock;
-+ unsigned long bb_tid;
-+ struct ext3_free_metadata *bb_md_cur;
-+ unsigned short bb_first_free;
-+ unsigned short bb_free;
-+ unsigned bb_counters[];
-+};
-+
- /*
- * third extended-fs super-block data in memory
- */
-@@ -78,6 +98,27 @@
- struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
- wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
- #endif
-+
-+ /* for buddy allocator */
-+ struct ext3_buddy_group_blocks **s_buddy_blocks;
-+ struct inode *s_buddy;
-+ long s_blocks_reserved;
-+ spinlock_t s_reserve_lock;
-+ struct list_head s_active_transaction;
-+ struct list_head s_closed_transaction;
-+ struct list_head s_committed_transaction;
-+ spinlock_t s_md_lock;
-+ tid_t s_last_transaction;
-+ int s_mb_factor;
-+
-+ /* stats for buddy allocator */
-+ spinlock_t s_bal_lock;
-+ unsigned long s_bal_reqs; /* number of reqs with len > 1 */
-+ unsigned long s_bal_success; /* we found long enough chunks */
-+ unsigned long s_bal_allocated; /* in blocks */
-+ unsigned long s_bal_ex_scanned; /* total extents scanned */
-+ unsigned long s_bal_goals; /* goal hits */
-+ unsigned long s_bal_breaks; /* too long searches */
- };
-
- #endif /* _LINUX_EXT3_FS_SB */
Index: linux-2.6.5-7.201/include/linux/ext3_fs.h
===================================================================
---- linux-2.6.5-7.201.orig/include/linux/ext3_fs.h 2005-10-14 08:59:38.000000000 +0400
-+++ linux-2.6.5-7.201/include/linux/ext3_fs.h 2005-10-14 09:02:36.000000000 +0400
-@@ -57,6 +57,14 @@
+--- linux-2.6.5-7.201.orig/include/linux/ext3_fs.h 2005-12-17 02:53:30.000000000 +0300
++++ linux-2.6.5-7.201/include/linux/ext3_fs.h 2005-12-17 03:13:38.000000000 +0300
+@@ -57,6 +57,14 @@ struct statfs;
#define ext3_debug(f, a...) do {} while (0)
#endif
/*
* Special inodes numbers
*/
-@@ -339,6 +347,7 @@
+@@ -339,6 +347,7 @@ struct ext3_inode {
#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */
#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */
-+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */
++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef clear_opt
-@@ -700,7 +709,7 @@
+@@ -700,7 +709,9 @@ extern int ext3_bg_has_super(struct supe
extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
- unsigned long);
+ unsigned long, int);
++extern void ext3_free_blocks_old (handle_t *, struct inode *, unsigned long,
++ unsigned long);
extern unsigned long ext3_count_free_blocks (struct super_block *);
extern void ext3_check_blocks_bitmap (struct super_block *);
extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
-@@ -822,6 +831,44 @@
+@@ -822,6 +833,17 @@ extern void ext3_extents_initialize_bloc
extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg);
+/* mballoc.c */
-+extern long ext3_mb_aggressive;
+extern long ext3_mb_stats;
+extern long ext3_mb_max_to_scan;
+extern int ext3_mb_init(struct super_block *, int);
+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
+extern int ext3_mb_reserve_blocks(struct super_block *, int);
+extern void ext3_mb_release_blocks(struct super_block *, int);
-+
-+/* writeback.c */
-+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
-+extern int ext3_wb_prepare_write(struct file *file, struct page *page,
-+ unsigned from, unsigned to);
-+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
-+extern int ext3_wb_writepage(struct page *, struct writeback_control *);
-+extern int ext3_wb_invalidatepage(struct page *, unsigned long);
-+extern int ext3_wb_releasepage(struct page *, int);
-+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
-+extern void ext3_wb_init(struct super_block *);
-+extern void ext3_wb_release(struct super_block *);
-+
-+/* writeback.c */
-+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
-+extern int ext3_wb_prepare_write(struct file *file, struct page *page,
-+ unsigned from, unsigned to);
-+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
-+extern int ext3_wb_writepage(struct page *, struct writeback_control *);
-+extern int ext3_wb_invalidatepage(struct page *, unsigned long);
-+extern int ext3_wb_releasepage(struct page *, int);
-+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
-+extern void ext3_wb_init(struct super_block *);
-+extern void ext3_wb_release(struct super_block *);
-+
-+/* proc.c */
-+extern int init_ext3_proc(void);
-+extern void exit_ext3_proc(void);
++int __init init_ext3_proc(void);
++void exit_ext3_proc(void);
+
#endif /* __KERNEL__ */
#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long)
-Index: linux-2.6.5-7.201/fs/ext3/balloc.c
+Index: linux-2.6.5-7.201/include/linux/ext3_fs_sb.h
===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/balloc.c 2005-10-11 00:12:45.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/balloc.c 2005-10-14 08:59:39.000000000 +0400
-@@ -78,7 +78,7 @@
- *
- * Return buffer_head on success or NULL in case of failure.
- */
--static struct buffer_head *
-+struct buffer_head *
- read_block_bitmap(struct super_block *sb, unsigned int block_group)
+--- linux-2.6.5-7.201.orig/include/linux/ext3_fs_sb.h 2005-12-17 02:53:25.000000000 +0300
++++ linux-2.6.5-7.201/include/linux/ext3_fs_sb.h 2005-12-17 03:10:23.000000000 +0300
+@@ -23,9 +23,15 @@
+ #define EXT_INCLUDE
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
++#include <linux/list.h>
+ #endif
+ #endif
+ #include <linux/rbtree.h>
++#include <linux/proc_fs.h>
++
++struct ext3_buddy_group_blocks;
++struct ext3_mb_history;
++#define EXT3_BB_MAX_BLOCKS
+
+ /*
+ * third extended-fs super-block data in memory
+@@ -78,6 +84,38 @@ struct ext3_sb_info {
+ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
+ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
+ #endif
++
++ /* for buddy allocator */
++ struct ext3_group_info **s_group_info;
++ struct inode *s_buddy_cache;
++ long s_blocks_reserved;
++ spinlock_t s_reserve_lock;
++ struct list_head s_active_transaction;
++ struct list_head s_closed_transaction;
++ struct list_head s_committed_transaction;
++ spinlock_t s_md_lock;
++ tid_t s_last_transaction;
++ int s_mb_factor;
++ unsigned short *s_mb_offsets, *s_mb_maxs;
++
++ /* history to debug policy */
++ struct ext3_mb_history *s_mb_history;
++ int s_mb_history_cur;
++ int s_mb_history_max;
++ struct proc_dir_entry *s_mb_proc;
++ spinlock_t s_mb_history_lock;
++
++ /* stats for buddy allocator */
++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
++ atomic_t s_bal_success; /* we found long enough chunks */
++ atomic_t s_bal_allocated; /* in blocks */
++ atomic_t s_bal_ex_scanned; /* total extents scanned */
++ atomic_t s_bal_goals; /* goal hits */
++ atomic_t s_bal_breaks; /* too long searches */
++ atomic_t s_bal_2orders; /* 2^order hits */
++ spinlock_t s_bal_lock;
++ unsigned long s_mb_buddies_generated;
++ unsigned long long s_mb_generation_time;
+ };
+
+ #endif /* _LINUX_EXT3_FS_SB */
+Index: linux-2.6.5-7.201/fs/ext3/super.c
+===================================================================
+--- linux-2.6.5-7.201.orig/fs/ext3/super.c 2005-12-17 02:53:30.000000000 +0300
++++ linux-2.6.5-7.201/fs/ext3/super.c 2005-12-17 03:10:23.000000000 +0300
+@@ -389,6 +389,7 @@ void ext3_put_super (struct super_block
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_mb_release(sb);
+ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+@@ -543,7 +544,7 @@ enum {
+ Opt_commit, Opt_journal_update, Opt_journal_inum,
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+- Opt_err, Opt_extents, Opt_extdebug
++ Opt_err, Opt_extents, Opt_extdebug, Opt_mballoc
+ };
+
+ static match_table_t tokens = {
+@@ -590,6 +591,7 @@ static match_table_t tokens = {
+ {Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
+ {Opt_extdebug, "extdebug"},
++ {Opt_mballoc, "mballoc"},
+ {Opt_err, NULL}
+ };
+
+@@ -811,6 +813,9 @@ static int parse_options (char * options
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
++ case Opt_mballoc:
++ set_opt (sbi->s_mount_opt, MBALLOC);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1464,6 +1469,7 @@ static int ext3_fill_super (struct super
+ ext3_count_dirs(sb));
+
+ ext3_ext_init(sb);
++ ext3_mb_init(sb, needs_recovery);
+
+ return 0;
+
+@@ -2112,7 +2118,13 @@ static struct file_system_type ext3_fs_t
+
+ static int __init init_ext3_fs(void)
{
- struct ext3_group_desc * desc;
-@@ -274,7 +274,7 @@
+- int err = init_ext3_xattr();
++ int err;
++
++ err = init_ext3_proc();
++ if (err)
++ return err;
++
++ err = init_ext3_xattr();
+ if (err)
+ return err;
+ err = init_inodecache();
+@@ -2141,6 +2153,7 @@ static void __exit exit_ext3_fs(void)
+ unregister_filesystem(&ext3_fs_type);
+ destroy_inodecache();
+ exit_ext3_xattr();
++ exit_ext3_proc();
}
- /* Free given blocks, update quota and i_blocks field */
--void ext3_free_blocks(handle_t *handle, struct inode *inode,
-+void ext3_free_blocks_old(handle_t *handle, struct inode *inode,
- unsigned long block, unsigned long count)
- {
- struct buffer_head *bitmap_bh = NULL;
-@@ -1142,7 +1142,7 @@
- * bitmap, and then for any free bit if that fails.
- * This function also updates quota and i_blocks field.
- */
--int ext3_new_block(handle_t *handle, struct inode *inode,
-+int ext3_new_block_old(handle_t *handle, struct inode *inode,
- unsigned long goal, int *errp)
- {
- struct buffer_head *bitmap_bh = NULL;
+ int ext3_prep_san_write(struct inode *inode, long *blocks,
Index: linux-2.6.5-7.201/fs/ext3/extents.c
===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/extents.c 2005-10-14 08:59:38.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/extents.c 2005-10-14 08:59:39.000000000 +0400
-@@ -771,7 +771,7 @@
+--- linux-2.6.5-7.201.orig/fs/ext3/extents.c 2005-12-17 02:53:29.000000000 +0300
++++ linux-2.6.5-7.201/fs/ext3/extents.c 2005-12-17 03:10:23.000000000 +0300
+@@ -771,7 +771,7 @@ cleanup:
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
}
}
kfree(ablocks);
-@@ -1428,7 +1428,7 @@
+@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st
path->p_idx->ei_leaf);
bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
return err;
}
-@@ -1913,10 +1913,12 @@
+@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t
int needed = ext3_remove_blocks_credits(tree, ex, from, to);
handle_t *handle = ext3_journal_start(tree->inode, needed);
struct buffer_head *bh;
if (IS_ERR(handle))
return PTR_ERR(handle);
-+ if (S_ISDIR(tree->inode->i_mode))
++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode))
+ metadata = 1;
if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
/* tail removal */
unsigned long num, start;
-@@ -1928,7 +1930,7 @@
+@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t
bh = sb_find_get_block(tree->inode->i_sb, start + i);
ext3_forget(handle, 0, tree->inode, bh, start + i);
}
} else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
printk("strange request: removal %lu-%lu from %u:%u\n",
from, to, ex->ee_block, ex->ee_len);
-Index: linux-2.6.5-7.201/fs/ext3/namei.c
+Index: linux-2.6.5-7.201/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.5-7.201.orig/fs/ext3/inode.c 2005-12-17 02:53:30.000000000 +0300
++++ linux-2.6.5-7.201/fs/ext3/inode.c 2005-12-17 03:10:23.000000000 +0300
+@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i < keys; i++)
+- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
+ return err;
+ }
+
+@@ -673,7 +673,7 @@ err_out:
+ if (err == -EAGAIN)
+ for (i = 0; i < num; i++)
+ ext3_free_blocks(handle, inode,
+- le32_to_cpu(where[i].key), 1);
++ le32_to_cpu(where[i].key), 1, 1);
+ return err;
+ }
+
+@@ -1835,7 +1835,7 @@ ext3_clear_blocks(handle_t *handle, stru
+ }
+ }
+
+- ext3_free_blocks(handle, inode, block_to_free, count);
++ ext3_free_blocks(handle, inode, block_to_free, count, 1);
+ }
+
+ /**
+@@ -2006,7 +2006,7 @@ static void ext3_free_branches(handle_t
+ ext3_journal_test_restart(handle, inode);
+ }
+
+- ext3_free_blocks(handle, inode, nr, 1);
++ ext3_free_blocks(handle, inode, nr, 1, 1);
+
+ if (parent_bh) {
+ /*
+Index: linux-2.6.5-7.201/fs/ext3/balloc.c
===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/namei.c 2005-10-14 08:59:35.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/namei.c 2005-10-14 08:59:39.000000000 +0400
-@@ -1640,7 +1640,7 @@
- * If the create succeeds, we fill in the inode information
- * with d_instantiate().
+--- linux-2.6.5-7.201.orig/fs/ext3/balloc.c 2005-10-11 00:12:45.000000000 +0400
++++ linux-2.6.5-7.201/fs/ext3/balloc.c 2005-12-17 03:10:23.000000000 +0300
+@@ -78,7 +78,7 @@ struct ext3_group_desc * ext3_get_group_
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ read_block_bitmap(struct super_block *sb, unsigned int block_group)
+ {
+ struct ext3_group_desc * desc;
+@@ -274,7 +274,7 @@ void ext3_discard_reservation(struct ino
+ }
+
+ /* Free given blocks, update quota and i_blocks field */
+-void ext3_free_blocks(handle_t *handle, struct inode *inode,
++void ext3_free_blocks_old(handle_t *handle, struct inode *inode,
+ unsigned long block, unsigned long count)
+ {
+ struct buffer_head *bitmap_bh = NULL;
+@@ -1142,7 +1142,7 @@ int ext3_should_retry_alloc(struct super
+ * bitmap, and then for any free bit if that fails.
+ * This function also updates quota and i_blocks field.
*/
--static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
-+int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
- struct nameidata *nd)
+-int ext3_new_block(handle_t *handle, struct inode *inode,
++int ext3_new_block_old(handle_t *handle, struct inode *inode,
+ unsigned long goal, int *errp)
{
- handle_t *handle;
+ struct buffer_head *bitmap_bh = NULL;
Index: linux-2.6.5-7.201/fs/ext3/xattr.c
===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/xattr.c 2005-10-14 08:59:36.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/xattr.c 2005-10-14 08:59:39.000000000 +0400
-@@ -1371,7 +1371,7 @@
+--- linux-2.6.5-7.201.orig/fs/ext3/xattr.c 2005-12-17 02:53:26.000000000 +0300
++++ linux-2.6.5-7.201/fs/ext3/xattr.c 2005-12-17 03:10:41.000000000 +0300
+@@ -1371,7 +1371,7 @@ ext3_xattr_set_handle2(handle_t *handle,
new_bh = sb_getblk(sb, block);
if (!new_bh) {
getblk_failed:
error = -EIO;
goto cleanup;
}
-@@ -1411,7 +1411,7 @@
+@@ -1411,7 +1411,7 @@ getblk_failed:
if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
/* Free the old block. */
ea_bdebug(old_bh, "freeing");
/* ext3_forget() calls bforget() for us, but we
let our caller release old_bh, so we need to
-@@ -1519,7 +1519,7 @@
+@@ -1519,7 +1519,7 @@ ext3_xattr_delete_inode(handle_t *handle
mb_cache_entry_free(ce);
ce = NULL;
}
get_bh(bh);
ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl);
} else {
-Index: linux-2.6.5-7.201/fs/ext3/Makefile
-===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/Makefile 2005-10-14 08:59:38.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/Makefile 2005-10-14 08:59:39.000000000 +0400
-@@ -5,7 +5,7 @@
- obj-$(CONFIG_EXT3_FS) += ext3.o
-
- ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
-- ioctl.o namei.o super.o symlink.o hash.o extents.o
-+ ioctl.o namei.o super.o symlink.o hash.o extents.o mballoc.o
-
- ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
- ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
Index: linux-2.6.5-7.201/fs/ext3/mballoc.c
===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/mballoc.c 2005-10-13 19:40:57.851699336 +0400
-+++ linux-2.6.5-7.201/fs/ext3/mballoc.c 2005-10-14 09:02:36.000000000 +0400
-@@ -0,0 +1,1868 @@
+--- linux-2.6.5-7.201.orig/fs/ext3/mballoc.c 2005-12-09 13:08:53.191437750 +0300
++++ linux-2.6.5-7.201/fs/ext3/mballoc.c 2005-12-17 03:15:04.000000000 +0300
+@@ -0,0 +1,2435 @@
+/*
-+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
++#include <linux/swap.h>
++#include <linux/proc_fs.h>
++#include <linux/pagemap.h>
++#include <linux/seq_file.h>
+
+/*
+ * TODO:
-+ * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green)
++ * - bitmap read-ahead (proposed by Oleg Drokin aka green)
+ * - track min/max extents in each group for better group selection
-+ * - is it worthwhile to use buddies directly if req is 2^N blocks?
+ * - mb_mark_used() may allocate chunk right after splitting buddy
+ * - special flag to advice allocator to look for requested + N blocks
+ * this may improve interaction between extents and mballoc
+ */
+
+/*
-+ * with 'ext3_mb_aggressive' set the allocator runs consistency checks over
++ * with AGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
-+long ext3_mb_aggressive = 0;
-+
-+
-+/*
-+ * with 'ext3_mb_stats' allocator will collect stats that will be
-+ * shown at umount. The collecting costs though!
-+ */
-+long ext3_mb_stats = 1;
++#define AGGRESSIVE_CHECK__
+
+/*
+ */
+#endif
+
+/*
-+ * where to save buddies structures beetween umount/mount (clean case only)
++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory
++ * and you can monitor it in /proc/fs/ext3/<dev>/mb_history
+ */
-+#define EXT3_BUDDY_FILE ".buddy"
++#define EXT3_MB_HISTORY
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
-+long ext3_mb_max_to_scan = 100;
++long ext3_mb_max_to_scan = 500;
+
+/*
-+ * This structure is on-disk description of a group for mballoc
++ * How long mballoc must look for a best extent
+ */
-+struct ext3_mb_group_descr {
-+ __u16 mgd_first_free; /* first free block in the group */
-+ __u16 mgd_free; /* number of free blocks in the group */
-+ __u16 mgd_counters[16]; /* number of free blocks by order */
-+};
++long ext3_mb_min_to_scan = 30;
+
+/*
-+ * This structure is header of mballoc's file
++ * with 'ext3_mb_stats' allocator will collect stats that will be
++ * shown at umount. The collecting costs though!
+ */
-+struct ext3_mb_grp_header {
-+ __u32 mh_magic;
++
++long ext3_mb_stats = 1;
++
++#ifdef EXT3_BB_MAX_BLOCKS
++#undef EXT3_BB_MAX_BLOCKS
++#endif
++#define EXT3_BB_MAX_BLOCKS 30
++
++struct ext3_free_metadata {
++ unsigned short group;
++ unsigned short num;
++ unsigned short blocks[EXT3_BB_MAX_BLOCKS];
++ struct list_head list;
+};
+
-+#define EXT3_MB_MAGIC_V1 0xbabd16fd
++struct ext3_group_info {
++ unsigned long bb_state;
++ unsigned long bb_tid;
++ struct ext3_free_metadata *bb_md_cur;
++ unsigned short bb_first_free;
++ unsigned short bb_free;
++ unsigned short bb_fragments;
++ unsigned short bb_counters[];
++};
++
++
++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0
++#define EXT3_GROUP_INFO_LOCKED_BIT 1
+
++#define EXT3_MB_GRP_NEED_INIT(grp) \
++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state))
+
+struct ext3_free_extent {
+ __u16 fe_start;
+ unsigned long ac_ex_scanned;
+ __u16 ac_groups_scanned;
+ __u16 ac_found;
++ __u16 ac_tail;
++ __u16 ac_buddy;
+ __u8 ac_status;
+ __u8 ac_flags; /* allocation hints */
++ __u8 ac_criteria;
+ __u8 ac_repeats;
++ __u8 ac_2order; /* if request is to allocate 2^N blocks and
++ * N > 0, the field stores N, otherwise 0 */
+};
+
+#define AC_STATUS_CONTINUE 1
+#define AC_STATUS_FOUND 2
+#define AC_STATUS_BREAK 3
+
++struct ext3_mb_history {
++ struct ext3_free_extent goal; /* goal allocation */
++ struct ext3_free_extent result; /* result allocation */
++ __u16 found; /* how many extents have been found */
++ __u16 groups; /* how many groups have been scanned */
++ __u16 tail; /* what tail broke some buddy */
++ __u16 buddy; /* buddy the tail ^^^ broke */
++ __u8 cr; /* which phase the result extent was found at */
++ __u8 merged;
++};
++
+struct ext3_buddy {
-+ struct buffer_head *bd_bh;
-+ struct buffer_head *bd_bh2;
-+ struct ext3_buddy_group_blocks *bd_bd;
++ struct page *bd_buddy_page;
++ void *bd_buddy;
++ struct page *bd_bitmap_page;
++ void *bd_bitmap;
++ struct ext3_group_info *bd_info;
+ struct super_block *bd_sb;
+ __u16 bd_blkbits;
+ __u16 bd_group;
+};
-+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data)
-+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data)
++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap)
++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy)
++
++#ifndef EXT3_MB_HISTORY
++#define ext3_mb_store_history(sb,ac)
++#else
++static void ext3_mb_store_history(struct super_block *,
++ struct ext3_allocation_context *ac);
++#endif
+
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+
++static struct proc_dir_entry *proc_root_ext3;
++
+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
+static inline int mb_test_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ return ext3_test_bit(bit, addr);
++ return ext2_test_bit(bit, addr);
+}
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_set_bit(bit, addr);
++ ext2_set_bit(bit, addr);
+}
+
+static inline void mb_set_bit_atomic(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_set_bit_atomic(NULL, bit, addr);
++ ext2_set_bit_atomic(NULL, bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_clear_bit(bit, addr);
++ ext2_clear_bit(bit, addr);
+}
+
+static inline void mb_clear_bit_atomic(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_clear_bit_atomic(NULL, bit, addr);
++ ext2_clear_bit_atomic(NULL, bit, addr);
+}
+
-+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
++static inline int mb_find_next_zero_bit(void *addr, int max, int start)
+{
-+ int i = 1;
-+ char *bb;
-+
-+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
-+ J_ASSERT(max != NULL);
-+
-+ if (order > e3b->bd_blkbits + 1) {
-+ *max = 0;
-+ return NULL;
-+ }
-+
-+ /* at order 0 we see each particular block */
-+ *max = 1 << (e3b->bd_blkbits + 3);
-+ if (order == 0)
-+ return EXT3_MB_BITMAP(e3b);
-+
-+ bb = EXT3_MB_BUDDY(e3b);
-+ *max = *max >> 1;
-+ while (i < order) {
-+ bb += 1 << (e3b->bd_blkbits - i);
-+ i++;
-+ *max = *max >> 1;
-+ }
-+ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) <
-+ e3b->bd_sb->s_blocksize);
-+ return bb;
++ int fix;
++#if BITS_PER_LONG == 64
++ fix = ((unsigned long) addr & 7UL) << 3;
++ addr = (void *) ((unsigned long) addr & ~7UL);
++#elif BITS_PER_LONG == 32
++ fix = ((unsigned long) addr & 3UL) << 3;
++ addr = (void *) ((unsigned long) addr & ~3UL);
++#else
++#error "how many bits you are?!"
++#endif
++ max += fix;
++ start += fix;
++ return ext2_find_next_zero_bit(addr, max, start) - fix;
+}
-+
-+static int ext3_mb_load_buddy(struct super_block *sb, int group,
-+ struct ext3_buddy *e3b)
++
++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
+{
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char *bb;
+
-+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap);
-+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy);
++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
++ J_ASSERT(max != NULL);
+
-+ /* load bitmap */
-+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap);
-+ if (e3b->bd_bh == NULL) {
-+ ext3_error(sb, "ext3_mb_load_buddy",
-+ "can't get block for buddy bitmap\n");
-+ goto out;
-+ }
-+ /* load buddy */
-+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy);
-+ if (e3b->bd_bh2 == NULL) {
-+ ext3_error(sb, "ext3_mb_load_buddy",
-+ "can't get block for buddy bitmap\n");
-+ goto out;
++ if (order > e3b->bd_blkbits + 1) {
++ *max = 0;
++ return NULL;
+ }
+
-+ if (!buffer_uptodate(e3b->bd_bh))
-+ ll_rw_block(READ, 1, &e3b->bd_bh);
-+ if (!buffer_uptodate(e3b->bd_bh2))
-+ ll_rw_block(READ, 1, &e3b->bd_bh2);
-+
-+ wait_on_buffer(e3b->bd_bh);
-+ J_ASSERT(buffer_uptodate(e3b->bd_bh));
-+ wait_on_buffer(e3b->bd_bh2);
-+ J_ASSERT(buffer_uptodate(e3b->bd_bh2));
-+
-+ e3b->bd_blkbits = sb->s_blocksize_bits;
-+ e3b->bd_bd = sbi->s_buddy_blocks[group];
-+ e3b->bd_sb = sb;
-+ e3b->bd_group = group;
++ /* at order 0 we see each particular block */
++ *max = 1 << (e3b->bd_blkbits + 3);
++ if (order == 0)
++ return EXT3_MB_BITMAP(e3b);
+
-+ return 0;
-+out:
-+ brelse(e3b->bd_bh);
-+ brelse(e3b->bd_bh2);
-+ e3b->bd_bh = NULL;
-+ e3b->bd_bh2 = NULL;
-+ return -EIO;
-+}
++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order];
++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order];
+
-+static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b)
-+{
-+ mark_buffer_dirty(e3b->bd_bh);
-+ mark_buffer_dirty(e3b->bd_bh2);
++ return bb;
+}
+
-+static void ext3_mb_release_desc(struct ext3_buddy *e3b)
-+{
-+ brelse(e3b->bd_bh);
-+ brelse(e3b->bd_bh2);
-+}
++#ifdef AGGRESSIVE_CHECK
+
+static void mb_check_buddy(struct ext3_buddy *e3b)
+{
+ int order = e3b->bd_blkbits + 1;
+ int max, max2, i, j, k, count;
++ int fragments = 0, fstart;
+ void *buddy, *buddy2;
+
-+ if (likely(!ext3_mb_aggressive))
-+ return;
-+
+ if (!test_opt(e3b->bd_sb, MBALLOC))
+ return;
+
++ {
++ static int mb_check_counter = 0;
++ if (mb_check_counter++ % 300 != 0)
++ return;
++ }
++
+ while (order > 1) {
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ }
+ count++;
+ }
-+ J_ASSERT(e3b->bd_bd->bb_counters[order] == count);
++ J_ASSERT(e3b->bd_info->bb_counters[order] == count);
+ order--;
+ }
+
++ fstart = -1;
+ buddy = mb_find_buddy(e3b, 0, &max);
+ for (i = 0; i < max; i++) {
-+ if (!mb_test_bit(i, buddy))
++ if (!mb_test_bit(i, buddy)) {
++ J_ASSERT(i >= e3b->bd_info->bb_first_free);
++ if (fstart == -1) {
++ fragments++;
++ fstart = i;
++ }
+ continue;
++ }
++ fstart = -1;
+ /* check used bits only */
+ for (j = 0; j < e3b->bd_blkbits + 1; j++) {
+ buddy2 = mb_find_buddy(e3b, j, &max2);
+ J_ASSERT(mb_test_bit(k, buddy2));
+ }
+ }
++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info));
++ J_ASSERT(e3b->bd_info->bb_fragments == fragments);
++}
++
++#else
++#define mb_check_buddy(e3b)
++#endif
++
++/* find most significant bit */
++static int inline fmsb(unsigned short word)
++{
++ int order;
++
++ if (word > 255) {
++ order = 7;
++ word >>= 8;
++ } else {
++ order = -1;
++ }
++
++ do {
++ order++;
++ word >>= 1;
++ } while (word != 0);
++
++ return order;
++}
++
++static void inline
++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first,
++ int len, struct ext3_group_info *grp)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ unsigned short min, max, chunk, border;
++
++ mb_debug("mark %u/%u free\n", first, len);
++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb));
++
++ border = 2 << sb->s_blocksize_bits;
++
++ while (len > 0) {
++ /* find how many blocks can be covered since this position */
++ max = ffs(first | border) - 1;
++
++ /* find how many blocks of power 2 we need to mark */
++ min = fmsb(len);
++
++ mb_debug(" %u/%u -> max %u, min %u\n",
++ first & ((2 << sb->s_blocksize_bits) - 1),
++ len, max, min);
++
++ if (max < min)
++ min = max;
++ chunk = 1 << min;
++
++ /* mark multiblock chunks only */
++ grp->bb_counters[min]++;
++ if (min > 0) {
++ mb_debug(" set %u at %u \n", first >> min,
++ sbi->s_mb_offsets[min]);
++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]);
++ }
++
++ len -= chunk;
++ first += chunk;
++ }
++}
++
++static void
++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap,
++ struct ext3_group_info *grp)
++{
++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb);
++ unsigned short i = 0, first, len;
++ unsigned free = 0, fragments = 0;
++ unsigned long long period = get_cycles();
++
++ i = mb_find_next_zero_bit(bitmap, max, 0);
++ grp->bb_first_free = i;
++ while (i < max) {
++ fragments++;
++ first = i;
++ i = find_next_bit(bitmap, max, i);
++ len = i - first;
++ free += len;
++ if (len > 1)
++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp);
++ else
++ grp->bb_counters[0]++;
++ if (i < max)
++ i = mb_find_next_zero_bit(bitmap, max, i);
++ }
++ grp->bb_fragments = fragments;
++
++ /* bb_state shouldn't being modified because all
++ * others waits for init completion on page lock */
++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state);
++ if (free != grp->bb_free) {
++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n",
++ free, grp->bb_free);
++ grp->bb_free = free;
++ }
++
++ period = get_cycles() - period;
++ spin_lock(&EXT3_SB(sb)->s_bal_lock);
++ EXT3_SB(sb)->s_mb_buddies_generated++;
++ EXT3_SB(sb)->s_mb_generation_time += period;
++ spin_unlock(&EXT3_SB(sb)->s_bal_lock);
++}
++
++static int ext3_mb_init_cache(struct page *page)
++{
++ int blocksize, blocks_per_page, groups_per_page;
++ int err = 0, i, first_group, first_block;
++ struct super_block *sb;
++ struct buffer_head *bhs;
++ struct buffer_head **bh;
++ struct inode *inode;
++ char *data, *bitmap;
++
++ mb_debug("init page %lu\n", page->index);
++
++ inode = page->mapping->host;
++ sb = inode->i_sb;
++ blocksize = 1 << inode->i_blkbits;
++ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
++
++ groups_per_page = blocks_per_page >> 1;
++ if (groups_per_page == 0)
++ groups_per_page = 1;
++
++ /* allocate buffer_heads to read bitmaps */
++ if (groups_per_page > 1) {
++ err = -ENOMEM;
++ i = sizeof(struct buffer_head *) * groups_per_page;
++ bh = kmalloc(i, GFP_NOFS);
++ if (bh == NULL)
++ goto out;
++ memset(bh, 0, i);
++ } else
++ bh = &bhs;
++
++ first_group = page->index * blocks_per_page / 2;
++
++ /* read all groups the page covers into the cache */
++ for (i = 0; i < groups_per_page; i++) {
++ struct ext3_group_desc * desc;
++
++ if (first_group + i >= EXT3_SB(sb)->s_groups_count)
++ break;
++
++ err = -EIO;
++ desc = ext3_get_group_desc(sb, first_group + i, NULL);
++ if (desc == NULL)
++ goto out;
++
++ err = -ENOMEM;
++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap));
++ if (bh[i] == NULL)
++ goto out;
++
++ if (buffer_uptodate(bh[i]))
++ continue;
++
++ lock_buffer(bh[i]);
++ if (buffer_uptodate(bh[i])) {
++ unlock_buffer(bh[i]);
++ continue;
++ }
++
++ get_bh(bh[i]);
++ bh[i]->b_end_io = end_buffer_read_sync;
++ submit_bh(READ, bh[i]);
++ mb_debug("read bitmap for group %u\n", first_group + i);
++ }
++
++ /* wait for I/O completion */
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ wait_on_buffer(bh[i]);
++
++ /* XXX: I/O error handling here */
++
++ first_block = page->index * blocks_per_page;
++ for (i = 0; i < blocks_per_page; i++) {
++ int group;
++
++ group = (first_block + i) >> 1;
++ if (group >= EXT3_SB(sb)->s_groups_count)
++ break;
++
++ data = page_address(page) + (i * blocksize);
++ bitmap = bh[group - first_group]->b_data;
++
++ if ((first_block + i) & 1) {
++ /* this is block of buddy */
++ mb_debug("put buddy for group %u in page %lu/%x\n",
++ group, page->index, i * blocksize);
++ memset(data, 0xff, blocksize);
++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
++ ext3_mb_generate_buddy(sb, data, bitmap,
++ EXT3_SB(sb)->s_group_info[group]);
++ } else {
++ /* this is block of bitmap */
++ mb_debug("put bitmap for group %u in page %lu/%x\n",
++ group, page->index, i * blocksize);
++ memcpy(data, bitmap, blocksize);
++ }
++ }
++ SetPageUptodate(page);
++
++out:
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ brelse(bh[i]);
++ if (bh && bh != &bhs)
++ kfree(bh);
++ return err;
++}
++
++static int ext3_mb_load_buddy(struct super_block *sb, int group,
++ struct ext3_buddy *e3b)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct inode *inode = sbi->s_buddy_cache;
++ int blocks_per_page, block, pnum, poff;
++ struct page *page;
++
++ mb_debug("load group %u\n", group);
++
++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
++
++ e3b->bd_blkbits = sb->s_blocksize_bits;
++ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_sb = sb;
++ e3b->bd_group = group;
++ e3b->bd_buddy_page = NULL;
++ e3b->bd_bitmap_page = NULL;
++
++ block = group * 2;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++
++ page = find_get_page(inode->i_mapping, pnum);
++ if (page == NULL || !PageUptodate(page)) {
++ if (page)
++ page_cache_release(page);
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ if (!PageUptodate(page))
++ ext3_mb_init_cache(page);
++ unlock_page(page);
++ }
++ }
++ if (page == NULL || !PageUptodate(page))
++ goto err;
++ e3b->bd_bitmap_page = page;
++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
++ mark_page_accessed(page);
++
++ block++;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++
++ page = find_get_page(inode->i_mapping, pnum);
++ if (page == NULL || !PageUptodate(page)) {
++ if (page)
++ page_cache_release(page);
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ if (!PageUptodate(page))
++ ext3_mb_init_cache(page);
++ unlock_page(page);
++ }
++ }
++ if (page == NULL || !PageUptodate(page))
++ goto err;
++ e3b->bd_buddy_page = page;
++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
++ mark_page_accessed(page);
++
++ J_ASSERT(e3b->bd_bitmap_page != NULL);
++ J_ASSERT(e3b->bd_buddy_page != NULL);
++
++ return 0;
++
++err:
++ if (e3b->bd_bitmap_page)
++ page_cache_release(e3b->bd_bitmap_page);
++ if (e3b->bd_buddy_page)
++ page_cache_release(e3b->bd_buddy_page);
++ e3b->bd_buddy = NULL;
++ e3b->bd_bitmap = NULL;
++ return -EIO;
++}
++
++static void ext3_mb_release_desc(struct ext3_buddy *e3b)
++{
++ if (e3b->bd_bitmap_page)
++ page_cache_release(e3b->bd_bitmap_page);
++ if (e3b->bd_buddy_page)
++ page_cache_release(e3b->bd_buddy_page);
+}
+
++
+static inline void
+ext3_lock_group(struct super_block *sb, int group)
+{
-+ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock);
++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
++ &EXT3_SB(sb)->s_group_info[group]->bb_state);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
-+ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock);
++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
++ &EXT3_SB(sb)->s_group_info[group]->bb_state);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+
+static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count)
+{
-+ int block, max, order;
++ int block = 0, max = 0, order;
+ void *buddy, *buddy2;
+
+ mb_check_buddy(e3b);
+
-+ e3b->bd_bd->bb_free += count;
-+ if (first < e3b->bd_bd->bb_first_free)
-+ e3b->bd_bd->bb_first_free = first;
-+
++ e3b->bd_info->bb_free += count;
++ if (first < e3b->bd_info->bb_first_free)
++ e3b->bd_info->bb_first_free = first;
++
++ /* let's maintain fragments counter */
++ if (first != 0)
++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b));
++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0])
++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b));
++ if (block && max)
++ e3b->bd_info->bb_fragments--;
++ else if (!block && !max)
++ e3b->bd_info->bb_fragments++;
++
++ /* let's maintain buddy itself */
+ while (count-- > 0) {
+ block = first++;
+ order = 0;
+
+ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b)));
+ mb_clear_bit(block, EXT3_MB_BITMAP(e3b));
-+ e3b->bd_bd->bb_counters[order]++;
++ e3b->bd_info->bb_counters[order]++;
+
+ /* start of the buddy */
+ buddy = mb_find_buddy(e3b, order, &max);
+ mb_set_bit(block, buddy);
+ mb_set_bit(block + 1, buddy);
+ }
-+ e3b->bd_bd->bb_counters[order]--;
-+ e3b->bd_bd->bb_counters[order]--;
++ e3b->bd_info->bb_counters[order]--;
++ e3b->bd_info->bb_counters[order]--;
+
+ block = block >> 1;
+ order++;
-+ e3b->bd_bd->bb_counters[order]++;
++ e3b->bd_info->bb_counters[order]++;
+
+ mb_clear_bit(block, buddy2);
+ buddy = buddy2;
+}
+
+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
-+ int needed, struct ext3_free_extent *ex)
++ int needed, struct ext3_free_extent *ex)
+{
+ int next, max, ord;
+ void *buddy;
+ return 0;
+ }
+
-+ if (order == 0) {
++ if (likely(order == 0)) {
+ /* find actual order */
+ order = mb_find_order_for_block(e3b, block);
+ block = block >> order;
+ ex->fe_start = block << order;
+ ex->fe_group = e3b->bd_group;
+
-+ while ((buddy = mb_find_buddy(e3b, order, &max))) {
++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) {
+
+ if (block + 1 >= max)
+ break;
+
+static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex)
+{
++ int ord, mlen = 0, max = 0, cur;
+ int start = ex->fe_start;
+ int len = ex->fe_len;
-+ int ord, mlen, max, cur;
++ unsigned ret = 0;
+ int len0 = len;
+ void *buddy;
+
-+ e3b->bd_bd->bb_free -= len;
-+ if (e3b->bd_bd->bb_first_free == start)
-+ e3b->bd_bd->bb_first_free += len;
++ mb_check_buddy(e3b);
+
++ e3b->bd_info->bb_free -= len;
++ if (e3b->bd_info->bb_first_free == start)
++ e3b->bd_info->bb_first_free += len;
++
++ /* let's maintain fragments counter */
++ if (start != 0)
++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b));
++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0])
++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b));
++ if (mlen && max)
++ e3b->bd_info->bb_fragments++;
++ else if (!mlen && !max)
++ e3b->bd_info->bb_fragments--;
++
++ /* let's maintain buddy itself */
+ while (len) {
+ ord = mb_find_order_for_block(e3b, start);
+
+ buddy = mb_find_buddy(e3b, ord, &max);
+ J_ASSERT((start >> ord) < max);
+ mb_set_bit(start >> ord, buddy);
-+ e3b->bd_bd->bb_counters[ord]--;
++ e3b->bd_info->bb_counters[ord]--;
+ start += mlen;
+ len -= mlen;
+ J_ASSERT(len >= 0);
+ continue;
+ }
+
++ /* store for history */
++ if (ret == 0)
++ ret = len | (ord << 16);
++
+ /* we have to split large buddy */
+ J_ASSERT(ord > 0);
+ buddy = mb_find_buddy(e3b, ord, &max);
+ mb_set_bit(start >> ord, buddy);
-+ e3b->bd_bd->bb_counters[ord]--;
++ e3b->bd_info->bb_counters[ord]--;
+
+ ord--;
+ cur = (start >> ord) & ~1U;
+ buddy = mb_find_buddy(e3b, ord, &max);
+ mb_clear_bit(cur, buddy);
+ mb_clear_bit(cur + 1, buddy);
-+ e3b->bd_bd->bb_counters[ord]++;
-+ e3b->bd_bd->bb_counters[ord]++;
++ e3b->bd_info->bb_counters[ord]++;
++ e3b->bd_info->bb_counters[ord]++;
+ }
+
+ /* now drop all the bits in bitmap */
+
+ mb_check_buddy(e3b);
+
-+ return 0;
++ return ret;
+}
+
+/*
+static void ext3_mb_use_best_found(struct ext3_allocation_context *ac,
+ struct ext3_buddy *e3b)
+{
++ unsigned long ret;
++
+ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
-+ mb_mark_used(e3b, &ac->ac_b_ex);
++ ret = mb_mark_used(e3b, &ac->ac_b_ex);
++
+ ac->ac_status = AC_STATUS_FOUND;
++ ac->ac_tail = ret & 0xffff;
++ ac->ac_buddy = ret >> 16;
+}
+
+/*
+ struct ext3_free_extent *ex,
+ struct ext3_buddy *e3b)
+{
-+ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor;
+ struct ext3_free_extent *bex = &ac->ac_b_ex;
-+ int diff = ac->ac_g_ex.fe_len - ex->fe_len;
++ struct ext3_free_extent *gex = &ac->ac_g_ex;
+
+ J_ASSERT(ex->fe_len > 0);
+ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8);
+ /*
+ * The special case - take what you catch first
+ */
-+ if (ac->ac_flags & EXT3_MB_HINT_FIRST) {
++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) {
+ *bex = *ex;
+ ext3_mb_use_best_found(ac, e3b);
+ return;
+ /*
+ * Let's check whether the chuck is good enough
+ */
-+ if (ex->fe_len >= ac->ac_g_ex.fe_len) {
-+ *bex = *ex;
-+ ext3_mb_use_best_found(ac, e3b);
-+ return;
-+ }
-+
-+ /*
-+ * If the request is vey large, then it makes sense to use large
-+ * chunks for it. Even if they don't satisfy whole request.
-+ */
-+ if (ex->fe_len > 1000) {
-+ *bex = *ex;
-+ ext3_mb_use_best_found(ac, e3b);
-+ return;
-+ }
-+
-+ /*
-+ * Sometimes it's worty to take close chunk
-+ */
-+ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) {
++ if (ex->fe_len == gex->fe_len) {
+ *bex = *ex;
+ ext3_mb_use_best_found(ac, e3b);
+ return;
+
+ /*
+ * If new found extent is better, store it in the context
-+ * FIXME: possible the policy should be more complex?
+ */
-+ if (ex->fe_len > bex->fe_len) {
++ if (bex->fe_len < gex->fe_len) {
++ /* if the request isn't satisfied, any found extent
++ * larger than previous best one is better */
++ if (ex->fe_len > bex->fe_len)
++ *bex = *ex;
++ } else if (ex->fe_len > gex->fe_len) {
++ /* if the request is satisfied, then we try to find
++ * an extent that still satisfy the request, but is
++ * smaller than previous one */
+ *bex = *ex;
+ }
+
+ /*
++ * Let's scan at least few extents and don't pick up a first one
++ */
++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan)
++ ac->ac_status = AC_STATUS_BREAK;
++
++ /*
+ * We don't want to scan for a whole year
+ */
+ if (ac->ac_found > ext3_mb_max_to_scan)
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
+
-+ if (max > 0)
++ if (max > 0) {
++ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
++ }
+
+ ext3_unlock_group(ac->ac_sb, group);
+
-+ if (ac->ac_status == AC_STATUS_FOUND)
-+ ext3_mb_dirty_buddy(e3b);
+ ext3_mb_release_desc(e3b);
+
+ return 0;
+ J_ASSERT(ex.fe_len > 0);
+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
++ ac->ac_found++;
+ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
+ }
+ ext3_unlock_group(ac->ac_sb, group);
+
-+ if (ac->ac_status == AC_STATUS_FOUND)
-+ ext3_mb_dirty_buddy(e3b);
+ ext3_mb_release_desc(e3b);
+
+ return 0;
+}
++
++/*
++ * The routine scans buddy structures (not bitmap!) from given order
++ * to max order and tries to find big enough chunk to satisfy the req
++ */
++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ struct ext3_group_info *grp = e3b->bd_info;
++ void *buddy;
++ int i, k, max;
++
++ J_ASSERT(ac->ac_2order > 0);
++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) {
++ if (grp->bb_counters[i] == 0)
++ continue;
++
++ buddy = mb_find_buddy(e3b, i, &max);
++ if (buddy == NULL) {
++ printk(KERN_ALERT "looking for wrong order?\n");
++ break;
++ }
++
++ k = mb_find_next_zero_bit(buddy, max, 0);
++ J_ASSERT(k < max);
++
++ ac->ac_found++;
++
++ ac->ac_b_ex.fe_len = 1 << i;
++ ac->ac_b_ex.fe_start = k << i;
++ ac->ac_b_ex.fe_group = e3b->bd_group;
++
++ ext3_mb_use_best_found(ac, e3b);
++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len);
++
++ if (unlikely(ext3_mb_stats))
++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders);
++
++ break;
++ }
++}
++
+/*
+ * The routine scans the group and measures all found extents.
+ * In order to optimize scanning, caller must pass number of
-+ * free blocks in the group, so the routine can upper limit.
++ * free blocks in the group, so the routine can know upper limit.
+ */
-+static void ext3_mb_scan_group(struct ext3_allocation_context *ac,
-+ struct ext3_buddy *e3b)
++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
+{
+ struct super_block *sb = ac->ac_sb;
+ void *bitmap = EXT3_MB_BITMAP(e3b);
+ struct ext3_free_extent ex;
+ int i, free;
+
-+ free = e3b->bd_bd->bb_free;
++ free = e3b->bd_info->bb_free;
+ J_ASSERT(free > 0);
+
-+ i = e3b->bd_bd->bb_first_free;
++ i = e3b->bd_info->bb_first_free;
+
-+ while (free && ac->ac_status != AC_STATUS_FOUND) {
-+ i = ext3_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
++ while (free && ac->ac_status == AC_STATUS_CONTINUE) {
++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
+ if (i >= sb->s_blocksize * 8) {
+ J_ASSERT(free == 0);
+ break;
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+ int group, int cr)
+{
-+ int free;
++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
++ struct ext3_group_info *grp = sbi->s_group_info[group];
++ unsigned free, fragments, i, bits;
+
-+ J_ASSERT(cr >= 0 && cr < 3);
++ J_ASSERT(cr >= 0 && cr < 4);
++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp));
+
-+ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free;
++ free = grp->bb_free;
++ fragments = grp->bb_fragments;
+ if (free == 0)
+ return 0;
++ if (fragments == 0)
++ return 0;
+
-+ if (cr == 0) {
-+ if (free >= ac->ac_g_ex.fe_len >> 1)
++ switch (cr) {
++ case 0:
++ J_ASSERT(ac->ac_2order != 0);
++ bits = ac->ac_sb->s_blocksize_bits + 1;
++ for (i = ac->ac_2order; i < bits; i++)
++ if (grp->bb_counters[i] > 0)
++ return 1;
++ case 1:
++ if ((free / fragments) >= ac->ac_g_ex.fe_len)
++ return 1;
++ case 2:
++ if (free >= ac->ac_g_ex.fe_len)
++ return 1;
++ case 3:
+ return 1;
-+ } else if (cr == 1) {
-+ if (free >= ac->ac_g_ex.fe_len >> 2)
-+ return 1;
-+ } else if (cr == 2) {
-+ return 1;
++ default:
++ BUG();
+ }
++
+ return 0;
+}
+
+ ac.ac_g_ex.fe_start = block;
+ ac.ac_g_ex.fe_len = *len;
+ ac.ac_flags = flags;
++ ac.ac_2order = 0;
++ ac.ac_criteria = 0;
+
-+ /*
-+ * Sometimes, caller may want to merge even small number
-+ * of blocks to an existing extent
-+ */
++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */
++ i = ffs(*len);
++ if (i >= 8) {
++ i--;
++ if ((*len & (~(1 << i))) == 0)
++ ac.ac_2order = i;
++ }
++
++ /* Sometimes, caller may want to merge even small
++ * number of blocks to an existing extent */
+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) {
+ err = ext3_mb_find_by_goal(&ac, &e3b);
+ if (err)
+ goto found;
+ }
+
-+ /*
-+ * FIXME
-+ * If requested chunk is power of 2 length, we can try
-+ * to exploit buddy nature to speed allocation up
-+ */
-+
-+
-+ /*
-+ * Let's just scan groups to find more-less suitable blocks
-+ */
-+ cr = 0;
++ /* Let's just scan groups to find more-less suitable blocks */
++ cr = ac.ac_2order ? 0 : 1;
+repeat:
-+ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
++ ac.ac_criteria = cr;
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) {
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ /* we need full data about the group
++ * to make a good selection */
++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
++ if (err)
++ goto out_err;
++ ext3_mb_release_desc(&e3b);
++ }
++
+ /* check is group good for our criteries */
+ if (!ext3_mb_good_group(&ac, group, cr))
+ continue;
+ continue;
+ }
+
-+ ext3_mb_scan_group(&ac, &e3b);
++ ac.ac_groups_scanned++;
++ if (cr == 0)
++ ext3_mb_simple_scan_group(&ac, &e3b);
++ else
++ ext3_mb_complex_scan_group(&ac, &e3b);
++
+ ext3_unlock_group(sb, group);
+
-+ if (ac.ac_status == AC_STATUS_FOUND)
-+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
+ if (err)
+ }
+ }
+
-+ if (ac.ac_status == AC_STATUS_BREAK &&
++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND &&
+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
-+ /* We've been searching too long. Let's try to allocate
-+ * the best chunk we've found so far. */
-+ if (ac.ac_g_ex.fe_len >= 128 &&
-+ ac.ac_b_ex.fe_len < ac.ac_g_ex.fe_len / 4)
-+ ext3_warning(inode->i_sb, __FUNCTION__,
-+ "too long searching: got %d want %d\n",
-+ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len);
++ /*
++ * We've been searching too long. Let's try to allocate
++ * the best chunk we've found so far
++ */
++
++ /*if (ac.ac_found > ext3_mb_max_to_scan)
++ printk(KERN_ERR "EXT3-fs: too long searching at "
++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len,
++ ac.ac_g_ex.fe_len);*/
+ ext3_mb_try_best_found(&ac, &e3b);
+ if (ac.ac_status != AC_STATUS_FOUND) {
+ /*
+ * The only thing we can do is just take first
+ * found block(s)
+ */
-+ mb_debug(KERN_ERR "EXT3-fs: and someone won our chunk\n");
++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");
+ ac.ac_b_ex.fe_group = 0;
+ ac.ac_b_ex.fe_start = 0;
+ ac.ac_b_ex.fe_len = 0;
+ ac.ac_status = AC_STATUS_CONTINUE;
+ ac.ac_flags |= EXT3_MB_HINT_FIRST;
-+ cr = 2;
++ cr = 3;
+ goto repeat;
+ }
+ }
+ printk("EXT3-fs: groups: ");
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
+ printk("%d: %d ", i,
-+ sbi->s_buddy_blocks[i]->bb_free);
++ sbi->s_group_info[i]->bb_free);
+ printk("\n");
+#endif
+ goto out;
+ ext3_error(sb, "ext3_new_block",
+ "Allocating block in system zone - "
+ "block = %u", block);
-+ if (unlikely(ext3_mb_aggressive)) {
-+ for (i = 0; i < ac.ac_b_ex.fe_len; i++)
-+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i,
-+ bitmap_bh->b_data));
-+ }
-+
++#ifdef AGGRESSIVE_CHECK
++ for (i = 0; i < ac.ac_b_ex.fe_len; i++)
++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data));
++#endif
+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len);
+
+ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
+ ext3_mb_release_blocks(sb, 1);
+ }
+
-+ if ((ext3_mb_stats) && (ac.ac_g_ex.fe_len > 1)) {
-+ spin_lock(&sbi->s_bal_lock);
-+ sbi->s_bal_reqs++;
-+ sbi->s_bal_allocated += *len;
++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) {
++ atomic_inc(&sbi->s_bal_reqs);
++ atomic_add(*len, &sbi->s_bal_allocated);
+ if (*len >= ac.ac_g_ex.fe_len)
-+ sbi->s_bal_success++;
-+ sbi->s_bal_ex_scanned += ac.ac_found;
++ atomic_inc(&sbi->s_bal_success);
++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned);
+ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start &&
+ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group)
-+ sbi->s_bal_goals++;
++ atomic_inc(&sbi->s_bal_goals);
+ if (ac.ac_found > ext3_mb_max_to_scan)
-+ sbi->s_bal_breaks++;
-+ spin_unlock(&sbi->s_bal_lock);
++ atomic_inc(&sbi->s_bal_breaks);
+ }
+
++ ext3_mb_store_history(sb, &ac);
++
+ return block;
+}
++EXPORT_SYMBOL(ext3_mb_new_blocks);
+
-+int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh,
-+ struct ext3_mb_group_descr **grp)
-+{
-+ struct super_block *sb = e3b->bd_sb;
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int descr_per_block, err, offset;
-+ struct ext3_mb_grp_header *hdr;
-+ unsigned long block;
-+
-+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
-+ / sizeof(struct ext3_mb_group_descr);
-+ block = e3b->bd_group / descr_per_block;
-+ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err);
-+ if (*bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n",
-+ e3b->bd_group, err);
-+ return err;
-+ }
-+
-+ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data;
-+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
-+ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n",
-+ e3b->bd_group);
-+ brelse(*bh);
-+ *bh = NULL;
-+ return -EIO;
-+ }
++#ifdef EXT3_MB_HISTORY
++struct ext3_mb_proc_session {
++ struct ext3_mb_history *history;
++ struct super_block *sb;
++ int start;
++ int max;
++};
+
-+ offset = e3b->bd_group % descr_per_block
-+ * sizeof(struct ext3_mb_group_descr)
-+ + sizeof(struct ext3_mb_grp_header);
-+ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset);
++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s,
++ struct ext3_mb_history *hs,
++ int first)
++{
++ if (hs == s->history + s->max)
++ hs = s->history;
++ if (!first && hs == s->history + s->start)
++ return NULL;
++ while (hs->goal.fe_len == 0) {
++ hs++;
++ if (hs == s->history + s->max)
++ hs = s->history;
++ if (hs == s->history + s->start)
++ return NULL;
++ }
++ return hs;
++}
+
-+ return 0;
++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
++{
++ struct ext3_mb_proc_session *s = seq->private;
++ struct ext3_mb_history *hs;
++ int l = *pos;
++
++ if (l == 0)
++ return SEQ_START_TOKEN;
++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1);
++ if (!hs)
++ return NULL;
++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL);
++ return hs;
+}
+
-+int ext3_mb_load_descr(struct ext3_buddy *e3b)
++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
+{
-+ struct ext3_mb_group_descr *grp;
-+ struct ext3_group_desc *gdp;
-+ struct buffer_head *bh;
-+ int err, i;
++ struct ext3_mb_proc_session *s = seq->private;
++ struct ext3_mb_history *hs = v;
++
++ ++*pos;
++ if (v == SEQ_START_TOKEN)
++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1);
++ else
++ return ext3_mb_history_skip_empty(s, ++hs, 0);
++}
+
-+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
-+ if (err)
-+ return err;
-+
-+ e3b->bd_bd->bb_first_free = grp->mgd_first_free;
-+ e3b->bd_bd->bb_free = grp->mgd_free;
-+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
-+ J_ASSERT(i < 16);
-+ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i];
-+ }
-+ brelse(bh);
++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v)
++{
++ struct ext3_mb_history *hs = v;
++ char buf[20], buf2[20];
+
-+ /* additional checks against old group descriptor */
-+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
-+ if (!gdp)
-+ return -EIO;
-+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
-+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
-+ e3b->bd_group, e3b->bd_bd->bb_free,
-+ le16_to_cpu(gdp->bg_free_blocks_count));
-+ return -ENODATA;
++ if (v == SEQ_START_TOKEN) {
++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n",
++ "goal", "result", "found", "grps", "cr", "merge",
++ "tail", "broken");
++ return 0;
+ }
+
++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group,
++ hs->goal.fe_start, hs->goal.fe_len);
++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group,
++ hs->result.fe_start, hs->result.fe_len);
++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf,
++ buf2, hs->found, hs->groups, hs->cr,
++ hs->merged ? "M" : "", hs->tail,
++ hs->buddy ? 1 << hs->buddy : 0);
+ return 0;
+}
+
++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v)
++{
++}
++
++static struct seq_operations ext3_mb_seq_history_ops = {
++ .start = ext3_mb_seq_history_start,
++ .next = ext3_mb_seq_history_next,
++ .stop = ext3_mb_seq_history_stop,
++ .show = ext3_mb_seq_history_show,
++};
+
-+int ext3_mb_update_descr(struct ext3_buddy *e3b)
++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file)
+{
-+ struct ext3_mb_group_descr *grp;
-+ struct ext3_group_desc *gdp;
-+ struct buffer_head *bh;
-+ handle_t *handle;
-+ int err, i;
++ struct super_block *sb = PDE(inode)->data;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_proc_session *s;
++ int rc, size;
+
-+ /* additional checks against old group descriptor */
-+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
-+ if (!gdp)
++ s = kmalloc(sizeof(*s), GFP_KERNEL);
++ if (s == NULL)
++ return -EIO;
++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max;
++ s->history = kmalloc(size, GFP_KERNEL);
++ if (s == NULL) {
++ kfree(s);
+ return -EIO;
-+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
-+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
-+ e3b->bd_group, e3b->bd_bd->bb_free,
-+ le16_to_cpu(gdp->bg_free_blocks_count));
-+ return -ENODATA;
+ }
+
-+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
-+ if (err)
-+ return err;
++ spin_lock(&sbi->s_mb_history_lock);
++ memcpy(s->history, sbi->s_mb_history, size);
++ s->max = sbi->s_mb_history_max;
++ s->start = sbi->s_mb_history_cur % s->max;
++ spin_unlock(&sbi->s_mb_history_lock);
+
-+ handle = ext3_journal_start(EXT3_SB(e3b->bd_sb)->s_buddy, 1);
-+ if (IS_ERR(handle)) {
-+ err = PTR_ERR(handle);
-+ handle = NULL;
-+ goto out;
++ rc = seq_open(file, &ext3_mb_seq_history_ops);
++ if (rc == 0) {
++ struct seq_file *m = (struct seq_file *)file->private_data;
++ m->private = s;
++ } else {
++ kfree(s->history);
++ kfree(s);
+ }
++ return rc;
+
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err)
-+ goto out;
-+ grp->mgd_first_free = e3b->bd_bd->bb_first_free;
-+ grp->mgd_free = e3b->bd_bd->bb_free;
-+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
-+ J_ASSERT(i < 16);
-+ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i];
-+ }
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err)
-+ goto out;
-+ err = 0;
-+out:
-+ brelse(bh);
-+ if (handle)
-+ ext3_journal_stop(handle);
-+ return err;
+}
+
-+int ext3_mb_generate_buddy(struct ext3_buddy *e3b)
++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file)
+{
-+ struct super_block *sb = e3b->bd_sb;
-+ struct buffer_head *bh;
-+ int i, count = 0;
++ struct seq_file *seq = (struct seq_file *)file->private_data;
++ struct ext3_mb_proc_session *s = seq->private;
++ kfree(s->history);
++ kfree(s);
++ return seq_release(inode, file);
++}
++
++static struct file_operations ext3_mb_seq_history_fops = {
++ .owner = THIS_MODULE,
++ .open = ext3_mb_seq_history_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext3_mb_seq_history_release,
++};
+
-+ mb_debug("generate buddy for group %d\n", e3b->bd_group);
-+ memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize);
-+ memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize);
++static void ext3_mb_history_release(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char name[64];
+
-+ bh = read_block_bitmap(sb, e3b->bd_group);
-+ if (bh == NULL)
-+ return -EIO;
++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ remove_proc_entry("mb_history", sbi->s_mb_proc);
++ remove_proc_entry(name, proc_root_ext3);
++
++ if (sbi->s_mb_history)
++ kfree(sbi->s_mb_history);
++}
+
-+ /* mb_free_blocks will set real free */
-+ e3b->bd_bd->bb_free = 0;
-+ e3b->bd_bd->bb_first_free = 1 << 15;
-+ /*
-+ * if change bb_counters size, don't forget about
-+ * ext3_mb_init_backend() -bzzz
-+ */
-+ memset(e3b->bd_bd->bb_counters, 0,
-+ sizeof(unsigned) * (sb->s_blocksize_bits + 2));
++static void ext3_mb_history_init(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char name[64];
++ int i;
+
-+ /* loop over the blocks, and create buddies for free ones */
-+ for (i = 0; i < sb->s_blocksize * 8; i++) {
-+ if (!mb_test_bit(i, (void *) bh->b_data)) {
-+ mb_free_blocks(e3b, i, 1);
-+ count++;
++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3);
++ if (sbi->s_mb_proc != NULL) {
++ struct proc_dir_entry *p;
++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
++ if (p) {
++ p->proc_fops = &ext3_mb_seq_history_fops;
++ p->data = sb;
+ }
+ }
-+ brelse(bh);
-+ mb_check_buddy(e3b);
-+ ext3_mb_dirty_buddy(e3b);
+
-+ return 0;
++ sbi->s_mb_history_max = 1000;
++ sbi->s_mb_history_cur = 0;
++ spin_lock_init(&sbi->s_mb_history_lock);
++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history);
++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
++ memset(sbi->s_mb_history, 0, i);
++ /* if we can't allocate history, then we simple won't use it */
+}
+
-+EXPORT_SYMBOL(ext3_mb_new_blocks);
++static void
++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_history h;
++
++ if (likely(sbi->s_mb_history == NULL))
++ return;
++
++ h.goal = ac->ac_g_ex;
++ h.result = ac->ac_b_ex;
++ h.found = ac->ac_found;
++ h.cr = ac->ac_criteria;
++ h.groups = ac->ac_groups_scanned;
++ h.tail = ac->ac_tail;
++ h.buddy = ac->ac_buddy;
++ h.merged = 0;
++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
++ h.merged = 1;
++
++ spin_lock(&sbi->s_mb_history_lock);
++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
++ sbi->s_mb_history_cur = 0;
++ spin_unlock(&sbi->s_mb_history_lock);
++}
+
-+#define MB_CREDITS \
-+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \
-+ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS)
++#else
++#define ext3_mb_history_release(sb)
++#define ext3_mb_history_init(sb)
++#endif
+
-+int ext3_mb_init_backend(struct super_block *sb, int *created)
++int ext3_mb_init_backend(struct super_block *sb)
+{
-+ int err, i, len, descr_per_block, buddy_offset, size;
-+ struct inode *root = sb->s_root->d_inode;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ struct ext3_mb_grp_header *hdr;
-+ struct buffer_head *bh = NULL;
-+ unsigned long block;
-+ struct dentry *db;
-+ handle_t *handle;
-+ tid_t target;
-+
-+ *created = 0;
++ int i, len;
++
+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
-+ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_buddy_blocks == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
++ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ if (sbi->s_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_buddy_blocks, 0, len);
-+ sbi->s_buddy = NULL;
-+
-+ down(&root->i_sem);
-+ len = strlen(EXT3_BUDDY_FILE);
-+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len);
-+ if (IS_ERR(db)) {
-+ err = PTR_ERR(db);
-+ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err);
-+ up(&root->i_sem);
-+ goto out;
-+ }
++ memset(sbi->s_group_info, 0, len);
+
-+ if (db->d_inode == NULL) {
-+ err = ext3_create(root, db, S_IFREG, NULL);
-+ if (err) {
-+ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err);
-+ up(&root->i_sem);
-+ goto out;
-+ }
-+ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME;
-+ *created = 1;
-+ mb_debug("no buddy file, regenerate\n");
-+ }
-+ up(&root->i_sem);
-+ sbi->s_buddy = igrab(db->d_inode);
-+
-+ /* calculate needed size */
-+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
-+ / sizeof(struct ext3_mb_group_descr);
-+ buddy_offset = (sbi->s_groups_count + descr_per_block - 1)
-+ / descr_per_block;
-+ len = sbi->s_groups_count * sb->s_blocksize * 2 +
-+ buddy_offset * sb->s_blocksize;
-+ if (len != i_size_read(sbi->s_buddy)) {
-+ if (*created == 0)
-+ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n",
-+ (unsigned) len,
-+ (unsigned) i_size_read(sbi->s_buddy));
-+ *created = 1;
-+ }
-+
-+ /* read/create mb group descriptors */
-+ for (i = 0; i < buddy_offset; i++) {
-+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
-+ if (IS_ERR(handle)) {
-+ printk(KERN_ERR "EXT3-fs: cant start transaction\n");
-+ err = PTR_ERR(handle);
-+ goto err_out;
-+ }
-+
-+ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err);
-+ if (bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err);
-+ goto err_out;
-+ }
-+ hdr = (struct ext3_mb_grp_header *) bh->b_data;
-+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err)
-+ goto err_out;
-+ if (*created == 0)
-+ printk(KERN_ERR
-+ "EXT3-fs: invalid header 0x%x in %d,"
-+ "regenerate\n", hdr->mh_magic, i);
-+ *created = 1;
-+ hdr->mh_magic = EXT3_MB_MAGIC_V1;
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err)
-+ goto err_out;
-+ }
-+ brelse(bh);
-+ ext3_journal_stop(handle);
++ sbi->s_buddy_cache = new_inode(sb);
++ if (sbi->s_buddy_cache == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
++ kfree(sbi->s_group_info);
++ return -ENOMEM;
+ }
+
+ /*
-+ * if change bb_counters size, don't forget about ext3_mb_generate_buddy()
++ * calculate needed size. if change bb_counters size,
++ * don't forget about ext3_mb_generate_buddy()
+ */
-+ len = sizeof(struct ext3_buddy_group_blocks);
-+ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2);
++ len = sizeof(struct ext3_group_info);
++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
+ for (i = 0; i < sbi->s_groups_count; i++) {
++ struct ext3_group_desc * desc;
+
-+ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_buddy_blocks[i] == NULL) {
++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
++ if (sbi->s_group_info[i] == NULL) {
+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
-+ err = -ENOMEM;
-+ goto out2;
-+ }
-+ memset(sbi->s_buddy_blocks[i], 0, len);
-+
-+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
-+ if (IS_ERR(handle)) {
-+ printk(KERN_ERR "EXT3-fs: cant start transaction\n");
-+ err = PTR_ERR(handle);
-+ goto out2;
-+ }
-+
-+ /* allocate block for bitmap */
-+ block = buddy_offset + i * 2;
-+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
-+ if (bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err);
-+ goto out2;
-+ }
-+ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr;
-+ brelse(bh);
-+
-+ /* allocate block for buddy */
-+ block = buddy_offset + i * 2 + 1;
-+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
-+ if (bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err);
-+ goto out2;
++ goto err_out;
+ }
-+ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr;
-+ brelse(bh);
-+
-+ size = (block + 1) << sbi->s_buddy->i_blkbits;
-+ if (size > sbi->s_buddy->i_size) {
-+ *created = 1;
-+ EXT3_I(sbi->s_buddy)->i_disksize = size;
-+ i_size_write(sbi->s_buddy, size);
-+ mark_inode_dirty(sbi->s_buddy);
++ desc = ext3_get_group_desc(sb, i, NULL);
++ if (desc == NULL) {
++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i);
++ goto err_out;
+ }
-+ ext3_journal_stop(handle);
-+
-+ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock);
-+ sbi->s_buddy_blocks[i]->bb_md_cur = NULL;
-+ sbi->s_buddy_blocks[i]->bb_tid = 0;
++ memset(sbi->s_group_info[i], 0, len);
++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
++ &sbi->s_group_info[i]->bb_state);
++ sbi->s_group_info[i]->bb_free =
++ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
-+ if (journal_start_commit(sbi->s_journal, &target))
-+ log_wait_commit(sbi->s_journal, target);
-+
-+out2:
-+ dput(db);
-+out:
-+ return err;
++ return 0;
+
+err_out:
-+ return err;
++ while (--i >= 0)
++ kfree(sbi->s_group_info[i]);
++ iput(sbi->s_buddy_cache);
++
++ return -ENOMEM;
+}
+
-+int ext3_mb_write_descriptors(struct super_block *sb)
++int ext3_mb_init(struct super_block *sb, int needs_recovery)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ struct ext3_buddy e3b;
-+ int ret = 0, i, err;
++ struct inode *root = sb->s_root->d_inode;
++ unsigned i, offset, max;
++ struct dentry *dentry;
+
-+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_buddy_blocks[i] == NULL)
-+ continue;
++ if (!test_opt(sb, MBALLOC))
++ return 0;
+
-+ err = ext3_mb_load_buddy(sb, i, &e3b);
-+ if (err == 0) {
-+ ext3_mb_update_descr(&e3b);
-+ ext3_mb_release_desc(&e3b);
-+ } else
-+ ret = err;
++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
++
++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
++ if (sbi->s_mb_offsets == NULL) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ return -ENOMEM;
+ }
-+ return ret;
++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
++ if (sbi->s_mb_maxs == NULL) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ kfree(sbi->s_mb_maxs);
++ return -ENOMEM;
++ }
++
++ /* order 0 is regular bitmap */
++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
++ sbi->s_mb_offsets[0] = 0;
++
++ i = 1;
++ offset = 0;
++ max = sb->s_blocksize << 2;
++ do {
++ sbi->s_mb_offsets[i] = offset;
++ sbi->s_mb_maxs[i] = max;
++ offset += 1 << (sb->s_blocksize_bits - i);
++ max = max >> 1;
++ i++;
++ } while (i <= sb->s_blocksize_bits + 1);
++
++
++ /* init file for buddy data */
++ if ((i = ext3_mb_init_backend(sb))) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ kfree(sbi->s_mb_offsets);
++ kfree(sbi->s_mb_maxs);
++ return i;
++ }
++
++ spin_lock_init(&sbi->s_reserve_lock);
++ spin_lock_init(&sbi->s_md_lock);
++ INIT_LIST_HEAD(&sbi->s_active_transaction);
++ INIT_LIST_HEAD(&sbi->s_closed_transaction);
++ INIT_LIST_HEAD(&sbi->s_committed_transaction);
++ spin_lock_init(&sbi->s_bal_lock);
++
++ /* remove old on-disk buddy file */
++ down(&root->i_sem);
++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy"));
++ if (dentry->d_inode != NULL) {
++ i = vfs_unlink(root, dentry);
++ if (i != 0)
++ printk("EXT3-fs: can't remove .buddy file: %d\n", i);
++ }
++ dput(dentry);
++ up(&root->i_sem);
++
++ ext3_mb_history_init(sb);
++
++ printk("EXT3-fs: mballoc enabled\n");
++ return 0;
+}
+
+int ext3_mb_release(struct super_block *sb)
+ spin_unlock(&sbi->s_md_lock);
+ ext3_mb_free_committed_blocks(sb);
+
-+ if (sbi->s_buddy_blocks) {
-+ ext3_mb_write_descriptors(sb);
++ if (sbi->s_group_info) {
+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_buddy_blocks[i] == NULL)
++ if (sbi->s_group_info[i] == NULL)
+ continue;
-+ kfree(sbi->s_buddy_blocks[i]);
++ kfree(sbi->s_group_info[i]);
+ }
-+ kfree(sbi->s_buddy_blocks);
-+ }
-+ if (sbi->s_buddy)
-+ iput(sbi->s_buddy);
++ kfree(sbi->s_group_info);
++ }
++ if (sbi->s_mb_offsets)
++ kfree(sbi->s_mb_offsets);
++ if (sbi->s_mb_maxs)
++ kfree(sbi->s_mb_maxs);
++ if (sbi->s_buddy_cache)
++ iput(sbi->s_buddy_cache);
+ if (sbi->s_blocks_reserved)
+ printk("ext3-fs: %ld blocks being reserved at umount!\n",
+ sbi->s_blocks_reserved);
+ if (ext3_mb_stats) {
-+ printk("EXT3-fs: mballoc: %lu blocks %lu reqs "
-+ "(%lu success)\n", sbi->s_bal_allocated,
-+ sbi->s_bal_reqs, sbi->s_bal_success);
-+ printk("EXT3-fs: mballoc: %lu extents scanned, "
-+ "%lu goal hits, %lu breaks\n", sbi->s_bal_ex_scanned,
-+ sbi->s_bal_goals, sbi->s_bal_breaks);
-+ }
-+
-+ return 0;
-+}
-+
-+int ext3_mb_init(struct super_block *sb, int needs_recovery)
-+{
-+ struct ext3_buddy e3b;
-+ int i, err, created;
-+
-+ if (!test_opt(sb, MBALLOC))
-+ return 0;
-+
-+ /* init file for buddy data */
-+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
-+ if ((err = ext3_mb_init_backend(sb, &created)))
-+ return err;
-+
-+repeat:
-+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
-+ err = ext3_mb_load_buddy(sb, i, &e3b);
-+ if (err) {
-+ /* FIXME: release backend */
-+ return err;
-+ }
-+ if (created || needs_recovery)
-+ ext3_mb_generate_buddy(&e3b);
-+ else
-+ err = ext3_mb_load_descr(&e3b);
-+ ext3_mb_release_desc(&e3b);
-+ if (err == -ENODATA) {
-+ created = 1;
-+ goto repeat;
-+ }
-+ }
-+ if (created || needs_recovery)
-+ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n",
-+ EXT3_SB(sb)->s_groups_count);
-+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock);
-+ spin_lock_init(&EXT3_SB(sb)->s_md_lock);
-+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction);
-+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction);
-+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction);
-+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
-+
-+ spin_lock_init(&EXT3_SB(sb)->s_bal_lock);
-+ if (ext3_mb_stats) {
-+ printk("EXT3-fs: mballoc enabled (stats)\n");
-+ } else {
-+ printk("EXT3-fs: mballoc enabled\n");
-+ }
++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n",
++ atomic_read(&sbi->s_bal_allocated),
++ atomic_read(&sbi->s_bal_reqs),
++ atomic_read(&sbi->s_bal_success));
++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, "
++ "%u 2^N hits, %u breaks\n",
++ atomic_read(&sbi->s_bal_ex_scanned),
++ atomic_read(&sbi->s_bal_goals),
++ atomic_read(&sbi->s_bal_2orders),
++ atomic_read(&sbi->s_bal_breaks));
++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n",
++ sbi->s_mb_buddies_generated++,
++ sbi->s_mb_generation_time);
++ }
++
++ ext3_mb_history_release(sb);
+
+ return 0;
+}
+ mb_debug("\n");
+ ext3_unlock_group(sb, md->group);
+
++ /* balance refcounts from ext3_mb_free_metadata() */
++ page_cache_release(e3b.bd_buddy_page);
++ page_cache_release(e3b.bd_bitmap_page);
++
+ kfree(md);
-+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
+ } while (md);
+ /* new transaction! time to close last one and free blocks for
+ * committed transaction. we know that only transaction can be
+ * active, so previos transaction can be being logged and we
-+ * know that transaction before previous is known to be alreade
++ * know that transaction before previous is known to be already
+ * logged. this means that now we may free blocks freed in all
+ * transactions before previous one. hope I'm clear enough ... */
+
+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b,
+ int group, int block, int count)
+{
-+ struct ext3_buddy_group_blocks *db = e3b->bd_bd;
++ struct ext3_group_info *db = e3b->bd_info;
+ struct super_block *sb = e3b->bd_sb;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ struct ext3_free_metadata *md;
+ int i;
+
++ J_ASSERT(e3b->bd_bitmap_page != NULL);
++ J_ASSERT(e3b->bd_buddy_page != NULL);
++
+ ext3_lock_group(sb, group);
+ for (i = 0; i < count; i++) {
+ md = db->bb_md_cur;
+ spin_lock(&sbi->s_md_lock);
+ list_add(&md->list, &sbi->s_active_transaction);
+ spin_unlock(&sbi->s_md_lock);
++ /* protect buddy cache from being freed,
++ * otherwise we'll refresh it from
++ * on-disk bitmap and lose not-yet-available
++ * blocks */
++ page_cache_get(e3b->bd_buddy_page);
++ page_cache_get(e3b->bd_bitmap_page);
+ db->bb_md_cur = md;
+ db->bb_tid = handle->h_transaction->t_tid;
+ mb_debug("new md 0x%p for group %u\n",
+ if (err)
+ goto error_return;
+
-+ if (unlikely(ext3_mb_aggressive)) {
++#ifdef AGGRESSIVE_CHECK
++ {
+ int i;
+ for (i = 0; i < count; i++)
+ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data));
+ }
-+
++#endif
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
+
+ /* We dirtied the bitmap block */
+ spin_unlock(sb_bgl_lock(sbi, block_group));
+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+
-+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
+ *freed = count;
+}
+
+
-+extern void ext3_free_blocks_old(handle_t *handle, struct inode *inode,
-+ unsigned long block, unsigned long count);
-+void ext3_free_blocks(handle_t *handle, struct inode *inode,
-+ unsigned long block, unsigned long count, int metadata)
++void ext3_free_blocks(handle_t *handle, struct inode * inode,
++ unsigned long block, unsigned long count, int metadata)
+{
++ struct super_block *sb;
+ int freed;
+
-+ if (!test_opt(inode->i_sb, MBALLOC) ||
-+ EXT3_SB(inode->i_sb)->s_buddy_blocks == NULL)
++ sb = inode->i_sb;
++ if (!test_opt(sb, MBALLOC))
+ ext3_free_blocks_old(handle, inode, block, count);
+ else {
-+ ext3_mb_free_blocks(handle, inode, block,count,metadata,&freed);
++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
+ if (freed)
+ DQUOT_FREE_BLOCK(inode, freed);
+ }
+ return;
+}
-Index: linux-2.6.5-7.201/fs/ext3/proc.c
-===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/proc.c 2005-10-13 19:40:57.851699336 +0400
-+++ linux-2.6.5-7.201/fs/ext3/proc.c 2005-10-14 09:02:36.000000000 +0400
-@@ -0,0 +1,195 @@
-+#include <linux/config.h>
-+#include <linux/fs.h>
-+#include <linux/init.h>
-+#include <linux/module.h>
-+#include <linux/kernel.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/proc_fs.h>
-+#include <linux/errno.h>
-+#include <asm/uaccess.h>
-+
+
+#define EXT3_ROOT "ext3"
-+#define EXT3_MB_AGGRESSIVE_NAME "mb_aggressive"
+#define EXT3_MB_STATS_NAME "mb_stats"
+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
+
-+
-+static struct proc_dir_entry *proc_root_ext3;
-+
-+
-+static int ext3_mb_aggressive_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
++static int ext3_mb_stats_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
+ int len;
+
+ if (off != 0)
+ return 0;
+
-+ len = sprintf(page, "%ld\n", ext3_mb_aggressive);
++ len = sprintf(page, "%ld\n", ext3_mb_stats);
+ *start = page;
+ return len;
+}
+
-+static int ext3_mb_aggressive_write(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
++static int ext3_mb_stats_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
+{
+ char str[32];
+
+ if (count >= sizeof(str)) {
+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
-+ EXT3_MB_AGGRESSIVE_NAME, sizeof(str));
++ EXT3_MB_STATS_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+
+ return -EFAULT;
+
+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
-+ ext3_mb_aggressive = (simple_strtol(str, NULL, 0) != 0);
++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0);
+ return count;
+}
+
-+static int ext3_mb_stats_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
+ int len;
+
+ if (off != 0)
+ return 0;
+
-+ len = sprintf(page, "%ld\n", ext3_mb_stats);
++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan);
+ *start = page;
+ return len;
+}
+
-+static int ext3_mb_stats_write(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
+{
+ char str[32];
++ long value;
+
+ if (count >= sizeof(str)) {
+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
-+ EXT3_MB_STATS_NAME, sizeof(str));
++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+
+ return -EFAULT;
+
+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
-+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0);
++ value = simple_strtol(str, NULL, 0);
++ if (value <= 0)
++ return -ERANGE;
++
++ ext3_mb_max_to_scan = value;
++
+ return count;
+}
+
-+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
+ int len;
+
+ if (off != 0)
+ return 0;
+
-+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan);
++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan);
+ *start = page;
+ return len;
+}
+
-+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
+{
+ char str[32];
+ long value;
+
+ if (count >= sizeof(str)) {
+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
-+ EXT3_MB_MAX_TO_SCAN_NAME, sizeof(str));
++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+
+
+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
+ value = simple_strtol(str, NULL, 0);
-+ if (value <= 0)
++ if (value <= 0)
+ return -ERANGE;
+
-+ ext3_mb_max_to_scan = value;
++ ext3_mb_min_to_scan = value;
+
+ return count;
+}
+
+int __init init_ext3_proc(void)
+{
-+ struct proc_dir_entry *proc_ext3_mb_aggressive;
+ struct proc_dir_entry *proc_ext3_mb_stats;
+ struct proc_dir_entry *proc_ext3_mb_max_to_scan;
++ struct proc_dir_entry *proc_ext3_mb_min_to_scan;
+
+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs);
+ if (proc_root_ext3 == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT);
-+ return -EIO;
-+ }
-+
-+ /* Initialize EXT3_MB_AGGRESSIVE_NAME */
-+ proc_ext3_mb_aggressive = create_proc_entry(EXT3_MB_AGGRESSIVE_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
-+ if (proc_ext3_mb_aggressive == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_AGGRESSIVE_NAME);
-+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT);
+ return -EIO;
+ }
+
-+ proc_ext3_mb_aggressive->data = NULL;
-+ proc_ext3_mb_aggressive->read_proc = ext3_mb_aggressive_read;
-+ proc_ext3_mb_aggressive->write_proc = ext3_mb_aggressive_write;
-+
+ /* Initialize EXT3_MB_STATS_NAME */
+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
+ if (proc_ext3_mb_stats == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_STATS_NAME);
-+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3);
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_STATS_NAME);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+ return -EIO;
+ }
+
+ /* Initialize EXT3_MAX_TO_SCAN_NAME */
+ proc_ext3_mb_max_to_scan = create_proc_entry(
-+ EXT3_MB_MAX_TO_SCAN_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ EXT3_MB_MAX_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
+ if (proc_ext3_mb_max_to_scan == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_STATS_NAME);
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_MAX_TO_SCAN_NAME);
+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+ return -EIO;
+ }
+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read;
+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write;
+
++ /* Initialize EXT3_MIN_TO_SCAN_NAME */
++ proc_ext3_mb_min_to_scan = create_proc_entry(
++ EXT3_MB_MIN_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_min_to_scan == NULL) {
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_MIN_TO_SCAN_NAME);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_min_to_scan->data = NULL;
++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read;
++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write;
++
+ return 0;
+}
+
+void exit_ext3_proc(void)
+{
-+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+}
-Index: linux-2.6.5-7.201/fs/ext3/inode.c
-===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/inode.c 2005-10-14 08:59:38.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/inode.c 2005-10-14 08:59:39.000000000 +0400
-@@ -572,7 +572,7 @@
- ext3_journal_forget(handle, branch[i].bh);
- }
- for (i = 0; i < keys; i++)
-- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
-+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
- return err;
- }
-
-@@ -673,7 +673,7 @@
- if (err == -EAGAIN)
- for (i = 0; i < num; i++)
- ext3_free_blocks(handle, inode,
-- le32_to_cpu(where[i].key), 1);
-+ le32_to_cpu(where[i].key), 1, 1);
- return err;
- }
-
-@@ -1835,7 +1835,7 @@
- }
- }
-
-- ext3_free_blocks(handle, inode, block_to_free, count);
-+ ext3_free_blocks(handle, inode, block_to_free, count, 1);
- }
-
- /**
-@@ -2006,7 +2006,7 @@
- ext3_journal_test_restart(handle, inode);
- }
-
-- ext3_free_blocks(handle, inode, nr, 1);
-+ ext3_free_blocks(handle, inode, nr, 1, 1);
-
- if (parent_bh) {
- /*
-Index: linux-2.6.5-7.201/fs/ext3/super.c
++
+Index: linux-2.6.5-7.201/fs/ext3/Makefile
===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/super.c 2005-10-14 08:59:38.000000000 +0400
-+++ linux-2.6.5-7.201/fs/ext3/super.c 2005-10-14 09:02:36.000000000 +0400
-@@ -389,6 +389,7 @@
- struct ext3_super_block *es = sbi->s_es;
- int i;
-
-+ ext3_mb_release(sb);
- ext3_ext_release(sb);
- ext3_xattr_put_super(sb);
- journal_destroy(sbi->s_journal);
-@@ -543,6 +544,7 @@
- Opt_commit, Opt_journal_update, Opt_journal_inum,
- Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_ignore, Opt_barrier, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
-+ Opt_mballoc, Opt_mbfactor,
- Opt_err, Opt_extents, Opt_extdebug
- };
-
-@@ -590,6 +592,8 @@
- {Opt_iopen_nopriv, "iopen_nopriv"},
- {Opt_extents, "extents"},
- {Opt_extdebug, "extdebug"},
-+ {Opt_mballoc, "mballoc"},
-+ {Opt_mbfactor, "mbfactor=%u"},
- {Opt_err, NULL}
- };
-
-@@ -811,6 +815,16 @@
- case Opt_extdebug:
- set_opt (sbi->s_mount_opt, EXTDEBUG);
- break;
-+ case Opt_mballoc:
-+ set_opt (sbi->s_mount_opt, MBALLOC);
-+ break;
-+ case Opt_mbfactor:
-+ if (match_int(&args[0], &option))
-+ return 0;
-+ if (option < 0)
-+ return 0;
-+ sbi->s_mb_factor = option;
-+ break;
- default:
- printk (KERN_ERR
- "EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1464,6 +1478,7 @@
- ext3_count_dirs(sb));
-
- ext3_ext_init(sb);
-+ ext3_mb_init(sb, needs_recovery);
-
- return 0;
-
-@@ -2112,7 +2127,13 @@
+--- linux-2.6.5-7.201.orig/fs/ext3/Makefile 2005-12-17 02:53:30.000000000 +0300
++++ linux-2.6.5-7.201/fs/ext3/Makefile 2005-12-17 03:10:23.000000000 +0300
+@@ -5,7 +5,8 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
- static int __init init_ext3_fs(void)
- {
-- int err = init_ext3_xattr();
-+ int err;
-+
-+ err = init_ext3_proc();
-+ if (err)
-+ return err;
-+
-+ err = init_ext3_xattr();
- if (err)
- return err;
- err = init_inodecache();
-@@ -2141,6 +2162,7 @@
- unregister_filesystem(&ext3_fs_type);
- destroy_inodecache();
- exit_ext3_xattr();
-+ exit_ext3_proc();
- }
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+- ioctl.o namei.o super.o symlink.o hash.o extents.o
++ ioctl.o namei.o super.o symlink.o hash.o extents.o \
++ mballoc.o
- int ext3_prep_san_write(struct inode *inode, long *blocks,
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
--- /dev/null
+Index: linux-2.6.12.6/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.12.6.orig/include/linux/ext3_fs.h 2005-12-17 02:17:16.000000000 +0300
++++ linux-2.6.12.6/include/linux/ext3_fs.h 2005-12-17 02:21:21.000000000 +0300
+@@ -57,6 +57,14 @@ struct statfs;
+ #define ext3_debug(f, a...) do {} while (0)
+ #endif
+
++#define EXT3_MULTIBLOCK_ALLOCATOR 1
++
++#define EXT3_MB_HINT_MERGE 1
++#define EXT3_MB_HINT_RESERVED 2
++#define EXT3_MB_HINT_METADATA 4
++#define EXT3_MB_HINT_FIRST 8
++#define EXT3_MB_HINT_BEST 16
++
+ /*
+ * Special inodes numbers
+ */
+@@ -366,6 +374,7 @@ struct ext3_inode {
+ #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000 /* Make iopen world-readable */
+ #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */
+ #define EXT3_MOUNT_EXTDEBUG 0x400000/* Extents debug */
++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef clear_opt
+@@ -727,7 +736,7 @@ extern int ext3_bg_has_super(struct supe
+ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+ extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
+ extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
+- unsigned long);
++ unsigned long, int);
+ extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
+ unsigned long, unsigned long, int *);
+ extern unsigned long ext3_count_free_blocks (struct super_block *);
+@@ -848,6 +857,17 @@ extern void ext3_extents_initialize_bloc
+ extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg);
+
++/* mballoc.c */
++extern long ext3_mb_stats;
++extern long ext3_mb_max_to_scan;
++extern int ext3_mb_init(struct super_block *, int);
++extern int ext3_mb_release(struct super_block *);
++extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
++extern int ext3_mb_reserve_blocks(struct super_block *, int);
++extern void ext3_mb_release_blocks(struct super_block *, int);
++int __init init_ext3_proc(void);
++void exit_ext3_proc(void);
++
+ #endif /* __KERNEL__ */
+
+ /* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
+Index: linux-2.6.12.6/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-2.6.12.6.orig/include/linux/ext3_fs_sb.h 2005-08-29 20:55:27.000000000 +0400
++++ linux-2.6.12.6/include/linux/ext3_fs_sb.h 2005-12-17 02:21:21.000000000 +0300
+@@ -21,8 +21,14 @@
+ #include <linux/wait.h>
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
++#include <linux/list.h>
+ #endif
+ #include <linux/rbtree.h>
++#include <linux/proc_fs.h>
++
++struct ext3_buddy_group_blocks;
++struct ext3_mb_history;
++#define EXT3_BB_MAX_BLOCKS
+
+ /*
+ * third extended-fs super-block data in memory
+@@ -78,6 +84,38 @@ struct ext3_sb_info {
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+ #endif
++
++ /* for buddy allocator */
++ struct ext3_group_info **s_group_info;
++ struct inode *s_buddy_cache;
++ long s_blocks_reserved;
++ spinlock_t s_reserve_lock;
++ struct list_head s_active_transaction;
++ struct list_head s_closed_transaction;
++ struct list_head s_committed_transaction;
++ spinlock_t s_md_lock;
++ tid_t s_last_transaction;
++ int s_mb_factor;
++ unsigned short *s_mb_offsets, *s_mb_maxs;
++
++ /* history to debug policy */
++ struct ext3_mb_history *s_mb_history;
++ int s_mb_history_cur;
++ int s_mb_history_max;
++ struct proc_dir_entry *s_mb_proc;
++ spinlock_t s_mb_history_lock;
++
++ /* stats for buddy allocator */
++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
++ atomic_t s_bal_success; /* we found long enough chunks */
++ atomic_t s_bal_allocated; /* in blocks */
++ atomic_t s_bal_ex_scanned; /* total extents scanned */
++ atomic_t s_bal_goals; /* goal hits */
++ atomic_t s_bal_breaks; /* too long searches */
++ atomic_t s_bal_2orders; /* 2^order hits */
++ spinlock_t s_bal_lock;
++ unsigned long s_mb_buddies_generated;
++ unsigned long long s_mb_generation_time;
+ };
+
+ #endif /* _LINUX_EXT3_FS_SB */
+Index: linux-2.6.12.6/fs/ext3/super.c
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/super.c 2005-12-17 02:17:16.000000000 +0300
++++ linux-2.6.12.6/fs/ext3/super.c 2005-12-17 02:21:21.000000000 +0300
+@@ -387,6 +387,7 @@ static void ext3_put_super (struct super
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_mb_release(sb);
+ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+@@ -597,6 +598,7 @@ enum {
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug,
++ Opt_mballoc,
+ };
+
+ static match_table_t tokens = {
+@@ -649,6 +651,7 @@ static match_table_t tokens = {
+ {Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
+ {Opt_extdebug, "extdebug"},
++ {Opt_mballoc, "mballoc"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+@@ -964,6 +967,9 @@ clear_qf_name:
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
++ case Opt_mballoc:
++ set_opt (sbi->s_mount_opt, MBALLOC);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1669,6 +1675,7 @@ static int ext3_fill_super (struct super
+ ext3_count_dirs(sb));
+
+ ext3_ext_init(sb);
++ ext3_mb_init(sb, needs_recovery);
+ lock_kernel();
+ return 0;
+
+@@ -2548,7 +2555,13 @@ static struct file_system_type ext3_fs_t
+
+ static int __init init_ext3_fs(void)
+ {
+- int err = init_ext3_xattr();
++ int err;
++
++ err = init_ext3_proc();
++ if (err)
++ return err;
++
++ err = init_ext3_xattr();
+ if (err)
+ return err;
+ err = init_inodecache();
+@@ -2570,6 +2583,7 @@ static void __exit exit_ext3_fs(void)
+ unregister_filesystem(&ext3_fs_type);
+ destroy_inodecache();
+ exit_ext3_xattr();
++ exit_ext3_proc();
+ }
+
+ int ext3_prep_san_write(struct inode *inode, long *blocks,
+Index: linux-2.6.12.6/fs/ext3/extents.c
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/extents.c 2005-12-17 02:17:16.000000000 +0300
++++ linux-2.6.12.6/fs/ext3/extents.c 2005-12-17 02:21:21.000000000 +0300
+@@ -771,7 +771,7 @@ cleanup:
+ for (i = 0; i < depth; i++) {
+ if (!ablocks[i])
+ continue;
+- ext3_free_blocks(handle, tree->inode, ablocks[i], 1);
++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1, 1);
+ }
+ }
+ kfree(ablocks);
+@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st
+ path->p_idx->ei_leaf);
+ bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
+ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
+- ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1);
++ ext3_free_blocks(handle, tree->inode, path->p_idx->ei_leaf, 1, 1);
+ return err;
+ }
+
+@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t
+ int needed = ext3_remove_blocks_credits(tree, ex, from, to);
+ handle_t *handle = ext3_journal_start(tree->inode, needed);
+ struct buffer_head *bh;
+- int i;
++ int i, metadata = 0;
+
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode))
++ metadata = 1;
+ if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
+ /* tail removal */
+ unsigned long num, start;
+@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t
+ bh = sb_find_get_block(tree->inode->i_sb, start + i);
+ ext3_forget(handle, 0, tree->inode, bh, start + i);
+ }
+- ext3_free_blocks(handle, tree->inode, start, num);
++ ext3_free_blocks(handle, tree->inode, start, num, metadata);
+ } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
+ printk("strange request: removal %lu-%lu from %u:%u\n",
+ from, to, ex->ee_block, ex->ee_len);
+Index: linux-2.6.12.6/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/inode.c 2005-12-17 02:17:16.000000000 +0300
++++ linux-2.6.12.6/fs/ext3/inode.c 2005-12-17 02:21:21.000000000 +0300
+@@ -564,7 +564,7 @@ static int ext3_alloc_branch(handle_t *h
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i < keys; i++)
+- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
+ return err;
+ }
+
+@@ -1850,7 +1850,7 @@ ext3_clear_blocks(handle_t *handle, stru
+ }
+ }
+
+- ext3_free_blocks(handle, inode, block_to_free, count);
++ ext3_free_blocks(handle, inode, block_to_free, count, 1);
+ }
+
+ /**
+@@ -2023,7 +2023,7 @@ static void ext3_free_branches(handle_t
+ ext3_journal_test_restart(handle, inode);
+ }
+
+- ext3_free_blocks(handle, inode, nr, 1);
++ ext3_free_blocks(handle, inode, nr, 1, 1);
+
+ if (parent_bh) {
+ /*
+Index: linux-2.6.12.6/fs/ext3/balloc.c
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/balloc.c 2005-08-29 20:55:27.000000000 +0400
++++ linux-2.6.12.6/fs/ext3/balloc.c 2005-12-17 02:21:21.000000000 +0300
+@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ read_block_bitmap(struct super_block *sb, unsigned int block_group)
+ {
+ struct ext3_group_desc * desc;
+@@ -490,24 +490,6 @@ error_return:
+ return;
+ }
+
+-/* Free given blocks, update quota and i_blocks field */
+-void ext3_free_blocks(handle_t *handle, struct inode *inode,
+- unsigned long block, unsigned long count)
+-{
+- struct super_block * sb;
+- int dquot_freed_blocks;
+-
+- sb = inode->i_sb;
+- if (!sb) {
+- printk ("ext3_free_blocks: nonexistent device");
+- return;
+- }
+- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+- if (dquot_freed_blocks)
+- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+- return;
+-}
+-
+ /*
+ * For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy. This
+@@ -1162,7 +1144,7 @@ int ext3_should_retry_alloc(struct super
+ * bitmap, and then for any free bit if that fails.
+ * This function also updates quota and i_blocks field.
+ */
+-int ext3_new_block(handle_t *handle, struct inode *inode,
++int ext3_new_block_old(handle_t *handle, struct inode *inode,
+ unsigned long goal, int *errp)
+ {
+ struct buffer_head *bitmap_bh = NULL;
+Index: linux-2.6.12.6/fs/ext3/xattr.c
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/xattr.c 2005-08-29 20:55:27.000000000 +0400
++++ linux-2.6.12.6/fs/ext3/xattr.c 2005-12-17 02:21:33.000000000 +0300
+@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl
+ ea_bdebug(bh, "refcount now=0; freeing");
+ if (ce)
+ mb_cache_entry_free(ce);
+- ext3_free_blocks(handle, inode, bh->b_blocknr, 1);
++ ext3_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
+ get_bh(bh);
+ ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+ } else {
+@@ -804,7 +804,7 @@ inserted:
+ new_bh = sb_getblk(sb, block);
+ if (!new_bh) {
+ getblk_failed:
+- ext3_free_blocks(handle, inode, block, 1);
++ ext3_free_blocks(handle, inode, block, 1, 1);
+ error = -EIO;
+ goto cleanup;
+ }
+Index: linux-2.6.12.6/fs/ext3/mballoc.c
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/mballoc.c 2005-12-09 13:08:53.191437750 +0300
++++ linux-2.6.12.6/fs/ext3/mballoc.c 2005-12-17 02:21:21.000000000 +0300
+@@ -0,0 +1,2434 @@
++/*
++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
++ * Written by Alex Tomas <alex@clusterfs.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public Licens
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
++ */
++
++
++/*
++ * mballoc.c contains the multiblocks allocation routines
++ */
++
++#include <linux/config.h>
++#include <linux/time.h>
++#include <linux/fs.h>
++#include <linux/namei.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/quotaops.h>
++#include <linux/buffer_head.h>
++#include <linux/module.h>
++#include <linux/swap.h>
++#include <linux/proc_fs.h>
++#include <linux/pagemap.h>
++#include <linux/seq_file.h>
++
++/*
++ * TODO:
++ * - bitmap read-ahead (proposed by Oleg Drokin aka green)
++ * - track min/max extents in each group for better group selection
++ * - mb_mark_used() may allocate chunk right after splitting buddy
++ * - special flag to advice allocator to look for requested + N blocks
++ * this may improve interaction between extents and mballoc
++ * - tree of groups sorted by number of free blocks
++ * - percpu reservation code (hotpath)
++ * - error handling
++ */
++
++/*
++ * with AGRESSIVE_CHECK allocator runs consistency checks over
++ * structures. these checks slow things down a lot
++ */
++#define AGGRESSIVE_CHECK__
++
++/*
++ */
++#define MB_DEBUG__
++#ifdef MB_DEBUG
++#define mb_debug(fmt,a...) printk(fmt, ##a)
++#else
++#define mb_debug(fmt,a...)
++#endif
++
++/*
++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory
++ * and you can monitor it in /proc/fs/ext3/<dev>/mb_history
++ */
++#define EXT3_MB_HISTORY
++
++/*
++ * How long mballoc can look for a best extent (in found extents)
++ */
++long ext3_mb_max_to_scan = 500;
++
++/*
++ * How long mballoc must look for a best extent
++ */
++long ext3_mb_min_to_scan = 30;
++
++/*
++ * with 'ext3_mb_stats' allocator will collect stats that will be
++ * shown at umount. The collecting costs though!
++ */
++
++long ext3_mb_stats = 1;
++
++#ifdef EXT3_BB_MAX_BLOCKS
++#undef EXT3_BB_MAX_BLOCKS
++#endif
++#define EXT3_BB_MAX_BLOCKS 30
++
++struct ext3_free_metadata {
++ unsigned short group;
++ unsigned short num;
++ unsigned short blocks[EXT3_BB_MAX_BLOCKS];
++ struct list_head list;
++};
++
++struct ext3_group_info {
++ unsigned long bb_state;
++ unsigned long bb_tid;
++ struct ext3_free_metadata *bb_md_cur;
++ unsigned short bb_first_free;
++ unsigned short bb_free;
++ unsigned short bb_fragments;
++ unsigned short bb_counters[];
++};
++
++
++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0
++#define EXT3_GROUP_INFO_LOCKED_BIT 1
++
++#define EXT3_MB_GRP_NEED_INIT(grp) \
++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state))
++
++struct ext3_free_extent {
++ __u16 fe_start;
++ __u16 fe_len;
++ __u16 fe_group;
++};
++
++struct ext3_allocation_context {
++ struct super_block *ac_sb;
++
++ /* search goals */
++ struct ext3_free_extent ac_g_ex;
++
++ /* the best found extent */
++ struct ext3_free_extent ac_b_ex;
++
++ /* number of iterations done. we have to track to limit searching */
++ unsigned long ac_ex_scanned;
++ __u16 ac_groups_scanned;
++ __u16 ac_found;
++ __u16 ac_tail;
++ __u16 ac_buddy;
++ __u8 ac_status;
++ __u8 ac_flags; /* allocation hints */
++ __u8 ac_criteria;
++ __u8 ac_repeats;
++ __u8 ac_2order; /* if request is to allocate 2^N blocks and
++ * N > 0, the field stores N, otherwise 0 */
++};
++
++#define AC_STATUS_CONTINUE 1
++#define AC_STATUS_FOUND 2
++#define AC_STATUS_BREAK 3
++
++struct ext3_mb_history {
++ struct ext3_free_extent goal; /* goal allocation */
++ struct ext3_free_extent result; /* result allocation */
++ __u16 found; /* how many extents have been found */
++ __u16 groups; /* how many groups have been scanned */
++ __u16 tail; /* what tail broke some buddy */
++ __u16 buddy; /* buddy the tail ^^^ broke */
++ __u8 cr; /* which phase the result extent was found at */
++ __u8 merged;
++};
++
++struct ext3_buddy {
++ struct page *bd_buddy_page;
++ void *bd_buddy;
++ struct page *bd_bitmap_page;
++ void *bd_bitmap;
++ struct ext3_group_info *bd_info;
++ struct super_block *bd_sb;
++ __u16 bd_blkbits;
++ __u16 bd_group;
++};
++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap)
++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy)
++
++#ifndef EXT3_MB_HISTORY
++#define ext3_mb_store_history(sb,ac)
++#else
++static void ext3_mb_store_history(struct super_block *,
++ struct ext3_allocation_context *ac);
++#endif
++
++#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
++
++static struct proc_dir_entry *proc_root_ext3;
++
++int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
++struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
++int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
++int ext3_mb_reserve_blocks(struct super_block *, int);
++void ext3_mb_release_blocks(struct super_block *, int);
++void ext3_mb_poll_new_transaction(struct super_block *, handle_t *);
++void ext3_mb_free_committed_blocks(struct super_block *);
++
++#if BITS_PER_LONG == 64
++#define mb_correct_addr_and_bit(bit,addr) \
++{ \
++ bit += ((unsigned long) addr & 7UL) << 3; \
++ addr = (void *) ((unsigned long) addr & ~7UL); \
++}
++#elif BITS_PER_LONG == 32
++#define mb_correct_addr_and_bit(bit,addr) \
++{ \
++ bit += ((unsigned long) addr & 3UL) << 3; \
++ addr = (void *) ((unsigned long) addr & ~3UL); \
++}
++#else
++#error "how many bits you are?!"
++#endif
++
++static inline int mb_test_bit(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ return ext2_test_bit(bit, addr);
++}
++
++static inline void mb_set_bit(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ ext2_set_bit(bit, addr);
++}
++
++static inline void mb_set_bit_atomic(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ ext2_set_bit_atomic(NULL, bit, addr);
++}
++
++static inline void mb_clear_bit(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ ext2_clear_bit(bit, addr);
++}
++
++static inline void mb_clear_bit_atomic(int bit, void *addr)
++{
++ mb_correct_addr_and_bit(bit,addr);
++ ext2_clear_bit_atomic(NULL, bit, addr);
++}
++
++static inline int mb_find_next_zero_bit(void *addr, int max, int start)
++{
++ int fix;
++#if BITS_PER_LONG == 64
++ fix = ((unsigned long) addr & 7UL) << 3;
++ addr = (void *) ((unsigned long) addr & ~7UL);
++#elif BITS_PER_LONG == 32
++ fix = ((unsigned long) addr & 3UL) << 3;
++ addr = (void *) ((unsigned long) addr & ~3UL);
++#else
++#error "how many bits you are?!"
++#endif
++ max += fix;
++ start += fix;
++ return ext2_find_next_zero_bit(addr, max, start) - fix;
++}
++
++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
++{
++ char *bb;
++
++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
++ J_ASSERT(max != NULL);
++
++ if (order > e3b->bd_blkbits + 1) {
++ *max = 0;
++ return NULL;
++ }
++
++ /* at order 0 we see each particular block */
++ *max = 1 << (e3b->bd_blkbits + 3);
++ if (order == 0)
++ return EXT3_MB_BITMAP(e3b);
++
++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order];
++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order];
++
++ return bb;
++}
++
++#ifdef AGGRESSIVE_CHECK
++
++static void mb_check_buddy(struct ext3_buddy *e3b)
++{
++ int order = e3b->bd_blkbits + 1;
++ int max, max2, i, j, k, count;
++ int fragments = 0, fstart;
++ void *buddy, *buddy2;
++
++ if (!test_opt(e3b->bd_sb, MBALLOC))
++ return;
++
++ {
++ static int mb_check_counter = 0;
++ if (mb_check_counter++ % 300 != 0)
++ return;
++ }
++
++ while (order > 1) {
++ buddy = mb_find_buddy(e3b, order, &max);
++ J_ASSERT(buddy);
++ buddy2 = mb_find_buddy(e3b, order - 1, &max2);
++ J_ASSERT(buddy2);
++ J_ASSERT(buddy != buddy2);
++ J_ASSERT(max * 2 == max2);
++
++ count = 0;
++ for (i = 0; i < max; i++) {
++
++ if (mb_test_bit(i, buddy)) {
++ /* only single bit in buddy2 may be 1 */
++ if (!mb_test_bit(i << 1, buddy2))
++ J_ASSERT(mb_test_bit((i<<1)+1, buddy2));
++ else if (!mb_test_bit((i << 1) + 1, buddy2))
++ J_ASSERT(mb_test_bit(i << 1, buddy2));
++ continue;
++ }
++
++ /* both bits in buddy2 must be 0 */
++ J_ASSERT(mb_test_bit(i << 1, buddy2));
++ J_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
++
++ for (j = 0; j < (1 << order); j++) {
++ k = (i * (1 << order)) + j;
++ J_ASSERT(!mb_test_bit(k, EXT3_MB_BITMAP(e3b)));
++ }
++ count++;
++ }
++ J_ASSERT(e3b->bd_info->bb_counters[order] == count);
++ order--;
++ }
++
++ fstart = -1;
++ buddy = mb_find_buddy(e3b, 0, &max);
++ for (i = 0; i < max; i++) {
++ if (!mb_test_bit(i, buddy)) {
++ J_ASSERT(i >= e3b->bd_info->bb_first_free);
++ if (fstart == -1) {
++ fragments++;
++ fstart = i;
++ }
++ continue;
++ }
++ fstart = -1;
++ /* check used bits only */
++ for (j = 0; j < e3b->bd_blkbits + 1; j++) {
++ buddy2 = mb_find_buddy(e3b, j, &max2);
++ k = i >> j;
++ J_ASSERT(k < max2);
++ J_ASSERT(mb_test_bit(k, buddy2));
++ }
++ }
++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info));
++ J_ASSERT(e3b->bd_info->bb_fragments == fragments);
++}
++
++#else
++#define mb_check_buddy(e3b)
++#endif
++
++/* find most significant bit */
++static int inline fmsb(unsigned short word)
++{
++ int order;
++
++ if (word > 255) {
++ order = 7;
++ word >>= 8;
++ } else {
++ order = -1;
++ }
++
++ do {
++ order++;
++ word >>= 1;
++ } while (word != 0);
++
++ return order;
++}
++
++static void inline
++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first,
++ int len, struct ext3_group_info *grp)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ unsigned short min, max, chunk, border;
++
++ mb_debug("mark %u/%u free\n", first, len);
++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb));
++
++ border = 2 << sb->s_blocksize_bits;
++
++ while (len > 0) {
++ /* find how many blocks can be covered since this position */
++ max = ffs(first | border) - 1;
++
++ /* find how many blocks of power 2 we need to mark */
++ min = fmsb(len);
++
++ mb_debug(" %u/%u -> max %u, min %u\n",
++ first & ((2 << sb->s_blocksize_bits) - 1),
++ len, max, min);
++
++ if (max < min)
++ min = max;
++ chunk = 1 << min;
++
++ /* mark multiblock chunks only */
++ grp->bb_counters[min]++;
++ if (min > 0) {
++ mb_debug(" set %u at %u \n", first >> min,
++ sbi->s_mb_offsets[min]);
++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]);
++ }
++
++ len -= chunk;
++ first += chunk;
++ }
++}
++
++static void
++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap,
++ struct ext3_group_info *grp)
++{
++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb);
++ unsigned short i = 0, first, len;
++ unsigned free = 0, fragments = 0;
++ unsigned long long period = get_cycles();
++
++ i = mb_find_next_zero_bit(bitmap, max, 0);
++ grp->bb_first_free = i;
++ while (i < max) {
++ fragments++;
++ first = i;
++ i = find_next_bit(bitmap, max, i);
++ len = i - first;
++ free += len;
++ if (len > 1)
++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp);
++ else
++ grp->bb_counters[0]++;
++ if (i < max)
++ i = mb_find_next_zero_bit(bitmap, max, i);
++ }
++ grp->bb_fragments = fragments;
++
++ /* bb_state shouldn't being modified because all
++ * others waits for init completion on page lock */
++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state);
++ if (free != grp->bb_free) {
++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n",
++ free, grp->bb_free);
++ grp->bb_free = free;
++ }
++
++ period = get_cycles() - period;
++ spin_lock(&EXT3_SB(sb)->s_bal_lock);
++ EXT3_SB(sb)->s_mb_buddies_generated++;
++ EXT3_SB(sb)->s_mb_generation_time += period;
++ spin_unlock(&EXT3_SB(sb)->s_bal_lock);
++}
++
++static int ext3_mb_init_cache(struct page *page)
++{
++ int blocksize, blocks_per_page, groups_per_page;
++ int err = 0, i, first_group, first_block;
++ struct super_block *sb;
++ struct buffer_head *bhs;
++ struct buffer_head **bh;
++ struct inode *inode;
++ char *data, *bitmap;
++
++ mb_debug("init page %lu\n", page->index);
++
++ inode = page->mapping->host;
++ sb = inode->i_sb;
++ blocksize = 1 << inode->i_blkbits;
++ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
++
++ groups_per_page = blocks_per_page >> 1;
++ if (groups_per_page == 0)
++ groups_per_page = 1;
++
++ /* allocate buffer_heads to read bitmaps */
++ if (groups_per_page > 1) {
++ err = -ENOMEM;
++ i = sizeof(struct buffer_head *) * groups_per_page;
++ bh = kmalloc(i, GFP_NOFS);
++ if (bh == NULL)
++ goto out;
++ memset(bh, 0, i);
++ } else
++ bh = &bhs;
++
++ first_group = page->index * blocks_per_page / 2;
++
++ /* read all groups the page covers into the cache */
++ for (i = 0; i < groups_per_page; i++) {
++ struct ext3_group_desc * desc;
++
++ if (first_group + i >= EXT3_SB(sb)->s_groups_count)
++ break;
++
++ err = -EIO;
++ desc = ext3_get_group_desc(sb, first_group + i, NULL);
++ if (desc == NULL)
++ goto out;
++
++ err = -ENOMEM;
++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap));
++ if (bh[i] == NULL)
++ goto out;
++
++ if (buffer_uptodate(bh[i]))
++ continue;
++
++ lock_buffer(bh[i]);
++ if (buffer_uptodate(bh[i])) {
++ unlock_buffer(bh[i]);
++ continue;
++ }
++
++ get_bh(bh[i]);
++ bh[i]->b_end_io = end_buffer_read_sync;
++ submit_bh(READ, bh[i]);
++ mb_debug("read bitmap for group %u\n", first_group + i);
++ }
++
++ /* wait for I/O completion */
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ wait_on_buffer(bh[i]);
++
++ /* XXX: I/O error handling here */
++
++ first_block = page->index * blocks_per_page;
++ for (i = 0; i < blocks_per_page; i++) {
++ int group;
++
++ group = (first_block + i) >> 1;
++ if (group >= EXT3_SB(sb)->s_groups_count)
++ break;
++
++ data = page_address(page) + (i * blocksize);
++ bitmap = bh[group - first_group]->b_data;
++
++ if ((first_block + i) & 1) {
++ /* this is block of buddy */
++ mb_debug("put buddy for group %u in page %lu/%x\n",
++ group, page->index, i * blocksize);
++ memset(data, 0xff, blocksize);
++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
++ ext3_mb_generate_buddy(sb, data, bitmap,
++ EXT3_SB(sb)->s_group_info[group]);
++ } else {
++ /* this is block of bitmap */
++ mb_debug("put bitmap for group %u in page %lu/%x\n",
++ group, page->index, i * blocksize);
++ memcpy(data, bitmap, blocksize);
++ }
++ }
++ SetPageUptodate(page);
++
++out:
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ brelse(bh[i]);
++ if (bh && bh != &bhs)
++ kfree(bh);
++ return err;
++}
++
++static int ext3_mb_load_buddy(struct super_block *sb, int group,
++ struct ext3_buddy *e3b)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct inode *inode = sbi->s_buddy_cache;
++ int blocks_per_page, block, pnum, poff;
++ struct page *page;
++
++ mb_debug("load group %u\n", group);
++
++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
++
++ e3b->bd_blkbits = sb->s_blocksize_bits;
++ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_sb = sb;
++ e3b->bd_group = group;
++ e3b->bd_buddy_page = NULL;
++ e3b->bd_bitmap_page = NULL;
++
++ block = group * 2;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++
++ page = find_get_page(inode->i_mapping, pnum);
++ if (page == NULL || !PageUptodate(page)) {
++ if (page)
++ page_cache_release(page);
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ if (!PageUptodate(page))
++ ext3_mb_init_cache(page);
++ unlock_page(page);
++ }
++ }
++ if (page == NULL || !PageUptodate(page))
++ goto err;
++ e3b->bd_bitmap_page = page;
++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
++ mark_page_accessed(page);
++
++ block++;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++
++ page = find_get_page(inode->i_mapping, pnum);
++ if (page == NULL || !PageUptodate(page)) {
++ if (page)
++ page_cache_release(page);
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ if (!PageUptodate(page))
++ ext3_mb_init_cache(page);
++ unlock_page(page);
++ }
++ }
++ if (page == NULL || !PageUptodate(page))
++ goto err;
++ e3b->bd_buddy_page = page;
++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
++ mark_page_accessed(page);
++
++ J_ASSERT(e3b->bd_bitmap_page != NULL);
++ J_ASSERT(e3b->bd_buddy_page != NULL);
++
++ return 0;
++
++err:
++ if (e3b->bd_bitmap_page)
++ page_cache_release(e3b->bd_bitmap_page);
++ if (e3b->bd_buddy_page)
++ page_cache_release(e3b->bd_buddy_page);
++ e3b->bd_buddy = NULL;
++ e3b->bd_bitmap = NULL;
++ return -EIO;
++}
++
++static void ext3_mb_release_desc(struct ext3_buddy *e3b)
++{
++ if (e3b->bd_bitmap_page)
++ page_cache_release(e3b->bd_bitmap_page);
++ if (e3b->bd_buddy_page)
++ page_cache_release(e3b->bd_buddy_page);
++}
++
++
++static inline void
++ext3_lock_group(struct super_block *sb, int group)
++{
++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
++ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++}
++
++static inline void
++ext3_unlock_group(struct super_block *sb, int group)
++{
++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
++ &EXT3_SB(sb)->s_group_info[group]->bb_state);
++}
++
++static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
++{
++ int order = 1;
++ void *bb;
++
++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
++ J_ASSERT(block < (1 << (e3b->bd_blkbits + 3)));
++
++ bb = EXT3_MB_BUDDY(e3b);
++ while (order <= e3b->bd_blkbits + 1) {
++ block = block >> 1;
++ if (!mb_test_bit(block, bb)) {
++ /* this block is part of buddy of order 'order' */
++ return order;
++ }
++ bb += 1 << (e3b->bd_blkbits - order);
++ order++;
++ }
++ return 0;
++}
++
++static inline void mb_clear_bits(void *bm, int cur, int len)
++{
++ __u32 *addr;
++
++ len = cur + len;
++ while (cur < len) {
++ if ((cur & 31) == 0 && (len - cur) >= 32) {
++ /* fast path: clear whole word at once */
++ addr = bm + (cur >> 3);
++ *addr = 0;
++ cur += 32;
++ continue;
++ }
++ mb_clear_bit_atomic(cur, bm);
++ cur++;
++ }
++}
++
++static inline void mb_set_bits(void *bm, int cur, int len)
++{
++ __u32 *addr;
++
++ len = cur + len;
++ while (cur < len) {
++ if ((cur & 31) == 0 && (len - cur) >= 32) {
++ /* fast path: clear whole word at once */
++ addr = bm + (cur >> 3);
++ *addr = 0xffffffff;
++ cur += 32;
++ continue;
++ }
++ mb_set_bit_atomic(cur, bm);
++ cur++;
++ }
++}
++
++static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count)
++{
++ int block = 0, max = 0, order;
++ void *buddy, *buddy2;
++
++ mb_check_buddy(e3b);
++
++ e3b->bd_info->bb_free += count;
++ if (first < e3b->bd_info->bb_first_free)
++ e3b->bd_info->bb_first_free = first;
++
++ /* let's maintain fragments counter */
++ if (first != 0)
++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b));
++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0])
++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b));
++ if (block && max)
++ e3b->bd_info->bb_fragments--;
++ else if (!block && !max)
++ e3b->bd_info->bb_fragments++;
++
++ /* let's maintain buddy itself */
++ while (count-- > 0) {
++ block = first++;
++ order = 0;
++
++ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b)));
++ mb_clear_bit(block, EXT3_MB_BITMAP(e3b));
++ e3b->bd_info->bb_counters[order]++;
++
++ /* start of the buddy */
++ buddy = mb_find_buddy(e3b, order, &max);
++
++ do {
++ block &= ~1UL;
++ if (mb_test_bit(block, buddy) ||
++ mb_test_bit(block + 1, buddy))
++ break;
++
++ /* both the buddies are free, try to coalesce them */
++ buddy2 = mb_find_buddy(e3b, order + 1, &max);
++
++ if (!buddy2)
++ break;
++
++ if (order > 0) {
++ /* for special purposes, we don't set
++ * free bits in bitmap */
++ mb_set_bit(block, buddy);
++ mb_set_bit(block + 1, buddy);
++ }
++ e3b->bd_info->bb_counters[order]--;
++ e3b->bd_info->bb_counters[order]--;
++
++ block = block >> 1;
++ order++;
++ e3b->bd_info->bb_counters[order]++;
++
++ mb_clear_bit(block, buddy2);
++ buddy = buddy2;
++ } while (1);
++ }
++ mb_check_buddy(e3b);
++
++ return 0;
++}
++
++static int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
++ int needed, struct ext3_free_extent *ex)
++{
++ int next, max, ord;
++ void *buddy;
++
++ J_ASSERT(ex != NULL);
++
++ buddy = mb_find_buddy(e3b, order, &max);
++ J_ASSERT(buddy);
++ J_ASSERT(block < max);
++ if (mb_test_bit(block, buddy)) {
++ ex->fe_len = 0;
++ ex->fe_start = 0;
++ ex->fe_group = 0;
++ return 0;
++ }
++
++ if (likely(order == 0)) {
++ /* find actual order */
++ order = mb_find_order_for_block(e3b, block);
++ block = block >> order;
++ }
++
++ ex->fe_len = 1 << order;
++ ex->fe_start = block << order;
++ ex->fe_group = e3b->bd_group;
++
++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) {
++
++ if (block + 1 >= max)
++ break;
++
++ next = (block + 1) * (1 << order);
++ if (mb_test_bit(next, EXT3_MB_BITMAP(e3b)))
++ break;
++
++ ord = mb_find_order_for_block(e3b, next);
++
++ order = ord;
++ block = next >> order;
++ ex->fe_len += 1 << order;
++ }
++
++ J_ASSERT(ex->fe_start + ex->fe_len <= (1 << (e3b->bd_blkbits + 3)));
++ return ex->fe_len;
++}
++
++static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex)
++{
++ int ord, mlen = 0, max = 0, cur;
++ int start = ex->fe_start;
++ int len = ex->fe_len;
++ unsigned ret = 0;
++ int len0 = len;
++ void *buddy;
++
++ mb_check_buddy(e3b);
++
++ e3b->bd_info->bb_free -= len;
++ if (e3b->bd_info->bb_first_free == start)
++ e3b->bd_info->bb_first_free += len;
++
++ /* let's maintain fragments counter */
++ if (start != 0)
++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b));
++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0])
++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b));
++ if (mlen && max)
++ e3b->bd_info->bb_fragments++;
++ else if (!mlen && !max)
++ e3b->bd_info->bb_fragments--;
++
++ /* let's maintain buddy itself */
++ while (len) {
++ ord = mb_find_order_for_block(e3b, start);
++
++ if (((start >> ord) << ord) == start && len >= (1 << ord)) {
++ /* the whole chunk may be allocated at once! */
++ mlen = 1 << ord;
++ buddy = mb_find_buddy(e3b, ord, &max);
++ J_ASSERT((start >> ord) < max);
++ mb_set_bit(start >> ord, buddy);
++ e3b->bd_info->bb_counters[ord]--;
++ start += mlen;
++ len -= mlen;
++ J_ASSERT(len >= 0);
++ continue;
++ }
++
++ /* store for history */
++ if (ret == 0)
++ ret = len | (ord << 16);
++
++ /* we have to split large buddy */
++ J_ASSERT(ord > 0);
++ buddy = mb_find_buddy(e3b, ord, &max);
++ mb_set_bit(start >> ord, buddy);
++ e3b->bd_info->bb_counters[ord]--;
++
++ ord--;
++ cur = (start >> ord) & ~1U;
++ buddy = mb_find_buddy(e3b, ord, &max);
++ mb_clear_bit(cur, buddy);
++ mb_clear_bit(cur + 1, buddy);
++ e3b->bd_info->bb_counters[ord]++;
++ e3b->bd_info->bb_counters[ord]++;
++ }
++
++ /* now drop all the bits in bitmap */
++ mb_set_bits(EXT3_MB_BITMAP(e3b), ex->fe_start, len0);
++
++ mb_check_buddy(e3b);
++
++ return ret;
++}
++
++/*
++ * Must be called under group lock!
++ */
++static void ext3_mb_use_best_found(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ unsigned long ret;
++
++ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
++ ret = mb_mark_used(e3b, &ac->ac_b_ex);
++
++ ac->ac_status = AC_STATUS_FOUND;
++ ac->ac_tail = ret & 0xffff;
++ ac->ac_buddy = ret >> 16;
++}
++
++/*
++ * The routine checks whether found extent is good enough. If it is,
++ * then the extent gets marked used and flag is set to the context
++ * to stop scanning. Otherwise, the extent is compared with the
++ * previous found extent and if new one is better, then it's stored
++ * in the context. Later, the best found extent will be used, if
++ * mballoc can't find good enough extent.
++ *
++ * FIXME: real allocation policy is to be designed yet!
++ */
++static void ext3_mb_measure_extent(struct ext3_allocation_context *ac,
++ struct ext3_free_extent *ex,
++ struct ext3_buddy *e3b)
++{
++ struct ext3_free_extent *bex = &ac->ac_b_ex;
++ struct ext3_free_extent *gex = &ac->ac_g_ex;
++
++ J_ASSERT(ex->fe_len > 0);
++ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8);
++ J_ASSERT(ex->fe_start < (1 << ac->ac_sb->s_blocksize_bits) * 8);
++
++ ac->ac_found++;
++
++ /*
++ * The special case - take what you catch first
++ */
++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) {
++ *bex = *ex;
++ ext3_mb_use_best_found(ac, e3b);
++ return;
++ }
++
++ /*
++ * Let's check whether the chuck is good enough
++ */
++ if (ex->fe_len == gex->fe_len) {
++ *bex = *ex;
++ ext3_mb_use_best_found(ac, e3b);
++ return;
++ }
++
++ /*
++ * If this is first found extent, just store it in the context
++ */
++ if (bex->fe_len == 0) {
++ *bex = *ex;
++ return;
++ }
++
++ /*
++ * If new found extent is better, store it in the context
++ */
++ if (bex->fe_len < gex->fe_len) {
++ /* if the request isn't satisfied, any found extent
++ * larger than previous best one is better */
++ if (ex->fe_len > bex->fe_len)
++ *bex = *ex;
++ } else if (ex->fe_len > gex->fe_len) {
++ /* if the request is satisfied, then we try to find
++ * an extent that still satisfy the request, but is
++ * smaller than previous one */
++ *bex = *ex;
++ }
++
++ /*
++ * Let's scan at least few extents and don't pick up a first one
++ */
++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan)
++ ac->ac_status = AC_STATUS_BREAK;
++
++ /*
++ * We don't want to scan for a whole year
++ */
++ if (ac->ac_found > ext3_mb_max_to_scan)
++ ac->ac_status = AC_STATUS_BREAK;
++}
++
++static int ext3_mb_try_best_found(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct ext3_free_extent ex = ac->ac_b_ex;
++ int group = ex.fe_group, max, err;
++
++ J_ASSERT(ex.fe_len > 0);
++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
++ if (err)
++ return err;
++
++ ext3_lock_group(ac->ac_sb, group);
++ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
++
++ if (max > 0) {
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ }
++
++ ext3_unlock_group(ac->ac_sb, group);
++
++ ext3_mb_release_desc(e3b);
++
++ return 0;
++}
++
++static int ext3_mb_find_by_goal(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ int group = ac->ac_g_ex.fe_group, max, err;
++ struct ext3_free_extent ex;
++
++ err = ext3_mb_load_buddy(ac->ac_sb, group, e3b);
++ if (err)
++ return err;
++
++ ext3_lock_group(ac->ac_sb, group);
++ max = mb_find_extent(e3b, 0, ac->ac_g_ex.fe_start,
++ ac->ac_g_ex.fe_len, &ex);
++
++ if (max > 0) {
++ J_ASSERT(ex.fe_len > 0);
++ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
++ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
++ ac->ac_found++;
++ ac->ac_b_ex = ex;
++ ext3_mb_use_best_found(ac, e3b);
++ }
++ ext3_unlock_group(ac->ac_sb, group);
++
++ ext3_mb_release_desc(e3b);
++
++ return 0;
++}
++
++/*
++ * The routine scans buddy structures (not bitmap!) from given order
++ * to max order and tries to find big enough chunk to satisfy the req
++ */
++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ struct ext3_group_info *grp = e3b->bd_info;
++ void *buddy;
++ int i, k, max;
++
++ J_ASSERT(ac->ac_2order > 0);
++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) {
++ if (grp->bb_counters[i] == 0)
++ continue;
++
++ buddy = mb_find_buddy(e3b, i, &max);
++ if (buddy == NULL) {
++ printk(KERN_ALERT "looking for wrong order?\n");
++ break;
++ }
++
++ k = mb_find_next_zero_bit(buddy, max, 0);
++ J_ASSERT(k < max);
++
++ ac->ac_found++;
++
++ ac->ac_b_ex.fe_len = 1 << i;
++ ac->ac_b_ex.fe_start = k << i;
++ ac->ac_b_ex.fe_group = e3b->bd_group;
++
++ ext3_mb_use_best_found(ac, e3b);
++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len);
++
++ if (unlikely(ext3_mb_stats))
++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders);
++
++ break;
++ }
++}
++
++/*
++ * The routine scans the group and measures all found extents.
++ * In order to optimize scanning, caller must pass number of
++ * free blocks in the group, so the routine can know upper limit.
++ */
++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ void *bitmap = EXT3_MB_BITMAP(e3b);
++ struct ext3_free_extent ex;
++ int i, free;
++
++ free = e3b->bd_info->bb_free;
++ J_ASSERT(free > 0);
++
++ i = e3b->bd_info->bb_first_free;
++
++ while (free && ac->ac_status == AC_STATUS_CONTINUE) {
++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
++ if (i >= sb->s_blocksize * 8) {
++ J_ASSERT(free == 0);
++ break;
++ }
++
++ mb_find_extent(e3b, 0, i, ac->ac_g_ex.fe_len, &ex);
++ J_ASSERT(ex.fe_len > 0);
++ J_ASSERT(free >= ex.fe_len);
++
++ ext3_mb_measure_extent(ac, &ex, e3b);
++
++ i += ex.fe_len;
++ free -= ex.fe_len;
++ }
++}
++
++static int ext3_mb_good_group(struct ext3_allocation_context *ac,
++ int group, int cr)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
++ struct ext3_group_info *grp = sbi->s_group_info[group];
++ unsigned free, fragments, i, bits;
++
++ J_ASSERT(cr >= 0 && cr < 4);
++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp));
++
++ free = grp->bb_free;
++ fragments = grp->bb_fragments;
++ if (free == 0)
++ return 0;
++ if (fragments == 0)
++ return 0;
++
++ switch (cr) {
++ case 0:
++ J_ASSERT(ac->ac_2order != 0);
++ bits = ac->ac_sb->s_blocksize_bits + 1;
++ for (i = ac->ac_2order; i < bits; i++)
++ if (grp->bb_counters[i] > 0)
++ return 1;
++ case 1:
++ if ((free / fragments) >= ac->ac_g_ex.fe_len)
++ return 1;
++ case 2:
++ if (free >= ac->ac_g_ex.fe_len)
++ return 1;
++ case 3:
++ return 1;
++ default:
++ BUG();
++ }
++
++ return 0;
++}
++
++int ext3_mb_new_blocks(handle_t *handle, struct inode *inode,
++ unsigned long goal, int *len, int flags, int *errp)
++{
++ struct buffer_head *bitmap_bh = NULL;
++ struct ext3_allocation_context ac;
++ int i, group, block, cr, err = 0;
++ struct ext3_group_desc *gdp;
++ struct ext3_super_block *es;
++ struct buffer_head *gdp_bh;
++ struct ext3_sb_info *sbi;
++ struct super_block *sb;
++ struct ext3_buddy e3b;
++
++ J_ASSERT(len != NULL);
++ J_ASSERT(*len > 0);
++
++ sb = inode->i_sb;
++ if (!sb) {
++ printk("ext3_mb_new_nblocks: nonexistent device");
++ return 0;
++ }
++
++ if (!test_opt(sb, MBALLOC)) {
++ static int ext3_mballoc_warning = 0;
++ if (ext3_mballoc_warning == 0) {
++ printk(KERN_ERR "EXT3-fs: multiblock request with "
++ "mballoc disabled!\n");
++ ext3_mballoc_warning++;
++ }
++ *len = 1;
++ err = ext3_new_block_old(handle, inode, goal, errp);
++ return err;
++ }
++
++ ext3_mb_poll_new_transaction(sb, handle);
++
++ sbi = EXT3_SB(sb);
++ es = EXT3_SB(sb)->s_es;
++
++ /*
++ * We can't allocate > group size
++ */
++ if (*len >= EXT3_BLOCKS_PER_GROUP(sb) - 10)
++ *len = EXT3_BLOCKS_PER_GROUP(sb) - 10;
++
++ if (!(flags & EXT3_MB_HINT_RESERVED)) {
++ /* someone asks for non-reserved blocks */
++ BUG_ON(*len > 1);
++ err = ext3_mb_reserve_blocks(sb, 1);
++ if (err) {
++ *errp = err;
++ return 0;
++ }
++ }
++
++ /*
++ * Check quota for allocation of this blocks.
++ */
++ while (*len && DQUOT_ALLOC_BLOCK(inode, *len))
++ *len -= 1;
++ if (*len == 0) {
++ *errp = -EDQUOT;
++ block = 0;
++ goto out;
++ }
++
++ /* start searching from the goal */
++ if (goal < le32_to_cpu(es->s_first_data_block) ||
++ goal >= le32_to_cpu(es->s_blocks_count))
++ goal = le32_to_cpu(es->s_first_data_block);
++ group = (goal - le32_to_cpu(es->s_first_data_block)) /
++ EXT3_BLOCKS_PER_GROUP(sb);
++ block = ((goal - le32_to_cpu(es->s_first_data_block)) %
++ EXT3_BLOCKS_PER_GROUP(sb));
++
++ /* set up allocation goals */
++ ac.ac_b_ex.fe_group = 0;
++ ac.ac_b_ex.fe_start = 0;
++ ac.ac_b_ex.fe_len = 0;
++ ac.ac_status = AC_STATUS_CONTINUE;
++ ac.ac_groups_scanned = 0;
++ ac.ac_ex_scanned = 0;
++ ac.ac_found = 0;
++ ac.ac_sb = inode->i_sb;
++ ac.ac_g_ex.fe_group = group;
++ ac.ac_g_ex.fe_start = block;
++ ac.ac_g_ex.fe_len = *len;
++ ac.ac_flags = flags;
++ ac.ac_2order = 0;
++ ac.ac_criteria = 0;
++
++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */
++ i = ffs(*len);
++ if (i >= 8) {
++ i--;
++ if ((*len & (~(1 << i))) == 0)
++ ac.ac_2order = i;
++ }
++
++ /* Sometimes, caller may want to merge even small
++ * number of blocks to an existing extent */
++ if (ac.ac_flags & EXT3_MB_HINT_MERGE) {
++ err = ext3_mb_find_by_goal(&ac, &e3b);
++ if (err)
++ goto out_err;
++ if (ac.ac_status == AC_STATUS_FOUND)
++ goto found;
++ }
++
++ /* Let's just scan groups to find more-less suitable blocks */
++ cr = ac.ac_2order ? 0 : 1;
++repeat:
++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
++ ac.ac_criteria = cr;
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) {
++ if (group == EXT3_SB(sb)->s_groups_count)
++ group = 0;
++
++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ /* we need full data about the group
++ * to make a good selection */
++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
++ if (err)
++ goto out_err;
++ ext3_mb_release_desc(&e3b);
++ }
++
++ /* check is group good for our criteries */
++ if (!ext3_mb_good_group(&ac, group, cr))
++ continue;
++
++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
++ if (err)
++ goto out_err;
++
++ ext3_lock_group(sb, group);
++ if (!ext3_mb_good_group(&ac, group, cr)) {
++ /* someone did allocation from this group */
++ ext3_unlock_group(sb, group);
++ ext3_mb_release_desc(&e3b);
++ continue;
++ }
++
++ ac.ac_groups_scanned++;
++ if (cr == 0)
++ ext3_mb_simple_scan_group(&ac, &e3b);
++ else
++ ext3_mb_complex_scan_group(&ac, &e3b);
++
++ ext3_unlock_group(sb, group);
++
++ ext3_mb_release_desc(&e3b);
++
++ if (err)
++ goto out_err;
++ if (ac.ac_status != AC_STATUS_CONTINUE)
++ break;
++ }
++ }
++
++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND &&
++ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
++ /*
++ * We've been searching too long. Let's try to allocate
++ * the best chunk we've found so far
++ */
++
++ /*if (ac.ac_found > ext3_mb_max_to_scan)
++ printk(KERN_ERR "EXT3-fs: too long searching at "
++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len,
++ ac.ac_g_ex.fe_len);*/
++ ext3_mb_try_best_found(&ac, &e3b);
++ if (ac.ac_status != AC_STATUS_FOUND) {
++ /*
++ * Someone more lucky has already allocated it.
++ * The only thing we can do is just take first
++ * found block(s)
++ */
++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");
++ ac.ac_b_ex.fe_group = 0;
++ ac.ac_b_ex.fe_start = 0;
++ ac.ac_b_ex.fe_len = 0;
++ ac.ac_status = AC_STATUS_CONTINUE;
++ ac.ac_flags |= EXT3_MB_HINT_FIRST;
++ cr = 3;
++ goto repeat;
++ }
++ }
++
++ if (ac.ac_status != AC_STATUS_FOUND) {
++ /*
++ * We aren't lucky definitely
++ */
++ DQUOT_FREE_BLOCK(inode, *len);
++ *errp = -ENOSPC;
++ block = 0;
++#if 1
++ printk(KERN_ERR "EXT3-fs: cant allocate: status %d, flags %d\n",
++ ac.ac_status, ac.ac_flags);
++ printk(KERN_ERR "EXT3-fs: goal %d, best found %d/%d/%d, cr %d\n",
++ ac.ac_g_ex.fe_len, ac.ac_b_ex.fe_group,
++ ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len, cr);
++ printk(KERN_ERR "EXT3-fs: %lu block reserved, %d found\n",
++ sbi->s_blocks_reserved, ac.ac_found);
++ printk("EXT3-fs: groups: ");
++ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
++ printk("%d: %d ", i,
++ sbi->s_group_info[i]->bb_free);
++ printk("\n");
++#endif
++ goto out;
++ }
++
++found:
++ J_ASSERT(ac.ac_b_ex.fe_len > 0);
++
++ /* good news - free block(s) have been found. now it's time
++ * to mark block(s) in good old journaled bitmap */
++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + ac.ac_b_ex.fe_start
++ + le32_to_cpu(es->s_first_data_block);
++
++ /* we made a desicion, now mark found blocks in good old
++ * bitmap to be journaled */
++
++ ext3_debug("using block group %d(%d)\n",
++ ac.ac_b_group.group, gdp->bg_free_blocks_count);
++
++ bitmap_bh = read_block_bitmap(sb, ac.ac_b_ex.fe_group);
++ if (!bitmap_bh) {
++ *errp = -EIO;
++ goto out_err;
++ }
++
++ err = ext3_journal_get_write_access(handle, bitmap_bh);
++ if (err) {
++ *errp = err;
++ goto out_err;
++ }
++
++ gdp = ext3_get_group_desc(sb, ac.ac_b_ex.fe_group, &gdp_bh);
++ if (!gdp) {
++ *errp = -EIO;
++ goto out_err;
++ }
++
++ err = ext3_journal_get_write_access(handle, gdp_bh);
++ if (err)
++ goto out_err;
++
++ block = ac.ac_b_ex.fe_group * EXT3_BLOCKS_PER_GROUP(sb)
++ + ac.ac_b_ex.fe_start
++ + le32_to_cpu(es->s_first_data_block);
++
++ if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
++ block == le32_to_cpu(gdp->bg_inode_bitmap) ||
++ in_range(block, le32_to_cpu(gdp->bg_inode_table),
++ EXT3_SB(sb)->s_itb_per_group))
++ ext3_error(sb, "ext3_new_block",
++ "Allocating block in system zone - "
++ "block = %u", block);
++#ifdef AGGRESSIVE_CHECK
++ for (i = 0; i < ac.ac_b_ex.fe_len; i++)
++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data));
++#endif
++ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len);
++
++ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
++ - ac.ac_b_ex.fe_len);
++ spin_unlock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
++ percpu_counter_mod(&sbi->s_freeblocks_counter, - ac.ac_b_ex.fe_len);
++
++ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++ if (err)
++ goto out_err;
++ err = ext3_journal_dirty_metadata(handle, gdp_bh);
++ if (err)
++ goto out_err;
++
++ sb->s_dirt = 1;
++ *errp = 0;
++ brelse(bitmap_bh);
++
++ /* drop non-allocated, but dquote'd blocks */
++ J_ASSERT(*len >= ac.ac_b_ex.fe_len);
++ DQUOT_FREE_BLOCK(inode, *len - ac.ac_b_ex.fe_len);
++
++ *len = ac.ac_b_ex.fe_len;
++ J_ASSERT(*len > 0);
++ J_ASSERT(block != 0);
++ goto out;
++
++out_err:
++ /* if we've already allocated something, roll it back */
++ if (ac.ac_status == AC_STATUS_FOUND) {
++ /* FIXME: free blocks here */
++ }
++
++ DQUOT_FREE_BLOCK(inode, *len);
++ brelse(bitmap_bh);
++ *errp = err;
++ block = 0;
++out:
++ if (!(flags & EXT3_MB_HINT_RESERVED)) {
++ /* block wasn't reserved before and we reserved it
++ * at the beginning of allocation. it doesn't matter
++ * whether we allocated anything or we failed: time
++ * to release reservation. NOTE: because I expect
++ * any multiblock request from delayed allocation
++ * path only, here is single block always */
++ ext3_mb_release_blocks(sb, 1);
++ }
++
++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) {
++ atomic_inc(&sbi->s_bal_reqs);
++ atomic_add(*len, &sbi->s_bal_allocated);
++ if (*len >= ac.ac_g_ex.fe_len)
++ atomic_inc(&sbi->s_bal_success);
++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned);
++ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start &&
++ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group)
++ atomic_inc(&sbi->s_bal_goals);
++ if (ac.ac_found > ext3_mb_max_to_scan)
++ atomic_inc(&sbi->s_bal_breaks);
++ }
++
++ ext3_mb_store_history(sb, &ac);
++
++ return block;
++}
++EXPORT_SYMBOL(ext3_mb_new_blocks);
++
++#ifdef EXT3_MB_HISTORY
++struct ext3_mb_proc_session {
++ struct ext3_mb_history *history;
++ struct super_block *sb;
++ int start;
++ int max;
++};
++
++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s,
++ struct ext3_mb_history *hs,
++ int first)
++{
++ if (hs == s->history + s->max)
++ hs = s->history;
++ if (!first && hs == s->history + s->start)
++ return NULL;
++ while (hs->goal.fe_len == 0) {
++ hs++;
++ if (hs == s->history + s->max)
++ hs = s->history;
++ if (hs == s->history + s->start)
++ return NULL;
++ }
++ return hs;
++}
++
++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
++{
++ struct ext3_mb_proc_session *s = seq->private;
++ struct ext3_mb_history *hs;
++ int l = *pos;
++
++ if (l == 0)
++ return SEQ_START_TOKEN;
++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1);
++ if (!hs)
++ return NULL;
++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL);
++ return hs;
++}
++
++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
++{
++ struct ext3_mb_proc_session *s = seq->private;
++ struct ext3_mb_history *hs = v;
++
++ ++*pos;
++ if (v == SEQ_START_TOKEN)
++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1);
++ else
++ return ext3_mb_history_skip_empty(s, ++hs, 0);
++}
++
++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v)
++{
++ struct ext3_mb_history *hs = v;
++ char buf[20], buf2[20];
++
++ if (v == SEQ_START_TOKEN) {
++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n",
++ "goal", "result", "found", "grps", "cr", "merge",
++ "tail", "broken");
++ return 0;
++ }
++
++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group,
++ hs->goal.fe_start, hs->goal.fe_len);
++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group,
++ hs->result.fe_start, hs->result.fe_len);
++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf,
++ buf2, hs->found, hs->groups, hs->cr,
++ hs->merged ? "M" : "", hs->tail,
++ hs->buddy ? 1 << hs->buddy : 0);
++ return 0;
++}
++
++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v)
++{
++}
++
++static struct seq_operations ext3_mb_seq_history_ops = {
++ .start = ext3_mb_seq_history_start,
++ .next = ext3_mb_seq_history_next,
++ .stop = ext3_mb_seq_history_stop,
++ .show = ext3_mb_seq_history_show,
++};
++
++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file)
++{
++ struct super_block *sb = PDE(inode)->data;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_proc_session *s;
++ int rc, size;
++
++ s = kmalloc(sizeof(*s), GFP_KERNEL);
++ if (s == NULL)
++ return -EIO;
++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max;
++ s->history = kmalloc(size, GFP_KERNEL);
++ if (s == NULL) {
++ kfree(s);
++ return -EIO;
++ }
++
++ spin_lock(&sbi->s_mb_history_lock);
++ memcpy(s->history, sbi->s_mb_history, size);
++ s->max = sbi->s_mb_history_max;
++ s->start = sbi->s_mb_history_cur % s->max;
++ spin_unlock(&sbi->s_mb_history_lock);
++
++ rc = seq_open(file, &ext3_mb_seq_history_ops);
++ if (rc == 0) {
++ struct seq_file *m = (struct seq_file *)file->private_data;
++ m->private = s;
++ } else {
++ kfree(s->history);
++ kfree(s);
++ }
++ return rc;
++
++}
++
++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file)
++{
++ struct seq_file *seq = (struct seq_file *)file->private_data;
++ struct ext3_mb_proc_session *s = seq->private;
++ kfree(s->history);
++ kfree(s);
++ return seq_release(inode, file);
++}
++
++static struct file_operations ext3_mb_seq_history_fops = {
++ .owner = THIS_MODULE,
++ .open = ext3_mb_seq_history_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext3_mb_seq_history_release,
++};
++
++static void ext3_mb_history_release(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char name[64];
++
++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ remove_proc_entry("mb_history", sbi->s_mb_proc);
++ remove_proc_entry(name, proc_root_ext3);
++
++ if (sbi->s_mb_history)
++ kfree(sbi->s_mb_history);
++}
++
++static void ext3_mb_history_init(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char name[64];
++ int i;
++
++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3);
++ if (sbi->s_mb_proc != NULL) {
++ struct proc_dir_entry *p;
++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
++ if (p) {
++ p->proc_fops = &ext3_mb_seq_history_fops;
++ p->data = sb;
++ }
++ }
++
++ sbi->s_mb_history_max = 1000;
++ sbi->s_mb_history_cur = 0;
++ spin_lock_init(&sbi->s_mb_history_lock);
++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history);
++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
++ memset(sbi->s_mb_history, 0, i);
++ /* if we can't allocate history, then we simple won't use it */
++}
++
++static void
++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_history h;
++
++ if (likely(sbi->s_mb_history == NULL))
++ return;
++
++ h.goal = ac->ac_g_ex;
++ h.result = ac->ac_b_ex;
++ h.found = ac->ac_found;
++ h.cr = ac->ac_criteria;
++ h.groups = ac->ac_groups_scanned;
++ h.tail = ac->ac_tail;
++ h.buddy = ac->ac_buddy;
++ h.merged = 0;
++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
++ h.merged = 1;
++
++ spin_lock(&sbi->s_mb_history_lock);
++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
++ sbi->s_mb_history_cur = 0;
++ spin_unlock(&sbi->s_mb_history_lock);
++}
++
++#else
++#define ext3_mb_history_release(sb)
++#define ext3_mb_history_init(sb)
++#endif
++
++int ext3_mb_init_backend(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int i, len;
++
++ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
++ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ if (sbi->s_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
++ return -ENOMEM;
++ }
++ memset(sbi->s_group_info, 0, len);
++
++ sbi->s_buddy_cache = new_inode(sb);
++ if (sbi->s_buddy_cache == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
++ kfree(sbi->s_group_info);
++ return -ENOMEM;
++ }
++
++ /*
++ * calculate needed size. if change bb_counters size,
++ * don't forget about ext3_mb_generate_buddy()
++ */
++ len = sizeof(struct ext3_group_info);
++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
++ for (i = 0; i < sbi->s_groups_count; i++) {
++ struct ext3_group_desc * desc;
++
++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
++ if (sbi->s_group_info[i] == NULL) {
++ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
++ goto err_out;
++ }
++ desc = ext3_get_group_desc(sb, i, NULL);
++ if (desc == NULL) {
++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i);
++ goto err_out;
++ }
++ memset(sbi->s_group_info[i], 0, len);
++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
++ &sbi->s_group_info[i]->bb_state);
++ sbi->s_group_info[i]->bb_free =
++ le16_to_cpu(desc->bg_free_blocks_count);
++ }
++
++ return 0;
++
++err_out:
++ while (--i >= 0)
++ kfree(sbi->s_group_info[i]);
++ iput(sbi->s_buddy_cache);
++
++ return -ENOMEM;
++}
++
++int ext3_mb_init(struct super_block *sb, int needs_recovery)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct inode *root = sb->s_root->d_inode;
++ unsigned i, offset, max;
++ struct dentry *dentry;
++
++ if (!test_opt(sb, MBALLOC))
++ return 0;
++
++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
++
++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
++ if (sbi->s_mb_offsets == NULL) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ return -ENOMEM;
++ }
++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
++ if (sbi->s_mb_maxs == NULL) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ kfree(sbi->s_mb_maxs);
++ return -ENOMEM;
++ }
++
++ /* order 0 is regular bitmap */
++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
++ sbi->s_mb_offsets[0] = 0;
++
++ i = 1;
++ offset = 0;
++ max = sb->s_blocksize << 2;
++ do {
++ sbi->s_mb_offsets[i] = offset;
++ sbi->s_mb_maxs[i] = max;
++ offset += 1 << (sb->s_blocksize_bits - i);
++ max = max >> 1;
++ i++;
++ } while (i <= sb->s_blocksize_bits + 1);
++
++
++ /* init file for buddy data */
++ if ((i = ext3_mb_init_backend(sb))) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ kfree(sbi->s_mb_offsets);
++ kfree(sbi->s_mb_maxs);
++ return i;
++ }
++
++ spin_lock_init(&sbi->s_reserve_lock);
++ spin_lock_init(&sbi->s_md_lock);
++ INIT_LIST_HEAD(&sbi->s_active_transaction);
++ INIT_LIST_HEAD(&sbi->s_closed_transaction);
++ INIT_LIST_HEAD(&sbi->s_committed_transaction);
++ spin_lock_init(&sbi->s_bal_lock);
++
++ /* remove old on-disk buddy file */
++ down(&root->i_sem);
++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy"));
++ if (dentry->d_inode != NULL) {
++ i = vfs_unlink(root, dentry);
++ if (i != 0)
++ printk("EXT3-fs: can't remove .buddy file: %d\n", i);
++ }
++ dput(dentry);
++ up(&root->i_sem);
++
++ ext3_mb_history_init(sb);
++
++ printk("EXT3-fs: mballoc enabled\n");
++ return 0;
++}
++
++int ext3_mb_release(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int i;
++
++ if (!test_opt(sb, MBALLOC))
++ return 0;
++
++ /* release freed, non-committed blocks */
++ spin_lock(&sbi->s_md_lock);
++ list_splice_init(&sbi->s_closed_transaction,
++ &sbi->s_committed_transaction);
++ list_splice_init(&sbi->s_active_transaction,
++ &sbi->s_committed_transaction);
++ spin_unlock(&sbi->s_md_lock);
++ ext3_mb_free_committed_blocks(sb);
++
++ if (sbi->s_group_info) {
++ for (i = 0; i < sbi->s_groups_count; i++) {
++ if (sbi->s_group_info[i] == NULL)
++ continue;
++ kfree(sbi->s_group_info[i]);
++ }
++ kfree(sbi->s_group_info);
++ }
++ if (sbi->s_mb_offsets)
++ kfree(sbi->s_mb_offsets);
++ if (sbi->s_mb_maxs)
++ kfree(sbi->s_mb_maxs);
++ if (sbi->s_buddy_cache)
++ iput(sbi->s_buddy_cache);
++ if (sbi->s_blocks_reserved)
++ printk("ext3-fs: %ld blocks being reserved at umount!\n",
++ sbi->s_blocks_reserved);
++ if (ext3_mb_stats) {
++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n",
++ atomic_read(&sbi->s_bal_allocated),
++ atomic_read(&sbi->s_bal_reqs),
++ atomic_read(&sbi->s_bal_success));
++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, "
++ "%u 2^N hits, %u breaks\n",
++ atomic_read(&sbi->s_bal_ex_scanned),
++ atomic_read(&sbi->s_bal_goals),
++ atomic_read(&sbi->s_bal_2orders),
++ atomic_read(&sbi->s_bal_breaks));
++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n",
++ sbi->s_mb_buddies_generated++,
++ sbi->s_mb_generation_time);
++ }
++
++ ext3_mb_history_release(sb);
++
++ return 0;
++}
++
++void ext3_mb_free_committed_blocks(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int err, i, count = 0, count2 = 0;
++ struct ext3_free_metadata *md;
++ struct ext3_buddy e3b;
++
++ if (list_empty(&sbi->s_committed_transaction))
++ return;
++
++ /* there is committed blocks to be freed yet */
++ do {
++ /* get next array of blocks */
++ md = NULL;
++ spin_lock(&sbi->s_md_lock);
++ if (!list_empty(&sbi->s_committed_transaction)) {
++ md = list_entry(sbi->s_committed_transaction.next,
++ struct ext3_free_metadata, list);
++ list_del(&md->list);
++ }
++ spin_unlock(&sbi->s_md_lock);
++
++ if (md == NULL)
++ break;
++
++ mb_debug("gonna free %u blocks in group %u (0x%p):",
++ md->num, md->group, md);
++
++ err = ext3_mb_load_buddy(sb, md->group, &e3b);
++ BUG_ON(err != 0);
++
++ /* there are blocks to put in buddy to make them really free */
++ count += md->num;
++ count2++;
++ ext3_lock_group(sb, md->group);
++ for (i = 0; i < md->num; i++) {
++ mb_debug(" %u", md->blocks[i]);
++ mb_free_blocks(&e3b, md->blocks[i], 1);
++ }
++ mb_debug("\n");
++ ext3_unlock_group(sb, md->group);
++
++ /* balance refcounts from ext3_mb_free_metadata() */
++ page_cache_release(e3b.bd_buddy_page);
++ page_cache_release(e3b.bd_bitmap_page);
++
++ kfree(md);
++ ext3_mb_release_desc(&e3b);
++
++ } while (md);
++ mb_debug("freed %u blocks in %u structures\n", count, count2);
++}
++
++void ext3_mb_poll_new_transaction(struct super_block *sb, handle_t *handle)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++ if (sbi->s_last_transaction == handle->h_transaction->t_tid)
++ return;
++
++ /* new transaction! time to close last one and free blocks for
++ * committed transaction. we know that only transaction can be
++ * active, so previos transaction can be being logged and we
++ * know that transaction before previous is known to be already
++ * logged. this means that now we may free blocks freed in all
++ * transactions before previous one. hope I'm clear enough ... */
++
++ spin_lock(&sbi->s_md_lock);
++ if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
++ mb_debug("new transaction %lu, old %lu\n",
++ (unsigned long) handle->h_transaction->t_tid,
++ (unsigned long) sbi->s_last_transaction);
++ list_splice_init(&sbi->s_closed_transaction,
++ &sbi->s_committed_transaction);
++ list_splice_init(&sbi->s_active_transaction,
++ &sbi->s_closed_transaction);
++ sbi->s_last_transaction = handle->h_transaction->t_tid;
++ }
++ spin_unlock(&sbi->s_md_lock);
++
++ ext3_mb_free_committed_blocks(sb);
++}
++
++int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b,
++ int group, int block, int count)
++{
++ struct ext3_group_info *db = e3b->bd_info;
++ struct super_block *sb = e3b->bd_sb;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_free_metadata *md;
++ int i;
++
++ J_ASSERT(e3b->bd_bitmap_page != NULL);
++ J_ASSERT(e3b->bd_buddy_page != NULL);
++
++ ext3_lock_group(sb, group);
++ for (i = 0; i < count; i++) {
++ md = db->bb_md_cur;
++ if (md && db->bb_tid != handle->h_transaction->t_tid) {
++ db->bb_md_cur = NULL;
++ md = NULL;
++ }
++
++ if (md == NULL) {
++ ext3_unlock_group(sb, group);
++ md = kmalloc(sizeof(*md), GFP_KERNEL);
++ if (md == NULL)
++ return -ENOMEM;
++ md->num = 0;
++ md->group = group;
++
++ ext3_lock_group(sb, group);
++ if (db->bb_md_cur == NULL) {
++ spin_lock(&sbi->s_md_lock);
++ list_add(&md->list, &sbi->s_active_transaction);
++ spin_unlock(&sbi->s_md_lock);
++ /* protect buddy cache from being freed,
++ * otherwise we'll refresh it from
++ * on-disk bitmap and lose not-yet-available
++ * blocks */
++ page_cache_get(e3b->bd_buddy_page);
++ page_cache_get(e3b->bd_bitmap_page);
++ db->bb_md_cur = md;
++ db->bb_tid = handle->h_transaction->t_tid;
++ mb_debug("new md 0x%p for group %u\n",
++ md, md->group);
++ } else {
++ kfree(md);
++ md = db->bb_md_cur;
++ }
++ }
++
++ BUG_ON(md->num >= EXT3_BB_MAX_BLOCKS);
++ md->blocks[md->num] = block + i;
++ md->num++;
++ if (md->num == EXT3_BB_MAX_BLOCKS) {
++ /* no more space, put full container on a sb's list */
++ db->bb_md_cur = NULL;
++ }
++ }
++ ext3_unlock_group(sb, group);
++ return 0;
++}
++
++void ext3_mb_free_blocks(handle_t *handle, struct inode *inode,
++ unsigned long block, unsigned long count,
++ int metadata, int *freed)
++{
++ struct buffer_head *bitmap_bh = NULL;
++ struct ext3_group_desc *gdp;
++ struct ext3_super_block *es;
++ unsigned long bit, overflow;
++ struct buffer_head *gd_bh;
++ unsigned long block_group;
++ struct ext3_sb_info *sbi;
++ struct super_block *sb;
++ struct ext3_buddy e3b;
++ int err = 0, ret;
++
++ *freed = 0;
++ sb = inode->i_sb;
++ if (!sb) {
++ printk ("ext3_free_blocks: nonexistent device");
++ return;
++ }
++
++ ext3_mb_poll_new_transaction(sb, handle);
++
++ sbi = EXT3_SB(sb);
++ es = EXT3_SB(sb)->s_es;
++ if (block < le32_to_cpu(es->s_first_data_block) ||
++ block + count < block ||
++ block + count > le32_to_cpu(es->s_blocks_count)) {
++ ext3_error (sb, "ext3_free_blocks",
++ "Freeing blocks not in datazone - "
++ "block = %lu, count = %lu", block, count);
++ goto error_return;
++ }
++
++ ext3_debug("freeing block %lu\n", block);
++
++do_more:
++ overflow = 0;
++ block_group = (block - le32_to_cpu(es->s_first_data_block)) /
++ EXT3_BLOCKS_PER_GROUP(sb);
++ bit = (block - le32_to_cpu(es->s_first_data_block)) %
++ EXT3_BLOCKS_PER_GROUP(sb);
++ /*
++ * Check to see if we are freeing blocks across a group
++ * boundary.
++ */
++ if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
++ overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
++ count -= overflow;
++ }
++ brelse(bitmap_bh);
++ bitmap_bh = read_block_bitmap(sb, block_group);
++ if (!bitmap_bh)
++ goto error_return;
++ gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
++ if (!gdp)
++ goto error_return;
++
++ if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
++ in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
++ in_range (block, le32_to_cpu(gdp->bg_inode_table),
++ EXT3_SB(sb)->s_itb_per_group) ||
++ in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
++ EXT3_SB(sb)->s_itb_per_group))
++ ext3_error (sb, "ext3_free_blocks",
++ "Freeing blocks in system zones - "
++ "Block = %lu, count = %lu",
++ block, count);
++
++ BUFFER_TRACE(bitmap_bh, "getting write access");
++ err = ext3_journal_get_write_access(handle, bitmap_bh);
++ if (err)
++ goto error_return;
++
++ /*
++ * We are about to modify some metadata. Call the journal APIs
++ * to unshare ->b_data if a currently-committing transaction is
++ * using it
++ */
++ BUFFER_TRACE(gd_bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, gd_bh);
++ if (err)
++ goto error_return;
++
++ err = ext3_mb_load_buddy(sb, block_group, &e3b);
++ if (err)
++ goto error_return;
++
++#ifdef AGGRESSIVE_CHECK
++ {
++ int i;
++ for (i = 0; i < count; i++)
++ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data));
++ }
++#endif
++ mb_clear_bits(bitmap_bh->b_data, bit, count);
++
++ /* We dirtied the bitmap block */
++ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
++ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++
++ if (metadata) {
++ /* blocks being freed are metadata. these blocks shouldn't
++ * be used until this transaction is committed */
++ ext3_mb_free_metadata(handle, &e3b, block_group, bit, count);
++ } else {
++ ext3_lock_group(sb, block_group);
++ mb_free_blocks(&e3b, bit, count);
++ ext3_unlock_group(sb, block_group);
++ }
++
++ spin_lock(sb_bgl_lock(sbi, block_group));
++ gdp->bg_free_blocks_count =
++ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
++ spin_unlock(sb_bgl_lock(sbi, block_group));
++ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
++
++ ext3_mb_release_desc(&e3b);
++
++ *freed = count;
++
++ /* And the group descriptor block */
++ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
++ ret = ext3_journal_dirty_metadata(handle, gd_bh);
++ if (!err) err = ret;
++
++ if (overflow && !err) {
++ block += count;
++ count = overflow;
++ goto do_more;
++ }
++ sb->s_dirt = 1;
++error_return:
++ brelse(bitmap_bh);
++ ext3_std_error(sb, err);
++ return;
++}
++
++int ext3_mb_reserve_blocks(struct super_block *sb, int blocks)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ int free, ret = -ENOSPC;
++
++ BUG_ON(blocks < 0);
++ spin_lock(&sbi->s_reserve_lock);
++ free = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
++ if (blocks <= free - sbi->s_blocks_reserved) {
++ sbi->s_blocks_reserved += blocks;
++ ret = 0;
++ }
++ spin_unlock(&sbi->s_reserve_lock);
++ return ret;
++}
++
++void ext3_mb_release_blocks(struct super_block *sb, int blocks)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++
++ BUG_ON(blocks < 0);
++ spin_lock(&sbi->s_reserve_lock);
++ sbi->s_blocks_reserved -= blocks;
++ WARN_ON(sbi->s_blocks_reserved < 0);
++ if (sbi->s_blocks_reserved < 0)
++ sbi->s_blocks_reserved = 0;
++ spin_unlock(&sbi->s_reserve_lock);
++}
++
++int ext3_new_block(handle_t *handle, struct inode *inode,
++ unsigned long goal, int *errp)
++{
++ int ret, len;
++
++ if (!test_opt(inode->i_sb, MBALLOC)) {
++ ret = ext3_new_block_old(handle, inode, goal, errp);
++ goto out;
++ }
++ len = 1;
++ ret = ext3_mb_new_blocks(handle, inode, goal, &len, 0, errp);
++out:
++ return ret;
++}
++
++
++void ext3_free_blocks(handle_t *handle, struct inode * inode,
++ unsigned long block, unsigned long count, int metadata)
++{
++ struct super_block *sb;
++ int freed;
++
++ sb = inode->i_sb;
++ if (!test_opt(sb, MBALLOC))
++ ext3_free_blocks_sb(handle, sb, block, count, &freed);
++ else
++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
++ if (freed)
++ DQUOT_FREE_BLOCK(inode, freed);
++ return;
++}
++
++#define EXT3_ROOT "ext3"
++#define EXT3_MB_STATS_NAME "mb_stats"
++#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
++
++static int ext3_mb_stats_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ int len;
++
++ *eof = 1;
++ if (off != 0)
++ return 0;
++
++ len = sprintf(page, "%ld\n", ext3_mb_stats);
++ *start = page;
++ return len;
++}
++
++static int ext3_mb_stats_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ char str[32];
++
++ if (count >= sizeof(str)) {
++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
++ EXT3_MB_STATS_NAME, (int)sizeof(str));
++ return -EOVERFLOW;
++ }
++
++ if (copy_from_user(str, buffer, count))
++ return -EFAULT;
++
++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0);
++ return count;
++}
++
++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ int len;
++
++ *eof = 1;
++ if (off != 0)
++ return 0;
++
++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan);
++ *start = page;
++ return len;
++}
++
++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ char str[32];
++ long value;
++
++ if (count >= sizeof(str)) {
++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str));
++ return -EOVERFLOW;
++ }
++
++ if (copy_from_user(str, buffer, count))
++ return -EFAULT;
++
++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
++ value = simple_strtol(str, NULL, 0);
++ if (value <= 0)
++ return -ERANGE;
++
++ ext3_mb_max_to_scan = value;
++
++ return count;
++}
++
++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
++{
++ int len;
++
++ *eof = 1;
++ if (off != 0)
++ return 0;
++
++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan);
++ *start = page;
++ return len;
++}
++
++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
++{
++ char str[32];
++ long value;
++
++ if (count >= sizeof(str)) {
++ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str));
++ return -EOVERFLOW;
++ }
++
++ if (copy_from_user(str, buffer, count))
++ return -EFAULT;
++
++ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
++ value = simple_strtol(str, NULL, 0);
++ if (value <= 0)
++ return -ERANGE;
++
++ ext3_mb_min_to_scan = value;
++
++ return count;
++}
++
++int __init init_ext3_proc(void)
++{
++ struct proc_dir_entry *proc_ext3_mb_stats;
++ struct proc_dir_entry *proc_ext3_mb_max_to_scan;
++ struct proc_dir_entry *proc_ext3_mb_min_to_scan;
++
++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs);
++ if (proc_root_ext3 == NULL) {
++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT);
++ return -EIO;
++ }
++
++ /* Initialize EXT3_MB_STATS_NAME */
++ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_stats == NULL) {
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_STATS_NAME);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_stats->data = NULL;
++ proc_ext3_mb_stats->read_proc = ext3_mb_stats_read;
++ proc_ext3_mb_stats->write_proc = ext3_mb_stats_write;
++
++ /* Initialize EXT3_MAX_TO_SCAN_NAME */
++ proc_ext3_mb_max_to_scan = create_proc_entry(
++ EXT3_MB_MAX_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_max_to_scan == NULL) {
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_MAX_TO_SCAN_NAME);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_max_to_scan->data = NULL;
++ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read;
++ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write;
++
++ /* Initialize EXT3_MIN_TO_SCAN_NAME */
++ proc_ext3_mb_min_to_scan = create_proc_entry(
++ EXT3_MB_MIN_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_min_to_scan == NULL) {
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_MIN_TO_SCAN_NAME);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_min_to_scan->data = NULL;
++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read;
++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write;
++
++ return 0;
++}
++
++void exit_ext3_proc(void)
++{
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++}
++
+Index: linux-2.6.12.6/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.12.6.orig/fs/ext3/Makefile 2005-12-17 02:17:16.000000000 +0300
++++ linux-2.6.12.6/fs/ext3/Makefile 2005-12-17 02:21:21.000000000 +0300
+@@ -5,7 +5,8 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\
+- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
++ mballoc.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
-Index: linux-2.6.9/include/linux/ext3_fs_sb.h
+Index: linux-2.6.9-full/include/linux/ext3_fs.h
===================================================================
---- linux-2.6.9.orig/include/linux/ext3_fs_sb.h 2005-10-14 09:10:05.000000000 +0400
-+++ linux-2.6.9/include/linux/ext3_fs_sb.h 2005-10-14 09:10:13.000000000 +0400
-@@ -23,10 +23,30 @@
- #define EXT_INCLUDE
- #include <linux/blockgroup_lock.h>
- #include <linux/percpu_counter.h>
-+#include <linux/list.h>
- #endif
- #endif
- #include <linux/rbtree.h>
-
-+#define EXT3_BB_MAX_BLOCKS 30
-+struct ext3_free_metadata {
-+ unsigned short group;
-+ unsigned short num;
-+ unsigned short blocks[EXT3_BB_MAX_BLOCKS];
-+ struct list_head list;
-+};
-+
-+struct ext3_buddy_group_blocks {
-+ __u32 bb_bitmap;
-+ __u32 bb_buddy;
-+ spinlock_t bb_lock;
-+ unsigned long bb_tid;
-+ struct ext3_free_metadata *bb_md_cur;
-+ unsigned short bb_first_free;
-+ unsigned short bb_free;
-+ unsigned bb_counters[];
-+};
-+
- /*
- * third extended-fs super-block data in memory
- */
-@@ -81,6 +101,27 @@
- char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
- int s_jquota_fmt; /* Format of quota to use */
- #endif
-+
-+ /* for buddy allocator */
-+ struct ext3_buddy_group_blocks **s_buddy_blocks;
-+ struct inode *s_buddy;
-+ long s_blocks_reserved;
-+ spinlock_t s_reserve_lock;
-+ struct list_head s_active_transaction;
-+ struct list_head s_closed_transaction;
-+ struct list_head s_committed_transaction;
-+ spinlock_t s_md_lock;
-+ tid_t s_last_transaction;
-+ int s_mb_factor;
-+
-+ /* stats for buddy allocator */
-+ spinlock_t s_bal_lock;
-+ unsigned long s_bal_reqs; /* number of reqs with len > 1 */
-+ unsigned long s_bal_success; /* we found long enough chunks */
-+ unsigned long s_bal_allocated; /* in blocks */
-+ unsigned long s_bal_ex_scanned; /* total extents scanned */
-+ unsigned long s_bal_goals; /* goal hits */
-+ unsigned long s_bal_breaks; /* too long searches */
- };
-
- #endif /* _LINUX_EXT3_FS_SB */
-Index: linux-2.6.9/include/linux/ext3_fs.h
-===================================================================
---- linux-2.6.9.orig/include/linux/ext3_fs.h 2005-10-14 09:10:12.000000000 +0400
-+++ linux-2.6.9/include/linux/ext3_fs.h 2005-10-14 09:10:31.000000000 +0400
-@@ -57,6 +57,14 @@
+--- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2005-12-16 23:16:41.000000000 +0300
++++ linux-2.6.9-full/include/linux/ext3_fs.h 2005-12-16 23:16:42.000000000 +0300
+@@ -57,6 +57,14 @@ struct statfs;
#define ext3_debug(f, a...) do {} while (0)
#endif
/*
* Special inodes numbers
*/
-@@ -365,6 +373,7 @@
+@@ -365,6 +373,7 @@ struct ext3_inode {
#define EXT3_MOUNT_IOPEN_NOPRIV 0x80000 /* Make iopen world-readable */
#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */
#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */
-+#define EXT3_MOUNT_MBALLOC 0x400000/* Buddy allocation support */
++#define EXT3_MOUNT_MBALLOC 0x800000/* Buddy allocation support */
/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
#ifndef clear_opt
-@@ -726,7 +735,7 @@
+@@ -726,7 +735,7 @@ extern int ext3_bg_has_super(struct supe
extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *);
extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
extern void ext3_free_blocks_sb (handle_t *, struct super_block *,
unsigned long, unsigned long, int *);
extern unsigned long ext3_count_free_blocks (struct super_block *);
-@@ -857,6 +866,44 @@
+@@ -857,6 +866,17 @@ extern void ext3_extents_initialize_bloc
extern int ext3_ext_ioctl(struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg);
+/* mballoc.c */
-+extern long ext3_mb_aggressive;
+extern long ext3_mb_stats;
+extern long ext3_mb_max_to_scan;
+extern int ext3_mb_init(struct super_block *, int);
+extern int ext3_mb_new_blocks(handle_t *, struct inode *, unsigned long, int *, int, int *);
+extern int ext3_mb_reserve_blocks(struct super_block *, int);
+extern void ext3_mb_release_blocks(struct super_block *, int);
-+
-+/* writeback.c */
-+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
-+extern int ext3_wb_prepare_write(struct file *file, struct page *page,
-+ unsigned from, unsigned to);
-+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
-+extern int ext3_wb_writepage(struct page *, struct writeback_control *);
-+extern int ext3_wb_invalidatepage(struct page *, unsigned long);
-+extern int ext3_wb_releasepage(struct page *, int);
-+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
-+extern void ext3_wb_init(struct super_block *);
-+extern void ext3_wb_release(struct super_block *);
-+
-+/* writeback.c */
-+extern int ext3_wb_writepages(struct address_space *, struct writeback_control *);
-+extern int ext3_wb_prepare_write(struct file *file, struct page *page,
-+ unsigned from, unsigned to);
-+extern int ext3_wb_commit_write(struct file *, struct page *, unsigned, unsigned);
-+extern int ext3_wb_writepage(struct page *, struct writeback_control *);
-+extern int ext3_wb_invalidatepage(struct page *, unsigned long);
-+extern int ext3_wb_releasepage(struct page *, int);
-+extern int ext3_wb_block_truncate_page(handle_t *, struct page *, struct address_space *, loff_t);
-+extern void ext3_wb_init(struct super_block *);
-+extern void ext3_wb_release(struct super_block *);
-+
-+/* proc.c */
-+extern int init_ext3_proc(void);
-+extern void exit_ext3_proc(void);
++int __init init_ext3_proc(void);
++void exit_ext3_proc(void);
+
#endif /* __KERNEL__ */
/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
-Index: linux-2.6.9/fs/ext3/balloc.c
+Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h
===================================================================
---- linux-2.6.9.orig/fs/ext3/balloc.c 2005-05-13 21:39:03.000000000 +0400
-+++ linux-2.6.9/fs/ext3/balloc.c 2005-10-14 09:10:13.000000000 +0400
-@@ -79,7 +79,7 @@
- *
- * Return buffer_head on success or NULL in case of failure.
- */
--static struct buffer_head *
-+struct buffer_head *
- read_block_bitmap(struct super_block *sb, unsigned int block_group)
- {
- struct ext3_group_desc * desc;
-@@ -450,24 +450,6 @@
- return;
- }
+--- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2005-12-16 23:16:39.000000000 +0300
++++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2005-12-16 23:16:42.000000000 +0300
+@@ -23,9 +23,15 @@
+ #define EXT_INCLUDE
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
++#include <linux/list.h>
+ #endif
+ #endif
+ #include <linux/rbtree.h>
++#include <linux/proc_fs.h>
++
++struct ext3_buddy_group_blocks;
++struct ext3_mb_history;
++#define EXT3_BB_MAX_BLOCKS
--/* Free given blocks, update quota and i_blocks field */
--void ext3_free_blocks(handle_t *handle, struct inode *inode,
-- unsigned long block, unsigned long count)
--{
-- struct super_block * sb;
-- int dquot_freed_blocks;
--
-- sb = inode->i_sb;
-- if (!sb) {
-- printk ("ext3_free_blocks: nonexistent device");
-- return;
-- }
-- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
-- if (dquot_freed_blocks)
-- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
-- return;
--}
--
/*
- * For ext3 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy. This
-@@ -1140,7 +1122,7 @@
- * bitmap, and then for any free bit if that fails.
- * This function also updates quota and i_blocks field.
- */
--int ext3_new_block(handle_t *handle, struct inode *inode,
-+int ext3_new_block_old(handle_t *handle, struct inode *inode,
- unsigned long goal, int *errp)
+ * third extended-fs super-block data in memory
+@@ -81,6 +87,38 @@ struct ext3_sb_info {
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+ #endif
++
++ /* for buddy allocator */
++ struct ext3_group_info **s_group_info;
++ struct inode *s_buddy_cache;
++ long s_blocks_reserved;
++ spinlock_t s_reserve_lock;
++ struct list_head s_active_transaction;
++ struct list_head s_closed_transaction;
++ struct list_head s_committed_transaction;
++ spinlock_t s_md_lock;
++ tid_t s_last_transaction;
++ int s_mb_factor;
++ unsigned short *s_mb_offsets, *s_mb_maxs;
++
++ /* history to debug policy */
++ struct ext3_mb_history *s_mb_history;
++ int s_mb_history_cur;
++ int s_mb_history_max;
++ struct proc_dir_entry *s_mb_proc;
++ spinlock_t s_mb_history_lock;
++
++ /* stats for buddy allocator */
++ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
++ atomic_t s_bal_success; /* we found long enough chunks */
++ atomic_t s_bal_allocated; /* in blocks */
++ atomic_t s_bal_ex_scanned; /* total extents scanned */
++ atomic_t s_bal_goals; /* goal hits */
++ atomic_t s_bal_breaks; /* too long searches */
++ atomic_t s_bal_2orders; /* 2^order hits */
++ spinlock_t s_bal_lock;
++ unsigned long s_mb_buddies_generated;
++ unsigned long long s_mb_generation_time;
+ };
+
+ #endif /* _LINUX_EXT3_FS_SB */
+Index: linux-2.6.9-full/fs/ext3/super.c
+===================================================================
+--- linux-2.6.9-full.orig/fs/ext3/super.c 2005-12-16 23:16:41.000000000 +0300
++++ linux-2.6.9-full/fs/ext3/super.c 2005-12-16 23:16:42.000000000 +0300
+@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block
+ struct ext3_super_block *es = sbi->s_es;
+ int i;
+
++ ext3_mb_release(sb);
+ ext3_ext_release(sb);
+ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+@@ -596,6 +597,7 @@ enum {
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug,
++ Opt_mballoc,
+ };
+
+ static match_table_t tokens = {
+@@ -647,6 +649,7 @@ static match_table_t tokens = {
+ {Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_extents, "extents"},
+ {Opt_extdebug, "extdebug"},
++ {Opt_mballoc, "mballoc"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+@@ -957,6 +960,9 @@ clear_qf_name:
+ case Opt_extdebug:
+ set_opt (sbi->s_mount_opt, EXTDEBUG);
+ break;
++ case Opt_mballoc:
++ set_opt (sbi->s_mount_opt, MBALLOC);
++ break;
+ default:
+ printk (KERN_ERR
+ "EXT3-fs: Unrecognized mount option \"%s\" "
+@@ -1646,6 +1652,7 @@ static int ext3_fill_super (struct super
+ ext3_count_dirs(sb));
+
+ ext3_ext_init(sb);
++ ext3_mb_init(sb, needs_recovery);
+
+ return 0;
+
+@@ -2428,7 +2435,13 @@ static struct file_system_type ext3_fs_t
+
+ static int __init init_ext3_fs(void)
{
- struct buffer_head *bitmap_bh = NULL;
-Index: linux-2.6.9/fs/ext3/extents.c
+- int err = init_ext3_xattr();
++ int err;
++
++ err = init_ext3_proc();
++ if (err)
++ return err;
++
++ err = init_ext3_xattr();
+ if (err)
+ return err;
+ err = init_inodecache();
+@@ -2450,6 +2463,7 @@ static void __exit exit_ext3_fs(void)
+ unregister_filesystem(&ext3_fs_type);
+ destroy_inodecache();
+ exit_ext3_xattr();
++ exit_ext3_proc();
+ }
+
+ int ext3_prep_san_write(struct inode *inode, long *blocks,
+Index: linux-2.6.9-full/fs/ext3/extents.c
===================================================================
---- linux-2.6.9.orig/fs/ext3/extents.c 2005-10-14 09:10:12.000000000 +0400
-+++ linux-2.6.9/fs/ext3/extents.c 2005-10-14 09:10:13.000000000 +0400
-@@ -771,7 +771,7 @@
+--- linux-2.6.9-full.orig/fs/ext3/extents.c 2005-12-16 23:16:41.000000000 +0300
++++ linux-2.6.9-full/fs/ext3/extents.c 2005-12-16 23:16:42.000000000 +0300
+@@ -771,7 +771,7 @@ cleanup:
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
}
}
kfree(ablocks);
-@@ -1428,7 +1428,7 @@
+@@ -1428,7 +1428,7 @@ int ext3_ext_rm_idx(handle_t *handle, st
path->p_idx->ei_leaf);
bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf);
ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf);
return err;
}
-@@ -1913,10 +1913,12 @@
+@@ -1913,10 +1913,12 @@ ext3_remove_blocks(struct ext3_extents_t
int needed = ext3_remove_blocks_credits(tree, ex, from, to);
handle_t *handle = ext3_journal_start(tree->inode, needed);
struct buffer_head *bh;
if (IS_ERR(handle))
return PTR_ERR(handle);
-+ if (S_ISDIR(tree->inode->i_mode))
++ if (S_ISDIR(tree->inode->i_mode) || S_ISLNK(tree->inode->i_mode))
+ metadata = 1;
if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) {
/* tail removal */
unsigned long num, start;
-@@ -1928,7 +1930,7 @@
+@@ -1928,7 +1930,7 @@ ext3_remove_blocks(struct ext3_extents_t
bh = sb_find_get_block(tree->inode->i_sb, start + i);
ext3_forget(handle, 0, tree->inode, bh, start + i);
}
} else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) {
printk("strange request: removal %lu-%lu from %u:%u\n",
from, to, ex->ee_block, ex->ee_len);
-Index: linux-2.6.9/fs/ext3/namei.c
+Index: linux-2.6.9-full/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.9-full.orig/fs/ext3/inode.c 2005-12-16 23:16:41.000000000 +0300
++++ linux-2.6.9-full/fs/ext3/inode.c 2005-12-16 23:16:42.000000000 +0300
+@@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h
+ ext3_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i < keys; i++)
+- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
++ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
+ return err;
+ }
+
+@@ -673,7 +673,7 @@ err_out:
+ if (err == -EAGAIN)
+ for (i = 0; i < num; i++)
+ ext3_free_blocks(handle, inode,
+- le32_to_cpu(where[i].key), 1);
++ le32_to_cpu(where[i].key), 1, 1);
+ return err;
+ }
+
+@@ -1831,7 +1831,7 @@ ext3_clear_blocks(handle_t *handle, stru
+ }
+ }
+
+- ext3_free_blocks(handle, inode, block_to_free, count);
++ ext3_free_blocks(handle, inode, block_to_free, count, 1);
+ }
+
+ /**
+@@ -2004,7 +2004,7 @@ static void ext3_free_branches(handle_t
+ ext3_journal_test_restart(handle, inode);
+ }
+
+- ext3_free_blocks(handle, inode, nr, 1);
++ ext3_free_blocks(handle, inode, nr, 1, 1);
+
+ if (parent_bh) {
+ /*
+Index: linux-2.6.9-full/fs/ext3/balloc.c
===================================================================
---- linux-2.6.9.orig/fs/ext3/namei.c 2005-10-14 09:10:04.000000000 +0400
-+++ linux-2.6.9/fs/ext3/namei.c 2005-10-14 09:10:13.000000000 +0400
-@@ -1639,7 +1639,7 @@
- * If the create succeeds, we fill in the inode information
- * with d_instantiate().
+--- linux-2.6.9-full.orig/fs/ext3/balloc.c 2005-10-27 21:44:24.000000000 +0400
++++ linux-2.6.9-full/fs/ext3/balloc.c 2005-12-16 23:16:42.000000000 +0300
+@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+-static struct buffer_head *
++struct buffer_head *
+ read_block_bitmap(struct super_block *sb, unsigned int block_group)
+ {
+ struct ext3_group_desc * desc;
+@@ -450,24 +450,6 @@ error_return:
+ return;
+ }
+
+-/* Free given blocks, update quota and i_blocks field */
+-void ext3_free_blocks(handle_t *handle, struct inode *inode,
+- unsigned long block, unsigned long count)
+-{
+- struct super_block * sb;
+- int dquot_freed_blocks;
+-
+- sb = inode->i_sb;
+- if (!sb) {
+- printk ("ext3_free_blocks: nonexistent device");
+- return;
+- }
+- ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+- if (dquot_freed_blocks)
+- DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+- return;
+-}
+-
+ /*
+ * For ext3 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy. This
+@@ -1140,7 +1122,7 @@ int ext3_should_retry_alloc(struct super
+ * bitmap, and then for any free bit if that fails.
+ * This function also updates quota and i_blocks field.
*/
--static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
-+int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
- struct nameidata *nd)
+-int ext3_new_block(handle_t *handle, struct inode *inode,
++int ext3_new_block_old(handle_t *handle, struct inode *inode,
+ unsigned long goal, int *errp)
{
- handle_t *handle;
-Index: linux-2.6.9/fs/ext3/xattr.c
+ struct buffer_head *bitmap_bh = NULL;
+Index: linux-2.6.9-full/fs/ext3/xattr.c
===================================================================
---- linux-2.6.9.orig/fs/ext3/xattr.c 2005-10-14 09:10:08.000000000 +0400
-+++ linux-2.6.9/fs/ext3/xattr.c 2005-10-14 09:10:13.000000000 +0400
-@@ -1281,7 +1281,7 @@
+--- linux-2.6.9-full.orig/fs/ext3/xattr.c 2005-12-16 23:16:40.000000000 +0300
++++ linux-2.6.9-full/fs/ext3/xattr.c 2005-12-16 23:16:42.000000000 +0300
+@@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle,
new_bh = sb_getblk(sb, block);
if (!new_bh) {
getblk_failed:
error = -EIO;
goto cleanup;
}
-@@ -1328,7 +1328,7 @@
+@@ -1328,7 +1328,7 @@ getblk_failed:
if (ce)
mb_cache_entry_free(ce);
ea_bdebug(old_bh, "freeing");
/* ext3_forget() calls bforget() for us, but we
let our caller release old_bh, so we need to
-@@ -1427,7 +1427,7 @@
+@@ -1427,7 +1427,7 @@ ext3_xattr_delete_inode(handle_t *handle
if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
if (ce)
mb_cache_entry_free(ce);
get_bh(bh);
ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl);
} else {
-Index: linux-2.6.9/fs/ext3/Makefile
-===================================================================
---- linux-2.6.9.orig/fs/ext3/Makefile 2005-10-14 09:10:12.000000000 +0400
-+++ linux-2.6.9/fs/ext3/Makefile 2005-10-14 09:10:13.000000000 +0400
-@@ -5,7 +5,8 @@
- obj-$(CONFIG_EXT3_FS) += ext3.o
-
- ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\
-- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
-+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-+ mballoc.o
-
- ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
- ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
-Index: linux-2.6.9/fs/ext3/mballoc.c
+Index: linux-2.6.9-full/fs/ext3/mballoc.c
===================================================================
---- linux-2.6.9.orig/fs/ext3/mballoc.c 2005-10-13 19:40:57.851699336 +0400
-+++ linux-2.6.9/fs/ext3/mballoc.c 2005-10-14 09:10:31.000000000 +0400
-@@ -0,0 +1,1865 @@
+--- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2005-12-16 17:46:19.148560250 +0300
++++ linux-2.6.9-full/fs/ext3/mballoc.c 2005-12-17 00:10:15.000000000 +0300
+@@ -0,0 +1,2434 @@
+/*
-+ * Copyright(c) 2003, 2004, 2005, Cluster File Systems, Inc, info@clusterfs.com
++ * Copyright (c) 2003-2005, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
++#include <linux/swap.h>
++#include <linux/proc_fs.h>
++#include <linux/pagemap.h>
++#include <linux/seq_file.h>
+
+/*
+ * TODO:
-+ * - bitmap/buddy read-ahead (proposed by Oleg Drokin aka green)
++ * - bitmap read-ahead (proposed by Oleg Drokin aka green)
+ * - track min/max extents in each group for better group selection
-+ * - is it worthwhile to use buddies directly if req is 2^N blocks?
+ * - mb_mark_used() may allocate chunk right after splitting buddy
+ * - special flag to advice allocator to look for requested + N blocks
+ * this may improve interaction between extents and mballoc
+ */
+
+/*
-+ * with 'ext3_mb_aggressive' set the allocator runs consistency checks over
++ * with AGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
-+long ext3_mb_aggressive = 0;
-+
-+
-+/*
-+ * with 'ext3_mb_stats' allocator will collect stats that will be
-+ * shown at umount. The collecting costs though!
-+ */
-+long ext3_mb_stats = 1;
++#define AGGRESSIVE_CHECK__
+
+/*
+ */
+#endif
+
+/*
-+ * where to save buddies structures beetween umount/mount (clean case only)
++ * with EXT3_MB_HISTORY mballoc stores last N allocations in memory
++ * and you can monitor it in /proc/fs/ext3/<dev>/mb_history
+ */
-+#define EXT3_BUDDY_FILE ".buddy"
++#define EXT3_MB_HISTORY
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
-+long ext3_mb_max_to_scan = 100;
++long ext3_mb_max_to_scan = 500;
+
+/*
-+ * This structure is on-disk description of a group for mballoc
++ * How long mballoc must look for a best extent
+ */
-+struct ext3_mb_group_descr {
-+ __u16 mgd_first_free; /* first free block in the group */
-+ __u16 mgd_free; /* number of free blocks in the group */
-+ __u16 mgd_counters[16]; /* number of free blocks by order */
-+};
++long ext3_mb_min_to_scan = 30;
+
+/*
-+ * This structure is header of mballoc's file
++ * with 'ext3_mb_stats' allocator will collect stats that will be
++ * shown at umount. The collecting costs though!
+ */
-+struct ext3_mb_grp_header {
-+ __u32 mh_magic;
++
++long ext3_mb_stats = 1;
++
++#ifdef EXT3_BB_MAX_BLOCKS
++#undef EXT3_BB_MAX_BLOCKS
++#endif
++#define EXT3_BB_MAX_BLOCKS 30
++
++struct ext3_free_metadata {
++ unsigned short group;
++ unsigned short num;
++ unsigned short blocks[EXT3_BB_MAX_BLOCKS];
++ struct list_head list;
++};
++
++struct ext3_group_info {
++ unsigned long bb_state;
++ unsigned long bb_tid;
++ struct ext3_free_metadata *bb_md_cur;
++ unsigned short bb_first_free;
++ unsigned short bb_free;
++ unsigned short bb_fragments;
++ unsigned short bb_counters[];
+};
+
-+#define EXT3_MB_MAGIC_V1 0xbabd16fd
+
++#define EXT3_GROUP_INFO_NEED_INIT_BIT 0
++#define EXT3_GROUP_INFO_LOCKED_BIT 1
++
++#define EXT3_MB_GRP_NEED_INIT(grp) \
++ (test_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &(grp)->bb_state))
+
+struct ext3_free_extent {
+ __u16 fe_start;
+ unsigned long ac_ex_scanned;
+ __u16 ac_groups_scanned;
+ __u16 ac_found;
++ __u16 ac_tail;
++ __u16 ac_buddy;
+ __u8 ac_status;
+ __u8 ac_flags; /* allocation hints */
++ __u8 ac_criteria;
+ __u8 ac_repeats;
++ __u8 ac_2order; /* if request is to allocate 2^N blocks and
++ * N > 0, the field stores N, otherwise 0 */
+};
+
+#define AC_STATUS_CONTINUE 1
+#define AC_STATUS_FOUND 2
+#define AC_STATUS_BREAK 3
+
++struct ext3_mb_history {
++ struct ext3_free_extent goal; /* goal allocation */
++ struct ext3_free_extent result; /* result allocation */
++ __u16 found; /* how many extents have been found */
++ __u16 groups; /* how many groups have been scanned */
++ __u16 tail; /* what tail broke some buddy */
++ __u16 buddy; /* buddy the tail ^^^ broke */
++ __u8 cr; /* which phase the result extent was found at */
++ __u8 merged;
++};
++
+struct ext3_buddy {
-+ struct buffer_head *bd_bh;
-+ struct buffer_head *bd_bh2;
-+ struct ext3_buddy_group_blocks *bd_bd;
++ struct page *bd_buddy_page;
++ void *bd_buddy;
++ struct page *bd_bitmap_page;
++ void *bd_bitmap;
++ struct ext3_group_info *bd_info;
+ struct super_block *bd_sb;
+ __u16 bd_blkbits;
+ __u16 bd_group;
+};
-+#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bh->b_data)
-+#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_bh2->b_data)
++#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap)
++#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy)
++
++#ifndef EXT3_MB_HISTORY
++#define ext3_mb_store_history(sb,ac)
++#else
++static void ext3_mb_store_history(struct super_block *,
++ struct ext3_allocation_context *ac);
++#endif
+
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+
++static struct proc_dir_entry *proc_root_ext3;
++
+int ext3_create (struct inode *, struct dentry *, int, struct nameidata *);
+struct buffer_head * read_block_bitmap(struct super_block *, unsigned int);
+int ext3_new_block_old(handle_t *, struct inode *, unsigned long, int *);
+static inline int mb_test_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ return ext3_test_bit(bit, addr);
++ return ext2_test_bit(bit, addr);
+}
+
+static inline void mb_set_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_set_bit(bit, addr);
++ ext2_set_bit(bit, addr);
+}
+
+static inline void mb_set_bit_atomic(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_set_bit_atomic(NULL, bit, addr);
++ ext2_set_bit_atomic(NULL, bit, addr);
+}
+
+static inline void mb_clear_bit(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_clear_bit(bit, addr);
++ ext2_clear_bit(bit, addr);
+}
+
+static inline void mb_clear_bit_atomic(int bit, void *addr)
+{
+ mb_correct_addr_and_bit(bit,addr);
-+ ext3_clear_bit_atomic(NULL, bit, addr);
++ ext2_clear_bit_atomic(NULL, bit, addr);
+}
+
-+static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
++static inline int mb_find_next_zero_bit(void *addr, int max, int start)
+{
-+ int i = 1;
-+ char *bb;
-+
-+ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
-+ J_ASSERT(max != NULL);
-+
-+ if (order > e3b->bd_blkbits + 1) {
-+ *max = 0;
-+ return NULL;
-+ }
-+
-+ /* at order 0 we see each particular block */
-+ *max = 1 << (e3b->bd_blkbits + 3);
-+ if (order == 0)
-+ return EXT3_MB_BITMAP(e3b);
-+
-+ bb = EXT3_MB_BUDDY(e3b);
-+ *max = *max >> 1;
-+ while (i < order) {
-+ bb += 1 << (e3b->bd_blkbits - i);
-+ i++;
-+ *max = *max >> 1;
-+ }
-+ J_ASSERT((unsigned) (bb - (char *) EXT3_MB_BUDDY(e3b)) <
-+ e3b->bd_sb->s_blocksize);
-+ return bb;
++ int fix;
++#if BITS_PER_LONG == 64
++ fix = ((unsigned long) addr & 7UL) << 3;
++ addr = (void *) ((unsigned long) addr & ~7UL);
++#elif BITS_PER_LONG == 32
++ fix = ((unsigned long) addr & 3UL) << 3;
++ addr = (void *) ((unsigned long) addr & ~3UL);
++#else
++#error "how many bits you are?!"
++#endif
++ max += fix;
++ start += fix;
++ return ext2_find_next_zero_bit(addr, max, start) - fix;
+}
+
-+static int ext3_mb_load_buddy(struct super_block *sb, int group,
-+ struct ext3_buddy *e3b)
++static inline void *mb_find_buddy(struct ext3_buddy *e3b, int order, int *max)
+{
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char *bb;
+
-+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_bitmap);
-+ J_ASSERT(sbi->s_buddy_blocks[group]->bb_buddy);
++ J_ASSERT(EXT3_MB_BITMAP(e3b) != EXT3_MB_BUDDY(e3b));
++ J_ASSERT(max != NULL);
+
-+ /* load bitmap */
-+ e3b->bd_bh = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_bitmap);
-+ if (e3b->bd_bh == NULL) {
-+ ext3_error(sb, "ext3_mb_load_buddy",
-+ "can't get block for buddy bitmap\n");
-+ goto out;
-+ }
-+ /* load buddy */
-+ e3b->bd_bh2 = sb_getblk(sb, sbi->s_buddy_blocks[group]->bb_buddy);
-+ if (e3b->bd_bh2 == NULL) {
-+ ext3_error(sb, "ext3_mb_load_buddy",
-+ "can't get block for buddy bitmap\n");
-+ goto out;
++ if (order > e3b->bd_blkbits + 1) {
++ *max = 0;
++ return NULL;
+ }
+
-+ if (!buffer_uptodate(e3b->bd_bh))
-+ ll_rw_block(READ, 1, &e3b->bd_bh);
-+ if (!buffer_uptodate(e3b->bd_bh2))
-+ ll_rw_block(READ, 1, &e3b->bd_bh2);
-+
-+ wait_on_buffer(e3b->bd_bh);
-+ J_ASSERT(buffer_uptodate(e3b->bd_bh));
-+ wait_on_buffer(e3b->bd_bh2);
-+ J_ASSERT(buffer_uptodate(e3b->bd_bh2));
-+
-+ e3b->bd_blkbits = sb->s_blocksize_bits;
-+ e3b->bd_bd = sbi->s_buddy_blocks[group];
-+ e3b->bd_sb = sb;
-+ e3b->bd_group = group;
++ /* at order 0 we see each particular block */
++ *max = 1 << (e3b->bd_blkbits + 3);
++ if (order == 0)
++ return EXT3_MB_BITMAP(e3b);
+
-+ return 0;
-+out:
-+ brelse(e3b->bd_bh);
-+ brelse(e3b->bd_bh2);
-+ e3b->bd_bh = NULL;
-+ e3b->bd_bh2 = NULL;
-+ return -EIO;
-+}
++ bb = EXT3_MB_BUDDY(e3b) + EXT3_SB(e3b->bd_sb)->s_mb_offsets[order];
++ *max = EXT3_SB(e3b->bd_sb)->s_mb_maxs[order];
+
-+static void ext3_mb_dirty_buddy(struct ext3_buddy *e3b)
-+{
-+ mark_buffer_dirty(e3b->bd_bh);
-+ mark_buffer_dirty(e3b->bd_bh2);
++ return bb;
+}
+
-+static void ext3_mb_release_desc(struct ext3_buddy *e3b)
-+{
-+ brelse(e3b->bd_bh);
-+ brelse(e3b->bd_bh2);
-+}
++#ifdef AGGRESSIVE_CHECK
+
+static void mb_check_buddy(struct ext3_buddy *e3b)
+{
+ int order = e3b->bd_blkbits + 1;
+ int max, max2, i, j, k, count;
++ int fragments = 0, fstart;
+ void *buddy, *buddy2;
+
-+ if (likely(!ext3_mb_aggressive))
-+ return;
-+
+ if (!test_opt(e3b->bd_sb, MBALLOC))
+ return;
+
++ {
++ static int mb_check_counter = 0;
++ if (mb_check_counter++ % 300 != 0)
++ return;
++ }
++
+ while (order > 1) {
+ buddy = mb_find_buddy(e3b, order, &max);
+ J_ASSERT(buddy);
+ }
+ count++;
+ }
-+ J_ASSERT(e3b->bd_bd->bb_counters[order] == count);
++ J_ASSERT(e3b->bd_info->bb_counters[order] == count);
+ order--;
+ }
+
++ fstart = -1;
+ buddy = mb_find_buddy(e3b, 0, &max);
+ for (i = 0; i < max; i++) {
-+ if (!mb_test_bit(i, buddy))
++ if (!mb_test_bit(i, buddy)) {
++ J_ASSERT(i >= e3b->bd_info->bb_first_free);
++ if (fstart == -1) {
++ fragments++;
++ fstart = i;
++ }
+ continue;
++ }
++ fstart = -1;
+ /* check used bits only */
+ for (j = 0; j < e3b->bd_blkbits + 1; j++) {
+ buddy2 = mb_find_buddy(e3b, j, &max2);
+ J_ASSERT(mb_test_bit(k, buddy2));
+ }
+ }
++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(e3b->bd_info));
++ J_ASSERT(e3b->bd_info->bb_fragments == fragments);
++}
++
++#else
++#define mb_check_buddy(e3b)
++#endif
++
++/* find most significant bit */
++static int inline fmsb(unsigned short word)
++{
++ int order;
++
++ if (word > 255) {
++ order = 7;
++ word >>= 8;
++ } else {
++ order = -1;
++ }
++
++ do {
++ order++;
++ word >>= 1;
++ } while (word != 0);
++
++ return order;
++}
++
++static void inline
++ext3_mb_mark_free_simple(struct super_block *sb, void *buddy, unsigned first,
++ int len, struct ext3_group_info *grp)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ unsigned short min, max, chunk, border;
++
++ mb_debug("mark %u/%u free\n", first, len);
++ J_ASSERT(len < EXT3_BLOCKS_PER_GROUP(sb));
++
++ border = 2 << sb->s_blocksize_bits;
++
++ while (len > 0) {
++ /* find how many blocks can be covered since this position */
++ max = ffs(first | border) - 1;
++
++ /* find how many blocks of power 2 we need to mark */
++ min = fmsb(len);
++
++ mb_debug(" %u/%u -> max %u, min %u\n",
++ first & ((2 << sb->s_blocksize_bits) - 1),
++ len, max, min);
++
++ if (max < min)
++ min = max;
++ chunk = 1 << min;
++
++ /* mark multiblock chunks only */
++ grp->bb_counters[min]++;
++ if (min > 0) {
++ mb_debug(" set %u at %u \n", first >> min,
++ sbi->s_mb_offsets[min]);
++ mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]);
++ }
++
++ len -= chunk;
++ first += chunk;
++ }
++}
++
++static void
++ext3_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap,
++ struct ext3_group_info *grp)
++{
++ unsigned short max = EXT3_BLOCKS_PER_GROUP(sb);
++ unsigned short i = 0, first, len;
++ unsigned free = 0, fragments = 0;
++ unsigned long long period = get_cycles();
++
++ i = mb_find_next_zero_bit(bitmap, max, 0);
++ grp->bb_first_free = i;
++ while (i < max) {
++ fragments++;
++ first = i;
++ i = find_next_bit(bitmap, max, i);
++ len = i - first;
++ free += len;
++ if (len > 1)
++ ext3_mb_mark_free_simple(sb, buddy, first, len, grp);
++ else
++ grp->bb_counters[0]++;
++ if (i < max)
++ i = mb_find_next_zero_bit(bitmap, max, i);
++ }
++ grp->bb_fragments = fragments;
++
++ /* bb_state shouldn't being modified because all
++ * others waits for init completion on page lock */
++ clear_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &grp->bb_state);
++ if (free != grp->bb_free) {
++ printk("EXT3-fs: %u blocks in bitmap, %u in group descriptor\n",
++ free, grp->bb_free);
++ grp->bb_free = free;
++ }
++
++ period = get_cycles() - period;
++ spin_lock(&EXT3_SB(sb)->s_bal_lock);
++ EXT3_SB(sb)->s_mb_buddies_generated++;
++ EXT3_SB(sb)->s_mb_generation_time += period;
++ spin_unlock(&EXT3_SB(sb)->s_bal_lock);
++}
++
++static int ext3_mb_init_cache(struct page *page)
++{
++ int blocksize, blocks_per_page, groups_per_page;
++ int err = 0, i, first_group, first_block;
++ struct super_block *sb;
++ struct buffer_head *bhs;
++ struct buffer_head **bh;
++ struct inode *inode;
++ char *data, *bitmap;
++
++ mb_debug("init page %lu\n", page->index);
++
++ inode = page->mapping->host;
++ sb = inode->i_sb;
++ blocksize = 1 << inode->i_blkbits;
++ blocks_per_page = PAGE_CACHE_SIZE / blocksize;
++
++ groups_per_page = blocks_per_page >> 1;
++ if (groups_per_page == 0)
++ groups_per_page = 1;
++
++ /* allocate buffer_heads to read bitmaps */
++ if (groups_per_page > 1) {
++ err = -ENOMEM;
++ i = sizeof(struct buffer_head *) * groups_per_page;
++ bh = kmalloc(i, GFP_NOFS);
++ if (bh == NULL)
++ goto out;
++ memset(bh, 0, i);
++ } else
++ bh = &bhs;
++
++ first_group = page->index * blocks_per_page / 2;
++
++ /* read all groups the page covers into the cache */
++ for (i = 0; i < groups_per_page; i++) {
++ struct ext3_group_desc * desc;
++
++ if (first_group + i >= EXT3_SB(sb)->s_groups_count)
++ break;
++
++ err = -EIO;
++ desc = ext3_get_group_desc(sb, first_group + i, NULL);
++ if (desc == NULL)
++ goto out;
++
++ err = -ENOMEM;
++ bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap));
++ if (bh[i] == NULL)
++ goto out;
++
++ if (buffer_uptodate(bh[i]))
++ continue;
++
++ lock_buffer(bh[i]);
++ if (buffer_uptodate(bh[i])) {
++ unlock_buffer(bh[i]);
++ continue;
++ }
++
++ get_bh(bh[i]);
++ bh[i]->b_end_io = end_buffer_read_sync;
++ submit_bh(READ, bh[i]);
++ mb_debug("read bitmap for group %u\n", first_group + i);
++ }
++
++ /* wait for I/O completion */
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ wait_on_buffer(bh[i]);
++
++ /* XXX: I/O error handling here */
++
++ first_block = page->index * blocks_per_page;
++ for (i = 0; i < blocks_per_page; i++) {
++ int group;
++
++ group = (first_block + i) >> 1;
++ if (group >= EXT3_SB(sb)->s_groups_count)
++ break;
++
++ data = page_address(page) + (i * blocksize);
++ bitmap = bh[group - first_group]->b_data;
++
++ if ((first_block + i) & 1) {
++ /* this is block of buddy */
++ mb_debug("put buddy for group %u in page %lu/%x\n",
++ group, page->index, i * blocksize);
++ memset(data, 0xff, blocksize);
++ EXT3_SB(sb)->s_group_info[group]->bb_fragments = 0;
++ memset(EXT3_SB(sb)->s_group_info[group]->bb_counters, 0,
++ sizeof(unsigned short)*(sb->s_blocksize_bits+2));
++ ext3_mb_generate_buddy(sb, data, bitmap,
++ EXT3_SB(sb)->s_group_info[group]);
++ } else {
++ /* this is block of bitmap */
++ mb_debug("put bitmap for group %u in page %lu/%x\n",
++ group, page->index, i * blocksize);
++ memcpy(data, bitmap, blocksize);
++ }
++ }
++ SetPageUptodate(page);
++
++out:
++ for (i = 0; i < groups_per_page && bh[i]; i++)
++ brelse(bh[i]);
++ if (bh && bh != &bhs)
++ kfree(bh);
++ return err;
++}
++
++static int ext3_mb_load_buddy(struct super_block *sb, int group,
++ struct ext3_buddy *e3b)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct inode *inode = sbi->s_buddy_cache;
++ int blocks_per_page, block, pnum, poff;
++ struct page *page;
++
++ mb_debug("load group %u\n", group);
++
++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
++
++ e3b->bd_blkbits = sb->s_blocksize_bits;
++ e3b->bd_info = sbi->s_group_info[group];
++ e3b->bd_sb = sb;
++ e3b->bd_group = group;
++ e3b->bd_buddy_page = NULL;
++ e3b->bd_bitmap_page = NULL;
++
++ block = group * 2;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++
++ page = find_get_page(inode->i_mapping, pnum);
++ if (page == NULL || !PageUptodate(page)) {
++ if (page)
++ page_cache_release(page);
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ if (!PageUptodate(page))
++ ext3_mb_init_cache(page);
++ unlock_page(page);
++ }
++ }
++ if (page == NULL || !PageUptodate(page))
++ goto err;
++ e3b->bd_bitmap_page = page;
++ e3b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
++ mark_page_accessed(page);
++
++ block++;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++
++ page = find_get_page(inode->i_mapping, pnum);
++ if (page == NULL || !PageUptodate(page)) {
++ if (page)
++ page_cache_release(page);
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ if (!PageUptodate(page))
++ ext3_mb_init_cache(page);
++ unlock_page(page);
++ }
++ }
++ if (page == NULL || !PageUptodate(page))
++ goto err;
++ e3b->bd_buddy_page = page;
++ e3b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
++ mark_page_accessed(page);
++
++ J_ASSERT(e3b->bd_bitmap_page != NULL);
++ J_ASSERT(e3b->bd_buddy_page != NULL);
++
++ return 0;
++
++err:
++ if (e3b->bd_bitmap_page)
++ page_cache_release(e3b->bd_bitmap_page);
++ if (e3b->bd_buddy_page)
++ page_cache_release(e3b->bd_buddy_page);
++ e3b->bd_buddy = NULL;
++ e3b->bd_bitmap = NULL;
++ return -EIO;
+}
+
++static void ext3_mb_release_desc(struct ext3_buddy *e3b)
++{
++ if (e3b->bd_bitmap_page)
++ page_cache_release(e3b->bd_bitmap_page);
++ if (e3b->bd_buddy_page)
++ page_cache_release(e3b->bd_buddy_page);
++}
++
++
+static inline void
+ext3_lock_group(struct super_block *sb, int group)
+{
-+ spin_lock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock);
++ bit_spin_lock(EXT3_GROUP_INFO_LOCKED_BIT,
++ &EXT3_SB(sb)->s_group_info[group]->bb_state);
+}
+
+static inline void
+ext3_unlock_group(struct super_block *sb, int group)
+{
-+ spin_unlock(&EXT3_SB(sb)->s_buddy_blocks[group]->bb_lock);
++ bit_spin_unlock(EXT3_GROUP_INFO_LOCKED_BIT,
++ &EXT3_SB(sb)->s_group_info[group]->bb_state);
+}
+
+static int mb_find_order_for_block(struct ext3_buddy *e3b, int block)
+
+static int mb_free_blocks(struct ext3_buddy *e3b, int first, int count)
+{
-+ int block, max, order;
++ int block = 0, max = 0, order;
+ void *buddy, *buddy2;
+
+ mb_check_buddy(e3b);
+
-+ e3b->bd_bd->bb_free += count;
-+ if (first < e3b->bd_bd->bb_first_free)
-+ e3b->bd_bd->bb_first_free = first;
-+
++ e3b->bd_info->bb_free += count;
++ if (first < e3b->bd_info->bb_first_free)
++ e3b->bd_info->bb_first_free = first;
++
++ /* let's maintain fragments counter */
++ if (first != 0)
++ block = !mb_test_bit(first - 1, EXT3_MB_BITMAP(e3b));
++ if (first + count < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0])
++ max = !mb_test_bit(first + count, EXT3_MB_BITMAP(e3b));
++ if (block && max)
++ e3b->bd_info->bb_fragments--;
++ else if (!block && !max)
++ e3b->bd_info->bb_fragments++;
++
++ /* let's maintain buddy itself */
+ while (count-- > 0) {
+ block = first++;
+ order = 0;
+
+ J_ASSERT(mb_test_bit(block, EXT3_MB_BITMAP(e3b)));
+ mb_clear_bit(block, EXT3_MB_BITMAP(e3b));
-+ e3b->bd_bd->bb_counters[order]++;
++ e3b->bd_info->bb_counters[order]++;
+
+ /* start of the buddy */
+ buddy = mb_find_buddy(e3b, order, &max);
+ mb_set_bit(block, buddy);
+ mb_set_bit(block + 1, buddy);
+ }
-+ e3b->bd_bd->bb_counters[order]--;
-+ e3b->bd_bd->bb_counters[order]--;
++ e3b->bd_info->bb_counters[order]--;
++ e3b->bd_info->bb_counters[order]--;
+
+ block = block >> 1;
+ order++;
-+ e3b->bd_bd->bb_counters[order]++;
++ e3b->bd_info->bb_counters[order]++;
+
+ mb_clear_bit(block, buddy2);
+ buddy = buddy2;
+}
+
+static int mb_find_extent(struct ext3_buddy *e3b, int order, int block,
-+ int needed, struct ext3_free_extent *ex)
++ int needed, struct ext3_free_extent *ex)
+{
+ int next, max, ord;
+ void *buddy;
+ return 0;
+ }
+
-+ if (order == 0) {
++ if (likely(order == 0)) {
+ /* find actual order */
+ order = mb_find_order_for_block(e3b, block);
+ block = block >> order;
+ ex->fe_start = block << order;
+ ex->fe_group = e3b->bd_group;
+
-+ while ((buddy = mb_find_buddy(e3b, order, &max))) {
++ while (needed > ex->fe_len && (buddy = mb_find_buddy(e3b, order, &max))) {
+
+ if (block + 1 >= max)
+ break;
+
+static int mb_mark_used(struct ext3_buddy *e3b, struct ext3_free_extent *ex)
+{
++ int ord, mlen = 0, max = 0, cur;
+ int start = ex->fe_start;
+ int len = ex->fe_len;
-+ int ord, mlen, max, cur;
++ unsigned ret = 0;
+ int len0 = len;
+ void *buddy;
+
-+ e3b->bd_bd->bb_free -= len;
-+ if (e3b->bd_bd->bb_first_free == start)
-+ e3b->bd_bd->bb_first_free += len;
++ mb_check_buddy(e3b);
+
++ e3b->bd_info->bb_free -= len;
++ if (e3b->bd_info->bb_first_free == start)
++ e3b->bd_info->bb_first_free += len;
++
++ /* let's maintain fragments counter */
++ if (start != 0)
++ mlen = !mb_test_bit(start - 1, EXT3_MB_BITMAP(e3b));
++ if (start + len < EXT3_SB(e3b->bd_sb)->s_mb_maxs[0])
++ max = !mb_test_bit(start + len, EXT3_MB_BITMAP(e3b));
++ if (mlen && max)
++ e3b->bd_info->bb_fragments++;
++ else if (!mlen && !max)
++ e3b->bd_info->bb_fragments--;
++
++ /* let's maintain buddy itself */
+ while (len) {
+ ord = mb_find_order_for_block(e3b, start);
+
+ buddy = mb_find_buddy(e3b, ord, &max);
+ J_ASSERT((start >> ord) < max);
+ mb_set_bit(start >> ord, buddy);
-+ e3b->bd_bd->bb_counters[ord]--;
++ e3b->bd_info->bb_counters[ord]--;
+ start += mlen;
+ len -= mlen;
+ J_ASSERT(len >= 0);
+ continue;
+ }
+
++ /* store for history */
++ if (ret == 0)
++ ret = len | (ord << 16);
++
+ /* we have to split large buddy */
+ J_ASSERT(ord > 0);
+ buddy = mb_find_buddy(e3b, ord, &max);
+ mb_set_bit(start >> ord, buddy);
-+ e3b->bd_bd->bb_counters[ord]--;
++ e3b->bd_info->bb_counters[ord]--;
+
+ ord--;
+ cur = (start >> ord) & ~1U;
+ buddy = mb_find_buddy(e3b, ord, &max);
+ mb_clear_bit(cur, buddy);
+ mb_clear_bit(cur + 1, buddy);
-+ e3b->bd_bd->bb_counters[ord]++;
-+ e3b->bd_bd->bb_counters[ord]++;
++ e3b->bd_info->bb_counters[ord]++;
++ e3b->bd_info->bb_counters[ord]++;
+ }
+
+ /* now drop all the bits in bitmap */
+
+ mb_check_buddy(e3b);
+
-+ return 0;
++ return ret;
+}
+
+/*
+static void ext3_mb_use_best_found(struct ext3_allocation_context *ac,
+ struct ext3_buddy *e3b)
+{
++ unsigned long ret;
++
+ ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
-+ mb_mark_used(e3b, &ac->ac_b_ex);
++ ret = mb_mark_used(e3b, &ac->ac_b_ex);
++
+ ac->ac_status = AC_STATUS_FOUND;
++ ac->ac_tail = ret & 0xffff;
++ ac->ac_buddy = ret >> 16;
+}
+
+/*
+ struct ext3_free_extent *ex,
+ struct ext3_buddy *e3b)
+{
-+ int factor = EXT3_SB(ac->ac_sb)->s_mb_factor;
+ struct ext3_free_extent *bex = &ac->ac_b_ex;
-+ int diff = ac->ac_g_ex.fe_len - ex->fe_len;
++ struct ext3_free_extent *gex = &ac->ac_g_ex;
+
+ J_ASSERT(ex->fe_len > 0);
+ J_ASSERT(ex->fe_len < (1 << ac->ac_sb->s_blocksize_bits) * 8);
+ /*
+ * The special case - take what you catch first
+ */
-+ if (ac->ac_flags & EXT3_MB_HINT_FIRST) {
++ if (unlikely(ac->ac_flags & EXT3_MB_HINT_FIRST)) {
+ *bex = *ex;
+ ext3_mb_use_best_found(ac, e3b);
+ return;
+ /*
+ * Let's check whether the chuck is good enough
+ */
-+ if (ex->fe_len >= ac->ac_g_ex.fe_len) {
-+ *bex = *ex;
-+ ext3_mb_use_best_found(ac, e3b);
-+ return;
-+ }
-+
-+ /*
-+ * If the request is vey large, then it makes sense to use large
-+ * chunks for it. Even if they don't satisfy whole request.
-+ */
-+ if (ex->fe_len > 1000) {
-+ *bex = *ex;
-+ ext3_mb_use_best_found(ac, e3b);
-+ return;
-+ }
-+
-+ /*
-+ * Sometimes it's worty to take close chunk
-+ */
-+ if (factor && (ac->ac_g_ex.fe_len * 100) / (diff * 100) >= factor) {
++ if (ex->fe_len == gex->fe_len) {
+ *bex = *ex;
+ ext3_mb_use_best_found(ac, e3b);
+ return;
+
+ /*
+ * If new found extent is better, store it in the context
-+ * FIXME: possible the policy should be more complex?
+ */
-+ if (ex->fe_len > bex->fe_len) {
++ if (bex->fe_len < gex->fe_len) {
++ /* if the request isn't satisfied, any found extent
++ * larger than previous best one is better */
++ if (ex->fe_len > bex->fe_len)
++ *bex = *ex;
++ } else if (ex->fe_len > gex->fe_len) {
++ /* if the request is satisfied, then we try to find
++ * an extent that still satisfy the request, but is
++ * smaller than previous one */
+ *bex = *ex;
+ }
+
+ /*
++ * Let's scan at least few extents and don't pick up a first one
++ */
++ if (bex->fe_len > gex->fe_len && ac->ac_found > ext3_mb_min_to_scan)
++ ac->ac_status = AC_STATUS_BREAK;
++
++ /*
+ * We don't want to scan for a whole year
+ */
+ if (ac->ac_found > ext3_mb_max_to_scan)
+ ext3_lock_group(ac->ac_sb, group);
+ max = mb_find_extent(e3b, 0, ex.fe_start, ex.fe_len, &ex);
+
-+ if (max > 0)
++ if (max > 0) {
++ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
++ }
+
+ ext3_unlock_group(ac->ac_sb, group);
+
-+ if (ac->ac_status == AC_STATUS_FOUND)
-+ ext3_mb_dirty_buddy(e3b);
+ ext3_mb_release_desc(e3b);
+
+ return 0;
+ J_ASSERT(ex.fe_len > 0);
+ J_ASSERT(ex.fe_group == ac->ac_g_ex.fe_group);
+ J_ASSERT(ex.fe_start == ac->ac_g_ex.fe_start);
++ ac->ac_found++;
+ ac->ac_b_ex = ex;
+ ext3_mb_use_best_found(ac, e3b);
+ }
+ ext3_unlock_group(ac->ac_sb, group);
+
-+ if (ac->ac_status == AC_STATUS_FOUND)
-+ ext3_mb_dirty_buddy(e3b);
+ ext3_mb_release_desc(e3b);
+
+ return 0;
+}
++
++/*
++ * The routine scans buddy structures (not bitmap!) from given order
++ * to max order and tries to find big enough chunk to satisfy the req
++ */
++static void ext3_mb_simple_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
++{
++ struct super_block *sb = ac->ac_sb;
++ struct ext3_group_info *grp = e3b->bd_info;
++ void *buddy;
++ int i, k, max;
++
++ J_ASSERT(ac->ac_2order > 0);
++ for (i = ac->ac_2order; i < sb->s_blocksize_bits + 1; i++) {
++ if (grp->bb_counters[i] == 0)
++ continue;
++
++ buddy = mb_find_buddy(e3b, i, &max);
++ if (buddy == NULL) {
++ printk(KERN_ALERT "looking for wrong order?\n");
++ break;
++ }
++
++ k = mb_find_next_zero_bit(buddy, max, 0);
++ J_ASSERT(k < max);
++
++ ac->ac_found++;
++
++ ac->ac_b_ex.fe_len = 1 << i;
++ ac->ac_b_ex.fe_start = k << i;
++ ac->ac_b_ex.fe_group = e3b->bd_group;
++
++ ext3_mb_use_best_found(ac, e3b);
++ J_ASSERT(ac->ac_b_ex.fe_len == ac->ac_g_ex.fe_len);
++
++ if (unlikely(ext3_mb_stats))
++ atomic_inc(&EXT3_SB(sb)->s_bal_2orders);
++
++ break;
++ }
++}
++
+/*
+ * The routine scans the group and measures all found extents.
+ * In order to optimize scanning, caller must pass number of
-+ * free blocks in the group, so the routine can upper limit.
++ * free blocks in the group, so the routine can know upper limit.
+ */
-+static void ext3_mb_scan_group(struct ext3_allocation_context *ac,
-+ struct ext3_buddy *e3b)
++static void ext3_mb_complex_scan_group(struct ext3_allocation_context *ac,
++ struct ext3_buddy *e3b)
+{
+ struct super_block *sb = ac->ac_sb;
+ void *bitmap = EXT3_MB_BITMAP(e3b);
+ struct ext3_free_extent ex;
+ int i, free;
+
-+ free = e3b->bd_bd->bb_free;
++ free = e3b->bd_info->bb_free;
+ J_ASSERT(free > 0);
+
-+ i = e3b->bd_bd->bb_first_free;
++ i = e3b->bd_info->bb_first_free;
+
-+ while (free && ac->ac_status != AC_STATUS_FOUND) {
-+ i = ext3_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
++ while (free && ac->ac_status == AC_STATUS_CONTINUE) {
++ i = mb_find_next_zero_bit(bitmap, sb->s_blocksize * 8, i);
+ if (i >= sb->s_blocksize * 8) {
+ J_ASSERT(free == 0);
+ break;
+static int ext3_mb_good_group(struct ext3_allocation_context *ac,
+ int group, int cr)
+{
-+ int free;
++ struct ext3_sb_info *sbi = EXT3_SB(ac->ac_sb);
++ struct ext3_group_info *grp = sbi->s_group_info[group];
++ unsigned free, fragments, i, bits;
+
-+ J_ASSERT(cr >= 0 && cr < 3);
++ J_ASSERT(cr >= 0 && cr < 4);
++ J_ASSERT(!EXT3_MB_GRP_NEED_INIT(grp));
+
-+ free = EXT3_SB(ac->ac_sb)->s_buddy_blocks[group]->bb_free;
++ free = grp->bb_free;
++ fragments = grp->bb_fragments;
+ if (free == 0)
+ return 0;
++ if (fragments == 0)
++ return 0;
+
-+ if (cr == 0) {
-+ if (free >= ac->ac_g_ex.fe_len >> 1)
++ switch (cr) {
++ case 0:
++ J_ASSERT(ac->ac_2order != 0);
++ bits = ac->ac_sb->s_blocksize_bits + 1;
++ for (i = ac->ac_2order; i < bits; i++)
++ if (grp->bb_counters[i] > 0)
++ return 1;
++ case 1:
++ if ((free / fragments) >= ac->ac_g_ex.fe_len)
++ return 1;
++ case 2:
++ if (free >= ac->ac_g_ex.fe_len)
++ return 1;
++ case 3:
+ return 1;
-+ } else if (cr == 1) {
-+ if (free >= ac->ac_g_ex.fe_len >> 2)
-+ return 1;
-+ } else if (cr == 2) {
-+ return 1;
++ default:
++ BUG();
+ }
++
+ return 0;
+}
+
+ ac.ac_g_ex.fe_start = block;
+ ac.ac_g_ex.fe_len = *len;
+ ac.ac_flags = flags;
++ ac.ac_2order = 0;
++ ac.ac_criteria = 0;
+
-+ /*
-+ * Sometimes, caller may want to merge even small number
-+ * of blocks to an existing extent
-+ */
++ /* probably, the request is for 2^8+ blocks (1/2/3/... MB) */
++ i = ffs(*len);
++ if (i >= 8) {
++ i--;
++ if ((*len & (~(1 << i))) == 0)
++ ac.ac_2order = i;
++ }
++
++ /* Sometimes, caller may want to merge even small
++ * number of blocks to an existing extent */
+ if (ac.ac_flags & EXT3_MB_HINT_MERGE) {
+ err = ext3_mb_find_by_goal(&ac, &e3b);
+ if (err)
+ goto found;
+ }
+
-+ /*
-+ * FIXME
-+ * If requested chunk is power of 2 length, we can try
-+ * to exploit buddy nature to speed allocation up
-+ */
-+
-+
-+ /*
-+ * Let's just scan groups to find more-less suitable blocks
-+ */
-+ cr = 0;
++ /* Let's just scan groups to find more-less suitable blocks */
++ cr = ac.ac_2order ? 0 : 1;
+repeat:
-+ for (; cr < 3 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
++ for (; cr < 4 && ac.ac_status == AC_STATUS_CONTINUE; cr++) {
++ ac.ac_criteria = cr;
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; group++, i++) {
+ if (group == EXT3_SB(sb)->s_groups_count)
+ group = 0;
+
++ if (EXT3_MB_GRP_NEED_INIT(sbi->s_group_info[group])) {
++ /* we need full data about the group
++ * to make a good selection */
++ err = ext3_mb_load_buddy(ac.ac_sb, group, &e3b);
++ if (err)
++ goto out_err;
++ ext3_mb_release_desc(&e3b);
++ }
++
+ /* check is group good for our criteries */
+ if (!ext3_mb_good_group(&ac, group, cr))
+ continue;
+ continue;
+ }
+
-+ ext3_mb_scan_group(&ac, &e3b);
++ ac.ac_groups_scanned++;
++ if (cr == 0)
++ ext3_mb_simple_scan_group(&ac, &e3b);
++ else
++ ext3_mb_complex_scan_group(&ac, &e3b);
++
+ ext3_unlock_group(sb, group);
+
-+ if (ac.ac_status == AC_STATUS_FOUND)
-+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
+ if (err)
+ }
+ }
+
-+ if (ac.ac_status == AC_STATUS_BREAK &&
++ if (ac.ac_b_ex.fe_len > 0 && ac.ac_status != AC_STATUS_FOUND &&
+ !(ac.ac_flags & EXT3_MB_HINT_FIRST)) {
-+ /* We've been searching too long. Let's try to allocate
-+ * the best chunk we've found so far. */
-+ if (ac.ac_g_ex.fe_len >= 128 &&
-+ ac.ac_b_ex.fe_len < ac.ac_g_ex.fe_len / 4)
-+ ext3_warning(inode->i_sb, __FUNCTION__,
-+ "too long searching: got %d want %d\n",
-+ ac.ac_b_ex.fe_len, ac.ac_g_ex.fe_len);
++ /*
++ * We've been searching too long. Let's try to allocate
++ * the best chunk we've found so far
++ */
++
++ /*if (ac.ac_found > ext3_mb_max_to_scan)
++ printk(KERN_ERR "EXT3-fs: too long searching at "
++ "%u (%d/%d)\n", cr, ac.ac_b_ex.fe_len,
++ ac.ac_g_ex.fe_len);*/
+ ext3_mb_try_best_found(&ac, &e3b);
+ if (ac.ac_status != AC_STATUS_FOUND) {
+ /*
+ * The only thing we can do is just take first
+ * found block(s)
+ */
-+ mb_debug(KERN_ERR "EXT3-fs: and someone won our chunk\n");
++ printk(KERN_ERR "EXT3-fs: and someone won our chunk\n");
+ ac.ac_b_ex.fe_group = 0;
+ ac.ac_b_ex.fe_start = 0;
+ ac.ac_b_ex.fe_len = 0;
+ ac.ac_status = AC_STATUS_CONTINUE;
+ ac.ac_flags |= EXT3_MB_HINT_FIRST;
-+ cr = 2;
++ cr = 3;
+ goto repeat;
+ }
+ }
+ printk("EXT3-fs: groups: ");
+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
+ printk("%d: %d ", i,
-+ sbi->s_buddy_blocks[i]->bb_free);
++ sbi->s_group_info[i]->bb_free);
+ printk("\n");
+#endif
+ goto out;
+ ext3_error(sb, "ext3_new_block",
+ "Allocating block in system zone - "
+ "block = %u", block);
-+ if (unlikely(ext3_mb_aggressive)) {
-+ for (i = 0; i < ac.ac_b_ex.fe_len; i++)
-+ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i,
-+ bitmap_bh->b_data));
-+ }
-+
++#ifdef AGGRESSIVE_CHECK
++ for (i = 0; i < ac.ac_b_ex.fe_len; i++)
++ J_ASSERT(!mb_test_bit(ac.ac_b_ex.fe_start + i, bitmap_bh->b_data));
++#endif
+ mb_set_bits(bitmap_bh->b_data, ac.ac_b_ex.fe_start, ac.ac_b_ex.fe_len);
+
+ spin_lock(sb_bgl_lock(sbi, ac.ac_b_ex.fe_group));
+ * path only, here is single block always */
+ ext3_mb_release_blocks(sb, 1);
+ }
-+
-+ if ((ext3_mb_stats) && (ac.ac_g_ex.fe_len > 1)) {
-+ spin_lock(&sbi->s_bal_lock);
-+ sbi->s_bal_reqs++;
-+ sbi->s_bal_allocated += *len;
++
++ if (unlikely(ext3_mb_stats) && ac.ac_g_ex.fe_len > 1) {
++ atomic_inc(&sbi->s_bal_reqs);
++ atomic_add(*len, &sbi->s_bal_allocated);
+ if (*len >= ac.ac_g_ex.fe_len)
-+ sbi->s_bal_success++;
-+ sbi->s_bal_ex_scanned += ac.ac_found;
++ atomic_inc(&sbi->s_bal_success);
++ atomic_add(ac.ac_found, &sbi->s_bal_ex_scanned);
+ if (ac.ac_g_ex.fe_start == ac.ac_b_ex.fe_start &&
+ ac.ac_g_ex.fe_group == ac.ac_b_ex.fe_group)
-+ sbi->s_bal_goals++;
++ atomic_inc(&sbi->s_bal_goals);
+ if (ac.ac_found > ext3_mb_max_to_scan)
-+ sbi->s_bal_breaks++;
-+ spin_unlock(&sbi->s_bal_lock);
++ atomic_inc(&sbi->s_bal_breaks);
+ }
+
++ ext3_mb_store_history(sb, &ac);
++
+ return block;
+}
++EXPORT_SYMBOL(ext3_mb_new_blocks);
++
++#ifdef EXT3_MB_HISTORY
++struct ext3_mb_proc_session {
++ struct ext3_mb_history *history;
++ struct super_block *sb;
++ int start;
++ int max;
++};
+
-+int ext3_mb_get_descr_loc(struct ext3_buddy *e3b, struct buffer_head **bh,
-+ struct ext3_mb_group_descr **grp)
++static void *ext3_mb_history_skip_empty(struct ext3_mb_proc_session *s,
++ struct ext3_mb_history *hs,
++ int first)
+{
-+ struct super_block *sb = e3b->bd_sb;
-+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ int descr_per_block, err, offset;
-+ struct ext3_mb_grp_header *hdr;
-+ unsigned long block;
-+
-+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
-+ / sizeof(struct ext3_mb_group_descr);
-+ block = e3b->bd_group / descr_per_block;
-+ *bh = ext3_bread(NULL, sbi->s_buddy, block, 0, &err);
-+ if (*bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk descr for group %d: %d\n",
-+ e3b->bd_group, err);
-+ return err;
-+ }
++ if (hs == s->history + s->max)
++ hs = s->history;
++ if (!first && hs == s->history + s->start)
++ return NULL;
++ while (hs->goal.fe_len == 0) {
++ hs++;
++ if (hs == s->history + s->max)
++ hs = s->history;
++ if (hs == s->history + s->start)
++ return NULL;
++ }
++ return hs;
++}
+
-+ hdr = (struct ext3_mb_grp_header *) (*bh)->b_data;
-+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
-+ printk(KERN_ERR "EXT3-fs: invalid magic in group %d!\n",
-+ e3b->bd_group);
-+ brelse(*bh);
-+ *bh = NULL;
-+ return -EIO;
-+ }
++static void *ext3_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
++{
++ struct ext3_mb_proc_session *s = seq->private;
++ struct ext3_mb_history *hs;
++ int l = *pos;
++
++ if (l == 0)
++ return SEQ_START_TOKEN;
++ hs = ext3_mb_history_skip_empty(s, s->history + s->start, 1);
++ if (!hs)
++ return NULL;
++ while (--l && (hs = ext3_mb_history_skip_empty(s, ++hs, 0)) != NULL);
++ return hs;
++}
+
-+ offset = e3b->bd_group % descr_per_block
-+ * sizeof(struct ext3_mb_group_descr)
-+ + sizeof(struct ext3_mb_grp_header);
-+ *grp = (struct ext3_mb_group_descr *) ((*bh)->b_data + offset);
++static void *ext3_mb_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
++{
++ struct ext3_mb_proc_session *s = seq->private;
++ struct ext3_mb_history *hs = v;
+
-+ return 0;
++ ++*pos;
++ if (v == SEQ_START_TOKEN)
++ return ext3_mb_history_skip_empty(s, s->history + s->start, 1);
++ else
++ return ext3_mb_history_skip_empty(s, ++hs, 0);
+}
+
-+int ext3_mb_load_descr(struct ext3_buddy *e3b)
++static int ext3_mb_seq_history_show(struct seq_file *seq, void *v)
+{
-+ struct ext3_mb_group_descr *grp;
-+ struct ext3_group_desc *gdp;
-+ struct buffer_head *bh;
-+ int err, i;
-+
-+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
-+ if (err)
-+ return err;
-+
-+ e3b->bd_bd->bb_first_free = grp->mgd_first_free;
-+ e3b->bd_bd->bb_free = grp->mgd_free;
-+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
-+ J_ASSERT(i < 16);
-+ e3b->bd_bd->bb_counters[i] = grp->mgd_counters[i];
-+ }
-+ brelse(bh);
++ struct ext3_mb_history *hs = v;
++ char buf[20], buf2[20];
+
-+ /* additional checks against old group descriptor */
-+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
-+ if (!gdp)
-+ return -EIO;
-+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
-+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
-+ e3b->bd_group, e3b->bd_bd->bb_free,
-+ le16_to_cpu(gdp->bg_free_blocks_count));
-+ return -ENODATA;
++ if (v == SEQ_START_TOKEN) {
++ seq_printf(seq, "%-17s %-17s %-5s %-5s %-2s %-5s %-5s %-6s\n",
++ "goal", "result", "found", "grps", "cr", "merge",
++ "tail", "broken");
++ return 0;
+ }
+
++ sprintf(buf, "%u/%u/%u", hs->goal.fe_group,
++ hs->goal.fe_start, hs->goal.fe_len);
++ sprintf(buf2, "%u/%u/%u", hs->result.fe_group,
++ hs->result.fe_start, hs->result.fe_len);
++ seq_printf(seq, "%-17s %-17s %-5u %-5u %-2u %-5s %-5u %-6u\n", buf,
++ buf2, hs->found, hs->groups, hs->cr,
++ hs->merged ? "M" : "", hs->tail,
++ hs->buddy ? 1 << hs->buddy : 0);
+ return 0;
+}
+
++static void ext3_mb_seq_history_stop(struct seq_file *seq, void *v)
++{
++}
++
++static struct seq_operations ext3_mb_seq_history_ops = {
++ .start = ext3_mb_seq_history_start,
++ .next = ext3_mb_seq_history_next,
++ .stop = ext3_mb_seq_history_stop,
++ .show = ext3_mb_seq_history_show,
++};
+
-+int ext3_mb_update_descr(struct ext3_buddy *e3b)
++static int ext3_mb_seq_history_open(struct inode *inode, struct file *file)
+{
-+ struct ext3_mb_group_descr *grp;
-+ struct ext3_group_desc *gdp;
-+ struct buffer_head *bh;
-+ handle_t *handle;
-+ int err, i;
++ struct super_block *sb = PDE(inode)->data;
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_proc_session *s;
++ int rc, size;
+
-+ /* additional checks against old group descriptor */
-+ gdp = ext3_get_group_desc(e3b->bd_sb, e3b->bd_group, NULL);
-+ if (!gdp)
++ s = kmalloc(sizeof(*s), GFP_KERNEL);
++ if (s == NULL)
++ return -EIO;
++ size = sizeof(struct ext3_mb_history) * sbi->s_mb_history_max;
++ s->history = kmalloc(size, GFP_KERNEL);
++ if (s == NULL) {
++ kfree(s);
+ return -EIO;
-+ if (e3b->bd_bd->bb_free != le16_to_cpu(gdp->bg_free_blocks_count)) {
-+ printk(KERN_ERR "EXT3-fs: mbgroup %d corrupted (%d != %d)\n",
-+ e3b->bd_group, e3b->bd_bd->bb_free,
-+ le16_to_cpu(gdp->bg_free_blocks_count));
-+ return -ENODATA;
+ }
+
-+ err = ext3_mb_get_descr_loc(e3b, &bh, &grp);
-+ if (err)
-+ return err;
++ spin_lock(&sbi->s_mb_history_lock);
++ memcpy(s->history, sbi->s_mb_history, size);
++ s->max = sbi->s_mb_history_max;
++ s->start = sbi->s_mb_history_cur % s->max;
++ spin_unlock(&sbi->s_mb_history_lock);
+
-+ handle = ext3_journal_start_sb(e3b->bd_sb, 1);
-+ if (IS_ERR(handle)) {
-+ err = PTR_ERR(handle);
-+ handle = NULL;
-+ goto out;
++ rc = seq_open(file, &ext3_mb_seq_history_ops);
++ if (rc == 0) {
++ struct seq_file *m = (struct seq_file *)file->private_data;
++ m->private = s;
++ } else {
++ kfree(s->history);
++ kfree(s);
+ }
++ return rc;
+
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err)
-+ goto out;
-+ grp->mgd_first_free = e3b->bd_bd->bb_first_free;
-+ grp->mgd_free = e3b->bd_bd->bb_free;
-+ for (i = 0; i <= e3b->bd_blkbits + 1; i++) {
-+ J_ASSERT(i < 16);
-+ grp->mgd_counters[i] = e3b->bd_bd->bb_counters[i];
-+ }
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err)
-+ goto out;
-+ err = 0;
-+out:
-+ brelse(bh);
-+ if (handle)
-+ ext3_journal_stop(handle);
-+ return err;
+}
+
-+int ext3_mb_generate_buddy(struct ext3_buddy *e3b)
++static int ext3_mb_seq_history_release(struct inode *inode, struct file *file)
+{
-+ struct super_block *sb = e3b->bd_sb;
-+ struct buffer_head *bh;
-+ int i, count = 0;
++ struct seq_file *seq = (struct seq_file *)file->private_data;
++ struct ext3_mb_proc_session *s = seq->private;
++ kfree(s->history);
++ kfree(s);
++ return seq_release(inode, file);
++}
++
++static struct file_operations ext3_mb_seq_history_fops = {
++ .owner = THIS_MODULE,
++ .open = ext3_mb_seq_history_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = ext3_mb_seq_history_release,
++};
+
-+ mb_debug("generate buddy for group %d\n", e3b->bd_group);
-+ memset(e3b->bd_bh->b_data, 0xff, sb->s_blocksize);
-+ memset(e3b->bd_bh2->b_data, 0xff, sb->s_blocksize);
++static void ext3_mb_history_release(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char name[64];
+
-+ bh = read_block_bitmap(sb, e3b->bd_group);
-+ if (bh == NULL)
-+ return -EIO;
++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ remove_proc_entry("mb_history", sbi->s_mb_proc);
++ remove_proc_entry(name, proc_root_ext3);
++
++ if (sbi->s_mb_history)
++ kfree(sbi->s_mb_history);
++}
+
-+ /* mb_free_blocks will set real free */
-+ e3b->bd_bd->bb_free = 0;
-+ e3b->bd_bd->bb_first_free = 1 << 15;
-+ /*
-+ * if change bb_counters size, don't forget about
-+ * ext3_mb_init_backend() -bzzz
-+ */
-+ memset(e3b->bd_bd->bb_counters, 0,
-+ sizeof(unsigned) * (sb->s_blocksize_bits + 2));
++static void ext3_mb_history_init(struct super_block *sb)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ char name[64];
++ int i;
+
-+ /* loop over the blocks, and create buddies for free ones */
-+ for (i = 0; i < sb->s_blocksize * 8; i++) {
-+ if (!mb_test_bit(i, (void *) bh->b_data)) {
-+ mb_free_blocks(e3b, i, 1);
-+ count++;
++ snprintf(name, sizeof(name) - 1, "%s", bdevname(sb->s_bdev, name));
++ sbi->s_mb_proc = proc_mkdir(name, proc_root_ext3);
++ if (sbi->s_mb_proc != NULL) {
++ struct proc_dir_entry *p;
++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
++ if (p) {
++ p->proc_fops = &ext3_mb_seq_history_fops;
++ p->data = sb;
+ }
+ }
-+ brelse(bh);
-+ mb_check_buddy(e3b);
-+ ext3_mb_dirty_buddy(e3b);
+
-+ return 0;
++ sbi->s_mb_history_max = 1000;
++ sbi->s_mb_history_cur = 0;
++ spin_lock_init(&sbi->s_mb_history_lock);
++ i = sbi->s_mb_history_max * sizeof(struct ext3_mb_history);
++ sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
++ memset(sbi->s_mb_history, 0, i);
++ /* if we can't allocate history, then we simple won't use it */
+}
+
-+EXPORT_SYMBOL(ext3_mb_new_blocks);
++static void
++ext3_mb_store_history(struct super_block *sb, struct ext3_allocation_context *ac)
++{
++ struct ext3_sb_info *sbi = EXT3_SB(sb);
++ struct ext3_mb_history h;
++
++ if (likely(sbi->s_mb_history == NULL))
++ return;
++
++ h.goal = ac->ac_g_ex;
++ h.result = ac->ac_b_ex;
++ h.found = ac->ac_found;
++ h.cr = ac->ac_criteria;
++ h.groups = ac->ac_groups_scanned;
++ h.tail = ac->ac_tail;
++ h.buddy = ac->ac_buddy;
++ h.merged = 0;
++ if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
++ ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
++ h.merged = 1;
++
++ spin_lock(&sbi->s_mb_history_lock);
++ memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
++ if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
++ sbi->s_mb_history_cur = 0;
++ spin_unlock(&sbi->s_mb_history_lock);
++}
+
-+#define MB_CREDITS \
-+ (EXT3_DATA_TRANS_BLOCKS + 3 + EXT3_INDEX_EXTRA_TRANS_BLOCKS + \
-+ 2 * EXT3_SINGLEDATA_TRANS_BLOCKS)
++#else
++#define ext3_mb_history_release(sb)
++#define ext3_mb_history_init(sb)
++#endif
+
-+int ext3_mb_init_backend(struct super_block *sb, int *created)
++int ext3_mb_init_backend(struct super_block *sb)
+{
-+ int err, i, len, descr_per_block, buddy_offset, size;
-+ struct inode *root = sb->s_root->d_inode;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ struct ext3_mb_grp_header *hdr;
-+ struct buffer_head *bh = NULL;
-+ unsigned long block;
-+ struct dentry *db;
-+ handle_t *handle;
-+ tid_t target;
-+
-+ *created = 0;
++ int i, len;
++
+ len = sizeof(struct ext3_buddy_group_blocks *) * sbi->s_groups_count;
-+ sbi->s_buddy_blocks = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_buddy_blocks == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
++ sbi->s_group_info = kmalloc(len, GFP_KERNEL);
++ if (sbi->s_group_info == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't allocate mem for buddy\n");
+ return -ENOMEM;
+ }
-+ memset(sbi->s_buddy_blocks, 0, len);
-+ sbi->s_buddy = NULL;
-+
-+ down(&root->i_sem);
-+ len = strlen(EXT3_BUDDY_FILE);
-+ db = lookup_one_len(EXT3_BUDDY_FILE, sb->s_root, len);
-+ if (IS_ERR(db)) {
-+ err = PTR_ERR(db);
-+ printk(KERN_ERR "EXT3-fs: cant lookup buddy: %d\n", err);
-+ up(&root->i_sem);
-+ goto out;
-+ }
++ memset(sbi->s_group_info, 0, len);
+
-+ if (db->d_inode == NULL) {
-+ err = ext3_create(root, db, S_IFREG, NULL);
-+ if (err) {
-+ printk(KERN_ERR "EXT3-fs: cant create buddy: %d\n", err);
-+ up(&root->i_sem);
-+ goto out;
-+ }
-+ db->d_inode->i_flags |= S_IMMUTABLE | S_NOATIME;
-+ *created = 1;
-+ mb_debug("no buddy file, regenerate\n");
-+ }
-+ up(&root->i_sem);
-+ sbi->s_buddy = igrab(db->d_inode);
-+
-+ /* calculate needed size */
-+ descr_per_block = (sb->s_blocksize - sizeof(struct ext3_mb_grp_header))
-+ / sizeof(struct ext3_mb_group_descr);
-+ buddy_offset = (sbi->s_groups_count + descr_per_block - 1)
-+ / descr_per_block;
-+ len = sbi->s_groups_count * sb->s_blocksize * 2 +
-+ buddy_offset * sb->s_blocksize;
-+ if (len != i_size_read(sbi->s_buddy)) {
-+ if (*created == 0)
-+ printk("EXT3-fs: wrong i_size (%u != %u), regenerate\n",
-+ (unsigned) len,
-+ (unsigned) i_size_read(sbi->s_buddy));
-+ *created = 1;
-+ }
-+
-+ /* read/create mb group descriptors */
-+ for (i = 0; i < buddy_offset; i++) {
-+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
-+ if (IS_ERR(handle)) {
-+ printk(KERN_ERR "EXT3-fs: cant start transaction\n");
-+ err = PTR_ERR(handle);
-+ goto err_out;
-+ }
-+
-+ bh = ext3_bread(handle, sbi->s_buddy, i, 1, &err);
-+ if (bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk grp: %d\n", err);
-+ goto err_out;
-+ }
-+ hdr = (struct ext3_mb_grp_header *) bh->b_data;
-+ if (hdr->mh_magic != EXT3_MB_MAGIC_V1) {
-+ err = ext3_journal_get_write_access(handle, bh);
-+ if (err)
-+ goto err_out;
-+ if (*created == 0)
-+ printk(KERN_ERR
-+ "EXT3-fs: invalid header 0x%x in %d,"
-+ "regenerate\n", hdr->mh_magic, i);
-+ *created = 1;
-+ hdr->mh_magic = EXT3_MB_MAGIC_V1;
-+ err = ext3_journal_dirty_metadata(handle, bh);
-+ if (err)
-+ goto err_out;
-+ }
-+ brelse(bh);
-+ ext3_journal_stop(handle);
++ sbi->s_buddy_cache = new_inode(sb);
++ if (sbi->s_buddy_cache == NULL) {
++ printk(KERN_ERR "EXT3-fs: can't get new inode\n");
++ kfree(sbi->s_group_info);
++ return -ENOMEM;
+ }
+
+ /*
-+ * if change bb_counters size, don't forget about ext3_mb_generate_buddy()
++ * calculate needed size. if change bb_counters size,
++ * don't forget about ext3_mb_generate_buddy()
+ */
-+ len = sizeof(struct ext3_buddy_group_blocks);
-+ len += sizeof(unsigned) * (sb->s_blocksize_bits + 2);
++ len = sizeof(struct ext3_group_info);
++ len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
+ for (i = 0; i < sbi->s_groups_count; i++) {
++ struct ext3_group_desc * desc;
+
-+ sbi->s_buddy_blocks[i] = kmalloc(len, GFP_KERNEL);
-+ if (sbi->s_buddy_blocks[i] == NULL) {
++ sbi->s_group_info[i] = kmalloc(len, GFP_KERNEL);
++ if (sbi->s_group_info[i] == NULL) {
+ printk(KERN_ERR "EXT3-fs: cant allocate mem for buddy\n");
-+ err = -ENOMEM;
-+ goto out2;
-+ }
-+ memset(sbi->s_buddy_blocks[i], 0, len);
-+
-+ handle = ext3_journal_start(sbi->s_buddy, MB_CREDITS);
-+ if (IS_ERR(handle)) {
-+ printk(KERN_ERR "EXT3-fs: cant start transaction\n");
-+ err = PTR_ERR(handle);
-+ goto out2;
-+ }
-+
-+ /* allocate block for bitmap */
-+ block = buddy_offset + i * 2;
-+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
-+ if (bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk bitmap: %d\n", err);
-+ goto out2;
-+ }
-+ sbi->s_buddy_blocks[i]->bb_bitmap = bh->b_blocknr;
-+ brelse(bh);
-+
-+ /* allocate block for buddy */
-+ block = buddy_offset + i * 2 + 1;
-+ bh = ext3_getblk(handle, sbi->s_buddy, block, 1, &err);
-+ if (bh == NULL) {
-+ printk(KERN_ERR "EXT3-fs: cant getblk for buddy: %d\n", err);
-+ goto out2;
++ goto err_out;
+ }
-+ sbi->s_buddy_blocks[i]->bb_buddy = bh->b_blocknr;
-+ brelse(bh);
-+
-+ size = (block + 1) << sbi->s_buddy->i_blkbits;
-+ if (size > sbi->s_buddy->i_size) {
-+ *created = 1;
-+ EXT3_I(sbi->s_buddy)->i_disksize = size;
-+ i_size_write(sbi->s_buddy, size);
-+ mark_inode_dirty(sbi->s_buddy);
++ desc = ext3_get_group_desc(sb, i, NULL);
++ if (desc == NULL) {
++ printk(KERN_ERR "EXT3-fs: cant read descriptor %u\n", i);
++ goto err_out;
+ }
-+ ext3_journal_stop(handle);
-+
-+ spin_lock_init(&sbi->s_buddy_blocks[i]->bb_lock);
-+ sbi->s_buddy_blocks[i]->bb_md_cur = NULL;
-+ sbi->s_buddy_blocks[i]->bb_tid = 0;
++ memset(sbi->s_group_info[i], 0, len);
++ set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT,
++ &sbi->s_group_info[i]->bb_state);
++ sbi->s_group_info[i]->bb_free =
++ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
-+ if (journal_start_commit(sbi->s_journal, &target))
-+ log_wait_commit(sbi->s_journal, target);
-+
-+out2:
-+ dput(db);
-+out:
-+ return err;
++ return 0;
+
+err_out:
-+ return err;
++ while (--i >= 0)
++ kfree(sbi->s_group_info[i]);
++ iput(sbi->s_buddy_cache);
++
++ return -ENOMEM;
+}
+
-+int ext3_mb_write_descriptors(struct super_block *sb)
++int ext3_mb_init(struct super_block *sb, int needs_recovery)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
-+ struct ext3_buddy e3b;
-+ int ret = 0, i, err;
++ struct inode *root = sb->s_root->d_inode;
++ unsigned i, offset, max;
++ struct dentry *dentry;
+
-+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_buddy_blocks[i] == NULL)
-+ continue;
++ if (!test_opt(sb, MBALLOC))
++ return 0;
+
-+ err = ext3_mb_load_buddy(sb, i, &e3b);
-+ if (err == 0) {
-+ ext3_mb_update_descr(&e3b);
-+ ext3_mb_release_desc(&e3b);
-+ } else
-+ ret = err;
++ i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
++
++ sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
++ if (sbi->s_mb_offsets == NULL) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ return -ENOMEM;
+ }
-+ return ret;
++ sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
++ if (sbi->s_mb_maxs == NULL) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ kfree(sbi->s_mb_maxs);
++ return -ENOMEM;
++ }
++
++ /* order 0 is regular bitmap */
++ sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
++ sbi->s_mb_offsets[0] = 0;
++
++ i = 1;
++ offset = 0;
++ max = sb->s_blocksize << 2;
++ do {
++ sbi->s_mb_offsets[i] = offset;
++ sbi->s_mb_maxs[i] = max;
++ offset += 1 << (sb->s_blocksize_bits - i);
++ max = max >> 1;
++ i++;
++ } while (i <= sb->s_blocksize_bits + 1);
++
++
++ /* init file for buddy data */
++ if ((i = ext3_mb_init_backend(sb))) {
++ clear_opt(sbi->s_mount_opt, MBALLOC);
++ kfree(sbi->s_mb_offsets);
++ kfree(sbi->s_mb_maxs);
++ return i;
++ }
++
++ spin_lock_init(&sbi->s_reserve_lock);
++ spin_lock_init(&sbi->s_md_lock);
++ INIT_LIST_HEAD(&sbi->s_active_transaction);
++ INIT_LIST_HEAD(&sbi->s_closed_transaction);
++ INIT_LIST_HEAD(&sbi->s_committed_transaction);
++ spin_lock_init(&sbi->s_bal_lock);
++
++ /* remove old on-disk buddy file */
++ down(&root->i_sem);
++ dentry = lookup_one_len(".buddy", sb->s_root, strlen(".buddy"));
++ if (dentry->d_inode != NULL) {
++ i = vfs_unlink(root, dentry);
++ if (i != 0)
++ printk("EXT3-fs: can't remove .buddy file: %d\n", i);
++ }
++ dput(dentry);
++ up(&root->i_sem);
++
++ ext3_mb_history_init(sb);
++
++ printk("EXT3-fs: mballoc enabled\n");
++ return 0;
+}
+
+int ext3_mb_release(struct super_block *sb)
+ spin_unlock(&sbi->s_md_lock);
+ ext3_mb_free_committed_blocks(sb);
+
-+ if (sbi->s_buddy_blocks) {
-+ ext3_mb_write_descriptors(sb);
++ if (sbi->s_group_info) {
+ for (i = 0; i < sbi->s_groups_count; i++) {
-+ if (sbi->s_buddy_blocks[i] == NULL)
++ if (sbi->s_group_info[i] == NULL)
+ continue;
-+ kfree(sbi->s_buddy_blocks[i]);
++ kfree(sbi->s_group_info[i]);
+ }
-+ kfree(sbi->s_buddy_blocks);
-+ }
-+ if (sbi->s_buddy)
-+ iput(sbi->s_buddy);
++ kfree(sbi->s_group_info);
++ }
++ if (sbi->s_mb_offsets)
++ kfree(sbi->s_mb_offsets);
++ if (sbi->s_mb_maxs)
++ kfree(sbi->s_mb_maxs);
++ if (sbi->s_buddy_cache)
++ iput(sbi->s_buddy_cache);
+ if (sbi->s_blocks_reserved)
+ printk("ext3-fs: %ld blocks being reserved at umount!\n",
+ sbi->s_blocks_reserved);
+ if (ext3_mb_stats) {
-+ printk("EXT3-fs: mballoc: %lu blocks %lu reqs "
-+ "(%lu success)\n", sbi->s_bal_allocated,
-+ sbi->s_bal_reqs, sbi->s_bal_success);
-+ printk("EXT3-fs: mballoc: %lu extents scanned, "
-+ "%lu goal hits, %lu breaks\n", sbi->s_bal_ex_scanned,
-+ sbi->s_bal_goals, sbi->s_bal_breaks);
-+ }
-+
-+ return 0;
-+}
-+
-+int ext3_mb_init(struct super_block *sb, int needs_recovery)
-+{
-+ struct ext3_buddy e3b;
-+ int i, err, created;
-+
-+ if (!test_opt(sb, MBALLOC))
-+ return 0;
-+
-+ /* init file for buddy data */
-+ clear_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
-+ if ((err = ext3_mb_init_backend(sb, &created)))
-+ return err;
-+
-+repeat:
-+ for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
-+ err = ext3_mb_load_buddy(sb, i, &e3b);
-+ if (err) {
-+ /* FIXME: release backend */
-+ return err;
-+ }
-+ if (created || needs_recovery)
-+ ext3_mb_generate_buddy(&e3b);
-+ else
-+ err = ext3_mb_load_descr(&e3b);
-+ ext3_mb_release_desc(&e3b);
-+ if (err == -ENODATA) {
-+ created = 1;
-+ goto repeat;
-+ }
-+ }
-+ if (created || needs_recovery)
-+ printk(KERN_ERR "EXT3-fs: generated buddies for %lu groups\n",
-+ EXT3_SB(sb)->s_groups_count);
-+ spin_lock_init(&EXT3_SB(sb)->s_reserve_lock);
-+ spin_lock_init(&EXT3_SB(sb)->s_md_lock);
-+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_active_transaction);
-+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_closed_transaction);
-+ INIT_LIST_HEAD(&EXT3_SB(sb)->s_committed_transaction);
-+ set_opt(EXT3_SB(sb)->s_mount_opt, MBALLOC);
-+
-+ spin_lock_init(&EXT3_SB(sb)->s_bal_lock);
-+ if (ext3_mb_stats) {
-+ printk("EXT3-fs: mballoc enabled (stats)\n");
-+ } else {
-+ printk("EXT3-fs: mballoc enabled\n");
-+ }
++ printk("EXT3-fs: mballoc: %u blocks %u reqs (%u success)\n",
++ atomic_read(&sbi->s_bal_allocated),
++ atomic_read(&sbi->s_bal_reqs),
++ atomic_read(&sbi->s_bal_success));
++ printk("EXT3-fs: mballoc: %u extents scanned, %u goal hits, "
++ "%u 2^N hits, %u breaks\n",
++ atomic_read(&sbi->s_bal_ex_scanned),
++ atomic_read(&sbi->s_bal_goals),
++ atomic_read(&sbi->s_bal_2orders),
++ atomic_read(&sbi->s_bal_breaks));
++ printk("EXT3-fs: mballoc: %lu generated and it took %Lu\n",
++ sbi->s_mb_buddies_generated++,
++ sbi->s_mb_generation_time);
++ }
++
++ ext3_mb_history_release(sb);
+
+ return 0;
+}
+ mb_debug("\n");
+ ext3_unlock_group(sb, md->group);
+
++ /* balance refcounts from ext3_mb_free_metadata() */
++ page_cache_release(e3b.bd_buddy_page);
++ page_cache_release(e3b.bd_bitmap_page);
++
+ kfree(md);
-+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
+ } while (md);
+ /* new transaction! time to close last one and free blocks for
+ * committed transaction. we know that only transaction can be
+ * active, so previos transaction can be being logged and we
-+ * know that transaction before previous is known to be alreade
++ * know that transaction before previous is known to be already
+ * logged. this means that now we may free blocks freed in all
+ * transactions before previous one. hope I'm clear enough ... */
+
+int ext3_mb_free_metadata(handle_t *handle, struct ext3_buddy *e3b,
+ int group, int block, int count)
+{
-+ struct ext3_buddy_group_blocks *db = e3b->bd_bd;
++ struct ext3_group_info *db = e3b->bd_info;
+ struct super_block *sb = e3b->bd_sb;
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ struct ext3_free_metadata *md;
+ int i;
+
++ J_ASSERT(e3b->bd_bitmap_page != NULL);
++ J_ASSERT(e3b->bd_buddy_page != NULL);
++
+ ext3_lock_group(sb, group);
+ for (i = 0; i < count; i++) {
+ md = db->bb_md_cur;
+ spin_lock(&sbi->s_md_lock);
+ list_add(&md->list, &sbi->s_active_transaction);
+ spin_unlock(&sbi->s_md_lock);
++ /* protect buddy cache from being freed,
++ * otherwise we'll refresh it from
++ * on-disk bitmap and lose not-yet-available
++ * blocks */
++ page_cache_get(e3b->bd_buddy_page);
++ page_cache_get(e3b->bd_bitmap_page);
+ db->bb_md_cur = md;
+ db->bb_tid = handle->h_transaction->t_tid;
+ mb_debug("new md 0x%p for group %u\n",
+ if (err)
+ goto error_return;
+
-+ if (unlikely(ext3_mb_aggressive)) {
++#ifdef AGGRESSIVE_CHECK
++ {
+ int i;
+ for (i = 0; i < count; i++)
+ J_ASSERT(mb_test_bit(bit + i, bitmap_bh->b_data));
+ }
-+
++#endif
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
+
+ /* We dirtied the bitmap block */
+ spin_unlock(sb_bgl_lock(sbi, block_group));
+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+
-+ ext3_mb_dirty_buddy(&e3b);
+ ext3_mb_release_desc(&e3b);
+
+ *freed = count;
+ return ret;
+}
+
-+void ext3_free_blocks(handle_t *handle, struct inode *inode,
-+ unsigned long block, unsigned long count, int metadata)
++
++void ext3_free_blocks(handle_t *handle, struct inode * inode,
++ unsigned long block, unsigned long count, int metadata)
+{
++ struct super_block *sb;
+ int freed;
+
-+ if (!test_opt(inode->i_sb, MBALLOC) ||
-+ EXT3_SB(inode->i_sb)->s_buddy_blocks == NULL)
-+ ext3_free_blocks_sb(handle, inode->i_sb, block, count, &freed);
++ sb = inode->i_sb;
++ if (!test_opt(sb, MBALLOC))
++ ext3_free_blocks_sb(handle, sb, block, count, &freed);
+ else
-+ ext3_mb_free_blocks(handle, inode, block,count,metadata,&freed);
-+
++ ext3_mb_free_blocks(handle, inode, block, count, metadata, &freed);
+ if (freed)
+ DQUOT_FREE_BLOCK(inode, freed);
+ return;
+}
-Index: linux-2.6.5-7.201/fs/ext3/proc.c
-===================================================================
---- linux-2.6.5-7.201.orig/fs/ext3/proc.c 2005-10-13 19:40:57.851699336 +0400
-+++ linux-2.6.5-7.201/fs/ext3/proc.c 2005-10-14 09:02:36.000000000 +0400
-@@ -0,0 +1,195 @@
-+#include <linux/config.h>
-+#include <linux/fs.h>
-+#include <linux/init.h>
-+#include <linux/module.h>
-+#include <linux/kernel.h>
-+#include <linux/jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/proc_fs.h>
-+#include <linux/errno.h>
-+#include <asm/uaccess.h>
-+
+
+#define EXT3_ROOT "ext3"
-+#define EXT3_MB_AGGRESSIVE_NAME "mb_aggressive"
+#define EXT3_MB_STATS_NAME "mb_stats"
+#define EXT3_MB_MAX_TO_SCAN_NAME "mb_max_to_scan"
++#define EXT3_MB_MIN_TO_SCAN_NAME "mb_min_to_scan"
+
-+
-+static struct proc_dir_entry *proc_root_ext3;
-+
-+
-+static int ext3_mb_aggressive_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
++static int ext3_mb_stats_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
+ int len;
+
+ if (off != 0)
+ return 0;
+
-+ len = sprintf(page, "%ld\n", ext3_mb_aggressive);
++ len = sprintf(page, "%ld\n", ext3_mb_stats);
+ *start = page;
+ return len;
+}
+
-+static int ext3_mb_aggressive_write(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
++static int ext3_mb_stats_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
+{
+ char str[32];
+
+ if (count >= sizeof(str)) {
+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
-+ EXT3_MB_AGGRESSIVE_NAME, sizeof(str));
++ EXT3_MB_STATS_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+
+ return -EFAULT;
+
+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
-+ ext3_mb_aggressive = (simple_strtol(str, NULL, 0) != 0);
++ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0);
+ return count;
+}
+
-+static int ext3_mb_stats_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
++static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
+ int len;
+
+ if (off != 0)
+ return 0;
+
-+ len = sprintf(page, "%ld\n", ext3_mb_stats);
++ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan);
+ *start = page;
+ return len;
+}
+
-+static int ext3_mb_stats_write(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
++static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
+{
+ char str[32];
++ long value;
+
+ if (count >= sizeof(str)) {
+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
-+ EXT3_MB_STATS_NAME, sizeof(str));
++ EXT3_MB_MAX_TO_SCAN_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+
+ return -EFAULT;
+
+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
-+ ext3_mb_stats = (simple_strtol(str, NULL, 0) != 0);
++ value = simple_strtol(str, NULL, 0);
++ if (value <= 0)
++ return -ERANGE;
++
++ ext3_mb_max_to_scan = value;
++
+ return count;
+}
+
-+static int ext3_mb_max_to_scan_read(char *page, char **start, off_t off,
-+ int count, int *eof, void *data)
++static int ext3_mb_min_to_scan_read(char *page, char **start, off_t off,
++ int count, int *eof, void *data)
+{
+ int len;
+
+ if (off != 0)
+ return 0;
+
-+ len = sprintf(page, "%ld\n", ext3_mb_max_to_scan);
++ len = sprintf(page, "%ld\n", ext3_mb_min_to_scan);
+ *start = page;
+ return len;
+}
+
-+static int ext3_mb_max_to_scan_write(struct file *file, const char *buffer,
-+ unsigned long count, void *data)
++static int ext3_mb_min_to_scan_write(struct file *file, const char *buffer,
++ unsigned long count, void *data)
+{
+ char str[32];
+ long value;
+
+ if (count >= sizeof(str)) {
+ printk(KERN_ERR "EXT3: %s string to long, max %u bytes\n",
-+ EXT3_MB_MAX_TO_SCAN_NAME, sizeof(str));
++ EXT3_MB_MIN_TO_SCAN_NAME, (int)sizeof(str));
+ return -EOVERFLOW;
+ }
+
+
+ /* Only set to 0 or 1 respectively; zero->0; non-zero->1 */
+ value = simple_strtol(str, NULL, 0);
-+ if (value <= 0)
++ if (value <= 0)
+ return -ERANGE;
+
-+ ext3_mb_max_to_scan = value;
++ ext3_mb_min_to_scan = value;
+
+ return count;
+}
+
+int __init init_ext3_proc(void)
+{
-+ struct proc_dir_entry *proc_ext3_mb_aggressive;
+ struct proc_dir_entry *proc_ext3_mb_stats;
+ struct proc_dir_entry *proc_ext3_mb_max_to_scan;
++ struct proc_dir_entry *proc_ext3_mb_min_to_scan;
+
+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs);
+ if (proc_root_ext3 == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT);
-+ return -EIO;
-+ }
-+
-+ /* Initialize EXT3_MB_AGGRESSIVE_NAME */
-+ proc_ext3_mb_aggressive = create_proc_entry(EXT3_MB_AGGRESSIVE_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
-+ if (proc_ext3_mb_aggressive == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_AGGRESSIVE_NAME);
-+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ printk(KERN_ERR "EXT3: Unable to create %s\n", EXT3_ROOT);
+ return -EIO;
+ }
+
-+ proc_ext3_mb_aggressive->data = NULL;
-+ proc_ext3_mb_aggressive->read_proc = ext3_mb_aggressive_read;
-+ proc_ext3_mb_aggressive->write_proc = ext3_mb_aggressive_write;
-+
+ /* Initialize EXT3_MB_STATS_NAME */
+ proc_ext3_mb_stats = create_proc_entry(EXT3_MB_STATS_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
+ if (proc_ext3_mb_stats == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_STATS_NAME);
-+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3);
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_STATS_NAME);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+ return -EIO;
+ }
+
+ /* Initialize EXT3_MAX_TO_SCAN_NAME */
+ proc_ext3_mb_max_to_scan = create_proc_entry(
-+ EXT3_MB_MAX_TO_SCAN_NAME,
-+ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ EXT3_MB_MAX_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
+ if (proc_ext3_mb_max_to_scan == NULL) {
-+ printk(KERN_ERR "EXT3: Unable to create %s\n",
-+ EXT3_MB_STATS_NAME);
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_MAX_TO_SCAN_NAME);
+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
-+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+ return -EIO;
+ }
+ proc_ext3_mb_max_to_scan->read_proc = ext3_mb_max_to_scan_read;
+ proc_ext3_mb_max_to_scan->write_proc = ext3_mb_max_to_scan_write;
+
++ /* Initialize EXT3_MIN_TO_SCAN_NAME */
++ proc_ext3_mb_min_to_scan = create_proc_entry(
++ EXT3_MB_MIN_TO_SCAN_NAME,
++ S_IFREG | S_IRUGO | S_IWUSR, proc_root_ext3);
++ if (proc_ext3_mb_min_to_scan == NULL) {
++ printk(KERN_ERR "EXT3: Unable to create %s\n",
++ EXT3_MB_MIN_TO_SCAN_NAME);
++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_ROOT, proc_root_fs);
++ return -EIO;
++ }
++
++ proc_ext3_mb_min_to_scan->data = NULL;
++ proc_ext3_mb_min_to_scan->read_proc = ext3_mb_min_to_scan_read;
++ proc_ext3_mb_min_to_scan->write_proc = ext3_mb_min_to_scan_write;
++
+ return 0;
+}
+
+void exit_ext3_proc(void)
+{
-+ remove_proc_entry(EXT3_MB_AGGRESSIVE_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_MB_STATS_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, proc_root_ext3);
++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, proc_root_ext3);
+ remove_proc_entry(EXT3_ROOT, proc_root_fs);
+}
-Index: linux-2.6.9/fs/ext3/inode.c
-===================================================================
---- linux-2.6.9.orig/fs/ext3/inode.c 2005-10-14 09:10:12.000000000 +0400
-+++ linux-2.6.9/fs/ext3/inode.c 2005-10-14 09:10:13.000000000 +0400
-@@ -572,7 +572,7 @@
- ext3_journal_forget(handle, branch[i].bh);
- }
- for (i = 0; i < keys; i++)
-- ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
-+ ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1, 1);
- return err;
- }
-
-@@ -673,7 +673,7 @@
- if (err == -EAGAIN)
- for (i = 0; i < num; i++)
- ext3_free_blocks(handle, inode,
-- le32_to_cpu(where[i].key), 1);
-+ le32_to_cpu(where[i].key), 1, 1);
- return err;
- }
-
-@@ -1831,7 +1831,7 @@
- }
- }
-
-- ext3_free_blocks(handle, inode, block_to_free, count);
-+ ext3_free_blocks(handle, inode, block_to_free, count, 1);
- }
-
- /**
-@@ -2004,7 +2004,7 @@
- ext3_journal_test_restart(handle, inode);
- }
-
-- ext3_free_blocks(handle, inode, nr, 1);
-+ ext3_free_blocks(handle, inode, nr, 1, 1);
-
- if (parent_bh) {
- /*
-Index: linux-2.6.9/fs/ext3/super.c
++
+Index: linux-2.6.9-full/fs/ext3/Makefile
===================================================================
---- linux-2.6.9.orig/fs/ext3/super.c 2005-10-14 09:10:12.000000000 +0400
-+++ linux-2.6.9/fs/ext3/super.c 2005-10-14 09:10:31.000000000 +0400
-@@ -394,6 +394,7 @@
- struct ext3_super_block *es = sbi->s_es;
- int i;
-
-+ ext3_mb_release(sb);
- ext3_ext_release(sb);
- ext3_xattr_put_super(sb);
- journal_destroy(sbi->s_journal);
-@@ -590,7 +591,7 @@
- Opt_commit, Opt_journal_update, Opt_journal_inum,
- Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
-- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
-+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_mballoc, Opt_mbfactor,
- Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_extents, Opt_extdebug,
- };
-@@ -644,6 +645,8 @@
- {Opt_iopen_nopriv, "iopen_nopriv"},
- {Opt_extents, "extents"},
- {Opt_extdebug, "extdebug"},
-+ {Opt_mballoc, "mballoc"},
-+ {Opt_mbfactor, "mbfactor=%u"},
- {Opt_barrier, "barrier=%u"},
- {Opt_err, NULL},
- {Opt_resize, "resize"},
-@@ -954,6 +957,16 @@
- case Opt_extdebug:
- set_opt (sbi->s_mount_opt, EXTDEBUG);
- break;
-+ case Opt_mballoc:
-+ set_opt (sbi->s_mount_opt, MBALLOC);
-+ break;
-+ case Opt_mbfactor:
-+ if (match_int(&args[0], &option))
-+ return 0;
-+ if (option < 0)
-+ return 0;
-+ sbi->s_mb_factor = option;
-+ break;
- default:
- printk (KERN_ERR
- "EXT3-fs: Unrecognized mount option \"%s\" "
-@@ -1637,6 +1650,7 @@
- ext3_count_dirs(sb));
-
- ext3_ext_init(sb);
-+ ext3_mb_init(sb, needs_recovery);
-
- return 0;
-
-@@ -2419,7 +2433,13 @@
+--- linux-2.6.9-full.orig/fs/ext3/Makefile 2005-12-16 23:16:41.000000000 +0300
++++ linux-2.6.9-full/fs/ext3/Makefile 2005-12-16 23:16:42.000000000 +0300
+@@ -5,7 +5,8 @@
+ obj-$(CONFIG_EXT3_FS) += ext3.o
- static int __init init_ext3_fs(void)
- {
-- int err = init_ext3_xattr();
-+ int err;
-+
-+ err = init_ext3_proc();
-+ if (err)
-+ return err;
-+
-+ err = init_ext3_xattr();
- if (err)
- return err;
- err = init_inodecache();
-@@ -2441,6 +2461,7 @@
- unregister_filesystem(&ext3_fs_type);
- destroy_inodecache();
- exit_ext3_xattr();
-+ exit_ext3_proc();
- }
+ ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\
+- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
++ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
++ mballoc.o
- int ext3_prep_san_write(struct inode *inode, long *blocks,
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o
--- /dev/null
+Index: linux-2.6.7/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.7.orig/fs/ext3/namei.c 2004-06-15 23:19:36.000000000 -0600
++++ linux-2.6.7/fs/ext3/namei.c 2004-08-20 17:48:54.000000000 -0600
+@@ -1596,11 +1596,17 @@ static int ext3_delete_entry (handle_t *
+ static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
+ {
+ inode->i_nlink++;
++ if (is_dx(inode) && inode->i_nlink > 1) {
++ /* limit is 16-bit i_links_count */
++ if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2)
++ inode->i_nlink = 1;
++ }
+ }
+
+ static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
+ {
+- inode->i_nlink--;
++ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
++ inode->i_nlink--;
+ }
+
+ static int ext3_add_nondir(handle_t *handle,
+@@ -1693,7 +1698,7 @@ static int ext3_mkdir(struct inode * dir
+ struct ext3_dir_entry_2 * de;
+ int err;
+
+- if (dir->i_nlink >= EXT3_LINK_MAX)
++ if (EXT3_DIR_LINK_MAXED(dir))
+ return -EMLINK;
+
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
+@@ -1715,7 +1720,7 @@ static int ext3_mkdir(struct inode * dir
+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+- inode->i_nlink--; /* is this nlink == 0? */
++ ext3_dec_count(handle, inode); /* is this nlink == 0? */
+ ext3_mark_inode_dirty(handle, inode);
+ iput (inode);
+ goto out_stop;
+@@ -1747,7 +1752,7 @@ static int ext3_mkdir(struct inode * dir
+ iput (inode);
+ goto out_stop;
+ }
+- dir->i_nlink++;
++ ext3_inc_count(handle, dir);
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ d_instantiate(dentry, inode);
+@@ -2010,10 +2015,10 @@ static int ext3_rmdir (struct inode * di
+ retval = ext3_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto end_rmdir;
+- if (inode->i_nlink != 2)
+- ext3_warning (inode->i_sb, "ext3_rmdir",
+- "empty directory has nlink!=2 (%d)",
+- inode->i_nlink);
++ if (!EXT3_DIR_LINK_EMPTY(inode))
++ ext3_warning(inode->i_sb, "ext3_rmdir",
++ "empty directory has too many links (%d)",
++ inode->i_nlink);
+ inode->i_version++;
+ inode->i_nlink = 0;
+ /* There's no need to set i_disksize: the fact that i_nlink is
+@@ -2023,7 +2028,7 @@ static int ext3_rmdir (struct inode * di
+ ext3_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ ext3_mark_inode_dirty(handle, inode);
+- dir->i_nlink--;
++ ext3_dec_count(handle, dir);
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+
+@@ -2074,7 +2079,7 @@ static int ext3_unlink(struct inode * di
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+- inode->i_nlink--;
++ ext3_dec_count(handle, inode);
+ if (!inode->i_nlink)
+ ext3_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime;
+@@ -2146,7 +2151,7 @@ static int ext3_link (struct dentry * ol
+ struct inode *inode = old_dentry->d_inode;
+ int err;
+
+- if (inode->i_nlink >= EXT3_LINK_MAX)
++ if (EXT3_DIR_LINK_MAXED(inode))
+ return -EMLINK;
+
+ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
+@@ -2230,8 +2235,8 @@ static int ext3_rename (struct inode * o
+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+ goto end_rename;
+ retval = -EMLINK;
+- if (!new_inode && new_dir!=old_dir &&
+- new_dir->i_nlink >= EXT3_LINK_MAX)
++ if (!new_inode && new_dir != old_dir &&
++ EXT3_DIR_LINK_MAXED(new_dir))
+ goto end_rename;
+ }
+ if (!new_bh) {
+@@ -2288,7 +2293,7 @@ static int ext3_rename (struct inode * o
+ }
+
+ if (new_inode) {
+- new_inode->i_nlink--;
++ ext3_dec_count(handle, new_inode);
+ new_inode->i_ctime = CURRENT_TIME_SEC;
+ }
+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+@@ -2299,11 +2304,13 @@ static int ext3_rename (struct inode * o
+ PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino);
+ BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, dir_bh);
+- old_dir->i_nlink--;
++ ext3_dec_count(handle, old_dir);
+ if (new_inode) {
+- new_inode->i_nlink--;
++ /* checked empty_dir above, can't have another parent,
++ * ext3_dec_count() won't work for many-linked dirs */
++ new_inode->i_nlink = 0;
+ } else {
+- new_dir->i_nlink++;
++ ext3_inc_count(handle, new_dir);
+ ext3_update_dx_flag(new_dir);
+ ext3_mark_inode_dirty(handle, new_dir);
+ }
+--- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600
++++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600
+@@ -79,7 +81,7 @@
+ /*
+ * Maximal count of links to a file
+ */
+-#define EXT3_LINK_MAX 32000
++#define EXT3_LINK_MAX 65000
+
+ /*
+ * Macro-instructions used to manage several block sizes
+@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
+ */
+
+ #ifdef CONFIG_EXT3_INDEX
+- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
+- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
++#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
+-#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
+-#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
++#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
++ (is_dx(dir) && (dir)->i_nlink == 1))
+ #else
+ #define is_dx(dir) 0
+-#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
+ #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
+ #endif
+
--- /dev/null
+diff -Nur orig/fs/ext3/namei.c patch/fs/ext3/namei.c
+--- orig/fs/ext3/namei.c 2005-10-12 13:58:19.000000000 -0700
++++ patch/fs/ext3/namei.c 2005-10-12 14:00:33.000000000 -0700
+@@ -1603,11 +1603,17 @@
+ static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
+ {
+ inode->i_nlink++;
++ if (is_dx(inode) && inode->i_nlink > 1) {
++ /* limit is 16-bit i_links_count */
++ if (inode->i_nlink >= EXT3_LINK_MAX || inode->i_nlink == 2)
++ inode->i_nlink = 1;
++ }
+ }
+
+ static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
+ {
+- inode->i_nlink--;
++ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
++ inode->i_nlink--;
+ }
+
+ static int ext3_add_nondir(handle_t *handle,
+@@ -1706,7 +1712,7 @@
+ struct ext3_dir_entry_2 * de;
+ int err, retries = 0;
+
+- if (dir->i_nlink >= EXT3_LINK_MAX)
++ if (EXT3_DIR_LINK_MAXED(dir))
+ return -EMLINK;
+
+ retry:
+@@ -1729,7 +1735,7 @@
+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+- inode->i_nlink--; /* is this nlink == 0? */
++ ext3_dec_count(handle, inode); /* is this nlink == 0? */
+ ext3_mark_inode_dirty(handle, inode);
+ iput (inode);
+ goto out_stop;
+@@ -1761,7 +1767,7 @@
+ iput (inode);
+ goto out_stop;
+ }
+- dir->i_nlink++;
++ ext3_inc_count(handle, dir);
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ d_instantiate(dentry, inode);
+@@ -2026,10 +2032,10 @@
+ retval = ext3_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto end_rmdir;
+- if (inode->i_nlink != 2)
+- ext3_warning (inode->i_sb, "ext3_rmdir",
+- "empty directory has nlink!=2 (%d)",
+- inode->i_nlink);
++ if (!EXT3_DIR_LINK_EMPTY(inode))
++ ext3_warning(inode->i_sb, "ext3_rmdir",
++ "empty directory has too many links (%d)",
++ inode->i_nlink);
+ inode->i_version++;
+ inode->i_nlink = 0;
+ /* There's no need to set i_disksize: the fact that i_nlink is
+@@ -2039,7 +2045,7 @@
+ ext3_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ ext3_mark_inode_dirty(handle, inode);
+- dir->i_nlink--;
++ ext3_dec_count(handle, dir);
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+
+@@ -2090,7 +2096,7 @@
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+- inode->i_nlink--;
++ ext3_dec_count(handle, inode);
+ if (!inode->i_nlink)
+ ext3_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime;
+@@ -2165,7 +2171,7 @@
+ struct inode *inode = old_dentry->d_inode;
+ int err, retries = 0;
+
+- if (inode->i_nlink >= EXT3_LINK_MAX)
++ if (EXT3_DIR_LINK_MAXED(inode))
+ return -EMLINK;
+
+ retry:
+@@ -2252,8 +2258,8 @@
+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+ goto end_rename;
+ retval = -EMLINK;
+- if (!new_inode && new_dir!=old_dir &&
+- new_dir->i_nlink >= EXT3_LINK_MAX)
++ if (!new_inode && new_dir != old_dir &&
++ EXT3_DIR_LINK_MAXED(new_dir))
+ goto end_rename;
+ }
+ if (!new_bh) {
+@@ -2310,7 +2316,7 @@
+ }
+
+ if (new_inode) {
+- new_inode->i_nlink--;
++ ext3_dec_count(handle, new_inode);
+ new_inode->i_ctime = CURRENT_TIME_SEC;
+ }
+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+@@ -2321,11 +2327,13 @@
+ PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+ BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, dir_bh);
+- old_dir->i_nlink--;
++ ext3_dec_count(handle, old_dir);
+ if (new_inode) {
+- new_inode->i_nlink--;
++ /* checked empty_dir above, can't have another parent,
++ * ext3_dec_count() won't work for many-linked dirs */
++ new_inode->i_nlink = 0;
+ } else {
+- new_dir->i_nlink++;
++ ext3_inc_count(handle, new_dir);
+ ext3_update_dx_flag(new_dir);
+ ext3_mark_inode_dirty(handle, new_dir);
+ }
+
+Index: linux-2.6.7/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/ext3_fs.h 2004-06-15 23:19:36.000000000 -0600
++++ linux-2.6.7/include/linux/ext3_fs.h 2004-08-20 17:41:27.000000000 -0600
+@@ -79,7 +81,7 @@
+ /*
+ * Maximal count of links to a file
+ */
+-#define EXT3_LINK_MAX 32000
++#define EXT3_LINK_MAX 65000
+
+ /*
+ * Macro-instructions used to manage several block sizes
+@@ -595,14 +595,15 @@ struct ext3_dir_entry_2 {
+ */
+
+ #ifdef CONFIG_EXT3_INDEX
+- #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
+- EXT3_FEATURE_COMPAT_DIR_INDEX) && \
++#define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
+ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
+-#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
+-#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
++#define EXT3_DIR_LINK_MAXED(dir) (!is_dx(dir) && (dir)->i_nlink >=EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || \
++ (is_dx(dir) && (dir)->i_nlink == 1))
+ #else
+ #define is_dx(dir) 0
+-#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_MAXED(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
+ #define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
+ #endif
+
--- /dev/null
+Index: linux-stage/fs/ext3/ialloc.c
+===================================================================
+--- linux-stage.orig/fs/ext3/ialloc.c 2005-06-26 10:59:43.048185981 +0200
++++ linux-stage/fs/ext3/ialloc.c 2005-06-26 11:01:21.317716027 +0200
+@@ -775,7 +775,6 @@
+ if (!gdp)
+ continue;
+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+- cond_resched();
+ }
+ return desc_count;
+ #endif
+Index: linux-stage/fs/ext3/super.c
+===================================================================
+--- linux-stage.orig/fs/ext3/super.c 2005-06-26 10:59:43.205412542 +0200
++++ linux-stage/fs/ext3/super.c 2005-06-26 11:02:29.599941754 +0200
+@@ -2236,11 +2232,9 @@
+ * block group descriptors. If the sparse superblocks
+ * feature is turned on, then not all groups have this.
+ */
+- for (i = 0; i < ngroups; i++) {
++ for (i = 0; i < ngroups; i++)
+ overhead += ext3_bg_has_super(sb, i) +
+ ext3_bg_num_gdb(sb, i);
+- cond_resched();
+- }
+
+ /*
+ * Every block group has an inode bitmap, a block
--- /dev/null
+Index: linux-2.6.12/fs/ext3/super.c
+===================================================================
+--- linux-2.6.12.orig/fs/ext3/super.c 2005-06-17 13:48:29.000000000 -0600
++++ linux-2.6.12/fs/ext3/super.c 2005-11-25 05:59:47.000000000 -0700
+@@ -2165,13 +2165,13 @@
+ {
+ struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+ unsigned long overhead;
+- int i;
+
+ if (test_opt (sb, MINIX_DF))
+ overhead = 0;
+ else {
+- unsigned long ngroups;
+- ngroups = EXT3_SB(sb)->s_groups_count;
++ unsigned long ngroups = EXT3_SB(sb)->s_groups_count, group;
++ unsigned long three = 1, five = 5, seven = 7;
++ unsigned long metabg = -1UL;
+ smp_rmb();
+
+ /*
+@@ -2189,11 +2188,14 @@
+ * block group descriptors. If the sparse superblocks
+ * feature is turned on, then not all groups have this.
+ */
+- for (i = 0; i < ngroups; i++) {
+- overhead += ext3_bg_has_super(sb, i) +
+- ext3_bg_num_gdb(sb, i);
+- cond_resched();
+- }
++ overhead += 1 + EXT3_SB(sb)->s_gdb_count; /* group 0 */
++ if (EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG))
++ metabg =le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
++
++ while ((group = ext3_list_backups(sb, &three, &five, &seven)) <
++ ngroups) /* sb + group descriptors backups */
++ overhead += 1 + (group >= metabg ? 1 :
++ EXT3_SB(sb)->s_gdb_count);
+
+ /*
+ * Every block group has an inode bitmap, a block
+@@ -2205,12 +2204,16 @@
+ buf->f_type = EXT3_SUPER_MAGIC;
+ buf->f_bsize = sb->s_blocksize;
+ buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
+- buf->f_bfree = ext3_count_free_blocks (sb);
++ buf->f_bfree = percpu_counter_read(&EXT3_SB(sb)->s_freeblocks_counter);
++ if (buf->f_bfree < 0)
++ buf->f_bfree = 0;
+ buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
+ if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
+ buf->f_bavail = 0;
+ buf->f_files = le32_to_cpu(es->s_inodes_count);
+- buf->f_ffree = ext3_count_free_inodes (sb);
++ buf->f_ffree = percpu_counter_read(&EXT3_SB(sb)->s_freeinodes_counter);
++ if (buf->f_ffree < 0)
++ buf->f_ffree = 0;
+ buf->f_namelen = EXT3_NAME_LEN;
+ return 0;
+ }
+Index: linux-2.6.12/fs/ext3/resize.c
+===================================================================
+--- linux-2.6.12.orig/fs/ext3/resize.c 2005-11-24 15:17:06.000000000 -0700
++++ linux-2.6.12/fs/ext3/resize.c 2005-11-25 06:01:01.000000000 -0700
+@@ -285,17 +285,17 @@
+ * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
+ * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
+ */
+-static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
+- unsigned *five, unsigned *seven)
++unsigned long ext3_list_backups(struct super_block *sb, unsigned long *three,
++ unsigned long *five, unsigned long *seven)
+ {
+- unsigned *min = three;
++ unsigned long metabg = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
++ unsigned long *min = three, ret;
+ int mult = 3;
+- unsigned ret;
+
+ if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
+ EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+- ret = *min;
+- *min += 1;
++ ret = *three;
++ *three += 1;
+ return ret;
+ }
+
+@@ -308,8 +307,26 @@
+ mult = 7;
+ }
+
+- ret = *min;
+- *min *= mult;
++ if (EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) &&
++ *min >= metabg * EXT3_DESC_PER_BLOCK(sb)) {
++ ret = *min;
++ switch (ret & (EXT3_DESC_PER_BLOCK(sb) - 1)) {
++ case 0:
++ *three = ret + 1;
++ break;
++ case 1:
++ *three = ret + EXT3_DESC_PER_BLOCK(sb) - 2;
++ break;
++ default:
++ *three = (ret | (EXT3_DESC_PER_BLOCK(sb) - 1)) + 1;
++ break;
++ }
++ *five = -1UL;
++ *seven = -1UL;
++ } else {
++ ret = *min;
++ *min *= mult;
++ }
+
+ return ret;
+ }
+@@ -324,17 +337,17 @@
+ {
+ const unsigned long blk = primary->b_blocknr;
+ const unsigned long end = EXT3_SB(sb)->s_groups_count;
+- unsigned three = 1;
+- unsigned five = 5;
+- unsigned seven = 7;
+- unsigned grp;
++ unsigned long three = 1;
++ unsigned long five = 5;
++ unsigned long seven = 7;
++ unsigned long grp;
+ __u32 *p = (__u32 *)primary->b_data;
+ int gdbackups = 0;
+
+ while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
+ if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
+ ext3_warning(sb, __FUNCTION__,
+- "reserved GDT %ld missing grp %d (%ld)\n",
++ "reserved GDT %ld missing grp %ld (%ld)\n",
+ blk, grp,
+ grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
+ return -EINVAL;
+@@ -618,10 +631,8 @@
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ const unsigned long last = sbi->s_groups_count;
+ const int bpg = EXT3_BLOCKS_PER_GROUP(sb);
+- unsigned three = 1;
+- unsigned five = 5;
+- unsigned seven = 7;
+- unsigned group;
++ unsigned long three = 1, five = 5, seven = 7;
++ unsigned long group;
+ int rest = sb->s_blocksize - size;
+ handle_t *handle;
+ int err = 0, err2;
+@@ -672,7 +683,7 @@
+ exit_err:
+ if (err) {
+ ext3_warning(sb, __FUNCTION__,
+- "can't update backup for group %d (err %d), "
++ "can't update backup for group %ld (err %d), "
+ "forcing fsck on next reboot\n", group, err);
+ sbi->s_mount_state &= ~EXT3_VALID_FS;
+ sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS);
+Index: linux-2.6.12/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.12.orig/include/linux/ext3_fs.h 2005-06-17 13:48:29.000000000 -0600
++++ linux-2.6.12/include/linux/ext3_fs.h 2005-11-25 05:59:47.000000000 -0700
+@@ -788,6 +788,10 @@
+ extern int ext3_group_extend(struct super_block *sb,
+ struct ext3_super_block *es,
+ unsigned long n_blocks_count);
++extern unsigned long ext3_list_backups(struct super_block *sb,
++ unsigned long *three,
++ unsigned long *five,
++ unsigned long *seven);
+
+ /* super.c */
+ extern void ext3_error (struct super_block *, const char *, const char *, ...)
--- /dev/null
+Index: linux-2.6.10/include/linux/namei.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/namei.h 2005-04-06 09:38:35.000000000 -0600
++++ linux-2.6.10/include/linux/namei.h 2006-01-03 15:32:11.000000000 -0700
+@@ -46,6 +46,7 @@
+ #define LOOKUP_PARENT 16
+ #define LOOKUP_NOALT 32
+ #define LOOKUP_ATOMIC 64
++#define LOOKUP_REVAL 128
+
+ /*
+ * Intent data
===================================================================
--- linux.mcp2.orig/fs/ext3/iopen.c 2002-04-11 07:25:15.000000000 -0700
+++ linux.mcp2/fs/ext3/iopen.c 2004-05-17 15:21:55.000000000 -0700
-@@ -0,0 +1,282 @@
+@@ -0,0 +1,285 @@
+/*
+ * linux/fs/ext3/iopen.c
+ *
+ if (!inode)
+ goto do_rehash;
+
++ if (!test_opt(inode->i_sb, IOPEN))
++ goto do_instantiate;
++
+ /* preferrably return a connected dentry */
+ list_for_each(lp, &inode->i_dentry) {
+ tmp = list_entry(lp, struct dentry, d_alias);
===================================================================
--- linux-stage.orig/fs/ext3/iopen.c 2005-02-25 14:41:01.017787968 +0200
+++ linux-stage/fs/ext3/iopen.c 2005-02-25 14:41:01.045783712 +0200
-@@ -0,0 +1,277 @@
+@@ -0,0 +1,278 @@
+/*
+ * linux/fs/ext3/iopen.c
+ *
+ goto do_instantiate;
+
+ /* Move the goal to the de hash queue */
-+ goal->d_flags &= ~ DCACHE_DISCONNECTED;
++ goal->d_flags &= ~DCACHE_DISCONNECTED;
+ security_d_instantiate(goal, inode);
++ __d_drop(dentry);
+ __d_rehash(dentry, 0);
+ __d_move(goal, dentry);
+ spin_unlock(&dcache_lock);
===================================================================
--- linux-2.6.5-sles9.orig/fs/ext3/iopen.c 2003-01-30 13:24:37.000000000 +0300
+++ linux-2.6.5-sles9/fs/ext3/iopen.c 2004-11-09 02:18:27.611913312 +0300
-@@ -0,0 +1,275 @@
+@@ -0,0 +1,278 @@
+/*
+ * linux/fs/ext3/iopen.c
+ *
+ alternate = list_entry(inode->i_dentry.next,
+ struct dentry, d_alias);
+ dget_locked(alternate);
++ spin_lock(&alternate->d_lock);
+ alternate->d_vfs_flags |= DCACHE_REFERENCED;
++ spin_unlock(&alternate->d_lock);
+ iput(inode);
+ spin_unlock(&dcache_lock);
+ return alternate;
+ goto do_instantiate;
+
+ /* Move the goal to the de hash queue */
-+ goal->d_flags &= ~ DCACHE_DISCONNECTED;
++ goal->d_flags &= ~DCACHE_DISCONNECTED;
+ security_d_instantiate(goal, inode);
++ __d_drop(dentry);
+ __d_rehash(dentry, 0);
+ __d_move(goal, dentry);
+ spin_unlock(&dcache_lock);
--- /dev/null
+Index: linux-2.6.12-rc6/fs/ext3/Makefile
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/Makefile 2005-06-14 16:00:45.206720992 +0200
++++ linux-2.6.12-rc6/fs/ext3/Makefile 2005-06-14 16:14:33.595382720 +0200
+@@ -4,7 +4,7 @@
+
+ obj-$(CONFIG_EXT3_FS) += ext3.o
+
+-ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o\
+ ioctl.o namei.o super.o symlink.o hash.o resize.o
+
+ ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+Index: linux-2.6.12-rc6/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/inode.c 2005-06-14 16:01:16.272150299 +0200
++++ linux-2.6.12-rc6/fs/ext3/inode.c 2005-06-14 16:24:55.686195412 +0200
+@@ -37,6 +37,7 @@
+ #include <linux/mpage.h>
+ #include <linux/uio.h>
+ #include "xattr.h"
++#include "iopen.h"
+ #include "acl.h"
+
+ static int ext3_writepage_trans_blocks(struct inode *inode);
+@@ -2437,6 +2438,8 @@
+ ei->i_default_acl = EXT3_ACL_NOT_CACHED;
+ #endif
+ ei->i_block_alloc_info = NULL;
++ if (ext3_iopen_get_inode(inode))
++ return;
+
+ if (__ext3_get_inode_loc(inode, &iloc, 0))
+ goto bad_inode;
+Index: linux-2.6.12-rc6/fs/ext3/iopen.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/iopen.c 2005-06-14 16:14:33.530929595 +0200
++++ linux-2.6.12-rc6/fs/ext3/iopen.c 2005-06-14 16:14:33.626632719 +0200
+@@ -0,0 +1,278 @@
++/*
++ * linux/fs/ext3/iopen.c
++ *
++ * Special support for open by inode number
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ *
++ *
++ * Invariants:
++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
++ * for an inode at one time.
++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
++ * aliases on an inode at the same time.
++ *
++ * If we have any connected dentry aliases for an inode, use one of those
++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED
++ * dentry for this inode, which thereafter will be found by the dcache
++ * when looking up this inode number in __iopen__, so we don't return here
++ * until it is gone.
++ *
++ * If we get an inode via a regular name lookup, then we "rename" the
++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures
++ * existing users of the disconnected dentry will continue to use the same
++ * dentry as the connected users, and there will never be both kinds of
++ * dentry aliases at one time.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/smp_lock.h>
++#include <linux/dcache.h>
++#include <linux/security.h>
++#include "iopen.h"
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#define IOPEN_NAME_LEN 32
++
++/*
++ * This implements looking up an inode by number.
++ */
++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry,
++ struct nameidata *nd)
++{
++ struct inode *inode;
++ unsigned long ino;
++ struct list_head *lp;
++ struct dentry *alternate;
++ char buf[IOPEN_NAME_LEN];
++
++ if (dentry->d_name.len >= IOPEN_NAME_LEN)
++ return ERR_PTR(-ENAMETOOLONG);
++
++ memcpy(buf, dentry->d_name.name, dentry->d_name.len);
++ buf[dentry->d_name.len] = 0;
++
++ if (strcmp(buf, ".") == 0)
++ ino = dir->i_ino;
++ else if (strcmp(buf, "..") == 0)
++ ino = EXT3_ROOT_INO;
++ else
++ ino = simple_strtoul(buf, 0, 0);
++
++ if ((ino != EXT3_ROOT_INO &&
++ //ino != EXT3_ACL_IDX_INO &&
++ //ino != EXT3_ACL_DATA_INO &&
++ ino < EXT3_FIRST_INO(dir->i_sb)) ||
++ ino > le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
++ return ERR_PTR(-ENOENT);
++
++ inode = iget(dir->i_sb, ino);
++ if (!inode)
++ return ERR_PTR(-EACCES);
++ if (is_bad_inode(inode)) {
++ iput(inode);
++ return ERR_PTR(-ENOENT);
++ }
++
++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
++ assert(d_unhashed(dentry)); /* d_rehash */
++
++ /* preferrably return a connected dentry */
++ spin_lock(&dcache_lock);
++ list_for_each(lp, &inode->i_dentry) {
++ alternate = list_entry(lp, struct dentry, d_alias);
++ assert(!(alternate->d_flags & DCACHE_DISCONNECTED));
++ }
++
++ if (!list_empty(&inode->i_dentry)) {
++ alternate = list_entry(inode->i_dentry.next,
++ struct dentry, d_alias);
++ dget_locked(alternate);
++ spin_lock(&alternate->d_lock);
++ alternate->d_flags |= DCACHE_REFERENCED;
++ spin_unlock(&alternate->d_lock);
++ iput(inode);
++ spin_unlock(&dcache_lock);
++ return alternate;
++ }
++ dentry->d_flags |= DCACHE_DISCONNECTED;
++
++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
++ dentry->d_inode = inode;
++
++ d_rehash_cond(dentry, 0); /* d_rehash */
++ spin_unlock(&dcache_lock);
++
++ return NULL;
++}
++
++#define do_switch(x,y) do { \
++ __typeof__ (x) __tmp = x; \
++ x = y; y = __tmp; } while (0)
++
++static inline void switch_names(struct dentry *dentry, struct dentry *target)
++{
++ const unsigned char *old_name, *new_name;
++
++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN_MIN);
++ old_name = target->d_name.name;
++ new_name = dentry->d_name.name;
++ if (old_name == target->d_iname)
++ old_name = dentry->d_iname;
++ if (new_name == dentry->d_iname)
++ new_name = target->d_iname;
++ target->d_name.name = new_name;
++ dentry->d_name.name = old_name;
++}
++
++/* This function is spliced into ext3_lookup and does the move of a
++ * disconnected dentry (if it exists) to a connected dentry.
++ */
++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode,
++ int rehash)
++{
++ struct dentry *tmp, *goal = NULL;
++ struct list_head *lp;
++
++ /* verify this dentry is really new */
++ assert(dentry->d_inode == NULL);
++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */
++ if (rehash)
++ assert(d_unhashed(dentry)); /* d_rehash */
++ assert(list_empty(&dentry->d_subdirs));
++
++ spin_lock(&dcache_lock);
++ if (!inode)
++ goto do_rehash;
++
++ if (!test_opt(inode->i_sb, IOPEN))
++ goto do_instantiate;
++
++ /* preferrably return a connected dentry */
++ list_for_each(lp, &inode->i_dentry) {
++ tmp = list_entry(lp, struct dentry, d_alias);
++ if (tmp->d_flags & DCACHE_DISCONNECTED) {
++ assert(tmp->d_alias.next == &inode->i_dentry);
++ assert(tmp->d_alias.prev == &inode->i_dentry);
++ goal = tmp;
++ dget_locked(goal);
++ break;
++ }
++ }
++
++ if (!goal)
++ goto do_instantiate;
++
++ /* Move the goal to the de hash queue */
++ goal->d_flags &= ~DCACHE_DISCONNECTED;
++ security_d_instantiate(goal, inode);
++ __d_drop(dentry);
++ d_rehash_cond(dentry, 0);
++ __d_move(goal, dentry);
++ spin_unlock(&dcache_lock);
++ iput(inode);
++
++ return goal;
++
++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */
++do_instantiate:
++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
++ dentry->d_inode = inode;
++do_rehash:
++ if (rehash)
++ d_rehash_cond(dentry, 0); /* d_rehash */
++ spin_unlock(&dcache_lock);
++
++ return NULL;
++}
++
++/*
++ * These are the special structures for the iopen pseudo directory.
++ */
++
++static struct inode_operations iopen_inode_operations = {
++ lookup: iopen_lookup, /* BKL held */
++};
++
++static struct file_operations iopen_file_operations = {
++ read: generic_read_dir,
++};
++
++static int match_dentry(struct dentry *dentry, const char *name)
++{
++ int len;
++
++ len = strlen(name);
++ if (dentry->d_name.len != len)
++ return 0;
++ if (strncmp(dentry->d_name.name, name, len))
++ return 0;
++ return 1;
++}
++
++/*
++ * This function is spliced into ext3_lookup and returns 1 the file
++ * name is __iopen__ and dentry has been filled in appropriately.
++ */
++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry)
++{
++ struct inode *inode;
++
++ if (dir->i_ino != EXT3_ROOT_INO ||
++ !test_opt(dir->i_sb, IOPEN) ||
++ !match_dentry(dentry, "__iopen__"))
++ return 0;
++
++ inode = iget(dir->i_sb, EXT3_BAD_INO);
++
++ if (!inode)
++ return 0;
++ d_add(dentry, inode);
++ return 1;
++}
++
++/*
++ * This function is spliced into read_inode; it returns 1 if inode
++ * number is the one for /__iopen__, in which case the inode is filled
++ * in appropriately. Otherwise, this fuction returns 0.
++ */
++int ext3_iopen_get_inode(struct inode *inode)
++{
++ if (inode->i_ino != EXT3_BAD_INO)
++ return 0;
++
++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
++ if (test_opt(inode->i_sb, IOPEN_NOPRIV))
++ inode->i_mode |= 0777;
++ inode->i_uid = 0;
++ inode->i_gid = 0;
++ inode->i_nlink = 1;
++ inode->i_size = 4096;
++ inode->i_atime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME;
++ inode->i_mtime = CURRENT_TIME;
++ EXT3_I(inode)->i_dtime = 0;
++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
++ * (for stat), not the fs block
++ * size */
++ inode->i_blocks = 0;
++ inode->i_version = 1;
++ inode->i_generation = 0;
++
++ inode->i_op = &iopen_inode_operations;
++ inode->i_fop = &iopen_file_operations;
++ inode->i_mapping->a_ops = 0;
++
++ return 1;
++}
+Index: linux-2.6.12-rc6/fs/ext3/iopen.h
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/iopen.h 2005-06-14 16:14:33.534835845 +0200
++++ linux-2.6.12-rc6/fs/ext3/iopen.h 2005-06-14 16:14:33.633468657 +0200
+@@ -0,0 +1,15 @@
++/*
++ * iopen.h
++ *
++ * Special support for opening files by inode number.
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ *
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ */
++
++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
++extern int ext3_iopen_get_inode(struct inode *inode);
++extern struct dentry *iopen_connect_dentry(struct dentry *dentry,
++ struct inode *inode, int rehash);
+Index: linux-2.6.12-rc6/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/namei.c 2005-06-14 16:01:14.701837819 +0200
++++ linux-2.6.12-rc6/fs/ext3/namei.c 2005-06-14 16:14:33.644210844 +0200
+@@ -37,6 +37,7 @@
+ #include <linux/buffer_head.h>
+ #include <linux/smp_lock.h>
+ #include "xattr.h"
++#include "iopen.h"
+ #include "acl.h"
+
+ /*
+@@ -985,6 +986,9 @@
+ if (dentry->d_name.len > EXT3_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
++ if (ext3_check_for_iopen(dir, dentry))
++ return NULL;
++
+ bh = ext3_find_entry(dentry, &de);
+ inode = NULL;
+ if (bh) {
+@@ -995,10 +999,8 @@
+ if (!inode)
+ return ERR_PTR(-EACCES);
+ }
+- if (inode)
+- return d_splice_alias(inode, dentry);
+- d_add(dentry, inode);
+- return NULL;
++
++ return iopen_connect_dentry(dentry, inode, 1);
+ }
+
+
+@@ -2042,10 +2044,6 @@
+ inode->i_nlink);
+ inode->i_version++;
+ inode->i_nlink = 0;
+- /* There's no need to set i_disksize: the fact that i_nlink is
+- * zero will ensure that the right thing happens during any
+- * recovery. */
+- inode->i_size = 0;
+ ext3_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ ext3_mark_inode_dirty(handle, inode);
+@@ -2168,6 +2166,23 @@
+ return err;
+ }
+
++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */
++static int ext3_add_link(handle_t *handle, struct dentry *dentry,
++ struct inode *inode)
++{
++ int err = ext3_add_entry(handle, dentry, inode);
++ if (!err) {
++ err = ext3_mark_inode_dirty(handle, inode);
++ if (err == 0) {
++ dput(iopen_connect_dentry(dentry, inode, 0));
++ return 0;
++ }
++ }
++ ext3_dec_count(handle, inode);
++ iput(inode);
++ return err;
++}
++
+ static int ext3_link (struct dentry * old_dentry,
+ struct inode * dir, struct dentry *dentry)
+ {
+@@ -2191,7 +2206,8 @@
+ ext3_inc_count(handle, inode);
+ atomic_inc(&inode->i_count);
+
+- err = ext3_add_nondir(handle, dentry, inode);
++ err = ext3_add_link(handle, dentry, inode);
++ ext3_orphan_del(handle,inode);
+ ext3_journal_stop(handle);
+ if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+Index: linux-2.6.12-rc6/fs/ext3/super.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/ext3/super.c 2005-06-14 16:01:16.287775299 +0200
++++ linux-2.6.12-rc6/fs/ext3/super.c 2005-06-14 16:14:33.656906156 +0200
+@@ -590,6 +590,7 @@
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
++ Opt_iopen, Opt_noiopen, Opt_iopen_nopriv,
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+ };
+
+@@ -638,6 +639,9 @@
+ {Opt_ignore, "noquota"},
+ {Opt_ignore, "quota"},
+ {Opt_ignore, "usrquota"},
++ {Opt_iopen, "iopen"},
++ {Opt_noiopen, "noiopen"},
++ {Opt_iopen_nopriv, "iopen_nopriv"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+@@ -921,6 +925,18 @@
+ else
+ clear_opt(sbi->s_mount_opt, BARRIER);
+ break;
++ case Opt_iopen:
++ set_opt (sbi->s_mount_opt, IOPEN);
++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++ break;
++ case Opt_noiopen:
++ clear_opt (sbi->s_mount_opt, IOPEN);
++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++ break;
++ case Opt_iopen_nopriv:
++ set_opt (sbi->s_mount_opt, IOPEN);
++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++ break;
+ case Opt_ignore:
+ break;
+ case Opt_resize:
+Index: linux-2.6.12-rc6/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.6.12-rc6.orig/include/linux/ext3_fs.h 2005-06-14 16:01:14.709650318 +0200
++++ linux-2.6.12-rc6/include/linux/ext3_fs.h 2005-06-14 16:28:38.452794245 +0200
+@@ -358,6 +358,8 @@
+ #define EXT3_MOUNT_RESERVATION 0x10000 /* Preallocation */
+ #define EXT3_MOUNT_BARRIER 0x20000 /* Use block barriers */
+ #define EXT3_MOUNT_NOBH 0x40000 /* No bufferheads */
++#define EXT3_MOUNT_IOPEN 0x80000 /* Allow access via iopen */
++#define EXT3_MOUNT_IOPEN_NOPRIV 0x100000 /* Make iopen world-readable */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
--- /dev/null
+Index: linux-2.6.4-51.0/Documentation/filesystems/ext2.txt
+===================================================================
+--- linux-2.6.4-51.0.orig/Documentation/filesystems/ext2.txt 2004-05-06 22:21:26.000000000 -0400
++++ linux-2.6.4-51.0/Documentation/filesystems/ext2.txt 2004-05-06 22:24:42.000000000 -0400
+@@ -35,6 +35,22 @@
+
+ sb=n Use alternate superblock at this location.
+
++iopen Makes an invisible pseudo-directory called
++ __iopen__ available in the root directory
++ of the filesystem. Allows open-by-inode-
++ number. i.e., inode 3145 can be accessed
++ via /mntpt/__iopen__/3145
++
++iopen_nopriv This option makes the iopen directory be
++ world-readable. This may be safer since it
++ allows daemons to run as an unprivileged user,
++ however it significantly changes the security
++ model of a Unix filesystem, since previously
++ all files under a mode 700 directory were not
++ generally avilable even if the
++ permissions on the file itself is
++ world-readable.
++
+ grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2.
+
+
+Index: linux-2.6.4-51.0/fs/dcache.c
+===================================================================
+--- linux-2.6.4-51.0.orig/fs/dcache.c 2004-05-06 22:24:42.000000000 -0400
++++ linux-2.6.4-51.0/fs/dcache.c 2004-05-06 22:58:37.000000000 -0400
+@@ -1195,14 +1195,13 @@
+ * dcache entries should not be moved in this way.
+ */
+
+-void d_move(struct dentry * dentry, struct dentry * target)
++void __d_move(struct dentry * dentry, struct dentry * target)
+ {
+ struct hlist_head *list;
+
+ if (!dentry->d_inode)
+ printk(KERN_WARNING "VFS: moving negative dcache entry\n");
+
+- spin_lock(&dcache_lock);
+ write_seqlock(&rename_lock);
+ /*
+ * XXXX: do we really need to take target->d_lock?
+@@ -1253,6 +1252,14 @@
+ spin_unlock(&target->d_lock);
+ spin_unlock(&dentry->d_lock);
+ write_sequnlock(&rename_lock);
++}
++
++EXPORT_SYMBOL(__d_move);
++
++void d_move(struct dentry *dentry, struct dentry *target)
++{
++ spin_lock(&dcache_lock);
++ __d_move(dentry, target);
+ spin_unlock(&dcache_lock);
+ }
+
+Index: linux-2.6.4-51.0/include/linux/dcache.h
+===================================================================
+--- linux-2.6.4-51.0.orig/include/linux/dcache.h 2004-05-06 22:24:42.000000000 -0400
++++ linux-2.6.4-51.0/include/linux/dcache.h 2004-05-06 23:03:43.000000000 -0400
+@@ -234,6 +234,7 @@
+ * This adds the entry to the hash queues.
+ */
+ extern void d_rehash(struct dentry *);
++extern void d_rehash_cond(struct dentry *, int lock);
+
+ /**
+ * d_add - add dentry to hash queues
+@@ -252,6 +253,7 @@
+
+ /* used for rename() and baskets */
+ extern void d_move(struct dentry *, struct dentry *);
++extern void __d_move(struct dentry *, struct dentry *);
+
+ /* appendix may either be NULL or be used for transname suffixes */
+ extern struct dentry * d_lookup(struct dentry *, struct qstr *);
--- /dev/null
+
+--- linux-2.4.18/fs/jbd/checkpoint.c~jbd-commit-tricks Mon Jul 28 13:52:05 2003
++++ linux-2.4.18-alexey/fs/jbd/checkpoint.c Mon Jul 28 14:03:59 2003
+@@ -77,19 +77,23 @@ static int __try_to_free_cp_buf(struct j
+ * to wait for a checkpoint to free up some space in the log.
+ */
+
+-void log_wait_for_space(journal_t *journal, int nblocks)
++void log_wait_for_space(journal_t *journal)
+ {
++ int nblocks;
++
++ nblocks = jbd_space_needed(journal);
+ while (log_space_left(journal) < nblocks) {
+ if (journal->j_flags & JFS_ABORT)
+ return;
+ unlock_journal(journal);
+ down(&journal->j_checkpoint_sem);
+ lock_journal(journal);
++ nblocks = jbd_space_needed(journal);
+
+ /* Test again, another process may have checkpointed
+ * while we were waiting for the checkpoint lock */
+ if (log_space_left(journal) < nblocks) {
+- log_do_checkpoint(journal, nblocks);
++ log_do_checkpoint(journal);
+ }
+ up(&journal->j_checkpoint_sem);
+ }
+@@ -260,8 +264,7 @@ static int __flush_buffer(journal_t *jou
+ * The journal should be locked before calling this function.
+ */
+
+-/* @@@ `nblocks' is unused. Should it be used? */
+-int log_do_checkpoint (journal_t *journal, int nblocks)
++int log_do_checkpoint (journal_t *journal)
+ {
+ transaction_t *transaction, *last_transaction, *next_transaction;
+ int result;
+@@ -315,6 +318,8 @@ repeat:
+ retry = __flush_buffer(journal, jh, bhs, &batch_count,
+ &drop_count);
+ } while (jh != last_jh && !retry);
++ if (journal->j_checkpoint_transactions != transaction)
++ goto done;
+ if (batch_count) {
+ __flush_batch(bhs, &batch_count);
+ goto repeat;
+@@ -328,6 +333,8 @@ repeat:
+ */
+ cleanup_ret = __cleanup_transaction(journal, transaction);
+ J_ASSERT(drop_count != 0 || cleanup_ret != 0);
++ if (journal->j_checkpoint_transactions != transaction)
++ goto done;
+ goto repeat; /* __cleanup may have dropped lock */
+ } while (transaction != last_transaction);
+
+--- linux-2.4.18/fs/jbd/journal.c~jbd-commit-tricks Mon Jul 28 13:52:05 2003
++++ linux-2.4.18-alexey/fs/jbd/journal.c Mon Jul 28 14:03:59 2003
+@@ -1115,7 +1115,7 @@ void journal_destroy (journal_t *journal
+ /* Force any old transactions to disk */
+ lock_journal(journal);
+ while (journal->j_checkpoint_transactions != NULL)
+- log_do_checkpoint(journal, 1);
++ log_do_checkpoint(journal);
+
+ J_ASSERT(journal->j_running_transaction == NULL);
+ J_ASSERT(journal->j_committing_transaction == NULL);
+@@ -1302,7 +1302,7 @@ int journal_flush (journal_t *journal)
+ /* ...and flush everything in the log out to disk. */
+ lock_journal(journal);
+ while (!err && journal->j_checkpoint_transactions != NULL)
+- err = log_do_checkpoint(journal, journal->j_maxlen);
++ err = log_do_checkpoint(journal);
+ cleanup_journal_tail(journal);
+
+ /* Finally, mark the journal as really needing no recovery.
+--- linux-2.4.18/fs/jbd/transaction.c~jbd-commit-tricks Mon Jul 28 13:52:05 2003
++++ linux-2.4.18-alexey/fs/jbd/transaction.c Mon Jul 28 14:03:59 2003
+@@ -182,14 +182,9 @@ repeat_locked:
+ * Also, this test is inconsitent with the matching one in
+ * journal_extend().
+ */
+- needed = journal->j_max_transaction_buffers;
+- if (journal->j_committing_transaction)
+- needed += journal->j_committing_transaction->
+- t_outstanding_credits;
+-
+- if (log_space_left(journal) < needed) {
++ if (log_space_left(journal) < jbd_space_needed(journal)) {
+ jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
+- log_wait_for_space(journal, needed);
++ log_wait_for_space(journal);
+ goto repeat_locked;
+ }
+
+--- linux-2.4.18/include/linux/jbd.h~jbd-commit-tricks Mon Jul 28 13:52:17 2003
++++ linux-2.4.18-alexey/include/linux/jbd.h Mon Jul 28 14:03:59 2003
+@@ -740,9 +740,9 @@ extern void journal_brelse_array(stru
+ extern int log_space_left (journal_t *); /* Called with journal locked */
+ extern tid_t log_start_commit (journal_t *, transaction_t *);
+ extern int log_wait_commit (journal_t *, tid_t);
+-extern int log_do_checkpoint (journal_t *, int);
++extern int log_do_checkpoint (journal_t *);
+
+-extern void log_wait_for_space(journal_t *, int nblocks);
++extern void log_wait_for_space(journal_t *);
+ extern void __journal_drop_transaction(journal_t *, transaction_t *);
+ extern int cleanup_journal_tail(journal_t *);
+
+@@ -815,6 +815,19 @@ static inline int tid_geq(tid_t x, tid_t
+ }
+
+ extern int journal_blocks_per_page(struct inode *inode);
++
++/*
++ * Return the minimum number of blocks which must be free in the journal
++ * before a new transaction may be started. Must be called under j_state_lock.
++ */
++static inline int jbd_space_needed(journal_t *journal)
++{
++ int nblocks = journal->j_max_transaction_buffers;
++ if (journal->j_committing_transaction)
++ nblocks += journal->j_committing_transaction->
++ t_outstanding_credits;
++ return nblocks;
++}
+
+ /*
+ * Definitions which augment the buffer_head layer
+
+_
+ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
-+#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.20.1 2005/03/24 22:50:28 jacob Exp $"
++#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.20.2 2005/04/01 21:30:19 green Exp $"
+
+/*
+ This code uses the list of all kernel and module symbols to :-
+ Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
-+#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.20.1 2005/03/24 22:50:28 jacob Exp $"
++#ident "$Id: kallsyms-2.4-bgl.patch,v 1.1.20.2 2005/04/01 21:30:19 green Exp $"
+
+#ifndef MODUTILS_KALLSYMS_H
+#define MODUTILS_KALLSYMS_H 1
===================================================================
--- linux-2.4.21-chaos.orig/fs/ext2/inode.c 2003-07-15 04:41:01.000000000 +0400
+++ linux-2.4.21-chaos/fs/ext2/inode.c 2003-12-14 15:11:46.000000000 +0300
-@@ -39,6 +39,18 @@
- static int ext2_update_inode(struct inode * inode, int do_sync);
-
- /*
-+ * Test whether an inode is a fast symlink.
-+ */
-+static inline int ext2_inode_is_fast_symlink(struct inode *inode)
-+{
-+ int ea_blocks = inode->u.ext2_i.i_file_acl ?
-+ (inode->i_sb->s_blocksize >> 9) : 0;
-+
-+ return (S_ISLNK(inode->i_mode) &&
-+ inode->i_blocks - ea_blocks == 0);
-+}
-+
-+/*
- * Called at each iput()
- */
- void ext2_put_inode (struct inode * inode)
@@ -53,9 +65,7 @@
{
lock_kernel();
inode->i_op = &ext2_file_inode_operations;
inode->i_fop = &ext2_file_operations;
inode->i_mapping->a_ops = &ext2_aops;
-@@ -1002,15 +1010,17 @@
- inode->i_fop = &ext2_dir_operations;
- inode->i_mapping->a_ops = &ext2_aops;
- } else if (S_ISLNK(inode->i_mode)) {
-- if (!inode->i_blocks)
-+ if (ext2_inode_is_fast_symlink(inode))
+@@ -1002,12 +1010,14 @@
+ if (ext2_inode_is_fast_symlink(inode))
inode->i_op = &ext2_fast_symlink_inode_operations;
else {
- inode->i_op = &page_symlink_inode_operations;
--- /dev/null
+diff -X /home/nikita/src/linux-git/linux-2.6.git/Documentation/dontdiff -rupbB linux-2.4.24.orig/fs/jbd/commit.c linux-2.4.24/fs/jbd/commit.c
+--- linux-2.4.24.orig/fs/jbd/commit.c 2005-06-23 17:39:32.000000000 +0400
++++ linux-2.4.24/fs/jbd/commit.c 2005-06-23 15:56:05.000000000 +0400
+@@ -505,6 +505,9 @@ start_journal_io:
+ goto wait_for_iobuf;
+ }
+
++ if (unlikely(!buffer_uptodate(bh)))
++ err = -EIO;
++
+ clear_bit(BH_JWrite, &jh2bh(jh)->b_state);
+
+ JBUFFER_TRACE(jh, "ph4: unfile after journal write");
+@@ -566,6 +569,9 @@ start_journal_io:
+ goto wait_for_ctlbuf;
+ }
+
++ if (unlikely(!buffer_uptodate(bh)))
++ err = -EIO;
++
+ BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+ clear_bit(BH_JWrite, &bh->b_state);
+ journal_unfile_buffer(jh);
--- /dev/null
+--- linux/arch/i386/kernel/process.c.seg 2005-03-27 13:07:14.000000000 -0800
++++ linux/arch/i386/kernel/process.c 2005-03-28 10:28:47.000000000 -0800
+@@ -597,8 +597,8 @@ struct task_struct fastcall * __switch_t
+ * Save away %fs and %gs. No need to save %es and %ds, as
+ * those are always kernel segments while inside the kernel.
+ */
+- asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
+- asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
++ asm volatile("mov %%fs,%0":"=m" (prev->fs));
++ asm volatile("mov %%gs,%0":"=m" (prev->gs));
+
+ /*
+ * Restore %fs and %gs if needed.
+--- linux/arch/i386/kernel/vm86.c.seg 2005-03-27 13:07:14.000000000 -0800
++++ linux/arch/i386/kernel/vm86.c 2005-03-28 10:28:47.000000000 -0800
+@@ -294,8 +294,8 @@ static void do_sys_vm86(struct kernel_vm
+ */
+ info->regs32->eax = 0;
+ tsk->thread.saved_esp0 = tsk->thread.esp0;
+- asm volatile("movl %%fs,%0":"=m" (tsk->thread.saved_fs));
+- asm volatile("movl %%gs,%0":"=m" (tsk->thread.saved_gs));
++ asm volatile("mov %%fs,%0":"=m" (tsk->thread.saved_fs));
++ asm volatile("mov %%gs,%0":"=m" (tsk->thread.saved_gs));
+
+ tss = &per_cpu(init_tss, get_cpu());
+ tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
+--- linux/arch/x86_64/kernel/process.c.seg 2005-03-27 13:07:49.000000000 -0800
++++ linux/arch/x86_64/kernel/process.c 2005-03-28 11:11:04.206766410 -0800
+@@ -391,10 +391,10 @@ int copy_thread(int nr, unsigned long cl
+ p->thread.fs = me->thread.fs;
+ p->thread.gs = me->thread.gs;
+
+- asm("movl %%gs,%0" : "=m" (p->thread.gsindex));
+- asm("movl %%fs,%0" : "=m" (p->thread.fsindex));
+- asm("movl %%es,%0" : "=m" (p->thread.es));
+- asm("movl %%ds,%0" : "=m" (p->thread.ds));
++ asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
++ asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
++ asm("mov %%es,%0" : "=m" (p->thread.es));
++ asm("mov %%ds,%0" : "=m" (p->thread.ds));
+
+ if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
+ p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+@@ -457,11 +457,11 @@ struct task_struct *__switch_to(struct t
+ * Switch DS and ES.
+ * This won't pick up thread selector changes, but I guess that is ok.
+ */
+- asm volatile("movl %%es,%0" : "=m" (prev->es));
++ asm volatile("mov %%es,%0" : "=m" (prev->es));
+ if (unlikely(next->es | prev->es))
+ loadsegment(es, next->es);
+
+- asm volatile ("movl %%ds,%0" : "=m" (prev->ds));
++ asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
+ if (unlikely(next->ds | prev->ds))
+ loadsegment(ds, next->ds);
+
+@@ -472,7 +472,7 @@ struct task_struct *__switch_to(struct t
+ */
+ {
+ unsigned fsindex;
+- asm volatile("movl %%fs,%0" : "=g" (fsindex));
++ asm volatile("movl %%fs,%0" : "=r" (fsindex));
+ /* segment register != 0 always requires a reload.
+ also reload when it has changed.
+ when prev process used 64bit base always reload
+@@ -493,7 +493,7 @@ struct task_struct *__switch_to(struct t
+ }
+ {
+ unsigned gsindex;
+- asm volatile("movl %%gs,%0" : "=g" (gsindex));
++ asm volatile("movl %%gs,%0" : "=r" (gsindex));
+ if (unlikely(gsindex | next->gsindex | prev->gs)) {
+ load_gs_index(next->gsindex);
+ if (gsindex)
+--- linux/include/asm-i386/system.h.seg 2005-03-27 13:09:12.000000000 -0800
++++ linux/include/asm-i386/system.h 2005-03-28 10:28:47.000000000 -0800
+@@ -81,7 +81,7 @@ static inline unsigned long _get_base(ch
+ #define loadsegment(seg,value) \
+ asm volatile("\n" \
+ "1:\t" \
+- "movl %0,%%" #seg "\n" \
++ "mov %0,%%" #seg "\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3:\t" \
+@@ -93,13 +93,13 @@ static inline unsigned long _get_base(ch
+ ".align 4\n\t" \
+ ".long 1b,3b\n" \
+ ".previous" \
+- : :"m" (*(unsigned int *)&(value)))
++ : :"m" (value))
+
+ /*
+ * Save a segment register away
+ */
+ #define savesegment(seg, value) \
+- asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value)))
++ asm volatile("mov %%" #seg ",%0":"=m" (value))
+
+ /*
+ * Clear and set 'TS' bit respectively
--- /dev/null
+Index: linux-2.6.10/drivers/char/qtronix.c
+===================================================================
+--- linux-2.6.10.orig/drivers/char/qtronix.c 2004-12-24 14:35:50.000000000 -0700
++++ linux-2.6.10/drivers/char/qtronix.c 2006-01-03 16:16:52.000000000 -0700
+@@ -537,7 +537,7 @@
+ i--;
+ }
+ if (count-i) {
+- file->f_dentry->d_inode->i_atime = CURRENT_TIME;
++ file->f_dentry->d_inode->i_atime = current_fs_time(inode->i_sb);
+ return count-i;
+ }
+ if (signal_pending(current))
+Index: linux-2.6.10/drivers/char/random.c
+===================================================================
+--- linux-2.6.10.orig/drivers/char/random.c 2005-04-06 09:38:33.000000000 -0600
++++ linux-2.6.10/drivers/char/random.c 2006-01-03 16:16:52.000000000 -0700
+@@ -1743,8 +1743,9 @@
+ if (p == buffer) {
+ return (ssize_t)ret;
+ } else {
+- file->f_dentry->d_inode->i_mtime = CURRENT_TIME;
+- mark_inode_dirty(file->f_dentry->d_inode);
++ struct inode *inode = file->f_dentry->d_inode;
++ inode->i_mtime = current_fs_time(inode->i_sb);
++ mark_inode_dirty(inode);
+ return (ssize_t)(p - buffer);
+ }
+ }
+Index: linux-2.6.10/drivers/char/sonypi.c
+===================================================================
+--- linux-2.6.10.orig/drivers/char/sonypi.c 2004-12-24 14:35:23.000000000 -0700
++++ linux-2.6.10/drivers/char/sonypi.c 2006-01-03 16:18:31.000000000 -0700
+@@ -537,7 +537,8 @@
+ }
+
+ if (ret > 0)
+- file->f_dentry->d_inode->i_atime = CURRENT_TIME;
++ struct inode *inode = file->f_dentry->d_inode;
++ inode->i_atime = current_fs_time(inode->i_sb);
+
+ return ret;
+ }
+Index: linux-2.6.10/drivers/char/tty_io.c
+===================================================================
+--- linux-2.6.10.orig/drivers/char/tty_io.c 2005-04-06 09:38:33.000000000 -0600
++++ linux-2.6.10/drivers/char/tty_io.c 2006-01-03 16:16:52.000000000 -0700
+@@ -1018,7 +1018,7 @@
+ tty_ldisc_deref(ld);
+ unlock_kernel();
+ if (i > 0)
+- inode->i_atime = CURRENT_TIME;
++ inode->i_atime = current_fs_time(inode->i_sb);
+ return i;
+ }
+
+@@ -1095,7 +1095,8 @@
+ cond_resched();
+ }
+ if (written) {
+- file->f_dentry->d_inode->i_mtime = CURRENT_TIME;
++ struct inode *inode = file->f_dentry->d_inode;
++ inode->i_mtime = current_fs_time(inode->i_sb);
+ ret = written;
+ }
+ up(&tty->atomic_write);
+Index: linux-2.6.10/fs/attr.c
+===================================================================
+--- linux-2.6.10.orig/fs/attr.c 2004-12-24 14:34:00.000000000 -0700
++++ linux-2.6.10/fs/attr.c 2006-01-03 16:16:52.000000000 -0700
+@@ -14,6 +14,7 @@
+ #include <linux/fcntl.h>
+ #include <linux/quotaops.h>
+ #include <linux/security.h>
++#include <linux/time.h>
+
+ /* Taken over from the old code... */
+
+@@ -87,11 +88,14 @@
+ if (ia_valid & ATTR_GID)
+ inode->i_gid = attr->ia_gid;
+ if (ia_valid & ATTR_ATIME)
+- inode->i_atime = attr->ia_atime;
++ inode->i_atime = timespec_trunc(attr->ia_atime,
++ get_sb_time_gran(inode->i_sb));
+ if (ia_valid & ATTR_MTIME)
+- inode->i_mtime = attr->ia_mtime;
++ inode->i_mtime = timespec_trunc(attr->ia_mtime,
++ get_sb_time_gran(inode->i_sb));
+ if (ia_valid & ATTR_CTIME)
+- inode->i_ctime = attr->ia_ctime;
++ inode->i_ctime = timespec_trunc(attr->ia_ctime,
++ get_sb_time_gran(inode->i_sb));
+ if (ia_valid & ATTR_MODE) {
+ umode_t mode = attr->ia_mode;
+
+@@ -131,14 +135,17 @@
+ int notify_change(struct dentry * dentry, struct iattr * attr)
+ {
+ struct inode *inode = dentry->d_inode;
+- mode_t mode = inode->i_mode;
++ mode_t mode;
+ int error;
+- struct timespec now = CURRENT_TIME;
++ struct timespec now;
+ unsigned int ia_valid = attr->ia_valid;
+
+ if (!inode)
+ BUG();
+
++ mode = inode->i_mode;
++ now = current_fs_time(inode->i_sb);
++
+ attr->ia_ctime = now;
+ if (!(ia_valid & ATTR_ATIME_SET))
+ attr->ia_atime = now;
+Index: linux-2.6.10/fs/bad_inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/bad_inode.c 2004-12-24 14:35:50.000000000 -0700
++++ linux-2.6.10/fs/bad_inode.c 2006-01-03 16:16:52.000000000 -0700
+@@ -105,7 +105,8 @@
+ remove_inode_hash(inode);
+
+ inode->i_mode = S_IFREG;
+- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
++ inode->i_atime = inode->i_mtime = inode->i_ctime =
++ current_fs_time(inode->i_sb);
+ inode->i_op = &bad_inode_ops;
+ inode->i_fop = &bad_file_ops;
+ }
+Index: linux-2.6.10/fs/binfmt_misc.c
+===================================================================
+--- linux-2.6.10.orig/fs/binfmt_misc.c 2004-12-24 14:34:31.000000000 -0700
++++ linux-2.6.10/fs/binfmt_misc.c 2006-01-03 16:16:52.000000000 -0700
+@@ -509,7 +509,8 @@
+ inode->i_gid = 0;
+ inode->i_blksize = PAGE_CACHE_SIZE;
+ inode->i_blocks = 0;
+- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
++ inode->i_atime = inode->i_mtime = inode->i_ctime =
++ current_fs_time(inode->i_sb);
+ }
+ return inode;
+ }
+Index: linux-2.6.10/fs/ext2/dir.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext2/dir.c 2004-12-24 14:34:58.000000000 -0700
++++ linux-2.6.10/fs/ext2/dir.c 2006-01-03 16:16:52.000000000 -0700
+@@ -426,7 +426,7 @@
+ ext2_set_de_type (de, inode);
+ err = ext2_commit_chunk(page, from, to);
+ ext2_put_page(page);
+- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++ dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
+ mark_inode_dirty(dir);
+ }
+@@ -516,7 +516,7 @@
+ de->inode = cpu_to_le32(inode->i_ino);
+ ext2_set_de_type (de, inode);
+ err = ext2_commit_chunk(page, from, to);
+- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++ dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
+ mark_inode_dirty(dir);
+ /* OFFSET_CACHE */
+@@ -564,7 +564,7 @@
+ pde->rec_len = cpu_to_le16(to-from);
+ dir->inode = 0;
+ err = ext2_commit_chunk(page, from, to);
+- inode->i_ctime = inode->i_mtime = CURRENT_TIME;
++ inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+ EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
+ mark_inode_dirty(inode);
+ out:
+Index: linux-2.6.10/fs/ext2/ialloc.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext2/ialloc.c 2004-12-24 14:34:47.000000000 -0700
++++ linux-2.6.10/fs/ext2/ialloc.c 2006-01-03 16:16:52.000000000 -0700
+@@ -577,7 +577,7 @@
+ inode->i_ino = ino;
+ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat), not the fs block size */
+ inode->i_blocks = 0;
+- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+ memset(ei->i_data, 0, sizeof(ei->i_data));
+ ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
+ if (S_ISLNK(mode))
+Index: linux-2.6.10/fs/ext2/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext2/inode.c 2004-12-24 14:33:51.000000000 -0700
++++ linux-2.6.10/fs/ext2/inode.c 2006-01-03 16:16:52.000000000 -0700
+@@ -493,7 +493,7 @@
+
+ /* We are done with atomic stuff, now do the rest of housekeeping */
+
+- inode->i_ctime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME_SEC;
+
+ /* had we spliced it onto indirect block? */
+ if (where->bh)
+@@ -953,7 +953,7 @@
+ case EXT2_TIND_BLOCK:
+ ;
+ }
+- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
++ inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+ if (inode_needs_sync(inode)) {
+ sync_mapping_buffers(inode->i_mapping);
+ ext2_sync_inode (inode);
+Index: linux-2.6.10/fs/ext2/ioctl.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext2/ioctl.c 2004-12-24 14:35:49.000000000 -0700
++++ linux-2.6.10/fs/ext2/ioctl.c 2006-01-03 16:16:52.000000000 -0700
+@@ -59,7 +59,7 @@
+ ei->i_flags = flags;
+
+ ext2_set_inode_flags(inode);
+- inode->i_ctime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME_SEC;
+ mark_inode_dirty(inode);
+ return 0;
+ }
+@@ -72,7 +72,7 @@
+ return -EROFS;
+ if (get_user(inode->i_generation, (int __user *) arg))
+ return -EFAULT;
+- inode->i_ctime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME_SEC;
+ mark_inode_dirty(inode);
+ return 0;
+ default:
+Index: linux-2.6.10/fs/ext2/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext2/namei.c 2004-12-24 14:35:25.000000000 -0700
++++ linux-2.6.10/fs/ext2/namei.c 2006-01-03 16:16:52.000000000 -0700
+@@ -211,7 +211,7 @@
+ if (inode->i_nlink >= EXT2_LINK_MAX)
+ return -EMLINK;
+
+- inode->i_ctime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME_SEC;
+ ext2_inc_count(inode);
+ atomic_inc(&inode->i_count);
+
+@@ -337,7 +337,7 @@
+ goto out_dir;
+ ext2_inc_count(old_inode);
+ ext2_set_link(new_dir, new_de, new_page, old_inode);
+- new_inode->i_ctime = CURRENT_TIME;
++ new_inode->i_ctime = CURRENT_TIME_SEC;
+ if (dir_de)
+ new_inode->i_nlink--;
+ ext2_dec_count(new_inode);
+@@ -362,7 +362,7 @@
+ * rename.
+ * ext2_dec_count() will mark the inode dirty.
+ */
+- old_inode->i_ctime = CURRENT_TIME;
++ old_inode->i_ctime = CURRENT_TIME_SEC;
+
+ ext2_delete_entry (old_de, old_page);
+ ext2_dec_count(old_inode);
+Index: linux-2.6.10/fs/ext2/super.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext2/super.c 2004-12-24 14:35:01.000000000 -0700
++++ linux-2.6.10/fs/ext2/super.c 2006-01-03 16:19:06.000000000 -0700
+@@ -595,7 +595,7 @@
+ es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
+ sbi->s_es = es;
+ sb->s_magic = le16_to_cpu(es->s_magic);
+- sb->s_flags |= MS_ONE_SECOND;
++ set_sb_time_gran(sb, 1000000000U);
+
+ if (sb->s_magic != EXT2_SUPER_MAGIC)
+ goto cantfind_ext2;
+Index: linux-2.6.10/fs/ext2/xattr.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext2/xattr.c 2005-04-06 09:38:35.000000000 -0600
++++ linux-2.6.10/fs/ext2/xattr.c 2006-01-03 16:16:52.000000000 -0700
+@@ -702,7 +702,7 @@
+
+ /* Update the inode. */
+ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
+- inode->i_ctime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME_SEC;
+ if (IS_SYNC(inode)) {
+ error = ext2_sync_inode (inode);
+ if (error)
+Index: linux-2.6.10/fs/ext3/ialloc.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ialloc.c 2004-12-24 14:34:45.000000000 -0700
++++ linux-2.6.10/fs/ext3/ialloc.c 2006-01-03 16:16:52.000000000 -0700
+@@ -558,7 +558,7 @@
+ /* This is the optimal IO size (for stat), not the fs block size */
+ inode->i_blksize = PAGE_SIZE;
+ inode->i_blocks = 0;
+- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+
+ memset(ei->i_data, 0, sizeof(ei->i_data));
+ ei->i_next_alloc_block = 0;
+Index: linux-2.6.10/fs/ext3/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/inode.c 2005-04-06 09:38:35.000000000 -0600
++++ linux-2.6.10/fs/ext3/inode.c 2006-01-03 16:16:52.000000000 -0700
+@@ -626,7 +626,7 @@
+
+ /* We are done with atomic stuff, now do the rest of housekeeping */
+
+- inode->i_ctime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME_SEC;
+ ext3_mark_inode_dirty(handle, inode);
+
+ /* had we spliced it onto indirect block? */
+@@ -2199,7 +2199,7 @@
+ ;
+ }
+ up(&ei->truncate_sem);
+- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
++ inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+ ext3_mark_inode_dirty(handle, inode);
+
+ /* In a multi-transaction truncate, we only make the final
+Index: linux-2.6.10/fs/ext3/ioctl.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/ioctl.c 2004-12-24 14:34:31.000000000 -0700
++++ linux-2.6.10/fs/ext3/ioctl.c 2006-01-03 16:16:52.000000000 -0700
+@@ -87,7 +87,7 @@
+ ei->i_flags = flags;
+
+ ext3_set_inode_flags(inode);
+- inode->i_ctime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME_SEC;
+
+ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ flags_err:
+@@ -121,7 +121,7 @@
+ return PTR_ERR(handle);
+ err = ext3_reserve_inode_write(handle, inode, &iloc);
+ if (err == 0) {
+- inode->i_ctime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_generation = generation;
+ err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ }
+Index: linux-2.6.10/fs/ext3/namei.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/namei.c 2004-12-24 14:34:58.000000000 -0700
++++ linux-2.6.10/fs/ext3/namei.c 2006-01-03 16:16:52.000000000 -0700
+@@ -1251,7 +1251,7 @@
+ * happen is that the times are slightly out of date
+ * and/or different from the directory change time.
+ */
+- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++ dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ ext3_update_dx_flag(dir);
+ dir->i_version++;
+ ext3_mark_inode_dirty(handle, dir);
+@@ -2029,7 +2029,7 @@
+ * recovery. */
+ inode->i_size = 0;
+ ext3_orphan_add(handle, inode);
+- inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
++ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ ext3_mark_inode_dirty(handle, inode);
+ dir->i_nlink--;
+ ext3_update_dx_flag(dir);
+@@ -2079,7 +2079,7 @@
+ retval = ext3_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto end_unlink;
+- dir->i_ctime = dir->i_mtime = CURRENT_TIME;
++ dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ inode->i_nlink--;
+@@ -2169,7 +2169,7 @@
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode->i_ctime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME_SEC;
+ ext3_inc_count(handle, inode);
+ atomic_inc(&inode->i_count);
+
+@@ -2270,7 +2270,7 @@
+ * Like most other Unix systems, set the ctime for inodes on a
+ * rename.
+ */
+- old_inode->i_ctime = CURRENT_TIME;
++ old_inode->i_ctime = CURRENT_TIME_SEC;
+ ext3_mark_inode_dirty(handle, old_inode);
+
+ /*
+@@ -2303,9 +2303,9 @@
+
+ if (new_inode) {
+ new_inode->i_nlink--;
+- new_inode->i_ctime = CURRENT_TIME;
++ new_inode->i_ctime = CURRENT_TIME_SEC;
+ }
+- old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
++ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+ ext3_update_dx_flag(old_dir);
+ if (dir_bh) {
+ BUFFER_TRACE(dir_bh, "get_write_access");
+Index: linux-2.6.10/fs/ext3/super.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/super.c 2005-04-06 09:38:35.000000000 -0600
++++ linux-2.6.10/fs/ext3/super.c 2006-01-03 16:16:52.000000000 -0700
+@@ -1318,7 +1318,7 @@
+ if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0))
+ goto failed_mount;
+
+- sb->s_flags |= MS_ONE_SECOND;
++ set_sb_time_gran(sb, 1000000000U);
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
+Index: linux-2.6.10/fs/ext3/xattr.c
+===================================================================
+--- linux-2.6.10.orig/fs/ext3/xattr.c 2005-04-06 09:38:35.000000000 -0600
++++ linux-2.6.10/fs/ext3/xattr.c 2006-01-03 16:16:52.000000000 -0700
+@@ -723,7 +723,7 @@
+
+ /* Update the inode. */
+ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
+- inode->i_ctime = CURRENT_TIME;
++ inode->i_ctime = CURRENT_TIME_SEC;
+ ext3_mark_inode_dirty(handle, inode);
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+Index: linux-2.6.10/fs/inode.c
+===================================================================
+--- linux-2.6.10.orig/fs/inode.c 2006-01-03 15:33:21.000000000 -0700
++++ linux-2.6.10/fs/inode.c 2006-01-03 16:16:52.000000000 -0700
+@@ -1131,19 +1131,6 @@
+
+ EXPORT_SYMBOL(bmap);
+
+-/*
+- * Return true if the filesystem which backs this inode considers the two
+- * passed timespecs to be sufficiently different to warrant flushing the
+- * altered time out to disk.
+- */
+-static int inode_times_differ(struct inode *inode,
+- struct timespec *old, struct timespec *new)
+-{
+- if (IS_ONE_SECOND(inode))
+- return old->tv_sec != new->tv_sec;
+- return !timespec_equal(old, new);
+-}
+-
+ /**
+ * update_atime - update the access time
+ * @inode: inode accessed
+@@ -1163,8 +1150,8 @@
+ if (IS_RDONLY(inode))
+ return;
+
+- now = current_kernel_time();
+- if (inode_times_differ(inode, &inode->i_atime, &now)) {
++ now = current_fs_time(inode->i_sb);
++ if (!timespec_equal(&inode->i_atime, &now)) {
+ inode->i_atime = now;
+ mark_inode_dirty_sync(inode);
+ } else {
+@@ -1194,14 +1181,13 @@
+ if (IS_RDONLY(inode))
+ return;
+
+- now = current_kernel_time();
+-
+- if (inode_times_differ(inode, &inode->i_mtime, &now))
++ now = current_fs_time(inode->i_sb);
++ if (!timespec_equal(&inode->i_mtime, &now))
+ sync_it = 1;
+ inode->i_mtime = now;
+
+ if (ctime_too) {
+- if (inode_times_differ(inode, &inode->i_ctime, &now))
++ if (!timespec_equal(&inode->i_ctime, &now))
+ sync_it = 1;
+ inode->i_ctime = now;
+ }
+Index: linux-2.6.10/fs/locks.c
+===================================================================
+--- linux-2.6.10.orig/fs/locks.c 2004-12-24 14:35:28.000000000 -0700
++++ linux-2.6.10/fs/locks.c 2006-01-03 16:16:52.000000000 -0700
+@@ -1228,7 +1228,7 @@
+ {
+ struct file_lock *flock = inode->i_flock;
+ if (flock && IS_LEASE(flock) && (flock->fl_type & F_WRLCK))
+- *time = CURRENT_TIME;
++ *time = current_fs_time(inode->i_sb);
+ else
+ *time = inode->i_mtime;
+ }
+Index: linux-2.6.10/include/linux/fs.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/fs.h 2006-01-03 16:04:26.000000000 -0700
++++ linux-2.6.10/include/linux/fs.h 2006-01-03 16:16:52.000000000 -0700
+@@ -124,7 +124,8 @@
+ #define MS_REC 16384
+ #define MS_VERBOSE 32768
+ #define MS_POSIXACL (1<<16) /* VFS does not apply the umask */
+-#define MS_ONE_SECOND (1<<17) /* fs has 1 sec a/m/ctime resolution */
++#define MS_ONE_SECOND (1<<17) /* fs has 1 sec time resolution (obsolete) */
++#define MS_TIME_GRAN (1<<18) /* fs has s_time_gran field */
+ #define MS_ACTIVE (1<<30)
+ #define MS_NOUSER (1<<31)
+
+@@ -803,8 +804,33 @@
+ * even looking at it. You had been warned.
+ */
+ struct semaphore s_vfs_rename_sem; /* Kludge */
++
++ /* Granuality of c/m/atime in ns.
++ Cannot be worse than a second */
++#ifndef __GENKSYMS__
++ u32 s_time_gran;
++#endif
+ };
+
++extern struct timespec current_fs_time(struct super_block *sb);
++
++static inline u32 get_sb_time_gran(struct super_block *sb)
++{
++ if (sb->s_flags & MS_TIME_GRAN)
++ return sb->s_time_gran;
++ if (sb->s_flags & MS_ONE_SECOND)
++ return 1000000000U;
++ return 1;
++}
++
++static inline void set_sb_time_gran(struct super_block *sb, u32 time_gran)
++{
++ sb->s_time_gran = time_gran;
++ sb->s_flags |= MS_TIME_GRAN;
++ if (time_gran == 1000000000U)
++ sb->s_flags |= MS_ONE_SECOND;
++}
++
+ /*
+ * Snapshotting support.
+ */
+Index: linux-2.6.10/include/linux/time.h
+===================================================================
+--- linux-2.6.10.orig/include/linux/time.h 2004-12-24 14:35:00.000000000 -0700
++++ linux-2.6.10/include/linux/time.h 2006-01-03 16:16:52.000000000 -0700
+@@ -90,6 +90,7 @@
+ struct timespec current_kernel_time(void);
+
+ #define CURRENT_TIME (current_kernel_time())
++#define CURRENT_TIME_SEC ((struct timespec) { xtime.tv_sec, 0 })
+
+ extern void do_gettimeofday(struct timeval *tv);
+ extern int do_settimeofday(struct timespec *tv);
+@@ -103,6 +104,8 @@
+ extern int do_getitimer(int which, struct itimerval *value);
+ extern void getnstimeofday (struct timespec *tv);
+
++extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
++
+ static inline void
+ set_normalized_timespec (struct timespec *ts, time_t sec, long nsec)
+ {
+Index: linux-2.6.10/kernel/time.c
+===================================================================
+--- linux-2.6.10.orig/kernel/time.c 2004-12-24 14:34:26.000000000 -0700
++++ linux-2.6.10/kernel/time.c 2006-01-03 16:16:52.000000000 -0700
+@@ -36,6 +36,7 @@
+
+ #include <asm/uaccess.h>
+ #include <asm/unistd.h>
++#include <linux/fs.h>
+
+ /*
+ * The timezone where the local system is located. Used as a default by some
+@@ -433,6 +434,50 @@
+
+ EXPORT_SYMBOL(current_kernel_time);
+
++/**
++ * current_fs_time - Return FS time
++ * @sb: Superblock.
++ *
++ * Return the current time truncated to the time granuality supported by
++ * the fs.
++ */
++struct timespec current_fs_time(struct super_block *sb)
++{
++ struct timespec now = current_kernel_time();
++ return timespec_trunc(now, get_sb_time_gran(sb));
++}
++EXPORT_SYMBOL(current_fs_time);
++
++/**
++ * timespec_trunc - Truncate timespec to a granuality
++ * @t: Timespec
++ * @gran: Granuality in ns.
++ *
++ * Truncate a timespec to a granuality. gran must be smaller than a second.
++ * Always rounds down.
++ *
++ * This function should be only used for timestamps returned by
++ * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
++ * it doesn't handle the better resolution of the later.
++ */
++struct timespec timespec_trunc(struct timespec t, unsigned gran)
++{
++ /*
++ * Division is pretty slow so avoid it for common cases.
++ * Currently current_kernel_time() never returns better than
++ * jiffies resolution. Exploit that.
++ */
++ if (gran <= jiffies_to_usecs(1) * 1000) {
++ /* nothing */
++ } else if (gran == 1000000000) {
++ t.tv_nsec = 0;
++ } else {
++ t.tv_nsec -= t.tv_nsec % gran;
++ }
++ return t;
++}
++EXPORT_SYMBOL(timespec_trunc);
++
+ #ifdef CONFIG_TIME_INTERPOLATION
+ void getnstimeofday (struct timespec *tv)
+ {
--- /dev/null
+Index: linux-2.6.12-rc6/fs/nfs/dir.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/nfs/dir.c 2005-06-14 14:22:14.585699648 +0200
++++ linux-2.6.12-rc6/fs/nfs/dir.c 2005-06-14 14:26:39.884524523 +0200
+@@ -783,7 +783,7 @@
+ if (nd->flags & LOOKUP_DIRECTORY)
+ return 0;
+ /* Are we trying to write to a read only partition? */
+- if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
++ if (IS_RDONLY(dir) && (nd->intent.it_flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
+ return 0;
+ return 1;
+ }
+@@ -805,7 +805,7 @@
+ dentry->d_op = NFS_PROTO(dir)->dentry_ops;
+
+ /* Let vfs_create() deal with O_EXCL */
+- if (nd->intent.open.flags & O_EXCL)
++ if (nd->intent.it_flags & O_EXCL)
+ goto no_entry;
+
+ /* Open the file on the server */
+@@ -817,7 +817,7 @@
+ goto out;
+ }
+
+- if (nd->intent.open.flags & O_CREAT) {
++ if (nd->intent.it_flags & O_CREAT) {
+ nfs_begin_data_update(dir);
+ inode = nfs4_atomic_open(dir, dentry, nd);
+ nfs_end_data_update(dir);
+@@ -833,7 +833,7 @@
+ break;
+ /* This turned out not to be a regular file */
+ case -ELOOP:
+- if (!(nd->intent.open.flags & O_NOFOLLOW))
++ if (!(nd->intent.it_flags & O_NOFOLLOW))
+ goto no_open;
+ /* case -EISDIR: */
+ /* case -EINVAL: */
+@@ -874,7 +874,7 @@
+ /* NFS only supports OPEN on regular files */
+ if (!S_ISREG(inode->i_mode))
+ goto no_open;
+- openflags = nd->intent.open.flags;
++ openflags = nd->intent.it_flags;
+ /* We cannot do exclusive creation on a positive dentry */
+ if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
+ goto no_open;
+Index: linux-2.6.12-rc6/fs/nfs/nfs4proc.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/nfs/nfs4proc.c 2005-06-06 17:22:29.000000000 +0200
++++ linux-2.6.12-rc6/fs/nfs/nfs4proc.c 2005-06-14 14:30:18.499756220 +0200
+@@ -877,19 +877,19 @@
+ struct nfs4_state *state;
+
+ if (nd->flags & LOOKUP_CREATE) {
+- attr.ia_mode = nd->intent.open.create_mode;
++ attr.ia_mode = nd->intent.it_create_mode;
+ attr.ia_valid = ATTR_MODE;
+ if (!IS_POSIXACL(dir))
+ attr.ia_mode &= ~current->fs->umask;
+ } else {
+ attr.ia_valid = 0;
+- BUG_ON(nd->intent.open.flags & O_CREAT);
++ BUG_ON(nd->intent.it_flags & O_CREAT);
+ }
+
+ cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
+ if (IS_ERR(cred))
+ return (struct inode *)cred;
+- state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred);
++ state = nfs4_do_open(dir, dentry, nd->intent.it_flags, &attr, cred);
+ put_rpccred(cred);
+ if (IS_ERR(state))
+ return (struct inode *)state;
+Index: linux-2.6.12-rc6/fs/cifs/dir.c
+===================================================================
+--- linux-2.6.12-rc6.orig/fs/cifs/dir.c 2005-06-06 17:22:29.000000000 +0200
++++ linux-2.6.12-rc6/fs/cifs/dir.c 2005-06-14 14:26:39.915774522 +0200
+@@ -146,23 +146,23 @@
+ }
+
+ if(nd) {
+- if ((nd->intent.open.flags & O_ACCMODE) == O_RDONLY)
++ if ((nd->intent.it_flags & O_ACCMODE) == O_RDONLY)
+ desiredAccess = GENERIC_READ;
+- else if ((nd->intent.open.flags & O_ACCMODE) == O_WRONLY) {
++ else if ((nd->intent.it_flags & O_ACCMODE) == O_WRONLY) {
+ desiredAccess = GENERIC_WRITE;
+ write_only = TRUE;
+- } else if ((nd->intent.open.flags & O_ACCMODE) == O_RDWR) {
++ } else if ((nd->intent.it_flags & O_ACCMODE) == O_RDWR) {
+ /* GENERIC_ALL is too much permission to request */
+ /* can cause unnecessary access denied on create */
+ /* desiredAccess = GENERIC_ALL; */
+ desiredAccess = GENERIC_READ | GENERIC_WRITE;
+ }
+
+- if((nd->intent.open.flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
++ if((nd->intent.it_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+ disposition = FILE_CREATE;
+- else if((nd->intent.open.flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC))
++ else if((nd->intent.it_flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC))
+ disposition = FILE_OVERWRITE_IF;
+- else if((nd->intent.open.flags & O_CREAT) == O_CREAT)
++ else if((nd->intent.it_flags & O_CREAT) == O_CREAT)
+ disposition = FILE_OPEN_IF;
+ else {
+ cFYI(1,("Create flag not set in create function"));
===================================================================
--- linux-bgl.orig/fs/nfsd/vfs.c 2003-07-02 08:44:33.000000000 -0700
+++ linux-bgl/fs/nfsd/vfs.c 2004-12-28 17:13:59.940919832 -0800
-@@ -77,6 +77,128 @@
+@@ -77,6 +77,129 @@
static struct raparms * raparml;
static struct raparms * raparm_cache;
+ struct nameidata nd = { .dentry = ddir, .last = dnew->d_name };
+ struct inode_operations *op = nd.dentry->d_inode->i_op;
+ err = op->link_raw(&old_nd, &nd);
++ igrab(dold->d_inode);
+ d_instantiate(dnew, dold->d_inode);
+ if(dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it)
+ dold->d_inode->i_op->revalidate_it(dnew, NULL);
--- linux-2.4.20-hp4-pnnl13/fs/nfsd/vfs.c~nfs_export_kernel-2.4.20-hp 2002-11-29 02:53:15.000000000 +0300
+++ linux-2.4.20-hp4-pnnl13-alexey/fs/nfsd/vfs.c 2003-10-08 10:54:08.000000000 +0400
-@@ -77,6 +77,129 @@ struct raparms {
+@@ -77,6 +77,130 @@ struct raparms {
static struct raparms * raparml;
static struct raparms * raparm_cache;
+ struct nameidata nd = { .dentry = ddir, .last = dnew->d_name };
+ struct inode_operations *op = nd.dentry->d_inode->i_op;
+ err = op->link_raw(&old_nd, &nd);
++ igrab(dold->d_inode);
+ d_instantiate(dnew, dold->d_inode);
+ if (dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it)
+ dold->d_inode->i_op->revalidate_it(dnew, NULL);
===================================================================
--- linux-2.4.21-chaos.orig/fs/nfsd/vfs.c 2003-09-19 03:49:54.000000000 +0400
+++ linux-2.4.21-chaos/fs/nfsd/vfs.c 2003-12-12 16:19:25.000000000 +0300
-@@ -78,6 +78,126 @@
+@@ -78,6 +78,127 @@
static struct raparms * raparml;
static struct raparms * raparm_cache;
+ struct nameidata nd = { .dentry = ddir, .last = dnew->d_name };
+ struct inode_operations *op = nd.dentry->d_inode->i_op;
+ err = op->link_raw(&old_nd, &nd);
++ igrab(dold->d_inode);
+ d_instantiate(dnew, dold->d_inode);
+ if (dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it)
+ dold->d_inode->i_op->revalidate_it(dnew, NULL);
===================================================================
--- linux-2.4.21-chaos.orig/fs/nfsd/vfs.c 2003-09-19 03:49:54.000000000 +0400
+++ linux-2.4.21-chaos/fs/nfsd/vfs.c 2003-12-12 16:19:25.000000000 +0300
-@@ -78,6 +78,126 @@
+@@ -78,6 +78,127 @@
static struct raparms * raparml;
static struct raparms * raparm_cache;
+ struct nameidata nd = { .dentry = ddir, .last = dnew->d_name };
+ struct inode_operations *op = nd.dentry->d_inode->i_op;
+ err = op->link_raw(&old_nd, &nd);
++ igrab(dold->d_inode);
+ d_instantiate(dnew, dold->d_inode);
+ if (dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it)
+ dold->d_inode->i_op->revalidate_it(dnew, NULL);
===================================================================
--- linux-2.4.22-vanilla.orig/fs/nfsd/vfs.c 2003-11-03 23:22:11.000000000 +0300
+++ linux-2.4.22-vanilla/fs/nfsd/vfs.c 2003-11-03 23:47:41.000000000 +0300
-@@ -77,6 +77,126 @@
+@@ -77,6 +77,127 @@
static struct raparms * raparml;
static struct raparms * raparm_cache;
+ struct nameidata nd = { .dentry = ddir, .last = dnew->d_name };
+ struct inode_operations *op = nd.dentry->d_inode->i_op;
+ err = op->link_raw(&old_nd, &nd);
++ igrab(dold->d_inode);
+ d_instantiate(dnew, dold->d_inode);
+ if (dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it)
+ dold->d_inode->i_op->revalidate_it(dnew, NULL);
===================================================================
--- linux-2.4.29.orig/fs/nfsd/vfs.c 2005-05-03 16:28:21.000000000 +0300
+++ linux-2.4.29/fs/nfsd/vfs.c 2005-05-03 18:46:09.372133224 +0300
-@@ -77,6 +77,126 @@
+@@ -77,6 +77,127 @@
static struct raparms * raparml;
static struct raparms * raparm_cache;
+ struct nameidata nd = { .dentry = ddir, .last = dnew->d_name };
+ struct inode_operations *op = nd.dentry->d_inode->i_op;
+ err = op->link_raw(&old_nd, &nd);
++ igrab(dold->d_inode);
+ d_instantiate(dnew, dold->d_inode);
+ if (dold->d_inode->i_op && dold->d_inode->i_op->revalidate_it)
+ dold->d_inode->i_op->revalidate_it(dnew, NULL);
--- linux-2.4.21.orig/arch/ia64/kernel/ia64_ksyms.c 2005-06-01 22:51:59.000000000 -0400
+++ linux-2.4.21/arch/ia64/kernel/ia64_ksyms.c 2005-06-01 23:14:43.773842072 -0400
@@ -207,3 +207,13 @@
- EXPORT_SYMBOL_GPL(show_mem);
EXPORT_SYMBOL_GPL(show_state);
EXPORT_SYMBOL_GPL(show_regs);
+ EXPORT_SYMBOL(pm_power_off);
+
+#define __KERNEL_SYSCALLS__ 1
+#include <asm/unistd.h>
#include <linux/shmem_fs.h>
#include <linux/security.h>
+#include <linux/module.h>
+ #include <linux/audit.h>
#include <asm/uaccess.h>
- #include "util.h"
@@ -850,6 +851,44 @@
return retval;
}
spin_unlock(&mm->page_table_lock);
out:
return ret;
-@@ -1552,6 +1562,7 @@
+@@ -1555,6 +1565,7 @@
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, addr, entry);
+ ioproc_update_page(vma, addr);
+ lazy_mmu_prot_update(entry);
spin_unlock(&mm->page_table_lock);
out:
- return VM_FAULT_MINOR;
-@@ -1669,6 +1680,7 @@
+@@ -1673,6 +1684,7 @@
/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
+ ioproc_update_page(vma, address);
+ lazy_mmu_prot_update(entry);
spin_unlock(&mm->page_table_lock);
out:
- return ret;
@@ -1853,6 +1865,7 @@
return ret;
return ret == len ? 0 : -1;
sg->page = virt_to_page(buf);
sg->offset = (unsigned long) buf & ~PAGE_MASK;
+#endif /* !SMALL_SCATTERLIST */
- sg_dma_len(sg) = buflen;
+ sg->length = buflen;
}
@@ -2297,8 +2302,13 @@
/* get the current page and offset */
page = nth_page(page, (offset >> PAGE_SHIFT));
@@ -2339,8 +2349,13 @@
+ next_sg:
sg = &qc->sg[qc->cursg];
- next_page:
+#if SMALL_SCATTERLIST
+ page = sg->u.page.page;
+ offset = sg->u.page.offset + qc->cursg_ofs;
--- /dev/null
+--- uml-2.4.24/arch/um/kernel/tt/ksyms.c.orig 2005-05-04 13:59:58.806659456 +0300
++++ uml-2.4.24/arch/um/kernel/tt/ksyms.c 2005-05-04 14:00:18.358687096 +0300
+@@ -12,6 +12,8 @@
+ EXPORT_SYMBOL(__do_strncpy_from_user);
+ EXPORT_SYMBOL(__do_strnlen_user);
+ EXPORT_SYMBOL(__do_clear_user);
++EXPORT_SYMBOL(clear_user_tt);
++EXPORT_SYMBOL(clear_user_skas);
+
+ EXPORT_SYMBOL(tracing_pid);
+ EXPORT_SYMBOL(honeypot);
+ }
+ if (lookup_flags & LOOKUP_DIRECTORY) {
+ err = -ENOTDIR;
-+ if(!nd->dentry->d_inode->i_op ||
-+ !nd->dentry->d_inode->i_op->lookup) {
++ if (!nd->dentry->d_inode->i_op ||
++ !nd->dentry->d_inode->i_op->lookup){
+ path_release(nd);
+ goto return_err;
+ }
{
struct super_block *sb = mnt->mnt_sb;
dput(mnt->mnt_root);
-+ spin_lock(&dcache_lock);
-+ list_del(&mnt->mnt_lustre_list);
-+ spin_unlock(&dcache_lock);
++ spin_lock(&dcache_lock);
++ list_del(&mnt->mnt_lustre_list);
++ spin_unlock(&dcache_lock);
free_vfsmnt(mnt);
deactivate_super(sb);
}
lock_kernel();
+ if (sb->s_op->umount_lustre)
-+ sb->s_op->umount_lustre(sb);
++ sb->s_op->umount_lustre(sb);
if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
sb->s_op->umount_begin(sb);
unlock_kernel();
};
/*
-@@ -46,6 +77,8 @@
- #define LOOKUP_PARENT 16
+@@ -47,6 +78,8 @@
#define LOOKUP_NOALT 32
#define LOOKUP_ATOMIC 64
-+#define LOOKUP_LAST (0x1000)
-+#define LOOKUP_LINK_NOTLAST (0x2000)
+ #define LOOKUP_REVAL 128
++#define LOOKUP_LAST (0x1000)
++#define LOOKUP_LINK_NOTLAST (0x2000)
/*
* Intent data
-@@ -55,6 +88,12 @@
+@@ -56,6 +90,12 @@
#define LOOKUP_ACCESS (0x0400)
extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
#define user_path_walk(name,nd) \
__user_walk(name, LOOKUP_FOLLOW, nd)
#define user_path_walk_link(name,nd) \
-@@ -67,7 +106,6 @@
+@@ -68,7 +108,6 @@
extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
};
static inline struct vfsmount *mntget(struct vfsmount *mnt)
-Index: linux-2.6.9-5.0.3.EL/kernel/exit.c
-===================================================================
---- linux-2.6.9-5.0.3.EL.orig/kernel/exit.c 2005-02-26 14:28:01.000000000 +0200
-+++ linux-2.6.9-5.0.3.EL/kernel/exit.c 2005-02-26 23:29:02.000000000 +0200
-@@ -244,6 +244,8 @@
- write_unlock_irq(&tasklist_lock);
- }
-
-+EXPORT_SYMBOL(reparent_to_init);
-+
- void __set_special_pids(pid_t session, pid_t pgrp)
- {
- struct task_struct *curr = current;
-@@ -428,6 +430,8 @@
- __exit_files(tsk);
- }
-
-+EXPORT_SYMBOL(exit_files);
-+
- static inline void __put_fs_struct(struct fs_struct *fs)
- {
- /* No need to hold fs->lock if we are killing it */
#define FSHOOK_END_USER_WALK(type, err, field) ((void)0);}
-Index: linux-2.6.5-12.1/kernel/exit.c
-===================================================================
---- linux-2.6.5-12.1.orig/kernel/exit.c 2004-05-10 12:21:56.000000000 -0400
-+++ linux-2.6.5-12.1/kernel/exit.c 2004-06-03 18:31:28.000000000 -0400
-@@ -260,6 +260,8 @@
- write_unlock_irq(&tasklist_lock);
- }
-
-+EXPORT_SYMBOL(reparent_to_init);
-+
- void __set_special_pids(pid_t session, pid_t pgrp)
- {
- struct task_struct *curr = current;
-@@ -429,6 +431,8 @@
- __exit_files(tsk);
- }
-
-+EXPORT_SYMBOL(exit_files);
-+
- static inline void __put_fs_struct(struct fs_struct *fs)
- {
- /* No need to hold fs->lock if we are killing it */
--- /dev/null
+Index: linux-2.6.12.5/fs/exec.c
+===================================================================
+--- linux-2.6.12.5.orig/fs/exec.c 2005-08-17 17:51:28.000000000 +0200
++++ linux-2.6.12.5/fs/exec.c 2005-08-17 17:51:44.000000000 +0200
+@@ -122,9 +122,10 @@
+ struct file * file;
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent, IT_OPEN);
+
+- nd.intent.open.flags = FMODE_READ;
+- error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++ nd.intent.it_flags = FMODE_READ|FMODE_EXEC;
++ error = __user_walk_it(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
+ if (error)
+ goto out;
+
+@@ -136,7 +137,7 @@
+ if (error)
+ goto exit;
+
+- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent);
+ error = PTR_ERR(file);
+ if (IS_ERR(file))
+ goto out;
+@@ -492,8 +493,9 @@
+ int err;
+ struct file *file;
+
+- nd.intent.open.flags = FMODE_READ;
+- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++ intent_init(&nd.intent, IT_OPEN);
++ nd.intent.it_flags = FMODE_READ|FMODE_EXEC;
++ err = path_lookup(name, LOOKUP_FOLLOW, &nd);
+ file = ERR_PTR(err);
+
+ if (!err) {
+@@ -506,7 +508,7 @@
+ err = -EACCES;
+ file = ERR_PTR(err);
+ if (!err) {
+- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent);
+ if (!IS_ERR(file)) {
+ err = deny_write_access(file);
+ if (err) {
+Index: linux-2.6.12.5/fs/namei.c
+===================================================================
+--- linux-2.6.12.5.orig/fs/namei.c 2005-08-17 17:51:28.000000000 +0200
++++ linux-2.6.12.5/fs/namei.c 2005-08-17 17:52:57.000000000 +0200
+@@ -301,8 +301,19 @@
+ return 0;
+ }
+
++void intent_release(struct lookup_intent *it)
++{
++ if (!it)
++ return;
++ if (it->it_magic != INTENT_MAGIC)
++ return;
++ if (it->it_op_release)
++ it->it_op_release(it);
++}
++
+ void path_release(struct nameidata *nd)
+ {
++ intent_release(&nd->intent);
+ dput(nd->dentry);
+ mntput(nd->mnt);
+ }
+@@ -392,8 +403,11 @@
+ {
+ struct dentry * result;
+ struct inode *dir = parent->d_inode;
++ int counter = 0;
+
+ down(&dir->i_sem);
++again:
++ counter++;
+ /*
+ * First re-do the cached lookup just in case it was created
+ * while we waited for the directory semaphore..
+@@ -427,13 +441,16 @@
+ * Uhhuh! Nasty case: the cache was re-populated while
+ * we waited on the semaphore. Need to revalidate.
+ */
+- up(&dir->i_sem);
+ if (result->d_op && result->d_op->d_revalidate) {
+ if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
+ dput(result);
+- result = ERR_PTR(-ENOENT);
++ if (counter > 10)
++ result = ERR_PTR(-ESTALE);
++ if (!IS_ERR(result))
++ goto again;
+ }
+ }
++ up(&dir->i_sem);
+ return result;
+ }
+
+@@ -461,7 +478,9 @@
+ static inline int __vfs_follow_link(struct nameidata *nd, const char *link)
+ {
+ int res = 0;
++ struct lookup_intent it = nd->intent;
+ char *name;
++
+ if (IS_ERR(link))
+ goto fail;
+
+@@ -471,6 +490,9 @@
+ /* weird __emul_prefix() stuff did it */
+ goto out;
+ }
++ intent_init(&nd->intent, it.it_op);
++ nd->intent.it_flags = it.it_flags;
++ nd->intent.it_create_mode = it.it_create_mode;
+ res = link_path_walk(link, nd);
+ out:
+ if (nd->depth || res || nd->last_type!=LAST_NORM)
+@@ -703,6 +725,33 @@
+ return PTR_ERR(dentry);
+ }
+
++static int revalidate_special(struct nameidata *nd)
++{
++ struct dentry *dentry = nd->dentry;
++ int err, counter = 0;
++
++ revalidate_again:
++ if (!dentry->d_op || !dentry->d_op->d_revalidate)
++ return 0;
++ if (!dentry->d_op->d_revalidate(dentry, nd)) {
++ struct dentry *new;
++ if ((err = permission(dentry->d_parent->d_inode, MAY_EXEC, nd)))
++ return err;
++ new = real_lookup(dentry->d_parent, &dentry->d_name, nd);
++ if (IS_ERR(new))
++ return PTR_ERR(new);
++ d_invalidate(dentry);
++ dput(dentry);
++ nd->dentry = dentry = new;
++ counter++;
++ if (counter < 10)
++ goto revalidate_again;
++ printk("excessive revalidate_it loops\n");
++ return -ESTALE;
++ }
++ return 0;
++}
++
+ /*
+ * Name resolution.
+ * This is the basic name resolution function, turning a pathname into
+@@ -800,7 +849,11 @@
+ goto out_dput;
+
+ if (inode->i_op->follow_link) {
++ int save_flags = nd->flags;
++ nd->flags |= LOOKUP_LINK_NOTLAST;
+ err = do_follow_link(&next, nd);
++ if (!(save_flags & LOOKUP_LINK_NOTLAST))
++ nd->flags &= ~LOOKUP_LINK_NOTLAST;
+ if (err)
+ goto return_err;
+ err = -ENOENT;
+@@ -839,6 +892,23 @@
+ inode = nd->dentry->d_inode;
+ /* fallthrough */
+ case 1:
++ nd->flags |= LOOKUP_LAST;
++ err = revalidate_special(nd);
++ nd->flags &= ~LOOKUP_LAST;
++ if (!nd->dentry->d_inode)
++ err = -ENOENT;
++ if (err) {
++ path_release(nd);
++ goto return_err;
++ }
++ if (lookup_flags & LOOKUP_DIRECTORY) {
++ err = -ENOTDIR;
++ if(!nd->dentry->d_inode->i_op ||
++ !nd->dentry->d_inode->i_op->lookup) {
++ path_release(nd);
++ goto return_err;
++ }
++ }
+ goto return_reval;
+ }
+ if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
+@@ -846,7 +916,9 @@
+ if (err < 0)
+ break;
+ }
++ nd->flags |= LOOKUP_LAST;
+ err = do_lookup(nd, &this, &next);
++ nd->flags &= ~LOOKUP_LAST;
+ if (err)
+ break;
+ inode = next.dentry->d_inode;
+@@ -1097,7 +1169,7 @@
+ }
+
+ /* SMP-safe */
+-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd)
+ {
+ unsigned long hash;
+ struct qstr this;
+@@ -1117,11 +1189,16 @@
+ }
+ this.hash = end_name_hash(hash);
+
+- return lookup_hash(&this, base);
++ return __lookup_hash(&this, base, nd);
+ access:
+ return ERR_PTR(-EACCES);
+ }
+
++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++{
++ return lookup_one_len_it(name, base, len, NULL);
++}
++
+ /*
+ * namei()
+ *
+@@ -1133,7 +1210,7 @@
+ * that namei follows links, while lnamei does not.
+ * SMP-safe
+ */
+-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd)
+ {
+ char *tmp = getname(name);
+ int err = PTR_ERR(tmp);
+@@ -1145,6 +1222,12 @@
+ return err;
+ }
+
++int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++{
++ intent_init(&nd->intent, IT_LOOKUP);
++ return __user_walk_it(name, flags, nd);
++}
++
+ /*
+ * It's inline, so penalty for filesystems that don't use sticky bit is
+ * minimal.
+@@ -1426,8 +1509,8 @@
+ acc_mode |= MAY_APPEND;
+
+ /* Fill in the open() intent data */
+- nd->intent.open.flags = flag;
+- nd->intent.open.create_mode = mode;
++ nd->intent.it_flags = flag;
++ nd->intent.it_create_mode = mode;
+
+ /*
+ * The simplest case - just a plain lookup.
+@@ -1442,6 +1525,7 @@
+ /*
+ * Create - we need to know the parent.
+ */
++ nd->intent.it_op |= IT_CREAT;
+ error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
+ if (error)
+ return error;
+@@ -1458,7 +1542,9 @@
+ dir = nd->dentry;
+ nd->flags &= ~LOOKUP_PARENT;
+ down(&dir->d_inode->i_sem);
++ nd->flags |= LOOKUP_LAST;
+ path.dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++ nd->flags &= ~LOOKUP_LAST;
+ path.mnt = nd->mnt;
+
+ do_last:
+@@ -1564,7 +1650,9 @@
+ }
+ dir = nd->dentry;
+ down(&dir->d_inode->i_sem);
++ nd->flags |= LOOKUP_LAST;
+ path.dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++ nd->flags &= ~LOOKUP_LAST;
+ path.mnt = nd->mnt;
+ putname(nd->last.name);
+ goto do_last;
+Index: linux-2.6.12.5/fs/namespace.c
+===================================================================
+--- linux-2.6.12.5.orig/fs/namespace.c 2005-08-17 17:51:28.000000000 +0200
++++ linux-2.6.12.5/fs/namespace.c 2005-08-17 17:51:44.000000000 +0200
+@@ -62,6 +62,7 @@
+ INIT_LIST_HEAD(&mnt->mnt_mounts);
+ INIT_LIST_HEAD(&mnt->mnt_list);
+ INIT_LIST_HEAD(&mnt->mnt_fslink);
++ INIT_LIST_HEAD(&mnt->mnt_lustre_list);
+ if (name) {
+ int size = strlen(name)+1;
+ char *newname = kmalloc(size, GFP_KERNEL);
+@@ -113,6 +114,7 @@
+
+ static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
+ {
++ memset(old_nd, 0, sizeof(*old_nd));
+ old_nd->dentry = mnt->mnt_mountpoint;
+ old_nd->mnt = mnt->mnt_parent;
+ mnt->mnt_parent = mnt;
+@@ -176,6 +178,9 @@
+ {
+ struct super_block *sb = mnt->mnt_sb;
+ dput(mnt->mnt_root);
++ spin_lock(&dcache_lock);
++ list_del(&mnt->mnt_lustre_list);
++ spin_unlock(&dcache_lock);
+ free_vfsmnt(mnt);
+ deactivate_super(sb);
+ }
+@@ -402,6 +407,8 @@
+ */
+
+ lock_kernel();
++ if (sb->s_op->umount_lustre)
++ sb->s_op->umount_lustre(sb);
+ if( (flags&MNT_FORCE) && sb->s_op->umount_begin)
+ sb->s_op->umount_begin(sb);
+ unlock_kernel();
+@@ -627,6 +634,7 @@
+ return err;
+ if (!old_name || !*old_name)
+ return -EINVAL;
++ intent_init(&old_nd.intent, IT_LOOKUP);
+ err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+ if (err)
+ return err;
+@@ -701,6 +709,7 @@
+ return -EPERM;
+ if (!old_name || !*old_name)
+ return -EINVAL;
++ intent_init(&old_nd.intent, IT_LOOKUP);
+ err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+ if (err)
+ return err;
+@@ -1012,6 +1021,7 @@
+ int retval = 0;
+ int mnt_flags = 0;
+
++ intent_init(&nd.intent, IT_LOOKUP);
+ /* Discard magic */
+ if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
+ flags &= ~MS_MGC_MSK;
+Index: linux-2.6.12.5/fs/open.c
+===================================================================
+--- linux-2.6.12.5.orig/fs/open.c 2005-08-17 17:51:28.000000000 +0200
++++ linux-2.6.12.5/fs/open.c 2005-08-17 17:51:44.000000000 +0200
+@@ -215,12 +215,12 @@
+ struct nameidata nd;
+ struct inode * inode;
+ int error;
+-
++ intent_init(&nd.intent, IT_GETATTR);
+ error = -EINVAL;
+ if (length < 0) /* sorry, but loff_t says... */
+ goto out;
+
+- error = user_path_walk(path, &nd);
++ error = user_path_walk_it(path, &nd);
+ if (error)
+ goto out;
+ inode = nd.dentry->d_inode;
+@@ -474,6 +474,7 @@
+ int old_fsuid, old_fsgid;
+ kernel_cap_t old_cap;
+ int res;
++ intent_init(&nd.intent, IT_GETATTR);
+
+ if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
+ return -EINVAL;
+@@ -498,13 +499,14 @@
+ else
+ current->cap_effective = current->cap_permitted;
+
+- res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
++ res = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+ if (!res) {
+ res = permission(nd.dentry->d_inode, mode, &nd);
+ /* SuS v2 requires we report a read only fs too */
+ if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
+ && !special_file(nd.dentry->d_inode->i_mode))
+ res = -EROFS;
++
+ path_release(&nd);
+ }
+
+@@ -519,8 +521,9 @@
+ {
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent, IT_GETATTR);
+
+- error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
++ error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+ if (error)
+ goto out;
+
+@@ -570,8 +573,9 @@
+ {
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent, IT_GETATTR);
+
+- error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
++ error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+ if (error)
+ goto out;
+
+@@ -750,27 +754,8 @@
+ * for the internal routines (ie open_namei()/follow_link() etc). 00 is
+ * used by symlinks.
+ */
+-struct file *filp_open(const char * filename, int flags, int mode)
+-{
+- int namei_flags, error;
+- struct nameidata nd;
+-
+- namei_flags = flags;
+- if ((namei_flags+1) & O_ACCMODE)
+- namei_flags++;
+- if (namei_flags & O_TRUNC)
+- namei_flags |= 2;
+-
+- error = open_namei(filename, namei_flags, mode, &nd);
+- if (!error)
+- return dentry_open(nd.dentry, nd.mnt, flags);
+-
+- return ERR_PTR(error);
+-}
+-
+-EXPORT_SYMBOL(filp_open);
+-
+-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, int flags,
++ struct lookup_intent *it)
+ {
+ struct file * f;
+ struct inode *inode;
+@@ -782,6 +767,7 @@
+ goto cleanup_dentry;
+ f->f_flags = flags;
+ f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
++ f->f_it = it;
+ inode = dentry->d_inode;
+ if (f->f_mode & FMODE_WRITE) {
+ error = get_write_access(inode);
+@@ -800,6 +786,7 @@
+ error = f->f_op->open(inode,f);
+ if (error)
+ goto cleanup_all;
++ intent_release(it);
+ }
+ f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+
+@@ -825,6 +812,7 @@
+ cleanup_file:
+ put_filp(f);
+ cleanup_dentry:
++ intent_release(it);
+ dput(dentry);
+ mntput(mnt);
+ return ERR_PTR(error);
+@@ -832,6 +820,36 @@
+
+ EXPORT_SYMBOL(dentry_open);
+
++struct file *filp_open(const char * filename, int flags, int mode)
++{
++ int namei_flags, error;
++ struct file * temp_filp;
++ struct nameidata nd;
++ intent_init(&nd.intent, IT_OPEN);
++
++ namei_flags = flags;
++ if ((namei_flags+1) & O_ACCMODE)
++ namei_flags++;
++ if (namei_flags & O_TRUNC)
++ namei_flags |= 2;
++
++ error = open_namei(filename, namei_flags, mode, &nd);
++ if (!error) {
++ temp_filp = dentry_open_it(nd.dentry, nd.mnt, flags, &nd.intent);
++ return temp_filp;
++ }
++ return ERR_PTR(error);
++}
++
++
++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++{
++ struct lookup_intent it;
++ intent_init(&it, IT_LOOKUP);
++
++ return dentry_open_it(dentry, mnt, flags, &it);
++}
++
+ /*
+ * Find an empty file descriptor entry, and mark it busy.
+ */
+Index: linux-2.6.12.5/fs/stat.c
+===================================================================
+--- linux-2.6.12.5.orig/fs/stat.c 2005-08-17 17:51:28.000000000 +0200
++++ linux-2.6.12.5/fs/stat.c 2005-08-17 17:51:44.000000000 +0200
+@@ -38,7 +38,7 @@
+
+ EXPORT_SYMBOL(generic_fillattr);
+
+-int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
++int vfs_getattr_it(struct vfsmount *mnt, struct dentry *dentry, struct lookup_intent *it, struct kstat *stat)
+ {
+ struct inode *inode = dentry->d_inode;
+ int retval;
+@@ -47,6 +47,8 @@
+ if (retval)
+ return retval;
+
++ if (inode->i_op->getattr_it)
++ return inode->i_op->getattr_it(mnt, dentry, it, stat);
+ if (inode->i_op->getattr)
+ return inode->i_op->getattr(mnt, dentry, stat);
+
+@@ -63,14 +65,20 @@
+
+ EXPORT_SYMBOL(vfs_getattr);
+
++int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
++{
++ return vfs_getattr_it(mnt, dentry, NULL, stat);
++}
++
+ int vfs_stat(char __user *name, struct kstat *stat)
+ {
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent, IT_GETATTR);
+
+- error = user_path_walk(name, &nd);
++ error = user_path_walk_it(name, &nd);
+ if (!error) {
+- error = vfs_getattr(nd.mnt, nd.dentry, stat);
++ error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat);
+ path_release(&nd);
+ }
+ return error;
+@@ -82,10 +90,11 @@
+ {
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent, IT_GETATTR);
+
+- error = user_path_walk_link(name, &nd);
++ error = user_path_walk_link_it(name, &nd);
+ if (!error) {
+- error = vfs_getattr(nd.mnt, nd.dentry, stat);
++ error = vfs_getattr_it(nd.mnt, nd.dentry, &nd.intent, stat);
+ path_release(&nd);
+ }
+ return error;
+@@ -97,9 +106,12 @@
+ {
+ struct file *f = fget(fd);
+ int error = -EBADF;
++ struct nameidata nd;
++ intent_init(&nd.intent, IT_GETATTR);
+
+ if (f) {
+- error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat);
++ error = vfs_getattr_it(f->f_vfsmnt, f->f_dentry, &nd.intent, stat);
++ intent_release(&nd.intent);
+ fput(f);
+ }
+ return error;
+Index: linux-2.6.12.5/fs/nfs/dir.c
+===================================================================
+--- linux-2.6.12.5.orig/fs/nfs/dir.c 2005-08-17 17:51:28.000000000 +0200
++++ linux-2.6.12.5/fs/nfs/dir.c 2005-08-17 17:51:44.000000000 +0200
+@@ -727,7 +727,7 @@
+ return 0;
+ if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
+ return 0;
+- return (nd->intent.open.flags & O_EXCL) != 0;
++ return (nd->intent.it_flags & O_EXCL) != 0;
+ }
+
+ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+@@ -1028,7 +1028,7 @@
+ attr.ia_valid = ATTR_MODE;
+
+ if (nd && (nd->flags & LOOKUP_CREATE))
+- open_flags = nd->intent.open.flags;
++ open_flags = nd->intent.it_flags;
+
+ lock_kernel();
+ nfs_begin_data_update(dir);
+Index: linux-2.6.12.5/fs/inode.c
+===================================================================
+--- linux-2.6.12.5.orig/fs/inode.c 2005-08-17 17:51:28.000000000 +0200
++++ linux-2.6.12.5/fs/inode.c 2005-08-17 17:51:44.000000000 +0200
+@@ -230,6 +230,7 @@
+ inodes_stat.nr_unused--;
+ }
+
++EXPORT_SYMBOL(__iget);
+ /**
+ * clear_inode - clear an inode
+ * @inode: inode to clear
+Index: linux-2.6.12.5/include/linux/dcache.h
+===================================================================
+--- linux-2.6.12.5.orig/include/linux/dcache.h 2005-08-17 17:51:28.000000000 +0200
++++ linux-2.6.12.5/include/linux/dcache.h 2005-08-17 17:51:44.000000000 +0200
+@@ -4,6 +4,7 @@
+ #ifdef __KERNEL__
+
+ #include <asm/atomic.h>
++#include <linux/string.h>
+ #include <linux/list.h>
+ #include <linux/spinlock.h>
+ #include <linux/cache.h>
+@@ -37,6 +38,8 @@
+ const unsigned char *name;
+ };
+
++#include <linux/namei.h>
++
+ struct dentry_stat_t {
+ int nr_dentry;
+ int nr_unused;
+Index: linux-2.6.12.5/include/linux/fs.h
+===================================================================
+--- linux-2.6.12.5.orig/include/linux/fs.h 2005-08-17 17:51:28.000000000 +0200
++++ linux-2.6.12.5/include/linux/fs.h 2005-08-17 17:51:44.000000000 +0200
+@@ -58,6 +58,7 @@
+
+ #define FMODE_READ 1
+ #define FMODE_WRITE 2
++#define FMODE_EXEC 4
+
+ /* Internal kernel extensions */
+ #define FMODE_LSEEK 4
+@@ -260,6 +261,8 @@
+ #define ATTR_ATTR_FLAG 1024
+ #define ATTR_KILL_SUID 2048
+ #define ATTR_KILL_SGID 4096
++#define ATTR_RAW 8192 /* file system, not vfs will massage attrs */
++#define ATTR_FROM_OPEN 16384 /* called from open path, ie O_TRUNC */
+
+ /*
+ * This is the Inode Attributes structure, used for notify_change(). It
+@@ -463,6 +466,7 @@
+ struct block_device *i_bdev;
+ struct cdev *i_cdev;
+ int i_cindex;
++ void *i_filterdata;
+
+ __u32 i_generation;
+
+@@ -600,6 +604,7 @@
+ spinlock_t f_ep_lock;
+ #endif /* #ifdef CONFIG_EPOLL */
+ struct address_space *f_mapping;
++ struct lookup_intent *f_it;
+ };
+ extern spinlock_t files_lock;
+ #define file_list_lock() spin_lock(&files_lock);
+@@ -968,7 +973,9 @@
+ void (*truncate) (struct inode *);
+ int (*permission) (struct inode *, int, struct nameidata *);
+ int (*setattr) (struct dentry *, struct iattr *);
++ int (*setattr_raw) (struct inode *, struct iattr *);
+ int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
++ int (*getattr_it) (struct vfsmount *, struct dentry *, struct lookup_intent *, struct kstat *);
+ int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
+ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+ ssize_t (*listxattr) (struct dentry *, char *, size_t);
+@@ -1008,6 +1015,7 @@
+ int (*remount_fs) (struct super_block *, int *, char *);
+ void (*clear_inode) (struct inode *);
+ void (*umount_begin) (struct super_block *);
++ void (*umount_lustre) (struct super_block *);
+
+ int (*show_options)(struct seq_file *, struct vfsmount *);
+
+@@ -1210,6 +1218,7 @@
+ extern struct vfsmount *kern_mount(struct file_system_type *);
+ extern int may_umount_tree(struct vfsmount *);
+ extern int may_umount(struct vfsmount *);
++struct vfsmount *do_kern_mount(const char *type, int flags, const char *name, void *data);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
+
+ extern int vfs_statfs(struct super_block *, struct kstatfs *);
+@@ -1262,6 +1271,7 @@
+ extern int do_truncate(struct dentry *, loff_t start);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char __user *);
+
+Index: linux-2.6.12.5/include/linux/namei.h
+===================================================================
+--- linux-2.6.12.5.orig/include/linux/namei.h 2005-08-17 17:51:28.000000000 +0200
++++ linux-2.6.12.5/include/linux/namei.h 2005-08-17 17:51:44.000000000 +0200
+@@ -2,14 +2,48 @@
+ #define _LINUX_NAMEI_H
+
+ #include <linux/linkage.h>
++#include <linux/string.h>
+
+ struct vfsmount;
++struct nameidata;
+
+-struct open_intent {
+- int flags;
+- int create_mode;
++/* intent opcodes */
++#define IT_OPEN (1)
++#define IT_CREAT (1<<1)
++#define IT_READDIR (1<<2)
++#define IT_GETATTR (1<<3)
++#define IT_LOOKUP (1<<4)
++#define IT_UNLINK (1<<5)
++#define IT_TRUNC (1<<6)
++#define IT_GETXATTR (1<<7)
++
++struct lustre_intent_data {
++ int it_disposition;
++ int it_status;
++ __u64 it_lock_handle;
++ void *it_data;
++ int it_lock_mode;
+ };
+
++#define INTENT_MAGIC 0x19620323
++struct lookup_intent {
++ int it_magic;
++ void (*it_op_release)(struct lookup_intent *);
++ int it_op;
++ int it_flags;
++ int it_create_mode;
++ union {
++ struct lustre_intent_data lustre;
++ } d;
++};
++
++static inline void intent_init(struct lookup_intent *it, int op)
++{
++ memset(it, 0, sizeof(*it));
++ it->it_magic = INTENT_MAGIC;
++ it->it_op = op;
++}
++
+ enum { MAX_NESTED_LINKS = 5 };
+
+ struct nameidata {
+@@ -21,10 +55,7 @@
+ unsigned depth;
+ char *saved_names[MAX_NESTED_LINKS + 1];
+
+- /* Intent data */
+- union {
+- struct open_intent open;
+- } intent;
++ struct lookup_intent intent;
+ };
+
+ /*
+@@ -47,6 +78,8 @@
+ #define LOOKUP_PARENT 16
+ #define LOOKUP_NOALT 32
+ #define LOOKUP_REVAL 64
++#define LOOKUP_LAST (0x1000)
++#define LOOKUP_LINK_NOTLAST (0x2000)
+ /*
+ * Intent data
+ */
+@@ -55,6 +88,12 @@
+ #define LOOKUP_ACCESS (0x0400)
+
+ extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
++extern int FASTCALL(__user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd));
++#define user_path_walk_it(name,nd) \
++ __user_walk_it(name, LOOKUP_FOLLOW, nd)
++#define user_path_walk_link_it(name,nd) \
++ __user_walk_it(name, 0, nd)
++extern void intent_release(struct lookup_intent *);
+ #define user_path_walk(name,nd) \
+ __user_walk(name, LOOKUP_FOLLOW, nd)
+ #define user_path_walk_link(name,nd) \
+@@ -67,7 +106,6 @@
+
+ extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
+ extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
+-
+ extern int follow_down(struct vfsmount **, struct dentry **);
+ extern int follow_up(struct vfsmount **, struct dentry **);
+
+Index: linux-2.6.12.5/include/linux/mount.h
+===================================================================
+--- linux-2.6.12.5.orig/include/linux/mount.h 2005-08-17 17:51:28.000000000 +0200
++++ linux-2.6.12.5/include/linux/mount.h 2005-08-17 17:51:44.000000000 +0200
+@@ -36,6 +36,8 @@
+ struct list_head mnt_list;
+ struct list_head mnt_fslink; /* link in fs-specific expiry list */
+ struct namespace *mnt_namespace; /* containing namespace */
++ struct list_head mnt_lustre_list; /* GNS mount list */
++ unsigned long mnt_last_used; /* for GNS auto-umount (jiffies) */
+ };
+
+ static inline struct vfsmount *mntget(struct vfsmount *mnt)
if (!IS_ERR(tmp)) {
struct dentry *dentry;
struct nameidata nd;
-+ intent_init(&nd.intent, IT_LOOKUP);
++ intent_init(&nd.intent, IT_LOOKUP);
error = path_lookup(tmp, LOOKUP_PARENT, &nd);
if (error)
char * name;
struct dentry *dentry;
struct nameidata nd;
-+ intent_init(&nd.intent, IT_LOOKUP);
++ intent_init(&nd.intent, IT_LOOKUP);
name = getname(pathname);
if(IS_ERR(name))
struct dentry *dentry;
struct nameidata nd;
struct inode *inode = NULL;
-+ intent_init(&nd.intent, IT_LOOKUP);
++ intent_init(&nd.intent, IT_LOOKUP);
name = getname(pathname);
if(IS_ERR(name))
if (!IS_ERR(to)) {
struct dentry *dentry;
struct nameidata nd;
-+ intent_init(&nd.intent, IT_LOOKUP);
++ intent_init(&nd.intent, IT_LOOKUP);
error = path_lookup(to, LOOKUP_PARENT, &nd);
if (error)
struct nameidata nd, old_nd;
int error;
char * to;
-+ intent_init(&nd.intent, IT_LOOKUP);
-+ intent_init(&old_nd.intent, IT_LOOKUP);
++ intent_init(&nd.intent, IT_LOOKUP);
++ intent_init(&old_nd.intent, IT_LOOKUP);
to = getname(newname);
if (IS_ERR(to))
error = -EXDEV;
if (old_nd.mnt != nd.mnt)
goto out_release;
-+ if (nd.dentry->d_inode->i_op->link_raw) {
-+ struct inode_operations *op = nd.dentry->d_inode->i_op;
-+ error = op->link_raw(&old_nd, &nd);
-+ /* the file system wants to use normal vfs path now */
-+ if (error != -EOPNOTSUPP)
-+ goto out_release;
-+ }
++ if (nd.dentry->d_inode->i_op->link_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->link_raw(&old_nd, &nd);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out_release;
++ }
new_dentry = lookup_create(&nd, 0);
error = PTR_ERR(new_dentry);
if (!IS_ERR(new_dentry)) {
-@@ -2101,7 +2158,7 @@
- * locking].
- */
- int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
-- struct inode *new_dir, struct dentry *new_dentry)
-+ struct inode *new_dir, struct dentry *new_dentry)
- {
- int error = 0;
- struct inode *target;
-@@ -2146,7 +2203,7 @@
- }
-
- int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
-- struct inode *new_dir, struct dentry *new_dentry)
-+ struct inode *new_dir, struct dentry *new_dentry)
- {
- struct inode *target;
- int error;
@@ -2223,6 +2280,8 @@
struct dentry * old_dentry, *new_dentry;
struct dentry * trap;
struct nameidata oldnd, newnd;
-+ intent_init(&oldnd.intent, IT_LOOKUP);
-+ intent_init(&newnd.intent, IT_LOOKUP);
++ intent_init(&oldnd.intent, IT_LOOKUP);
++ intent_init(&newnd.intent, IT_LOOKUP);
error = path_lookup(oldname, LOOKUP_PARENT, &oldnd);
if (error)
+ if (error != -EOPNOTSUPP)
+ goto dput_and_out;
+ } else {
-+ down(&inode->i_sem);
-+ error = notify_change(nd.dentry, &newattrs);
-+ up(&inode->i_sem);
-+ }
++ down(&inode->i_sem);
++ error = notify_change(nd.dentry, &newattrs);
++ up(&inode->i_sem);
++ }
dput_and_out:
path_release(&nd);
out:
+ if (error != -EOPNOTSUPP)
+ goto dput_and_out;
+ } else {
-+ down(&inode->i_sem);
-+ error = notify_change(nd.dentry, &newattrs);
-+ up(&inode->i_sem);
-+ }
++ down(&inode->i_sem);
++ error = notify_change(nd.dentry, &newattrs);
++ up(&inode->i_sem);
++ }
dput_and_out:
path_release(&nd);
out:
--- /dev/null
+Index: linux-2.6.12.2/fs/namei.c
+===================================================================
+--- linux-2.6.12.2.orig/fs/namei.c 2005-07-23 12:25:12.241868120 +0200
++++ linux-2.6.12.2/fs/namei.c 2005-07-23 12:25:14.440533872 +0200
+@@ -1466,7 +1466,7 @@
+ if (!error) {
+ DQUOT_INIT(inode);
+
+- error = do_truncate(dentry, 0);
++ error = do_truncate(dentry, 0, 1);
+ }
+ put_write_access(inode);
+ if (error)
+@@ -1719,6 +1719,7 @@
+ char * tmp;
+ struct dentry * dentry;
+ struct nameidata nd;
++ intent_init(&nd.intent, IT_LOOKUP);
+
+ if (S_ISDIR(mode))
+ return -EPERM;
+@@ -1729,6 +1730,15 @@
+ error = path_lookup(tmp, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
++
++ if (nd.dentry->d_inode->i_op->mknod_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->mknod_raw(&nd, mode, dev);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out2;
++ }
++
+ dentry = lookup_create(&nd, 0);
+ error = PTR_ERR(dentry);
+
+@@ -1755,6 +1765,7 @@
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(tmp);
+@@ -1796,10 +1807,18 @@
+ if (!IS_ERR(tmp)) {
+ struct dentry *dentry;
+ struct nameidata nd;
++ intent_init(&nd.intent, IT_LOOKUP);
+
+ error = path_lookup(tmp, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
++ if (nd.dentry->d_inode->i_op->mkdir_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->mkdir_raw(&nd, mode);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out2;
++ }
+ dentry = lookup_create(&nd, 1);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+@@ -1809,6 +1828,7 @@
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(tmp);
+@@ -1885,6 +1905,7 @@
+ char * name;
+ struct dentry *dentry;
+ struct nameidata nd;
++ intent_init(&nd.intent, IT_LOOKUP);
+
+ name = getname(pathname);
+ if(IS_ERR(name))
+@@ -1905,6 +1926,16 @@
+ error = -EBUSY;
+ goto exit1;
+ }
++
++ if (nd.dentry->d_inode->i_op->rmdir_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++ error = op->rmdir_raw(&nd);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto exit1;
++ }
++
+ down(&nd.dentry->d_inode->i_sem);
+ dentry = lookup_hash(&nd.last, nd.dentry);
+ error = PTR_ERR(dentry);
+@@ -1963,6 +1994,7 @@
+ struct dentry *dentry;
+ struct nameidata nd;
+ struct inode *inode = NULL;
++ intent_init(&nd.intent, IT_LOOKUP);
+
+ name = getname(pathname);
+ if(IS_ERR(name))
+@@ -1974,6 +2006,13 @@
+ error = -EISDIR;
+ if (nd.last_type != LAST_NORM)
+ goto exit1;
++ if (nd.dentry->d_inode->i_op->unlink_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->unlink_raw(&nd);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto exit1;
++ }
+ down(&nd.dentry->d_inode->i_sem);
+ dentry = lookup_hash(&nd.last, nd.dentry);
+ error = PTR_ERR(dentry);
+@@ -2040,10 +2079,18 @@
+ if (!IS_ERR(to)) {
+ struct dentry *dentry;
+ struct nameidata nd;
++ intent_init(&nd.intent, IT_LOOKUP);
+
+ error = path_lookup(to, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
++ if (nd.dentry->d_inode->i_op->symlink_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->symlink_raw(&nd, from);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out2;
++ }
+ dentry = lookup_create(&nd, 0);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+@@ -2051,6 +2098,7 @@
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(to);
+@@ -2114,6 +2162,8 @@
+ struct nameidata nd, old_nd;
+ int error;
+ char * to;
++ intent_init(&nd.intent, IT_LOOKUP);
++ intent_init(&old_nd.intent, IT_LOOKUP);
+
+ to = getname(newname);
+ if (IS_ERR(to))
+@@ -2128,6 +2178,13 @@
+ error = -EXDEV;
+ if (old_nd.mnt != nd.mnt)
+ goto out_release;
++ if (nd.dentry->d_inode->i_op->link_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->link_raw(&old_nd, &nd);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out_release;
++ }
+ new_dentry = lookup_create(&nd, 0);
+ error = PTR_ERR(new_dentry);
+ if (!IS_ERR(new_dentry)) {
+@@ -2300,6 +2357,8 @@
+ struct dentry * old_dentry, *new_dentry;
+ struct dentry * trap;
+ struct nameidata oldnd, newnd;
++ intent_init(&oldnd.intent, IT_LOOKUP);
++ intent_init(&newnd.intent, IT_LOOKUP);
+
+ error = path_lookup(oldname, LOOKUP_PARENT, &oldnd);
+ if (error)
+@@ -2322,6 +2381,13 @@
+ if (newnd.last_type != LAST_NORM)
+ goto exit2;
+
++ if (old_dir->d_inode->i_op->rename_raw) {
++ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto exit2;
++ }
++
+ trap = lock_rename(new_dir, old_dir);
+
+ old_dentry = lookup_hash(&oldnd.last, old_dir);
+@@ -2353,8 +2419,7 @@
+ if (new_dentry == trap)
+ goto exit5;
+
+- error = vfs_rename(old_dir->d_inode, old_dentry,
+- new_dir->d_inode, new_dentry);
++ error = vfs_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, new_dentry);
+ exit5:
+ dput(new_dentry);
+ exit4:
+Index: linux-2.6.12.2/fs/open.c
+===================================================================
+--- linux-2.6.12.2.orig/fs/open.c 2005-07-23 12:25:12.248867056 +0200
++++ linux-2.6.12.2/fs/open.c 2005-07-23 12:28:13.221355056 +0200
+@@ -192,9 +192,10 @@
+ return error;
+ }
+
+-int do_truncate(struct dentry *dentry, loff_t length)
++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open)
+ {
+ int err;
++ struct inode_operations *op = dentry->d_inode->i_op;
+ struct iattr newattrs;
+
+ /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
+@@ -205,7 +206,16 @@
+ newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+
+ down(&dentry->d_inode->i_sem);
+- err = notify_change(dentry, &newattrs);
++ if (called_from_open)
++ newattrs.ia_valid |= ATTR_FROM_OPEN;
++ if (op->setattr_raw) {
++ newattrs.ia_valid |= ATTR_RAW;
++ newattrs.ia_ctime = CURRENT_TIME;
++ down_write(&dentry->d_inode->i_alloc_sem);
++ err = op->setattr_raw(dentry->d_inode, &newattrs);
++ up_write(&dentry->d_inode->i_alloc_sem);
++ } else
++ err = notify_change(dentry, &newattrs);
+ up(&dentry->d_inode->i_sem);
+ return err;
+ }
+@@ -260,7 +270,7 @@
+ error = locks_verify_truncate(inode, NULL, length);
+ if (!error) {
+ DQUOT_INIT(inode);
+- error = do_truncate(nd.dentry, length);
++ error = do_truncate(nd.dentry, length, 0);
+ }
+ put_write_access(inode);
+
+@@ -312,7 +322,7 @@
+
+ error = locks_verify_truncate(inode, file, length);
+ if (!error)
+- error = do_truncate(dentry, length);
++ error = do_truncate(dentry, length, 0);
+ out_putf:
+ fput(file);
+ out:
+@@ -391,9 +401,19 @@
+ (error = permission(inode,MAY_WRITE,&nd)) != 0)
+ goto dput_and_out;
+ }
+- down(&inode->i_sem);
+- error = notify_change(nd.dentry, &newattrs);
+- up(&inode->i_sem);
++ if (inode->i_op->setattr_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++ newattrs.ia_valid |= ATTR_RAW;
++ error = op->setattr_raw(inode, &newattrs);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto dput_and_out;
++ } else {
++ down(&inode->i_sem);
++ error = notify_change(nd.dentry, &newattrs);
++ up(&inode->i_sem);
++ }
+ dput_and_out:
+ path_release(&nd);
+ out:
+@@ -444,9 +464,19 @@
+ (error = permission(inode,MAY_WRITE,&nd)) != 0)
+ goto dput_and_out;
+ }
+- down(&inode->i_sem);
+- error = notify_change(nd.dentry, &newattrs);
+- up(&inode->i_sem);
++ if (inode->i_op->setattr_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++ newattrs.ia_valid |= ATTR_RAW;
++ error = op->setattr_raw(inode, &newattrs);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto dput_and_out;
++ } else {
++ down(&inode->i_sem);
++ error = notify_change(nd.dentry, &newattrs);
++ up(&inode->i_sem);
++ }
+ dput_and_out:
+ path_release(&nd);
+ out:
+@@ -596,36 +626,52 @@
+ return error;
+ }
+
+-asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
++int chmod_common(struct dentry *dentry, mode_t mode)
+ {
+- struct inode * inode;
+- struct dentry * dentry;
+- struct file * file;
+- int err = -EBADF;
++ struct inode * inode = dentry->d_inode;
+ struct iattr newattrs;
++ int error = -EROFS;
+
+- file = fget(fd);
+- if (!file)
++ if (IS_RDONLY(inode))
+ goto out;
++
++ if (inode->i_op->setattr_raw) {
++ struct inode_operations *op = dentry->d_inode->i_op;
++
++ newattrs.ia_mode = mode;
++ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
++ newattrs.ia_valid |= ATTR_RAW;
++ error = op->setattr_raw(inode, &newattrs);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out;
++ }
+
+- dentry = file->f_dentry;
+- inode = dentry->d_inode;
+-
+- err = -EROFS;
+- if (IS_RDONLY(inode))
+- goto out_putf;
+- err = -EPERM;
++ error = -EPERM;
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+- goto out_putf;
++ goto out;
++
+ down(&inode->i_sem);
+ if (mode == (mode_t) -1)
+ mode = inode->i_mode;
+ newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+- err = notify_change(dentry, &newattrs);
++ error = notify_change(dentry, &newattrs);
+ up(&inode->i_sem);
++out:
++ return error;
++}
+
+-out_putf:
++asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
++{
++ struct file * file;
++ int err = -EBADF;
++
++ file = fget(fd);
++ if (!file)
++ goto out;
++
++ err = chmod_common(file->f_dentry, mode);
+ fput(file);
+ out:
+ return err;
+@@ -634,32 +680,13 @@
+ asmlinkage long sys_chmod(const char __user * filename, mode_t mode)
+ {
+ struct nameidata nd;
+- struct inode * inode;
+ int error;
+- struct iattr newattrs;
+
+ error = user_path_walk(filename, &nd);
+ if (error)
+ goto out;
+- inode = nd.dentry->d_inode;
+-
+- error = -EROFS;
+- if (IS_RDONLY(inode))
+- goto dput_and_out;
+-
+- error = -EPERM;
+- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+- goto dput_and_out;
+-
+- down(&inode->i_sem);
+- if (mode == (mode_t) -1)
+- mode = inode->i_mode;
+- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+- error = notify_change(nd.dentry, &newattrs);
+- up(&inode->i_sem);
+
+-dput_and_out:
++ error = chmod_common(nd.dentry, mode);
+ path_release(&nd);
+ out:
+ return error;
+@@ -680,6 +707,18 @@
+ if (IS_RDONLY(inode))
+ goto out;
+ error = -EPERM;
++ if (inode->i_op->setattr_raw) {
++ struct inode_operations *op = dentry->d_inode->i_op;
++
++ newattrs.ia_uid = user;
++ newattrs.ia_gid = group;
++ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME;
++ newattrs.ia_valid |= ATTR_RAW;
++ error = op->setattr_raw(inode, &newattrs);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ return error;
++ }
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ goto out;
+ newattrs.ia_valid = ATTR_CTIME;
+@@ -693,6 +732,7 @@
+ }
+ if (!S_ISDIR(inode->i_mode))
+ newattrs.ia_valid |= ATTR_KILL_SUID|ATTR_KILL_SGID;
++
+ down(&inode->i_sem);
+ error = notify_change(dentry, &newattrs);
+ up(&inode->i_sem);
+Index: linux-2.6.12.2/fs/exec.c
+===================================================================
+--- linux-2.6.12.2.orig/fs/exec.c 2005-07-23 12:25:12.229869944 +0200
++++ linux-2.6.12.2/fs/exec.c 2005-07-23 12:25:14.442533568 +0200
+@@ -1488,7 +1488,7 @@
+ goto close_fail;
+ if (!file->f_op->write)
+ goto close_fail;
+- if (do_truncate(file->f_dentry, 0) != 0)
++ if (do_truncate(file->f_dentry, 0, 0) != 0)
+ goto close_fail;
+
+ retval = binfmt->core_dump(signr, regs, file);
+Index: linux-2.6.12.2/include/linux/fs.h
+===================================================================
+--- linux-2.6.12.2.orig/include/linux/fs.h 2005-07-23 12:25:12.279862344 +0200
++++ linux-2.6.12.2/include/linux/fs.h 2005-07-23 12:25:14.443533416 +0200
+@@ -960,13 +960,20 @@
+ int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
+ struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
+ int (*link) (struct dentry *,struct inode *,struct dentry *);
++ int (*link_raw) (struct nameidata *,struct nameidata *);
+ int (*unlink) (struct inode *,struct dentry *);
++ int (*unlink_raw) (struct nameidata *);
+ int (*symlink) (struct inode *,struct dentry *,const char *);
++ int (*symlink_raw) (struct nameidata *,const char *);
+ int (*mkdir) (struct inode *,struct dentry *,int);
++ int (*mkdir_raw) (struct nameidata *,int);
+ int (*rmdir) (struct inode *,struct dentry *);
++ int (*rmdir_raw) (struct nameidata *);
+ int (*mknod) (struct inode *,struct dentry *,int,dev_t);
++ int (*mknod_raw) (struct nameidata *,int,dev_t);
+ int (*rename) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *);
++ int (*rename_raw) (struct nameidata *, struct nameidata *);
+ int (*readlink) (struct dentry *, char __user *,int);
+ int (*follow_link) (struct dentry *, struct nameidata *);
+ void (*put_link) (struct dentry *, struct nameidata *);
+@@ -1268,7 +1275,7 @@
+
+ /* fs/open.c */
+
+-extern int do_truncate(struct dentry *, loff_t start);
++extern int do_truncate(struct dentry *, loff_t start, int called_from_open);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
+ extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *);
+Index: linux-2.6.12.2/net/unix/af_unix.c
+===================================================================
+--- linux-2.6.12.2.orig/net/unix/af_unix.c 2005-06-30 01:00:53.000000000 +0200
++++ linux-2.6.12.2/net/unix/af_unix.c 2005-07-23 12:25:14.445533112 +0200
+@@ -673,6 +673,7 @@
+ int err = 0;
+
+ if (sunname->sun_path[0]) {
++ intent_init(&nd.intent, IT_LOOKUP);
+ err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
+ if (err)
+ goto fail;
--- /dev/null
+Index: linux-2.6.7-vanilla/fs/dcache.c
+===================================================================
+--- linux-2.6.7-vanilla.orig/fs/dcache.c 2004-07-01 12:09:19.000000000 +0300
++++ linux-2.6.7-vanilla/fs/dcache.c 2004-07-01 12:29:12.510193264 +0300
+@@ -219,7 +219,14 @@
+ spin_unlock(&dcache_lock);
+ return 0;
+ }
+- /*
++
++ /* network invalidation by Lustre */
++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) {
++ spin_unlock(&dcache_lock);
++ return 0;
++ }
++
++ /*
+ * Check whether to do a partial shrink_dcache
+ * to get rid of unused child entries.
+ */
+@@ -1199,16 +1199,25 @@
+ * Adds a dentry to the hash according to its name.
+ */
+
+-void d_rehash(struct dentry * entry)
++void d_rehash_cond(struct dentry * entry, int lock)
+ {
+ struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash);
+
+- spin_lock(&dcache_lock);
++ if (lock)
++ spin_lock(&dcache_lock);
+ spin_lock(&entry->d_lock);
+ __d_rehash(entry, list);
+ spin_unlock(&entry->d_lock);
+- spin_unlock(&dcache_lock);
++ if (lock)
++ spin_unlock(&dcache_lock);
+ }
+
++EXPORT_SYMBOL(d_rehash_cond);
++
++void d_rehash(struct dentry * entry)
++{
++ d_rehash_cond(entry, 1);
++ }
++
+ #define do_switch(x,y) do { \
+ __typeof__ (x) __tmp = x; \
+ x = y; y = __tmp; } while (0)
+Index: linux-2.6.7-vanilla/include/linux/dcache.h
+===================================================================
+--- linux-2.6.7-vanilla.orig/include/linux/dcache.h 2004-07-01 12:24:53.602553208 +0300
++++ linux-2.6.7-vanilla/include/linux/dcache.h 2004-07-01 12:27:29.757814000 +0300
+@@ -159,6 +159,8 @@
+
+ #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */
+ #define DCACHE_UNHASHED 0x0010
++#define DCACHE_LUSTRE_INVALID 0x0020 /* Lustre invalidated */
++
+
+ extern spinlock_t dcache_lock;
+
uml-2.6.10-fc3.patch
lustre_version.patch
+fc3_to_rhel4_updates.patch
vfs_intent-2.6-rhel4.patch
vfs_nointent-2.6-rhel4.patch
vfs_races-2.6-fc3.patch
ext3-patch-fuzz-fixup-fc3.patch
uml-exprt-clearuser.patch
fsprivate-2.6.patch
+linux-2.6.9-ext3-sub-second-timestamp.patch
dev_read_only-2.6-suse.patch
export-2.6-suse.patch
lookup_bdev_init_intent.patch
-8kstack-2.6-rhel4.patch
remove-suid-2.6-suse.patch
export-show_task-2.6-vanilla.patch
sd_iostats-2.6-rhel4.patch
fsprivate-2.6.patch
export_symbol_numa.patch
qsnet-rhel4-2.6.patch
+linux-2.6-binutils-2.16.patch
+compile-fixes-2.6.9-rhel4-22.patch
uml-exprt-clearuser.patch
qsnet-suse-2.6.patch
fsprivate-2.6.patch
+dcache-qstr-api-fix-2.6-suse.patch
--- /dev/null
+lustre_version.patch
+vfs_intent-2.6.12.patch
+vfs_nointent-2.6.12.patch
+vfs_races-2.6.12.patch
+ext3-wantedi-misc-2.6-suse.patch
+jbd-2.6.10-jcberr.patch
+nfs-cifs-intent-2.6.12.patch
+iopen-misc-2.6.12.patch
+export-truncate-2.6-suse.patch
+export_symbols-2.6.12.patch
+dev_read_only-2.6-suse.patch
+export-2.6-suse.patch
+lookup_bdev_init_intent.patch
+8kstack-2.6.12.patch
+remove-suid-2.6-suse.patch
+export-show_task-2.6-vanilla.patch
+sd_iostats-2.6-rhel4.patch
+fsprivate-2.6.patch
+export_symbol_numa.patch
ext3-include-fixes-2.6-rhel4.patch
ext3-extents-2.6.9-rhel4.patch
ext3-mballoc2-2.6.9-rhel4.patch
-ext3-nlinks-2.6.7.patch
-ext3-htree-dot-2.6.patch
+ext3-nlinks-2.6.9.patch
ext3-ialloc-2.6.patch
--- /dev/null
+ext3-wantedi-2.6-rhel4.patch
+ext3-san-jdike-2.6-suse.patch
+iopen-2.6.12.patch
+ext3-map_inode_page-2.6-suse.patch
+export-ext3-2.6-rhel4.patch
+ext3-include-fixes-2.6-rhel4.patch
+ext3-extents-2.6.12.patch
+ext3-mballoc2-2.6.12.patch
+ext3-nlinks-2.6.9.patch
+ext3-ialloc-2.6.patch
+ext3-remove-cond_resched-calls-2.6.12.patch
+ext3-htree-dot-2.6.patch
+ext3-external-journal-2.6.12.patch
iopen-2.4.21-chaos.patch
tcp-zero-copy-2.4.21-chaos.patch
jbd-dont-account-blocks-twice.patch
-jbd-commit-tricks.patch
+jbd-commit-tricks-rhel3.patch
ext3-o_direct-2.4.21-chaos.patch
ext3-no-write-super-chaos.patch
add_page_private.patch
statfs64-cast-unsigned-2.4-rhel.patch
fsprivate-2.4.patch
nfsd_iallocsem.patch
-linux-2.4.24-jbd-handle-EIO.patch
+linux-2.4.24-jbd-handle-EIO-rhel3.patch
lnxmaj="2.6.9"
-lnxrel="5.0.5.EL"
+lnxrel="22.EL"
KERNEL=linux-${lnxmaj}-${lnxrel}.tar.bz2
SERIES=2.6-rhel4.series
lnxmaj="2.4.21"
-lnxrel="32.0.1.EL"
+lnxrel="37.EL"
KERNEL=linux-${lnxmaj}-${lnxrel}.tar.bz2
SERIES=rhel-2.4.21
default: all
-MODULES := ldiskfs #quotafmt_test
+MODULES := ldiskfs
+
+@QUOTA_TRUE@MODULES += quotafmt_test
# copy makefile over to not break patches
ext3_extra := $(wildcard @LINUX@/fs/ext3/Makefile)
linux_headers := $(wildcard @LINUX@/include/linux/ext3*.h)
ext3_sources := $(filter-out %.mod.c,$(wildcard @LINUX@/fs/ext3/*.c))
-new_sources := iopen.c iopen.h extents.c mballoc.c proc.c
+new_sources := iopen.c iopen.h extents.c mballoc.c
new_headers := ext3_extents.h
-#quotafmt_sources := lustre_quota_fmt.c
-#quotafmt_headers := lustre_quota_fmt.h
ldiskfs_patched_sources := $(notdir $(ext3_sources) $(ext3_headers)) $(new_sources) $(new_headers)
-ldiskfs_sources := $(ldiskfs_patched_sources) #$(quotafmt_sources) $(quotafmt_headers)
+ldiskfs_sources := $(ldiskfs_patched_sources)
+
+quotafmt_sources := lustre_quota_fmt.c
+quotafmt_headers := lustre_quota_fmt.h
+@QUOTA_TRUE@ldiskfs_sources += $(quotafmt_sources) $(quotafmt_headers)
ldiskfs-objs := $(filter %.o,$(ldiskfs_sources:.c=.o))
-#quotafmt-objs := quotafmt_test.o
+
+@QUOTA_TRUE@quotafmt-objs := quotafmt_test.o
EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LUSTRE@ -I@LUSTRE@/ldiskfs
* linux/fs/quota_v2.c
*/
-
#ifndef EXPORT_SYMTAB
# define EXPORT_SYMTAB
#endif
#define GETIDINDEX(id, depth) (((id) >> ((LUSTRE_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
#define GETENTRIES(buf) ((struct lustre_disk_dqblk *)(((char *)buf)+sizeof(struct lustre_disk_dqdbheader)))
+static int check_quota_file(struct file *f, struct inode *inode, int type)
+{
+ struct lustre_disk_dqheader dqhead;
+ mm_segment_t fs;
+ ssize_t size;
+ loff_t offset = 0;
+ static const uint quota_magics[] = LUSTRE_INITQMAGICS;
+ static const uint quota_versions[] = LUSTRE_INITQVERSIONS;
+
+ if (f) {
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ size = f->f_op->read(f, (char *)&dqhead,
+ sizeof(struct lustre_disk_dqheader),
+ &offset);
+ set_fs(fs);
+ } else {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12)
+ size = 0;
+#else
+ struct super_block *sb = inode->i_sb;
+ size = sb->s_op->quota_read(sb, type, (char *)&dqhead,
+ sizeof(struct lustre_disk_dqheader), 0);
+#endif
+ }
+ if (size != sizeof(struct lustre_disk_dqheader))
+ return 0;
+ if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
+ le32_to_cpu(dqhead.dqh_version) != quota_versions[type])
+ return 0;
+ return 1;
+}
+
/* Check whether given file is really lustre admin quotafile */
int lustre_check_quota_file(struct lustre_quota_info *lqi, int type)
{
- struct lustre_disk_dqheader dqhead;
- struct file *f = lqi->qi_files[type];
- mm_segment_t fs;
- ssize_t size;
- loff_t offset = 0;
- static const uint quota_magics[] = LUSTRE_INITQMAGICS;
- static const uint quota_versions[] = LUSTRE_INITQVERSIONS;
-
- fs = get_fs();
- set_fs(KERNEL_DS);
- size = f->f_op->read(f, (char *)&dqhead, sizeof(struct lustre_disk_dqheader), &offset);
- set_fs(fs);
- if (size != sizeof(struct lustre_disk_dqheader))
- return 0;
- if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
- le32_to_cpu(dqhead.dqh_version) != quota_versions[type])
- return 0;
- return 1;
+ struct file *f = lqi->qi_files[type];
+ return check_quota_file(f, NULL, type);
}
/* Read information header from quota file */
int lustre_read_quota_info(struct lustre_quota_info *lqi, int type)
{
- mm_segment_t fs;
- struct lustre_disk_dqinfo dinfo;
- struct lustre_mem_dqinfo *info = &lqi->qi_info[type];
- struct file *f = lqi->qi_files[type];
- ssize_t size;
- loff_t offset = LUSTRE_DQINFOOFF;
-
- fs = get_fs();
- set_fs(KERNEL_DS);
- size = f->f_op->read(f, (char *)&dinfo, sizeof(struct lustre_disk_dqinfo), &offset);
- set_fs(fs);
- if (size != sizeof(struct lustre_disk_dqinfo)) {
- printk(KERN_WARNING "Can't read info structure on device %s.\n",
- f->f_vfsmnt->mnt_sb->s_id);
- return -1;
- }
- info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
- info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
- info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
- info->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
- info->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
- info->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
- return 0;
+ mm_segment_t fs;
+ struct lustre_disk_dqinfo dinfo;
+ struct lustre_mem_dqinfo *info = &lqi->qi_info[type];
+ struct file *f = lqi->qi_files[type];
+ ssize_t size;
+ loff_t offset = LUSTRE_DQINFOOFF;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ size = f->f_op->read(f, (char *)&dinfo,
+ sizeof(struct lustre_disk_dqinfo), &offset);
+ set_fs(fs);
+ if (size != sizeof(struct lustre_disk_dqinfo)) {
+ printk(KERN_WARNING "Can't read info structure on device %s.\n",
+ f->f_vfsmnt->mnt_sb->s_id);
+ return -1;
+ }
+ info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+ info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+ info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
+ info->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+ info->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+ info->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+ return 0;
}
/* Write information header to quota file */
int lustre_write_quota_info(struct lustre_quota_info *lqi, int type)
{
- mm_segment_t fs;
- struct lustre_disk_dqinfo dinfo;
- struct lustre_mem_dqinfo *info = &lqi->qi_info[type];
- struct file *f = lqi->qi_files[type];
- ssize_t size;
- loff_t offset = LUSTRE_DQINFOOFF;
-
- info->dqi_flags &= ~DQF_INFO_DIRTY;
- dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
- dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
- dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
- dinfo.dqi_blocks = cpu_to_le32(info->dqi_blocks);
- dinfo.dqi_free_blk = cpu_to_le32(info->dqi_free_blk);
- dinfo.dqi_free_entry = cpu_to_le32(info->dqi_free_entry);
- fs = get_fs();
- set_fs(KERNEL_DS);
- size = f->f_op->write(f, (char *)&dinfo, sizeof(struct lustre_disk_dqinfo), &offset);
- set_fs(fs);
- if (size != sizeof(struct lustre_disk_dqinfo)) {
- printk(KERN_WARNING "Can't write info structure on device %s.\n",
- f->f_vfsmnt->mnt_sb->s_id);
- return -1;
- }
- return 0;
+ mm_segment_t fs;
+ struct lustre_disk_dqinfo dinfo;
+ struct lustre_mem_dqinfo *info = &lqi->qi_info[type];
+ struct file *f = lqi->qi_files[type];
+ ssize_t size;
+ loff_t offset = LUSTRE_DQINFOOFF;
+
+ info->dqi_flags &= ~DQF_INFO_DIRTY;
+ dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+ dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+ dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+ dinfo.dqi_blocks = cpu_to_le32(info->dqi_blocks);
+ dinfo.dqi_free_blk = cpu_to_le32(info->dqi_free_blk);
+ dinfo.dqi_free_entry = cpu_to_le32(info->dqi_free_entry);
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ size = f->f_op->write(f, (char *)&dinfo,
+ sizeof(struct lustre_disk_dqinfo), &offset);
+ set_fs(fs);
+ if (size != sizeof(struct lustre_disk_dqinfo)) {
+ printk(KERN_WARNING
+ "Can't write info structure on device %s.\n",
+ f->f_vfsmnt->mnt_sb->s_id);
+ return -1;
+ }
+ return 0;
}
static void disk2memdqb(struct mem_dqblk *m, struct lustre_disk_dqblk *d)
{
- m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
- m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
- m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
- m->dqb_itime = le64_to_cpu(d->dqb_itime);
- m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
- m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
- m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
- m->dqb_btime = le64_to_cpu(d->dqb_btime);
+ m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
+ m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
+ m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
+ m->dqb_itime = le64_to_cpu(d->dqb_itime);
+ m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
+ m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
+ m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+ m->dqb_btime = le64_to_cpu(d->dqb_btime);
}
-static void mem2diskdqb(struct lustre_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
+static void mem2diskdqb(struct lustre_disk_dqblk *d, struct mem_dqblk *m,
+ qid_t id)
{
- d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
- d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
- d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
- d->dqb_itime = cpu_to_le64(m->dqb_itime);
- d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
- d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
- d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
- d->dqb_btime = cpu_to_le64(m->dqb_btime);
- d->dqb_id = cpu_to_le32(id);
+ d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
+ d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
+ d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
+ d->dqb_itime = cpu_to_le64(m->dqb_itime);
+ d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
+ d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
+ d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+ d->dqb_btime = cpu_to_le64(m->dqb_btime);
+ d->dqb_id = cpu_to_le32(id);
}
static dqbuf_t getdqbuf(void)
{
- dqbuf_t buf = kmalloc(LUSTRE_DQBLKSIZE, GFP_NOFS);
- if (!buf)
- printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
- return buf;
+ dqbuf_t buf = kmalloc(LUSTRE_DQBLKSIZE, GFP_NOFS);
+ if (!buf)
+ printk(KERN_WARNING
+ "VFS: Not enough memory for quota buffers.\n");
+ return buf;
}
static inline void freedqbuf(dqbuf_t buf)
{
- kfree(buf);
+ kfree(buf);
}
static ssize_t read_blk(struct file *filp, uint blk, dqbuf_t buf)
{
- mm_segment_t fs;
- ssize_t ret;
- loff_t offset = blk<<LUSTRE_DQBLKSIZE_BITS;
-
- memset(buf, 0, LUSTRE_DQBLKSIZE);
- fs = get_fs();
- set_fs(KERNEL_DS);
- ret = filp->f_op->read(filp, (char *)buf, LUSTRE_DQBLKSIZE, &offset);
- set_fs(fs);
- return ret;
+ mm_segment_t fs;
+ ssize_t ret;
+ loff_t offset = blk << LUSTRE_DQBLKSIZE_BITS;
+
+ memset(buf, 0, LUSTRE_DQBLKSIZE);
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = filp->f_op->read(filp, (char *)buf, LUSTRE_DQBLKSIZE, &offset);
+ set_fs(fs);
+ return ret;
}
static ssize_t write_blk(struct file *filp, uint blk, dqbuf_t buf)
{
- mm_segment_t fs;
- ssize_t ret;
- loff_t offset = blk<<LUSTRE_DQBLKSIZE_BITS;
+ mm_segment_t fs;
+ ssize_t ret;
+ loff_t offset = blk << LUSTRE_DQBLKSIZE_BITS;
- fs = get_fs();
- set_fs(KERNEL_DS);
- ret = filp->f_op->write(filp, (char *)buf, LUSTRE_DQBLKSIZE, &offset);
- set_fs(fs);
- return ret;
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = filp->f_op->write(filp, (char *)buf, LUSTRE_DQBLKSIZE, &offset);
+ set_fs(fs);
+ return ret;
}
static void lustre_mark_info_dirty(struct lustre_mem_dqinfo *info)
{
- set_bit(DQF_INFO_DIRTY_B, &info->dqi_flags);
+ set_bit(DQF_INFO_DIRTY_B, &info->dqi_flags);
}
#define lustre_info_dirty(info) test_bit(DQF_INFO_DIRTY_B, &(info)->dqi_flags)
/* Remove empty block from list and return it */
static int get_free_dqblk(struct file *filp, struct lustre_mem_dqinfo *info)
{
- dqbuf_t buf = getdqbuf();
- struct lustre_disk_dqdbheader *dh = (struct lustre_disk_dqdbheader *)buf;
- int ret, blk;
-
- if (!buf)
- return -ENOMEM;
- if (info->dqi_free_blk) {
- blk = info->dqi_free_blk;
- if ((ret = read_blk(filp, blk, buf)) < 0)
- goto out_buf;
- info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
- }
- else {
- memset(buf, 0, LUSTRE_DQBLKSIZE);
- if ((ret = write_blk(filp, info->dqi_blocks, buf)) < 0) /* Assure block allocation... */
- goto out_buf;
- blk = info->dqi_blocks++;
- }
- lustre_mark_info_dirty(info);
- ret = blk;
+ dqbuf_t buf = getdqbuf();
+ struct lustre_disk_dqdbheader *dh =
+ (struct lustre_disk_dqdbheader *)buf;
+ int ret, blk;
+
+ if (!buf)
+ return -ENOMEM;
+ if (info->dqi_free_blk) {
+ blk = info->dqi_free_blk;
+ if ((ret = read_blk(filp, blk, buf)) < 0)
+ goto out_buf;
+ info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
+ } else {
+ memset(buf, 0, LUSTRE_DQBLKSIZE);
+ if ((ret = write_blk(filp, info->dqi_blocks, buf)) < 0) /* Assure block allocation... */
+ goto out_buf;
+ blk = info->dqi_blocks++;
+ }
+ lustre_mark_info_dirty(info);
+ ret = blk;
out_buf:
- freedqbuf(buf);
- return ret;
+ freedqbuf(buf);
+ return ret;
}
/* Insert empty block to the list */
static int put_free_dqblk(struct file *filp, struct lustre_mem_dqinfo *info,
- dqbuf_t buf, uint blk)
+ dqbuf_t buf, uint blk)
{
- struct lustre_disk_dqdbheader *dh =(struct lustre_disk_dqdbheader *)buf;
- int err;
-
- dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
- dh->dqdh_prev_free = cpu_to_le32(0);
- dh->dqdh_entries = cpu_to_le16(0);
- info->dqi_free_blk = blk;
- lustre_mark_info_dirty(info);
- if ((err = write_blk(filp, blk, buf)) < 0)
- /* Some strange block. We had better leave it... */
- return err;
- return 0;
+ struct lustre_disk_dqdbheader *dh =
+ (struct lustre_disk_dqdbheader *)buf;
+ int err;
+
+ dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
+ dh->dqdh_prev_free = cpu_to_le32(0);
+ dh->dqdh_entries = cpu_to_le16(0);
+ info->dqi_free_blk = blk;
+ lustre_mark_info_dirty(info);
+ if ((err = write_blk(filp, blk, buf)) < 0)
+ /* Some strange block. We had better leave it... */
+ return err;
+ return 0;
}
/* Remove given block from the list of blocks with free entries */
-static int remove_free_dqentry(struct file *filp, struct lustre_mem_dqinfo *info, dqbuf_t buf, uint blk)
-{
- dqbuf_t tmpbuf = getdqbuf();
- struct lustre_disk_dqdbheader *dh = (struct lustre_disk_dqdbheader *)buf;
- uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
- int err;
-
- if (!tmpbuf)
- return -ENOMEM;
- if (nextblk) {
- if ((err = read_blk(filp, nextblk, tmpbuf)) < 0)
- goto out_buf;
- ((struct lustre_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
- if ((err = write_blk(filp, nextblk, tmpbuf)) < 0)
- goto out_buf;
- }
- if (prevblk) {
- if ((err = read_blk(filp, prevblk, tmpbuf)) < 0)
- goto out_buf;
- ((struct lustre_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
- if ((err = write_blk(filp, prevblk, tmpbuf)) < 0)
- goto out_buf;
- }
- else {
- info->dqi_free_entry = nextblk;
- lustre_mark_info_dirty(info);
- }
- freedqbuf(tmpbuf);
- dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
- if (write_blk(filp, blk, buf) < 0) /* No matter whether write succeeds block is out of list */
- printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
- return 0;
+static int remove_free_dqentry(struct file *filp,
+ struct lustre_mem_dqinfo *info, dqbuf_t buf,
+ uint blk)
+{
+ dqbuf_t tmpbuf = getdqbuf();
+ struct lustre_disk_dqdbheader *dh =
+ (struct lustre_disk_dqdbheader *)buf;
+ uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk =
+ le32_to_cpu(dh->dqdh_prev_free);
+ int err;
+
+ if (!tmpbuf)
+ return -ENOMEM;
+ if (nextblk) {
+ if ((err = read_blk(filp, nextblk, tmpbuf)) < 0)
+ goto out_buf;
+ ((struct lustre_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+ dh->dqdh_prev_free;
+ if ((err = write_blk(filp, nextblk, tmpbuf)) < 0)
+ goto out_buf;
+ }
+ if (prevblk) {
+ if ((err = read_blk(filp, prevblk, tmpbuf)) < 0)
+ goto out_buf;
+ ((struct lustre_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
+ dh->dqdh_next_free;
+ if ((err = write_blk(filp, prevblk, tmpbuf)) < 0)
+ goto out_buf;
+ } else {
+ info->dqi_free_entry = nextblk;
+ lustre_mark_info_dirty(info);
+ }
+ freedqbuf(tmpbuf);
+ dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
+ if (write_blk(filp, blk, buf) < 0) /* No matter whether write succeeds block is out of list */
+ printk(KERN_ERR
+ "VFS: Can't write block (%u) with free entries.\n", blk);
+ return 0;
out_buf:
- freedqbuf(tmpbuf);
- return err;
+ freedqbuf(tmpbuf);
+ return err;
}
/* Insert given block to the beginning of list with free entries */
-static int insert_free_dqentry(struct file *filp, struct lustre_mem_dqinfo *info, dqbuf_t buf, uint blk)
-{
- dqbuf_t tmpbuf = getdqbuf();
- struct lustre_disk_dqdbheader *dh = (struct lustre_disk_dqdbheader *)buf;
- int err;
-
- if (!tmpbuf)
- return -ENOMEM;
- dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
- dh->dqdh_prev_free = cpu_to_le32(0);
- if ((err = write_blk(filp, blk, buf)) < 0)
- goto out_buf;
- if (info->dqi_free_entry) {
- if ((err = read_blk(filp, info->dqi_free_entry, tmpbuf)) < 0)
- goto out_buf;
- ((struct lustre_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
- if ((err = write_blk(filp, info->dqi_free_entry, tmpbuf)) < 0)
- goto out_buf;
- }
- freedqbuf(tmpbuf);
- info->dqi_free_entry = blk;
- lustre_mark_info_dirty(info);
- return 0;
+static int insert_free_dqentry(struct file *filp,
+ struct lustre_mem_dqinfo *info, dqbuf_t buf,
+ uint blk)
+{
+ dqbuf_t tmpbuf = getdqbuf();
+ struct lustre_disk_dqdbheader *dh =
+ (struct lustre_disk_dqdbheader *)buf;
+ int err;
+
+ if (!tmpbuf)
+ return -ENOMEM;
+ dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
+ dh->dqdh_prev_free = cpu_to_le32(0);
+ if ((err = write_blk(filp, blk, buf)) < 0)
+ goto out_buf;
+ if (info->dqi_free_entry) {
+ if ((err = read_blk(filp, info->dqi_free_entry, tmpbuf)) < 0)
+ goto out_buf;
+ ((struct lustre_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+ cpu_to_le32(blk);
+ if ((err = write_blk(filp, info->dqi_free_entry, tmpbuf)) < 0)
+ goto out_buf;
+ }
+ freedqbuf(tmpbuf);
+ info->dqi_free_entry = blk;
+ lustre_mark_info_dirty(info);
+ return 0;
out_buf:
- freedqbuf(tmpbuf);
- return err;
+ freedqbuf(tmpbuf);
+ return err;
}
/* Find space for dquot */
static uint find_free_dqentry(struct lustre_dquot *dquot, int *err)
{
- struct lustre_quota_info *lqi = dquot->dq_info;
- struct file *filp = lqi->qi_files[dquot->dq_type];
- struct lustre_mem_dqinfo *info = &lqi->qi_info[dquot->dq_type];
- uint blk, i;
- struct lustre_disk_dqdbheader *dh;
- struct lustre_disk_dqblk *ddquot;
- struct lustre_disk_dqblk fakedquot;
- dqbuf_t buf;
-
- *err = 0;
- if (!(buf = getdqbuf())) {
- *err = -ENOMEM;
- return 0;
- }
- dh = (struct lustre_disk_dqdbheader *)buf;
- ddquot = GETENTRIES(buf);
- if (info->dqi_free_entry) {
- blk = info->dqi_free_entry;
- if ((*err = read_blk(filp, blk, buf)) < 0)
- goto out_buf;
- }
- else {
- blk = get_free_dqblk(filp, info);
- if ((int)blk < 0) {
- *err = blk;
- freedqbuf(buf);
- return 0;
- }
- memset(buf, 0, LUSTRE_DQBLKSIZE);
- info->dqi_free_entry = blk; /* This is enough as block is already zeroed and entry list is empty... */
- lustre_mark_info_dirty(info);
- }
- if (le16_to_cpu(dh->dqdh_entries)+1 >= LUSTRE_DQSTRINBLK) /* Block will be full? */
- if ((*err = remove_free_dqentry(filp, info, buf, blk)) < 0) {
- printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
- goto out_buf;
- }
- dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)+1);
- memset(&fakedquot, 0, sizeof(struct lustre_disk_dqblk));
- /* Find free structure in block */
- for (i = 0; i < LUSTRE_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct lustre_disk_dqblk)); i++);
-
- if (i == LUSTRE_DQSTRINBLK) {
- printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
- *err = -EIO;
- goto out_buf;
- }
-
- if ((*err = write_blk(filp, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
- goto out_buf;
- }
- dquot->dq_off = (blk<<LUSTRE_DQBLKSIZE_BITS)+sizeof(struct lustre_disk_dqdbheader)+i*sizeof(struct lustre_disk_dqblk);
- freedqbuf(buf);
- return blk;
+ struct lustre_quota_info *lqi = dquot->dq_info;
+ struct file *filp = lqi->qi_files[dquot->dq_type];
+ struct lustre_mem_dqinfo *info = &lqi->qi_info[dquot->dq_type];
+ uint blk, i;
+ struct lustre_disk_dqdbheader *dh;
+ struct lustre_disk_dqblk *ddquot;
+ struct lustre_disk_dqblk fakedquot;
+ dqbuf_t buf;
+
+ *err = 0;
+ if (!(buf = getdqbuf())) {
+ *err = -ENOMEM;
+ return 0;
+ }
+ dh = (struct lustre_disk_dqdbheader *)buf;
+ ddquot = GETENTRIES(buf);
+ if (info->dqi_free_entry) {
+ blk = info->dqi_free_entry;
+ if ((*err = read_blk(filp, blk, buf)) < 0)
+ goto out_buf;
+ } else {
+ blk = get_free_dqblk(filp, info);
+ if ((int)blk < 0) {
+ *err = blk;
+ freedqbuf(buf);
+ return 0;
+ }
+ memset(buf, 0, LUSTRE_DQBLKSIZE);
+ info->dqi_free_entry = blk; /* This is enough as block is already zeroed and entry list is empty... */
+ lustre_mark_info_dirty(info);
+ }
+ if (le16_to_cpu(dh->dqdh_entries) + 1 >= LUSTRE_DQSTRINBLK) /* Block will be full? */
+ if ((*err = remove_free_dqentry(filp, info, buf, blk)) < 0) {
+ printk(KERN_ERR
+ "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n",
+ blk);
+ goto out_buf;
+ }
+ dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries) + 1);
+ memset(&fakedquot, 0, sizeof(struct lustre_disk_dqblk));
+ /* Find free structure in block */
+ for (i = 0; i < LUSTRE_DQSTRINBLK &&
+ memcmp(&fakedquot, ddquot + i, sizeof(fakedquot)); i++) ;
+
+ if (i == LUSTRE_DQSTRINBLK) {
+ printk(KERN_ERR
+ "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
+ *err = -EIO;
+ goto out_buf;
+ }
+
+ if ((*err = write_blk(filp, blk, buf)) < 0) {
+ printk(KERN_ERR
+ "VFS: find_free_dqentry(): Can't write quota data block %u.\n",
+ blk);
+ goto out_buf;
+ }
+ dquot->dq_off =
+ (blk << LUSTRE_DQBLKSIZE_BITS) +
+ sizeof(struct lustre_disk_dqdbheader) +
+ i * sizeof(struct lustre_disk_dqblk);
+ freedqbuf(buf);
+ return blk;
out_buf:
- freedqbuf(buf);
- return 0;
+ freedqbuf(buf);
+ return 0;
}
/* Insert reference to structure into the trie */
-static int do_insert_tree(struct lustre_dquot *dquot, uint *treeblk, int depth)
-{
- struct lustre_quota_info *lqi = dquot->dq_info;
- struct file *filp = lqi->qi_files[dquot->dq_type];
- struct lustre_mem_dqinfo *info = &lqi->qi_info[dquot->dq_type];
- dqbuf_t buf;
- int ret = 0, newson = 0, newact = 0;
- u32 *ref;
- uint newblk;
-
- if (!(buf = getdqbuf()))
- return -ENOMEM;
- if (!*treeblk) {
- ret = get_free_dqblk(filp, info);
- if (ret < 0)
- goto out_buf;
- *treeblk = ret;
- memset(buf, 0, LUSTRE_DQBLKSIZE);
- newact = 1;
- }
- else {
- if ((ret = read_blk(filp, *treeblk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
- goto out_buf;
- }
- }
- ref = (u32 *)buf;
- newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
- if (!newblk)
- newson = 1;
- if (depth == LUSTRE_DQTREEDEPTH-1) {
-
- if (newblk) {
- printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", ref[GETIDINDEX(dquot->dq_id, depth)]);
- ret = -EIO;
- goto out_buf;
- }
-
- newblk = find_free_dqentry(dquot, &ret);
- }
- else
- ret = do_insert_tree(dquot, &newblk, depth+1);
- if (newson && ret >= 0) {
- ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
- ret = write_blk(filp, *treeblk, buf);
- }
- else if (newact && ret < 0)
- put_free_dqblk(filp, info, buf, *treeblk);
+static int do_insert_tree(struct lustre_dquot *dquot, uint * treeblk, int depth)
+{
+ struct lustre_quota_info *lqi = dquot->dq_info;
+ struct file *filp = lqi->qi_files[dquot->dq_type];
+ struct lustre_mem_dqinfo *info = &lqi->qi_info[dquot->dq_type];
+ dqbuf_t buf;
+ int ret = 0, newson = 0, newact = 0;
+ u32 *ref;
+ uint newblk;
+
+ if (!(buf = getdqbuf()))
+ return -ENOMEM;
+ if (!*treeblk) {
+ ret = get_free_dqblk(filp, info);
+ if (ret < 0)
+ goto out_buf;
+ *treeblk = ret;
+ memset(buf, 0, LUSTRE_DQBLKSIZE);
+ newact = 1;
+ } else {
+ if ((ret = read_blk(filp, *treeblk, buf)) < 0) {
+ printk(KERN_ERR
+ "VFS: Can't read tree quota block %u.\n",
+ *treeblk);
+ goto out_buf;
+ }
+ }
+ ref = (u32 *) buf;
+ newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
+ if (!newblk)
+ newson = 1;
+ if (depth == LUSTRE_DQTREEDEPTH - 1) {
+
+ if (newblk) {
+ printk(KERN_ERR
+ "VFS: Inserting already present quota entry (block %u).\n",
+ ref[GETIDINDEX(dquot->dq_id, depth)]);
+ ret = -EIO;
+ goto out_buf;
+ }
+
+ newblk = find_free_dqentry(dquot, &ret);
+ } else
+ ret = do_insert_tree(dquot, &newblk, depth + 1);
+ if (newson && ret >= 0) {
+ ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
+ ret = write_blk(filp, *treeblk, buf);
+ } else if (newact && ret < 0)
+ put_free_dqblk(filp, info, buf, *treeblk);
out_buf:
- freedqbuf(buf);
- return ret;
+ freedqbuf(buf);
+ return ret;
}
/* Wrapper for inserting quota structure into tree */
static inline int dq_insert_tree(struct lustre_dquot *dquot)
{
- int tmp = LUSTRE_DQTREEOFF;
- return do_insert_tree(dquot, &tmp, 0);
+ int tmp = LUSTRE_DQTREEOFF;
+ return do_insert_tree(dquot, &tmp, 0);
}
/*
*/
static int lustre_write_dquot(struct lustre_dquot *dquot)
{
- int type = dquot->dq_type;
- struct file *filp;
- mm_segment_t fs;
- loff_t offset;
- ssize_t ret;
- struct lustre_disk_dqblk ddquot, empty;
-
- if (!dquot->dq_off)
- if ((ret = dq_insert_tree(dquot)) < 0) {
- printk(KERN_ERR "VFS: Error %Zd occurred while creating quota.\n", ret);
- return ret;
- }
- filp = dquot->dq_info->qi_files[type];
- offset = dquot->dq_off;
- mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
- /* Argh... We may need to write structure full of zeroes but that would be
- * treated as an empty place by the rest of the code. Format change would
- * be definitely cleaner but the problems probably are not worth it */
- memset(&empty, 0, sizeof(struct lustre_disk_dqblk));
- if (!memcmp(&empty, &ddquot, sizeof(struct lustre_disk_dqblk)))
- ddquot.dqb_itime = cpu_to_le64(1);
- fs = get_fs();
- set_fs(KERNEL_DS);
- ret = filp->f_op->write(filp, (char *)&ddquot, sizeof(struct lustre_disk_dqblk), &offset);
- set_fs(fs);
- if (ret != sizeof(struct lustre_disk_dqblk)) {
- printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", filp->f_dentry->d_sb->s_id);
- if (ret >= 0)
- ret = -ENOSPC;
- }
- else
- ret = 0;
-
- return ret;
+ int type = dquot->dq_type;
+ struct file *filp;
+ mm_segment_t fs;
+ loff_t offset;
+ ssize_t ret;
+ struct lustre_disk_dqblk ddquot, empty;
+
+ if (!dquot->dq_off)
+ if ((ret = dq_insert_tree(dquot)) < 0) {
+ printk(KERN_ERR
+ "VFS: Error %Zd occurred while creating quota.\n",
+ ret);
+ return ret;
+ }
+ filp = dquot->dq_info->qi_files[type];
+ offset = dquot->dq_off;
+ mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
+ /* Argh... We may need to write structure full of zeroes but that would be
+ * treated as an empty place by the rest of the code. Format change would
+ * be definitely cleaner but the problems probably are not worth it */
+ memset(&empty, 0, sizeof(struct lustre_disk_dqblk));
+ if (!memcmp(&empty, &ddquot, sizeof(struct lustre_disk_dqblk)))
+ ddquot.dqb_itime = cpu_to_le64(1);
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = filp->f_op->write(filp, (char *)&ddquot,
+ sizeof(struct lustre_disk_dqblk), &offset);
+ set_fs(fs);
+ if (ret != sizeof(struct lustre_disk_dqblk)) {
+ printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+ filp->f_dentry->d_sb->s_id);
+ if (ret >= 0)
+ ret = -ENOSPC;
+ } else
+ ret = 0;
+
+ return ret;
}
/* Free dquot entry in data block */
static int free_dqentry(struct lustre_dquot *dquot, uint blk)
{
- struct file *filp = dquot->dq_info->qi_files[dquot->dq_type];
- struct lustre_mem_dqinfo *info = &dquot->dq_info->qi_info[dquot->dq_type];
- struct lustre_disk_dqdbheader *dh;
- dqbuf_t buf = getdqbuf();
- int ret = 0;
-
- if (!buf)
- return -ENOMEM;
- if (dquot->dq_off >> LUSTRE_DQBLKSIZE_BITS != blk) {
- printk(KERN_ERR "VFS: Quota structure has offset to other block (%u) than it should (%u).\n", blk, (uint)(dquot->dq_off >> LUSTRE_DQBLKSIZE_BITS));
- goto out_buf;
- }
- if ((ret = read_blk(filp, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
- goto out_buf;
- }
- dh = (struct lustre_disk_dqdbheader *)buf;
- dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)-1);
- if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
- if ((ret = remove_free_dqentry(filp, info, buf, blk)) < 0 ||
- (ret = put_free_dqblk(filp, info, buf, blk)) < 0) {
- printk(KERN_ERR "VFS: Can't move quota data block (%u) to free list.\n", blk);
- goto out_buf;
- }
- }
- else {
- memset(buf+(dquot->dq_off & ((1 << LUSTRE_DQBLKSIZE_BITS)-1)), 0, sizeof(struct lustre_disk_dqblk));
- if (le16_to_cpu(dh->dqdh_entries) == LUSTRE_DQSTRINBLK-1) {
- /* Insert will write block itself */
- if ((ret = insert_free_dqentry(filp, info, buf, blk)) < 0) {
- printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
- goto out_buf;
- }
- }
- else
- if ((ret = write_blk(filp, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't write quota data block %u\n", blk);
- goto out_buf;
- }
- }
- dquot->dq_off = 0; /* Quota is now unattached */
+ struct file *filp = dquot->dq_info->qi_files[dquot->dq_type];
+ struct lustre_mem_dqinfo *info =
+ &dquot->dq_info->qi_info[dquot->dq_type];
+ struct lustre_disk_dqdbheader *dh;
+ dqbuf_t buf = getdqbuf();
+ int ret = 0;
+
+ if (!buf)
+ return -ENOMEM;
+ if (dquot->dq_off >> LUSTRE_DQBLKSIZE_BITS != blk) {
+ printk(KERN_ERR
+ "VFS: Quota structure has offset to other block (%u) than it should (%u).\n",
+ blk, (uint) (dquot->dq_off >> LUSTRE_DQBLKSIZE_BITS));
+ goto out_buf;
+ }
+ if ((ret = read_blk(filp, blk, buf)) < 0) {
+ printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+ goto out_buf;
+ }
+ dh = (struct lustre_disk_dqdbheader *)buf;
+ dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries) - 1);
+ if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
+ if ((ret = remove_free_dqentry(filp, info, buf, blk)) < 0 ||
+ (ret = put_free_dqblk(filp, info, buf, blk)) < 0) {
+ printk(KERN_ERR
+ "VFS: Can't move quota data block (%u) to free list.\n",
+ blk);
+ goto out_buf;
+ }
+ } else {
+ memset(buf +
+ (dquot->dq_off & ((1 << LUSTRE_DQBLKSIZE_BITS) - 1)), 0,
+ sizeof(struct lustre_disk_dqblk));
+ if (le16_to_cpu(dh->dqdh_entries) == LUSTRE_DQSTRINBLK - 1) {
+ /* Insert will write block itself */
+ if ((ret =
+ insert_free_dqentry(filp, info, buf, blk)) < 0) {
+ printk(KERN_ERR
+ "VFS: Can't insert quota data block (%u) to free entry list.\n",
+ blk);
+ goto out_buf;
+ }
+ } else if ((ret = write_blk(filp, blk, buf)) < 0) {
+ printk(KERN_ERR
+ "VFS: Can't write quota data block %u\n", blk);
+ goto out_buf;
+ }
+ }
+ dquot->dq_off = 0; /* Quota is now unattached */
out_buf:
- freedqbuf(buf);
- return ret;
+ freedqbuf(buf);
+ return ret;
}
/* Remove reference to dquot from tree */
-static int remove_tree(struct lustre_dquot *dquot, uint *blk, int depth)
-{
- struct file *filp = dquot->dq_info->qi_files[dquot->dq_type];
- struct lustre_mem_dqinfo *info = &dquot->dq_info->qi_info[dquot->dq_type];
- dqbuf_t buf = getdqbuf();
- int ret = 0;
- uint newblk;
- u32 *ref = (u32 *)buf;
-
- if (!buf)
- return -ENOMEM;
- if ((ret = read_blk(filp, *blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
- goto out_buf;
- }
- newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
- if (depth == LUSTRE_DQTREEDEPTH-1) {
- ret = free_dqentry(dquot, newblk);
- newblk = 0;
- }
- else
- ret = remove_tree(dquot, &newblk, depth+1);
- if (ret >= 0 && !newblk) {
- int i;
- ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
- for (i = 0; i < LUSTRE_DQBLKSIZE && !buf[i]; i++); /* Block got empty? */
- /* don't put the root block into free blk list! */
- if (i == LUSTRE_DQBLKSIZE && *blk != LUSTRE_DQTREEOFF) {
- put_free_dqblk(filp, info, buf, *blk);
- *blk = 0;
- }
- else
- if ((ret = write_blk(filp, *blk, buf)) < 0)
- printk(KERN_ERR "VFS: Can't write quota tree block %u.\n", *blk);
- }
+static int remove_tree(struct lustre_dquot *dquot, uint * blk, int depth)
+{
+ struct file *filp = dquot->dq_info->qi_files[dquot->dq_type];
+ struct lustre_mem_dqinfo *info =
+ &dquot->dq_info->qi_info[dquot->dq_type];
+ dqbuf_t buf = getdqbuf();
+ int ret = 0;
+ uint newblk;
+ u32 *ref = (u32 *) buf;
+
+ if (!buf)
+ return -ENOMEM;
+ if ((ret = read_blk(filp, *blk, buf)) < 0) {
+ printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+ goto out_buf;
+ }
+ newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
+ if (depth == LUSTRE_DQTREEDEPTH - 1) {
+ ret = free_dqentry(dquot, newblk);
+ newblk = 0;
+ } else
+ ret = remove_tree(dquot, &newblk, depth + 1);
+ if (ret >= 0 && !newblk) {
+ int i;
+ ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
+ for (i = 0; i < LUSTRE_DQBLKSIZE && !buf[i]; i++) ; /* Block got empty? */
+ /* don't put the root block into free blk list! */
+ if (i == LUSTRE_DQBLKSIZE && *blk != LUSTRE_DQTREEOFF) {
+ put_free_dqblk(filp, info, buf, *blk);
+ *blk = 0;
+ } else if ((ret = write_blk(filp, *blk, buf)) < 0)
+ printk(KERN_ERR
+ "VFS: Can't write quota tree block %u.\n", *blk);
+ }
out_buf:
- freedqbuf(buf);
- return ret;
+ freedqbuf(buf);
+ return ret;
}
/* Delete dquot from tree */
-#ifndef QFMT_NO_DELETE
static int lustre_delete_dquot(struct lustre_dquot *dquot)
{
- uint tmp = LUSTRE_DQTREEOFF;
+ uint tmp = LUSTRE_DQTREEOFF;
- if (!dquot->dq_off) /* Even not allocated? */
- return 0;
- return remove_tree(dquot, &tmp, 0);
+ if (!dquot->dq_off) /* Even not allocated? */
+ return 0;
+ return remove_tree(dquot, &tmp, 0);
}
-#endif
/* Find entry in block */
static loff_t find_block_dqentry(struct lustre_dquot *dquot, uint blk)
{
- struct file *filp = dquot->dq_info->qi_files[dquot->dq_type];
- dqbuf_t buf = getdqbuf();
- loff_t ret = 0;
- int i;
- struct lustre_disk_dqblk *ddquot = GETENTRIES(buf);
-
- if (!buf)
- return -ENOMEM;
- if ((ret = read_blk(filp, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
- goto out_buf;
- }
- if (dquot->dq_id)
- for (i = 0; i < LUSTRE_DQSTRINBLK && le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
- else { /* ID 0 as a bit more complicated searching... */
- struct lustre_disk_dqblk fakedquot;
-
- memset(&fakedquot, 0, sizeof(struct lustre_disk_dqblk));
- for (i = 0; i < LUSTRE_DQSTRINBLK; i++)
- if (!le32_to_cpu(ddquot[i].dqb_id) && memcmp(&fakedquot, ddquot+i, sizeof(struct lustre_disk_dqblk)))
- break;
- }
- if (i == LUSTRE_DQSTRINBLK) {
- printk(KERN_ERR "VFS: Quota for id %u referenced but not present.\n", dquot->dq_id);
- ret = -EIO;
- goto out_buf;
- }
- else
- ret = (blk << LUSTRE_DQBLKSIZE_BITS) + sizeof(struct lustre_disk_dqdbheader) + i * sizeof(struct lustre_disk_dqblk);
+ struct file *filp = dquot->dq_info->qi_files[dquot->dq_type];
+ dqbuf_t buf = getdqbuf();
+ loff_t ret = 0;
+ int i;
+ struct lustre_disk_dqblk *ddquot = GETENTRIES(buf);
+
+ if (!buf)
+ return -ENOMEM;
+ if ((ret = read_blk(filp, blk, buf)) < 0) {
+ printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+ goto out_buf;
+ }
+ if (dquot->dq_id)
+ for (i = 0;
+ i < LUSTRE_DQSTRINBLK
+ && le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++) ;
+ else { /* ID 0 as a bit more complicated searching... */
+ struct lustre_disk_dqblk fakedquot;
+
+ memset(&fakedquot, 0, sizeof(struct lustre_disk_dqblk));
+ for (i = 0; i < LUSTRE_DQSTRINBLK; i++)
+ if (!le32_to_cpu(ddquot[i].dqb_id)
+ && memcmp(&fakedquot, ddquot + i,
+ sizeof(struct lustre_disk_dqblk)))
+ break;
+ }
+ if (i == LUSTRE_DQSTRINBLK) {
+ printk(KERN_ERR
+ "VFS: Quota for id %u referenced but not present.\n",
+ dquot->dq_id);
+ ret = -EIO;
+ goto out_buf;
+ } else
+ ret =
+ (blk << LUSTRE_DQBLKSIZE_BITS) +
+ sizeof(struct lustre_disk_dqdbheader) +
+ i * sizeof(struct lustre_disk_dqblk);
out_buf:
- freedqbuf(buf);
- return ret;
+ freedqbuf(buf);
+ return ret;
}
/* Find entry for given id in the tree */
static loff_t find_tree_dqentry(struct lustre_dquot *dquot, uint blk, int depth)
{
- struct file *filp = dquot->dq_info->qi_files[dquot->dq_type];
- dqbuf_t buf = getdqbuf();
- loff_t ret = 0;
- u32 *ref = (u32 *)buf;
-
- if (!buf)
- return -ENOMEM;
- if ((ret = read_blk(filp, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
- goto out_buf;
- }
- ret = 0;
- blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
- if (!blk) /* No reference? */
- goto out_buf;
- if (depth < LUSTRE_DQTREEDEPTH-1)
- ret = find_tree_dqentry(dquot, blk, depth+1);
- else
- ret = find_block_dqentry(dquot, blk);
+ struct file *filp = dquot->dq_info->qi_files[dquot->dq_type];
+ dqbuf_t buf = getdqbuf();
+ loff_t ret = 0;
+ u32 *ref = (u32 *) buf;
+
+ if (!buf)
+ return -ENOMEM;
+ if ((ret = read_blk(filp, blk, buf)) < 0) {
+ printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+ goto out_buf;
+ }
+ ret = 0;
+ blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
+ if (!blk) /* No reference? */
+ goto out_buf;
+ if (depth < LUSTRE_DQTREEDEPTH - 1)
+ ret = find_tree_dqentry(dquot, blk, depth + 1);
+ else
+ ret = find_block_dqentry(dquot, blk);
out_buf:
- freedqbuf(buf);
- return ret;
+ freedqbuf(buf);
+ return ret;
}
/* Find entry for given id in the tree - wrapper function */
static inline loff_t find_dqentry(struct lustre_dquot *dquot)
{
- return find_tree_dqentry(dquot, LUSTRE_DQTREEOFF, 0);
+ return find_tree_dqentry(dquot, LUSTRE_DQTREEOFF, 0);
}
int lustre_read_dquot(struct lustre_dquot *dquot)
{
- int type = dquot->dq_type;
- struct file *filp;
- mm_segment_t fs;
- loff_t offset;
- struct lustre_disk_dqblk ddquot, empty;
- int ret = 0;
-
- filp = dquot->dq_info->qi_files[type];
-
- if (!filp || !dquot->dq_info) { /* Invalidated quota? */
- printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
- return -EIO;
- }
-
- offset = find_dqentry(dquot);
- if (offset <= 0) { /* Entry not present? */
- if (offset < 0)
- printk(KERN_ERR "VFS: Can't read quota structure for id %u.\n", dquot->dq_id);
- dquot->dq_off = 0;
- set_bit(DQ_FAKE_B, &dquot->dq_flags);
- memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
- ret = offset;
- }
- else {
- dquot->dq_off = offset;
- fs = get_fs();
- set_fs(KERNEL_DS);
- if ((ret = filp->f_op->read(filp, (char *)&ddquot, sizeof(struct lustre_disk_dqblk), &offset)) != sizeof(struct lustre_disk_dqblk)) {
- if (ret >= 0)
- ret = -EIO;
- printk(KERN_ERR "VFS: Error while reading quota structure for id %u.\n", dquot->dq_id);
- memset(&ddquot, 0, sizeof(struct lustre_disk_dqblk));
- }
- else {
- ret = 0;
- /* We need to escape back all-zero structure */
- memset(&empty, 0, sizeof(struct lustre_disk_dqblk));
- empty.dqb_itime = cpu_to_le64(1);
- if (!memcmp(&empty, &ddquot,
- sizeof(struct lustre_disk_dqblk)))
- ddquot.dqb_itime = 0;
- }
- set_fs(fs);
- disk2memdqb(&dquot->dq_dqb, &ddquot);
- }
-
- return ret;
+ int type = dquot->dq_type;
+ struct file *filp;
+ mm_segment_t fs;
+ loff_t offset;
+ struct lustre_disk_dqblk ddquot, empty;
+ int ret = 0;
+
+ filp = dquot->dq_info->qi_files[type];
+
+ if (!filp || !dquot->dq_info) { /* Invalidated quota? */
+ printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+ return -EIO;
+ }
+
+ offset = find_dqentry(dquot);
+ if (offset <= 0) { /* Entry not present? */
+ if (offset < 0)
+ printk(KERN_ERR
+ "VFS: Can't read quota structure for id %u.\n",
+ dquot->dq_id);
+ dquot->dq_off = 0;
+ set_bit(DQ_FAKE_B, &dquot->dq_flags);
+ memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+ ret = offset;
+ } else {
+ dquot->dq_off = offset;
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ if ((ret = filp->f_op->read(filp, (char *)&ddquot,
+ sizeof(struct lustre_disk_dqblk),
+ &offset)) !=
+ sizeof(struct lustre_disk_dqblk)) {
+ if (ret >= 0)
+ ret = -EIO;
+ printk(KERN_ERR
+ "VFS: Error while reading quota structure for id %u.\n",
+ dquot->dq_id);
+ memset(&ddquot, 0, sizeof(struct lustre_disk_dqblk));
+ } else {
+ ret = 0;
+ /* We need to escape back all-zero structure */
+ memset(&empty, 0, sizeof(struct lustre_disk_dqblk));
+ empty.dqb_itime = cpu_to_le64(1);
+ if (!memcmp(&empty, &ddquot,
+ sizeof(struct lustre_disk_dqblk)))
+ ddquot.dqb_itime = 0;
+ }
+ set_fs(fs);
+ disk2memdqb(&dquot->dq_dqb, &ddquot);
+ }
+
+ return ret;
}
/* Commit changes of dquot to disk - it might also mean deleting it when quota became fake */
int lustre_commit_dquot(struct lustre_dquot *dquot)
{
- int rc = 0;
- /* always clear the flag so we don't loop on an IO error... */
- clear_bit(DQ_MOD_B, &dquot->dq_flags);
+ int rc = 0;
+ /* always clear the flag so we don't loop on an IO error... */
+ clear_bit(DQ_MOD_B, &dquot->dq_flags);
- /* The block/inode usage in admin quotafile isn't the real usage
- * over all cluster, so keep the fake dquot entry on disk is
- * meaningless, just remove it */
- if (test_bit(DQ_FAKE_B, &dquot->dq_flags))
- rc = lustre_delete_dquot(dquot);
- else
- rc = lustre_write_dquot(dquot);
- if (rc < 0)
- return rc;
+ /* The block/inode usage in admin quotafile isn't the real usage
+ * over all cluster, so keep the fake dquot entry on disk is
+ * meaningless, just remove it */
+ if (test_bit(DQ_FAKE_B, &dquot->dq_flags))
+ rc = lustre_delete_dquot(dquot);
+ else
+ rc = lustre_write_dquot(dquot);
- if (lustre_info_dirty(&dquot->dq_info->qi_info[dquot->dq_type]))
- rc = lustre_write_quota_info(dquot->dq_info, dquot->dq_type);
+ if (rc < 0)
+ return rc;
- return rc;
+ if (lustre_info_dirty(&dquot->dq_info->qi_info[dquot->dq_type]))
+ rc = lustre_write_quota_info(dquot->dq_info, dquot->dq_type);
+
+ return rc;
}
/* We need to export this function to initialize quotafile, because we haven't
* user level check utility */
int lustre_init_quota_info(struct lustre_quota_info *lqi, int type)
{
- struct lustre_mem_dqinfo *dqinfo = &lqi->qi_info[type];
- struct lustre_disk_dqheader dqhead;
- struct file *fp = lqi->qi_files[type];
- ssize_t size;
- loff_t offset = 0;
- int rc = 0;
- static const uint quota_magics[] = LUSTRE_INITQMAGICS;
- static const uint quota_versions[] = LUSTRE_INITQVERSIONS;
-
- /* write quotafile header */
- dqhead.dqh_magic = cpu_to_le32(quota_magics[type]);
- dqhead.dqh_version = cpu_to_le32(quota_versions[type]);
- size = fp->f_op->write(fp, (char *)&dqhead,
- sizeof(struct lustre_disk_dqheader), &offset);
-
- if (size != sizeof(struct lustre_disk_dqheader)) {
- printk(KERN_ERR "error writing quoafile header (rc:%d)\n", rc);
- rc = size;
- }
- if (rc)
- return rc;
-
- /* write init quota info */
- memset(dqinfo, 0, sizeof(*dqinfo));
- dqinfo->dqi_bgrace = MAX_DQ_TIME;
- dqinfo->dqi_igrace = MAX_IQ_TIME;
- dqinfo->dqi_blocks = LUSTRE_DQTREEOFF + 1;
-
- return lustre_write_quota_info(lqi, type);
+ struct lustre_mem_dqinfo *dqinfo = &lqi->qi_info[type];
+ struct lustre_disk_dqheader dqhead;
+ struct file *fp = lqi->qi_files[type];
+ ssize_t size;
+ loff_t offset = 0;
+ int rc = 0;
+ static const uint quota_magics[] = LUSTRE_INITQMAGICS;
+ static const uint quota_versions[] = LUSTRE_INITQVERSIONS;
+
+ /* write quotafile header */
+ dqhead.dqh_magic = cpu_to_le32(quota_magics[type]);
+ dqhead.dqh_version = cpu_to_le32(quota_versions[type]);
+ size = fp->f_op->write(fp, (char *)&dqhead,
+ sizeof(struct lustre_disk_dqheader), &offset);
+
+ if (size != sizeof(struct lustre_disk_dqheader)) {
+ printk(KERN_ERR "error writing quoafile header (rc:%d)\n", rc);
+ rc = size;
+ }
+ if (rc)
+ return rc;
+
+ /* write init quota info */
+ memset(dqinfo, 0, sizeof(*dqinfo));
+ dqinfo->dqi_bgrace = MAX_DQ_TIME;
+ dqinfo->dqi_igrace = MAX_IQ_TIME;
+ dqinfo->dqi_blocks = LUSTRE_DQTREEOFF + 1;
+
+ return lustre_write_quota_info(lqi, type);
+}
+
+struct dqblk {
+ struct list_head link;
+ uint blk;
+};
+
+static ssize_t quota_read(struct file *file, struct inode *inode, int type,
+ uint blk, dqbuf_t buf)
+{
+ if (file) {
+ return read_blk(file, blk, buf);
+ } else {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12)
+ return -ENOTSUPP;
+#else
+ struct super_block *sb = inode->i_sb;
+ memset(buf, 0, LUSTRE_DQBLKSIZE);
+ return sb->s_op->quota_read(sb, type, (char *)buf,
+ LUSTRE_DQBLKSIZE,
+ blk << LUSTRE_DQBLKSIZE_BITS);
+#endif
+ }
+}
+
+static int walk_block_dqentry(struct file *filp, struct inode *inode, int type,
+ uint blk, struct list_head *list)
+{
+ dqbuf_t buf = getdqbuf();
+ loff_t ret = 0;
+ struct lustre_disk_dqdbheader *dqhead =
+ (struct lustre_disk_dqdbheader *)buf;
+ struct dqblk *blk_item;
+ struct dqblk *pos;
+ struct list_head *tmp;
+
+ if (!buf)
+ return -ENOMEM;
+ if ((ret = quota_read(filp, inode, type, blk, buf)) < 0) {
+ printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+ goto out_buf;
+ }
+ ret = 0;
+
+ if (!le32_to_cpu(dqhead->dqdh_entries))
+ goto out_buf;
+
+ if (list_empty(list)) {
+ tmp = list;
+ goto done;
+ }
+
+ list_for_each_entry(pos, list, link) {
+ if (blk == pos->blk) /* we got this blk already */
+ goto out_buf;
+ if (blk > pos->blk)
+ continue;
+ break;
+ }
+ tmp = &pos->link;
+done:
+ blk_item = kmalloc(sizeof(*blk_item), GFP_NOFS);
+ if (!blk_item) {
+ ret = -ENOMEM;
+ goto out_buf;
+ }
+ blk_item->blk = blk;
+ INIT_LIST_HEAD(&blk_item->link);
+
+ list_add_tail(&blk_item->link, tmp);
+
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+static int walk_tree_dqentry(struct file *filp, struct inode *inode, int type,
+ uint blk, int depth, struct list_head *list)
+{
+ dqbuf_t buf = getdqbuf();
+ loff_t ret = 0;
+ int index;
+ u32 *ref = (u32 *) buf;
+
+ if (!buf)
+ return -ENOMEM;
+ if ((ret = quota_read(filp, inode, type, blk, buf)) < 0) {
+ printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+ goto out_buf;
+ }
+ ret = 0;
+
+ for (index = 0; index <= 0xff && !ret; index++) {
+ blk = le32_to_cpu(ref[index]);
+ if (!blk) /* No reference */
+ continue;
+
+ if (depth < LUSTRE_DQTREEDEPTH - 1)
+ ret = walk_tree_dqentry(filp, inode, type, blk,
+ depth + 1, list);
+ else
+ ret = walk_block_dqentry(filp, inode, type, blk, list);
+ }
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Walk through the quota file (v2 format) to get all ids with quota limit */
+int lustre_get_qids(struct file *fp, struct inode *inode, int type,
+ struct list_head *list)
+{
+ struct list_head blk_list;
+ struct dqblk *blk_item, *tmp;
+ dqbuf_t buf = NULL;
+ struct lustre_disk_dqblk *ddquot;
+ int rc;
+
+ if (!check_quota_file(fp, inode, type)) {
+ printk(KERN_ERR "unknown quota file format!\n");
+ return -EINVAL;
+ }
+ if (!list_empty(list)) {
+ printk(KERN_ERR "not empty list\n");
+ return -EINVAL;
+ }
+
+ INIT_LIST_HEAD(&blk_list);
+ rc = walk_tree_dqentry(fp, inode, type, LUSTRE_DQTREEOFF, 0, &blk_list);
+ if (rc) {
+ printk(KERN_ERR "walk through quota file failed!(%d)\n", rc);
+ goto out_free;
+ }
+ if (list_empty(&blk_list))
+ return 0;
+
+ buf = getdqbuf();
+ if (!buf)
+ return -ENOMEM;
+ ddquot = GETENTRIES(buf);
+
+ list_for_each_entry(blk_item, &blk_list, link) {
+ loff_t ret = 0;
+ int i;
+ struct lustre_disk_dqblk fakedquot;
+
+ memset(buf, 0, LUSTRE_DQBLKSIZE);
+ if ((ret = quota_read(fp, inode, type, blk_item->blk, buf))<0) {
+ printk(KERN_ERR
+ "VFS: Can't read quota tree block %u.\n",
+ blk_item->blk);
+ rc = ret;
+ goto out_free;
+ }
+
+ memset(&fakedquot, 0, sizeof(struct lustre_disk_dqblk));
+ for (i = 0; i < LUSTRE_DQSTRINBLK; i++) {
+ struct dquot_id *dqid;
+ /* skip empty entry */
+ if (!memcmp
+ (&fakedquot, ddquot + i,
+ sizeof(struct lustre_disk_dqblk)))
+ continue;
+
+ dqid = kmalloc(sizeof(*dqid), GFP_NOFS);
+ if (!dqid) {
+ rc = -ENOMEM;
+ goto out_free;
+ }
+ dqid->di_id = le32_to_cpu(ddquot[i].dqb_id);
+ INIT_LIST_HEAD(&dqid->di_link);
+ list_add(&dqid->di_link, list);
+ }
+ }
+
+out_free:
+ list_for_each_entry_safe(blk_item, tmp, &blk_list, link) {
+ list_del_init(&blk_item->link);
+ kfree(blk_item);
+ }
+ if (buf)
+ freedqbuf(buf);
+ return rc;
}
EXPORT_SYMBOL(lustre_check_quota_file);
EXPORT_SYMBOL(lustre_read_dquot);
EXPORT_SYMBOL(lustre_commit_dquot);
EXPORT_SYMBOL(lustre_init_quota_info);
+EXPORT_SYMBOL(lustre_get_qids);
-/*
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
* Lustre administrative quota format
*
* from
* to blocks of these structures.
*/
struct lustre_disk_dqblk {
- __u32 dqb_id; /* id this quota applies to */
- __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */
- __u32 dqb_isoftlimit; /* preferred inode limit */
- __u32 dqb_curinodes; /* current # allocated inodes */
- __u32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
- __u32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
- __u64 dqb_curspace; /* current space occupied (in bytes) */
- __u64 dqb_btime; /* time limit for excessive disk use */
- __u64 dqb_itime; /* time limit for excessive inode use */
+ __u32 dqb_id; /* id this quota applies to */
+ __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */
+ __u32 dqb_isoftlimit; /* preferred inode limit */
+ __u32 dqb_curinodes; /* current # allocated inodes */
+ __u32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+ __u32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+ __u64 dqb_curspace; /* current space occupied (in bytes) */
+ __u64 dqb_btime; /* time limit for excessive disk use */
+ __u64 dqb_itime; /* time limit for excessive inode use */
};
/*
*/
/* First generic header */
struct lustre_disk_dqheader {
- __u32 dqh_magic; /* Magic number identifying file */
- __u32 dqh_version; /* File version */
+ __u32 dqh_magic; /* Magic number identifying file */
+ __u32 dqh_version; /* File version */
};
/* Header with type and version specific information */
struct lustre_disk_dqinfo {
- __u32 dqi_bgrace; /* Time before block soft limit becomes hard limit */
- __u32 dqi_igrace; /* Time before inode soft limit becomes hard limit */
- __u32 dqi_flags; /* Flags for quotafile (DQF_*) */
- __u32 dqi_blocks; /* Number of blocks in file */
- __u32 dqi_free_blk; /* Number of first free block in the list */
- __u32 dqi_free_entry; /* Number of block with at least one free entry */
+ __u32 dqi_bgrace; /* Time before block soft limit becomes hard limit */
+ __u32 dqi_igrace; /* Time before inode soft limit becomes hard limit */
+ __u32 dqi_flags; /* Flags for quotafile (DQF_*) */
+ __u32 dqi_blocks; /* Number of blocks in file */
+ __u32 dqi_free_blk; /* Number of first free block in the list */
+ __u32 dqi_free_entry; /* Number of block with at least one free entry */
};
/*
* there will be space for exactly 21 quota-entries in a block
*/
struct lustre_disk_dqdbheader {
- __u32 dqdh_next_free; /* Number of next block with free entry */
- __u32 dqdh_prev_free; /* Number of previous block with free entry */
- __u16 dqdh_entries; /* Number of valid entries in block */
- __u16 dqdh_pad1;
- __u32 dqdh_pad2;
+ __u32 dqdh_next_free; /* Number of next block with free entry */
+ __u32 dqdh_prev_free; /* Number of previous block with free entry */
+ __u16 dqdh_entries; /* Number of valid entries in block */
+ __u16 dqdh_pad1;
+ __u32 dqdh_pad2;
};
-#define LUSTRE_DQINFOOFF sizeof(struct lustre_disk_dqheader) /* Offset of info header in file */
+#define LUSTRE_DQINFOOFF sizeof(struct lustre_disk_dqheader) /* Offset of info header in file */
#define LUSTRE_DQBLKSIZE_BITS 10
-#define LUSTRE_DQBLKSIZE (1 << LUSTRE_DQBLKSIZE_BITS) /* Size of block with quota structures */
-#define LUSTRE_DQTREEOFF 1 /* Offset of tree in file in blocks */
-#define LUSTRE_DQTREEDEPTH 4 /* Depth of quota tree */
-#define LUSTRE_DQSTRINBLK ((LUSTRE_DQBLKSIZE - sizeof(struct lustre_disk_dqdbheader)) / sizeof(struct lustre_disk_dqblk)) /* Number of entries in one blocks */
-
+#define LUSTRE_DQBLKSIZE (1 << LUSTRE_DQBLKSIZE_BITS) /* Size of block with quota structures */
+#define LUSTRE_DQTREEOFF 1 /* Offset of tree in file in blocks */
+#define LUSTRE_DQTREEDEPTH 4 /* Depth of quota tree */
+#define LUSTRE_DQSTRINBLK ((LUSTRE_DQBLKSIZE - sizeof(struct lustre_disk_dqdbheader)) / sizeof(struct lustre_disk_dqblk)) /* Number of entries in one blocks */
-#endif /* lustre_quota_fmt.h */
+#endif /* lustre_quota_fmt.h */
#include "lustre_quota_fmt.h"
-char *test_quotafile[2] = {"usrquota_test", "grpquota_test"};
+char *test_quotafile[2] = { "usrquota_test", "grpquota_test" };
static int quotfmt_initialize(struct lustre_quota_info *lqi,
struct obd_device *tgt,
- struct lvfs_run_ctxt *saved)
+ struct lvfs_run_ctxt *saved)
{
- struct lustre_disk_dqheader dqhead;
- static const uint quota_magics[] = LUSTRE_INITQMAGICS;
- static const uint quota_versions[] = LUSTRE_INITQVERSIONS;
- struct file *fp;
- struct inode *parent_inode = tgt->obd_lvfs_ctxt.pwd->d_inode;
- size_t size;
- struct dentry *de;
- int i, rc = 0;
- ENTRY;
-
- push_ctxt(saved, &tgt->obd_lvfs_ctxt, NULL);
-
- sema_init(&lqi->qi_sem, 1);
-
- for (i = 0; i < MAXQUOTAS; i++) {
- loff_t offset = 0;
- char *name = test_quotafile[i];
- int namelen = strlen(name);
-
- /* remove the stale test quotafile */
- down(&parent_inode->i_sem);
- de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen);
- if (!IS_ERR(de) && de->d_inode)
- vfs_unlink(parent_inode, de);
- if (!IS_ERR(de))
- dput(de);
- up(&parent_inode->i_sem);
-
- /* create quota file */
- fp = filp_open(name, O_CREAT | O_EXCL, 0644);
- if (IS_ERR(fp)) {
- rc = PTR_ERR(fp);
- CERROR("error creating test quotafile %s (rc = %d)\n",
- name, rc);
- break;
- }
- lqi->qi_files[i] = fp;
-
- /* write quotafile header */
- dqhead.dqh_magic = cpu_to_le32(quota_magics[i]);
- dqhead.dqh_version = cpu_to_le32(quota_versions[i]);
- size = fp->f_op->write(fp, (char *)&dqhead,
- sizeof(struct lustre_disk_dqheader),
- &offset);
- if (size != sizeof(struct lustre_disk_dqheader)) {
- CERROR("error writing quoafile header %s (rc = %d)\n",
- name, rc);
- rc = size;
- break;
- }
- }
-
- RETURN(rc);
+ struct lustre_disk_dqheader dqhead;
+ static const uint quota_magics[] = LUSTRE_INITQMAGICS;
+ static const uint quota_versions[] = LUSTRE_INITQVERSIONS;
+ struct file *fp;
+ struct inode *parent_inode = tgt->obd_lvfs_ctxt.pwd->d_inode;
+ size_t size;
+ struct dentry *de;
+ int i, rc = 0;
+ ENTRY;
+
+ push_ctxt(saved, &tgt->obd_lvfs_ctxt, NULL);
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ loff_t offset = 0;
+ char *name = test_quotafile[i];
+ int namelen = strlen(name);
+
+ /* remove the stale test quotafile */
+ down(&parent_inode->i_sem);
+ de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen);
+ if (!IS_ERR(de) && de->d_inode)
+ vfs_unlink(parent_inode, de);
+ if (!IS_ERR(de))
+ dput(de);
+ up(&parent_inode->i_sem);
+
+ /* create quota file */
+ fp = filp_open(name, O_CREAT | O_EXCL, 0644);
+ if (IS_ERR(fp)) {
+ rc = PTR_ERR(fp);
+ CERROR("error creating test quotafile %s (rc = %d)\n",
+ name, rc);
+ break;
+ }
+ lqi->qi_files[i] = fp;
+
+ /* write quotafile header */
+ dqhead.dqh_magic = cpu_to_le32(quota_magics[i]);
+ dqhead.dqh_version = cpu_to_le32(quota_versions[i]);
+ size = fp->f_op->write(fp, (char *)&dqhead,
+ sizeof(struct lustre_disk_dqheader),
+ &offset);
+ if (size != sizeof(struct lustre_disk_dqheader)) {
+ CERROR("error writing quoafile header %s (rc = %d)\n",
+ name, rc);
+ rc = size;
+ break;
+ }
+ }
+
+ RETURN(rc);
}
static int quotfmt_finalize(struct lustre_quota_info *lqi,
- struct obd_device *tgt,
- struct lvfs_run_ctxt *saved)
+ struct obd_device *tgt, struct lvfs_run_ctxt *saved)
{
- struct dentry *de;
- struct inode *parent_inode = tgt->obd_lvfs_ctxt.pwd->d_inode;
- int i, rc = 0;
- ENTRY;
-
- for (i = 0; i < MAXQUOTAS; i++) {
- char *name = test_quotafile[i];
- int namelen = strlen(name);
-
- if (lqi->qi_files[i] == NULL)
- continue;
-
- /* close quota file */
- filp_close(lqi->qi_files[i], 0);
-
- /* unlink quota file */
- down(&parent_inode->i_sem);
-
- de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen);
- if (IS_ERR(de) || de->d_inode == NULL) {
- rc = IS_ERR(de) ? PTR_ERR(de) : -ENOENT;
- CERROR("error lookup quotafile %s (rc = %d)\n",
- name, rc);
- goto dput;
- }
-
- rc = vfs_unlink(parent_inode, de);
- if (rc)
- CERROR("error unlink quotafile %s (rc = %d)\n",
- name, rc);
-dput:
- if (!IS_ERR(de))
- dput(de);
- up(&parent_inode->i_sem);
- }
-
- pop_ctxt(saved, &tgt->obd_lvfs_ctxt, NULL);
- RETURN(rc);
+ struct dentry *de;
+ struct inode *parent_inode = tgt->obd_lvfs_ctxt.pwd->d_inode;
+ int i, rc = 0;
+ ENTRY;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ char *name = test_quotafile[i];
+ int namelen = strlen(name);
+
+ if (lqi->qi_files[i] == NULL)
+ continue;
+
+ /* close quota file */
+ filp_close(lqi->qi_files[i], 0);
+
+ /* unlink quota file */
+ down(&parent_inode->i_sem);
+
+ de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen);
+ if (IS_ERR(de) || de->d_inode == NULL) {
+ rc = IS_ERR(de) ? PTR_ERR(de) : -ENOENT;
+ CERROR("error lookup quotafile %s (rc = %d)\n",
+ name, rc);
+ goto dput;
+ }
+
+ rc = vfs_unlink(parent_inode, de);
+ if (rc)
+ CERROR("error unlink quotafile %s (rc = %d)\n",
+ name, rc);
+ dput:
+ if (!IS_ERR(de))
+ dput(de);
+ up(&parent_inode->i_sem);
+ }
+
+ pop_ctxt(saved, &tgt->obd_lvfs_ctxt, NULL);
+ RETURN(rc);
}
static int quotfmt_test_1(struct lustre_quota_info *lqi)
{
- int i;
- ENTRY;
-
- for (i = 0; i < MAXQUOTAS; i++) {
- if (!lustre_check_quota_file(lqi, i))
- RETURN(-EINVAL);
- }
- RETURN(0);
+ int i;
+ ENTRY;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (!lustre_check_quota_file(lqi, i))
+ RETURN(-EINVAL);
+ }
+ RETURN(0);
}
static void print_quota_info(struct lustre_quota_info *lqi)
{
#if 0
- struct lustre_mem_dqinfo *dqinfo;
- int i;
-
- for (i = 0; i < MAXQUOTAS; i++) {
- dqinfo = &lqi->qi_info[i];
- printk("%s quota info:\n", i == USRQUOTA ? "user " : "group");
- printk("dqi_bgrace(%u) dqi_igrace(%u) dqi_flags(%lu) dqi_blocks(%u) "
- "dqi_free_blk(%u) dqi_free_entry(%u)\n",
- dqinfo->dqi_bgrace, dqinfo->dqi_igrace, dqinfo->dqi_flags,
- dqinfo->dqi_blocks, dqinfo->dqi_free_blk,
- dqinfo->dqi_free_entry);
- }
+ struct lustre_mem_dqinfo *dqinfo;
+ int i;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ dqinfo = &lqi->qi_info[i];
+ printk("%s quota info:\n", i == USRQUOTA ? "user " : "group");
+ printk
+ ("dqi_bgrace(%u) dqi_igrace(%u) dqi_flags(%lu) dqi_blocks(%u) "
+ "dqi_free_blk(%u) dqi_free_entry(%u)\n",
+ dqinfo->dqi_bgrace, dqinfo->dqi_igrace, dqinfo->dqi_flags,
+ dqinfo->dqi_blocks, dqinfo->dqi_free_blk,
+ dqinfo->dqi_free_entry);
+ }
#endif
}
static int quotfmt_test_2(struct lustre_quota_info *lqi)
{
- int i, rc = 0;
- ENTRY;
-
- for (i = 0; i < MAXQUOTAS; i++) {
- struct lustre_mem_dqinfo dqinfo;
-
- rc = lustre_init_quota_info(lqi, i);
- if (rc) {
- CERROR("init quotainfo(%d) failed! (rc:%d)\n", i, rc);
- break;
- }
- memcpy(&dqinfo, &lqi->qi_info[i], sizeof(dqinfo));
-
- rc = lustre_read_quota_info(lqi, i);
- if (rc) {
- CERROR("read quotainfo(%d) failed! (rc:%d)\n", i, rc);
- break;
- }
-
- if(memcmp(&dqinfo, &lqi->qi_info[i], sizeof(dqinfo))) {
- rc = -EINVAL;
- break;
- }
- }
- RETURN(rc);
+ int i, rc = 0;
+ ENTRY;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ struct lustre_mem_dqinfo dqinfo;
+
+ rc = lustre_init_quota_info(lqi, i);
+ if (rc) {
+ CERROR("init quotainfo(%d) failed! (rc:%d)\n", i, rc);
+ break;
+ }
+ memcpy(&dqinfo, &lqi->qi_info[i], sizeof(dqinfo));
+
+ rc = lustre_read_quota_info(lqi, i);
+ if (rc) {
+ CERROR("read quotainfo(%d) failed! (rc:%d)\n", i, rc);
+ break;
+ }
+
+ if (memcmp(&dqinfo, &lqi->qi_info[i], sizeof(dqinfo))) {
+ rc = -EINVAL;
+ break;
+ }
+ }
+ RETURN(rc);
}
static struct lustre_dquot *get_rand_dquot(struct lustre_quota_info *lqi)
{
- struct lustre_dquot *dquot;
- unsigned int rand;
-
- OBD_ALLOC(dquot, sizeof(*dquot));
- if (dquot == NULL)
- return NULL;
-
- get_random_bytes(&rand, sizeof(rand));
- if (!rand)
- rand = 1000;
-
- dquot->dq_info = lqi;
- dquot->dq_id = rand % 1000 + 1;
- dquot->dq_type = rand % MAXQUOTAS;
-
- dquot->dq_dqb.dqb_bhardlimit = rand;
- dquot->dq_dqb.dqb_bsoftlimit = rand / 2;
- dquot->dq_dqb.dqb_curspace = rand / 3;
- dquot->dq_dqb.dqb_ihardlimit = rand;
- dquot->dq_dqb.dqb_isoftlimit = rand / 2;
- dquot->dq_dqb.dqb_curinodes = rand / 3;
- dquot->dq_dqb.dqb_btime = jiffies;
- dquot->dq_dqb.dqb_itime = jiffies;
-
- return dquot;
+ struct lustre_dquot *dquot;
+ unsigned int rand;
+
+ OBD_ALLOC(dquot, sizeof(*dquot));
+ if (dquot == NULL)
+ return NULL;
+
+ get_random_bytes(&rand, sizeof(rand));
+ if (!rand)
+ rand = 1000;
+
+ dquot->dq_info = lqi;
+ dquot->dq_id = rand % 1000 + 1;
+ dquot->dq_type = rand % MAXQUOTAS;
+
+ dquot->dq_dqb.dqb_bhardlimit = rand;
+ dquot->dq_dqb.dqb_bsoftlimit = rand / 2;
+ dquot->dq_dqb.dqb_curspace = rand / 3;
+ dquot->dq_dqb.dqb_ihardlimit = rand;
+ dquot->dq_dqb.dqb_isoftlimit = rand / 2;
+ dquot->dq_dqb.dqb_curinodes = rand / 3;
+ dquot->dq_dqb.dqb_btime = jiffies;
+ dquot->dq_dqb.dqb_itime = jiffies;
+
+ return dquot;
}
static void put_rand_dquot(struct lustre_dquot *dquot)
{
- OBD_FREE(dquot, sizeof(*dquot));
+ OBD_FREE(dquot, sizeof(*dquot));
}
static int write_check_dquot(struct lustre_quota_info *lqi)
{
- struct lustre_dquot *dquot;
- struct mem_dqblk dqblk;
- int rc = 0;
- ENTRY;
-
- dquot = get_rand_dquot(lqi);
- if (dquot == NULL)
- RETURN(-ENOMEM);
-
- /* for already exists entry, we set the dq_off by read_dquot */
- rc = lustre_read_dquot(dquot);
- if (rc) {
- CERROR("read dquot failed! (rc:%d)\n", rc);
- GOTO(out, rc);
- }
-
- /* for already exists entry, we rewrite it */
- rc = lustre_commit_dquot(dquot);
- if (rc) {
- CERROR("commit dquot failed! (rc:%d)\n", rc);
- GOTO(out, rc);
- }
- memcpy(&dqblk, &dquot->dq_dqb, sizeof(dqblk));
- memset(&dquot->dq_dqb, 0, sizeof(dqblk));
-
- rc = lustre_read_dquot(dquot);
- if (rc) {
- CERROR("read dquot failed! (rc:%d)\n", rc);
- GOTO(out, rc);
- }
-
- if (memcmp(&dqblk, &dquot->dq_dqb, sizeof(dqblk))) {
- rc = -EINVAL;
- GOTO(out, rc);
- }
-out:
- put_rand_dquot(dquot);
- RETURN(rc);
+ struct lustre_dquot *dquot;
+ struct mem_dqblk dqblk;
+ int rc = 0;
+ ENTRY;
+
+ dquot = get_rand_dquot(lqi);
+ if (dquot == NULL)
+ RETURN(-ENOMEM);
+
+ /* for already exists entry, we set the dq_off by read_dquot */
+ rc = lustre_read_dquot(dquot);
+ if (rc) {
+ CERROR("read dquot failed! (rc:%d)\n", rc);
+ GOTO(out, rc);
+ }
+
+ clear_bit(DQ_FAKE_B, &dquot->dq_flags);
+ /* for already exists entry, we rewrite it */
+ rc = lustre_commit_dquot(dquot);
+ if (rc) {
+ CERROR("commit dquot failed! (rc:%d)\n", rc);
+ GOTO(out, rc);
+ }
+ memcpy(&dqblk, &dquot->dq_dqb, sizeof(dqblk));
+ memset(&dquot->dq_dqb, 0, sizeof(dqblk));
+
+ rc = lustre_read_dquot(dquot);
+ if (rc) {
+ CERROR("read dquot failed! (rc:%d)\n", rc);
+ GOTO(out, rc);
+ }
+
+ if (memcmp(&dqblk, &dquot->dq_dqb, sizeof(dqblk))) {
+ rc = -EINVAL;
+ GOTO(out, rc);
+ }
+ out:
+ put_rand_dquot(dquot);
+ RETURN(rc);
}
static int quotfmt_test_3(struct lustre_quota_info *lqi)
{
- struct lustre_dquot *dquot;
- int i = 0, rc = 0;
- ENTRY;
-
- dquot = get_rand_dquot(lqi);
- if (dquot == NULL)
- RETURN(-ENOMEM);
-repeat:
- clear_bit(DQ_FAKE_B, &dquot->dq_flags);
- /* write a new dquot */
- rc = lustre_commit_dquot(dquot);
- if (rc) {
- CERROR("commit dquot failed! (rc:%d)\n", rc);
- GOTO(out, rc);
- }
- dquot->dq_off = 0;
- memset(&dquot->dq_dqb, 0, sizeof(dquot->dq_dqb));
-
- /* check if this dquot is on disk now */
- rc = lustre_read_dquot(dquot);
- if (rc) {
- CERROR("read dquot failed! (rc:%d)\n", rc);
- GOTO(out, rc);
- }
- if (!dquot->dq_off || test_bit(DQ_FAKE_B, &dquot->dq_flags)) {
- CERROR("the dquot isn't committed\n");
- GOTO(out, rc = -EINVAL);
- }
-
- /* remove this dquot */
- set_bit(DQ_FAKE_B, &dquot->dq_flags);
- dquot->dq_dqb.dqb_curspace = 0;
- dquot->dq_dqb.dqb_curinodes = 0;
- rc = lustre_commit_dquot(dquot);
- if (rc) {
- CERROR("remove dquot failed! (rc:%d)\n", rc);
- GOTO(out, rc);
- }
-
- /* check if the dquot is really removed */
- clear_bit(DQ_FAKE_B, &dquot->dq_flags);
- dquot->dq_off = 0;
- rc = lustre_read_dquot(dquot);
- if (rc) {
- CERROR("read dquot failed! (rc:%d)\n", rc);
- GOTO(out, rc);
- }
- if (!test_bit(DQ_FAKE_B, &dquot->dq_flags) || dquot->dq_off) {
- CERROR("the dquot isn't removed!\n");
- GOTO(out, rc = -EINVAL);
- }
-
- /* check if this dquot can be write again */
- if (++i < 2)
- goto repeat;
+ struct lustre_dquot *dquot;
+ int i = 0, rc = 0;
+ ENTRY;
+
+ dquot = get_rand_dquot(lqi);
+ if (dquot == NULL)
+ RETURN(-ENOMEM);
+ repeat:
+ clear_bit(DQ_FAKE_B, &dquot->dq_flags);
+ /* write a new dquot */
+ rc = lustre_commit_dquot(dquot);
+ if (rc) {
+ CERROR("commit dquot failed! (rc:%d)\n", rc);
+ GOTO(out, rc);
+ }
+ dquot->dq_off = 0;
+ memset(&dquot->dq_dqb, 0, sizeof(dquot->dq_dqb));
+
+ /* check if this dquot is on disk now */
+ rc = lustre_read_dquot(dquot);
+ if (rc) {
+ CERROR("read dquot failed! (rc:%d)\n", rc);
+ GOTO(out, rc);
+ }
+ if (!dquot->dq_off || test_bit(DQ_FAKE_B, &dquot->dq_flags)) {
+ CERROR("the dquot isn't committed\n");
+ GOTO(out, rc = -EINVAL);
+ }
+
+ /* remove this dquot */
+ set_bit(DQ_FAKE_B, &dquot->dq_flags);
+ dquot->dq_dqb.dqb_curspace = 0;
+ dquot->dq_dqb.dqb_curinodes = 0;
+ rc = lustre_commit_dquot(dquot);
+ if (rc) {
+ CERROR("remove dquot failed! (rc:%d)\n", rc);
+ GOTO(out, rc);
+ }
+
+ /* check if the dquot is really removed */
+ clear_bit(DQ_FAKE_B, &dquot->dq_flags);
+ dquot->dq_off = 0;
+ rc = lustre_read_dquot(dquot);
+ if (rc) {
+ CERROR("read dquot failed! (rc:%d)\n", rc);
+ GOTO(out, rc);
+ }
+ if (!test_bit(DQ_FAKE_B, &dquot->dq_flags) || dquot->dq_off) {
+ CERROR("the dquot isn't removed!\n");
+ GOTO(out, rc = -EINVAL);
+ }
+
+ /* check if this dquot can be write again */
+ if (++i < 2)
+ goto repeat;
print_quota_info(lqi);
-out:
- put_rand_dquot(dquot);
- RETURN(rc);
+ out:
+ put_rand_dquot(dquot);
+ RETURN(rc);
}
static int quotfmt_test_4(struct lustre_quota_info *lqi)
{
- int i, rc = 0;
- ENTRY;
-
- for (i = 0; i < 30000; i++) {
- rc = write_check_dquot(lqi);
- if (rc) {
- CERROR("write/check dquot failed at %d! (rc:%d)\n",
- i, rc);
- break;
- }
- }
- print_quota_info(lqi);
- RETURN(rc);
+ int i, rc = 0;
+ ENTRY;
+
+ for (i = 0; i < 30000; i++) {
+ rc = write_check_dquot(lqi);
+ if (rc) {
+ CERROR("write/check dquot failed at %d! (rc:%d)\n",
+ i, rc);
+ break;
+ }
+ }
+ print_quota_info(lqi);
+ RETURN(rc);
+}
+
+static int quotfmt_test_5(struct lustre_quota_info *lqi)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12)
+ int i, rc = 0;
+
+ for (i = USRQUOTA; i < MAXQUOTAS && !rc; i++) {
+ struct list_head list;
+ struct dquot_id *dqid, *tmp;
+
+ INIT_LIST_HEAD(&list);
+ rc = lustre_get_qids(lqi->qi_files[i], NULL, i, &list);
+ if (rc) {
+ CERROR("%s get all %ss (rc:%d):\n",
+ rc ? "error" : "success",
+ i == USRQUOTA ? "uid" : "gid", rc);
+ }
+ list_for_each_entry_safe(dqid, tmp, &list, di_link) {
+ list_del_init(&dqid->di_link);
+ if (rc == 0)
+ printk("%d ", dqid->di_id);
+ kfree(dqid);
+ }
+ printk("\n");
+ }
+ return rc;
+#else
+ CWARN("kernel version >= 2.6.12, test skipped\n");
+ return 0;
+#endif
}
static int quotfmt_run_tests(struct obd_device *obd, struct obd_device *tgt)
{
struct lvfs_run_ctxt saved;
- struct lustre_quota_info *lqi = NULL;
- int rc = 0;
- ENTRY;
-
- OBD_ALLOC(lqi, sizeof(*lqi));
- if (lqi == NULL) {
- CERROR("not enough memory\n");
- RETURN(-ENOMEM);
- }
-
- CWARN("=== Initialize quotafile test\n");
- rc = quotfmt_initialize(lqi, tgt, &saved);
- if (rc)
- GOTO(out, rc);
-
- CWARN("=== test 1: check quota header\n");
- rc = quotfmt_test_1(lqi);
- if (rc) {
- CERROR("check quota header failed! (rc:%d)\n", rc);
- GOTO(out, rc);
- }
-
- CWARN("=== test 2: write/read quota info\n");
- rc = quotfmt_test_2(lqi);
- if (rc) {
- CERROR("write/read quota info failed! (rc:%d)\n", rc);
- GOTO(out, rc);
- }
-
- CWARN("=== test 3: write/remove dquot\n");
- rc = quotfmt_test_3(lqi);
- if (rc) {
- CERROR("write/remove dquot failed! (rc:%d)\n", rc);
- GOTO(out, rc);
- }
-
- CWARN("=== test 4: write/read 30000 dquot\n");
- rc = quotfmt_test_4(lqi);
- if (rc) {
- CERROR("write/read 30000 dquot failed\n");
- GOTO(out, rc);
- }
-out:
- CWARN("=== Finalize quotafile test\n");
- rc = quotfmt_finalize(lqi, tgt, &saved);
- OBD_FREE(lqi, sizeof(*lqi));
- RETURN(rc);
+ struct lustre_quota_info *lqi = NULL;
+ int rc = 0;
+ ENTRY;
+
+ OBD_ALLOC(lqi, sizeof(*lqi));
+ if (lqi == NULL) {
+ CERROR("not enough memory\n");
+ RETURN(-ENOMEM);
+ }
+
+ CWARN("=== Initialize quotafile test\n");
+ rc = quotfmt_initialize(lqi, tgt, &saved);
+ if (rc)
+ GOTO(out, rc);
+
+ CWARN("=== test 1: check quota header\n");
+ rc = quotfmt_test_1(lqi);
+ if (rc) {
+ CERROR("check quota header failed! (rc:%d)\n", rc);
+ GOTO(out, rc);
+ }
+
+ CWARN("=== test 2: write/read quota info\n");
+ rc = quotfmt_test_2(lqi);
+ if (rc) {
+ CERROR("write/read quota info failed! (rc:%d)\n", rc);
+ GOTO(out, rc);
+ }
+
+ CWARN("=== test 3: write/remove dquot\n");
+ rc = quotfmt_test_3(lqi);
+ if (rc) {
+ CERROR("write/remove dquot failed! (rc:%d)\n", rc);
+ GOTO(out, rc);
+ }
+
+ CWARN("=== test 4: write/read 30000 dquot\n");
+ rc = quotfmt_test_4(lqi);
+ if (rc) {
+ CERROR("write/read 30000 dquot failed\n");
+ GOTO(out, rc);
+ }
+
+ CWARN("=== test 5: walk through quota file to get all ids\n");
+ rc = quotfmt_test_5(lqi);
+ if (rc) {
+ CERROR("walk through quota file failed\n");
+ GOTO(out, rc);
+ }
+
+ out:
+ CWARN("=== Finalize quotafile test\n");
+ rc = quotfmt_finalize(lqi, tgt, &saved);
+ OBD_FREE(lqi, sizeof(*lqi));
+ RETURN(rc);
}
static int quotfmt_test_cleanup(struct obd_device *obd)
{
- ENTRY;
+ ENTRY;
lprocfs_obd_cleanup(obd);
RETURN(0);
}
{
struct lprocfs_static_vars lvars;
struct lustre_cfg *lcfg = buf;
- struct obd_device *tgt;
+ struct obd_device *tgt;
int rc;
ENTRY;
}
static struct obd_ops quotfmt_obd_ops = {
- .o_owner = THIS_MODULE,
- .o_setup = quotfmt_test_setup,
- .o_cleanup = quotfmt_test_cleanup,
+ .o_owner = THIS_MODULE,
+ .o_setup = quotfmt_test_setup,
+ .o_cleanup = quotfmt_test_cleanup,
};
#ifdef LPROCFS
static struct lprocfs_vars lprocfs_obd_vars[] = { {0} };
static struct lprocfs_vars lprocfs_module_vars[] = { {0} };
+
LPROCFS_INIT_VARS(quotfmt_test, lprocfs_module_vars, lprocfs_obd_vars)
#endif
-
static int __init quotfmt_test_init(void)
{
struct lprocfs_static_vars lvars;
lprocfs_init_vars(quotfmt_test, &lvars);
return class_register_type("fmt_obd_ops, lvars.module_vars,
- "quotfmt_test");
+ "quotfmt_test");
}
static void __exit quotfmt_test_exit(void)
MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
DIST_SOURCES = ldlm_extent.c ldlm_flock.c ldlm_internal.h ldlm_lib.c \
ldlm_lock.c ldlm_lockd.c ldlm_plain.c ldlm_request.c \
- ldlm_resource.c l_lock.c
+ ldlm_resource.c l_lock.c ldlm_inodebits.c
/* Determine if the lock is compatible with all locks on the queue. */
static int
ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req,
- struct list_head *work_list)
+ int send_cbs)
{
struct list_head *tmp;
struct ldlm_lock *lock;
int compat = 1;
ENTRY;
- LASSERT(req_bits); /* There is no sence in lock with no bits set,
+ LASSERT(req_bits); /* There is no sense in lock with no bits set,
I think. Also such a lock would be compatible
with any other bit lock */
list_for_each(tmp, queue) {
if (!(lock->l_policy_data.l_inodebits.bits & req_bits))
continue;
- if (!work_list)
+ if (!send_cbs)
RETURN(0);
compat = 0;
if (lock->l_blocking_ast)
- ldlm_add_ast_work_item(lock, req, work_list);
+ ldlm_add_ast_work_item(lock, req, NULL, 0);
}
RETURN(compat);
* - the caller has NOT initialized req->lr_tmp, so we must
* - must call this function with the ns lock held once */
int ldlm_process_inodebits_lock(struct ldlm_lock *lock, int *flags,
- int first_enq, ldlm_error_t *err,
- struct list_head *work_list)
+ int first_enq, ldlm_error_t *err)
{
struct ldlm_resource *res = lock->l_resource;
struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
ENTRY;
LASSERT(list_empty(&res->lr_converting));
- check_res_locked(res);
if (!first_enq) {
- LASSERT(work_list != NULL);
- rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, NULL);
+ LASSERT(res->lr_tmp != NULL);
+ rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, 0);
if (!rc)
RETURN(LDLM_ITER_STOP);
- rc = ldlm_inodebits_compat_queue(&res->lr_waiting, lock, NULL);
+ rc = ldlm_inodebits_compat_queue(&res->lr_waiting, lock, 0);
if (!rc)
RETURN(LDLM_ITER_STOP);
ldlm_resource_unlink_lock(lock);
- ldlm_grant_lock(lock, work_list);
+ ldlm_grant_lock(lock, NULL, 0, 1);
RETURN(LDLM_ITER_CONTINUE);
}
restart:
- rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, &rpc_list);
- rc += ldlm_inodebits_compat_queue(&res->lr_waiting, lock, &rpc_list);
+ LASSERT(res->lr_tmp == NULL);
+ res->lr_tmp = &rpc_list;
+ rc = ldlm_inodebits_compat_queue(&res->lr_granted, lock, 1);
+ rc += ldlm_inodebits_compat_queue(&res->lr_waiting, lock, 1);
+ res->lr_tmp = NULL;
if (rc != 2) {
/* If either of the compat_queue()s returned 0, then we
* re-ordered! Causes deadlock, because ASTs aren't sent! */
if (list_empty(&lock->l_res_link))
ldlm_resource_add_lock(res, &res->lr_waiting, lock);
- unlock_res(res);
- rc = ldlm_run_bl_ast_work(&rpc_list);
- lock_res(res);
+ l_unlock(&res->lr_namespace->ns_lock);
+ rc = ldlm_run_ast_work(res->lr_namespace, &rpc_list);
+ l_lock(&res->lr_namespace->ns_lock);
if (rc == -ERESTART)
GOTO(restart, -ERESTART);
*flags |= LDLM_FL_BLOCK_GRANTED;
} else {
ldlm_resource_unlink_lock(lock);
- ldlm_grant_lock(lock, NULL);
+ ldlm_grant_lock(lock, NULL, 0, 0);
}
RETURN(0);
}
struct ldlm_lock *
ldlm_lock_create(struct ldlm_namespace *ns,
struct lustre_handle *parent_lock_handle, struct ldlm_res_id,
- __u32 type, ldlm_mode_t, ldlm_blocking_callback,
+ ldlm_type_t type, ldlm_mode_t, ldlm_blocking_callback,
ldlm_completion_callback, ldlm_glimpse_callback, void *data,
__u32 lvb_len);
ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **,
int ldlm_process_llog_lock(struct ldlm_lock *lock, int *flags, int first_enq,
ldlm_error_t *err);
+
+/* ldlm_inodebits.c */
+int ldlm_process_inodebits_lock(struct ldlm_lock *lock, int *flags,
+ int first_enq, ldlm_error_t *err);
+
/* l_lock.c */
void l_check_ns_lock(struct ldlm_namespace *ns);
void l_check_no_ns_lock(struct ldlm_namespace *ns);
}
}
- spin_lock_init(&cli->cl_qchk_lock);
- cli->cl_qchk_stat = CL_NO_QUOTACHECK;
+ cli->cl_qchk_stat = CL_NOT_QUOTACHECKED;
RETURN(rc);
GOTO(out_ldlm, rc);
ocd = &imp->imp_connect_data;
- if (data)
+ if (data) {
*ocd = *data;
+ imp->imp_connect_flags_orig = data->ocd_connect_flags;
+ }
rc = ptlrpc_connect_import(imp, NULL);
if (rc != 0) {
req->rq_repmsg->handle = conn;
+ /* ownership of this export ref transfers to the request AFTER we
+ * drop any previous reference the request had, but we don't want
+ * that to go to zero before we get our new export reference. */
+ export = class_conn2export(&conn);
+ LASSERT(export != NULL);
+
/* If the client and the server are the same node, we will already
* have an export that really points to the client's DLM export,
* because we have a shared handles table.
if (req->rq_export != NULL)
class_export_put(req->rq_export);
- /* ownership of this export ref transfers to the request */
- export = req->rq_export = class_conn2export(&conn);
- LASSERT(export != NULL);
+ req->rq_export = export;
spin_lock_irqsave(&export->exp_lock, flags);
if (export->exp_conn_cnt >= req->rq_reqmsg->conn_cnt) {
CWARN("%s: all clients recovered, %d MDS "
"orphans deleted\n", obd->obd_name, rc);
else
- CERROR("postrecov failed %d\n", rc);
+ CWARN("postrecov failed %d\n", rc);
}
list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
target_release_saved_req(req);
}
obd->obd_recovery_end = CURRENT_SECONDS;
- return;
}
static void abort_recovery_queue(struct obd_device *obd)
obd->obd_replayed_requests++;
reset_recovery_timer(obd);
/* bug 1580: decide how to properly sync() in recovery */
- //mds_fsync_super(mds->mds_sb);
+ //mds_fsync_super(obd->u.obt.obt_sb);
class_export_put(req->rq_export);
if (req->rq_reply_state != NULL) {
ptlrpc_rs_decref(req->rq_reply_state);
OBD_ALLOC(reqmsg, req->rq_reqlen);
if (!reqmsg)
LBUG();
- memcpy(saved_req, req, sizeof *saved_req);
+ *saved_req = *req;
memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
/* Don't race cleanup */
}
EXPORT_SYMBOL(target_committed_to_req);
+
+#ifdef HAVE_QUOTA_SUPPORT
+int target_handle_qc_callback(struct ptlrpc_request *req)
+{
+ struct obd_quotactl *oqctl;
+ struct client_obd *cli = &req->rq_export->exp_obd->u.cli;
+
+ oqctl = lustre_swab_reqbuf(req, 0, sizeof(*oqctl),
+ lustre_swab_obd_quotactl);
+
+ cli->cl_qchk_stat = oqctl->qc_stat;
+
+ return 0;
+}
+
+int target_handle_dqacq_callback(struct ptlrpc_request *req)
+{
+#ifdef __KERNEL__
+ struct obd_device *obd = req->rq_export->exp_obd;
+ struct obd_device *master_obd;
+ struct lustre_quota_ctxt *qctxt;
+ struct qunit_data *qdata, *rep;
+ int rc = 0, repsize = sizeof(struct qunit_data);
+ ENTRY;
+
+ rc = lustre_pack_reply(req, 1, &repsize, NULL);
+ if (rc) {
+ CERROR("packing reply failed!: rc = %d\n", rc);
+ RETURN(rc);
+ }
+ rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*rep));
+ LASSERT(rep);
+
+ qdata = lustre_swab_reqbuf(req, 0, sizeof(*qdata), lustre_swab_qdata);
+ if (qdata == NULL) {
+ CERROR("unpacking request buffer failed!");
+ RETURN(-EPROTO);
+ }
+
+ /* we use the observer */
+ LASSERT(obd->obd_observer && obd->obd_observer->obd_observer);
+ master_obd = obd->obd_observer->obd_observer;
+ qctxt = &master_obd->u.obt.obt_qctxt;
+
+ LASSERT(qctxt->lqc_handler);
+ rc = qctxt->lqc_handler(master_obd, qdata, req->rq_reqmsg->opc);
+ if (rc && rc != -EDQUOT)
+ CDEBUG(rc == -EBUSY ? D_QUOTA : D_ERROR,
+ "dqacq failed! (rc:%d)\n", rc);
+
+ /* the qd_count might be changed in lqc_handler */
+ memcpy(rep, qdata, sizeof(*rep));
+ req->rq_status = rc;
+ rc = ptlrpc_reply(req);
+
+ RETURN(rc);
+#else
+ return 0;
+#endif /* !__KERNEL__ */
+}
+#endif /* HAVE_QUOTA_SUPPORT */
+
+ldlm_mode_t lck_compat_array[] = {
+ [LCK_EX] LCK_COMPAT_EX,
+ [LCK_PW] LCK_COMPAT_PW,
+ [LCK_PR] LCK_COMPAT_PR,
+ [LCK_CW] LCK_COMPAT_CW,
+ [LCK_CR] LCK_COMPAT_CR,
+ [LCK_NL] LCK_COMPAT_NL,
+ [LCK_GROUP] LCK_COMPAT_GROUP
+};
[LDLM_PLAIN] "PLN",
[LDLM_EXTENT] "EXT",
[LDLM_FLOCK] "FLK",
+ [LDLM_IBITS] "IBT",
};
char *ldlm_it2str(int it)
[LDLM_FLOCK] ldlm_process_flock_lock,
//[LDLM_LLOG] ldlm_process_llog_lock,
#endif
+ [LDLM_IBITS] ldlm_process_inodebits_lock,
};
ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res)
struct obd_export *export = NULL;
l_lock(&ns->ns_lock);
- LDLM_DEBUG(lock, "final lock_put on destroyed lock, freeing");
+ LDLM_DEBUG(lock, "final lock_put on destroyed lock, freeing it.");
LASSERT(lock->l_destroyed);
LASSERT(list_empty(&lock->l_res_link));
void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc)
{
- ldlm_res2desc(lock->l_resource, &desc->l_resource);
- desc->l_req_mode = lock->l_req_mode;
- desc->l_granted_mode = lock->l_granted_mode;
- memcpy(&desc->l_policy_data, &lock->l_policy_data,
- sizeof(desc->l_policy_data));
+ struct obd_export *exp = lock->l_export?:lock->l_conn_export;
+ /* INODEBITS_INTEROP: If the other side does not support
+ * inodebits, reply with a plain lock descriptor.
+ */
+ if ((lock->l_resource->lr_type == LDLM_IBITS) &&
+ (exp && !(exp->exp_connect_flags & OBD_CONNECT_IBITS))) {
+ struct ldlm_resource res = *lock->l_resource;
+
+ /* Make sure all the right bits are set in this lock we
+ are going to pass to client */
+ LASSERTF(lock->l_policy_data.l_inodebits.bits ==
+ (MDS_INODELOCK_LOOKUP|MDS_INODELOCK_UPDATE),
+ "Inappropriate inode lock bits during "
+ "conversion " LPU64 "\n",
+ lock->l_policy_data.l_inodebits.bits);
+ res.lr_type = LDLM_PLAIN;
+ ldlm_res2desc(&res, &desc->l_resource);
+ /* Convert "new" lock mode to something old client can
+ understand */
+ if ((lock->l_req_mode == LCK_CR) ||
+ (lock->l_req_mode == LCK_CW))
+ desc->l_req_mode = LCK_PR;
+ else
+ desc->l_req_mode = lock->l_req_mode;
+ if ((lock->l_granted_mode == LCK_CR) ||
+ (lock->l_granted_mode == LCK_CW)) {
+ desc->l_granted_mode = LCK_PR;
+ } else {
+ /* We never grant PW/EX locks to clients */
+ LASSERT((lock->l_granted_mode != LCK_PW) &&
+ (lock->l_granted_mode != LCK_EX));
+ desc->l_granted_mode = lock->l_granted_mode;
+ }
+
+ /* We do not copy policy here, because there is no
+ policy for plain locks */
+ } else {
+ ldlm_res2desc(lock->l_resource, &desc->l_resource);
+ desc->l_req_mode = lock->l_req_mode;
+ desc->l_granted_mode = lock->l_granted_mode;
+ desc->l_policy_data = lock->l_policy_data;
+ }
}
void ldlm_add_ast_work_item(struct ldlm_lock *lock, struct ldlm_lock *new,
lock->l_policy_data.l_extent.gid != policy->l_extent.gid)
continue;
+ /* We match if we have existing lock with same or wider set
+ of bits. */
+ if (lock->l_resource->lr_type == LDLM_IBITS &&
+ ((lock->l_policy_data.l_inodebits.bits &
+ policy->l_inodebits.bits) !=
+ policy->l_inodebits.bits))
+ continue;
+
if (lock->l_destroyed || (lock->l_flags & LDLM_FL_FAILED))
continue;
if (rc) {
l_lock(&ns->ns_lock);
LDLM_DEBUG(lock, "matched ("LPU64" "LPU64")",
- type == LDLM_PLAIN ? res_id->name[2] :
- policy->l_extent.start,
- type == LDLM_PLAIN ? res_id->name[3] :
- policy->l_extent.end);
+ (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+ res_id->name[2] : policy->l_extent.start,
+ (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+ res_id->name[3] : policy->l_extent.end);
l_unlock(&ns->ns_lock);
} else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/
LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res "
LPU64"/"LPU64" ("LPU64" "LPU64")", ns,
type, mode, res_id->name[0], res_id->name[1],
- type == LDLM_PLAIN ? res_id->name[2] :
- policy->l_extent.start,
- type == LDLM_PLAIN ? res_id->name[3] :
- policy->l_extent.end);
+ (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+ res_id->name[2] :policy->l_extent.start,
+ (type == LDLM_PLAIN || type == LDLM_IBITS) ?
+ res_id->name[3] : policy->l_extent.end);
}
if (old_lock)
LDLM_LOCK_PUT(old_lock);
lock->l_policy_data.l_flock.pid,
lock->l_policy_data.l_flock.start,
lock->l_policy_data.l_flock.end);
+ else if (lock->l_resource->lr_type == LDLM_IBITS)
+ CDEBUG(level, " Bits: "LPX64"\n",
+ lock->l_policy_data.l_inodebits.bits);
}
void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh)
LDLM_ERROR(lock, "%s AST failed (%d): evicting client %s@%s NID %s"
" (%s)", ast_type, rc, lock->l_export->exp_client_uuid.uuid,
- conn->c_remote_uuid.uuid, libcfs_nid2str(conn->c_peer.nid),
+ conn->c_remote_uuid.uuid, libcfs_nid2str(conn->c_peer.nid),
str);
if (obd_dump_on_timeout)
instant_cancel = 1;
req = ptlrpc_prep_req(lock->l_export->exp_imp_reverse,
- LDLM_BL_CALLBACK, 1, &size, NULL);
+ LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK,
+ 1, &size, NULL);
if (req == NULL) {
l_unlock(&lock->l_resource->lr_namespace->ns_lock);
RETURN(-ENOMEM);
}
body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
- memcpy(&body->lock_handle1, &lock->l_remote_handle,
- sizeof(body->lock_handle1));
- memcpy(&body->lock_desc, desc, sizeof(*desc));
+ body->lock_handle1 = lock->l_remote_handle;
+ body->lock_desc = *desc;
body->lock_flags |= (lock->l_flags & LDLM_AST_FLAGS);
LDLM_DEBUG(lock, "server preparing blocking AST");
req->rq_send_state = LUSTRE_IMP_FULL;
req->rq_timeout = ldlm_timeout; /* timeout for initial AST reply */
if (unlikely(instant_cancel)) {
- rc = ptl_send_rpc_nowait(req);
+ rc = ptl_send_rpc(req, 1);
} else {
rc = ptlrpc_queue_wait(req);
}
ptlrpc_req_finished(req);
+ /* If we cancelled the lock, we need to restart ldlm_reprocess_queue */
+ if (!rc && instant_cancel)
+ rc = -ERESTART;
+
RETURN(rc);
}
struct ptlrpc_request *req;
struct timeval granted_time;
long total_enqueue_wait;
- int rc = 0, size[2] = {sizeof(*body)}, buffers = 1;
+ int rc = 0, size[2] = {sizeof(*body)}, buffers = 1, instant_cancel = 0;
ENTRY;
LASSERT(lock != NULL);
up(&lock->l_resource->lr_lvb_sem);
req = ptlrpc_prep_req(lock->l_export->exp_imp_reverse,
- LDLM_CP_CALLBACK, buffers, size, NULL);
+ LUSTRE_DLM_VERSION, LDLM_CP_CALLBACK,
+ buffers, size, NULL);
if (req == NULL)
RETURN(-ENOMEM);
body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
- memcpy(&body->lock_handle1, &lock->l_remote_handle,
- sizeof(body->lock_handle1));
+ body->lock_handle1 = lock->l_remote_handle;
body->lock_flags = flags;
ldlm_lock2desc(lock, &body->lock_desc);
ldlm_handle_enqueue will call ldlm_lock_cancel() still, that
would not only cancel the loc, but will also remove it from
waiting list */
- if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK)
+ if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) {
ldlm_lock_cancel(lock);
- else
+ instant_cancel = 1;
+ } else {
ldlm_add_waiting_lock(lock); /* start the lock-timeout
clock */
+ }
}
l_unlock(&lock->l_resource->lr_namespace->ns_lock);
ptlrpc_req_finished(req);
+ /* If we cancelled the lock, we need to restart ldlm_reprocess_queue */
+ if (!rc && instant_cancel)
+ rc = -ERESTART;
+
RETURN(rc);
}
LASSERT(lock != NULL);
req = ptlrpc_prep_req(lock->l_export->exp_imp_reverse,
- LDLM_GL_CALLBACK, 1, &size, NULL);
+ LUSTRE_DLM_VERSION, LDLM_GL_CALLBACK,
+ 1, &size, NULL);
if (req == NULL)
RETURN(-ENOMEM);
body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body));
- memcpy(&body->lock_handle1, &lock->l_remote_handle,
- sizeof(body->lock_handle1));
+ body->lock_handle1 = lock->l_remote_handle;
ldlm_lock2desc(lock, &body->lock_desc);
down(&lock->l_resource->lr_lvb_sem);
}
+/*
+ * Main server-side entry point into LDLM. This is called by ptlrpc service
+ * threads to carry out client lock enqueueing requests.
+ */
int ldlm_handle_enqueue(struct ptlrpc_request *req,
ldlm_completion_callback completion_callback,
ldlm_blocking_callback blocking_callback,
LDLM_DEBUG_NOLOCK("server-side enqueue handler START");
- dlm_req = lustre_swab_reqbuf (req, 0, sizeof (*dlm_req),
+ dlm_req = lustre_swab_reqbuf (req, MDS_REQ_INTENT_LOCKREQ_OFF,
+ sizeof (*dlm_req),
lustre_swab_ldlm_request);
if (dlm_req == NULL) {
CERROR ("Can't unpack dlm_req\n");
LASSERT(req->rq_export);
- if (flags & LDLM_FL_REPLAY) {
- lock = find_existing_lock(req->rq_export,
- &dlm_req->lock_handle1);
- if (lock != NULL) {
- DEBUG_REQ(D_HA, req, "found existing lock cookie "LPX64,
- lock->l_handle.h_cookie);
- GOTO(existing_lock, rc = 0);
- }
- }
-
if (dlm_req->lock_desc.l_resource.lr_type < LDLM_MIN_TYPE ||
dlm_req->lock_desc.l_resource.lr_type >= LDLM_MAX_TYPE) {
DEBUG_REQ(D_ERROR, req, "invalid lock request type %d\n",
GOTO(out, rc = -EFAULT);
}
+ if (req->rq_export->exp_connect_flags & OBD_CONNECT_IBITS) {
+ if (dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN) {
+ DEBUG_REQ(D_ERROR, req,
+ "PLAIN lock request from IBITS client?\n");
+ GOTO(out, rc = -EPROTO);
+ }
+ } else if (dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) {
+ DEBUG_REQ(D_ERROR, req,
+ "IBITS lock request from unaware client?\n");
+ GOTO(out, rc = -EPROTO);
+ }
+
+#if 0
+ /* FIXME this makes it impossible to use plain locks -- check against
+ server's *_CONNECT_SUPPORTED flags? (I don't want to use ibits
+ for mgc/mgs) */
+
+ /* INODEBITS_INTEROP: Perform conversion from plain lock to
+ * inodebits lock if client does not support them. */
+ if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_IBITS) &&
+ (dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN)) {
+ dlm_req->lock_desc.l_resource.lr_type = LDLM_IBITS;
+ dlm_req->lock_desc.l_policy_data.l_inodebits.bits =
+ MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE;
+ if (dlm_req->lock_desc.l_req_mode == LCK_PR)
+ dlm_req->lock_desc.l_req_mode = LCK_CR;
+ }
+#endif
+
+ if (flags & LDLM_FL_REPLAY) {
+ lock = find_existing_lock(req->rq_export,
+ &dlm_req->lock_handle1);
+ if (lock != NULL) {
+ DEBUG_REQ(D_HA, req, "found existing lock cookie "LPX64,
+ lock->l_handle.h_cookie);
+ GOTO(existing_lock, rc = 0);
+ }
+ }
+
/* The lock's callback data might be set in the policy function */
lock = ldlm_lock_create(obddev->obd_namespace, &dlm_req->lock_handle2,
dlm_req->lock_desc.l_resource.lr_name,
GOTO(out, rc = -ENOMEM);
do_gettimeofday(&lock->l_enqueued_time);
- memcpy(&lock->l_remote_handle, &dlm_req->lock_handle1,
- sizeof(lock->l_remote_handle));
+ lock->l_remote_handle = dlm_req->lock_handle1;
LDLM_DEBUG(lock, "server-side enqueue handler, new lock created");
OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_BLOCKED, obd_timeout * 2);
}
if (dlm_req->lock_desc.l_resource.lr_type != LDLM_PLAIN)
- memcpy(&lock->l_policy_data, &dlm_req->lock_desc.l_policy_data,
- sizeof(ldlm_policy_data_t));
+ lock->l_policy_data = dlm_req->lock_desc.l_policy_data;
if (dlm_req->lock_desc.l_resource.lr_type == LDLM_EXTENT)
- memcpy(&lock->l_req_extent, &lock->l_policy_data.l_extent,
- sizeof(lock->l_req_extent));
+ lock->l_req_extent = lock->l_policy_data.l_extent;
err = ldlm_lock_enqueue(obddev->obd_namespace, &lock, cookie, &flags);
if (err)
else if (lock->l_granted_mode == lock->l_req_mode)
ldlm_add_waiting_lock(lock);
}
- if ((dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN) &&
+ /* Make sure we never ever grant usual metadata locks to liblustre
+ clients */
+ if ((dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN ||
+ dlm_req->lock_desc.l_resource.lr_type == LDLM_IBITS) &&
req->rq_export->exp_libclient) {
if (!(lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) ||
!(dlm_rep->lock_flags & LDLM_FL_CANCEL_ON_BLOCK)) {
}
if (lock->l_resource->lr_type != LDLM_PLAIN) {
- memcpy(&lock->l_policy_data, &dlm_req->lock_desc.l_policy_data,
- sizeof(lock->l_policy_data));
+ lock->l_policy_data = dlm_req->lock_desc.l_policy_data;
LDLM_DEBUG(lock, "completion AST, new policy data");
}
restart:
LASSERT(res->lr_tmp == NULL);
res->lr_tmp = &rpc_list;
+
rc = ldlm_plain_compat_queue(&res->lr_granted, lock, 1);
rc += ldlm_plain_compat_queue(&res->lr_waiting, lock, 1);
res->lr_tmp = NULL;
lock->l_flags |= *flags & LDLM_INHERIT_FLAGS;
lock->l_lvb_swabber = lvb_swabber;
if (policy != NULL)
- memcpy(&lock->l_policy_data, policy, sizeof(*policy));
+ lock->l_policy_data = *policy;
if (type == LDLM_EXTENT)
- memcpy(&lock->l_req_extent, &policy->l_extent,
- sizeof(policy->l_extent));
+ lock->l_req_extent = policy->l_extent;
err = ldlm_lock_enqueue(ns, &lock, policy, flags);
if (err != ELDLM_OK)
GOTO(out, err);
if (policy != NULL)
- memcpy(policy, &lock->l_policy_data, sizeof(*policy));
+ *policy = lock->l_policy_data;
if ((*flags) & LDLM_FL_LOCK_CHANGED)
- memcpy(&res_id, &lock->l_resource->lr_name, sizeof(res_id));
+ res_id = lock->l_resource->lr_name;
LDLM_DEBUG_NOLOCK("client-side local enqueue handler END (lock %p)",
lock);
struct ldlm_lock *lock;
struct ldlm_request *body;
struct ldlm_reply *reply;
- int rc, size[2] = {sizeof(*body), lvb_len}, req_passed_in = 1;
+ int rc, size[] = {sizeof(*body), lvb_len}, req_passed_in = 1;
int is_replay = *flags & LDLM_FL_REPLAY;
int cleanup_phase = 0;
ENTRY;
ldlm_lock_addref_internal(lock, mode);
ldlm_lock2handle(lock, lockh);
lock->l_lvb_swabber = lvb_swabber;
- if (policy != NULL)
- memcpy(&lock->l_policy_data, policy, sizeof(*policy));
+ if (policy != NULL) {
+ /* INODEBITS_INTEROP: If the server does not support
+ * inodebits, we will request a plain lock in the
+ * descriptor (ldlm_lock2desc() below) but use an
+ * inodebits lock internally with both bits set.
+ */
+ if (type == LDLM_IBITS && !(exp->exp_connect_flags &
+ OBD_CONNECT_IBITS))
+ lock->l_policy_data.l_inodebits.bits =
+ MDS_INODELOCK_LOOKUP |
+ MDS_INODELOCK_UPDATE;
+ else
+ lock->l_policy_data = *policy;
+ }
+
if (type == LDLM_EXTENT)
- memcpy(&lock->l_req_extent, &policy->l_extent,
- sizeof(policy->l_extent));
+ lock->l_req_extent = policy->l_extent;
LDLM_DEBUG(lock, "client-side enqueue START");
}
cleanup_phase = 2;
if (req == NULL) {
- req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 1,
- size, NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION,
+ LDLM_ENQUEUE, 1, size, NULL);
if (req == NULL)
GOTO(cleanup, rc = -ENOMEM);
req_passed_in = 0;
- } else if (req->rq_reqmsg->buflens[0] != sizeof(*body))
- LBUG();
+ } else {
+ LASSERTF(req->rq_reqmsg->buflens[MDS_REQ_INTENT_LOCKREQ_OFF] ==
+ sizeof(*body), "buflen[%d] = %d, not %d\n",
+ MDS_REQ_INTENT_LOCKREQ_OFF,
+ req->rq_reqmsg->buflens[MDS_REQ_INTENT_LOCKREQ_OFF],
+ sizeof(*body));
+ }
+
+ lock->l_conn_export = exp;
+ lock->l_export = NULL;
+ lock->l_blocking_ast = blocking;
/* Dump lock data into the request buffer */
- body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
+ body = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_INTENT_LOCKREQ_OFF,
+ sizeof(*body));
ldlm_lock2desc(lock, &body->lock_desc);
body->lock_flags = *flags;
- memcpy(&body->lock_handle1, lockh, sizeof(*lockh));
+ body->lock_handle1 = *lockh;
/* Continue as normal. */
if (!req_passed_in) {
- int buffers = 1;
- if (lvb_len > 0)
- buffers = 2;
size[0] = sizeof(*reply);
- req->rq_replen = lustre_msg_size(buffers, size);
+ req->rq_replen = lustre_msg_size(1 + (lvb_len > 0), size);
}
- lock->l_conn_export = exp;
- lock->l_export = NULL;
- lock->l_blocking_ast = blocking;
-
LDLM_DEBUG(lock, "sending request");
rc = ptlrpc_queue_wait(req);
/* lock enqueued on the server */
cleanup_phase = 1;
- memcpy(&lock->l_remote_handle, &reply->lock_handle,
- sizeof(lock->l_remote_handle));
+ lock->l_remote_handle = reply->lock_handle;
*flags = reply->lock_flags;
lock->l_flags |= reply->lock_flags & LDLM_INHERIT_FLAGS;
LDLM_DEBUG(lock, "client-side enqueue, new resource");
}
if (policy != NULL)
- memcpy(&lock->l_policy_data,
- &reply->lock_desc.l_policy_data,
- sizeof(reply->lock_desc.l_policy_data));
+ if (!(type == LDLM_IBITS && !(exp->exp_connect_flags &
+ OBD_CONNECT_IBITS)))
+ lock->l_policy_data =
+ reply->lock_desc.l_policy_data;
if (type != LDLM_PLAIN)
LDLM_DEBUG(lock,"client-side enqueue, new policy data");
}
LDLM_DEBUG(lock, "client-side convert");
req = ptlrpc_prep_req(class_exp2cliimp(lock->l_conn_export),
- LDLM_CONVERT, 1, &size, NULL);
+ LUSTRE_DLM_VERSION, LDLM_CONVERT, 1, &size, NULL);
if (!req)
GOTO(out, rc = -ENOMEM);
body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
- memcpy(&body->lock_handle1, &lock->l_remote_handle,
- sizeof(body->lock_handle1));
+ body->lock_handle1 = lock->l_remote_handle;
body->lock_desc.l_req_mode = new_mode;
body->lock_flags = *flags;
goto local_cancel;
}
- req = ptlrpc_prep_req(imp, LDLM_CANCEL, 1, &size, NULL);
+ req = ptlrpc_prep_req(imp, LUSTRE_DLM_VERSION, LDLM_CANCEL,
+ 1, &size, NULL);
if (!req)
GOTO(out, rc = -ENOMEM);
req->rq_no_resend = 1;
req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
- memcpy(&body->lock_handle1, &lock->l_remote_handle,
- sizeof(body->lock_handle1));
+ body->lock_handle1 = lock->l_remote_handle;
req->rq_replen = lustre_msg_size(0, NULL);
rc = ptlrpc_queue_wait(req);
if (rc == ESTALE) {
- CERROR("client/server (nid %s) out of sync"
- " -- not fatal, flags %d\n",
- libcfs_nid2str(req->rq_import->
- imp_connection->c_peer.nid),
- lock->l_flags);
+ /* For PLAIN (inodebits) locks on liblustre clients
+ this is a valid race between us cancelling a lock
+ from lru and sending notification and server
+ cancelling our lock at the same time */
+#ifndef __KERNEL__
+ if (lock->l_resource->lr_type != LDLM_PLAIN /* IBITS */)
+#endif
+ CERROR("client/server (nid %s) out of sync"
+ " -- not fatal, flags %d\n",
+ libcfs_nid2str(req->rq_import->
+ imp_connection->c_peer.nid),
+ lock->l_flags);
} else if (rc == -ETIMEDOUT) {
ptlrpc_req_finished(req);
GOTO(restart, rc);
list_for_each_entry_safe(lock, next, &ns->ns_unused_list, l_lru) {
LASSERT(!lock->l_readers && !lock->l_writers);
+ /* If we have chosen to canecl this lock voluntarily, we better
+ send cancel notification to server, so that it frees
+ appropriate state. This might lead to a race where while
+ we are doing cancel here, server is also silently
+ cancelling this lock. */
+ lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK;
+
/* Setting the CBPENDING flag is a little misleading, but
* prevents an important race; namely, once CBPENDING is set,
* the lock can accumulate no more readers/writers. Since
GOTO (out, rc = -EPROTO);
}
- memcpy(&lock->l_remote_handle, &reply->lock_handle,
- sizeof(lock->l_remote_handle));
+ lock->l_remote_handle = reply->lock_handle;
LDLM_DEBUG(lock, "replayed lock:");
ptlrpc_import_recovery_state_machine(req->rq_import);
out:
int size[2];
int flags;
+ /* If this is reply-less callback lock, we cannot replay it, since
+ * server might have long dropped it, but notification of that event was
+ * lost by network. (and server granted conflicting lock already) */
+ if (lock->l_flags & LDLM_FL_CANCEL_ON_BLOCK) {
+ LDLM_DEBUG(lock, "Not replaying reply-less lock:");
+ ldlm_lock_cancel(lock);
+ RETURN(0);
+ }
/*
* If granted mode matches the requested mode, this lock is granted.
*
flags = LDLM_FL_REPLAY;
size[0] = sizeof(*body);
- req = ptlrpc_prep_req(imp, LDLM_ENQUEUE, 1, size, NULL);
+ req = ptlrpc_prep_req(imp, LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
+ 1, size, NULL);
if (!req)
RETURN(-ENOMEM);
RETURN(NULL);
l_lock(&ns->ns_lock);
- memcpy(&res->lr_name, &name, sizeof(res->lr_name));
+ res->lr_name = name;
res->lr_namespace = ns;
atomic_inc(&ns->ns_refcount);
void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc)
{
desc->lr_type = res->lr_type;
- memcpy(&desc->lr_name, &res->lr_name, sizeof(desc->lr_name));
+ desc->lr_name = res->lr_name;
}
void ldlm_dump_all_namespaces(int level)
$(top_builddir)/lustre/obdclass/liblustreclass.a \
$(top_builddir)/lustre/lvfs/liblvfs.a
+if QUOTA
+QUOTA_LIBS = $(top_builddir)/lustre/quota/libquota.a
+endif
+
LND_LIBS =
if BUILD_USOCKLND
LND_LIBS += $(top_builddir)/lnet/ulnds/socklnd/libsocklnd.a
liblustre_a_SOURCES = llite_lib.c super.c namei.c rw.c file.c dir.c \
llite_lib.h
-liblustre.a : $(LUSTRE_LIBS) $(LND_LIBS) $(LNET_LIBS) $(SYSIO_LIBS)
- sh $(srcdir)/genlib.sh "$(SYSIO)" "$(LIBS)" "$(LND_LIBS)" "$(PTHREAD_LIBS)"
+liblustre.a : $(LUSTRE_LIBS) $(LND_LIBS) $(LNET_LIBS) $(SYSIO_LIBS) $(QUOTA_LIBS)
+ sh $(srcdir)/genlib.sh "$(SYSIO)" "$(LIBS)" "$(LND_LIBS)" "$(PTHREAD_LIBS)" "$(QUOTA_LIBS)" "$(CAP_LIBS)"
EXTRA_DIST = genlib.sh
struct obd_device *obddev = class_exp2obd(sbi->ll_mdc_exp);
struct ldlm_res_id res_id =
{ .name = {st->st_ino, (__u64)lli->lli_st_generation} };
+ ldlm_policy_data_t policy = { .l_inodebits = { MDS_INODELOCK_UPDATE } };
ENTRY;
rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
- &res_id, LDLM_PLAIN, NULL, LCK_PR, &lockh);
+ &res_id, LDLM_IBITS, &policy, LCK_CR, &lockh);
if (!rc) {
llu_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0);
- rc = mdc_enqueue(sbi->ll_mdc_exp, LDLM_PLAIN, &it, LCK_PR,
+ rc = mdc_enqueue(sbi->ll_mdc_exp, LDLM_IBITS, &it, LCK_CR,
&data, &lockh, NULL, 0,
ldlm_completion_ast, llu_mdc_blocking_ast,
inode, LDLM_FL_CANCEL_ON_BLOCK);
ptlrpc_req_finished(request);
EXIT;
- ldlm_lock_decref(&lockh, LCK_PR);
+ ldlm_lock_decref(&lockh, LCK_CR);
return rc;
}
return 0;
}
-ssize_t llu_iop_getdirentries(struct inode *ino, char *buf, size_t nbytes,
- _SYSIO_OFF_T *basep)
+ssize_t llu_iop_filldirentries(struct inode *ino, _SYSIO_OFF_T *basep,
+ char *buf, size_t nbytes)
{
struct llu_inode_info *lli = llu_i2info(ino);
struct intnl_stat *st = llu_i2stat(ino);
int rc;
ENTRY;
- oti.oti_thread = request->rq_svc_thread;
/* req is swabbed so this is safe */
body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body));
GOTO(out, rc = -EPROTO);
}
- rc = obd_unpackmd(llu_i2obdexp(dir), &lsm, eadata, body->eadatasize);
+ rc = obd_unpackmd(llu_i2obdexp(dir), &lsm, eadata,body->eadatasize);
if (rc < 0) {
CERROR("obd_unpackmd: %d\n", rc);
GOTO(out, rc);
}
}
- rc = obd_destroy(llu_i2obdexp(dir), oa, lsm, &oti);
+ rc = obd_destroy(llu_i2obdexp(dir), oa, lsm, &oti, NULL);
obdo_free(oa);
if (rc)
CERROR("obd destroy objid 0x"LPX64" error %d\n",
}
oa.o_id = lsm->lsm_object_id;
- oa.o_valid = OBD_MD_FLID;
- obdo_from_inode(&oa, inode,
- OBD_MD_FLTYPE|OBD_MD_FLMODE|OBD_MD_FLATIME|
- OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLFLAGS);
+ oa.o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
+ oa.o_flags = flags; /* We don't actually want to copy inode flags */
- oa.o_flags |= flags; /* OBD_MD_FLFLAGS is already set at this point */
+ obdo_from_inode(&oa, inode,
+ OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLATIME |
+ OBD_MD_FLMTIME | OBD_MD_FLCTIME);
obd_adjust_kms(llu_i2obdexp(inode), lsm, st->st_size, 1);
LIBS=$2
LND_LIBS=$3
PTHREAD_LIBS=$4
+QUOTA_LIBS=$5
+CAP_LIBS=$6
if [ ! -f $SYSIO/lib/libsysio.a ]; then
echo "ERROR: $SYSIO/lib/libsysio.a dosen't exist"
fi
build_obj_list ../../lnet/lnet liblnet.a
+if [ "x$QUOTA_LIBS" != "x" ]; then
+ build_obj_list ../quota libquota.a
+fi
+
# create static lib lsupport
rm -f $CWD/liblsupport.a
$AR -cru $CWD/liblsupport.a $ALL_OBJS
gcc -shared -o $CWD/liblustre.so $ALL_OBJS -lpthread -Xlinker -bnoipath ../../libsyscall.so
else
$LD -shared -o $CWD/liblustre.so -init __liblustre_setup_ -fini __liblustre_cleanup_ \
- $ALL_OBJS -lcap $PTHREAD_LIBS
+ $ALL_OBJS $CAP_LIBS $PTHREAD_LIBS
fi
rm -rf $sysio_tmp
#include <file.h>
#endif
-/* env variables */
-#define ENV_LUSTRE_MNTPNT "LIBLUSTRE_MOUNT_POINT"
-#define ENV_LUSTRE_MNTTGT "LIBLUSTRE_MOUNT_TARGET"
-#define ENV_LUSTRE_TIMEOUT "LIBLUSTRE_TIMEOUT"
-#define ENV_LUSTRE_DUMPFILE "LIBLUSTRE_DUMPFILE"
-#define ENV_LUSTRE_DEBUG_MASK "LIBLUSTRE_DEBUG_MASK"
-#define ENV_LUSTRE_DEBUG_SUBSYS "LIBLUSTRE_DEBUG_SUBSYS"
-
/* both sys/queue.h (libsysio require it) and portals/lists.h have definition
* of 'LIST_HEAD'. undef it to suppress warnings
*/
static int lllib_init(void)
{
if (liblustre_init_current("liblustre") ||
- init_obdclass() ||
init_lib_portals() ||
+ init_obdclass() ||
ptlrpc_init() ||
mdc_init() ||
lov_init() ||
perror("init llite driver");
return err;
}
- timeout = getenv(ENV_LUSTRE_TIMEOUT);
+ timeout = getenv("LIBLUSTRE_TIMEOUT");
if (timeout) {
obd_timeout = (unsigned int) strtol(timeout, NULL, 0);
printf("LibLustre: set obd timeout as %u seconds\n",
}
/* debug masks */
- debug_mask = getenv(ENV_LUSTRE_DEBUG_MASK);
+ debug_mask = getenv("LIBLUSTRE_DEBUG_MASK");
if (debug_mask)
libcfs_debug = (unsigned int) strtol(debug_mask, NULL, 0);
- debug_subsys = getenv(ENV_LUSTRE_DEBUG_SUBSYS);
+ debug_subsys = getenv("LIBLUSTRE_DEBUG_SUBSYS");
if (debug_subsys)
libcfs_subsystem_debug =
(unsigned int) strtol(debug_subsys, NULL, 0);
unsigned mntflgs = 0;
int err;
- lustre_path = getenv(ENV_LUSTRE_MNTPNT);
+ lustre_path = getenv("LIBLUSTRE_MOUNT_POINT");
if (!lustre_path) {
lustre_path = "/mnt/lustre";
}
/* mount target */
- target = getenv(ENV_LUSTRE_MNTTGT);
+ target = getenv("LIBLUSTRE_MOUNT_TARGET");
if (!target) {
printf("LibLustre: no mount target specified\n");
exit(1);
* liblutre. this dilema lead to another hack in
* libsysio/src/file_hack.c FIXME
*/
- _sysio_shutdown();
#ifdef INIT_SYSIO
+ _sysio_shutdown();
cleanup_lib_portals();
LNetFini();
#endif
void *data, int flag);
/* dir.c */
-ssize_t llu_iop_getdirentries(struct inode *ino, char *buf, size_t nbytes,
- _SYSIO_OFF_T *basep);
+ssize_t llu_iop_filldirentries(struct inode *ino, _SYSIO_OFF_T *basep,
+ char *buf, size_t nbytes);
/* ext2 related */
#define EXT2_NAME_LEN (255)
return ldlm_namespace_cleanup;
else if (!strcmp(arg, "ldlm_replay_locks"))
return ldlm_replay_locks;
+#ifdef HAVE_QUOTA_SUPPORT
+ else if (!strcmp(arg, "osc_quota_interface"))
+ return &osc_quota_interface;
+ else if (!strcmp(arg, "mdc_quota_interface"))
+ return &mdc_quota_interface;
+ else if (!strcmp(arg, "lov_quota_interface"))
+ return &lov_quota_interface;
+#endif
else
return NULL;
}
/*
* random number generator stuff
*/
-#ifdef LIBLUSTRE_USE_URANDOM
-static int _rand_dev_fd = -1;
-#endif
#ifdef HAVE_GETHOSTBYNAME
static int get_ipv4_addr()
void liblustre_init_random()
{
- int seed;
+ int _rand_dev_fd;
+ int seed[2];
struct timeval tv;
#ifdef LIBLUSTRE_USE_URANDOM
_rand_dev_fd = syscall(SYS_open, "/dev/urandom", O_RDONLY);
if (_rand_dev_fd >= 0) {
if (syscall(SYS_read, _rand_dev_fd,
- &seed, sizeof(int)) == sizeof(int)) {
- srand(seed);
+ &seed, sizeof(seed)) == sizeof(seed)) {
+ ll_srand(seed[0], seed[1]);
return;
}
syscall(SYS_close, _rand_dev_fd);
- _rand_dev_fd = -1;
}
#endif /* LIBLUSTRE_USE_URANDOM */
#ifdef HAVE_GETHOSTBYNAME
- seed = get_ipv4_addr();
+ seed[0] = get_ipv4_addr();
#else
- seed = _my_pnid;
+ seed[0] = _my_pnid;
#endif
gettimeofday(&tv, NULL);
- srand(tv.tv_sec + tv.tv_usec + getpid() + __swab32(seed));
+ ll_srand(tv.tv_usec | __swab32(getpid()), tv.tv_sec|__swab32(seed[0]));
}
void get_random_bytes(void *buf, int size)
{
- char *p = buf;
+ int *p = buf;
+ int rem;
LASSERT(size >= 0);
-#ifdef LIBLUSTRE_USE_URANDOM
- if (_rand_dev_fd >= 0) {
- if (syscall(SYS_read, _rand_dev_fd, buf, size) == size)
- return;
- syscall(SYS_close, _rand_dev_fd);
- _rand_dev_fd = -1;
+ rem = min((unsigned long)buf & (sizeof(int) - 1), size);
+ if (rem) {
+ int val = ll_rand();
+ memcpy(buf, &val, rem);
+ p = buf + rem;
+ size -= rem;
}
-#endif
- while (size--)
- *p++ = rand();
+ while (size >= sizeof(int)) {
+ *p = ll_rand();
+ size -= sizeof(int);
+ p++;
+ }
+ buf = p;
+ if (size) {
+ int val = ll_rand();
+ memcpy(buf, &val, size);
+ }
}
-
+
static void init_capability(int *res)
{
#ifdef HAVE_LIBCAP
int rc;
ENTRY;
+ rc = libcfs_debug_init(5 * 1024 * 1024);
+ if (rc != 0) {
+ CERROR("libcfs_debug_init() failed: %d\n", rc);
+ RETURN (-ENXIO);
+ }
+
rc = LNetInit();
if (rc != 0) {
- CERROR("LNetInit failed: %d\n", rc);
+ CERROR("LNetInit() failed: %d\n", rc);
RETURN (-ENXIO);
}
RETURN(0);
extern void ptlrpc_exit_portals(void);
void cleanup_lib_portals()
{
+ libcfs_debug_cleanup();
ptlrpc_exit_portals();
}
struct inode *inode = llu_inode_from_lock(lock);
struct llu_inode_info *lli;
struct intnl_stat *st;
+ __u64 bits = lock->l_policy_data.l_inodebits.bits;
/* Invalidate all dentries associated with this inode */
if (inode == NULL)
lli = llu_i2info(inode);
st = llu_i2stat(inode);
- clear_bit(LLI_F_HAVE_MDS_SIZE_LOCK, &lli->lli_flags);
+ if (bits & MDS_INODELOCK_UPDATE)
+ clear_bit(LLI_F_HAVE_MDS_SIZE_LOCK, &lli->lli_flags);
if (lock->l_resource->lr_name.name[0] != st->st_ino ||
lock->l_resource->lr_name.name[1] !=lli->lli_st_generation){
LDLM_ERROR(lock, "data mismatch with ino %llu/%lu",
(long long)st->st_ino,lli->lli_st_generation);
}
- if (S_ISDIR(st->st_mode)) {
+ if (S_ISDIR(st->st_mode) &&
+ (bits & MDS_INODELOCK_UPDATE)) {
CDEBUG(D_INODE, "invalidating inode %llu\n",
(long long)st->st_ino);
dst->o_generation = lli->lli_st_generation;
newvalid |= OBD_MD_FLGENER;
}
+ if (valid & OBD_MD_FLFID) {
+ dst->o_fid = st->st_ino;
+ newvalid |= OBD_MD_FLFID;
+ }
dst->o_valid |= newvalid;
}
struct inode *inode;
struct llu_inode_info *lli;
struct intnl_stat st = {
- st_dev: 0,
+ .st_dev = 0,
#ifndef AUTOMOUNT_FILE_NAME
- st_mode: fid->f_type & S_IFMT,
+ .st_mode = fid->f_type & S_IFMT,
#else
- st_mode: fid->f_type /* all of the bits! */
+ .st_mode = fid->f_type /* all of the bits! */
#endif
- st_uid: geteuid(),
- st_gid: getegid(),
+ .st_uid = geteuid(),
+ .st_gid = getegid(),
};
OBD_ALLOC(lli, sizeof(*lli));
lli->lli_sysio_fid.fid_data = &lli->lli_fid;
lli->lli_sysio_fid.fid_len = sizeof(lli->lli_fid);
-
- memcpy(&lli->lli_fid, fid, sizeof(*fid));
+ lli->lli_fid = *fid;
/* file identifier is needed by functions like _sysio_i_find() */
inode = _sysio_i_new(fs, &lli->lli_sysio_fid,
return inode;
}
-static int llu_have_md_lock(struct inode *inode)
+static int llu_have_md_lock(struct inode *inode, __u64 lockpart)
{
struct llu_sb_info *sbi = llu_i2sbi(inode);
struct llu_inode_info *lli = llu_i2info(inode);
struct lustre_handle lockh;
struct ldlm_res_id res_id = { .name = {0} };
struct obd_device *obddev;
+ ldlm_policy_data_t policy = { .l_inodebits = { lockpart } };
int flags;
ENTRY;
/* FIXME use LDLM_FL_TEST_LOCK instead */
flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
- if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_PLAIN,
- NULL, LCK_PR, &lockh)) {
+ if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
+ &policy, LCK_PR, &lockh)) {
ldlm_lock_decref(&lockh, LCK_PR);
RETURN(1);
}
- if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_PLAIN,
- NULL, LCK_PW, &lockh)) {
+ if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
+ &policy, LCK_PW, &lockh)) {
ldlm_lock_decref(&lockh, LCK_PW);
RETURN(1);
}
RETURN(0);
}
- if (!llu_have_md_lock(inode)) {
+ if (!llu_have_md_lock(inode, MDS_INODELOCK_UPDATE)) {
struct lustre_md md;
struct ptlrpc_request *req = NULL;
struct llu_sb_info *sbi = llu_i2sbi(inode);
obd_set_info(obd->obd_self_export, strlen("async"), "async",
sizeof(async), &async);
- ocd.ocd_connect_flags = OBD_CONNECT_VERSION;
+ ocd.ocd_connect_flags = OBD_CONNECT_IBITS|OBD_CONNECT_VERSION;
+ ocd.ocd_ibits_known = MDS_INODELOCK_FULL;
ocd.ocd_version = LUSTRE_VERSION_CODE;
/* setup mdc */
inop_lookup: llu_iop_lookup,
inop_getattr: llu_iop_getattr,
inop_setattr: llu_iop_setattr,
- inop_getdirentries: llu_iop_getdirentries,
+ /*
+ FIXME doesn't work on 2.6.10fc3?
+ inop_filldirentries: llu_iop_filldirentries,
+ */
inop_mkdir: llu_iop_mkdir_raw,
inop_rmdir: llu_iop_rmdir_raw,
inop_symlink: llu_iop_symlink_raw,
#include "../lutil.h"
-#ifdef CRAY_XT3
+#if CRAY_XT3
int _sysio_lustre_init(void)
{
/*
void *buf_alloc;
int buf_size;
+int opt_verbose;
extern char *lustre_path;
snprintf(path, MAX_PATH_LENGTH, "%s/test_t1", lustre_path);
+ if (opt_verbose)
+ printf("touch+unlink %s\n", path);
+
t_touch(path);
t_unlink(path);
LEAVE();
ENTRY(">1 block(4k) directory readdir");
snprintf(dir, MAX_PATH_LENGTH, "%s/test_t14_dir/", lustre_path);
- t_mkdir(dir);
+ rc = mkdir(dir, 0755);
+ if (rc < 0 && errno != EEXIST) {
+ printf("mkdir(%s) error: %s\n", dir, strerror(errno));
+ exit(1);
+ }
printf("Creating %d files...\n", nfiles);
for (i = 0; i < nfiles; i++) {
sprintf(path, "%s%s%05d", dir, prefix, i);
{
struct stat statbuf;
- if(stat(file, &statbuf) != 0) {
+ if (stat(file, &statbuf) != 0) {
printf("Error stat(%s)\n", file);
return(1);
}
ENTRY("4k aligned i/o sanity");
while (np <= _npages) {
printf("%3d per xfer(total %d)...\t", np, _npages);
+ fflush(stdout);
pages_io(np, offset);
np += np;
}
int result;
ENTRY("truncate() should truncate file to proper length");
- snprintf(file, MAX_PATH_LENGTH, "%s/test_t19_file", lustre_path);
+ snprintf(file, MAX_PATH_LENGTH, "%s/test_t51_file", lustre_path);
for (size = 0; size < T51_NR * T51_STEP; size += T51_STEP) {
t_echo_create(file, "");
if (truncate(file, size) != 0) {
- printf("error truncating file: %s\n", strerror(errno));
+ printf("\nerror truncating file: %s\n",strerror(errno));
return(-1);
}
result = check_file_size(file, size);
t_echo_create(file, "");
fd = open(file, O_RDWR|O_CREAT, (mode_t)0666);
if (fd < 0) {
- printf("error open file: %s\n", strerror(errno));
+ printf("\nerror open file: %s\n", strerror(errno));
return(-1);
}
if (ftruncate(fd, size) != 0) {
- printf("error ftruncating file: %s\n", strerror(errno));
+ printf("\nerror ftruncating file:%s\n",strerror(errno));
return(-1);
}
close(fd);
if (result != 0)
return result;
t_unlink(file);
+ if (size % (T51_STEP * (T51_NR / 75)) == 0) {
+ printf(".");
+ fflush(stdout);
+ }
}
+ printf("\n");
LEAVE();
}
{"dumpfile", 1, 0, 'd'},
{"only", 1, 0, 'o'},
{"target", 1, 0, 't'},
+ {"verbose", 1, 0, 'v'},
{0, 0, 0, 0}
};
- while ((c = getopt_long(argc, argv, "d:o:t:", long_opts, &opt_index)) != -1) {
+ while ((c = getopt_long(argc, argv, "d:o:t:v", long_opts, &opt_index)) != -1) {
switch (c) {
case 'd':
setenv(ENV_LUSTRE_DUMPFILE, optarg, 1);
case 't':
setenv(ENV_LUSTRE_MNTTGT, optarg, 1);
break;
+ case 'v':
+ opt_verbose++;
+ break;
default:
usage(argv[0]);
break;
__liblustre_setup_();
buf_size = _npages * PAGE_SIZE;
- buf_alloc = malloc(buf_size);
+ if (opt_verbose)
+ printf("allocating %d bytes buffer\n", buf_size);
+ buf_alloc = calloc(1, buf_size);
+ if (buf_alloc == NULL) {
+ fprintf(stderr, "error allocating %d\n", buf_size);
+ exit(-ENOMEM);
+ }
for (test = testlist; test->test != NULL; test++) {
int run = 1, i;
* an AST before calling d_revalidate_it(). The dentry still exists (marked
* INVALID) so d_lookup() matches it, but we have no lock on it (so
* lock_match() fails) and we spin around real_lookup(). */
-static int ll_dcompare(struct dentry *parent, struct qstr *d_name,
- struct qstr *name)
+int ll_dcompare(struct dentry *parent, struct qstr *d_name, struct qstr *name)
{
struct dentry *dchild;
ENTRY;
if (memcmp(d_name->name, name->name, name->len))
RETURN(1);
+ /* XXX: d_name must be in-dentry structure */
dchild = container_of(d_name, struct dentry, d_name); /* ugh */
if (dchild->d_flags & DCACHE_LUSTRE_INVALID) {
CDEBUG(D_DENTRY,"INVALID dentry %p not matched, was bug 3784\n",
ll_intent_drop_lock(it);
it->it_magic = 0;
it->it_op_release = 0;
+ /* We are still holding extra reference on a request, need to free it */
+ if (it_disposition(it, DISP_ENQ_COMPLETE))
+ ptlrpc_req_finished(it->d.lustre.it_data);
+
it->d.lustre.it_disposition = 0;
it->d.lustre.it_data = NULL;
EXIT;
continue;
}
+ lock_dentry(dentry);
if (atomic_read(&dentry->d_count) == 0) {
CDEBUG(D_DENTRY, "deleting dentry %.*s (%p) parent %p "
"inode %p\n", dentry->d_name.len,
dentry->d_inode);
dget_locked(dentry);
__d_drop(dentry);
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
- INIT_HLIST_NODE(&dentry->d_hash);
-#endif
+ unlock_dentry(dentry);
spin_unlock(&dcache_lock);
dput(dentry);
goto restart;
"inode %p refc %d\n", dentry->d_name.len,
dentry->d_name.name, dentry, dentry->d_parent,
dentry->d_inode, atomic_read(&dentry->d_count));
- hlist_del_init(&dentry->d_hash);
+ /* actually we don't unhash the dentry, rather just
+ * mark it inaccessible for to __d_lookup(). otherwise
+ * sys_getcwd() could return -ENOENT -bzzz */
dentry->d_flags |= DCACHE_LUSTRE_INVALID;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+ __d_drop(dentry);
hlist_add_head(&dentry->d_hash,
&sbi->ll_orphan_dentry_list);
+#endif
}
+ unlock_dentry(dentry);
}
spin_unlock(&dcache_lock);
EXIT;
/* unfortunately ll_intent_lock may cause a callback and revoke our
* dentry */
spin_lock(&dcache_lock);
- hlist_del_init(&de->d_hash);
+ lock_dentry(de);
+ __d_drop(de);
+ unlock_dentry(de);
__d_rehash(de, 0);
spin_unlock(&dcache_lock);
out:
- /* If we had succesful it lookup on mds, but it happened to be negative,
- we do not free request as it will be reused during lookup (see
- comment in mdc/mdc_locks.c::mdc_intent_lock(). But if
+ /* We do not free request as it may be reused during following lookup
+ (see comment in mdc/mdc_locks.c::mdc_intent_lock()), request will
+ be freed in ll_lookup_it or in ll_intent_release. But if
request was not completed, we need to free it. (bug 5154) */
- if (req != NULL && (rc == 1 || !it_disposition(it, DISP_ENQ_COMPLETE)))
+ if (req != NULL && !it_disposition(it, DISP_ENQ_COMPLETE))
ptlrpc_req_finished(req);
if (rc == 0) {
ll_unhash_aliases(de->d_inode);
de->d_name.name, de, de->d_parent, de->d_inode,
atomic_read(&de->d_count));
ll_lookup_finish_locks(it, de);
+ lock_dentry(de);
de->d_flags &= ~DCACHE_LUSTRE_INVALID;
+ unlock_dentry(de);
}
RETURN(rc);
}
struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp);
struct address_space *mapping = dir->i_mapping;
struct page *page;
+ ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} };
int rc;
rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED,
- &res_id, LDLM_PLAIN, NULL, LCK_PR, &lockh);
+ &res_id, LDLM_IBITS, &policy, LCK_CR, &lockh);
if (!rc) {
struct lookup_intent it = { .it_op = IT_READDIR };
struct ptlrpc_request *request;
ll_prepare_mdc_op_data(&data, dir, NULL, NULL, 0, 0);
- rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, LDLM_PLAIN, &it,
- LCK_PR, &data, &lockh, NULL, 0,
+ rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, LDLM_IBITS, &it,
+ LCK_CR, &data, &lockh, NULL, 0,
ldlm_completion_ast, ll_mdc_blocking_ast, dir,
0);
}
out_unlock:
- ldlm_lock_decref(&lockh, LCK_PR);
+ ldlm_lock_decref(&lockh, LCK_CR);
return page;
fail:
RETURN(rc);
}
-#define Q_CONV(tgt, src, member) (tgt)->member = (src)->member
-
-#define QCTLCONV(tgt, src) \
-do { \
- Q_CONV(tgt, src, qc_cmd); \
- Q_CONV(tgt, src, qc_type); \
- Q_CONV(tgt, src, qc_id); \
- Q_CONV(tgt, src, qc_stat); \
- Q_CONV(tgt, src, qc_dqinfo.dqi_bgrace); \
- Q_CONV(tgt, src, qc_dqinfo.dqi_igrace); \
- Q_CONV(tgt, src, qc_dqinfo.dqi_flags); \
- Q_CONV(tgt, src, qc_dqblk.dqb_ihardlimit); \
- Q_CONV(tgt, src, qc_dqblk.dqb_isoftlimit); \
- Q_CONV(tgt, src, qc_dqblk.dqb_curinodes); \
- Q_CONV(tgt, src, qc_dqblk.dqb_bhardlimit); \
- Q_CONV(tgt, src, qc_dqblk.dqb_bsoftlimit); \
- Q_CONV(tgt, src, qc_dqblk.dqb_curspace); \
- Q_CONV(tgt, src, qc_dqblk.dqb_btime); \
- Q_CONV(tgt, src, qc_dqblk.dqb_itime); \
+#define QCTL_COPY(out, in) \
+do { \
+ Q_COPY(out, in, qc_cmd); \
+ Q_COPY(out, in, qc_type); \
+ Q_COPY(out, in, qc_id); \
+ Q_COPY(out, in, qc_stat); \
+ Q_COPY(out, in, qc_dqinfo); \
+ Q_COPY(out, in, qc_dqblk); \
} while (0)
static int ll_dir_ioctl(struct inode *inode, struct file *file,
int rc, lmmsize;
ll_inode2fid(&fid, inode);
+
+ rc = ll_get_max_mdsize(sbi, &lmmsize);
+ if (rc)
+ RETURN(rc);
+
rc = mdc_getattr(sbi->ll_mdc_exp, &fid, OBD_MD_FLDIREA,
- obd_size_diskmd(sbi->ll_osc_exp, NULL),
- &request);
+ lmmsize, &request);
if (rc < 0) {
CDEBUG(D_INFO, "mdc_getattr failed: rc = %d\n", rc);
RETURN(rc);
RETURN(PTR_ERR(filename));
ll_inode2fid(&fid, inode);
+
+ rc = ll_get_max_mdsize(sbi, &lmmsize);
+ if (rc)
+ RETURN(rc);
+
rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid, filename,
strlen(filename) + 1, OBD_MD_FLEASIZE,
- obd_size_diskmd(sbi->ll_osc_exp, NULL),
- &request);
+ lmmsize, &request);
if (rc < 0) {
CDEBUG(D_INFO, "mdc_getattr_name failed on %s: rc %d\n",
filename, rc);
if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
lustre_swab_lov_user_md((struct lov_user_md *)lmm);
lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
+ } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
+ lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
+ }
+ if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
+ struct lov_stripe_md *lsm;
+ struct lov_user_md_join *lmj;
+ int lmj_size, i, aindex = 0, rc;
+
+ rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
+ if (rc < 0)
+ GOTO(out_req, rc = -ENOMEM);
+ rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
+ if (rc)
+ GOTO(out_free_memmd, rc);
+
+ lmj_size = sizeof(struct lov_user_md_join) +
+ lsm->lsm_stripe_count *
+ sizeof(struct lov_user_ost_data_join);
+ OBD_ALLOC(lmj, lmj_size);
+ if (!lmj)
+ GOTO(out_free_memmd, rc = -ENOMEM);
+
+ memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
+ for(i = 0; i < lsm->lsm_stripe_count; i++) {
+ struct lov_array_info *lai = lsm->lsm_array;
+ if ((lai->lai_ext_array[aindex].le_loi_idx +
+ lai->lai_ext_array[aindex].le_stripe_count)<=i){
+ aindex ++;
+ }
+ CDEBUG(D_INFO, "aindex %d i %d l_extent_start"LPU64""
+ "len %d \n", aindex, i,
+ lai->lai_ext_array[aindex].le_start,
+ (int)lai->lai_ext_array[aindex].le_len);
+ lmj->lmm_objects[i].l_extent_start =
+ lai->lai_ext_array[aindex].le_start;
+
+ if ((int)lai->lai_ext_array[aindex].le_len == -1) {
+ lmj->lmm_objects[i].l_extent_end = -1;
+ } else {
+ lmj->lmm_objects[i].l_extent_end =
+ lai->lai_ext_array[aindex].le_start +
+ lai->lai_ext_array[aindex].le_len;
+ }
+ lmj->lmm_objects[i].l_object_id =
+ lsm->lsm_oinfo[i].loi_id;
+ lmj->lmm_objects[i].l_object_gr =
+ lsm->lsm_oinfo[i].loi_gr;
+ lmj->lmm_objects[i].l_ost_gen =
+ lsm->lsm_oinfo[i].loi_ost_gen;
+ lmj->lmm_objects[i].l_ost_idx =
+ lsm->lsm_oinfo[i].loi_ost_idx;
+ }
+ lmm = (struct lov_mds_md *)lmj;
+ lmmsize = lmj_size;
+out_free_memmd:
+ obd_free_memmd(sbi->ll_osc_exp, &lsm);
+ if (rc)
+ GOTO(out_req, rc);
}
-
if (cmd == IOC_MDC_GETFILEINFO) {
struct lov_user_mds_data *lmdp;
lstat_t st = { 0 };
}
rc = copy_to_user(lump, lmm, lmmsize);
+ if (lmm->lmm_magic == LOV_MAGIC_JOIN)
+ OBD_FREE(lmm, lmmsize);
if (rc)
GOTO(out_req, rc = -EFAULT);
bufs[1] = NULL;
}
size = data->ioc_plen1;
- req = ptlrpc_prep_req(sbi2mdc(sbi)->cl_import, LLOG_CATINFO,
+ req = ptlrpc_prep_req(sbi2mdc(sbi)->cl_import,
+ LUSTRE_LOG_VERSION, LLOG_CATINFO,
2, lens, bufs);
if (!req)
GOTO(out_catinfo, rc = -ENOMEM);
RETURN(rc);
}
case OBD_IOC_QUOTACHECK: {
- struct obd_quotactl oqctl = { 0, };
+ struct obd_quotactl *oqctl;
int rc, error = 0;
if (!capable(CAP_SYS_ADMIN))
RETURN(-EPERM);
- oqctl.qc_type = arg;
- rc = obd_quotacheck(sbi->ll_mdc_exp, &oqctl);
+ OBD_ALLOC_PTR(oqctl);
+ if (!oqctl)
+ RETURN(-ENOMEM);
+ oqctl->qc_type = arg;
+ rc = obd_quotacheck(sbi->ll_mdc_exp, oqctl);
if (rc < 0) {
CDEBUG(D_INFO, "mdc_quotacheck failed: rc %d\n", rc);
error = rc;
}
- rc = obd_quotacheck(sbi->ll_osc_exp, &oqctl);
+ rc = obd_quotacheck(sbi->ll_osc_exp, oqctl);
if (rc < 0)
CDEBUG(D_INFO, "osc_quotacheck failed: rc %d\n", rc);
- if (error)
- rc = error;
- return rc;
+ OBD_FREE_PTR(oqctl);
+ return error ?: rc;
}
case OBD_IOC_POLL_QUOTACHECK: {
- struct if_quotacheck check;
+ struct if_quotacheck *check;
int rc;
if (!capable(CAP_SYS_ADMIN))
RETURN(-EPERM);
- rc = obd_iocontrol(cmd, sbi->ll_mdc_exp, 0, (void *)&check,
+ OBD_ALLOC_PTR(check);
+ if (!check)
+ RETURN(-ENOMEM);
+
+ rc = obd_iocontrol(cmd, sbi->ll_mdc_exp, 0, (void *)check,
NULL);
- if (check.stat == -ENODATA)
- rc = check.stat;
if (rc) {
- CDEBUG(D_QUOTA, "mdc ioctl %d failed: rc %d\n",
- cmd, check.stat);
- if (copy_to_user((void *)arg, &check, sizeof(check)))
- RETURN(-EFAULT);
- RETURN(rc);
+ CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
+ if (copy_to_user((void *)arg, check, sizeof(*check)))
+ rc = -EFAULT;
+ GOTO(out_poll, rc);
}
- rc = obd_iocontrol(cmd, sbi->ll_osc_exp, 0, (void *)&check,
+ rc = obd_iocontrol(cmd, sbi->ll_osc_exp, 0, (void *)check,
NULL);
- if (check.stat == -ENODATA)
- rc = check.stat;
if (rc) {
- CDEBUG(D_QUOTA, "osc ioctl %d failed: rc %d\n",
- cmd, rc);
- if (copy_to_user((void *)arg, &check, sizeof(check)))
- RETURN(-EFAULT);
- RETURN(rc);
+ CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
+ if (copy_to_user((void *)arg, check, sizeof(*check)))
+ rc = -EFAULT;
+ GOTO(out_poll, rc);
}
-
- RETURN(0);
+ out_poll:
+ OBD_FREE_PTR(check);
+ RETURN(rc);
}
#if HAVE_QUOTA_SUPPORT
case OBD_IOC_QUOTACTL: {
- struct if_quotactl qctl;
- struct obd_quotactl oqctl;
+ struct if_quotactl *qctl;
+ struct obd_quotactl *oqctl;
- int cmd, type, id, rc = 0, error = 0;
+ int cmd, type, id, rc = 0;
- if (copy_from_user(&qctl, (void *)arg, sizeof(qctl)))
- RETURN(-EFAULT);
+ OBD_ALLOC_PTR(qctl);
+ if (!qctl)
+ RETURN(-ENOMEM);
+
+ OBD_ALLOC_PTR(oqctl);
+ if (!oqctl) {
+ OBD_FREE_PTR(qctl);
+ RETURN(-ENOMEM);
+ }
+ if (copy_from_user(qctl, (void *)arg, sizeof(*qctl)))
+ GOTO(out_quotactl, rc = -EFAULT);
- cmd = qctl.qc_cmd;
- type = qctl.qc_type;
- id = qctl.qc_id;
+ cmd = qctl->qc_cmd;
+ type = qctl->qc_type;
+ id = qctl->qc_id;
switch (cmd) {
case Q_QUOTAON:
case Q_QUOTAOFF:
case Q_SETQUOTA:
case Q_SETINFO:
if (!capable(CAP_SYS_ADMIN))
- RETURN(-EPERM);
+ GOTO(out_quotactl, rc = -EPERM);
break;
case Q_GETQUOTA:
if (((type == USRQUOTA && current->euid != id) ||
(type == GRPQUOTA && !in_egroup_p(id))) &&
!capable(CAP_SYS_ADMIN))
- RETURN(-EPERM);
+ GOTO(out_quotactl, rc = -EPERM);
+
+ /* XXX: dqb_valid is borrowed as a flag to mark that
+ * only mds quota is wanted */
+ if (qctl->qc_dqblk.dqb_valid)
+ qctl->obd_uuid =
+ sbi->ll_mdc_exp->exp_obd->u.cli.
+ cl_import->imp_target_uuid;
break;
case Q_GETINFO:
break;
default:
- RETURN(-EINVAL);
+ CERROR("unsupported quotactl op: %#x\n", cmd);
+ GOTO(out_quotactl, -ENOTTY);
}
- QCTLCONV(&oqctl, &qctl);
+ QCTL_COPY(oqctl, qctl);
- if (qctl.obd_uuid.uuid[0]) {
+ if (qctl->obd_uuid.uuid[0]) {
struct obd_device *obd;
- struct obd_uuid *uuid = &qctl.obd_uuid;
-
- if (cmd == Q_GETINFO)
- oqctl.qc_cmd = Q_GETOINFO;
- else if (cmd == Q_GETQUOTA)
- oqctl.qc_cmd = Q_GETOQUOTA;
- else
- RETURN(-EINVAL);
+ struct obd_uuid *uuid = &qctl->obd_uuid;
- rc = -ENOENT;
obd = class_find_client_notype(uuid,
&sbi->ll_osc_exp->exp_obd->obd_uuid);
if (!obd)
- RETURN(rc);
+ GOTO(out_quotactl, rc = -ENOENT);
+
+ if (cmd == Q_GETINFO)
+ oqctl->qc_cmd = Q_GETOINFO;
+ else if (cmd == Q_GETQUOTA)
+ oqctl->qc_cmd = Q_GETOQUOTA;
+ else
+ GOTO(out_quotactl, rc = -EINVAL);
if (sbi->ll_mdc_exp->exp_obd == obd) {
- rc = obd_quotactl(sbi->ll_mdc_exp, &oqctl);
+ rc = obd_quotactl(sbi->ll_mdc_exp, oqctl);
} else {
int i;
struct obd_export *exp;
continue;
if (exp->exp_obd == obd) {
- rc = obd_quotactl(exp, &oqctl);
+ rc = obd_quotactl(exp, oqctl);
break;
}
}
}
- QCTLCONV(&qctl, &oqctl);
+ oqctl->qc_cmd = cmd;
+ QCTL_COPY(qctl, oqctl);
- if (copy_to_user((void *)arg, &qctl, sizeof(qctl)))
- RETURN(-EFAULT);
+ if (copy_to_user((void *)arg, qctl, sizeof(*qctl)))
+ rc = -EFAULT;
- RETURN(rc);
+ GOTO(out_quotactl, rc);
}
- if (cmd == Q_SETQUOTA)
- oqctl.qc_dqblk.dqb_valid = QIF_LIMITS;
-
- rc = obd_quotactl(sbi->ll_mdc_exp, &oqctl);
- if (rc) {
- if (rc == -EBUSY && cmd == Q_QUOTAON)
- error = rc;
- else
- RETURN(rc);
+ rc = obd_quotactl(sbi->ll_mdc_exp, oqctl);
+ if (rc && rc != -EBUSY && cmd == Q_QUOTAON) {
+ oqctl->qc_cmd = Q_QUOTAOFF;
+ obd_quotactl(sbi->ll_mdc_exp, oqctl);
}
- if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) {
- rc = obd_quotactl(sbi->ll_osc_exp, &oqctl);
- if (rc) {
- if (rc != -EBUSY && cmd == Q_QUOTAON) {
- oqctl.qc_cmd = Q_QUOTAOFF;
- obd_quotactl(sbi->ll_mdc_exp, &oqctl);
- obd_quotactl(sbi->ll_osc_exp, &oqctl);
- }
- RETURN(rc);
- }
- }
-
- QCTLCONV(&qctl, &oqctl);
+ QCTL_COPY(qctl, oqctl);
- if (copy_to_user((void *)arg, &qctl, sizeof(qctl)))
- return -EFAULT;
-
- RETURN(rc?:error);
+ if (copy_to_user((void *)arg, qctl, sizeof(*qctl)))
+ rc = -EFAULT;
+ out_quotactl:
+ OBD_FREE_PTR(qctl);
+ OBD_FREE_PTR(oqctl);
+ RETURN(rc);
}
#endif /* HAVE_QUOTA_SUPPORT */
case OBD_IOC_GETNAME: {
ll_prepare_mdc_op_data(&data, parent->d_inode, NULL, name, len, O_RDWR);
- rc = mdc_enqueue(sbi->ll_mdc_exp, LDLM_PLAIN, itp, LCK_PW, &data,
+ rc = mdc_enqueue(sbi->ll_mdc_exp, LDLM_IBITS, itp, LCK_PW, &data,
&lockh, lmm, lmmsize, ldlm_completion_ast,
ll_mdc_blocking_ast, NULL, 0);
if (rc < 0)
int rc;
ENTRY;
- LASSERT(lockh->cookie == 0);
+ LASSERT(!lustre_handle_is_used(lockh));
LASSERT(lsm != NULL);
/* don't drop the mmapped file to LRU */
oa->o_id = ucreatp.lrc_id;
oa->o_nlink = ucreatp.lrc_ost_idx;
- oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
oa->o_flags |= OBD_FL_RECREATE_OBJS;
+ oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
- OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+ OBD_MD_FLMTIME | OBD_MD_FLCTIME);
oti.oti_objid = NULL;
memcpy(lsm2, lsm, lsm_size);
rc = mdc_req2lustre_md(req, 1, exp, &md);
if (rc)
GOTO(out, rc);
- ll_update_inode(f->f_dentry->d_inode, md.body, md.lsm);
+ ll_update_inode(f->f_dentry->d_inode, &md);
rc = ll_local_open(f, &oit, fd);
if (rc)
RETURN(0);
}
+static int join_sanity_check(struct inode *head, struct inode *tail)
+{
+ ENTRY;
+ if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
+ CERROR("server do not support join \n");
+ RETURN(-EINVAL);
+ }
+ if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
+ CERROR("tail ino %lu and ino head %lu must be regular\n",
+ head->i_ino, tail->i_ino);
+ RETURN(-EINVAL);
+ }
+ if (head->i_ino == tail->i_ino) {
+ CERROR("file %lu can not be joined to itself \n", head->i_ino);
+ RETURN(-EINVAL);
+ }
+ if (head->i_size % JOIN_FILE_ALIGN) {
+ CERROR("hsize" LPU64 " must be times of 64K\n",
+ head->i_size);
+ RETURN(-EINVAL);
+ }
+ RETURN(0);
+}
+
+static int join_file(struct inode *head_inode, struct file *head_filp,
+ struct file *tail_filp)
+{
+ struct inode *tail_inode, *tail_parent;
+ struct dentry *tail_dentry = tail_filp->f_dentry;
+ struct lookup_intent oit = {.it_op = IT_OPEN,
+ .it_flags = head_filp->f_flags|O_JOIN_FILE};
+ struct ptlrpc_request *req = NULL;
+ struct ll_file_data *fd;
+ struct lustre_handle lockh;
+ struct mdc_op_data *op_data;
+ __u32 hsize = head_inode->i_size >> 32;
+ __u32 tsize = head_inode->i_size;
+ struct file *f;
+ int rc;
+ ENTRY;
+
+ tail_dentry = tail_filp->f_dentry;
+ tail_inode = tail_dentry->d_inode;
+ tail_parent = tail_dentry->d_parent->d_inode;
+
+ fd = ll_file_data_get();
+ if (fd == NULL)
+ RETURN(-ENOMEM);
+
+ OBD_ALLOC_PTR(op_data);
+ if (op_data == NULL) {
+ ll_file_data_put(fd);
+ RETURN(-ENOMEM);
+ }
+
+ f = get_empty_filp();
+ if (f == NULL)
+ GOTO(out, rc = -ENOMEM);
+
+ f->f_dentry = head_filp->f_dentry;
+ f->f_vfsmnt = head_filp->f_vfsmnt;
+
+ ll_prepare_mdc_op_data(op_data, head_inode, tail_parent,
+ tail_dentry->d_name.name,
+ tail_dentry->d_name.len, 0);
+ rc = mdc_enqueue(ll_i2mdcexp(head_inode), LDLM_IBITS, &oit, LCK_PW,
+ op_data, &lockh, &tsize, 0, ldlm_completion_ast,
+ ll_mdc_blocking_ast, &hsize, 0);
+
+ if (rc < 0)
+ GOTO(out, rc);
+
+ req = oit.d.lustre.it_data;
+ rc = oit.d.lustre.it_status;
+
+ if (rc < 0)
+ GOTO(out, rc);
+
+ rc = ll_local_open(f, &oit, fd);
+ LASSERTF(rc == 0, "rc = %d\n", rc);
+
+ fd = NULL;
+ ll_intent_release(&oit);
+
+ rc = ll_file_release(f->f_dentry->d_inode, f);
+out:
+ if (op_data)
+ OBD_FREE_PTR(op_data);
+ if (f)
+ put_filp(f);
+ ll_file_data_put(fd);
+ ptlrpc_req_finished(req);
+ RETURN(rc);
+}
+
+static int ll_file_join(struct inode *head, struct file *filp,
+ char *filename_tail)
+{
+ struct inode *tail = NULL, *first, *second;
+ struct dentry *tail_dentry;
+ struct file *tail_filp, *first_filp, *second_filp;
+ struct ll_lock_tree first_tree, second_tree;
+ struct ll_lock_tree_node *first_node, *second_node;
+ struct ll_inode_info *hlli = ll_i2info(head), *tlli;
+ int rc = 0, cleanup_phase = 0;
+ ENTRY;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
+ head->i_ino, head->i_generation, head, filename_tail);
+
+ tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
+ if (IS_ERR(tail_filp)) {
+ CERROR("Can not open tail file %s", filename_tail);
+ rc = PTR_ERR(tail_filp);
+ GOTO(cleanup, rc);
+ }
+ tail = igrab(tail_filp->f_dentry->d_inode);
+
+ tlli = ll_i2info(tail);
+ tail_dentry = tail_filp->f_dentry;
+ LASSERT(tail_dentry);
+ cleanup_phase = 1;
+
+ /*reorder the inode for lock sequence*/
+ first = head->i_ino > tail->i_ino ? head : tail;
+ second = head->i_ino > tail->i_ino ? tail : head;
+ first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
+ second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
+
+ CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
+ head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
+ first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
+ if (IS_ERR(first_node)){
+ rc = PTR_ERR(first_node);
+ GOTO(cleanup, rc);
+ }
+ first_tree.lt_fd = first_filp->private_data;
+ rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
+ if (rc != 0)
+ GOTO(cleanup, rc);
+ cleanup_phase = 2;
+
+ second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
+ if (IS_ERR(second_node)){
+ rc = PTR_ERR(second_node);
+ GOTO(cleanup, rc);
+ }
+ second_tree.lt_fd = second_filp->private_data;
+ rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
+ if (rc != 0)
+ GOTO(cleanup, rc);
+ cleanup_phase = 3;
+
+ rc = join_sanity_check(head, tail);
+ if (rc)
+ GOTO(cleanup, rc);
+
+ rc = join_file(head, filp, tail_filp);
+ if (rc)
+ GOTO(cleanup, rc);
+cleanup:
+ switch (cleanup_phase) {
+ case 3:
+ ll_tree_unlock(&second_tree);
+ obd_cancel_unused(ll_i2obdexp(second),
+ ll_i2info(second)->lli_smd, 0, NULL);
+ case 2:
+ ll_tree_unlock(&first_tree);
+ obd_cancel_unused(ll_i2obdexp(first),
+ ll_i2info(first)->lli_smd, 0, NULL);
+ case 1:
+ filp_close(tail_filp, 0);
+ if (tail)
+ iput(tail);
+ if (head && rc == 0) {
+ obd_free_memmd(ll_i2sbi(head)->ll_osc_exp,
+ &hlli->lli_smd);
+ hlli->lli_smd = NULL;
+ }
+ case 0:
+ break;
+ default:
+ CERROR("invalid cleanup_phase %d\n", cleanup_phase);
+ LBUG();
+ }
+ RETURN(rc);
+}
int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
unsigned long arg)
{
case EXT3_IOC_GETVERSION_OLD:
case EXT3_IOC_GETVERSION:
RETURN(put_user(inode->i_generation, (int *) arg));
+ case LL_IOC_JOIN: {
+ char *ftail;
+ int rc;
+
+ ftail = getname((const char *)arg);
+ if (IS_ERR(ftail))
+ RETURN(PTR_ERR(ftail));
+ rc = ll_file_join(inode, file, ftail);
+ putname(ftail);
+ RETURN(rc);
+ }
case LL_IOC_GROUP_LOCK:
RETURN(ll_get_grouplock(inode, file, arg));
case LL_IOC_GROUP_UNLOCK:
struct lustre_handle lockh;
struct ldlm_res_id res_id = { .name = {0} };
struct obd_device *obddev;
+ ldlm_policy_data_t policy = { .l_inodebits = {MDS_INODELOCK_UPDATE}};
int flags;
ENTRY;
CDEBUG(D_INFO, "trying to match res "LPU64"\n", res_id.name[0]);
- /* FIXME use LDLM_FL_TEST_LOCK instead */
- flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
- if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_PLAIN,
- NULL, LCK_PR, &lockh)) {
- ldlm_lock_decref(&lockh, LCK_PR);
+ flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
+ if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
+ &policy, LCK_CR|LCK_CW|LCK_PR, &lockh)) {
RETURN(1);
}
- if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_PLAIN,
- NULL, LCK_PW, &lockh)) {
- ldlm_lock_decref(&lockh, LCK_PW);
- RETURN(1);
- }
RETURN(0);
}
struct ptlrpc_request *req = NULL;
struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
struct ll_fid fid;
- unsigned long valid = OBD_MD_FLGETATTR;
+ obd_valid valid = OBD_MD_FLGETATTR;
int ealen = 0;
if (S_ISREG(inode->i_mode)) {
- ealen = obd_size_diskmd(sbi->ll_osc_exp, NULL);
- valid |= OBD_MD_FLEASIZE;
+ rc = ll_get_max_mdsize(sbi, &ealen);
+ if (rc)
+ RETURN(rc);
+ valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
}
ll_inode2fid(&fid, inode);
rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
}
#endif
+static
+int lustre_check_acl(struct inode *inode, int mask)
+{
+#ifdef CONFIG_FS_POSIX_ACL
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct posix_acl *acl;
+ int rc;
+ ENTRY;
+
+ spin_lock(&lli->lli_lock);
+ acl = posix_acl_dup(lli->lli_posix_acl);
+ spin_unlock(&lli->lli_lock);
+
+ if (!acl)
+ RETURN(-EAGAIN);
+
+ rc = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+
+ RETURN(rc);
+#else
+ return -EAGAIN;
+#endif
+}
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
+int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
+ inode->i_ino, inode->i_generation, inode, mask);
+ return generic_permission(inode, mask, lustre_check_acl);
+}
+#else
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
+int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
+#else
+int ll_inode_permission(struct inode *inode, int mask)
+#endif
+{
+ int mode = inode->i_mode;
+ int rc;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
+ inode->i_ino, inode->i_generation, inode, mask);
+
+ if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+ return -EROFS;
+ if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+ return -EACCES;
+ if (current->fsuid == inode->i_uid) {
+ mode >>= 6;
+ } else if (1) {
+ if (((mode >> 3) & mask & S_IRWXO) != mask)
+ goto check_groups;
+ rc = lustre_check_acl(inode, mask);
+ if (rc == -EAGAIN)
+ goto check_groups;
+ if (rc == -EACCES)
+ goto check_capabilities;
+ return rc;
+ } else {
+check_groups:
+ if (in_group_p(inode->i_gid))
+ mode >>= 3;
+ }
+ if ((mode & mask & S_IRWXO) == mask)
+ return 0;
+
+check_capabilities:
+ if (!(mask & MAY_EXEC) ||
+ (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
+ if (capable(CAP_DAC_OVERRIDE))
+ return 0;
+
+ if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
+ (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
+ return 0;
+ return -EACCES;
+}
+#endif
+
struct file_operations ll_file_operations = {
.read = ll_file_read,
.write = ll_file_write,
#else
.revalidate_it = ll_inode_revalidate_it,
#endif
+ .permission = ll_inode_permission,
.setxattr = ll_setxattr,
.getxattr = ll_getxattr,
.listxattr = ll_listxattr,
#ifndef LLITE_INTERNAL_H
#define LLITE_INTERNAL_H
+#ifdef CONFIG_FS_POSIX_ACL
+# include <linux/fs.h>
+# include <linux/xattr_acl.h>
+#endif
+
#include <linux/lustre_debug.h>
#include <linux/lustre_version.h>
#include <linux/lustre_disk.h> /* for s2sbi */
struct file_operations *ll_save_ffop;
struct file_operations *ll_save_wfop;
struct file_operations *ll_save_wrfop;
+
+ struct posix_acl *lli_posix_acl;
+
+ struct list_head lli_dead_list;
+
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
struct inode lli_vfs_inode;
#endif
};
/* flags for sbi->ll_flags */
-#define LL_SBI_NOLCK 0x1 /* DLM locking disabled (directio-only) */
-#define LL_SBI_CHECKSUM 0x2 /* checksum each page as it's written */
-#define LL_SBI_FLOCK 0x4
-#define LL_SBI_USER_XATTR 0x8 /* support user xattr */
+#define LL_SBI_NOLCK 0x01 /* DLM locking disabled (directio-only) */
+#define LL_SBI_CHECKSUM 0x02 /* checksum each page as it's written */
+#define LL_SBI_FLOCK 0x04
+#define LL_SBI_USER_XATTR 0x08 /* support user xattr */
+#define LL_SBI_ACL 0x10 /* support ACL */
+#define LL_SBI_JOIN 0x20 /* support JOIN */
struct ll_sb_info {
struct list_head ll_list;
struct ll_ra_info ll_ra_info;
unsigned int ll_namelen;
struct file_operations *ll_fop;
+
+ struct list_head ll_deathrow; /* inodes to be destroyed (b1443) */
+ spinlock_t ll_deathrow_lock;
};
struct ll_ra_read {
extern struct proc_dir_entry *proc_lustre_fs_root;
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-# define hlist_del_init list_del_init
-#endif
-
static inline struct inode *ll_info2i(struct ll_inode_info *lli)
{
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
unsigned int llap_write_queued:1,
llap_defer_uptodate:1,
llap_origin:3,
- llap_ra_used:1;
+ llap_ra_used:1,
+ llap_ignore_quota:1;
void *llap_cookie;
struct page *llap_page;
struct list_head llap_pending_write;
struct lookup_intent *it, struct kstat *stat);
#endif
struct ll_file_data *ll_file_data_get(void);
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
+int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd);
+#else
+int ll_inode_permission(struct inode *inode, int mask);
+#endif
/* llite/dcache.c */
void ll_intent_drop_lock(struct lookup_intent *);
void ll_unhash_aliases(struct inode *);
void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft);
void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
+int ll_dcompare(struct dentry *parent, struct qstr *d_name, struct qstr *name);
/* llite/llite_lib.c */
int ll_statfs(struct super_block *sb, struct kstatfs *sfs);
int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
unsigned long maxage);
-void ll_update_inode(struct inode *inode, struct mds_body *body,
- struct lov_stripe_md *lsm);
+void ll_update_inode(struct inode *inode, struct lustre_md *md);
void ll_read_inode2(struct inode *inode, void *opaque);
int ll_iocontrol(struct inode *inode, struct file *file,
unsigned int cmd, unsigned long arg);
void lustre_dump_inode(struct inode *);
struct ll_async_page *llite_pglist_next_llap(struct ll_sb_info *sbi,
struct list_head *list);
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
/* llite/llite_nfs.c */
__u32 get_uuid2int(const char *name, int len);
/* llite/xattr.c */
int ll_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
-int ll_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size);
-int ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size);
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size);
int ll_removexattr(struct dentry *dentry, const char *name);
#endif /* LLITE_INTERNAL_H */
spin_lock(&ll_sb_lock);
list_add_tail(&sbi->ll_list, &ll_super_blocks);
spin_unlock(&ll_sb_lock);
+
+ INIT_LIST_HEAD(&sbi->ll_deathrow);
+ spin_lock_init(&sbi->ll_deathrow_lock);
RETURN(sbi);
}
EXIT;
}
+static struct dentry_operations ll_d_root_ops = {
+ .d_compare = ll_dcompare,
+};
+
int client_common_fill_super(struct super_block *sb, char *mdc, char *osc)
{
struct inode *root = 0;
CERROR("could not register mount in /proc/lustre");
}
+ /* indicate that inodebits locking is supported by this client */
+ data->ocd_connect_flags |= OBD_CONNECT_IBITS;
+ data->ocd_ibits_known = MDS_INODELOCK_FULL;
+
if (sb->s_flags & MS_RDONLY)
data->ocd_connect_flags |= OBD_CONNECT_RDONLY;
+ if (sbi->ll_flags & LL_SBI_USER_XATTR)
+ data->ocd_connect_flags |= OBD_CONNECT_XATTR;
+ data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_JOIN;
if (sbi->ll_flags & LL_SBI_FLOCK) {
sbi->ll_fop = &ll_file_operations_flock;
data->ocd_connect_flags |= OBD_CONNECT_VERSION;
data->ocd_version = LUSTRE_VERSION_CODE;
+
err = obd_connect(&mdc_conn, obd, &sbi->ll_sb_uuid, data);
if (err == -EBUSY) {
CERROR("An MDT (mdc %s) is performing recovery, of which this"
if (err)
GOTO(out_mdc, err);
+ /* async connect is surely finished by now */
+ *data = class_exp2cliimp(sbi->ll_mdc_exp)->imp_connect_data;
+
LASSERT(osfs.os_bsize);
sb->s_blocksize = osfs.os_bsize;
sb->s_blocksize_bits = log2(osfs.os_bsize);
sb->s_maxbytes = PAGE_CACHE_MAXBYTES;
sbi->ll_namelen = osfs.os_namelen;
+ if ((sbi->ll_flags & LL_SBI_USER_XATTR) &&
+ !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) {
+ LCONSOLE_INFO("Disabling user_xattr feature because "
+ "it is not supported on the server\n");
+ sbi->ll_flags &= ~LL_SBI_USER_XATTR;
+ }
+
+ if (data->ocd_connect_flags & OBD_CONNECT_ACL) {
+#ifdef MS_POSIXACL
+ sb->s_flags |= MS_POSIXACL;
+#endif
+ sbi->ll_flags |= LL_SBI_ACL;
+ } else
+ sbi->ll_flags &= ~LL_SBI_ACL;
+
+ if (data->ocd_connect_flags & OBD_CONNECT_JOIN)
+ sbi->ll_flags |= LL_SBI_JOIN;
+
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
/* We set sb->s_dev equal on all lustre clients in order to support
* NFS export clustering. NFSD requires that the FSID be the same
/* make root inode
* XXX: move this to after cbd setup? */
err = mdc_getattr(sbi->ll_mdc_exp, &rootfid,
- OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS, 0, &request);
+ OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS |
+ (sbi->ll_flags & LL_SBI_ACL ? OBD_MD_FLACL : 0),
+ 0, &request);
if (err) {
CERROR("mdc_getattr failed for root: rc = %d\n", err);
GOTO(out_osc, err);
ptlrpc_req_finished(request);
if (root == NULL || is_bad_inode(root)) {
- if (md.lsm != NULL)
- obd_free_memmd(sbi->ll_osc_exp, &md.lsm);
+ mdc_free_lustre_md(sbi->ll_osc_exp, &md);
CERROR("lustre_lite: bad iget4 for root\n");
GOTO(out_root, err = -EBADF);
}
sb->s_root = d_alloc_root(root);
if (data != NULL)
OBD_FREE(data, sizeof(*data));
+ sb->s_root->d_op = &ll_d_root_ops;
RETURN(err);
out_root:
RETURN(err);
}
+int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize)
+{
+ int size, rc;
+
+ *lmmsize = obd_size_diskmd(sbi->ll_osc_exp, NULL);
+ size = sizeof(int);
+ rc = obd_get_info(sbi->ll_mdc_exp, strlen("max_easize"), "max_easize",
+ &size, lmmsize);
+ if (rc)
+ CERROR("Get max mdsize error rc %d \n", rc);
+
+ RETURN(rc);
+}
+
void ll_dump_inode(struct inode *inode)
{
struct list_head *tmp;
}
}
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+void lustre_throw_orphan_dentries(struct super_block *sb)
+{
+ struct hlist_node *tmp, *next;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+ /* Do this to get rid of orphaned dentries. That is not really trw. */
+ hlist_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list) {
+ struct dentry *dentry = hlist_entry(tmp, struct dentry, d_hash);
+ CWARN("found orphan dentry %.*s (%p->%p) at unmount, dumping "
+ "before and after shrink_dcache_parent\n",
+ dentry->d_name.len, dentry->d_name.name, dentry, next);
+ lustre_dump_dentry(dentry, 1);
+ shrink_dcache_parent(dentry);
+ lustre_dump_dentry(dentry, 1);
+ }
+}
+#else
+#define lustre_throw_orphan_dentries(sb)
+#endif
+
+static void prune_deathrow(struct ll_sb_info *sbi, int try)
+{
+ LIST_HEAD(throw_away);
+ int locked = 0;
+ ENTRY;
+
+ if (try) {
+ locked = spin_trylock(&sbi->ll_deathrow_lock);
+ } else {
+ spin_lock(&sbi->ll_deathrow_lock);
+ locked = 1;
+ }
+
+ if (!locked) {
+ EXIT;
+ return;
+ }
+
+ list_splice_init(&sbi->ll_deathrow, &throw_away);
+ spin_unlock(&sbi->ll_deathrow_lock);
+
+ while (!list_empty(&throw_away)) {
+ struct ll_inode_info *lli;
+ struct inode *inode;
+
+ lli = list_entry(throw_away.next, struct ll_inode_info,
+ lli_dead_list);
+ list_del_init(&lli->lli_dead_list);
+
+ inode = ll_info2i(lli);
+ d_prune_aliases(inode);
+
+ CDEBUG(D_INODE, "prune duplicate inode %p inum %lu count %u\n",
+ inode, inode->i_ino, atomic_read(&inode->i_count));
+ iput(inode);
+ }
+ EXIT;
+}
+
void client_common_put_super(struct super_block *sb)
{
struct ll_sb_info *sbi = ll_s2sbi(sb);
- struct hlist_node *tmp, *next;
ENTRY;
ll_close_thread_shutdown(sbi->ll_lcq);
+ /* destroy inodes in deathrow */
+ prune_deathrow(sbi, 0);
+
list_del(&sbi->ll_conn_chain);
obd_disconnect(sbi->ll_osc_exp);
obd_disconnect(sbi->ll_mdc_exp);
- // We do this to get rid of orphaned dentries. That is not really trw.
- hlist_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list) {
- struct dentry *dentry = hlist_entry(tmp, struct dentry, d_hash);
- CWARN("found orphan dentry %.*s (%p->%p) at unmount, dumping "
- "before and after shrink_dcache_parent\n",
- dentry->d_name.len, dentry->d_name.name, dentry, next);
- lustre_dump_dentry(dentry, 1);
- shrink_dcache_parent(dentry);
- lustre_dump_dentry(dentry, 1);
- }
+ lustre_throw_orphan_dentries(sb);
+
EXIT;
}
ENTRY;
CDEBUG(D_SUPER, "option: %s, data %s\n", opt, data);
- if (strncmp(opt, data, strlen(opt)))
+ if (strncmp(opt, data, strlen(opt)) != 0)
RETURN(0);
else
RETURN(fl);
*flags &= ~tmp;
continue;
}
+ tmp = ll_set_opt("acl", this_char, LL_SBI_ACL);
+ if (tmp) {
+ /* Ignore deprecated mount option. The client will
+ * always try to mount with ACL support, whether this
+ * is used depends on whether server supports it. */
+ continue;
+ }
+ tmp = ll_set_opt("noacl", this_char, LL_SBI_ACL);
+ if (tmp) {
+ continue;
+ }
}
CERROR("flags %#x\n", *flags);
spin_lock_init(&lli->lli_lock);
INIT_LIST_HEAD(&lli->lli_pending_write_llaps);
lli->lli_inode_magic = LLI_INODE_MAGIC;
+ INIT_LIST_HEAD(&lli->lli_dead_list);
}
int ll_fill_super(struct super_block *sb)
strlen(lli->lli_symlink_name) + 1);
lli->lli_symlink_name = NULL;
}
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (lli->lli_posix_acl) {
+ LASSERT(atomic_read(&lli->lli_posix_acl->a_refcount) == 1);
+ posix_acl_release(lli->lli_posix_acl);
+ lli->lli_posix_acl = NULL;
+ }
+#endif
+
lli->lli_inode_magic = LLI_INODE_DEAD;
+ spin_lock(&sbi->ll_deathrow_lock);
+ list_del_init(&lli->lli_dead_list);
+ spin_unlock(&sbi->ll_deathrow_lock);
+
EXIT;
}
*/
int ll_setattr_raw(struct inode *inode, struct iattr *attr)
{
- struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct lov_stripe_md *lsm = lli->lli_smd;
struct ll_sb_info *sbi = ll_i2sbi(inode);
struct ptlrpc_request *request = NULL;
struct mdc_op_data op_data;
* above to avoid invoking vmtruncate, otherwise it is important
* to call vmtruncate in inode_setattr to update inode->i_size
* (bug 6196) */
- inode_setattr(inode, attr);
+ rc = inode_setattr(inode, attr);
- ll_update_inode(inode, md.body, md.lsm);
+ ll_update_inode(inode, &md);
ptlrpc_req_finished(request);
if (!lsm || !S_ISREG(inode->i_mode)) {
CDEBUG(D_INODE, "no lsm: not setting attrs on OST\n");
- RETURN(0);
+ RETURN(rc);
}
} else {
/* The OST doesn't check permissions, but the alternative is
}
/* Won't invoke vmtruncate, as we already cleared ATTR_SIZE */
- inode_setattr(inode, attr);
+ rc = inode_setattr(inode, attr);
}
/* We really need to get our PW lock before we change inode->i_size.
ldlm_policy_data_t policy = { .l_extent = {attr->ia_size,
OBD_OBJECT_EOF } };
struct lustre_handle lockh = { 0 };
- struct ll_inode_info *lli = ll_i2info(inode);
int err, ast_flags = 0;
/* XXX when we fix the AST intents to pass the discard-range
* XXX extent, make ast_flags always LDLM_AST_DISCARD_DATA
rc = err;
}
} else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) {
+ obd_flag flags;
struct obdo oa;
CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n",
inode->i_ino, LTIME_S(attr->ia_mtime));
+
oa.o_id = lsm->lsm_object_id;
oa.o_valid = OBD_MD_FLID;
- obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
- OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+
+ flags = OBD_MD_FLTYPE | OBD_MD_FLATIME |
+ OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+ OBD_MD_FLFID | OBD_MD_FLGENER;
+
+ obdo_from_inode(&oa, inode, flags);
rc = obd_setattr(sbi->ll_osc_exp, &oa, lsm, NULL);
if (rc)
CERROR("obd_setattr fails: rc=%d\n", rc);
up(&lli->lli_size_sem);
}
-void ll_update_inode(struct inode *inode, struct mds_body *body,
- struct lov_stripe_md *lsm)
+static void ll_replace_lsm(struct inode *inode, struct lov_stripe_md *lsm)
+{
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ dump_lsm(D_INODE, lsm);
+ dump_lsm(D_INODE, lli->lli_smd);
+ LASSERTF(lsm->lsm_magic == LOV_MAGIC_JOIN,
+ "lsm must be joined lsm %p\n", lsm);
+ obd_free_memmd(ll_i2obdexp(inode), &lli->lli_smd);
+ CDEBUG(D_INODE, "replace lsm %p to lli_smd %p for inode %lu%u(%p)\n",
+ lsm, lli->lli_smd, inode->i_ino, inode->i_generation, inode);
+ lli->lli_smd = lsm;
+ lli->lli_maxbytes = lsm->lsm_maxbytes;
+ if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES)
+ lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
+}
+
+void ll_update_inode(struct inode *inode, struct lustre_md *md)
{
struct ll_inode_info *lli = ll_i2info(inode);
+ struct mds_body *body = md->body;
+ struct lov_stripe_md *lsm = md->lsm;
LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
if (lsm != NULL) {
if (lli->lli_smd == NULL) {
- if (lsm->lsm_magic != LOV_MAGIC) {
+ if (lsm->lsm_magic != LOV_MAGIC &&
+ lsm->lsm_magic != LOV_MAGIC_JOIN) {
dump_lsm(D_ERROR, lsm);
LBUG();
}
if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES)
lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
} else {
- if (lov_stripe_md_cmp(lli->lli_smd, lsm)) {
- CERROR("lsm mismatch for inode %ld\n",
- inode->i_ino);
- CERROR("lli_smd:\n");
- dump_lsm(D_ERROR, lli->lli_smd);
- CERROR("lsm:\n");
- dump_lsm(D_ERROR, lsm);
- LBUG();
- }
+ if (lli->lli_smd->lsm_magic == lsm->lsm_magic &&
+ lli->lli_smd->lsm_stripe_count ==
+ lsm->lsm_stripe_count) {
+ if (lov_stripe_md_cmp(lli->lli_smd, lsm)) {
+ CERROR("lsm mismatch for inode %ld\n",
+ inode->i_ino);
+ CERROR("lli_smd:\n");
+ dump_lsm(D_ERROR, lli->lli_smd);
+ CERROR("lsm:\n");
+ dump_lsm(D_ERROR, lsm);
+ LBUG();
+ }
+ } else
+ ll_replace_lsm(inode, lsm);
}
/* bug 2844 - limit i_blksize for broken user-space apps */
LASSERTF(lsm->lsm_xfersize != 0, "%lu\n", lsm->lsm_xfersize);
inode->i_sb->s_blocksize);
}
+#ifdef CONFIG_FS_POSIX_ACL
+ LASSERT(!md->posix_acl || (body->valid & OBD_MD_FLACL));
+ if (body->valid & OBD_MD_FLACL) {
+ spin_lock(&lli->lli_lock);
+ if (lli->lli_posix_acl)
+ posix_acl_release(lli->lli_posix_acl);
+ lli->lli_posix_acl = md->posix_acl;
+ spin_unlock(&lli->lli_lock);
+ }
+#endif
+
if (body->valid & OBD_MD_FLID)
inode->i_ino = body->ino;
if (body->valid & OBD_MD_FLATIME)
LTIME_S(inode->i_atime) = 0;
LTIME_S(inode->i_ctime) = 0;
inode->i_rdev = 0;
- ll_update_inode(inode, md->body, md->lsm);
+ ll_update_inode(inode, md);
/* OIDEBUG(inode); */
oa->o_flags = flags;
oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
+ obdo_from_inode(oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
rc = obd_setattr(sbi->ll_osc_exp, oa, lsm, NULL);
obdo_free(oa);
if (rc) {
struct ptlrpc_request *req, int offset,struct super_block *sb)
{
struct lustre_md md;
+ struct ll_sb_info *sbi = NULL;
int rc = 0;
+ ENTRY;
+
+ LASSERT(*inode || sb);
+ sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode);
+ prune_deathrow(sbi, 1);
rc = mdc_req2lustre_md(req, offset, exp, &md);
if (rc)
RETURN(rc);
if (*inode) {
- ll_update_inode(*inode, md.body, md.lsm);
+ ll_update_inode(*inode, &md);
} else {
LASSERT(sb);
*inode = ll_iget(sb, md.body->ino, &md);
if (*inode == NULL || is_bad_inode(*inode)) {
- /* free the lsm if we allocated one above */
- if (md.lsm != NULL)
- obd_free_memmd(exp, &md.lsm);
+ mdc_free_lustre_md(exp, &md);
rc = -ENOMEM;
CERROR("new_inode -fatal: rc %d\n", rc);
+ GOTO(out, rc);
}
}
+ rc = obd_checkmd(exp, ll_i2mdcexp(*inode),
+ ll_i2info(*inode)->lli_smd);
+out:
RETURN(rc);
}
if (inode)
return inode;
if (S_ISREG(mode)) {
- eadatalen = obd_size_diskmd(sbi->ll_osc_exp, NULL);
+ rc = ll_get_max_mdsize(sbi, &eadatalen);
+ if (rc)
+ return ERR_PTR(rc);
valid |= OBD_MD_FLEASIZE;
}
fid.id = (__u64)ino;
spin_lock(&dcache_lock);
for (lp = inode->i_dentry.next; lp != &inode->i_dentry ; lp=lp->next) {
result = list_entry(lp,struct dentry, d_alias);
+ lock_dentry(result);
if (!(result->d_flags & DCACHE_DISCONNECTED)) {
dget_locked(result);
ll_set_dflags(result, DCACHE_REFERENCED);
+ unlock_dentry(result);
spin_unlock(&dcache_lock);
iput(inode);
return result;
}
+ unlock_dentry(result);
}
spin_unlock(&dcache_lock);
result = d_alloc_root(inode);
/* methods */
+/* called from iget{4,5_locked}->find_inode() under inode_lock spinlock */
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
static int ll_test_inode(struct inode *inode, unsigned long ino, void *opaque)
#else
if (inode->i_ino != md->body->ino)
return 0;
#endif
- if (inode->i_generation != md->body->generation)
+ if (inode->i_generation != md->body->generation) {
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ll_inode_info *lli = ll_i2info(inode);
+
+ if (inode->i_state & (I_FREEING | I_CLEAR))
+ return 0;
+
+ atomic_inc(&inode->i_count);
+ inode->i_nlink = 0;
+ inode->i_state |= I_FREEING;
+ LASSERT(list_empty(&lli->lli_dead_list));
+ /* add "duplicate" inode into deathrow for destroy */
+ spin_lock(&sbi->ll_deathrow_lock);
+ list_add(&lli->lli_dead_list, &sbi->ll_deathrow);
+ spin_unlock(&sbi->ll_deathrow_lock);
+
+ /* remove inode from dirty/io lists */
+ list_del_init(&inode->i_list);
+
return 0;
+ }
/* Apply the attributes in 'opaque' to this inode */
if (!(inode->i_state & (I_FREEING | I_CLEAR)))
- ll_update_inode(inode, md->body, md->lsm);
+ ll_update_inode(inode, md);
return 1;
}
break;
case LDLM_CB_CANCELING: {
struct inode *inode = ll_inode_from_lock(lock);
+ __u64 bits = lock->l_policy_data.l_inodebits.bits;
/* Invalidate all dentries associated with this inode */
if (inode == NULL)
break;
- clear_bit(LLI_F_HAVE_MDS_SIZE_LOCK,
- &(ll_i2info(inode)->lli_flags));
-
if (lock->l_resource->lr_name.name[0] != inode->i_ino ||
lock->l_resource->lr_name.name[1] != inode->i_generation) {
LDLM_ERROR(lock, "data mismatch with ino %lu/%u (%p)",
inode->i_ino, inode->i_generation, inode);
}
- if (S_ISDIR(inode->i_mode)) {
+
+ if (bits & MDS_INODELOCK_UPDATE)
+ clear_bit(LLI_F_HAVE_MDS_SIZE_LOCK,
+ &(ll_i2info(inode)->lli_flags));
+
+
+ if (S_ISDIR(inode->i_mode) &&
+ (bits & MDS_INODELOCK_UPDATE)) {
CDEBUG(D_INODE, "invalidating inode %lu\n",
inode->i_ino);
-
truncate_inode_pages(inode->i_mapping, 0);
}
if (inode->i_sb->s_root &&
- inode != inode->i_sb->s_root->d_inode)
+ inode != inode->i_sb->s_root->d_inode &&
+ (bits & MDS_INODELOCK_LOOKUP))
ll_unhash_aliases(inode);
iput(inode);
break;
de->d_name.len) != 0)
continue;
- if (!list_empty(&dentry->d_lru))
- list_del_init(&dentry->d_lru);
-
- hlist_del_init(&dentry->d_hash);
- __d_rehash(dentry, 0); /* avoid taking dcache_lock inside */
+ dget_locked(dentry);
+ lock_dentry(dentry);
+ __d_drop(dentry);
dentry->d_flags &= ~DCACHE_LUSTRE_INVALID;
- atomic_inc(&dentry->d_count);
+ unlock_dentry(dentry);
+ __d_rehash(dentry, 0); /* avoid taking dcache_lock inside */
spin_unlock(&dcache_lock);
iput(inode);
CDEBUG(D_DENTRY, "alias dentry %.*s (%p) parent %p inode %p "
RETURN(0);
}
-
static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
struct lookup_intent *it, int lookup_flags)
{
if (rc)
RETURN(rc);
- mdc_store_inode_generation(request, 2, 1);
+ mdc_store_inode_generation(request, MDS_REQ_INTENT_REC_OFF, 1);
inode = ll_create_node(dir, dentry->d_name.name, dentry->d_name.len,
NULL, 0, mode, 0, it);
if (IS_ERR(inode)) {
ll_update_times(request, 0, dir);
- err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0,
+ err = ll_prep_inode(sbi->ll_osc_exp, &inode, request, 0,
dchild->d_sb);
if (err)
GOTO(out_err, err);
int rc;
ENTRY;
- oti.oti_thread = request->rq_svc_thread;
-
/* req is swabbed so this is safe */
body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body));
}
LASSERT(rc >= sizeof(*lsm));
+ rc = obd_checkmd(ll_i2obdexp(dir), ll_i2mdcexp(dir), lsm);
+ if (rc)
+ GOTO(out_free_memmd, rc);
+
oa = obdo_alloc();
if (oa == NULL)
GOTO(out_free_memmd, rc = -ENOMEM);
}
}
- rc = obd_destroy(ll_i2obdexp(dir), oa, lsm, &oti);
+ rc = obd_destroy(ll_i2obdexp(dir), oa, lsm, &oti, ll_i2mdcexp(dir));
obdo_free(oa);
if (rc)
CERROR("obd destroy objid "LPX64" error %d\n",
.create = ll_create_nd,
.getattr_it = ll_getattr,
#endif
+ .permission = ll_inode_permission,
.setxattr = ll_setxattr,
.getxattr = ll_getxattr,
.listxattr = ll_listxattr,
/* If the truncate leaves behind a partial page, update its
* checksum. */
struct page *page = find_get_page(inode->i_mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
+ inode->i_size >> PAGE_CACHE_SHIFT);
if (page != NULL) {
struct ll_async_page *llap = llap_cast_private(page);
if (llap != NULL) {
oa.o_id = lsm->lsm_object_id;
oa.o_valid = OBD_MD_FLID;
+
obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
- OBD_MD_FLATIME |OBD_MD_FLMTIME |OBD_MD_FLCTIME);
+ OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+ OBD_MD_FLFID | OBD_MD_FLGENER);
ll_inode_size_unlock(inode, 0);
if (rc)
CERROR("obd_truncate fails (%d) ino %lu\n", rc, inode->i_ino);
else
- obdo_to_inode(inode, &oa, OBD_MD_FLSIZE|OBD_MD_FLBLOCKS|
- OBD_MD_FLATIME | OBD_MD_FLMTIME |
- OBD_MD_FLCTIME);
+ obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+ OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME);
EXIT;
return;
out_unlock:
ll_inode_size_unlock(inode, 0);
- EXIT;
} /* ll_truncate */
int ll_prepare_write(struct file *file, struct page *page, unsigned from,
pga.count = PAGE_SIZE;
pga.flag = 0;
- oa.o_id = lsm->lsm_object_id;
oa.o_mode = inode->i_mode;
+ oa.o_id = lsm->lsm_object_id;
oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE;
+ obdo_from_inode(&oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
- rc = obd_brw(OBD_BRW_CHECK, ll_i2obdexp(inode), &oa, lsm, 1, &pga,
- NULL);
+ rc = obd_brw(OBD_BRW_CHECK, ll_i2obdexp(inode), &oa, lsm,
+ 1, &pga, NULL);
if (rc)
RETURN(rc);
oa->o_valid = OBD_MD_FLID;
valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME;
if (cmd & OBD_BRW_WRITE) {
- oa->o_valid |= OBD_MD_FLIFID | OBD_MD_FLEPOCH;
- mdc_pack_fid(obdo_fid(oa), inode->i_ino, 0, inode->i_mode);
+ oa->o_valid |= OBD_MD_FLEPOCH;
oa->o_easize = ll_i2info(inode)->lli_io_epoch;
oa->o_uid = inode->i_uid;
oa->o_gid = inode->i_gid;
valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
- OBD_MD_FLUID | OBD_MD_FLGID;
+ OBD_MD_FLUID | OBD_MD_FLGID |
+ OBD_MD_FLFID | OBD_MD_FLGENER;
}
obdo_from_inode(oa, inode, valid_flags);
llap = LLAP_FROM_COOKIE(data);
ll_inode_fill_obdo(llap->llap_page->mapping->host, cmd, oa);
+
EXIT;
}
unsigned long size_index = inode->i_size >> PAGE_SHIFT;
struct obd_io_group *oig;
struct ll_sb_info *sbi = ll_i2sbi(inode);
- int rc, noquot = capable(CAP_SYS_RESOURCE) ? OBD_BRW_NOQUOTA : 0;
+ int rc, noquot = llap->llap_ignore_quota ? OBD_BRW_NOQUOTA : 0;
ENTRY;
/* _make_ready only sees llap once we've unlocked the page */
if (exp == NULL)
RETURN(-EINVAL);
+ llap->llap_ignore_quota = capable(CAP_SYS_RESOURCE);
+
/* queue a write for some time in the future the first time we
* dirty the page */
if (!PageDirty(page)) {
flags = 0 /* | OBD_BRW_DIRECTIO */;
offset = ((obd_off)blocknr << inode->i_blkbits);
length = iobuf->length;
+ rw = rw ? OBD_BRW_WRITE : OBD_BRW_READ;
for (i = 0, length = iobuf->length; length > 0;
length -= pga[i].count, offset += pga[i].count, i++) { /*i last!*/
pga[i].count = min_t(int, PAGE_SIZE - (offset & ~PAGE_MASK),
length);
pga[i].flag = flags;
- if (rw == READ)
+ if (rw == OBD_BRW_READ)
POISON_PAGE(iobuf->maplist[i], 0x0d);
}
ll_inode_fill_obdo(inode, rw, &oa);
- if (rw == WRITE)
+ if (rw == OBD_BRW_WRITE)
lprocfs_counter_add(ll_i2sbi(inode)->ll_stats,
LPROC_LL_DIRECT_WRITE, iobuf->length);
else
lprocfs_counter_add(ll_i2sbi(inode)->ll_stats,
LPROC_LL_DIRECT_READ, iobuf->length);
- rc = obd_brw_async(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
- ll_i2obdexp(inode), &oa, lsm, iobuf->nr_pages, pga,
- set, NULL);
+ rc = obd_brw_async(rw, ll_i2obdexp(inode), &oa, lsm, iobuf->nr_pages,
+ pga, set, NULL);
if (rc) {
CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
"error from obd_brw_async: rc = %d\n", rc);
ptlrpc_set_destroy(set);
if (rc == 0) {
rc = iobuf->length;
- if (rw == WRITE) {
+ if (rw == OBD_BRW_WRITE) {
lov_stripe_lock(lsm);
obd_adjust_kms(ll_i2obdexp(inode), lsm, offset, 0);
lov_stripe_unlock(lsm);
#else
.revalidate_it = ll_inode_revalidate_it,
#endif
+ .permission = ll_inode_permission,
.setxattr = ll_setxattr,
.getxattr = ll_getxattr,
.listxattr = ll_listxattr,
#include <linux/lustre_dlm.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/random.h>
#include <linux/cache_def.h>
#include <linux/lprocfs_status.h>
#include "llite_internal.h"
static int __init init_lustre_lite(void)
{
- int rc;
+ int rc, seed[2];
printk(KERN_INFO "Lustre: Lustre Lite Client File System; "
"info@clusterfs.com\n");
lustre_register_client_fill_super(ll_fill_super);
+ get_random_bytes(seed, sizeof(seed));
+ ll_srand(seed[0], seed[1]);
+
return rc;
}
static int __init init_lustre_lite(void)
{
- int rc;
+ int rc, seed[2];
printk(KERN_INFO "Lustre: Lustre Lite Client File System; "
"info@clusterfs.com\n");
rc = ll_init_inodecache();
lustre_register_client_fill_super(ll_fill_super);
+ get_random_bytes(seed, sizeof(seed));
+ ll_srand(seed[0], seed[1]);
+
return rc;
}
#else
.getattr_it = ll_getattr,
#endif
+ .permission = ll_inode_permission,
.setxattr = ll_setxattr,
.getxattr = ll_getxattr,
.listxattr = ll_listxattr,
#define XATTR_USER_T (1)
#define XATTR_TRUSTED_T (2)
#define XATTR_SECURITY_T (3)
-#define XATTR_POSIXACL_T (4)
+#define XATTR_ACL_T (4)
#define XATTR_OTHER_T (5)
static
{
if (!strcmp(name, XATTR_NAME_ACL_ACCESS) ||
!strcmp(name, XATTR_NAME_ACL_DEFAULT))
- return XATTR_POSIXACL_T;
+ return XATTR_ACL_T;
if (!strncmp(name, XATTR_USER_PREFIX,
sizeof(XATTR_USER_PREFIX) - 1))
}
static
+int xattr_type_filter(struct ll_sb_info *sbi, int xattr_type)
+{
+ if (xattr_type == XATTR_ACL_T && !(sbi->ll_flags & LL_SBI_ACL))
+ return -EOPNOTSUPP;
+ if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
+ return -EOPNOTSUPP;
+ if (xattr_type == XATTR_TRUSTED_T && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (xattr_type == XATTR_OTHER_T)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static
int ll_setxattr_common(struct inode *inode, const char *name,
const void *value, size_t size,
int flags, __u64 valid)
lprocfs_counter_incr(sbi->ll_stats, LPROC_LL_SETXATTR);
xattr_type = get_xattr_type(name);
- if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
- RETURN(-EOPNOTSUPP);
- if (xattr_type == XATTR_TRUSTED_T && !capable(CAP_SYS_ADMIN))
- RETURN(-EPERM);
- if (xattr_type == XATTR_OTHER_T)
- RETURN(-EOPNOTSUPP);
+ rc = xattr_type_filter(sbi, xattr_type);
+ if (rc)
+ RETURN(rc);
ll_inode2fid(&fid, inode);
rc = mdc_setxattr(sbi->ll_mdc_exp, &fid, valid,
name, value, size, 0, flags, &req);
if (rc) {
if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
- CWARN("disable user_xattr from now on\n");
+ LCONSOLE_INFO("Disabling user_xattr feature because "
+ "it is not supported on the server\n");
sbi->ll_flags &= ~LL_SBI_USER_XATTR;
}
RETURN(rc);
}
int ll_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+ const void *value, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
}
xattr_type = get_xattr_type(name);
- if (xattr_type == XATTR_USER_T && !(sbi->ll_flags & LL_SBI_USER_XATTR))
- RETURN(-EOPNOTSUPP);
- if (xattr_type == XATTR_TRUSTED_T && !capable(CAP_SYS_ADMIN))
- RETURN(-EPERM);
- if (xattr_type == XATTR_OTHER_T)
- RETURN(-EOPNOTSUPP);
+ rc = xattr_type_filter(sbi, xattr_type);
+ if (rc)
+ RETURN(rc);
do_getxattr:
ll_inode2fid(&fid, inode);
size, &req);
if (rc) {
if (rc == -EOPNOTSUPP && xattr_type == XATTR_USER_T) {
- CWARN("disable user_xattr from now on\n");
+ LCONSOLE_INFO("Disabling user_xattr feature because "
+ "it is not supported on the server\n");
sbi->ll_flags &= ~LL_SBI_USER_XATTR;
}
RETURN(rc);
if (size < body->eadatasize) {
CERROR("server bug: replied size %u > %u\n",
- body->eadatasize, size);
+ body->eadatasize, (int)size);
GOTO(out, rc = -ERANGE);
}
RETURN(rc);
}
-int ll_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+ssize_t ll_getxattr(struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct inode *inode = dentry->d_inode;
return ll_getxattr_common(inode, name, buffer, size, OBD_MD_FLXATTR);
}
-int ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
+ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
struct inode *inode = dentry->d_inode;
MODULES := lov
-lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o
+lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o
@INCLUDE_RULES@
if LIBLUSTRE
noinst_LIBRARIES = liblov.a
-liblov_a_SOURCES = lov_log.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_internal.h
+liblov_a_SOURCES = lov_log.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_ea.c lov_internal.h
liblov_a_CPPFLAGS = $(LLCPPFLAGS)
liblov_a_CFLAGS = $(LLCFLAGS)
endif
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (c) 2001-2005 Cluster File Systems, Inc.
+ * Author: Wang Di <wangdi@clusterfs.com>
+ *
+ * This file is part of the Lustre file system, http://www.lustre.org
+ * Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ * You may have signed or agreed to another license before downloading
+ * this software. If so, you are bound by the terms and conditions
+ * of that agreement, and the following does not apply to you. See the
+ * LICENSE file included with this distribution for more information.
+ *
+ * If you did not agree to a different license, then this copy of Lustre
+ * is open source software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * In either case, Lustre is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * license text for more details.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LOV
+
+#ifdef __KERNEL__
+#include <asm/div64.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <linux/obd_class.h>
+#include <linux/obd_lov.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_log.h>
+
+#include "lov_internal.h"
+
+struct lovea_unpack_args {
+ struct lov_stripe_md *lsm;
+ int cursor;
+};
+
+static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes,
+ int stripe_count)
+{
+
+ if (stripe_count == 0) {
+ CERROR("bad stripe count %d\n", stripe_count);
+ lov_dump_lmm_v1(D_WARNING, lmm);
+ return -EINVAL;
+ }
+
+ if (lmm->lmm_object_id == 0) {
+ CERROR("zero object id\n");
+ lov_dump_lmm_v1(D_WARNING, lmm);
+ return -EINVAL;
+ }
+
+ if (lmm->lmm_pattern != cpu_to_le32(LOV_PATTERN_RAID0)) {
+ CERROR("bad striping pattern\n");
+ lov_dump_lmm_v1(D_WARNING, lmm);
+ return -EINVAL;
+ }
+
+ if (lmm->lmm_stripe_size == 0 ||
+ (__u64)le32_to_cpu(lmm->lmm_stripe_size) * stripe_count > ~0UL) {
+ CERROR("bad stripe size %u\n",
+ le32_to_cpu(lmm->lmm_stripe_size));
+ lov_dump_lmm_v1(D_WARNING, lmm);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void lsm_unpackmd_common(struct lov_stripe_md *lsm,
+ struct lov_mds_md *lmm)
+{
+ lsm->lsm_object_id = le64_to_cpu(lmm->lmm_object_id);
+ lsm->lsm_object_gr = le64_to_cpu(lmm->lmm_object_gr);
+ lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
+ lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern);
+ lsm->lsm_xfersize = lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static void
+lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno,
+ obd_off *lov_off, unsigned long *swidth)
+{
+ if (swidth)
+ *swidth = lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static void
+lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno,
+ obd_off *lov_off, unsigned long *swidth)
+{
+ if (swidth)
+ *swidth = lsm->lsm_stripe_size * lsm->lsm_stripe_count;
+}
+
+static obd_off
+lsm_stripe_offset_by_index_plain(struct lov_stripe_md *lsm,
+ int stripe_index)
+{
+ return 0;
+}
+
+static int
+lsm_stripe_index_by_offset_plain(struct lov_stripe_md *lsm,
+ obd_off lov_off)
+{
+ return 0;
+}
+
+static void lsm_free_plain(struct lov_stripe_md *lsm)
+{
+ OBD_FREE(lsm, lov_stripe_md_size(lsm->lsm_stripe_count));
+}
+
+static int lsm_revalidate_plain(struct lov_stripe_md *lsm,
+ struct obd_device *obd)
+{
+ return 0;
+}
+
+static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa,
+ struct obd_export *md_exp)
+{
+ return 0;
+}
+
+static int lsm_lmm_verify_plain(struct lov_mds_md *lmm, int lmm_bytes,
+ int *stripe_count)
+{
+ if (lmm_bytes < sizeof(*lmm)) {
+ CERROR("lov_mds_md too small: %d, need at least %d\n",
+ lmm_bytes, (int)sizeof(*lmm));
+ return -EINVAL;
+ }
+
+ *stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
+
+ if (lmm_bytes < lov_mds_md_v1_size(*stripe_count)) {
+ CERROR("LOV EA too small: %d, need %d\n",
+ lmm_bytes, lov_mds_md_v1_size(*stripe_count));
+ lov_dump_lmm_v1(D_WARNING, lmm);
+ return -EINVAL;
+ }
+
+ return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count);
+}
+
+int lsm_unpackmd_plain(struct lov_obd *lov, struct lov_stripe_md *lsm,
+ struct lov_mds_md_v1 *lmm)
+{
+ struct lov_oinfo *loi;
+ int i;
+
+ lsm_unpackmd_common(lsm, lmm);
+
+ for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++) {
+ /* XXX LOV STACKING call down to osc_unpackmd() */
+ loi->loi_id = le64_to_cpu(lmm->lmm_objects[i].l_object_id);
+ loi->loi_gr = le64_to_cpu(lmm->lmm_objects[i].l_object_gr);
+ loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+ loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+ if (loi->loi_ost_idx > lov->desc.ld_tgt_count) {
+ CERROR("OST index %d more than OST count %d\n",
+ loi->loi_ost_idx, lov->desc.ld_tgt_count);
+ lov_dump_lmm_v1(D_WARNING, lmm);
+ return -EINVAL;
+ }
+ loi++;
+ }
+
+ return 0;
+}
+
+struct lsm_operations lsm_plain_ops = {
+ .lsm_free = lsm_free_plain,
+ .lsm_destroy = lsm_destroy_plain,
+ .lsm_stripe_by_index = lsm_stripe_by_index_plain,
+ .lsm_stripe_by_offset = lsm_stripe_by_offset_plain,
+ .lsm_revalidate = lsm_revalidate_plain,
+ .lsm_stripe_offset_by_index = lsm_stripe_offset_by_index_plain,
+ .lsm_stripe_index_by_offset = lsm_stripe_index_by_offset_plain,
+ .lsm_lmm_verify = lsm_lmm_verify_plain,
+ .lsm_unpackmd = lsm_unpackmd_plain,
+};
+
+struct lov_extent *lovea_off2le(struct lov_stripe_md *lsm, obd_off lov_off)
+{
+ struct lov_array_info *lai;
+ struct lov_extent *le;
+ int i = 0;
+
+ LASSERT(lsm->lsm_array != NULL);
+ lai = lsm->lsm_array;
+ LASSERT(lai->lai_ext_count > 1);
+
+ for (le = lai->lai_ext_array, i = 0;
+ i < lai->lai_ext_count && le->le_start + le->le_len <= lov_off
+ && le->le_len != -1;
+ i ++, le ++) {
+ ; /* empty loop */
+ }
+
+ CDEBUG(D_INFO, "off "LPU64" idx%d, ext"LPU64":"LPU64"idx%d sc%d\n",
+ lov_off, i, le->le_start, le->le_len, le->le_loi_idx,
+ le->le_stripe_count);
+
+ RETURN(le);
+}
+
+struct lov_extent *lovea_idx2le(struct lov_stripe_md *lsm, int stripe_no)
+{
+ struct lov_extent *le;
+ struct lov_array_info *lai;
+ int i, stripe_index;
+
+ LASSERT(lsm->lsm_array != NULL);
+ LASSERT(stripe_no >= 0 && stripe_no <= lsm->lsm_stripe_count);
+ lai = lsm->lsm_array;
+ LASSERT(lai->lai_ext_count > 1);
+
+ for (le = lai->lai_ext_array, i = 0, stripe_index = le->le_stripe_count;
+ i < lai->lai_ext_count && stripe_index <= stripe_no &&
+ le->le_len != -1; i ++, le ++,
+ stripe_index += le->le_stripe_count) {
+ ; /* empty loop */
+ }
+
+ CDEBUG(D_INFO, "stripe %d idx%d, ext"LPU64":"LPU64"idx %d scount%d\n",
+ stripe_no, i, le->le_start, le->le_len, le->le_loi_idx,
+ le->le_stripe_count);
+ RETURN(le);
+}
+
+
+static void lovea_free_array_info(struct lov_stripe_md *lsm)
+{
+ if (!lsm || !lsm->lsm_array)
+ return;
+
+ if (lsm->lsm_array->lai_ext_array)
+ OBD_FREE(lsm->lsm_array->lai_ext_array,
+ lsm->lsm_array->lai_ext_count *
+ sizeof(struct lov_extent));
+
+ OBD_FREE_PTR(lsm->lsm_array);
+}
+
+static void lsm_free_join(struct lov_stripe_md *lsm)
+{
+ lovea_free_array_info(lsm);
+ OBD_FREE(lsm, lov_stripe_md_size(lsm->lsm_stripe_count));
+}
+
+static void
+lsm_stripe_by_index_join(struct lov_stripe_md *lsm, int *stripeno,
+ obd_off *lov_off, unsigned long *swidth)
+{
+ struct lov_extent *le;
+
+ LASSERT(stripeno != NULL);
+
+ le = lovea_idx2le(lsm, *stripeno);
+
+ LASSERT(le != NULL && le->le_stripe_count != 0);
+
+ *stripeno -= le->le_loi_idx;
+
+ if (swidth)
+ *swidth = lsm->lsm_stripe_size * le->le_stripe_count;
+
+ if (lov_off) {
+ struct lov_extent *lov_le = lovea_off2le(lsm, *lov_off);
+ if (lov_le == le) {
+ *lov_off = (*lov_off > le->le_start) ?
+ (*lov_off - le->le_start) : 0;
+ } else {
+ *lov_off = (*lov_off > le->le_start) ?
+ le->le_len : 0;
+ LASSERT(*lov_off != -1);
+ }
+ }
+}
+
+static void
+lsm_stripe_by_offset_join(struct lov_stripe_md *lsm, int *stripeno,
+ obd_off *lov_off, unsigned long *swidth)
+{
+ struct lov_extent *le;
+
+ LASSERT(lov_off != NULL);
+
+ le = lovea_off2le(lsm, *lov_off);
+
+ LASSERT(le != NULL && le->le_stripe_count != 0);
+
+ *lov_off = (*lov_off > le->le_start) ? (*lov_off - le->le_start) : 0;
+
+ if (stripeno)
+ *stripeno -= le->le_loi_idx;
+
+ if (swidth)
+ *swidth = lsm->lsm_stripe_size * le->le_stripe_count;
+}
+
+static obd_off
+lsm_stripe_offset_by_index_join(struct lov_stripe_md *lsm,
+ int stripe_index)
+{
+ struct lov_extent *le;
+
+ le = lovea_idx2le(lsm, stripe_index);
+
+ return le ? le->le_start : 0;
+}
+
+static int
+lsm_stripe_index_by_offset_join(struct lov_stripe_md *lsm,
+ obd_off lov_off)
+{
+ struct lov_extent *le = NULL;
+
+ le = lovea_off2le(lsm, lov_off);
+
+ return le ? le->le_loi_idx : 0;
+}
+
+static int lovea_unpack_array(struct llog_handle *handle,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct lovea_unpack_args *args = (struct lovea_unpack_args *)data;
+ struct llog_array_rec *la_rec = (struct llog_array_rec*)rec;
+ struct mds_extent_desc *med = &la_rec->lmr_med;
+ struct lov_stripe_md *lsm = args->lsm;
+ int cursor = args->cursor++;
+ struct lov_mds_md *lmm;
+ struct lov_array_info *lai;
+ struct lov_oinfo * loi;
+ int i, loi_index;
+ ENTRY;
+
+ /* sanity check */
+ LASSERT(lsm->lsm_stripe_count != 0);
+ lmm = &med->med_lmm;
+ LASSERT(lsm->lsm_array != NULL);
+
+ lai = lsm->lsm_array;
+
+ if (cursor == 0) {
+ lai->lai_ext_array[cursor].le_loi_idx = 0;
+ } else {
+ int next_loi_index = lai->lai_ext_array[cursor - 1].le_loi_idx +
+ lai->lai_ext_array[cursor - 1].le_stripe_count;
+ lai->lai_ext_array[cursor].le_loi_idx = next_loi_index;
+ }
+ /* insert extent desc into lsm extent array */
+ lai->lai_ext_array[cursor].le_start = le64_to_cpu(med->med_start);
+ lai->lai_ext_array[cursor].le_len = le64_to_cpu(med->med_len);
+ lai->lai_ext_array[cursor].le_stripe_count = lmm->lmm_stripe_count;
+
+ /* unpack extent's lmm to lov_oinfo array */
+ loi_index = lai->lai_ext_array[cursor].le_loi_idx;
+ loi = &lsm->lsm_oinfo[loi_index];
+ CDEBUG(D_INFO, "lovea upackmd cursor %d, loi_index %d extent "
+ LPU64":"LPU64"\n", cursor, loi_index, med->med_start,
+ med->med_len);
+
+ for (i = 0; i < lmm->lmm_stripe_count; i ++) {
+ /* XXX LOV STACKING call down to osc_unpackmd() */
+ loi->loi_id = le64_to_cpu(lmm->lmm_objects[i].l_object_id);
+ loi->loi_gr = le64_to_cpu(lmm->lmm_objects[i].l_object_gr);
+ loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+ loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+ loi++;
+ }
+
+ RETURN(0);
+}
+
+static int lsm_revalidate_join(struct lov_stripe_md *lsm,
+ struct obd_device *obd)
+{
+ struct llog_handle *llh;
+ struct llog_ctxt *ctxt;
+ struct lovea_unpack_args args;
+ int rc, rc2;
+ ENTRY;
+
+ LASSERT(lsm->lsm_array != NULL);
+
+ /*Revalidate lsm might be called from client or MDS server.
+ *So the ctxt might be in different position
+ */
+ ctxt = llog_get_context(obd, LLOG_LOVEA_REPL_CTXT);
+ if (!ctxt)
+ ctxt = llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT);
+
+ LASSERT(ctxt);
+
+ if (lsm->lsm_array && lsm->lsm_array->lai_ext_array)
+ RETURN(0);
+
+ CDEBUG(D_INFO, "get lsm logid: "LPU64":"LPU64"\n",
+ lsm->lsm_array->lai_array_id.lgl_oid,
+ lsm->lsm_array->lai_array_id.lgl_ogr);
+ OBD_ALLOC(lsm->lsm_array->lai_ext_array,lsm->lsm_array->lai_ext_count *
+ sizeof (struct lov_extent));
+ if (!lsm->lsm_array->lai_ext_array)
+ RETURN(-ENOMEM);
+
+ CDEBUG(D_INFO, "get lsm logid: "LPU64":"LPU64"\n",
+ lsm->lsm_array->lai_array_id.lgl_oid,
+ lsm->lsm_array->lai_array_id.lgl_ogr);
+
+ rc = llog_create(ctxt, &llh, &lsm->lsm_array->lai_array_id, NULL);
+ if (rc)
+ GOTO(out, rc);
+
+ args.lsm = lsm;
+ args.cursor = 0;
+ rc = llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL);
+ if (rc == 0)
+ rc = llog_process(llh, lovea_unpack_array, &args, NULL);
+ rc2 = llog_close(llh);
+ if (rc == 0)
+ rc = rc2;
+out:
+ if (rc)
+ lovea_free_array_info(lsm);
+ RETURN(rc);
+}
+
+int lsm_destroy_join(struct lov_stripe_md *lsm, struct obdo *oa,
+ struct obd_export *md_exp)
+{
+ struct llog_ctxt *ctxt;
+ struct llog_handle *llh;
+ int rc = 0;
+ ENTRY;
+
+ LASSERT(md_exp != NULL);
+ ctxt = llog_get_context(md_exp->exp_obd, LLOG_LOVEA_REPL_CTXT);
+ if (!ctxt)
+ GOTO(out, rc = -EINVAL);
+
+ LASSERT(lsm->lsm_array != NULL);
+ /*for those orphan inode, we should keep array id*/
+ if (!(oa->o_valid & OBD_MD_FLCOOKIE))
+ RETURN(0);
+
+ LASSERT(ctxt != NULL);
+ rc = llog_create(ctxt, &llh, &lsm->lsm_array->lai_array_id,
+ NULL);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL);
+ if (rc == 0) {
+ rc = llog_destroy(llh);
+ }
+ llog_free_handle(llh);
+out:
+ RETURN(rc);
+}
+
+static int lsm_lmm_verify_join(struct lov_mds_md *lmm, int lmm_bytes,
+ int *stripe_count)
+{
+ struct lov_mds_md_join *lmmj = (struct lov_mds_md_join *)lmm;
+
+ if (lmm_bytes < sizeof(*lmmj)) {
+ CERROR("lov_mds_md too small: %d, need at least %d\n",
+ lmm_bytes, (int)sizeof(*lmmj));
+ return -EINVAL;
+ }
+
+ if (lmmj->lmmj_array_id.lgl_oid == 0) {
+ CERROR("zero array object id\n");
+ return -EINVAL;
+ }
+
+ *stripe_count = le32_to_cpu(lmmj->lmmj_md.lmm_stripe_count);
+
+ return lsm_lmm_verify_common(&lmmj->lmmj_md, lmm_bytes, *stripe_count);
+}
+
+static int lovea_init_array_info(struct lov_stripe_md *lsm,
+ struct llog_logid *logid,
+ __u32 extent_count)
+{
+ struct lov_array_info *lai;
+ ENTRY;
+
+ OBD_ALLOC_PTR(lai);
+ if (!lai)
+ RETURN(-ENOMEM);
+
+ lai->lai_array_id = *logid;
+ lai->lai_ext_count = extent_count;
+ lsm->lsm_array = lai;
+ RETURN(0);
+}
+
+static int lsm_unpackmd_join(struct lov_obd *lov, struct lov_stripe_md *lsm,
+ struct lov_mds_md *lmm)
+{
+ struct lov_mds_md_join *lmmj = (struct lov_mds_md_join*)lmm;
+ int rc;
+ ENTRY;
+
+ lsm_unpackmd_common(lsm, &lmmj->lmmj_md);
+
+ rc = lovea_init_array_info(lsm, &lmmj->lmmj_array_id,
+ lmmj->lmmj_extent_count);
+ if (rc) {
+ CERROR("Init joined lsm id"LPU64" arrary error %d",
+ lsm->lsm_object_id, rc);
+ GOTO(out, rc);
+ }
+out:
+ RETURN(rc);
+}
+
+struct lsm_operations lsm_join_ops = {
+ .lsm_free = lsm_free_join,
+ .lsm_destroy = lsm_destroy_join,
+ .lsm_stripe_by_index = lsm_stripe_by_index_join,
+ .lsm_stripe_by_offset = lsm_stripe_by_offset_join,
+ .lsm_revalidate = lsm_revalidate_join,
+ .lsm_stripe_offset_by_index = lsm_stripe_offset_by_index_join,
+ .lsm_stripe_index_by_offset = lsm_stripe_index_by_offset_join,
+ .lsm_lmm_verify = lsm_lmm_verify_join,
+ .lsm_unpackmd = lsm_unpackmd_join,
+};
+
+
/* lov_obd.c */
int lov_get_stripecnt(struct lov_obd *lov, int stripe_count);
-int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count, int pattern);
-void lov_free_memmd(struct lov_stripe_md **lsmp);
/* lov_log.c */
int lov_llog_init(struct obd_device *obd, struct obd_device *tgt,
struct lov_user_md *lump);
int lov_getstripe(struct obd_export *exp,
struct lov_stripe_md *lsm, struct lov_user_md *lump);
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count,
+ int pattern, int magic);
+void lov_free_memmd(struct lov_stripe_md **lsmp);
+void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm);
+void lov_dump_lmm_join(int level, struct lov_mds_md_join *lmmj);
+/* lov_ea.c */
+int lov_unpackmd_join(struct lov_obd *lov, struct lov_stripe_md *lsm,
+ struct lov_mds_md *lmm);
+
+struct lov_extent *lovea_idx2le(struct lov_stripe_md *lsm, int stripe_no);
+struct lov_extent *lovea_off2le(struct lov_stripe_md *lsm, obd_off lov_off);
+int lovea_destroy_object(struct lov_obd *lov, struct lov_stripe_md *lsm,
+ struct obdo *oa, void *data);
/* lproc_lov.c */
extern struct file_operations lov_proc_target_fops;
-/* Quota stuff */
-int lov_quotacheck(struct obd_export *exp, struct obd_quotactl *oqctl);
-int lov_quotactl(struct obd_export *exp, struct obd_quotactl *oqctl);
-
#endif
* Unset cookies should be all-zero (which will never occur naturally). */
static int lov_llog_origin_add(struct llog_ctxt *ctxt,
struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
- struct llog_cookie *logcookies, int numcookies,
- llog_fill_rec_cb_t fill_cb)
+ struct llog_cookie *logcookies, int numcookies)
{
struct obd_device *obd = ctxt->loc_obd;
struct lov_obd *lov = &obd->u.lov;
int i, rc = 0;
ENTRY;
- LASSERT(logcookies && numcookies >= lsm->lsm_stripe_count);
+ LASSERTF(logcookies && numcookies >= lsm->lsm_stripe_count,
+ "logcookies %p, numcookies %d lsm->lsm_stripe_count %d \n",
+ logcookies, numcookies, lsm->lsm_stripe_count);
for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
struct obd_device *child = lov->tgts[loi->loi_ost_idx].ltd_exp->exp_obd;
struct llog_ctxt *cctxt = llog_get_context(child, ctxt->loc_idx);
- struct llog_fill_rec_data data;
/* fill mds unlink/setattr log record */
- data.lfd_id = loi->loi_id;
- data.lfd_ogen = loi->loi_gr;
- fill_cb(rec, &data);
+ switch (rec->lrh_type) {
+ case MDS_UNLINK_REC: {
+ struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+ lur->lur_oid = loi->loi_id;
+ lur->lur_ogen = loi->loi_gr;
+ break;
+ }
+ case MDS_SETATTR_REC: {
+ struct llog_setattr_rec *lsr = (struct llog_setattr_rec *)rec;
+ lsr->lsr_oid = loi->loi_id;
+ lsr->lsr_ogen = loi->loi_gr;
+ break;
+ }
+ default:
+ break;
+ }
rc += llog_add(cctxt, rec, NULL, logcookies + rc,
- numcookies - rc, fill_cb);
-
+ numcookies - rc);
}
RETURN(rc);
enum obd_notify_event ev, void *data)
{
int rc = 0;
+ ENTRY;
if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
struct obd_uuid *uuid;
LASSERT(watched);
-
+
if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) {
CERROR("unexpected notification of %s %s!\n",
watched->obd_type->typ_name,
watched->obd_name);
- return -EINVAL;
+ RETURN(-EINVAL);
}
uuid = &watched->u.cli.cl_import->imp_target_uuid;
rc = lov_notify(obd, tgt->ltd_exp->exp_obd, OBD_NOTIFY_ACTIVE,
(void *)&idx);
- out:
+out:
if (rc) {
CERROR("add failed (%d), deleting %s\n", rc,
(char *)tgt->uuid.uuid);
RETURN(rc);
}
-#define ASSERT_LSM_MAGIC(lsmp) \
-do { \
- LASSERT((lsmp) != NULL); \
- LASSERTF((lsmp)->lsm_magic == LOV_MAGIC, "%p->lsm_magic=%x\n", \
- (lsmp), (lsmp)->lsm_magic); \
+#define ASSERT_LSM_MAGIC(lsmp) \
+do { \
+ LASSERT((lsmp) != NULL); \
+ LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC || \
+ (lsmp)->lsm_magic == LOV_MAGIC_JOIN), "%p->lsm_magic=%x\n", \
+ (lsmp), (lsmp)->lsm_magic); \
} while (0)
static int lov_destroy(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *lsm, struct obd_trans_info *oti)
+ struct lov_stripe_md *lsm, struct obd_trans_info *oti,
+ struct obd_export *md_exp)
{
struct lov_request_set *set;
struct lov_request *req;
/* XXX update the cookie position */
oti->oti_logcookies = set->set_cookies + req->rq_stripe;
rc = obd_destroy(lov->tgts[req->rq_idx].ltd_exp, req->rq_oa,
- NULL, oti);
+ NULL, oti, NULL);
err = lov_update_common_set(set, req, rc);
if (rc) {
CERROR("error: destroying objid "LPX64" subobj "
}
}
lov_fini_destroy_set(set);
+ if (rc == 0) {
+ LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+ rc = lsm_op_find(lsm->lsm_magic)->lsm_destroy(lsm, oa, md_exp);
+ }
RETURN(rc);
}
if (!exp || !exp->exp_obd)
RETURN(-ENODEV);
- /* for now, we only expect time updates here */
- LASSERT(!(src_oa->o_valid & ~(OBD_MD_FLID|OBD_MD_FLTYPE | OBD_MD_FLMODE|
- OBD_MD_FLATIME | OBD_MD_FLMTIME |
- OBD_MD_FLCTIME | OBD_MD_FLFLAGS |
- OBD_MD_FLSIZE)));
+ /* for now, we only expect the following updates here */
+ LASSERT(!(src_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
+ OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+ OBD_MD_FLFLAGS | OBD_MD_FLSIZE | OBD_MD_FLGROUP |
+ OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLINLINE |
+ OBD_MD_FLFID | OBD_MD_FLGENER)));
lov = &exp->exp_obd->u.lov;
- rc = lov_prep_setattr_set(exp, src_oa, lsm, NULL, &set);
+ rc = lov_prep_setattr_set(exp, src_oa, lsm, oti, &set);
if (rc)
RETURN(rc);
if (!exp || !exp->exp_obd)
RETURN(-ENODEV);
- /* support OBD_MD_FLUID, OBD_MD_FLGID and OBD_MD_FLCOOKIE now */
LASSERT(!(src_oa->o_valid & ~(OBD_MD_FLID | OBD_MD_FLUID |
- OBD_MD_FLGID| OBD_MD_FLCOOKIE)));
+ OBD_MD_FLGID| OBD_MD_FLCOOKIE |
+ OBD_MD_FLFID | OBD_MD_FLGENER)));
lov = &exp->exp_obd->u.lov;
loi = lsm->lsm_oinfo;
}
src_oa->o_id = loi->loi_id;
+ src_oa->o_stripe_idx = i;
+
/* do chown/chgrp on OST asynchronously */
err = obd_setattr_async(lov->tgts[loi->loi_ost_idx].ltd_exp,
src_oa, NULL, oti);
return lap->lap_caller_ops->ap_make_ready(lap->lap_caller_data, cmd);
}
+
static int lov_ap_refresh_count(void *data, int cmd)
{
struct lov_async_page *lap = LAP_FROM_COOKIE(data);
return lap->lap_caller_ops->ap_refresh_count(lap->lap_caller_data,
cmd);
}
+
static void lov_ap_fill_obdo(void *data, int cmd, struct obdo *oa)
{
struct lov_async_page *lap = LAP_FROM_COOKIE(data);
lap->lap_caller_ops->ap_fill_obdo(lap->lap_caller_data, cmd, oa);
/* XXX woah, shouldn't we be altering more here? size? */
oa->o_id = lap->lap_loi_id;
+ oa->o_stripe_idx = lap->lap_stripe;
}
static void lov_ap_completion(void *data, int cmd, struct obdo *oa, int rc)
break;
default: {
int set = 0;
+
if (count == 0)
RETURN(-ENOTTY);
+
rc = 0;
for (i = 0; i < count; i++) {
int err;
err = obd_iocontrol(cmd, lov->tgts[i].ltd_exp,
len, karg, uarg);
- if (err) {
+ if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
+ RETURN(err);
+ } else if (err) {
if (lov->tgts[i].active) {
CERROR("error: iocontrol OSC %s on OST "
"idx %d cmd %x: err = %d\n",
struct ldlm_lock *lock;
struct lov_stripe_md *lsm;
} *data = key;
+ struct ldlm_res_id *res_id = &data->lock->l_resource->lr_name;
struct lov_oinfo *loi;
__u32 *stripe = val;
for (i = 0, loi = data->lsm->lsm_oinfo;
i < data->lsm->lsm_stripe_count;
i++, loi++) {
- if (lov->tgts[loi->loi_ost_idx].ltd_exp ==
- data->lock->l_conn_export) {
+ if (lov->tgts[loi->loi_ost_idx].ltd_exp ==
+ data->lock->l_conn_export &&
+ loi->loi_id == res_id->name[0] &&
+ loi->loi_gr == res_id->name[2]) {
*stripe = i;
GOTO(out, rc = 0);
}
}
- LDLM_ERROR(data->lock, "lock on inode without such object\n");
+ LDLM_ERROR(data->lock, "lock on inode without such object");
dump_lsm(D_ERROR, data->lsm);
GOTO(out, rc = -ENXIO);
- } else if (keylen >= strlen("size_to_stripe") &&
- strcmp(key, "size_to_stripe") == 0) {
- struct {
- int stripe_number;
- __u64 size;
- struct lov_stripe_md *lsm;
- } *data = val;
-
- if (*vallen < sizeof(*data))
- GOTO(out, rc = -EFAULT);
-
- data->size = lov_size_to_stripe(data->lsm, data->size,
- data->stripe_number);
- GOTO(out, rc = 0);
} else if (keylen >= strlen("last_id") && strcmp(key, "last_id") == 0) {
obd_id *ids = val;
int size = sizeof(obd_id);
RETURN(rc);
}
+static int lov_checkmd(struct obd_export *exp, struct obd_export *md_exp,
+ struct lov_stripe_md *lsm)
+{
+ int rc;
+ ENTRY;
+
+ if (!lsm)
+ RETURN(0);
+ LASSERT(md_exp);
+ LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+ rc = lsm_op_find(lsm->lsm_magic)->lsm_revalidate(lsm, md_exp->exp_obd);
+
+ RETURN(rc);
+}
+
int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm)
{
struct lov_oinfo *loi;
}
#endif
-
void lov_stripe_lock(struct lov_stripe_md *md)
{
LASSERT(md->lsm_lock_owner != current);
.o_statfs = lov_statfs,
.o_packmd = lov_packmd,
.o_unpackmd = lov_unpackmd,
+ .o_checkmd = lov_checkmd,
.o_create = lov_create,
.o_destroy = lov_destroy,
.o_getattr = lov_getattr,
.o_llog_init = lov_llog_init,
.o_llog_finish = lov_llog_finish,
.o_notify = lov_notify,
-#ifdef HAVE_QUOTA_SUPPORT
- .o_quotacheck = lov_quotacheck,
- .o_quotactl = lov_quotactl,
-#endif
};
+static quota_interface_t *quota_interface;
+extern quota_interface_t lov_quota_interface;
+
int __init lov_init(void)
{
struct lprocfs_static_vars lvars;
ENTRY;
lprocfs_init_vars(lov, &lvars);
+
+ quota_interface = PORTAL_SYMBOL_GET(lov_quota_interface);
+ init_obd_quota_ops(quota_interface, &lov_obd_ops);
+
rc = class_register_type(&lov_obd_ops, lvars.module_vars,
OBD_LOV_DEVICENAME);
+ if (rc && quota_interface)
+ PORTAL_SYMBOL_PUT(osc_quota_interface);
+
RETURN(rc);
}
#ifdef __KERNEL__
static void /*__exit*/ lov_exit(void)
{
+ if (quota_interface)
+ PORTAL_SYMBOL_PUT(lov_quota_interface);
+
class_unregister_type(OBD_LOV_DEVICENAME);
}
int stripeno)
{
unsigned long ssize = lsm->lsm_stripe_size;
- unsigned long swidth = ssize * lsm->lsm_stripe_count;
- unsigned long stripe_size;
+ unsigned long swidth, stripe_size;
+ int sindex = stripeno;
obd_size lov_size;
+ int magic = lsm->lsm_magic;
ENTRY;
if (ost_size == 0)
RETURN(0);
+ LASSERT(lsm_op_find(magic) != NULL);
+ lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, NULL, &swidth);
+
/* do_div(a, b) returns a % b, and a = a / b */
stripe_size = do_div(ost_size, ssize);
if (stripe_size)
else
lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
+ lov_size += lsm_op_find(magic)->lsm_stripe_offset_by_index(lsm, sindex);
RETURN(lov_size);
}
int stripeno, obd_off *obd_off)
{
unsigned long ssize = lsm->lsm_stripe_size;
- unsigned long swidth = ssize * lsm->lsm_stripe_count;
- unsigned long stripe_off, this_stripe;
+ unsigned long swidth, stripe_off, this_stripe;
+ int magic = lsm->lsm_magic;
int ret = 0;
if (lov_off == OBD_OBJECT_EOF) {
return 0;
}
+ LASSERT(lsm_op_find(magic) != NULL);
+ lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off,
+ &swidth);
+
/* do_div(a, b) returns a % b, and a = a / b */
stripe_off = do_div(lov_off, swidth);
int stripeno)
{
unsigned long ssize = lsm->lsm_stripe_size;
- unsigned long swidth = ssize * lsm->lsm_stripe_count;
- unsigned long stripe_off, this_stripe;
+ unsigned long swidth, stripe_off, this_stripe;
+ int magic = lsm->lsm_magic;
if (file_size == OBD_OBJECT_EOF)
return OBD_OBJECT_EOF;
+ LASSERT(lsm_op_find(magic) != NULL);
+ lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size,
+ &swidth);
+
/* do_div(a, b) returns a % b, and a = a / b */
stripe_off = do_div(file_size, swidth);
int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off)
{
unsigned long ssize = lsm->lsm_stripe_size;
- unsigned long swidth = ssize * lsm->lsm_stripe_count;
- unsigned long stripe_off;
+ unsigned long swidth, stripe_off;
+ obd_off offset = lov_off;
+ int magic = lsm->lsm_magic;
+
+ LASSERT(lsm_op_find(magic) != NULL);
+ lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth);
stripe_off = do_div(lov_off, swidth);
- return stripe_off / ssize;
+ return (stripe_off/ssize +
+ lsm_op_find(magic)->lsm_stripe_index_by_offset(lsm, offset));
}
le64_to_cpu(lod->l_object_id));
}
+void lov_dump_lmm_join(int level, struct lov_mds_md_join *lmmj)
+{
+
+ CDEBUG(level, "objid "LPX64", magic 0x%08X, pattern %#X\n",
+ le64_to_cpu(lmmj->lmmj_md.lmm_object_id),
+ le32_to_cpu(lmmj->lmmj_md.lmm_magic),
+ le32_to_cpu(lmmj->lmmj_md.lmm_pattern));
+ CDEBUG(level,"stripe_size %u, stripe_count %u extent_count %u \n",
+ le32_to_cpu(lmmj->lmmj_md.lmm_stripe_size),
+ le32_to_cpu(lmmj->lmmj_md.lmm_stripe_count),
+ le32_to_cpu(lmmj->lmmj_extent_count));
+}
+
#define LMM_ASSERT(test) \
do { \
if (!(test)) lov_dump_lmm(D_ERROR, lmm); \
return stripe_count;
}
-static int lov_verify_lmm_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes,
- int *stripe_count)
-{
- if (lmm_bytes < sizeof(*lmm)) {
- CERROR("lov_mds_md too small: %d, need at least %d\n",
- lmm_bytes, (int)sizeof(*lmm));
- return -EINVAL;
- }
-
- if (lmm->lmm_magic != le32_to_cpu(LOV_MAGIC_V1)) {
- CERROR("bad disk LOV MAGIC: 0x%08X\n",
- le32_to_cpu(*(__u32 *)lmm));
- return -EINVAL;
- }
-
- *stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
-
- if (*stripe_count == 0) {
- CERROR("bad stripe count %d\n", *stripe_count);
- lov_dump_lmm_v1(D_WARNING, lmm);
- return -EINVAL;
- }
-
- if (lmm_bytes < lov_mds_md_v1_size(*stripe_count)) {
- CERROR("LOV EA too small: %d, need %d\n",
- lmm_bytes, lov_mds_md_v1_size(*stripe_count));
- lov_dump_lmm_v1(D_WARNING, lmm);
- return -EINVAL;
- }
-
- if (lmm->lmm_object_id == 0) {
- CERROR("zero object id\n");
- lov_dump_lmm_v1(D_WARNING, lmm);
- return -EINVAL;
- }
-
- if (lmm->lmm_pattern != cpu_to_le32(LOV_PATTERN_RAID0)) {
- CERROR("bad striping pattern\n");
- lov_dump_lmm_v1(D_WARNING, lmm);
- return -EINVAL;
- }
-
- if (lmm->lmm_stripe_size == 0 ||
- (__u64)le32_to_cpu(lmm->lmm_stripe_size) * *stripe_count > ~0UL) {
- CERROR("bad stripe size %u\n",
- le32_to_cpu(lmm->lmm_stripe_size));
- lov_dump_lmm_v1(D_WARNING, lmm);
- return -EINVAL;
- }
-
- return 0;
-}
static int lov_verify_lmm(void *lmm, int lmm_bytes, int *stripe_count)
{
- switch (le32_to_cpu(*(__u32 *)lmm)) {
- case LOV_MAGIC_V1:
- return lov_verify_lmm_v1(lmm, lmm_bytes, stripe_count);
- default:
+ int rc;
+
+ if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) {
CERROR("bad disk LOV MAGIC: 0x%08X; dumping V1 LMM:\n",
le32_to_cpu(*(__u32 *)lmm));
lov_dump_lmm_v1(D_WARNING, lmm);
return -EINVAL;
}
+ rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm,
+ lmm_bytes, stripe_count);
+ return rc;
}
-int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count, int pattern)
+int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count,
+ int pattern, int magic)
{
int lsm_size = lov_stripe_md_size(stripe_count);
struct lov_oinfo *loi;
int i;
+ ENTRY;
+
+ CDEBUG(D_INFO, "alloc lsm, stripe_count %d, lsm_size %d\n",
+ stripe_count, lsm_size);
OBD_ALLOC(*lsmp, lsm_size);
- if (!*lsmp)
- return -ENOMEM;
+ if (!*lsmp) {
+ CERROR("can not allocate lsmp lsm_size %d stripe_count %d\n",
+ lsm_size, stripe_count);
+ RETURN(-ENOMEM);
+ }
spin_lock_init(&(*lsmp)->lsm_lock);
- (*lsmp)->lsm_magic = LOV_MAGIC;
+ (*lsmp)->lsm_magic = magic;
(*lsmp)->lsm_stripe_count = stripe_count;
(*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count;
(*lsmp)->lsm_xfersize = PTLRPC_MAX_BRW_SIZE * stripe_count;
(*lsmp)->lsm_pattern = pattern;
(*lsmp)->lsm_oinfo[0].loi_ost_idx = ~0;
-
+
for (i = 0, loi = (*lsmp)->lsm_oinfo; i < stripe_count; i++, loi++)
loi_init(loi);
- return lsm_size;
+ RETURN(lsm_size);
}
void lov_free_memmd(struct lov_stripe_md **lsmp)
{
- OBD_FREE(*lsmp, lov_stripe_md_size((*lsmp)->lsm_stripe_count));
+ struct lov_stripe_md *lsm = *lsmp;
+
+ LASSERT(lsm_op_find(lsm->lsm_magic) != NULL);
+ lsm_op_find(lsm->lsm_magic)->lsm_free(lsm);
+
*lsmp = NULL;
}
-int lov_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm,
- struct lov_mds_md_v1 *lmm)
-{
- struct lov_oinfo *loi;
- int i;
-
- lsm->lsm_object_id = le64_to_cpu(lmm->lmm_object_id);
- lsm->lsm_object_gr = le64_to_cpu(lmm->lmm_object_gr);
- lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
- lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern);
- lsm->lsm_xfersize = lsm->lsm_stripe_size * lsm->lsm_stripe_count;
-
- for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++) {
- /* XXX LOV STACKING call down to osc_unpackmd() */
- loi->loi_id = le64_to_cpu(lmm->lmm_objects[i].l_object_id);
- loi->loi_gr = le64_to_cpu(lmm->lmm_objects[i].l_object_gr);
- loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
- loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
- if (loi->loi_ost_idx > lov->desc.ld_tgt_count) {
- CERROR("OST index %d more than OST count %d\n",
- loi->loi_ost_idx, lov->desc.ld_tgt_count);
- lov_dump_lmm_v1(D_WARNING, lmm);
- return -EINVAL;
- }
- loi++;
- }
-
- return 0;
-}
/* Unpack LOV object metadata from disk storage. It is packed in LE byte
* order and is opaque to the networking layer.
*/
-int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
+int lov_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
struct lov_mds_md *lmm, int lmm_bytes)
{
struct obd_device *obd = class_exp2obd(exp);
struct lov_obd *lov = &obd->u.lov;
int rc = 0, stripe_count, lsm_size;
+ __u32 magic;
ENTRY;
/* If passed an MDS struct use values from there, otherwise defaults */
rc = lov_verify_lmm(lmm, lmm_bytes, &stripe_count);
if (rc)
RETURN(rc);
+ magic = le32_to_cpu(lmm->lmm_magic);
} else {
stripe_count = lov_get_stripecnt(lov, 0);
+ magic = LOV_MAGIC;
}
/* If we aren't passed an lsmp struct, we just want the size */
RETURN(0);
}
- lsm_size = lov_alloc_memmd(lsmp, stripe_count, LOV_PATTERN_RAID0);
+ lsm_size = lov_alloc_memmd(lsmp, stripe_count, LOV_PATTERN_RAID0,
+ magic);
if (lsm_size < 0)
RETURN(lsm_size);
if (!lmm)
RETURN(lsm_size);
- switch (le32_to_cpu(lmm->lmm_magic)) {
- case LOV_MAGIC_V1:
- rc = lov_unpackmd_v1(lov, *lsmp, lmm);
- break;
- }
-
+ LASSERT(lsm_op_find(magic) != NULL);
+ rc = lsm_op_find(magic)->lsm_unpackmd(lov, *lsmp, lmm);
if (rc) {
lov_free_memmd(lsmp);
RETURN(rc);
RETURN(-EINVAL);
}
- rc = lov_alloc_memmd(lsmp, stripe_count, lum.lmm_pattern);
+ rc = lov_alloc_memmd(lsmp, stripe_count, lum.lmm_pattern, LOV_MAGIC);
if (rc < 0)
RETURN(rc);
int stripe, i, rc = -EIO;
ENTRY;
- ost_idx = (req->rq_idx + 1) % ost_count;
+ ost_idx = (req->rq_idx + lsm->lsm_stripe_count) % ost_count;
for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
if (lov->tgts[ost_idx].active == 0) {
CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx);
continue;
}
/* check if objects has been created on this ost */
- for (stripe = req->rq_stripe; stripe >= 0; stripe--) {
+ for (stripe = 0; stripe < lsm->lsm_stripe_count; stripe++) {
+ if (stripe == req->rq_stripe)
+ continue;
if (ost_idx == lsm->lsm_oinfo[stripe].loi_ost_idx)
break;
}
- if (stripe < 0) {
+ if (stripe >= lsm->lsm_stripe_count) {
req->rq_idx = ost_idx;
- rc = obd_create(lov->tgts[ost_idx].ltd_exp, req->rq_oa,
+ rc = obd_create(lov->tgts[ost_idx].ltd_exp, req->rq_oa,
&req->rq_md, set->set_oti);
if (!rc)
break;
if (newea || lsm->lsm_oinfo[0].loi_ost_idx >= ost_count) {
if (--ost_start_count <= 0) {
- ost_start_idx = ll_insecure_random_int();
+ ost_start_idx = ll_rand();
ost_start_count =
(LOV_CREATE_RESEED_MIN / max(ost_active_count, 1U) +
LOV_CREATE_RESEED_MULT) * max(ost_active_count, 1U);
lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
LASSERT(lov_lockhp);
- if (lov_lockhp->cookie == 0)
+ if (!lustre_handle_is_used(lov_lockhp))
continue;
rc = obd_cancel(lov->tgts[req->rq_idx].ltd_exp, req->rq_md,
struct lustre_handle *lov_lockhp;
lov_lockhp = set->set_lockh->llh_handles + i;
- if (lov_lockhp->cookie == 0) {
+ if (!lustre_handle_is_used(lov_lockhp)) {
CDEBUG(D_HA, "lov idx %d subobj "LPX64" no lock?\n",
loi->loi_ost_idx, loi->loi_id);
continue;
continue;
sub_exp = lov->tgts[req->rq_idx].ltd_exp;
- err = obd_destroy(sub_exp, req->rq_oa, NULL, oti);
+ err = obd_destroy(sub_exp, req->rq_oa, NULL, oti, NULL);
if (err)
CERROR("Failed to uncreate objid "LPX64" subobj "
LPX64" on OST idx %d: rc = %d\n",
rc = lov_alloc_memmd(&set->set_md, stripes,
lov->desc.ld_pattern ?
- lov->desc.ld_pattern : LOV_PATTERN_RAID0);
+ lov->desc.ld_pattern : LOV_PATTERN_RAID0,
+ LOV_MAGIC);
if (rc < 0)
goto out_set;
newea = 1;
if (src_oa)
memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa));
req->rq_oa->o_id = loi->loi_id;
+ req->rq_oa->o_stripe_idx = i;
req->rq_buflen = sizeof(*req->rq_md);
OBD_ALLOC(req->rq_md, req->rq_buflen);
GOTO(out_set, rc = -ENOMEM);
memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa));
req->rq_oa->o_id = loi->loi_id;
+ req->rq_oa->o_stripe_idx = i;
if (src_oa->o_valid & OBD_MD_FLSIZE) {
if (lov_stripe_offset(lsm, src_oa->o_size, i,
GOTO(out_set, rc = -ENOMEM);
memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa));
req->rq_oa->o_id = loi->loi_id;
+ req->rq_oa->o_stripe_idx = i;
req->rq_extent.start = rs;
req->rq_extent.end = re;
GOTO(out_set, rc = -ENOMEM);
memcpy(req->rq_oa, src_oa, sizeof(*req->rq_oa));
req->rq_oa->o_id = loi->loi_id;
+ req->rq_oa->o_stripe_idx = i;
req->rq_extent.start = rs;
req->rq_extent.end = re;
-MODULES := lvfs #quotactl_test quotacheck_test
+MODULES := lvfs
@SERVER_TRUE@MODULES += fsfilt_@BACKINGFS@
lvfs-objs := lvfs_common.o lvfs_linux.o fsfilt.o upcall_cache.o
-#quotactl-objs := quotactl_test.o
-#quotaccheck-objs := quotacheck_test.o
ifeq ($(PATCHLEVEL),6)
fsfilt_@BACKINGFS@-objs := fsfilt-@BACKINGFS@.o
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
#include <linux/iobuf.h>
#endif
+#include <linux/lustre_compat25.h>
#ifdef EXT3_MULTIBLOCK_ALLOCATOR
#include <linux/ext3_extents.h>
#define EXT3_XATTR_INDEX_TRUSTED 4
#endif
+static char *fsfilt_ext3_label(struct super_block *sb)
+{
+ return EXT3_SB(sb)->s_es->s_volume_name;
+}
+
+static char *fsfilt_ext3_uuid(struct super_block *sb)
+{
+ return EXT3_SB(sb)->s_es->s_uuid;
+}
+
/*
* We don't currently need any additional blocks for rmdir and
* unlink transactions because we are storing the OST oa_id inside
nblocks = (LLOG_CHUNK_SIZE >> inode->i_blkbits) +
EXT3_DELETE_TRANS_BLOCKS * logs;
break;
+ case FSFILT_OP_JOIN:
+ /* delete 2 file(file + array id) + create 1 file (array id)
+ * create/update logs for each stripe */
+ nblocks += 2 * EXT3_DELETE_TRANS_BLOCKS;
+
+ /*create array log for head file*/
+ nblocks += 3;
+ nblocks += (EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+ EXT3_SINGLEDATA_TRANS_BLOCKS);
+ /*update head file array */
+ nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS +
+ EXT3_DATA_TRANS_BLOCKS;
+ break;
default: CERROR("unknown transaction start op %d\n", op);
LBUG();
}
/* make sure _something_ gets set - so new inode
* goes to disk (probably won't work over XFS */
if (!(iattr->ia_valid & (ATTR_MODE | ATTR_MTIME | ATTR_CTIME))){
- iattr->ia_valid |= ATTR_MODE;
- iattr->ia_mode = inode->i_mode;
+ iattr->ia_valid |= ATTR_MTIME;
+ iattr->ia_mtime = inode->i_mtime;
}
}
/* Don't allow setattr to change file type */
- iattr->ia_mode = (inode->i_mode & S_IFMT)|(iattr->ia_mode & ~S_IFMT);
+ if (iattr->ia_valid & ATTR_MODE)
+ iattr->ia_mode = (inode->i_mode & S_IFMT) |
+ (iattr->ia_mode & ~S_IFMT);
/* We set these flags on the client, but have already checked perms
* so don't confuse inode_change_ok. */
}
unlock_kernel();
-
return rc;
}
int rc = 0;
ENTRY;
+ /* FIXME: Can't do this because of nested transaction deadlock */
+ if (cmd == EXT3_IOC_SETFLAGS && (*(int *)arg) & EXT3_JOURNAL_DATA_FL) {
+ CERROR("can't set data journal flag on file\n");
+ RETURN(-EPERM);
+ }
+
if (inode->i_fop->ioctl)
rc = inode->i_fop->ioctl(inode, file, cmd, arg);
else
{
int rc;
- LASSERT(down_trylock(&inode->i_sem) != 0);
+ LASSERT_SEM_LOCKED(&inode->i_sem);
if (EXT3_I(inode)->i_file_acl /* || large inode EA flag */)
CWARN("setting EA on %lu/%u again... interesting\n",
{
int rc;
- LASSERT(down_trylock(&inode->i_sem) != 0);
+ LASSERT_SEM_LOCKED(&inode->i_sem);
lock_24kernel();
rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED,
return bg_start + colour + block;
}
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#include <linux/locks.h>
+static void ll_unmap_underlying_metadata(struct super_block *sb,
+ unsigned long blocknr)
+{
+ struct buffer_head *old_bh;
+
+ old_bh = get_hash_table(sb->s_dev, blocknr, sb->s_blocksize);
+ if (old_bh) {
+ mark_buffer_clean(old_bh);
+ wait_on_buffer(old_bh);
+ clear_bit(BH_Req, &old_bh->b_state);
+ __brelse(old_bh);
+ }
+}
+#else
+#define ll_unmap_underlying_metadata(sb, blocknr) \
+ unmap_underlying_metadata((sb)->s_bdev, blocknr)
+#endif
+
static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree,
struct ext3_ext_path *path,
struct ext3_ext_cache *cex)
unlock_24kernel();
map:
if (err >= 0) {
- struct block_device *bdev = inode->i_sb->s_bdev;
-
/* map blocks */
if (bp->num == 0) {
CERROR("hmm. why do we find this extent?\n");
} else {
*(bp->created) = 1;
/* unmap any possible underlying metadata from
- * the block device mapping. bug 6998.
- * This only compiles on 2.6, but there are
- * no users of mballoc on 2.4. */
- unmap_underlying_metadata(bdev, *(bp->blocks));
+ * the block device mapping. bug 6998. */
+ ll_unmap_underlying_metadata(inode->i_sb,
+ *(bp->blocks));
}
bp->created++;
bp->blocks++;
cleanup:
return rc;
}
-#endif
+#endif /* EXT3_MULTIBLOCK_ALLOCATOR */
extern int ext3_map_inode_page(struct inode *inode, struct page *page,
unsigned long *blocks, int *created, int create);
set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS);
sb->s_flags |= S_PDIROPS;
#endif
+ if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
+ CWARN("filesystem doesn't have dir_index feature enabled\n");
return 0;
}
return 0;
}
+static const char *op_quotafile[] = { "lquota.user", "lquota.group" };
+
+#define DQINFO_COPY(out, in) \
+do { \
+ Q_COPY(out, in, dqi_bgrace); \
+ Q_COPY(out, in, dqi_igrace); \
+ Q_COPY(out, in, dqi_flags); \
+ Q_COPY(out, in, dqi_valid); \
+} while (0)
+
+#define DQBLK_COPY(out, in) \
+do { \
+ Q_COPY(out, in, dqb_bhardlimit); \
+ Q_COPY(out, in, dqb_bsoftlimit); \
+ Q_COPY(out, in, dqb_curspace); \
+ Q_COPY(out, in, dqb_ihardlimit); \
+ Q_COPY(out, in, dqb_isoftlimit); \
+ Q_COPY(out, in, dqb_curinodes); \
+ Q_COPY(out, in, dqb_btime); \
+ Q_COPY(out, in, dqb_itime); \
+ Q_COPY(out, in, dqb_valid); \
+} while (0)
+
+
+
+static int fsfilt_ext3_quotactl(struct super_block *sb,
+ struct obd_quotactl *oqc)
+{
+ int i, rc = 0, error = 0;
+ struct quotactl_ops *qcop;
+ struct if_dqinfo *info;
+ struct if_dqblk *dqblk;
+ ENTRY;
+
+ if (!sb->s_qcop)
+ RETURN(-ENOSYS);
+
+ OBD_ALLOC_PTR(info);
+ if (!info)
+ RETURN(-ENOMEM);
+ OBD_ALLOC_PTR(dqblk);
+ if (!dqblk) {
+ OBD_FREE_PTR(info);
+ RETURN(-ENOMEM);
+ }
+
+ DQINFO_COPY(info, &oqc->qc_dqinfo);
+ DQBLK_COPY(dqblk, &oqc->qc_dqblk);
+
+ qcop = sb->s_qcop;
+ if (oqc->qc_cmd == Q_QUOTAON || oqc->qc_cmd == Q_QUOTAOFF) {
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (!Q_TYPESET(oqc, i))
+ continue;
+
+ if (oqc->qc_cmd == Q_QUOTAON) {
+ if (!qcop->quota_on)
+ GOTO(out, rc = -ENOSYS);
+ rc = qcop->quota_on(sb, i, oqc->qc_id,
+ (char *)op_quotafile[i]);
+ } else if (oqc->qc_cmd == Q_QUOTAOFF) {
+ if (!qcop->quota_off)
+ GOTO(out, rc = -ENOSYS);
+ rc = qcop->quota_off(sb, i);
+ }
+
+ if (rc == -EBUSY)
+ error = rc;
+ else if (rc)
+ GOTO(out, rc);
+ }
+ GOTO(out, rc ?: error);
+ }
+
+ switch (oqc->qc_cmd) {
+ case Q_GETOINFO:
+ case Q_GETINFO:
+ if (!qcop->get_info)
+ GOTO(out, rc = -ENOSYS);
+ rc = qcop->get_info(sb, oqc->qc_type, info);
+ break;
+ case Q_SETQUOTA:
+ case Q_INITQUOTA:
+ if (!qcop->set_dqblk)
+ GOTO(out, rc = -ENOSYS);
+ rc = qcop->set_dqblk(sb, oqc->qc_type, oqc->qc_id, dqblk);
+ break;
+ case Q_GETOQUOTA:
+ case Q_GETQUOTA:
+ if (!qcop->get_dqblk)
+ GOTO(out, rc = -ENOSYS);
+ rc = qcop->get_dqblk(sb, oqc->qc_type, oqc->qc_id, dqblk);
+ break;
+ case Q_SYNC:
+ if (!sb->s_qcop->quota_sync)
+ GOTO(out, rc = -ENOSYS);
+ qcop->quota_sync(sb, oqc->qc_type);
+ break;
+ default:
+ CERROR("unsupported quotactl command: %d", oqc->qc_cmd);
+ LBUG();
+ }
+out:
+ DQINFO_COPY(&oqc->qc_dqinfo, info);
+ DQBLK_COPY(&oqc->qc_dqblk, dqblk);
+
+ OBD_FREE_PTR(info);
+ OBD_FREE_PTR(dqblk);
+
+ if (rc)
+ CDEBUG(D_QUOTA, "quotactl command %#x, id %u, type %d "
+ "failed: %d\n",
+ oqc->qc_cmd, oqc->qc_id, oqc->qc_type, rc);
+ RETURN(rc);
+}
+
+struct chk_dqblk{
+ struct hlist_node dqb_hash; /* quotacheck hash */
+ struct list_head dqb_list; /* in list also */
+ qid_t dqb_id; /* uid/gid */
+ short dqb_type; /* USRQUOTA/GRPQUOTA */
+ __u32 dqb_bhardlimit; /* block hard limit */
+ __u32 dqb_bsoftlimit; /* block soft limit */
+ qsize_t dqb_curspace; /* current space */
+ __u32 dqb_ihardlimit; /* inode hard limit */
+ __u32 dqb_isoftlimit; /* inode soft limit */
+ __u32 dqb_curinodes; /* current inodes */
+ __u64 dqb_btime; /* block grace time */
+ __u64 dqb_itime; /* inode grace time */
+ __u32 dqb_valid; /* flag for above fields */
+};
+
+static inline unsigned int const
+chkquot_hash(qid_t id, int type)
+{
+ return (id * (MAXQUOTAS - type)) % NR_DQHASH;
+}
+
+static inline struct chk_dqblk *
+find_chkquot(struct hlist_head *head, qid_t id, int type)
+{
+ struct hlist_node *node;
+ struct chk_dqblk *cdqb;
+
+ hlist_for_each(node, head) {
+ cdqb = hlist_entry(node, struct chk_dqblk, dqb_hash);
+ if (cdqb->dqb_id == id && cdqb->dqb_type == type)
+ return cdqb;
+ }
+
+ return NULL;
+}
+
+static struct chk_dqblk *alloc_chkquot(qid_t id, int type)
+{
+ struct chk_dqblk *cdqb;
+
+ OBD_ALLOC_PTR(cdqb);
+ if (cdqb) {
+ INIT_HLIST_NODE(&cdqb->dqb_hash);
+ INIT_LIST_HEAD(&cdqb->dqb_list);
+ cdqb->dqb_id = id;
+ cdqb->dqb_type = type;
+ }
+
+ return cdqb;
+}
+
+static struct chk_dqblk *
+cqget(struct super_block *sb, struct hlist_head *hash, struct list_head *list,
+ qid_t id, int type, int first_check)
+{
+ struct hlist_head *head = hash + chkquot_hash(id, type);
+ struct if_dqblk dqb;
+ struct chk_dqblk *cdqb;
+ int rc;
+
+ cdqb = find_chkquot(head, id, type);
+ if (cdqb)
+ return cdqb;
+
+ cdqb = alloc_chkquot(id, type);
+ if (!cdqb)
+ return NULL;
+
+ if (!first_check) {
+ rc = sb->s_qcop->get_dqblk(sb, type, id, &dqb);
+ if (rc) {
+ CERROR("get_dqblk of id %u, type %d failed: %d\n",
+ id, type, rc);
+ } else {
+ DQBLK_COPY(cdqb, &dqb);
+ cdqb->dqb_curspace = 0;
+ cdqb->dqb_curinodes = 0;
+ }
+ }
+
+ hlist_add_head(&cdqb->dqb_hash, head);
+ list_add_tail(&cdqb->dqb_list, list);
+
+ return cdqb;
+}
+
+static inline int quota_onoff(struct super_block *sb, int cmd, int type)
+{
+ struct obd_quotactl *oqctl;
+ int rc;
+
+ OBD_ALLOC_PTR(oqctl);
+ if (!oqctl)
+ RETURN(-ENOMEM);
+
+ oqctl->qc_cmd = cmd;
+ oqctl->qc_id = QFMT_LDISKFS;
+ oqctl->qc_type = type;
+ rc = fsfilt_ext3_quotactl(sb, oqctl);
+
+ OBD_FREE_PTR(oqctl);
+ return rc;
+}
+
+static inline int read_old_dqinfo(struct super_block *sb, int type,
+ struct if_dqinfo *dqinfo)
+{
+ struct obd_quotactl *oqctl;
+ int rc;
+ ENTRY;
+
+ OBD_ALLOC_PTR(oqctl);
+ if (!oqctl)
+ RETURN(-ENOMEM);
+
+ oqctl->qc_cmd = Q_GETINFO;
+ oqctl->qc_type = type;
+ rc = fsfilt_ext3_quotactl(sb, oqctl);
+ if (!rc)
+ ((struct obd_dqinfo *)dqinfo)[type] = oqctl->qc_dqinfo;
+
+ OBD_FREE_PTR(oqctl);
+ RETURN(rc);
+}
+
static inline struct ext3_group_desc *
get_group_desc(struct super_block *sb, int group)
{
return inode;
}
+struct qchk_ctxt {
+ struct hlist_head qckt_hash[NR_DQHASH]; /* quotacheck hash */
+ struct list_head qckt_list; /* quotacheck list */
+ int qckt_first_check[MAXQUOTAS]; /* 1 if no old quotafile */
+ struct if_dqinfo qckt_dqinfo[MAXQUOTAS]; /* old dqinfo */
+};
+
+static int add_inode_quota(struct inode *inode, struct qchk_ctxt *qctxt,
+ struct obd_quotactl *oqc)
+{
+ struct chk_dqblk *cdqb[MAXQUOTAS] = { NULL, };
+ loff_t size = 0;
+ qid_t qid[MAXQUOTAS];
+ int cnt, i, rc = 0;
+
+ if (!inode)
+ return 0;
+
+ qid[USRQUOTA] = inode->i_uid;
+ qid[GRPQUOTA] = inode->i_gid;
+
+ if (S_ISDIR(inode->i_mode) ||
+ S_ISREG(inode->i_mode) ||
+ S_ISLNK(inode->i_mode))
+ size = inode_get_bytes(inode);
+
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (!Q_TYPESET(oqc, cnt))
+ continue;
+
+ cdqb[cnt] = cqget(inode->i_sb, qctxt->qckt_hash,
+ &qctxt->qckt_list, qid[cnt], cnt,
+ qctxt->qckt_first_check[cnt]);
+ if (!cdqb[cnt]) {
+ rc = -ENOMEM;
+ break;
+ }
+
+ cdqb[cnt]->dqb_curspace += size;
+ cdqb[cnt]->dqb_curinodes++;
+ }
+
+ if (rc) {
+ for (i = 0; i < cnt; i++) {
+ if (!Q_TYPESET(oqc, i))
+ continue;
+ LASSERT(cdqb[i]);
+ cdqb[i]->dqb_curspace -= size;
+ cdqb[i]->dqb_curinodes--;
+ }
+ }
+
+ return rc;
+}
+
+static int v2_write_dqheader(struct file *f, int type)
+{
+ static const __u32 quota_magics[] = V2_INITQMAGICS;
+ static const __u32 quota_versions[] = V2_INITQVERSIONS;
+ struct v2_disk_dqheader dqhead;
+ loff_t offset = 0;
+
+ CLASSERT(ARRAY_SIZE(quota_magics) == ARRAY_SIZE(quota_versions));
+ LASSERT(0 <= type && type < ARRAY_SIZE(quota_magics));
+
+ dqhead.dqh_magic = cpu_to_le32(quota_magics[type]);
+ dqhead.dqh_version = cpu_to_le32(quota_versions[type]);
+
+ return cfs_user_write(f, (char *)&dqhead, sizeof(dqhead), &offset);
+}
+
+/* write dqinfo struct in a new quota file */
+static int v2_write_dqinfo(struct file *f, int type, struct if_dqinfo *info)
+{
+ struct v2_disk_dqinfo dqinfo;
+ __u32 blocks = V2_DQTREEOFF + 1;
+ loff_t offset = V2_DQINFOOFF;
+
+ if (info) {
+ dqinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+ dqinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+ dqinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK &
+ ~DQF_INFO_DIRTY);
+ } else {
+ dqinfo.dqi_bgrace = cpu_to_le32(MAX_DQ_TIME);
+ dqinfo.dqi_igrace = cpu_to_le32(MAX_IQ_TIME);
+ dqinfo.dqi_flags = 0;
+ }
+
+ dqinfo.dqi_blocks = cpu_to_le32(blocks);
+ dqinfo.dqi_free_blk = 0;
+ dqinfo.dqi_free_entry = 0;
+
+ return cfs_user_write(f, (char *)&dqinfo, sizeof(dqinfo), &offset);
+}
+
+static int create_new_quota_files(struct qchk_ctxt *qctxt,
+ struct obd_quotactl *oqc)
+{
+ int i, rc = 0;
+ ENTRY;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ struct if_dqinfo *info = qctxt->qckt_first_check[i]?
+ NULL : &qctxt->qckt_dqinfo[i];
+ struct file *file;
+
+ if (!Q_TYPESET(oqc, i))
+ continue;
+
+ file = filp_open(op_quotafile[i], O_RDWR | O_CREAT | O_TRUNC,
+ 0644);
+ if (IS_ERR(file)) {
+ rc = PTR_ERR(file);
+ CERROR("can't create %s file: rc = %d\n",
+ op_quotafile[i], rc);
+ GOTO(out, rc);
+ }
+
+ if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+ CERROR("file %s is not regular", op_quotafile[i]);
+ filp_close(file, 0);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ rc = v2_write_dqheader(file, i);
+ if (rc) {
+ filp_close(file, 0);
+ GOTO(out, rc);
+ }
+
+ rc = v2_write_dqinfo(file, i, info);
+ filp_close(file, 0);
+ if (rc)
+ GOTO(out, rc);
+ }
+
+out:
+ RETURN(rc);
+}
+
+
+static int commit_chkquot(struct super_block *sb, struct qchk_ctxt *qctxt,
+ struct chk_dqblk *cdqb)
+{
+ struct obd_quotactl *oqc;
+ struct timeval now;
+ int rc;
+ ENTRY;
+
+ OBD_ALLOC_PTR(oqc);
+ if (!oqc)
+ RETURN(-ENOMEM);
+
+ do_gettimeofday(&now);
+
+ if (cdqb->dqb_bsoftlimit &&
+ toqb(cdqb->dqb_curspace) >= cdqb->dqb_bsoftlimit &&
+ !cdqb->dqb_btime)
+ cdqb->dqb_btime = now.tv_sec +
+ qctxt->qckt_dqinfo[cdqb->dqb_type].dqi_bgrace;
+
+ if (cdqb->dqb_isoftlimit &&
+ cdqb->dqb_curinodes >= cdqb->dqb_isoftlimit &&
+ !cdqb->dqb_itime)
+ cdqb->dqb_itime = now.tv_sec +
+ qctxt->qckt_dqinfo[cdqb->dqb_type].dqi_igrace;
+
+ cdqb->dqb_valid = QIF_ALL;
+
+ oqc->qc_cmd = Q_SETQUOTA;
+ oqc->qc_type = cdqb->dqb_type;
+ oqc->qc_id = cdqb->dqb_id;
+ DQBLK_COPY(&oqc->qc_dqblk, cdqb);
+
+ rc = fsfilt_ext3_quotactl(sb, oqc);
+ OBD_FREE_PTR(oqc);
+ RETURN(rc);
+}
+
+static int prune_chkquots(struct super_block *sb,
+ struct qchk_ctxt *qctxt, int error)
+{
+ struct chk_dqblk *cdqb, *tmp;
+ int rc;
+
+ list_for_each_entry_safe(cdqb, tmp, &qctxt->qckt_list, dqb_list) {
+ if (!error) {
+ rc = commit_chkquot(sb, qctxt, cdqb);
+ if (rc)
+ error = rc;
+ }
+ hlist_del_init(&cdqb->dqb_hash);
+ list_del(&cdqb->dqb_list);
+ OBD_FREE_PTR(cdqb);
+ }
+
+ return error;
+}
+
+static int fsfilt_ext3_quotacheck(struct super_block *sb,
+ struct obd_quotactl *oqc)
+{
+ struct ext3_sb_info *sbi = EXT3_SB(sb);
+ int i, group;
+ struct qchk_ctxt *qctxt;
+ struct buffer_head *bitmap_bh = NULL;
+ unsigned long ino;
+ struct inode *inode;
+ int rc = 0;
+ ENTRY;
+
+ /* turn on quota and read dqinfo if existed */
+ OBD_ALLOC_PTR(qctxt);
+ if (!qctxt) {
+ oqc->qc_stat = -ENOMEM;
+ RETURN(-ENOMEM);
+ }
+
+ for (i = 0; i < NR_DQHASH; i++)
+ INIT_HLIST_HEAD(&qctxt->qckt_hash[i]);
+ INIT_LIST_HEAD(&qctxt->qckt_list);
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (!Q_TYPESET(oqc, i))
+ continue;
+
+ rc = quota_onoff(sb, Q_QUOTAON, i);
+ if (!rc || rc == -EBUSY) {
+ rc = read_old_dqinfo(sb, i, qctxt->qckt_dqinfo);
+ if (rc)
+ GOTO(out, rc);
+ } else if (rc == -ENOENT) {
+ qctxt->qckt_first_check[i] = 1;
+ } else if (rc) {
+ GOTO(out, rc);
+ }
+ }
+
+ /* check quota and update in hash */
+ for (group = 0; group < sbi->s_groups_count; group++) {
+ ino = group * sbi->s_inodes_per_group + 1;
+ bitmap_bh = read_inode_bitmap(sb, group);
+ if (!bitmap_bh) {
+ CERROR("read_inode_bitmap group %d failed", group);
+ GOTO(out, -EIO);
+ }
+
+ for (i = 0; i < sbi->s_inodes_per_group; i++, ino++) {
+ if (ino < sbi->s_first_ino)
+ continue;
+
+ inode = ext3_iget_inuse(sb, bitmap_bh, i, ino);
+ rc = add_inode_quota(inode, qctxt, oqc);
+ iput(inode);
+ if (rc) {
+ brelse(bitmap_bh);
+ GOTO(out, rc);
+ }
+ }
+
+ brelse(bitmap_bh);
+ }
+
+ /* read old quota limits from old quota file. (only for the user
+ * has limits but hasn't file) */
#ifdef HAVE_QUOTA_SUPPORT
-# include "fsfilt_ext3_quota.h"
+ for (i = 0; i < MAXQUOTAS; i++) {
+ struct list_head id_list;
+ struct dquot_id *dqid, *tmp;
+
+ if (!Q_TYPESET(oqc, i))
+ continue;
+
+ if (qctxt->qckt_first_check[i])
+ continue;
+
+
+ LASSERT(sb_dqopt(sb)->files[i] != NULL);
+ INIT_LIST_HEAD(&id_list);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12)
+ rc = lustre_get_qids(sb_dqopt(sb)->files[i], NULL, i, &id_list);
+#else
+ rc = lustre_get_qids(NULL, sb_dqopt(sb)->files[i], i, &id_list);
+#endif
+ if (rc)
+ CERROR("read old limits failed. (rc:%d)\n", rc);
+
+ list_for_each_entry_safe(dqid, tmp, &id_list, di_link) {
+ list_del_init(&dqid->di_link);
+
+ if (!rc)
+ cqget(sb, qctxt->qckt_hash, &qctxt->qckt_list,
+ dqid->di_id, i,
+ qctxt->qckt_first_check[i]);
+ kfree(dqid);
+ }
+ }
+#endif
+ /* turn off quota cause we are to dump chk_dqblk to files */
+ quota_onoff(sb, Q_QUOTAOFF, oqc->qc_type);
+
+ rc = create_new_quota_files(qctxt, oqc);
+ if (rc)
+ GOTO(out, rc);
+
+ /* we use vfs functions to set dqblk, so turn quota on */
+ rc = quota_onoff(sb, Q_QUOTAON, oqc->qc_type);
+out:
+ /* dump and free chk_dqblk */
+ rc = prune_chkquots(sb, qctxt, rc);
+ OBD_FREE_PTR(qctxt);
+
+ /* turn off quota, `lfs quotacheck` will turn on when all
+ * nodes quotacheck finish. */
+ quota_onoff(sb, Q_QUOTAOFF, oqc->qc_type);
+
+ oqc->qc_stat = rc;
+ if (rc)
+ CERROR("quotacheck failed: rc = %d\n", rc);
+
+ RETURN(rc);
+}
+
+#ifdef HAVE_QUOTA_SUPPORT
+static int fsfilt_ext3_quotainfo(struct lustre_quota_info *lqi, int type,
+ int cmd)
+{
+ int rc = 0;
+ ENTRY;
+
+ if (lqi->qi_files[type] == NULL) {
+ CERROR("operate qinfo before it's enabled!\n");
+ RETURN(-EIO);
+ }
+
+ switch (cmd) {
+ case QFILE_CHK:
+ rc = lustre_check_quota_file(lqi, type);
+ break;
+ case QFILE_RD_INFO:
+ rc = lustre_read_quota_info(lqi, type);
+ break;
+ case QFILE_WR_INFO:
+ rc = lustre_write_quota_info(lqi, type);
+ break;
+ case QFILE_INIT_INFO:
+ rc = lustre_init_quota_info(lqi, type);
+ break;
+ default:
+ CERROR("Unsupported admin quota file cmd %d\n", cmd);
+ LBUG();
+ break;
+ }
+ RETURN(rc);
+}
+
+static int fsfilt_ext3_qids(struct file *file, struct inode *inode, int type,
+ struct list_head *list)
+{
+ return lustre_get_qids(file, inode, type, list);
+}
+
+static int fsfilt_ext3_dquot(struct lustre_dquot *dquot, int cmd)
+{
+ int rc = 0;
+ ENTRY;
+
+ if (dquot->dq_info->qi_files[dquot->dq_type] == NULL) {
+ CERROR("operate dquot before it's enabled!\n");
+ RETURN(-EIO);
+ }
+
+ switch (cmd) {
+ case QFILE_RD_DQUOT:
+ rc = lustre_read_dquot(dquot);
+ break;
+ case QFILE_WR_DQUOT:
+ if (dquot->dq_dqb.dqb_ihardlimit ||
+ dquot->dq_dqb.dqb_isoftlimit ||
+ dquot->dq_dqb.dqb_bhardlimit ||
+ dquot->dq_dqb.dqb_bsoftlimit)
+ clear_bit(DQ_FAKE_B, &dquot->dq_flags);
+ else
+ set_bit(DQ_FAKE_B, &dquot->dq_flags);
+
+ rc = lustre_commit_dquot(dquot);
+ if (rc >= 0)
+ rc = 0;
+ break;
+ default:
+ CERROR("Unsupported admin quota file cmd %d\n", cmd);
+ LBUG();
+ break;
+ }
+ RETURN(rc);
+}
#endif
static struct fsfilt_operations fsfilt_ext3_ops = {
.fs_type = "ext3",
.fs_owner = THIS_MODULE,
+ .fs_label = fsfilt_ext3_label,
+ .fs_uuid = fsfilt_ext3_uuid,
.fs_start = fsfilt_ext3_start,
.fs_brw_start = fsfilt_ext3_brw_start,
.fs_commit = fsfilt_ext3_commit,
.fs_setup = fsfilt_ext3_setup,
.fs_send_bio = fsfilt_ext3_send_bio,
.fs_get_op_len = fsfilt_ext3_get_op_len,
-#ifdef HAVE_QUOTA_SUPPORT
.fs_quotactl = fsfilt_ext3_quotactl,
.fs_quotacheck = fsfilt_ext3_quotacheck,
+#ifdef HAVE_QUOTA_SUPPORT
.fs_quotainfo = fsfilt_ext3_quotainfo,
+ .fs_qids = fsfilt_ext3_qids,
.fs_dquot = fsfilt_ext3_dquot,
#endif
};
atomic_read(¤t->fs->pwdmnt->mnt_count));
*/
- LASSERT(current->fs->pwd == new_ctx->pwd);
- LASSERT(current->fs->pwdmnt == new_ctx->pwdmnt);
+ LASSERTF(current->fs->pwd == new_ctx->pwd, "%p != %p\n",
+ current->fs->pwd, new_ctx->pwd);
+ LASSERTF(current->fs->pwdmnt == new_ctx->pwdmnt, "%p != %p\n",
+ current->fs->pwdmnt, new_ctx->pwdmnt);
set_fs(saved->fs);
set_fs_pwd(current->fs, saved->pwdmnt, saved->pwd);
static struct lprocfs_vars lprocfs_obd_vars[] = {
{ "uuid", lprocfs_rd_uuid, 0, 0 },
{ "ping", 0, lprocfs_wr_ping, 0 },
+ { "connect_flags", lprocfs_rd_connect_flags, 0, 0 },
{ "blocksize", lprocfs_rd_blksize, 0, 0 },
{ "kbytestotal", lprocfs_rd_kbytestotal, 0, 0 },
{ "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 },
#include <linux/lustre_mds.h>
-void mdc_pack_req_body(struct ptlrpc_request *);
+void mdc_pack_req_body(struct ptlrpc_request *req, int offset,
+ __u64 valid, struct ll_fid *fid, int ea_size);
void mdc_pack_rep_body(struct ptlrpc_request *);
-void mdc_readdir_pack(struct ptlrpc_request *req, __u64 offset, __u32 size,
- struct ll_fid *mdc_fid);
+void mdc_readdir_pack(struct ptlrpc_request *req, int pos, __u64 offset,
+ __u32 size, struct ll_fid *mdc_fid);
void mdc_getattr_pack(struct ptlrpc_request *req, int valid, int offset,
int flags, struct mdc_op_data *data);
-void mdc_setattr_pack(struct ptlrpc_request *req,
+void mdc_setattr_pack(struct ptlrpc_request *req, int offset,
struct mdc_op_data *data,
struct iattr *iattr, void *ea, int ealen,
void *ea2, int ea2len);
void mdc_open_pack(struct ptlrpc_request *req, int offset,
struct mdc_op_data *op_data, __u32 mode, __u64 rdev,
__u32 flags, const void *data, int datalen);
+void mdc_join_pack(struct ptlrpc_request *req, int offset,
+ struct mdc_op_data *op_data, __u64 head_size);
void mdc_unlink_pack(struct ptlrpc_request *req, int offset,
struct mdc_op_data *data);
void mdc_link_pack(struct ptlrpc_request *req, int offset,
}
/* Quota stuff */
-#ifdef HAVE_QUOTA_SUPPORT
-int mdc_quotacheck(struct obd_export *exp, struct obd_quotactl *oqctl);
-int mdc_poll_quotacheck(struct obd_export *exp, struct if_quotacheck *qchk);
-int mdc_quotactl(struct obd_export *exp, struct obd_quotactl *oqctl);
-#else
-static inline int mdc_quotacheck(struct obd_export *exp, struct obd_quotactl *oqctl)
-{
- return -ENOTSUPP;
-}
-
-static inline int mdc_poll_quotacheck(struct obd_export *exp, struct if_quotacheck *qchk)
-{
- return -ENOTSUPP;
-}
-
-static inline int mdc_quotactl(struct obd_export *exp, struct obd_quotactl *oqctl)
-{
- return -ENOTSUPP;
-}
-#endif
-
+extern quota_interface_t *quota_interface;
#endif
#endif
-void mdc_readdir_pack(struct ptlrpc_request *req, __u64 offset, __u32 size,
- struct ll_fid *mdc_fid)
+void mdc_readdir_pack(struct ptlrpc_request *req, int pos, __u64 offset,
+ __u32 size, struct ll_fid *mdc_fid)
{
struct mds_body *b;
- b = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*b));
+ b = lustre_msg_buf(req->rq_reqmsg, pos, sizeof (*b));
b->fsuid = current->fsuid;
b->fsgid = current->fsgid;
b->capability = current->cap_effective;
b->capability = current->cap_effective;
}
-void mdc_pack_req_body(struct ptlrpc_request *req)
+void mdc_pack_req_body(struct ptlrpc_request *req, int offset,
+ __u64 valid, struct ll_fid *fid, int ea_size)
{
- struct mds_body *b = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*b));
+ struct mds_body *b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b));
+
+ if (fid)
+ b->fid1 = *fid;
+ b->valid = valid;
+ b->eadatasize = ea_size;
mdc_pack_body(b);
}
((flags & O_APPEND) ? MDS_OPEN_APPEND : 0) |
((flags & O_SYNC) ? MDS_OPEN_SYNC : 0) |
((flags & O_DIRECTORY) ? MDS_OPEN_DIRECTORY : 0) |
+ ((flags & O_JOIN_FILE) ? MDS_OPEN_JOIN_FILE : 0) |
0;
}
/* packing of MDS records */
+void mdc_join_pack(struct ptlrpc_request *req, int offset,
+ struct mdc_op_data *op_data, __u64 head_size)
+{
+ struct mds_rec_join *rec;
+
+ rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*rec));
+ LASSERT(rec != NULL);
+ rec->jr_fid = op_data->fid2;
+ rec->jr_headsize = head_size;
+}
+
void mdc_open_pack(struct ptlrpc_request *req, int offset,
struct mdc_op_data *op_data, __u32 mode, __u64 rdev,
__u32 flags, const void *lmm, int lmmlen)
}
}
-void mdc_setattr_pack(struct ptlrpc_request *req, struct mdc_op_data *data,
- struct iattr *iattr, void *ea, int ealen,
- void *ea2, int ea2len)
+void mdc_setattr_pack(struct ptlrpc_request *req, int offset,
+ struct mdc_op_data *data, struct iattr *iattr,
+ void *ea, int ealen, void *ea2, int ea2len)
{
- struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, 0,
+ struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, offset,
sizeof (*rec));
rec->sa_opcode = REINT_SETATTR;
rec->sa_fsuid = current->fsuid;
if (ealen == 0)
return;
- memcpy(lustre_msg_buf(req->rq_reqmsg, 1, ealen), ea, ealen);
+ memcpy(lustre_msg_buf(req->rq_reqmsg, offset + 1, ealen), ea, ealen);
if (ea2len == 0)
return;
- memcpy(lustre_msg_buf(req->rq_reqmsg, 2, ea2len), ea2, ea2len);
+ memcpy(lustre_msg_buf(req->rq_reqmsg, offset + 2, ea2len), ea2, ea2len);
}
void mdc_unlink_pack(struct ptlrpc_request *req, int offset,
}
}
-void mdc_getattr_pack(struct ptlrpc_request *req, int valid, int offset,
+void mdc_getattr_pack(struct ptlrpc_request *req, int offset, int valid,
int flags, struct mdc_op_data *data)
{
struct mds_body *b;
}
EXPORT_SYMBOL(it_set_disposition);
+void it_clear_disposition(struct lookup_intent *it, int flag)
+{
+ it->d.lustre.it_disposition &= ~flag;
+}
+
static int it_to_lock_mode(struct lookup_intent *it)
{
/* CREAT needs to be tested before open (both could be set) */
if (it->it_op & IT_CREAT)
- return LCK_PW;
+ return LCK_CW;
else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP))
- return LCK_PR;
+ return LCK_CR;
LBUG();
RETURN(-EINVAL);
struct obd_device *obddev = class_exp2obd(exp);
struct ldlm_res_id res_id =
{ .name = {data->fid1.id, data->fid1.generation} };
- int size[5] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)};
- int rc, flags = extra_lock_flags | LDLM_FL_HAS_INTENT;
- int repsize[4] = {sizeof(struct ldlm_reply),
- sizeof(struct mds_body),
- obddev->u.cli.cl_max_mds_easize,
- obddev->u.cli.cl_max_mds_cookiesize};
- struct ldlm_reply *dlm_rep;
- struct ldlm_intent *lit;
+ ldlm_policy_data_t policy = { .l_inodebits = { MDS_INODELOCK_LOOKUP } };
struct ldlm_request *lockreq;
+ struct ldlm_intent *lit;
+ int size[6] = {[MDS_REQ_INTENT_LOCKREQ_OFF] = sizeof(*lockreq),
+ [MDS_REQ_INTENT_IT_OFF] = sizeof(*lit) };
+ struct ldlm_reply *dlm_rep;
+ int repsize[4] = {sizeof(*dlm_rep),
+ sizeof(struct mds_body),
+ obddev->u.cli.cl_max_mds_easize};
void *eadata;
unsigned long irqflags;
- int reply_buffers = 0;
+ int repbufcnt = 3, req_buffers = 2;
+ int rc, flags = extra_lock_flags | LDLM_FL_HAS_INTENT;
ENTRY;
+ LASSERTF(lock_type == LDLM_IBITS, "lock type %d\n", lock_type);
// LDLM_DEBUG_NOLOCK("mdsintent=%s,name=%s,dir=%lu",
// ldlm_it2str(it->it_op), it_name, it_inode->i_ino);
if (it->it_op & IT_OPEN) {
it->it_create_mode |= S_IFREG;
- size[2] = sizeof(struct mds_rec_create);
- size[3] = data->namelen + 1;
+ size[req_buffers++] = sizeof(struct mds_rec_create);
+ size[req_buffers++] = data->namelen + 1;
/* As an optimization, we allocate an RPC request buffer for
* at least a default-sized LOV EA even if we aren't sending
* one. We grow the whole request to the next power-of-two
* size since we get that much from a slab allocation anyways.
* This avoids an allocation below in the common case where
* we need to save a default-sized LOV EA for open replay. */
- size[4] = max(lmmsize, obddev->u.cli.cl_default_mds_easize);
- rc = lustre_msg_size(5, size);
+ size[req_buffers++] = max(lmmsize,
+ obddev->u.cli.cl_default_mds_easize);
+ rc = lustre_msg_size(req_buffers, size);
if (rc & (rc - 1))
- size[4] = min(size[4] + round_up(rc) - rc,
- obddev->u.cli.cl_max_mds_easize);
- req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE,
- 5, size, NULL);
+ size[req_buffers - 1] = min(size[req_buffers - 1] +
+ round_up(rc) - rc,
+ obddev->u.cli.cl_max_mds_easize);
+
+ if (it->it_flags & O_JOIN_FILE) {
+ __u64 head_size = *(__u32*)cb_data;
+ __u32 tsize = *(__u32*)lmm;
+
+ /* join is like an unlink of the tail */
+ policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
+ size[req_buffers++] = sizeof(struct mds_rec_join);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp),
+ LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
+ req_buffers, size, NULL);
+ /* when joining file, cb_data and lmm args together
+ * indicate the head file size*/
+ mdc_join_pack(req, req_buffers - 1, data,
+ (head_size << 32) | tsize);
+ cb_data = NULL;
+ lmm = NULL;
+ } else
+ req = ptlrpc_prep_req(class_exp2cliimp(exp),
+ LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
+ req_buffers, size, NULL);
if (!req)
RETURN(-ENOMEM);
spin_unlock_irqrestore (&req->rq_lock, irqflags);
/* pack the intent */
- lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit));
+ lit = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_INTENT_IT_OFF,
+ sizeof (*lit));
lit->opc = (__u64)it->it_op;
/* pack the intended request */
- mdc_open_pack(req, 2, data, it->it_create_mode, 0,
+ mdc_open_pack(req, MDS_REQ_INTENT_REC_OFF, data,
+ it->it_create_mode, 0,
it->it_flags, lmm, lmmsize);
- /* get ready for the reply */
- reply_buffers = 3;
- req->rq_replen = lustre_msg_size(3, repsize);
+
+ repsize[repbufcnt++] = LUSTRE_POSIX_ACL_MAX_SIZE;
} else if (it->it_op & IT_UNLINK) {
- size[2] = sizeof(struct mds_rec_unlink);
- size[3] = data->namelen + 1;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 4,
- size, NULL);
+ size[req_buffers++] = sizeof(struct mds_rec_unlink);
+ size[req_buffers++] = data->namelen + 1;
+ policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION,
+ LDLM_ENQUEUE, req_buffers, size, NULL);
if (!req)
RETURN(-ENOMEM);
/* pack the intent */
- lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit));
+ lit = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_INTENT_IT_OFF,
+ sizeof (*lit));
lit->opc = (__u64)it->it_op;
/* pack the intended request */
- mdc_unlink_pack(req, 2, data);
+ mdc_unlink_pack(req, MDS_REQ_INTENT_REC_OFF, data);
/* get ready for the reply */
- reply_buffers = 4;
- req->rq_replen = lustre_msg_size(4, repsize);
+ repsize[repbufcnt++] = obddev->u.cli.cl_max_mds_cookiesize;
} else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
- obd_valid valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE;
- size[2] = sizeof(struct mds_body);
- size[3] = data->namelen + 1;
+ obd_valid valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE |
+ OBD_MD_FLACL | OBD_MD_FLMODEASIZE;
+ size[req_buffers++] = sizeof(struct mds_body);
+ size[req_buffers++] = data->namelen + 1;
+
+ if (it->it_op & IT_GETATTR)
+ policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 4,
- size, NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION,
+ LDLM_ENQUEUE, req_buffers, size, NULL);
if (!req)
RETURN(-ENOMEM);
/* pack the intent */
- lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit));
+ lit = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_INTENT_IT_OFF,
+ sizeof (*lit));
lit->opc = (__u64)it->it_op;
/* pack the intended request */
- mdc_getattr_pack(req, valid, 2, it->it_flags, data);
+ mdc_getattr_pack(req, MDS_REQ_INTENT_REC_OFF, valid,
+ it->it_flags, data);
/* get ready for the reply */
- reply_buffers = 3;
- req->rq_replen = lustre_msg_size(3, repsize);
+ repsize[repbufcnt++] = LUSTRE_POSIX_ACL_MAX_SIZE;
} else if (it->it_op == IT_READDIR) {
- req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 1,
- size, NULL);
+ policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION,
+ LDLM_ENQUEUE, 1, size, NULL);
if (!req)
RETURN(-ENOMEM);
/* get ready for the reply */
- reply_buffers = 1;
- req->rq_replen = lustre_msg_size(1, repsize);
- } else {
+ repbufcnt = 1;
+ } else {
LBUG();
RETURN(-EINVAL);
}
+ /* get ready for the reply */
+ req->rq_replen = lustre_msg_size(repbufcnt, repsize);
+
mdc_get_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
rc = ldlm_cli_enqueue(exp, req, obddev->obd_namespace, res_id,
- lock_type, NULL, lock_mode, &flags, cb_blocking,
+ lock_type, &policy,lock_mode, &flags, cb_blocking,
cb_completion, NULL, cb_data, NULL, 0, NULL,
lockh);
mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
/* Similarly, if we're going to replay this request, we don't want to
* actually get a lock, just perform the intent. */
if (req->rq_transno || req->rq_replay) {
- lockreq = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*lockreq));
+ lockreq = lustre_msg_buf(req->rq_reqmsg,
+ MDS_REQ_INTENT_LOCKREQ_OFF,
+ sizeof(*lockreq));
lockreq->lock_flags |= LDLM_FL_INTENT_ONLY;
}
it->it_op,it->d.lustre.it_disposition,it->d.lustre.it_status);
/* We know what to expect, so we do any byte flipping required here */
- LASSERT(reply_buffers == 4 || reply_buffers == 3 || reply_buffers == 1);
- if (reply_buffers >= 3) {
+ LASSERT(repbufcnt == 4 || repbufcnt == 1);
+ if (repbufcnt == 4) {
struct mds_body *body;
body = lustre_swab_repbuf(req, 1, sizeof (*body),
CERROR ("Missing/short eadata\n");
RETURN (-EPROTO);
}
+ if (body->valid & OBD_MD_FLMODEASIZE) {
+ if (obddev->u.cli.cl_max_mds_easize <
+ body->max_mdsize) {
+ obddev->u.cli.cl_max_mds_easize =
+ body->max_mdsize;
+ CDEBUG(D_INFO, "maxeasize become %d\n",
+ body->max_mdsize);
+ }
+ if (obddev->u.cli.cl_max_mds_cookiesize <
+ body->max_cookiesize) {
+ obddev->u.cli.cl_max_mds_cookiesize =
+ body->max_cookiesize;
+ CDEBUG(D_INFO, "cookiesize become %d\n",
+ body->max_cookiesize);
+ }
+ }
/* We save the reply LOV EA in case we have to replay
* a create for recovery. If we didn't allocate a
* large enough request buffer above we need to
* reallocate it here to hold the actual LOV EA. */
if (it->it_op & IT_OPEN) {
- if (req->rq_reqmsg->buflens[4] <
+ int pos = MDS_REQ_INTENT_REC_OFF + 2;
+
+ if (req->rq_reqmsg->buflens[pos] <
body->eadatasize)
mdc_realloc_openmsg(req, body, size);
- lmm = lustre_msg_buf(req->rq_reqmsg, 4,
+ lmm = lustre_msg_buf(req->rq_reqmsg, pos,
body->eadatasize);
if (lmm)
memcpy(lmm, eadata, body->eadatasize);
struct ldlm_res_id res_id = {.name ={op_data->fid2.id,
op_data->fid2.generation}};
struct lustre_handle lockh;
- int mode = LCK_PR;
+ ldlm_policy_data_t policy;
+ int mode = LCK_CR;
+ policy.l_inodebits.bits = (it->it_op == IT_GETATTR) ?
+ MDS_INODELOCK_UPDATE : MDS_INODELOCK_LOOKUP;
rc = ldlm_lock_match(exp->exp_obd->obd_namespace,
LDLM_FL_BLOCK_GRANTED, &res_id,
- LDLM_PLAIN, NULL, LCK_PR, &lockh);
+ LDLM_IBITS, &policy, LCK_CR, &lockh);
+ if (!rc) {
+ mode = LCK_CW;
+ rc = ldlm_lock_match(exp->exp_obd->obd_namespace,
+ LDLM_FL_BLOCK_GRANTED, &res_id,
+ LDLM_IBITS, &policy, LCK_CW, &lockh);
+ }
if (!rc) {
- mode = LCK_PW;
+ mode = LCK_PR;
rc = ldlm_lock_match(exp->exp_obd->obd_namespace,
LDLM_FL_BLOCK_GRANTED, &res_id,
- LDLM_PLAIN, NULL, LCK_PW, &lockh);
+ LDLM_IBITS, &policy, LCK_PR, &lockh);
}
if (rc) {
memcpy(&it->d.lustre.it_lock_handle, &lockh,
* never dropped its reference, so the refcounts are all OK */
if (!it_disposition(it, DISP_ENQ_COMPLETE)) {
- rc = mdc_enqueue(exp, LDLM_PLAIN, it, it_to_lock_mode(it),
+ rc = mdc_enqueue(exp, LDLM_IBITS, it, it_to_lock_mode(it),
op_data, &lockh, lmm, lmmsize,
ldlm_completion_ast, cb_blocking, NULL,
extra_lock_flags);
if (rc < 0)
RETURN(rc);
memcpy(&it->d.lustre.it_lock_handle, &lockh, sizeof(lockh));
+ } else if (!op_data->fid2.id) {
+ /* DISP_ENQ_COMPLETE set means there is extra reference on
+ * request referenced from this intent, saved for subsequent
+ * lookup. This path is executed when we proceed to this
+ * lookup, so we clear DISP_ENQ_COMPLETE */
+ it_clear_disposition(it, DISP_ENQ_COMPLETE);
}
request = *reqp = it->d.lustre.it_data;
LASSERT(request != NULL);
LASSERT(request != LP_POISON);
+ LASSERT(request->rq_repmsg != LP_POISON);
/* If we're doing an IT_OPEN which did not result in an actual
* successful open, then we need to remove the bit which saves
* intent_finish has performed the iget().) */
lock = ldlm_handle2lock(&lockh);
if (lock) {
+ ldlm_policy_data_t policy = lock->l_policy_data;
LDLM_DEBUG(lock, "matching against this");
LDLM_LOCK_PUT(lock);
memcpy(&old_lock, &lockh, sizeof(lockh));
if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
- LDLM_PLAIN, NULL, LCK_NL, &old_lock)) {
+ LDLM_IBITS, &policy, LCK_NL, &old_lock)) {
ldlm_lock_decref_and_cancel(&lockh,
it->d.lustre.it_lock_mode);
memcpy(&lockh, &old_lock, sizeof(old_lock));
struct mds_rec_setattr *rec;
struct mdc_rpc_lock *rpc_lock;
struct obd_device *obd = exp->exp_obd;
- int rc, bufcount = 1, size[3] = {sizeof(*rec), ealen, ea2len};
+ int size[] = { sizeof(*rec), ealen, ea2len};
+ int rc, bufcount = 1;
ENTRY;
LASSERT(iattr != NULL);
if (ealen > 0) {
- bufcount = 2;
+ bufcount++;
if (ea2len > 0)
- bufcount = 3;
+ bufcount++;
}
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_REINT, bufcount,
- size, NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+ MDS_REINT, bufcount, size, NULL);
if (req == NULL)
RETURN(-ENOMEM);
if (iattr->ia_valid & (ATTR_MTIME | ATTR_CTIME))
CDEBUG(D_INODE, "setting mtime %lu, ctime %lu\n",
LTIME_S(iattr->ia_mtime), LTIME_S(iattr->ia_ctime));
- mdc_setattr_pack(req, data, iattr, ea, ealen, ea2, ea2len);
+ mdc_setattr_pack(req, MDS_REQ_REC_OFF, data, iattr, ea, ealen, ea2, ea2len);
size[0] = sizeof(struct mds_body);
req->rq_replen = lustre_msg_size(1, size);
{
struct obd_device *obd = exp->exp_obd;
struct ptlrpc_request *req;
- int rc, size[3] = {sizeof(struct mds_rec_create), op_data->namelen + 1};
- int level, bufcount = 2;
+ int size[] = { sizeof(struct mds_rec_create), op_data->namelen + 1, 0};
+ int rc, level, bufcount = 2;
ENTRY;
if (data && datalen) {
bufcount++;
}
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_REINT, bufcount,
- size, NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+ MDS_REINT, bufcount, size, NULL);
if (req == NULL)
RETURN(-ENOMEM);
/* mdc_create_pack fills msg->bufs[1] with name
* and msg->bufs[2] with tgt, for symlinks or lov MD data */
- mdc_create_pack(req, 0, op_data, data, datalen, mode,
+ mdc_create_pack(req, MDS_REQ_REC_OFF, op_data, data, datalen, mode,
uid, gid, cap_effective, rdev);
size[0] = sizeof(struct mds_body);
{
struct obd_device *obd = class_exp2obd(exp);
struct ptlrpc_request *req = *request;
- int rc, size[2] = {sizeof(struct mds_rec_unlink), data->namelen + 1};
+ int rc, size[] = { sizeof(struct mds_rec_unlink), data->namelen + 1};
ENTRY;
LASSERT(req == NULL);
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_REINT, 2, size,
- NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+ MDS_REINT, 2, size, NULL);
if (req == NULL)
RETURN(-ENOMEM);
*request = req;
size[2] = obd->u.cli.cl_max_mds_cookiesize;
req->rq_replen = lustre_msg_size(3, size);
- mdc_unlink_pack(req, 0, data);
+ mdc_unlink_pack(req, MDS_REQ_REC_OFF, data);
rc = mdc_reint(req, obd->u.cli.cl_rpc_lock, LUSTRE_IMP_FULL);
if (rc == -ERESTARTSYS)
{
struct obd_device *obd = exp->exp_obd;
struct ptlrpc_request *req;
- int rc, size[2] = {sizeof(struct mds_rec_link), data->namelen + 1};
+ int rc, size[] = { sizeof(struct mds_rec_link), data->namelen + 1};
ENTRY;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_REINT, 2, size,
- NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+ MDS_REINT, 2, size, NULL);
if (req == NULL)
RETURN(-ENOMEM);
- mdc_link_pack(req, 0, data);
+ mdc_link_pack(req, MDS_REQ_REC_OFF, data);
size[0] = sizeof(struct mds_body);
req->rq_replen = lustre_msg_size(1, size);
{
struct obd_device *obd = exp->exp_obd;
struct ptlrpc_request *req;
- int rc, size[3] = {sizeof(struct mds_rec_rename), oldlen + 1,
- newlen + 1};
+ int rc, size[] = { sizeof(struct mds_rec_rename), oldlen +1, newlen +1};
ENTRY;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_REINT, 3, size,
- NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+ MDS_REINT, 3, size, NULL);
if (req == NULL)
RETURN(-ENOMEM);
- mdc_rename_pack(req, 0, data, old, oldlen, new, newlen);
+ mdc_rename_pack(req, MDS_REQ_REC_OFF, data, old, oldlen, new, newlen);
size[0] = sizeof(struct mds_body);
size[1] = obd->u.cli.cl_max_mds_easize;
int level, int msg_flags)
{
struct ptlrpc_request *req;
- struct mds_body *body;
- int rc, size = sizeof(*body);
+ int rc, size[] = { [MDS_REQ_REC_OFF] = sizeof(struct mds_body) };
ENTRY;
- req = ptlrpc_prep_req(imp, MDS_GETSTATUS, 1, &size, NULL);
+ req = ptlrpc_prep_req(imp, LUSTRE_MDS_VERSION, MDS_GETSTATUS,
+ 1, size, NULL);
if (!req)
GOTO(out, rc = -ENOMEM);
- body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
req->rq_send_state = level;
- req->rq_replen = lustre_msg_size(1, &size);
+ req->rq_replen = lustre_msg_size(1, size);
- mdc_pack_req_body(req);
+ mdc_pack_req_body(req, MDS_REQ_REC_OFF, 0, NULL, 0);
req->rq_reqmsg->flags |= msg_flags;
rc = ptlrpc_queue_wait(req);
if (!rc) {
- body = lustre_swab_repbuf (req, 0, sizeof (*body),
- lustre_swab_mds_body);
+ struct mds_body *body;
+
+ body = lustre_swab_repbuf(req, 0, sizeof(*body),
+ lustre_swab_mds_body);
if (body == NULL) {
CERROR ("Can't extract mds_body\n");
GOTO (out, rc = -EPROTO);
0);
}
-int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size,
- struct ptlrpc_request *req)
+static
+int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size,
+ unsigned int acl_size, struct ptlrpc_request *req)
{
struct mds_body *body;
void *eadata;
int rc;
- int size[2] = {sizeof(*body), 0};
+ int size[3] = {sizeof(*body)};
int bufcount = 1;
ENTRY;
CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n",
ea_size);
}
+ if (acl_size) {
+ size[bufcount++] = acl_size;
+ CDEBUG(D_INODE, "reserved %u bytes for ACL\n", acl_size);
+ }
+
req->rq_replen = lustre_msg_size(bufcount, size);
mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
RETURN (-EPROTO);
}
}
+
+ if (body->valid & OBD_MD_FLMODEASIZE) {
+ if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize)
+ exp->exp_obd->u.cli.cl_max_mds_easize =
+ body->max_mdsize;
+ if (exp->exp_obd->u.cli.cl_max_mds_cookiesize <
+ body->max_cookiesize)
+ exp->exp_obd->u.cli.cl_max_mds_cookiesize =
+ body->max_cookiesize;
+ }
RETURN (0);
}
struct ptlrpc_request *req;
struct mds_body *body;
int size = sizeof(*body);
- int rc;
+ int acl_size = 0, rc;
ENTRY;
/* XXX do we need to make another request here? We just did a getattr
* to do the lookup in the first place.
*/
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_GETATTR, 1, &size,
- NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+ MDS_GETATTR, 1, &size, NULL);
if (!req)
GOTO(out, rc = -ENOMEM);
- body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
- memcpy(&body->fid1, fid, sizeof(*fid));
- body->valid = valid;
- body->eadatasize = ea_size;
- mdc_pack_req_body(req);
+ mdc_pack_req_body(req, MDS_REQ_REC_OFF, valid, fid, ea_size);
+
+ /* currently only root inode will call us with FLACL */
+ if (valid & OBD_MD_FLACL)
+ acl_size = LUSTRE_POSIX_ACL_MAX_SIZE;
- rc = mdc_getattr_common(exp, ea_size, req);
+ rc = mdc_getattr_common(exp, ea_size, acl_size, req);
if (rc != 0) {
ptlrpc_req_finished (req);
req = NULL;
}
int mdc_getattr_name(struct obd_export *exp, struct ll_fid *fid,
- char *filename, int namelen, unsigned long valid,
- unsigned int ea_size, struct ptlrpc_request **request)
+ const char *filename, int namelen, unsigned long valid,
+ unsigned int ea_len, struct ptlrpc_request **request)
{
struct ptlrpc_request *req;
- struct mds_body *body;
- int rc, size[2] = {sizeof(*body), namelen};
+ int rc, size[] = { sizeof(struct mds_body), namelen };
ENTRY;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_GETATTR_NAME, 2,
- size, NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+ MDS_GETATTR_NAME, 2, size, NULL);
if (!req)
GOTO(out, rc = -ENOMEM);
- body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
- memcpy(&body->fid1, fid, sizeof(*fid));
- body->valid = valid;
- body->eadatasize = ea_size;
- mdc_pack_req_body(req);
-
+ mdc_pack_req_body(req, MDS_REQ_REC_OFF, valid, fid, ea_len);
+
LASSERT (strnlen (filename, namelen) == namelen - 1);
memcpy(lustre_msg_buf(req->rq_reqmsg, 1, namelen), filename, namelen);
- rc = mdc_getattr_common(exp, ea_size, req);
+ rc = mdc_getattr_common(exp, ea_len, 0, req);
if (rc != 0) {
ptlrpc_req_finished (req);
req = NULL;
size[bufcnt++] = input_size;
}
- req = ptlrpc_prep_req(class_exp2cliimp(exp), opcode,
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, opcode,
bufcnt, size, NULL);
if (!req)
GOTO(out, rc = -ENOMEM);
/* request data */
- body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
- memcpy(&body->fid1, fid, sizeof(*fid));
- body->valid = valid;
- body->eadatasize = output_size;
+ body = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_REC_OFF, sizeof(*body));
+ mdc_pack_req_body(req, MDS_REQ_REC_OFF, valid, fid, output_size);
body->flags = flags;
- mdc_pack_req_body(req);
if (xattr_name) {
tmp = lustre_msg_buf(req->rq_reqmsg, 1, xattr_namelen);
void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff,
int repoff)
{
- struct mds_rec_create *rec =
- lustre_msg_buf(req->rq_reqmsg, reqoff, sizeof(*rec));
- struct mds_body *body =
- lustre_msg_buf(req->rq_repmsg, repoff, sizeof(*body));
+ struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, reqoff,
+ sizeof(*rec));
+ struct mds_body *body = lustre_msg_buf(req->rq_repmsg, repoff,
+ sizeof(*body));
LASSERT (rec != NULL);
LASSERT (body != NULL);
rec->cr_replayfid.generation, rec->cr_replayfid.id);
}
+#ifdef CONFIG_FS_POSIX_ACL
+static
+int mdc_unpack_acl(struct obd_export *exp, struct ptlrpc_request *req,
+ struct lustre_md *md, unsigned int offset)
+{
+ struct mds_body *body = md->body;
+ struct posix_acl *acl;
+ void *buf;
+ int rc;
+
+ if (!body->aclsize)
+ return 0;
+
+ buf = lustre_msg_buf(req->rq_repmsg, offset, body->aclsize);
+ if (!buf) {
+ CERROR("aclsize %u, bufcount %u, bufsize %u\n",
+ body->aclsize, req->rq_repmsg->bufcount,
+ (req->rq_repmsg->bufcount <= offset) ? -1 :
+ req->rq_repmsg->buflens[offset]);
+ return -EPROTO;
+ }
+
+ acl = posix_acl_from_xattr(buf, body->aclsize);
+ if (IS_ERR(acl)) {
+ rc = PTR_ERR(acl);
+ CERROR("convert xattr to acl: %d\n", rc);
+ return rc;
+ }
+
+ rc = posix_acl_valid(acl);
+ if (rc) {
+ CERROR("validate acl: %d\n", rc);
+ posix_acl_release(acl);
+ return rc;
+ }
+
+ md->posix_acl = acl;
+ return 0;
+}
+#else
+#define mdc_unpack_acl(exp, req, md, offset) 0
+#endif
+
int mdc_req2lustre_md(struct ptlrpc_request *req, int offset,
struct obd_export *exp,
struct lustre_md *md)
md->body = lustre_msg_buf(req->rq_repmsg, offset, sizeof (*md->body));
LASSERT (md->body != NULL);
LASSERT_REPSWABBED (req, offset);
+ offset++;
if (md->body->valid & OBD_MD_FLEASIZE) {
int lmmsize;
RETURN(-EPROTO);
}
lmmsize = md->body->eadatasize;
- lmm = lustre_msg_buf(req->rq_repmsg, offset + 1, lmmsize);
+ lmm = lustre_msg_buf(req->rq_repmsg, offset, lmmsize);
LASSERT (lmm != NULL);
- LASSERT_REPSWABBED (req, offset + 1);
+ LASSERT_REPSWABBED (req, offset);
rc = obd_unpackmd(exp, &md->lsm, lmm, lmmsize);
- if (rc >= 0) {
- LASSERT (rc >= sizeof (*md->lsm));
- rc = 0;
- }
+ if (rc < 0)
+ RETURN(rc);
+
+ LASSERT (rc >= sizeof (*md->lsm));
+ rc = 0;
+
+ offset++;
}
+
+ /* for ACL, it's possible that FLACL is set but aclsize is zero.
+ * only when aclsize != 0 there's an actual segment for ACL in
+ * reply buffer.
+ */
+ if ((md->body->valid & OBD_MD_FLACL) && md->body->aclsize) {
+ rc = mdc_unpack_acl(exp, req, md, offset);
+ if (rc)
+ GOTO(err_out, rc);
+ offset++;
+ }
+out:
RETURN(rc);
+
+err_out:
+ if (md->lsm)
+ obd_free_memmd(exp, &md->lsm);
+ goto out;
+}
+
+void mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
+{
+ if (md->lsm)
+ obd_free_memmd(exp, &md->lsm);
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (md->posix_acl) {
+ posix_acl_release(md->posix_acl);
+ md->posix_acl = NULL;
+ }
+#endif
}
static void mdc_commit_open(struct ptlrpc_request *req)
if (close_req != NULL) {
struct mds_body *close_body;
LASSERT(close_req->rq_reqmsg->opc == MDS_CLOSE);
- close_body = lustre_msg_buf(close_req->rq_reqmsg, 0,
+ close_body = lustre_msg_buf(close_req->rq_reqmsg,
+ MDS_REQ_REC_OFF,
sizeof(*close_body));
if (och != NULL)
LASSERT(!memcmp(&old, &close_body->handle, sizeof old));
struct ptlrpc_request *open_req)
{
struct mdc_open_data *mod;
- struct mds_rec_create *rec =
- lustre_msg_buf(open_req->rq_reqmsg, 2, sizeof(*rec));
- struct mds_body *body =
- lustre_msg_buf(open_req->rq_repmsg, 1, sizeof(*body));
+ struct mds_rec_create *rec = lustre_msg_buf(open_req->rq_reqmsg,
+ MDS_REQ_INTENT_REC_OFF,
+ sizeof(*rec));
+ struct mds_body *body = lustre_msg_buf(open_req->rq_repmsg, 1,
+ sizeof(*body));
- LASSERT(rec != NULL);
- /* outgoing messages always in my byte order */
LASSERT(body != NULL);
/* incoming message in my byte order (it's been swabbed) */
+ LASSERT(rec != NULL);
+ /* outgoing messages always in my byte order */
LASSERT_REPSWABBED(open_req, 1);
OBD_ALLOC(mod, sizeof(*mod));
struct obd_client_handle *och, struct ptlrpc_request **request)
{
struct obd_device *obd = class_exp2obd(exp);
- int reqsize = sizeof(struct mds_body);
- int rc, repsize[3] = {sizeof(struct mds_body),
+ int size[] = { sizeof(struct mds_body) };
+ int rc, repsize[] = { sizeof(struct mds_body),
obd->u.cli.cl_max_mds_easize,
obd->u.cli.cl_max_mds_cookiesize};
struct ptlrpc_request *req;
struct l_wait_info lwi;
ENTRY;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_CLOSE, 1, &reqsize,
- NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+ MDS_CLOSE, 1, size, NULL);
if (req == NULL)
GOTO(out, rc = -ENOMEM);
CDEBUG(D_HA, "couldn't find open req; expecting close error\n");
}
- mdc_close_pack(req, 0, oa, oa->o_valid, och);
+ mdc_close_pack(req, MDS_REQ_REC_OFF, oa, oa->o_valid, och);
req->rq_replen = lustre_msg_size(3, repsize);
req->rq_commit_cb = mdc_commit_close;
LASSERT(req->rq_cb_data == NULL);
req->rq_cb_data = mod;
+ CDEBUG(D_HA, "close req->rep_len %d mdsize %d cookiesize %d\n",
+ req->rq_replen,
+ obd->u.cli.cl_max_mds_easize, obd->u.cli.cl_max_mds_cookiesize);
+
/* We hand a ref to the rpcd here, so we need another one of our own. */
ptlrpc_request_addref(req);
{
struct ptlrpc_request *req;
struct mds_body *body;
- int rc, size = sizeof(*body);
+ int rc, size[] = { [MDS_REQ_REC_OFF] = sizeof(*body) };
ENTRY;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_DONE_WRITING, 1,
- &size, NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+ MDS_DONE_WRITING, 1, size, NULL);
if (req == NULL)
RETURN(-ENOMEM);
- body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body));
+ body = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_REC_OFF, sizeof(*body));
mdc_pack_fid(&body->fid1, obdo->o_id, 0, obdo->o_mode);
body->size = obdo->o_size;
body->blocks = obdo->o_blocks;
body->valid = obdo->o_valid;
// memcpy(&body->handle, &och->och_fh, sizeof(body->handle));
- req->rq_replen = lustre_msg_size(1, &size);
+ req->rq_replen = lustre_msg_size(1, size);
rc = ptlrpc_queue_wait(req);
ptlrpc_req_finished(req);
RETURN(rc);
}
-int mdc_readpage(struct obd_export *exp, struct ll_fid *mdc_fid, __u64 offset,
+int mdc_readpage(struct obd_export *exp, struct ll_fid *fid, __u64 offset,
struct page *page, struct ptlrpc_request **request)
{
struct obd_import *imp = class_exp2cliimp(exp);
struct ptlrpc_request *req = NULL;
struct ptlrpc_bulk_desc *desc = NULL;
struct mds_body *body;
- int rc, size = sizeof(*body);
+ int rc, size[] = { sizeof(*body) };
ENTRY;
- CDEBUG(D_INODE, "inode: %ld\n", (long)mdc_fid->id);
+ CDEBUG(D_INODE, "inode: "LPU64"\n", fid->id);
- req = ptlrpc_prep_req(imp, MDS_READPAGE, 1, &size, NULL);
+ req = ptlrpc_prep_req(imp, LUSTRE_MDS_VERSION, MDS_READPAGE,
+ 1, size, NULL);
if (req == NULL)
GOTO(out, rc = -ENOMEM);
+
/* XXX FIXME bug 249 */
req->rq_request_portal = MDS_READPAGE_PORTAL;
ptlrpc_prep_bulk_page(desc, page, 0, PAGE_CACHE_SIZE);
- mdc_readdir_pack(req, offset, PAGE_CACHE_SIZE, mdc_fid);
+ mdc_readdir_pack(req, MDS_REQ_REC_OFF, offset, PAGE_CACHE_SIZE, fid);
- req->rq_replen = lustre_msg_size(1, &size);
+ req->rq_replen = lustre_msg_size(1, size);
rc = ptlrpc_queue_wait(req);
if (rc == 0) {
return rc;
}
+
static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
void *karg, void *uarg)
{
}
#endif
case OBD_IOC_POLL_QUOTACHECK:
- rc = mdc_poll_quotacheck(exp, (struct if_quotacheck *)karg);
+ rc = lquota_poll_check(quota_interface, exp,
+ (struct if_quotacheck *)karg);
GOTO(out, rc);
default:
CERROR("mdc_ioctl(): unrecognised ioctl %#x\n", cmd);
struct obd_import *imp = class_exp2cliimp(exp);
int rc = -EINVAL;
- if (keylen == strlen("initial_recov") &&
- memcmp(key, "initial_recov", strlen("initial_recov")) == 0) {
+ if (KEY_IS("initial_recov")) {
if (vallen != sizeof(int))
RETURN(-EINVAL);
imp->imp_initial_recov = *(int *)val;
exp->exp_obd->obd_name, imp->imp_initial_recov);
RETURN(0);
}
- if (keylen == strlen("read-only") &&
- memcmp(key, "read-only", strlen("read-only")) == 0) {
+ /* Turn off initial_recov after we try all backup servers once */
+ if (KEY_IS("init_recov_bk")) {
+ if (vallen != sizeof(int))
+ RETURN(-EINVAL);
+ imp->imp_initial_recov_bk = *(int *)val;
+ if (imp->imp_initial_recov_bk)
+ imp->imp_initial_recov = 1;
+ CDEBUG(D_HA, "%s: set imp_initial_recov_bk = %d\n",
+ exp->exp_obd->obd_name, imp->imp_initial_recov_bk);
+ RETURN(0);
+ }
+ if (KEY_IS("read-only")) {
struct ptlrpc_request *req;
int size[2] = {keylen, vallen};
char *bufs[2] = {key, val};
~OBD_CONNECT_RDONLY;
}
- req = ptlrpc_prep_req(imp, MDS_SET_INFO, 2, size, bufs);
+ req = ptlrpc_prep_req(imp, LUSTRE_MDS_VERSION,
+ MDS_SET_INFO, 2, size, bufs);
if (req == NULL)
RETURN(-ENOMEM);
RETURN(rc);
}
+int mdc_get_info(struct obd_export *exp, __u32 keylen, void *key,
+ __u32 *vallen, void *val)
+{
+ int rc = -EINVAL;
+
+ if (keylen == strlen("max_easize") &&
+ memcmp(key, "max_easize", strlen("max_easize")) == 0) {
+ int mdsize, *max_easize;
+
+ if (*vallen != sizeof(int))
+ RETURN(-EINVAL);
+ mdsize = *(int*)val;
+ if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize)
+ exp->exp_obd->u.cli.cl_max_mds_easize = mdsize;
+ max_easize = val;
+ *max_easize = exp->exp_obd->u.cli.cl_max_mds_easize;
+ RETURN(0);
+ }
+ RETURN(rc);
+}
+
static int mdc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
unsigned long max_age)
{
* during mount that would help a bit). Having relative timestamps
* is not so great if request processing is slow, while absolute
* timestamps are not ideal because they need time synchronization. */
- req = ptlrpc_prep_req(obd->u.cli.cl_import, MDS_STATFS, 0, NULL, NULL);
+ req = ptlrpc_prep_req(obd->u.cli.cl_import, LUSTRE_MDS_VERSION,
+ MDS_STATFS, 0, NULL, NULL);
if (!req)
RETURN(-ENOMEM);
{
struct ptlrpc_request *req;
struct mds_body *body;
- int rc, size = sizeof(*body);
+ int rc, size[] = { [MDS_REQ_REC_OFF] = sizeof(struct mds_body) };
ENTRY;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_PIN, 1, &size, NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+ MDS_PIN, 1, size, NULL);
if (req == NULL)
RETURN(-ENOMEM);
- body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
+ body = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_REC_OFF, sizeof (*body));
mdc_pack_fid(&body->fid1, ino, gen, type);
body->flags = flag;
- req->rq_replen = lustre_msg_size(1, &size);
+ req->rq_replen = lustre_msg_size(1, size);
mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
rc = ptlrpc_queue_wait(req);
{
struct ptlrpc_request *req;
struct mds_body *body;
- int rc, size = sizeof(*body);
+ int rc, size[] = { [MDS_REQ_REC_OFF] = sizeof(struct mds_body) };
ENTRY;
if (handle->och_magic != OBD_CLIENT_HANDLE_MAGIC)
RETURN(0);
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_CLOSE, 1, &size, NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+ MDS_CLOSE, 1, size, NULL);
if (req == NULL)
RETURN(-ENOMEM);
- body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body));
+ body = lustre_msg_buf(req->rq_reqmsg, 1, sizeof(*body));
memcpy(&body->handle, &handle->och_fh, sizeof(body->handle));
body->flags = flag;
struct ptlrpc_request **request)
{
struct ptlrpc_request *req;
- struct mds_body *body;
- int size = sizeof(*body);
- int rc;
+ int rc, size[] = { [MDS_REQ_REC_OFF] = sizeof(struct mds_body) };
ENTRY;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MDS_SYNC, 1,&size,NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
+ MDS_SYNC, 1, size, NULL);
if (!req)
RETURN(rc = -ENOMEM);
- if (fid) {
- body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
- memcpy(&body->fid1, fid, sizeof(*fid));
- mdc_pack_req_body(req);
- }
+ mdc_pack_req_body(req, MDS_REQ_REC_OFF, 0, fid, 0);
- req->rq_replen = lustre_msg_size(1, &size);
+ req->rq_replen = lustre_msg_size(1, size);
rc = ptlrpc_queue_wait(req);
if (rc || request == NULL)
struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC };
struct lov_desc desc;
__u32 valsize = sizeof(desc);
+ __u32 stripes;
int rc, size;
ENTRY;
- size = obd_size_diskmd(lov_exp, NULL);
- if (cli->cl_max_mds_easize < size)
- cli->cl_max_mds_easize = size;
rc = obd_get_info(lov_exp, strlen(KEY_LOVDESC) + 1, KEY_LOVDESC,
&valsize, &desc);
if (rc)
RETURN(rc);
+ stripes = min(desc.ld_tgt_count, (__u32)LOV_MAX_STRIPE_COUNT);
+ lsm.lsm_stripe_count = stripes;
+ size = obd_size_diskmd(lov_exp, &lsm);
+
+ if (cli->cl_max_mds_easize < size)
+ cli->cl_max_mds_easize = size;
+
lsm.lsm_stripe_count = desc.ld_default_stripe_count;
size = obd_size_diskmd(lov_exp, &lsm);
if (cli->cl_default_mds_easize < size)
cli->cl_default_mds_easize = size;
- size = desc.ld_tgt_count * sizeof(struct llog_cookie);
+ size = stripes * sizeof(struct llog_cookie);
if (cli->cl_max_mds_cookiesize < size)
cli->cl_max_mds_cookiesize = size;
+ CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n",
+ cli->cl_max_mds_easize, cli->cl_max_mds_cookiesize);
+
RETURN(0);
}
ctxt->loc_imp = obd->u.cli.cl_import;
}
+ rc = llog_setup(obd, LLOG_LOVEA_REPL_CTXT, tgt, 0, NULL,
+ &llog_client_ops);
+ if (rc == 0) {
+ ctxt = llog_get_context(obd, LLOG_LOVEA_REPL_CTXT);
+ ctxt->loc_imp = obd->u.cli.cl_import;
+ }
+
RETURN(rc);
}
int rc;
ENTRY;
+ rc = llog_cleanup(llog_get_context(obd, LLOG_LOVEA_REPL_CTXT));
+ if (rc) {
+ CERROR("can not cleanup LLOG_CONFIG_REPL_CTXT rc %d\n", rc);
+ }
rc = llog_cleanup(llog_get_context(obd, LLOG_CONFIG_REPL_CTXT));
RETURN(rc);
}
.o_disconnect = client_disconnect_export,
.o_iocontrol = mdc_iocontrol,
.o_set_info = mdc_set_info,
+ .o_get_info = mdc_get_info,
.o_statfs = mdc_statfs,
.o_pin = mdc_pin,
.o_unpin = mdc_unpin,
.o_import_event = mdc_import_event,
.o_llog_init = mdc_llog_init,
.o_llog_finish = mdc_llog_finish,
- .o_quotacheck = mdc_quotacheck,
- .o_quotactl = mdc_quotactl,
};
+static quota_interface_t *quota_interface;
+extern quota_interface_t mdc_quota_interface;
+
int __init mdc_init(void)
{
+ int rc;
struct lprocfs_static_vars lvars;
lprocfs_init_vars(mdc, &lvars);
- return class_register_type(&mdc_obd_ops, lvars.module_vars,
- LUSTRE_MDC_NAME);
+
+ quota_interface = PORTAL_SYMBOL_GET(mdc_quota_interface);
+ init_obd_quota_ops(quota_interface, &mdc_obd_ops);
+
+ rc = class_register_type(&mdc_obd_ops, lvars.module_vars,
+ LUSTRE_MDC_NAME);
+ if (rc && quota_interface)
+ PORTAL_SYMBOL_PUT(mdc_quota_interface);
+
+ RETURN(rc);
}
#ifdef __KERNEL__
static void /*__exit*/ mdc_exit(void)
{
+ if (quota_interface)
+ PORTAL_SYMBOL_PUT(mdc_quota_interface);
+
class_unregister_type(LUSTRE_MDC_NAME);
}
MODULE_LICENSE("GPL");
EXPORT_SYMBOL(mdc_req2lustre_md);
+EXPORT_SYMBOL(mdc_free_lustre_md);
EXPORT_SYMBOL(mdc_change_cbdata);
EXPORT_SYMBOL(mdc_getstatus);
EXPORT_SYMBOL(mdc_getattr);
MODULES := mds
mds-objs := mds_log.o mds_unlink_open.o mds_lov.o handler.o mds_reint.o
-mds-objs += mds_fs.o lproc_mds.o mds_open.o mds_lib.o mds_xattr.o
-
-ifeq ($(PATCHLEVEL),6)
-#mds-objs += quota_master.o
-endif
+mds-objs += mds_fs.o lproc_mds.o mds_open.o mds_lib.o mds_xattr.o mds_join.o
@INCLUDE_RULES@
struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid,
struct vfsmount **mnt, int lock_mode,
struct lustre_handle *lockh,
- char *name, int namelen)
+ char *name, int namelen, __u64 lockpart)
{
struct mds_obd *mds = &obd->u.mds;
struct dentry *de = mds_fid2dentry(mds, fid, mnt), *retval = de;
struct ldlm_res_id res_id = { .name = {0} };
int flags = 0, rc;
+ ldlm_policy_data_t policy = { .l_inodebits = { lockpart} };
ENTRY;
if (IS_ERR(de))
res_id.name[0] = de->d_inode->i_ino;
res_id.name[1] = de->d_inode->i_generation;
rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id,
- LDLM_PLAIN, NULL, lock_mode, &flags,
+ LDLM_IBITS, &policy, lock_mode, &flags,
ldlm_blocking_ast, ldlm_completion_ast,
NULL, NULL, NULL, 0, NULL, lockh);
if (rc != ELDLM_OK) {
snprintf(fid_name, sizeof(fid_name), "0x%lx", ino);
CDEBUG(D_DENTRY, "--> mds_fid2dentry: ino/gen %lu/%u, sb %p\n",
- ino, generation, mds->mds_sb);
+ ino, generation, mds->mds_obt.obt_sb);
/* under ext3 this is neither supposed to return bad inodes
nor NULL inodes. */
RETURN(result);
}
+static int mds_connect_internal(struct obd_export *exp,
+ struct obd_connect_data *data)
+{
+ struct obd_device *obd = exp->exp_obd;
+ if (data != NULL) {
+ data->ocd_connect_flags &= MDS_CONNECT_SUPPORTED;
+ data->ocd_ibits_known &= MDS_INODELOCK_FULL;
+
+ /* If no known bits (which should not happen, probably,
+ as everybody should support LOOKUP and UPDATE bits at least)
+ revert to compat mode with plain locks. */
+ if (!data->ocd_ibits_known &&
+ data->ocd_connect_flags & OBD_CONNECT_IBITS)
+ data->ocd_connect_flags &= ~OBD_CONNECT_IBITS;
+
+ if (!obd->u.mds.mds_fl_acl)
+ data->ocd_connect_flags &= ~OBD_CONNECT_ACL;
+
+ if (!obd->u.mds.mds_fl_user_xattr)
+ data->ocd_connect_flags &= ~OBD_CONNECT_XATTR;
+
+ exp->exp_connect_flags = data->ocd_connect_flags;
+ data->ocd_version = LUSTRE_VERSION_CODE;
+ exp->exp_mds_data.med_ibits_known = data->ocd_ibits_known;
+ }
+
+ if (obd->u.mds.mds_fl_acl &&
+ ((exp->exp_connect_flags & OBD_CONNECT_ACL) == 0)) {
+ CWARN("%s: MDS requires ACL support but client does not\n",
+ obd->obd_name);
+ return -EBADE;
+ }
+ return 0;
+}
+
+static int mds_reconnect(struct obd_export *exp, struct obd_device *obd,
+ struct obd_uuid *cluuid,
+ struct obd_connect_data *data)
+{
+ int rc;
+ ENTRY;
+
+ if (exp == NULL || obd == NULL || cluuid == NULL)
+ RETURN(-EINVAL);
+
+ rc = mds_connect_internal(exp, data);
+
+ RETURN(rc);
+}
/* Establish a connection to the MDS.
*
{
struct obd_export *exp;
struct mds_export_data *med;
- struct mds_client_data *mcd;
+ struct mds_client_data *mcd = NULL;
int rc, abort_recovery;
ENTRY;
LASSERT(exp);
med = &exp->exp_mds_data;
- if (data != NULL) {
- data->ocd_connect_flags &= MDS_CONNECT_SUPPORTED;
- exp->exp_connect_flags = data->ocd_connect_flags;
- data->ocd_version = LUSTRE_VERSION_CODE;
- }
+ rc = mds_connect_internal(exp, data);
+ if (rc)
+ GOTO(out, rc);
OBD_ALLOC(mcd, sizeof(*mcd));
- if (!mcd) {
- CERROR("mds: out of memory for client data\n");
+ if (!mcd)
GOTO(out, rc = -ENOMEM);
- }
memcpy(mcd->mcd_uuid, cluuid, sizeof(mcd->mcd_uuid));
med->med_mcd = mcd;
/* child orphan sem protects orphan_dec_test and
* is_orphan race, mds_mfd_close drops it */
MDS_DOWN_WRITE_ORPHAN_SEM(dentry->d_inode);
- rc = mds_mfd_close(NULL, obd, mfd,
+ rc = mds_mfd_close(NULL, MDS_REQ_REC_OFF, obd, mfd,
!(export->exp_flags & OBD_OPT_FAILOVER));
if (rc)
} else if (rc > 0) {
*size = rc;
}
+ } else {
+ *size = 0;
}
if (lock)
up(&inode->i_sem);
RETURN(rc);
}
+#ifdef CONFIG_FS_POSIX_ACL
+static
+int mds_pack_posix_acl(struct inode *inode, struct lustre_msg *repmsg,
+ struct mds_body *repbody, int repoff)
+{
+ struct dentry de = { .d_inode = inode };
+ int buflen, rc;
+ ENTRY;
+
+ LASSERT(repbody->aclsize == 0);
+ LASSERT(repmsg->bufcount > repoff);
+
+ buflen = lustre_msg_buflen(repmsg, repoff);
+ if (!buflen)
+ GOTO(out, 0);
+
+ if (!inode->i_op || !inode->i_op->getxattr)
+ GOTO(out, 0);
+
+ lock_24kernel();
+ rc = inode->i_op->getxattr(&de, XATTR_NAME_ACL_ACCESS,
+ lustre_msg_buf(repmsg, repoff, buflen),
+ buflen);
+ unlock_24kernel();
+
+ if (rc >= 0)
+ repbody->aclsize = rc;
+ else if (rc != -ENODATA) {
+ CERROR("buflen %d, get acl: %d\n", buflen, rc);
+ RETURN(rc);
+ }
+ EXIT;
+out:
+ repbody->valid |= OBD_MD_FLACL;
+ return 0;
+}
+#else
+#define mds_pack_posix_acl(inode, repmsg, repbody, repoff) 0
+#endif
+
+int mds_pack_acl(struct mds_export_data *med, struct inode *inode,
+ struct lustre_msg *repmsg, struct mds_body *repbody,
+ int repoff)
+{
+ return mds_pack_posix_acl(inode, repmsg, repbody, repoff);
+}
+
static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
struct ptlrpc_request *req,
struct mds_body *reqbody, int reply_off)
mds_pack_inode2fid(&body->fid1, inode);
mds_pack_inode2body(body, inode);
+ reply_off++;
if ((S_ISREG(inode->i_mode) && (reqbody->valid & OBD_MD_FLEASIZE)) ||
(S_ISDIR(inode->i_mode) && (reqbody->valid & OBD_MD_FLDIREA))) {
- rc = mds_pack_md(obd, req->rq_repmsg, reply_off + 1, body,
+ rc = mds_pack_md(obd, req->rq_repmsg, reply_off, body,
inode, 1);
/* If we have LOV EA data, the OST holds size, atime, mtime */
!(body->valid & OBD_MD_FLDIREA))
body->valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
OBD_MD_FLATIME | OBD_MD_FLMTIME);
+
+ lustre_shrink_reply(req, reply_off, body->eadatasize, 0);
+ if (body->eadatasize)
+ reply_off++;
} else if (S_ISLNK(inode->i_mode) &&
(reqbody->valid & OBD_MD_LINKNAME) != 0) {
- char *symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1,0);
+ char *symname = lustre_msg_buf(req->rq_repmsg, reply_off, 0);
int len;
LASSERT (symname != NULL); /* caller prepped reply */
- len = req->rq_repmsg->buflens[reply_off + 1];
+ len = req->rq_repmsg->buflens[reply_off];
rc = inode->i_op->readlink(dentry, symname, len);
if (rc < 0) {
symname[rc] = 0; /* NULL terminate */
rc = 0;
}
+ reply_off++;
}
+ if (reqbody->valid & OBD_MD_FLMODEASIZE) {
+ struct mds_obd *mds = mds_req2mds(req);
+ body->max_cookiesize = mds->mds_max_cookiesize;
+ body->max_mdsize = mds->mds_max_mdsize;
+ body->valid |= OBD_MD_FLMODEASIZE;
+ }
+
+ if (rc)
+ RETURN(rc);
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if ((req->rq_export->exp_connect_flags & OBD_CONNECT_ACL) &&
+ (reqbody->valid & OBD_MD_FLACL)) {
+ rc = mds_pack_acl(&req->rq_export->exp_mds_data,
+ inode, req->rq_repmsg,
+ body, reply_off);
+
+ lustre_shrink_reply(req, reply_off, body->aclsize, 0);
+ if (body->aclsize)
+ reply_off++;
+ }
+#endif
+
RETURN(rc);
}
{
struct mds_obd *mds = mds_req2mds(req);
struct mds_body *body;
- int rc = 0, size[2] = {sizeof(*body)}, bufcount = 1;
+ int rc, size[2] = {sizeof(*body)}, bufcount = 1;
ENTRY;
body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*body));
if ((S_ISREG(inode->i_mode) && (body->valid & OBD_MD_FLEASIZE)) ||
(S_ISDIR(inode->i_mode) && (body->valid & OBD_MD_FLDIREA))) {
- int ret;
down(&inode->i_sem);
- ret = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0);
+ rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0);
up(&inode->i_sem);
CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n",
rc, inode->i_ino);
- if (ret < 0) {
- if (ret != -ENODATA) {
+ if (rc < 0) {
+ if (rc != -ENODATA) {
CERROR("error getting inode %lu MD: rc = %d\n",
- inode->i_ino, ret);
- /* should we return ret in req->rq_status? */
+ inode->i_ino, rc);
+ RETURN(rc);
}
size[bufcount] = 0;
- } else if (ret > mds->mds_max_mdsize) {
+ } else if (rc > mds->mds_max_mdsize) {
size[bufcount] = 0;
CERROR("MD size %d larger than maximum possible %u\n",
- ret, mds->mds_max_mdsize);
+ rc, mds->mds_max_mdsize);
} else {
- size[bufcount] = ret;
+ size[bufcount] = rc;
}
bufcount++;
} else if (S_ISLNK(inode->i_mode) && (body->valid & OBD_MD_LINKNAME)) {
inode->i_size + 1, body->eadatasize);
}
+#ifdef CONFIG_FS_POSIX_ACL
+ if ((req->rq_export->exp_connect_flags & OBD_CONNECT_ACL) &&
+ (body->valid & OBD_MD_FLACL)) {
+ struct dentry de = { .d_inode = inode };
+
+ size[bufcount] = 0;
+ if (inode->i_op && inode->i_op->getxattr) {
+ lock_24kernel();
+ rc = inode->i_op->getxattr(&de, XATTR_NAME_ACL_ACCESS,
+ NULL, 0);
+ unlock_24kernel();
+
+ if (rc < 0) {
+ if (rc != -ENODATA) {
+ CERROR("got acl size: %d\n", rc);
+ RETURN(rc);
+ }
+ } else
+ size[bufcount] = rc;
+ }
+ bufcount++;
+ }
+#endif
+
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) {
CERROR("failed MDS_GETATTR_PACK test\n");
req->rq_status = -ENOMEM;
- GOTO(out, rc = -ENOMEM);
+ RETURN(-ENOMEM);
}
rc = lustre_pack_reply(req, bufcount, size, NULL);
if (rc) {
CERROR("lustre_pack_reply failed: rc %d\n", rc);
- GOTO(out, req->rq_status = rc);
+ req->rq_status = rc;
+ RETURN(rc);
}
- EXIT;
- out:
- return(rc);
+ RETURN(0);
}
static int mds_getattr_name(int offset, struct ptlrpc_request *req,
- struct lustre_handle *child_lockh)
+ int child_part, struct lustre_handle *child_lockh)
{
struct obd_device *obd = req->rq_export->exp_obd;
struct mds_obd *mds = &obd->u.mds;
if (rc)
GOTO(cleanup, rc);
- LASSERT (offset == 0 || offset == 2);
+ LASSERT (offset == MDS_REQ_REC_OFF || offset == MDS_REQ_INTENT_REC_OFF);
/* if requests were at offset 2, the getattr reply goes back at 1 */
- if (offset) {
+ if (offset == MDS_REQ_INTENT_REC_OFF) {
rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rep));
offset = 1;
}
}
#endif
- if (child_lockh->cookie != 0) {
+ if (lustre_handle_is_used(child_lockh)) {
LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT);
resent_req = 1;
}
if (resent_req == 0) {
+ if (name) {
rc = mds_get_parent_child_locked(obd, &obd->u.mds, &body->fid1,
&parent_lockh, &dparent,
- LCK_PR, name, namesize,
- child_lockh, &dchild, LCK_PR);
- if (rc)
- GOTO(cleanup, rc);
+ LCK_CR,
+ MDS_INODELOCK_UPDATE,
+ name, namesize,
+ child_lockh, &dchild, LCK_CR,
+ child_part);
+ } else {
+ /* For revalidate by fid we always take UPDATE lock */
+ dchild = mds_fid2locked_dentry(obd, &body->fid2, NULL,
+ LCK_CR, child_lockh,
+ NULL, 0,
+ MDS_INODELOCK_UPDATE);
+ LASSERT(dchild);
+ if (IS_ERR(dchild))
+ rc = PTR_ERR(dchild);
+ }
+ if (rc)
+ GOTO(cleanup, rc);
} else {
struct ldlm_lock *granted_lock;
struct ll_fid child_fid;
case 2:
if (resent_req == 0) {
if (rc && dchild->d_inode)
- ldlm_lock_decref(child_lockh, LCK_PR);
- ldlm_lock_decref(&parent_lockh, LCK_PR);
+ ldlm_lock_decref(child_lockh, LCK_CR);
+ ldlm_lock_decref(&parent_lockh, LCK_CR);
l_dput(dparent);
}
l_dput(dchild);
return rc;
}
-static int mds_getattr(int offset, struct ptlrpc_request *req)
+static int mds_getattr(struct ptlrpc_request *req, int offset)
{
struct mds_obd *mds = mds_req2mds(req);
struct obd_device *obd = req->rq_export->exp_obd;
int rc;
spin_lock(&obd->obd_osfs_lock);
- rc = fsfilt_statfs(obd, obd->u.mds.mds_sb, max_age);
+ rc = fsfilt_statfs(obd, obd->u.obt.obt_sb, max_age);
if (rc == 0)
memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
spin_unlock(&obd->obd_osfs_lock);
return 0;
}
-static int mds_sync(struct ptlrpc_request *req)
+static int mds_sync(struct ptlrpc_request *req, int offset)
{
struct obd_device *obd = req->rq_export->exp_obd;
struct mds_obd *mds = &obd->u.mds;
if (body->fid1.id == 0) {
/* a fid of zero is taken to mean "sync whole filesystem" */
- rc = fsfilt_sync(obd, mds->mds_sb);
+ rc = fsfilt_sync(obd, obd->u.obt.obt_sb);
GOTO(out, rc);
} else {
struct dentry *de;
*
* If we were to take another one here, a deadlock will result, if another
* thread is already waiting for a PW lock. */
-static int mds_readpage(struct ptlrpc_request *req)
+static int mds_readpage(struct ptlrpc_request *req, int offset)
{
struct obd_device *obd = req->rq_export->exp_obd;
struct mds_obd *mds = &obd->u.mds;
GOTO(out, rc);
}
- body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_mds_body);
+ body = lustre_swab_reqbuf(req, offset, sizeof(*body),
+ lustre_swab_mds_body);
if (body == NULL)
GOTO (out, rc = -EFAULT);
RETURN(0);
}
+static int mds_handle_quotacheck(struct ptlrpc_request *req)
+{
+ struct obd_quotactl *oqctl;
+ int rc;
+ ENTRY;
+
+ oqctl = lustre_swab_reqbuf(req, 0, sizeof(*oqctl),
+ lustre_swab_obd_quotactl);
+ if (oqctl == NULL)
+ RETURN(-EPROTO);
+
+ rc = lustre_pack_reply(req, 0, NULL, NULL);
+ if (rc) {
+ CERROR("mds: out of memory while packing quotacheck reply\n");
+ RETURN(rc);
+ }
+
+ req->rq_status = obd_quotacheck(req->rq_export, oqctl);
+ RETURN(0);
+}
+
+static int mds_handle_quotactl(struct ptlrpc_request *req)
+{
+ struct obd_quotactl *oqctl, *repoqc;
+ int rc, size = sizeof(*repoqc);
+ ENTRY;
+
+ oqctl = lustre_swab_reqbuf(req, 0, sizeof(*oqctl),
+ lustre_swab_obd_quotactl);
+ if (oqctl == NULL)
+ RETURN(-EPROTO);
+
+ rc = lustre_pack_reply(req, 1, &size, NULL);
+ if (rc)
+ RETURN(rc);
+
+ repoqc = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*repoqc));
+
+ req->rq_status = obd_quotactl(req->rq_export, oqctl);
+ *repoqc = *oqctl;
+ RETURN(0);
+}
+
+static int mds_msg_check_version(struct lustre_msg *msg)
+{
+ int rc;
+
+ /* TODO: enable the below check while really introducing msg version.
+ * it's disabled because it will break compatibility with b1_4.
+ */
+ return (0);
+
+ switch (msg->opc) {
+ case MDS_CONNECT:
+ case MDS_DISCONNECT:
+ case OBD_PING:
+ rc = lustre_msg_check_version(msg, LUSTRE_OBD_VERSION);
+ if (rc)
+ CERROR("bad opc %u version %08x, expecting %08x\n",
+ msg->opc, msg->version, LUSTRE_OBD_VERSION);
+ break;
+ case MDS_GETSTATUS:
+ case MDS_GETATTR:
+ case MDS_GETATTR_NAME:
+ case MDS_STATFS:
+ case MDS_READPAGE:
+ case MDS_REINT:
+ case MDS_CLOSE:
+ case MDS_DONE_WRITING:
+ case MDS_PIN:
+ case MDS_SYNC:
+ case MDS_GETXATTR:
+ case MDS_SETXATTR:
+ case MDS_SET_INFO:
+ case MDS_QUOTACHECK:
+ case MDS_QUOTACTL:
+ case QUOTA_DQACQ:
+ case QUOTA_DQREL:
+ rc = lustre_msg_check_version(msg, LUSTRE_MDS_VERSION);
+ if (rc)
+ CERROR("bad opc %u version %08x, expecting %08x\n",
+ msg->opc, msg->version, LUSTRE_MDS_VERSION);
+ break;
+ case LDLM_ENQUEUE:
+ case LDLM_CONVERT:
+ case LDLM_BL_CALLBACK:
+ case LDLM_CP_CALLBACK:
+ rc = lustre_msg_check_version(msg, LUSTRE_DLM_VERSION);
+ if (rc)
+ CERROR("bad opc %u version %08x, expecting %08x\n",
+ msg->opc, msg->version, LUSTRE_DLM_VERSION);
+ break;
+ case OBD_LOG_CANCEL:
+ case LLOG_ORIGIN_HANDLE_CREATE:
+ case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
+ case LLOG_ORIGIN_HANDLE_PREV_BLOCK:
+ case LLOG_ORIGIN_HANDLE_READ_HEADER:
+ case LLOG_ORIGIN_HANDLE_CLOSE:
+ case LLOG_CATINFO:
+ rc = lustre_msg_check_version(msg, LUSTRE_LOG_VERSION);
+ if (rc)
+ CERROR("bad opc %u version %08x, expecting %08x\n",
+ msg->opc, msg->version, LUSTRE_LOG_VERSION);
+ break;
+ default:
+ CERROR("MDS unknown opcode %d\n", msg->opc);
+ rc = -ENOTSUPP;
+ }
+ return rc;
+}
int mds_handle(struct ptlrpc_request *req)
{
OBD_FAIL_RETURN(OBD_FAIL_MDS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0);
LASSERT(current->journal_info == NULL);
+
+ rc = mds_msg_check_version(req->rq_reqmsg);
+ if (rc) {
+ CERROR("MDS drop mal-formed request\n");
+ RETURN(rc);
+ }
+
/* XXX identical to OST */
if (req->rq_reqmsg->opc != MDS_CONNECT) {
struct mds_export_data *med;
case MDS_GETATTR:
DEBUG_REQ(D_INODE, req, "getattr");
OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_NET, 0);
- rc = mds_getattr(0, req);
+ rc = mds_getattr(req, MDS_REQ_REC_OFF);
break;
case MDS_SETXATTR:
break;
case MDS_GETATTR_NAME: {
- struct lustre_handle lockh;
+ struct lustre_handle lockh = { 0 };
DEBUG_REQ(D_INODE, req, "getattr_name");
OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_NAME_NET, 0);
* acquiring any new locks in mds_getattr_name, so we don't
* want to cancel.
*/
- lockh.cookie = 0;
- rc = mds_getattr_name(0, req, &lockh);
+ rc = mds_getattr_name(MDS_REQ_REC_OFF, req,
+ MDS_INODELOCK_UPDATE, &lockh);
/* this non-intent call (from an ioctl) is special */
req->rq_status = rc;
- if (rc == 0 && lockh.cookie)
- ldlm_lock_decref(&lockh, LCK_PR);
+ if (rc == 0 && lustre_handle_is_used(&lockh))
+ ldlm_lock_decref(&lockh, LCK_CR);
break;
}
case MDS_STATFS:
case MDS_READPAGE:
DEBUG_REQ(D_INODE, req, "readpage");
OBD_FAIL_RETURN(OBD_FAIL_MDS_READPAGE_NET, 0);
- rc = mds_readpage(req);
+ rc = mds_readpage(req, MDS_REQ_REC_OFF);
if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_SENDPAGE)) {
RETURN(0);
break;
case MDS_REINT: {
- __u32 *opcp = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*opcp));
+ __u32 *opcp = lustre_msg_buf(req->rq_reqmsg, MDS_REQ_REC_OFF,
+ sizeof (*opcp));
__u32 opc;
- int size[3] = {sizeof(struct mds_body), mds->mds_max_mdsize,
+ int size[] = { sizeof(struct mds_body), mds->mds_max_mdsize,
mds->mds_max_cookiesize};
int bufcount;
if (rc)
break;
- rc = mds_reint(req, 0, NULL);
+ rc = mds_reint(req, MDS_REQ_REC_OFF, NULL);
fail = OBD_FAIL_MDS_REINT_NET_REP;
break;
}
case MDS_CLOSE:
DEBUG_REQ(D_INODE, req, "close");
OBD_FAIL_RETURN(OBD_FAIL_MDS_CLOSE_NET, 0);
- rc = mds_close(req);
+ rc = mds_close(req, MDS_REQ_REC_OFF);
break;
case MDS_DONE_WRITING:
DEBUG_REQ(D_INODE, req, "done_writing");
OBD_FAIL_RETURN(OBD_FAIL_MDS_DONE_WRITING_NET, 0);
- rc = mds_done_writing(req);
+ rc = mds_done_writing(req, MDS_REQ_REC_OFF);
break;
case MDS_PIN:
DEBUG_REQ(D_INODE, req, "pin");
OBD_FAIL_RETURN(OBD_FAIL_MDS_PIN_NET, 0);
- rc = mds_pin(req);
+ rc = mds_pin(req, MDS_REQ_REC_OFF);
break;
case MDS_SYNC:
DEBUG_REQ(D_INODE, req, "sync");
OBD_FAIL_RETURN(OBD_FAIL_MDS_SYNC_NET, 0);
- rc = mds_sync(req);
+ rc = mds_sync(req, MDS_REQ_REC_OFF);
break;
case MDS_SET_INFO:
case MDS_QUOTACHECK:
DEBUG_REQ(D_INODE, req, "quotacheck");
OBD_FAIL_RETURN(OBD_FAIL_MDS_QUOTACHECK_NET, 0);
- rc = mds_quotacheck(req);
+ rc = mds_handle_quotacheck(req);
break;
case MDS_QUOTACTL:
DEBUG_REQ(D_INODE, req, "quotactl");
OBD_FAIL_RETURN(OBD_FAIL_MDS_QUOTACTL_NET, 0);
- rc = mds_quotactl(req);
+ rc = mds_handle_quotactl(req);
break;
case OBD_PING:
OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
rc = llog_origin_handle_create(req);
break;
+ case LLOG_ORIGIN_HANDLE_DESTROY:
+ DEBUG_REQ(D_INODE, req, "llog_init");
+ OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
+ rc = llog_origin_handle_destroy(req);
+ break;
case LLOG_ORIGIN_HANDLE_NEXT_BLOCK:
DEBUG_REQ(D_INODE, req, "llog next block");
OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
rc = llog_origin_handle_next_block(req);
break;
+ case LLOG_ORIGIN_HANDLE_PREV_BLOCK:
+ DEBUG_REQ(D_INODE, req, "llog prev block");
+ OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
+ rc = llog_origin_handle_prev_block(req);
+ break;
case LLOG_ORIGIN_HANDLE_READ_HEADER:
DEBUG_REQ(D_INODE, req, "llog read header");
OBD_FAIL_RETURN(OBD_FAIL_OBD_LOGD_NET, 0);
lsd->lsd_last_transno = cpu_to_le64(mds->mds_last_transno);
- if (!(lsd->lsd_feature_compat & cpu_to_le32(LR_COMPAT_COMMON_LR))) {
+ if (!(lsd->lsd_feature_compat & cpu_to_le32(OBD_COMPAT_COMMON_LR))) {
/* Swap to the old mds_server_data format, in case
someone wants to revert to a pre-1.6 lustre */
CDEBUG(D_INFO, "writing old last_rcvd format\n");
RETURN(rc);
}
+static
+void fsoptions_to_mds_flags(struct mds_obd *mds, char *options)
+{
+ char *p = options;
+
+ while (*options) {
+ int len;
+
+ while (*p && *p != ',')
+ p++;
+
+ len = p - options;
+ if (len == sizeof("user_xattr") - 1 &&
+ memcmp(options, "user_xattr", len) == 0) {
+ mds->mds_fl_user_xattr = 1;
+ } else if (len == sizeof("acl") - 1 &&
+ memcmp(options, "acl", len) == 0) {
+#ifdef CONFIG_FS_POSIX_ACL
+ mds->mds_fl_acl = 1;
+#else
+ CWARN("ignoring unsupported acl mount option\n");
+ memmove(options, p, strlen(p) + 1);
+#endif
+ }
+
+ options = ++p;
+ }
+}
/* mount the file system (secretly). lustre_cfg parameters are:
* 1 = device
{
struct lprocfs_static_vars lvars;
struct lustre_cfg* lcfg = buf;
- char *options = NULL;
struct mds_obd *mds = &obd->u.mds;
struct lustre_mount_info *lmi;
struct vfsmount *mnt;
+ struct obd_uuid uuid;
+ __u8 *uuid_ptr;
+ char *options, *str, *label;
char ns_name[48];
unsigned long page;
int rc = 0;
ENTRY;
- /* setup 1:/dev/loop/0 2:ext3 3:mdsA 4:errors=remount-ro,iopen_nopriv*/
+ /* setup 1:/dev/loop/0 2:ext3 3:mdsA 4:errors=remount-ro,iopen_nopriv */
+
+ CLASSERT(offsetof(struct obd_device, u.obt) ==
+ offsetof(struct obd_device, u.mds.mds_obt));
if (lcfg->lcfg_bufcount < 3)
RETURN(rc = -EINVAL);
* MDS utility and the rest of options are passed by mount
* options. Probably this should be moved to somewhere else
* like startup scripts or lconf. */
- sprintf(options, "iopen_nopriv");
+ strcpy(options, "iopen_nopriv");
- if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0 && lustre_cfg_buf(lcfg, 4))
+ if (LUSTRE_CFG_BUFLEN(lcfg, 4) > 0 && lustre_cfg_buf(lcfg, 4)) {
sprintf(options + strlen(options), ",%s",
lustre_cfg_string(lcfg, 4));
+ fsoptions_to_mds_flags(mds, options);
+ }
mnt = do_kern_mount(lustre_cfg_string(lcfg, 2), 0,
- lustre_cfg_string(lcfg, 1), (void *)options);
+ lustre_cfg_string(lcfg, 1),
+ (void *)options);
free_page(page);
if (IS_ERR(mnt)) {
rc = PTR_ERR(mnt);
"mds_ldlm_client", &obd->obd_ldlm_client);
obd->obd_replayable = 1;
+ rc = lquota_setup(quota_interface, obd, lcfg);
+ if (rc)
+ GOTO(err_fs, rc);
+
mds->mds_group_hash = upcall_cache_init(obd->obd_name);
if (IS_ERR(mds->mds_group_hash)) {
rc = PTR_ERR(mds->mds_group_hash);
mds->mds_group_hash = NULL;
- GOTO(err_fs, rc);
+ GOTO(err_qctxt, rc);
}
- mds_quota_setup(mds);
-
/* Don't wait for mds_postrecov trying to clear orphans */
obd->obd_async_recov = 1;
rc = mds_postsetup(obd);
if (rc)
- GOTO(err_fs, rc);
+ GOTO(err_qctxt, rc);
obd->obd_async_recov = 0;
lprocfs_init_vars(mds, &lvars);
lprocfs_obd_setup(obd, lvars.obd_vars);
+ uuid_ptr = fsfilt_uuid(obd, obd->u.obt.obt_sb);
+ if (uuid_ptr != NULL) {
+ class_uuid_unparse(uuid_ptr, &uuid);
+ str = uuid.uuid;
+ } else {
+ str = "no UUID";
+ }
+
+ label = fsfilt_label(obd, obd->u.obt.obt_sb);
if (obd->obd_recovering) {
- LCONSOLE_WARN("MDT %s now serving %s, but will be in recovery "
- "until %d %s reconnect, or if no clients "
- "reconnect for %d:%.02d; during that time new "
+ LCONSOLE_WARN("MDT %s now serving %s (%s%s%s), but will be in "
+ "recovery until %d %s reconnect, or if no clients"
+ " reconnect for %d:%.02d; during that time new "
"clients will not be allowed to connect. "
"Recovery progress can be monitored by watching "
"/proc/fs/lustre/mds/%s/recovery_status.\n",
- obd->obd_name,
- lustre_cfg_string(lcfg, 1),
+ obd->obd_name, lustre_cfg_string(lcfg, 1),
+ label ?: "", label ? "/" : "", str,
obd->obd_recoverable_clients,
(obd->obd_recoverable_clients == 1)
? "client" : "clients",
(int)(OBD_RECOVERY_TIMEOUT / HZ) % 60,
obd->obd_name);
} else {
- LCONSOLE_INFO("MDT %s now serving %s with recovery %s.\n",
- obd->obd_name,
- lustre_cfg_string(lcfg, 1),
+ LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery "
+ "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
+ label ?: "", label ? "/" : "", str,
obd->obd_replayable ? "enabled" : "disabled");
}
RETURN(0);
+err_qctxt:
+ lquota_cleanup(quota_interface, obd);
err_fs:
/* No extra cleanup needed for llog_init_commit_thread() */
mds_fs_cleanup(obd);
mntput(mds->mds_vfsmnt);
lock_kernel();
}
- mds->mds_sb = 0;
+ obd->u.obt.obt_sb = NULL;
return rc;
}
if (rc)
RETURN(rc);
+ rc = llog_setup(obd, LLOG_LOVEA_ORIG_CTXT, obd, 0, NULL,
+ &llog_lvfs_ops);
+ if (rc)
+ RETURN(rc);
+
if (mds->mds_profile) {
struct lustre_profile *lprof;
#if 0
mds_lov_clean(obd);
//err_llog:
llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
+ llog_cleanup(llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT));
RETURN(rc);
}
/* FIXME why not put this in the synchronize? */
/* set nextid first, so we are sure it happens */
rc = mds_lov_set_nextid(obd);
- if (rc)
+ if (rc) {
+ CERROR("%s: mds_lov_set_nextid failed\n",
+ obd->obd_name);
GOTO(out, rc);
-
+ }
+
/* clean PENDING dir */
rc = mds_cleanup_pending(obd);
if (rc < 0) {
OBD_NOTIFY_SYNC, NULL);
//mds_lov_start_synchronize(obd, NULL, NULL, obd->obd_async_recov);
+ /* quota recovery */
+ lquota_recovery(quota_interface, obd);
+
out:
RETURN(rc < 0 ? rc : item);
}
mds_lov_disconnect(obd);
mds_lov_clean(obd);
llog_cleanup(llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT));
+ llog_cleanup(llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT));
rc = obd_llog_finish(obd, 0);
}
RETURN(rc);
ping_evictor_stop();
- if (mds->mds_sb == NULL)
+ if (obd->u.obt.obt_sb == NULL)
RETURN(0);
- save_dev = lvfs_sbdev(mds->mds_sb);
+ save_dev = lvfs_sbdev(obd->u.obt.obt_sb);
if (mds->mds_osc_exp)
/* lov export was disconnected by mds_lov_clean;
lprocfs_obd_cleanup(obd);
- mds_quota_cleanup(mds);
+ lquota_cleanup(quota_interface, obd);
mds_update_server_data(obd, 1);
if (mds->mds_lov_objids != NULL)
/* We can only unlock kernel if we are in the context of sys_ioctl,
otherwise we never called lock_kernel */
- if (kernel_locked()) {
+ if (ll_kernel_locked()) {
unlock_kernel();
must_relock++;
}
if (must_put)
/* In case we didn't mount with lustre_get_mount -- old method*/
mntput(mds->mds_vfsmnt);
- mds->mds_sb = NULL;
+ obd->u.obt.obt_sb = NULL;
ldlm_namespace_free(obd->obd_namespace, obd->obd_force);
RETURN(0);
}
-static void fixup_handle_for_resent_req(struct ptlrpc_request *req,
+static void fixup_handle_for_resent_req(struct ptlrpc_request *req, int offset,
struct ldlm_lock *new_lock,
struct ldlm_lock **old_lock,
struct lustre_handle *lockh)
struct obd_export *exp = req->rq_export;
struct obd_device *obd = exp->exp_obd;
struct ldlm_request *dlmreq =
- lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*dlmreq));
+ lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*dlmreq));
struct lustre_handle remote_hdl = dlmreq->lock_handle1;
struct list_head *iter;
struct ldlm_reply *rep;
struct lustre_handle lockh = { 0 };
struct ldlm_lock *new_lock = NULL;
- int rc, offset = 2, repsize[4] = {sizeof(struct ldlm_reply),
- sizeof(struct mds_body),
- mds->mds_max_mdsize,
- mds->mds_max_cookiesize};
+ int getattr_part = MDS_INODELOCK_UPDATE;
+ int repsize[4] = {sizeof(*rep),
+ sizeof(struct mds_body),
+ mds->mds_max_mdsize};
+ int repbufcnt = 3, offset = MDS_REQ_INTENT_REC_OFF;
+ int rc;
ENTRY;
LASSERT(req != NULL);
- if (req->rq_reqmsg->bufcount <= 1) {
+ if (req->rq_reqmsg->bufcount <= MDS_REQ_INTENT_IT_OFF) {
/* No intent was provided */
int size = sizeof(struct ldlm_reply);
rc = lustre_pack_reply(req, 1, &size, NULL);
RETURN(0);
}
- it = lustre_swab_reqbuf(req, 1, sizeof(*it), lustre_swab_ldlm_intent);
+ it = lustre_swab_reqbuf(req, MDS_REQ_INTENT_IT_OFF, sizeof(*it),
+ lustre_swab_ldlm_intent);
if (it == NULL) {
CERROR("Intent missing\n");
RETURN(req->rq_status = -EFAULT);
LDLM_DEBUG(lock, "intent policy, opc: %s", ldlm_it2str(it->opc));
- rc = lustre_pack_reply(req, it->opc == IT_UNLINK ? 4 : 3, repsize,
- NULL);
+ if ((req->rq_export->exp_connect_flags & OBD_CONNECT_ACL) &&
+ (it->opc & (IT_OPEN | IT_GETATTR | IT_LOOKUP)))
+ /* we should never allow OBD_CONNECT_ACL if not configured */
+ repsize[repbufcnt++] = LUSTRE_POSIX_ACL_MAX_SIZE;
+ else if (it->opc & IT_UNLINK)
+ repsize[repbufcnt++] = mds->mds_max_cookiesize;
+
+ rc = lustre_pack_reply(req, repbufcnt, repsize, NULL);
if (rc)
RETURN(req->rq_status = rc);
switch ((long)it->opc) {
case IT_OPEN:
case IT_CREAT|IT_OPEN:
- fixup_handle_for_resent_req(req, lock, NULL, &lockh);
+ fixup_handle_for_resent_req(req, MDS_REQ_INTENT_LOCKREQ_OFF,
+ lock, NULL, &lockh);
/* XXX swab here to assert that an mds_open reint
* packet is following */
rep->lock_policy_res2 = mds_reint(req, offset, &lockh);
#endif
RETURN(ELDLM_LOCK_ABORTED);
break;
- case IT_GETATTR:
case IT_LOOKUP:
+ getattr_part = MDS_INODELOCK_LOOKUP;
+ case IT_GETATTR:
+ getattr_part |= MDS_INODELOCK_LOOKUP;
case IT_READDIR:
- fixup_handle_for_resent_req(req, lock, &new_lock, &lockh);
+ fixup_handle_for_resent_req(req, MDS_REQ_INTENT_LOCKREQ_OFF,
+ lock, &new_lock, &lockh);
+
+ /* INODEBITS_INTEROP: if this lock was converted from a
+ * plain lock (client does not support inodebits), then
+ * child lock must be taken with both lookup and update
+ * bits set for all operations.
+ */
+ if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_IBITS))
+ getattr_part = MDS_INODELOCK_LOOKUP |
+ MDS_INODELOCK_UPDATE;
+
rep->lock_policy_res2 = mds_getattr_name(offset, req,
- &lockh);
+ getattr_part, &lockh);
/* FIXME: LDLM can set req->rq_status. MDS sets
policy_res{1,2} with disposition and status.
- replay: returns 0 & req->status is old status
}
static int mds_health_check(struct obd_device *obd)
{
+ struct obd_device_target *odt = &obd->u.obt;
struct mds_obd *mds = &obd->u.mds;
int rc = 0;
- if (mds->mds_sb->s_flags & MS_RDONLY)
+ if (odt->obt_sb->s_flags & MS_RDONLY)
rc = 1;
LASSERT(mds->mds_health_check_filp != NULL);
static struct obd_ops mds_obd_ops = {
.o_owner = THIS_MODULE,
.o_connect = mds_connect,
+ .o_reconnect = mds_reconnect,
.o_init_export = mds_init_export,
.o_destroy_export = mds_destroy_export,
.o_disconnect = mds_disconnect,
.o_health_check = mdt_health_check,
};
+quota_interface_t *quota_interface;
+quota_interface_t mds_quota_interface;
+
static int __init mds_init(void)
{
int rc;
struct lprocfs_static_vars lvars;
- rc = lustre_dquot_init();
- if (rc)
+ quota_interface = PORTAL_SYMBOL_GET(mds_quota_interface);
+ rc = lquota_init(quota_interface);
+ if (rc) {
+ if (quota_interface)
+ PORTAL_SYMBOL_PUT(mds_quota_interface);
return rc;
-
+ }
+ init_obd_quota_ops(quota_interface, &mds_obd_ops);
+
lprocfs_init_vars(mds, &lvars);
class_register_type(&mds_obd_ops, lvars.module_vars, LUSTRE_MDS_NAME);
lprocfs_init_vars(mdt, &lvars);
static void /*__exit*/ mds_exit(void)
{
- lustre_dquot_exit();
+ lquota_exit(quota_interface);
+ if (quota_interface)
+ PORTAL_SYMBOL_PUT(mds_quota_interface);
class_unregister_type(LUSTRE_MDS_NAME);
class_unregister_type(LUSTRE_MDT_NAME);
return count;
}
+#ifdef HAVE_QUOTA_SUPPORT
+static int lprocfs_mds_rd_bunit(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ LASSERT(obd != NULL);
+
+ return snprintf(page, count, "%lu\n",
+ obd->u.obt.obt_qctxt.lqc_bunit_sz);
+}
+
+static int lprocfs_mds_rd_iunit(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ LASSERT(obd != NULL);
+
+ return snprintf(page, count, "%lu\n",
+ obd->u.obt.obt_qctxt.lqc_iunit_sz);
+}
+
+static int lprocfs_mds_wr_bunit(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ int val, rc;
+ LASSERT(obd != NULL);
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val % QUOTABLOCK_SIZE ||
+ val <= obd->u.obt.obt_qctxt.lqc_btune_sz)
+ return -EINVAL;
+
+ obd->u.obt.obt_qctxt.lqc_bunit_sz = val;
+ return count;
+}
+
+static int lprocfs_mds_wr_iunit(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ int val, rc;
+ LASSERT(obd != NULL);
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val <= obd->u.obt.obt_qctxt.lqc_itune_sz)
+ return -EINVAL;
+
+ obd->u.obt.obt_qctxt.lqc_iunit_sz = val;
+ return count;
+}
+
+static int lprocfs_mds_rd_btune(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ LASSERT(obd != NULL);
+
+ return snprintf(page, count, "%lu\n",
+ obd->u.obt.obt_qctxt.lqc_btune_sz);
+}
+
+static int lprocfs_mds_rd_itune(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ LASSERT(obd != NULL);
+
+ return snprintf(page, count, "%lu\n",
+ obd->u.obt.obt_qctxt.lqc_itune_sz);
+}
+
+static int lprocfs_mds_wr_btune(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ int val, rc;
+ LASSERT(obd != NULL);
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val <= QUOTABLOCK_SIZE * MIN_QLIMIT || val % QUOTABLOCK_SIZE ||
+ val >= obd->u.obt.obt_qctxt.lqc_bunit_sz)
+ return -EINVAL;
+
+ obd->u.obt.obt_qctxt.lqc_btune_sz = val;
+ return count;
+}
+
+static int lprocfs_mds_wr_itune(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ int val, rc;
+ LASSERT(obd != NULL);
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val <= MIN_QLIMIT ||
+ val >= obd->u.obt.obt_qctxt.lqc_iunit_sz)
+ return -EINVAL;
+
+ obd->u.obt.obt_qctxt.lqc_itune_sz = val;
+ return count;
+}
+#endif
+
struct lprocfs_vars lprocfs_mds_obd_vars[] = {
{ "uuid", lprocfs_rd_uuid, 0, 0 },
{ "blocksize", lprocfs_rd_blksize, 0, 0 },
lprocfs_wr_group_acquire_expire, 0},
{ "group_upcall", lprocfs_rd_group_upcall,
lprocfs_wr_group_upcall, 0},
- { "group_flush", 0, lprocfs_wr_group_flush, 0},
- { "group_info", 0, lprocfs_wr_group_info, 0 },
+ { "group_flush", 0, lprocfs_wr_group_flush, 0},
+ { "group_info", 0, lprocfs_wr_group_info, 0 },
{ 0 }
};
#include "mds_internal.h"
-/* This limit is arbitrary (32k clients on x86), but it is convenient to use
- * 2^n * PAGE_SIZE * 8 for the number of bits that fit an order-n allocation. */
-#define MDS_MAX_CLIENTS (PAGE_SIZE * 8)
-
-#define LAST_RCVD "last_rcvd"
-#define LOV_OBJID "lov_objid"
#define HEALTH_CHECK "health_check"
/* Add client data to the MDS. We use a bitmap to locate a free space
* there's no need for extra complication here
*/
if (new_client) {
- cl_idx = find_first_zero_bit(bitmap, MDS_MAX_CLIENTS);
+ cl_idx = find_first_zero_bit(bitmap, LR_MAX_CLIENTS);
repeat:
- if (cl_idx >= MDS_MAX_CLIENTS ||
+ if (cl_idx >= LR_MAX_CLIENTS ||
OBD_FAIL_CHECK_ONCE(OBD_FAIL_MDS_CLIENT_ADD)) {
- CERROR("no room for clients - fix MDS_MAX_CLIENTS\n");
+ CERROR("no room for clients - fix LR_MAX_CLIENTS\n");
return -EOVERFLOW;
}
if (test_and_set_bit(cl_idx, bitmap)) {
- cl_idx = find_next_zero_bit(bitmap, MDS_MAX_CLIENTS,
+ cl_idx = find_next_zero_bit(bitmap, LR_MAX_CLIENTS,
cl_idx);
goto repeat;
}
static int mds_server_free_data(struct mds_obd *mds)
{
- OBD_FREE(mds->mds_client_bitmap, MDS_MAX_CLIENTS / 8);
+ OBD_FREE(mds->mds_client_bitmap, LR_MAX_CLIENTS / 8);
OBD_FREE(mds->mds_server_data, sizeof(*mds->mds_server_data));
mds->mds_server_data = NULL;
LASSERT(offsetof(struct lr_server_data, lsd_padding) +
sizeof(lsd->lsd_padding) == LR_SERVER_SIZE);
LASSERT(offsetof(struct mds_client_data, mcd_padding) +
- sizeof(mcd->mcd_padding) == MDS_LR_CLIENT_SIZE);
- LASSERT(MDS_LR_CLIENT_SIZE == LR_CLIENT_SIZE);
- LASSERT(MDS_LR_CLIENT_START == LR_CLIENT_START);
+ sizeof(mcd->mcd_padding) == LR_CLIENT_SIZE);
OBD_ALLOC_WAIT(lsd, sizeof(*lsd));
if (!lsd)
RETURN(-ENOMEM);
- OBD_ALLOC_WAIT(mds->mds_client_bitmap, MDS_MAX_CLIENTS / 8);
+ OBD_ALLOC_WAIT(mds->mds_client_bitmap, LR_MAX_CLIENTS / 8);
if (!mds->mds_client_bitmap) {
OBD_FREE(lsd, sizeof(*lsd));
RETURN(-ENOMEM);
memcpy(lsd->lsd_uuid, obd->obd_uuid.uuid,sizeof(lsd->lsd_uuid));
lsd->lsd_last_transno = 0;
mount_count = lsd->lsd_mount_count = 0;
- lsd->lsd_server_size = cpu_to_le32(MDS_LR_SERVER_SIZE);
- lsd->lsd_client_start = cpu_to_le32(MDS_LR_CLIENT_START);
- lsd->lsd_client_size = cpu_to_le16(MDS_LR_CLIENT_SIZE);
- lsd->lsd_feature_rocompat = cpu_to_le32(MDS_ROCOMPAT_LOVOBJID);
- lsd->lsd_feature_compat = cpu_to_le32(LR_COMPAT_COMMON_LR);
+ lsd->lsd_server_size = cpu_to_le32(LR_SERVER_SIZE);
+ lsd->lsd_client_start = cpu_to_le32(LR_CLIENT_START);
+ lsd->lsd_client_size = cpu_to_le16(LR_CLIENT_SIZE);
+ lsd->lsd_feature_rocompat = cpu_to_le32(OBD_ROCOMPAT_LOVOBJID);
+ lsd->lsd_feature_compat = cpu_to_le32(OBD_COMPAT_COMMON_LR);
} else {
rc = fsfilt_read_record(obd, file, lsd, sizeof(*lsd), &off);
if (rc) {
}
mount_count = le64_to_cpu(lsd->lsd_mount_count);
}
- if (lsd->lsd_feature_incompat & ~cpu_to_le32(MDS_INCOMPAT_SUPP)) {
- CERROR("unsupported incompat feature %x\n",
- le32_to_cpu(lsd->lsd_feature_incompat) &
- ~MDS_INCOMPAT_SUPP);
+ if (lsd->lsd_feature_incompat & ~cpu_to_le32(MDT_INCOMPAT_SUPP)) {
+ CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
+ obd->obd_name, le32_to_cpu(lsd->lsd_feature_incompat) &
+ ~MDT_INCOMPAT_SUPP);
GOTO(err_msd, rc = -EINVAL);
}
- if (lsd->lsd_feature_rocompat & ~cpu_to_le32(MDS_ROCOMPAT_SUPP)) {
- CERROR("unsupported read-only feature %x\n",
- le32_to_cpu(lsd->lsd_feature_rocompat) &
- ~MDS_ROCOMPAT_SUPP);
+ if (lsd->lsd_feature_rocompat & ~cpu_to_le32(MDT_ROCOMPAT_SUPP)) {
+ CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
+ obd->obd_name, le32_to_cpu(lsd->lsd_feature_rocompat) &
+ ~MDT_ROCOMPAT_SUPP);
/* Do something like remount filesystem read-only */
GOTO(err_msd, rc = -EINVAL);
}
- if (!(lsd->lsd_feature_compat & cpu_to_le32(LR_COMPAT_COMMON_LR))) {
- CDEBUG(D_WARNING, "old last_rcvd format\n");
- lsd->lsd_mount_count = lsd->lsd_last_transno; //msd->msd_mount_count
- lsd->lsd_last_transno = lsd->lsd_unused; //msd->msd_last_transno;
+ if (!(lsd->lsd_feature_compat & cpu_to_le32(OBD_COMPAT_COMMON_LR))) {
+ CDEBUG(D_WARNING, "using old last_rcvd format\n");
+ lsd->lsd_mount_count = lsd->lsd_last_transno;
+ lsd->lsd_last_transno = lsd->lsd_unused;
/* If we update the last_rcvd, we can never go back to
- an old install. Leave this in the old format for now.
+ an old install, so leave this in the old format for now.
lsd->lsd_feature_compat |= cpu_to_le32(LR_COMPAT_COMMON_LR);
*/
}
RETURN(rc);
mds->mds_vfsmnt = mnt;
- mds->mds_sb = mnt->mnt_root->d_inode->i_sb;
+ obd->u.obt.obt_sb = mnt->mnt_root->d_inode->i_sb;
- fsfilt_setup(obd, mds->mds_sb);
+ fsfilt_setup(obd, obd->u.obt.obt_sb);
OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
obd->obd_lvfs_ctxt.pwdmnt = mnt;
int rc = 0;
if (obd->obd_fail)
- CERROR("%s: shutting down for failover; client state will"
- " be preserved.\n", obd->obd_name);
+ CWARN("%s: shutting down for failover; client state will "
+ "be preserved.\n", obd->obd_name);
class_disconnect_exports(obd); /* cleans up client info too */
mds_server_free_data(mds);
mds->mds_pending_dir = NULL;
}
- mds_fs_quota_cleanup(mds);
-
+ lquota_fs_cleanup(quota_interface, obd);
+
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
shrink_dcache_parent(mds->mds_fid_de);
dput(mds->mds_fid_de);
- LL_DQUOT_OFF(mds->mds_sb);
+ LL_DQUOT_OFF(obd->u.obt.obt_sb);
return rc;
}
{
struct mds_obd *mds = &exp->exp_obd->u.mds;
struct inode *parent_inode = mds->mds_objects_dir->d_inode;
- unsigned int tmpname = ll_insecure_random_int();
+ unsigned int tmpname = ll_rand();
struct file *filp;
struct dentry *new_child;
struct lvfs_run_ctxt saved;
}
int mds_obd_destroy(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea, struct obd_trans_info *oti)
+ struct lov_stripe_md *ea, struct obd_trans_info *oti,
+ struct obd_export *md_exp)
{
struct mds_obd *mds = &exp->exp_obd->u.mds;
struct inode *parent_inode = mds->mds_objects_dir->d_inode;
#define _MDS_INTERNAL_H
#include <linux/lustre_mds.h>
+#include <linux/lustre_disk.h>
+
+#define MDT_ROCOMPAT_SUPP (OBD_ROCOMPAT_LOVOBJID)
+
+#define MDT_INCOMPAT_SUPP (OBD_INCOMPAT_MDT)
+
+/* Data stored per client in the last_rcvd file. In le32 order. */
+struct mds_client_data {
+ __u8 mcd_uuid[40]; /* client UUID */
+ __u64 mcd_last_transno; /* last completed transaction ID */
+ __u64 mcd_last_xid; /* xid for the last transaction */
+ __u32 mcd_last_result; /* result from last RPC */
+ __u32 mcd_last_data; /* per-op data (disposition for open &c.) */
+ __u8 mcd_padding[LR_CLIENT_SIZE - 64];
+};
#define MDS_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 1000)
}
/* mds/mds_reint.c */
-int res_gt(struct ldlm_res_id *res1, struct ldlm_res_id *res2);
+int res_gt(struct ldlm_res_id *res1, struct ldlm_res_id *res2,
+ ldlm_policy_data_t *p1, ldlm_policy_data_t *p2);
int enqueue_ordered_locks(struct obd_device *obd, struct ldlm_res_id *p1_res_id,
struct lustre_handle *p1_lockh, int p1_lock_mode,
+ ldlm_policy_data_t *p1_policy,
struct ldlm_res_id *p2_res_id,
- struct lustre_handle *p2_lockh, int p2_lock_mode);
+ struct lustre_handle *p2_lockh, int p2_lock_mode,
+ ldlm_policy_data_t *p2_policy);
void mds_commit_cb(struct obd_device *, __u64 last_rcvd, void *data, int error);
int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle,
struct ptlrpc_request *req, int rc, __u32 op_data);
struct ll_fid *fid,
struct lustre_handle *parent_lockh,
struct dentry **dparentp, int parent_mode,
+ __u64 parent_lockpart,
char *name, int namelen,
struct lustre_handle *child_lockh,
- struct dentry **dchildp, int child_mode);
+ struct dentry **dchildp, int child_mode,
+ __u64 child_lockpart);
int mds_lock_new_child(struct obd_device *obd, struct inode *inode,
struct lustre_handle *child_lockh);
int mds_osc_setattr_async(struct obd_device *obd, struct inode *inode,
struct lov_mds_md *lmm, int lmm_size,
- struct llog_cookie *logcookies);
-
+ struct llog_cookie *logcookies, struct ll_fid *fid);
+
+int mds_get_parents_children_locked(struct obd_device *obd,
+ struct mds_obd *mds,
+ struct ll_fid *p1_fid,
+ struct dentry **de_srcdirp,
+ struct ll_fid *p2_fid,
+ struct dentry **de_tgtdirp,
+ int parent_mode,
+ const char *old_name, int old_len,
+ struct dentry **de_oldp,
+ const char *new_name, int new_len,
+ struct dentry **de_newp,
+ struct lustre_handle *dlm_handles,
+ int child_mode);
+
+void mds_shrink_reply(struct obd_device *obd, struct ptlrpc_request *req,
+ struct mds_body *body);
+int mds_get_cookie_size(struct obd_device *obd, struct lov_mds_md *lmm);
/* mds/mds_lib.c */
int mds_update_unpack(struct ptlrpc_request *, int offset,
struct mds_update_record *);
int mds_lov_disconnect(struct obd_device *obd);
int mds_lov_write_objids(struct obd_device *obd);
void mds_lov_update_objids(struct obd_device *obd, obd_id *ids);
+int mds_lov_clear_orphans(struct mds_obd *mds, struct obd_uuid *ost_uuid);
int mds_lov_set_nextid(struct obd_device *obd);
-int mds_lov_clearorphans(struct mds_obd *mds, struct obd_uuid *ost_uuid);
int mds_lov_start_synchronize(struct obd_device *obd,
struct obd_device *watched,
void *data, int nonblock);
struct lov_mds_md *lmm, int lmm_size);
void mds_objids_from_lmm(obd_id *ids, struct lov_mds_md *lmm,
struct lov_desc *desc);
+int mds_init_lov_desc(struct obd_device *obd, struct obd_export *osc_exp);
/* mds/mds_open.c */
int mds_query_write_access(struct inode *inode);
int mds_open(struct mds_update_record *rec, int offset,
struct ptlrpc_request *req, struct lustre_handle *);
-int mds_pin(struct ptlrpc_request *req);
+int mds_pin(struct ptlrpc_request *req, int offset);
void mds_mfd_unlink(struct mds_file_data *mfd, int decref);
-int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd,
+int mds_mfd_close(struct ptlrpc_request *req, int offset, struct obd_device *obd,
struct mds_file_data *mfd, int unlink_orphan);
-int mds_close(struct ptlrpc_request *req);
-int mds_done_writing(struct ptlrpc_request *req);
+int mds_close(struct ptlrpc_request *req, int offset);
+int mds_done_writing(struct ptlrpc_request *req, int offset);
+/*mds/mds_join.c*/
+int mds_join_file(struct mds_update_record *rec, struct ptlrpc_request *req,
+ struct dentry *dchild, struct lustre_handle *lockh);
/* mds/mds_fs.c */
int mds_client_add(struct obd_device *obd, struct mds_obd *mds,
int mds_obd_create(struct obd_export *exp, struct obdo *oa,
struct lov_stripe_md **ea, struct obd_trans_info *oti);
int mds_obd_destroy(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea, struct obd_trans_info *oti);
+ struct lov_stripe_md *ea, struct obd_trans_info *oti,
+ struct obd_export *md_exp);
/* mds/handler.c */
extern struct lvfs_callback_ops mds_lvfs_ops;
void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode);
void mds_pack_inode2body(struct mds_body *body, struct inode *inode);
#endif
+int mds_pack_acl(struct mds_export_data *med, struct inode *inode,
+ struct lustre_msg *repmsg, struct mds_body *repbody,
+ int repoff);
+
+/* quota stuff */
+extern quota_interface_t mds_quota_interface;
+extern quota_interface_t *quota_interface;
/* mds/mds_xattr.c */
int mds_setxattr(struct ptlrpc_request *req);
int mds_getxattr(struct ptlrpc_request *req);
-/* mds/quota_master.c */
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) && defined (HAVE_QUOTA_SUPPORT)
-int lustre_dquot_init(void);
-void lustre_dquot_exit(void);
-int dqacq_handler(struct obd_device *obd, struct qunit_data *qdata, int opc);
-void mds_adjust_qunit(struct obd_device *obd, uid_t cuid, gid_t cgid,
- uid_t puid, gid_t pgid, int rc);
-int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl);
-int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl);
-int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl);
-int mds_set_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl);
-int mds_get_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl);
-int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl);
-int mds_get_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl);
-#else
-static inline int lustre_dquot_init(void) { return 0; }
-static inline void lustre_dquot_exit(void) { return; }
-static inline int dqacq_handler(struct obd_device *obd,
- struct qunit_data *qdata, int opc) {return 0;}
-static inline void mds_adjust_qunit(struct obd_device *obd, uid_t cuid,
- gid_t cgid, uid_t puid,
- gid_t pgid, int rc) { return; }
-static inline int init_admin_quotafiles(struct obd_device *obd,
- struct obd_quotactl *oqctl) {return 0;}
-static inline int mds_quota_on(struct obd_device *obd,
- struct obd_quotactl *oqctl) { return 0; }
-static inline int mds_quota_off(struct obd_device *obd,
- struct obd_quotactl *oqctl) { return 0; }
-static inline int mds_set_dqinfo(struct obd_device *obd,
- struct obd_quotactl *oqctl) { return 0; }
-static inline int mds_get_dqinfo(struct obd_device *obd,
- struct obd_quotactl *oqctl) { return 0; }
-static inline int mds_set_dqblk(struct obd_device *obd,
- struct obd_quotactl *oqctl) { return 0; }
-static inline int mds_get_dqblk(struct obd_device *obd,
- struct obd_quotactl *oqctl) { return 0; }
-#endif /* KERNEL_VERSION(2,5,0) && QUOTA */
-
-#ifdef HAVE_QUOTA_SUPPORT
-/* Internal quota stuff */
-int mds_quotacheck(struct ptlrpc_request *req);
-int mds_quotactl(struct ptlrpc_request *req);
-void mds_quota_setup(struct mds_obd *mds);
-void mds_quota_cleanup(struct mds_obd *mds);
-void mds_fs_quota_cleanup(struct mds_obd *mds);
-
-#ifdef LPROCFS
-int lprocfs_mds_rd_bunit(char *page, char **start, off_t off, int count,
- int *eof, void *data);
-int lprocfs_mds_rd_iunit(char *page, char **start, off_t off, int count,
- int *eof, void *data);
-int lprocfs_mds_wr_bunit(struct file *file, const char *buffer,
- unsigned long count, void *data);
-int lprocfs_mds_wr_iunit(struct file *file, const char *buffer,
- unsigned long count, void *data);
-int lprocfs_mds_rd_btune(char *page, char **start, off_t off, int count,
- int *eof, void *data);
-int lprocfs_mds_rd_itune(char *page, char **start, off_t off, int count,
- int *eof, void *data);
-int lprocfs_mds_wr_btune(struct file *file, const char *buffer,
- unsigned long count, void *data);
-int lprocfs_mds_wr_itune(struct file *file, const char *buffer,
- unsigned long count, void *data);
-#endif /* LPROCFS */
-#else /* QUOTA */
-static inline int mds_quotacheck(struct ptlrpc_request *req)
-{
- req->rq_status = -EOPNOTSUPP;
- return -EOPNOTSUPP;
-}
-static inline int mds_quotactl(struct ptlrpc_request *req)
-{
- req->rq_status = -EOPNOTSUPP;
- return -EOPNOTSUPP;
-}
-static inline void mds_quota_setup(struct mds_obd *mds) {}
-static inline void mds_quota_cleanup(struct mds_obd *mds) {}
-static inline void mds_fs_quota_cleanup(struct mds_obd *mds) {}
-#endif /* Quota */
-
-
#endif /* _MDS_INTERNAL_H */
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * linux/mds/mds_join.c
+ * Lustre Metadata join handler file
+ *
+ * Copyright (c) 2001-2005 Cluster File Systems, Inc.
+ * Author: Wang Di <wangdi@clusterfs.com>
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_MDS
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/ext3_fs.h>
+#include <linux/obd_support.h>
+#include <linux/obd_class.h>
+#include <linux/obd.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_mds.h>
+#include <linux/lustre_dlm.h>
+#include <linux/lustre_log.h>
+#include <linux/lustre_fsfilt.h>
+#include <linux/lustre_lite.h>
+#include "mds_internal.h"
+#include <linux/obd_lov.h>
+
+struct mdsea_cb_data {
+ struct llog_handle *mc_llh;
+ struct lov_mds_md *mc_lmm;
+ struct lov_mds_md_join *mc_lmm_join;
+ __u64 mc_offset;
+ __u64 mc_headfile_sz;
+};
+
+static int mdsea_iterate(struct llog_handle *llh_tail, llog_cb_t cb,
+ void *cbdata)
+{
+ return llog_process(llh_tail, cb, cbdata, NULL);
+}
+
+static int mds_insert_join_lmm(struct llog_handle *llh,
+ struct lov_mds_md *lmm,
+ __u64 start, __u64 len,
+ struct lov_mds_md_join *lmmj)
+{
+ struct llog_rec_hdr rec;
+ struct mds_extent_desc *med;
+ int sz_med, rc;
+ ENTRY;
+
+
+ sz_med = lov_mds_md_size(le32_to_cpu(lmm->lmm_stripe_count));
+ sz_med += 2 * sizeof(__u64);
+ sz_med = size_round(sz_med);
+
+ rec.lrh_len = cpu_to_le32(sz_med);
+ rec.lrh_type = cpu_to_le32(LLOG_JOIN_REC);
+
+ CDEBUG(D_INFO, "insert extent "LPU64":"LPU64" lmm \n", start, len);
+
+ OBD_ALLOC(med, sz_med);
+ if (med == NULL)
+ RETURN(-ENOMEM);
+
+ med->med_start = start;
+ med->med_len = len;
+ memcpy(&med->med_lmm, lmm,
+ lov_mds_md_size(le32_to_cpu(lmm->lmm_stripe_count)));
+
+ rc = llog_write_rec(llh, &rec, NULL, 0, med, -1);
+ OBD_FREE(med, sz_med);
+
+ if (lmmj) {
+ /*modify lmmj for join stripe info*/
+ lmmj->lmmj_md.lmm_stripe_count += lmm->lmm_stripe_count;
+ lmmj->lmmj_extent_count ++;
+ }
+
+ RETURN(rc);
+}
+
+static int mdsea_append_extent(struct llog_handle *llh_tail,
+ struct llog_rec_hdr *rec_in_tail,
+ struct mdsea_cb_data *cbdata)
+{
+ struct mds_extent_desc *med =
+ &((struct llog_array_rec *)rec_in_tail)->lmr_med;
+ int rc;
+ ENTRY;
+
+ CDEBUG(D_INODE, "insert lmm extent: "LPU64":"LPU64" \n",
+ med->med_start, med->med_len);
+ rc = mds_insert_join_lmm(cbdata->mc_llh, &med->med_lmm,
+ med->med_start + cbdata->mc_headfile_sz,
+ med->med_len, cbdata->mc_lmm_join);
+ if (rc) {
+ CERROR("error %d insert the lmm \n", rc);
+ RETURN(rc);
+ }
+ RETURN(LLOG_DEL_RECORD);
+}
+
+static void mds_init_stripe_join(struct lov_mds_md_join *lmmj,
+ struct lov_mds_md *lmm,
+ struct llog_logid *logid)
+{
+ lmmj->lmmj_md.lmm_magic = cpu_to_le32(LOV_MAGIC_JOIN);
+ lmmj->lmmj_md.lmm_object_id = lmm->lmm_object_id;
+ lmmj->lmmj_md.lmm_object_gr = lmm->lmm_object_gr;
+ lmmj->lmmj_md.lmm_pattern = lmm->lmm_pattern;
+ lmmj->lmmj_md.lmm_stripe_size = lmm->lmm_stripe_size;
+ lmmj->lmmj_md.lmm_stripe_count = 0;
+ lmmj->lmmj_extent_count = 0;
+ lmmj->lmmj_array_id = *logid;
+}
+
+static int mdsea_cancel_last_extent(struct llog_handle *llh_tail,
+ struct llog_rec_hdr *rec_in_tail,
+ struct mdsea_cb_data *cbdata)
+{
+ struct mds_extent_desc *med =
+ &((struct llog_array_rec *)rec_in_tail)->lmr_med;
+
+ CDEBUG(D_INODE, "extent: "LPU64":"LPU64" \n", med->med_start,
+ med->med_len);
+
+ LASSERTF(cbdata->mc_offset == med->med_start,
+ "A hole in the extent "LPU64"--"LPU64"\n",
+ cbdata->mc_offset, med->med_start);
+
+ if (med->med_len != -1)
+ cbdata->mc_offset = med->med_start + med->med_len;
+
+ if (med->med_start > cbdata->mc_headfile_sz || (med->med_len == -1)) {
+ CDEBUG(D_INFO, "del rec offset"LPU64", head size "LPU64" \n",
+ med->med_start, cbdata->mc_headfile_sz);
+ if (!cbdata->mc_lmm) {
+ int stripe = le32_to_cpu(med->med_lmm.lmm_stripe_count);
+ OBD_ALLOC(cbdata->mc_lmm, lov_mds_md_size(stripe));
+ if (!cbdata->mc_lmm)
+ RETURN(-ENOMEM);
+ memcpy(cbdata->mc_lmm, &med->med_lmm,
+ lov_mds_md_size(stripe));
+ }
+ RETURN(LLOG_DEL_RECORD);
+ }
+ RETURN(0);
+}
+
+static int mds_adjust_last_extent(struct llog_handle *llh_head,
+ __u64 head_size)
+{
+ struct mdsea_cb_data *cbdata;
+ int rc;
+ ENTRY;
+
+ OBD_ALLOC_PTR(cbdata);
+
+ if (!cbdata)
+ RETURN(-ENOMEM);
+
+ cbdata->mc_headfile_sz = head_size;
+ /*Find the last extent and cancel the record in the lmm*/
+ rc = mdsea_iterate(llh_head, (llog_cb_t)mdsea_cancel_last_extent,
+ cbdata);
+
+ if (rc) {
+ CERROR("can not find the last extent rc=%d\n", rc);
+ GOTO(exit, rc);
+ }
+
+ LASSERT(cbdata->mc_lmm);
+
+ CDEBUG(D_INODE, "insert lmm extent: "LPU64":"LPU64" \n",
+ cbdata->mc_offset, (head_size - cbdata->mc_offset));
+
+ rc = mds_insert_join_lmm(llh_head, cbdata->mc_lmm,
+ cbdata->mc_offset,
+ (head_size - cbdata->mc_offset),
+ NULL);
+ if (rc)
+ CERROR("error insert the lmm rc %d \n", rc);
+exit:
+ if (cbdata && cbdata->mc_lmm)
+ OBD_FREE(cbdata->mc_lmm,
+ lov_mds_md_size(cbdata->mc_lmm->lmm_stripe_count));
+ if (cbdata)
+ OBD_FREE_PTR(cbdata);
+
+ RETURN(rc);
+}
+
+static void mds_finish_join(struct mds_obd *mds, struct ptlrpc_request *req,
+ struct inode *inode, struct lov_mds_md_join *lmmj)
+{
+ struct mds_body *body = (struct mds_body *)
+ lustre_msg_buf(req->rq_repmsg, 1, 0);
+ int max_cookiesize = lmmj->lmmj_md.lmm_stripe_count *
+ sizeof(struct llog_cookie);
+ int max_easize = sizeof(*lmmj);
+
+ CDEBUG(D_INFO, "change the max md size from %d to %d \n",
+ mds->mds_max_mdsize, sizeof(*lmmj));
+
+ if (mds->mds_max_mdsize < max_easize ||
+ mds->mds_max_cookiesize < max_cookiesize) {
+ body->max_mdsize = mds->mds_max_mdsize > max_easize ?
+ mds->mds_max_mdsize : max_easize;
+ mds->mds_max_mdsize = body->max_mdsize;
+ body->max_cookiesize = mds->mds_max_cookiesize > max_cookiesize?
+ mds->mds_max_cookiesize : max_cookiesize;
+ mds->mds_max_cookiesize = body->max_cookiesize;
+ body->valid |= OBD_MD_FLMODEASIZE;
+ }
+
+ if (body->valid & OBD_MD_FLMODEASIZE)
+ CDEBUG(D_HA, "updating max_mdsize/max_cookiesize: %d/%d\n",
+ mds->mds_max_mdsize, mds->mds_max_cookiesize);
+
+ mds_pack_inode2fid(&body->fid1, inode);
+ mds_pack_inode2body(body, inode);
+}
+
+static int mds_join_unlink_tail_inode(struct mds_update_record *rec,
+ struct ptlrpc_request *req,
+ struct mds_rec_join *join_rec,
+ struct lov_mds_md *tail_lmm,
+ int lmm_size, struct dentry *dchild,
+ void **handle,struct lustre_handle *lockh)
+{
+ struct mds_obd *mds = mds_req2mds(req);
+ struct obd_device *obd = req->rq_export->exp_obd;
+ struct inode *tail_inode, *head_inode;
+ struct dentry *de_tailparent = NULL, *de_tail = NULL, *de_head = NULL;
+ struct lustre_handle dlm_handles[4] = {{0}, {0}, {0}, {0}};
+ struct ll_fid head_fid;
+ int rc;
+ ENTRY;
+
+ if (lockh)
+ ldlm_lock_decref(lockh, LCK_EX);
+
+ head_inode = dchild->d_inode;
+ mdc_pack_fid(&head_fid, head_inode->i_ino, head_inode->i_generation,
+ head_inode->i_mode & S_IFMT);
+
+ rc = mds_get_parents_children_locked(obd, mds, &join_rec->jr_fid,
+ &de_tailparent, &head_fid,
+ &de_head, LCK_PW, rec->ur_name,
+ rec->ur_namelen, &de_tail,
+ NULL, 0, NULL, dlm_handles,
+ LCK_EX);
+ if (rc)
+ GOTO(cleanup, rc);
+
+ *lockh = dlm_handles[1];
+ LASSERT(de_tailparent);
+ tail_inode = de_tail->d_inode;
+ if (tail_inode == NULL) {
+ CERROR("tail inode doesn't exist(dir %lu,name %s)!\n",
+ de_tailparent? de_tailparent->d_inode->i_ino : 0,
+ rec->ur_name);
+ GOTO(cleanup, rc = -ENOENT);
+ }
+
+ if (!S_ISREG(tail_inode->i_mode)) {
+ CERROR("tail file is not a regular file (dir %lu, name %s)!\n",
+ de_tailparent? de_tailparent->d_inode->i_ino : 0,
+ rec->ur_name);
+ GOTO(cleanup, rc = -EINVAL);
+ }
+
+ *handle = fsfilt_start(obd, head_inode, FSFILT_OP_JOIN, NULL);
+ if (IS_ERR(*handle)) {
+ rc = PTR_ERR(*handle);
+ GOTO(cleanup, rc);
+ }
+
+ rc = mds_get_md(obd, tail_inode, tail_lmm, &lmm_size, 1);
+ if (rc < 0) /* get md fails */
+ GOTO(cleanup, rc);
+
+ LASSERT(le32_to_cpu(tail_lmm->lmm_magic) == LOV_MAGIC_JOIN ||
+ le32_to_cpu(tail_lmm->lmm_magic) == LOV_MAGIC);
+
+ LASSERT(de_tailparent);
+ rc = vfs_unlink(de_tailparent->d_inode, de_tail);
+
+ if (rc == 0) {
+ CDEBUG(D_INODE, "delete the tail inode %lu/%u \n",
+ tail_inode->i_ino, tail_inode->i_generation);
+ }
+cleanup:
+ if (dlm_handles[2].cookie != 0)
+ ldlm_lock_decref(&dlm_handles[2], LCK_EX);
+
+ if (dlm_handles[0].cookie != 0) {
+ if (rc)
+ ldlm_lock_decref(&dlm_handles[0], LCK_PW);
+ else
+ ptlrpc_save_lock(req, &dlm_handles[0], LCK_PW);
+ }
+ if (de_tail)
+ l_dput(de_tail);
+
+ if (de_tailparent)
+ l_dput(de_tailparent);
+
+ if (de_head)
+ l_dput(de_head);
+
+ RETURN(rc);
+}
+
+int mds_join_file(struct mds_update_record *rec, struct ptlrpc_request *req,
+ struct dentry *de_head, struct lustre_handle *lockh)
+{
+ struct mds_obd *mds = mds_req2mds(req);
+ struct obd_device *obd = req->rq_export->exp_obd;
+ struct inode *head_inode = NULL;
+ struct lvfs_run_ctxt saved;
+ void *handle = NULL;
+ struct lov_mds_md *head_lmm, *tail_lmm;
+ struct lov_mds_md_join *head_lmmj = NULL, *tail_lmmj = NULL;
+ int lmm_size, rc = 0, cleanup_phase = 0, size;
+ struct llog_handle *llh_head = NULL, *llh_tail = NULL;
+ struct llog_ctxt *ctxt;
+ struct mds_rec_join *join_rec;
+ ENTRY;
+
+ join_rec = lustre_swab_reqbuf (req, 5, sizeof (*join_rec),
+ lustre_swab_mds_rec_join);
+ if (join_rec == NULL)
+ RETURN (-EFAULT);
+
+ DEBUG_REQ(D_INODE, req,"head "LPU64"/%u, ptail ino "LPU64"/%u, tail %s",
+ rec->ur_fid1->id, rec->ur_fid1->generation,
+ join_rec->jr_fid.id, join_rec->jr_fid.generation,
+ rec->ur_name);
+
+ size = mds->mds_max_mdsize;
+ lmm_size = mds->mds_max_mdsize;
+ OBD_ALLOC(head_lmm, lmm_size);
+ OBD_ALLOC(tail_lmm, lmm_size);
+ if (!head_lmm || !tail_lmm)
+ GOTO(cleanup, rc = -ENOMEM);
+
+ /* acquire head's dentry */
+ LASSERT(de_head);
+ head_inode = de_head->d_inode;
+ if (head_inode == NULL) {
+ CERROR("head inode doesn't exist!\n");
+ GOTO(cleanup, rc = -ENOENT);
+ }
+
+ /*Unlink tail inode and get the lmm back*/
+ rc = mds_join_unlink_tail_inode(rec, req, join_rec, tail_lmm, lmm_size,
+ de_head, &handle, lockh);
+ if (rc) {
+ CERROR("unlink tail_inode error %d\n", rc);
+ GOTO(cleanup, rc);
+ }
+
+ down(&head_inode->i_sem);
+ cleanup_phase = 1;
+ rc = mds_get_md(obd, head_inode, head_lmm, &size, 0);
+ if (rc < 0)
+ GOTO(cleanup, rc);
+
+ LASSERTF(le32_to_cpu(head_lmm->lmm_magic) == LOV_MAGIC_JOIN ||
+ le32_to_cpu(head_lmm->lmm_magic) == LOV_MAGIC);
+
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ ctxt = llog_get_context(obd, LLOG_LOVEA_ORIG_CTXT);
+ cleanup_phase = 2;
+ if (le32_to_cpu(head_lmm->lmm_magic) == LOV_MAGIC) { /*simple file */
+ struct llog_logid *llog_array;
+
+ rc = llog_create(ctxt, &llh_head, NULL, NULL);
+ if (rc) {
+ CERROR("cannot create new log, error = %d\n", rc);
+ GOTO(cleanup, rc);
+ }
+ cleanup_phase = 3;
+ llog_array = &llh_head->lgh_id;
+ CDEBUG(D_INFO,"create arrary for %lu with id "LPU64":"LPU64"\n",
+ head_inode->i_ino, llog_array->lgl_oid,
+ llog_array->lgl_ogr);
+ rc = llog_init_handle(llh_head, LLOG_F_IS_PLAIN, NULL);
+ if (rc)
+ GOTO(cleanup, rc);
+ OBD_ALLOC_PTR(head_lmmj);
+ if (head_lmmj == NULL)
+ GOTO(cleanup, rc = -ENOMEM);
+ mds_init_stripe_join(head_lmmj, head_lmm, llog_array);
+ mds_insert_join_lmm(llh_head, head_lmm, 0,join_rec->jr_headsize,
+ head_lmmj);
+ } else { /*head lmm is join file */
+ head_lmmj = (struct lov_mds_md_join *)head_lmm;
+ /* construct and fill extent llog object */
+ rc = llog_create(ctxt, &llh_head,
+ &head_lmmj->lmmj_array_id, NULL);
+ if (rc) {
+ CERROR("cannot open existing log, error = %d\n", rc);
+ GOTO(cleanup, rc);
+ }
+ cleanup_phase = 3;
+ rc = llog_init_handle(llh_head, LLOG_F_IS_PLAIN, NULL);
+ if (rc)
+ GOTO(cleanup, rc);
+ rc = mds_adjust_last_extent(llh_head, join_rec->jr_headsize);
+ if (rc) {
+ CERROR("can't adjust last extent of obj rc=%d\n", rc);
+ GOTO(cleanup, rc);
+ }
+ }
+
+ if (le32_to_cpu(tail_lmm->lmm_magic) != LOV_MAGIC_JOIN) {
+ mds_insert_join_lmm(llh_head, tail_lmm, join_rec->jr_headsize,
+ -1, head_lmmj);
+ } else {
+ struct mdsea_cb_data cbdata;
+ tail_lmmj = (struct lov_mds_md_join *)tail_lmm;
+
+ rc = llog_create(ctxt,&llh_tail,&tail_lmmj->lmmj_array_id,NULL);
+ if (rc) {
+ CERROR("cannot open existing log, error = %d\n", rc);
+ GOTO(cleanup, rc);
+ }
+ rc = llog_init_handle(llh_tail, LLOG_F_IS_PLAIN, NULL);
+ if (rc) {
+ llog_close(llh_tail);
+ GOTO(cleanup, rc);
+ }
+ cbdata.mc_llh = llh_head;
+ cbdata.mc_headfile_sz = join_rec->jr_headsize;
+ cbdata.mc_lmm_join = head_lmmj;
+ rc = mdsea_iterate(llh_tail, (llog_cb_t)mdsea_append_extent,
+ &cbdata);
+ if (rc) {
+ llog_close(llh_tail);
+ CERROR("can not append extent log error %d \n", rc);
+ GOTO(cleanup, rc);
+ }
+ rc = llog_destroy(llh_tail);
+ if (rc) {
+ llog_close(llh_tail);
+ CERROR("can not destroy log error %d \n", rc);
+ GOTO(cleanup, rc);
+ }
+ llog_free_handle(llh_tail);
+ }
+ LASSERT(head_inode);
+ CDEBUG(D_INODE, "join finish, set lmm V2 to inode %lu \n",
+ head_inode->i_ino);
+ fsfilt_set_md(obd, head_inode, handle, head_lmmj,
+ sizeof(struct lov_mds_md_join));
+ mds_finish_join(mds, req, head_inode, head_lmmj);
+cleanup:
+ rc = mds_finish_transno(mds, head_inode, handle, req, rc, 0);
+ switch(cleanup_phase){
+ case 3:
+ llog_close(llh_head);
+ case 2:
+ if (head_lmmj && ((void*)head_lmmj != (void*)head_lmm))
+ OBD_FREE_PTR(head_lmmj);
+
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ case 1:
+ up(&head_inode->i_sem);
+ case 0:
+ if (tail_lmm != NULL)
+ OBD_FREE(tail_lmm, lmm_size);
+ if (head_lmm != NULL)
+ OBD_FREE(head_lmm, lmm_size);
+ break;
+ default:
+ CERROR("invalid cleanup_phase %d\n", cleanup_phase);
+ LBUG();
+ }
+ req->rq_status = rc;
+ RETURN(rc);
+}
+
LASSERT(body != NULL); /* previously verified & swabbed by caller */
#if CRAY_XT3
- ucred->luc_fsuid = req->rq_uid;
- if (req->rq_uid == 0) /* allow root to keep capabilities, bug 7305 */
- ucred->luc_cap = body->capability;
-#else
- ucred->luc_fsuid = body->fsuid;
- ucred->luc_fsgid = body->fsgid;
- ucred->luc_cap = body->capability;
+ if (req->rq_uid != LNET_UID_ANY) {
+ /* Non-root local cluster client */
+ LASSERT (req->rq_uid != 0);
+ ucred->luc_fsuid = req->rq_uid;
+ } else
#endif
+ {
+ ucred->luc_fsuid = body->fsuid;
+ ucred->luc_fsgid = body->fsgid;
+ ucred->luc_cap = body->capability;
+ }
ucred->luc_uce = upcall_cache_get_entry(mds->mds_group_hash,
ucred->luc_fsuid,
#include "mds_internal.h"
-/* callback function of lov to fill unlink log record */
-static int mds_log_fill_unlink_rec(struct llog_rec_hdr *rec, void *data)
-{
- struct llog_fill_rec_data *lfd = (struct llog_fill_rec_data *)data;
- struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
-
- lur->lur_oid = lfd->lfd_id;
- lur->lur_ogen = lfd->lfd_ogen;
-
- RETURN(0);
-}
-
-/* callback function of lov to fill setattr log record */
-static int mds_log_fill_setattr_rec(struct llog_rec_hdr *rec, void *data)
-{
- struct llog_fill_rec_data *lfd = (struct llog_fill_rec_data *)data;
- struct llog_setattr_rec *lsr = (struct llog_setattr_rec *)rec;
-
- lsr->lsr_oid = lfd->lfd_id;
- lsr->lsr_ogen = lfd->lfd_ogen;
-
- RETURN(0);
-}
-
static int mds_llog_origin_add(struct llog_ctxt *ctxt,
struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
- struct llog_cookie *logcookies, int numcookies,
- llog_fill_rec_cb_t fill_cb)
+ struct llog_cookie *logcookies, int numcookies)
{
struct obd_device *obd = ctxt->loc_obd;
struct obd_device *lov_obd = obd->u.mds.mds_osc_obd;
ENTRY;
lctxt = llog_get_context(lov_obd, ctxt->loc_idx);
- rc = llog_add(lctxt, rec, lsm, logcookies, numcookies, fill_cb);
+ rc = llog_add(lctxt, rec, lsm, logcookies, numcookies);
RETURN(rc);
}
{
struct mds_obd *mds = &obd->u.mds;
struct lov_stripe_md *lsm = NULL;
- struct llog_ctxt *ctxt;
struct llog_unlink_rec *lur;
+ struct llog_ctxt *ctxt;
int rc;
ENTRY;
if (IS_ERR(mds->mds_osc_obd))
RETURN(PTR_ERR(mds->mds_osc_obd));
- rc = obd_unpackmd(mds->mds_osc_exp, &lsm,
- lmm, lmm_size);
+ rc = obd_unpackmd(mds->mds_osc_exp, &lsm, lmm, lmm_size);
if (rc < 0)
RETURN(rc);
-
+ rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm);
+ if (rc)
+ GOTO(out, rc);
/* first prepare unlink log record */
OBD_ALLOC(lur, sizeof(*lur));
if (!lur)
- RETURN(-ENOMEM);
+ GOTO(out, rc = -ENOMEM);
lur->lur_hdr.lrh_len = lur->lur_tail.lrt_len = sizeof(*lur);
lur->lur_hdr.lrh_type = MDS_UNLINK_REC;
ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
rc = llog_add(ctxt, &lur->lur_hdr, lsm, logcookies,
- cookies_size / sizeof(struct llog_cookie),
- mds_log_fill_unlink_rec);
+ cookies_size / sizeof(struct llog_cookie));
- obd_free_memmd(mds->mds_osc_exp, &lsm);
OBD_FREE(lur, sizeof(*lur));
-
+out:
+ obd_free_memmd(mds->mds_osc_exp, &lsm);
RETURN(rc);
}
{
struct mds_obd *mds = &obd->u.mds;
struct lov_stripe_md *lsm = NULL;
- struct llog_ctxt *ctxt;
struct llog_setattr_rec *lsr;
+ struct llog_ctxt *ctxt;
int rc;
ENTRY;
if (rc < 0)
RETURN(rc);
+ rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm);
+ if (rc)
+ GOTO(out, rc);
+
OBD_ALLOC(lsr, sizeof(*lsr));
if (!lsr)
GOTO(out, rc = -ENOMEM);
/* write setattr log */
ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
rc = llog_add(ctxt, &lsr->lsr_hdr, lsm, logcookies,
- cookies_size / sizeof(struct llog_cookie),
- mds_log_fill_setattr_rec);
+ cookies_size / sizeof(struct llog_cookie));
OBD_FREE(lsr, sizeof(*lsr));
out:
RETURN(rc);
}
-int mds_lov_clearorphans(struct mds_obd *mds, struct obd_uuid *ost_uuid)
+int mds_lov_clear_orphans(struct mds_obd *mds, struct obd_uuid *ost_uuid)
{
int rc;
struct obdo oa;
{
struct mds_obd *mds = &obd->u.mds;
struct lov_desc *ld;
- __u32 valsize = sizeof(mds->mds_lov_desc);
- int rc = 0, i, size;
+ __u32 size, stripes, valsize = sizeof(mds->mds_lov_desc);
+ int rc = 0;
ENTRY;
OBD_ALLOC(ld, sizeof(*ld));
size = 1;
while (size < ld->ld_tgt_count)
size = size << 1;
- CERROR("Next size=%d\n", size);
size = size * sizeof(obd_id);
OBD_ALLOC(ids, size);
mds->mds_lov_objids_size = size;
}
+ /* Don't change the mds_lov_desc until the objids size matches the
+ count (paranoia) */
mds->mds_lov_desc = *ld;
+
+ CDEBUG(D_HA, "updated lov_desc, tgt_count: %d\n",
+ mds->mds_lov_desc.ld_tgt_count);
+
+ stripes = min(mds->mds_lov_desc.ld_tgt_count,
+ (__u32)LOV_MAX_STRIPE_COUNT);
+
+ mds->mds_max_mdsize = lov_mds_md_size(stripes);
+ mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie);
+
+ CDEBUG(D_HA, "updated max_mdsize/max_cookiesize: %d/%d\n",
+ mds->mds_max_mdsize, mds->mds_max_cookiesize);
- i = lov_mds_md_size(mds->mds_lov_desc.ld_tgt_count);
- if (i > mds->mds_max_mdsize)
- mds->mds_max_mdsize = i;
- mds->mds_max_cookiesize = mds->mds_lov_desc.ld_tgt_count *
- sizeof(struct llog_cookie);
out:
OBD_FREE(ld, sizeof(*ld));
RETURN(rc);
RETURN(rc);
}
+/* update the LOV-OSC knowledge of the last used object id's */
int mds_lov_connect(struct obd_device *obd, char * lov_name)
{
struct mds_obd *mds = &obd->u.mds;
case OBD_IOC_SYNC: {
CDEBUG(D_HA, "syncing mds %s\n", obd->obd_name);
- rc = fsfilt_sync(obd, obd->u.mds.mds_sb);
+ rc = fsfilt_sync(obd, obd->u.obt.obt_sb);
RETURN(rc);
}
case OBD_IOC_SET_READONLY: {
void *handle;
- struct inode *inode = obd->u.mds.mds_sb->s_root->d_inode;
+ struct inode *inode = obd->u.obt.obt_sb->s_root->d_inode;
BDEVNAME_DECLARE_STORAGE(tmp);
CERROR("*** setting device %s read-only ***\n",
- ll_bdevname(obd->u.mds.mds_sb, tmp));
+ ll_bdevname(obd->u.obt.obt_sb, tmp));
handle = fsfilt_start(obd, inode, FSFILT_OP_MKNOD, NULL);
if (!IS_ERR(handle))
rc = fsfilt_commit(obd, inode, handle, 1);
CDEBUG(D_HA, "syncing mds %s\n", obd->obd_name);
- rc = fsfilt_sync(obd, obd->u.mds.mds_sb);
+ rc = fsfilt_sync(obd, obd->u.obt.obt_sb);
- lvfs_set_rdonly(lvfs_sbdev(obd->u.mds.mds_sb));
+ lvfs_set_rdonly(lvfs_sbdev(obd->u.obt.obt_sb));
RETURN(0);
}
if (obd->obd_stopping)
GOTO(out, rc = -ENODEV);
- rc = mds_lov_clearorphans(mds, uuid);
+ rc = mds_lov_clear_orphans(mds, uuid);
if (rc != 0) {
- CERROR("%s: failed at mds_lov_clearorphans: %d\n",
+ CERROR("%s: failed at mds_lov_clear_orphans: %d\n",
obd->obd_name, rc);
GOTO(out, rc);
}
+ EXIT;
out:
class_decref(obd);
- RETURN(rc);
+ return rc;
}
int mds_lov_synchronize(void *data)
if (rc < 0) {
CERROR("%s: error starting mds_lov_synchronize: %d\n",
obd->obd_name, rc);
- class_export_put(obd->obd_self_export);
+ class_decref(obd);
} else {
CDEBUG(D_HA, "%s: mds_lov_synchronize thread: %d\n",
obd->obd_name, rc);
}
if (obd->obd_recovering) {
+ /* in the case OBD is in recovery we do not reinit desc and
+ * easize, as that will be done in mds_lov_connect() after
+ * recovery is finished. */
CWARN("MDS %s: in recovery, not resetting orphans on %s\n",
obd->obd_name,
watched->u.cli.cl_import->imp_target_uuid.uuid);
LASSERT(llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT) != NULL);
rc = mds_lov_start_synchronize(obd, watched, data,
!(ev == OBD_NOTIFY_SYNC));
+ lquota_recovery(quota_interface, obd);
+
RETURN(rc);
}
int rc, err;
ENTRY;
- if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC)
+ if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC ||
+ le32_to_cpu(lmm->lmm_magic == LOV_MAGIC_JOIN))
RETURN(0);
CDEBUG(D_INODE, "converting LOV EA on %lu/%u from %#08x to %#08x\n",
inode->i_ino, inode->i_generation, le32_to_cpu(lmm->lmm_magic),
LOV_MAGIC);
+
rc = obd_unpackmd(obd->u.mds.mds_osc_exp, &lsm, lmm, lmm_size);
if (rc < 0)
GOTO(conv_end, rc);
static int mds_create_objects(struct ptlrpc_request *req, int offset,
struct mds_update_record *rec,
struct mds_obd *mds, struct obd_device *obd,
- struct dentry *dchild, void **handle,
+ struct dentry *dchild, void **handle,
obd_id **ids)
{
- struct obdo *oa;
+ struct inode *inode = dchild->d_inode;
struct obd_trans_info oti = { 0 };
- struct mds_body *body;
struct lov_stripe_md *lsm = NULL;
struct lov_mds_md *lmm = NULL;
- struct inode *inode = dchild->d_inode;
- void *lmm_buf;
int rc, lmm_bufsize, lmm_size;
+ struct mds_body *body;
+ struct obdo *oa;
+ void *lmm_buf;
ENTRY;
+ if (!S_ISREG(inode->i_mode))
+ RETURN(0);
if (rec->ur_flags & MDS_OPEN_DELAY_CREATE ||
!(rec->ur_flags & FMODE_WRITE))
RETURN(0);
body = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*body));
- if (!S_ISREG(inode->i_mode))
- RETURN(0);
if (body->valid & OBD_MD_FLEASIZE)
RETURN(0);
OBD_ALLOC(*ids, mds->mds_lov_desc.ld_tgt_count * sizeof(**ids));
if (*ids == NULL)
RETURN(-ENOMEM);
+ oti_init(&oti, req);
oti.oti_objid = *ids;
- oti.oti_thread = req->rq_svc_thread;
/* replay case */
if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
oa = obdo_alloc();
if (oa == NULL)
GOTO(out_ids, rc = -ENOMEM);
- oa->o_mode = S_IFREG | 0600;
- oa->o_id = inode->i_ino;
- oa->o_generation = inode->i_generation;
oa->o_uid = 0; /* must have 0 uid / gid on OST */
oa->o_gid = 0;
- oa->o_valid = OBD_MD_FLID | OBD_MD_FLGENER | OBD_MD_FLTYPE |
+ oa->o_mode = S_IFREG | 0600;
+ oa->o_id = inode->i_ino;
+ oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLFLAGS |
OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID;
oa->o_size = 0;
- obdo_from_inode(oa, inode, OBD_MD_FLTYPE|OBD_MD_FLATIME|OBD_MD_FLMTIME|
- OBD_MD_FLCTIME);
+ obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
+ OBD_MD_FLMTIME | OBD_MD_FLCTIME);
if (!(rec->ur_flags & MDS_OPEN_HAS_OBJS)) {
/* check if things like lfs setstripe are sending us the ea */
}
if (inode->i_size) {
oa->o_size = inode->i_size;
- obdo_from_inode(oa, inode, OBD_MD_FLTYPE|OBD_MD_FLATIME|
- OBD_MD_FLMTIME| OBD_MD_FLCTIME| OBD_MD_FLSIZE);
+ obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
+ OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLSIZE);
+
+ /* pack lustre id to OST */
+ oa->o_fid = body->fid1.id;
+ oa->o_generation = body->fid1.generation;
+ oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER;
+
rc = obd_setattr(mds->mds_osc_exp, oa, lsm, &oti);
if (rc) {
CERROR("error setting attrs for inode %lu: rc %d\n",
LASSERT(lsm && lsm->lsm_object_id);
lmm = NULL;
rc = obd_packmd(mds->mds_osc_exp, &lmm, lsm);
- LASSERT(rc >= 0);
+ if (rc < 0) {
+ CERROR("cannot pack lsm, err = %d\n", rc);
+ GOTO(out_oa, rc);
+ }
lmm_size = rc;
body->eadatasize = rc;
OBD_MD_FLATIME | OBD_MD_FLMTIME);
}
+ if (!(rec->ur_flags & MDS_OPEN_JOIN_FILE))
+ lustre_shrink_reply(req, 2, body->eadatasize, 0);
+
+ if (req->rq_export->exp_connect_flags & OBD_CONNECT_ACL &&
+ !(rec->ur_flags & MDS_OPEN_JOIN_FILE)) {
+ int acl_off = body->eadatasize ? 3 : 2;
+
+ rc = mds_pack_acl(med, dchild->d_inode, req->rq_repmsg,
+ body, acl_off);
+ lustre_shrink_reply(req, acl_off, body->aclsize, 0);
+ if (!req->rq_status && rc)
+ req->rq_status = rc;
+ }
+
/* If we have -EEXIST as the status, and we were asked to create
* exclusively, we can tell we failed because the file already existed.
*/
* Now that exp_outstanding_reply is a list, it's just using mfd != NULL
* to detect a re-open */
if (mfd == NULL) {
+ if (rec->ur_flags & MDS_OPEN_JOIN_FILE) {
+ rc = mds_join_file(rec, req, dchild, NULL);
+ if (rc)
+ GOTO(out_dput, rc);
+ }
mntget(mds->mds_vfsmnt);
CERROR("Re-opened file \n");
mfd = mds_dentry_open(dchild, mds->mds_vfsmnt,
/* Handles object creation, actual opening, and I/O epoch */
static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild,
struct mds_body *body, int flags, void **handle,
- struct mds_update_record *rec,struct ldlm_reply *rep)
+ struct mds_update_record *rec,struct ldlm_reply *rep,
+ struct lustre_handle *lockh)
{
struct mds_obd *mds = mds_req2mds(req);
struct obd_device *obd = req->rq_export->exp_obd;
/* atomically create objects if necessary */
down(&dchild->d_inode->i_sem);
+
if (S_ISREG(dchild->d_inode->i_mode) &&
!(body->valid & OBD_MD_FLEASIZE)) {
rc = mds_pack_md(obd, req->rq_repmsg, 2, body,
up(&dchild->d_inode->i_sem);
RETURN(-EEXIST);
}
-
- if (!(body->valid & OBD_MD_FLEASIZE)) {
+ if (rec->ur_flags & MDS_OPEN_JOIN_FILE) {
+ up(&dchild->d_inode->i_sem);
+ rc = mds_join_file(rec, req, dchild, lockh);
+ if (rc)
+ RETURN(rc);
+ down(&dchild->d_inode->i_sem);
+ }
+ if (!(body->valid & OBD_MD_FLEASIZE) &&
+ !(body->valid & OBD_MD_FLMODEASIZE)) {
/* no EA: create objects */
rc = mds_create_objects(req, 2, rec, mds, obd,
dchild, handle, &ids);
}
up(&dchild->d_inode->i_sem);
+ if (!(rec->ur_flags & MDS_OPEN_JOIN_FILE))
+ lustre_shrink_reply(req, 2, body->eadatasize, 0);
+
+ if (req->rq_export->exp_connect_flags & OBD_CONNECT_ACL &&
+ !(rec->ur_flags & MDS_OPEN_JOIN_FILE)) {
+ int acl_off = body->eadatasize ? 3 : 2;
+
+ rc = mds_pack_acl(&req->rq_export->exp_mds_data,
+ dchild->d_inode, req->rq_repmsg,
+ body, acl_off);
+ lustre_shrink_reply(req, acl_off, body->aclsize, 0);
+ if (rc)
+ RETURN(rc);
+ }
+
intent_set_disposition(rep, DISP_OPEN_OPEN);
mfd = mds_dentry_open(dchild, mds->mds_vfsmnt, flags, req);
if (IS_ERR(mfd))
intent_set_disposition(rep, DISP_LOOKUP_POS);
open:
- rc = mds_finish_open(req, dchild, body, flags, &handle, rec, rep);
+ rc = mds_finish_open(req, dchild, body, flags, &handle, rec, rep,
+ NULL);
rc = mds_finish_transno(mds, dchild ? dchild->d_inode : NULL, handle,
req, rc, rep ? rep->lock_policy_res1 : 0);
/* XXX what do we do here if mds_finish_transno itself failed? */
RETURN(rc);
}
-int mds_pin(struct ptlrpc_request *req)
+int mds_pin(struct ptlrpc_request *req, int offset)
{
struct obd_device *obd = req->rq_export->exp_obd;
struct mds_body *request_body, *reply_body;
int rc, size = sizeof(*reply_body);
ENTRY;
- request_body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*request_body));
+ request_body = lustre_msg_buf(req->rq_reqmsg, offset,
+ sizeof(*request_body));
rc = lustre_pack_reply(req, 1, &size, NULL);
if (rc)
RETURN(rc);
}
-/* Get a lock on the ino to sync with creation WRT inode reuse (bug 2029).
- * If child_lockh is NULL we just get the lock as a barrier to wait for
- * other holders of this lock, and drop it right away again. */
+/* Get an internal lock on the inode number (but not generation) to sync
+ * new inode creation with inode unlink (bug 2029). If child_lockh is NULL
+ * we just get the lock as a barrier to wait for other holders of this lock,
+ * and drop it right away again. */
int mds_lock_new_child(struct obd_device *obd, struct inode *inode,
struct lustre_handle *child_lockh)
{
struct mds_export_data *med;
struct lustre_handle parent_lockh;
int rc = 0, cleanup_phase = 0, acc_mode, created = 0;
- int parent_mode = LCK_PR;
+ int parent_mode = LCK_CR;
void *handle = NULL;
struct dentry_params dp;
- uid_t parent_uid = 0;
- gid_t parent_gid = 0;
+ unsigned int qcids[MAXQUOTAS] = {current->fsuid, current->fsgid};
+ unsigned int qpids[MAXQUOTAS] = {0, 0};
ENTRY;
+ CLASSERT(MAXQUOTAS < 4);
if (offset == 2) { /* intent */
rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rep));
body = lustre_msg_buf(req->rq_repmsg, 1, sizeof (*body));
- } else if (offset == 0) { /* non-intent reint */
+ } else if (offset == MDS_REQ_REC_OFF) { /* non-intent reint */
body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
} else {
body = NULL;
/* Step 0: If we are passed a fid, then we assume the client already
* opened this file and is only replaying the RPC, so we open the
* inode by fid (at some large expense in security). */
- if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) {
+ if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY &&
+ !(rec->ur_flags & MDS_OPEN_JOIN_FILE)) {
if (rec->ur_fid2->id == 0) {
struct ldlm_lock *lock = ldlm_handle2lock(child_lockh);
if (lock) {
}
/* Step 1: Find and lock the parent */
- if (rec->ur_flags & MDS_OPEN_CREAT)
- parent_mode = LCK_PW;
+ if (rec->ur_flags & (MDS_OPEN_CREAT | MDS_OPEN_JOIN_FILE))
+ parent_mode = LCK_EX;
dparent = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, parent_mode,
&parent_lockh, rec->ur_name,
- rec->ur_namelen - 1);
+ rec->ur_namelen - 1,
+ MDS_INODELOCK_UPDATE);
if (IS_ERR(dparent)) {
rc = PTR_ERR(dparent);
if (rc != -ENOENT) {
cleanup_phase = 1; /* parent dentry and lock */
+ if (rec->ur_flags & MDS_OPEN_JOIN_FILE) {
+ dchild = dget(dparent);
+ cleanup_phase = 2; /* child dentry */
+ acc_mode = accmode(dchild->d_inode, rec->ur_flags);
+ GOTO(found_child, rc);
+ }
+
/* Step 2: Lookup the child */
dchild = ll_lookup_one_len(rec->ur_name, dparent, rec->ur_namelen - 1);
if (IS_ERR(dchild)) {
acc_mode = accmode(dchild->d_inode, rec->ur_flags);
}
-
LASSERTF(!mds_inode_is_orphan(dchild->d_inode),
"dchild %.*s (%p) inode %p/%lu/%u\n", dchild->d_name.len,
dchild->d_name.name, dchild, dchild->d_inode,
dchild->d_inode->i_ino, dchild->d_inode->i_generation);
+found_child:
mds_pack_inode2fid(&body->fid1, dchild->d_inode);
mds_pack_inode2body(body, dchild->d_inode);
/* Step 5: mds_open it */
rc = mds_finish_open(req, dchild, body, rec->ur_flags, &handle, rec,
- rep);
+ rep, &parent_lockh);
GOTO(cleanup, rc);
cleanup:
} else if (created) {
mds_lock_new_child(obd, dchild->d_inode, NULL);
/* save uid/gid for quota acquire/release */
- parent_uid = dparent->d_inode->i_uid;
- parent_gid = dparent->d_inode->i_gid;
-
+ qpids[USRQUOTA] = dparent->d_inode->i_uid;
+ qpids[GRPQUOTA] = dparent->d_inode->i_gid;
}
l_dput(dchild);
case 1:
}
/* trigger dqacq on the owner of child and parent */
- mds_adjust_qunit(obd, current->fsuid, current->fsgid,
- parent_uid, parent_gid, rc);
+ lquota_adjust(quota_interface, obd, qcids, qpids, rc, FSFILT_OP_CREATE);
RETURN(rc);
}
* (it will not even _have_ an entry in last_rcvd anymore).
*
* Returns EAGAIN if the client needs to get more data and re-close. */
-int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd,
+int mds_mfd_close(struct ptlrpc_request *req, int offset,struct obd_device *obd,
struct mds_file_data *mfd, int unlink_orphan)
{
struct inode *inode = mfd->mfd_dentry->d_inode;
ENTRY;
if (req && req->rq_reqmsg != NULL)
- request_body = lustre_msg_buf(req->rq_reqmsg, 0,
+ request_body = lustre_msg_buf(req->rq_reqmsg, offset,
sizeof(*request_body));
if (req && req->rq_repmsg != NULL)
reply_body = lustre_msg_buf(req->rq_repmsg, 0,
RETURN(rc);
}
-int mds_close(struct ptlrpc_request *req)
+int mds_close(struct ptlrpc_request *req, int offset)
{
struct mds_export_data *med = &req->rq_export->exp_mds_data;
struct obd_device *obd = req->rq_export->exp_obd;
if (rc) {
CERROR("lustre_pack_reply: rc = %d\n", rc);
req->rq_status = rc;
- /* Continue on to drop local open count even if we can't send the reply */
+ /* continue on to drop local open even if we can't send reply */
} else {
MDS_CHECK_RESENT(req, mds_reconstruct_generic(req));
}
- body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_mds_body);
+ CDEBUG(D_HA, "close req->rep_len %d mdsize %d cookiesize %d\n",
+ req->rq_replen,
+ obd->u.mds.mds_max_mdsize, obd->u.mds.mds_max_cookiesize);
+
+ body = lustre_swab_reqbuf(req, offset, sizeof(*body),
+ lustre_swab_mds_body);
if (body == NULL) {
CERROR("Can't unpack body\n");
req->rq_status = -EFAULT;
}
push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
- req->rq_status = mds_mfd_close(req, obd, mfd, 1);
+ req->rq_status = mds_mfd_close(req, offset, obd, mfd, 1);
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ mds_shrink_reply(obd, req, body);
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CLOSE_PACK)) {
CERROR("test case OBD_FAIL_MDS_CLOSE_PACK\n");
req->rq_status = -ENOMEM;
RETURN(rc);
}
-int mds_done_writing(struct ptlrpc_request *req)
+int mds_done_writing(struct ptlrpc_request *req, int offset)
{
struct mds_body *body;
int rc, size = sizeof(struct mds_body);
MDS_CHECK_RESENT(req, mds_reconstruct_generic(req));
- body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_mds_body);
+ body = lustre_swab_reqbuf(req, offset, sizeof(*body),
+ lustre_swab_mds_body);
if (body == NULL) {
CERROR("Can't unpack body\n");
req->rq_status = -EFAULT;
CDEBUG(D_HA, "cancelling %d cookies\n",
(int)(mlcd->mlcd_cookielen / sizeof(*mlcd->mlcd_cookies)));
- rc = obd_unpackmd(obd->u.mds.mds_osc_exp, &lsm, mlcd->mlcd_lmm,
+ rc = obd_unpackmd(obd->u.mds.mds_osc_exp, &lsm, mlcd->mlcd_lmm,
mlcd->mlcd_eadatalen);
if (rc < 0) {
CERROR("bad LSM cancelling %d log cookies: rc %d\n",
rc);
} else {
///* XXX 0 normally, SENDNOW for debug */);
+ rc = obd_checkmd(obd->u.mds.mds_osc_exp, obd->obd_self_export,
+ lsm);
+ if (rc)
+ CERROR("Can not revalidate lsm %p \n", lsm);
+
ctxt = llog_get_context(obd,mlcd->mlcd_cookies[0].lgc_subsys+1);
rc = llog_cancel(ctxt, lsm, mlcd->mlcd_cookielen /
sizeof(*mlcd->mlcd_cookies),
/* times */
if ((ia_valid & (ATTR_MTIME|ATTR_ATIME)) == (ATTR_MTIME|ATTR_ATIME)) {
- if (rec->ur_uc.luc_fsuid != inode->i_uid &&
+ if (current->fsuid != inode->i_uid &&
(error = ll_permission(inode, MAY_WRITE, NULL)) != 0)
RETURN(error);
}
int mds_osc_setattr_async(struct obd_device *obd, struct inode *inode,
struct lov_mds_md *lmm, int lmm_size,
- struct llog_cookie *logcookies)
+ struct llog_cookie *logcookies, struct ll_fid *fid)
{
struct mds_obd *mds = &obd->u.mds;
struct lov_stripe_md *lsm = NULL;
GOTO(out, rc);
}
+ rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm);
+ if (rc) {
+ CERROR("Error revalidate lsm %p \n", lsm);
+ GOTO(out, rc);
+ }
+
/* then fill oa */
oa->o_id = lsm->lsm_object_id;
oa->o_uid = inode->i_uid;
oti.oti_logcookies = logcookies;
}
+ LASSERT(fid != NULL);
+ oa->o_fid = fid->id;
+ oa->o_generation = fid->generation;
+ oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER;
+
/* do setattr from mds to ost asynchronously */
rc = obd_setattr_async(mds->mds_osc_exp, oa, lsm, &oti);
if (rc)
CDEBUG(D_INODE, "mds to ost setattr objid 0x"LPX64
" on ost error %d\n", lsm->lsm_object_id, rc);
-
- obd_free_memmd(mds->mds_osc_exp, &lsm);
- out:
+out:
+ if (lsm)
+ obd_free_memmd(mds->mds_osc_exp, &lsm);
obdo_free(oa);
RETURN(rc);
}
struct ptlrpc_request *req,
struct lustre_handle *lh)
{
+ unsigned int ia_valid = rec->ur_iattr.ia_valid;
struct mds_obd *mds = mds_req2mds(req);
struct obd_device *obd = req->rq_export->exp_obd;
struct mds_body *body;
struct mds_logcancel_data *mlcd = NULL;
struct lov_mds_md *lmm = NULL;
struct llog_cookie *logcookies = NULL;
- int lmm_size = 0, need_lock = 1;
+ int lmm_size = 0, need_lock = 1, cookie_size = 0;
int rc = 0, cleanup_phase = 0, err, locked = 0;
- uid_t child_uid = 0;
- gid_t child_gid = 0;
+ unsigned int qcids[MAXQUOTAS] = {0, 0};
+ unsigned int qpids[MAXQUOTAS] = {rec->ur_iattr.ia_uid,
+ rec->ur_iattr.ia_gid};
ENTRY;
- LASSERT(offset == 0);
+ LASSERT(offset == MDS_REQ_REC_OFF);
DEBUG_REQ(D_INODE, req, "setattr "LPU64"/%u %x", rec->ur_fid1->id,
rec->ur_fid1->generation, rec->ur_iattr.ia_valid);
if (req->rq_export->exp_connect_flags & OBD_CONNECT_RDONLY)
GOTO(cleanup, rc = -EROFS);
} else {
- de = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, LCK_PW,
- &lockh, NULL, 0);
+ __u64 lockpart = MDS_INODELOCK_UPDATE;
+ if (rec->ur_iattr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID))
+ lockpart |= MDS_INODELOCK_LOOKUP;
+
+ de = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, LCK_EX,
+ &lockh, NULL, 0, lockpart);
if (IS_ERR(de))
GOTO(cleanup, rc = PTR_ERR(de));
locked = 1;
LASSERT(inode);
/* save uid/gid for quota acq/rel */
- child_uid = inode->i_uid;
- child_gid = inode->i_gid;
+ qcids[USRQUOTA] = inode->i_uid;
+ qcids[GRPQUOTA] = inode->i_gid;
if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) &&
rec->ur_eadata != NULL) {
} else if (rec->ur_iattr.ia_valid) { /* setattr */
rc = fsfilt_setattr(obd, de, handle, &rec->ur_iattr, 0);
/* journal chown/chgrp in llog, just like unlink */
- if (rc == 0 && S_ISREG(inode->i_mode) &&
- rec->ur_iattr.ia_valid & (ATTR_UID | ATTR_GID) && lmm_size){
- OBD_ALLOC(logcookies, mds->mds_max_cookiesize);
+ if (rc == 0 && lmm_size){
+ cookie_size = mds_get_cookie_size(obd, lmm);
+ OBD_ALLOC(logcookies, cookie_size);
if (logcookies == NULL)
GOTO(cleanup, rc = -ENOMEM);
if (mds_log_op_setattr(obd, inode, lmm, lmm_size,
- logcookies,
- mds->mds_max_cookiesize) <= 0) {
- OBD_FREE(logcookies, mds->mds_max_cookiesize);
+ logcookies, cookie_size) <= 0) {
+ OBD_FREE(logcookies, cookie_size);
logcookies = NULL;
}
}
mds_pack_inode2fid(&body->fid1, inode);
mds_pack_inode2body(body, inode);
- /* Don't return OST-specific attributes if we didn't just set them */
- if (rec->ur_iattr.ia_valid & ATTR_SIZE)
+ /* don't return OST-specific attributes if we didn't just set them. */
+ if (ia_valid & ATTR_SIZE)
body->valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
- if (rec->ur_iattr.ia_valid & (ATTR_MTIME | ATTR_MTIME_SET))
+ if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET))
body->valid |= OBD_MD_FLMTIME;
- if (rec->ur_iattr.ia_valid & (ATTR_ATIME | ATTR_ATIME_SET))
+ if (ia_valid & (ATTR_ATIME | ATTR_ATIME_SET))
body->valid |= OBD_MD_FLATIME;
if (rc == 0 && rec->ur_cookielen && !IS_ERR(mds->mds_osc_obd)) {
err = mds_finish_transno(mds, inode, handle, req, rc, 0);
/* do mds to ost setattr if needed */
if (!rc && !err && lmm_size)
- mds_osc_setattr_async(obd, inode, lmm, lmm_size, logcookies);
+ mds_osc_setattr_async(obd, inode, lmm, lmm_size,
+ logcookies, rec->ur_fid1);
switch (cleanup_phase) {
case 2:
OBD_FREE(lmm, mds->mds_max_mdsize);
if (logcookies)
- OBD_FREE(logcookies, mds->mds_max_cookiesize);
+ OBD_FREE(logcookies, cookie_size);
case 1:
if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) &&
rec->ur_eadata != NULL)
l_dput(de);
if (locked) {
if (rc) {
- ldlm_lock_decref(&lockh, LCK_PW);
+ ldlm_lock_decref(&lockh, LCK_EX);
} else {
- ptlrpc_save_lock (req, &lockh, LCK_PW);
+ ptlrpc_save_lock (req, &lockh, LCK_EX);
}
}
case 0:
req->rq_status = rc;
/* trigger dqrel/dqacq for original owner and new owner */
- if (rec->ur_iattr.ia_valid & (ATTR_UID | ATTR_GID)) {
- mds_adjust_qunit(obd, rec->ur_iattr.ia_uid,
- rec->ur_iattr.ia_gid, 0, 0, rc);
- mds_adjust_qunit(obd, child_uid, child_gid, 0, 0, rc);
- }
+ if (ia_valid & (ATTR_UID | ATTR_GID))
+ lquota_adjust(quota_interface, obd, qcids, qpids, rc,
+ FSFILT_OP_SETATTR);
+
return 0;
}
struct lustre_handle lockh;
int rc = 0, err, type = rec->ur_mode & S_IFMT, cleanup_phase = 0;
int created = 0;
- uid_t parent_uid = 0;
- gid_t parent_gid = 0;
+ unsigned int qcids[MAXQUOTAS] = {current->fsuid, current->fsgid};
+ unsigned int qpids[MAXQUOTAS] = {0, 0};
struct dentry_params dp;
ENTRY;
- LASSERT(offset == 0);
+ LASSERT(offset == MDS_REQ_REC_OFF);
LASSERT(!strcmp(req->rq_export->exp_obd->obd_type->typ_name,
LUSTRE_MDS_NAME));
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_CREATE))
GOTO(cleanup, rc = -ESTALE);
- dparent = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, LCK_PW, &lockh,
- rec->ur_name, rec->ur_namelen - 1);
+ dparent = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, LCK_EX, &lockh,
+ rec->ur_name, rec->ur_namelen - 1,
+ MDS_INODELOCK_UPDATE);
if (IS_ERR(dparent)) {
rc = PTR_ERR(dparent);
if (rc != -ENOENT)
* See bug 2029 for more detail.*/
mds_lock_new_child(obd, dchild->d_inode, NULL);
/* save uid/gid of create inode and parent */
- parent_uid = dir->i_uid;
- parent_gid = dir->i_gid;
+ qpids[USRQUOTA] = dir->i_uid;
+ qpids[GRPQUOTA] = dir->i_gid;
} else {
rc = err;
}
l_dput(dchild);
case 1: /* locked parent dentry */
if (rc) {
- ldlm_lock_decref(&lockh, LCK_PW);
+ ldlm_lock_decref(&lockh, LCK_EX);
} else {
- ptlrpc_save_lock (req, &lockh, LCK_PW);
+ ptlrpc_save_lock (req, &lockh, LCK_EX);
}
l_dput(dparent);
case 0:
req->rq_status = rc;
/* trigger dqacq on the owner of child and parent */
- mds_adjust_qunit(obd, current->fsuid, current->fsgid,
- parent_uid, parent_gid, rc);
+ lquota_adjust(quota_interface, obd, qcids, qpids, rc, FSFILT_OP_CREATE);
return 0;
}
-int res_gt(struct ldlm_res_id *res1, struct ldlm_res_id *res2)
+int res_gt(struct ldlm_res_id *res1, struct ldlm_res_id *res2,
+ ldlm_policy_data_t *p1, ldlm_policy_data_t *p2)
{
int i;
if (res1->name[i] < res2->name[i])
return 0;
}
+ if (!p1 || !p2)
+ return 0;
+ if (memcmp(p1, p2, sizeof(*p1)) < 0)
+ return 1;
return 0;
}
* no lock is taken for that res_id. Must be at least one non-zero res_id. */
int enqueue_ordered_locks(struct obd_device *obd, struct ldlm_res_id *p1_res_id,
struct lustre_handle *p1_lockh, int p1_lock_mode,
+ ldlm_policy_data_t *p1_policy,
struct ldlm_res_id *p2_res_id,
- struct lustre_handle *p2_lockh, int p2_lock_mode)
+ struct lustre_handle *p2_lockh, int p2_lock_mode,
+ ldlm_policy_data_t *p2_policy)
{
struct ldlm_res_id *res_id[2] = { p1_res_id, p2_res_id };
struct lustre_handle *handles[2] = { p1_lockh, p2_lockh };
int lock_modes[2] = { p1_lock_mode, p2_lock_mode };
- int flags = LDLM_FL_LOCAL_ONLY;
- int rc;
+ ldlm_policy_data_t *policies[2] = {p1_policy, p2_policy};
+ int rc, flags;
ENTRY;
LASSERT(p1_res_id != NULL && p2_res_id != NULL);
CDEBUG(D_INFO, "locks before: "LPU64"/"LPU64"\n",
res_id[0]->name[0], res_id[1]->name[0]);
- if (res_gt(p1_res_id, p2_res_id)) {
+ if (res_gt(p1_res_id, p2_res_id, p1_policy, p2_policy)) {
handles[1] = p1_lockh;
handles[0] = p2_lockh;
res_id[1] = p1_res_id;
res_id[0] = p2_res_id;
lock_modes[1] = p1_lock_mode;
lock_modes[0] = p2_lock_mode;
+ policies[1] = p1_policy;
+ policies[0] = p2_policy;
}
CDEBUG(D_DLMTRACE, "lock order: "LPU64"/"LPU64"\n",
res_id[0]->name[0], res_id[1]->name[0]);
+ flags = LDLM_FL_LOCAL_ONLY;
rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, *res_id[0],
- LDLM_PLAIN, NULL, lock_modes[0], &flags,
+ LDLM_IBITS, policies[0], lock_modes[0], &flags,
ldlm_blocking_ast, ldlm_completion_ast,
NULL, NULL, NULL, 0, NULL, handles[0]);
if (rc != ELDLM_OK)
RETURN(-EIO);
ldlm_lock_dump_handle(D_OTHER, handles[0]);
- if (memcmp(res_id[0], res_id[1], sizeof(*res_id[0])) == 0) {
+ if (memcmp(res_id[0], res_id[1], sizeof(*res_id[0])) == 0 &&
+ (policies[0]->l_inodebits.bits & policies[1]->l_inodebits.bits)) {
memcpy(handles[1], handles[0], sizeof(*(handles[1])));
ldlm_lock_addref(handles[1], lock_modes[1]);
} else if (res_id[1]->name[0] != 0) {
+ flags = LDLM_FL_LOCAL_ONLY;
rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace,
- *res_id[1], LDLM_PLAIN, NULL,
+ *res_id[1], LDLM_IBITS, policies[1],
lock_modes[1], &flags,
ldlm_blocking_ast, ldlm_completion_ast,
NULL, NULL, NULL, 0, NULL, handles[1]);
int enqueue_4ordered_locks(struct obd_device *obd,struct ldlm_res_id *p1_res_id,
struct lustre_handle *p1_lockh, int p1_lock_mode,
+ ldlm_policy_data_t *p1_policy,
struct ldlm_res_id *p2_res_id,
struct lustre_handle *p2_lockh, int p2_lock_mode,
+ ldlm_policy_data_t *p2_policy,
struct ldlm_res_id *c1_res_id,
struct lustre_handle *c1_lockh, int c1_lock_mode,
+ ldlm_policy_data_t *c1_policy,
struct ldlm_res_id *c2_res_id,
- struct lustre_handle *c2_lockh, int c2_lock_mode)
+ struct lustre_handle *c2_lockh, int c2_lock_mode,
+ ldlm_policy_data_t *c2_policy)
{
struct ldlm_res_id *res_id[5] = { p1_res_id, p2_res_id,
c1_res_id, c2_res_id };
c1_lockh, c2_lockh };
int lock_modes[5] = { p1_lock_mode, p2_lock_mode,
c1_lock_mode, c2_lock_mode };
+ ldlm_policy_data_t *policies[5] = {p1_policy, p2_policy,
+ c1_policy, c2_policy};
int rc, i, j, sorted, flags;
ENTRY;
dlm_handles[4] = dlm_handles[i];
res_id[4] = res_id[i];
lock_modes[4] = lock_modes[i];
+ policies[4] = policies[i];
sorted = 0;
do {
- if (res_gt(res_id[j], res_id[4])) {
+ if (res_gt(res_id[j], res_id[4], policies[j],
+ policies[4])) {
dlm_handles[j + 1] = dlm_handles[j];
res_id[j + 1] = res_id[j];
lock_modes[j + 1] = lock_modes[j];
+ policies[j + 1] = policies[j];
j--;
} else {
sorted = 1;
dlm_handles[j + 1] = dlm_handles[4];
res_id[j + 1] = res_id[4];
lock_modes[j + 1] = lock_modes[4];
+ policies[j + 1] = policies[4];
}
CDEBUG(D_DLMTRACE, "lock order: "LPU64"/"LPU64"/"LPU64"/"LPU64"\n",
if (res_id[i]->name[0] == 0)
break;
if (i != 0 &&
- memcmp(res_id[i], res_id[i-1], sizeof(*res_id[i])) == 0) {
+ memcmp(res_id[i], res_id[i-1], sizeof(*res_id[i])) == 0 &&
+ (policies[i]->l_inodebits.bits &
+ policies[i-1]->l_inodebits.bits)) {
memcpy(dlm_handles[i], dlm_handles[i-1],
sizeof(*(dlm_handles[i])));
ldlm_lock_addref(dlm_handles[i], lock_modes[i]);
} else {
rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace,
- *res_id[i], LDLM_PLAIN, NULL,
+ *res_id[i], LDLM_IBITS,
+ policies[i],
lock_modes[i], &flags,
ldlm_blocking_ast,
ldlm_completion_ast, NULL, NULL,
struct ldlm_res_id *child_res_id,
struct lustre_handle *child_lockh,
struct dentry **dchildp, int child_mode,
+ ldlm_policy_data_t *child_policy,
const char *name, int namelen,
struct ldlm_res_id *maxres)
{
child_res_id->name[0] = dchild->d_inode->i_ino;
child_res_id->name[1] = dchild->d_inode->i_generation;
- if (res_gt(parent_res_id, child_res_id) ||
- res_gt(maxres, child_res_id)) {
+ if (res_gt(parent_res_id, child_res_id, NULL, NULL) ||
+ res_gt(maxres, child_res_id, NULL, NULL)) {
CDEBUG(D_DLMTRACE, "relock "LPU64"<("LPU64"|"LPU64")\n",
child_res_id->name[0], parent_res_id->name[0],
maxres->name[0]);
}
rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace,
- *child_res_id, LDLM_PLAIN, NULL,
+ *child_res_id, LDLM_IBITS, child_policy,
child_mode, &flags, ldlm_blocking_ast,
ldlm_completion_ast, NULL, NULL, NULL, 0,
NULL, child_lockh);
struct ll_fid *fid,
struct lustre_handle *parent_lockh,
struct dentry **dparentp, int parent_mode,
+ __u64 parent_lockpart,
char *name, int namelen,
struct lustre_handle *child_lockh,
- struct dentry **dchildp, int child_mode)
+ struct dentry **dchildp, int child_mode,
+ __u64 child_lockpart)
{
struct ldlm_res_id child_res_id = { .name = {0} };
struct ldlm_res_id parent_res_id = { .name = {0} };
+ ldlm_policy_data_t parent_policy = {.l_inodebits = { parent_lockpart }};
+ ldlm_policy_data_t child_policy = {.l_inodebits = { child_lockpart }};
struct inode *inode;
int rc = 0, cleanup_phase = 0;
ENTRY;
/* Step 3: Lock parent and child in resource order. If child doesn't
* exist, we still have to lock the parent and re-lookup. */
rc = enqueue_ordered_locks(obd,&parent_res_id,parent_lockh,parent_mode,
- &child_res_id, child_lockh, child_mode);
+ &parent_policy,
+ &child_res_id, child_lockh, child_mode,
+ &child_policy);
if (rc)
GOTO(cleanup, rc);
/* Step 4: Re-lookup child to verify it hasn't changed since locking */
rc = mds_verify_child(obd, &parent_res_id, parent_lockh, *dparentp,
parent_mode, &child_res_id, child_lockh, dchildp,
- child_mode, name, namelen, &parent_res_id);
+ child_mode,&child_policy, name, namelen, &parent_res_id);
if (rc > 0)
goto retry_locks;
if (rc < 0) {
RETURN(rc);
}
+int mds_get_cookie_size(struct obd_device *obd, struct lov_mds_md *lmm)
+{
+ int count = le32_to_cpu(lmm->lmm_stripe_count);
+ int real_csize = count * sizeof(struct llog_cookie);
+ return real_csize;
+}
+
+void mds_shrink_reply(struct obd_device *obd, struct ptlrpc_request *req,
+ struct mds_body *body)
+{
+ int cookie_size = 0, md_size = 0;
+
+ if (body && body->valid & OBD_MD_FLEASIZE) {
+ md_size = body->eadatasize;
+ }
+ if (body && body->valid & OBD_MD_FLCOOKIE) {
+ LASSERT(body->valid & OBD_MD_FLEASIZE);
+ cookie_size = mds_get_cookie_size(obd, lustre_msg_buf(
+ req->rq_repmsg, 1, 0));
+ }
+
+ CDEBUG(D_INFO, "Shrink to md_size %d cookie_size %d \n", md_size,
+ cookie_size);
+
+ lustre_shrink_reply(req, 1, md_size, 1);
+
+ lustre_shrink_reply(req, md_size? 2:1, cookie_size, 0);
+}
+
static int mds_reint_unlink(struct mds_update_record *rec, int offset,
struct ptlrpc_request *req,
struct lustre_handle *lh)
struct lustre_handle parent_lockh, child_lockh, child_reuse_lockh;
void *handle = NULL;
int rc = 0, cleanup_phase = 0;
- uid_t child_uid = 0, parent_uid = 0;
- gid_t child_gid = 0, parent_gid = 0;
+ unsigned int qcids [MAXQUOTAS] = {0, 0};
+ unsigned int qpids [MAXQUOTAS] = {0, 0};
ENTRY;
- LASSERT(offset == 0 || offset == 2);
+ LASSERT(offset == MDS_REQ_REC_OFF || offset == 2);
DEBUG_REQ(D_INODE, req, "parent ino "LPU64"/%u, child %s",
rec->ur_fid1->id, rec->ur_fid1->generation, rec->ur_name);
GOTO(cleanup, rc = -ENOENT);
rc = mds_get_parent_child_locked(obd, mds, rec->ur_fid1,
- &parent_lockh, &dparent, LCK_PW,
+ &parent_lockh, &dparent, LCK_EX,
+ MDS_INODELOCK_UPDATE,
rec->ur_name, rec->ur_namelen,
- &child_lockh, &dchild, LCK_EX);
+ &child_lockh, &dchild, LCK_EX,
+ MDS_INODELOCK_FULL);
if (rc)
GOTO(cleanup, rc);
}
/* save uid/gid for quota acquire/release */
- child_uid = child_inode->i_uid;
- child_gid = child_inode->i_gid;
- parent_uid = dparent->d_inode->i_uid;
- parent_gid = dparent->d_inode->i_gid;
+ qcids[USRQUOTA] = child_inode->i_uid;
+ qcids[GRPQUOTA] = child_inode->i_gid;
+ qpids[USRQUOTA] = dparent->d_inode->i_uid;
+ qpids[GRPQUOTA] = dparent->d_inode->i_gid;
cleanup_phase = 2; /* dchild has a lock */
ldlm_lock_decref(&child_lockh, LCK_EX);
case 1: /* child and parent dentry, parent lock */
if (rc)
- ldlm_lock_decref(&parent_lockh, LCK_PW);
+ ldlm_lock_decref(&parent_lockh, LCK_EX);
else
- ptlrpc_save_lock(req, &parent_lockh, LCK_PW);
+ ptlrpc_save_lock(req, &parent_lockh, LCK_EX);
l_dput(dchild);
l_dput(dchild);
l_dput(dparent);
}
req->rq_status = rc;
+ mds_shrink_reply(obd, req, body);
+
/* trigger dqrel on the owner of child and parent */
- mds_adjust_qunit(obd, child_uid, child_gid, parent_uid, parent_gid, rc);
+ lquota_adjust(quota_interface, obd, qcids, qpids, rc, FSFILT_OP_UNLINK);
return 0;
}
struct lustre_handle *handle = NULL, tgt_dir_lockh, src_lockh;
struct ldlm_res_id src_res_id = { .name = {0} };
struct ldlm_res_id tgt_dir_res_id = { .name = {0} };
+ ldlm_policy_data_t src_policy ={.l_inodebits = {MDS_INODELOCK_UPDATE}};
+ ldlm_policy_data_t tgt_dir_policy =
+ {.l_inodebits = {MDS_INODELOCK_UPDATE}};
int rc = 0, cleanup_phase = 0;
ENTRY;
- LASSERT(offset == 0);
+ LASSERT(offset == MDS_REQ_REC_OFF);
DEBUG_REQ(D_INODE, req, "original "LPU64"/%u to "LPU64"/%u %s",
rec->ur_fid1->id, rec->ur_fid1->generation,
tgt_dir_res_id.name[1] = de_tgt_dir->d_inode->i_generation;
rc = enqueue_ordered_locks(obd, &src_res_id, &src_lockh, LCK_EX,
- &tgt_dir_res_id, &tgt_dir_lockh, LCK_EX);
+ &src_policy,
+ &tgt_dir_res_id, &tgt_dir_lockh, LCK_EX,
+ &tgt_dir_policy);
if (rc)
GOTO(cleanup, rc);
* lock on the parent after the lookup is done, so dentry->d_inode may change
* at any time, and igrab() itself doesn't like getting passed a NULL argument.
*/
-static int mds_get_parents_children_locked(struct obd_device *obd,
- struct mds_obd *mds,
- struct ll_fid *p1_fid,
- struct dentry **de_srcdirp,
- struct ll_fid *p2_fid,
- struct dentry **de_tgtdirp,
- int parent_mode,
- const char *old_name, int old_len,
- struct dentry **de_oldp,
- const char *new_name, int new_len,
- struct dentry **de_newp,
- struct lustre_handle *dlm_handles,
- int child_mode)
+int mds_get_parents_children_locked(struct obd_device *obd,
+ struct mds_obd *mds,
+ struct ll_fid *p1_fid,
+ struct dentry **de_srcdirp,
+ struct ll_fid *p2_fid,
+ struct dentry **de_tgtdirp,
+ int parent_mode,
+ const char *old_name, int old_len,
+ struct dentry **de_oldp,
+ const char *new_name, int new_len,
+ struct dentry **de_newp,
+ struct lustre_handle *dlm_handles,
+ int child_mode)
{
struct ldlm_res_id p1_res_id = { .name = {0} };
struct ldlm_res_id p2_res_id = { .name = {0} };
struct ldlm_res_id c1_res_id = { .name = {0} };
struct ldlm_res_id c2_res_id = { .name = {0} };
+ ldlm_policy_data_t p_policy = {.l_inodebits = {MDS_INODELOCK_UPDATE}};
+ /* Only dentry should disappear, but the inode itself would be
+ intact otherwise. */
+ ldlm_policy_data_t c1_policy = {.l_inodebits = {MDS_INODELOCK_LOOKUP}};
+ /* If something is going to be replaced, both dentry and inode locks are needed */
+ ldlm_policy_data_t c2_policy = {.l_inodebits = {MDS_INODELOCK_FULL}};
struct ldlm_res_id *maxres_src, *maxres_tgt;
struct inode *inode;
int rc = 0, cleanup_phase = 0;
iput(inode);
/* Step 4: Lookup the target child entry */
+ if (!new_name)
+ GOTO(retry_locks, rc);
*de_newp = ll_lookup_one_len(new_name, *de_tgtdirp, new_len - 1);
if (IS_ERR(*de_newp)) {
rc = PTR_ERR(*de_newp);
maxres_tgt = &p2_res_id;
cleanup_phase = 4; /* target dentry */
- if (c1_res_id.name[0] != 0 && res_gt(&c1_res_id, &p1_res_id))
+ if (c1_res_id.name[0] != 0 && res_gt(&c1_res_id, &p1_res_id,NULL,NULL))
maxres_src = &c1_res_id;
- if (c2_res_id.name[0] != 0 && res_gt(&c2_res_id, &p2_res_id))
+ if (c2_res_id.name[0] != 0 && res_gt(&c2_res_id, &p2_res_id,NULL,NULL))
maxres_tgt = &c2_res_id;
rc = enqueue_4ordered_locks(obd, &p1_res_id,&dlm_handles[0],parent_mode,
+ &p_policy,
&p2_res_id, &dlm_handles[1], parent_mode,
+ &p_policy,
&c1_res_id, &dlm_handles[2], child_mode,
- &c2_res_id, &dlm_handles[3], child_mode);
+ &c1_policy,
+ &c2_res_id, &dlm_handles[3], child_mode,
+ &c2_policy);
if (rc)
GOTO(cleanup, rc);
/* Step 6a: Re-lookup source child to verify it hasn't changed */
rc = mds_verify_child(obd, &p1_res_id, &dlm_handles[0], *de_srcdirp,
parent_mode, &c1_res_id, &dlm_handles[2], de_oldp,
- child_mode, old_name, old_len, maxres_tgt);
+ child_mode, &c1_policy, old_name, old_len,
+ maxres_tgt);
if (rc) {
if (c2_res_id.name[0] != 0)
ldlm_lock_decref(&dlm_handles[3], child_mode);
if ((*de_oldp)->d_inode == NULL)
GOTO(cleanup, rc = -ENOENT);
+ if (!new_name)
+ GOTO(cleanup, rc);
/* Step 6b: Re-lookup target child to verify it hasn't changed */
rc = mds_verify_child(obd, &p2_res_id, &dlm_handles[1], *de_tgtdirp,
parent_mode, &c2_res_id, &dlm_handles[3], de_newp,
- child_mode, new_name, new_len, maxres_src);
+ child_mode, &c2_policy, new_name, new_len,
+ maxres_src);
if (rc) {
ldlm_lock_decref(&dlm_handles[2], child_mode);
ldlm_lock_decref(&dlm_handles[0], parent_mode);
struct lov_mds_md *lmm = NULL;
int rc = 0, lock_count = 3, cleanup_phase = 0;
void *handle = NULL;
+ unsigned int qcids[MAXQUOTAS] = {0, 0};
+ unsigned int qpids[4] = {0, 0, 0, 0};
ENTRY;
- LASSERT(offset == 0);
+ LASSERT(offset == MDS_REQ_REC_OFF);
DEBUG_REQ(D_INODE, req, "parent "LPU64"/%u %s to "LPU64"/%u %s",
rec->ur_fid1->id, rec->ur_fid1->generation, rec->ur_name,
MDS_CHECK_RESENT(req, mds_reconstruct_generic(req));
rc = mds_get_parents_children_locked(obd, mds, rec->ur_fid1, &de_srcdir,
- rec->ur_fid2, &de_tgtdir, LCK_PW,
+ rec->ur_fid2, &de_tgtdir, LCK_EX,
rec->ur_name, rec->ur_namelen,
&de_old, rec->ur_tgt,
rec->ur_tgtlen, &de_new,
if (old_inode == new_inode)
GOTO(cleanup, rc = 0);
+ /* save uids/gids for qunit acquire/release */
+ qcids[USRQUOTA] = old_inode->i_uid;
+ qcids[GRPQUOTA] = old_inode->i_gid;
+ qpids[USRQUOTA] = de_tgtdir->d_inode->i_uid;
+ qpids[GRPQUOTA] = de_tgtdir->d_inode->i_gid;
+ qpids[2] = de_srcdir->d_inode->i_uid;
+ qpids[3] = de_srcdir->d_inode->i_gid;
+
/* if we are about to remove the target at first, pass the EA of
* that inode to client to perform and cleanup on OST */
body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
if (lock_count == 4)
ldlm_lock_decref(&(dlm_handles[3]), LCK_EX);
ldlm_lock_decref(&(dlm_handles[2]), LCK_EX);
- ldlm_lock_decref(&(dlm_handles[1]), LCK_PW);
- ldlm_lock_decref(&(dlm_handles[0]), LCK_PW);
+ ldlm_lock_decref(&(dlm_handles[1]), LCK_EX);
+ ldlm_lock_decref(&(dlm_handles[0]), LCK_EX);
} else {
if (lock_count == 4)
ptlrpc_save_lock(req,&(dlm_handles[3]), LCK_EX);
ptlrpc_save_lock(req, &(dlm_handles[2]), LCK_EX);
- ptlrpc_save_lock(req, &(dlm_handles[1]), LCK_PW);
- ptlrpc_save_lock(req, &(dlm_handles[0]), LCK_PW);
+ ptlrpc_save_lock(req, &(dlm_handles[1]), LCK_EX);
+ ptlrpc_save_lock(req, &(dlm_handles[0]), LCK_EX);
}
l_dput(de_new);
l_dput(de_old);
LBUG();
}
req->rq_status = rc;
+
+ /* acquire/release qunit */
+ lquota_adjust(quota_interface, obd, qcids, qpids, rc, FSFILT_OP_RENAME);
return 0;
}
ENTRY;
#if CRAY_XT3
- rec->ur_uc.luc_fsuid = req->rq_uid;
+ if (req->rq_uid != LNET_UID_ANY) {
+ /* non-root local cluster client
+ * NB root's creds are believed... */
+ LASSERT (req->rq_uid != 0);
+ rec->ur_uc.luc_fsuid = req->rq_uid;
+ rec->ur_uc.luc_cap = 0;
+ }
#endif
/* get group info of this user */
#include "mds_internal.h"
-static int mds_osc_destroy_orphan(struct mds_obd *mds,
+static int mds_osc_destroy_orphan(struct obd_device *obd,
struct inode *inode,
struct lov_mds_md *lmm,
int lmm_size,
struct llog_cookie *logcookies,
int log_unlink)
{
+ struct mds_obd *mds = &obd->u.mds;
struct lov_stripe_md *lsm = NULL;
struct obd_trans_info oti = { 0 };
struct obdo *oa;
rc = 0;
}
+ rc = obd_checkmd(mds->mds_osc_exp, obd->obd_self_export, lsm);
+ if (rc)
+ GOTO(out_free_memmd, rc);
+
oa = obdo_alloc();
if (oa == NULL)
GOTO(out_free_memmd, rc = -ENOMEM);
oa->o_valid |= OBD_MD_FLCOOKIE;
oti.oti_logcookies = logcookies;
}
-
- rc = obd_destroy(mds->mds_osc_exp, oa, lsm, &oti);
+ rc = obd_destroy(mds->mds_osc_exp, oa, lsm, &oti, obd->obd_self_export);
obdo_free(oa);
if (rc)
CDEBUG(D_INODE, "destroy orphan objid 0x"LPX64" on ost error "
struct mds_obd *mds = &obd->u.mds;
struct lov_mds_md *lmm = NULL;
struct llog_cookie *logcookies = NULL;
- int lmm_size, log_unlink = 0;
+ int lmm_size, log_unlink = 0, cookie_size = 0;
void *handle = NULL;
int rc, err;
ENTRY;
CERROR("error %d unlinking orphan %.*s from PENDING\n",
rc, dchild->d_name.len, dchild->d_name.name);
} else if (lmm_size) {
- OBD_ALLOC(logcookies, mds->mds_max_cookiesize);
+ cookie_size = mds_get_cookie_size(obd, lmm);
+ OBD_ALLOC(logcookies, cookie_size);
if (logcookies == NULL)
rc = -ENOMEM;
else if (mds_log_op_unlink(obd, inode, lmm,lmm_size,logcookies,
- mds->mds_max_cookiesize) > 0)
+ cookie_size) > 0)
log_unlink = 1;
}
if (!rc)
rc = err;
} else if (!rc) {
- rc = mds_osc_destroy_orphan(mds, inode, lmm, lmm_size,
+ rc = mds_osc_destroy_orphan(obd, inode, lmm, lmm_size,
logcookies, log_unlink);
}
if (logcookies != NULL)
- OBD_FREE(logcookies, mds->mds_max_cookiesize);
+ OBD_FREE(logcookies, cookie_size);
out_free_lmm:
OBD_FREE(lmm, mds->mds_max_mdsize);
RETURN(rc);
return -EFAULT;
}
+ if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_XATTR) &&
+ (strncmp(xattr_name, "user.", 5) == 0))
+ return -EOPNOTSUPP;
+
if (inode->i_op && inode->i_op->getxattr)
rc = inode->i_op->getxattr(de, xattr_name, NULL, 0);
} else if (body->valid & OBD_MD_FLXATTRLS) {
char *xattr = NULL;
int xattrlen;
int rc = -EOPNOTSUPP, err = 0;
+ __u64 lockpart;
ENTRY;
body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
MDS_CHECK_RESENT(req, mds_reconstruct_generic(req));
- de = mds_fid2locked_dentry(obd, &body->fid1, NULL, LCK_PW,
- &lockh, NULL, 0);
+ lockpart = MDS_INODELOCK_UPDATE;
+
+ de = mds_fid2locked_dentry(obd, &body->fid1, NULL, LCK_EX,
+ &lockh, NULL, 0, lockpart);
if (IS_ERR(de))
GOTO(out, rc = PTR_ERR(de));
body->valid & OBD_MD_FLXATTR ? "set" : "remove",
xattr_name);
- if (!strncmp(xattr_name, "trusted.", 8)) {
- if (!strcmp(xattr_name, "trusted."XATTR_LUSTRE_MDS_LOV_EA))
+ if (strncmp(xattr_name, "trusted.", 8) == 0) {
+ if (strcmp(xattr_name + 8, XATTR_LUSTRE_MDS_LOV_EA) == 0)
GOTO(out_dput, rc = -EACCES);
}
+ if (!(req->rq_export->exp_connect_flags & OBD_CONNECT_XATTR) &&
+ (strncmp(xattr_name, "user.", 5) == 0)) {
+ GOTO(out_dput, rc = -EOPNOTSUPP);
+ }
+
/* filter_op simply use setattr one */
handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR, NULL);
if (IS_ERR(handle))
out_dput:
l_dput(de);
if (rc)
- ldlm_lock_decref(&lockh, LCK_PW);
+ ldlm_lock_decref(&lockh, LCK_EX);
else
- ptlrpc_save_lock (req, &lockh, LCK_PW);
+ ptlrpc_save_lock (req, &lockh, LCK_EX);
if (err && !rc)
rc = err;
+++ /dev/null
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * lustre/mds/quota_context.c
- * Lustre Quota Context
- *
- * Copyright (c) 2001-2003 Cluster File Systems, Inc.
- * Author: Niu YaWei <niu@clusterfs.com>
- *
- * This file is part of Lustre, http://www.lustre.org.
- *
- * No redistribution or use is permitted outside of Cluster File Systems, Inc.
- *
- */
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
-
-#define DEBUG_SUBSYSTEM S_MDS
-
-#include <linux/version.h>
-#include <linux/fs.h>
-#include <asm/unistd.h>
-#include <linux/slab.h>
-#include <linux/quotaops.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/obd_class.h>
-#include <linux/lustre_quota.h>
-#include <linux/lustre_fsfilt.h>
-
-const unsigned long default_bunit_sz = 100 * 1024 * 1024; /* 100M bytes */
-const unsigned long default_btune_sz = 50 * 1024 * 1024; /* 50M bytes */
-const unsigned long default_iunit_sz = 5000; /* 5000 inodes */
-const unsigned long default_itune_sz = 2500; /* 2500 inodes */
-
-static inline int const
-qunit_hashfn(struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata)
-{
- unsigned int id = qdata->qd_id;
- unsigned int type = qdata->qd_type;
-
- unsigned long tmp = ((unsigned long)qctxt >> L1_CACHE_SHIFT) ^ id;
- tmp = (tmp * (MAXQUOTAS - type)) % NR_DQHASH;
- return tmp;
-}
-
-static inline struct lustre_qunit *find_qunit(unsigned int hashent,
- struct lustre_quota_ctxt *qctxt,
- struct qunit_data *qdata)
-{
- struct list_head *pos;
- struct lustre_qunit *qunit = NULL;
- struct qunit_data *tmp;
-
- list_for_each(pos, qunit_hash + hashent) {
- qunit = list_entry(pos, struct lustre_qunit, lq_hash);
- tmp = &qunit->lq_data;
- if (qunit->lq_ctxt == qctxt &&
- qdata->qd_id == tmp->qd_id && qdata->qd_type == tmp->qd_type
- && qdata->qd_isblk == tmp->qd_isblk)
- return qunit;
- }
- return NULL;
-}
-
-/* check_cur_qunit - check the current usage of qunit.
- * @qctxt: quota context
- * @qdata: the type of quota unit to be checked
- *
- * return: 1 - need acquire qunit;
- * 2 - need release qunit;
- * 0 - need do nothing.
- * < 0 - error.
- */
-static int
-check_cur_qunit(struct obd_device *obd,
- struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata)
-{
- struct super_block *sb = qctxt->lqc_sb;
- unsigned long qunit_sz, tune_sz;
- __u64 usage, limit;
- struct obd_quotactl *qctl = NULL;
- int ret = 0;
- ENTRY;
-
- if (!sb_any_quota_enabled(sb))
- RETURN(0);
-
- /* ignore root user */
- if (qdata->qd_id == 0 && qdata->qd_type == USRQUOTA)
- RETURN(0);
-
- OBD_ALLOC(qctl, sizeof(*qctl));
- if (qctl == NULL)
- RETURN(-ENOMEM);
-
- /* get fs quota usage & limit */
- qctl->qc_cmd = Q_GETQUOTA;
- qctl->qc_id = qdata->qd_id;
- qctl->qc_type = qdata->qd_type;
- ret = fsfilt_quotactl(obd, sb, qctl);
- if (ret) {
- if (ret == -ESRCH) /* no limit */
- ret = 0;
- else
- CERROR("can't get fs quota usage! (rc:%d)\n", ret);
- GOTO(out, ret);
- }
-
- if (qdata->qd_isblk) {
- usage = qctl->qc_dqblk.dqb_curspace;
- limit = qctl->qc_dqblk.dqb_bhardlimit;
- qunit_sz = qctxt->lqc_bunit_sz;
- tune_sz = qctxt->lqc_btune_sz;
-
- LASSERT(!(qunit_sz % QUOTABLOCK_SIZE));
- LASSERT(limit == MIN_QLIMIT
- || !((__u32) limit % toqb(qunit_sz)));
- limit = limit << QUOTABLOCK_BITS;
- } else {
- usage = qctl->qc_dqblk.dqb_curinodes;
- limit = qctl->qc_dqblk.dqb_ihardlimit;
- qunit_sz = qctxt->lqc_iunit_sz;
- tune_sz = qctxt->lqc_itune_sz;
- }
-
- /* if it's not first time to set quota, ignore the no quota limit
- * case */
- if (!limit)
- GOTO(out, ret = 0);
-
- /* we don't count the MIN_QLIMIT */
- if ((limit == MIN_QLIMIT && !qdata->qd_isblk) ||
- (toqb(limit) == MIN_QLIMIT && qdata->qd_isblk))
- limit = 0;
-
- LASSERT(qdata->qd_count == 0);
- if (limit <= usage + tune_sz) {
- while (qdata->qd_count + limit <= usage + tune_sz)
- qdata->qd_count += qunit_sz;
- ret = 1;
- } else if (limit > usage + qunit_sz + tune_sz) {
- while (limit - qdata->qd_count > usage + qunit_sz + tune_sz)
- qdata->qd_count += qunit_sz;
- ret = 2;
- }
- LASSERT(ret == 0 || qdata->qd_count);
-out:
- OBD_FREE(qctl, sizeof(*qctl));
- RETURN(ret);
-}
-
-/* must hold qctxt->lqc_qunit_lock */
-static struct lustre_qunit *dqacq_in_flight(struct lustre_quota_ctxt *qctxt,
- struct qunit_data *qdata)
-{
- unsigned int hashent = qunit_hashfn(qctxt, qdata);
- struct lustre_qunit *qunit = NULL;
- ENTRY;
-
- qunit = find_qunit(hashent, qctxt, qdata);
- RETURN(qunit);
-}
-
-static struct lustre_qunit *alloc_qunit(struct lustre_quota_ctxt *qctxt,
- struct qunit_data *qdata, int opc)
-{
- struct lustre_qunit *qunit = NULL;
- ENTRY;
-
- OBD_SLAB_ALLOC(qunit, qunit_cachep, SLAB_NOFS, sizeof(*qunit));
- if (qunit == NULL)
- RETURN(NULL);
-
- INIT_LIST_HEAD(&qunit->lq_hash);
- INIT_LIST_HEAD(&qunit->lq_waiters);
- atomic_set(&qunit->lq_refcnt, 1);
- qunit->lq_ctxt = qctxt;
- memcpy(&qunit->lq_data, qdata, sizeof(*qdata));
- qunit->lq_opc = opc;
-
- RETURN(qunit);
-}
-
-static inline void free_qunit(struct lustre_qunit *qunit)
-{
- OBD_SLAB_FREE(qunit, qunit_cachep, sizeof(*qunit));
-}
-
-static inline void qunit_get(struct lustre_qunit *qunit)
-{
- atomic_inc(&qunit->lq_refcnt);
-}
-
-static void qunit_put(struct lustre_qunit *qunit)
-{
- LASSERT(atomic_read(&qunit->lq_refcnt));
- if (atomic_dec_and_test(&qunit->lq_refcnt))
- free_qunit(qunit);
-}
-
-static void
-insert_qunit_nolock(struct lustre_quota_ctxt *qctxt, struct lustre_qunit *qunit)
-{
- struct list_head *head;
-
- head = qunit_hash + qunit_hashfn(qctxt, &qunit->lq_data);
- list_add(&qunit->lq_hash, head);
-}
-
-static void remove_qunit_nolock(struct lustre_qunit *qunit)
-{
- LASSERT(!list_empty(&qunit->lq_hash));
- list_del_init(&qunit->lq_hash);
-}
-
-struct qunit_waiter {
- struct list_head qw_entry;
- wait_queue_head_t qw_waitq;
- int qw_rc;
-};
-
-#define QDATA_DEBUG(qd, fmt, arg...) \
- CDEBUG(D_QUOTA, "id(%u) type(%u) count(%u) isblk(%u):" \
- fmt, qd->qd_id, qd->qd_type, qd->qd_count, qd->qd_isblk, \
- ## arg); \
-
-#define INC_QLIMIT(limit, count) (limit == MIN_QLIMIT) ? \
- (limit = count) : (limit += count)
-
-static int
-dqacq_completion(struct obd_device *obd,
- struct lustre_quota_ctxt *qctxt,
- struct qunit_data *qdata, int rc, int opc)
-{
- struct lustre_qunit *qunit = NULL;
- struct super_block *sb = qctxt->lqc_sb;
- unsigned long qunit_sz;
- struct list_head *pos, *tmp;
- int err = 0;
- ENTRY;
-
- LASSERT(qdata);
- qunit_sz =
- (qdata->qd_isblk) ? qctxt->lqc_bunit_sz : qctxt->lqc_iunit_sz;
- LASSERT(!(qdata->qd_count % qunit_sz));
-
- /* update local operational quota file */
- if (rc == 0) {
- __u32 count = QUSG(qdata->qd_count, qdata->qd_isblk);
- struct obd_quotactl *qctl = NULL;
-
- OBD_ALLOC(qctl, sizeof(*qctl));
- if (qctl == NULL)
- GOTO(out, err = -ENOMEM);
-
- /* acq/rel qunit for specified uid/gid is serialized,
- * so there is no race between get fs quota limit and
- * set fs quota limit */
- qctl->qc_cmd = Q_GETQUOTA;
- qctl->qc_id = qdata->qd_id;
- qctl->qc_type = qdata->qd_type;
- err = fsfilt_quotactl(obd, sb, qctl);
- if (err) {
- CERROR("error get quota fs limit! (rc:%d)\n", err);
- GOTO(out_mem, err);
- }
-
- switch (opc) {
- case QUOTA_DQACQ:
- if (qdata->qd_isblk) {
- qctl->qc_dqblk.dqb_valid = QIF_BLIMITS;
- INC_QLIMIT(qctl->qc_dqblk.dqb_bhardlimit,
- count);
- } else {
- qctl->qc_dqblk.dqb_valid = QIF_ILIMITS;
- INC_QLIMIT(qctl->qc_dqblk.dqb_ihardlimit,
- count);
- }
- break;
- case QUOTA_DQREL:
- if (qdata->qd_isblk) {
- LASSERT(count < qctl->qc_dqblk.dqb_bhardlimit);
- qctl->qc_dqblk.dqb_valid = QIF_BLIMITS;
- qctl->qc_dqblk.dqb_bhardlimit -= count;
- } else {
- LASSERT(count < qctl->qc_dqblk.dqb_ihardlimit);
- qctl->qc_dqblk.dqb_valid = QIF_ILIMITS;
- qctl->qc_dqblk.dqb_ihardlimit -= count;
- }
- break;
- default:
- LBUG();
- break;
- }
-
- /* clear quota limit */
- if (count == 0) {
- if (qdata->qd_isblk)
- qctl->qc_dqblk.dqb_bhardlimit = 0;
- else
- qctl->qc_dqblk.dqb_ihardlimit = 0;
- }
-
- qctl->qc_cmd = Q_SETQUOTA;
- err = fsfilt_quotactl(obd, sb, qctl);
- if (err)
- CERROR("error set quota fs limit! (rc:%d)\n", err);
-
- QDATA_DEBUG(qdata, "%s completion\n",
- opc == QUOTA_DQACQ ? "DQACQ" : "DQREL");
-out_mem:
- OBD_FREE(qctl, sizeof(*qctl));
- } else if (rc == -EDQUOT) {
- CWARN("acquire qunit got EDQUOT\n");
- } else {
- CERROR("acquire qunit got error! (rc:%d)\n", rc);
- }
-out:
- /* remove the qunit from hash */
- spin_lock(&qunit_hash_lock);
-
- qunit = dqacq_in_flight(qctxt, qdata);
-
- LASSERT(qunit);
- LASSERT(opc == qunit->lq_opc);
- remove_qunit_nolock(qunit);
-
- /* wake up all waiters */
- list_for_each_safe(pos, tmp, &qunit->lq_waiters) {
- struct qunit_waiter *qw = list_entry(pos, struct qunit_waiter,
- qw_entry);
- list_del_init(&qw->qw_entry);
- qw->qw_rc = rc;
- wake_up(&qw->qw_waitq);
- }
-
- spin_unlock(&qunit_hash_lock);
-
- qunit_put(qunit);
- RETURN(err);
-}
-
-struct dqacq_async_args {
- struct lustre_quota_ctxt *aa_ctxt;
- struct lustre_qunit *aa_qunit;
-};
-
-static int dqacq_interpret(struct ptlrpc_request *req, void *data, int rc)
-{
- struct dqacq_async_args *aa = (struct dqacq_async_args *)data;
- struct lustre_quota_ctxt *qctxt = aa->aa_ctxt;
- struct lustre_qunit *qunit = aa->aa_qunit;
- struct obd_device *obd = req->rq_import->imp_obd;
- struct qunit_data *qdata = NULL;
- ENTRY;
-
- qdata = lustre_swab_repbuf(req, 0, sizeof(*qdata), lustre_swab_qdata);
- if (rc == 0 && qdata == NULL)
- RETURN(-EPROTO);
-
- LASSERT(qdata->qd_id == qunit->lq_data.qd_id &&
- qdata->qd_type == qunit->lq_data.qd_type &&
- (qdata->qd_count == qunit->lq_data.qd_count ||
- qdata->qd_count == 0));
-
- QDATA_DEBUG(qdata, "%s interpret rc(%d).\n",
- req->rq_reqmsg->opc == QUOTA_DQACQ ? "DQACQ" : "DQREL", rc);
-
- rc = dqacq_completion(obd, qctxt, qdata, rc, req->rq_reqmsg->opc);
-
- RETURN(rc);
-}
-
-static int got_qunit(struct qunit_waiter *waiter)
-{
- int rc = 0;
- ENTRY;
- spin_lock(&qunit_hash_lock);
- rc = list_empty(&waiter->qw_entry);
- spin_unlock(&qunit_hash_lock);
- RETURN(rc);
-}
-
-static int
-schedule_dqacq(struct obd_device *obd,
- struct lustre_quota_ctxt *qctxt,
- struct qunit_data *qdata, int opc, int wait)
-{
- struct lustre_qunit *qunit = NULL;
- struct qunit_waiter qw;
- struct l_wait_info lwi = { 0 };
- int rc = 0;
- ENTRY;
-
- INIT_LIST_HEAD(&qw.qw_entry);
- init_waitqueue_head(&qw.qw_waitq);
- qw.qw_rc = 0;
-
- spin_lock(&qunit_hash_lock);
-
- qunit = dqacq_in_flight(qctxt, qdata);
- if (qunit && wait) {
- list_add_tail(&qw.qw_entry, &qunit->lq_waiters);
- spin_unlock(&qunit_hash_lock);
- goto wait_completion;
- } else if (qunit && !wait) {
- qunit = NULL;
- } else if (!qunit && (qunit = alloc_qunit(qctxt, qdata, opc)) != NULL)
- insert_qunit_nolock(qctxt, qunit);
-
- spin_unlock(&qunit_hash_lock);
-
- if (qunit) {
- struct ptlrpc_request *req;
- struct qunit_data *reqdata;
- struct dqacq_async_args *aa;
- int size = sizeof(*reqdata);
-
- /* master is going to dqacq/dqrel from itself */
- if (qctxt->lqc_handler) {
- int rc2;
- QDATA_DEBUG(qdata, "local %s.\n",
- opc == QUOTA_DQACQ ? "DQACQ" : "DQREL");
- rc = qctxt->lqc_handler(obd, qdata, opc);
- rc2 = dqacq_completion(obd, qctxt, qdata, rc, opc);
- RETURN((rc && rc != -EDQUOT) ? rc : rc2);
- }
-
- /* build dqacq/dqrel request */
- LASSERT(qctxt->lqc_import);
- req = ptlrpc_prep_req(qctxt->lqc_import, opc, 1, &size, NULL);
- if (!req) {
- dqacq_completion(obd, qctxt, qdata, -ENOMEM, opc);
- RETURN(-ENOMEM);
- }
-
- reqdata = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*reqdata));
- memcpy(reqdata, qdata, sizeof(*reqdata));
- size = sizeof(*reqdata);
- req->rq_replen = lustre_msg_size(1, &size);
-
- LASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
- aa = (struct dqacq_async_args *)&req->rq_async_args;
- aa->aa_ctxt = qctxt;
- aa->aa_qunit = qunit;
-
- req->rq_interpret_reply = dqacq_interpret;
- ptlrpcd_add_req(req);
-
- QDATA_DEBUG(qdata, "%s scheduled.\n",
- opc == QUOTA_DQACQ ? "DQACQ" : "DQREL");
- }
-wait_completion:
- if (wait && qunit) {
- struct qunit_data *p = &qunit->lq_data;
- QDATA_DEBUG(p, "wait for dqacq.\n");
-
- l_wait_event(qw.qw_waitq, got_qunit(&qw), &lwi);
- if (qw.qw_rc == 0)
- rc = -EAGAIN;
-
- CDEBUG(D_QUOTA, "wait dqacq done. (rc:%d)\n", qw.qw_rc);
- }
- RETURN(rc);
-}
-
-int
-qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
- uid_t uid, gid_t gid, __u32 isblk)
-{
- int ret, rc = 0, i = USRQUOTA;
- struct qunit_data qdata[MAXQUOTAS];
- ENTRY;
-
- if (!sb_any_quota_enabled(qctxt->lqc_sb))
- RETURN(0);
-
- qdata[USRQUOTA].qd_id = uid;
- qdata[USRQUOTA].qd_type = USRQUOTA;
- qdata[USRQUOTA].qd_isblk = isblk;
- qdata[USRQUOTA].qd_count = 0;
- qdata[GRPQUOTA].qd_id = gid;
- qdata[GRPQUOTA].qd_type = GRPQUOTA;
- qdata[GRPQUOTA].qd_isblk = isblk;
- qdata[GRPQUOTA].qd_count = 0;
-
-next:
- ret = check_cur_qunit(obd, qctxt, &qdata[i]);
- if (ret > 0) {
- int opc;
- /* need acquire or release */
- opc = ret == 1 ? QUOTA_DQACQ : QUOTA_DQREL;
- ret = schedule_dqacq(obd, qctxt, &qdata[i], opc, 0);
- if (!rc)
- rc = ret;
- }
- if (++i < MAXQUOTAS)
- goto next;
-
- RETURN(rc);
-}
-EXPORT_SYMBOL(qctxt_adjust_qunit);
-
-int
-qctxt_wait_on_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
- uid_t uid, gid_t gid, __u32 isblk)
-{
- struct qunit_data qdata[MAXQUOTAS];
- int i = USRQUOTA, ret, rc = -EAGAIN;
- ENTRY;
-
- if (!sb_any_quota_enabled(qctxt->lqc_sb))
- RETURN(0);
-
- qdata[USRQUOTA].qd_id = uid;
- qdata[USRQUOTA].qd_type = USRQUOTA;
- qdata[USRQUOTA].qd_isblk = isblk;
- qdata[USRQUOTA].qd_count = 0;
- qdata[GRPQUOTA].qd_id = gid;
- qdata[GRPQUOTA].qd_type = GRPQUOTA;
- qdata[GRPQUOTA].qd_isblk = isblk;
- qdata[GRPQUOTA].qd_count = 0;
-
-next:
- ret = check_cur_qunit(obd, qctxt, &qdata[i]);
- if (ret > 0)
- rc = schedule_dqacq(obd, qctxt, &qdata[i], QUOTA_DQACQ, 1);
-
- if (++i < MAXQUOTAS)
- goto next;
-
- RETURN(rc);
-}
-EXPORT_SYMBOL(qctxt_wait_on_dqacq);
-
-int
-qctxt_init(struct lustre_quota_ctxt *qctxt, struct super_block *sb,
- dqacq_handler_t handler)
-{
- int rc = 0;
- ENTRY;
-
- rc = ptlrpcd_addref();
- if (rc)
- RETURN(rc);
-
- qctxt->lqc_handler = handler;
- qctxt->lqc_sb = sb;
- qctxt->lqc_import = NULL;
- qctxt->lqc_flags = 0;
- qctxt->lqc_bunit_sz = default_bunit_sz;
- qctxt->lqc_btune_sz = default_btune_sz;
- qctxt->lqc_iunit_sz = default_iunit_sz;
- qctxt->lqc_itune_sz = default_itune_sz;
-
- RETURN(0);
-}
-EXPORT_SYMBOL(qctxt_init);
-
-void qctxt_cleanup(struct lustre_quota_ctxt *qctxt, int force)
-{
- struct list_head *pos, *tmp;
- struct lustre_qunit *qunit;
- int i;
- ENTRY;
-
- ptlrpcd_decref();
-
- spin_lock(&qunit_hash_lock);
-
- for (i = 0; i < NR_DQHASH; i++) {
- list_for_each_safe(pos, tmp, &qunit_hash[i]) {
- qunit = list_entry(pos, struct lustre_qunit, lq_hash);
- LASSERT(qunit->lq_ctxt != qctxt);
- }
- }
-
- spin_unlock(&qunit_hash_lock);
- EXIT;
-}
-EXPORT_SYMBOL(qctxt_cleanup);
int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id)
{
char *name_end;
-
+ int len;
+ __u64 resname = 0;
+
/* fsname is at most 8 chars long at the beginning of the logname
e.g. "lustre-MDT0001" or "lustre" */
name_end = strchr(logname, '-');
- if (!name_end)
- name_end = logname + strlen(logname);
- LASSERT(name_end - logname <= 8);
-
- memcpy(&res_id->name[0], logname, name_end - logname);
- CDEBUG(D_MGC, "log %s to resid "LPX64"\n", logname, res_id->name[0]);
+ if (name_end)
+ len = name_end - logname;
+ else
+ len = strlen(logname);
+ LASSERT(len <= 8);
+ memcpy(&resname, logname, len);
+
+ memset(res_id, 0, sizeof(*res_id));
+ /* FIXME are resid names swabbed across the wire? */
+ res_id->name[0] = cpu_to_le64(resname);
+ CDEBUG(D_MGC, "log %s to resid "LPX64"/"LPX64" (%.8s)\n", logname,
+ res_id->name[0], res_id->name[1], (char *)&res_id->name[0]);
return 0;
}
EXPORT_SYMBOL(mgc_logname2resid);
switch (flag) {
case LDLM_CB_BLOCKING:
/* mgs wants the lock, give it up... */
- LDLM_DEBUG(lock, "MGC blocking CB");
+ LDLM_ERROR(lock, "MGC blocking CB");
ldlm_lock2handle(lock, &lockh);
rc = ldlm_cli_cancel(&lockh);
break;
CERROR("Lock res "LPX64" (%.8s)\n",
lock->l_resource->lr_name.name[0],
- (char *)lock->l_resource->lr_name.name);
+ (char *)&lock->l_resource->lr_name.name[0]);
/* Make sure not to re-enqueue when the mgc is stopping
(we get called from client_disconnect_export) */
int rc;
ENTRY;
- LASSERT(type == LDLM_PLAIN);
-
CDEBUG(D_MGC, "Enqueue for %s (res "LPX64")\n", cld->cld_logname,
cld->cld_resid.name[0]);
int rc;
ENTRY;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MGMT_TARGET_ADD,
- 1, &size, NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MGS_VERSION,
+ MGMT_TARGET_ADD, 1, &size, NULL);
if (!req)
RETURN(rc = -ENOMEM);
int rc;
ENTRY;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), MGMT_TARGET_DEL,
- 1, &size, NULL);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MGS_VERSION,
+ MGMT_TARGET_DEL, 1, &size, NULL);
if (!req)
RETURN(rc = -ENOMEM);
.o_del_conn = client_import_del_conn,
.o_connect = client_connect_import,
.o_disconnect = client_disconnect_export,
- .o_enqueue = mgc_enqueue,
+ //.o_enqueue = mgc_enqueue,
.o_cancel = mgc_cancel,
//.o_iocontrol = mgc_iocontrol,
.o_set_info = mgc_set_info,
#include <linux/lustre_fsfilt.h>
#include <linux/lustre_commit_confd.h>
#include <linux/lustre_disk.h>
+#include <linux/lustre_ver.h>
#include "mgs_internal.h"
static int mgs_cleanup(struct obd_device *obd);
if (data != NULL) {
data->ocd_connect_flags &= MGMT_CONNECT_SUPPORTED;
+ data->ocd_ibits_known &= MDS_INODELOCK_FULL;
+
+ /* If no known bits (which should not happen, probably,
+ as everybody should support LOOKUP and UPDATE bits at least)
+ revert to compat mode with plain locks. */
+ if (!data->ocd_ibits_known &&
+ data->ocd_connect_flags & OBD_CONNECT_IBITS)
+ data->ocd_connect_flags &= ~OBD_CONNECT_IBITS;
+
exp->exp_connect_flags = data->ocd_connect_flags;
+ data->ocd_version = LUSTRE_VERSION_CODE;
}
if (rc) {
static int mgs_get_cfg_lock(struct obd_device *obd, char *fsname,
struct lustre_handle *lockh)
{
- /* FIXME resource should be based on fsname,
- one lock per fs. One lock per config log? */
- struct ldlm_res_id res_id = {.name = {12321}};
+ struct ldlm_res_id res_id;
int rc, flags = 0;
+ ENTRY;
- CERROR("mgs_lock %s\n", fsname);
rc = mgc_logname2resid(fsname, &res_id);
rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, res_id,
if (rc) {
CERROR("can't take cfg lock %d\n", rc);
}
-
- return rc;
+
+ RETURN(rc);
}
static int mgs_put_cfg_lock(struct lustre_handle *lockh)
obdclass-objs += class_obd.o
obdclass-objs += debug.o genops.o sysctl.o uuid.o llog_ioctl.o
obdclass-objs += lprocfs_status.o lustre_handles.o lustre_peer.o
-obdclass-objs += statfs_pack.o obdo.o obd_config.o obd_mount.o
+obdclass-objs += statfs_pack.o obdo.o obd_config.o obd_mount.o prng.o
+
ifeq ($(PATCHLEVEL),6)
llog_test-objs := llog-test.o
liblustreclass_a_SOURCES = class_obd.c debug.c genops.c statfs_pack.c uuid.c
liblustreclass_a_SOURCES += lustre_handles.c lustre_peer.c lprocfs_status.c
liblustreclass_a_SOURCES += obdo.c obd_config.c llog.c llog_obd.c llog_cat.c
-liblustreclass_a_SOURCES += llog_lvfs.c llog_swab.c #llog_ioctl.c rbtree.c
+liblustreclass_a_SOURCES += llog_lvfs.c llog_swab.c
+liblustreclass_a_SOURCES += prng.c #llog_ioctl.c rbtree.c
liblustreclass_a_CPPFLAGS = $(LLCPPFLAGS) -DLUSTRE_VERSION=\"32\" -DBUILD_VERSION=\"1\"
liblustreclass_a_CFLAGS = $(LLCFLAGS)
CERROR("Device %d not attached\n", obd->obd_minor);
GOTO(out, err = -ENODEV);
}
- CDEBUG(D_IOCTL,
- "disabling committed-transno notifications on %d\n",
- obd->obd_minor);
+ CDEBUG(D_HA, "%s: disabling committed-transno notification\n",
+ obd->obd_name);
obd->obd_no_transno = 1;
GOTO(out, err = 0);
}
if (obd->obd_type == NULL)
continue;
+ if (obd->obd_stopping)
+ continue;
+
atomic_inc(&obd->obd_refcount);
spin_unlock(&obd_dev_lock);
#include <linux/obd_ost.h>
#include <linux/obd_class.h>
#include <linux/lprocfs_status.h>
-#include <linux/lustre_quota.h>
extern struct list_head obd_types;
static spinlock_t obd_types_lock = SPIN_LOCK_UNLOCKED;
EXPORT_SYMBOL(obdo_cachep);
kmem_cache_t *import_cachep = NULL;
-#ifdef HAVE_QUOTA_SUPPORT
-kmem_cache_t *qunit_cachep = NULL;
-struct list_head qunit_hash[NR_DQHASH];
-spinlock_t qunit_hash_lock = SPIN_LOCK_UNLOCKED;
-EXPORT_SYMBOL(qunit_cachep);
-EXPORT_SYMBOL(qunit_hash);
-EXPORT_SYMBOL(qunit_hash_lock);
-#endif
-
-
int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp);
#ifdef LPROCFS
type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root,
vars, type);
-#endif
if (IS_ERR(type->typ_procroot)) {
rc = PTR_ERR(type->typ_procroot);
type->typ_procroot = NULL;
GOTO (failed, rc);
}
+#endif
spin_lock(&obd_types_lock);
list_add(&type->typ_chain, &obd_types);
return NULL;
}
-static void obd_cleanup_qunit_cache(void)
-{
-#ifdef HAVE_QUOTA_SUPPORT
- int i;
- ENTRY;
-
- spin_lock(&qunit_hash_lock);
- for (i = 0; i < NR_DQHASH; i++)
- LASSERT(list_empty(qunit_hash + i));
- spin_unlock(&qunit_hash_lock);
-
- if (qunit_cachep) {
- LASSERTF(kmem_cache_destroy(qunit_cachep) == 0,
- "Cannot destroy ll_qunit_cache\n");
- qunit_cachep = NULL;
- }
- EXIT;
-#endif
-}
void obd_cleanup_caches(void)
{
"Cannot destory ll_import_cache\n");
import_cachep = NULL;
}
- obd_cleanup_qunit_cache();
EXIT;
}
-static int obd_init_qunit_cache(void)
-{
-
-#ifdef HAVE_QUOTA_SUPPORT
- int i;
- ENTRY;
-
- LASSERT(qunit_cachep == NULL);
- qunit_cachep = kmem_cache_create("ll_qunit_cache",
- sizeof(struct lustre_qunit),
- 0, 0, NULL, NULL);
- if (!qunit_cachep)
- RETURN(-ENOMEM);
-
- spin_lock(&qunit_hash_lock);
- for (i = 0; i < NR_DQHASH; i++)
- INIT_LIST_HEAD(qunit_hash + i);
- spin_unlock(&qunit_hash_lock);
-#endif
- RETURN(0);
-}
-
int obd_init_caches(void)
{
- int rc = 0;
ENTRY;
LASSERT(obdo_cachep == NULL);
if (!import_cachep)
GOTO(out, -ENOMEM);
- rc = obd_init_qunit_cache();
- if (rc)
- GOTO(out, rc);
-
RETURN(0);
out:
obd_cleanup_caches();
ENTRY;
lock_kernel();
- libcfs_daemonize("ping_evictor");
+
+ /* ptlrpc_daemonize() */
+ exit_mm(current);
+ lustre_daemonize_helper();
+ set_fs_pwd(current->fs, init_task.fs->pwdmnt, init_task.fs->pwd);
+ exit_files(current);
+ reparent_to_init();
+ THREAD_NAME(current->comm, sizeof(current->comm), "ping_evictor");
+
SIGNAL_MASK_LOCK(current, flags);
sigfillset(¤t->blocked);
RECALC_SIGPENDING;
class_export_get(exp);
spin_unlock(&obd->obd_dev_lock);
LCONSOLE_WARN("%s: haven't heard from %s in %ld"
- " seconds. I think it's dead, "
- "and I am evicting it.\n",
- obd->obd_name,
+ " seconds. Last request was at %ld. "
+ "I think it's dead, and I am evicting "
+ "it.\n", obd->obd_name,
obd_export_nid2str(exp),
(long)(CURRENT_SECONDS -
- exp->exp_last_request_time));
+ exp->exp_last_request_time),
+ exp->exp_last_request_time);
class_fail_export(exp);
/* Check if the oldest entry is expired. */
if (CURRENT_SECONDS > (oldest_time +
(3 * obd_timeout / 2) + extra_delay)) {
- /* We need a second timer, in case the net was
- * down and it just came back. Since the pinger
- * may skip every other PING_INTERVAL (see note in
- * ptlrpc_pinger_main), we better wait for 3. */
+ /* We need a second timer, in case the net was down and
+ * it just came back. Since the pinger may skip every
+ * other PING_INTERVAL (see note in ptlrpc_pinger_main),
+ * we better wait for 3. */
exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS +
3 * PING_INTERVAL;
CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n",
for (i = 0; i < num_to_evict; i++) {
exports_evicted++;
- CERROR("evicting NID '%s' (%s) #%d at adminstrative request\n",
- nid, doomed_exp[i]->exp_client_uuid.uuid,
+ CWARN("%s: evict NID '%s' (%s) #%d at adminstrative request\n",
+ obd->obd_name, nid, doomed_exp[i]->exp_client_uuid.uuid,
exports_evicted);
class_fail_export(doomed_exp[i]);
class_export_put(doomed_exp[i]);
}
if (!exports_evicted)
- CERROR("can't disconnect NID '%s': no exports found\n", nid);
+ CERROR("%s: can't disconnect NID '%s': no exports found\n",
+ obd->obd_name, nid);
return exports_evicted;
}
EXPORT_SYMBOL(obd_export_evict_by_nid);
spin_unlock(&obd->obd_dev_lock);
if (doomed_exp == NULL) {
- CERROR("can't disconnect %s: no exports found\n", uuid);
+ CERROR("%s: can't disconnect %s: no exports found\n",
+ obd->obd_name, uuid);
} else {
- CERROR("evicting %s at adminstrative request\n",
- doomed_exp->exp_client_uuid.uuid);
+ CWARN("%s: evicting %s at adminstrative request\n",
+ obd->obd_name, doomed_exp->exp_client_uuid.uuid);
class_fail_export(doomed_exp);
class_export_put(doomed_exp);
exports_evicted++;
loghandle->lgh_id.lgl_oid,
loghandle->lgh_id.lgl_ogen);
GOTO(out, rc);
+ } else if (rc == LLOG_DEL_RECORD) {
+ llog_cancel_rec(loghandle, rec->lrh_index);
+ rc = 0;
}
if (rc)
GOTO(out, rc);
}
EXPORT_SYMBOL(llog_get_size);
+int llog_reverse_process(struct llog_handle *loghandle, llog_cb_t cb,
+ void *data, void *catdata)
+{
+ struct llog_log_hdr *llh = loghandle->lgh_hdr;
+ struct llog_process_cat_data *cd = catdata;
+ void *buf;
+ int rc = 0, first_index = 1, index, idx;
+ struct llog_rec_tail *tail;
+ ENTRY;
+
+ OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+ if (!buf)
+ RETURN(-ENOMEM);
+
+ if (cd != NULL)
+ first_index = cd->first_idx + 1;
+ if (cd != NULL && cd->last_idx)
+ index = cd->last_idx;
+ else
+ index = LLOG_BITMAP_BYTES * 8 - 1;
+
+ while (rc == 0) {
+ struct llog_rec_hdr *rec;
+
+ /* skip records not set in bitmap */
+ while (index >= first_index &&
+ !ext2_test_bit(index, llh->llh_bitmap))
+ --index;
+
+ LASSERT(index >= first_index - 1);
+ if (index == first_index - 1)
+ break;
+
+ /* get the buf with our target record; avoid old garbage */
+ memset(buf, 0, LLOG_CHUNK_SIZE);
+ rc = llog_prev_block(loghandle, index, buf, LLOG_CHUNK_SIZE);
+ if (rc)
+ GOTO(out, rc);
+
+ rec = buf;
+ idx = le32_to_cpu(rec->lrh_index);
+ if (idx < index)
+ CDEBUG(D_HA, "index %u : idx %u\n", index, idx);
+ while (idx < index) {
+ rec = ((void *)rec + le32_to_cpu(rec->lrh_len));
+ idx ++;
+ }
+
+ /* process records in buffer, starting where we found one */
+ while ((void *)rec >= buf) {
+ if (rec->lrh_index == 0)
+ GOTO(out, 0); /* no more records */
+
+ /* if set, process the callback on this record */
+ if (ext2_test_bit(index, llh->llh_bitmap)) {
+ rc = cb(loghandle, rec, data);
+ if (rc == LLOG_PROC_BREAK) {
+ CWARN("recovery from log: "LPX64":%x"
+ " stopped\n",
+ loghandle->lgh_id.lgl_oid,
+ loghandle->lgh_id.lgl_ogen);
+ GOTO(out, rc);
+ }
+ if (rc)
+ GOTO(out, rc);
+ }
+
+ /* previous record, still in buffer? */
+ --index;
+ if (index < first_index)
+ GOTO(out, rc = 0);
+ tail = (void *)rec - sizeof(struct llog_rec_tail);
+ rec = ((void *)rec - le32_to_cpu(tail->lrt_len));
+ }
+ }
+
+out:
+ if (buf)
+ OBD_FREE(buf, LLOG_CHUNK_SIZE);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_reverse_process);
}
EXPORT_SYMBOL(llog_cat_process);
+static int llog_cat_reverse_process_cb(struct llog_handle *cat_llh,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct llog_process_data *d = data;
+ struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+ struct llog_handle *llh;
+ int rc;
+
+ if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
+ CERROR("invalid record in catalog\n");
+ RETURN(-EINVAL);
+ }
+ CWARN("processing log "LPX64":%x at index %u of catalog "LPX64"\n",
+ lir->lid_id.lgl_oid, lir->lid_id.lgl_ogen,
+ le32_to_cpu(rec->lrh_index), cat_llh->lgh_id.lgl_oid);
+
+ rc = llog_cat_id2handle(cat_llh, &llh, &lir->lid_id);
+ if (rc) {
+ CERROR("Cannot find handle for log "LPX64"\n",
+ lir->lid_id.lgl_oid);
+ RETURN(rc);
+ }
+
+ rc = llog_reverse_process(llh, d->lpd_cb, d->lpd_data, NULL);
+ RETURN(rc);
+}
+
+int llog_cat_reverse_process(struct llog_handle *cat_llh,
+ llog_cb_t cb, void *data)
+{
+ struct llog_process_data d;
+ struct llog_process_cat_data cd;
+ struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+ int rc;
+ ENTRY;
+
+ LASSERT(llh->llh_flags &cpu_to_le32(LLOG_F_IS_CAT));
+ d.lpd_data = data;
+ d.lpd_cb = cb;
+
+ if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+ CWARN("catalog "LPX64" crosses index zero\n",
+ cat_llh->lgh_id.lgl_oid);
+
+ cd.first_idx = 0;
+ cd.last_idx = cat_llh->lgh_last_idx;
+ rc = llog_reverse_process(cat_llh, llog_cat_reverse_process_cb,
+ &d, &cd);
+ if (rc != 0)
+ RETURN(rc);
+
+ cd.first_idx = le32_to_cpu(llh->llh_cat_idx);
+ cd.last_idx = 0;
+ rc = llog_reverse_process(cat_llh, llog_cat_reverse_process_cb,
+ &d, &cd);
+ } else {
+ rc = llog_reverse_process(cat_llh, llog_cat_reverse_process_cb,
+ &d, NULL);
+ }
+
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_reverse_process);
+
int llog_cat_set_first_idx(struct llog_handle *cathandle, int index)
{
struct llog_log_hdr *llh = cathandle->lgh_hdr;
rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
loghandle->lgh_file, buf, len,
&ppos);
-
if (rc) {
CERROR("Cant read llog block at log id "LPU64
"/%u offset "LPU64"\n",
RETURN(-EIO);
}
+static int llog_lvfs_prev_block(struct llog_handle *loghandle,
+ int prev_idx, void *buf, int len)
+{
+ __u64 cur_offset;
+ int rc;
+ ENTRY;
+
+ if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+ RETURN(-EINVAL);
+
+ CDEBUG(D_OTHER, "looking for log index %u n", prev_idx);
+
+ cur_offset = LLOG_CHUNK_SIZE;
+ llog_skip_over(&cur_offset, 0, prev_idx);
+
+ while (cur_offset < loghandle->lgh_file->f_dentry->d_inode->i_size) {
+ struct llog_rec_hdr *rec;
+ struct llog_rec_tail *tail;
+ loff_t ppos;
+
+ ppos = cur_offset;
+
+ rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+ loghandle->lgh_file, buf, len,
+ &ppos);
+ if (rc) {
+ CERROR("Cant read llog block at log id "LPU64
+ "/%u offset "LPU64"\n",
+ loghandle->lgh_id.lgl_oid,
+ loghandle->lgh_id.lgl_ogen,
+ cur_offset);
+ RETURN(rc);
+ }
+
+ /* put number of bytes read into rc to make code simpler */
+ rc = ppos - cur_offset;
+ cur_offset = ppos;
+
+ if (rc == 0) /* end of file, nothing to do */
+ RETURN(0);
+
+ if (rc < sizeof(*tail)) {
+ CERROR("Invalid llog block at log id "LPU64"/%u offset "
+ LPU64"\n", loghandle->lgh_id.lgl_oid,
+ loghandle->lgh_id.lgl_ogen, cur_offset);
+ RETURN(-EINVAL);
+ }
+
+ tail = buf + rc - sizeof(struct llog_rec_tail);
+
+ /* this shouldn't happen */
+ if (tail->lrt_index == 0) {
+ CERROR("Invalid llog tail at log id "LPU64"/%u offset "
+ LPU64"\n", loghandle->lgh_id.lgl_oid,
+ loghandle->lgh_id.lgl_ogen, cur_offset);
+ RETURN(-EINVAL);
+ }
+ if (le32_to_cpu(tail->lrt_index) < prev_idx)
+ continue;
+
+ /* sanity check that the start of the new buffer is no farther
+ * than the record that we wanted. This shouldn't happen. */
+ rec = buf;
+ if (le32_to_cpu(rec->lrh_index) > prev_idx) {
+ CERROR("missed desired record? %u > %u\n",
+ le32_to_cpu(rec->lrh_index), prev_idx);
+ RETURN(-ENOENT);
+ }
+ RETURN(0);
+ }
+ RETURN(-EIO);
+}
+
static struct file *llog_filp_open(char *name, int flags, int mode)
{
char *logname;
oa = obdo_alloc();
if (oa == NULL)
GOTO(cleanup, rc = -ENOMEM);
- /* XXX get some filter group constants */
- oa->o_gr = 1;
+
+ oa->o_gr = FILTER_GROUP_LLOG;
oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP;
+
rc = obd_create(ctxt->loc_exp, oa, NULL, NULL);
if (rc)
GOTO(cleanup, rc);
if (rc)
GOTO(out, rc);
- rc = obd_destroy(handle->lgh_ctxt->loc_exp, oa, NULL, NULL);
+ rc = obd_destroy(handle->lgh_ctxt->loc_exp, oa, NULL, NULL, NULL);
out:
obdo_free(oa);
RETURN(rc);
struct llog_operations llog_lvfs_ops = {
lop_write_rec: llog_lvfs_write_rec,
lop_next_block: llog_lvfs_next_block,
+ lop_prev_block: llog_lvfs_prev_block,
lop_read_header: llog_lvfs_read_header,
lop_create: llog_lvfs_create,
lop_destroy: llog_lvfs_destroy,
return 0;
}
+static int llog_lvfs_prev_block(struct llog_handle *loghandle,
+ int prev_idx, void *buf, int len)
+{
+ LBUG();
+ return 0;
+}
+
static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res,
struct llog_logid *logid, char *name)
{
struct llog_operations llog_lvfs_ops = {
lop_write_rec: llog_lvfs_write_rec,
lop_next_block: llog_lvfs_next_block,
+ lop_prev_block: llog_lvfs_prev_block,
lop_read_header: llog_lvfs_read_header,
lop_create: llog_lvfs_create,
lop_destroy: llog_lvfs_destroy,
int llog_add(struct llog_ctxt *ctxt, struct llog_rec_hdr *rec,
struct lov_stripe_md *lsm, struct llog_cookie *logcookies,
- int numcookies, llog_fill_rec_cb_t fill_cb)
+ int numcookies)
{
int rc;
ENTRY;
CTXT_CHECK_OP(ctxt, add, -EOPNOTSUPP);
- rc = CTXTP(ctxt, add)(ctxt, rec, lsm, logcookies, numcookies, fill_cb);
+ rc = CTXTP(ctxt, add)(ctxt, rec, lsm, logcookies, numcookies);
RETURN(rc);
}
EXPORT_SYMBOL(llog_add);
/* add for obdfilter/sz and mds/unlink */
int llog_obd_origin_add(struct llog_ctxt *ctxt,
struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
- struct llog_cookie *logcookies, int numcookies,
- llog_fill_rec_cb_t fill_cb)
+ struct llog_cookie *logcookies, int numcookies)
{
struct llog_handle *cathandle;
int rc;
}
EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
-void lustre_swab_ll_fid (struct ll_fid *fid)
+void lustre_swab_ll_fid(struct ll_fid *fid)
{
__swab64s (&fid->id);
__swab32s (&fid->generation);
GOTO(out, rc);
}
+ CWARN("5f: print plain log entries reversely.. expect 6\n");
+ rc = llog_cat_reverse_process(llh, plain_print_cb, "foobar");
+ if (rc) {
+ CERROR("5f: reversely process with plain_print_cb failed: %d\n", rc);
+ GOTO(out, rc);
+ }
+
out:
CWARN("5: close re-opened catalog\n");
if (llh)
if (rc)
CERROR("6: llog_process failed %d\n", rc);
+ rc = llog_reverse_process(llh, (llog_cb_t)plain_print_cb, NULL, NULL);
+ if (rc)
+ CERROR("6: llog_reverse_process failed %d\n", rc);
+
parse_out:
rc = llog_close(llh);
if (rc) {
if (rc)
RETURN(rc);
- llog_test_rand = ll_insecure_random_int();
+ llog_test_rand = ll_rand();
rc = llog_run_tests(obd);
if (rc)
return snprintf(page, count, "%s\n", conn->c_remote_uuid.uuid);
}
+static const char *obd_connect_names[] = {
+ "read_only",
+ "lov_index",
+ "unused",
+ "write_grant",
+ "server_lock",
+ "version",
+ "request_portal",
+ "acl",
+ "xattr",
+ "create_on_write",
+ "truncate_lock",
+ "initial_transno",
+ "inode_bit_locks",
+ "join_file",
+ NULL
+};
+
+int lprocfs_rd_connect_flags(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct obd_device *obd = data;
+ __u64 mask = 1, flags;
+ int i, ret;
+
+ if (obd == NULL)
+ return 0;
+
+ flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags;
+ ret = snprintf(page, count, "flags="LPX64"\n", flags);
+ for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+ if (flags & mask)
+ ret += snprintf(page + ret, count - ret, "%s\n",
+ obd_connect_names[i]);
+ }
+ if (flags & ~(mask - 1))
+ ret += snprintf(page + ret, count - ret,
+ "unknown flags "LPX64"\n", flags & ~(mask - 1));
+
+ return ret;
+}
+EXPORT_SYMBOL(lprocfs_rd_connect_flags);
+
int lprocfs_rd_num_exports(char *page, char **start, off_t off, int count,
int *eof, void *data)
{
LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, checkmd);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
- LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
for (i = num_private_stats; i < num_stats; i++) {
/* If this LBUGs, it is likely that an obd
data = list_entry(deathrow.next, struct uuid_nid_data, un_list);
list_del (&data->un_list);
+ CDEBUG(D_INFO, "del uuid %s\n", data->un_uuid);
OBD_FREE(data->un_uuid, strlen(data->un_uuid) + 1);
OBD_FREE(data, sizeof(*data));
static int lustre_start_mgc(struct super_block *sb)
{
struct lustre_handle mgc_conn = {0, };
+ struct obd_connect_data ocd = { 0 };
struct lustre_sb_info *lsi = s2lsi(sb);
struct obd_device *obd;
struct obd_export *exp;
CERROR("can't set init_recov_bk %d\n", rc);
GOTO(out, rc);
}
+
+ /* FIXME add ACL support? */
+ //ocd.ocd_connect_flags = OBD_CONNECT_ACL;
/* We connect to the MGS at setup, and don't disconnect until cleanup */
- rc = obd_connect(&mgc_conn, obd, &(obd->obd_uuid), NULL);
+ rc = obd_connect(&mgc_conn, obd, &(obd->obd_uuid), &ocd);
if (rc) {
CERROR("connect failed %d\n", rc);
GOTO(out, rc);
RETURN(-EINVAL);
}
- /* Try to detect old lmd data in options */
+ /* Options should be a string - try to detect old lmd data */
if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
LCONSOLE_ERROR("You're using an old version of "
"/sbin/mount.lustre. Please install version "
while(*s1) {
while (*s1 == ' ' || *s1 == ',')
s1++;
+ /* FIXME do something with the RECOVER flag - see lconf */
if (strncmp(s1, "recov", 5) == 0)
lmd->lmd_flags |= LMD_FLG_RECOVER;
if (strncmp(s1, "norecov", 7) == 0)
lmd->lmd_flags &= ~LMD_FLG_RECOVER;
if (strncmp(s1, "nosvc", 5) == 0)
lmd->lmd_flags |= LMD_FLG_NOSVC;
+ /* Client options are parsed in ll_options: eg. flock,
+ user_xattr, acl */
/* Linux 2.4 doesn't pass the device, so we stuck it at the
end of the options. */
attr->ia_gid = oa->o_gid;
attr->ia_valid |= ATTR_GID;
}
+
if (valid & OBD_MD_FLFLAGS) {
attr->ia_attr_flags = oa->o_flags;
attr->ia_valid |= ATTR_ATTR_FLAG;
dst->o_generation = src->i_generation;
newvalid |= OBD_MD_FLGENER;
}
+ if (valid & OBD_MD_FLFID) {
+ dst->o_fid = src->i_ino;
+ newvalid |= OBD_MD_FLFID;
+ }
dst->o_valid |= newvalid;
}
void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid)
{
#ifdef __KERNEL__
+ CLASSERT(sizeof(struct lustre_handle) +
+ sizeof(struct llog_cookie) <= sizeof(src->o_inline));
+
CDEBUG(D_INODE, "src obdo "LPX64" valid "LPX64", dst obdo "LPX64"\n",
src->o_id, src->o_valid, dst->o_id);
#endif
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * This file is part of the Lustre file system, http://www.lustre.org
+ * Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ * concatenation of following two 16-bit multiply with carry generators
+ * x(n)=a*x(n-1)+carry mod 2^16 and y(n)=b*y(n-1)+carry mod 2^16,
+ * number and carry packed within the same 32 bit integer.
+ * algorithm recommended by Marsaglia
+ ******************************************************************/
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#ifdef __KERNEL__
+#include <linux/module.h>
+#else
+#include <liblustre.h>
+#endif
+
+/*
+From: George Marsaglia <geo@stat.fsu.edu>
+Newsgroups: sci.math
+Subject: Re: A RANDOM NUMBER GENERATOR FOR C
+Date: Tue, 30 Sep 1997 05:29:35 -0700
+
+ * You may replace the two constants 36969 and 18000 by any
+ * pair of distinct constants from this list:
+ * 18000 18030 18273 18513 18879 19074 19098 19164 19215 19584
+ * 19599 19950 20088 20508 20544 20664 20814 20970 21153 21243
+ * 21423 21723 21954 22125 22188 22293 22860 22938 22965 22974
+ * 23109 23124 23163 23208 23508 23520 23553 23658 23865 24114
+ * 24219 24660 24699 24864 24948 25023 25308 25443 26004 26088
+ * 26154 26550 26679 26838 27183 27258 27753 27795 27810 27834
+ * 27960 28320 28380 28689 28710 28794 28854 28959 28980 29013
+ * 29379 29889 30135 30345 30459 30714 30903 30963 31059 31083
+ * (or any other 16-bit constants k for which both k*2^16-1
+ * and k*2^15-1 are prime) */
+
+#define RANDOM_CONST_A 18030
+#define RANDOM_CONST_B 29013
+
+static unsigned int seed_x = 521288629;
+static unsigned int seed_y = 362436069;
+unsigned int ll_rand(void)
+{
+
+ seed_x = RANDOM_CONST_A * (seed_x & 65535) + (seed_x >> 16);
+ seed_y = RANDOM_CONST_B * (seed_y & 65535) + (seed_y >> 16);
+
+ return ((seed_x << 16) + (seed_y & 65535));
+}
+EXPORT_SYMBOL(ll_rand);
+
+void ll_srand(unsigned int seed1, unsigned int seed2)
+{
+ if (seed1)
+ seed_x = seed1; /* use default seeds if parameter is 0 */
+ if (seed2)
+ seed_y = seed2;
+}
+EXPORT_SYMBOL(ll_srand);
}
int echo_destroy(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea, struct obd_trans_info *oti)
+ struct lov_stripe_md *ea, struct obd_trans_info *oti,
+ struct obd_export *md_exp)
{
struct obd_device *obd = class_exp2obd(exp);
if (lsm->lsm_stripe_size == 0)
lsm->lsm_stripe_size = PAGE_SIZE;
- idx = ll_insecure_random_int();
+ idx = ll_rand();
/* setup stripes: indices + default ids if required */
for (i = 0; i < lsm->lsm_stripe_count; i++) {
oa->o_id = ++last_object_id;
if (on_target) {
- /* XXX get some filter group constants */
- oa->o_gr = 2;
+ oa->o_gr = FILTER_GROUP_ECHO;
oa->o_valid |= OBD_MD_FLGROUP;
+
rc = obd_create(ec->ec_exp, oa, &lsm, oti);
if (rc != 0)
goto failed;
oa->o_id, on_target ? " (undoing create)" : "");
if (on_target)
- obd_destroy(ec->ec_exp, oa, lsm, oti);
+ obd_destroy(ec->ec_exp, oa, lsm, oti, NULL);
rc = -EEXIST;
goto failed;
rnb[i].len = PAGE_SIZE;
}
- /* XXX this can't be the best.. */
- memset(oti, 0, sizeof(*oti));
ioo.ioo_bufcnt = npages;
+ oti->oti_transno = 0;
ret = obd_preprw(rw, exp, oa, 1, &ioo, npages, rnb, lnb, oti);
if (ret != 0)
{
struct obd_device *obd = class_exp2obd(exp);
struct echo_client_obd *ec = &obd->u.echo_client;
- struct obd_trans_info dummy_oti;
+ struct obd_trans_info dummy_oti = { .oti_thread_id = -1 };
struct ec_object *eco;
int rc;
ENTRY;
if (rc)
RETURN(rc);
- memset(&dummy_oti, 0, sizeof(dummy_oti));
-
data->ioc_obdo1.o_valid &= ~OBD_MD_FLHANDLE;
data->ioc_obdo1.o_valid |= OBD_MD_FLGROUP;
- data->ioc_obdo1.o_gr = 2;
+ data->ioc_obdo1.o_gr = FILTER_GROUP_ECHO;
switch((long)data->ioc_pbuf1) {
case 1:
rc = echo_get_object (&eco, obd, &data->ioc_obdo1);
if (rc == 0) {
oa = &data->ioc_obdo1;
- oa->o_gr = 2;
+ oa->o_gr = FILTER_GROUP_ECHO;
oa->o_valid |= OBD_MD_FLGROUP;
rc = obd_destroy(ec->ec_exp, oa, eco->eco_lsm,
- &dummy_oti);
+ &dummy_oti, NULL);
if (rc == 0)
eco->eco_deleted = 1;
echo_put_object(eco);
#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
# include <linux/mount.h>
# include <linux/buffer_head.h>
+# include <linux/security.h>
#endif
#include <linux/obd_class.h>
#include <libcfs/list.h>
#include <linux/lustre_disk.h>
#include <linux/lustre_quota.h>
+#include <linux/quotaops.h>
#include "filter_internal.h"
* there's no need for extra complication here
*/
if (new_client) {
- cl_idx = find_first_zero_bit(bitmap, FILTER_LR_MAX_CLIENTS);
+ cl_idx = find_first_zero_bit(bitmap, LR_MAX_CLIENTS);
repeat:
- if (cl_idx >= FILTER_LR_MAX_CLIENTS) {
- CERROR("no client slots - fix FILTER_LR_MAX_CLIENTS\n");
+ if (cl_idx >= LR_MAX_CLIENTS) {
+ CERROR("no client slots - fix LR_MAX_CLIENTS\n");
RETURN(-EOVERFLOW);
}
if (test_and_set_bit(cl_idx, bitmap)) {
- cl_idx = find_next_zero_bit(bitmap,
- FILTER_LR_MAX_CLIENTS,
+ cl_idx = find_next_zero_bit(bitmap, LR_MAX_CLIENTS,
cl_idx);
goto repeat;
}
{
OBD_FREE(filter->fo_fsd, sizeof(*filter->fo_fsd));
filter->fo_fsd = NULL;
- OBD_FREE(filter->fo_last_rcvd_slots, FILTER_LR_MAX_CLIENTS / 8);
+ OBD_FREE(filter->fo_last_rcvd_slots, LR_MAX_CLIENTS / 8);
filter->fo_last_rcvd_slots = NULL;
return 0;
}
int rc;
ENTRY;
+ if (filter->fo_last_objid_files[group] == NULL) {
+ CERROR("Object group "LPU64" not fully setup; not updating "
+ "last_objid\n", group);
+ RETURN(-EINVAL);
+ }
+
CDEBUG(D_INODE, "%s: server last_objid for group "LPU64": "LPU64"\n",
obd->obd_name, group, filter->fo_last_objids[group]);
int rc;
/* ensure padding in the struct is the correct size */
- LASSERT (offsetof(struct lr_server_data, lsd_padding) +
- sizeof(fsd->lsd_padding) == FILTER_LR_SERVER_SIZE);
- LASSERT (offsetof(struct filter_client_data, fcd_padding) +
- sizeof(fcd->fcd_padding) == FILTER_LR_CLIENT_SIZE);
- LASSERT(FILTER_LR_CLIENT_SIZE == LR_CLIENT_SIZE);
- LASSERT(FILTER_LR_CLIENT_START == LR_CLIENT_START);
+ CLASSERT (offsetof(struct lr_server_data, lsd_padding) +
+ sizeof(fsd->lsd_padding) == LR_SERVER_SIZE);
+ CLASSERT (offsetof(struct filter_client_data, fcd_padding) +
+ sizeof(fcd->fcd_padding) == LR_CLIENT_SIZE);
OBD_ALLOC(fsd, sizeof(*fsd));
if (!fsd)
RETURN(-ENOMEM);
filter->fo_fsd = fsd;
- OBD_ALLOC(filter->fo_last_rcvd_slots, FILTER_LR_MAX_CLIENTS / 8);
+ OBD_ALLOC(filter->fo_last_rcvd_slots, LR_MAX_CLIENTS / 8);
if (filter->fo_last_rcvd_slots == NULL) {
OBD_FREE(fsd, sizeof(*fsd));
RETURN(-ENOMEM);
memcpy(fsd->lsd_uuid, obd->obd_uuid.uuid,sizeof(fsd->lsd_uuid));
fsd->lsd_last_transno = 0;
mount_count = fsd->lsd_mount_count = 0;
- fsd->lsd_server_size = cpu_to_le32(FILTER_LR_SERVER_SIZE);
- fsd->lsd_client_start = cpu_to_le32(FILTER_LR_CLIENT_START);
- fsd->lsd_client_size = cpu_to_le16(FILTER_LR_CLIENT_SIZE);
+ fsd->lsd_server_size = cpu_to_le32(LR_SERVER_SIZE);
+ fsd->lsd_client_start = cpu_to_le32(LR_CLIENT_START);
+ fsd->lsd_client_size = cpu_to_le16(LR_CLIENT_SIZE);
fsd->lsd_subdir_count = cpu_to_le16(FILTER_SUBDIR_COUNT);
filter->fo_subdir_count = FILTER_SUBDIR_COUNT;
- fsd->lsd_feature_compat = cpu_to_le32(LR_COMPAT_COMMON_LR);
+ fsd->lsd_feature_compat = cpu_to_le32(OBD_COMPAT_COMMON_LR);
} else {
rc = fsfilt_read_record(obd, filp, fsd, sizeof(*fsd), &off);
if (rc) {
}
if (fsd->lsd_feature_incompat & ~cpu_to_le32(FILTER_INCOMPAT_SUPP)) {
- CERROR("unsupported feature %x\n",
- le32_to_cpu(fsd->lsd_feature_incompat) &
+ CERROR("%s: unsupported incompat filesystem feature(s) %x\n",
+ obd->obd_name, le32_to_cpu(fsd->lsd_feature_incompat) &
~FILTER_INCOMPAT_SUPP);
GOTO(err_fsd, rc = -EINVAL);
}
if (fsd->lsd_feature_rocompat & ~cpu_to_le32(FILTER_ROCOMPAT_SUPP)) {
- CERROR("read-only feature %x\n",
- le32_to_cpu(fsd->lsd_feature_rocompat) &
+ CERROR("%s: unsupported read-only filesystem feature(s) %x\n",
+ obd->obd_name, le32_to_cpu(fsd->lsd_feature_rocompat) &
~FILTER_ROCOMPAT_SUPP);
/* Do something like remount filesystem read-only */
GOTO(err_fsd, rc = -EINVAL);
GOTO(cleanup_O0, rc);
}
filter->fo_fsd->lsd_feature_incompat |=
- cpu_to_le32(FILTER_INCOMPAT_GROUPS);
+ cpu_to_le32(OBD_INCOMPAT_GROUPS);
rc = filter_update_server_data(obd, filter->fo_rcvd_filp,
filter->fo_fsd, 1);
GOTO(cleanup_O0, rc);
filter->fo_last_objid_files[i] = filp;
if (filp->f_dentry->d_inode->i_size == 0) {
- if (i == 0 && filter->fo_fsd->lsd_unused != 0) {
- /* OST conversion, remove sometime post 1.0 */
- filter->fo_last_objids[0] =
- le64_to_cpu(filter->fo_fsd->lsd_unused);
- CWARN("saving old objid "LPU64" to LAST_ID\n",
- filter->fo_last_objids[0]);
- } else {
- filter->fo_last_objids[i] = FILTER_INIT_OBJID;
- }
+ filter->fo_last_objids[i] = FILTER_INIT_OBJID;
rc = filter_update_last_objid(obd, i, 1);
if (rc)
GOTO(cleanup, rc);
GOTO(err_filp, rc = -ENOENT);
}
- /* steal operations */
- inode = file->f_dentry->d_inode;
- filter->fo_fop = file->f_op;
- filter->fo_iop = inode->i_op;
- filter->fo_aops = inode->i_mapping->a_ops;
+ inode = file->f_dentry->d_parent->d_inode;
+ /* We use i_op->unlink directly in filter_vfs_unlink() */
+ if (!inode->i_op || !inode->i_op->create || !inode->i_op->unlink) {
+ CERROR("%s: filesystem does not support create/unlink ops\n",
+ obd->obd_name);
+ GOTO(err_filp, rc = -EOPNOTSUPP);
+ }
rc = filter_init_server_data(obd, file);
if (rc) {
file = filp_open(HEALTH_CHECK, O_RDWR | O_CREAT, 0644);
if (IS_ERR(file)) {
rc = PTR_ERR(file);
- CERROR("OBD filter: cannot open/create %s rc = %d\n",
+ CERROR("OBD filter: cannot open/create %s rc = %d\n",
HEALTH_CHECK, rc);
GOTO(err_filp, rc);
}
}
/* We never dget the object parent, so DON'T dput it either */
-static void filter_parent_unlock(struct dentry *dparent)
-{
- up(&dparent->d_inode->i_sem);
-}
-
-/* We never dget the object parent, so DON'T dput it either */
struct dentry *filter_parent(struct obd_device *obd, obd_gr group, obd_id objid)
{
struct filter_obd *filter = &obd->u.filter;
return rc ? ERR_PTR(rc) : dparent;
}
+/* We never dget the object parent, so DON'T dput it either */
+static void filter_parent_unlock(struct dentry *dparent)
+{
+ up(&dparent->d_inode->i_sem);
+}
+
/* How to get files, dentries, inodes from object id's.
*
* If dir_dentry is passed, the caller has already locked the parent
int len;
ENTRY;
- if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOENT)) {
- CERROR("test case OBD_FAIL_OST_ENOENT\n");
+ if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOENT))
RETURN(ERR_PTR(-ENOENT));
- }
if (id == 0) {
CERROR("fatal: invalid object id 0\n");
RETURN(rc);
}
+/* This is vfs_unlink() without down(i_sem). If we call regular vfs_unlink()
+ * we have 2.6 lock ordering issues with filter_commitrw_write() as it takes
+ * i_sem before starting a handle, while filter_destroy() + vfs_unlink do the
+ * reverse. Caller must take i_sem before starting the transaction and we
+ * drop it here before the inode is removed from the dentry. bug 4180/6984 */
+int filter_vfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int rc;
+ ENTRY;
+
+ /* don't need dir->i_zombie for 2.4, it is for rename/unlink of dir
+ * itself we already hold dir->i_sem for child create/unlink ops */
+ LASSERT(down_trylock(&dir->i_sem) != 0);
+ LASSERT(down_trylock(&dentry->d_inode->i_sem) != 0);
+
+ /* may_delete() */
+ if (!dentry->d_inode || dentry->d_parent->d_inode != dir)
+ GOTO(out, rc = -ENOENT);
+
+ rc = ll_permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+ if (rc)
+ GOTO(out, rc);
+
+ if (IS_APPEND(dir))
+ GOTO(out, rc = -EPERM);
+
+ /* check_sticky() */
+ if ((dentry->d_inode->i_uid != current->fsuid && !capable(CAP_FOWNER))||
+ IS_APPEND(dentry->d_inode) || IS_IMMUTABLE(dentry->d_inode))
+ GOTO(out, rc = -EPERM);
+
+ /* NOTE: This might need to go outside i_sem, though it isn't clear if
+ * that was done because of journal_start (which is already done
+ * here) or some other ordering issue. */
+ DQUOT_INIT(dir);
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+ rc = security_inode_unlink(dir, dentry);
+ if (rc)
+ GOTO(out, rc);
+#endif
+
+ rc = dir->i_op->unlink(dir, dentry);
+out:
+ /* need to drop i_sem before we lose inode reference */
+ up(&dentry->d_inode->i_sem);
+ if (rc == 0)
+ d_delete(dentry);
+
+ RETURN(rc);
+}
+
/* Caller must hold LCK_PW on parent and push us into kernel context.
+ * Caller must hold child i_sem, we drop it always.
* Caller is also required to ensure that dchild->d_inode exists. */
static int filter_destroy_internal(struct obd_device *obd, obd_id objid,
struct dentry *dparent,
{
struct inode *inode = dchild->d_inode;
int rc;
- ENTRY;
if (inode->i_nlink != 1 || atomic_read(&inode->i_count) != 1) {
CERROR("destroying objid %.*s ino %lu nlink %lu count %d\n",
atomic_read(&inode->i_count));
}
- rc = vfs_unlink(dparent->d_inode, dchild);
+ rc = filter_vfs_unlink(dparent->d_inode, dchild);
if (rc)
CERROR("error unlinking objid %.*s: rc %d\n",
dchild->d_name.len, dchild->d_name.name, rc);
- RETURN(rc);
+ return(rc);
}
static int filter_intent_policy(struct ldlm_namespace *ns,
}
RETURN(ELDLM_LOCK_ABORTED);
}
+
/*
* This check is for lock taken in filter_prepare_destroy() that does
* not have l_glimpse_ast set. So the logic is: if there is a lock
* unknown at the time of OST thread creation.
*
* Instead array of iobuf's is attached to struct filter_obd (->fo_iobuf_pool
- * field). This array has size OST_NUM_THREADS, so that each OST thread uses
+ * field). This array has size OST_MAX_THREADS, so that each OST thread uses
* it's very own iobuf.
*
* Functions below
*/
static void filter_iobuf_pool_done(struct filter_obd *filter)
{
- void **pool;
+ struct filter_iobuf **pool;
int i;
ENTRY;
pool = filter->fo_iobuf_pool;
if (pool != NULL) {
- for (i = 0; i < OST_NUM_THREADS; ++ i) {
+ for (i = 0; i < filter->fo_iobuf_count; ++ i) {
if (pool[i] != NULL)
filter_free_iobuf(pool[i]);
}
- OBD_FREE(pool, OST_NUM_THREADS * sizeof pool[0]);
+ OBD_FREE(pool, filter->fo_iobuf_count * sizeof pool[0]);
filter->fo_iobuf_pool = NULL;
}
EXIT;
/*
* pre-allocate pool of iobuf's to be used by filter_{prep,commit}rw_write().
*/
-static int filter_iobuf_pool_init(struct filter_obd *filter, int count)
+static int filter_iobuf_pool_init(struct filter_obd *filter)
{
void **pool;
- int i;
- int result = 0;
ENTRY;
- LASSERT(count <= OST_NUM_THREADS);
-
- OBD_ALLOC_GFP(pool, OST_NUM_THREADS * sizeof pool[0], GFP_KERNEL);
- if (pool == NULL)
+ OBD_ALLOC_GFP(filter->fo_iobuf_pool, OST_MAX_THREADS * sizeof(*pool),
+ GFP_KERNEL);
+ if (filter->fo_iobuf_pool == NULL)
RETURN(-ENOMEM);
- filter->fo_iobuf_pool = pool;
- filter->fo_iobuf_count = count;
- for (i = 0; i < count; ++ i) {
- /*
- * allocate kiobuf to be used by i-th OST thread.
- */
- result = filter_alloc_iobuf(filter, OBD_BRW_WRITE,
- PTLRPC_MAX_BRW_PAGES,
- &pool[i]);
- if (result != 0) {
- filter_iobuf_pool_done(filter);
- break;
- }
- }
- RETURN(result);
+ filter->fo_iobuf_count = OST_MAX_THREADS;
+
+ RETURN(0);
}
-/*
- * return iobuf preallocated by filter_iobuf_pool_init() for @thread.
- */
-void *filter_iobuf_get(struct ptlrpc_thread *thread, struct filter_obd *filter)
+/* Return iobuf allocated for @thread_id. We don't know in advance how
+ * many threads there will be so we allocate a large empty array and only
+ * fill in those slots that are actually in use.
+ * If we haven't allocated a pool entry for this thread before, do so now. */
+void *filter_iobuf_get(struct filter_obd *filter, struct obd_trans_info *oti)
{
- void *kio;
+ int thread_id = oti ? oti->oti_thread_id : -1;
+ struct filter_iobuf *pool = NULL;
+ struct filter_iobuf **pool_place = NULL;
+
+ if (thread_id >= 0) {
+ LASSERT(thread_id < filter->fo_iobuf_count);
+ pool = *(pool_place = &filter->fo_iobuf_pool[thread_id]);
+ }
+
+ if (unlikely(pool == NULL)) {
+ pool = filter_alloc_iobuf(filter, OBD_BRW_WRITE,
+ PTLRPC_MAX_BRW_PAGES);
+ if (pool_place != NULL)
+ *pool_place = pool;
+ }
- LASSERT(thread->t_id < filter->fo_iobuf_count);
- kio = filter->fo_iobuf_pool[thread->t_id];
- LASSERT(kio != NULL);
- return kio;
+ return pool;
}
/* mount the file system (secretly). lustre_cfg parameters are:
struct filter_obd *filter = &obd->u.filter;
struct vfsmount *mnt;
struct lustre_mount_info *lmi;
- char *str;
+ struct obd_uuid uuid;
+ __u8 *uuid_ptr;
+ char *str, *label;
char ns_name[48];
int rc;
ENTRY;
if (IS_ERR(obd->obd_fsops))
GOTO(err_mntput, rc = PTR_ERR(obd->obd_fsops));
- rc = filter_iobuf_pool_init(filter, OST_NUM_THREADS);
+ rc = filter_iobuf_pool_init(filter);
if (rc != 0)
GOTO(err_ops, rc);
}
filter->fo_vfsmnt = mnt;
- filter->fo_sb = mnt->mnt_sb;
+ obd->u.obt.obt_sb = mnt->mnt_sb;
filter->fo_fstype = mnt->mnt_sb->s_type->name;
CDEBUG(D_SUPER, "%s: mnt = %p\n", filter->fo_fstype, mnt);
filter->fo_destroy_in_progress = 0;
sema_init(&filter->fo_create_lock, 1);
-
spin_lock_init(&filter->fo_translock);
spin_lock_init(&filter->fo_objidlock);
spin_lock_init(&filter->fo_stats_lock);
GOTO(err_post, rc);
}
- rc = filter_quota_setup(filter);
- if (rc) {
+ rc = lquota_setup(quota_interface, obd, lcfg);
+ if (rc)
GOTO(err_post, rc);
+
+ uuid_ptr = fsfilt_uuid(obd, obd->u.obt.obt_sb);
+ if (uuid_ptr != NULL) {
+ class_uuid_unparse(uuid_ptr, &uuid);
+ str = uuid.uuid;
+ } else {
+ str = "no UUID";
}
+ label = fsfilt_label(obd, obd->u.obt.obt_sb);
if (obd->obd_recovering) {
- LCONSOLE_WARN("OST %s now serving %s, but will be in recovery "
- "until %d %s reconnect, or if no clients "
- "reconnect for %d:%.02d; during that time new "
+ LCONSOLE_WARN("OST %s now serving %s (%s%s%s), but will be in"
+ "recovery until %d %s reconnect, or if no clients"
+ " reconnect for %d:%.02d; during that time new "
"clients will not be allowed to connect. "
"Recovery progress can be monitored by watching "
"/proc/fs/lustre/obdfilter/%s/recovery_status.\n",
- obd->obd_name,
- lustre_cfg_string(lcfg, 1),
+ obd->obd_name, lustre_cfg_string(lcfg, 1),
+ label ?: "", label ? "/" : "", str,
obd->obd_recoverable_clients,
(obd->obd_recoverable_clients == 1)
? "client" : "clients",
(int)(OBD_RECOVERY_TIMEOUT / HZ) % 60,
obd->obd_name);
} else {
- LCONSOLE_INFO("OST %s now serving %s with recovery %s.\n",
- obd->obd_name,
- lustre_cfg_string(lcfg, 1),
+ LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery "
+ "%s\n", obd->obd_name, lustre_cfg_string(lcfg, 1),
+ label ?: "", label ? "/" : "", str,
obd->obd_replayable ? "enabled" : "disabled");
}
mntput(mnt);
lock_kernel();
}
- filter->fo_sb = 0;
+ obd->u.obt.obt_sb = 0;
return rc;
}
unsigned long page;
int rc;
+ CLASSERT(offsetof(struct obd_device, u.obt) ==
+ offsetof(struct obd_device, u.filter.fo_obt));
+
if (!LUSTRE_CFG_BUFLEN(lcfg, 1) || !LUSTRE_CFG_BUFLEN(lcfg, 2))
RETURN(-EINVAL);
ping_evictor_stop();
- filter_quota_cleanup(filter);
+ lquota_cleanup(quota_interface, obd);
ldlm_namespace_free(obd->obd_namespace, obd->obd_force);
- if (filter->fo_sb == NULL)
+ if (obd->u.obt.obt_sb == NULL)
RETURN(0);
- save_dev = lvfs_sbdev(filter->fo_sb);
+ save_dev = lvfs_sbdev(obd->u.obt.obt_sb);
lprocfs_free_obd_stats(obd);
lprocfs_obd_cleanup(obd);
filter_post(obd);
- shrink_dcache_parent(filter->fo_sb->s_root);
+ shrink_dcache_parent(obd->u.obt.obt_sb->s_root);
- LL_DQUOT_OFF(filter->fo_sb);
+ LL_DQUOT_OFF(obd->u.obt.obt_sb);
must_put = server_put_mount(obd->obd_name, filter->fo_vfsmnt);
/* must_put is for old method (l_p_m returns non-0 on err) */
/* We can only unlock kernel if we are in the context of sys_ioctl,
otherwise we never called lock_kernel */
- if (kernel_locked()) {
+ if (ll_kernel_locked()) {
unlock_kernel();
must_relock++;
}
if (must_put)
/* In case we didn't mount with lustre_get_mount -- old method*/
mntput(filter->fo_vfsmnt);
-
- filter->fo_sb = NULL;
+ obd->u.obt.obt_sb = NULL;
lvfs_clear_rdonly(save_dev);
/* nearly identical to mds_connect */
static int filter_connect(struct lustre_handle *conn, struct obd_device *obd,
- struct obd_uuid *cluuid,struct obd_connect_data *data)
+ struct obd_uuid *cluuid, struct obd_connect_data *data)
{
struct obd_export *exp;
struct filter_export_data *fed;
LASSERT(exp != NULL);
fed = &exp->exp_filter_data;
+
spin_lock_init(&fed->fed_lock);
+ rc = filter_connect_internal(exp, data);
+ if (rc)
+ GOTO(cleanup, rc);
+
if (!obd->obd_replayable)
GOTO(cleanup, rc = 0);
fed->fed_fcd = fcd;
rc = filter_client_add(obd, filter, fed, -1);
- if (!rc)
- filter_connect_internal(exp, data);
GOTO(cleanup, rc);
if (exp->exp_obd->obd_replayable)
filter_client_free(exp);
+ else
+ fsfilt_sync(exp->exp_obd, exp->exp_obd->u.obt.obt_sb);
filter_grant_discard(exp);
rc = class_disconnect(exp);
ldlm_cancel_locks_for_export(exp);
- fsfilt_sync(obd, obd->u.filter.fo_sb);
+ fsfilt_sync(obd, obd->u.obt.obt_sb);
/* flush any remaining cancel messages out to the target */
ctxt = llog_get_context(obd, LLOG_MDS_OST_REPL_CTXT);
RETURN(rc);
}
-struct dentry *__filter_oa2dentry(struct obd_device *obd,
- struct obdo *oa, const char *what, int quiet)
+struct dentry *__filter_oa2dentry(struct obd_device *obd, struct obdo *oa,
+ const char *what, int quiet)
{
struct dentry *dchild = NULL;
obd_gr group = 0;
dchild = filter_fid2dentry(obd, NULL, group, oa->o_id);
if (IS_ERR(dchild)) {
- CERROR("%s error looking up object: "LPU64"\n", what, oa->o_id);
+ CERROR("%s error looking up object: "LPU64"\n",
+ what, oa->o_id);
RETURN(dchild);
}
RETURN(rc);
}
+/* this should be enabled/disabled in condition to enabled/disabled large
+ * inodes (fast EAs) in backing store FS. */
+int filter_update_fidea(struct obd_export *exp, struct inode *inode,
+ void *handle, struct obdo *oa)
+{
+ struct obd_device *obd = exp->exp_obd;
+ int rc = 0;
+ ENTRY;
+
+ if (oa->o_valid & OBD_MD_FLFID) {
+ struct filter_fid ff;
+ obd_gr group = 0;
+
+ if (oa->o_valid & OBD_MD_FLGROUP)
+ group = oa->o_gr;
+
+ /* packing fid and converting it to LE for storing into EA.
+ * Here ->o_stripe_idx should be filled by LOV and rest of
+ * fields - by client. */
+ ff.ff_fid.id = cpu_to_le64(oa->o_fid);
+ ff.ff_fid.f_type = cpu_to_le32(oa->o_stripe_idx);
+ ff.ff_fid.generation = cpu_to_le32(oa->o_generation);
+ ff.ff_objid = cpu_to_le64(oa->o_id);
+ ff.ff_group = cpu_to_le64(group);
+
+ CDEBUG(D_INODE, "storing filter fid EA ("LPU64"/%u/%u"
+ LPU64"/"LPU64")\n", oa->o_fid, oa->o_stripe_idx,
+ oa->o_generation, oa->o_id, group);
+
+ rc = fsfilt_set_md(obd, inode, handle, &ff, sizeof(ff));
+ if (rc)
+ CERROR("store fid in object failed! rc: %d\n", rc);
+ } else {
+ CDEBUG(D_HA, "OSS object without fid info!\n");
+ }
+
+ RETURN(rc);
+}
+
/* this is called from filter_truncate() until we have filter_punch() */
-int filter_setattr(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md, struct obd_trans_info *oti)
+int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
+ struct obdo *oa, struct obd_trans_info *oti)
{
- struct obd_device *obd;
- struct lvfs_run_ctxt saved;
+ unsigned int orig_ids[MAXQUOTAS] = {0, 0};
+ struct llog_cookie *fcc = NULL;
struct filter_obd *filter;
- struct dentry *dentry;
+ int rc, err, locked = 0;
+ unsigned int ia_valid;
+ struct inode *inode;
struct iattr iattr;
- uid_t orig_uid = 0;
- gid_t orig_gid = 0;
- struct ldlm_res_id res_id = { .name = { oa->o_id } };
- struct ldlm_resource *res;
void *handle;
- struct llog_cookie *fcc = NULL;
- int rc, rc2;
ENTRY;
- dentry = __filter_oa2dentry(exp->exp_obd, oa, __FUNCTION__, 1);
- if (IS_ERR(dentry))
- RETURN(PTR_ERR(dentry));
+ LASSERT(dentry != NULL);
+ LASSERT(!IS_ERR(dentry));
- obd = exp->exp_obd;
- filter = &obd->u.filter;
+ inode = dentry->d_inode;
+ LASSERT(inode != NULL);
+ filter = &exp->exp_obd->u.filter;
iattr_from_obdo(&iattr, oa, oa->o_valid);
-
- push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
- lock_kernel();
+ ia_valid = iattr.ia_valid;
if (oa->o_valid & OBD_MD_FLCOOKIE) {
OBD_ALLOC(fcc, sizeof(*fcc));
memcpy(fcc, obdo_logcookie(oa), sizeof(*fcc));
}
- if (iattr.ia_valid & ATTR_SIZE)
- down(&dentry->d_inode->i_sem);
+ if (ia_valid & ATTR_SIZE || ia_valid & (ATTR_UID | ATTR_GID)) {
+ down(&inode->i_sem);
+ locked = 1;
+ }
+
+ /* If the inode still has SUID+SGID bits set (see filter_precreate())
+ * then we will accept the UID+GID sent by the client during write for
+ * initializing the ownership of this inode. We only allow this to
+ * happen once so clear these bits in setattr. In 2.6 kernels it is
+ * possible to get ATTR_UID and ATTR_GID separately, so we only clear
+ * the flags that are actually being set. */
+ if (ia_valid & (ATTR_UID | ATTR_GID)) {
+ CDEBUG(D_INODE, "update UID/GID to %lu/%lu\n",
+ (unsigned long)oa->o_uid, (unsigned long)oa->o_gid);
+
+ if ((inode->i_mode & S_ISUID) && (ia_valid & ATTR_UID)) {
+ if (!(ia_valid & ATTR_MODE)) {
+ iattr.ia_mode = inode->i_mode;
+ iattr.ia_valid |= ATTR_MODE;
+ }
+ iattr.ia_mode &= ~S_ISUID;
+ }
+ if ((inode->i_mode & S_ISGID) && (ia_valid & ATTR_GID)) {
+ if (!(iattr.ia_valid & ATTR_MODE)) {
+ iattr.ia_mode = inode->i_mode;
+ iattr.ia_valid |= ATTR_MODE;
+ }
+ iattr.ia_mode &= ~S_ISGID;
+ }
- if (iattr.ia_valid & (ATTR_UID | ATTR_GID)) {
- orig_uid = dentry->d_inode->i_uid;
- orig_gid = dentry->d_inode->i_gid;
- handle = fsfilt_start_log(exp->exp_obd, dentry->d_inode,
+ orig_ids[USRQUOTA] = inode->i_uid;
+ orig_ids[GRPQUOTA] = inode->i_gid;
+ handle = fsfilt_start_log(exp->exp_obd, inode,
FSFILT_OP_SETATTR, oti, 1);
+
+ /* update inode EA only once when inode is suid bit marked. As
+ * on 2.6.x UID and GID may be set separately, we check here
+ * only one of them to avoid double setting. */
+ if (inode->i_mode & S_ISUID)
+ filter_update_fidea(exp, inode, handle, oa);
} else {
- handle = fsfilt_start(exp->exp_obd, dentry->d_inode,
+ handle = fsfilt_start(exp->exp_obd, inode,
FSFILT_OP_SETATTR, oti);
}
if (IS_ERR(handle))
GOTO(out_unlock, rc = PTR_ERR(handle));
- if (iattr.ia_valid & ATTR_ATTR_FLAG) {
- rc = fsfilt_iocontrol(exp->exp_obd, dentry->d_inode, NULL,
- EXT3_IOC_SETFLAGS,
- (long)&iattr.ia_attr_flags);
+ if (oa->o_valid & OBD_MD_FLFLAGS) {
+ rc = fsfilt_iocontrol(exp->exp_obd, inode, NULL,
+ EXT3_IOC_SETFLAGS, (long)&oa->o_flags);
} else {
rc = fsfilt_setattr(exp->exp_obd, dentry, handle, &iattr, 1);
if (fcc != NULL)
/* set cancel cookie callback function */
- fsfilt_add_journal_cb(obd, 0, oti ?
+ fsfilt_add_journal_cb(exp->exp_obd, 0, oti ?
oti->oti_handle : handle,
filter_cancel_cookies_cb,
fcc);
}
+ if (locked) {
+ up(&inode->i_sem);
+ locked = 0;
+ }
+
rc = filter_finish_transno(exp, oti, rc);
- rc2 = fsfilt_commit(exp->exp_obd, dentry->d_inode, handle, 0);
- if (rc2) {
- CERROR("error on commit, err = %d\n", rc2);
+
+ err = fsfilt_commit(exp->exp_obd, inode, handle, 0);
+ if (err) {
+ CERROR("error on commit, err = %d\n", err);
if (!rc)
- rc = rc2;
+ rc = err;
+ }
+ EXIT;
+out_unlock:
+ if (locked)
+ up(&inode->i_sem);
+
+ /* trigger quota release */
+ if (ia_valid & (ATTR_SIZE | ATTR_UID | ATTR_GID)) {
+ unsigned int cur_ids[MAXQUOTAS] = {oa->o_uid, oa->o_gid};
+ int rc2 = lquota_adjust(quota_interface, exp->exp_obd, cur_ids,
+ orig_ids, rc, FSFILT_OP_SETATTR);
+ CDEBUG(rc2 ? D_ERROR : D_QUOTA,
+ "filter adjust qunit. (rc:%d)\n", rc2);
}
+ return rc;
+}
+
+/* this is called from filter_truncate() until we have filter_punch() */
+int filter_setattr(struct obd_export *exp, struct obdo *oa,
+ struct lov_stripe_md *md, struct obd_trans_info *oti)
+{
+ struct ldlm_res_id res_id = { .name = { oa->o_id } };
+ struct ldlm_valblock_ops *ns_lvbo;
+ struct lvfs_run_ctxt saved;
+ struct filter_obd *filter;
+ struct ldlm_resource *res;
+ struct dentry *dentry;
+ int rc;
+ ENTRY;
+
+ dentry = __filter_oa2dentry(exp->exp_obd, oa,
+ __FUNCTION__, 1);
+ if (IS_ERR(dentry))
+ RETURN(PTR_ERR(dentry));
+
+ filter = &exp->exp_obd->u.filter;
+ push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
+ lock_kernel();
+
+ /* setting objects attributes (including owner/group) */
+ rc = filter_setattr_internal(exp, dentry, oa, oti);
+ if (rc)
+ GOTO(out_unlock, rc);
res = ldlm_resource_get(exp->exp_obd->obd_namespace, NULL,
res_id, LDLM_EXTENT, 0);
+
if (res != NULL) {
- if (res->lr_namespace->ns_lvbo &&
- res->lr_namespace->ns_lvbo->lvbo_update)
- rc = res->lr_namespace->ns_lvbo->lvbo_update(res, NULL,
- 0, 0);
+ ns_lvbo = res->lr_namespace->ns_lvbo;
+ if (ns_lvbo && ns_lvbo->lvbo_update)
+ rc = ns_lvbo->lvbo_update(res, NULL, 0, 0);
ldlm_resource_putref(res);
- } else if (iattr.ia_valid & ATTR_SIZE) {
- CERROR("!!! resource_get failed for object "LPU64" -- "
- "filter_setattr with no lock?\n", oa->o_id);
}
oa->o_valid = OBD_MD_FLID;
+
/* Quota release need uid/gid info */
obdo_from_inode(oa, dentry->d_inode,
FILTER_VALID_FLAGS | OBD_MD_FLUID | OBD_MD_FLGID);
+ EXIT;
out_unlock:
- if (iattr.ia_valid & ATTR_SIZE)
- up(&dentry->d_inode->i_sem);
unlock_kernel();
- pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
-
f_dput(dentry);
-
- /* trigger quota release */
- if (rc == 0 && iattr.ia_valid & (ATTR_SIZE | ATTR_UID | ATTR_GID)) {
- rc2 = qctxt_adjust_qunit(obd, &filter->fo_quota_ctxt,
- oa->o_uid, oa->o_gid, 1);
- if (rc2)
- CERROR("error filter adjust qunit! (rc:%d)\n", rc2);
- /* after owner changed, release quota for the original owner */
- rc2 = qctxt_adjust_qunit(obd, &filter->fo_quota_ctxt,
- orig_uid, orig_gid, 1);
- if (rc2)
- CERROR("error filter adjust qunit! (rc:%d)\n", rc2);
- }
- RETURN(rc);
+ pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
+ return rc;
}
/* XXX identical to osc_unpackmd */
exp->exp_obd->obd_name, oa->o_id + 1, last);
for (id = oa->o_id + 1; id <= last; id++) {
doa.o_id = id;
- filter_destroy(exp, &doa, NULL, NULL);
+ filter_destroy(exp, &doa, NULL, NULL, NULL);
}
CDEBUG(D_HA, "%s: after destroy: set last_objids["LPU64"] = "LPU64"\n",
unsigned long max_age)
{
struct filter_obd *filter = &obd->u.filter;
- int blockbits = filter->fo_sb->s_blocksize_bits;
+ int blockbits = obd->u.obt.obt_sb->s_blocksize_bits;
int rc;
ENTRY;
* might be under-reporting if clients haven't announced their
* caches with brw recently */
spin_lock(&obd->obd_osfs_lock);
- rc = fsfilt_statfs(obd, filter->fo_sb, max_age);
+ rc = fsfilt_statfs(obd, obd->u.obt.obt_sb, max_age);
memcpy(osfs, &obd->obd_osfs, sizeof(*osfs));
spin_unlock(&obd->obd_osfs_lock);
osfs->os_bavail -= min(osfs->os_bavail, GRANT_FOR_LLOG(obd) +
((filter->fo_tot_dirty + filter->fo_tot_pending +
osfs->os_bsize - 1) >> blockbits));
+
+ /* set EROFS to state field if FS is mounted as RDONLY. The goal is to
+ * stop creating files on MDS if OST is not good shape to create
+ * objects.*/
+ osfs->os_state = (filter->fo_obt.obt_sb->s_flags & MS_RDONLY) ?
+ EROFS : 0;
RETURN(rc);
}
rc = filter_statfs(obd, osfs, jiffies - HZ);
if (rc == 0 && osfs->os_bavail < (osfs->os_blocks >> 10)) {
CDEBUG(D_HA, "OST out of space! avail "LPU64"\n",
- osfs->os_bavail<<filter->fo_sb->s_blocksize_bits);
+ osfs->os_bavail<<filter->fo_obt.obt_sb->s_blocksize_bits);
*num=0;
rc = -ENOSPC;
}
GOTO(cleanup, rc = PTR_ERR(handle));
cleanup_phase = 3;
- rc = ll_vfs_create(dparent->d_inode, dchild, S_IFREG | 0666, NULL);
+ rc = ll_vfs_create(dparent->d_inode, dchild,
+ S_IFREG | S_ISUID | S_ISGID | 0666, NULL);
if (rc) {
CERROR("create failed rc = %d\n", rc);
GOTO(cleanup, rc);
}
int filter_destroy(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md, struct obd_trans_info *oti)
+ struct lov_stripe_md *md, struct obd_trans_info *oti,
+ struct obd_export *md_exp)
{
+ unsigned int qcids[MAXQUOTAS] = {0, 0};
struct obd_device *obd;
struct filter_obd *filter;
- struct dentry *dchild = NULL, *dparent = NULL;
+ struct dentry *dchild = NULL, *dparent;
struct lvfs_run_ctxt saved;
void *handle = NULL;
struct llog_cookie *fcc = NULL;
- int rc, rc2, cleanup_phase = 0, have_prepared = 0;
+ int rc, rc2, cleanup_phase = 0;
obd_gr group = 0;
+ struct iattr iattr;
ENTRY;
if (oa->o_valid & OBD_MD_FLGROUP)
filter = &obd->u.filter;
push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-
- acquire_locks:
- dparent = filter_parent_lock(obd, group, oa->o_id);
- if (IS_ERR(dparent))
- GOTO(cleanup, rc = PTR_ERR(dparent));
cleanup_phase = 1;
- dchild = filter_fid2dentry(obd, dparent, group, oa->o_id);
+ dchild = filter_fid2dentry(obd, NULL, group, oa->o_id);
if (IS_ERR(dchild))
GOTO(cleanup, rc = PTR_ERR(dchild));
cleanup_phase = 2;
GOTO(cleanup, rc = -ENOENT);
}
- if (!have_prepared) {
- /* If we're really going to destroy the object, get ready
- * by getting the clients to discard their cached data.
- *
- * We have to drop the parent lock, because
- * filter_prepare_destroy will acquire a PW on the object, and
- * we don't want to deadlock with an incoming write to the
- * object, which has the extent PW and then wants to get the
- * parent dentry to do the lookup.
- *
- * We dput the child because it's not worth the extra
- * complication of condition the above code to skip it on the
- * second time through. */
- f_dput(dchild);
- filter_parent_unlock(dparent);
-
- filter_prepare_destroy(obd, oa->o_id);
- have_prepared = 1;
- goto acquire_locks;
- }
-
- handle = fsfilt_start_log(obd, dparent->d_inode,FSFILT_OP_UNLINK,oti,1);
- if (IS_ERR(handle))
- GOTO(cleanup, rc = PTR_ERR(handle));
- cleanup_phase = 3;
+ filter_prepare_destroy(obd, oa->o_id);
/* Our MDC connection is established by the MDS to us */
if (oa->o_valid & OBD_MD_FLCOOKIE) {
memcpy(fcc, obdo_logcookie(oa), sizeof(*fcc));
}
+ /* we're gonna truncate it first in order to avoid possible deadlock:
+ * P1 P2
+ * open trasaction open transaction
+ * down(i_zombie) down(i_zombie)
+ * restart transaction
+ * (see BUG 4180) -bzzz
+ */
+ down(&dchild->d_inode->i_sem);
+ handle = fsfilt_start_log(obd, dchild->d_inode, FSFILT_OP_SETATTR,
+ NULL, 1);
+ if (IS_ERR(handle)) {
+ up(&dchild->d_inode->i_sem);
+ GOTO(cleanup, rc = PTR_ERR(handle));
+ }
+
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = 0;
+ rc = fsfilt_setattr(obd, dchild, handle, &iattr, 1);
+ rc2 = fsfilt_commit(obd, dchild->d_inode, handle, 0);
+ up(&dchild->d_inode->i_sem);
+ if (rc)
+ GOTO(cleanup, rc);
+ if (rc2)
+ GOTO(cleanup, rc = rc2);
+
+ /* We don't actually need to lock the parent until we are unlinking
+ * here, and not while truncating above. That avoids holding the
+ * parent lock for a long time during truncate, which can block other
+ * threads from doing anything to objects in that directory. bug 7171 */
+ dparent = filter_parent_lock(obd, group, oa->o_id);
+ if (IS_ERR(dparent))
+ GOTO(cleanup, rc = PTR_ERR(dparent));
+ cleanup_phase = 3; /* filter_parent_unlock */
+
+ down(&dchild->d_inode->i_sem);
+ handle = fsfilt_start_log(obd, dparent->d_inode,FSFILT_OP_UNLINK,oti,1);
+ if (IS_ERR(handle)) {
+ up(&dchild->d_inode->i_sem);
+ GOTO(cleanup, rc = PTR_ERR(handle));
+ }
+ cleanup_phase = 4; /* fsfilt_commit */
+
/* Quota release need uid/gid of inode */
obdo_from_inode(oa, dchild->d_inode, OBD_MD_FLUID|OBD_MD_FLGID);
+
+ /* this drops dchild->d_inode->i_sem unconditionally */
rc = filter_destroy_internal(obd, oa->o_id, dparent, dchild);
+ EXIT;
cleanup:
switch(cleanup_phase) {
- case 3:
+ case 4:
if (fcc != NULL) {
fsfilt_add_journal_cb(obd, 0,
oti ? oti->oti_handle : handle,
if (!rc)
rc = rc2;
}
+ case 3:
+ filter_parent_unlock(dparent);
case 2:
f_dput(dchild);
case 1:
- filter_parent_unlock(dparent);
- case 0:
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
break;
default:
}
/* trigger quota release */
- if (rc == 0) {
- rc2 = qctxt_adjust_qunit(obd, &filter->fo_quota_ctxt,
- oa->o_uid, oa->o_gid, 1);
- if (rc2)
- CERROR("error filter adjust qunit! (rc:%d)\n", rc2);
- }
-
- RETURN(rc);
+ qcids[USRQUOTA] = oa->o_uid;
+ qcids[GRPQUOTA] = oa->o_gid;
+ rc2 = lquota_adjust(quota_interface, obd, qcids, NULL, rc,
+ FSFILT_OP_UNLINK);
+ CDEBUG(rc2 ? D_ERROR : D_QUOTA,
+ "filter adjust qunit! (rc:%d)\n", rc2);
+ return rc;
}
/* NB start and end are used for punch, but not truncate */
static int filter_truncate(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *lsm,
- obd_off start, obd_off end,
- struct obd_trans_info *oti)
+ struct lov_stripe_md *lsm, obd_off start,
+ obd_off end, struct obd_trans_info *oti)
{
- int error;
+ int rc;
ENTRY;
if (end != OBD_OBJECT_EOF) {
CDEBUG(D_INODE, "calling truncate for object "LPU64", valid = "LPX64
", o_size = "LPD64"\n", oa->o_id, oa->o_valid, start);
+
oa->o_size = start;
- error = filter_setattr(exp, oa, NULL, oti);
- RETURN(error);
+ rc = filter_setattr(exp, oa, NULL, oti);
+ RETURN(rc);
}
static int filter_sync(struct obd_export *exp, struct obdo *oa,
/* an objid of zero is taken to mean "sync whole filesystem" */
if (!oa || !(oa->o_valid & OBD_MD_FLID)) {
- rc = fsfilt_sync(exp->exp_obd, filter->fo_sb);
+ rc = fsfilt_sync(exp->exp_obd, filter->fo_obt.obt_sb);
/* flush any remaining cancel messages out to the target */
ctxt = llog_get_context(exp->exp_obd, LLOG_MDS_OST_REPL_CTXT);
llog_sync(ctxt, exp);
memcmp(key, "blocksize", keylen) == 0) {
__u32 *blocksize = val;
*vallen = sizeof(*blocksize);
- *blocksize = obd->u.filter.fo_sb->s_blocksize;
+ *blocksize = obd->u.obt.obt_sb->s_blocksize;
RETURN(0);
}
memcmp(key, "blocksize_bits", keylen) == 0) {
__u32 *blocksize_bits = val;
*vallen = sizeof(*blocksize_bits);
- *blocksize_bits = obd->u.filter.fo_sb->s_blocksize_bits;
+ *blocksize_bits = obd->u.obt.obt_sb->s_blocksize_bits;
RETURN(0);
}
/* setup llog imports */
ctxt = llog_get_context(obd, LLOG_MDS_OST_REPL_CTXT);
rc = llog_receptor_accept(ctxt, exp->exp_imp_reverse);
-
- filter_quota_set_info(exp, obd);
+
+ lquota_setinfo(quota_interface, exp, obd);
RETURN(rc);
}
case OBD_IOC_SYNC: {
CDEBUG(D_HA, "syncing ost %s\n", obd->obd_name);
- rc = fsfilt_sync(obd, obd->u.filter.fo_sb);
+ rc = fsfilt_sync(obd, obd->u.obt.obt_sb);
RETURN(rc);
}
case OBD_IOC_SET_READONLY: {
void *handle;
- struct super_block *sb = obd->u.filter.fo_sb;
+ struct super_block *sb = obd->u.obt.obt_sb;
struct inode *inode = sb->s_root->d_inode;
BDEVNAME_DECLARE_STORAGE(tmp);
CERROR("*** setting device %s read-only ***\n",
rc = fsfilt_commit(obd, inode, handle, 1);
CDEBUG(D_HA, "syncing ost %s\n", obd->obd_name);
- rc = fsfilt_sync(obd, obd->u.filter.fo_sb);
+ rc = fsfilt_sync(obd, obd->u.obt.obt_sb);
- lvfs_set_rdonly(lvfs_sbdev(obd->u.filter.fo_sb));
+ lvfs_set_rdonly(lvfs_sbdev(obd->u.obt.obt_sb));
RETURN(0);
}
* health_check to return 0 on healthy
* and 1 on unhealthy.
*/
- if (filter->fo_sb->s_flags & MS_RDONLY)
+ if (obd->u.obt.obt_sb->s_flags & MS_RDONLY)
rc = 1;
LASSERT(filter->fo_health_check_filp != NULL);
.o_llog_init = filter_llog_init,
.o_llog_finish = filter_llog_finish,
.o_iocontrol = filter_iocontrol,
- .o_quotacheck = filter_quotacheck,
- .o_quotactl = filter_quotactl,
.o_health_check = filter_health_check,
};
.o_iocontrol = filter_iocontrol,
};
+quota_interface_t *quota_interface;
+extern quota_interface_t filter_quota_interface;
+
static int __init obdfilter_init(void)
{
struct lprocfs_static_vars lvars;
if (obdfilter_created_scratchpad == NULL)
return -ENOMEM;
+ quota_interface = PORTAL_SYMBOL_GET(filter_quota_interface);
+ init_obd_quota_ops(quota_interface, &filter_obd_ops);
+ init_obd_quota_ops(quota_interface, &filter_sanobd_ops);
+
rc = class_register_type(&filter_obd_ops, lvars.module_vars,
LUSTRE_OST_NAME);
if (rc)
if (rc) {
class_unregister_type(LUSTRE_OST_NAME);
out:
+ if (quota_interface)
+ PORTAL_SYMBOL_PUT(filter_quota_interface);
+
OBD_FREE(obdfilter_created_scratchpad,
OBDFILTER_CREATED_SCRATCHPAD_ENTRIES *
sizeof(*obdfilter_created_scratchpad));
- }
+ }
+
return rc;
}
static void __exit obdfilter_exit(void)
{
+ if (quota_interface)
+ PORTAL_SYMBOL_PUT(filter_quota_interface);
+
class_unregister_type(LUSTRE_OSTSAN_NAME);
class_unregister_type(LUSTRE_OST_NAME);
+
OBD_FREE(obdfilter_created_scratchpad,
OBDFILTER_CREATED_SCRATCHPAD_ENTRIES *
sizeof(*obdfilter_created_scratchpad));
#include <linux/lustre_handles.h>
#include <linux/lustre_debug.h>
#include <linux/obd.h>
+#include <linux/lustre_disk.h>
#define FILTER_LAYOUT_VERSION "2"
-#define LAST_RCVD "last_rcvd"
#define HEALTH_CHECK "health_check"
#define FILTER_INIT_OBJID 0
-#define FILTER_LR_SERVER_SIZE 512
-
-#define FILTER_LR_CLIENT_START 8192
-#define FILTER_LR_CLIENT_SIZE 128
-
-/* This limit is arbitrary, but for now we fit it in 1 page (32k clients) */
-#define FILTER_LR_MAX_CLIENTS (PAGE_SIZE * 8)
-
#define FILTER_SUBDIR_COUNT 32 /* set to zero for no subdirs */
#define FILTER_GROUPS 3 /* must be at least 3; not dynamic yet */
-#define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
+#define FILTER_ROCOMPAT_SUPP (0)
-#define FILTER_ROCOMPAT_SUPP (0)
+#define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
-#define FILTER_INCOMPAT_GROUPS 0x00000001
-#define FILTER_INCOMPAT_SUPP (FILTER_INCOMPAT_GROUPS)
+#define FILTER_INCOMPAT_SUPP (OBD_INCOMPAT_GROUPS)
#define FILTER_GRANT_CHUNK (2ULL * PTLRPC_MAX_BRW_SIZE)
#define GRANT_FOR_LLOG(obd) 16
__u8 fcd_uuid[40]; /* client UUID */
__u64 fcd_last_rcvd; /* last completed transaction ID */
__u64 fcd_last_xid; /* client RPC xid for the last transaction */
- __u8 fcd_padding[FILTER_LR_CLIENT_SIZE - 56];
+ __u8 fcd_padding[LR_CLIENT_SIZE - 56];
};
#define FILTER_DENTRY_MAGIC 0x9efba101
OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ|\
OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME)
+struct filter_fid {
+ struct ll_fid ff_fid;
+ __u64 ff_objid;
+ __u64 ff_group;
+};
+
enum {
LPROC_FILTER_READ_BYTES = 0,
LPROC_FILTER_WRITE_BYTES = 1,
int filter_finish_transno(struct obd_export *, struct obd_trans_info *, int rc);
__u64 filter_next_id(struct filter_obd *, struct obdo *);
__u64 filter_last_id(struct filter_obd *, struct obdo *);
+int filter_update_fidea(struct obd_export *exp, struct inode *inode,
+ void *handle, struct obdo *oa);
int filter_update_server_data(struct obd_device *, struct file *,
struct lr_server_data *, int force_sync);
int filter_update_last_objid(struct obd_device *, obd_gr, int force_sync);
int filter_common_setup(struct obd_device *, obd_count len, void *buf,
void *option);
int filter_destroy(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *md, struct obd_trans_info *);
+ struct lov_stripe_md *md, struct obd_trans_info *,
+ struct obd_export *);
+int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry,
+ struct obdo *oa, struct obd_trans_info *oti);
int filter_setattr(struct obd_export *exp, struct obdo *oa,
struct lov_stripe_md *md, struct obd_trans_info *oti);
+struct dentry *filter_create_object(struct obd_device *obd, struct obdo *oa);
+
/* filter_lvb.c */
extern struct ldlm_valblock_ops filter_lvbo;
void flip_into_page_cache(struct inode *inode, struct page *new_page);
/* filter_io_*.c */
+struct filter_iobuf;
int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount,
struct obd_ioobj *obj, int niocount,
struct niobuf_local *res, struct obd_trans_info *oti,
obd_size want, obd_size fs_space_left);
void filter_grant_commit(struct obd_export *exp, int niocount,
struct niobuf_local *res);
-int filter_alloc_iobuf(struct filter_obd *, int rw, int num_pages, void **ret);
-void filter_free_iobuf(void *iobuf);
-int filter_iobuf_add_page(struct obd_device *obd, void *iobuf,
+struct filter_iobuf *filter_alloc_iobuf(struct filter_obd *, int rw,
+ int num_pages);
+void filter_free_iobuf(struct filter_iobuf *iobuf);
+int filter_iobuf_add_page(struct obd_device *obd, struct filter_iobuf *iobuf,
struct inode *inode, struct page *page);
-void *filter_iobuf_get(struct ptlrpc_thread *thread, struct filter_obd *filter);
-void filter_iobuf_put(void *iobuf);
-int filter_direct_io(int rw, struct dentry *dchild, void *iobuf,
+void *filter_iobuf_get(struct filter_obd *filter, struct obd_trans_info *oti);
+void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf,
+ struct obd_trans_info *oti);
+int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf,
struct obd_export *exp, struct iattr *attr,
struct obd_trans_info *oti, void **wait_handle);
#endif
/* Quota stuff */
-#ifdef HAVE_QUOTA_SUPPORT
-int filter_quota_setup(struct filter_obd *filter);
-void filter_quota_cleanup(struct filter_obd *filter);
-void filter_quota_set_info(struct obd_export *exp, struct obd_device *obd);
-int filter_quotacheck(struct obd_export *exp, struct obd_quotactl *oqctl);
-int filter_quotactl(struct obd_export *exp, struct obd_quotactl *oqctl);
-int filter_quota_enforcement(struct obd_device *obd,
- unsigned int fsuid, unsigned int fsgid,
- struct lvfs_ucred **ret_uc);
-int filter_get_quota_flag(struct obd_device *obd, struct obdo *oa);
-int filter_quota_check_master(struct obd_device *obd, struct inode *inode);
-
-#ifdef LPROCFS
-int lprocfs_filter_rd_bunit(char *page, char **start, off_t off,
- int count, int *eof, void *data);
-int lprocfs_filter_rd_iunit(char *page, char **start, off_t off,
- int count, int *eof, void *data);
-int lprocfs_filter_wr_bunit(struct file *file, const char *buffer,
- unsigned long count, void *data);
-int lprocfs_filter_wr_iunit(struct file *file, const char *buffer,
- unsigned long count, void *data);
-int lprocfs_filter_rd_btune(char *page, char **start, off_t off,
- int count, int *eof, void *data);
-int lprocfs_filter_rd_itune(char *page, char **start, off_t off,
- int count, int *eof, void *data);
-int lprocfs_filter_wr_btune(struct file *file, const char *buffer,
- unsigned long count, void *data);
-int lprocfs_filter_wr_itune(struct file *file, const char *buffer,
- unsigned long count, void *data);
-#endif /* LPROCFS */
-#else /* !HAVE_QUOTA_SUPPORT */
-static inline int filter_quota_setup(struct filter_obd *filter)
-{
- return 0;
-}
-static inline void filter_quota_cleanup(struct filter_obd *filter) {}
-static inline void filter_quota_set_info(struct obd_export *exp,
- struct obd_device *obd) {}
-static inline int filter_quotacheck(struct obd_export *exp,
- struct obd_quotactl *oqctl)
-{
- return -ENOTSUPP;
-}
-static inline int filter_quotactl(struct obd_export *exp,
- struct obd_quotactl *oqctl)
-{
- return -ENOTSUPP;
-}
-static inline int filter_quota_enforcement(struct obd_device *obd,
- unsigned int fsuid,
- unsigned int fsgid,
- struct lvfs_ucred **ret_uc)
-{
- return 0;
-}
-static inline int filter_get_quota_flag(struct obd_device *obd,
- struct obdo *oa)
-{
- return 0;
-}
-static inline int filter_quota_check_master(struct obd_device *obd,
- struct inode *inode)
-{
- return 0;
-}
-#endif /* HAVE_QUOTA_SUPPORT */
+extern quota_interface_t *quota_interface;
#endif /* _FILTER_INTERNAL_H */
obd_size filter_grant_space_left(struct obd_export *exp)
{
struct obd_device *obd = exp->exp_obd;
- int blockbits = obd->u.filter.fo_sb->s_blocksize_bits;
+ int blockbits = obd->u.obt.obt_sb->s_blocksize_bits;
obd_size tot_granted = obd->u.filter.fo_tot_granted, avail, left = 0;
int rc, statfs_done = 0;
if (time_before(obd->obd_osfs_age, jiffies - HZ)) {
restat:
- rc = fsfilt_statfs(obd, obd->u.filter.fo_sb, jiffies + 1);
+ rc = fsfilt_statfs(obd, obd->u.obt.obt_sb, jiffies + 1);
if (rc) /* N.B. statfs can't really fail */
RETURN(0);
statfs_done = 1;
{
struct obd_device *obd = exp->exp_obd;
struct filter_export_data *fed = &exp->exp_filter_data;
- int blockbits = obd->u.filter.fo_sb->s_blocksize_bits;
+ int blockbits = obd->u.obt.obt_sb->s_blocksize_bits;
__u64 grant = 0;
LASSERT_SPIN_LOCKED(&obd->obd_osfs_lock);
spin_unlock(&obd->obd_osfs_lock);
}
- push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
- iobuf = filter_iobuf_get(oti->oti_thread, &exp->exp_obd->u.filter);
+ iobuf = filter_iobuf_get(&obd->u.filter, oti);
dentry = filter_oa2dentry(obd, oa);
if (IS_ERR(dentry)) {
dentry = NULL;
GOTO(cleanup, rc);
}
-
+
inode = dentry->d_inode;
-
+
if (oa)
obdo_to_inode(inode, oa, OBD_MD_FLATIME);
fsfilt_check_slow(now, obd_timeout, "start_page_read");
- rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp,
- NULL, NULL, NULL);
+ rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf,
+ exp, NULL, NULL, NULL);
if (rc)
GOTO(cleanup, rc);
f_dput(dentry);
}
- filter_iobuf_put(iobuf);
+ filter_iobuf_put(&obd->u.filter, iobuf, oti);
- pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
if (rc)
CERROR("io error %d\n", rc);
struct inode *inode)
{
struct filter_export_data *fed = &exp->exp_filter_data;
- int blocksize = exp->exp_obd->u.filter.fo_sb->s_blocksize;
+ int blocksize = exp->exp_obd->u.obt.obt_sb->s_blocksize;
unsigned long used = 0, ungranted = 0, using;
int i, rc = -ENOSPC, obj, n = 0, mask = D_CACHE;
/* Rough calc in case we don't refresh cached statfs data */
using = (used + ungranted + 1 ) >>
- exp->exp_obd->u.filter.fo_sb->s_blocksize_bits;
+ exp->exp_obd->u.obt.obt_sb->s_blocksize_bits;
if (exp->exp_obd->obd_osfs.os_bavail > using)
exp->exp_obd->obd_osfs.os_bavail -= using;
else
LASSERT(objcount == 1);
LASSERT(obj->ioo_bufcnt > 0);
- iobuf = filter_iobuf_get(oti->oti_thread, &exp->exp_obd->u.filter);
+ push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
+ iobuf = filter_iobuf_get(&exp->exp_obd->u.filter, oti);
+ if (iobuf == NULL)
+ GOTO(cleanup, rc = -ENOMEM);
cleanup_phase = 1;
- push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
dentry = filter_fid2dentry(exp->exp_obd, NULL, obj->ioo_gr,
obj->ioo_id);
if (IS_ERR(dentry))
cleanup_phase = 2;
if (dentry->d_inode == NULL) {
- CERROR("trying to BRW to non-existent file "LPU64"\n",
- obj->ioo_id);
+ CERROR("%s: trying to BRW to non-existent file "LPU64"\n",
+ exp->exp_obd->obd_name, obj->ioo_id);
GOTO(cleanup, rc = -ENOENT);
}
rc = filter_grant_check(exp, objcount, &fso, niocount, nb, res,
&left, dentry->d_inode);
- /* We're finishing using body->oa as an input variable, so reset
- * o_valid here. */
+ /* do not zero out oa->o_valid as it is used in filter_commitrw_write()
+ * for setting UID/GID and fid EA in first write time. */
if (oa && oa->o_valid & OBD_MD_FLGRANT) {
oa->o_grant = filter_grant(exp,oa->o_grant,oa->o_undirty,left);
- oa->o_valid = OBD_MD_FLGRANT;
- } else if (oa)
- oa->o_valid = 0;
+ oa->o_valid |= OBD_MD_FLGRANT;
+ }
spin_unlock(&exp->exp_obd->obd_osfs_lock);
switch(cleanup_phase) {
case 4:
case 3:
- filter_iobuf_put(iobuf);
+ filter_iobuf_put(&exp->exp_obd->u.filter, iobuf, oti);
case 2:
pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
if (rc)
f_dput(dentry);
break;
case 1:
+ filter_iobuf_put(&exp->exp_obd->u.filter, iobuf, oti);
+ case 0:
spin_lock(&exp->exp_obd->obd_osfs_lock);
if (oa)
filter_grant_incoming(exp, oa);
spin_unlock(&exp->exp_obd->obd_osfs_lock);
pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
- filter_iobuf_put(iobuf);
break;
default:;
}
GOTO(out, ret = -ENOMEM);
for (i = 0; i < oa_bufs; i++) {
+ lnb[i].page = pga[i].pg;
rnb[i].offset = pga[i].off;
rnb[i].len = pga[i].count;
}
if (ret != 0)
GOTO(out, ret);
- for (i = 0; i < oa_bufs; i++) {
- void *virt;
- obd_off off;
- void *addr;
-
- if (lnb[i].page == NULL)
- break;
-
- off = pga[i].off & ~PAGE_MASK;
- virt = kmap(pga[i].pg);
- addr = kmap(lnb[i].page);
-
- /* 2 kmaps == vanishingly small deadlock opportunity */
-
- if (cmd & OBD_BRW_WRITE)
- memcpy(addr + off, virt + off, pga[i].count);
- else
- memcpy(virt + off, addr + off, pga[i].count);
-
- kunmap(lnb[i].page);
- kunmap(pga[i].pg);
- }
-
ret = filter_commitrw(cmd, exp, oa, 1, &ioo, oa_bufs, lnb, oti, ret);
out:
}
/* Must be called with i_sem taken for writes; this will drop it */
-int filter_direct_io(int rw, struct dentry *dchild, void *buf,
+int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *buf,
struct obd_export *exp, struct iattr *attr,
struct obd_trans_info *oti, void **wait_handle)
{
struct obd_device *obd = exp->exp_obd;
struct inode *inode = dchild->d_inode;
- struct kiobuf *iobuf = buf;
+ struct kiobuf *iobuf = (void *)buf;
int rc, create = (rw == OBD_BRW_WRITE), committed = 0;
int blocks_per_page = PAGE_SIZE >> inode->i_blkbits, cleanup_phase = 0;
struct semaphore *sem = NULL;
iobuf->length = 0;
}
-void filter_iobuf_put(void *iobuf)
+struct filter_iobuf *filter_alloc_iobuf(struct filter_obd *filter,
+ int rw, int num_pages)
{
- clear_kiobuf(iobuf);
-}
-
-int filter_alloc_iobuf(struct filter_obd *filter, int rw, int num_pages,
- void **ret)
-{
- int rc;
struct kiobuf *iobuf;
+ int rc;
ENTRY;
LASSERTF(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ, "%x\n", rw);
rc = alloc_kiovec(1, &iobuf);
if (rc)
- RETURN(rc);
+ RETURN(ERR_PTR(rc));
rc = expand_kiobuf(iobuf, num_pages);
if (rc) {
free_kiovec(1, &iobuf);
- RETURN(rc);
+ RETURN(ERR_PTR(rc));
}
#ifdef HAVE_KIOBUF_DOVARY
iobuf->dovary = 0; /* this prevents corruption, not present in 2.4.20 */
#endif
clear_kiobuf(iobuf);
- *ret = iobuf;
- RETURN(0);
+ RETURN((void *)iobuf);
}
-void filter_free_iobuf(void *buf)
+void filter_free_iobuf(struct filter_iobuf *buf)
{
- struct kiobuf *iobuf = buf;
+ struct kiobuf *iobuf = (void *)buf;
clear_kiobuf(iobuf);
free_kiovec(1, &iobuf);
}
-int filter_iobuf_add_page(struct obd_device *obd, void *buf,
+void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf,
+ struct obd_trans_info *oti)
+{
+ int thread_id = oti ? oti->oti_thread_id : -1;
+
+ if (unlikely(thread_id < 0)) {
+ filter_free_iobuf(iobuf);
+ return;
+ }
+
+ LASSERTF(filter->fo_iobuf_pool[thread_id] == iobuf,
+ "iobuf mismatch for thread %d: pool %p iobuf %p\n",
+ thread_id, filter->fo_iobuf_pool[thread_id], iobuf);
+ clear_kiobuf((void *)iobuf);
+}
+
+int filter_iobuf_add_page(struct obd_device *obd, struct filter_iobuf *buf,
struct inode *inode, struct page *page)
{
- struct kiobuf *iobuf = buf;
+ struct kiobuf *iobuf = (void *)buf;
iobuf->maplist[iobuf->nr_pages++] = page;
iobuf->length += PAGE_SIZE;
if (rc != 0)
GOTO(cleanup, rc);
- iobuf = filter_iobuf_get(oti->oti_thread, &exp->exp_obd->u.filter);
+ iobuf = filter_iobuf_get(&obd->u.filter, oti);
+ if (iobuf == NULL)
+ GOTO(cleanup, rc = -ENOMEM);
cleanup_phase = 1;
fso.fso_dentry = res->dentry;
fsfilt_check_slow(now, obd_timeout, "brw_start");
- iattr_from_obdo(&iattr,oa,OBD_MD_FLATIME|OBD_MD_FLMTIME|OBD_MD_FLCTIME);
+ i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
+
+ /* If the inode still has SUID+SGID bits set (see filter_precreate())
+ * then we will accept the UID+GID if sent by the client for
+ * initializing the ownership of this inode. We only allow this to
+ * happen once (so clear these bits) and later only allow setattr. */
+ if (inode->i_mode & S_ISUID)
+ i |= OBD_MD_FLUID;
+ if (inode->i_mode & S_ISGID)
+ i |= OBD_MD_FLGID;
+
+ iattr_from_obdo(&iattr, oa, i);
+ if (iattr.ia_valid & (ATTR_UID | ATTR_GID)) {
+ CDEBUG(D_INODE, "update UID/GID to %lu/%lu\n",
+ (unsigned long)oa->o_uid, (unsigned long)oa->o_gid);
+
+ cap_raise(current->cap_effective, CAP_SYS_RESOURCE);
+
+ iattr.ia_valid |= ATTR_MODE;
+ iattr.ia_mode = inode->i_mode;
+ if (iattr.ia_valid & ATTR_UID)
+ iattr.ia_mode &= ~S_ISUID;
+ if (iattr.ia_valid & ATTR_GID)
+ iattr.ia_mode &= ~S_ISGID;
+
+ rc = filter_update_fidea(exp, inode, oti->oti_handle, oa);
+ }
+
/* filter_direct_io drops i_sem */
rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr,
oti, &wait_handle);
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
LASSERT(current->journal_info == NULL);
case 1:
- filter_iobuf_put(iobuf);
+ filter_iobuf_put(&obd->u.filter, iobuf, oti);
case 0:
/*
* lnb->page automatically returns back into per-thread page
/* 512byte block min */
#define MAX_BLOCKS_PER_PAGE (PAGE_SIZE / 512)
-struct dio_request {
+struct filter_iobuf {
atomic_t dr_numreqs; /* number of reqs being processed */
struct bio *dr_bios; /* list of completed bios */
wait_queue_head_t dr_wait;
unsigned long *dr_blocks;
spinlock_t dr_lock;
unsigned long dr_start_time; /* jiffies */
+ unsigned int dr_ignore_quota:1;
struct filter_obd *dr_filter;
};
-static void record_start_io(struct dio_request *dreq, int rw, int size)
+static void record_start_io(struct filter_iobuf *iobuf, int rw, int size)
{
- struct filter_obd *filter = dreq->dr_filter;
+ struct filter_obd *filter = iobuf->dr_filter;
unsigned long flags;
- atomic_inc(&dreq->dr_numreqs);
+ atomic_inc(&iobuf->dr_numreqs);
if (rw == OBD_BRW_READ) {
lprocfs_oh_tally(&filter->fo_read_rpc_hist,
else
filter->fo_w_in_flight++;
spin_unlock_irqrestore(&filter->fo_stats_lock, flags);
- dreq->dr_start_time = jiffies;
+ iobuf->dr_start_time = jiffies;
}
-static void record_finish_io(struct dio_request *dreq, int rw, int rc)
+static void record_finish_io(struct filter_iobuf *iobuf, int rw, int rc)
{
- struct filter_obd *filter = dreq->dr_filter;
+ struct filter_obd *filter = iobuf->dr_filter;
unsigned long flags, stop_time = jiffies;
spin_lock_irqsave(&filter->fo_stats_lock, flags);
filter->fo_w_in_flight--;
spin_unlock_irqrestore(&filter->fo_stats_lock, flags);
- if (atomic_dec_and_test(&dreq->dr_numreqs))
- wake_up(&dreq->dr_wait);
+ if (atomic_dec_and_test(&iobuf->dr_numreqs))
+ wake_up(&iobuf->dr_wait);
if (rc != 0)
return;
if (rw == OBD_BRW_READ) {
lprocfs_oh_tally_log2(&filter->fo_r_io_time,
- stop_time - dreq->dr_start_time);
+ stop_time - iobuf->dr_start_time);
} else {
lprocfs_oh_tally_log2(&filter->fo_w_io_time,
- stop_time - dreq->dr_start_time);
+ stop_time - iobuf->dr_start_time);
}
}
static int dio_complete_routine(struct bio *bio, unsigned int done, int error)
{
- struct dio_request *dreq = bio->bi_private;
+ struct filter_iobuf *iobuf = bio->bi_private;
unsigned long flags;
if (bio->bi_size) {
return 1;
}
- if (dreq == NULL) {
+ if (iobuf == NULL) {
CERROR("***** bio->bi_private is NULL! This should never "
"happen. Normally, I would crash here, but instead I "
"will dump the bio contents to the console. Please "
return 0;
}
- spin_lock_irqsave(&dreq->dr_lock, flags);
- bio->bi_private = dreq->dr_bios;
- dreq->dr_bios = bio;
- if (dreq->dr_error == 0)
- dreq->dr_error = error;
- spin_unlock_irqrestore(&dreq->dr_lock, flags);
+ spin_lock_irqsave(&iobuf->dr_lock, flags);
+ bio->bi_private = iobuf->dr_bios;
+ iobuf->dr_bios = bio;
+ if (iobuf->dr_error == 0)
+ iobuf->dr_error = error;
+ spin_unlock_irqrestore(&iobuf->dr_lock, flags);
- record_finish_io(dreq, test_bit(BIO_RW, &bio->bi_rw) ?
+ record_finish_io(iobuf, test_bit(BIO_RW, &bio->bi_rw) ?
OBD_BRW_WRITE : OBD_BRW_READ, error);
return 0;
return bio->bi_sector + size == sector ? 1 : 0;
}
-int filter_alloc_iobuf(struct filter_obd *filter, int rw, int num_pages,
- void **ret)
+struct filter_iobuf *filter_alloc_iobuf(struct filter_obd *filter,
+ int rw, int num_pages)
{
- struct dio_request *dreq;
+ struct filter_iobuf *iobuf;
LASSERTF(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ, "%x\n", rw);
- OBD_ALLOC(dreq, sizeof(*dreq));
- if (dreq == NULL)
+ OBD_ALLOC(iobuf, sizeof(*iobuf));
+ if (iobuf == NULL)
goto failed_0;
- OBD_ALLOC(dreq->dr_pages, num_pages * sizeof(*dreq->dr_pages));
- if (dreq->dr_pages == NULL)
+ OBD_ALLOC(iobuf->dr_pages, num_pages * sizeof(*iobuf->dr_pages));
+ if (iobuf->dr_pages == NULL)
goto failed_1;
- OBD_ALLOC(dreq->dr_blocks,
- MAX_BLOCKS_PER_PAGE * num_pages * sizeof(*dreq->dr_blocks));
- if (dreq->dr_blocks == NULL)
+ OBD_ALLOC(iobuf->dr_blocks,
+ MAX_BLOCKS_PER_PAGE * num_pages * sizeof(*iobuf->dr_blocks));
+ if (iobuf->dr_blocks == NULL)
goto failed_2;
- dreq->dr_filter = filter;
- dreq->dr_bios = NULL;
- init_waitqueue_head(&dreq->dr_wait);
- atomic_set(&dreq->dr_numreqs, 0);
- spin_lock_init(&dreq->dr_lock);
- dreq->dr_max_pages = num_pages;
- dreq->dr_npages = 0;
-
- *ret = dreq;
- RETURN(0);
-
+ iobuf->dr_filter = filter;
+ iobuf->dr_bios = NULL;
+ init_waitqueue_head(&iobuf->dr_wait);
+ atomic_set(&iobuf->dr_numreqs, 0);
+ spin_lock_init(&iobuf->dr_lock);
+ iobuf->dr_max_pages = num_pages;
+ iobuf->dr_npages = 0;
+
+ RETURN(iobuf);
+
failed_2:
- OBD_FREE(dreq->dr_pages,
- num_pages * sizeof(*dreq->dr_pages));
+ OBD_FREE(iobuf->dr_pages,
+ num_pages * sizeof(*iobuf->dr_pages));
failed_1:
- OBD_FREE(dreq, sizeof(*dreq));
+ OBD_FREE(iobuf, sizeof(*iobuf));
failed_0:
- RETURN(-ENOMEM);
+ RETURN(ERR_PTR(-ENOMEM));
}
-void filter_iobuf_put(void *iobuf)
+static void filter_clear_iobuf(struct filter_iobuf *iobuf)
{
- struct dio_request *dreq = iobuf;
-
/* free all bios */
- while (dreq->dr_bios) {
- struct bio *bio = dreq->dr_bios;
- dreq->dr_bios = bio->bi_private;
+ while (iobuf->dr_bios) {
+ struct bio *bio = iobuf->dr_bios;
+ iobuf->dr_bios = bio->bi_private;
bio_put(bio);
}
- dreq->dr_npages = 0;
- atomic_set(&dreq->dr_numreqs, 0);
+ iobuf->dr_npages = 0;
+ atomic_set(&iobuf->dr_numreqs, 0);
}
-void filter_free_iobuf(void *iobuf)
+void filter_free_iobuf(struct filter_iobuf *iobuf)
{
- struct dio_request *dreq = iobuf;
- int num_pages = dreq->dr_max_pages;
+ int num_pages = iobuf->dr_max_pages;
- filter_iobuf_put(dreq);
+ filter_clear_iobuf(iobuf);
- OBD_FREE(dreq->dr_blocks,
- MAX_BLOCKS_PER_PAGE * num_pages * sizeof(*dreq->dr_blocks));
- OBD_FREE(dreq->dr_pages,
- num_pages * sizeof(*dreq->dr_pages));
- OBD_FREE_PTR(dreq);
+ OBD_FREE(iobuf->dr_blocks,
+ MAX_BLOCKS_PER_PAGE * num_pages * sizeof(*iobuf->dr_blocks));
+ OBD_FREE(iobuf->dr_pages,
+ num_pages * sizeof(*iobuf->dr_pages));
+ OBD_FREE_PTR(iobuf);
}
-int filter_iobuf_add_page(struct obd_device *obd, void *iobuf,
- struct inode *inode, struct page *page)
+void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf,
+ struct obd_trans_info *oti)
{
- struct dio_request *dreq = iobuf;
+ int thread_id = oti ? oti->oti_thread_id : -1;
+
+ if (unlikely(thread_id < 0)) {
+ filter_free_iobuf(iobuf);
+ return;
+ }
+
+ LASSERTF(filter->fo_iobuf_pool[thread_id] == iobuf,
+ "iobuf mismatch for thread %d: pool %p iobuf %p\n",
+ thread_id, filter->fo_iobuf_pool[thread_id], iobuf);
+ filter_clear_iobuf(iobuf);
+}
- LASSERT (dreq->dr_npages < dreq->dr_max_pages);
- dreq->dr_pages[dreq->dr_npages++] = page;
+int filter_iobuf_add_page(struct obd_device *obd, struct filter_iobuf *iobuf,
+ struct inode *inode, struct page *page)
+{
+ LASSERT(iobuf->dr_npages < iobuf->dr_max_pages);
+ iobuf->dr_pages[iobuf->dr_npages++] = page;
return 0;
}
int filter_do_bio(struct obd_device *obd, struct inode *inode,
- struct dio_request *dreq, int rw)
+ struct filter_iobuf *iobuf, int rw)
{
int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
- struct page **pages = dreq->dr_pages;
- int npages = dreq->dr_npages;
- unsigned long *blocks = dreq->dr_blocks;
+ struct page **pages = iobuf->dr_pages;
+ int npages = iobuf->dr_npages;
+ unsigned long *blocks = iobuf->dr_blocks;
int total_blocks = npages * blocks_per_page;
int sector_bits = inode->i_sb->s_blocksize_bits - 9;
unsigned int blocksize = inode->i_sb->s_blocksize;
int rc = 0;
ENTRY;
- LASSERT(dreq->dr_npages == npages);
+ LASSERT(iobuf->dr_npages == npages);
LASSERT(total_blocks <= OBDFILTER_CREATED_SCRATCHPAD_ENTRIES);
- for (page_idx = 0, block_idx = 0;
- page_idx < npages;
+ for (page_idx = 0, block_idx = 0;
+ page_idx < npages;
page_idx++, block_idx += blocks_per_page) {
-
+
page = pages[page_idx];
LASSERT (block_idx + blocks_per_page <= total_blocks);
- for (i = 0, page_offset = 0;
+ for (i = 0, page_offset = 0;
i < blocks_per_page;
i += nblocks, page_offset += blocksize * nblocks) {
if (bio != NULL &&
can_be_merged(bio, sector) &&
- bio_add_page(bio, page,
+ bio_add_page(bio, page,
blocksize * nblocks, page_offset) != 0)
continue; /* added this frag OK */
/* Dang! I have to fragment this I/O */
CDEBUG(D_INODE, "bio++ sz %d vcnt %d(%d) "
"sectors %d(%d) psg %d(%d) hsg %d(%d)\n",
- bio->bi_size,
+ bio->bi_size,
bio->bi_vcnt, bio->bi_max_vecs,
bio->bi_size >> 9, q->max_sectors,
- bio_phys_segments(q, bio),
+ bio_phys_segments(q, bio),
q->max_phys_segments,
- bio_hw_segments(q, bio),
+ bio_hw_segments(q, bio),
q->max_hw_segments);
- record_start_io(dreq, rw, bio->bi_size);
+ record_start_io(iobuf, rw, bio->bi_size);
rc = fsfilt_send_bio(rw, obd, inode, bio);
if (rc < 0) {
CERROR("Can't send bio: %d\n", rc);
- record_finish_io(dreq, rw, rc);
+ record_finish_io(iobuf, rw, rc);
goto out;
}
}
/* allocate new bio */
- bio = bio_alloc(GFP_NOIO,
+ bio = bio_alloc(GFP_NOIO,
(npages - page_idx) * blocks_per_page);
if (bio == NULL) {
CERROR ("Can't allocate bio\n");
bio->bi_bdev = inode->i_sb->s_bdev;
bio->bi_sector = sector;
bio->bi_end_io = dio_complete_routine;
- bio->bi_private = dreq;
+ bio->bi_private = iobuf;
- rc = bio_add_page(bio, page,
+ rc = bio_add_page(bio, page,
blocksize * nblocks, page_offset);
LASSERT (rc != 0);
}
}
if (bio != NULL) {
- record_start_io(dreq, rw, bio->bi_size);
+ record_start_io(iobuf, rw, bio->bi_size);
rc = fsfilt_send_bio(rw, obd, inode, bio);
if (rc >= 0) {
rc = 0;
} else {
CERROR("Can't send bio: %d\n", rc);
- record_finish_io(dreq, rw, rc);
+ record_finish_io(iobuf, rw, rc);
}
}
out:
- wait_event(dreq->dr_wait, atomic_read(&dreq->dr_numreqs) == 0);
+ wait_event(iobuf->dr_wait, atomic_read(&iobuf->dr_numreqs) == 0);
if (rc == 0)
- rc = dreq->dr_error;
+ rc = iobuf->dr_error;
RETURN(rc);
}
* not be dirty, because we already called fdatasync/fdatawait on them.
*/
static int filter_clear_page_cache(struct inode *inode,
- struct dio_request *iobuf)
+ struct filter_iobuf *iobuf)
{
struct page *page;
int i, rc, rc2;
}
/* Must be called with i_sem taken for writes; this will drop it */
-int filter_direct_io(int rw, struct dentry *dchild, void *iobuf,
+int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf,
struct obd_export *exp, struct iattr *attr,
struct obd_trans_info *oti, void **wait_handle)
{
struct obd_device *obd = exp->exp_obd;
- struct dio_request *dreq = iobuf;
struct inode *inode = dchild->d_inode;
int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
int rc, rc2, create;
struct semaphore *sem;
ENTRY;
- LASSERTF(dreq->dr_npages <= dreq->dr_max_pages, "%d,%d\n",
- dreq->dr_npages, dreq->dr_max_pages);
- LASSERT(dreq->dr_npages <= OBDFILTER_CREATED_SCRATCHPAD_ENTRIES);
+ LASSERTF(iobuf->dr_npages <= iobuf->dr_max_pages, "%d,%d\n",
+ iobuf->dr_npages, iobuf->dr_max_pages);
+ LASSERT(iobuf->dr_npages <= OBDFILTER_CREATED_SCRATCHPAD_ENTRIES);
if (rw == OBD_BRW_READ) {
- if (dreq->dr_npages == 0)
+ if (iobuf->dr_npages == 0)
RETURN(0);
create = 0;
sem = NULL;
} else {
LASSERTF(rw == OBD_BRW_WRITE, "%x\n", rw);
- LASSERT(dreq->dr_npages > 0);
+ LASSERT(iobuf->dr_npages > 0);
create = 1;
sem = &obd->u.filter.fo_alloc_lock;
+
+ lquota_enforce(quota_interface, obd, iobuf->dr_ignore_quota);
}
remap:
- rc = fsfilt_map_inode_pages(obd, inode, dreq->dr_pages,
- dreq->dr_npages, dreq->dr_blocks,
+ rc = fsfilt_map_inode_pages(obd, inode, iobuf->dr_pages,
+ iobuf->dr_npages, iobuf->dr_blocks,
obdfilter_created_scratchpad, create, sem);
if (rc == -EDQUOT) {
* pre-dqacq in time or this user has exceeded quota limit, we
* have to wait for the completion of in flight dqacq/dqrel,
* then try again */
- if (filter_quota_check_master(obd, inode))
+ if (lquota_acquire(quota_interface, obd, inode->i_uid,
+ inode->i_gid))
goto remap;
}
if (rw == OBD_BRW_WRITE) {
if (rc == 0) {
filter_tally_write(&obd->u.filter,
- dreq->dr_pages,
- dreq->dr_npages,
- dreq->dr_blocks,
+ iobuf->dr_pages,
+ iobuf->dr_npages,
+ iobuf->dr_blocks,
blocks_per_page);
if (attr->ia_size > inode->i_size)
attr->ia_valid |= ATTR_SIZE;
RETURN(rc);
}
- rc = filter_clear_page_cache(inode, dreq);
+ rc = filter_clear_page_cache(inode, iobuf);
if (rc != 0)
RETURN(rc);
- RETURN(filter_do_bio(obd, inode, dreq, rw));
+ RETURN(filter_do_bio(obd, inode, iobuf, rw));
}
/* See if there are unallocated parts in given file region */
int rc)
{
struct niobuf_local *lnb;
- struct dio_request *dreq = NULL;
+ struct filter_iobuf *iobuf = NULL;
struct lvfs_run_ctxt saved;
struct fsfilt_objinfo fso;
struct iattr iattr = { 0 };
unsigned long now = jiffies;
int i, err, cleanup_phase = 0;
struct obd_device *obd = exp->exp_obd;
- struct filter_obd *filter = &obd->u.filter;
- struct lvfs_ucred *uc = NULL;
void *wait_handle;
int total_size = 0;
+ unsigned int qcids[MAXQUOTAS] = {0, 0};
ENTRY;
LASSERT(oti != NULL);
if (rc != 0)
GOTO(cleanup, rc);
- dreq = filter_iobuf_get(oti->oti_thread, &exp->exp_obd->u.filter);
+ iobuf = filter_iobuf_get(&obd->u.filter, oti);
cleanup_phase = 1;
fso.fso_dentry = res->dentry;
fso.fso_bufcnt = obj->ioo_bufcnt;
inode = res->dentry->d_inode;
+ iobuf->dr_ignore_quota = 0;
for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) {
loff_t this_size;
continue;
}
- err = filter_iobuf_add_page(obd, dreq, inode, lnb->page);
+ err = filter_iobuf_add_page(obd, iobuf, inode, lnb->page);
LASSERT (err == 0);
total_size += lnb->len;
this_size = lnb->offset + lnb->len;
if (this_size > iattr.ia_size)
iattr.ia_size = this_size;
+
+ /* if one page is a write-back page from client cache, or it's
+ * written by root, then mark the whole io request as ignore
+ * quota request */
+ if (lnb->flags & (OBD_BRW_FROM_GRANT | OBD_BRW_NOQUOTA))
+ iobuf->dr_ignore_quota = 1;
}
- /* The client store the user credit information fsuid and fsgid
- * in oa->o_uid and oa->o_gid. In case of quota enabled, we use
- * them to build the lvfs_ucred so as to enforce oss quota check */
- rc = filter_quota_enforcement(obd, oa->o_uid, oa->o_gid, &uc);
- if (rc)
- GOTO(cleanup, rc);
-
- push_ctxt(&saved, &obd->obd_lvfs_ctxt, uc);
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
cleanup_phase = 2;
down(&inode->i_sem);
fsfilt_check_slow(now, obd_timeout, "brw_start");
- iattr_from_obdo(&iattr,oa,OBD_MD_FLATIME|OBD_MD_FLMTIME|OBD_MD_FLCTIME);
+ i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
+
+ /* If the inode still has SUID+SGID bits set (see filter_precreate())
+ * then we will accept the UID+GID if sent by the client for
+ * initializing the ownership of this inode. We only allow this to
+ * happen once (so clear these bits) and later only allow setattr. */
+ if (inode->i_mode & S_ISUID)
+ i |= OBD_MD_FLUID;
+ if (inode->i_mode & S_ISGID)
+ i |= OBD_MD_FLGID;
+
+ iattr_from_obdo(&iattr, oa, i);
+ if (iattr.ia_valid & (ATTR_UID | ATTR_GID)) {
+ CDEBUG(D_INODE, "update UID/GID to %lu/%lu\n",
+ (unsigned long)oa->o_uid, (unsigned long)oa->o_gid);
+
+ cap_raise(current->cap_effective, CAP_SYS_RESOURCE);
+
+ iattr.ia_valid |= ATTR_MODE;
+ iattr.ia_mode = inode->i_mode;
+ if (iattr.ia_valid & ATTR_UID)
+ iattr.ia_mode &= ~S_ISUID;
+ if (iattr.ia_valid & ATTR_GID)
+ iattr.ia_mode &= ~S_ISGID;
+
+ rc = filter_update_fidea(exp, inode, oti->oti_handle, oa);
+ }
+
/* filter_direct_io drops i_sem */
- rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, dreq, exp, &iattr,
+ rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr,
oti, &wait_handle);
if (rc == 0)
obdo_from_inode(oa, inode,
else
obdo_from_inode(oa, inode, OBD_MD_FLUID | OBD_MD_FLGID);
- filter_get_quota_flag(obd, oa);
+ lquota_getflag(quota_interface, obd, oa);
fsfilt_check_slow(now, obd_timeout, "direct_io");
switch (cleanup_phase) {
case 2:
- pop_ctxt(&saved, &obd->obd_lvfs_ctxt, uc);
- if (uc)
- OBD_FREE(uc, sizeof(*uc));
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
LASSERT(current->journal_info == NULL);
case 1:
- filter_iobuf_put(dreq);
+ filter_iobuf_put(&obd->u.filter, iobuf, oti);
case 0:
/*
* lnb->page automatically returns back into per-thread page
}
/* trigger quota pre-acquire */
- if (rc == 0) {
- err = qctxt_adjust_qunit(obd, &filter->fo_quota_ctxt,
- oa->o_uid, oa->o_gid, 1);
- if (err)
- CERROR("error filter ajust qunit! (rc:%d)\n", err);
- }
+ qcids[USRQUOTA] = oa->o_uid;
+ qcids[GRPQUOTA] = oa->o_gid;
+ err = lquota_adjust(quota_interface, obd, qcids, NULL, rc,
+ FSFILT_OP_CREATE);
+ CDEBUG(err ? D_ERROR : D_QUOTA,
+ "error filter adjust qunit! (rc:%d)\n", err);
+
RETURN(rc);
}
#include "filter_internal.h"
-int filter_log_sz_change(struct llog_handle *cathandle,
+int filter_log_sz_change(struct llog_handle *cathandle,
struct ll_fid *mds_fid,
__u32 io_epoch,
- struct llog_cookie *logcookie,
+ struct llog_cookie *logcookie,
struct inode *inode)
{
struct llog_size_change_rec *lsc;
down(&inode->i_sem);
ofd = inode->i_filterdata;
-
+
if (ofd && ofd->ofd_epoch >= io_epoch) {
if (ofd->ofd_epoch > io_epoch)
- CERROR("client sent old epoch %d for obj ino %ld\n",
+ CERROR("client sent old epoch %d for obj ino %ld\n",
io_epoch, inode->i_ino);
up(&inode->i_sem);
RETURN(0);
}
/* Callback for processing the unlink log record received from MDS by
- * llog_client_api.
- */
+ * llog_client_api. */
static int filter_recov_log_unlink_cb(struct llog_ctxt *ctxt,
struct llog_rec_hdr *rec,
struct llog_cookie *cookie)
memcpy(obdo_logcookie(oa), cookie, sizeof(*cookie));
oid = oa->o_id;
- rc = filter_destroy(exp, oa, NULL, NULL);
+ rc = filter_destroy(exp, oa, NULL, NULL, NULL);
obdo_free(oa);
if (rc == -ENOENT) {
CDEBUG(D_HA, "object already removed, send cookie\n");
}
/* Callback for processing the setattr log record received from MDS by
- * llog_client_api.
- */
+ * llog_client_api. */
static int filter_recov_log_setattr_cb(struct llog_ctxt *ctxt,
struct llog_rec_hdr *rec,
struct llog_cookie *cookie)
CERROR("log is not plain\n");
RETURN(-EINVAL);
}
- if (rec->lrh_type != MDS_UNLINK_REC &&
- rec->lrh_type != MDS_SETATTR_REC &&
- rec->lrh_type != LLOG_GEN_REC) {
- CERROR("log record type error\n");
- RETURN(-EINVAL);
- }
cookie.lgc_lgl = llh->lgh_id;
cookie.lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
llog_cancel(ctxt, NULL, 1, &cookie, 0);
RETURN(rc);
}
+ break;
default:
+ CERROR("log record type %08x unknown\n", rec->lrh_type);
+ RETURN(-EINVAL);
break;
}
/* Called with res->lr_lvb_sem held */
static int filter_lvbo_init(struct ldlm_resource *res)
{
- int rc = 0;
struct ost_lvb *lvb = NULL;
struct obd_device *obd;
struct dentry *dentry;
+ int rc = 0;
ENTRY;
LASSERT(res);
RETURN(0);
if (res->lr_lvb_data)
- GOTO(out, rc = 0);
+ RETURN(0);
OBD_ALLOC(lvb, sizeof(*lvb));
if (lvb == NULL)
- GOTO(out, rc = -ENOMEM);
+ RETURN(-ENOMEM);
res->lr_lvb_data = lvb;
res->lr_lvb_len = sizeof(*lvb);
LASSERT(obd != NULL);
dentry = filter_fid2dentry(obd, NULL, 0, res->lr_name.name[0]);
- if (IS_ERR(dentry))
- GOTO(out, rc = PTR_ERR(dentry));
+ if (IS_ERR(dentry)) {
+ rc = PTR_ERR(dentry);
+ CERROR("%s: bad object "LPU64"/"LPU64": rc %d\n", obd->obd_name,
+ res->lr_name.name[0], res->lr_name.name[1], rc);
+ RETURN(rc);
+ }
if (dentry->d_inode == NULL)
GOTO(out_dentry, rc = -ENOENT);
res->lr_name.name[0], lvb->lvb_size,
lvb->lvb_mtime, lvb->lvb_blocks);
- out_dentry:
+ EXIT;
+out_dentry:
f_dput(dentry);
- out:
+
/* Don't free lvb data on lookup error */
return rc;
}
return count;
}
+#ifdef HAVE_QUOTA_SUPPORT
+static int lprocfs_filter_rd_bunit(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ LASSERT(obd != NULL);
+
+ return snprintf(page, count, "%lu\n",
+ obd->u.obt.obt_qctxt.lqc_bunit_sz);
+}
+
+static int lprocfs_filter_rd_iunit(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ LASSERT(obd != NULL);
+
+ return snprintf(page, count, "%lu\n",
+ obd->u.obt.obt_qctxt.lqc_iunit_sz);
+}
+
+static int lprocfs_filter_wr_bunit(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ int val, rc;
+ LASSERT(obd != NULL);
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val % QUOTABLOCK_SIZE ||
+ val <= obd->u.obt.obt_qctxt.lqc_btune_sz)
+ return -EINVAL;
+
+ obd->u.obt.obt_qctxt.lqc_bunit_sz = val;
+ return count;
+}
+
+static int lprocfs_filter_wr_iunit(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ int val, rc;
+ LASSERT(obd != NULL);
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val <= obd->u.obt.obt_qctxt.lqc_itune_sz)
+ return -EINVAL;
+
+ obd->u.obt.obt_qctxt.lqc_iunit_sz = val;
+ return count;
+}
+
+static int lprocfs_filter_rd_btune(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ LASSERT(obd != NULL);
+
+ return snprintf(page, count, "%lu\n",
+ obd->u.obt.obt_qctxt.lqc_btune_sz);
+}
+
+static int lprocfs_filter_rd_itune(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ LASSERT(obd != NULL);
+
+ return snprintf(page, count, "%lu\n",
+ obd->u.obt.obt_qctxt.lqc_itune_sz);
+}
+
+static int lprocfs_filter_wr_btune(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ int val, rc;
+ LASSERT(obd != NULL);
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val <= QUOTABLOCK_SIZE * MIN_QLIMIT || val % QUOTABLOCK_SIZE ||
+ val >= obd->u.obt.obt_qctxt.lqc_bunit_sz)
+ return -EINVAL;
+
+ obd->u.obt.obt_qctxt.lqc_btune_sz = val;
+ return count;
+}
+
+static int lprocfs_filter_wr_itune(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ int val, rc;
+ LASSERT(obd != NULL);
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val <= MIN_QLIMIT ||
+ val >= obd->u.obt.obt_qctxt.lqc_iunit_sz)
+ return -EINVAL;
+
+ obd->u.obt.obt_qctxt.lqc_itune_sz = val;
+ return count;
+}
+#endif
+
static struct lprocfs_vars lprocfs_obd_vars[] = {
{ "uuid", lprocfs_rd_uuid, 0, 0 },
{ "blocksize", lprocfs_rd_blksize, 0, 0 },
{ "quota_itune_sz", lprocfs_filter_rd_itune,
lprocfs_filter_wr_itune, 0},
#endif
-
{ 0 }
};
static struct lprocfs_vars lprocfs_obd_vars[] = {
{ "uuid", lprocfs_rd_uuid, 0, 0 },
{ "ping", 0, lprocfs_wr_ping, 0 },
+ { "connect_flags", lprocfs_rd_connect_flags, 0, 0 },
{ "blocksize", lprocfs_rd_blksize, 0, 0 },
{ "kbytestotal", lprocfs_rd_kbytestotal, 0, 0 },
{ "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 },
osc_wr_max_pages_per_rpc, 0 },
{ "max_rpcs_in_flight", osc_rd_max_rpcs_in_flight,
osc_wr_max_rpcs_in_flight, 0 },
- { "max_dirty_mb", osc_rd_max_dirty_mb, osc_wr_max_dirty_mb, 0 },
+ { "max_dirty_mb", osc_rd_max_dirty_mb, osc_wr_max_dirty_mb, 0 },
{ "cur_dirty_bytes", osc_rd_cur_dirty_bytes, 0, 0 },
{ "cur_grant_bytes", osc_rd_cur_grant_bytes, 0, 0 },
- { "create_count", osc_rd_create_count, osc_wr_create_count, 0 },
+ { "create_count", osc_rd_create_count, osc_wr_create_count, 0 },
{ "prealloc_next_id", osc_rd_prealloc_next_id, 0, 0 },
{ "prealloc_last_id", osc_rd_prealloc_last_id, 0, 0 },
- { "checksums", osc_rd_checksum, osc_wr_checksum, 0 },
+ { "checksums", osc_rd_checksum, osc_wr_checksum, 0 },
{ 0 }
};
oscc->oscc_flags |= OSCC_FLAG_CREATING;
spin_unlock(&oscc->oscc_lock);
- request = ptlrpc_prep_req(oscc->oscc_obd->u.cli.cl_import, OST_CREATE,
- 1, &size, NULL);
+ request = ptlrpc_prep_req(oscc->oscc_obd->u.cli.cl_import,
+ LUSTRE_OST_VERSION, OST_CREATE, 1,
+ &size, NULL);
if (request == NULL) {
spin_lock(&oscc->oscc_lock);
oscc->oscc_flags &= ~OSCC_FLAG_CREATING;
spin_lock(&oscc->oscc_lock);
if (oscc->oscc_flags & OSCC_FLAG_SYNC_IN_PROGRESS) {
spin_unlock(&oscc->oscc_lock);
- return -EBUSY;
+ RETURN(-EBUSY);
}
if (!(oscc->oscc_flags & OSCC_FLAG_RECOVERING)) {
spin_unlock(&oscc->oscc_lock);
- return 0;
+ RETURN(0);
}
oscc->oscc_flags |= OSCC_FLAG_SYNC_IN_PROGRESS;
spin_unlock(&oscc->oscc_lock);
oscc->oscc_flags |= OSCC_FLAG_NOSPC;
oscc->oscc_flags &= ~OSCC_FLAG_RECOVERING;
oscc->oscc_last_id = oa->o_id;
- CDEBUG(D_HA, "%s: oscc recovery finished: %d\n",
- oscc->oscc_obd->obd_name, rc);
+ CDEBUG(D_HA, "%s: oscc recovery finished, last_id: "
+ LPU64", rc: %d\n", oscc->oscc_obd->obd_name,
+ oscc->oscc_last_id, rc);
wake_up(&oscc->oscc_waitq);
} else {
CDEBUG(D_ERROR, "%s: oscc recovery failed: %d\n",
*ea = lsm;
oscc->oscc_next_id++;
try_again = 0;
+
+ CDEBUG(D_HA, "%s: set oscc_next_id = "LPU64"\n",
+ exp->exp_obd->obd_name, oscc->oscc_next_id);
} else if (oscc->oscc_flags & OSCC_FLAG_NOSPC) {
rc = -ENOSPC;
spin_unlock(&oscc->oscc_lock);
void oscc_init(struct obd_device *obd);
void osc_wake_cache_waiters(struct client_obd *cli);
-#ifdef HAVE_QUOTA_SUPPORT
-int osc_get_quota_flag(struct client_obd *cli, unsigned int uid,
- unsigned int gid);
-int osc_set_quota_flag(struct client_obd *cli,
- unsigned int uid, unsigned int gid,
- obd_flag valid, obd_flag flags);
-int osc_qinfo_cleanup(struct client_obd *cli);
-int osc_qinfo_init(void);
-void osc_qinfo_exit(void);
-int osc_quotacheck(struct obd_export *exp, struct obd_quotactl *oqctl);
-int osc_poll_quotacheck(struct obd_export *exp, struct if_quotacheck *qchk);
-int osc_quotactl(struct obd_export *exp, struct obd_quotactl *oqctl);
-#else /* !HAVE_QUOTA_SUPPORT */
-static inline int osc_get_quota_flag(struct client_obd *cli,
- unsigned int uid, unsigned int gid)
-{
- return QUOTA_OK;
-}
-static inline int osc_set_quota_flag(struct client_obd *cli,
- unsigned int uid, unsigned int gid,
- obd_flag valid, obd_flag flags)
-{
- return 0;
-}
-static inline int osc_qinfo_cleanup(struct client_obd *cli)
-{
- return 0;
-}
-static inline int osc_qinfo_init(void)
-{
- return 0;
-}
-static inline void osc_qinfo_exit(void) {}
-static inline int osc_quotacheck(struct obd_export *exp,
- struct obd_quotactl *oqctl)
-{
- return -ENOTSUPP;
-}
-static inline int osc_poll_quotacheck(struct obd_export *exp,
- struct if_quotacheck *qchk)
-{
- return -ENOTSUPP;
-}
-static inline int osc_quotactl(struct obd_export *exp,
- struct obd_quotactl *oqctl)
-{
- return -ENOTSUPP;
-}
-#endif /* HAVE_QUOTA_SUPPORT */
+
+/* Quota stuff */
+extern quota_interface_t *quota_interface;
#ifdef LPROCFS
int lproc_osc_attach_seqstat(struct obd_device *dev);
+++ /dev/null
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Copyright (c) 2003 Cluster File Systems, Inc.
- *
- * No redistribution or use is permitted outside of Cluster File Systems, Inc.
- *
- */
-
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
-#define DEBUG_SUBSYSTEM S_OSC
-
-#ifdef __KERNEL__
-# include <linux/module.h>
-# include <linux/obd_ost.h>
-# include <linux/lustre_net.h>
-# include <linux/lustre_dlm.h>
-# include <linux/lustre_lib.h>
-# include <linux/lustre_compat25.h>
-# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-# include <linux/workqueue.h>
-# include <linux/smp_lock.h>
-# else
-# include <linux/locks.h>
-# endif
-#else
-# include <liblustre.h>
-#endif
-
-#include <linux/obd.h>
-#include "osc_internal.h"
-
-struct osc_quota_info {
- struct list_head oqi_hash; /* hash list */
- struct client_obd *oqi_cli; /* osc obd */
- unsigned int oqi_id; /* uid/gid of a file */
- short oqi_type; /* quota type */
- unsigned long oqi_flag; /* flag, NO_QUOTA */
-};
-
-spinlock_t qinfo_list_lock = SPIN_LOCK_UNLOCKED;
-
-static struct list_head qinfo_hash[NR_DQHASH];
-/* SLAB cache for client quota context */
-kmem_cache_t *qinfo_cachep = NULL;
-
-static inline int const hashfn(struct client_obd *cli,
- unsigned long id,
- int type)
-{
- unsigned long tmp = ((unsigned long)cli>>6) ^ id;
- tmp = (tmp * (MAXQUOTAS - type)) % NR_DQHASH;
- return tmp;
-}
-
-static inline void insert_qinfo_hash(struct osc_quota_info *oqi)
-{
- struct list_head *head = qinfo_hash +
- hashfn(oqi->oqi_cli, oqi->oqi_id, oqi->oqi_type);
- list_add(&oqi->oqi_hash, head);
-}
-
-static inline void remove_qinfo_hash(struct osc_quota_info *oqi)
-{
- list_del_init(&oqi->oqi_hash);
-}
-
-static inline struct osc_quota_info *find_qinfo(struct client_obd *cli,
- unsigned int id, int type)
-{
- unsigned int hashent = hashfn(cli, id, type);
- struct list_head *head;
- struct osc_quota_info *oqi;
-
- for (head = qinfo_hash[hashent].next;
- head != qinfo_hash+hashent; head = head->next) {
- oqi = list_entry(head, struct osc_quota_info, oqi_hash);
- LASSERT(oqi->oqi_flag == NO_QUOTA);
- if (oqi->oqi_cli == cli &&
- oqi->oqi_id == id && oqi->oqi_type == type)
- return oqi;
- }
- return NULL;
-}
-
-static struct osc_quota_info *alloc_qinfo(struct client_obd *cli,
- unsigned int id, int type)
-{
- struct osc_quota_info *oqi;
- ENTRY;
-
- OBD_SLAB_ALLOC(oqi, qinfo_cachep, SLAB_KERNEL, sizeof(*oqi));
- if(!oqi)
- RETURN(NULL);
-
- INIT_LIST_HEAD(&oqi->oqi_hash);
- oqi->oqi_cli = cli;
- oqi->oqi_id = id;
- oqi->oqi_type = type;
-
- RETURN(oqi);
-}
-
-static void free_qinfo(struct osc_quota_info *oqi)
-{
- OBD_SLAB_FREE(oqi, qinfo_cachep, sizeof(*oqi));
-}
-
-int osc_get_quota_flag(struct client_obd *cli,
- unsigned int uid, unsigned int gid)
-{
- unsigned int id;
- int cnt, rc = QUOTA_OK;
- ENTRY;
-
- spin_lock(&qinfo_list_lock);
- for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- struct osc_quota_info *oqi = NULL;
-
- id = (cnt == USRQUOTA) ? uid : gid;
- oqi = find_qinfo(cli, id, cnt);
- if (oqi) {
- rc = NO_QUOTA;
- break;
- }
- }
- spin_unlock(&qinfo_list_lock);
-
- RETURN(rc);
-}
-
-int osc_set_quota_flag(struct client_obd *cli,
- unsigned int uid, unsigned int gid,
- obd_flag valid, obd_flag flags)
-{
- unsigned int id;
- obd_flag noquota;
- int cnt, rc = 0;
- ENTRY;
-
- spin_lock(&qinfo_list_lock);
-
- for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- struct osc_quota_info *oqi = NULL;
-
- if (!(valid & ((cnt == USRQUOTA) ?
- OBD_MD_FLUSRQUOTA : OBD_MD_FLGRPQUOTA)))
- continue;
-
- id = (cnt == USRQUOTA) ? uid : gid;
- noquota = (cnt == USRQUOTA) ?
- (flags & OBD_FL_NO_USRQUOTA) : (flags & OBD_FL_NO_GRPQUOTA);
-
- oqi = find_qinfo(cli, id, cnt);
-
- if (oqi && !noquota) {
- remove_qinfo_hash(oqi);
- free_qinfo(oqi);
- } else if (!oqi && noquota) {
- oqi = alloc_qinfo(cli, id, cnt);
- if (!oqi) {
- CERROR("not enough mem!\n");
- rc = -ENOMEM;
- break;
- }
- oqi->oqi_flag = NO_QUOTA;
- insert_qinfo_hash(oqi);
- }
- }
-
- spin_unlock(&qinfo_list_lock);
-
- RETURN(rc);
-}
-
-int osc_qinfo_cleanup(struct client_obd *cli)
-{
- struct osc_quota_info *oqi, *n;
- int i;
- ENTRY;
-
- spin_lock(&qinfo_list_lock);
- for (i = 0; i < NR_DQHASH; i++) {
- list_for_each_entry_safe(oqi, n, &qinfo_hash[i], oqi_hash) {
- if (oqi->oqi_cli != cli)
- continue;
- remove_qinfo_hash(oqi);
- free_qinfo(oqi);
- }
- }
- spin_unlock(&qinfo_list_lock);
-
- RETURN(0);
-}
-
-int osc_qinfo_init(void)
-{
- int i;
- ENTRY;
-
- LASSERT(qinfo_cachep == NULL);
- qinfo_cachep = kmem_cache_create("osc_quota_info",
- sizeof(struct osc_quota_info),
- 0, 0, NULL, NULL);
- if (!qinfo_cachep)
- RETURN(-ENOMEM);
-
- for (i = 0; i < NR_DQHASH; i++)
- INIT_LIST_HEAD(qinfo_hash + i);
-
- RETURN(0);
-}
-
-void osc_qinfo_exit(void)
-{
- struct osc_quota_info *oqi, *n;
- int i;
- ENTRY;
-
- spin_lock(&qinfo_list_lock);
- for (i = 0; i < NR_DQHASH; i++) {
- list_for_each_entry_safe(oqi, n, &qinfo_hash[i], oqi_hash) {
- remove_qinfo_hash(oqi);
- free_qinfo(oqi);
- }
- }
- spin_unlock(&qinfo_list_lock);
-
- LASSERTF(kmem_cache_destroy(qinfo_cachep) == 0,
- "couldn't destroy osc quota info slab\n");
-}
-
-int osc_quotacheck(struct obd_export *exp, struct obd_quotactl *oqctl)
-{
- struct client_obd *cli = &exp->exp_obd->u.cli;
- struct ptlrpc_request *req;
- struct obd_quotactl *body;
- int size = sizeof(*body);
- int rc;
- ENTRY;
-
- req = ptlrpc_prep_req(class_exp2cliimp(exp), OST_QUOTACHECK, 1, &size,
- NULL);
- if (!req)
- GOTO(out, rc = -ENOMEM);
-
- body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body));
- memcpy(body, oqctl, sizeof(*body));
-
- req->rq_replen = lustre_msg_size(0, NULL);
-
- spin_lock(&cli->cl_qchk_lock);
- cli->cl_qchk_stat = CL_QUOTACHECKING;
- spin_unlock(&cli->cl_qchk_lock);
-
- rc = ptlrpc_queue_wait(req);
- if (rc) {
- spin_lock(&cli->cl_qchk_lock);
- cli->cl_qchk_stat = rc;
- spin_unlock(&cli->cl_qchk_lock);
- }
- out:
- ptlrpc_req_finished(req);
- RETURN (rc);
-}
-
-int osc_poll_quotacheck(struct obd_export *exp,
- struct if_quotacheck *qchk)
-{
- struct client_obd *cli = &exp->exp_obd->u.cli;
- int stat;
- ENTRY;
-
- spin_lock(&cli->cl_qchk_lock);
- stat = cli->cl_qchk_stat;
- spin_unlock(&cli->cl_qchk_lock);
-
- qchk->stat = stat;
- if (stat == CL_QUOTACHECKING) {
- qchk->stat = -ENODATA;
- stat = 0;
- } else if (qchk->stat) {
- if (qchk->stat > CL_QUOTACHECKING)
- qchk->stat = stat = -EINTR;
-
- strncpy(qchk->obd_type, "obdfilter", 10);
- qchk->obd_uuid = cli->cl_import->imp_target_uuid;
- }
- RETURN(stat);
-}
-
-int osc_quotactl(struct obd_export *exp, struct obd_quotactl *oqctl)
-{
- struct ptlrpc_request *req;
- struct obd_quotactl *oqc;
- int size = sizeof(*oqctl);
- int rc;
- ENTRY;
-
- req = ptlrpc_prep_req(class_exp2cliimp(exp), OST_QUOTACTL, 1, &size,
- NULL);
- if (!req)
- GOTO(out, rc = -ENOMEM);
-
- memcpy(lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*oqctl)), oqctl, size);
-
- req->rq_replen = lustre_msg_size(1, &size);
-
- rc = ptlrpc_queue_wait(req);
- if (!rc) {
- oqc = lustre_swab_repbuf(req, 0, sizeof (*oqc),
- lustre_swab_obd_quotactl);
- if (oqc == NULL) {
- CERROR ("Can't unpack mds_body\n");
- GOTO(out, rc = -EPROTO);
- }
-
- memcpy(oqctl, oqc, sizeof(*oqctl));
- }
-out:
- ptlrpc_req_finished(req);
- RETURN (rc);
-}
-
struct osc_getattr_async_args *aa;
ENTRY;
- request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_GETATTR, 1,
- &size, NULL);
+ request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+ OST_GETATTR, 1, &size, NULL);
if (!request)
RETURN(-ENOMEM);
int rc, size = sizeof(*body);
ENTRY;
- request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_GETATTR, 1,
- &size, NULL);
+ request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+ OST_GETATTR, 1, &size, NULL);
if (!request)
RETURN(-ENOMEM);
int rc, size = sizeof(*body);
ENTRY;
- request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_SETATTR, 1, &size,
- NULL);
+ request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+ OST_SETATTR, 1, &size, NULL);
if (!request)
RETURN(-ENOMEM);
LASSERT(oti);
- request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_SETATTR, 1,
- &size, NULL);
+ request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+ OST_SETATTR, 1, &size, NULL);
if (!request)
RETURN(-ENOMEM);
RETURN(rc);
}
- request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_CREATE, 1, &size,
- NULL);
+ request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+ OST_CREATE, 1, &size, NULL);
if (!request)
GOTO(out, rc = -ENOMEM);
RETURN(-EINVAL);
}
- request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_PUNCH, 1, &size,
- NULL);
+ request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+ OST_PUNCH, 1, &size, NULL);
if (!request)
RETURN(-ENOMEM);
RETURN(-EINVAL);
}
- request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_SYNC, 1, &size,
- NULL);
+ request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+ OST_SYNC, 1, &size, NULL);
if (!request)
RETURN(-ENOMEM);
}
static int osc_destroy(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *ea, struct obd_trans_info *oti)
+ struct lov_stripe_md *ea, struct obd_trans_info *oti,
+ struct obd_export *md_export)
{
struct ptlrpc_request *request;
struct ost_body *body;
RETURN(-EINVAL);
}
- request = ptlrpc_prep_req(class_exp2cliimp(exp), OST_DESTROY, 1,
- &size, NULL);
+ request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+ OST_DESTROY, 1, &size, NULL);
if (!request)
RETURN(-ENOMEM);
size[2] = niocount * sizeof(*niobuf);
OBD_FAIL_RETURN(OBD_FAIL_OSC_BRW_PREP_REQ, -ENOMEM);
- req = ptlrpc_prep_req_pool(imp, opc, 3, size, NULL, pool);
+ req = ptlrpc_prep_req_pool(imp, LUSTRE_OST_VERSION, opc, 3,
+ size, NULL, pool);
if (req == NULL)
return (-ENOMEM);
if (rc < 0 && rc != -EDQUOT)
RETURN(rc);
+ LASSERTF(req->rq_repmsg != NULL, "rc = %d\n", rc);
body = lustre_swab_repbuf(req, 0, sizeof(*body), lustre_swab_ost_body);
if (body == NULL) {
CERROR ("Can't unpack body\n");
/* set/clear over quota flag for a uid/gid */
if (req->rq_reqmsg->opc == OST_WRITE &&
body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA))
- osc_set_quota_flag(cli, body->oa.o_uid, body->oa.o_gid,
- body->oa.o_valid, body->oa.o_flags);
+ lquota_setdq(quota_interface, cli, body->oa.o_uid,
+ body->oa.o_gid, body->oa.o_valid,
+ body->oa.o_flags);
if (rc < 0)
RETURN(rc);
static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap,
int sent);
+/* This maintains the lists of pending pages to read/write for a given object
+ * (lop). This is used by osc_check_rpcs->osc_next_loi() and loi_list_maint()
+ * to quickly find objects that are ready to send an RPC. */
static int lop_makes_rpc(struct client_obd *cli, struct loi_oap_pages *lop,
int cmd)
{
!list_empty(&(LOI)->loi_read_lop.lop_urgent), \
args) \
+/* This is called by osc_check_rpcs() to find which objects have pages that
+ * we could be sending. These lists are maintained by lop_makes_rpc(). */
struct lov_oinfo *osc_next_loi(struct client_obd *cli)
{
ENTRY;
ops = oap->oap_caller_ops;
ops->ap_fill_obdo(oap->oap_caller_data, cmd, oa);
- if (osc_get_quota_flag(cli, oa->o_uid, oa->o_gid) == NO_QUOTA)
+ if (lquota_chkdq(quota_interface, cli, oa->o_uid, oa->o_gid) ==
+ NO_QUOTA)
rc = -EDQUOT;
obdo_free(oa);
size[1] = sizeof(struct obd_ioobj);
size[2] = page_count * sizeof(*nioptr);
- request = ptlrpc_prep_req(imp, OST_SAN_READ, 3, size, NULL);
+ request = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+ OST_SAN_READ, 3, size, NULL);
if (!request)
RETURN(-ENOMEM);
size[1] = sizeof(struct obd_ioobj);
size[2] = page_count * sizeof(*nioptr);
- request = ptlrpc_prep_req_pool(imp, OST_SAN_WRITE,
+ request = ptlrpc_prep_req_pool(class_exp2cliimp(exp),
+ LUSTRE_OST_VERSION, OST_SAN_WRITE,
3, size, NULL, cli->cl_rq_pool);
if (!request)
RETURN(-ENOMEM);
if (*flags & LDLM_FL_HAS_INTENT) {
int size[2] = {sizeof(struct ldlm_request), sizeof(lvb)};
- req = ptlrpc_prep_req(class_exp2cliimp(exp), LDLM_ENQUEUE, 1,
+ req = ptlrpc_prep_req(class_exp2cliimp(exp),
+ LUSTRE_DLM_VERSION, LDLM_ENQUEUE, 1,
size, NULL);
if (req == NULL)
RETURN(-ENOMEM);
* during mount that would help a bit). Having relative timestamps
* is not so great if request processing is slow, while absolute
* timestamps are not ideal because they need time synchronization. */
- request = ptlrpc_prep_req(obd->u.cli.cl_import, OST_STATFS,0,NULL,NULL);
+ request = ptlrpc_prep_req(obd->u.cli.cl_import, LUSTRE_OST_VERSION,
+ OST_STATFS,0,NULL,NULL);
if (!request)
RETURN(-ENOMEM);
RETURN(rc);
}
+
static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
void *karg, void *uarg)
{
data->ioc_offset);
GOTO(out, err);
case OBD_IOC_POLL_QUOTACHECK:
- err = osc_poll_quotacheck(exp, (struct if_quotacheck *)karg);
+ err = lquota_poll_check(quota_interface, exp,
+ (struct if_quotacheck *)karg);
GOTO(out, err);
default:
CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n",
obd_id *reply;
char *bufs[1] = {key};
int rc;
- req = ptlrpc_prep_req(class_exp2cliimp(exp), OST_GET_INFO, 1,
- &keylen, bufs);
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION,
+ OST_GET_INFO, 1, &keylen, bufs);
if (req == NULL)
RETURN(-ENOMEM);
RETURN(0);
}
-
+
if (KEY_IS("unlinked")) {
struct osc_creator *oscc = &obd->u.cli.cl_oscc;
spin_lock(&oscc->oscc_lock);
RETURN(0);
}
-
if (KEY_IS("initial_recov")) {
struct obd_import *imp = exp->exp_obd->u.cli.cl_import;
if (vallen != sizeof(int))
RETURN(-EINVAL);
- req = ptlrpc_prep_req(imp, OST_SET_INFO, 2, size, bufs);
+ req = ptlrpc_prep_req(imp, LUSTRE_OST_VERSION, OST_SET_INFO,
+ 2, size, bufs);
if (req == NULL)
RETURN(-ENOMEM);
spin_unlock(&oscc->oscc_lock);
/* free memory of osc quota cache */
- osc_qinfo_cleanup(cli);
+ lquota_cleanup(quota_interface, obd);
+
+ rc = client_obd_cleanup(obd);
ptlrpc_free_rq_pool(cli->cl_rq_pool);
- rc = client_obd_cleanup(obd);
ptlrpcd_decref();
RETURN(rc);
}
.o_import_event = osc_import_event,
.o_llog_init = osc_llog_init,
.o_llog_finish = osc_llog_finish,
- .o_quotacheck = osc_quotacheck,
- .o_quotactl = osc_quotactl,
};
#if defined(__KERNEL__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
};
#endif
+static quota_interface_t *quota_interface;
+extern quota_interface_t osc_quota_interface;
+
int __init osc_init(void)
{
struct lprocfs_static_vars lvars;
lprocfs_init_vars(osc, &sanlvars);
#endif
+ quota_interface = PORTAL_SYMBOL_GET(osc_quota_interface);
+ lquota_init(quota_interface);
+ init_obd_quota_ops(quota_interface, &osc_obd_ops);
+
rc = class_register_type(&osc_obd_ops, lvars.module_vars,
LUSTRE_OSC_NAME);
- if (rc)
+ if (rc) {
+ if (quota_interface)
+ PORTAL_SYMBOL_PUT(osc_quota_interface);
RETURN(rc);
+ }
#if defined(__KERNEL__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
rc = class_register_type(&sanosc_obd_ops, sanlvars.module_vars,
LUSTRE_SANOSC_NAME);
- if (rc)
+ if (rc) {
class_unregister_type(LUSTRE_OSC_NAME);
+ if (quota_interface)
+ PORTAL_SYMBOL_PUT(osc_quota_interface);
+ RETURN(rc);
+ }
#endif
- rc = osc_qinfo_init();
-
RETURN(rc);
}
#ifdef __KERNEL__
static void /*__exit*/ osc_exit(void)
{
- osc_qinfo_exit();
+ lquota_exit(quota_interface);
+ if (quota_interface)
+ PORTAL_SYMBOL_PUT(osc_quota_interface);
+
#if defined(__KERNEL__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
class_unregister_type(LUSTRE_SANOSC_NAME);
#endif
#include <linux/lustre_quota.h>
#include "ost_internal.h"
-void oti_init(struct obd_trans_info *oti, struct ptlrpc_request *req)
-{
- if (oti == NULL)
- return;
- memset(oti, 0, sizeof *oti);
-
- if (req->rq_repmsg && req->rq_reqmsg != 0)
- oti->oti_transno = req->rq_repmsg->transno;
- oti->oti_thread = req->rq_svc_thread;
-}
+static int ost_num_threads;
+CFS_MODULE_PARM(ost_num_threads, "i", int, 0444,
+ "number of OST service threads to start");
void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req)
{
if (req->rq_repmsg)
req->rq_repmsg->transno = oti->oti_transno;
+ req->rq_transno = oti->oti_transno;
/* XXX 4 == entries in oti_ack_locks??? */
for (ack_lock = oti->oti_ack_locks, i = 0; i < 4; i++, ack_lock++) {
oti->oti_logcookies = obdo_logcookie(&body->oa);
repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*repbody));
memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
- req->rq_status = obd_destroy(exp, &body->oa, NULL, oti);
+ req->rq_status = obd_destroy(exp, &body->oa, NULL, oti, NULL);
RETURN(0);
}
osfs = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*osfs));
req->rq_status = obd_statfs(req->rq_export->exp_obd, osfs, jiffies-HZ);
+ if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_OST_ENOSPC))
+ osfs->os_bfree = osfs->os_bavail = 64;
if (req->rq_status != 0)
CERROR("ost: statfs failed: rc %d\n", req->rq_status);
LASSERT(mode == LCK_PR || mode == LCK_PW);
LASSERT(!lustre_handle_is_used(lh));
+ if (nrbufs == 0 || !(nb[0].flags & OBD_BRW_SRVLOCK))
+ RETURN(0);
+
/* EXPENSIVE ASSERTION */
for (i = 1; i < nrbufs; i ++)
LASSERT((nb[0].flags & OBD_BRW_SRVLOCK) ==
(nb[i].flags & OBD_BRW_SRVLOCK));
- if (nrbufs == 0 || !(nb[0].flags & OBD_BRW_SRVLOCK))
- RETURN(0);
-
policy.l_extent.start = nb[0].offset & CFS_PAGE_MASK;
policy.l_extent.end = (nb[nrbufs - 1].offset +
nb[nrbufs - 1].len - 1) | ~CFS_PAGE_MASK;
}
}
+ /* Check if client was evicted while we were doing i/o before touching
+ network */
if (rc == 0) {
- rc = ptlrpc_start_bulk_transfer(desc);
+ if (desc->bd_export->exp_failed)
+ rc = -ENOTCONN;
+ else
+ rc = ptlrpc_start_bulk_transfer(desc);
if (rc == 0) {
- lwi = LWI_TIMEOUT(obd_timeout * HZ / 4,
- ost_bulk_timeout, desc);
+ lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 4, HZ,
+ ost_bulk_timeout, desc);
rc = l_wait_event(desc->bd_waitq,
- !ptlrpc_bulk_active(desc), &lwi);
+ !ptlrpc_bulk_active(desc) ||
+ desc->bd_export->exp_failed, &lwi);
LASSERT(rc == 0 || rc == -ETIMEDOUT);
if (rc == -ETIMEDOUT) {
DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT");
ptlrpc_abort_bulk(desc);
+ } else if (desc->bd_export->exp_failed) {
+ DEBUG_REQ(D_ERROR, req, "Eviction on bulk PUT");
+ rc = -ENOTCONN;
+ ptlrpc_abort_bulk(desc);
} else if (!desc->bd_success ||
desc->bd_nob_transferred != desc->bd_nob) {
DEBUG_REQ(D_ERROR, req, "%s bulk PUT %d(%d)",
pp_rnb[i].offset & (PAGE_SIZE - 1),
pp_rnb[i].len);
- rc = ptlrpc_start_bulk_transfer (desc);
+ /* Check if client was evicted while we were doing i/o before touching
+ network */
+ if (desc->bd_export->exp_failed)
+ rc = -ENOTCONN;
+ else
+ rc = ptlrpc_start_bulk_transfer (desc);
if (rc == 0) {
- lwi = LWI_TIMEOUT(obd_timeout * HZ / 4,
- ost_bulk_timeout, desc);
- rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc),
- &lwi);
+ lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 4, HZ,
+ ost_bulk_timeout, desc);
+ rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc) ||
+ desc->bd_export->exp_failed, &lwi);
LASSERT(rc == 0 || rc == -ETIMEDOUT);
if (rc == -ETIMEDOUT) {
DEBUG_REQ(D_ERROR, req, "timeout on bulk GET");
ptlrpc_abort_bulk(desc);
+ } else if (desc->bd_export->exp_failed) {
+ DEBUG_REQ(D_ERROR, req, "Eviction on bulk GET");
+ rc = -ENOTCONN;
+ ptlrpc_abort_bulk(desc);
} else if (!desc->bd_success ||
desc->bd_nob_transferred != desc->bd_nob) {
DEBUG_REQ(D_ERROR, req, "%s bulk GET %d(%d)",
RETURN(rc);
}
+static int ost_handle_quotactl(struct ptlrpc_request *req)
+{
+ struct obd_quotactl *oqctl, *repoqc;
+ int rc, size = sizeof(*repoqc);
+ ENTRY;
+
+ oqctl = lustre_swab_reqbuf(req, 0, sizeof(*oqctl),
+ lustre_swab_obd_quotactl);
+ if (oqctl == NULL)
+ GOTO(out, rc = -EPROTO);
+
+ rc = lustre_pack_reply(req, 1, &size, NULL);
+ if (rc)
+ GOTO(out, rc);
+
+ repoqc = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*repoqc));
+
+ req->rq_status = obd_quotactl(req->rq_export, oqctl);
+ *repoqc = *oqctl;
+out:
+ RETURN(rc);
+}
+
+static int ost_handle_quotacheck(struct ptlrpc_request *req)
+{
+ struct obd_quotactl *oqctl;
+ int rc;
+ ENTRY;
+
+ oqctl = lustre_swab_reqbuf(req, 0, sizeof(*oqctl),
+ lustre_swab_obd_quotactl);
+ if (oqctl == NULL)
+ RETURN(-EPROTO);
+
+ rc = lustre_pack_reply(req, 0, NULL, NULL);
+ if (rc) {
+ CERROR("ost: out of memory while packing quotacheck reply\n");
+ RETURN(-ENOMEM);
+ }
+
+ req->rq_status = obd_quotacheck(req->rq_export, oqctl);
+ RETURN(0);
+}
+
static int ost_filter_recovery_request(struct ptlrpc_request *req,
struct obd_device *obd, int *process)
{
}
}
+int ost_msg_check_version(struct lustre_msg *msg)
+{
+ int rc;
+
+ /* TODO: enable the below check while really introducing msg version.
+ * it's disabled because it will break compatibility with b1_4.
+ */
+ return (0);
+ switch(msg->opc) {
+ case OST_CONNECT:
+ case OST_DISCONNECT:
+ case OBD_PING:
+ rc = lustre_msg_check_version(msg, LUSTRE_OBD_VERSION);
+ if (rc)
+ CERROR("bad opc %u version %08x, expecting %08x\n",
+ msg->opc, msg->version, LUSTRE_OBD_VERSION);
+ break;
+ case OST_CREATE:
+ case OST_DESTROY:
+ case OST_GETATTR:
+ case OST_SETATTR:
+ case OST_WRITE:
+ case OST_READ:
+ case OST_SAN_READ:
+ case OST_SAN_WRITE:
+ case OST_PUNCH:
+ case OST_STATFS:
+ case OST_SYNC:
+ case OST_SET_INFO:
+ case OST_GET_INFO:
+ case OST_QUOTACHECK:
+ case OST_QUOTACTL:
+ rc = lustre_msg_check_version(msg, LUSTRE_OST_VERSION);
+ if (rc)
+ CERROR("bad opc %u version %08x, expecting %08x\n",
+ msg->opc, msg->version, LUSTRE_OST_VERSION);
+ break;
+ case LDLM_ENQUEUE:
+ case LDLM_CONVERT:
+ case LDLM_CANCEL:
+ case LDLM_BL_CALLBACK:
+ case LDLM_CP_CALLBACK:
+ rc = lustre_msg_check_version(msg, LUSTRE_DLM_VERSION);
+ if (rc)
+ CERROR("bad opc %u version %08x, expecting %08x\n",
+ msg->opc, msg->version, LUSTRE_DLM_VERSION);
+ break;
+ case LLOG_ORIGIN_CONNECT:
+ case OBD_LOG_CANCEL:
+ rc = lustre_msg_check_version(msg, LUSTRE_LOG_VERSION);
+ if (rc)
+ CERROR("bad opc %u version %08x, expecting %08x\n",
+ msg->opc, msg->version, LUSTRE_LOG_VERSION);
+ default:
+ CERROR("Unexpected opcode %d\n", msg->opc);
+ rc = -ENOTSUPP;
+ }
+ return rc;
+}
+
static int ost_handle(struct ptlrpc_request *req)
{
struct obd_trans_info trans_info = { 0, };
}
oti_init(oti, req);
+ rc = ost_msg_check_version(req->rq_reqmsg);
+ if (rc)
+ RETURN(rc);
switch (req->rq_reqmsg->opc) {
case OST_CONNECT: {
case OST_QUOTACHECK:
CDEBUG(D_INODE, "quotacheck\n");
OBD_FAIL_RETURN(OBD_FAIL_OST_QUOTACHECK_NET, 0);
- rc = ost_quotacheck(req);
+ rc = ost_handle_quotacheck(req);
break;
case OST_QUOTACTL:
CDEBUG(D_INODE, "quotactl\n");
OBD_FAIL_RETURN(OBD_FAIL_OST_QUOTACTL_NET, 0);
- rc = ost_quotactl(req);
+ rc = ost_handle_quotactl(req);
break;
case OBD_PING:
DEBUG_REQ(D_INODE, req, "ping");
ENTRY;
LASSERT(thread != NULL);
- LASSERT(thread->t_data != NULL);
/*
* be prepared to handle partially-initialized pools (because this is
LASSERT(thread != NULL);
LASSERT(thread->t_data == NULL);
- LASSERT(thread->t_id < OST_NUM_THREADS);
+ LASSERT(thread->t_id < OST_MAX_THREADS);
OBD_ALLOC_PTR(tls);
if (tls != NULL) {
sema_init(&ost->ost_health_sem, 1);
+ if (ost_num_threads < 2)
+ ost_num_threads = OST_DEF_THREADS;
+ if (ost_num_threads > OST_MAX_THREADS)
+ ost_num_threads = OST_MAX_THREADS;
+
ost->ost_service =
ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE,
OST_MAXREPSIZE, OST_REQUEST_PORTAL,
OSC_REPLY_PORTAL,
obd_timeout * 1000, ost_handle, LUSTRE_OSS_NAME,
obd->obd_proc_entry, ost_print_req,
- OST_NUM_THREADS);
+ ost_num_threads);
if (ost->ost_service == NULL) {
CERROR("failed to start service\n");
GOTO(out_lprocfs, rc = -ENOMEM);
OSC_REPLY_PORTAL,
obd_timeout * 1000, ost_handle, "ost_io",
obd->obd_proc_entry, ost_print_req,
- OST_NUM_THREADS);
+ ost_num_threads);
if (ost->ost_io_service == NULL) {
CERROR("failed to start OST I/O service\n");
GOTO(out_create, rc = -ENOMEM);
.o_health_check = ost_health_check,
};
+
static int __init ost_init(void)
{
struct lprocfs_static_vars lvars;
+ int rc;
ENTRY;
- lprocfs_init_vars(ost,&lvars);
- RETURN(class_register_type(&ost_obd_ops, lvars.module_vars,
- LUSTRE_OSS_NAME));
+ lprocfs_init_vars(ost, &lvars);
+ rc = class_register_type(&ost_obd_ops, lvars.module_vars,
+ LUSTRE_OSS_NAME);
+ RETURN(rc);
}
static void /*__exit*/ ost_exit(void)
/*
* tunables for per-thread page pool (bug 5137)
*/
-enum {
- /*
- * pool size in pages
- */
- OST_THREAD_POOL_SIZE = PTLRPC_MAX_BRW_PAGES,
- /*
- * GFP mask used to allocate pages for pool
- */
- OST_THREAD_POOL_GFP = GFP_HIGHUSER
-};
+#define OST_THREAD_POOL_SIZE PTLRPC_MAX_BRW_PAGES /* pool size in pages */
+#define OST_THREAD_POOL_GFP GFP_HIGHUSER /* GFP mask for pool pages */
struct page;
struct niobuf_local;
struct ost_thread_local_cache *ost_tls(struct ptlrpc_request *r);
-#ifdef HAVE_QUOTA_SUPPORT
/* Quota stuff */
-int ost_quotacheck(struct ptlrpc_request *req);
-int ost_quotactl(struct ptlrpc_request *req);
-#else
-static inline int ost_quotacheck(struct ptlrpc_request *req)
-{
- req->rq_status = -ENOTSUPP;
- return -ENOTSUPP;
-}
-static inline int ost_quotactl(struct ptlrpc_request *req)
-{
- req->rq_status = -ENOTSUPP;
- return -ENOTSUPP;
-}
-#endif
+extern quota_interface_t *quota_interface;
#endif /* OST_INTERNAL_H */
ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o
ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o
ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
-ldlm_objs += $(LDLM)ldlm_flock.o
+ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o recov_thread.o
ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
ptlrpc-objs := $(ldlm_objs) $(ptlrpc_objs)
-ifeq ($(PATCHLEVEL),6)
-#ptlrpc-objs += @top_srcdir@/lustre/mds/quota_context.o
-endif
-
default: all
ldlm_%.c: @LUSTRE@/ldlm/ldlm_%.c
$(top_srcdir)/lustre/ldlm/ldlm_request.c \
$(top_srcdir)/lustre/ldlm/ldlm_lockd.c \
$(top_srcdir)/lustre/ldlm/ldlm_internal.h \
+ $(top_srcdir)/lustre/ldlm/ldlm_inodebits.c \
$(top_srcdir)/lustre/ldlm/ldlm_flock.c
COMMON_SOURCES = client.c recover.c connection.c niobuf.c pack_generic.c \
return request;
}
-struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp, int opcode,
- int count, int *lengths,
- char **bufs,
- struct ptlrpc_request_pool *pool)
+struct ptlrpc_request *
+ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode,
+ int count, int *lengths, char **bufs,
+ struct ptlrpc_request_pool *pool)
{
struct ptlrpc_request *request = NULL;
int rc;
RETURN(NULL);
}
+#if 0 /* TODO: enable this while really introducing msg version.
+ * it's disabled because it will break compatibility with b1_4.
+ */
+ request->rq_reqmsg->version |= version;
+#endif
if (imp->imp_server_timeout)
request->rq_timeout = obd_timeout / 2;
else
RETURN(request);
}
-struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
- int count, int *lengths, char **bufs)
+struct ptlrpc_request *
+ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode,
+ int count, int *lengths, char **bufs)
{
- return ptlrpc_prep_req_pool(imp, opcode, count, lengths, bufs, NULL);
+ return ptlrpc_prep_req_pool(imp, version, opcode, count, lengths,
+ bufs, NULL);
}
-
struct ptlrpc_request_set *ptlrpc_prep_set(void)
{
struct ptlrpc_request_set *set;
libcfs_nid2str(imp->imp_connection->c_peer.nid),
req->rq_reqmsg->opc);
- rc = ptl_send_rpc(req);
+ rc = ptl_send_rpc(req, 0);
if (rc) {
DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc);
req->rq_net_err = 1;
RETURN(0);
}
+/* this sends any unsent RPCs in @set and returns TRUE if all are sent */
int ptlrpc_check_set(struct ptlrpc_request_set *set)
{
unsigned long flags;
}
}
- rc = ptl_send_rpc(req);
+ rc = ptl_send_rpc(req, 0);
if (rc) {
DEBUG_REQ(D_HA, req, "send failed (%d)",
rc);
list_add_tail(&req->rq_list, &imp->imp_sending_list);
spin_unlock_irqrestore(&imp->imp_lock, flags);
- rc = ptl_send_rpc(req);
+ rc = ptl_send_rpc(req, 0);
if (rc) {
DEBUG_REQ(D_HA, req, "send failed (%d); recovering", rc);
timeout = 1;
}
/* Last chance to free reqs left on the replay list, but we
- * will still leak reqs that haven't comitted. */
+ * will still leak reqs that haven't committed. */
if (imp->imp_replayable)
ptlrpc_free_committed(imp);
}
}
- CDEBUG(D_WARNING,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer));
+ CDEBUG(D_NET,"%s->%s\n", uuid->uuid, libcfs_id2str(*peer));
if (rc != 0)
CERROR("No NID found for %s\n", uuid->uuid);
return rc;
#endif
if (rc == 0)
return 0;
-
+
CERROR ("Failed to allocate event queue: %d\n", rc);
LNetNIFini();
imp->imp_conn_cnt++;
imp->imp_resend_replay = 0;
- if (imp->imp_remote_handle.cookie == 0) {
+ if (!lustre_handle_is_used(&imp->imp_remote_handle))
initial_connect = 1;
- } else {
+ else
committed_before_reconnect = imp->imp_peer_committed_transno;
- }
spin_unlock_irqrestore(&imp->imp_lock, flags);
if (imp->imp_initial_recov_bk && initial_connect &&
/* last in list */
(imp->imp_conn_current->oic_item.next == &imp->imp_conn_list)) {
- CERROR("Last connection (%d) for %s, turning off init_recov\n",
+ CDEBUG(D_HA, "Last connection attempt (%d) for %s\n",
imp->imp_conn_cnt, imp->imp_target_uuid.uuid);
/* Don't retry if connect fails */
rc = 0;
if (rc)
GOTO(out, rc);
- request = ptlrpc_prep_req(imp, imp->imp_connect_op, 4, size, tmp);
+ request = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, imp->imp_connect_op,
+ 4, size, tmp);
if (!request)
GOTO(out, rc = -ENOMEM);
if (request->rq_repmsg->last_committed < aa->pcaa_peer_committed) {
CERROR("%s went back in time (transno "LPD64
" was previously committed, server now claims "LPD64
- ")! is shared storage not coherent?\n",
- imp->imp_target_uuid.uuid,
- aa->pcaa_peer_committed,
+ ")! See https://bugzilla.clusterfs.com/"
+ "long_list.cgi?buglist=9646\n",
+ imp->imp_target_uuid.uuid, aa->pcaa_peer_committed,
request->rq_repmsg->last_committed);
}
}
} else {
struct obd_connect_data *ocd;
+ struct obd_export *exp;
ocd = lustre_swab_repbuf(request, 0,
sizeof *ocd, lustre_swab_connect);
GOTO(out, rc);
}
spin_lock_irqsave(&imp->imp_lock, flags);
+
/*
* check that server granted subset of flags we asked for.
*/
LASSERT((ocd->ocd_connect_flags &
imp->imp_connect_data.ocd_connect_flags) ==
ocd->ocd_connect_flags);
+
imp->imp_connect_data = *ocd;
+ if (!ocd->ocd_ibits_known &&
+ ocd->ocd_connect_flags & OBD_CONNECT_IBITS)
+ CERROR("Inodebits aware server returned zero compatible"
+ " bits?\n");
+
+ exp = class_conn2export(&imp->imp_dlm_handle);
+ LASSERT(exp);
+ exp->exp_connect_flags = ocd->ocd_connect_flags;
+ class_export_put(exp);
+
obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
#else
char *action = "recompiling this application";
#endif
-
+
CWARN("Server %s version (%d.%d.%d.%d) is much newer. "
"Consider %s (%s).\n",
imp->imp_target_uuid.uuid,
out:
if (rc != 0) {
-
IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
- if (aa->pcaa_initial_connect && !imp->imp_initial_recov) {
+ if (aa->pcaa_initial_connect && !imp->imp_initial_recov)
ptlrpc_deactivate_import(imp);
- }
if (rc == -EPROTO) {
struct obd_connect_data *ocd;
ocd = lustre_swab_repbuf(request, 0,
sizeof *ocd,
lustre_swab_connect);
- if (ocd &&
- (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+ if (ocd &&
+ (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
(ocd->ocd_version != LUSTRE_VERSION_CODE)) {
/* Actually servers are only supposed to refuse
connection from liblustre clients, so we should
never see this from VFS context */
- CERROR("Server %s version (%d.%d.%d.%d) refused"
- " connection from this client as too old "
- "version (%s). Client must be "
- "recompiled\n",
+ CERROR("Server %s version (%d.%d.%d.%d) "
+ "refused connection from this client "
+ "as too old version (%s). Client must "
+ "be recompiled\n",
imp->imp_target_uuid.uuid,
OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
OBD_OCD_VERSION_MINOR(ocd->ocd_version),
OBD_OCD_VERSION_FIX(ocd->ocd_version),
LUSTRE_VERSION_STRING);
IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
- RETURN(-EPROTO);
}
+ RETURN(-EPROTO);
}
-
+
ptlrpc_maybe_ping_import_soon(imp);
CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
atomic_inc(&imp->imp_replay_inflight);
- req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL, NULL);
+ req = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, OBD_PING,
+ 0, NULL, NULL);
if (!req) {
atomic_dec(&imp->imp_replay_inflight);
RETURN(-ENOMEM);
spin_unlock_irqrestore(&imp->imp_lock, flags);
- request = ptlrpc_prep_req(imp, rq_opc, 0, NULL, NULL);
+ request = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, rq_opc,
+ 0, NULL, NULL);
if (request) {
/* We are disconnecting, do not retry a failed DISCONNECT rpc if
* it fails. We can get through the above with a down server
bufcount++;
}
- req = ptlrpc_prep_req(imp, LLOG_ORIGIN_HANDLE_CREATE,
- bufcount, size, tmp);
+ req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION,
+ LLOG_ORIGIN_HANDLE_CREATE, bufcount, size, tmp);
if (!req)
GOTO(err_free, rc = -ENOMEM);
goto out;
}
+static int llog_client_destroy(struct llog_handle *loghandle)
+{
+ struct obd_import *imp = loghandle->lgh_ctxt->loc_imp;
+ struct ptlrpc_request *req = NULL;
+ struct llogd_body *body;
+ int size = sizeof(*body);
+ int repsize[2] = {sizeof (*body)};
+ int rc;
+ ENTRY;
+
+ req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION,
+ LLOG_ORIGIN_HANDLE_DESTROY, 1, &size, NULL);
+ if (!req)
+ RETURN(-ENOMEM);
+
+ body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
+ body->lgd_logid = loghandle->lgh_id;
+ body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+
+ req->rq_replen = lustre_msg_size(1, repsize);
+ rc = ptlrpc_queue_wait(req);
+
+ ptlrpc_req_finished(req);
+ RETURN(rc);
+}
+
static int llog_client_next_block(struct llog_handle *loghandle,
int *cur_idx, int next_idx,
int rc;
ENTRY;
- req = ptlrpc_prep_req(imp, LLOG_ORIGIN_HANDLE_NEXT_BLOCK, 1,&size,NULL);
+ req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION,
+ LLOG_ORIGIN_HANDLE_NEXT_BLOCK, 1,&size,NULL);
if (!req)
GOTO(out, rc = -ENOMEM);
RETURN(rc);
}
+static int llog_client_prev_block(struct llog_handle *loghandle,
+ int prev_idx, void *buf, int len)
+{
+ struct obd_import *imp = loghandle->lgh_ctxt->loc_imp;
+ struct ptlrpc_request *req = NULL;
+ struct llogd_body *body;
+ void * ptr;
+ int size = sizeof(*body);
+ int repsize[2] = {sizeof (*body)};
+ int rc;
+ ENTRY;
+
+ req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION,
+ LLOG_ORIGIN_HANDLE_PREV_BLOCK, 1,&size,NULL);
+ if (!req)
+ GOTO(out, rc = -ENOMEM);
+
+ body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
+ body->lgd_logid = loghandle->lgh_id;
+ body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1;
+ body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags;
+ body->lgd_index = prev_idx;
+ body->lgd_len = len;
+ repsize[1] = len;
+
+ req->rq_replen = lustre_msg_size(2, repsize);
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ GOTO(out, rc);
+
+ body = lustre_swab_repbuf(req, 0, sizeof(*body),
+ lustre_swab_llogd_body);
+ if (body == NULL) {
+ CERROR ("Can't unpack llogd_body\n");
+ GOTO(out, rc =-EFAULT);
+ }
+
+ ptr = lustre_msg_buf(req->rq_repmsg, 1, len);
+ if (ptr == NULL) {
+ CERROR ("Can't unpack bitmap\n");
+ GOTO(out, rc =-EFAULT);
+ }
+
+ memcpy(buf, ptr, len);
+
+out:
+ if (req)
+ ptlrpc_req_finished(req);
+ RETURN(rc);
+}
static int llog_client_read_header(struct llog_handle *handle)
{
int rc;
ENTRY;
- req = ptlrpc_prep_req(imp, LLOG_ORIGIN_HANDLE_READ_HEADER,
- 1, &size, NULL);
+ req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION,
+ LLOG_ORIGIN_HANDLE_READ_HEADER, 1, &size, NULL);
if (!req)
GOTO(out, rc = -ENOMEM);
struct llog_operations llog_client_ops = {
lop_next_block: llog_client_next_block,
+ lop_prev_block: llog_client_prev_block,
lop_read_header: llog_client_read_header,
lop_create: llog_client_create,
+ lop_destroy: llog_client_destroy,
lop_close: llog_client_close,
};
lgr->lgr_hdr.lrh_len = lgr->lgr_tail.lrt_len = sizeof(*lgr);
lgr->lgr_hdr.lrh_type = LLOG_GEN_REC;
lgr->lgr_gen = ctxt->loc_gen;
- rc = llog_add(ctxt, &lgr->lgr_hdr, NULL, NULL, 1, NULL);
+ rc = llog_add(ctxt, &lgr->lgr_hdr, NULL, NULL, 1);
OBD_FREE(lgr, sizeof(*lgr));
if (rc != 1)
RETURN(rc);
LASSERT(ctxt->loc_imp);
imp = ctxt->loc_imp;
- request = ptlrpc_prep_req(imp, LLOG_ORIGIN_CONNECT, 1, &size, NULL);
+ request = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION,
+ LLOG_ORIGIN_CONNECT, 1, &size, NULL);
if (!request)
RETURN(-ENOMEM);
RETURN(rc);
}
+int llog_origin_handle_destroy(struct ptlrpc_request *req)
+{
+ struct obd_export *exp = req->rq_export;
+ struct obd_device *obd = exp->exp_obd;
+ struct obd_device *disk_obd;
+ struct llog_handle *loghandle;
+ struct llogd_body *body;
+ struct lvfs_run_ctxt saved;
+ struct llog_logid *logid = NULL;
+ struct llog_ctxt *ctxt;
+ int size = sizeof (*body);
+ int rc;
+ __u32 flags;
+ ENTRY;
+
+ body = lustre_swab_reqbuf(req, 0, sizeof(*body),
+ lustre_swab_llogd_body);
+ if (body == NULL) {
+ CERROR ("Can't unpack llogd_body\n");
+ GOTO(out, rc =-EFAULT);
+ }
+
+ if (body->lgd_logid.lgl_oid > 0)
+ logid = &body->lgd_logid;
+
+ ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
+ if (ctxt == NULL)
+ GOTO(out, rc = -EINVAL);
+ disk_obd = ctxt->loc_exp->exp_obd;
+ push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+ rc = llog_create(ctxt, &loghandle, logid, NULL);
+ if (rc)
+ GOTO(out_pop, rc);
+
+ rc = lustre_pack_reply(req, 1, &size, NULL);
+ if (rc)
+ GOTO(out_close, rc = -ENOMEM);
+
+ body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
+ body->lgd_logid = loghandle->lgh_id;
+ flags = body->lgd_llh_flags;
+ rc = llog_init_handle(loghandle, LLOG_F_IS_PLAIN, NULL);
+ if (rc)
+ GOTO(out_close, rc);
+ rc = llog_destroy(loghandle);
+ if (rc)
+ GOTO(out_close, rc);
+ llog_free_handle(loghandle);
+
+out_close:
+ if (rc)
+ llog_close(loghandle);
+out_pop:
+ pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+out:
+ RETURN(rc);
+}
+
int llog_origin_handle_next_block(struct ptlrpc_request *req)
{
struct obd_export *exp = req->rq_export;
RETURN(rc);
}
+int llog_origin_handle_prev_block(struct ptlrpc_request *req)
+{
+ struct obd_export *exp = req->rq_export;
+ struct obd_device *obd = exp->exp_obd;
+ struct llog_handle *loghandle;
+ struct llogd_body *body;
+ struct obd_device *disk_obd;
+ struct lvfs_run_ctxt saved;
+ struct llog_ctxt *ctxt;
+ __u32 flags;
+ __u8 *buf;
+ void * ptr;
+ int size[] = {sizeof (*body),
+ LLOG_CHUNK_SIZE};
+ int rc, rc2;
+ ENTRY;
+
+ body = lustre_swab_reqbuf(req, 0, sizeof(*body),
+ lustre_swab_llogd_body);
+ if (body == NULL) {
+ CERROR ("Can't unpack llogd_body\n");
+ GOTO(out, rc =-EFAULT);
+ }
+
+ OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+ if (!buf)
+ GOTO(out, rc = -ENOMEM);
+
+ ctxt = llog_get_context(obd, body->lgd_ctxt_idx);
+ LASSERT(ctxt != NULL);
+ disk_obd = ctxt->loc_exp->exp_obd;
+ push_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+
+ rc = llog_create(ctxt, &loghandle, &body->lgd_logid, NULL);
+ if (rc)
+ GOTO(out_pop, rc);
+
+ flags = body->lgd_llh_flags;
+ rc = llog_init_handle(loghandle, flags, NULL);
+ if (rc)
+ GOTO(out_close, rc);
+
+ memset(buf, 0, LLOG_CHUNK_SIZE);
+ rc = llog_prev_block(loghandle, body->lgd_index,
+ buf, LLOG_CHUNK_SIZE);
+ if (rc)
+ GOTO(out_close, rc);
+
+
+ rc = lustre_pack_reply(req, 2, size, NULL);
+ if (rc)
+ GOTO(out_close, rc = -ENOMEM);
+
+ ptr = lustre_msg_buf(req->rq_repmsg, 0, sizeof (body));
+ memcpy(ptr, body, sizeof(*body));
+
+ ptr = lustre_msg_buf(req->rq_repmsg, 1, LLOG_CHUNK_SIZE);
+ memcpy(ptr, buf, LLOG_CHUNK_SIZE);
+
+out_close:
+ rc2 = llog_close(loghandle);
+ if (!rc)
+ rc = rc2;
+
+out_pop:
+ pop_ctxt(&saved, &disk_obd->obd_lvfs_ctxt, NULL);
+ OBD_FREE(buf, LLOG_CHUNK_SIZE);
+out:
+ RETURN(rc);
+}
+
int llog_origin_handle_read_header(struct ptlrpc_request *req)
{
struct obd_export *exp = req->rq_export;
LBUG();
return 0;
}
+
+int llog_origin_handle_destroy(struct ptlrpc_request *req)
+{
+ LBUG();
+ return 0;
+}
+
int llog_origin_handle_next_block(struct ptlrpc_request *req)
{
LBUG();
return 0;
}
+int llog_origin_handle_prev_block(struct ptlrpc_request *req)
+{
+ LBUG();
+ return 0;
+}
int llog_origin_handle_read_header(struct ptlrpc_request *req)
{
LBUG();
int rc;
ENTRY;
- req = ptlrpc_prep_req(obd->u.cli.cl_import, OBD_PING, 0, NULL, NULL);
+ req = ptlrpc_prep_req(obd->u.cli.cl_import, LUSTRE_OBD_VERSION,
+ OBD_PING, 0, NULL, NULL);
if (req == NULL)
RETURN(-ENOMEM);
if (!ptlrpc_bulk_active(desc)) /* completed or */
return; /* never started */
+ /* Do not send any meaningful data over the wire for evicted clients */
+ if (desc->bd_export && desc->bd_export->exp_failed)
+ ptl_rpc_wipe_bulk_pages(desc);
+
/* The unlink ensures the callback happens ASAP and is the last
* one. If it fails, it must be because completion just happened,
* but we must still l_wait_event() in this case, to give liblustre
RETURN(rc);
}
-int ptl_send_rpc_nowait(struct ptlrpc_request *request)
-{
- int rc;
- struct ptlrpc_connection *connection;
- unsigned long flags;
- ENTRY;
-
- LASSERT (request->rq_type == PTL_RPC_MSG_REQUEST);
-
- if (request->rq_import->imp_obd &&
- request->rq_import->imp_obd->obd_fail) {
- CDEBUG(D_HA, "muting rpc for failed imp obd %s\n",
- request->rq_import->imp_obd->obd_name);
- /* this prevents us from waiting in ptlrpc_queue_wait */
- request->rq_err = 1;
- RETURN(-ENODEV);
- }
-
- connection = request->rq_import->imp_connection;
-
- request->rq_reqmsg->handle = request->rq_import->imp_remote_handle;
- request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST;
- request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt;
-
- spin_lock_irqsave (&request->rq_lock, flags);
- /* If the MD attach succeeds, there _will_ be a reply_in callback */
- request->rq_receiving_reply = 0;
- /* Clear any flags that may be present from previous sends. */
- request->rq_replied = 0;
- request->rq_err = 0;
- request->rq_timedout = 0;
- request->rq_net_err = 0;
- request->rq_resend = 0;
- request->rq_restart = 0;
- spin_unlock_irqrestore (&request->rq_lock, flags);
-
- ptlrpc_request_addref(request); /* +1 ref for the SENT callback */
-
- request->rq_sent = CURRENT_SECONDS;
- ptlrpc_pinger_sending_on_import(request->rq_import);
- rc = ptl_send_buf(&request->rq_req_md_h,
- request->rq_reqmsg, request->rq_reqlen,
- LNET_NOACK_REQ, &request->rq_req_cbid,
- connection,
- request->rq_request_portal,
- request->rq_xid);
- if (rc == 0) {
- ptlrpc_lprocfs_rpc_sent(request);
- } else {
- ptlrpc_req_finished (request); /* drop callback ref */
- }
-
- return rc;
-}
-
-
-int ptl_send_rpc(struct ptlrpc_request *request)
+int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
{
int rc;
int rc2;
request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST;
request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt;
- LASSERT (request->rq_replen != 0);
- if (request->rq_repmsg == NULL)
- OBD_ALLOC(request->rq_repmsg, request->rq_replen);
- if (request->rq_repmsg == NULL)
- GOTO(cleanup_bulk, rc = -ENOMEM);
-
- rc = LNetMEAttach(request->rq_reply_portal, /* XXX FIXME bug 249 */
- connection->c_peer, request->rq_xid, 0,
- LNET_UNLINK, LNET_INS_AFTER, &reply_me_h);
- if (rc != 0) {
- CERROR("LNetMEAttach failed: %d\n", rc);
- LASSERT (rc == -ENOMEM);
- GOTO(cleanup_repmsg, rc = -ENOMEM);
+ if (!noreply) {
+ LASSERT (request->rq_replen != 0);
+ if (request->rq_repmsg == NULL)
+ OBD_ALLOC(request->rq_repmsg, request->rq_replen);
+ if (request->rq_repmsg == NULL)
+ GOTO(cleanup_bulk, rc = -ENOMEM);
+
+ rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/
+ connection->c_peer, request->rq_xid, 0,
+ LNET_UNLINK, LNET_INS_AFTER, &reply_me_h);
+ if (rc != 0) {
+ CERROR("LNetMEAttach failed: %d\n", rc);
+ LASSERT (rc == -ENOMEM);
+ GOTO(cleanup_repmsg, rc = -ENOMEM);
+ }
}
spin_lock_irqsave (&request->rq_lock, flags);
/* If the MD attach succeeds, there _will_ be a reply_in callback */
- request->rq_receiving_reply = 1;
+ request->rq_receiving_reply = !noreply;
/* Clear any flags that may be present from previous sends. */
request->rq_replied = 0;
request->rq_err = 0;
request->rq_restart = 0;
spin_unlock_irqrestore (&request->rq_lock, flags);
- reply_md.start = request->rq_repmsg;
- reply_md.length = request->rq_replen;
- reply_md.threshold = 1;
- reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT;
- reply_md.user_ptr = &request->rq_reply_cbid;
- reply_md.eq_handle = ptlrpc_eq_h;
-
- rc = LNetMDAttach(reply_me_h, reply_md, LNET_UNLINK,
- &request->rq_reply_md_h);
- if (rc != 0) {
- CERROR("LNetMDAttach failed: %d\n", rc);
- LASSERT (rc == -ENOMEM);
- spin_lock_irqsave (&request->rq_lock, flags);
- /* ...but the MD attach didn't succeed... */
- request->rq_receiving_reply = 0;
- spin_unlock_irqrestore (&request->rq_lock, flags);
- GOTO(cleanup_me, rc -ENOMEM);
+ if (!noreply) {
+ reply_md.start = request->rq_repmsg;
+ reply_md.length = request->rq_replen;
+ reply_md.threshold = 1;
+ reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT;
+ reply_md.user_ptr = &request->rq_reply_cbid;
+ reply_md.eq_handle = ptlrpc_eq_h;
+
+ rc = LNetMDAttach(reply_me_h, reply_md, LNET_UNLINK,
+ &request->rq_reply_md_h);
+ if (rc != 0) {
+ CERROR("LNetMDAttach failed: %d\n", rc);
+ LASSERT (rc == -ENOMEM);
+ spin_lock_irqsave (&request->rq_lock, flags);
+ /* ...but the MD attach didn't succeed... */
+ request->rq_receiving_reply = 0;
+ spin_unlock_irqrestore (&request->rq_lock, flags);
+ GOTO(cleanup_me, rc -ENOMEM);
+ }
+
+ CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
+ ", portal %u\n",
+ request->rq_replen, request->rq_xid,
+ request->rq_reply_portal);
}
- CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
- ", portal %u\n",
- request->rq_replen, request->rq_xid,
- request->rq_reply_portal);
-
ptlrpc_request_addref(request); /* +1 ref for the SENT callback */
request->rq_sent = CURRENT_SECONDS;
ptlrpc_req_finished (request); /* drop callback ref */
+ if (noreply)
+ RETURN(rc);
+ else
+ GOTO(cleanup_me, rc);
cleanup_me:
/* MEUnlink is safe; the PUT didn't even get off the ground, and
* nobody apart from the PUT's target has the right nid+XID to
return (msg->magic == __swab32(PTLRPC_MSG_MAGIC));
}
+int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
+{
+ if (lustre_msg_swabbed(msg))
+ return (__swab32(msg->version) & LUSTRE_VERSION_MASK) != version;
+
+ return (msg->version & LUSTRE_VERSION_MASK) != version;
+}
+
static void
lustre_init_msg (struct lustre_msg *msg, int count, int *lens, char **bufs)
{
RETURN (0);
}
+/*
+ * shrink @segment to size @newlen. if @move_data is non-zero, we also move
+ * data forward from @segment + 1.
+ *
+ * if @newlen == 0, we remove the segment completely, but we still keep the
+ * totally bufcount the same to save possible data moving. this will leave a
+ * unused segment with size 0 at the tail, but that's ok.
+ *
+ * CAUTION:
+ * + if any buffers higher than @segment has been filled in, must call shrink
+ * with non-zero @move_data.
+ * + caller should NOT keep pointers to msg buffers which higher than @segment
+ * after call shrink.
+ */
+void lustre_shrink_reply(struct ptlrpc_request *req,
+ int segment, unsigned int newlen, int move_data)
+{
+ struct lustre_msg *msg = req->rq_repmsg;
+ char *tail = NULL, *newpos;
+ int tail_len = 0, n;
+
+ LASSERT(req->rq_reply_state);
+ LASSERT(msg);
+ LASSERT(msg->bufcount > segment);
+ LASSERT(msg->buflens[segment] >= newlen);
+
+ if (msg->buflens[segment] == newlen)
+ return;
+
+ if (move_data && msg->bufcount > segment + 1) {
+ tail = lustre_msg_buf(msg, segment + 1, 0);
+ for (n = segment + 1; n < msg->bufcount; n++)
+ tail_len += size_round(msg->buflens[n]);
+ }
+
+ msg->buflens[segment] = newlen;
+
+ if (tail && tail_len) {
+ newpos = lustre_msg_buf(msg, segment + 1, 0);
+ LASSERT(newpos <= tail);
+ if (newpos != tail)
+ memcpy(newpos, tail, tail_len);
+ }
+
+ if (newlen == 0 && msg->bufcount > segment + 1) {
+ memmove(&msg->buflens[segment], &msg->buflens[segment + 1],
+ (msg->bufcount - segment - 1) * sizeof(__u32));
+ msg->buflens[msg->bufcount - 1] = 0;
+ }
+
+ req->rq_replen = lustre_msg_size(msg->bufcount, msg->buflens);
+}
+
void lustre_free_reply_state (struct ptlrpc_reply_state *rs)
{
PTLRPC_RS_DEBUG_LRU_DEL(rs);
RETURN (-EINVAL);
}
- if (m->version != PTLRPC_MSG_VERSION) {
+ if ((m->version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) {
CERROR("wrong lustre_msg version %#08x\n", m->version);
RETURN (-EINVAL);
}
__swab32s (&ocd->ocd_grant);
__swab32s (&ocd->ocd_index);
__swab32s (&ocd->ocd_unused);
- CLASSERT(offsetof(typeof(*ocd), padding1) != 0);
+ __swab64s (&ocd->ocd_ibits_known);
CLASSERT(offsetof(typeof(*ocd), padding2) != 0);
CLASSERT(offsetof(typeof(*ocd), padding3) != 0);
CLASSERT(offsetof(typeof(*ocd), padding4) != 0);
__swab32s (&o->o_misc);
__swab32s (&o->o_easize);
__swab32s (&o->o_mds);
- CLASSERT(offsetof(typeof(*o), o_padding_1) != 0);
- CLASSERT(offsetof(typeof(*o), o_padding_2) != 0);
+ __swab32s (&o->o_stripe_idx);
+ __swab32s (&o->o_padding_1);
/* o_inline is opaque */
}
__swab32s (&os->os_bsize);
__swab32s (&os->os_namelen);
__swab64s (&os->os_maxbytes);
- CLASSERT(offsetof(typeof(*os), os_spare) != 0);
+ __swab32s (&os->os_state);
+ /* no need to swap os_spare */
}
void lustre_swab_obd_ioobj (struct obd_ioobj *ioo)
__swab32s (&b->generation);
__swab32s (&b->suppgid);
__swab32s (&b->eadatasize);
- CLASSERT(offsetof(typeof(*b), padding_1) != 0);
- CLASSERT(offsetof(typeof(*b), padding_2) != 0);
- CLASSERT(offsetof(typeof(*b), padding_3) != 0);
- CLASSERT(offsetof(typeof(*b), padding_4) != 0);
+ __swab32s (&b->aclsize);
+ __swab32s (&b->max_mdsize);
+ __swab32s (&b->max_cookiesize);
+ __swab32s (&b->padding_4);
}
void lustre_swab_mgs_target_info(struct mgmt_target_info *mti)
{
int i;
+ LASSERT(sizeof(lnet_nid_t) == sizeof(__u64));
for (i = 0; i < MTI_NIDS_MAX; i++) {
__swab64s(&mti->mti_nids[i]);
__swab64s(&mti->mti_failnids[i]);
CLASSERT(offsetof(typeof(*sa), sa_padding) != 0);
}
+void lustre_swab_mds_rec_join (struct mds_rec_join *jr)
+{
+ __swab64s(&jr->jr_headsize);
+ lustre_swab_ll_fid(&jr->jr_fid);
+}
+
void lustre_swab_mds_rec_create (struct mds_rec_create *cr)
{
__swab32s (&cr->cr_opcode);
EXIT;
}
+static void print_lumj (struct lov_user_md_join *lumj)
+{
+ CDEBUG(D_OTHER, "lov_user_md %p:\n", lumj);
+ CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lumj->lmm_magic);
+ CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lumj->lmm_pattern);
+ CDEBUG(D_OTHER, "\tlmm_object_id: "LPU64"\n", lumj->lmm_object_id);
+ CDEBUG(D_OTHER, "\tlmm_object_gr: "LPU64"\n", lumj->lmm_object_gr);
+ CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lumj->lmm_stripe_size);
+ CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lumj->lmm_stripe_count);
+ CDEBUG(D_OTHER, "\tlmm_extent_count: %#x\n", lumj->lmm_extent_count);
+}
+
+void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj)
+{
+ ENTRY;
+ CDEBUG(D_IOCTL, "swabbing lov_user_md_join\n");
+ __swab32s(&lumj->lmm_magic);
+ __swab32s(&lumj->lmm_pattern);
+ __swab64s(&lumj->lmm_object_id);
+ __swab64s(&lumj->lmm_object_gr);
+ __swab32s(&lumj->lmm_stripe_size);
+ __swab32s(&lumj->lmm_stripe_count);
+ __swab32s(&lumj->lmm_extent_count);
+ print_lumj(lumj);
+ EXIT;
+}
+
static void print_lum_objs(struct lov_user_md *lum)
{
struct lov_user_ost_data *lod;
void lustre_assert_wire_constants(void)
{
/* Wire protocol assertions generated by 'wirecheck'
- * running on Linux schatzie.adilger.int 2.6.12-1.1378_FC3 #1 Wed Sep 14 04:24:31 EDT 2005 i6
+ * running on Linux schatzie.adilger.int 2.6.12-1.1381_FC3 #1 Fri Oct 21 03:46:55 EDT 2005 i6
* with gcc version 3.3.4 20040817 (Red Hat Linux 3.3.4-2) */
(long long)MDS_STATUS_CONN);
LASSERTF(MDS_STATUS_LOV == 2, " found %lld\n",
(long long)MDS_STATUS_LOV);
- LASSERTF(MDS_OPEN_HAS_EA == 1073741824, " found %lld\n",
- (long long)MDS_OPEN_HAS_EA);
LASSERTF(LDLM_ENQUEUE == 101, " found %lld\n",
(long long)LDLM_ENQUEUE);
LASSERTF(LDLM_CONVERT == 102, " found %lld\n",
(long long)OBD_CONNECT_GRANT);
LASSERTF(OBD_CONNECT_SRVLOCK == 16, " found %lld\n",
(long long)OBD_CONNECT_SRVLOCK);
+ LASSERTF(OBD_CONNECT_VERSION == 32, " found %lld\n",
+ (long long)OBD_CONNECT_VERSION);
+ LASSERTF(OBD_CONNECT_REQPORTAL == 64, " found %lld\n",
+ (long long)OBD_CONNECT_REQPORTAL);
LASSERTF(OBD_CONNECT_ACL == 128, " found %lld\n",
(long long)OBD_CONNECT_ACL);
LASSERTF(OBD_CONNECT_XATTR == 256, " found %lld\n",
(long long)OBD_CONNECT_XATTR);
LASSERTF(OBD_CONNECT_CROW == 512, " found %lld\n",
(long long)OBD_CONNECT_CROW);
+ LASSERTF(OBD_CONNECT_TRUNCLOCK == 1024, " found %lld\n",
+ (long long)OBD_CONNECT_TRUNCLOCK);
+ LASSERTF(OBD_CONNECT_TRANSNO == 2048, " found %lld\n",
+ (long long)OBD_CONNECT_TRANSNO);
/* Sizes and Offsets */
(long long)(int)offsetof(struct obdo, o_mds));
LASSERTF((int)sizeof(((struct obdo *)0)->o_mds) == 4, " found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_mds));
- LASSERTF((int)offsetof(struct obdo, o_padding_1) == 120, " found %lld\n",
+ LASSERTF((int)offsetof(struct obdo, o_padding_1) == 124, " found %lld\n",
(long long)(int)offsetof(struct obdo, o_padding_1));
LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_1) == 4, " found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_padding_1));
- LASSERTF((int)offsetof(struct obdo, o_padding_2) == 124, " found %lld\n",
- (long long)(int)offsetof(struct obdo, o_padding_2));
- LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_2) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct obdo *)0)->o_padding_2));
LASSERTF((int)offsetof(struct obdo, o_inline) == 128, " found %lld\n",
(long long)(int)offsetof(struct obdo, o_inline));
LASSERTF((int)sizeof(((struct obdo *)0)->o_inline) == 80, " found %lld\n",
(long long)OBD_MD_FLCOOKIE);
LASSERTF(OBD_MD_FLGROUP == 16777216, " found %lld\n",
(long long)OBD_MD_FLGROUP);
- LASSERTF(OBD_MD_FLIFID == 33554432, " found %lld\n",
- (long long)OBD_MD_FLIFID);
+ LASSERTF(OBD_MD_FLFID == 33554432, " found %lld\n",
+ (long long)OBD_MD_FLFID);
LASSERTF(OBD_MD_FLEPOCH == 67108864, " found %lld\n",
(long long)OBD_MD_FLEPOCH);
LASSERTF(OBD_MD_FLGRANT == 134217728, " found %lld\n",
(long long)(int)offsetof(struct obd_statfs, os_namelen));
LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, " found %lld\n",
(long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
- LASSERTF((int)offsetof(struct obd_statfs, os_spare) == 104, " found %lld\n",
- (long long)(int)offsetof(struct obd_statfs, os_spare));
- LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare) == 40, " found %lld\n",
- (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare));
+ LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, " found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_state));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_state));
/* Checks for struct obd_ioobj */
LASSERTF((int)sizeof(struct obd_ioobj) == 24, " found %lld\n",
(long long)(int)offsetof(struct obd_dqblk, padding));
LASSERTF((int)sizeof(((struct obd_dqblk *)0)->padding) == 4, " found %lld\n",
(long long)(int)sizeof(((struct obd_dqblk *)0)->padding));
+ LASSERTF(Q_QUOTACHECK == 0x800100," found %lld\n",
+ (long long)Q_QUOTACHECK);
+ LASSERTF(Q_INITQUOTA == 0x800101," found %lld\n",
+ (long long)Q_INITQUOTA);
+ LASSERTF(Q_GETOINFO == 0x800102," found %lld\n",
+ (long long)Q_GETOINFO);
+ LASSERTF(Q_GETOQUOTA == 0x800103," found %lld\n",
+ (long long)Q_GETOQUOTA);
/* Checks for struct niobuf_remote */
LASSERTF((int)sizeof(struct niobuf_remote) == 16, " found %lld\n",
(long long)(int)offsetof(struct mds_body, eadatasize));
LASSERTF((int)sizeof(((struct mds_body *)0)->eadatasize) == 4, " found %lld\n",
(long long)(int)sizeof(((struct mds_body *)0)->eadatasize));
- LASSERTF((int)offsetof(struct mds_body, padding_1) == 152, " found %lld\n",
- (long long)(int)offsetof(struct mds_body, padding_1));
- LASSERTF((int)sizeof(((struct mds_body *)0)->padding_1) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct mds_body *)0)->padding_1));
- LASSERTF((int)offsetof(struct mds_body, padding_2) == 156, " found %lld\n",
- (long long)(int)offsetof(struct mds_body, padding_2));
- LASSERTF((int)sizeof(((struct mds_body *)0)->padding_2) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct mds_body *)0)->padding_2));
- LASSERTF((int)offsetof(struct mds_body, padding_3) == 160, " found %lld\n",
- (long long)(int)offsetof(struct mds_body, padding_3));
- LASSERTF((int)sizeof(((struct mds_body *)0)->padding_3) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct mds_body *)0)->padding_3));
+ LASSERTF((int)offsetof(struct mds_body, aclsize) == 152, " found %lld\n",
+ (long long)(int)offsetof(struct mds_body, aclsize));
+ LASSERTF((int)sizeof(((struct mds_body *)0)->aclsize) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct mds_body *)0)->aclsize));
+ LASSERTF((int)offsetof(struct mds_body, max_mdsize) == 156, " found %lld\n",
+ (long long)(int)offsetof(struct mds_body, max_mdsize));
+ LASSERTF((int)sizeof(((struct mds_body *)0)->max_mdsize) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct mds_body *)0)->max_mdsize));
+ LASSERTF((int)offsetof(struct mds_body, max_cookiesize) == 160, " found %lld\n",
+ (long long)(int)offsetof(struct mds_body, max_cookiesize));
+ LASSERTF((int)sizeof(((struct mds_body *)0)->max_cookiesize) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct mds_body *)0)->max_cookiesize));
LASSERTF((int)offsetof(struct mds_body, padding_4) == 164, " found %lld\n",
(long long)(int)offsetof(struct mds_body, padding_4));
LASSERTF((int)sizeof(((struct mds_body *)0)->padding_4) == 4, " found %lld\n",
(long long)MDS_OPEN_DIRECTORY);
LASSERTF(MDS_OPEN_DELAY_CREATE == 16777216, " found %lld\n",
(long long)MDS_OPEN_DELAY_CREATE);
- LASSERTF(MDS_OPEN_HAS_EA == 1073741824, " found %lld\n",
- (long long)MDS_OPEN_HAS_EA);
+ CLASSERT(MDS_OPEN_OWNEROVERRIDE == 0200000000);
+ CLASSERT(MDS_OPEN_JOIN_FILE == 0400000000);
+ CLASSERT(MDS_OPEN_HAS_EA == 010000000000);
+ CLASSERT(MDS_OPEN_HAS_OBJS == 020000000000);
/* Checks for struct mds_rec_setattr */
LASSERTF((int)sizeof(struct mds_rec_setattr) == 96, " found %lld\n",
desc->bd_iov_count++;
}
+void ptl_rpc_wipe_bulk_pages(struct ptlrpc_bulk_desc *desc)
+{
+ int i;
+
+ for (i = 0; i < desc->bd_iov_count ; i++) {
+ lnet_kiov_t *kiov = &desc->bd_iov[i];
+ memset(kmap(kiov->kiov_page)+kiov->kiov_offset, 0xab,
+ kiov->kiov_len);
+ kunmap(kiov->kiov_page);
+ }
+}
+
#else /* !__KERNEL__ */
void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc)
}
}
+void ptl_rpc_wipe_bulk_pages(struct ptlrpc_bulk_desc *desc)
+{
+ int i;
+
+ for(i = 0; i < desc->bd_iov_count; i++) {
+ lnet_md_iovec_t *iov = &desc->bd_iov[i];
+
+ memset(iov->iov_base, 0xab, iov->iov_len);
+ }
+}
#endif /* !__KERNEL__ */
int rc = 0;
ENTRY;
- req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL,
- NULL);
+ req = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, OBD_PING, 0, NULL, NULL);
if (req) {
DEBUG_REQ(D_INFO, req, "pinging %s->%s",
imp->imp_obd->obd_uuid.uuid,
RETURN(rc);
}
-static inline void ptlrpc_update_next_ping(struct obd_import *imp)
+static void ptlrpc_update_next_ping(struct obd_import *imp)
{
imp->imp_next_ping = jiffies + HZ *
(imp->imp_state == LUSTRE_IMP_DISCON ? 10 : PING_INTERVAL);
/* And now, loop forever, pinging as needed. */
while (1) {
unsigned long this_ping = jiffies;
- long time_to_next_ping;
- struct l_wait_info lwi = LWI_TIMEOUT(PING_INTERVAL * HZ,
- NULL, NULL);
+ long time_to_next_ping = 0;
+ struct l_wait_info lwi;
struct list_head *iter;
down(&pinger_sem);
int force, level;
unsigned long flags;
-
spin_lock_irqsave(&imp->imp_lock, flags);
level = imp->imp_state;
force = imp->imp_force_verify;
imp->imp_force_verify = 0;
spin_unlock_irqrestore(&imp->imp_lock, flags);
+
CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA,
"level %s/%u force %u deactive %u pingable %u\n",
ptlrpc_import_state_name(level), level,
/* Wait until the next ping time, or until we're stopped. */
time_to_next_ping = this_ping + (PING_INTERVAL * HZ) - jiffies;
+
/* The ping sent by ptlrpc_send_rpc may get sent out
say .01 second after this.
- ptlrpc_pinger_sending_on_import will then set the
+ ptlrpc_pinger_eending_on_import will then set the
next ping time to next_ping + .01 sec, which means
we will SKIP the next ping at next_ping, and the
ping will get sent 2 timeouts from now! Beware. */
if (rc < 0) {
CERROR("cannot start thread: %d\n", rc);
OBD_FREE(pinger_thread, sizeof(*pinger_thread));
+ pinger_thread = NULL;
RETURN(rc);
}
l_wait_event(pinger_thread->t_ctl_waitq,
pinger_thread->t_flags & SVC_RUNNING, &lwi);
- RETURN(rc);
+ RETURN(0);
}
int ptlrpc_stop_pinger(void)
continue;
}
- req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL,
- NULL);
+ req = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, OBD_PING,
+ 0, NULL, NULL);
if (!req) {
CERROR("out of memory\n");
break;
DEBUG_REQ(D_HA, req, "pinging %s->%s",
req->rq_import->imp_obd->obd_uuid.uuid,
req->rq_import->imp_target_uuid.uuid);
- (void)ptl_send_rpc(req);
+ (void)ptl_send_rpc(req, 0);
}
do_check_set:
void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc);
void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, struct page *page,
int pageoffset, int len);
+void ptl_rpc_wipe_bulk_pages(struct ptlrpc_bulk_desc *desc);
/* pinger.c */
int ptlrpc_start_pinger(void);
__init int ptlrpc_init(void)
{
- int rc;
+ int rc, cleanup_phase = 0;
ENTRY;
lustre_assert_wire_constants();
rc = ptlrpc_init_portals();
if (rc)
RETURN(rc);
+ cleanup_phase = 1;
ptlrpc_init_connection();
- llog_init_commit_master();
+ rc = llog_init_commit_master();
+ if (rc)
+ GOTO(cleanup, rc);
+ cleanup_phase = 2;
ptlrpc_put_connection_superhack = ptlrpc_put_connection;
ptlrpc_abort_inflight_superhack = ptlrpc_abort_inflight;
- ptlrpc_start_pinger();
- ldlm_init();
+ rc = ptlrpc_start_pinger();
+ if (rc)
+ GOTO(cleanup, rc);
+ cleanup_phase = 3;
+
+ rc = ldlm_init();
+ if (rc)
+ GOTO(cleanup, rc);
RETURN(0);
+
+cleanup:
+ switch(cleanup_phase) {
+ case 3:
+ ptlrpc_stop_pinger();
+ case 2:
+ llog_cleanup_commit_master(1);
+ ptlrpc_cleanup_connection();
+ case 1:
+ ptlrpc_exit_portals();
+ default: ;
+ }
+
+ return rc;
}
#ifdef __KERNEL__
EXPORT_SYMBOL(ptlrpc_error);
EXPORT_SYMBOL(ptlrpc_resend_req);
EXPORT_SYMBOL(ptl_send_rpc);
-EXPORT_SYMBOL(ptl_send_rpc_nowait);
/* client.c */
EXPORT_SYMBOL(ptlrpc_init_client);
/* pack_generic.c */
EXPORT_SYMBOL(lustre_msg_swabbed);
+EXPORT_SYMBOL(lustre_msg_check_version);
EXPORT_SYMBOL(lustre_pack_request);
EXPORT_SYMBOL(lustre_pack_reply);
+EXPORT_SYMBOL(lustre_shrink_reply);
EXPORT_SYMBOL(lustre_free_reply_state);
EXPORT_SYMBOL(lustre_msg_size);
EXPORT_SYMBOL(lustre_unpack_msg);
EXPORT_SYMBOL(lustre_swab_obd_quotactl);
EXPORT_SYMBOL(lustre_swab_mds_rec_setattr);
EXPORT_SYMBOL(lustre_swab_mds_rec_create);
+EXPORT_SYMBOL(lustre_swab_mds_rec_join);
EXPORT_SYMBOL(lustre_swab_mds_rec_link);
EXPORT_SYMBOL(lustre_swab_mds_rec_unlink);
EXPORT_SYMBOL(lustre_swab_mds_rec_rename);
EXPORT_SYMBOL(lustre_swab_lov_desc);
EXPORT_SYMBOL(lustre_swab_lov_user_md);
EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
+EXPORT_SYMBOL(lustre_swab_lov_user_md_join);
EXPORT_SYMBOL(lustre_swab_ldlm_res_id);
EXPORT_SYMBOL(lustre_swab_ldlm_policy_data);
EXPORT_SYMBOL(lustre_swab_ldlm_intent);
/* llogd.c */
EXPORT_SYMBOL(llog_origin_handle_create);
+EXPORT_SYMBOL(llog_origin_handle_destroy);
EXPORT_SYMBOL(llog_origin_handle_next_block);
+EXPORT_SYMBOL(llog_origin_handle_prev_block);
EXPORT_SYMBOL(llog_origin_handle_read_header);
EXPORT_SYMBOL(llog_origin_handle_close);
EXPORT_SYMBOL(llog_client_ops);
wake_up(&pc->pc_waitq);
}
+/* requests that are added to the ptlrpcd queue are sent via
+ * ptlrpcd_check->ptlrpc_check_set() */
void ptlrpcd_add_req(struct ptlrpc_request *req)
{
struct ptlrpcd_ctl *pc;
/* single threaded!! */
pc->pc_recurred++;
- if (pc->pc_recurred == 1)
+ if (pc->pc_recurred == 1) {
rc = ptlrpcd_check(pc);
+ if (!rc)
+ ptlrpc_expired_set(pc->pc_set);
+ }
pc->pc_recurred--;
return rc;
static int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc)
{
- int rc = 0;
+ int rc;
memset(pc, 0, sizeof(*pc));
init_completion(&pc->pc_starting);
pc->pc_set = ptlrpc_prep_set();
if (pc->pc_set == NULL)
- GOTO(out, rc = -ENOMEM);
+ RETURN(-ENOMEM);
#ifdef __KERNEL__
- if (kernel_thread(ptlrpcd, pc, 0) < 0) {
+ rc = kernel_thread(ptlrpcd, pc, 0);
+ if (rc < 0) {
ptlrpc_set_destroy(pc->pc_set);
- GOTO(out, rc = -ECHILD);
+ RETURN(rc);
}
wait_for_completion(&pc->pc_starting);
#else
pc->pc_callback =
liblustre_register_wait_callback(&ptlrpcd_check_async_rpcs, pc);
+ (void)rc;
#endif
-out:
- RETURN(rc);
+ RETURN(0);
}
static void ptlrpcd_stop(struct ptlrpcd_ctl *pc)
continue;
}
- request = ptlrpc_prep_req(import, OBD_LOG_CANCEL, 1,
+ request = ptlrpc_prep_req(import, LUSTRE_LOG_VERSION,
+ OBD_LOG_CANCEL, 1,
&llcd->llcd_cookiebytes,
bufs);
if (request == NULL) {
imp->imp_obd->obd_name);
ptlrpc_deactivate_import(imp);
}
- ptlrpc_connect_import(imp, NULL);
+ /* to control recovery via lctl {disable|enable}_recovery */
+ if (imp->imp_deactive == 0)
+ ptlrpc_connect_import(imp, NULL);
}
/* Wait for recovery to complete and resend. If evicted, then
/* Now allocate pool of reply buffers */
/* Increase max reply size to next power of two */
service->srv_max_reply_size = 1;
- while(service->srv_max_reply_size < max_reply_size)
+ while (service->srv_max_reply_size < max_reply_size)
service->srv_max_reply_size <<= 1;
if (proc_entry != NULL)
timediff = timeval_sub(&work_end, &work_start);
if (timediff / 1000000 > (long)obd_timeout)
- CERROR("request "LPU64" opc %u from %s processed in %lds\n",
+ CERROR("request "LPU64" opc %u from %s processed in %lds "
+ "trans "LPU64" rc %d/%d\n",
request->rq_xid, request->rq_reqmsg->opc,
libcfs_id2str(request->rq_peer),
timeval_sub(&work_end,
- &request->rq_arrival_time) / 1000000);
+ &request->rq_arrival_time) / 1000000,
+ request->rq_repmsg ? request->rq_repmsg->transno :
+ request->rq_transno, request->rq_status,
+ request->rq_repmsg ? request->rq_repmsg->status : -999);
else
- CDEBUG(D_HA,"request "LPU64" opc %u from %s processed in %ldus"
- " (%ldus total)\n", request->rq_xid,
- request->rq_reqmsg->opc,
+ CDEBUG(D_HA, "request "LPU64" opc %u from %s processed in "
+ "%ldus (%ldus total) trans "LPU64" rc %d/%d\n",
+ request->rq_xid, request->rq_reqmsg->opc,
libcfs_id2str(request->rq_peer), timediff,
- timeval_sub(&work_end, &request->rq_arrival_time));
+ timeval_sub(&work_end, &request->rq_arrival_time),
+ request->rq_transno, request->rq_status,
+ request->rq_repmsg ? request->rq_repmsg->status : -999);
if (svc->srv_stats != NULL) {
int opc = opcode_offset(request->rq_reqmsg->opc);
list_del_init (&rs->rs_list);
- /* Disengage from notifiers carefully (lock ordering!) */
+ /* Disengage from notifiers carefully (lock order - irqrestore below!)*/
spin_unlock(&svc->srv_lock);
spin_lock (&obd->obd_uncommitted_replies_lock);
svc->srv_done(thread);
out:
- spin_lock_irqsave(&svc->srv_lock, flags);
+ CDEBUG(D_NET, "service thread %d exiting: rc %d\n", thread->t_id, rc);
+ spin_lock_irqsave(&svc->srv_lock, flags);
svc->srv_nthreads--; /* must know immediately */
+ thread->t_id = rc;
thread->t_flags = SVC_STOPPED;
- wake_up(&thread->t_ctl_waitq);
+ wake_up(&thread->t_ctl_waitq);
spin_unlock_irqrestore(&svc->srv_lock, flags);
- CDEBUG(D_NET, "service thread %d exiting: rc %d\n", thread->t_id, rc);
- thread->t_id = rc;
-
return rc;
}
spin_unlock_irqrestore(&svc->srv_lock, flags);
}
-/* @base_name should be 12 characters or less - 3 will be added on */
+/* @base_name should be 11 characters or less - 3 will be added on */
int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc,
char *base_name)
{
int i, rc = 0;
ENTRY;
+ LASSERT(svc->srv_num_threads > 0);
for (i = 0; i < svc->srv_num_threads; i++) {
char name[32];
sprintf(name, "%s_%02d", base_name, i);
--- /dev/null
+.Xrefs
+config.log
+config.status
+configure
+Makefile
+.deps
+TAGS
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.o.flags
+.tmp_versions
+.depend
--- /dev/null
+MODULES := lquota
+MODULES += quotactl_test quotacheck_test
+
+lquota-objs := quota_check.o quota_context.o quota_ctl.o quota_interface.o
+lquota-objs += quota_master.o
+quotactl-objs := quotactl_test.o
+quotaccheck-objs := quotacheck_test.o
+
+@INCLUDE_RULES@
+
--- /dev/null
+# Copyright (C) 2005 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if LIBLUSTRE
+noinst_LIBRARIES = libquota.a
+libquota_a_SOURCES = quota_check.c quota_ctl.c quota_interface.c
+libquota_a_CPPFLAGS = $(LLCPPFLAGS)
+libquota_a_CFLAGS = $(LLCFLAGS)
+endif
+
+if MODULES
+modulefs_DATA = lquota$(KMODEXT)
+endif
+
+MOSTLYCLEANFILES := @MOSTLYCLEANFILES@
+DIST_SOURCES := $(lquota-objs:%.o=%.c) quota_internal.h
+DIST_SOURCES += quotactl_test.c quotacheck_test.c
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lustre/quota/quota_check.c
+ *
+ * Copyright (c) 2005 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ *
+ */
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_MDS
+
+#ifdef __KERNEL__
+# include <linux/version.h>
+# include <linux/module.h>
+# include <linux/init.h>
+# include <linux/fs.h>
+# include <linux/jbd.h>
+# include <linux/ext3_fs.h>
+# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+# include <linux/smp_lock.h>
+# include <linux/buffer_head.h>
+# include <linux/workqueue.h>
+# include <linux/mount.h>
+# else
+# include <linux/locks.h>
+# endif
+#else /* __KERNEL__ */
+# include <liblustre.h>
+#endif
+
+#include <linux/obd_class.h>
+#include <linux/lustre_mds.h>
+#include <linux/lustre_dlm.h>
+#include <linux/lustre_cfg.h>
+#include <linux/obd_ost.h>
+#include <linux/lustre_fsfilt.h>
+#include <linux/lustre_quota.h>
+#include "quota_internal.h"
+
+#ifdef __KERNEL__
+static int target_quotacheck_callback(struct obd_export *exp,
+ struct obd_quotactl *oqctl)
+{
+ struct ptlrpc_request *req;
+ struct obd_quotactl *body;
+ int rc, size = sizeof(*oqctl);
+ ENTRY;
+
+ req = ptlrpc_prep_req(exp->exp_imp_reverse, LUSTRE_OBD_VERSION,
+ OBD_QC_CALLBACK, 1, &size, NULL);
+ if (!req)
+ RETURN(-ENOMEM);
+
+ body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body));
+ *body = *oqctl;
+
+ req->rq_replen = lustre_msg_size(0, NULL);
+
+ rc = ptlrpc_queue_wait(req);
+ ptlrpc_req_finished(req);
+
+ RETURN(rc);
+}
+
+static int target_quotacheck_thread(void *data)
+{
+ unsigned long flags;
+ struct quotacheck_thread_args *qta = data;
+ struct obd_export *exp;
+ struct obd_device *obd;
+ struct obd_quotactl *oqctl;
+ struct lvfs_run_ctxt saved;
+ int rc;
+
+ lock_kernel();
+ ptlrpc_daemonize();
+
+ SIGNAL_MASK_LOCK(current, flags);
+ sigfillset(¤t->blocked);
+ RECALC_SIGPENDING;
+ SIGNAL_MASK_UNLOCK(current, flags);
+
+ THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX, "%s",
+ "quotacheck");
+ unlock_kernel();
+
+ exp = qta->qta_exp;
+ obd = exp->exp_obd;
+ oqctl = &qta->qta_oqctl;
+
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+ rc = fsfilt_quotacheck(obd, qta->qta_sb, oqctl);
+ if (rc)
+ CERROR("%s: fsfilt_quotacheck: %d\n", obd->obd_name, rc);
+
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+ rc = target_quotacheck_callback(exp, oqctl);
+
+ atomic_inc(qta->qta_sem);
+
+ OBD_FREE_PTR(qta);
+ return rc;
+}
+
+int target_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct obd_device_target *obt = &obd->u.obt;
+ struct quotacheck_thread_args *qta;
+ int rc = 0;
+ ENTRY;
+
+ if (!atomic_dec_and_test(&obt->obt_quotachecking)) {
+ CDEBUG(D_INFO, "other people are doing quotacheck\n");
+ GOTO(out, rc = -EBUSY);
+ }
+
+ OBD_ALLOC_PTR(qta);
+ if (!qta)
+ GOTO(out, rc = -ENOMEM);
+
+ qta->qta_exp = exp;
+ qta->qta_oqctl = *oqctl;
+ qta->qta_sb = obt->obt_sb;
+ qta->qta_sem = &obt->obt_quotachecking;
+
+ if (!strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME)) {
+ /* quota master */
+ rc = init_admin_quotafiles(obd, &qta->qta_oqctl);
+ if (rc) {
+ CERROR("init_admin_quotafiles failed: %d\n", rc);
+ OBD_FREE_PTR(qta);
+ GOTO(out, rc);
+ }
+ }
+
+ rc = kernel_thread(target_quotacheck_thread, qta, CLONE_VM|CLONE_FILES);
+ if (rc >= 0) {
+ CDEBUG(D_INFO, "%s: target_quotacheck_thread: %d\n",
+ obd->obd_name, rc);
+ RETURN(0);
+ }
+
+ CERROR("%s: error starting quotacheck_thread: %d\n",
+ obd->obd_name, rc);
+ OBD_FREE_PTR(qta);
+out:
+ atomic_inc(&obt->obt_quotachecking);
+ RETURN(rc);
+}
+
+#endif /* __KERNEL__ */
+
+int client_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl)
+{
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+ struct ptlrpc_request *req;
+ struct obd_quotactl *body;
+ int size = sizeof(*body), opc, version;
+ int rc;
+ ENTRY;
+
+ if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME)) {
+ version = LUSTRE_MDS_VERSION;
+ opc = MDS_QUOTACHECK;
+ } else if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+ version = LUSTRE_OST_VERSION;
+ opc = OST_QUOTACHECK;
+ } else {
+ RETURN(-EINVAL);
+ }
+
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), version, opc, 1, &size,
+ NULL);
+ if (!req)
+ GOTO(out, rc = -ENOMEM);
+
+ body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body));
+ *body = *oqctl;
+
+ req->rq_replen = lustre_msg_size(0, NULL);
+
+ /* the next poll will find -ENODATA, that means quotacheck is
+ * going on */
+ cli->cl_qchk_stat = -ENODATA;
+ rc = ptlrpc_queue_wait(req);
+ if (rc)
+ cli->cl_qchk_stat = rc;
+out:
+ ptlrpc_req_finished(req);
+ RETURN(rc);
+}
+
+int client_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk)
+{
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+ int rc;
+ ENTRY;
+
+ rc = cli->cl_qchk_stat;
+
+ /* the client is not the previous one */
+ if (rc == CL_NOT_QUOTACHECKED)
+ rc = -EINTR;
+
+ qchk->obd_uuid = cli->cl_import->imp_target_uuid;
+ if (strncmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME,
+ strlen(LUSTRE_OSC_NAME)))
+ memcpy(qchk->obd_type, LUSTRE_FILTER_NAME,
+ strlen(LUSTRE_FILTER_NAME));
+ else if (strncmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME,
+ strlen(LUSTRE_MDC_NAME)))
+ memcpy(qchk->obd_type, LUSTRE_MDS_NAME,
+ strlen(LUSTRE_MDS_NAME));
+
+ RETURN(rc);
+}
+
+int lov_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lov_obd *lov = &obd->u.lov;
+ int i, rc = 0;
+ ENTRY;
+
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ int err;
+
+ if (!lov->tgts[i].active) {
+ CERROR("lov idx %d inactive\n", i);
+ RETURN(-EIO);
+ }
+
+ err = obd_quotacheck(lov->tgts[i].ltd_exp, oqctl);
+ if (err && lov->tgts[i].active && !rc)
+ rc = err;
+ }
+
+ RETURN(rc);
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lustre/quota/quota_context.c
+ * Lustre Quota Context
+ *
+ * Copyright (c) 2001-2005 Cluster File Systems, Inc.
+ * Author: Niu YaWei <niu@clusterfs.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ *
+ */
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#define DEBUG_SUBSYSTEM S_MDS
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/obd_class.h>
+#include <linux/lustre_quota.h>
+#include <linux/lustre_fsfilt.h>
+#include "quota_internal.h"
+
+unsigned long default_bunit_sz = 100 * 1024 * 1024; /* 100M bytes */
+unsigned long default_btune_ratio = 50; /* 50 percentage */
+unsigned long default_iunit_sz = 5000; /* 5000 inodes */
+unsigned long default_itune_ratio = 50; /* 50 percentage */
+
+kmem_cache_t *qunit_cachep = NULL;
+struct list_head qunit_hash[NR_DQHASH];
+spinlock_t qunit_hash_lock = SPIN_LOCK_UNLOCKED;
+
+struct lustre_qunit {
+ struct list_head lq_hash; /* Hash list in memory */
+ atomic_t lq_refcnt; /* Use count */
+ struct lustre_quota_ctxt *lq_ctxt; /* Quota context this applies to */
+ struct qunit_data lq_data; /* See qunit_data */
+ unsigned int lq_opc; /* QUOTA_DQACQ, QUOTA_DQREL */
+ struct list_head lq_waiters; /* All write threads waiting for this qunit */
+};
+
+void qunit_cache_cleanup(void)
+{
+ int i;
+ ENTRY;
+
+ spin_lock(&qunit_hash_lock);
+ for (i = 0; i < NR_DQHASH; i++)
+ LASSERT(list_empty(qunit_hash + i));
+ spin_unlock(&qunit_hash_lock);
+
+ if (qunit_cachep) {
+ int rc;
+ rc = kmem_cache_destroy(qunit_cachep);
+ LASSERT(rc == 0);
+ qunit_cachep = NULL;
+ }
+ EXIT;
+}
+
+int qunit_cache_init(void)
+{
+ int i;
+ ENTRY;
+
+ LASSERT(qunit_cachep == NULL);
+ qunit_cachep = kmem_cache_create("ll_qunit_cache",
+ sizeof(struct lustre_qunit),
+ 0, 0, NULL, NULL);
+ if (!qunit_cachep)
+ RETURN(-ENOMEM);
+
+ spin_lock(&qunit_hash_lock);
+ for (i = 0; i < NR_DQHASH; i++)
+ INIT_LIST_HEAD(qunit_hash + i);
+ spin_unlock(&qunit_hash_lock);
+ RETURN(0);
+}
+
+static inline int const
+qunit_hashfn(struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata)
+{
+ unsigned int id = qdata->qd_id;
+ unsigned int type = qdata->qd_type;
+
+ unsigned long tmp = ((unsigned long)qctxt >> L1_CACHE_SHIFT) ^ id;
+ tmp = (tmp * (MAXQUOTAS - type)) % NR_DQHASH;
+ return tmp;
+}
+
+/* caller must hold qunit_hash_lock */
+static inline struct lustre_qunit *find_qunit(unsigned int hashent,
+ struct lustre_quota_ctxt *qctxt,
+ struct qunit_data *qdata)
+{
+ struct lustre_qunit *qunit = NULL;
+ struct qunit_data *tmp;
+
+ LASSERT_SPIN_LOCKED(&qunit_hash_lock);
+ list_for_each_entry(qunit, qunit_hash + hashent, lq_hash) {
+ tmp = &qunit->lq_data;
+ if (qunit->lq_ctxt == qctxt &&
+ qdata->qd_id == tmp->qd_id && qdata->qd_type == tmp->qd_type
+ && qdata->qd_isblk == tmp->qd_isblk)
+ return qunit;
+ }
+ return NULL;
+}
+
+/* check_cur_qunit - check the current usage of qunit.
+ * @qctxt: quota context
+ * @qdata: the type of quota unit to be checked
+ *
+ * return: 1 - need acquire qunit;
+ * 2 - need release qunit;
+ * 0 - need do nothing.
+ * < 0 - error.
+ */
+static int
+check_cur_qunit(struct obd_device *obd,
+ struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata)
+{
+ struct super_block *sb = qctxt->lqc_sb;
+ unsigned long qunit_sz, tune_sz;
+ __u64 usage, limit;
+ struct obd_quotactl *qctl;
+ int ret = 0;
+ ENTRY;
+
+ if (!sb_any_quota_enabled(sb))
+ RETURN(0);
+
+ /* ignore root user */
+ if (qdata->qd_id == 0 && qdata->qd_type == USRQUOTA)
+ RETURN(0);
+
+ OBD_ALLOC_PTR(qctl);
+ if (qctl == NULL)
+ RETURN(-ENOMEM);
+
+ /* get fs quota usage & limit */
+ qctl->qc_cmd = Q_GETQUOTA;
+ qctl->qc_id = qdata->qd_id;
+ qctl->qc_type = qdata->qd_type;
+ ret = fsfilt_quotactl(obd, sb, qctl);
+ if (ret) {
+ if (ret == -ESRCH) /* no limit */
+ ret = 0;
+ else
+ CERROR("can't get fs quota usage! (rc:%d)\n", ret);
+ GOTO(out, ret);
+ }
+
+ if (qdata->qd_isblk) {
+ usage = qctl->qc_dqblk.dqb_curspace;
+ limit = qctl->qc_dqblk.dqb_bhardlimit << QUOTABLOCK_BITS;
+ qunit_sz = qctxt->lqc_bunit_sz;
+ tune_sz = qctxt->lqc_btune_sz;
+
+ LASSERT(!(qunit_sz % QUOTABLOCK_SIZE));
+ } else {
+ usage = qctl->qc_dqblk.dqb_curinodes;
+ limit = qctl->qc_dqblk.dqb_ihardlimit;
+ qunit_sz = qctxt->lqc_iunit_sz;
+ tune_sz = qctxt->lqc_itune_sz;
+ }
+
+ /* ignore the no quota limit case */
+ if (!limit)
+ GOTO(out, ret = 0);
+
+ /* we don't count the MIN_QLIMIT */
+ if ((limit == MIN_QLIMIT && !qdata->qd_isblk) ||
+ (toqb(limit) == MIN_QLIMIT && qdata->qd_isblk))
+ limit = 0;
+
+ LASSERT(qdata->qd_count == 0);
+ if (limit <= usage + tune_sz) {
+ while (qdata->qd_count + limit <= usage + tune_sz)
+ qdata->qd_count += qunit_sz;
+ ret = 1;
+ } else if (limit > usage + qunit_sz + tune_sz) {
+ while (limit - qdata->qd_count > usage + qunit_sz + tune_sz)
+ qdata->qd_count += qunit_sz;
+ ret = 2;
+ }
+ LASSERT(ret == 0 || qdata->qd_count);
+ EXIT;
+out:
+ OBD_FREE_PTR(qctl);
+ return ret;
+}
+
+/* caller must hold qunit_hash_lock */
+static struct lustre_qunit *dqacq_in_flight(struct lustre_quota_ctxt *qctxt,
+ struct qunit_data *qdata)
+{
+ unsigned int hashent = qunit_hashfn(qctxt, qdata);
+ struct lustre_qunit *qunit;
+ ENTRY;
+
+ LASSERT_SPIN_LOCKED(&qunit_hash_lock);
+ qunit = find_qunit(hashent, qctxt, qdata);
+ RETURN(qunit);
+}
+
+static struct lustre_qunit *alloc_qunit(struct lustre_quota_ctxt *qctxt,
+ struct qunit_data *qdata, int opc)
+{
+ struct lustre_qunit *qunit = NULL;
+ ENTRY;
+
+ OBD_SLAB_ALLOC(qunit, qunit_cachep, SLAB_NOFS, sizeof(*qunit));
+ if (qunit == NULL)
+ RETURN(NULL);
+
+ INIT_LIST_HEAD(&qunit->lq_hash);
+ INIT_LIST_HEAD(&qunit->lq_waiters);
+ atomic_set(&qunit->lq_refcnt, 1);
+ qunit->lq_ctxt = qctxt;
+ memcpy(&qunit->lq_data, qdata, sizeof(*qdata));
+ qunit->lq_opc = opc;
+
+ RETURN(qunit);
+}
+
+static inline void free_qunit(struct lustre_qunit *qunit)
+{
+ OBD_SLAB_FREE(qunit, qunit_cachep, sizeof(*qunit));
+}
+
+static inline void qunit_get(struct lustre_qunit *qunit)
+{
+ atomic_inc(&qunit->lq_refcnt);
+}
+
+static void qunit_put(struct lustre_qunit *qunit)
+{
+ LASSERT(atomic_read(&qunit->lq_refcnt));
+ if (atomic_dec_and_test(&qunit->lq_refcnt))
+ free_qunit(qunit);
+}
+
+static void
+insert_qunit_nolock(struct lustre_quota_ctxt *qctxt, struct lustre_qunit *qunit)
+{
+ struct list_head *head;
+
+ LASSERT(list_empty(&qunit->lq_hash));
+ head = qunit_hash + qunit_hashfn(qctxt, &qunit->lq_data);
+ list_add(&qunit->lq_hash, head);
+}
+
+static void remove_qunit_nolock(struct lustre_qunit *qunit)
+{
+ LASSERT(!list_empty(&qunit->lq_hash));
+ list_del_init(&qunit->lq_hash);
+}
+
+struct qunit_waiter {
+ struct list_head qw_entry;
+ wait_queue_head_t qw_waitq;
+ int qw_rc;
+};
+
+#define QDATA_DEBUG(qd, fmt, arg...) \
+ CDEBUG(D_QUOTA, "id(%u) type(%u) count(%u) isblk(%u):" \
+ fmt, qd->qd_id, qd->qd_type, qd->qd_count, qd->qd_isblk, \
+ ## arg); \
+
+#define INC_QLIMIT(limit, count) (limit == MIN_QLIMIT) ? \
+ (limit = count) : (limit += count)
+
+
+/* FIXME check if this mds is the master of specified id */
+static int
+is_master(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
+ unsigned int id, int type)
+{
+ return qctxt->lqc_handler ? 1 : 0;
+}
+
+static int
+schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
+ struct qunit_data *qdata, int opc, int wait);
+
+static int
+dqacq_completion(struct obd_device *obd,
+ struct lustre_quota_ctxt *qctxt,
+ struct qunit_data *qdata, int rc, int opc)
+{
+ struct lustre_qunit *qunit = NULL;
+ struct super_block *sb = qctxt->lqc_sb;
+ unsigned long qunit_sz;
+ struct qunit_waiter *qw, *tmp;
+ int err = 0;
+ ENTRY;
+
+ LASSERT(qdata);
+ qunit_sz = qdata->qd_isblk ? qctxt->lqc_bunit_sz : qctxt->lqc_iunit_sz;
+ LASSERT(!(qdata->qd_count % qunit_sz));
+
+ /* update local operational quota file */
+ if (rc == 0) {
+ __u32 count = QUSG(qdata->qd_count, qdata->qd_isblk);
+ struct obd_quotactl *qctl;
+ __u64 *hardlimit;
+
+ OBD_ALLOC_PTR(qctl);
+ if (qctl == NULL)
+ GOTO(out, err = -ENOMEM);
+
+ /* acq/rel qunit for specified uid/gid is serialized,
+ * so there is no race between get fs quota limit and
+ * set fs quota limit */
+ qctl->qc_cmd = Q_GETQUOTA;
+ qctl->qc_id = qdata->qd_id;
+ qctl->qc_type = qdata->qd_type;
+ err = fsfilt_quotactl(obd, sb, qctl);
+ if (err) {
+ CERROR("error get quota fs limit! (rc:%d)\n", err);
+ GOTO(out_mem, err);
+ }
+
+ if (qdata->qd_isblk) {
+ qctl->qc_dqblk.dqb_valid = QIF_BLIMITS;
+ hardlimit = &qctl->qc_dqblk.dqb_bhardlimit;
+ } else {
+ qctl->qc_dqblk.dqb_valid = QIF_ILIMITS;
+ hardlimit = &qctl->qc_dqblk.dqb_ihardlimit;
+ }
+
+ switch (opc) {
+ case QUOTA_DQACQ:
+ INC_QLIMIT(*hardlimit, count);
+ break;
+ case QUOTA_DQREL:
+ LASSERT(count < *hardlimit);
+ *hardlimit -= count;
+ break;
+ default:
+ LBUG();
+ }
+
+ /* clear quota limit */
+ if (count == 0)
+ *hardlimit = 0;
+
+ qctl->qc_cmd = Q_SETQUOTA;
+ err = fsfilt_quotactl(obd, sb, qctl);
+ if (err)
+ CERROR("error set quota fs limit! (rc:%d)\n", err);
+
+ QDATA_DEBUG(qdata, "%s completion\n",
+ opc == QUOTA_DQACQ ? "DQACQ" : "DQREL");
+out_mem:
+ OBD_FREE_PTR(qctl);
+ } else if (rc == -EDQUOT) {
+ QDATA_DEBUG(qdata, "acquire qunit got EDQUOT.\n");
+ } else if (rc == -EBUSY) {
+ QDATA_DEBUG(qdata, "it's is recovering, got EBUSY.\n");
+ } else {
+ CERROR("acquire qunit got error! (rc:%d)\n", rc);
+ }
+out:
+ /* remove the qunit from hash */
+ spin_lock(&qunit_hash_lock);
+
+ qunit = dqacq_in_flight(qctxt, qdata);
+ /* this qunit has been removed by qctxt_cleanup() */
+ if (!qunit) {
+ spin_unlock(&qunit_hash_lock);
+ RETURN(err);
+ }
+
+ LASSERT(opc == qunit->lq_opc);
+ remove_qunit_nolock(qunit);
+
+ /* wake up all waiters */
+ list_for_each_entry_safe(qw, tmp, &qunit->lq_waiters, qw_entry) {
+ list_del_init(&qw->qw_entry);
+ qw->qw_rc = rc;
+ wake_up(&qw->qw_waitq);
+ }
+
+ spin_unlock(&qunit_hash_lock);
+
+ qunit_put(qunit);
+
+ /* don't reschedule in such cases:
+ * - acq/rel failure, but not for quota recovery.
+ * - local dqacq/dqrel.
+ * - local disk io failure.
+ */
+ if (err || (rc && rc != -EBUSY) ||
+ is_master(obd, qctxt, qdata->qd_id, qdata->qd_type))
+ RETURN(err);
+
+ /* reschedule another dqacq/dqrel if needed */
+ qdata->qd_count = 0;
+ rc = check_cur_qunit(obd, qctxt, qdata);
+ if (rc > 0) {
+ int opc;
+ opc = rc == 1 ? QUOTA_DQACQ : QUOTA_DQREL;
+ rc = schedule_dqacq(obd, qctxt, qdata, opc, 0);
+ QDATA_DEBUG(qdata, "reschedudle opc(%d) rc(%d)\n", opc, rc);
+ }
+ RETURN(err);
+}
+
+struct dqacq_async_args {
+ struct lustre_quota_ctxt *aa_ctxt;
+ struct lustre_qunit *aa_qunit;
+};
+
+static int dqacq_interpret(struct ptlrpc_request *req, void *data, int rc)
+{
+ struct dqacq_async_args *aa = (struct dqacq_async_args *)data;
+ struct lustre_quota_ctxt *qctxt = aa->aa_ctxt;
+ struct lustre_qunit *qunit = aa->aa_qunit;
+ struct obd_device *obd = req->rq_import->imp_obd;
+ struct qunit_data *qdata = NULL;
+ ENTRY;
+
+ qdata = lustre_swab_repbuf(req, 0, sizeof(*qdata), lustre_swab_qdata);
+ if (rc == 0 && qdata == NULL)
+ RETURN(-EPROTO);
+
+ LASSERT(qdata->qd_id == qunit->lq_data.qd_id &&
+ qdata->qd_type == qunit->lq_data.qd_type &&
+ (qdata->qd_count == qunit->lq_data.qd_count ||
+ qdata->qd_count == 0));
+
+ QDATA_DEBUG(qdata, "%s interpret rc(%d).\n",
+ req->rq_reqmsg->opc == QUOTA_DQACQ ? "DQACQ" : "DQREL", rc);
+
+ rc = dqacq_completion(obd, qctxt, qdata, rc, req->rq_reqmsg->opc);
+
+ RETURN(rc);
+}
+
+static int got_qunit(struct qunit_waiter *waiter)
+{
+ int rc = 0;
+ ENTRY;
+ spin_lock(&qunit_hash_lock);
+ rc = list_empty(&waiter->qw_entry);
+ spin_unlock(&qunit_hash_lock);
+ RETURN(rc);
+}
+
+static int
+schedule_dqacq(struct obd_device *obd,
+ struct lustre_quota_ctxt *qctxt,
+ struct qunit_data *qdata, int opc, int wait)
+{
+ struct lustre_qunit *qunit, *empty;
+ struct qunit_waiter qw;
+ struct l_wait_info lwi = { 0 };
+ struct ptlrpc_request *req;
+ struct qunit_data *reqdata;
+ struct dqacq_async_args *aa;
+ int size = sizeof(*reqdata);
+ int rc = 0;
+ ENTRY;
+
+ INIT_LIST_HEAD(&qw.qw_entry);
+ init_waitqueue_head(&qw.qw_waitq);
+ qw.qw_rc = 0;
+
+ if ((empty = alloc_qunit(qctxt, qdata, opc)) == NULL)
+ RETURN(-ENOMEM);
+
+ spin_lock(&qunit_hash_lock);
+
+ qunit = dqacq_in_flight(qctxt, qdata);
+ if (qunit) {
+ if (wait)
+ list_add_tail(&qw.qw_entry, &qunit->lq_waiters);
+ spin_unlock(&qunit_hash_lock);
+
+ free_qunit(empty);
+ goto wait_completion;
+ }
+ qunit = empty;
+ insert_qunit_nolock(qctxt, qunit);
+ if (wait)
+ list_add_tail(&qw.qw_entry, &qunit->lq_waiters);
+ spin_unlock(&qunit_hash_lock);
+
+ LASSERT(qunit);
+
+ /* master is going to dqacq/dqrel from itself */
+ if (is_master(obd, qctxt, qdata->qd_id, qdata->qd_type)) {
+ int rc2;
+ QDATA_DEBUG(qdata, "local %s.\n",
+ opc == QUOTA_DQACQ ? "DQACQ" : "DQREL");
+ rc = qctxt->lqc_handler(obd, qdata, opc);
+ rc2 = dqacq_completion(obd, qctxt, qdata, rc, opc);
+ RETURN((rc && rc != -EDQUOT) ? rc : rc2);
+ }
+
+ /* build dqacq/dqrel request */
+ LASSERT(qctxt->lqc_import);
+ req = ptlrpc_prep_req(qctxt->lqc_import, LUSTRE_MDS_VERSION, opc, 1,
+ &size, NULL);
+ if (!req) {
+ dqacq_completion(obd, qctxt, qdata, -ENOMEM, opc);
+ RETURN(-ENOMEM);
+ }
+
+ reqdata = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*reqdata));
+ *reqdata = *qdata;
+ size = sizeof(*reqdata);
+ req->rq_replen = lustre_msg_size(1, &size);
+
+ CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
+ aa = (struct dqacq_async_args *)&req->rq_async_args;
+ aa->aa_ctxt = qctxt;
+ aa->aa_qunit = qunit;
+
+ req->rq_interpret_reply = dqacq_interpret;
+ ptlrpcd_add_req(req);
+
+ QDATA_DEBUG(qdata, "%s scheduled.\n",
+ opc == QUOTA_DQACQ ? "DQACQ" : "DQREL");
+wait_completion:
+ if (wait && qunit) {
+ struct qunit_data *p = &qunit->lq_data;
+ QDATA_DEBUG(p, "wait for dqacq.\n");
+
+ l_wait_event(qw.qw_waitq, got_qunit(&qw), &lwi);
+ if (qw.qw_rc == 0)
+ rc = -EAGAIN;
+
+ CDEBUG(D_QUOTA, "wait dqacq done. (rc:%d)\n", qw.qw_rc);
+ }
+ RETURN(rc);
+}
+
+int
+qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
+ uid_t uid, gid_t gid, __u32 isblk, int wait)
+{
+ int ret, rc = 0, i = USRQUOTA;
+ __u32 id[MAXQUOTAS] = { uid, gid };
+ struct qunit_data qdata[MAXQUOTAS];
+ ENTRY;
+
+ CLASSERT(MAXQUOTAS < 4);
+ if (!sb_any_quota_enabled(qctxt->lqc_sb))
+ RETURN(0);
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ qdata[i].qd_id = id[i];
+ qdata[i].qd_type = i;
+ qdata[i].qd_isblk = isblk;
+ qdata[i].qd_count = 0;
+
+ ret = check_cur_qunit(obd, qctxt, &qdata[i]);
+ if (ret > 0) {
+ int opc;
+ /* need acquire or release */
+ opc = ret == 1 ? QUOTA_DQACQ : QUOTA_DQREL;
+ ret = schedule_dqacq(obd, qctxt, &qdata[i], opc, wait);
+ if (!rc)
+ rc = ret;
+ }
+ }
+
+ RETURN(rc);
+}
+
+int
+qctxt_wait_pending_dqacq(struct lustre_quota_ctxt *qctxt, unsigned int id,
+ unsigned short type, int isblk)
+{
+ struct lustre_qunit *qunit = NULL;
+ struct qunit_waiter qw;
+ struct qunit_data qdata;
+ struct l_wait_info lwi = { 0 };
+ ENTRY;
+
+ INIT_LIST_HEAD(&qw.qw_entry);
+ init_waitqueue_head(&qw.qw_waitq);
+ qw.qw_rc = 0;
+
+ qdata.qd_id = id;
+ qdata.qd_type = type;
+ qdata.qd_isblk = isblk;
+ qdata.qd_count = 0;
+
+ spin_lock(&qunit_hash_lock);
+
+ qunit = dqacq_in_flight(qctxt, &qdata);
+ if (qunit)
+ list_add_tail(&qw.qw_entry, &qunit->lq_waiters);
+
+ spin_unlock(&qunit_hash_lock);
+
+ if (qunit) {
+ struct qunit_data *p = &qdata;
+ QDATA_DEBUG(p, "wait for dqacq completion.\n");
+ l_wait_event(qw.qw_waitq, got_qunit(&qw), &lwi);
+ QDATA_DEBUG(p, "wait dqacq done. (rc:%d)\n", qw.qw_rc);
+ }
+ RETURN(0);
+}
+
+int
+qctxt_init(struct lustre_quota_ctxt *qctxt, struct super_block *sb,
+ dqacq_handler_t handler)
+{
+ int rc = 0;
+ ENTRY;
+
+ rc = ptlrpcd_addref();
+ if (rc)
+ RETURN(rc);
+
+ qctxt->lqc_handler = handler;
+ qctxt->lqc_sb = sb;
+ qctxt->lqc_import = NULL;
+ qctxt->lqc_recovery = 0;
+ qctxt->lqc_bunit_sz = default_bunit_sz;
+ qctxt->lqc_btune_sz = default_bunit_sz / 100 * default_btune_ratio;
+ qctxt->lqc_iunit_sz = default_iunit_sz;
+ qctxt->lqc_itune_sz = default_iunit_sz * default_itune_ratio / 100;
+
+ RETURN(0);
+}
+
+void qctxt_cleanup(struct lustre_quota_ctxt *qctxt, int force)
+{
+ struct lustre_qunit *qunit, *tmp;
+ struct qunit_waiter *qw, *tmp2;
+ int i;
+ ENTRY;
+
+ spin_lock(&qunit_hash_lock);
+
+ for (i = 0; i < NR_DQHASH; i++) {
+ list_for_each_entry_safe(qunit, tmp, &qunit_hash[i], lq_hash) {
+ if (qunit->lq_ctxt != qctxt)
+ continue;
+
+ remove_qunit_nolock(qunit);
+ /* wake up all waiters */
+ list_for_each_entry_safe(qw, tmp2, &qunit->lq_waiters,
+ qw_entry) {
+ list_del_init(&qw->qw_entry);
+ qw->qw_rc = 0;
+ wake_up(&qw->qw_waitq);
+ }
+ qunit_put(qunit);
+ }
+ }
+
+ spin_unlock(&qunit_hash_lock);
+
+ ptlrpcd_decref();
+
+ EXIT;
+}
+
+struct qslave_recov_thread_data {
+ struct obd_device *obd;
+ struct lustre_quota_ctxt *qctxt;
+ struct completion comp;
+};
+
+/* FIXME only recovery block quota by now */
+static int qslave_recovery_main(void *arg)
+{
+ struct qslave_recov_thread_data *data = arg;
+ struct obd_device *obd = data->obd;
+ struct lustre_quota_ctxt *qctxt = data->qctxt;
+ unsigned long flags;
+ unsigned int type;
+ int rc = 0;
+ ENTRY;
+
+ lock_kernel();
+ ptlrpc_daemonize();
+
+ SIGNAL_MASK_LOCK(current, flags);
+ sigfillset(¤t->blocked);
+ RECALC_SIGPENDING;
+ SIGNAL_MASK_UNLOCK(current, flags);
+ THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s", "qslave_recovd");
+ unlock_kernel();
+
+ complete(&data->comp);
+
+ if (qctxt->lqc_recovery)
+ RETURN(0);
+ qctxt->lqc_recovery = 1;
+
+ for (type = USRQUOTA; type < MAXQUOTAS; type++) {
+ struct qunit_data qdata;
+ struct quota_info *dqopt = sb_dqopt(qctxt->lqc_sb);
+ struct list_head id_list;
+ struct dquot_id *dqid, *tmp;
+ int ret;
+
+ down(&dqopt->dqonoff_sem);
+ if (!sb_has_quota_enabled(qctxt->lqc_sb, type)) {
+ up(&dqopt->dqonoff_sem);
+ break;
+ }
+
+ LASSERT(dqopt->files[type] != NULL);
+ INIT_LIST_HEAD(&id_list);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12)
+ rc = fsfilt_qids(obd, dqopt->files[type], NULL, type, &id_list);
+#else
+ rc = fsfilt_qids(obd, NULL, dqopt->files[type], type, &id_list);
+#endif
+ up(&dqopt->dqonoff_sem);
+ if (rc)
+ CERROR("Get ids from quota file failed. (rc:%d)\n", rc);
+
+ list_for_each_entry_safe(dqid, tmp, &id_list, di_link) {
+ list_del_init(&dqid->di_link);
+ /* skip slave recovery on itself */
+ if (is_master(obd, qctxt, dqid->di_id, type))
+ goto free;
+ if (rc && rc != -EBUSY)
+ goto free;
+
+ qdata.qd_id = dqid->di_id;
+ qdata.qd_type = type;
+ qdata.qd_isblk = 1;
+ qdata.qd_count = 0;
+
+ ret = check_cur_qunit(obd, qctxt, &qdata);
+ if (ret > 0) {
+ int opc;
+ opc = ret == 1 ? QUOTA_DQACQ : QUOTA_DQREL;
+ rc = schedule_dqacq(obd, qctxt, &qdata, opc, 0);
+ } else
+ rc = 0;
+
+ if (rc)
+ CDEBUG(rc == -EBUSY ? D_QUOTA : D_ERROR,
+ "qslave recovery failed! (id:%d type:%d "
+ " rc:%d)\n", dqid->di_id, type, rc);
+free:
+ kfree(dqid);
+ }
+ }
+
+ qctxt->lqc_recovery = 0;
+ RETURN(rc);
+}
+
+void
+qslave_start_recovery(struct obd_device *obd, struct lustre_quota_ctxt *qctxt)
+{
+ struct qslave_recov_thread_data data;
+ int rc;
+ ENTRY;
+
+ if (!sb_any_quota_enabled(qctxt->lqc_sb))
+ goto exit;
+
+ data.obd = obd;
+ data.qctxt = qctxt;
+ init_completion(&data.comp);
+
+ rc = kernel_thread(qslave_recovery_main, &data, CLONE_VM|CLONE_FILES);
+ if (rc < 0) {
+ CERROR("Cannot start quota recovery thread: rc %d\n", rc);
+ goto exit;
+ }
+ wait_for_completion(&data.comp);
+exit:
+ EXIT;
+}
+
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lustre/quota/quota_ctl.c
+ *
+ * Copyright (c) 2005 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ *
+ */
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_MDS
+
+#ifdef __KERNEL__
+# include <linux/version.h>
+# include <linux/module.h>
+# include <linux/init.h>
+# include <linux/fs.h>
+# include <linux/jbd.h>
+# include <linux/ext3_fs.h>
+# include <linux/quota.h>
+# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+# include <linux/smp_lock.h>
+# include <linux/buffer_head.h>
+# include <linux/workqueue.h>
+# include <linux/mount.h>
+# else
+# include <linux/locks.h>
+# endif
+#else /* __KERNEL__ */
+# include <liblustre.h>
+#endif
+
+#include <linux/obd_class.h>
+#include <linux/lustre_mds.h>
+#include <linux/lustre_dlm.h>
+#include <linux/lustre_cfg.h>
+#include <linux/obd_ost.h>
+#include <linux/lustre_fsfilt.h>
+#include <linux/lustre_quota.h>
+#include "quota_internal.h"
+
+#ifdef __KERNEL__
+int mds_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
+{
+ struct obd_device *obd = exp->exp_obd;
+ int rc = 0;
+ ENTRY;
+
+ switch (oqctl->qc_cmd) {
+ case Q_QUOTAON:
+ rc = mds_quota_on(obd, oqctl);
+ break;
+ case Q_QUOTAOFF:
+ mds_quota_off(obd, oqctl);
+ break;
+ case Q_SETINFO:
+ rc = mds_set_dqinfo(obd, oqctl);
+ break;
+ case Q_GETINFO:
+ rc = mds_get_dqinfo(obd, oqctl);
+ break;
+ case Q_SETQUOTA:
+ rc = mds_set_dqblk(obd, oqctl);
+ break;
+ case Q_GETQUOTA:
+ rc = mds_get_dqblk(obd, oqctl);
+ break;
+ case Q_GETOINFO:
+ case Q_GETOQUOTA:
+ rc = mds_get_obd_quota(obd, oqctl);
+ break;
+ default:
+ CERROR("%s: unsupported mds_quotactl command: %d\n",
+ obd->obd_name, oqctl->qc_cmd);
+ RETURN(-EFAULT);
+ }
+
+ if (rc)
+ CDEBUG(D_INFO, "mds_quotactl admin quota command %d, id %u, "
+ "type %d, failed: rc = %d\n",
+ oqctl->qc_cmd, oqctl->qc_id, oqctl->qc_type, rc);
+
+ RETURN(rc);
+}
+
+int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lvfs_run_ctxt saved;
+ int rc = 0;
+ ENTRY;
+
+ switch (oqctl->qc_cmd) {
+ case Q_QUOTAON:
+ case Q_QUOTAOFF:
+ case Q_GETOINFO:
+ case Q_GETOQUOTA:
+ case Q_GETQUOTA:
+ /* In recovery scenario, this pending dqacq/dqrel might have
+ * been processed by master successfully before it's dquot
+ * on master enter recovery mode. We must wait for this
+ * dqacq/dqrel done then return the correct limits to master */
+ if (oqctl->qc_stat == QUOTA_RECOVERING)
+ qctxt_wait_pending_dqacq(&obd->u.obt.obt_qctxt,
+ oqctl->qc_id, oqctl->qc_type,
+ 1);
+
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ break;
+ case Q_INITQUOTA:
+ {
+ unsigned int uid = 0, gid = 0;
+
+ /* Initialize quota limit to MIN_QLIMIT */
+ LASSERT(oqctl->qc_dqblk.dqb_valid == QIF_BLIMITS);
+ LASSERT(oqctl->qc_dqblk.dqb_bhardlimit == MIN_QLIMIT);
+ LASSERT(oqctl->qc_dqblk.dqb_bsoftlimit == 0);
+
+ /* There might be a pending dqacq/dqrel (which is going to
+ * clear stale limits on slave). we should wait for it's
+ * completion then initialize limits */
+ qctxt_wait_pending_dqacq(&obd->u.obt.obt_qctxt,
+ oqctl->qc_id, oqctl->qc_type, 1);
+
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
+
+ /* Update on-disk quota, in case of lose the changed limits
+ * (MIN_QLIMIT) on crash, which cannot be recovered.*/
+ if (!rc) {
+ oqctl->qc_cmd = Q_SYNC;
+ fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
+ oqctl->qc_cmd = Q_INITQUOTA;
+ }
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+ if (rc)
+ RETURN(rc);
+
+ /* Trigger qunit pre-acquire */
+ if (oqctl->qc_type == USRQUOTA)
+ uid = oqctl->qc_id;
+ else
+ gid = oqctl->qc_id;
+
+ rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt,
+ uid, gid, 1, 0);
+ break;
+ }
+ default:
+ CERROR("%s: unsupported filter_quotactl command: %d\n",
+ obd->obd_name, oqctl->qc_cmd);
+ RETURN(-EFAULT);
+ }
+
+ RETURN(rc);
+}
+#endif /* __KERNEL__ */
+
+int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
+{
+ struct ptlrpc_request *req;
+ struct obd_quotactl *oqc;
+ int size = sizeof(*oqctl), opc, version;
+ int rc;
+ ENTRY;
+
+ if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME)) {
+ opc = MDS_QUOTACTL;
+ version = LUSTRE_MDS_VERSION;
+ } else if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+ opc = OST_QUOTACTL;
+ version = LUSTRE_OST_VERSION;
+ } else {
+ RETURN(-EINVAL);
+ }
+
+ req = ptlrpc_prep_req(class_exp2cliimp(exp), version, opc, 1, &size,
+ NULL);
+ if (!req)
+ GOTO(out, rc = -ENOMEM);
+
+ oqc = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*oqctl));
+ *oqc = *oqctl;
+
+ req->rq_replen = lustre_msg_size(1, &size);
+
+ rc = ptlrpc_queue_wait(req);
+ if (!rc) {
+ oqc = lustre_swab_repbuf(req, 0, sizeof (*oqc),
+ lustre_swab_obd_quotactl);
+ if (oqc == NULL) {
+ CERROR ("Can't unpack obd_quotactl\n");
+ GOTO(out, rc = -EPROTO);
+ }
+
+ *oqctl = *oqc;
+ }
+out:
+ ptlrpc_req_finished(req);
+ RETURN (rc);
+}
+
+int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl)
+{
+ struct obd_device *obd = class_exp2obd(exp);
+ struct lov_obd *lov = &obd->u.lov;
+ __u64 curspace = 0;
+ __u32 bhardlimit = 0;
+ int i, rc = 0;
+ ENTRY;
+
+ if (oqctl->qc_cmd != Q_QUOTAON && oqctl->qc_cmd != Q_QUOTAOFF &&
+ oqctl->qc_cmd != Q_GETOQUOTA && oqctl->qc_cmd != Q_INITQUOTA) {
+ CERROR("bad quota opc %x for lov obd", oqctl->qc_cmd);
+ RETURN(-EFAULT);
+ }
+
+ for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+ int err;
+
+ if (!lov->tgts[i].active) {
+ if (oqctl->qc_cmd == Q_GETOQUOTA) {
+ CERROR("ost %d is inactive\n", i);
+ rc = -EIO;
+ break;
+ } else {
+ CDEBUG(D_HA, "ost %d is inactive\n", i);
+ continue;
+ }
+ }
+
+ err = obd_quotactl(lov->tgts[i].ltd_exp, oqctl);
+ if (err) {
+ if (lov->tgts[i].active && !rc)
+ rc = err;
+ continue;
+ }
+
+ if (oqctl->qc_cmd == Q_GETOQUOTA) {
+ curspace += oqctl->qc_dqblk.dqb_curspace;
+ bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit;
+ }
+ }
+
+ if (oqctl->qc_cmd == Q_GETOQUOTA) {
+ oqctl->qc_dqblk.dqb_curspace = curspace;
+ oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit;
+ }
+ RETURN(rc);
+}
+
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lustre/quota/quota_interface.c
+ *
+ * Copyright (c) 2001-2005 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ *
+ */
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_MDS
+
+#ifdef __KERNEL__
+# include <linux/version.h>
+# include <linux/module.h>
+# include <linux/init.h>
+# include <linux/fs.h>
+# include <linux/jbd.h>
+# include <linux/ext3_fs.h>
+# include <linux/parser.h>
+# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+# include <linux/smp_lock.h>
+# include <linux/buffer_head.h>
+# include <linux/workqueue.h>
+# include <linux/mount.h>
+# else
+# include <linux/locks.h>
+# endif
+#else /* __KERNEL__ */
+# include <liblustre.h>
+#endif
+
+#include <linux/obd_class.h>
+#include <linux/lustre_mds.h>
+#include <linux/lustre_dlm.h>
+#include <linux/lustre_cfg.h>
+#include <linux/obd_ost.h>
+#include <linux/lustre_fsfilt.h>
+#include <linux/lustre_quota.h>
+#include "quota_internal.h"
+
+
+#ifdef __KERNEL__
+extern unsigned long default_bunit_sz;
+extern unsigned long default_btune_ratio;
+extern unsigned long default_iunit_sz;
+extern unsigned long default_itune_ratio;
+
+enum {
+ Opt_quotaon, Opt_iunit_sz, Opt_bunit_sz,
+ Opt_itune_ratio, Opt_btune_ratio, Opt_err,
+};
+
+static match_table_t tokens = {
+ {Opt_quotaon, "quotaon=%10s"},
+ {Opt_iunit_sz, "iunit=%u"},
+ {Opt_bunit_sz, "bunit=%u"},
+ {Opt_itune_ratio, "itune=%u"},
+ {Opt_btune_ratio, "btune=%u"},
+ {Opt_err, NULL}
+};
+
+static int
+quota_parse_config_args(char *options, int *quotaon, int *type,
+ struct lustre_quota_ctxt *qctxt)
+{
+ char *opt;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ int rc = 0;
+ unsigned long iunit = 0, bunit = 0, itune = 0, btune = 0;
+ ENTRY;
+
+ while ((opt = strsep (&options, ",")) != NULL) {
+ int token;
+ if (!*opt)
+ continue;
+
+ token = match_token(opt, tokens, args);
+ switch(token) {
+ case Opt_quotaon: {
+ char *quota_type = match_strdup(&args[0]);
+ if (!quota_type)
+ GOTO(out, rc = -EINVAL);
+
+ *quotaon = 1;
+ if (strchr(quota_type, 'u') && strchr(quota_type, 'g'))
+ *type = UGQUOTA;
+ else if (strchr(quota_type, 'u'))
+ *type = USRQUOTA;
+ else if (strchr(quota_type, 'g'))
+ *type = GRPQUOTA;
+ else {
+ *quotaon = 0;
+ rc = -EINVAL;
+ }
+ break;
+ }
+ case Opt_iunit_sz:
+ if (match_int(&args[0], &option))
+ rc = -EINVAL;
+ iunit = option;
+ break;
+ case Opt_bunit_sz:
+ if (match_int(&args[0], &option))
+ rc = -EINVAL;
+ bunit = option;
+ break;
+ case Opt_itune_ratio:
+ if (match_int(&args[0], &option) ||
+ option <= 0 || option >= 100)
+ rc = -EINVAL;
+ itune = option;
+ break;
+ case Opt_btune_ratio:
+ if (match_int(&args[0], &option) ||
+ option <= 0 || option >= 100)
+ rc = -EINVAL;
+ btune = option;
+ break;
+ default:
+ rc = -EINVAL;
+ }
+
+ if (rc)
+ GOTO(out, rc);
+ }
+
+ /* adjust the tunables of qunits based on quota config args */
+ if (iunit)
+ qctxt->lqc_iunit_sz = iunit;
+ if (itune)
+ qctxt->lqc_itune_sz = qctxt->lqc_iunit_sz *
+ itune / 100;
+ else
+ qctxt->lqc_itune_sz = qctxt->lqc_iunit_sz *
+ default_itune_ratio / 100;
+ if (bunit)
+ qctxt->lqc_bunit_sz = bunit << 20;
+ if (btune)
+ qctxt->lqc_btune_sz = ((qctxt->lqc_bunit_sz >> 20) *
+ btune / 100) << 20;
+ else
+ qctxt->lqc_btune_sz = ((qctxt->lqc_bunit_sz >> 20) *
+ default_btune_ratio / 100) << 20;
+
+ CDEBUG(D_INFO, "iunit=%lu bunit=%lu itune=%lu btune=%lu\n",
+ qctxt->lqc_iunit_sz, qctxt->lqc_bunit_sz,
+ qctxt->lqc_itune_sz, qctxt->lqc_btune_sz);
+ EXIT;
+
+ out:
+ if (rc)
+ CERROR("quota config args parse error!(rc = %d) usage: "
+ "--quota quotaon=u|g|ug,iunit=100,bunit=100,itune=50,btune=50\n",
+ rc);
+
+ return rc;
+}
+
+static int auto_quota_on(struct obd_device *obd, int type,
+ struct super_block *sb, int is_master)
+{
+ struct obd_quotactl *oqctl;
+ struct lvfs_run_ctxt saved;
+ int rc;
+ ENTRY;
+
+ LASSERT(type == USRQUOTA || type == GRPQUOTA || type == UGQUOTA);
+
+ OBD_ALLOC_PTR(oqctl);
+ if (!oqctl)
+ RETURN(-ENOMEM);
+
+ oqctl->qc_type = type;
+ oqctl->qc_cmd = Q_QUOTAON;
+ oqctl->qc_id = QFMT_LDISKFS;
+
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+ if (!is_master)
+ goto local_quota;
+
+ /* turn on cluster wide quota */
+ rc = mds_admin_quota_on(obd, oqctl);
+ if (rc) {
+ CERROR("auto enable admin quota error! err = %d\n", rc);
+ GOTO(out_pop, rc);
+ }
+local_quota:
+ /* turn on local quota */
+ rc = fsfilt_quotactl(obd, sb, oqctl);
+ CDEBUG(rc ? D_ERROR : D_INFO, "auto-enable quota. rc=%d\n", rc);
+ if (rc && is_master)
+ mds_quota_off(obd, oqctl);
+out_pop:
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+ OBD_FREE_PTR(oqctl);
+ RETURN(rc);
+}
+
+static int mds_auto_quota_on(struct obd_device *obd, int type)
+{
+ int rc;
+ ENTRY;
+ rc = auto_quota_on(obd, type, obd->u.obt.obt_sb, 1);
+ RETURN(rc);
+}
+
+static int filter_auto_quota_on(struct obd_device *obd, int type)
+{
+ int rc = 0;
+ ENTRY;
+ rc = auto_quota_on(obd, type, obd->u.obt.obt_sb, 0);
+ RETURN(rc);
+}
+
+static int filter_quota_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ int rc = 0;
+ struct obd_device_target *obt = &obd->u.obt;
+ ENTRY;
+
+ atomic_set(&obt->obt_quotachecking, 1);
+ rc = qctxt_init(&obt->obt_qctxt, obt->obt_sb, NULL);
+ if (rc) {
+ CERROR("initialize quota context failed! (rc:%d)\n", rc);
+ RETURN(rc);
+ }
+
+ /* Based on quota config args, set qunit sizes and enable quota */
+ if (LUSTRE_CFG_BUFLEN(lcfg, 5) > 0 && lustre_cfg_buf(lcfg, 5)) {
+ char *args = lustre_cfg_string(lcfg, 5);
+ int quotaon = 0, type;
+ int err = 0;
+
+ err = quota_parse_config_args(args, "aon, &type,
+ &obd->u.obt.obt_qctxt);
+ if (!err && quotaon)
+ filter_auto_quota_on(obd, type);
+ }
+
+ RETURN(rc);
+}
+
+static int filter_quota_cleanup(struct obd_device *obd)
+{
+ qctxt_cleanup(&obd->u.obt.obt_qctxt, 0);
+ return 0;
+}
+
+static int filter_quota_setinfo(struct obd_export *exp, struct obd_device *obd)
+{
+ /* setup the quota context import */
+ obd->u.obt.obt_qctxt.lqc_import = exp->exp_imp_reverse;
+ /* start quota slave recovery thread. (release high limits) */
+ qslave_start_recovery(obd, &obd->u.obt.obt_qctxt);
+ return 0;
+}
+static int filter_quota_enforce(struct obd_device *obd, unsigned int ignore)
+{
+ ENTRY;
+
+ if (!sb_any_quota_enabled(obd->u.obt.obt_sb))
+ RETURN(0);
+
+ if (ignore)
+ cap_raise(current->cap_effective, CAP_SYS_RESOURCE);
+ else
+ cap_lower(current->cap_effective, CAP_SYS_RESOURCE);
+
+ RETURN(0);
+}
+
+static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa)
+{
+ struct obd_device_target *obt = &obd->u.obt;
+ int err, cnt, rc = 0;
+ struct obd_quotactl *oqctl;
+ ENTRY;
+
+ if (!sb_any_quota_enabled(obt->obt_sb))
+ RETURN(0);
+
+ oa->o_flags &= ~(OBD_FL_NO_USRQUOTA | OBD_FL_NO_GRPQUOTA);
+
+ OBD_ALLOC_PTR(oqctl);
+ if (!oqctl) {
+ CERROR("Not enough memory!");
+ RETURN(-ENOMEM);
+ }
+
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ memset(oqctl, 0, sizeof(*oqctl));
+
+ oqctl->qc_cmd = Q_GETQUOTA;
+ oqctl->qc_type = cnt;
+ oqctl->qc_id = (cnt == USRQUOTA) ? oa->o_uid : oa->o_gid;
+ err = fsfilt_quotactl(obd, obt->obt_sb, oqctl);
+ if (err) {
+ if (!rc)
+ rc = err;
+ continue;
+ }
+
+ /* set over quota flags for a uid/gid */
+ oa->o_valid |= (cnt == USRQUOTA) ?
+ OBD_MD_FLUSRQUOTA : OBD_MD_FLGRPQUOTA;
+ if (oqctl->qc_dqblk.dqb_bhardlimit &&
+ (toqb(oqctl->qc_dqblk.dqb_curspace) >
+ oqctl->qc_dqblk.dqb_bhardlimit))
+ oa->o_flags |= (cnt == USRQUOTA) ?
+ OBD_FL_NO_USRQUOTA : OBD_FL_NO_GRPQUOTA;
+ }
+ OBD_FREE_PTR(oqctl);
+ RETURN(rc);
+}
+
+static int filter_quota_acquire(struct obd_device *obd, unsigned int uid,
+ unsigned int gid)
+{
+ struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+ int rc;
+ ENTRY;
+
+ rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, 1, 1);
+ RETURN(rc == -EAGAIN);
+}
+
+static int mds_quota_init(void)
+{
+ return lustre_dquot_init();
+}
+
+static int mds_quota_exit(void)
+{
+ lustre_dquot_exit();
+ return 0;
+}
+
+static int mds_quota_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ struct obd_device_target *obt = &obd->u.obt;
+ struct mds_obd *mds = &obd->u.mds;
+ int rc;
+ ENTRY;
+
+ atomic_set(&obt->obt_quotachecking, 1);
+ /* initialize quota master and quota context */
+ sema_init(&mds->mds_qonoff_sem, 1);
+ rc = qctxt_init(&obt->obt_qctxt, obt->obt_sb, dqacq_handler);
+ if (rc) {
+ CERROR("initialize quota context failed! (rc:%d)\n", rc);
+ RETURN(rc);
+ }
+
+ /* Based on quota config args, set qunit sizes and enable quota */
+ if (LUSTRE_CFG_BUFLEN(lcfg, 5) > 0 && lustre_cfg_buf(lcfg, 5)) {
+ char *args = lustre_cfg_string(lcfg, 5);
+ int quotaon = 0, type;
+ int err;
+
+ err = quota_parse_config_args(args, "aon, &type,
+ &obt->obt_qctxt);
+ if (!err && quotaon)
+ mds_auto_quota_on(obd, type);
+ }
+ RETURN(rc);
+}
+
+static int mds_quota_cleanup(struct obd_device *obd)
+{
+ qctxt_cleanup(&obd->u.obt.obt_qctxt, 0);
+ RETURN(0);
+}
+
+static int mds_quota_fs_cleanup(struct obd_device *obd)
+{
+ struct mds_obd *mds = &obd->u.mds;
+ int i;
+ ENTRY;
+
+ /* close admin quota files */
+ down(&mds->mds_qonoff_sem);
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (mds->mds_quota_info.qi_files[i]) {
+ filp_close(mds->mds_quota_info.qi_files[i], 0);
+ mds->mds_quota_info.qi_files[i] = NULL;
+ }
+ }
+ up(&mds->mds_qonoff_sem);
+ RETURN(0);
+}
+#endif /* __KERNEL__ */
+
+struct osc_quota_info {
+ struct list_head oqi_hash; /* hash list */
+ struct client_obd *oqi_cli; /* osc obd */
+ unsigned int oqi_id; /* uid/gid of a file */
+ short oqi_type; /* quota type */
+};
+
+spinlock_t qinfo_list_lock = SPIN_LOCK_UNLOCKED;
+
+static struct list_head qinfo_hash[NR_DQHASH];
+/* SLAB cache for client quota context */
+kmem_cache_t *qinfo_cachep = NULL;
+
+static inline int const hashfn(struct client_obd *cli,
+ unsigned long id,
+ int type)
+{
+ unsigned long tmp = ((unsigned long)cli>>6) ^ id;
+ tmp = (tmp * (MAXQUOTAS - type)) % NR_DQHASH;
+ return tmp;
+}
+
+/* caller must hold qinfo_list_lock */
+static inline void insert_qinfo_hash(struct osc_quota_info *oqi)
+{
+ struct list_head *head = qinfo_hash +
+ hashfn(oqi->oqi_cli, oqi->oqi_id, oqi->oqi_type);
+
+ LASSERT_SPIN_LOCKED(&qinfo_list_lock);
+ list_add(&oqi->oqi_hash, head);
+}
+
+/* caller must hold qinfo_list_lock */
+static inline void remove_qinfo_hash(struct osc_quota_info *oqi)
+{
+ LASSERT_SPIN_LOCKED(&qinfo_list_lock);
+ list_del_init(&oqi->oqi_hash);
+}
+
+/* caller must hold qinfo_list_lock */
+static inline struct osc_quota_info *find_qinfo(struct client_obd *cli,
+ unsigned int id, int type)
+{
+ unsigned int hashent = hashfn(cli, id, type);
+ struct osc_quota_info *oqi;
+
+ LASSERT_SPIN_LOCKED(&qinfo_list_lock);
+ list_for_each_entry(oqi, &qinfo_hash[hashent], oqi_hash) {
+ if (oqi->oqi_cli == cli &&
+ oqi->oqi_id == id && oqi->oqi_type == type)
+ return oqi;
+ }
+ return NULL;
+}
+
+static struct osc_quota_info *alloc_qinfo(struct client_obd *cli,
+ unsigned int id, int type)
+{
+ struct osc_quota_info *oqi;
+ ENTRY;
+
+ OBD_SLAB_ALLOC(oqi, qinfo_cachep, SLAB_KERNEL, sizeof(*oqi));
+ if(!oqi)
+ RETURN(NULL);
+
+ INIT_LIST_HEAD(&oqi->oqi_hash);
+ oqi->oqi_cli = cli;
+ oqi->oqi_id = id;
+ oqi->oqi_type = type;
+
+ RETURN(oqi);
+}
+
+static void free_qinfo(struct osc_quota_info *oqi)
+{
+ OBD_SLAB_FREE(oqi, qinfo_cachep, sizeof(*oqi));
+}
+
+int osc_quota_chkdq(struct client_obd *cli,
+ unsigned int uid, unsigned int gid)
+{
+ unsigned int id;
+ int cnt, rc = QUOTA_OK;
+ ENTRY;
+
+ spin_lock(&qinfo_list_lock);
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ struct osc_quota_info *oqi = NULL;
+
+ id = (cnt == USRQUOTA) ? uid : gid;
+ oqi = find_qinfo(cli, id, cnt);
+ if (oqi) {
+ rc = NO_QUOTA;
+ break;
+ }
+ }
+ spin_unlock(&qinfo_list_lock);
+
+ RETURN(rc);
+}
+
+int osc_quota_setdq(struct client_obd *cli,
+ unsigned int uid, unsigned int gid,
+ obd_flag valid, obd_flag flags)
+{
+ unsigned int id;
+ obd_flag noquota;
+ int cnt, rc = 0;
+ ENTRY;
+
+
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ struct osc_quota_info *oqi, *old;
+
+ if (!(valid & ((cnt == USRQUOTA) ?
+ OBD_MD_FLUSRQUOTA : OBD_MD_FLGRPQUOTA)))
+ continue;
+
+ id = (cnt == USRQUOTA) ? uid : gid;
+ noquota = (cnt == USRQUOTA) ?
+ (flags & OBD_FL_NO_USRQUOTA) : (flags & OBD_FL_NO_GRPQUOTA);
+
+ oqi = alloc_qinfo(cli, id, cnt);
+ if (oqi) {
+ spin_lock(&qinfo_list_lock);
+
+ old = find_qinfo(cli, id, cnt);
+ if (old && !noquota)
+ remove_qinfo_hash(old);
+ else if (!old && noquota)
+ insert_qinfo_hash(oqi);
+
+ spin_unlock(&qinfo_list_lock);
+
+ if (old || !noquota)
+ free_qinfo(oqi);
+ if (old && !noquota)
+ free_qinfo(old);
+ } else {
+ CERROR("not enough mem!\n");
+ rc = -ENOMEM;
+ break;
+ }
+ }
+
+ RETURN(rc);
+}
+
+int osc_quota_cleanup(struct obd_device *obd)
+{
+ struct client_obd *cli = &obd->u.cli;
+ struct osc_quota_info *oqi, *n;
+ int i;
+ ENTRY;
+
+ spin_lock(&qinfo_list_lock);
+ for (i = 0; i < NR_DQHASH; i++) {
+ list_for_each_entry_safe(oqi, n, &qinfo_hash[i], oqi_hash) {
+ if (oqi->oqi_cli != cli)
+ continue;
+ remove_qinfo_hash(oqi);
+ free_qinfo(oqi);
+ }
+ }
+ spin_unlock(&qinfo_list_lock);
+
+ RETURN(0);
+}
+
+int osc_quota_init(void)
+{
+ int i;
+ ENTRY;
+
+ LASSERT(qinfo_cachep == NULL);
+ qinfo_cachep = kmem_cache_create("osc_quota_info",
+ sizeof(struct osc_quota_info),
+ 0, 0, NULL, NULL);
+ if (!qinfo_cachep)
+ RETURN(-ENOMEM);
+
+ for (i = 0; i < NR_DQHASH; i++)
+ INIT_LIST_HEAD(qinfo_hash + i);
+
+ RETURN(0);
+}
+
+int osc_quota_exit(void)
+{
+ struct osc_quota_info *oqi, *n;
+ int i, rc;
+ ENTRY;
+
+ spin_lock(&qinfo_list_lock);
+ for (i = 0; i < NR_DQHASH; i++) {
+ list_for_each_entry_safe(oqi, n, &qinfo_hash[i], oqi_hash) {
+ remove_qinfo_hash(oqi);
+ free_qinfo(oqi);
+ }
+ }
+ spin_unlock(&qinfo_list_lock);
+
+ rc = kmem_cache_destroy(qinfo_cachep);
+ LASSERT(rc == 0);
+ RETURN(0);
+}
+
+#ifdef __KERNEL__
+quota_interface_t mds_quota_interface = {
+ .quota_init = mds_quota_init,
+ .quota_exit = mds_quota_exit,
+ .quota_setup = mds_quota_setup,
+ .quota_cleanup = mds_quota_cleanup,
+ .quota_check = target_quota_check,
+ .quota_ctl = mds_quota_ctl,
+ .quota_fs_cleanup =mds_quota_fs_cleanup,
+ .quota_recovery = mds_quota_recovery,
+ .quota_adjust = mds_quota_adjust,
+};
+
+quota_interface_t filter_quota_interface = {
+ .quota_setup = filter_quota_setup,
+ .quota_cleanup = filter_quota_cleanup,
+ .quota_check = target_quota_check,
+ .quota_ctl = filter_quota_ctl,
+ .quota_setinfo = filter_quota_setinfo,
+ .quota_enforce = filter_quota_enforce,
+ .quota_getflag = filter_quota_getflag,
+ .quota_acquire = filter_quota_acquire,
+ .quota_adjust = filter_quota_adjust,
+};
+#endif /* __KERNEL__ */
+
+quota_interface_t mdc_quota_interface = {
+ .quota_ctl = client_quota_ctl,
+ .quota_check = client_quota_check,
+ .quota_poll_check = client_quota_poll_check,
+};
+
+quota_interface_t osc_quota_interface = {
+ .quota_ctl = client_quota_ctl,
+ .quota_check = client_quota_check,
+ .quota_poll_check = client_quota_poll_check,
+ .quota_init = osc_quota_init,
+ .quota_exit = osc_quota_exit,
+ .quota_chkdq = osc_quota_chkdq,
+ .quota_setdq = osc_quota_setdq,
+ .quota_cleanup = osc_quota_cleanup,
+};
+
+quota_interface_t lov_quota_interface = {
+ .quota_check = lov_quota_check,
+ .quota_ctl = lov_quota_ctl,
+};
+
+#ifdef __KERNEL__
+static int __init init_lustre_quota(void)
+{
+ int rc = qunit_cache_init();
+ if (rc)
+ return rc;
+ PORTAL_SYMBOL_REGISTER(filter_quota_interface);
+ PORTAL_SYMBOL_REGISTER(mds_quota_interface);
+ PORTAL_SYMBOL_REGISTER(mdc_quota_interface);
+ PORTAL_SYMBOL_REGISTER(osc_quota_interface);
+ PORTAL_SYMBOL_REGISTER(lov_quota_interface);
+ return 0;
+}
+
+static void /*__exit*/ exit_lustre_quota(void)
+{
+ PORTAL_SYMBOL_UNREGISTER(filter_quota_interface);
+ PORTAL_SYMBOL_UNREGISTER(mds_quota_interface);
+ PORTAL_SYMBOL_UNREGISTER(mdc_quota_interface);
+ PORTAL_SYMBOL_UNREGISTER(osc_quota_interface);
+ PORTAL_SYMBOL_UNREGISTER(lov_quota_interface);
+
+ qunit_cache_cleanup();
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Lustre Quota");
+MODULE_LICENSE("GPL");
+
+cfs_module(lquota, "1.0.0", init_lustre_quota, exit_lustre_quota);
+
+EXPORT_SYMBOL(mds_quota_interface);
+EXPORT_SYMBOL(filter_quota_interface);
+EXPORT_SYMBOL(mdc_quota_interface);
+EXPORT_SYMBOL(osc_quota_interface);
+EXPORT_SYMBOL(lov_quota_interface);
+#endif /* __KERNEL */
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lustre/quota/quota_internal.h
+ *
+ * Copyright (c) 2001-2005 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * No redistribution or use is permitted outside of Cluster File Systems, Inc.
+ *
+ */
+
+#ifndef __QUOTA_INTERNAL_H
+#define __QUOTA_INTERNAL_H
+
+#include <linux/lustre_quota.h>
+
+/* QUSG covnert bytes to blocks when counting block quota */
+#define QUSG(count, isblk) (isblk ? toqb(count) : count)
+
+/* This flag is set in qc_stat to distinguish if the current getquota
+ * operation is for quota recovery */
+#define QUOTA_RECOVERING 0x01
+
+#ifdef __KERNEL__
+
+#define DQUOT_DEBUG(dquot, fmt, arg...) \
+ CDEBUG(D_QUOTA, "refcnt(%u) id(%u) type(%u) off(%llu) flags(%lu) " \
+ "bhardlimit(%u) curspace("LPX64") ihardlimit(%u) " \
+ "curinodes(%u): " fmt, dquot->dq_refcnt, \
+ dquot->dq_id, dquot->dq_type, dquot->dq_off, dquot->dq_flags, \
+ dquot->dq_dqb.dqb_bhardlimit, dquot->dq_dqb.dqb_curspace, \
+ dquot->dq_dqb.dqb_ihardlimit, dquot->dq_dqb.dqb_curinodes, \
+ ## arg); \
+
+#define QINFO_DEBUG(qinfo, fmt, arg...) \
+ CDEBUG(D_QUOTA, "files (%p/%p) flags(%lu/%lu) blocks(%u/%u) " \
+ "free_blk(/%u/%u) free_entry(%u/%u): " fmt, \
+ qinfo->qi_files[0], qinfo->qi_files[1], \
+ qinfo->qi_info[0].dqi_flags, qinfo->qi_info[1].dqi_flags, \
+ qinfo->qi_info[0].dqi_blocks, qinfo->qi_info[1].dqi_blocks, \
+ qinfo->qi_info[0].dqi_free_blk, qinfo->qi_info[1].dqi_free_blk,\
+ qinfo->qi_info[0].dqi_free_entry, \
+ qinfo->qi_info[1].dqi_free_entry, ## arg);
+
+/* quota_context.c */
+void qunit_cache_cleanup(void);
+int qunit_cache_init(void);
+int qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
+ uid_t uid, gid_t gid, __u32 isblk, int wait);
+int qctxt_wait_pending_dqacq(struct lustre_quota_ctxt *qctxt, unsigned int id,
+ unsigned short type, int isblk);
+int qctxt_init(struct lustre_quota_ctxt *qctxt, struct super_block *sb,
+ dqacq_handler_t handler);
+void qctxt_cleanup(struct lustre_quota_ctxt *qctxt, int force);
+void qslave_start_recovery(struct obd_device *obd,
+ struct lustre_quota_ctxt *qctxt);
+/* quota_master.c */
+int lustre_dquot_init(void);
+void lustre_dquot_exit(void);
+int dqacq_handler(struct obd_device *obd, struct qunit_data *qdata, int opc);
+int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[],
+ unsigned int qpids[], int rc, int opc);
+int filter_quota_adjust(struct obd_device *obd, unsigned int qcids[],
+ unsigned int qpids[], int rc, int opc);
+int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl);
+int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl);
+int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl);
+int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl);
+int mds_set_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl);
+int mds_get_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl);
+int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl);
+int mds_get_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl);
+int mds_quota_recovery(struct obd_device *obd);
+int mds_get_obd_quota(struct obd_device *obd, struct obd_quotactl *oqctl);
+#endif
+
+/* quota_ctl.c */
+int mds_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl);
+int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl);
+int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl);
+int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl);
+
+/* quota_chk.c */
+int target_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl);
+int client_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl);
+int lov_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl);
+int client_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk);
+
+#endif
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*
- * lustre/mds/quota_master.c
+ * lustre/quota/quota_master.c
* Lustre Quota Master request handler
*
- * Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ * Copyright (c) 2001-2005 Cluster File Systems, Inc.
* Author: Niu YaWei <niu@clusterfs.com>
*
* This file is part of Lustre, http://www.lustre.org.
#include <linux/lustre_fsfilt.h>
#include <linux/lustre_mds.h>
-#include "mds_internal.h"
+#include "quota_internal.h"
+/* lock ordering:
+ * mds->mds_qonoff_sem > dquot->dq_sem */
static struct list_head lustre_dquot_hash[NR_DQHASH];
static spinlock_t dquot_hash_lock = SPIN_LOCK_UNLOCKED;
LASSERT(list_empty(lustre_dquot_hash + i));
}
if (lustre_dquot_cachep) {
- LASSERTF(kmem_cache_destroy(lustre_dquot_cachep) == 0,
- "Cannot destroy lustre_dquot_cache\n");
+ int rc;
+ rc = kmem_cache_destroy(lustre_dquot_cachep);
+ LASSERT(rc == 0);
lustre_dquot_cachep = NULL;
}
EXIT;
return tmp;
}
+/* caller must hold dquot_hash_lock */
static struct lustre_dquot *find_dquot(int hashent,
struct lustre_quota_info *lqi, qid_t id,
int type)
{
- struct list_head *head;
struct lustre_dquot *dquot;
ENTRY;
- for (head = lustre_dquot_hash[hashent].next;
- head != lustre_dquot_hash + hashent; head = head->next) {
- dquot = list_entry(head, struct lustre_dquot, dq_hash);
+ LASSERT_SPIN_LOCKED(&dquot_hash_lock);
+ list_for_each_entry(dquot, &lustre_dquot_hash[hashent], dq_hash) {
if (dquot->dq_info == lqi &&
dquot->dq_id == id && dquot->dq_type == type)
RETURN(dquot);
RETURN(NULL);
INIT_LIST_HEAD(&dquot->dq_hash);
- INIT_LIST_HEAD(&dquot->dq_unused);
- sema_init(&dquot->dq_sem, 1);
- atomic_set(&dquot->dq_refcnt, 1);
+ init_mutex_locked(&dquot->dq_sem);
+ dquot->dq_refcnt = 1;
dquot->dq_info = lqi;
dquot->dq_id = id;
dquot->dq_type = type;
+ dquot->dq_status = DQ_STATUS_AVAIL;
RETURN(dquot);
}
{
struct list_head *head = lustre_dquot_hash +
dquot_hashfn(dquot->dq_info, dquot->dq_id, dquot->dq_type);
+ LASSERT(list_empty(&dquot->dq_hash));
list_add(&dquot->dq_hash, head);
}
{
ENTRY;
spin_lock(&dquot_hash_lock);
- LASSERT(atomic_read(&dquot->dq_refcnt));
- if (atomic_dec_and_test(&dquot->dq_refcnt)) {
+ LASSERT(dquot->dq_refcnt);
+ dquot->dq_refcnt--;
+ if (!dquot->dq_refcnt) {
remove_dquot_nolock(dquot);
free_dquot(dquot);
}
EXIT;
}
-#define DQUOT_DEBUG(dquot, fmt, arg...) \
- CDEBUG(D_QUOTA, "refcnt(%u) id(%u) type(%u) off(%llu) flags(%lu) " \
- "bhardlimit(%u) curspace("LPX64") ihardlimit(%u) " \
- "curinodes(%u): " fmt, atomic_read(&dquot->dq_refcnt), \
- dquot->dq_id, dquot->dq_type, dquot->dq_off, dquot->dq_flags, \
- dquot->dq_dqb.dqb_bhardlimit, dquot->dq_dqb.dqb_curspace, \
- dquot->dq_dqb.dqb_ihardlimit, dquot->dq_dqb.dqb_curinodes, \
- ## arg); \
-
-#define QINFO_DEBUG(qinfo, fmt, arg...) \
- CDEBUG(D_QUOTA, "files (%p/%p) flags(%lu/%lu) blocks(%u/%u) " \
- "free_blk(/%u/%u) free_entry(%u/%u): " fmt, \
- qinfo->qi_files[0], qinfo->qi_files[1], \
- qinfo->qi_info[0].dqi_flags, qinfo->qi_info[1].dqi_flags, \
- qinfo->qi_info[0].dqi_blocks, qinfo->qi_info[1].dqi_blocks, \
- qinfo->qi_info[0].dqi_free_blk, qinfo->qi_info[1].dqi_free_blk,\
- qinfo->qi_info[0].dqi_free_entry, \
- qinfo->qi_info[1].dqi_free_entry, ## arg);
-
static struct lustre_dquot *lustre_dqget(struct obd_device *obd,
struct lustre_quota_info *lqi,
qid_t id, int type)
{
unsigned int hashent = dquot_hashfn(lqi, id, type);
- struct lustre_dquot *dquot = NULL;
- int read = 0;
+ struct lustre_dquot *dquot, *empty;
ENTRY;
+ if ((empty = alloc_dquot(lqi, id, type)) == NULL)
+ RETURN(ERR_PTR(-ENOMEM));
+
spin_lock(&dquot_hash_lock);
if ((dquot = find_dquot(hashent, lqi, id, type)) != NULL) {
- atomic_inc(&dquot->dq_refcnt);
+ dquot->dq_refcnt++;
+ spin_unlock(&dquot_hash_lock);
+ free_dquot(empty);
} else {
- dquot = alloc_dquot(lqi, id, type);
- if (dquot) {
- insert_dquot_nolock(dquot);
- read = 1;
- }
- }
- spin_unlock(&dquot_hash_lock);
-
- if (dquot == NULL)
- RETURN(ERR_PTR(-ENOMEM));
+ int rc;
- if (read) {
- int rc = 0;
+ dquot = empty;
+ insert_dquot_nolock(dquot);
+ spin_unlock(&dquot_hash_lock);
- down(&dquot->dq_info->qi_sem);
- down(&dquot->dq_sem);
rc = fsfilt_dquot(obd, dquot, QFILE_RD_DQUOT);
up(&dquot->dq_sem);
- up(&dquot->dq_info->qi_sem);
if (rc) {
- CERROR("can't read dquot from admin qutoafile! "
+ CERROR("can't read dquot from admin quotafile! "
"(rc:%d)\n", rc);
lustre_dqput(dquot);
RETURN(ERR_PTR(rc));
}
+
}
+
+ LASSERT(dquot);
RETURN(dquot);
}
int rc = 0;
ENTRY;
+ OBD_FAIL_RETURN(OBD_FAIL_OBD_DQACQ, -EIO);
+
/* slaves never acquires qunit for user root */
LASSERT(qdata->qd_id || qdata->qd_type == GRPQUOTA);
DQUOT_DEBUG(dquot, "get dquot in dqacq_handler\n");
QINFO_DEBUG(dquot->dq_info, "get dquot in dqadq_handler\n");
- down(&dquot->dq_info->qi_sem);
+ down(&mds->mds_qonoff_sem);
down(&dquot->dq_sem);
+ if (dquot->dq_status & DQ_STATUS_RECOVERY) {
+ DQUOT_DEBUG(dquot, "this dquot is under recovering.\n");
+ GOTO(out, rc = -EBUSY);
+ }
+
if (qdata->qd_isblk) {
grace = info->qi_info[qdata->qd_type].dqi_bgrace;
usage = &dquot->dq_dqb.dqb_curspace;
hlimit = dquot->dq_dqb.dqb_ihardlimit;
slimit = dquot->dq_dqb.dqb_isoftlimit;
time = &dquot->dq_dqb.dqb_itime;
- }
+ }
/* if the quota limit in admin quotafile is zero, we just inform
* slave to clear quota limit with zero qd_count */
qdata->qd_count = 0;
GOTO(out, rc);
}
-
- if (opc == QUOTA_DQACQ) {
+
+ switch (opc) {
+ case QUOTA_DQACQ:
if (hlimit &&
QUSG(*usage + qdata->qd_count, qdata->qd_isblk) > hlimit)
GOTO(out, rc = -EDQUOT);
if (slimit &&
QUSG(*usage + qdata->qd_count, qdata->qd_isblk) > slimit) {
- if (*time && CURRENT_SECONDS >= *time)
+ if (*time && cfs_time_current_sec() >= *time)
GOTO(out, rc = -EDQUOT);
else if (!*time)
- *time = CURRENT_SECONDS + grace;
+ *time = cfs_time_current_sec() + grace;
}
*usage += qdata->qd_count;
-
- } else if (opc == QUOTA_DQREL) {
- LASSERT(*usage - qdata->qd_count >= 0);
- *usage -= qdata->qd_count;
+ break;
+ case QUOTA_DQREL:
+ /* The usage in administrative file might be incorrect before
+ * recovery done */
+ if (*usage - qdata->qd_count < 0)
+ *usage = 0;
+ else
+ *usage -= qdata->qd_count;
/* (usage <= soft limit) but not (usage < soft limit) */
if (!slimit || QUSG(*usage, qdata->qd_isblk) <= slimit)
*time = 0;
- } else {
+ break;
+ default:
LBUG();
}
rc = fsfilt_dquot(obd, dquot, QFILE_WR_DQUOT);
+ EXIT;
out:
up(&dquot->dq_sem);
- up(&dquot->dq_info->qi_sem);
+ up(&mds->mds_qonoff_sem);
lustre_dqput(dquot);
- RETURN(rc);
+ return rc;
}
-void mds_adjust_qunit(struct obd_device *obd, uid_t cuid, gid_t cgid,
- uid_t puid, gid_t pgid, int rc)
+int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[],
+ unsigned int qpids[], int rc, int opc)
{
- struct mds_obd *mds = &obd->u.mds;
- struct lustre_quota_ctxt *qctxt = &mds->mds_quota_ctxt;
+ struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+ int rc2 = 0;
ENTRY;
- if (rc && rc != -EDQUOT) {
- EXIT;
- return;
+ if (rc && rc != -EDQUOT)
+ RETURN(0);
+
+ switch (opc) {
+ case FSFILT_OP_RENAME:
+ /* acquire/release block quota on owner of original parent */
+ rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[2], qpids[3], 1, 0);
+ /* fall-through */
+ case FSFILT_OP_SETATTR:
+ /* acquire/release file quota on original owner */
+ rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 0, 0);
+ /* fall-through */
+ case FSFILT_OP_CREATE:
+ case FSFILT_OP_UNLINK:
+ /* acquire/release file/block quota on owner of child (or current owner) */
+ rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0);
+ rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0);
+ /* acquire/release block quota on owner of parent (or original owner) */
+ rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0);
+ break;
+ default:
+ LBUG();
+ break;
}
- /* dqacq/dqrel file quota on owner of child */
- rc = qctxt_adjust_qunit(obd, qctxt, cuid, cgid, 0);
- if (rc)
- CERROR("error mds adjust child qunit! (rc:%d)\n", rc);
- /* dqacq/dqrel block quota on owner of parent directory */
- rc = qctxt_adjust_qunit(obd, qctxt, puid, pgid, 1);
- if (rc)
- CERROR("error mds adjust parent qunit! (rc:%d)\n", rc);
- EXIT;
+
+ if (rc2)
+ CERROR("mds adjust qunit failed! (opc:%d rc:%d)\n", opc, rc2);
+ RETURN(0);
}
+int filter_quota_adjust(struct obd_device *obd, unsigned int qcids[],
+ unsigned int qpids[], int rc, int opc)
+{
+ struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt;
+ int rc2 = 0;
+ ENTRY;
+
+ if (rc && rc != -EDQUOT)
+ RETURN(0);
+
+ switch (opc) {
+ case FSFILT_OP_SETATTR:
+ /* acquire/release block quota on original & current owner */
+ rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0);
+ rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0);
+ break;
+ case FSFILT_OP_UNLINK:
+ /* release block quota on this owner */
+ case FSFILT_OP_CREATE: /* XXX for write operation on obdfilter */
+ /* acquire block quota on this owner */
+ rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0);
+ break;
+ default:
+ LBUG();
+ break;
+ }
+
+ if (rc || rc2)
+ CERROR("filter adjust qunit failed! (opc:%d rc%d)\n",
+ opc, rc ?: rc2);
+ RETURN(0);
+}
+
+#define LUSTRE_ADMIN_QUOTAFILES {\
+ "admin_quotafile.usr", /* user admin quotafile */\
+ "admin_quotafile.grp" /* group admin quotafile */\
+}
+static const char prefix[] = "OBJECTS/";
+
int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl)
{
struct mds_obd *mds = &obd->u.mds;
LASSERT(iparent);
push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
- down(&qinfo->qi_sem);
+ down(&mds->mds_qonoff_sem);
for (i = 0; i < MAXQUOTAS; i++) {
- struct dentry *de = NULL;
- struct file *fp = NULL;
+ struct dentry *de;
+ struct file *fp;
if (!Q_TYPESET(oqctl, i))
continue;
/* lookup quota file */
rc = 0;
down(&iparent->i_sem);
-
de = lookup_one_len(quotafiles[i], dparent,
strlen(quotafiles[i]));
- if (IS_ERR(de) || de->d_inode == NULL)
+ up(&iparent->i_sem);
+ if (IS_ERR(de) || de->d_inode == NULL ||
+ !S_ISREG(de->d_inode->i_mode))
rc = IS_ERR(de) ? PTR_ERR(de) : -ENOENT;
if (!IS_ERR(de))
dput(de);
- up(&iparent->i_sem);
if (rc && rc != -ENOENT) {
CERROR("error lookup quotafile %s! (rc:%d)\n",
continue;
}
- sprintf(name, "OBJECTS/%s", quotafiles[i]);
+ LASSERT(strlen(quotafiles[i]) + sizeof(prefix) <= sizeof(name));
+ sprintf(name, "%s%s", prefix, quotafiles[i]);
LASSERT(rc == -ENOENT);
/* create quota file */
fp = filp_open(name, O_CREAT | O_EXCL, 0644);
- if (IS_ERR(fp)) {
+ if (IS_ERR(fp) || !S_ISREG(fp->f_dentry->d_inode->i_mode)) {
rc = PTR_ERR(fp);
CERROR("error creating admin quotafile %s (rc:%d)\n",
name, rc);
break;
}
}
- up(&qinfo->qi_sem);
+ up(&mds->mds_qonoff_sem);
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
RETURN(rc);
}
-int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl)
+static int close_quota_files(struct obd_quotactl *oqctl,
+ struct lustre_quota_info *qinfo)
+{
+ int i, rc = 0;
+ ENTRY;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (!Q_TYPESET(oqctl, i))
+ continue;
+ if (qinfo->qi_files[i] == NULL) {
+ rc = -ESRCH;
+ continue;
+ }
+ filp_close(qinfo->qi_files[i], 0);
+ qinfo->qi_files[i] = NULL;
+ }
+ RETURN(rc);
+}
+
+int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl)
{
struct mds_obd *mds = &obd->u.mds;
struct lustre_quota_info *qinfo = &mds->mds_quota_info;
const char *quotafiles[] = LUSTRE_ADMIN_QUOTAFILES;
- struct lvfs_run_ctxt saved;
char name[64];
int i, rc = 0;
struct inode *iparent = mds->mds_objects_dir->d_inode;
ENTRY;
LASSERT(iparent);
- push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
- down(&qinfo->qi_sem);
/* open admin quota files and read quotafile info */
for (i = 0; i < MAXQUOTAS; i++) {
- struct file *fp = NULL;
+ struct file *fp;
if (!Q_TYPESET(oqctl, i))
continue;
- sprintf(name, "OBJECTS/%s", quotafiles[i]);
+ LASSERT(strlen(quotafiles[i]) + sizeof(prefix) <= sizeof(name));
+ sprintf(name, "%s%s", prefix, quotafiles[i]);
if (qinfo->qi_files[i] != NULL) {
rc = -EBUSY;
}
fp = filp_open(name, O_RDWR | O_EXCL, 0644);
- if (IS_ERR(fp)) {
+ if (IS_ERR(fp) || !S_ISREG(fp->f_dentry->d_inode->i_mode)) {
rc = PTR_ERR(fp);
CERROR("error open %s! (rc:%d)\n", name, rc);
break;
break;
}
}
- up(&qinfo->qi_sem);
- pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ if (rc && rc != -EBUSY)
+ close_quota_files(oqctl, qinfo);
- if (rc && rc != -EBUSY) {
- down(&qinfo->qi_sem);
- for (i = 0; i < MAXQUOTAS; i++) {
- if (!Q_TYPESET(oqctl, i))
- continue;
- if (qinfo->qi_files[i])
- filp_close(qinfo->qi_files[i], 0);
- qinfo->qi_files[i] = NULL;
- }
- up(&qinfo->qi_sem);
- }
RETURN(rc);
}
-int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl)
+static int mds_admin_quota_off(struct obd_device *obd,
+ struct obd_quotactl *oqctl)
{
struct mds_obd *mds = &obd->u.mds;
struct lustre_quota_info *qinfo = &mds->mds_quota_info;
- int i, rc = 0;
+ int rc;
ENTRY;
- down(&qinfo->qi_sem);
/* close admin quota files */
- for (i = 0; i < MAXQUOTAS; i++) {
- if (!Q_TYPESET(oqctl, i))
- continue;
+ rc = close_quota_files(oqctl, qinfo);
+ RETURN(rc);
+}
- if (qinfo->qi_files[i] == NULL) {
- rc = -ESRCH;
- continue;
- }
- filp_close(qinfo->qi_files[i], 0);
- qinfo->qi_files[i] = NULL;
- }
- up(&qinfo->qi_sem);
+int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl)
+{
+ struct mds_obd *mds = &obd->u.mds;
+ struct lvfs_run_ctxt saved;
+ int rc;
+ ENTRY;
+
+ down(&mds->mds_qonoff_sem);
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ rc = mds_admin_quota_on(obd, oqctl);
+ if (rc)
+ goto out;
+ rc = obd_quotactl(mds->mds_osc_exp, oqctl);
+ if (rc)
+ goto out;
+
+ rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
+out:
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ up(&mds->mds_qonoff_sem);
RETURN(rc);
}
+int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl)
+{
+ struct mds_obd *mds = &obd->u.mds;
+ struct lvfs_run_ctxt saved;
+ int rc, rc2;
+ ENTRY;
+
+ down(&mds->mds_qonoff_sem);
+ /* close admin quota files */
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ mds_admin_quota_off(obd, oqctl);
+
+ rc = obd_quotactl(mds->mds_osc_exp, oqctl);
+ rc2 = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
+
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ up(&mds->mds_qonoff_sem);
+ RETURN(rc ?: rc2);
+}
+
int mds_set_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl)
{
struct mds_obd *mds = &obd->u.mds;
struct lustre_quota_info *qinfo = &mds->mds_quota_info;
struct obd_dqinfo *dqinfo = &oqctl->qc_dqinfo;
- int rc = 0;
+ int rc;
ENTRY;
- if (qinfo->qi_files[oqctl->qc_type] == NULL)
- RETURN(-ESRCH);
+ down(&mds->mds_qonoff_sem);
+ if (qinfo->qi_files[oqctl->qc_type] == NULL) {
+ rc = -ESRCH;
+ goto out;
+ }
- down(&qinfo->qi_sem);
qinfo->qi_info[oqctl->qc_type].dqi_bgrace = dqinfo->dqi_bgrace;
qinfo->qi_info[oqctl->qc_type].dqi_igrace = dqinfo->dqi_igrace;
qinfo->qi_info[oqctl->qc_type].dqi_flags = dqinfo->dqi_flags;
rc = fsfilt_quotainfo(obd, qinfo, oqctl->qc_type, QFILE_WR_INFO);
- up(&qinfo->qi_sem);
+out:
+ up(&mds->mds_qonoff_sem);
RETURN(rc);
}
struct mds_obd *mds = &obd->u.mds;
struct lustre_quota_info *qinfo = &mds->mds_quota_info;
struct obd_dqinfo *dqinfo = &oqctl->qc_dqinfo;
+ int rc = 0;
ENTRY;
- if (qinfo->qi_files[oqctl->qc_type] == NULL)
- RETURN(-ESRCH);
+ down(&mds->mds_qonoff_sem);
+ if (qinfo->qi_files[oqctl->qc_type] == NULL) {
+ rc = -ESRCH;
+ goto out;
+ }
- down(&qinfo->qi_sem);
dqinfo->dqi_bgrace = qinfo->qi_info[oqctl->qc_type].dqi_bgrace;
dqinfo->dqi_igrace = qinfo->qi_info[oqctl->qc_type].dqi_igrace;
dqinfo->dqi_flags = qinfo->qi_info[oqctl->qc_type].dqi_flags;
- up(&qinfo->qi_sem);
- RETURN(0);
+out:
+ up(&mds->mds_qonoff_sem);
+ RETURN(rc);
}
static int mds_init_slave_ilimits(struct obd_device *obd,
struct obd_quotactl *oqctl)
{
/* XXX: for file limits only adjust local now */
- struct mds_obd *mds = &obd->u.mds;
unsigned int uid = 0, gid = 0;
struct obd_quotactl *ioqc;
int rc;
if (!oqctl->qc_dqblk.dqb_ihardlimit && !oqctl->qc_dqblk.dqb_isoftlimit)
RETURN(0);
- OBD_ALLOC(ioqc, sizeof(*ioqc));
+ OBD_ALLOC_PTR(ioqc);
if (!ioqc)
RETURN(-ENOMEM);
- ioqc->qc_cmd = Q_SETQUOTA;
+ ioqc->qc_cmd = Q_INITQUOTA;
ioqc->qc_id = oqctl->qc_id;
ioqc->qc_type = oqctl->qc_type;
ioqc->qc_dqblk.dqb_valid = QIF_ILIMITS;
ioqc->qc_dqblk.dqb_ihardlimit = MIN_QLIMIT;
/* set local limit to MIN_QLIMIT */
- rc = fsfilt_quotactl(obd, mds->mds_sb, ioqc);
+ rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, ioqc);
if (rc)
GOTO(out, rc);
else
gid = oqctl->qc_id;
- rc = qctxt_adjust_qunit(obd, &mds->mds_quota_ctxt, uid, gid, 0);
+ rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, uid, gid, 0, 0);
if (rc) {
CERROR("error mds adjust local file quota! (rc:%d)\n", rc);
GOTO(out, rc);
}
/* FIXME initialize all slaves in CMD */
+ EXIT;
out:
- OBD_FREE(ioqc, sizeof(*ioqc));
- RETURN(rc);
+ OBD_FREE_PTR(ioqc);
+ return rc;
}
static int mds_init_slave_blimits(struct obd_device *obd,
if (!oqctl->qc_dqblk.dqb_bhardlimit && !oqctl->qc_dqblk.dqb_bsoftlimit)
RETURN(0);
- OBD_ALLOC(ioqc, sizeof(*ioqc));
+ OBD_ALLOC_PTR(ioqc);
if (!ioqc)
RETURN(-ENOMEM);
- ioqc->qc_cmd = Q_SETQUOTA;
+ ioqc->qc_cmd = Q_INITQUOTA;
ioqc->qc_id = oqctl->qc_id;
ioqc->qc_type = oqctl->qc_type;
ioqc->qc_dqblk.dqb_valid = QIF_BLIMITS;
ioqc->qc_dqblk.dqb_bhardlimit = MIN_QLIMIT;
/* set local limit to MIN_QLIMIT */
- rc = fsfilt_quotactl(obd, mds->mds_sb, ioqc);
+ rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, ioqc);
if (rc)
GOTO(out, rc);
else
gid = oqctl->qc_id;
- rc = qctxt_adjust_qunit(obd, &mds->mds_quota_ctxt, uid, gid, 1);
+ rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, uid, gid, 1, 0);
if (rc) {
CERROR("error mds adjust local block quota! (rc:%d)\n", rc);
GOTO(out, rc);
}
/* initialize all slave's limit */
- ioqc->qc_cmd = Q_INITQUOTA;
rc = obd_quotactl(mds->mds_osc_exp, ioqc);
+ EXIT;
out:
- OBD_FREE(ioqc, sizeof(*ioqc));
- RETURN(rc);
+ OBD_FREE_PTR(ioqc);
+ return rc;
}
int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl)
time_t btime, itime;
struct lustre_dquot *dquot;
struct obd_dqblk *dqblk = &oqctl->qc_dqblk;
- int rc = 0;
+ int rc;
ENTRY;
+ down(&mds->mds_qonoff_sem);
if (qinfo->qi_files[oqctl->qc_type] == NULL)
- RETURN(-ESRCH);
+ GOTO(out_sem, rc = -ESRCH);
dquot = lustre_dqget(obd, qinfo, oqctl->qc_id, oqctl->qc_type);
if (IS_ERR(dquot))
- RETURN(PTR_ERR(dquot));
+ GOTO(out_sem, rc = PTR_ERR(dquot));
DQUOT_DEBUG(dquot, "get dquot in mds_set_blk\n");
QINFO_DEBUG(dquot->dq_info, "get dquot in mds_set_blk\n");
- down(&dquot->dq_info->qi_sem);
down(&dquot->dq_sem);
+ if (dquot->dq_status) {
+ up(&dquot->dq_sem);
+ lustre_dqput(dquot);
+ GOTO(out_sem, rc = -EBUSY);
+ }
+ dquot->dq_status |= DQ_STATUS_SET;
+
ihardlimit = dquot->dq_dqb.dqb_ihardlimit;
isoftlimit = dquot->dq_dqb.dqb_isoftlimit;
bhardlimit = dquot->dq_dqb.dqb_bhardlimit;
btime = dquot->dq_dqb.dqb_btime;
itime = dquot->dq_dqb.dqb_itime;
+ if (dqblk->dqb_valid & QIF_BTIME)
+ dquot->dq_dqb.dqb_btime = dqblk->dqb_btime;
+ if (dqblk->dqb_valid & QIF_ITIME)
+ dquot->dq_dqb.dqb_itime = dqblk->dqb_itime;
+
if (dqblk->dqb_valid & QIF_BLIMITS) {
dquot->dq_dqb.dqb_bhardlimit = dqblk->dqb_bhardlimit;
dquot->dq_dqb.dqb_bsoftlimit = dqblk->dqb_bsoftlimit;
if (!dquot->dq_dqb.dqb_bhardlimit &&
!dquot->dq_dqb.dqb_bsoftlimit)
dquot->dq_dqb.dqb_curspace = 0;
+
+ /* clear grace time */
+ if (!dqblk->dqb_bsoftlimit ||
+ toqb(dquot->dq_dqb.dqb_curspace) <= dqblk->dqb_bsoftlimit)
+ dquot->dq_dqb.dqb_btime = 0;
+ /* set grace only if user hasn't provided his own */
+ else if (!(dqblk->dqb_valid & QIF_BTIME))
+ dquot->dq_dqb.dqb_btime = cfs_time_current_sec() +
+ qinfo->qi_info[dquot->dq_type].dqi_bgrace;
}
if (dqblk->dqb_valid & QIF_ILIMITS) {
if (!dquot->dq_dqb.dqb_ihardlimit &&
!dquot->dq_dqb.dqb_isoftlimit)
dquot->dq_dqb.dqb_curinodes = 0;
- }
- if (dqblk->dqb_valid & QIF_BTIME)
- dquot->dq_dqb.dqb_btime = dqblk->dqb_btime;
-
- if (dqblk->dqb_valid & QIF_ITIME)
- dquot->dq_dqb.dqb_itime = dqblk->dqb_itime;
+ if (!dqblk->dqb_isoftlimit ||
+ dquot->dq_dqb.dqb_curinodes <= dqblk->dqb_isoftlimit)
+ dquot->dq_dqb.dqb_itime = 0;
+ else if (!(dqblk->dqb_valid & QIF_ITIME))
+ dquot->dq_dqb.dqb_itime = cfs_time_current_sec() +
+ qinfo->qi_info[dquot->dq_type].dqi_igrace;
+ }
rc = fsfilt_dquot(obd, dquot, QFILE_WR_DQUOT);
up(&dquot->dq_sem);
- up(&dquot->dq_info->qi_sem);
- if (rc)
- GOTO(out, rc);
+ if (rc) {
+ CERROR("set limit failed! (rc:%d)\n", rc);
+ goto out;
+ }
+ up(&mds->mds_qonoff_sem);
if (dqblk->dqb_valid & QIF_ILIMITS && !(ihardlimit || isoftlimit)) {
rc = mds_init_slave_ilimits(obd, oqctl);
if (rc) {
CERROR("init slave ilimits failed! (rc:%d)\n", rc);
- GOTO(revoke_out, rc);
+ goto revoke_out;
}
}
rc = mds_init_slave_blimits(obd, oqctl);
if (rc) {
CERROR("init slave blimits failed! (rc:%d)\n", rc);
- GOTO(revoke_out, rc);
+ goto revoke_out;
}
}
+ down(&mds->mds_qonoff_sem);
revoke_out:
if (rc) {
/* cancel previous setting */
- down(&dquot->dq_info->qi_sem);
down(&dquot->dq_sem);
dquot->dq_dqb.dqb_ihardlimit = ihardlimit;
dquot->dq_dqb.dqb_isoftlimit = isoftlimit;
dquot->dq_dqb.dqb_itime = itime;
fsfilt_dquot(obd, dquot, QFILE_WR_DQUOT);
up(&dquot->dq_sem);
- up(&dquot->dq_info->qi_sem);
}
out:
+ down(&dquot->dq_sem);
+ dquot->dq_status &= ~DQ_STATUS_SET;
+ up(&dquot->dq_sem);
lustre_dqput(dquot);
- RETURN(rc);
+ EXIT;
+out_sem:
+ up(&mds->mds_qonoff_sem);
+ return rc;
}
static int mds_get_space(struct obd_device *obd, struct obd_quotactl *oqctl)
{
struct obd_quotactl *soqc;
+ struct lvfs_run_ctxt saved;
int rc;
+ ENTRY;
- OBD_ALLOC(soqc, sizeof(*soqc));
+ OBD_ALLOC_PTR(soqc);
if (!soqc)
RETURN(-ENOMEM);
- soqc->qc_cmd = oqctl->qc_cmd;
+ soqc->qc_cmd = Q_GETOQUOTA;
soqc->qc_id = oqctl->qc_id;
soqc->qc_type = oqctl->qc_type;
rc = obd_quotactl(obd->u.mds.mds_osc_exp, soqc);
+ if (rc)
+ GOTO(out, rc);
oqctl->qc_dqblk.dqb_curspace = soqc->qc_dqblk.dqb_curspace;
- OBD_FREE(soqc, sizeof(*soqc));
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ soqc->qc_dqblk.dqb_curspace = 0;
+ rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, soqc);
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+ if (rc)
+ GOTO(out, rc);
+
+ oqctl->qc_dqblk.dqb_curinodes += soqc->qc_dqblk.dqb_curinodes;
+ oqctl->qc_dqblk.dqb_curspace += soqc->qc_dqblk.dqb_curspace;
+ EXIT;
+out:
+ OBD_FREE_PTR(soqc);
return rc;
}
int rc;
ENTRY;
+ down(&mds->mds_qonoff_sem);
if (qinfo->qi_files[oqctl->qc_type] == NULL)
- RETURN(-ESRCH);
+ GOTO(out, rc = -ESRCH);
dquot = lustre_dqget(obd, qinfo, oqctl->qc_id, oqctl->qc_type);
if (IS_ERR(dquot))
- RETURN(PTR_ERR(dquot));
+ GOTO(out, rc = PTR_ERR(dquot));
down(&dquot->dq_sem);
dqblk->dqb_ihardlimit = dquot->dq_dqb.dqb_ihardlimit;
dqblk->dqb_itime = dquot->dq_dqb.dqb_itime;
up(&dquot->dq_sem);
+ lustre_dqput(dquot);
+
/* the usages in admin quota file is inaccurate */
dqblk->dqb_curinodes = 0;
dqblk->dqb_curspace = 0;
rc = mds_get_space(obd, oqctl);
+ EXIT;
+out:
+ up(&mds->mds_qonoff_sem);
+ return rc;
+}
+
+int mds_get_obd_quota(struct obd_device *obd, struct obd_quotactl *oqctl)
+{
+ struct lvfs_run_ctxt saved;
+ int rc;
+ ENTRY;
+
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl);
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+
+ RETURN(rc);
+}
+
+
+/* FIXME we only recovery block limit by now, need recovery inode
+ * limits also after CMD involved in */
+static int
+dquot_recovery(struct obd_device *obd, unsigned int id, unsigned short type)
+{
+ struct mds_obd *mds = &obd->u.mds;
+ struct lustre_quota_info *qinfo= &obd->u.mds.mds_quota_info;
+ struct lustre_dquot *dquot;
+ struct obd_quotactl *qctl;
+ __u64 total_limits = 0;
+ int rc;
+ ENTRY;
+
+ OBD_ALLOC_PTR(qctl);
+ if (qctl == NULL)
+ RETURN(-ENOMEM);
+
+ dquot = lustre_dqget(obd, qinfo, id, type);
+ if (IS_ERR(dquot)) {
+ CERROR("Get dquot failed. (rc:%ld)\n", PTR_ERR(dquot));
+ OBD_FREE_PTR(qctl);
+ RETURN(PTR_ERR(dquot));
+ }
+
+ down(&dquot->dq_sem);
+
+ /* don't recovery the dquot without limits or under setting */
+ if (!(dquot->dq_dqb.dqb_bhardlimit || dquot->dq_dqb.dqb_bsoftlimit) ||
+ dquot->dq_status)
+ GOTO(skip, rc = 0);
+ dquot->dq_status |= DQ_STATUS_RECOVERY;
+
+ up(&dquot->dq_sem);
+
+ /* get real bhardlimit from all slaves. */
+ qctl->qc_cmd = Q_GETOQUOTA;
+ qctl->qc_type = type;
+ qctl->qc_id = id;
+ qctl->qc_stat = QUOTA_RECOVERING;
+ rc = obd_quotactl(obd->u.mds.mds_osc_exp, qctl);
+ if (rc)
+ GOTO(out, rc);
+ total_limits = qctl->qc_dqblk.dqb_bhardlimit;
+
+ /* get real bhardlimit from master */
+ rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, qctl);
+ if (rc)
+ GOTO(out, rc);
+ total_limits += qctl->qc_dqblk.dqb_bhardlimit;
+
+ /* amend the usage of the administrative quotafile */
+ down(&mds->mds_qonoff_sem);
+ down(&dquot->dq_sem);
+
+ dquot->dq_dqb.dqb_curspace = total_limits << QUOTABLOCK_BITS;
+
+ rc = fsfilt_dquot(obd, dquot, QFILE_WR_DQUOT);
+ if (rc)
+ CERROR("write dquot failed! (rc:%d)\n", rc);
+
+ up(&dquot->dq_sem);
+ up(&mds->mds_qonoff_sem);
+ EXIT;
+out:
+ down(&dquot->dq_sem);
+ dquot->dq_status &= ~DQ_STATUS_RECOVERY;
+skip:
+ up(&dquot->dq_sem);
lustre_dqput(dquot);
+ OBD_FREE_PTR(qctl);
+ return rc;
+}
+
+struct qmaster_recov_thread_data {
+ struct obd_device *obd;
+ struct completion comp;
+};
+
+static int qmaster_recovery_main(void *arg)
+{
+ struct qmaster_recov_thread_data *data = arg;
+ struct obd_device *obd = data->obd;
+ unsigned long flags;
+ int rc = 0;
+ unsigned short type;
+ ENTRY;
+
+ lock_kernel();
+ ptlrpc_daemonize();
+
+ SIGNAL_MASK_LOCK(current, flags);
+ sigfillset(¤t->blocked);
+ RECALC_SIGPENDING;
+ SIGNAL_MASK_UNLOCK(current, flags);
+ THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1, "%s",
+ "qmaster_recovd");
+ unlock_kernel();
+
+ complete(&data->comp);
+
+ for (type = USRQUOTA; type < MAXQUOTAS; type++) {
+ struct mds_obd *mds = &obd->u.mds;
+ struct lustre_quota_info *qinfo = &mds->mds_quota_info;
+ struct list_head id_list;
+ struct dquot_id *dqid, *tmp;
+
+ down(&mds->mds_qonoff_sem);
+ if (qinfo->qi_files[type] == NULL) {
+ up(&mds->mds_qonoff_sem);
+ continue;
+ }
+ INIT_LIST_HEAD(&id_list);
+ rc = fsfilt_qids(obd, qinfo->qi_files[type], NULL, type,
+ &id_list);
+ up(&mds->mds_qonoff_sem);
+
+ if (rc)
+ CERROR("error get ids from admin quotafile.(%d)\n", rc);
+
+ list_for_each_entry_safe(dqid, tmp, &id_list, di_link) {
+ list_del_init(&dqid->di_link);
+ if (rc)
+ goto free;
+
+ rc = dquot_recovery(obd, dqid->di_id, type);
+ if (rc)
+ CERROR("qmaster recovery failed! (id:%d type:%d"
+ " rc:%d)\n", dqid->di_id, type, rc);
+free:
+ kfree(dqid);
+ }
+ }
+ RETURN(rc);
+}
+
+int mds_quota_recovery(struct obd_device *obd)
+{
+ struct lov_obd *lov = &obd->u.mds.mds_osc_obd->u.lov;
+ struct qmaster_recov_thread_data data;
+ int rc = 0;
+ ENTRY;
+
+ down(&lov->lov_lock);
+ if (lov->desc.ld_tgt_count != lov->desc.ld_active_tgt_count) {
+ CWARN("Not all osts are active, abort quota recovery\n");
+ up(&lov->lov_lock);
+ RETURN(rc);
+ }
+ up(&lov->lov_lock);
+
+ data.obd = obd;
+ init_completion(&data.comp);
+
+ rc = kernel_thread(qmaster_recovery_main, &data, CLONE_VM|CLONE_FILES);
+ if (rc < 0)
+ CERROR("Cannot start quota recovery thread: rc %d\n", rc);
+
+ wait_for_completion(&data.comp);
RETURN(rc);
}
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*
- * Copyright (C) 2003 Cluster File Systems, Inc.
+ * Copyright (C) 2005 Cluster File Systems, Inc.
* Author: Lai Siyao <lsy@clusterfs.com>
*
* This file is part of Lustre, http://www.lustre.org/
{
unsigned long desc_block, desc;
struct ext3_group_desc *gdp;
-
+
desc_block = group / EXT3_DESC_PER_BLOCK(sb);
desc = group % EXT3_DESC_PER_BLOCK(sb);
gdp = (struct ext3_group_desc *)
EXT3_SB(sb)->s_group_desc[desc_block]->b_data;
-
+
return gdp + desc;
}
{
struct ext3_group_desc *desc;
struct buffer_head *bh;
-
+
desc = get_group_desc(sb, group);
bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
-
+
return bh;
}
int index, unsigned long ino)
{
struct inode *inode = NULL;
-
+
if (ext3_test_bit(index, bitmap_bh->b_data)) {
CERROR("i: %d, ino: %lu\n", index, ino);
ll_sleep(1);
inode = iget(sb, ino);
}
-
+
return inode;
}
brelse(bitmap_bh);
bitmap_bh = read_inode_bitmap(sb, group);
- if (group == 0)
+ if (group == 0)
CERROR("groups_count: %lu, inodes_per_group: %lu, first_ino: %u, inodes_count: %u\n",
sbi->s_groups_count, sbi->s_inodes_per_group,
sbi->s_first_ino, le32_to_cpu(sbi->s_es->s_inodes_count));
* ------------------------------------------------------------------------- */
static int quotacheck_run_tests(struct obd_device *obd, struct obd_device *tgt)
{
- struct super_block *sb;
int rc;
ENTRY;
- if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDS_NAME))
- sb = tgt->u.mds.mds_sb;
- else if (!strcmp(tgt->obd_type->typ_name, "obdfilter"))
- sb = tgt->u.filter.fo_sb;
- else {
+ if (strcmp(tgt->obd_type->typ_name, LUSTRE_MDS_NAME) &&
+ !strcmp(tgt->obd_type->typ_name, "obdfilter")) {
CERROR("TARGET OBD should be mds or ost\n");
RETURN(-EINVAL);
}
- rc = quotacheck_test_1(tgt, sb);
+ rc = quotacheck_test_1(tgt, tgt->u.obt.obt_sb);
return rc;
}
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*
- * Copyright (C) 2003 Cluster File Systems, Inc.
+ * Copyright (C) 2005 Cluster File Systems, Inc.
* Author: Lai Siyao <lsy@clusterfs.com>
*
* This file is part of Lustre, http://www.lustre.org/
#include <linux/lustre_mds.h>
#include <linux/obd_ost.h>
-char *test_quotafile[] = {"aquotactl.user", "aquotactl.group"};
+static struct obd_quotactl oqctl;
/* Test quotaon */
static int quotactl_test_1(struct obd_device *obd, struct super_block *sb)
{
- struct obd_quotactl oqctl;
int rc;
ENTRY;
oqctl.qc_id = QFMT_LDISKFS;
oqctl.qc_type = UGQUOTA;
rc = fsfilt_quotactl(obd, sb, &oqctl);
- if (rc) {
+ if (rc)
CERROR("1a: quotactl Q_QUOTAON failed: %d\n", rc);
- RETURN(rc);
- }
-
- RETURN(0);
+ RETURN(rc);
}
#if 0 /* set/getinfo not supported, this is for cluster-wide quotas */
/* Test set/getquota */
static int quotactl_test_3(struct obd_device *obd, struct super_block *sb)
{
- struct obd_quotactl oqctl;
int rc;
ENTRY;
/* Test quotaoff */
static int quotactl_test_4(struct obd_device *obd, struct super_block *sb)
{
- struct obd_quotactl oqctl;
int rc;
ENTRY;
int rc;
ENTRY;
- if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDS_NAME))
- sb = tgt->u.mds.mds_sb;
- else if (!strcmp(tgt->obd_type->typ_name, "obdfilter"))
- sb = tgt->u.filter.fo_sb;
- else {
- CERROR("TARGET OBD should be mds or obdfilter\n");
+ if (strcmp(tgt->obd_type->typ_name, LUSTRE_MDS_NAME) &&
+ !strcmp(tgt->obd_type->typ_name, "obdfilter")) {
+ CERROR("TARGET OBD should be mds or ost\n");
RETURN(-EINVAL);
}
+ sb = tgt->u.obt.obt_sb;
+
push_ctxt(&saved, &tgt->obd_lvfs_ctxt, NULL);
rc = quotactl_test_1(tgt, sb);
flock_test
writemany
random-reads
+chownmany
# Lustre test Makefile
-AM_CPPFLAGS = $(LLCPPFLAGS) -I/opt/lam/include -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
+AM_CPPFLAGS = $(LLCPPFLAGS) -I/opt/lam/include -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DLUSTRE_UTILS
AM_CFLAGS = $(LLCFLAGS)
# LDADD = -lldap
# LDADD := -lreadline -ltermcap # -lefence
pkgexample_scripts = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh
pkgexample_scripts += local.sh echo.sh uml.sh lov.sh
noinst_DATA =
-noinst_SCRIPTS = leak_finder.pl llecho.sh llmount.sh llmountcleanup.sh tbox.sh
+noinst_SCRIPTS = leak_finder.pl llecho.sh llmount.sh llmountcleanup.sh
noinst_SCRIPTS += llrmount.sh runfailure-mds runvmstat runfailure-net
noinst_SCRIPTS += runfailure-ost runiozone runregression-net.sh runtests
noinst_SCRIPTS += sanity.sh rundbench
pkgexample_SCRIPTS = $(pkgexample_scripts)
noinst_PROGRAMS = openunlink testreq truncate directio openme writeme
noinst_PROGRAMS += tchmod toexcl fsx test_brw openclose createdestroy
-noinst_PROGRAMS += stat createmany statmany multifstat createtest mlink utime
+noinst_PROGRAMS += stat createmany chownmany statmany multifstat createtest mlink utime
noinst_PROGRAMS += opendirunlink opendevunlink unlinkmany fchdir_test checkstat
noinst_PROGRAMS += wantedi statone runas openfile getdents o_directory
noinst_PROGRAMS += small_write multiop sleeptest ll_sparseness_verify cmknod
-copied from acl-2.2.23/test/
+- copied from acl-2.2.23/test/
+- add inheritance.test from HP
+- some tests are depend on enviroment. e.g. some succeed on FC2 but fail on
+ FC3 etc. We comment out those items, maybe more will be commented out.
$ echo i > tree/dir1/f
$ ls -l tree/dir1/f | awk -- '{ print $1 }'
> -rw-r--r--+
- $ su bin
- $ echo i > tree/dir6/dir2/f
- > tree/dir6/dir2/f: No such file or directory
- $ su
+in following item, the error message is dependant on distributions.
+success on FC3, but not on FC2 and SLES3 etc. comment out by CFS.
+# $ su bin
+# $ echo i > tree/dir6/dir2/f
+# > tree/dir6/dir2/f: No such file or directory
+# $ su
$ rm -rf tree
TIMEOUT=${TIMEOUT:-20}
UPCALL=${UPCALL:-DEFAULT}
+MDSOPT=${MDSOPT:-"user_xattr,acl"}
+CLIENTOPT=${CLIENTOPT:-"user_xattr,acl"}
+MOUNTOPT=${MOUNTOPT:-"user_xattr,acl"}
+
STRIPE_BYTES=${STRIPE_BYTES:-1048576}
STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+void usage(char *prog)
+{
+ printf("usage: %s owner filenamefmt count\n", prog);
+ printf(" %s owner filenamefmt start count\n", prog);
+}
+
+int main(int argc, char ** argv)
+{
+ int i, rc = 0, mask = 0;
+ char format[4096], *fmt;
+ char filename[4096];
+ long start, last;
+ long begin = 0, count;
+
+ if (argc < 4 || argc > 5) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ mask = strtol(argv[1], NULL, 0);
+
+ if (strlen(argv[2]) > 4080) {
+ printf("name too long\n");
+ return 1;
+ }
+
+ start = last = time(0);
+
+ if (argc == 4) {
+ count = strtol(argv[3], NULL, 0);
+ if (count < 1) {
+ printf("count must be at least one\n");
+ return 1;
+ }
+ } else {
+ begin = strtol(argv[3], NULL, 0);
+ count = strtol(argv[4], NULL, 0);
+ }
+
+ if (strchr(argv[2], '%')) {
+ fmt = argv[2];
+ } else {
+ sprintf(format, "%s%%d", argv[2]);
+ fmt = format;
+ }
+ for (i = 0; i < count; i++, begin++) {
+ sprintf(filename, fmt, begin);
+ rc = chown(filename, mask, -1);
+ if (rc) {
+ printf("chown (%s) error: %s\n",
+ filename, strerror(errno));
+ rc = errno;
+ break;
+ }
+ if ((i % 10000) == 0) {
+ printf(" - chowned %d (time %ld ; total %ld ; last "
+ "%ld)\n", i, time(0), time(0) - start,
+ time(0) - last);
+ last = time(0);
+ }
+ }
+ printf("total: %d chowns in %ld seconds: %f chowns/second\n", i,
+ time(0) - start, ((float)i / (time(0) - start)));
+
+ return rc;
+}
ONLY=${ONLY:-"$*"}
# bug number for skipped test:
-ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""}
+ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT"
# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
-[ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT"
-
SRCDIR=`dirname $0`
PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
[ -d $MOUNT ] || mkdir -p $MOUNT
$LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
- llmount -o nettype=$NETTYPE $mds_HOST://mds_svc/client_facet $MOUNT && exit 1
+ llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST://mds_svc/client_facet $MOUNT && exit 1
# cleanup client modules
$LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
[ -d $MOUNT ] || mkdir -p $MOUNT
$LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
- llmount -o nettype=$NETTYPE $mds_HOST://wrong_mds_svc/client_facet $MOUNT && return 1
+ llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST://wrong_mds_svc/client_facet $MOUNT && return 1
# cleanup client modules
$LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
[ -d $MOUNT ] || mkdir -p $MOUNT
$LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
- llmount -o nettype=$NETTYPE $mds_HOST://mds_svc/client_facet $MOUNT || return 1
+ llmount -o nettype=$NETTYPE,$MOUNTOPT $mds_HOST://mds_svc/client_facet $MOUNT || return 1
umount $MOUNT || return 2
# cleanup client modules
do_node `hostname` mkdir -p $MOUNT 2> /dev/null
# load llite module on the client if it isn't in /lib/modules
do_node `hostname` lconf --nosetup --node client_facet $XMLCONFIG
- do_node `hostname` mount -t lustre -o nettype=$NETTYPE \
+ do_node `hostname` mount -t lustre -o nettype=$NETTYPE,$MOUNTOPT \
`facet_active_host mds`:/mds_svc/client_facet $MOUNT ||return $?
echo "mount lustre on $MOUNT with $MOUNTLUSTRE: success"
[ -d /r ] && $LCTL modules > /r/tmp/ogdb-`hostname`
[ -f "$MOUNTLUSTRE" ] && rm -f $MOUNTLUSTRE
echo "mount lustre on ${MOUNT} without $MOUNTLUSTRE....."
- do_node `hostname` mount -t lustre -o nettype=$NETTYPE \
+ do_node `hostname` mount -t lustre -o nettype=$NETTYPE,$MOUNTOPT \
`facet_active_host mds`:/mds_svc/client_facet $MOUNT &&return $?
echo "mount lustre on $MOUNT without $MOUNTLUSTRE failed as expected"
cleanup || return $?
SERVER=${SERVER:-$HOSTNAME}
CLIENT=${CLIENT:-$HOSTNAME}
NET=${NET:-tcp}
+[ "$ACCEPTOR_PORT" ] && PORT_OPT="--port $ACCEPTOR_PORT"
h2tcp () {
case $1 in
rm -f $config
# create nodes
$LMC --add node --node $SERVER || exit 1
-$LMC --add net --node $SERVER --nid `h2$NET $SERVER` --nettype $NET || exit 2
+$LMC --add net --node $SERVER --nid `h2$NET $SERVER` --nettype $NET $PORT_OPT|| exit 2
if (($LOV)); then
$LMC --add mds --node $SERVER --mds mds1 --fstype $FSTYPE --dev $MDSDEV --size $MDSSIZE || exit 10
if [ "$SERVER" != "$CLIENT" ]; then
$LMC --add node --node $CLIENT || exit 1
- $LMC --add net --node $CLIENT --nid `h2$NET $CLIENT` --nettype $NET || exit 2
+ $LMC --add net --node $CLIENT --nid `h2$NET $CLIENT` --nettype $NET $PORT_OPT || exit 2
fi
$LMC --add echo_client --node $CLIENT --ost ${OBD_NAME} || exit 3
. ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
-ALWAYS_EXCEPT="10"
+ALWAYS_EXCEPT="10 $INSANITY_EXCEPT"
SETUP=${SETUP:-"setup"}
CLEANUP=${CLEANUP:-"cleanup"}
[ "$NODE" ] && node_opt="--node $NODE"
[ "$DEBUG" ] && debug_opt="--ptldebug=$DEBUG"
+[ "$PTLDEBUG" ] && debug_opt="--ptldebug=$PTLDEBUG"
${LCONF} $NOMOD $portals_opt $lustre_opt $debug_opt $node_opt ${REFORMAT:---reformat} $@ \
- $conf_opt || exit 2
+ $conf_opt || {
+ # maybe acceptor error, dump tcp port usage
+ netstat -tpn
+ exit 2
+}
if [ "$MOUNT2" ]; then
- $LLMOUNT -v `hostname`:/mds1/client $MOUNT2 || exit 3
+ $LLMOUNT -v -o user_xattr,acl `hostname`:/mds1/client $MOUNT2 || exit 3
fi
--- /dev/null
+#!/bin/sh
+
+set -e
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+. $LUSTRE/tests/test-framework.sh
+
+init_test_env $@
+
+. ${CONFIG:=$LUSTRE/tests/cfg/local.sh}
+
+ostfailover_HOST=${ostfailover_HOST:-$ost_HOST}
+
+gen_config() {
+ rm -f "$XMLCONFIG"
+ add_mds mds --dev "$MDSDEV" --size "$MDSSIZE"
+ add_lov lov1 mds --stripe_sz $STRIPE_BYTES \
+ --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
+ add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE --failover
+ if [ ! -z "$ostfailover_HOST" ]; then
+ add_ostfailover ost --dev $OSTDEV --size $OSTSIZE
+ fi
+ add_client client mds --lov lov1 --path $MOUNT
+}
+
+cleanup() {
+ # make sure we are using the primary MDS, so the config log will
+ # be able to clean up properly.
+ activeost=`facet_active ost`
+ if [ $activeost != "ost" ]; then
+ fail ost
+ fi
+ zconf_umount `hostname` $MOUNT
+ stop mds ${FORCE} $MDSLCONFARGS
+ stop ost ${FORCE} --dump $TMP/replay-ost-single-`hostname`.log
+}
+
+if [ "$ONLY" == "cleanup" ]; then
+ sysctl -w portals.debug=0
+ FORCE=--force cleanup
+ exit
+fi
+
+build_test_filter
+
+SETUP=${SETUP:-"setup"}
+CLEANUP=${CLEANUP:-"cleanup"}
+
+setup() {
+ gen_config
+
+ start ost --reformat $OSTLCONFARGS
+ [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
+ start mds --reformat $MDSLCONFARGS
+
+ if [ -z "`grep " $MOUNT " /proc/mounts`" ]; then
+ # test "-1" needed during initial client->OST connection
+ log "== test 00: target handle mismatch (bug 5317) === `date +%H:%M:%S`"
+
+ #define OBD_FAIL_OST_ALL_REPLY_NET 0x211
+ do_facet ost "sysctl -w lustre.fail_loc=0x80000211"
+
+ zconf_mount `hostname` $MOUNT && df $MOUNT && pass || error "mount fail"
+ fi
+}
+
+mkdir -p $DIR
+
+$SETUP
+
+LCOUNT=${LCOUNT:-10000}
+
+test_0() {
+ ./createmany -o $DIR/llog-%d $LCOUNT
+ #replay_barrier ost
+}
+run_test 0 "Prepare fileset"
+
+test_1() {
+ ./chownmany 1000 $DIR/llog-%d $LCOUNT
+ sleep 5
+}
+run_test 1 "Do chowns"
+
+test_2() {
+ HALFCOUNT=${HALFCOUNT:-17}
+ ./chownmany 500 $DIR/llog-%d 0 $HALFCOUNT
+ fail ost
+ ./chownmany 500 $DIR/llog-%d $HALFCOUNT $LCOUNT
+ sleep 5
+}
+#run_test 2 "Fail OST during chown"
+
+test_3() {
+ ./unlinkmany $DIR/llog-%d $LCOUNT
+ sleep 2
+ $CHECKSTAT -t file $DIR/llog-* && return 1 || true
+}
+run_test 3 "Remove testset"
+
+equals_msg test complete, cleaning up
+FORCE=--force $CLEANUP
#!/bin/sh
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
export PATH=`dirname $0`/../utils:$PATH
fi
[ "$NODE" ] && node_opt="--node $NODE"
+[ "$DEBUG" ] && portals_opt="$portals_opt --ptldebug=$DEBUG"
+[ "$PTLDEBUG" ] && portals_opt="$portals_opt --ptldebug=$PTLDEBUG"
-${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt $@ $conf_opt || exit 2
+${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt $@ $conf_opt || {
+ # maybe acceptor error, dump tcp port usage
+ netstat -tpn
+ exit 2
+}
-[ $DEBUG ] && sysctl -w lnet.debug=$DEBUG
if [ "$MOUNT2" ]; then
- $LLMOUNT -v `hostname`:/mds1/client $MOUNT2 || exit 3
+ $LLMOUNT -v -o user_xattr,acl `hostname`:/mds1/client $MOUNT2 || exit 3
fi
MOUNT=${MOUNT:-/mnt/lustre}
MOUNT2=${MOUNT2:-${MOUNT}2}
NETTYPE=${NETTYPE:-tcp}
+[ "$ACCEPTOR_PORT" ] && PORT_OPT="--port $ACCEPTOR_PORT"
OSTDEV=${OSTDEV:-$TMP/ost1-`hostname`}
OSTSIZE=${OSTSIZE:-400000}
-CLIENTOPT="user_xattr,${CLIENTOPT:-""}"
+MDS_MOUNT_OPTS="user_xattr,acl,${MDS_MOUNT_OPTS:-""}"
+CLIENTOPT="user_xattr,acl,${CLIENTOPT:-""}"
# specific journal size for the ost, in MB
JSIZE=${JSIZE:-0}
# create nodes
${LMC} --add node --node $HOSTNAME || exit 10
-${LMC} --add net --node $HOSTNAME --nid `h2$NETTYPE $HOSTNAME` --nettype $NETTYPE || exit 11
-${LMC} --add net --node client --nid '*' --nettype $NETTYPE || exit 12
+${LMC} --add net --node $HOSTNAME --nid `h2$NETTYPE $HOSTNAME` \
+ --nettype $NETTYPE $PORT_OPT || exit 11
+${LMC} --add net --node client --nid '*' --nettype $NETTYPE $PORT_OPT|| exit 12
+# configure mds server
[ "x$MDS_MOUNT_OPTS" != "x" ] &&
MDS_MOUNT_OPTS="--mountfsoptions $MDS_MOUNT_OPTS"
+[ "x$QUOTA_OPTS" != "x" ] &&
+ QUOTA_OPTS="--quota $QUOTA_OPTS"
+
# configure mds server
${LMC} --add mds --node $HOSTNAME --mds mds1 --fstype $FSTYPE \
- --dev $MDSDEV \
- $MDS_MOUNT_OPTS --size $MDSSIZE $JARG $IARG $MDSOPT || exit 20
+ --dev $MDSDEV $MDS_MOUNT_OPTS $QUOTA_OPTS\
+ --size $MDSSIZE $JARG $IARG $MDSOPT || exit 20
[ "x$OST_MOUNT_OPTS" != "x" ] &&
OST_MOUNT_OPTS="--mountfsoptions $OST_MOUNT_OPTS"
--stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 $LOVOPT || exit 20
${LMC} --add ost --node $HOSTNAME --lov lov1 --fstype $FSTYPE \
- --dev $OSTDEV \
+ --dev $OSTDEV $QUOTA_OPTS\
$OST_MOUNT_OPTS --size $OSTSIZE $JARG $OSTOPT || exit 30
# create client config
cleanup() {
[ $CR_PID ] && kill -9 $CR_PID
+ [ $ST_PID ] && kill -9 $ST_PID
}
-trap cleanup 0
+trap cleanup EXIT
LOCKDIR=$DIR/lockdir
LOCKFILE=$LOCKDIR/lockfile
MOUNT=${MOUNT:-/mnt/lustre}
MOUNT2=${MOUNT2:-${MOUNT}2}
NETTYPE=${NETTYPE:-tcp}
+[ "$ACCEPTOR_PORT" ] && PORT_OPT="--port $ACCEPTOR_PORT"
OSTCOUNT=${OSTCOUNT:-5}
# OSTDEVN will still override the device for OST N
STRIPE_BYTES=${STRIPE_BYTES:-1048576}
STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-$((OSTCOUNT -1))}
-CLIENTOPT="user_xattr,${CLIENTOPT:-""}"
+MDS_MOUNT_OPTS="user_xattr,acl,${MDS_MOUNT_OPTS:-""}"
+CLIENTOPT="user_xattr,acl,${CLIENTOPT:-""}"
# specific journal size for the ost, in MB
JSIZE=${JSIZE:-0}
# create nodes
${LMC} --add node --node $HOSTNAME || exit 10
-${LMC} --add net --node $HOSTNAME --nid $HOSTNAME --nettype $NETTYPE || exit 11
-${LMC} --add net --node client --nid '*' --nettype $NETTYPE || exit 12
+${LMC} --add net --node $HOSTNAME --nid $HOSTNAME \
+ --nettype $NETTYPE $PORT_OPT || exit 11
+${LMC} --add net --node client --nid '*' --nettype $NETTYPE $PORT_OPT || exit 12
+
+[ "x$QUOTA_OPTS" != "x" ] &&
+ QUOTA_OPTS="--quota $QUOTA_OPTS"
# configure mds server
+[ "x$MDS_MOUNT_OPTS" != "x" ] &&
+ MDS_MOUNT_OPTS="--mountfsoptions $MDS_MOUNT_OPTS"
+
${LMC} --format --add mds --node $HOSTNAME --mds mds1 --fstype $FSTYPE \
- --dev $MDSDEV --size $MDSSIZE $MDSOPT || exit 20
+ --dev $MDSDEV $MDS_MOUNT_OPTS $QUOTA_OPTS --size $MDSSIZE $MDSOPT || exit 20
# configure ost
${LMC} --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES \
DEVPTR=OSTDEV$num
eval $DEVPTR=${!DEVPTR:=$TMP/$OST-`hostname`}
${LMC} --add ost --node $HOSTNAME --lov lov1 --ost $OST --fstype $FSTYPE \
- --dev ${!DEVPTR} --size $OSTSIZE $JARG $OSTOPT || exit 30
+ --dev ${!DEVPTR} --size $OSTSIZE $JARG $OSTOPT $QUOTA_OPTS|| exit 30
done
mem = calloc(numchunk, sizeof(*mem));
if (mem == NULL) {
fprintf(stderr, "error allocating initial chunk array\n");
- exit(1);
+ exit(-1);
}
alloc = CHUNK;
printf("[%d] allocating %lld kbytes in %u kbyte chunks\n",
getpid(), kbtotal, alloc);
- for (i = kballoc = 0; i < numchunk; i++, kballoc += alloc) {
+ for (i = kballoc = 0; i < numchunk && alloc > 0; i++, kballoc += alloc){
if (kbtotal - kballoc < alloc)
alloc = kbtotal - kballoc;
- tmp = mem[i] = malloc(alloc * 1024);
- if (tmp == NULL) {
+ while (alloc > 0 && (mem[i] = malloc(alloc * 1024)) == NULL) {
fprintf(stderr, "malloc(%u) failed (%lld/%lld)\n",
alloc * 1024, kballoc, kbtotal);
- } else {
- printf("touching %p (%lld/%lld)\n",
- tmp, kballoc, kbtotal);
- for (j = 0; j < alloc; j += 4) {
- for (k = 0, sum = 0; k < 4095; k++, tmp++)
- sum += *tmp;
- *tmp = sum;
- }
+ alloc /= 2;
+ }
+ if (alloc == 0)
+ break;
+
+ printf("touching %p ([%lld-%lld]/%lld)\n", mem[i], kballoc,
+ kballoc + alloc - 1, kbtotal);
+ for (j = 0, tmp = mem[i]; j < alloc; j += 4) {
+ for (k = 0, sum = 0; k < 4095; k++, tmp++)
+ sum += *tmp;
+ *tmp = sum;
}
}
+ if (kballoc == 0)
+ exit(-2);
+
+ kbtotal = kballoc;
printf("touched %lld kbytes\n", kballoc);
alloc = CHUNK;
if (*tmp != sum) {
fprintf(stderr, "sum %x != %x at %p\n",
*tmp, sum, tmp - 4092);
- rc = 1;
+ rc++;
}
}
}
SUCCESS=1
-rm -f $OOS
+rm -f $OOS $LOG
-sleep 1 # to ensure we get up-to-date statfs info
+sync; sleep 1; sync # to ensure we get up-to-date statfs info
#echo -1 > /proc/sys/lnet/debug
#echo 0x40a8 > /proc/sys/lnet/subsystem_debug
export LANG=C LC_LANG=C # for "No space left on device" message
-rm -f $LOG >/dev/null 2>&1
[ -f $LOG ] && echo "ERROR: log file wasn't removed?" && exit 1
# make sure we stripe over all OSTs to avoid OOS on only a subset of OSTs
fi
if [ "`grep -c 'No space left on device' $LOG`" -ne 1 ]; then
- echo "ERROR: dd not return ENOSPC"
+ echo "ERROR: dd not return ENOSPC"
+ sed "s/^/LOG: /" $LOG
SUCCESS=0
fi
fi
RECORDSOUT=`grep "records out" $LOG | cut -d + -f1`
-
FILESIZE=`ls -l $OOS | awk '{ print $5 }'`
-if [ "$RECORDSOUT" -ne $((FILESIZE / 1024)) ]; then
- echo "ERROR: blocks written by dd not equal to the size of file"
- SUCCESS=0
+if [ -z "$RECORDSOUT" ]; then
+ echo "ERROR: no blocks written by dd?"
+ sed "s/^/LOG: /" $LOG
+ SUCCESS=0
+elif [ "$RECORDSOUT" -ne $((FILESIZE / 1024)) ]; then
+ echo "ERROR: blocks written by dd not equal to the size of file"
+ SUCCESS=0
fi
#lctl debug_daemon stop
rm -f $OOS $OOS2 $LOG $LOG2
-sleep 1 # to ensure we get up-to-date statfs info
+sync; sleep 1; sync # to ensure we get up-to-date statfs info
STRIPECOUNT=`cat /proc/fs/lustre/lov/*/activeobd | head -n 1`
ORIGFREE=`cat /proc/fs/lustre/llite/*/kbytesavail | head -n 1`
+++ /dev/null
-#!/bin/bash
-
-set -e
-#set -vx
-
-SRCDIR=`dirname $0`
-export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH:/sbin
-. $SRCDIR/test-framework.sh
-
-LFS=${LFS:-lfs}
-LCTL=${LCTL:-lctl}
-USER="quota_usr"
-TSTID=${TSTID:-60000}
-RUNAS=${RUNAS:-"runas -u $TSTID"}
-BLK_SZ=1024
-BUNIT_SZ=1000 # 1000 quota blocks
-BTUNE_SZ=500 # 500 quota blocks
-IUNIT_SZ=10 # 10 files
-ITUNE_SZ=5 # 5 files
-MAX_DQ_TIME=604800
-MAX_IQ_TIME=604800
-
-MOUNT="`cat /proc/mounts | grep "lustre" | awk '{print $2}'`"
-if [ -z "$MOUNT" ]; then
- echo "ERROR: lustre not mounted, quit test!"
- exit 1;
-fi
-OSTCOUNT=`cat /proc/fs/lustre/lov/*/activeobd | head -n 1`
-TSTDIR="$MOUNT/quota_dir"
-
-# set_blk_tunables(btune_sz)
-set_blk_tunesz() {
- # set btune size on all obdfilters
- for i in `ls /proc/fs/lustre/obdfilter/*/quota_btune_sz`; do
- echo $(($1 * $BLK_SZ)) > $i
- done
- # set btune size on mds
- for i in `ls /proc/fs/lustre/mds/mds*/quota_btune_sz`; do
- echo $(($1 * $BLK_SZ)) > $i
- done
-}
-
-# se_blk_unitsz(bunit_sz)
-set_blk_unitsz() {
- for i in `ls /proc/fs/lustre/obdfilter/*/quota_bunit_sz`; do
- echo $(($1 * $BLK_SZ)) > $i
- done
- for i in `ls /proc/fs/lustre/mds/mds*/quota_bunit_sz`; do
- echo $(($1 * $BLK_SZ)) > $i
- done
-}
-
-# set_file_tunesz(itune_sz)
-set_file_tunesz() {
- # set iunit and itune size on all obdfilters
- for i in `ls /proc/fs/lustre/obdfilter/*/quota_itune_sz`; do
- echo $1 > $i
- done
- # set iunit and itune size on mds
- for i in `ls /proc/fs/lustre/mds/mds*/quota_itune_sz`; do
- echo $1 > $i
- done
-
-
-}
-# set_file_unitsz(iunit_sz)
-set_file_unitsz() {
- for i in `ls /proc/fs/lustre/obdfilter/*/quota_iunit_sz`; do
- echo $1 > $i
- done;
- for i in `ls /proc/fs/lustre/mds/mds*/quota_iunit_sz`; do
- echo $1 > $i
- done
-}
-
-prepare_test() {
- # create test group
- GRP="`cat /etc/group | grep "$USER" | awk -F: '{print $1}'`"
- if [ -z "$GRP" ]; then
- groupadd -g $TSTID "$USER"
- fi
- TSTID="`cat /etc/group | grep "$USER" | awk -F: '{print $3}'`"
-
- # create test user
- USR="`cat /etc/passwd | grep "$USER" | awk -F: '{print $1}'`"
- if [ -z "$USR" ]; then
- useradd -u $TSTID -g $TSTID -d /tmp "$USER"
- fi
-
- RUNAS="runas -u $TSTID"
-
- # set block tunables
- set_blk_tunesz $BTUNE_SZ
- set_blk_unitsz $BUNIT_SZ
- # set file tunaables
- set_file_tunesz $ITUNE_SZ
- set_file_unitsz $IUNIT_SZ
-
- [ -d $TSTDIR ] || mkdir $TSTDIR
- chmod 777 $TSTDIR
-}
-
-cleanup_test() {
- # restore block tunables to default size
- set_blk_unitsz $((1024 * 100))
- set_blk_tunesz $((1024 * 50))
- # restore file tunables to default size
- set_file_unitsz 5000
- set_file_tunesz 2500
-
- rm -fr $TSTDIR
- # delete test user and group
- userdel "$USER"
-}
-
-# set quota
-test_1() {
- echo "== Enable quota"
- $LFS quotaoff -ug $MOUNT
- $LFS quotacheck -ug $MOUNT
-
- $LFS setquota -u $USER 0 0 0 0 $MOUNT
- $LFS setquota -g $USER 0 0 0 0 $MOUNT
- return 0
-}
-
-# block hard limit (normal use and out of quota)
-test_2() {
- echo "== Block hard limit"
- LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) * 10)) # 10 bunits each sever
- TESTFILE="$TSTDIR/quota_tst20"
-
- echo " User quota (limit: $LIMIT bytes)"
- $LFS setquota -u $USER 0 $LIMIT 0 0 $MOUNT
-
- $RUNAS touch $TESTFILE >/dev/null 2>&1
-
- echo " Write ..."
- $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) > /dev/null 2>&1 || error "(usr) write failure, but expect success"
- echo " Done"
- echo " Write out of block quota ..."
- # this time maybe cache write, ignore it's failure
- $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) seek=$(($LIMIT/2)) > /dev/null 2>&1 || echo " " > /dev/null
- # flush cache, ensure noquota flag is setted on client
- sync; sleep 1; sync;
- $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$LIMIT > /dev/null 2>&1 && error "(usr) write success, but expect EDQUOT"
- echo " EDQUOT"
-
- rm -f $TESTFILE
-
- echo " Group quota (limit: $LIMIT bytes)"
- $LFS setquota -u $USER 0 0 0 0 $MOUNT # clear user limit
- $LFS setquota -g $USER 0 $LIMIT 0 0 $MOUNT
- TESTFILE="$TSTDIR/quota_tst21"
-
- $RUNAS touch $TESTFILE >/dev/null 2>&1
-
- echo " Write ..."
- $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) > /dev/null 2>&1 || error "(grp) write failure, but expect success"
- echo " Done"
- echo " Write out of block quota ..."
- # this time maybe cache write, ignore it's failure
- $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) seek=$(($LIMIT/2)) > /dev/null 2>&1 || echo " " > /dev/null
- sync; sleep 1; sync;
- $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$LIMIT > /dev/null 2>&1 && error "(grp) write success, but expect EDQUOT"
- echo " EDQUOT"
-
- # cleanup
- rm -f $TESTFILE
- $LFS setquota -g $USER 0 0 0 0 $MOUNT
- return 0
-}
-
-# file hard limit (normal use and out of quota)
-test_3() {
- echo "== File hard limit"
- LIMIT=$(($IUNIT_SZ * 10)) # 10 iunits on mds
- TESTFILE="$TSTDIR/quota_tst30"
-
- echo " User quota (limit: $LIMIT files)"
- $LFS setquota -u $USER 0 0 0 $LIMIT $MOUNT
-
- echo " Create $LIMIT files ..."
- for i in `seq ${LIMIT}`; do
- $RUNAS touch ${TESTFILE}_$i > /dev/null 2>&1 || error "(usr) touch failure, but except success"
- done
- echo " Done"
- echo " Create out of file quota ..."
- $RUNAS touch ${TESTFILE}_xxx > /dev/null 2>&1 && error "(usr) touch success, but expect EDQUOT"
- echo " EDQUOT"
-
- for i in `seq ${LIMIT}`; do
- rm -f ${TESTFILE}_$i
- done
-
- echo " Group quota (limit: $LIMIT files)"
- $LFS setquota -u $USER 0 0 0 0 $MOUNT # clear user limit
- $LFS setquota -g $USER 0 0 0 $LIMIT $MOUNT
- TESTFILE="$TSTDIR/quota_tst31"
-
- echo " Create $LIMIT files ..."
- for i in `seq ${LIMIT}`; do
- $RUNAS touch ${TESTFILE}_$i > /dev/null 2>&1 || error "(grp) touch failure, but expect success"
- done
- echo " Done"
- echo " Create out of file quota ..."
- $RUNAS touch ${TESTFILE}_xxx > /dev/null 2>&1 && error "(grp) touch success, but expect EDQUOT"
- echo " EDQUOT"
-
- # cleanup
- for i in `seq ${LIMIT}`; do
- rm -f ${TESTFILE}_$i
- done
- $LFS setquota -g $USER 0 0 0 0 $MOUNT
- return 0
-}
-
-test_block_soft() {
- TESTFILE=$1
- GRACE=$2
- BS=$(($BUNIT_SZ * $BLK_SZ))
-
- echo " Write to exceed soft limit"
- $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ >/dev/null 2>&1 || error "write failure, but expect success"
- sync; sleep 1; sync;
-
- echo " Write before timer goes off"
- $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$BUNIT_SZ >/dev/null 2>&1 || error "write failure, but expect success"
- sync; sleep 1; sync;
- echo " Done"
-
- echo " Sleep $GRACE seconds ..."
- sleep $GRACE
-
- echo " Write after timer goes off"
- # maybe cache write, ignore.
- $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$(($BUNIT_SZ * 2)) >/dev/null 2>&1 || echo " " > /dev/null
- sync; sleep 1; sync;
- $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=1 seek=$(($BUNIT_SZ * 3)) >/dev/null 2>&1 && error "write success, but expect EDQUOT"
- echo " EDQUOT"
-
- echo " Unlink file to stop timer"
- rm -f $TESTFILE
- echo " Done"
-
- echo " Write ..."
- $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ >/dev/null 2>&1 || error "write failure, but expect success"
- echo " Done"
-
- # cleanup
- rm -f $TESTFILE
-}
-
-# block soft limit (start timer, timer goes off, stop timer)
-test_4() {
- echo "== Block soft limit"
- LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) )) # 1 bunits each sever
- TESTFILE="$TSTDIR/quota_tst40"
- GRACE=10
-
- echo " User quota (soft limit: $LIMIT bytes grace: $GRACE seconds)"
- $LFS setquota -t -u $GRACE $MAX_IQ_TIME $MOUNT
- $LFS setquota -u $USER $LIMIT 0 0 0 $MOUNT
-
- test_block_soft $TESTFILE $GRACE
- $LFS setquota -u $USER 0 0 0 0 $MOUNT
-
- echo " Group quota (soft limit: $LIMIT bytes grace: $GRACE seconds)"
- $LFS setquota -t -g $GRACE $MAX_IQ_TIME $MOUNT
- $LFS setquota -g $USER $LIMIT 0 0 0 $MOUNT
- TESTFILE="$TSTDIR/quota_tst41"
-
- test_block_soft $TESTFILE $GRACE
- $LFS setquota -g $USER 0 0 0 0 $MOUNT
-
- return 0
-}
-
-test_file_soft() {
- TESTFILE=$1
- LIMIT=$2
- GRACE=$3
-
- echo " Create files to exceed soft limit"
- for i in `seq $LIMIT`; do
- $RUNAS touch ${TESTFILE}_$i >/dev/null 2>&1 || error "touch failure, but expect success"
- done
- echo " Done"
-
- echo " Create file before timer goes off"
- $RUNAS touch ${TESTFILE}_before >/dev/null 2>&1 || error "touch before timer goes off failure, but expect success"
- echo " Done"
-
- echo " Sleep $GRACE seconds ..."
- sleep $GRACE
-
- echo " Create file after timer goes off"
- for i in `seq $(($IUNIT_SZ - 1))`; do
- $RUNAS touch ${TESTFILE}_after_$i >/dev/null 2>&1 || error "touch ${TESTFILE}_after_$i failure, but expect success"
- done
- $RUNAS touch ${TESTFILE}_after >/dev/null 2>&1 && error "touch after timer goes off success, but expect EDQUOT"
- echo " EDQUOT"
-
- echo " Unlink files to stop timer"
- for i in `seq $LIMIT`; do
- rm -f ${TESTFILE}_$i >/dev/null 2>&1 || error "rm ${TESTFILE}_$i failure"
- done
- rm -f ${TESTFILE}_before
- for i in `seq $(($IUNIT_SZ - 1))`; do
- rm -f ${TESTFILE}_after_$i >/dev/null 2>&1 || error "rm ${TESTFILE}_after_$i failure"
- done
- echo " Done"
-
- echo " Create file"
- $RUNAS touch ${TESTFILE}_xxx >/dev/null 2>&1 || error "touch after timer stop failure, but expect success"
- echo " Done"
-
- # cleanup
- rm -f ${TESTFILE}_xxx
-}
-
-# file soft limit (start timer, timer goes off, stop timer)
-test_5() {
- echo "== File soft limit"
- LIMIT=$(($IUNIT_SZ * 10)) # 10 iunits on mds
- TESTFILE="$TSTDIR/quota_tst50"
- GRACE=5
-
- echo " User quota (soft limit: $LIMIT files grace: $GRACE seconds)"
- $LFS setquota -t -u $MAX_DQ_TIME $GRACE $MOUNT
- $LFS setquota -u $USER 0 0 $LIMIT 0 $MOUNT
-
- test_file_soft $TESTFILE $LIMIT $GRACE
- $LFS setquota -u $USER 0 0 0 0 $MOUNT
-
- echo " Group quota (soft limit: $LIMIT files grace: $GRACE seconds)"
- $LFS setquota -t -g $MAX_DQ_TIME $GRACE $MOUNT
- $LFS setquota -g $USER 0 0 $LIMIT 0 $MOUNT
- TESTFILE="$TSTDIR/quota_tst51"
-
- test_file_soft $TESTFILE $LIMIT $GRACE
- $LFS setquota -g $USER 0 0 0 0 $MOUNT
-
- # cleanup
- $LFS setquota -t -u $MAX_DQ_TIME $MAX_IQ_TIME $MOUNT
- $LFS setquota -t -g $MAX_DQ_TIME $MAX_IQ_TIME $MOUNT
- return 0
-}
-
-# chown & chgrp (chown & chgrp successfully even out of block/file quota)
-test_6() {
- echo "== Chown/Chgrp ignore quota"
- BLIMIT=$(( $BUNIT_SZ * $((OSTCOUNT + 1)) * 10)) # 10 bunits on each server
- ILIMIT=$(( $IUNIT_SZ * 10 )) # 10 iunits on mds
-
- echo " Set quota limit (0 $BLIMIT 0 $ILIMIT) for $USER.$USER"
- $LFS setquota -u $USER 0 $BLIMIT 0 $ILIMIT $MOUNT
- $LFS setquota -g $USER 0 $BLIMIT 0 $ILIMIT $MOUNT
-
- echo " Create more than $ILIMIT files and alloc more than $BLIMIT blocks ..."
- for i in `seq $(($ILIMIT + 1))`; do
- touch $TSTDIR/quota_tst60_$i > /dev/null 2>&1 || error "touch failure, expect success"
- done
- dd if=/dev/zero of=$TSTDIR/quota_tst60_1 bs=$BLK_SZ count=$(($BLIMIT+1)) > /dev/null 2>&1 || error "write failure, expect success"
-
- echo " Chown files to $USER.$USER ..."
- for i in `seq $(($ILIMIT + 1))`; do
- chown $USER.$USER $TSTDIR/quota_tst60_$i > /dev/null 2>&1 || error "chown failure, but expect success"
- done
-
- # cleanup
- for i in `seq $(($ILIMIT + 1))`; do
- rm -f $TSTDIR/quota_tst60_$i
- done
- $LFS setquota -u $USER 0 0 0 0 $MOUNT
- $LFS setquota -g $USER 0 0 0 0 $MOUNT
- return 0
-}
-
-# block quota acquire & release
-test_7() {
- echo "== Block quota acqurie / release"
-
- if [ $OSTCOUNT -lt 2 ]; then
- echo "WARN: too few osts, skip this test."
- return 0;
- fi
-
- LIMIT=$(($BUNIT_SZ * $(($OSTCOUNT + 1)) * 10)) # 10 bunits per server
- FILEA="$TSTDIR/quota_tst70_a"
- FILEB="$TSTDIR/quota_tst70_b"
-
- echo " Set block limit $LIMIT bytes to $USER.$USER"
- $LFS setquota -u $USER 0 $LIMIT 0 0 $MOUNT
- $LFS setquota -g $USER 0 $LIMIT 0 0 $MOUNT
-
- echo " Create filea on OST0 and fileb on OST1"
- $LFS setstripe $FILEA 65536 0 1
- $LFS setstripe $FILEB 65536 1 1
- chown $USER.$USER $FILEA
- chown $USER.$USER $FILEB
-
- echo " Exceed quota limit ..."
- $RUNAS dd if=/dev/zero of=$FILEB bs=$BLK_SZ count=$(($LIMIT - $BUNIT_SZ * $OSTCOUNT)) >/dev/null 2>&1 || error "write fileb failure, but expect success"
- sync; sleep 1; sync;
- $RUNAS dd if=/dev/zero of=$FILEB bs=$BLK_SZ seek=$LIMIT count=$BUNIT_SZ >/dev/null 2>&1 && error "write fileb success, but expect EDQUOT"
- sync; sleep 1; sync;
- echo " Write to OST0 return EDQUOT"
- # this write maybe cache write, ignore it's failure
- $RUNAS dd if=/dev/zero of=$FILEA bs=$BLK_SZ count=$(($BUNIT_SZ * 2)) >/dev/null 2>&1 || echo " " > /dev/null
- sync; sleep 1; sync;
- $RUNAS dd if=/dev/zero of=$FILEA bs=$BLK_SZ count=$(($BUNIT_SZ * 2)) seek=$(($BUNIT_SZ *2)) >/dev/null 2>&1 && error "write filea success, but expect EDQUOT"
- echo " EDQUOT"
-
- echo " Remove fileb to let OST1 release quota"
- rm -f $FILEB
-
- echo " Write to OST0"
- $RUNAS dd if=/dev/zero of=$FILEA bs=$BLK_SZ count=$(($LIMIT - $BUNIT_SZ * $OSTCOUNT)) >/dev/null 2>&1 || error "write filea failure, expect success"
- echo " Done"
-
- # cleanup
- rm -f $FILEA
- $LFS setquota -u $USER 0 0 0 0 $MOUNT
- $LFS setquota -g $USER 0 0 0 0 $MOUNT
- return 0
-}
-
-# turn off quota
-test_8()
-{
- echo "=== Turn off quota"
- $LFS quotaoff $MOUNT
- return 0
-}
-
-prepare_test
-
-# run all tests
-for j in `seq 8`; do
- test_$j
- echo "== Done"
- echo " "
-done
-
-cleanup_test
set -e
# bug 2986 5494 7288
-ALWAYS_EXCEPT="20b 24 27"
-
+ALWAYS_EXCEPT="20b 24 27 $RECOVERY_SMALL_EXCEPT"
LUSTRE=${LUSTRE:-`dirname $0`/..}
set -e
-# bug 6088
-ALWAYS_EXCEPT="8"
+# bug number: 6088
+ALWAYS_EXCEPT="8 $REPLAY_DUAL_EXCEPT"
LUSTRE=${LUSTRE:-`dirname $0`/..}
. $LUSTRE/tests/test-framework.sh
sysctl -w lustre.fail_loc=0
ls $DIR/$tfile
- $CHECKSTAT -t file $DIR/$tfile || return 2
kill -USR1 $MULTIPID || return 3
wait $MULTIPID || return 4
+ $CHECKSTAT -t file $DIR/$tfile || return 2
rm $DIR/$tfile
return 0
df $MOUNT || return 1
unlinkmany $MOUNT1/$tfile- 25 || return 2
+ [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
zconf_mount `hostname` $MOUNT2
return 0
}
run_test 15 "timeout waiting for lost client during replay, 1 client completes"
+test_15a() {
+ local ost_last_id=""
+ local osc_last_id=""
+
+ replay_barrier mds
+ echo "data" > "$MOUNT2/${tfile}-m2"
+
+ umount $MOUNT2
+ facet_failover mds
+ df $MOUNT || return 1
+
+ ost_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+ mds_last_id=`cat /proc/fs/lustre/osc/*mds*/last_id`
+
+ echo "Ids after MDS<->OST synchonizing"
+ echo "--------------------------------"
+ echo "MDS last_id:"
+ echo $mds_last_id
+ echo "OST last_id:"
+ echo $ost_last_id
+
+ local i=0
+ echo $ost_last_id | while read id; do
+ ost_ids[$i]=$id
+ ((i++))
+ done
+
+ i=0
+ echo $mds_last_id | while read id; do
+ mds_ids[$i]=$id
+ ((i++))
+ done
+
+ local arr_len=${#mds_ids[*]}
+ for ((i=0;i<$arr_len;i++)); do
+ mds_id=${mds_ids[i]}
+ ost_id=${ost_ids[i]}
+
+ test $mds_id -ge $ost_id || {
+ echo "MDS last id ($mds_id) is smaller than OST one ($ost_id)"
+ return 2
+ }
+ done
+
+ zconf_mount `hostname` $MOUNT2
+ return 0
+}
+#CROW run_test 15a "OST clear orphans - synchronize ids on MDS and OST"
+
+test_15b() {
+ replay_barrier mds
+ echo "data" > "$MOUNT2/${tfile}-m2"
+ umount $MOUNT2
+
+ do_facet ost "sysctl -w lustre.fail_loc=0x80000802"
+ facet_failover mds
+
+ df $MOUNT || return 1
+ do_facet ost "sysctl -w lustre.fail_loc=0"
+
+ zconf_mount `hostname` $MOUNT2
+ return 0
+}
+#CROW run_test 15b "multiple delayed OST clear orphans"
+
+test_15c() {
+ replay_barrier mds
+ for ((i = 0; i < 2000; i++)); do
+ echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
+ done
+
+ umount $MOUNT2
+ facet_failover mds
+
+ df $MOUNT || return 1
+
+ zconf_mount `hostname` $MOUNT2
+ return 0
+}
+run_test 15c "remove multiple OST orphans"
+
test_16() {
replay_barrier mds
createmany -o $MOUNT1/$tfile- 25
# Skip these tests
# BUG NUMBER: 2766?
-ALWAYS_EXCEPT="5"
+ALWAYS_EXCEPT="5 $REPLAY_OST_SINGLE_EXCEPT"
gen_config() {
rm -f $XMLCONFIG
sync && sleep 2 && sync # wait for delete thread
before=`kbytesfree`
dd if=/dev/urandom bs=4096 count=1280 of=$f
+ lfs getstripe $f
#define OBD_FAIL_MDS_REINT_NET_REP 0x119
do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
sync
$CHECKSTAT -t file $f && return 2 || true
sync
# let the delete happen
- sleep 2
+ sleep 5
after=`kbytesfree`
log "before: $before after: $after"
(( $before <= $after + 40 )) || return 3 # take OST logs into account
before=`kbytesfree`
dd if=/dev/urandom bs=4096 count=1280 of=$f
sync
+ sleep 1 # ensure we have a fresh statfs
after_dd=`kbytesfree`
log "before: $before after_dd: $after_dd"
(( $before > $after_dd )) || return 1
. ${CONFIG:=$LUSTRE/tests/cfg/local.sh}
# Skip these tests
-# bug number: 2766 4176
-ALWAYS_EXCEPT="0b 39"
+# bug number: 2766
+ALWAYS_EXCEPT="0b $REPLAY_SINGLE_EXCEPT"
gen_config() {
rm -f $XMLCONFIG
}
run_test 1 "simple create"
+test_1a() {
+ do_facet ost "sysctl -w lustre.fail_loc=0"
+
+ rm -fr $DIR/$tfile
+ local old_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+ touch -o $DIR/$tfile 1
+ sync
+ local new_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+
+ test "$old_last_id" = "$new_last_id" || {
+ echo "OST object create is caused by MDS"
+ return 1
+ }
+
+ old_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+ echo "data" > $DIR/$tfile
+ sync
+ new_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+ test "$old_last_id" = "$new_last_id "&& {
+ echo "CROW does not work on write"
+ return 1
+ }
+
+ rm -fr $DIR/$tfile
+
+#define OBD_FAIL_OST_CROW_EIO | OBD_FAIL_ONCE
+ do_facet ost "sysctl -w lustre.fail_loc=0x80000801"
+
+ rm -fr $DIR/1a1
+ old_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+ echo "data" > $DIR/1a1
+ sync
+ new_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id`
+ test "$old_last_id" = "$new_last_id" || {
+ echo "CROW does work with fail_loc=0x80000801"
+ return 1
+ }
+
+ rm -fr $DIR/1a1
+
+ do_facet ost "sysctl -w lustre.fail_loc=0"
+}
+#CROW run_test 1a "CROW object create (check OST last_id)"
+
test_2a() {
replay_barrier mds
touch $DIR/$tfile
mkdir -p $DIR
TGT=$DIR/client.txt
SRC=${SRC:-/usr/lib/dbench/client.txt}
-[ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
+[ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
SRC=/usr/lib/dbench/client_plain.txt
-[ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
+[ ! -s $TGT -a -s $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
+[ ! -s $TGT ] && echo "$TGT doesn't exist" && exit 1
cd $DIR
echo "running 'dbench $@' on $PWD at `date`"
dbench -c client.txt $@
OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`"
if [ -z "$OSCMT" ]; then
$LCONF $@ || exit 1
- trap cleanup 0
+ trap cleanup EXIT
OSCMT="`mount | awk '/ lustre_lite / { print $3 }' | tail -n 1`"
[ -z "$OSCMT" ] && fail "no lustre filesystem mounted" 1
fi
--- /dev/null
+#!/bin/bash
+#
+# Run select tests by setting ONLY, or as arguments to the script.
+# Skip specific tests by setting EXCEPT.
+#
+# Run test by setting NOSETUP=true when ltest has setup env for us
+set -e
+
+SRCDIR=`dirname $0`
+export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/../utils:$PATH:/sbin
+
+ONLY=${ONLY:-"$*"}
+ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""}
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
+
+case `uname -r` in
+2.6*) FSTYPE=${FSTYPE:-ldiskfs};;
+*) error "unsupported kernel" ;;
+esac
+
+[ "$ALWAYS_EXCEPT$EXCEPT" ] && \
+ echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT`"
+
+TMP=${TMP:-/tmp}
+
+LFS=${LFS:-lfs}
+LCTL=${LCTL:-lctl}
+LSTRIPE=${LSTRIPE:-"$LFS setstripe"}
+TSTID=${TSTID:-60000}
+RUNAS=${RUNAS:-"runas -u $TSTID"}
+TSTUSR=${TSTUSR:-"quota_usr"}
+BLK_SZ=1024
+BUNIT_SZ=${BUNIT_SZ:-1000} # default 1000 quota blocks
+BTUNE_SZ=${BTUNE_SZ:-500} # default 50% of BUNIT_SZ
+IUNIT_SZ=${IUNIT_SZ:-10} # default 10 files
+ITUNE_SZ=${ITUNE_SZ:-5} # default 50% of IUNIT_SZ
+MAX_DQ_TIME=604800
+MAX_IQ_TIME=604800
+
+log() {
+ echo "$*"
+ $LCTL mark "$*" 2> /dev/null || true
+}
+
+trace() {
+ log "STARTING: $*"
+ strace -o $TMP/$1.strace -ttt $*
+ RC=$?
+ log "FINISHED: $*: rc $RC"
+ return 1
+}
+TRACE=${TRACE:-""}
+
+run_one() {
+ BEFORE=`date +%s`
+ log "== test $1: $2= `date +%H:%M:%S` ($BEFORE)"
+ export TESTNAME=test_$1
+ test_$1 || error "exit with rc=$?"
+ unset TESTNAME
+ pass "($((`date +%s` - $BEFORE))s)"
+ cd $SAVE_PWD
+}
+
+build_test_filter() {
+ for O in $ONLY; do
+ eval ONLY_${O}=true
+ done
+ for E in $EXCEPT $ALWAYS_EXCEPT; do
+ eval EXCEPT_${E}=true
+ done
+ # turn on/off quota tests must be included
+ eval ONLY_0=true
+ eval ONLY_9=true
+}
+
+_basetest() {
+ echo $*
+}
+
+basetest() {
+ IFS=abcdefghijklmnopqrstuvwxyz _basetest $1
+}
+
+run_test() {
+ base=`basetest $1`
+ if [ "$ONLY" ]; then
+ testname=ONLY_$1
+ if [ ${!testname}x != x ]; then
+ run_one $1 "$2"
+ return $?
+ fi
+ testname=ONLY_$base
+ if [ ${!testname}x != x ]; then
+ run_one $1 "$2"
+ return $?
+ fi
+ echo -n "."
+ return 0
+ fi
+ testname=EXCEPT_$1
+ if [ ${!testname}x != x ]; then
+ echo "skipping excluded test $1"
+ return 0
+ fi
+ testname=EXCEPT_$base
+ if [ ${!testname}x != x ]; then
+ echo "skipping excluded test $1 (base $base)"
+ return 0
+ fi
+ run_one $1 "$2"
+ return $?
+}
+
+[ "$SANITYLOG" ] && rm -f $SANITYLOG || true
+
+error() {
+ sysctl -w lustre.fail_loc=0
+ log "FAIL: $TESTNAME $@"
+ if [ "$SANITYLOG" ]; then
+ echo "FAIL: $TESTNAME $@" >> $SANITYLOG
+ else
+ exit 1
+ fi
+}
+
+pass() {
+ echo PASS $@
+}
+
+mounted_lustre_filesystems() {
+ awk '($3 ~ "lustre") { print $2 }' /proc/mounts
+}
+MOUNT="`mounted_lustre_filesystems`"
+if [ -z "$MOUNT" ]; then
+ export QUOTA_OPTS="quotaon=ug"
+ sh llmount.sh
+ MOUNT="`mounted_lustre_filesystems`"
+ [ -z "$MOUNT" ] && error "NAME=$NAME not mounted"
+ I_MOUNTED=yes
+fi
+
+[ `echo $MOUNT | wc -w` -gt 1 ] && error "NAME=$NAME mounted more than once"
+
+DIR=${DIR:-$MOUNT}
+[ -z "`echo $DIR | grep $MOUNT`" ] && echo "$DIR not in $MOUNT" && exit 99
+
+LPROC=/proc/fs/lustre
+LOVNAME=`cat $LPROC/llite/*/lov/common_name | tail -n 1`
+OSTCOUNT=`cat $LPROC/lov/$LOVNAME/numobd`
+STRIPECOUNT=`cat $LPROC/lov/$LOVNAME/stripecount`
+STRIPESIZE=`cat $LPROC/lov/$LOVNAME/stripesize`
+ORIGFREE=`cat $LPROC/lov/$LOVNAME/kbytesavail`
+MAXFREE=${MAXFREE:-$((200000 * $OSTCOUNT))}
+MDS=$(\ls $LPROC/mds 2> /dev/null | grep -v num_refs | tail -n 1)
+TSTDIR="$MOUNT/quota_dir"
+
+build_test_filter
+
+
+# set_blk_tunables(btune_sz)
+set_blk_tunesz() {
+ # set btune size on all obdfilters
+ for i in `ls /proc/fs/lustre/obdfilter/*/quota_btune_sz`; do
+ echo $(($1 * $BLK_SZ)) > $i
+ done
+ # set btune size on mds
+ for i in `ls /proc/fs/lustre/mds/mds*/quota_btune_sz`; do
+ echo $(($1 * $BLK_SZ)) > $i
+ done
+}
+# se_blk_unitsz(bunit_sz)
+set_blk_unitsz() {
+ for i in `ls /proc/fs/lustre/obdfilter/*/quota_bunit_sz`; do
+ echo $(($1 * $BLK_SZ)) > $i
+ done
+ for i in `ls /proc/fs/lustre/mds/mds*/quota_bunit_sz`; do
+ echo $(($1 * $BLK_SZ)) > $i
+ done
+}
+# set_file_tunesz(itune_sz)
+set_file_tunesz() {
+ # set iunit and itune size on all obdfilters
+ for i in `ls /proc/fs/lustre/obdfilter/*/quota_itune_sz`; do
+ echo $1 > $i
+ done
+ # set iunit and itune size on mds
+ for i in `ls /proc/fs/lustre/mds/mds*/quota_itune_sz`; do
+ echo $1 > $i
+ done
+
+
+}
+# set_file_unitsz(iunit_sz)
+set_file_unitsz() {
+ for i in `ls /proc/fs/lustre/obdfilter/*/quota_iunit_sz`; do
+ echo $1 > $i
+ done;
+ for i in `ls /proc/fs/lustre/mds/mds*/quota_iunit_sz`; do
+ echo $1 > $i
+ done
+}
+
+# These are for test on local machine,if run sanity-quota.sh on
+# real cluster, ltest should have setup the test environment:
+#
+# - create test user/group on all servers with same id.
+# - set unit size/tune on all servers size to reasonable value.
+pre_test() {
+ if [ -z "$NOSETUP" ]; then
+ # set block tunables
+ set_blk_tunesz $BTUNE_SZ
+ set_blk_unitsz $BUNIT_SZ
+ # set file tunaables
+ set_file_tunesz $ITUNE_SZ
+ set_file_unitsz $IUNIT_SZ
+ fi
+}
+pre_test
+
+post_test() {
+ if [ -z "$NOSETUP" ]; then
+ # restore block tunables to default size
+ set_blk_unitsz $((1024 * 100))
+ set_blk_tunesz $((1024 * 50))
+ # restore file tunables to default size
+ set_file_unitsz 5000
+ set_file_tunesz 2500
+ fi
+}
+
+setup() {
+ # create local test group
+ GRP="`cat /etc/group | grep "$TSTUSR" | awk -F: '{print $1}'`"
+ if [ -z "$GRP" ]; then
+ groupadd -g $TSTID "$TSTUSR"
+ fi
+ TSTID="`cat /etc/group | grep "$TSTUSR" | awk -F: '{print $3}'`"
+
+ # create test user
+ USR="`cat /etc/passwd | grep "$TSTUSR" | awk -F: '{print $1}'`"
+ if [ -z "$USR" ]; then
+ useradd -u $TSTID -g $TSTID -d /tmp "$TSTUSR"
+ fi
+
+ RUNAS="runas -u $TSTID"
+
+ # create test directory
+ [ -d $TSTDIR ] || mkdir $TSTDIR
+ chmod 777 $TSTDIR
+}
+setup
+
+# set quota
+test_0() {
+ $LFS quotaoff -ug $MOUNT
+ $LFS quotacheck -ug $MOUNT
+
+ $LFS setquota -u $TSTUSR 0 0 0 0 $MOUNT
+ $LFS setquota -g $TSTUSR 0 0 0 0 $MOUNT
+}
+run_test 0 "Set quota ============================="
+
+# block hard limit (normal use and out of quota)
+test_1() {
+ LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) * 10)) # 10 bunits each sever
+ TESTFILE="$TSTDIR/quota_tst10"
+
+ echo " User quota (limit: $LIMIT bytes)"
+ $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $MOUNT
+
+ $LFS setstripe $TESTFILE 65536 0 1
+ chown $TSTUSR.$TSTUSR $TESTFILE
+
+ echo " Write ..."
+ $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) > /dev/null 2>&1 || error "(usr) write failure, but expect success"
+ echo " Done"
+ echo " Write out of block quota ..."
+ # this time maybe cache write, ignore it's failure
+ $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) seek=$(($LIMIT/2)) > /dev/null 2>&1 || echo " " > /dev/null
+ # flush cache, ensure noquota flag is setted on client
+ sync; sleep 1; sync;
+ $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$LIMIT > /dev/null 2>&1 && error "(usr) write success, but expect EDQUOT"
+ echo " EDQUOT"
+
+ rm -f $TESTFILE
+
+ echo " Group quota (limit: $LIMIT bytes)"
+ $LFS setquota -u $TSTUSR 0 0 0 0 $MOUNT # clear user limit
+ $LFS setquota -g $TSTUSR 0 $LIMIT 0 0 $MOUNT
+ TESTFILE="$TSTDIR/quota_tst11"
+
+ $LFS setstripe $TESTFILE 65536 0 1
+ chown $TSTUSR.$TSTUSR $TESTFILE
+
+ echo " Write ..."
+ $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) > /dev/null 2>&1 || error "(grp) write failure, but expect success"
+ echo " Done"
+ echo " Write out of block quota ..."
+ # this time maybe cache write, ignore it's failure
+ $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) seek=$(($LIMIT/2)) > /dev/null 2>&1 || echo " " > /dev/null
+ sync; sleep 1; sync;
+ $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$LIMIT > /dev/null 2>&1 && error "(grp) write success, but expect EDQUOT"
+ echo " EDQUOT"
+
+ # cleanup
+ rm -f $TESTFILE
+ $LFS setquota -g $TSTUSR 0 0 0 0 $MOUNT
+}
+run_test 1 "Block hard limit (normal use and out of quota) ==="
+
+# file hard limit (normal use and out of quota)
+test_2() {
+ LIMIT=$(($IUNIT_SZ * 10)) # 10 iunits on mds
+ TESTFILE="$TSTDIR/quota_tstr20"
+
+ echo " User quota (limit: $LIMIT files)"
+ $LFS setquota -u $TSTUSR 0 0 0 $LIMIT $MOUNT
+
+ echo " Create $LIMIT files ..."
+ for i in `seq ${LIMIT}`; do
+ $RUNAS touch ${TESTFILE}_$i > /dev/null 2>&1 || error "(usr) touch failure, but except success"
+ done
+ echo " Done"
+ echo " Create out of file quota ..."
+ $RUNAS touch ${TESTFILE}_xxx > /dev/null 2>&1 && error "(usr) touch success, but expect EDQUOT"
+ echo " EDQUOT"
+
+ for i in `seq ${LIMIT}`; do
+ rm -f ${TESTFILE}_$i
+ done
+
+ echo " Group quota (limit: $LIMIT files)"
+ $LFS setquota -u $TSTUSR 0 0 0 0 $MOUNT # clear user limit
+ $LFS setquota -g $TSTUSR 0 0 0 $LIMIT $MOUNT
+ TESTFILE="$TSTDIR/quota_tst21"
+
+ echo " Create $LIMIT files ..."
+ for i in `seq ${LIMIT}`; do
+ $RUNAS touch ${TESTFILE}_$i > /dev/null 2>&1 || error "(grp) touch failure, but expect success"
+ done
+ echo " Done"
+ echo " Create out of file quota ..."
+ $RUNAS touch ${TESTFILE}_xxx > /dev/null 2>&1 && error "(grp) touch success, but expect EDQUOT"
+ echo " EDQUOT"
+
+ # cleanup
+ for i in `seq ${LIMIT}`; do
+ rm -f ${TESTFILE}_$i
+ done
+ $LFS setquota -g $TSTUSR 0 0 0 0 $MOUNT
+}
+run_test 2 "File hard limit (normal use and out of quota) ==="
+
+test_block_soft() {
+ TESTFILE=$1
+ GRACE=$2
+
+ echo " Write to exceed soft limit"
+ $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ >/dev/null 2>&1 || error "write failure, but expect success"
+ sync; sleep 1; sync;
+
+ echo " Write before timer goes off"
+ $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$BUNIT_SZ >/dev/null 2>&1 || error "write failure, but expect success"
+ sync; sleep 1; sync;
+ echo " Done"
+
+ echo " Sleep $GRACE seconds ..."
+ sleep $GRACE
+
+ echo " Write after timer goes off"
+ # maybe cache write, ignore.
+ $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$(($BUNIT_SZ * 2)) >/dev/null 2>&1 || echo " " > /dev/null
+ sync; sleep 1; sync;
+ $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=1 seek=$(($BUNIT_SZ * 3)) >/dev/null 2>&1 && error "write success, but expect EDQUOT"
+ echo " EDQUOT"
+
+ echo " Unlink file to stop timer"
+ rm -f $TESTFILE
+ echo " Done"
+
+ echo " Write ..."
+ $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ >/dev/null 2>&1 || error "write failure, but expect success"
+ echo " Done"
+
+ # cleanup
+ rm -f $TESTFILE
+}
+
+# block soft limit (start timer, timer goes off, stop timer)
+test_3() {
+ LIMIT=$(( $BUNIT_SZ * 2 )) # 1 bunit on mds and 1 bunit on the ost
+ GRACE=10
+
+ echo " User quota (soft limit: $LIMIT bytes grace: $GRACE seconds)"
+ TESTFILE="$TSTDIR/quota_tst30"
+ $LFS setstripe $TESTFILE 65536 0 1
+ chown $TSTUSR.$TSTUSR $TESTFILE
+
+ $LFS setquota -t -u $GRACE $MAX_IQ_TIME $MOUNT
+ $LFS setquota -u $TSTUSR $LIMIT 0 0 0 $MOUNT
+
+ test_block_soft $TESTFILE $GRACE
+ $LFS setquota -u $TSTUSR 0 0 0 0 $MOUNT
+
+ echo " Group quota (soft limit: $LIMIT bytes grace: $GRACE seconds)"
+ TESTFILE="$TSTDIR/quota_tst31"
+ $LFS setstripe $TESTFILE 65536 0 1
+ chown $TSTUSR.$TSTUSR $TESTFILE
+
+ $LFS setquota -t -g $GRACE $MAX_IQ_TIME $MOUNT
+ $LFS setquota -g $TSTUSR $LIMIT 0 0 0 $MOUNT
+ TESTFILE="$TSTDIR/quota_tst31"
+
+ test_block_soft $TESTFILE $GRACE
+ $LFS setquota -g $TSTUSR 0 0 0 0 $MOUNT
+}
+run_test 3 "Block soft limit (start timer, timer goes off, stop timer) ==="
+
+test_file_soft() {
+ TESTFILE=$1
+ LIMIT=$2
+ GRACE=$3
+
+ echo " Create files to exceed soft limit"
+ for i in `seq $LIMIT`; do
+ $RUNAS touch ${TESTFILE}_$i >/dev/null 2>&1 || error "touch failure, but expect success"
+ done
+ echo " Done"
+
+ echo " Create file before timer goes off"
+ $RUNAS touch ${TESTFILE}_before >/dev/null 2>&1 || error "touch before timer goes off failure, but expect success"
+ echo " Done"
+
+ echo " Sleep $GRACE seconds ..."
+ sleep $GRACE
+
+ echo " Create file after timer goes off"
+ for i in `seq $(($IUNIT_SZ - 1))`; do
+ $RUNAS touch ${TESTFILE}_after_$i >/dev/null 2>&1 || error "touch ${TESTFILE}_after_$i failure, but expect success"
+ done
+ $RUNAS touch ${TESTFILE}_after >/dev/null 2>&1 && error "touch after timer goes off success, but expect EDQUOT"
+ echo " EDQUOT"
+
+ echo " Unlink files to stop timer"
+ for i in `seq $LIMIT`; do
+ rm -f ${TESTFILE}_$i >/dev/null 2>&1 || error "rm ${TESTFILE}_$i failure"
+ done
+ rm -f ${TESTFILE}_before
+ for i in `seq $(($IUNIT_SZ - 1))`; do
+ rm -f ${TESTFILE}_after_$i >/dev/null 2>&1 || error "rm ${TESTFILE}_after_$i failure"
+ done
+ echo " Done"
+
+ echo " Create file"
+ $RUNAS touch ${TESTFILE}_xxx >/dev/null 2>&1 || error "touch after timer stop failure, but expect success"
+ echo " Done"
+
+ # cleanup
+ rm -f ${TESTFILE}_xxx
+}
+
+# file soft limit (start timer, timer goes off, stop timer)
+test_4() {
+ LIMIT=$(($IUNIT_SZ * 10)) # 10 iunits on mds
+ TESTFILE="$TSTDIR/quota_tst40"
+ GRACE=5
+
+ echo " User quota (soft limit: $LIMIT files grace: $GRACE seconds)"
+ $LFS setquota -t -u $MAX_DQ_TIME $GRACE $MOUNT
+ $LFS setquota -u $TSTUSR 0 0 $LIMIT 0 $MOUNT
+
+ test_file_soft $TESTFILE $LIMIT $GRACE
+ $LFS setquota -u $TSTUSR 0 0 0 0 $MOUNT
+
+ echo " Group quota (soft limit: $LIMIT files grace: $GRACE seconds)"
+ $LFS setquota -t -g $MAX_DQ_TIME $GRACE $MOUNT
+ $LFS setquota -g $TSTUSR 0 0 $LIMIT 0 $MOUNT
+ TESTFILE="$TSTDIR/quota_tst41"
+
+ test_file_soft $TESTFILE $LIMIT $GRACE
+ $LFS setquota -g $TSTUSR 0 0 0 0 $MOUNT
+
+ # cleanup
+ $LFS setquota -t -u $MAX_DQ_TIME $MAX_IQ_TIME $MOUNT
+ $LFS setquota -t -g $MAX_DQ_TIME $MAX_IQ_TIME $MOUNT
+}
+run_test 4 "File soft limit (start timer, timer goes off, stop timer) ==="
+
+# chown & chgrp (chown & chgrp successfully even out of block/file quota)
+test_5() {
+ BLIMIT=$(( $BUNIT_SZ * $((OSTCOUNT + 1)) * 10)) # 10 bunits on each server
+ ILIMIT=$(( $IUNIT_SZ * 10 )) # 10 iunits on mds
+
+ echo " Set quota limit (0 $BLIMIT 0 $ILIMIT) for $TSTUSR.$TSTUSR"
+ $LFS setquota -u $TSTUSR 0 $BLIMIT 0 $ILIMIT $MOUNT
+ $LFS setquota -g $TSTUSR 0 $BLIMIT 0 $ILIMIT $MOUNT
+
+ echo " Create more than $ILIMIT files and alloc more than $BLIMIT blocks ..."
+ for i in `seq $(($ILIMIT + 1))`; do
+ touch $TSTDIR/quota_tst50_$i > /dev/null 2>&1 || error "touch failure, expect success"
+ done
+ dd if=/dev/zero of=$TSTDIR/quota_tst50_1 bs=$BLK_SZ count=$(($BLIMIT+1)) > /dev/null 2>&1 || error "write failure, expect success"
+
+ echo " Chown files to $TSTUSR.$TSTUSR ..."
+ for i in `seq $(($ILIMIT + 1))`; do
+ chown $TSTUSR.$TSTUSR $TSTDIR/quota_tst50_$i > /dev/null 2>&1 || error "chown failure, but expect success"
+ done
+
+ # cleanup
+ for i in `seq $(($ILIMIT + 1))`; do
+ rm -f $TSTDIR/quota_tst50_$i
+ done
+ $LFS setquota -u $TSTUSR 0 0 0 0 $MOUNT
+ $LFS setquota -g $TSTUSR 0 0 0 0 $MOUNT
+}
+run_test 5 "Chown & chgrp (chown & chgrp successfully even out of block/file quota) ==="
+
+# block quota acquire & release
+test_6() {
+ if [ $OSTCOUNT -lt 2 ]; then
+ echo "WARN: too few osts, skip this test."
+ return 0;
+ fi
+
+ LIMIT=$(($BUNIT_SZ * $(($OSTCOUNT + 1)) * 10)) # 10 bunits per server
+ FILEA="$TSTDIR/quota_tst60_a"
+ FILEB="$TSTDIR/quota_tst60_b"
+
+ echo " Set block limit $LIMIT bytes to $TSTUSR.$TSTUSR"
+ $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $MOUNT
+ $LFS setquota -g $TSTUSR 0 $LIMIT 0 0 $MOUNT
+
+ echo " Create filea on OST0 and fileb on OST1"
+ $LFS setstripe $FILEA 65536 0 1
+ $LFS setstripe $FILEB 65536 1 1
+ chown $TSTUSR.$TSTUSR $FILEA
+ chown $TSTUSR.$TSTUSR $FILEB
+
+ echo " Exceed quota limit ..."
+ $RUNAS dd if=/dev/zero of=$FILEB bs=$BLK_SZ count=$(($LIMIT - $BUNIT_SZ * $OSTCOUNT)) >/dev/null 2>&1 || error "write fileb failure, but expect success"
+ sync; sleep 1; sync;
+ $RUNAS dd if=/dev/zero of=$FILEB bs=$BLK_SZ seek=$LIMIT count=$BUNIT_SZ >/dev/null 2>&1 && error "write fileb success, but expect EDQUOT"
+ sync; sleep 1; sync;
+ echo " Write to OST0 return EDQUOT"
+ # this write maybe cache write, ignore it's failure
+ $RUNAS dd if=/dev/zero of=$FILEA bs=$BLK_SZ count=$(($BUNIT_SZ * 2)) >/dev/null 2>&1 || echo " " > /dev/null
+ sync; sleep 1; sync;
+ $RUNAS dd if=/dev/zero of=$FILEA bs=$BLK_SZ count=$(($BUNIT_SZ * 2)) seek=$(($BUNIT_SZ *2)) >/dev/null 2>&1 && error "write filea success, but expect EDQUOT"
+ echo " EDQUOT"
+
+ echo " Remove fileb to let OST1 release quota"
+ rm -f $FILEB
+
+ echo " Write to OST0"
+ $RUNAS dd if=/dev/zero of=$FILEA bs=$BLK_SZ count=$(($LIMIT - $BUNIT_SZ * $OSTCOUNT)) >/dev/null 2>&1 || error "write filea failure, expect success"
+ echo " Done"
+
+ # cleanup
+ rm -f $FILEA
+ $LFS setquota -u $TSTUSR 0 0 0 0 $MOUNT
+ $LFS setquota -g $TSTUSR 0 0 0 0 $MOUNT
+ return 0
+}
+run_test 6 "Block quota acquire & release ========="
+
+# quota recovery (block quota only by now)
+test_7()
+{
+ if [ -z "`lsmod|grep mds`" ]; then
+ echo "WARN: no local mds, skip this test"
+ return 0
+ fi
+
+ LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) * 10)) # 10 bunits each sever
+ TESTFILE="$TSTDIR/quota_tst70"
+
+ $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $MOUNT
+
+ $LFS setstripe $TESTFILE 65536 0 1
+ chown $TSTUSR.$TSTUSR $TESTFILE
+
+ echo " Write to OST0..."
+ $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ >/dev/null 2>&1 || error "write failure, but expect success"
+
+ #define OBD_FAIL_OBD_DQACQ 0x604
+ echo 0x604 > /proc/sys/lustre/fail_loc
+ echo " Remove files on OST0"
+ rm -f $TESTFILE
+ echo 0 > /proc/sys/lustre/fail_loc
+
+ echo " Trigger recovery..."
+ OSC0_UUID="`$LCTL dl | awk '/.* OSC_[^ ]+_OST.* / { print $1 }'`"
+ [ -z "$OSC0_UUID" ] && OSC0_UUID="`$LCTL dl | awk '/.* OSC_[^ ]+_ost1.* / { print $1 }'`"
+ for i in $OSC0_UUID; do
+ $LCTL --device $i activate > /dev/null 2>&1 || error "activate osc failed!"
+ done
+
+ # sleep a while to wait for recovery done
+ sleep 20
+
+ # check limits
+ PATTERN="`echo $MOUNT | sed 's/\//\\\\\//g'`"
+ TOTAL_LIMIT="`$LFS quota -u $TSTUSR $MOUNT | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`"
+ [ $TOTAL_LIMIT -eq $LIMIT ] || error "total limits not recovery!"
+ echo " total limits = $TOTAL_LIMIT"
+
+ OST0_UUID=`$LCTL dl | awk '/.*OST_[^ ]+_UUID.* / { print $5 }'`
+ [ -z "$OST0_UUID" ] && OST0_UUID=`$LCTL dl | awk '/.*ost1_[^ ]*UUID.* / { print $5 }'`
+ OST0_LIMIT="`$LFS quota -o $OST0_UUID -u $TSTUSR $MOUNT | awk '/^.*[[:digit:]+][[:space:]+]/ { print $3 }'`"
+ [ $OST0_LIMIT -eq $BUNIT_SZ ] || error "high limits not released!"
+ echo " limits on $OST0_UUID = $OST0_LIMIT"
+
+ # cleanup
+ $LFS setquota -u $TSTUSR 0 0 0 0 $MOUNT
+}
+run_test 7 "Quota recovery (only block limit) ======"
+
+# run dbench with quota enabled
+test_8() {
+ BLK_LIMIT=$((100 * 1024 * 1024)) # 100G
+ FILE_LIMIT=1000000
+ DBENCH_LIB=${DBENCH_LIB:-/usr/lib/dbench}
+
+ [ ! -d $DBENCH_LIB ] && echo "dbench not installed, skip this test" && return 0
+
+ echo " Set enough high limit for user: $TSTUSR"
+ $LFS setquota -u $TSTUSR 0 $BLK_LIMIT 0 $FILE_LIMIT $MOUNT
+ echo " Set enough high limit for group: $TSTUSR"
+ $LFS setquota -g $USER 0 $BLK_LIMIT 0 $FILE_LIMIT $MOUNT
+
+
+ TGT=$TSTDIR/client.txt
+ SRC=${SRC:-$DBENCH_LIB/client.txt}
+ [ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
+ SRC=$DBENCH_LIB/client_plain.txt
+ [ ! -e $TGT -a -e $SRC ] && echo "copying $SRC to $TGT" && cp $SRC $TGT
+
+ SAVE_PWD=$PWD
+ cd $TSTDIR
+ $RUNAS dbench -c client.txt 3
+ RC=$?
+
+ cd $SAVE_PWD
+ return $RC
+}
+run_test 8 "Run dbench with quota enabled ==========="
+
+# turn off quota
+test_9()
+{
+ $LFS quotaoff $MOUNT
+ return 0
+}
+run_test 9 "Quota off ==============================="
+
+
+log "cleanup: ======================================================"
+if [ "`mount | grep ^$NAME`" ]; then
+ rm -fr $TSTDIR
+ post_test
+ # delete test user and group
+ userdel "$TSTUSR"
+ if [ "$I_MOUNTED" = "yes" ]; then
+ sh llmountcleanup.sh || error "llmountcleanup failed"
+ fi
+fi
+
+echo '=========================== finished ==============================='
+[ -f "$SANITYLOG" ] && cat $SANITYLOG && exit 1 || true
+
ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"42a 42c 45 68"}
# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
-[ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 24o 51b 51c 64b 71 101"
+[ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 24o 27m 51b 51c 64b 71 101"
case `uname -r` in
2.4*) FSTYPE=${FSTYPE:-ext3} ;;
*) error "unsupported kernel" ;;
esac
-[ "$ALWAYS_EXCEPT$EXCEPT" ] && \
- echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT`"
+[ "$ALWAYS_EXCEPT$EXCEPT$SANITY_EXCEPT" ] && \
+ echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT $SANITY_EXCEPT`"
SRCDIR=`dirname $0`
export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH:/sbin
IOPENTEST2=${IOPENTEST2:-iopentest2}
MEMHOG=${MEMHOG:-memhog}
DIRECTIO=${DIRECTIO:-directio}
+ACCEPTOR_PORT=${ACCEPTOR_PORT:-988}
+UMOUNT=${UMOUNT:-"umount -d"}
if [ $UID -ne 0 ]; then
echo "Warning: running as non-root uid $UID"
BEFORE=`date +%s`
log "== test $1: $2= `date +%H:%M:%S` ($BEFORE)"
export TESTNAME=test_$1
+ export tfile=f${testnum}
+ export tdir=d${base}
test_$1 || error "exit with rc=$?"
unset TESTNAME
pass "($((`date +%s` - $BEFORE))s)"
for O in $ONLY; do
eval ONLY_${O}=true
done
- for E in $EXCEPT $ALWAYS_EXCEPT; do
+ for E in $EXCEPT $ALWAYS_EXCEPT $SANITY_EXCEPT; do
eval EXCEPT_${E}=true
done
}
error() {
sysctl -w lustre.fail_loc=0
log "FAIL: $TESTNAME $@"
+ $LCTL dk $TMP/lustre-log-$TESTNAME.log
if [ "$SANITYLOG" ]; then
echo "FAIL: $TESTNAME $@" >> $SANITYLOG
else
test_24m() {
f="$DIR/f24m"
- multiop $f OcLN ${f}2 ${f}2 || error
+ multiop $f OcLN ${f}2 ${f}2 || error "link ${f}2 ${f}2 failed"
+ # on ext3 this does not remove either the source or target files
+ # though the "expected" operation would be to remove the source
+ $CHECKSTAT -t file ${f} || error "${f} missing"
+ $CHECKSTAT -t file ${f}2 || error "${f}2 missing"
}
run_test 24m "Renaming a file to a hard link to itself ========="
mkdir -p $DIR/d27
$LSTRIPE $DIR/d27/fdef 0 -1 0 || error "lstripe failed"
$CHECKSTAT -t file $DIR/d27/fdef || error "checkstat failed"
- #dd if=/dev/zero of=$DIR/d27/fdef bs=4k count=4 || error
+ dd if=/dev/zero of=$DIR/d27/fdef bs=4k count=4 || error
}
run_test 27d "create file with default settings ================"
reset_enospc
rm -f $DIR/d27/f27o
exhaust_all_precreations 0x215
+ sleep 5
touch $DIR/d27/f27o && error
mkdir -p $DIR/d32a/ext2-mountpoint
mount -t ext2 -o loop $EXT2_DEV $DIR/d32a/ext2-mountpoint || error
$CHECKSTAT -t dir $DIR/d32a/ext2-mountpoint/.. || error
- umount $DIR/d32a/ext2-mountpoint || error
+ $UMOUNT $DIR/d32a/ext2-mountpoint || error
}
run_test 32a "stat d32a/ext2-mountpoint/.. ====================="
mkdir -p $DIR/d32b/ext2-mountpoint
mount -t ext2 -o loop $EXT2_DEV $DIR/d32b/ext2-mountpoint || error
ls -al $DIR/d32b/ext2-mountpoint/.. || error
- umount $DIR/d32b/ext2-mountpoint || error
+ $UMOUNT $DIR/d32b/ext2-mountpoint || error
}
run_test 32b "open d32b/ext2-mountpoint/.. ====================="
mount -t ext2 -o loop $EXT2_DEV $DIR/d32c/ext2-mountpoint || error
mkdir -p $DIR/d32c/d2/test_dir
$CHECKSTAT -t dir $DIR/d32c/ext2-mountpoint/../d2/test_dir || error
- umount $DIR/d32c/ext2-mountpoint || error
+ $UMOUNT $DIR/d32c/ext2-mountpoint || error
}
run_test 32c "stat d32c/ext2-mountpoint/../d2/test_dir ========="
mount -t ext2 -o loop $EXT2_DEV $DIR/d32d/ext2-mountpoint || error
mkdir -p $DIR/d32d/d2/test_dir
ls -al $DIR/d32d/ext2-mountpoint/../d2/test_dir || error
- umount $DIR/d32d/ext2-mountpoint || error
+ $UMOUNT $DIR/d32d/ext2-mountpoint || error
}
run_test 32d "open d32d/ext2-mountpoint/../d2/test_dir ========="
mount -t ext2 -o loop $EXT2_DEV $DIR/d32i/ext2-mountpoint || error
touch $DIR/d32i/test_file
$CHECKSTAT -t file $DIR/d32i/ext2-mountpoint/../test_file || error
- umount $DIR/d32i/ext2-mountpoint || error
+ $UMOUNT $DIR/d32i/ext2-mountpoint || error
}
run_test 32i "stat d32i/ext2-mountpoint/../test_file ==========="
mount -t ext2 -o loop $EXT2_DEV $DIR/d32j/ext2-mountpoint || error
touch $DIR/d32j/test_file
cat $DIR/d32j/ext2-mountpoint/../test_file || error
- umount $DIR/d32j/ext2-mountpoint || error
+ $UMOUNT $DIR/d32j/ext2-mountpoint || error
}
run_test 32j "open d32j/ext2-mountpoint/../test_file ==========="
mkdir -p $DIR/d32k/d2
touch $DIR/d32k/d2/test_file || error
$CHECKSTAT -t file $DIR/d32k/ext2-mountpoint/../d2/test_file || error
- umount $DIR/d32k/ext2-mountpoint || error
+ $UMOUNT $DIR/d32k/ext2-mountpoint || error
}
run_test 32k "stat d32k/ext2-mountpoint/../d2/test_file ========"
mkdir -p $DIR/d32l/d2
touch $DIR/d32l/d2/test_file
cat $DIR/d32l/ext2-mountpoint/../d2/test_file || error
- umount $DIR/d32l/ext2-mountpoint || error
+ $UMOUNT $DIR/d32l/ext2-mountpoint || error
}
run_test 32l "open d32l/ext2-mountpoint/../d2/test_file ========"
touch $DIR/d32q/under_the_mount
mount -t ext2 -o loop $EXT2_DEV $DIR/d32q
ls $DIR/d32q/under_the_mount && error || true
- umount $DIR/d32q || error
+ $UMOUNT $DIR/d32q || error
}
run_test 32q "stat follows mountpoints in Lustre (should return error)"
touch $DIR/d32r/under_the_mount
mount -t ext2 -o loop $EXT2_DEV $DIR/d32r
ls $DIR/d32r | grep -q under_the_mount && error || true
- umount $DIR/d32r || error
+ $UMOUNT $DIR/d32r || error
}
run_test 32r "opendir follows mountpoints in Lustre (should return error)"
rm -fr $DIR/d33
mkdir -p $DIR/d33
chown $RUNAS_ID $DIR/d33
- $RUNAS $OPENFILE -f O_RDWR:O_CREAT -m 0444 $DIR/d33/f33 || error
- $RUNAS $OPENFILE -f O_RDWR:O_CREAT -m 0444 $DIR/d33/f33 && error || true
+ $RUNAS $OPENFILE -f O_RDWR:O_CREAT -m 0444 $DIR/d33/f33|| error "create"
+ $RUNAS $OPENFILE -f O_RDWR:O_CREAT -m 0444 $DIR/d33/f33 && \
+ error "open RDWR" || true
}
run_test 33a "test open file(mode=0444) with O_RDWR (should return error)"
echo f > $DIR/dextra/fbugfile
mount -t ext2 -o loop $EXT2_DEV $DIR/dextra
ls $DIR/dextra | grep "\<fbugfile\>" && error
- umount $DIR/dextra || error
+ $UMOUNT $DIR/dextra || error
rm -f $DIR/dextra/fbugfile || error
}
run_test 37 "ls a mounted file system to check old content ====="
# ls -lc $DIR/test_39_file $DIR/test_39_file2
sleep 2
$OPENFILE -f O_CREAT:O_TRUNC:O_WRONLY $DIR/test_39_file2
-# ls -l $DIR/test_39_file $DIR/test_39_file2
-# ls -lu $DIR/test_39_file $DIR/test_39_file2
-# ls -lc $DIR/test_39_file $DIR/test_39_file2
- [ $DIR/test_39_file2 -nt $DIR/test_39_file ] || error
+ if [ ! $DIR/test_39_file2 -nt $DIR/test_39_file ]; then
+ echo "mtime"
+ ls -l $DIR/test_39_file $DIR/test_39_file2
+ echo "atime"
+ ls -lu $DIR/test_39_file $DIR/test_39_file2
+ echo "ctime"
+ ls -lc $DIR/test_39_file $DIR/test_39_file2
+ error "O_TRUNC didn't change timestamps"
+ fi
}
run_test 39 "mtime changed on create ==========================="
WRITEBACK_SAVE=500
start_writeback() {
+ trap 0
# in 2.6, restore /proc/sys/vm/dirty_writeback_centisecs
if [ -f /proc/sys/vm/dirty_writeback_centisecs ]; then
echo $WRITEBACK_SAVE > /proc/sys/vm/dirty_writeback_centisecs
run_test 42d "test complete truncate of file with cached dirty data"
test_43() {
- mkdir $DIR/d43
- cp -p /bin/ls $DIR/d43/f
- exec 100>> $DIR/d43/f
- $DIR/d43/f && error || true
+ mkdir $DIR/$tdir
+ cp -p /bin/ls $DIR/$tdir/$tfile
+ exec 100>> $DIR/$tdir/$tfile
+ $DIR/$tdir/$tfile && error || true
exec 100<&-
}
run_test 43 "execution of file opened for write should return -ETXTBSY"
[ -f $DIR/d52a/foo ] && chattr -a $DIR/d52a/foo
mkdir -p $DIR/d52a
touch $DIR/d52a/foo
- chattr =a $DIR/d52a/foo || error
- echo bar >> $DIR/d52a/foo || error
- cp /etc/hosts $DIR/d52a/foo && error
- rm -f $DIR/d52a/foo 2>/dev/null && error
- link $DIR/d52a/foo $DIR/d52a/foo_link 2>/dev/null && error
- echo foo >> $DIR/d52a/foo || error
- mrename $DIR/d52a/foo $DIR/d52a/foo_ren && error
- lsattr $DIR/d52a/foo | egrep -q "^-+a-+ $DIR/d52a/foo" || error
- chattr -a $DIR/d52a/foo || error
-
- rm -fr $DIR/d52a || error
+ chattr =a $DIR/d52a/foo || error "chattr =a failed"
+ echo bar >> $DIR/d52a/foo || error "append bar failed"
+ cp /etc/hosts $DIR/d52a/foo && error "cp worked"
+ rm -f $DIR/d52a/foo 2>/dev/null && error "rm worked"
+ link $DIR/d52a/foo $DIR/d52a/foo_link 2>/dev/null && error "link worked"
+ echo foo >> $DIR/d52a/foo || error "append foo failed"
+ mrename $DIR/d52a/foo $DIR/d52a/foo_ren && error "rename worked"
+ lsattr $DIR/d52a/foo | egrep -q "^-+a-+ $DIR/d52a/foo" || error "lsattr"
+ chattr -a $DIR/d52a/foo || error "chattr -a failed"
+
+ rm -fr $DIR/d52a || error "cleanup rm failed"
}
run_test 52a "append-only flag test (should return errors) ====="
$SOCKETCLIENT $DIR/socket || error
$MUNLINK $DIR/socket
}
-run_test 54a "unix damain socket test =========================="
+run_test 54a "unix domain socket test =========================="
test_54b() {
f="$DIR/f54b"
tfile="$DIR/f54c"
tdir="$DIR/d54c"
loopdev="$DIR/loop54c"
-
+
find_loop_dev
[ -z "$LOOPNUM" ] && echo "couldn't find empty loop device" && return
mknod $loopdev b 7 $LOOPNUM
- echo "make a loop file system with $tfile on $loopdev ($LOOPNUM)..."
+ echo "make a loop file system with $tfile on $loopdev ($LOOPNUM)..."
dd if=/dev/zero of=$tfile bs=`page_size` seek=1024 count=1 > /dev/null
losetup $loopdev $tfile || error "can't set up $loopdev for $tfile"
mkfs.ext2 $loopdev || error "mke2fs on $loopdev"
dd if=/dev/zero of=$tdir/tmp bs=`page_size` count=30 || error "dd write"
df $tdir
dd if=$tdir/tmp of=/dev/zero bs=`page_size` count=30 || error "dd read"
- umount $tdir
+ $UMOUNT $tdir
losetup -d $loopdev
rm $loopdev
}
$IOPENTEST2 $DIR/d55 || error "running $IOPENTEST2"
echo "check for $EXT2_DEV. Please wait..."
rm -rf $DIR/d55/*
- umount $DIR/d55 || error "unmounting"
+ $UMOUNT $DIR/d55 || error "unmounting"
}
run_test 55 "check iopen_connect_dentry() ======================"
for i in /proc/fs/lustre/osc/*/max_dirty_mb ; do
echo $MAX_DIRTY_MB > $i
done
- true
+ rm -f $DIR/f63 || true
}
run_test 63 "Verify oig_wait interruption does not crash ======="
$LSTRIPE $DIR/d65 $(($STRIPESIZE * 2)) 0 1 || error "setstripe"
$LSTRIPE -d $DIR/d65 || error "setstripe"
$LFS find -v $DIR/d65 | grep "$DIR/d65/ has no stripe info" || \
- error "delete default stripe failed"
+ error "delete default stripe failed"
}
run_test 65g "directory setstripe -d ==========================="
run_test 67 "supplementary group failure (should return error) ="
cleanup_68() {
+ trap 0
if [ "$LOOPDEV" ]; then
swapoff $LOOPDEV || error "swapoff failed"
losetup -d $LOOPDEV || error "losetup -d failed"
mkdir $DIR$LIB71 || error "can't create $DIR$LIB71"
cp $LIB71/libc* $DIR$LIB71 || error "can't copy $LIB71/libc*"
cp $LIB71/ld-* $DIR$LIB71 || error "can't create $LIB71/ld-*"
-
+
echo "chroot $DIR /dbench -c client.txt 2"
chroot $DIR /dbench -c client.txt 2
RC=$?
}
run_test 74 "ldlm_enqueue freed-export error path (shouldn't LBUG)"
+JOIN=${JOIN:-"lfs join"}
+test_75() {
+ F=$DIR/$tfile
+ F128k=${F}_128k
+ FHEAD=${F}_head
+ FTAIL=${F}_tail
+ rm -f $F*
+
+ dd if=/dev/urandom of=${F}_128k bs=1024 count=128 || error "dd failed"
+ chmod 777 ${F128k}
+ cp -p ${F128k} ${FHEAD}
+ cp -p ${F128k} ${FTAIL}
+ cat ${F128k} ${F128k} > ${F}_sim_sim
+
+ $JOIN ${FHEAD} ${FTAIL} || error "join ${FHEAD} ${FTAIL} error"
+ diff ${FHEAD} ${F}_sim_sim
+ diff -u ${FHEAD} ${F}_sim_sim || error "${FHEAD} ${F}_sim_sim differ"
+ $CHECKSTAT -a ${FTAIL} || error "tail ${FTAIL} still exist after join"
+
+ cp -p ${F128k} ${FTAIL}
+ cat ${F}_sim_sim >> ${F}_join_sim
+ cat ${F128k} >> ${F}_join_sim
+ $JOIN ${FHEAD} ${FTAIL} || error "join ${FHEAD} ${FTAIL} error"
+ diff -u ${FHEAD} ${F}_join_sim
+ diff -u ${FHEAD} ${F}_join_sim || \
+ error "${FHEAD} ${F}_join_sim are different"
+ $CHECKSTAT -a ${FTAIL} || error "tail ${FTAIL} exist after join"
+
+ cp -p ${F128k} ${FTAIL}
+ cat ${F128k} >> ${F}_sim_join
+ cat ${F}_join_sim >> ${F}_sim_join
+ $JOIN ${FTAIL} ${FHEAD} || error "join error"
+ diff -u ${FTAIL} ${F}_sim_join || \
+ error "${FTAIL} ${F}_sim_join are different"
+ $CHECKSTAT -a ${FHEAD} || error "tail ${FHEAD} exist after join"
+
+ cp -p ${F128k} ${FHEAD}
+ cp -p ${F128k} ${FHEAD}_tmp
+ cat ${F}_sim_sim >> ${F}_join_join
+ cat ${F}_sim_join >> ${F}_join_join
+ $JOIN ${FHEAD} ${FHEAD}_tmp || error "join ${FHEAD} ${FHEAD}_tmp error"
+ $JOIN ${FHEAD} ${FTAIL} || error "join ${FHEAD} ${FTAIL} error"
+ diff -u ${FHEAD} ${F}_join_join ||error "${FHEAD} ${F}_join_join differ"
+ $CHECKSTAT -a ${FHEAD}_tmp || error "${FHEAD}_tmp exist after join"
+ $CHECKSTAT -a ${FTAIL} || error "tail ${FTAIL} exist after join (2)"
+
+ rm -rf ${FHEAD} || "delete join file error"
+ cp -p ${F128k} ${F}_join_10_compare
+ cp -p ${F128k} ${F}_join_10
+ for ((i = 0; i < 10; i++)); do
+ cat ${F128k} >> ${F}_join_10_compare
+ cp -p ${F128k} ${FTAIL}
+ $JOIN ${F}_join_10 ${FTAIL} || \
+ error "join ${F}_join_10 ${FTAIL} error"
+ $CHECKSTAT -a ${FTAIL} || error "tail file exist after join"
+ done
+ diff -u ${F}_join_10 ${F}_join_10_compare || \
+ error "files ${F}_join_10 ${F}_join_10_compare are different"
+ $LFS getstripe ${F}_join_10
+ $OPENUNLINK ${F}_join_10 ${F}_join_10 || error "files unlink open"
+
+ rm -f $F*
+}
+run_test 75 "TEST join file"
+
+num_inodes() {
+ awk '/lustre_inode_cache|^inode_cache/ {print $2; exit}' /proc/slabinfo
+}
+
+test_76() { # bug 1443
+ BEFORE_INODES=`num_inodes`
+ echo "before inodes: $BEFORE_INODES"
+ for i in `seq 1000`; do
+ touch $DIR/$tfile
+ rm -f $DIR/$tfile
+ done
+ AFTER_INODES=`num_inodes`
+ echo "after inodes: $AFTER_INODES"
+ [ $AFTER_INODES -gt $((BEFORE_INODES + 10)) ] && \
+ error "inode slab grew from $BEFORE_INODES to $AFTER_INODES"
+ true
+}
+run_test 76 "destroy duplicate inodes in client inode cache"
+
# on the LLNL clusters, runas will still pick up root's $TMP settings,
# which will not be writable for the runas user, and then you get a CVS
# error message with a corrupt path string (CVS bug) and panic.
netstat -tna | while read PROT SND RCV LOCAL REMOTE STAT; do
[ "$PROT" != "tcp" ] && continue
RPORT=`echo $REMOTE | cut -d: -f2`
- [ "$RPORT" != "988" ] && continue
+ [ "$RPORT" != "$ACCEPTOR_PORT" ] && continue
LPORT=`echo $LOCAL | cut -d: -f2`
[ $LPORT -ge 1024 ] && error "local port: $LPORT > 1024" || true
done
cat $LPROC/llite/*/read_ahead_stats
error "too many ($discard) discarded pages"
fi
+ rm -f $DIR/f101 || true
}
run_test 101 "check read-ahead for random reads ==========="
touch $testfile
[ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return
- [ -z "`mount | grep " $DIR .*\<user_xattr\>"`" ] && echo "skipping $TESTNAME (must have user_xattr)" && return
+ [ -z "grep \<xattr\> /proc/fs/lustre/mdc/MDC*MNT*/connect_flags" ] && echo "skipping $TESTNAME (must have user_xattr)" && return
echo "set/get xattr..."
setfattr -n trusted.name1 -v value1 $testfile || error
[ "`getfattr -n trusted.name1 $testfile 2> /dev/null | \
}
run_test 102 "user xattr test ====================="
+run_acl_subtest()
+{
+ $SAVE_PWD/acl/run $SAVE_PWD/acl/$1.test
+ return $?
+}
+
+test_103 () {
+ SAVE_UMASK=`umask`
+ umask 0022
+ cd $DIR
+
+ [ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return
+ [ -z "`mount | grep " $DIR .*\<acl\>"`" ] && echo "skipping $TESTNAME (must have acl)" && return
+ [ -z "`grep acl /proc/fs/lustre/mdc/MDC*MNT*/connect_flags`" ] && echo "skipping $TESTNAME (must have acl)" && return
+
+ echo "performing cp ..."
+ run_acl_subtest cp || error
+ echo "performing getfacl-noacl..."
+ run_acl_subtest getfacl-noacl || error
+ echo "performing misc..."
+ run_acl_subtest misc || error
+# XXX add back permission test when we support supplementary groups.
+# echo "performing permissions..."
+# run_acl_subtest permissions || error
+ echo "performing setfacl..."
+ run_acl_subtest setfacl || error
+
+ # inheritance test got from HP
+ echo "performing inheritance..."
+ cp $SAVE_PWD/acl/make-tree . || error
+ chmod +x make-tree || error
+ run_acl_subtest inheritance || error
+ rm -f make-tree
+
+ cd $SAVED_PWD
+ umask $SAVE_UMASK
+}
+run_test 103 "==============acl test ============="
+
TMPDIR=$OLDTMPDIR
TMP=$OLDTMP
HOME=$OLDHOME
set -e
ONLY=${ONLY:-"$*"}
-# bug number for skipped test: 1768 3192 4035
-ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"4 14b 14c"}
+# bug number for skipped test: 3192 4035
+ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"14b 14c"}
# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
-[ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT"
+[ "$ALWAYS_EXCEPT$EXCEPT$SANITYN_EXCEPT" ] && \
+ echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT $SANITYN_EXCEPT`"
SRCDIR=`dirname $0`
PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
BEFORE=`date +%s`
log "== test $1: $2= `date +%H:%M:%S` ($BEFORE)"
export TESTNAME=test_$1
+ export tfile=f${testnum}
+ export tdir=d${base}
test_$1 || error "test_$1: exit with rc=$?"
unset TESTNAME
pass "($((`date +%s` - $BEFORE))s)"
echo -n "."
fi
done
- for X in $EXCEPT $ALWAYS_EXCEPT; do
+ for X in $EXCEPT $ALWAYS_EXCEPT $SANITYN_EXCEPT; do
if [ "`echo $1 | grep '\<'$X'[a-z]*\>'`" ]; then
echo "skipping excluded test $1"
return 0
run_test 15 "test out-of-space with multiple writers ==========="
test_16() {
- fsx -c 50 -p 100 -N 2500 $MOUNT1/fsxfile $MOUNT2/fsxfile
+ fsx -c 50 -p 100 -N 2500 -S 0 $MOUNT1/fsxfile $MOUNT2/fsxfile
}
run_test 16 "2500 iterations of dual-mount fsx ================="
[ $CNTD -gt 0 ] && \
error $CNTD" page left in cache after lock cancel" || true
}
-
run_test 20 "test extra readahead page left in cache ===="
+cleanup_21() {
+ trap 0
+ umount $DIR1/d21
+}
+
test_21() { # Bug 5907
mkdir $DIR1/d21
- mount /etc $DIR1/d21 --bind # Poor man's mount.
- rmdir $DIR1/d21 && error "Removed mounted directory"
- rmdir $DIR2/d21 && echo "Removed mounted directory from another mountpoint, needs to be fixed"
- test -d $DIR1/d21 || error "Monted directory disappeared"
- umount $DIR1/d21
+ mount /etc $DIR1/d21 --bind || error "mount failed" # Poor man's mount.
+ trap cleanup_21 EXIT
+ rmdir -v $DIR1/d21 && error "Removed mounted directory"
+ rmdir -v $DIR2/d21 && echo "Removed mounted directory from another mountpoint, needs to be fixed"
+ test -d $DIR1/d21 || error "Mounted directory disappeared"
+ cleanup_21
test -d $DIR2/d21 || test -d $DIR1/d21 && error "Removed dir still visible after umount"
true
}
-
run_test 21 " Try to remove mountpoint on another dir ===="
+JOIN=${JOIN:-"lfs join"}
+test_22() { # Bug 9926
+ mkdir $DIR1/d21
+ dd if=/dev/urandom of=$DIR1/d21/128k bs=1024 count=128
+ cp -p $DIR1/d21/128k $DIR1/d21/f_head
+ for ((i=0;i<10;i++)); do
+ cp -p $DIR1/d21/128k $DIR1/d21/f_tail
+ $JOIN $DIR1/d21/f_head $DIR1/d21/f_tail || error "join error"
+ $CHECKSTAT -a $DIR1/d21/f_tail || error "tail file exist after join"
+ done
+ echo aaaaaaaaaaa >> $DIR1/d21/no_joined
+
+ mv $DIR2/d21/f_head $DIR2/
+ munlink $DIR2/f_head || error "unlink joined file error"
+ cat $DIR2/d21/no_joined || error "cat error"
+ rm -rf $DIR2/d21/no_joined || error "unlink normal file error"
+}
+run_test 22 " After joining in one dir, open/close unlink file in anther dir"
log "cleanup: ======================================================"
rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true
+++ /dev/null
-# tbox.sh - Shell functions to manage tinderbox build reporting
-# Copyright (C) 2002 Cluster File Systems, Inc.
-# Gord Eagle <gord@clusterfs.com>, 2002-08-22
-
-HOSTNAME=`hostname`
-PROGNAME=`echo "$0" | sed -e 's%^.*/%%'`
-MAILPROG="${MAILPROG-mail}"
-
-TBOX_PHASE=build # or test
-TBOX_STARTTIME=`date +%s`
-TBOX_LOG="${TBOX_LOG-/tmp/tbox.$$.$TBOX_STARTTIME.log}"
-TBOX_BUILDMAIL=tinderbox_builds@lustre.org
-TBOX_BUILDNAME="${TBOX_BUILDNAME-$PROGNAME-$HOSTNAME}"
-
-# Send a status message to the list.
-tbox_status() {
- [ -n "$TBOX_BUILDNAME" -a -n "$TBOX_BUILDMAIL" ] || return 0
- [ "$#" -ge 4 ] || return 1
- if [ "$#" -gt 4 ]; then
- log="$5"
- echo >> $log
- else
- log=
- fi
-
- TREE="$1"
- SUBJECT="$2"
- STATUS="$3"
- TIMENOW="$4"
-
- echo "sending tinderbox mail to $TBOX_BUILDMAIL: $TREE $SUBJECT $STATUS"
-
- TMPFILE="/tmp/tinderbox.boilerplate.$$.$TIMENOW"
-
- cat > $TMPFILE <<-EOF
- tinderbox: tree: $TREE
- tinderbox: starttime: $TBOX_STARTTIME
- tinderbox: timenow: $TIMENOW
- tinderbox: builddate: $TBOX_STARTTIME
- tinderbox: status: $STATUS
- tinderbox: buildname: $TBOX_BUILDNAME
- tinderbox: errorparser: unix
- tinderbox: END
-
-EOF
-
- cat $TMPFILE $log | $MAILPROG -s "build $SUBJECT ($TBOX_BUILDNAME)" $TBOX_BUILDMAIL
- rm -f $TMPFILE
-}
-
-# Send out the failure or success message based on exit status.
-tbox_exit() {
- TREE="$1"
- TAILPID="$2"
- CODE=${3-$?}
- if [ $CODE -eq 0 ]; then
- SUBJECT=successful
- STATUS=success
- else
- SUBJECT=failed
- STATUS="${TBOX_PHASE}_failed"
- fi
-
- # Send off the status message.
- trap 0
- tbox_status "$TREE" "$SUBJECT" "$STATUS"
- rm -f $TBOX_LOG
-
- # Wait for tail to display all output, then finish it.
- sleep 1
- kill $TAILPID
- exit $CODE
-}
-
-# Run a subprogram, but stop it from sending its own tinderbox
-# messages.
-tbox_absorb_log() {
- # This probably doesn't do what you think it does... it only prepends
- # TBOX_LOG= to our arguments.
- set TBOX_LOG= "$@"
-
- # Now evaluate the command.
- eval "$@"
-}
-
-# Start the log for a given tree.
-tbox_start_log() {
- TREE="$1"
-
- # Send status messages to stdout, stderr.
- exec 6>&1 7>&2
-
- [ -n "$TBOX_LOG" ] || return 0
-
- # Initialize the output log file.
- : > $TBOX_LOG
-
- # Send all our output to the log.
- exec >>$TBOX_LOG 2>&1
-
- # Monitor it on the old stdout.
- tail -f $TBOX_LOG 1>&6 &
-
- # Allow tail to print our last output before exiting.
- trap "tbox_exit \"$TREE\" $! 1" 1 2 10 15
- trap "tbox_exit \"$TREE\" $!" 0
-}
-
-
-# Begin writing to the log and send out the initial status.
-# tbox_start TREE
-tbox_start() {
- TREE="$1"
- tbox_start_log "$TREE"
- tbox_status "$TREE" starting building "$TBOX_STARTTIME"
-}
#!/bin/bash
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
set -e
export XMLCONFIG=${XMLCONFIG:-${TESTSUITE}.xml}
export LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest}
- [ -d /r ] && export ROOT=/r
+ [ -d /r ] && export ROOT=${ROOT:-/r}
export TMP=${TMP:-$ROOT/tmp}
export PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests
export CHECKSTAT="${CHECKSTAT:-checkstat} "
export FSYTPE=${FSTYPE:-"ext3"}
+ if [ "$ACCEPTOR_PORT" ]; then
+ export PORT_OPT="--port $ACCEPTOR_PORT"
+ fi
+
# Paths on remote nodes, if different
export RLUSTRE=${RLUSTRE:-$LUSTRE}
export RPWD=${RPWD:-$PWD}
do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \
--node ${active}_facet --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \
$@ $XMLCONFIG
+ RC=${PIPESTATUS[0]}
+ if [ $RC -ne 0 ]; then
+ # maybe acceptor error, dump tcp port usage
+ netstat -tpn
+ fi
+ return $RC
}
stop() {
}
zconf_mount() {
+ local OPTIONS
client=$1
mnt=$2
# Only supply -o to mount if we have options
if [ -n "$MOUNTOPT" ]; then
- MOUNTOPT="-o $MOUNTOPT"
+ OPTIONS="-o $MOUNTOPT"
fi
if [ -x /sbin/mount.lustre ] ; then
- do_node $client mount -t lustre $MOUNTOPT \
+ do_node $client mount -t lustre $OPTIONS \
`facet_nid mds`:/mds_svc/client_facet $mnt || return 1
else
# this is so cheating
do_node $client $LCONF --nosetup --node client_facet $XMLCONFIG > \
/dev/null || return 2
- do_node $client $LLMOUNT $MOUNTOPT \
+ do_node $client $LLMOUNT $OPTIONS \
`facet_nid mds`:/mds_svc/client_facet $mnt || return 4
fi
echo "add facet $facet: `facet_host $facet`"
do_lmc --add node --node ${facet}_facet $@ --timeout $TIMEOUT \
--lustre_upcall $UPCALL --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM
- do_lmc --add net --node ${facet}_facet --nid `facet_nid $facet` --nettype lnet
+ do_lmc --add net --node ${facet}_facet --nid `facet_nid $facet` \
+ --nettype lnet $PORT_OPT
}
add_mds() {
- facet=$1
+ local MOUNT_OPTS
+ local facet=$1
shift
rm -f ${facet}active
add_facet $facet
+ [ "x$MDSOPT" != "x" ] && MOUNT_OPTS="--mountfsoptions $MDSOPT"
do_lmc --add mds --node ${facet}_facet --mds ${facet}_svc \
- --fstype $FSTYPE $* $MDSOPT
+ --fstype $FSTYPE $* $MOUNT_OPTS
}
add_mdsfailover() {
- facet=$1
+ local MOUNT_OPTS
+ local facet=$1
shift
add_facet ${facet}failover --lustre_upcall $UPCALL
+ [ "x$MDSOPT" != "x" ] && MOUNT_OPTS="--mountfsoptions $MDSOPT"
do_lmc --add mds --node ${facet}failover_facet --mds ${facet}_svc \
- --fstype $FSTYPE $* $MDSOPT
+ --fstype $FSTYPE $* $MOUNT_OPTS
}
add_ost() {
}
add_client() {
- facet=$1
+ local MOUNT_OPTS
+ local facet=$1
mds=$2
shift; shift
+ [ "x$CLIENTOPT" != "x" ] && MOUNT_OPTS="--clientoptions $CLIENTOPT"
add_facet $facet --lustre_upcall $UPCALL
- do_lmc --add mtpt --node ${facet}_facet --mds ${mds}_svc $* $CLIENTOPT
+ do_lmc --add mtpt --node ${facet}_facet --mds ${mds}_svc $* $MOUNT_OPTS
}
eval ONLY_${O}=true
done
[ "$EXCEPT$ALWAYS_EXCEPT" ] && \
- log "skipping test `echo $EXCEPT $ALWAYS_EXCEPT`"
+ log "skipping tests: `echo $EXCEPT $ALWAYS_EXCEPT`"
for E in $EXCEPT $ALWAYS_EXCEPT; do
eval EXCEPT_${E}=true
done
+++ /dev/null
-#!/usr/bin/expect
-
-spawn lwizard $argv
-HOSTNAME=`hostname`
-set timeout 3
-expect {
- "overwrite existing" {
- interact
- }
-}
-expect "HOSTNAME for mds"
-send -- "$HOSTNAME\n"
-expect "network INTERFACE"
-send -- "192.168.1.29/24 10.0.0.29/24\n"
-expect "enter the device or loop file name for mds"
-send -- "/tmp/mds\n"
-expect "device SIZE"
-send -- "10000\n"
-expect "configure FAILOVER"
-send -- "n\n"
-expect "HOSTNAME for ost"
-send -- "$HOSTNAME\n"
-expect "network INTERFACE"
-send -- "192.168.1.29/24 10.0.0.29/24\n"
-expect "device or loop file name for ost"
-send -- "/tmp/ost\n"
-expect "device SIZE"
-send -- "10000\n"
-expect "configure FAILOVER"
-send -- "n\n"
-expect "HOSTNAME for ost"
-send -- "\n"
-expect "clients' mountpoint"
-send -- "\n"
-expect "configure another client with multiple network interfaces"
-send -- "y\n"
-expect "HOSTNAME"
-send -- "node\n"
-expect "network interface address"
-send -- "192.168.1.29/24 10.0.0.29/24\n"
-expect "configure another client with multiple network interfaces"
-send -- "n\n"
-expect "Lustre configuration has been written"
-send -- "\n"
-close
MOUNT=${MOUNT:-/mnt/lustre}
FSTYPE=${FSTYPE:-ext3}
-CLIENTOPT="user_xattr,${CLIENTOPT:-""}"
+MDS_MOUNT_OPTS="user_xattr,acl,${MDS_MOUNT_OPTS:-""}"
+CLIENTOPT="user_xattr,acl,${CLIENTOPT:-""}"
NETTYPE=${NETTYPE:-tcp}
NIDTYPE=${NIDTYPE:-$NETTYPE}
+[ "$ACCEPTOR_PORT" ] && PORT_OPT="--port $ACCEPTOR_PORT"
# NOTE - You can't have different MDS/OST nodes and also have clients on the
# MDS/OST nodes without using --endlevel and --startlevel during lconf.
echo -n "adding NET for:"
for NODE in `echo $MDSNODE $OSTNODES $CLIENTS | tr -s " " "\n" | sort -u`; do
echo -n " $NODE"
- ${LMC} -m $config --add net --node $NODE --nid `h2$NIDTYPE $NODE` --nettype $NETTYPE || exit 1
+ ${LMC} -m $config --add net --node $NODE --nid `h2$NIDTYPE $NODE` \
+ --nettype $NETTYPE $PORT_OPT || exit 1
done
# configure mds server
+[ "x$MDS_MOUNT_OPTS" != "x" ] &&
+ MDS_MOUNT_OPTS="--mountfsoptions $MDS_MOUNT_OPTS"
+
echo; echo "adding MDS on: $MDSNODE"
${LMC} -m $config --add mds --node $MDSNODE --mds mds1 --fstype $FSTYPE \
- --dev $MDSDEV --size $MDSSIZE $MDSOPT || exit 10
+ --dev $MDSDEV $MDS_MOUNT_OPTS --size $MDSSIZE $MDSOPT || exit 10
# configure ost
${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES \
LIBPTLCTL := $(top_builddir)/lnet/utils/libptlctl.a
sbin_scripts = lconf lmc llanalyze llstat.pl llobdstat.pl lactive \
- load_ldap.sh lrun lwizard
+ load_ldap.sh lrun
bin_scripts = lfind lstripe
if UTILS
#include <pwd.h>
#include <grp.h>
#include <stdarg.h>
+#include <stddef.h>
#include <syslog.h>
#include <lustre/lustre_user.h>
if config.noexec: return (0, [])
child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
- child.tochild.write(cmds + "\n")
+ child.tochild.write(cmds + "\nq\n")
child.tochild.close()
# From "Python Cookbook" from O'Reilly
return 0
return stat.S_ISBLK(s[stat.ST_MODE])
+def my_realpath(path):
+ try:
+ if os.path.islink(path):
+ # get the realpath of the mount point path
+ if 'realpath' in dir(os.path):
+ real_path = os.path.realpath(path)
+ else:
+ real_path = path
+ link_count = 0
+ while os.path.islink(real_path) and (link_count < 20):
+ link_count = link_count + 1
+ path_link = os.readlink(real_path)
+ if os.path.isabs(path_link):
+ real_path = path_link
+ else:
+ real_path = os.path.join(os.path.dirname(real_path), path_link)
+ if link_count > 19:
+ panic("Encountered too many symbolic links resolving path:", path)
+ else:
+ real_path = path
+
+ return real_path
+ except:
+ panic("Fatal error realpath()ing path:", path)
+
+
# build fs according to type
# fixme: dangerous
def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
# get the realpath of the device
# it may be the real device, such as /dev/hda7
# or the hardlink created via mknod for a device
- if 'realpath' in dir(os.path):
- real_dev = os.path.realpath(dev)
- else:
- real_dev = dev
- link_count = 0
- while os.path.islink(real_dev) and (link_count < 20):
- link_count = link_count + 1
- dev_link = os.readlink(real_dev)
- if os.path.isabs(dev_link):
- real_dev = dev_link
- else:
- real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
- if link_count > 19:
- panic("Encountered too many symbolic links resolving block device:", dev)
+ real_dev = my_realpath(dev)
# get the major and minor number of the realpath via ls
# it seems python(os.stat) does not return
#else:
# mountfsoptions = "%s,extents,mballoc" % (mountfsoptions)
elif target == 'mds':
- mountfsoptions = "%s,user_xattr" % (mountfsoptions)
+ if config.user_xattr:
+ mountfsoptions = "%s,user_xattr" % (mountfsoptions)
+ if config.acl:
+ mountfsoptions = "%s,acl" % (mountfsoptions)
return mountfsoptions
return ""
def fs_is_mounted(path):
"""Return true if path is a mounted lustre filesystem"""
try:
+ real_path = my_realpath(path)
+
fp = open('/proc/mounts')
lines = fp.readlines()
fp.close()
for l in lines:
a = string.split(l)
- if a[1] == path and a[2] == 'lustre_lite':
+ if a[1] == real_path and a[2] == 'lustre_lite':
return 1
except IOError, e:
log(e)
if mod_loaded(mod) and not config.noexec:
continue
log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
+ options = ''
if mod == 'lnet':
#For LNET we really need modprobe to load defined LNDs
run('/sbin/modprobe lnet')
- #But if that fails, try insmod anyhow
+ #But if that fails, try insmod anyhow with dev option
+ #accept=all for dev liblustre testing
+ options = 'accept=all'
if src_dir:
module = find_module(src_dir, dev_dir, mod)
if not module:
panic('module not found:', mod)
- (rc, out) = run('/sbin/insmod', module)
+ (rc, out) = run('/sbin/insmod', module, options)
if rc and not mod_loaded(mod):
if rc == 1:
print("Bad module options? Check dmesg.")
# remove any self-ref portals created
lctl.unconfigure_network()
if config.dump:
+ debug('dumping debug log to', config.dump)
# debug hack
lctl.dump(config.dump)
log('unloading the network')
self.nspath = self.db.get_val('nspath', '')
self.mkfsoptions = '-i 4096 ' + self.db.get_val('mkfsoptions', '')
self.mountfsoptions = self.db.get_val('mountfsoptions', '')
+ self.quota = self.db.get_val('quota', '')
# overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
target_uuid = self.db.get_first_ref('target')
mds = self.db.lookup(target_uuid)
self.active = 0
self.inode_size = self.db.get_val_int('inodesize', 0)
+ debug('original inode_size ', self.inode_size)
if self.inode_size == 0:
# find the LOV for this MDS
lovconfig_uuid = mds.get_first_ref('lovconfig')
# self.inode_size = 256
else:
self.inode_size = 512
- debug('stripe_count %d, inode_size %d',
- stripe_count, self.inode_size)
+ debug('stripe_count ', stripe_count,' inode_size ',self.inode_size)
self.target_dev_uuid = self.uuid
self.uuid = target_uuid
# loading modules
+ if self.quota:
+ self.add_lustre_module('quota', 'lquota')
self.add_lustre_module('mdc', 'mdc')
self.add_lustre_module('osc', 'osc')
self.add_lustre_module('lov', 'lov')
print 'MDS mount options: ' + mountfsoptions
lctl.newdev("mds", self.name, self.uuid,
- setup ="%s %s %s %s" %(blkdev, self.fstype, self.name, mountfsoptions))
+ setup ="%s %s %s %s %s" %(blkdev, self.fstype, self.name,
+ mountfsoptions, self.quota))
self.group_upcall = self.db.get_val('group_upcall','')
sys_set_group_upcall(self.name, self.group_upcall)
noexec_opt = ('', '-n')
ret, out = run (sys.argv[0],
noexec_opt[old_noexec == 1],
- " -v --record --nomod",
+ " -v --record --nomod --old_conf",
"--record_log", client_name,
"--record_device", self.name,
"--node", client_name,
#change the mtime of LLOG to match the XML creation time
if toplustreDB.get_mtime():
- mtime = string.atof(toplustreDB.get_mtime())
- runcmd("mkdir /tmp/lustre-XXXX/")
- if is_block(self.devpath):
- ret, out = runcmd("mount %s /tmp/lustre-XXXX/" %self.devpath)
- else:
- ret, out = runcmd("mount -o loop %s /tmp/lustre-XXXX/" %self.devpath)
+ mtime = toplustreDB.get_mtime()
+ debug("changing mtime of LOGS to %s" %mtime)
+ ret, mktemp = runcmd("mktemp /tmp/lustre-cmd.XXXXXXXX")
if ret:
- print out[0]
- try:
- # MOUNT_CONFIGS_DIR
- os.utime("/tmp/lustre-XXXX/CONFIGS", (mtime, mtime))
- except OSError:
- runcmd("umount -f /tmp/lustre-XXXX/")
- panic("Can't adjust config creation time!")
- runcmd("umount -f /tmp/lustre-XXXX/")
- else:
- print "XML file does not contain mtime, skip mtime checking."
+ log(self.module_name, "create mtime LOGS cmdfile failed: ", self.name)
+ else:
+ mtimecmdfile = string.split(mktemp[0])[0]
+ fd = os.open(mtimecmdfile, os.O_RDWR | os.O_CREAT)
+ os.write(fd, "\n\n\n\n\n%s\n\n" %mtime)
+ os.close(fd)
+ cmd = "debugfs -w -R \"mi /LOGS\" <%s %s" %(mtimecmdfile, self.devpath)
+ ret, outs = runcmd(cmd)
+ os.remove(mtimecmdfile)
+ if ret:
+ print "Can not change mtime of LOGS by debugfs."
def mds_remaining(self):
out = lctl.device_list()
self.devpath = self.db.get_val('devpath', '')
self.size = self.db.get_val_int('devsize', 0)
self.journal_size = self.db.get_val_int('journalsize', 0)
- self.inode_size = self.db.get_val_int('inodesize', 0)
+
+ # now as we store fids in EA on OST we need to make inode bigger
+ self.inode_size = self.db.get_val_int('inodesize', 256)
self.mkfsoptions = self.db.get_val('mkfsoptions', '')
# Allocate fewer inodes on large OST devices. Most filesystems
# can be much more aggressive than this, but by default we can't.
if self.size > 1000000:
self.mkfsoptions = '-i 16384 ' + self.mkfsoptions
self.mountfsoptions = self.db.get_val('mountfsoptions', '')
+ self.quota = self.db.get_val('quota', '')
self.fstype = self.db.get_val('fstype', '')
if sys_get_branch() == '2.4' and self.fstype == 'ldiskfs':
self.target_dev_uuid = self.uuid
self.uuid = target_uuid
# modules
+ if self.quota:
+ self.add_lustre_module('quota', 'lquota')
self.add_lustre_module('ost', 'ost')
# FIXME: should we default to ext3 here?
if self.fstype == 'ldiskfs':
print 'OST mount options: ' + mountfsoptions
lctl.newdev(self.osdtype, self.name, self.uuid,
- setup ="%s %s %s %s" %(blkdev, self.fstype,
- self.failover_ost, mountfsoptions))
+ setup ="%s %s %s %s %s" %(blkdev, self.fstype,
+ self.failover_ost, mountfsoptions,
+ self.quota))
if not is_prepared('OSS'):
lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
self.vosc = VOSC(obd, client_uuid, self.name)
self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
+ mds_db = self.db.lookup(self.mds_uuid)
+ quota = mds_db.get_val('quota', '')
+ if quota:
+ self.add_lustre_module('quota', 'lquota')
self.add_lustre_module('mdc', 'mdc')
self.add_lustre_module('llite', 'llite')
extra_error = ""
panic("Error creating " + target_symlink + ": " + e[1] + extra_error)
-# Check mtime of CONFIGS
+# Check mtime of config logs
def doCheckMtime(lustreDB, hosts):
for h in hosts:
node_db = lustreDB.lookup_name(h, 'node')
if s[1].get_class() == 'mdsdev':
mdsdb = s[1]
break
- if mdsdb:
- if lustreDB.get_mtime():
- if config.verbose:
- print "Checking XML modification time"
- devpath = mdsdb.get_val('devpath','')
- xmtime = int(lustreDB.get_mtime())
- runcmd("mkdir /tmp/lustre-XXXX/")
- # mounting ro causes confusing syslog errors
- if is_block(devpath):
- ret, out = runcmd("mount %s /tmp/lustre-XXXX/" %devpath)
- else:
- ret, out = runcmd("mount -o loop %s /tmp/lustre-XXXX/" %devpath)
- if ret:
- print out[0]
- else:
- try:
- out = os.stat("/tmp/lustre-XXXX/CONFIGS")
- except OSError:
- runcmd("umount -f /tmp/lustre-XXXX")
- panic("Warning: Can't read Lustre logs."
- " Please run --write_conf to update.")
- runcmd("umount -f /tmp/lustre-XXXX")
- try:
- kmtime = int(out[8])
- except ValueError:
- kmtime = xmtime
- if xmtime > kmtime :
- debug('xmtime ', xmtime, '> kmtime', kmtime)
- panic("Warning: the startup logs are older than the XML file."
- " Please run --write_conf to update.")
+
+ if mdsdb and lustreDB.get_mtime():
+ debug("Checking XML modification time")
+ devpath = mdsdb.get_val('devpath','')
+ xmtime = string.atol(lustreDB.get_mtime())
+ cmd = "debugfs -c -R 'stat /LOGS' %s 2>&1 | grep mtime" %devpath
+ ret, kmtimes = runcmd(cmd)
+ if ret:
+ log("Can not get mtime info of MDS LOGS directory")
else:
- print "XML file does not contain mtime, skip mtime checking."
+ kmtime = string.atoi(string.split(kmtimes[0])[1], 0)
+ if xmtime > kmtime:
+ debug('xmtime ', xmtime, '> kmtime', kmtime)
+ if config.old_conf:
+ log("Warning: MDS startup logs are older than config %s."
+ " Please run --write_conf on stopped MDS to update."
+ %CONFIG_FILE)
+ else:
+ panic("Error: MDS startup logs are older than config %s."
+ " Please run --write_conf on stopped MDS to update."
+ " Use '--old_conf' to start anyways." %CONFIG_FILE)
+ return
#
# Load profile for
('dump', "Dump the kernel debug log to file before portals is unloaded",
PARAM),
('write_conf', "Save all the client config information on mds."),
+ ('old_conf', "Start up service even though config logs appear outdated."),
('record', "Write config information on mds."),
('record_log', "Name of config record log.", PARAM),
('record_device', "MDS device name that will record the config commands",
mounting (currently OST-only). Can be repeated.""",
PARAMLIST),
('user_xattr', """Enable user_xattr support on MDS""", FLAG, 0),
+ ('acl', """Enable ACL support on MDS""", FLAG, 0),
]
def main():
sys.exit(1)
except CommandError, e:
e.dump()
+ rc = e.rc
+ if rc == 0:
+ rc = 1
sys.exit(e.rc)
if first_cleanup_error:
#include <errno.h>
#include <pwd.h>
#include <grp.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
#include <lnet/api-support.h>
#include <lnet/lnetctl.h>
unsigned int libcfs_subsystem_debug = 0;
-#ifdef HAVE_QUOTA_SUPPORT
-
-/* FIXME: Q_SYNC ... commands defined in linux/quota.h seems broken,
- * so define new commands with the value in kernel */
-#define LUSTRE_Q_QUOTAON 0x800002 /* turn quotas on */
-#define LUSTRE_Q_QUOTAOFF 0x800003 /* turn quotas off */
-#define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */
-#define LUSTRE_Q_SETINFO 0x800006 /* set information about quota files */
-#define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */
-#define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */
-
-/* Where is this stupid thing supposed to be defined? */
-#ifndef USRQUOTA
-# define USRQUOTA 0
-# define GRPQUOTA 1
-#endif
-
-#endif /* HAVE_QUOTA_SUPPORT */
-
/* all functions */
static int lfs_setstripe(int argc, char **argv);
static int lfs_find(int argc, char **argv);
static int lfs_check(int argc, char **argv);
static int lfs_catinfo(int argc, char **argv);
#ifdef HAVE_QUOTA_SUPPORT
-static int lfs_quotachog(int argc, char **argv);
+static int lfs_quotachown(int argc, char **argv);
static int lfs_quotacheck(int argc, char **argv);
static int lfs_quotaon(int argc, char **argv);
static int lfs_quotaoff(int argc, char **argv);
static int lfs_setquota(int argc, char **argv);
static int lfs_quota(int argc, char **argv);
#endif
+static int lfs_join(int argc, char **argv);
/* all avaialable commands */
command_t cmdlist[] = {
"usage: setstripe <filename|dirname> <stripe size> <stripe start> <stripe count>\n"
" or \n"
" setstripe -d <dirname> (to delete default striping)\n"
- "\tstripe size: Number of bytes in each stripe (0 default)\n"
- "\tstripe start: OST index of first stripe (-1 default)\n"
+ "\tstripe size: Number of bytes on each OST (0 filesystem default)\n"
+ "\tstripe start: OST index of first stripe (-1 filesystem default)\n"
"\tstripe count: Number of OSTs to stripe over (0 default, -1 all)"},
{"find", lfs_find, 0,
"To list the extended attributes for a given filename or files in a\n"
"usage: catinfo {keyword} [node name]\n"
"\tkeywords are one of followings: config, deletions.\n"
"\tnode name must be provided when use keyword config."},
+ {"join", lfs_join, 0,
+ "join two lustre files into one - join A, B, will be like cat B >> A & del B\n"
+ "usage: join <filename_A> <filename_B>\n"},
{"osts", lfs_osts, 0, "osts"},
#ifdef HAVE_QUOTA_SUPPORT
- {"quotachog",lfs_quotachog, 0,
- "Change all files owner or group in specified filesystem.\n"
- "usage: quotachog [-i] <filesystem>\n"
+ {"quotachown",lfs_quotachown, 0,
+ "Change files' owner or group on the specified filesystem.\n"
+ "usage: quotachown [-i] <filesystem>\n"
"\t-i: ignore error if file is not exist\n"},
{"quotacheck", lfs_quotacheck, 0,
"Scan the specified filesystem for disk usage, and create,\n"
"usage: setquota [ -u | -g ] <name> <block-softlimit> <block-hardlimit> <inode-softlimit> <inode-hardlimit> <filesystem>\n"
" setquota -t [ -u | -g ] <block-grace> <inode-grace> <filesystem>"},
{"quota", lfs_quota, 0, "Display disk usage and limits.\n"
- "usage: quota -t [ -u |-g ] <filesystem>\n"
- " quota [ -o obd_uuid ] [ -u | -g ] [name] <filesystem>"},
+ "usage: quota [ -o obd_uuid ] [ -u | -g ] [name] <filesystem>"},
#endif
{"help", Parser_help, 0, "help"},
{"exit", Parser_quit, 0, "quit"},
endmntent(fp);
}
+ if (!mnt) {
+ fprintf(stderr, "No suitable Lustre mount found\n");
+ return -1;
+ }
+
rc = llapi_target_check(num_types, obd_types, mnt->mnt_dir);
if (rc)
return rc;
}
+int lfs_join(int argc, char **argv)
+{
+ char *name_head, *name_tail;
+ int fd, rc;
+ loff_t size;
+
+ if (argc != 3)
+ return CMD_HELP;
+ name_head = argv[1];
+ fd = open(name_head, O_WRONLY);
+ if (fd < 0) {
+ fprintf(stderr, "Can not open name_head %s rc=%d\n",
+ name_head, fd);
+ return fd;
+ }
+ size = lseek(fd, 0, SEEK_END);
+ if (size % JOIN_FILE_ALIGN) {
+ fprintf(stderr,"head file %s size %llu must be mutiple of %d\n",
+ name_head, size, JOIN_FILE_ALIGN);
+ rc = -EINVAL;
+ goto out;
+ }
+ name_tail = argv[2];
+ rc = ioctl(fd, LL_IOC_JOIN, name_tail);
+out:
+ close(fd);
+ if (rc) {
+ fprintf(stderr, "Lustre joining files: %s, %s, failed\n",
+ argv[1], argv[2]);
+ }
+ return rc;
+}
+
#ifdef HAVE_QUOTA_SUPPORT
-static int lfs_quotachog(int argc, char **argv)
+static int lfs_quotachown(int argc, char **argv)
{
int c,rc;
flag++;
break;
default:
- fprintf(stderr, "error: %s: option '-%c' unrecognized\n", argv[0], c);
+ fprintf(stderr, "error: %s: option '-%c' "
+ "unrecognized\n", argv[0], c);
return CMD_HELP;
}
}
if (optind == argc)
return CMD_HELP;
- rc = llapi_quotachog(argv[optind], flag);
+ rc = llapi_quotachown(argv[optind], flag);
if(rc)
fprintf(stderr,"error: change file owner/group failed.\n");
return rc;
check_type |= 0x02;
break;
default:
- fprintf(stderr, "error: %s: option '-%c' unrecognized\n", argv[0], c);
+ fprintf(stderr, "error: %s: option '-%c' "
+ "unrecognized\n", argv[0], c);
return CMD_HELP;
}
}
qctl.qc_cmd = LUSTRE_Q_QUOTAOFF;
break;
default:
- fprintf(stderr, "error: %s: option '-%c' unrecognized\n", argv[0], c);
+ fprintf(stderr, "error: %s: option '-%c' "
+ "unrecognized\n", argv[0], c);
return CMD_HELP;
}
}
qctl.qc_type |= 0x02;
break;
default:
- fprintf(stderr, "error: %s: option '-%c' unrecognized\n", argv[0], c);
+ fprintf(stderr, "error: %s: option '-%c' "
+ "unrecognized\n", argv[0], c);
return CMD_HELP;
}
}
qctl.qc_cmd = LUSTRE_Q_SETINFO;
break;
default:
- fprintf(stderr, "error: %s: option '-%c' unrecognized\n", argv[0], c);
+ fprintf(stderr, "error: %s: option '-%c' "
+ "unrecognized\n", argv[0], c);
return CMD_HELP;
}
}
qctl.qc_type--;
if (qctl.qc_type == UGQUOTA) {
- fprintf(stderr, "error: user and group quotas can't be set together\n");
+ fprintf(stderr, "error: user and group quotas can't be set "
+ "both\n");
return CMD_HELP;
}
if (qctl.qc_cmd == LUSTRE_Q_SETQUOTA) {
- struct if_dqblk *dqb = &qctl.qc_dqblk;
+ struct obd_dqblk *dqb = &qctl.qc_dqblk;
if (optind + 6 != argc)
return CMD_HELP;
ARG2INT(dqb->dqb_bhardlimit, argv[optind++], "block-hardlimit");
ARG2INT(dqb->dqb_isoftlimit, argv[optind++], "inode-softlimit");
ARG2INT(dqb->dqb_ihardlimit, argv[optind++], "inode-hardlimit");
+
+ dqb->dqb_valid = QIF_LIMITS;
} else {
- struct if_dqinfo *dqi = &qctl.qc_dqinfo;
+ struct obd_dqinfo *dqi = &qctl.qc_dqinfo;
if (optind + 3 != argc)
return CMD_HELP;
grace2str(seconds - now, buf);
}
+static void print_quota_title(char *name, struct if_quotactl *qctl)
+{
+ printf("Disk quotas for %s %s (%cid %u):\n",
+ type2name(qctl->qc_type), name,
+ *type2name(qctl->qc_type), qctl->qc_id);
+ printf("%15s%8s %7s%8s%8s%8s %7s%8s%8s\n",
+ "Filesystem",
+ "blocks", "quota", "limit", "grace",
+ "files", "quota", "limit", "grace");
+}
-static void print_quota(char *mnt, char *name, struct if_quotactl *qctl)
+static void print_quota(char *mnt, struct if_quotactl *qctl, int ost_only)
{
time_t now;
if (qctl->qc_cmd == LUSTRE_Q_GETQUOTA || qctl->qc_cmd == Q_GETOQUOTA) {
int bover = 0, iover = 0;
- struct if_dqblk *dqb = &qctl->qc_dqblk;
+ struct obd_dqblk *dqb = &qctl->qc_dqblk;
if (dqb->dqb_bhardlimit &&
toqb(dqb->dqb_curspace) > dqb->dqb_bhardlimit) {
}
}
- printf("Disk quotas for %s %s (%cid %u):\n",
- type2name(qctl->qc_type), name,
- *type2name(qctl->qc_type), qctl->qc_id);
- printf("%15s%8s %7s%8s%8s%8s %7s%8s%8s\n",
- "Filesystem",
- "blocks", "quota", "limit", "grace",
- "files", "quota", "limit", "grace");
-
#if 0 /* XXX: always print quotas even when no usages */
if (dqb->dqb_curspace || dqb->dqb_curinodes)
#endif
printf("%s\n%15s", mnt, "");
else
printf("%15s", mnt);
+
if (bover)
diff2str(dqb->dqb_btime, timebuf, now);
- sprintf(numbuf[0], LPU64, toqb(dqb->dqb_curspace));
- sprintf(numbuf[1], LPU64, dqb->dqb_bsoftlimit);
- sprintf(numbuf[2], LPU64, dqb->dqb_bhardlimit);
- printf(" %7s%c %6s %7s %7s", numbuf[0], bover ? '*' : ' ', numbuf[1],
+
+ sprintf(numbuf[0], "%llu", toqb(dqb->dqb_curspace));
+ sprintf(numbuf[1], "%llu", dqb->dqb_bsoftlimit);
+ sprintf(numbuf[2], "%llu", dqb->dqb_bhardlimit);
+ printf(" %7s%c %6s %7s %7s",
+ numbuf[0], bover ? '*' : ' ', numbuf[1],
numbuf[2], bover > 1 ? timebuf : "");
+
if (iover)
diff2str(dqb->dqb_itime, timebuf, now);
- sprintf(numbuf[0], LPU64, dqb->dqb_curinodes);
- sprintf(numbuf[1], LPU64, dqb->dqb_isoftlimit);
- sprintf(numbuf[2], LPU64, dqb->dqb_ihardlimit);
- printf(" %7s%c %6s %7s %7s\n", numbuf[0], iover ? '*' : ' ', numbuf[1],
- numbuf[2], iover > 1 ? timebuf : "");
+
+ sprintf(numbuf[0], "%llu", dqb->dqb_curinodes);
+ sprintf(numbuf[1], "%llu", dqb->dqb_isoftlimit);
+ sprintf(numbuf[2], "%llu", dqb->dqb_ihardlimit);
+ if (!ost_only)
+ printf(" %7s%c %6s %7s %7s",
+ numbuf[0], iover ? '*' : ' ', numbuf[1],
+ numbuf[2], iover > 1 ? timebuf : "");
+ printf("\n");
}
- } else if (qctl->qc_cmd == LUSTRE_Q_GETINFO || qctl->qc_cmd == Q_GETOINFO) {
+ } else if (qctl->qc_cmd == LUSTRE_Q_GETINFO ||
+ qctl->qc_cmd == Q_GETOINFO) {
char bgtimebuf[40];
char igtimebuf[40];
grace2str(qctl->qc_dqinfo.dqi_bgrace, bgtimebuf);
grace2str(qctl->qc_dqinfo.dqi_igrace, igtimebuf);
- printf("Block grace time: %s; Inode grace time: %s\n", bgtimebuf, igtimebuf);
+ printf("Block grace time: %s; Inode grace time: %s\n",
+ bgtimebuf, igtimebuf);
+ }
+}
+
+static void print_mds_quota(char *mnt, struct if_quotactl *qctl)
+{
+ int rc;
+
+ /* XXX: this is a flag to mark that only mds quota is wanted */
+ qctl->qc_dqblk.dqb_valid = 1;
+ rc = llapi_quotactl(mnt, qctl);
+ if (rc) {
+ fprintf(stderr, "quotactl failed: %s\n", strerror(errno));
+ return;
+ }
+ qctl->qc_dqblk.dqb_valid = 0;
+
+ print_quota(qctl->obd_uuid.uuid, qctl, 0);
+}
+
+static void print_lov_quota(char *mnt, struct if_quotactl *qctl)
+{
+ DIR *dir;
+ struct obd_uuid uuids[1024], *uuidp;
+ int obdcount = 1024;
+ int i, rc;
+
+ dir = opendir(mnt);
+ if (!dir) {
+ fprintf(stderr, "open %s failed: %s\n", mnt, strerror(errno));
+ return;
+ }
+
+ rc = llapi_lov_get_uuids(dirfd(dir), uuids, &obdcount);
+ if (rc != 0) {
+ fprintf(stderr, "get ost uuid failed: %s\n", strerror(errno));
+ goto out;
+ }
+
+ for (i = 0, uuidp = uuids; i < obdcount; i++, uuidp++) {
+ memcpy(&qctl->obd_uuid, uuidp, sizeof(*uuidp));
+
+ rc = llapi_quotactl(mnt, qctl);
+ if (rc) {
+ fprintf(stderr, "%s quotactl failed: %s\n",
+ uuidp->uuid, strerror(errno));
+ continue;
+ }
+
+ print_quota(uuidp->uuid, qctl, 1);
}
+
+out:
+ closedir(dir);
+ return;
}
static int lfs_quota(int argc, char **argv)
strncpy(obd_uuid, optarg, sizeof(qctl.obd_uuid));
break;
default:
- fprintf(stderr, "error: %s: option '-%c' unrecognized\n", argv[0], c);
+ fprintf(stderr, "error: %s: option '-%c' "
+ "unrecognized\n", argv[0], c);
return CMD_HELP;
}
}
qctl.qc_type--;
if (qctl.qc_type == UGQUOTA) {
- fprintf(stderr, "error: user or group can't be specified together\n");
+ fprintf(stderr, "error: user or group can't be specified"
+ "both\n");
return CMD_HELP;
}
- if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA && optind + 2 == argc) {
+ if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA) {
+ if (optind + 2 != argc)
+ return CMD_HELP;
+
name = argv[optind++];
rc = name2id(&qctl.qc_id, name, qctl.qc_type);
if (rc) {
name, strerror(errno));
return CMD_HELP;
}
+ print_quota_title(name, &qctl);
} else if (optind + 1 != argc) {
return CMD_HELP;
}
if (!name)
rc = id2name(&name, getuid(), qctl.qc_type);
- print_quota(mnt, name, &qctl);
+ if (*obd_uuid) {
+ mnt = "";
+ name = obd_uuid;
+ }
+
+ print_quota(mnt, &qctl, 0);
+
+ if (!*obd_uuid && qctl.qc_cmd != LUSTRE_Q_GETINFO) {
+ print_mds_quota(mnt, &qctl);
+ print_lov_quota(mnt, &qctl);
+ }
+
return 0;
}
#endif /* HAVE_QUOTA_SUPPORT */
#include <linux/lustre_lib.h>
#include <lustre/liblustreapi.h>
#include <linux/obd_lov.h>
+#include <lustre/liblustreapi.h>
static void err_msg(char *fmt, ...)
{
}
}
+void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *dname,
+ char *fname, int obdindex, int quiet,
+ int header, int body)
+{
+ struct lov_user_md_join *lumj = (struct lov_user_md_join *)lum;
+ int i, obdstripe = 0;
+
+ if (obdindex != OBD_NOT_FOUND) {
+ for (i = 0; i < lumj->lmm_stripe_count; i++) {
+ if (obdindex == lumj->lmm_objects[i].l_ost_idx) {
+ printf("%s/%s\n", dname, fname);
+ obdstripe = 1;
+ break;
+ }
+ }
+ } else if (!quiet) {
+ printf("%s/%s\n", dname, fname);
+ obdstripe = 1;
+ }
+
+ if (header && obdstripe == 1) {
+ printf("lmm_magic: 0x%08X\n", lumj->lmm_magic);
+ printf("lmm_object_gr: "LPX64"\n", lumj->lmm_object_gr);
+ printf("lmm_object_id: "LPX64"\n", lumj->lmm_object_id);
+ printf("lmm_stripe_count: %u\n", (int)lumj->lmm_stripe_count);
+ printf("lmm_stripe_size: %u\n", lumj->lmm_stripe_size);
+ printf("lmm_stripe_pattern: %x\n", lumj->lmm_pattern);
+ printf("lmm_extent_count: %x\n", lumj->lmm_extent_count);
+ }
+
+ if (body) {
+ unsigned long long start = -1, end = 0;
+ if (!quiet && obdstripe == 1)
+ printf("joined\tobdidx\t\t objid\t\tobjid\t\t group"
+ "\t\tstart\t\tend\n");
+ for (i = 0; i < lumj->lmm_stripe_count; i++) {
+ int idx = lumj->lmm_objects[i].l_ost_idx;
+ long long oid = lumj->lmm_objects[i].l_object_id;
+ long long gr = lumj->lmm_objects[i].l_object_gr;
+ if (obdindex == OBD_NOT_FOUND || obdindex == idx)
+ printf("\t%6u\t%14llu\t%#13llx\t%14llu%s",
+ idx, oid, oid, gr,
+ obdindex == idx ? " *" : "");
+ if (start != lumj->lmm_objects[i].l_extent_start ||
+ end != lumj->lmm_objects[i].l_extent_end) {
+ start = lumj->lmm_objects[i].l_extent_start;
+ printf("\t%14llu", start);
+ end = lumj->lmm_objects[i].l_extent_end;
+ if (end == (unsigned long long)-1)
+ printf("\t\tEOF\n");
+ else
+ printf("\t\t%llu\n", end);
+ } else {
+ printf("\t\t\t\t\n");
+ }
+ }
+ printf("\n");
+ }
+}
+
void llapi_lov_dump_user_lmm(struct find_param *param, char *dname, char *fname)
{
switch(*(__u32 *)¶m->lmd->lmd_lmm) { /* lum->lmm_magic */
case LOV_USER_MAGIC_V1:
- lov_dump_user_lmm_v1(¶m->lmd->lmd_lmm, dname, fname, param->obdindex,
- param->quiet, param->verbose,
- (param->verbose || !param->obduuid));
+ lov_dump_user_lmm_v1(¶m->lmd->lmd_lmm, dname, fname,
+ param->obdindex, param->quiet,
+ param->verbose,
+ (param->verbose || !param->obduuid));
+ break;
+ case LOV_USER_MAGIC_JOIN:
+ lov_dump_user_lmm_join(¶m->lmd->lmd_lmm, dname, fname,
+ param->obdindex, param->quiet,
+ param->verbose,
+ (param->verbose || !param->obduuid));
break;
default:
printf("unknown lmm_magic: %#x (expecting %#x)\n",
return rc;
}
-int llapi_target_check(int type_num, char **obd_type, char *dir)
+int llapi_target_iterate(int type_num, char **obd_type, void *args, llapi_cb_t cb)
{
char buf[MAX_STRING_SIZE];
FILE *fp = fopen(DEVICES_LIST, "r");
while (fgets(buf, sizeof(buf), fp) != NULL) {
char *obd_type_name = NULL;
char *obd_name = NULL;
+ char *obd_uuid = NULL;
char rawbuf[OBD_MAX_IOCTL_BUFFER];
char *bufl = rawbuf;
char *bufp = buf;
obd_type_name = strsep(&bufp, " ");
}
obd_name = strsep(&bufp, " ");
+ obd_uuid = strsep(&bufp, " ");
memset(&osfs_buffer, 0, sizeof (osfs_buffer));
if (strcmp(obd_type_name, obd_type[i]) != 0)
continue;
- rc = llapi_ping(obd_type_name, obd_name);
- if (rc) {
- fprintf(stderr, "error: check %s: %s\n",
- obd_name, strerror(rc = errno));
- } else {
- printf("%s active.\n", obd_name);
- }
+ cb(obd_type_name, obd_name, obd_uuid, args);
}
}
fclose(fp);
return rc;
}
+static void do_target_check(char *obd_type_name, char *obd_name,
+ char *obd_uuid, void *args)
+{
+ int rc;
+
+ rc = llapi_ping(obd_type_name, obd_name);
+ if (rc) {
+ fprintf(stderr, "error: check %s: %s\n",
+ obd_name, strerror(rc = errno));
+ } else {
+ printf("%s active.\n", obd_name);
+ }
+}
+
+int llapi_target_check(int type_num, char **obd_type, char *dir)
+{
+ return llapi_target_iterate(type_num, obd_type, NULL, do_target_check);
+}
+
#undef MAX_STRING_SIZE
int llapi_catinfo(char *dir, char *keyword, char *node_name)
while (1) {
rc = ioctl(dirfd(root), LL_IOC_POLL_QUOTACHECK, qchk);
- if (!rc || errno != ENODATA)
+ if (!rc)
break;
sleep(poll_intvl);
if (poll_intvl < 30)
return rc;
}
-static int quotachog_process_file(DIR *dir, char *dname, char *fname,
+static int quotachown_process_file(DIR *dir, char *dname, char *fname,
struct find_param *param)
{
lstat_t *st;
st = ¶m->lmd->lmd_st;
snprintf(pathname, sizeof(pathname), "%s/%s", dname, fname);
+
+ /* libc chown() will do extra check, and if the real owner is
+ * the same as the ones to set, it won't fall into kernel, so
+ * invoke syscall directly. */
rc = syscall(SYS_chown, pathname, st->st_uid, st->st_gid);
if (rc)
fprintf(stderr, "chown %s (%u,%u) fail: %s\n",
return rc;
}
-int llapi_quotachog(char *path, int flag)
+int llapi_quotachown(char *path, int flag)
{
struct find_param param;
int ret = 0;
param.recursive = 1;
param.verbose = 0;
param.quiet = 1;
- param.process_file = quotachog_process_file;
+ param.process_file = quotachown_process_file;
ret = prepare_find(¶m);
if (ret)
void usage(FILE *out)
{
fprintf(out, "%s v1.%d\n", progname, LMD_MAGIC & 0xFF);
- fprintf(out, "usage: %s <mdsnode>:/<mdsname>/<cfgname> <mountpt> "
- "[-fhnv] [-o mntopt]\n", progname);
+ fprintf(out, "usage: %s <mdsnode>[,<altmdsnode>]:/<mdsname>/<cfgname>"
+ " <mountpt> [-fhnv] [-o mntopt]\n", progname);
fprintf(out, "\t<mdsnode>: nid of MDS (config) node\n"
"\t<mdsname>: name of MDS service (e.g. mds1)\n"
"\t<cfgname>: name of client config (e.g. client)\n"
"\t-v|--verbose: print verbose config settings\n"
"\t-o: filesystem mount options:\n"
"\t\tflock/noflock: enable/disable flock support\n"
- "\t\tuser_xattr/nouser_xattr: enable/disable user extended attributes\n"
+ "\t\tuser_xattr/nouser_xattr: enable/disable user extended "
+ "attributes\n"
);
exit(out != stdout);
}
{
memset(lmd, 0, sizeof(*lmd));
lmd->lmd_magic = LMD_MAGIC;
- lmd->lmd_nid = LNET_NID_ANY;
return 0;
}
int
print_options(struct lustre_mount_data *lmd, const char *options)
{
- printf("nid: %s\n", libcfs_nid2str(lmd->lmd_nid));
- printf("mds: %s\n", lmd->lmd_mds);
+ int i;
+ for (i = 0; i < lmd->lmd_nid_count; i++) {
+ printf("mds nid %d: %s\n", i,
+ libcfs_nid2str(lmd->lmd_nid[i]));
+ }
+ printf("mds name: %s\n", lmd->lmd_mds);
printf("profile: %s\n", lmd->lmd_profile);
printf("options: %s\n", options);
return 0;
}
+static int parse_nids(struct lustre_mount_data *lmd, char *nids)
+{
+ int i = 0;
+ char *tmp = 0;
+ lnet_nid_t nid;
+
+ while ((tmp = strsep(&nids, ",:"))) {
+ nid = libcfs_str2nid(tmp);
+ if (nid == LNET_NID_ANY) {
+ fprintf(stderr, "%s: Can't parse NID '%s'\n",
+ progname, tmp);
+ continue;
+ }
+ lmd->lmd_nid[lmd->lmd_nid_count++] = nid;
+ if (lmd->lmd_nid_count >= MAX_FAILOVER_NIDS) {
+ fprintf(stderr, "%s: Too many: ignoring nids after %s\n",
+ progname, tmp);
+ break;
+ }
+ }
+ return (lmd->lmd_nid_count);
+}
+
+
/*****************************************************************************
*
* This part was cribbed from util-linux/mount/mount.c. There was no clear
{ "noflock", 1, 1, 0, LMD_FLG_FLOCK}, /* Disable flock support */
{ "user_xattr", 0, 0, 0, LMD_FLG_USER_XATTR}, /* Enable get/set user xattr */
{ "nouser_xattr", 1, 1, 0, LMD_FLG_USER_XATTR}, /* Disable user xattr */
+ { "acl", 0, 0, 0, LMD_FLG_ACL}, /* Enable ACL support */
+ { "noacl", 1, 1, 0, LMD_FLG_ACL}, /* Disable ACL support */
/* please add new mount options to usage message */
{ NULL, 0, 0, 0, 0 }
};
if (rc)
return rc;
- lmd->lmd_nid = libcfs_str2nid(nid);
- if (lmd->lmd_nid == LNET_NID_ANY) {
- fprintf(stderr, "%s: can't parse nid '%s'\n", progname, nid);
- return 1;
+ if (parse_nids(lmd, nid) == 0) {
+ fprintf(stderr, "%s: Can't parse any mds nids\n", progname);
+ return(1);
}
if (strlen(mds) + 1 > sizeof(lmd->lmd_mds)) {
if (!fake)
rc = mount(source, target, "lustre", flags, (void *)&lmd);
if (rc) {
- fprintf(stderr, "%s: mount(%s, %s) failed: %s\n", source,
- target, progname, strerror(errno));
+ fprintf(stderr, "%s: mount(%s, %s) failed: %s\n", progname,
+ source, target, strerror(errno));
if (errno == ENODEV)
fprintf(stderr, "Are the lustre modules loaded?\n"
"Check /etc/modules.conf and /proc/filesystems\n");
--mdsuuid uuid
--mkfsoptions options
--mountfsoptions options
+ --quota quotaon=u|g|ug,iunit=,bunit=,itune=,btune=
--add lov
--lov lov_name
--ostuuid uuid
--mkfsoptions options
--mountfsoptions options
+ --quota quotaon=u|g|ug,iunit=,bunit=,itune=,btune=
--add mtpt - Mountpoint
--node node_name
('mdsuuid', "Optional argument to specify MDS UUID", PARAM,""),
('nspath', "Local mount point of server namespace.", PARAM,""),
('format', ""),
-
+ ('quota', "quotaon:enable quota, only u|g|ug is supported now. \
+ iunit: the unit for slave to acquire/release inode quota from/to masteri.\
+ Int type (>0), default value in Lustre is 5000 inodes.\
+ bunit: the unit for slave to acquire/release block quota from/to master.\
+ Mbytes (>0), default value in Lustre is 100(Mbytes).\
+ itune: used to tune the threthold. When inode quota usage reach the threthold,\
+ slave should acquire/release inode quota from/to master.\
+ Int type (100 > btune > 0), default value in Lustre is 50 (percentge).\
+ inode threthold = iunit * itune / 100.\
+ btune: used to tune the threthold. When block quota usage reach the threthold,\
+ slave should acquire/release block quota from/to master.\
+ Int type (100 > btune > 0), default value in Lustre is 50 (percentage).\
+ block threthold = bunit * btune / 100.", PARAM,""),
# clients: mountpoint and echo
('echo_client', "", PARAM),
('path', "Specify the mountpoint for Lustre.", PARAM),
def osd(self, name, uuid, fstype, osdtype, devname, format, ost_uuid,
node_uuid, dev_size=0, journal_size=0, inode_size=0, nspath="",
- mkfsoptions="", mountfsoptions=""):
+ mkfsoptions="", mountfsoptions="", quota=""):
osd = self.newService("osd", name, uuid)
osd.setAttribute('osdtype', osdtype)
osd.appendChild(self.ref("target", ost_uuid))
self.addElement(osd, "mkfsoptions", mkfsoptions)
if mountfsoptions:
self.addElement(osd, "mountfsoptions", mountfsoptions)
+ if quota:
+ self.addElement(osd, "quota", quota)
if nspath:
self.addElement(osd, "nspath", nspath)
return osd
def mdsdev(self, name, uuid, fstype, devname, format, node_uuid,
mds_uuid, dev_size=0, journal_size=0, inode_size=256,
- nspath="", mkfsoptions="", mountfsoptions="", group_upcall=""):
+ nspath="", mkfsoptions="", mountfsoptions="", quota="", group_upcall=""):
mdd = self.newService("mdsdev", name, uuid)
self.addElement(mdd, "fstype", fstype)
dev = self.addElement(mdd, "devpath", devname)
self.addElement(mdd, "mkfsoptions", mkfsoptions)
if mountfsoptions:
self.addElement(mdd, "mountfsoptions", mountfsoptions)
+ if quota:
+ self.addElement(mdd, "quota", quota)
if group_upcall:
self.addElement(mdd, "group_upcall", group_upcall)
nspath = get_option(options, 'nspath')
mkfsoptions = get_option(options, 'mkfsoptions')
mountfsoptions = get_option(options, 'mountfsoptions')
+ quota = get_option(options, 'quota')
group_upcall = get_option(options, 'group_upcall')
node_uuid = name2uuid(lustre, node_name, 'node')
mdd = gen.mdsdev(mdd_name, mdd_uuid, fstype, devname,
get_format_flag(options), node_uuid, mds_uuid,
size, journal_size, inode_size, nspath, mkfsoptions,
- mountfsoptions, group_upcall)
+ mountfsoptions, quota, group_upcall)
lustre.appendChild(mdd)
journal_size = ''
inode_size = ''
mkfsoptions = ''
- mountfsoptions = ''
+ mountfsoptions = ''
+ quota = ''
else:
devname = get_option(options, 'dev') # can be unset for bluearcs
size = get_option(options, 'size')
journal_size = get_option(options, 'journal_size')
inode_size = get_option(options, 'inode_size')
mkfsoptions = get_option(options, 'mkfsoptions')
- mountfsoptions = get_option(options, 'mountfsoptions')
+ mountfsoptions = get_option(options, 'mountfsoptions')
+ quota = get_option(options, 'quota')
nspath = get_option(options, 'nspath')
osd = gen.osd(osdname, osd_uuid, fstype, osdtype, devname,
get_format_flag(options), ost_uuid, node_uuid, size,
journal_size, inode_size, nspath, mkfsoptions,
- mountfsoptions)
+ mountfsoptions, quota)
node = findByName(lustre, node_name, "node")
lustre_cfg_bufs_reset(&bufs, lcfg_devname);
- if (argc > 5)
+ if (argc > 6)
return CMD_HELP;
for (i = 1; i < argc; i++) {
+++ /dev/null
-#!/bin/sh
-# Copyright (C) 2003 Cluster File Systems, Inc.
-# Create a Lustre configuration file
-#
-# Usage: lwizard
-#
-# Jerrifer <jerrifer@clusterfs.com>
-# wangdi <wangdi@clusterfs.com>
-
-# fatal error to exit
-fatal()
-{
- if [ "$#" -gt "1" ]; then
- echo
- echo "$2"
- exit 1
- fi
-
- exit 1
-}
-
-#print usage and exit
-usage()
-{
- cat <<EOF
-Usage: ${0##*/} [OPTIONS]...
-
-${0##*/} asks the user questions about their cluster configuration,
-and writes an appropriate configuration file to config.xml.
-
-Options:
- --batch=FILE
- save lmc batch commands to FILE
- -o, --file=FILE
- write Lustre configuration to FILE (default: lwizard.xml)
- -f, --force
- force existing files to be overwritten
- --help
- to get this help
- --stripe_size=SIZE
- size (in KB) of each stripe on an OST (default: 64)
- --stripe_count=COUNT
- the number of OSTs files are striped to (default: 1)
-EOF
-
- exit 0
-}
-
-# check if $1 is a number
-check_number()
-{
- local num=$(expr "$1" : "[0-9]*$")
- if [ $num -gt "0" ]; then
- return 0
- fi
-
- return 1
-}
-
-# parse options of this shell
-LMC_BATCH_FILE=
-RM_BATCH_FILE=1
-FORCE=0
-get_option()
-{
- local long_options="batch:,file:,force:,help,stripe_size:,stripe_count:"
- local options
-
- options=$(getopt -o o:hf --long "$long_options" -- "$@")
-
- if [ $? -ne 0 ] ; then
- usage
- fi
- eval set -- "$options"
-
- while true ; do
- case "$1" in
- --batch)
- LMC_BATCH_FILE=$2
- RM_BATCH_FILE=0
- shift 2
- ;;
- -o | --file)
- CONFIG_FILE=$2
- shift 2
- ;;
- -f | --force)
- FORCE=1
- shift
- ;;
- --stripe_count)
- STRIPE_CNT=$2
- check_number $STRIPE_CNT || \
- fatal 1 "Stripe count should be a number."
- shift 2
- ;;
- --stripe_size)
- STRIPE_SIZE=$(($2 * 1024))
- check_number $STRIPE_SIZE || \
- fatal 1 "Stripe size should be a number."
- shift 2
- ;;
- -h | --help)
- usage
- ;;
- --)
- shift 1
- break
- esac
- done
-}
-
-# if $1 in $2
-in_list()
-{
- local node
-
- for node in $2 ; do
- [ "$1" = "$node" ] && return 0
- done
- return 1
-}
-
-# read device size from user and check devive size and convert device
-# size to *K
-
-get_device_size()
-{
- local size
- local tail
-
- read size
- if [ -z $size ]; then
- device_size="0"
- return 0
- fi
-
- device_size=$(echo $size | awk -F[a-z,A-Z] '{print $1; }')
- [ -z $device_size ] && return 1
- tail=$(echo $size | awk -F$device_size '{print $2; }')
- [ -z $tail ] && return 0
- case $tail in
- k | K)
- ;;
- m | M)
- (( device_size *= 1024 ))
- ;;
- t | T)
- (( device_size *= (1024 * 1024) ))
- ;;
- *)
- return 1
- ;;
- esac
-
- return 0
-}
-
-# ask user some questions to add a device
-add_device()
-{
- local hostnames
- local device
- local device_size
- local hostname
- local interfaces
-
- echo "Creating $1 \"$1$2\"..."
- if [ "$2" -gt "1" ]; then
- echo -n "Please enter the HOSTNAME for $1$2, or just hit enter to finish: "
- else
- echo -n "Please enter the HOSTNAME for $1$2: "
- fi
- read hostnames
-
- if [ -z "$hostnames" ] ; then
- return 1
- fi
-
- # Why do we need multiple hosts for a device? If we just want to support
- # failover, we already have.
- for hostname in $hostnames ; do
- break
- done
-
- # Multi-net
- cat <<EOF
-If $hostname has more than one network INTERFACE, enter here, separating them
-by blank space. See lwizard man page for help.
-EOF
- echo -n "(hit enter if only one): "
- read interfaces
- interfaces=`echo $interfaces | sed -e "s/ /,/g"`
-
- device=
- while [ -z "$device" ] ; do
- echo -n "Please enter the device or loop file name for $1$2 on ${hostname}: "
- read device
- echo -n "Please enter the device SIZE or 0 to use entire device (in KB): "
- while ! get_device_size ; do
- echo -n "Please enter the device SIZE or 0 to use entire device (in KB): "
- done
- echo -n "Do you want to configure FAILOVER $1$2? "
- read answer
- if [ "${answer:0:1}" = "y" -o "${answer:0:1}" = "Y" ] ; then
- echo -n "Please enter the HOSTNAME for failover $1$2: "
- read failoverhostname
- echo -n "Please enter the device for failover $1$2 on ${failoverhostname}: "
- read failoverdevice
- else
- failoverhostname=
- failovedevice=
- fi
- done
- newdev="$hostname:$device:$2:$1$2:$CURRENT_MDS:$CURRENT_LOV:$device_size:$failoverhostname:$failoverdevice:$interfaces"
- DEVICE_LIST="$DEVICE_LIST $newdev"
-
- return 0
-}
-
-cur_mds_id=1
-
-# get mds information
-add_mds()
-{
- CURRENT_LOV=
- CURRENT_MDS=
- add_device "mds" "$cur_mds_id" || return 1
- CURRENT_LOV="lov$cur_mds_id"
- CURRENT_MDS="mds$cur_mds_id"
-
- DEVICE_LIST="$DEVICE_LIST *:*:lov:$CURRENT_LOV:$CURRENT_MDS:*:"
-
- (( cur_mds_id++ ))
- return 0
-}
-
-cur_ost_id=1
-
-# ask user to add ost
-add_ost()
-{
- # We have to add one...
- while ! add_device "ost" "$cur_ost_id" ; do
- true
- done
-
- (( cur_ost_id++ ))
-
- # ...and maybe more
- while add_device "ost" "$cur_ost_id" ; do
- (( cur_ost_id++ ))
- done
- return 0
-}
-
-cur_cli_id=1
-
-# ask user to add client to lustre
-add_client()
-{
- echo -n "Please enter the clients' mountpoint (/mnt/lustre): "
- read mtpt
- [ -z "$mtpt" ] && mtpt="/mnt/lustre"
- newdev="*:$mtpt:client:client:$CURRENT_MDS:$CURRENT_LOV"
- DEVICE_LIST="$DEVICE_LIST $newdev"
- (( cur_cli_id++ ))
-
- # Multi-net
- while true ; do
- echo -n "Do you want to configure another client with multiple network interfaces? "
- read answer
- if [ "${answer:0:1}" = "y" -o "${answer:0:1}" = "Y" ] ; then
- echo -n "Please enter the HOSTNAME: "
- read hostname
- if [ -z "$hostname" ] ; then
- echo "No extra client is configured"
- return 0
- fi
-
- echo -n "Please enter network interface address (separated by space): "
- read interfaces
- interfaces=`echo $interfaces | sed -e "s/ /,/g"`
- if [ -z "$interfaces" ] ; then
- echo "No extra client is configured"
- return 0
- fi
-
- newdev="$hostname:$mtpt:client:client:$CURRENT_MDS:$CURRENT_LOV::::$interfaces"
- DEVICE_LIST="$DEVICE_LIST $newdev"
- else
- break
- fi
- done
- return 0
-}
-
-# save node config into config file
-add_node()
-{
- local node=$1
- local interfaces=$2
- local nettype=$DEFAULT_NETTYPE
-
- in_list "$node" "$NODE_LIST" && return 0
- NODE_LIST="$NODE_LIST $node"
-
- run_lmc --add node --node "$node"
-
- interfaces=`echo $interfaces | sed -e "s/,/ /g"`
- extraopt=""
-
- if [ "$interfaces" ] ; then
- for i in $interfaces ; do
- extraopt=" $extraopt --hostaddr $i"
- done
- fi
- run_lmc --add net --node "$node" --nid "$node" \
- --nettype "$nettype" $extraopt
-
- return 0
-}
-
-# save client node config into config file
-add_client_node()
-{
- local node=$1
- local nettype=$DEFAULT_NETTYPE
-
- in_list "$node" "$NODE_LIST" && return 0
- NODE_LIST="$NODE_LIST $node"
- run_lmc --add node --node "$node"
- run_lmc --add net --node "$node" --nid "*" \
- --nettype "$nettype"
-
- return 0
-}
-
-# get hostname, device , device_id and device name
-# from mds node
-get_name_in_list()
-{
- HOST_NAME=$(echo $1 | awk -F: '{ print $1 }')
- DEVICE=$(echo $1 | awk -F: '{ print $2 }')
- DEVICE_ID=$(echo $1 | awk -F: '{ print $3 }')
- DEVICE_NAME=$(echo $1 | awk -F: '{ print $4 }')
- DEVICE_MDS=$(echo $1 | awk -F: '{ print $5 }')
- DEVICE_LOV=$(echo $1 | awk -F: '{ print $6 }')
- DEVICE_SIZE=$(echo $1 | awk -F: '{ print $7 }')
- FAILOVER_HOST=$(echo $1 | awk -F: '{ print $8 }')
- FAILOVER_DEVICE=$(echo $1 | awk -F: '{ print $9 }')
- INTERFACES=$(echo $1 | awk -F: '{ print $10 }')
-}
-
-# save command to file and do the command
-run_lmc()
-{
- echo "$@" >> "$LMC_BATCH_FILE"
-}
-
-# following user input to create xml config file
-create_config()
-{
- local extraopt=""
-
- for device in $DEVICE_LIST ; do
- get_name_in_list $device
- echo -n " $DEVICE_NAME"
-
- case $DEVICE_NAME in
- mds*)
- add_node "$HOST_NAME" "$INTERFACES"
- extraopt=""
- if [ "$FAILOVER_HOST" != "" ] ; then
- extraopt=" --failover --group $HOST_NAME"
- fi
-
- run_lmc --add mds \
- --node "$HOST_NAME" \
- --mds "$DEVICE_NAME" \
- --dev "$DEVICE" \
- --size "$DEVICE_SIZE" \
- --fstype "$DEFAULT_FSTYPE" \
- $extraopt
- if [ "$FAILOVER_HOST" != "" ] ; then
- add_node "$FAILOVER_HOST"
- run_lmc --add mds \
- --node "$FAILOVER_HOST" \
- --mds "$DEVICE_NAME" \
- --dev "$FAILOVER_DEVICE" \
- --size "$DEVICE_SIZE" \
- --fstype "$DEFAULT_FSTYPE" \
- --failover \
- --group "$HOST_NAME"
- fi
- ;;
- lov*)
- run_lmc --add lov \
- --lov "$DEVICE_NAME" \
- --mds "$DEVICE_MDS" \
- --stripe_sz "$STRIPE_SIZE" \
- --stripe_cnt "$STRIPE_CNT" \
- --stripe_pattern "$STRIPE_PATTERN"
- ;;
- ost*)
- add_node "$HOST_NAME" "$INTERFACES"
- extraopt=""
- if [ "$FAILOVER_HOST" != "" ] ; then
- extraopt=" --failover --group $HOST_NAME"
- fi
- run_lmc --add ost \
- --node "$HOST_NAME" \
- --ost "$DEVICE_NAME" \
- --lov "$DEVICE_LOV" \
- --dev "$DEVICE" \
- --size "$DEVICE_SIZE" \
- --fstype "$DEFAULT_FSTYPE" \
- $extraopt
- if [ "$FAILOVER_HOST" != "" ] ; then
- add_node "$FAILOVER_HOST"
- run_lmc --add ost \
- --node "$FAILOVER_HOST" \
- --ost "$DEVICE_NAME" \
- --lov "$DEVICE_LOV" \
- --dev "$FAILOVER_DEVICE" \
- --size "$DEVICE_SIZE" \
- --fstype "$DEFAULT_FSTYPE" \
- --failover \
- --group "$HOST_NAME"
- fi
- ;;
- client*)
- if [ "$INTERFACES" ] ; then
- add_node "$HOST_NAME" "$INTERFACES"
- run_lmc --add mtpt \
- --node "$HOST_NAME" \
- --mds "$DEVICE_MDS" \
- --lov "$DEVICE_LOV" \
- --path "$DEVICE" \
- # --clientoptions "async"
-
- else
- add_client_node "$DEVICE_NAME"
- run_lmc --add mtpt \
- --node "$DEVICE_NAME" \
- --mds "$DEVICE_MDS" \
- --lov "$DEVICE_LOV" \
- --path "$DEVICE" \
- # --clientoptions "async"
- fi
- ;;
- esac
- done
-
- echo
- return 0
-}
-
-maybe_clean()
-{
- [ -f "$1" ] || return 0
- if ! (( $FORCE )) ; then
- echo -n "${0##*/}: overwrite existing $2 \"$1\"? "
- read answer
- if ! [ "${answer:0:1}" = "y" -o "${answer:0:1}" = "Y" ] ; then
- echo "(${0##*/}: (Exiting.)"
- exit 0
- fi
- fi
- rm -f "$1"
-}
-
-# parse options
-get_option "$@"
-
-# some default definitions
-LMC=${LMC:-"./lmc"}
-
-CONFIG_FILE=${CONFIG_FILE:-"lwizard.xml"}
-
-# Remove exiting files.
-
-maybe_clean "$CONFIG_FILE" "Lustre configuration file"
-if [ "$LMC_BATCH_FILE" ] ; then
- maybe_clean "$LMC_BATCH_FILE" "lmc batch file"
-else
- LMC_BATCH_FILE=$(mktemp -q "/tmp/${CONFIG_FILE##*/}.XXXXXX")
- [ $? -eq 0 ] || fatal 1 "Couldn't create temporary batch file."
-fi
-
-DEFAULT_FSTYPE=${DEFAULT_FSTYPE:-"ext3"}
-DEFAULT_NETTYPE=${DEFAULT_NETTYPE:-"tcp"}
-DEFAULT_MNTPT=${DEFAULT_MNTPT:-"/mnt/lustre"}
-
-STRIPE_SIZE=${STRIPE_SIZE:-$((1 * 1024 * 1024))}
-STRIPE_CNT=${STRIPE_CNT:-1}
-STRIPE_PATTERN=${STRIPE_PATTERN:-0}
-
-ANSWER="yes no"
-
-CURRENT_LOV=
-MDS_LIST=
-OST_LIST=
-CLIENT_LIST=
-
-# print program information
-cat <<EOF
-${0##*/} will help you create a Lustre configuration file.
-
-EOF
-if add_mds ; then
- add_ost
- add_client
-fi
-
-create_config
-$LMC --batch "$LMC_BATCH_FILE" -o "$CONFIG_FILE"
-if [ $? -ne 0 ] ; then
- fatal 1 "lmc returned an error; Please check above for more details."
-fi
-
-echo "The Lustre configuration has been written to $CONFIG_FILE."
-
-if (( $RM_BATCH_FILE )) ; then
- rm -f "$LMC_BATCH_FILE"
-else
- echo "The lmc batch file has been written to $LMC_BATCH_FILE."
-fi
-
-exit 0
printf("Target: %s\n", ldd->ldd_svname);
printf("Lustre FS: %s\n", ldd->ldd_fsname);
printf("Mount type: %s\n", MT_STR(ldd));
- printf("Flags: %s%s%s%s\n",
+ printf("Flags: %s%s%s%s%s\n",
IS_MDT(ldd) ? "MDT ":"", IS_OST(ldd) ? "OST ":"",
IS_MGMT(ldd) ? "MGMT ":"",
ldd->ldd_flags & LDD_F_NEED_INDEX ? "needs_index ":"",
void usage(FILE *out)
{
- fprintf(out, "%s v1.%d\n", progname, LMD_MAGIC & 0xFF);
+ fprintf(out, "%s v2.0\n", progname);
fprintf(out, "usage: %s <mgmtnid>[:<altmgtnid>...]:/<filesystem>[/<cfgname>] <mountpt> "
"[-fhnv] [-o mntopt]\n", progname);
fprintf(out, "\t<mdsnode>: nid of MDS (config) node\n"
{ "nouser", 0, 1, 0 }, /* Forbid ordinary user to mount */
{ "noowner", 0, 1, 0 }, /* Device owner has no special privs */
{ "_netdev", 0, 0, 0 }, /* Device accessible only via network */
+ /* These strings are passed through and parsed in lustre ll_options */
{ "flock", 0, 0, 0 }, /* Enable flock support */
{ "noflock", 1, 1, 0 }, /* Disable flock support */
{ "user_xattr", 0, 0, 0 }, /* Enable get/set user xattr */
{ "nouser_xattr", 1, 1, 0 }, /* Disable user xattr */
+ { "acl", 0, 0, 0 }, /* Enable ACL support */
+ { "noacl", 1, 1, 0 }, /* Disable ACL support */
{ "nosvc", 0, 0, 0 }, /* Only start MGS/MGC, no other services */
{ NULL, 0, 0, 0 }
};
return 0;
}
-
void obd_finalize(int argc, char **argv)
{
struct sigaction sigact;
#define STRINGIFY(a) #a
+
+#define CHECK_CDEFINE(a) \
+ printf(" CLASSERT("#a" == "STRINGIFY(a) ");\n")
+
+#define CHECK_CVALUE(a) \
+ printf(" CLASSERT("#a" == %lld);\n", (long long)a)
+
#define CHECK_DEFINE(a) \
do { \
printf(" LASSERTF("#a" == "STRINGIFY(a) \
CHECK_MEMBER(obdo, o_easize);
CHECK_MEMBER(obdo, o_mds);
CHECK_MEMBER(obdo, o_padding_1);
- CHECK_MEMBER(obdo, o_padding_2);
CHECK_MEMBER(obdo, o_inline);
CHECK_VALUE(OBD_INLINESZ);
CHECK_VALUE(OBD_MD_FLQOS);
CHECK_VALUE(OBD_MD_FLCOOKIE);
CHECK_VALUE(OBD_MD_FLGROUP);
- CHECK_VALUE(OBD_MD_FLIFID);
+ CHECK_VALUE(OBD_MD_FLFID);
CHECK_VALUE(OBD_MD_FLEPOCH);
CHECK_VALUE(OBD_MD_FLGRANT);
CHECK_VALUE(OBD_MD_FLDIREA);
CHECK_MEMBER(obd_statfs, os_fsid);
CHECK_MEMBER(obd_statfs, os_bsize);
CHECK_MEMBER(obd_statfs, os_namelen);
- CHECK_MEMBER(obd_statfs, os_spare);
+ CHECK_MEMBER(obd_statfs, os_state);
}
static void
CHECK_MEMBER(obd_dqblk, dqb_itime);
CHECK_MEMBER(obd_dqblk, dqb_valid);
CHECK_MEMBER(obd_dqblk, padding);
+
+ CHECK_DEFINE(Q_QUOTACHECK);
+ CHECK_DEFINE(Q_INITQUOTA);
+ CHECK_DEFINE(Q_GETOINFO);
+ CHECK_DEFINE(Q_GETOQUOTA);
}
static void
CHECK_MEMBER(mds_body, generation);
CHECK_MEMBER(mds_body, suppgid);
CHECK_MEMBER(mds_body, eadatasize);
- CHECK_MEMBER(mds_body, padding_1);
- CHECK_MEMBER(mds_body, padding_2);
- CHECK_MEMBER(mds_body, padding_3);
+ CHECK_MEMBER(mds_body, aclsize);
+ CHECK_MEMBER(mds_body, max_mdsize);
+ CHECK_MEMBER(mds_body, max_cookiesize);
CHECK_MEMBER(mds_body, padding_4);
CHECK_VALUE(FMODE_READ);
CHECK_VALUE(FMODE_WRITE);
CHECK_VALUE(FMODE_EXEC);
+
CHECK_VALUE(MDS_OPEN_CREAT);
CHECK_VALUE(MDS_OPEN_EXCL);
CHECK_VALUE(MDS_OPEN_TRUNC);
CHECK_VALUE(MDS_OPEN_SYNC);
CHECK_VALUE(MDS_OPEN_DIRECTORY);
CHECK_VALUE(MDS_OPEN_DELAY_CREATE);
- CHECK_VALUE(MDS_OPEN_HAS_EA);
+ CHECK_CDEFINE(MDS_OPEN_OWNEROVERRIDE);
+ CHECK_CDEFINE(MDS_OPEN_JOIN_FILE);
+ CHECK_CDEFINE(MDS_OPEN_HAS_EA);
+ CHECK_CDEFINE(MDS_OPEN_HAS_OBJS);
}
static void
CHECK_VALUE(MDS_STATUS_CONN);
CHECK_VALUE(MDS_STATUS_LOV);
- CHECK_VALUE(MDS_OPEN_HAS_EA);
-
CHECK_VALUE(LDLM_ENQUEUE);
CHECK_VALUE(LDLM_CONVERT);
CHECK_VALUE(LDLM_CANCEL);
CHECK_VALUE(OBD_CONNECT_INDEX);
CHECK_VALUE(OBD_CONNECT_GRANT);
CHECK_VALUE(OBD_CONNECT_SRVLOCK);
+ CHECK_VALUE(OBD_CONNECT_VERSION);
+ CHECK_VALUE(OBD_CONNECT_REQPORTAL);
CHECK_VALUE(OBD_CONNECT_ACL);
CHECK_VALUE(OBD_CONNECT_XATTR);
CHECK_VALUE(OBD_CONNECT_CROW);
+ CHECK_VALUE(OBD_CONNECT_TRUNCLOCK);
+ CHECK_VALUE(OBD_CONNECT_TRANSNO);
COMMENT("Sizes and Offsets");
BLANK_LINE();
#undef LASSERT
#undef LASSERTF
+#define CLASSERT(cond) ({ switch(42) { case (cond): case 0: break; } })
#define LASSERT(cond) if (!(cond)) { printf("failed " #cond "\n"); ret = 1; }
#define LASSERTF(cond, fmt, arg) if (!(cond)) { printf("failed '" #cond "'" fmt, arg);ret = 1;}
#undef LASSERT
#undef LASSERTF
+#define CLASSERT(cond) ({ switch(42) { case (cond): case 0: break; } })
#define LASSERT(cond) if (!(cond)) { printf("failed " #cond "\n"); ret = 1; }
#define LASSERTF(cond, fmt, arg) if (!(cond)) { printf("failed '" #cond "'" fmt, arg);ret = 1;}
void lustre_assert_wire_constants(void)
{
/* Wire protocol assertions generated by 'wirecheck'
- * running on Linux schatzie.adilger.int 2.6.12-1.1378_FC3 #1 Wed Sep 14 04:24:31 EDT 2005 i6
+ * running on Linux schatzie.adilger.int 2.6.12-1.1381_FC3 #1 Fri Oct 21 03:46:55 EDT 2005 i6
* with gcc version 3.3.4 20040817 (Red Hat Linux 3.3.4-2) */
(long long)MDS_STATUS_CONN);
LASSERTF(MDS_STATUS_LOV == 2, " found %lld\n",
(long long)MDS_STATUS_LOV);
- LASSERTF(MDS_OPEN_HAS_EA == 1073741824, " found %lld\n",
- (long long)MDS_OPEN_HAS_EA);
LASSERTF(LDLM_ENQUEUE == 101, " found %lld\n",
(long long)LDLM_ENQUEUE);
LASSERTF(LDLM_CONVERT == 102, " found %lld\n",
(long long)OBD_CONNECT_GRANT);
LASSERTF(OBD_CONNECT_SRVLOCK == 16, " found %lld\n",
(long long)OBD_CONNECT_SRVLOCK);
+ LASSERTF(OBD_CONNECT_VERSION == 32, " found %lld\n",
+ (long long)OBD_CONNECT_VERSION);
+ LASSERTF(OBD_CONNECT_REQPORTAL == 64, " found %lld\n",
+ (long long)OBD_CONNECT_REQPORTAL);
LASSERTF(OBD_CONNECT_ACL == 128, " found %lld\n",
(long long)OBD_CONNECT_ACL);
LASSERTF(OBD_CONNECT_XATTR == 256, " found %lld\n",
(long long)OBD_CONNECT_XATTR);
LASSERTF(OBD_CONNECT_CROW == 512, " found %lld\n",
(long long)OBD_CONNECT_CROW);
+ LASSERTF(OBD_CONNECT_TRUNCLOCK == 1024, " found %lld\n",
+ (long long)OBD_CONNECT_TRUNCLOCK);
+ LASSERTF(OBD_CONNECT_TRANSNO == 2048, " found %lld\n",
+ (long long)OBD_CONNECT_TRANSNO);
/* Sizes and Offsets */
(long long)(int)offsetof(struct obdo, o_mds));
LASSERTF((int)sizeof(((struct obdo *)0)->o_mds) == 4, " found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_mds));
- LASSERTF((int)offsetof(struct obdo, o_padding_1) == 120, " found %lld\n",
+ LASSERTF((int)offsetof(struct obdo, o_padding_1) == 124, " found %lld\n",
(long long)(int)offsetof(struct obdo, o_padding_1));
LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_1) == 4, " found %lld\n",
(long long)(int)sizeof(((struct obdo *)0)->o_padding_1));
- LASSERTF((int)offsetof(struct obdo, o_padding_2) == 124, " found %lld\n",
- (long long)(int)offsetof(struct obdo, o_padding_2));
- LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_2) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct obdo *)0)->o_padding_2));
LASSERTF((int)offsetof(struct obdo, o_inline) == 128, " found %lld\n",
(long long)(int)offsetof(struct obdo, o_inline));
LASSERTF((int)sizeof(((struct obdo *)0)->o_inline) == 80, " found %lld\n",
(long long)OBD_MD_FLCOOKIE);
LASSERTF(OBD_MD_FLGROUP == 16777216, " found %lld\n",
(long long)OBD_MD_FLGROUP);
- LASSERTF(OBD_MD_FLIFID == 33554432, " found %lld\n",
- (long long)OBD_MD_FLIFID);
+ LASSERTF(OBD_MD_FLFID == 33554432, " found %lld\n",
+ (long long)OBD_MD_FLFID);
LASSERTF(OBD_MD_FLEPOCH == 67108864, " found %lld\n",
(long long)OBD_MD_FLEPOCH);
LASSERTF(OBD_MD_FLGRANT == 134217728, " found %lld\n",
(long long)(int)offsetof(struct obd_statfs, os_namelen));
LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, " found %lld\n",
(long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen));
- LASSERTF((int)offsetof(struct obd_statfs, os_spare) == 104, " found %lld\n",
- (long long)(int)offsetof(struct obd_statfs, os_spare));
- LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare) == 40, " found %lld\n",
- (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare));
+ LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, " found %lld\n",
+ (long long)(int)offsetof(struct obd_statfs, os_state));
+ LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct obd_statfs *)0)->os_state));
/* Checks for struct obd_ioobj */
LASSERTF((int)sizeof(struct obd_ioobj) == 24, " found %lld\n",
(long long)(int)offsetof(struct obd_dqblk, padding));
LASSERTF((int)sizeof(((struct obd_dqblk *)0)->padding) == 4, " found %lld\n",
(long long)(int)sizeof(((struct obd_dqblk *)0)->padding));
+ LASSERTF(Q_QUOTACHECK == 0x800100," found %lld\n",
+ (long long)Q_QUOTACHECK);
+ LASSERTF(Q_INITQUOTA == 0x800101," found %lld\n",
+ (long long)Q_INITQUOTA);
+ LASSERTF(Q_GETOINFO == 0x800102," found %lld\n",
+ (long long)Q_GETOINFO);
+ LASSERTF(Q_GETOQUOTA == 0x800103," found %lld\n",
+ (long long)Q_GETOQUOTA);
/* Checks for struct niobuf_remote */
LASSERTF((int)sizeof(struct niobuf_remote) == 16, " found %lld\n",
(long long)(int)offsetof(struct mds_body, eadatasize));
LASSERTF((int)sizeof(((struct mds_body *)0)->eadatasize) == 4, " found %lld\n",
(long long)(int)sizeof(((struct mds_body *)0)->eadatasize));
- LASSERTF((int)offsetof(struct mds_body, padding_1) == 152, " found %lld\n",
- (long long)(int)offsetof(struct mds_body, padding_1));
- LASSERTF((int)sizeof(((struct mds_body *)0)->padding_1) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct mds_body *)0)->padding_1));
- LASSERTF((int)offsetof(struct mds_body, padding_2) == 156, " found %lld\n",
- (long long)(int)offsetof(struct mds_body, padding_2));
- LASSERTF((int)sizeof(((struct mds_body *)0)->padding_2) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct mds_body *)0)->padding_2));
- LASSERTF((int)offsetof(struct mds_body, padding_3) == 160, " found %lld\n",
- (long long)(int)offsetof(struct mds_body, padding_3));
- LASSERTF((int)sizeof(((struct mds_body *)0)->padding_3) == 4, " found %lld\n",
- (long long)(int)sizeof(((struct mds_body *)0)->padding_3));
+ LASSERTF((int)offsetof(struct mds_body, aclsize) == 152, " found %lld\n",
+ (long long)(int)offsetof(struct mds_body, aclsize));
+ LASSERTF((int)sizeof(((struct mds_body *)0)->aclsize) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct mds_body *)0)->aclsize));
+ LASSERTF((int)offsetof(struct mds_body, max_mdsize) == 156, " found %lld\n",
+ (long long)(int)offsetof(struct mds_body, max_mdsize));
+ LASSERTF((int)sizeof(((struct mds_body *)0)->max_mdsize) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct mds_body *)0)->max_mdsize));
+ LASSERTF((int)offsetof(struct mds_body, max_cookiesize) == 160, " found %lld\n",
+ (long long)(int)offsetof(struct mds_body, max_cookiesize));
+ LASSERTF((int)sizeof(((struct mds_body *)0)->max_cookiesize) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct mds_body *)0)->max_cookiesize));
LASSERTF((int)offsetof(struct mds_body, padding_4) == 164, " found %lld\n",
(long long)(int)offsetof(struct mds_body, padding_4));
LASSERTF((int)sizeof(((struct mds_body *)0)->padding_4) == 4, " found %lld\n",
(long long)MDS_OPEN_DIRECTORY);
LASSERTF(MDS_OPEN_DELAY_CREATE == 16777216, " found %lld\n",
(long long)MDS_OPEN_DELAY_CREATE);
- LASSERTF(MDS_OPEN_HAS_EA == 1073741824, " found %lld\n",
- (long long)MDS_OPEN_HAS_EA);
+ CLASSERT(MDS_OPEN_OWNEROVERRIDE == 0200000000);
+ CLASSERT(MDS_OPEN_JOIN_FILE == 0400000000);
+ CLASSERT(MDS_OPEN_HAS_EA == 010000000000);
+ CLASSERT(MDS_OPEN_HAS_OBJS == 020000000000);
/* Checks for struct mds_rec_setattr */
LASSERTF((int)sizeof(struct mds_rec_setattr) == 96, " found %lld\n",