/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*
- * linux/fs/obdfilter/filter_io.c
+ * GPL HEADER START
*
- * Copyright (c) 2001-2003 Cluster File Systems, Inc.
- * Author: Peter Braam <braam@clusterfs.com>
- * Author: Andreas Dilger <adilger@clusterfs.com>
- * Author: Phil Schwan <phil@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
- * This file is part of the Lustre file system, http://www.lustre.org
- * Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
*
- * You may have signed or agreed to another license before downloading
- * this software. If so, you are bound by the terms and conditions
- * of that agreement, and the following does not apply to you. See the
- * LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
*
- * If you did not agree to a different license, then this copy of Lustre
- * is open source software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
*
- * In either case, Lustre is distributed in the hope that it will be
- * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdfilter/filter_io_26.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
*/
-#ifdef HAVE_KERNEL_CONFIG_H
+#ifndef AUTOCONF_INCLUDED
#include <linux/config.h>
#endif
#include <linux/module.h>
atomic_inc(&filter->fo_r_in_flight);
lprocfs_oh_tally(&filter->fo_filter_stats.hist[BRW_R_RPC_HIST],
atomic_read(&filter->fo_r_in_flight));
- lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_R_DISK_IOSIZE], size);
+ lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_R_DISK_IOSIZE],
+ size);
lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_RPC_HIST],
atomic_read(&filter->fo_r_in_flight));
lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_DISK_IOSIZE], size);
atomic_inc(&filter->fo_w_in_flight);
lprocfs_oh_tally(&filter->fo_filter_stats.hist[BRW_W_RPC_HIST],
atomic_read(&filter->fo_w_in_flight));
- lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_W_DISK_IOSIZE], size);
+ lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_W_DISK_IOSIZE],
+ size);
lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_RPC_HIST],
atomic_read(&filter->fo_w_in_flight));
lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_DISK_IOSIZE], size);
struct filter_iobuf *iobuf = bio->bi_private;
unsigned long flags;
+#ifdef HAVE_PAGE_CONSTANT
+ struct bio_vec *bvl;
+ int i;
+#endif
+
/* CAVEAT EMPTOR: possibly in IRQ context
* DO NOT record procfs stats here!!! */
CERROR("***** bio->bi_private is NULL! This should never "
"happen. Normally, I would crash here, but instead I "
"will dump the bio contents to the console. Please "
- "report this to CFS, along with any interesting "
- "messages leading up to this point (like SCSI errors, "
- "perhaps). Because bi_private is NULL, I can't wake up "
- "the thread that initiated this I/O -- so you will "
- "probably have to reboot this node.\n");
+ "report this to <http://bugzilla.lustre.org/> , along "
+ "with any interesting messages leading up to this point "
+ "(like SCSI errors, perhaps). Because bi_private is "
+ "NULL, I can't wake up the thread that initiated this "
+ "I/O -- so you will probably have to reboot this node.\n");
CERROR("bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d, "
"bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, "
"bi_private: %p\n", bio->bi_next, bio->bi_flags,
return 0;
}
+#ifdef HAVE_PAGE_CONSTANT
+ bio_for_each_segment(bvl, bio, i)
+ ClearPageConstant(bvl->bv_page);
+#endif
+
spin_lock_irqsave(&iobuf->dr_lock, flags);
if (iobuf->dr_error == 0)
iobuf->dr_error = error;
spin_lock_init(&iobuf->dr_lock);
iobuf->dr_max_pages = num_pages;
iobuf->dr_npages = 0;
+ iobuf->dr_error = 0;
RETURN(iobuf);
static void filter_clear_iobuf(struct filter_iobuf *iobuf)
{
iobuf->dr_npages = 0;
+ iobuf->dr_error = 0;
atomic_set(&iobuf->dr_numreqs, 0);
}
continue;
}
- sector = blocks[block_idx + i] << sector_bits;
+ sector = (sector_t)blocks[block_idx + i] << sector_bits;
/* Additional contiguous file blocks? */
while (i + nblocks < blocks_per_page &&
- (sector + nblocks*(blocksize>>9)) ==
- (blocks[block_idx + i + nblocks] << sector_bits))
+ (sector + (nblocks << sector_bits)) ==
+ ((sector_t)blocks[block_idx + i + nblocks] <<
+ sector_bits))
nblocks++;
+#ifdef HAVE_PAGE_CONSTANT
+ /* I only set the page to be constant only if it
+ * is mapped to a contiguous underlying disk block(s).
+ * It will then make sure the corresponding device
+ * cache of raid5 will be overwritten by this page.
+ * - jay */
+ if ((rw == OBD_BRW_WRITE) &&
+ (nblocks == blocks_per_page) &&
+ mapping_cap_page_constant_write(inode->i_mapping))
+ SetPageConstant(page);
+#endif
+
if (bio != NULL &&
can_be_merged(bio, sector) &&
bio_add_page(bio, page,
wait_event(iobuf->dr_wait, atomic_read(&iobuf->dr_numreqs) == 0);
if (rw == OBD_BRW_READ) {
- lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.hist[BRW_R_DIO_FRAGS], frags);
+ lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.hist[BRW_R_DIO_FRAGS],
+ frags);
lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_DIO_FRAGS],
frags);
lprocfs_oh_tally_log2(&obd->u.filter.fo_filter_stats.hist[BRW_R_IO_TIME],
jiffies - start_time);
- lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_IO_TIME],
- jiffies - start_time);
+ lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_IO_TIME], jiffies - start_time);
+ if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) {
+ lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_R_DIO_FRAGS],
+ frags);
+ lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_R_IO_TIME],
+ jiffies - start_time);
+ }
} else {
- lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.hist[BRW_W_DIO_FRAGS], frags);
+ lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.hist[BRW_W_DIO_FRAGS],
+ frags);
lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_DIO_FRAGS],
frags);
lprocfs_oh_tally_log2(&obd->u.filter.fo_filter_stats.hist[BRW_W_IO_TIME],
jiffies - start_time);
- lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_IO_TIME],
- jiffies - start_time);
+ lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_IO_TIME], jiffies - start_time);
+ if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) {
+ lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_W_DIO_FRAGS],
+ frags);
+ lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_W_IO_TIME],
+ jiffies - start_time);
+ }
}
if (rc == 0)
if (!locked)
LOCK_INODE_MUTEX(inode);
if (inode->i_mapping->nrpages) {
+#ifdef PF_SYNCWRITE
current->flags |= PF_SYNCWRITE;
+#endif
rc = filemap_fdatawrite(inode->i_mapping);
if (rc == 0)
rc = filemap_fdatawait(inode->i_mapping);
+#ifdef PF_SYNCWRITE
current->flags &= ~PF_SYNCWRITE;
+#endif
}
if (!locked)
UNLOCK_INODE_MUTEX(inode);
return rc;
}
-
/* Clear pages from the mapping before we do direct IO to that offset.
* Now that the only source of such pages in the truncate path flushes
* these pages to disk and then discards them, this is error condition.
int rc;
/* Truncate on page boundary, so nothing to flush? */
- if (!(inode->i_size & ~CFS_PAGE_MASK))
+ if (!(i_size_read(inode) & ~CFS_PAGE_MASK))
return 0;
rc = filter_sync_inode_data(inode, 1);
/* be careful to call this after fsync_inode_data_buffers has waited
* for IO to complete before we evict it from the cache */
page = find_lock_page(inode->i_mapping,
- inode->i_size >> CFS_PAGE_SHIFT);
+ i_size_read(inode) >> CFS_PAGE_SHIFT);
if (page) {
if (page->mapping != NULL) {
wait_on_page_writeback(page);
create = 1;
sem = &obd->u.filter.fo_alloc_lock;
- lquota_enforce(filter_quota_interface_ref, obd,
- iobuf->dr_ignore_quota);
+ lquota_enforce(filter_quota_interface_ref, obd, iobuf->dr_ignore_quota);
}
-remap:
+
rc = fsfilt_map_inode_pages(obd, inode, iobuf->dr_pages,
iobuf->dr_npages, iobuf->dr_blocks,
obdfilter_created_scratchpad, create, sem);
- if (rc == -EDQUOT) {
- LASSERT(rw == OBD_BRW_WRITE &&
- !cap_raised(current->cap_effective, CAP_SYS_RESOURCE));
-
- /* Unfortunately, if quota master is too busy to handle the
- * pre-dqacq in time or this user has exceeded quota limit, we
- * have to wait for the completion of in flight dqacq/dqrel,
- * then try again */
- if (lquota_acquire(filter_quota_interface_ref, obd,
- inode->i_uid, inode->i_gid))
- goto remap;
- }
-
if (rw == OBD_BRW_WRITE) {
if (rc == 0) {
- filter_tally_write(exp, iobuf->dr_pages,
- iobuf->dr_npages, iobuf->dr_blocks,
- blocks_per_page);
- if (attr->ia_size > inode->i_size)
+ filter_tally(exp, iobuf->dr_pages,
+ iobuf->dr_npages, iobuf->dr_blocks,
+ blocks_per_page, 1);
+ if (attr->ia_size > i_size_read(inode))
attr->ia_valid |= ATTR_SIZE;
rc = fsfilt_setattr(obd, dchild,
oti->oti_handle, attr, 0);
UNLOCK_INODE_MUTEX(inode);
- rc2 = filter_finish_transno(exp, oti, 0);
+ rc2 = filter_finish_transno(exp, oti, 0, 0);
if (rc2 != 0) {
CERROR("can't close transaction: %d\n", rc2);
if (rc == 0)
rc = rc2;
}
- rc2 =fsfilt_commit_async(obd,inode,oti->oti_handle,wait_handle);
+ rc2 = fsfilt_commit_async(obd,inode,oti->oti_handle,
+ wait_handle);
if (rc == 0)
rc = rc2;
if (rc != 0)
RETURN(rc);
} else if (rc == 0) {
- filter_tally_read(exp, iobuf->dr_pages,
- iobuf->dr_npages, iobuf->dr_blocks,
- blocks_per_page);
+ filter_tally(exp, iobuf->dr_pages, iobuf->dr_npages,
+ iobuf->dr_blocks, blocks_per_page, 0);
}
rc = filter_clear_page_cache(inode, iobuf);
int i, err, cleanup_phase = 0;
struct obd_device *obd = exp->exp_obd;
void *wait_handle;
- int total_size = 0;
+ int total_size = 0, rc2;
unsigned int qcids[MAXQUOTAS] = {0, 0};
ENTRY;
if (rc != 0)
GOTO(cleanup, rc);
+ /* Unfortunately, if quota master is too busy to handle the
+ * pre-dqacq in time and quota hash on ost is used up, we
+ * have to wait for the completion of in flight dqacq/dqrel,
+ * then try again */
+ if ((rc2 = lquota_chkquota(filter_quota_interface_ref, obd, oa->o_uid,
+ oa->o_gid, niocount)) == QUOTA_RET_ACQUOTA) {
+ OBD_FAIL_TIMEOUT(OBD_FAIL_OST_HOLD_WRITE_RPC, 90);
+ lquota_acquire(filter_quota_interface_ref, obd, oa->o_uid,
+ oa->o_gid);
+ }
+
+ if (rc2 < 0) {
+ rc = rc2;
+ GOTO(cleanup, rc);
+ }
+
iobuf = filter_iobuf_get(&obd->u.filter, oti);
if (IS_ERR(iobuf))
GOTO(cleanup, rc = PTR_ERR(iobuf));
DQUOT_INIT(inode);
LOCK_INODE_MUTEX(inode);
- fsfilt_check_slow(obd, now, obd_timeout, "i_mutex");
+ fsfilt_check_slow(obd, now, "i_mutex");
oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res,
oti);
if (IS_ERR(oti->oti_handle)) {
}
/* have to call fsfilt_commit() from this point on */
- fsfilt_check_slow(obd, now, obd_timeout, "brw_start");
+ fsfilt_check_slow(obd, now, "brw_start");
i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
CDEBUG(D_INODE, "update UID/GID to %lu/%lu\n",
(unsigned long)oa->o_uid, (unsigned long)oa->o_gid);
- cap_raise(current->cap_effective, CAP_SYS_RESOURCE);
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
iattr.ia_valid |= ATTR_MODE;
iattr.ia_mode = inode->i_mode;
lquota_getflag(filter_quota_interface_ref, obd, oa);
- fsfilt_check_slow(obd, now, obd_timeout, "direct_io");
+ fsfilt_check_slow(obd, now, "direct_io");
err = fsfilt_commit_wait(obd, inode, wait_handle);
if (err) {
"oti_transno "LPU64" last_committed "LPU64"\n",
oti->oti_transno, obd->obd_last_committed);
- fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit");
+ fsfilt_check_slow(obd, now, "commitrw commit");
cleanup:
filter_grant_commit(exp, niocount, res);