Whamcloud - gitweb
b=15266
[fs/lustre-release.git] / lustre / obdfilter / filter_io_26.c
index 73bb316..e96513c 100644 (file)
@@ -1,33 +1,46 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- *  linux/fs/obdfilter/filter_io.c
+ * GPL HEADER START
  *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter Braam <braam@clusterfs.com>
- *   Author: Andreas Dilger <adilger@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
- *   This file is part of the Lustre file system, http://www.lustre.org
- *   Lustre is a trademark of Cluster File Systems, Inc.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
  *
- *   You may have signed or agreed to another license before downloading
- *   this software.  If so, you are bound by the terms and conditions
- *   of that agreement, and the following does not apply to you.  See the
- *   LICENSE file included with this distribution for more information.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
  *
- *   If you did not agree to a different license, then this copy of Lustre
- *   is open source software; you can redistribute it and/or modify it
- *   under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  *
- *   In either case, Lustre is distributed in the hope that it will be
- *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
- *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   license text for more details.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdfilter/filter_io_26.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
  */
 
-#ifdef HAVE_KERNEL_CONFIG_H
+#ifndef AUTOCONF_INCLUDED
 #include <linux/config.h>
 #endif
 #include <linux/module.h>
@@ -68,7 +81,8 @@ static void record_start_io(struct filter_iobuf *iobuf, int rw, int size,
                 atomic_inc(&filter->fo_r_in_flight);
                 lprocfs_oh_tally(&filter->fo_filter_stats.hist[BRW_R_RPC_HIST],
                                  atomic_read(&filter->fo_r_in_flight));
-                lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_R_DISK_IOSIZE], size);
+                lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_R_DISK_IOSIZE],
+                                      size);
                 lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_RPC_HIST],
                                  atomic_read(&filter->fo_r_in_flight));
                 lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_DISK_IOSIZE], size);
@@ -76,7 +90,8 @@ static void record_start_io(struct filter_iobuf *iobuf, int rw, int size,
                 atomic_inc(&filter->fo_w_in_flight);
                 lprocfs_oh_tally(&filter->fo_filter_stats.hist[BRW_W_RPC_HIST],
                                  atomic_read(&filter->fo_w_in_flight));
-                lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_W_DISK_IOSIZE], size);
+                lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_W_DISK_IOSIZE],
+                                      size);
                 lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_RPC_HIST],
                                  atomic_read(&filter->fo_w_in_flight));
                 lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_DISK_IOSIZE], size);
@@ -104,6 +119,11 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error)
         struct filter_iobuf *iobuf = bio->bi_private;
         unsigned long        flags;
 
+#ifdef HAVE_PAGE_CONSTANT
+        struct bio_vec *bvl;
+        int i;
+#endif
+
         /* CAVEAT EMPTOR: possibly in IRQ context 
          * DO NOT record procfs stats here!!! */
 
@@ -114,11 +134,11 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error)
                 CERROR("***** bio->bi_private is NULL!  This should never "
                        "happen.  Normally, I would crash here, but instead I "
                        "will dump the bio contents to the console.  Please "
-                       "report this to CFS, along with any interesting "
-                       "messages leading up to this point (like SCSI errors, "
-                       "perhaps).  Because bi_private is NULL, I can't wake up "
-                       "the thread that initiated this I/O -- so you will "
-                       "probably have to reboot this node.\n");
+                       "report this to <http://bugzilla.lustre.org/> , along "
+                       "with any interesting messages leading up to this point "
+                       "(like SCSI errors, perhaps).  Because bi_private is "
+                       "NULL, I can't wake up the thread that initiated this "
+                       "I/O -- so you will probably have to reboot this node.\n");
                 CERROR("bi_next: %p, bi_flags: %lx, bi_rw: %lu, bi_vcnt: %d, "
                        "bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, "
                        "bi_private: %p\n", bio->bi_next, bio->bi_flags,
@@ -128,6 +148,11 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error)
                 return 0;
         }
 
+#ifdef HAVE_PAGE_CONSTANT
+        bio_for_each_segment(bvl, bio, i)
+                ClearPageConstant(bvl->bv_page);
+#endif
+
         spin_lock_irqsave(&iobuf->dr_lock, flags);
         if (iobuf->dr_error == 0)
                 iobuf->dr_error = error;
@@ -182,6 +207,7 @@ struct filter_iobuf *filter_alloc_iobuf(struct filter_obd *filter,
         spin_lock_init(&iobuf->dr_lock);
         iobuf->dr_max_pages = num_pages;
         iobuf->dr_npages = 0;
+        iobuf->dr_error = 0;
 
         RETURN(iobuf);
 
@@ -197,6 +223,7 @@ struct filter_iobuf *filter_alloc_iobuf(struct filter_obd *filter,
 static void filter_clear_iobuf(struct filter_iobuf *iobuf)
 {
         iobuf->dr_npages = 0;
+        iobuf->dr_error = 0;
         atomic_set(&iobuf->dr_numreqs, 0);
 }
 
@@ -285,14 +312,27 @@ int filter_do_bio(struct obd_export *exp, struct inode *inode,
                                 continue;
                         }
 
-                        sector = blocks[block_idx + i] << sector_bits;
+                        sector = (sector_t)blocks[block_idx + i] << sector_bits;
 
                         /* Additional contiguous file blocks? */
                         while (i + nblocks < blocks_per_page &&
-                               (sector + nblocks*(blocksize>>9)) ==
-                               (blocks[block_idx + i + nblocks] << sector_bits))
+                               (sector + (nblocks << sector_bits)) ==
+                               ((sector_t)blocks[block_idx + i + nblocks] <<
+                                sector_bits))
                                 nblocks++;
 
+#ifdef HAVE_PAGE_CONSTANT
+                        /* I only set the page to be constant only if it 
+                         * is mapped to a contiguous underlying disk block(s). 
+                         * It will then make sure the corresponding device 
+                         * cache of raid5 will be overwritten by this page. 
+                         * - jay */
+                        if ((rw == OBD_BRW_WRITE) && 
+                            (nblocks == blocks_per_page) && 
+                            mapping_cap_page_constant_write(inode->i_mapping))
+                               SetPageConstant(page);
+#endif
+
                         if (bio != NULL &&
                             can_be_merged(bio, sector) &&
                             bio_add_page(bio, page,
@@ -362,21 +402,33 @@ int filter_do_bio(struct obd_export *exp, struct inode *inode,
         wait_event(iobuf->dr_wait, atomic_read(&iobuf->dr_numreqs) == 0);
 
         if (rw == OBD_BRW_READ) {
-                lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.hist[BRW_R_DIO_FRAGS], frags);
+                lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.hist[BRW_R_DIO_FRAGS],
+                                 frags);
                 lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_DIO_FRAGS],
                                  frags);
                 lprocfs_oh_tally_log2(&obd->u.filter.fo_filter_stats.hist[BRW_R_IO_TIME],
                                       jiffies - start_time);
-                lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_IO_TIME],
-                                 jiffies - start_time);
+                lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_IO_TIME], jiffies - start_time);
+                if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) {
+                        lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_R_DIO_FRAGS],
+                                         frags);
+                        lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_R_IO_TIME],
+                                              jiffies - start_time);
+                }
         } else {
-                lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.hist[BRW_W_DIO_FRAGS], frags);
+                lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.hist[BRW_W_DIO_FRAGS],
+                                 frags);
                 lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_DIO_FRAGS],
                                  frags);
                 lprocfs_oh_tally_log2(&obd->u.filter.fo_filter_stats.hist[BRW_W_IO_TIME],
                                       jiffies - start_time);
-                lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_IO_TIME],
-                                 jiffies - start_time);
+                lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_IO_TIME], jiffies - start_time);
+                if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) {
+                        lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_W_DIO_FRAGS],
+                                         frags);
+                        lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_W_IO_TIME],
+                                              jiffies - start_time);
+                }
         }
 
         if (rc == 0)
@@ -407,18 +459,21 @@ static int filter_sync_inode_data(struct inode *inode, int locked)
         if (!locked)
                 LOCK_INODE_MUTEX(inode);
         if (inode->i_mapping->nrpages) {
+#ifdef PF_SYNCWRITE
                 current->flags |= PF_SYNCWRITE;
+#endif
                 rc = filemap_fdatawrite(inode->i_mapping);
                 if (rc == 0)
                         rc = filemap_fdatawait(inode->i_mapping);
+#ifdef PF_SYNCWRITE
                 current->flags &= ~PF_SYNCWRITE;
+#endif
         }
         if (!locked)
                 UNLOCK_INODE_MUTEX(inode);
 
         return rc;
 }
-
 /* Clear pages from the mapping before we do direct IO to that offset.
  * Now that the only source of such pages in the truncate path flushes
  * these pages to disk and then discards them, this is error condition.
@@ -461,7 +516,7 @@ int filter_clear_truncated_page(struct inode *inode)
         int rc;
 
         /* Truncate on page boundary, so nothing to flush? */
-        if (!(inode->i_size & ~CFS_PAGE_MASK))
+        if (!(i_size_read(inode) & ~CFS_PAGE_MASK))
                 return 0;
 
         rc = filter_sync_inode_data(inode, 1);
@@ -471,7 +526,7 @@ int filter_clear_truncated_page(struct inode *inode)
         /* be careful to call this after fsync_inode_data_buffers has waited
          * for IO to complete before we evict it from the cache */
         page = find_lock_page(inode->i_mapping,
-                              inode->i_size >> CFS_PAGE_SHIFT);
+                              i_size_read(inode) >> CFS_PAGE_SHIFT);
         if (page) {
                 if (page->mapping != NULL) {
                         wait_on_page_writeback(page);
@@ -511,8 +566,7 @@ int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf,
                 create = 1;
                 sem = &obd->u.filter.fo_alloc_lock;
 
-                lquota_enforce(filter_quota_interface_ref, obd,
-                               iobuf->dr_ignore_quota);
+                lquota_enforce(filter_quota_interface_ref, obd, iobuf->dr_ignore_quota);
         }
 
         rc = fsfilt_map_inode_pages(obd, inode, iobuf->dr_pages,
@@ -521,10 +575,10 @@ int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf,
 
         if (rw == OBD_BRW_WRITE) {
                 if (rc == 0) {
-                        filter_tally_write(exp, iobuf->dr_pages,
-                                           iobuf->dr_npages, iobuf->dr_blocks,
-                                           blocks_per_page);
-                        if (attr->ia_size > inode->i_size)
+                        filter_tally(exp, iobuf->dr_pages,
+                                     iobuf->dr_npages, iobuf->dr_blocks,
+                                     blocks_per_page, 1);
+                        if (attr->ia_size > i_size_read(inode))
                                 attr->ia_valid |= ATTR_SIZE;
                         rc = fsfilt_setattr(obd, dchild,
                                             oti->oti_handle, attr, 0);
@@ -532,22 +586,22 @@ int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf,
 
                 UNLOCK_INODE_MUTEX(inode);
 
-                rc2 = filter_finish_transno(exp, oti, 0);
+                rc2 = filter_finish_transno(exp, oti, 0, 0);
                 if (rc2 != 0) {
                         CERROR("can't close transaction: %d\n", rc2);
                         if (rc == 0)
                                 rc = rc2;
                 }
 
-                rc2 =fsfilt_commit_async(obd,inode,oti->oti_handle,wait_handle);
+                rc2 = fsfilt_commit_async(obd,inode,oti->oti_handle,
+                                          wait_handle);
                 if (rc == 0)
                         rc = rc2;
                 if (rc != 0)
                         RETURN(rc);
         } else if (rc == 0) {
-                filter_tally_read(exp, iobuf->dr_pages,
-                                  iobuf->dr_npages, iobuf->dr_blocks,
-                                  blocks_per_page);
+                filter_tally(exp, iobuf->dr_pages, iobuf->dr_npages,
+                             iobuf->dr_blocks, blocks_per_page, 0);
         }
 
         rc = filter_clear_page_cache(inode, iobuf);
@@ -667,7 +721,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
         DQUOT_INIT(inode);
 
         LOCK_INODE_MUTEX(inode);
-        fsfilt_check_slow(obd, now, obd_timeout, "i_mutex");
+        fsfilt_check_slow(obd, now, "i_mutex");
         oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res,
                                            oti);
         if (IS_ERR(oti->oti_handle)) {
@@ -680,7 +734,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
         }
         /* have to call fsfilt_commit() from this point on */
 
-        fsfilt_check_slow(obd, now, obd_timeout, "brw_start");
+        fsfilt_check_slow(obd, now, "brw_start");
 
         i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
 
@@ -700,7 +754,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
                 CDEBUG(D_INODE, "update UID/GID to %lu/%lu\n",
                        (unsigned long)oa->o_uid, (unsigned long)oa->o_gid);
 
-                cap_raise(current->cap_effective, CAP_SYS_RESOURCE);
+                cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
 
                 iattr.ia_valid |= ATTR_MODE;
                 iattr.ia_mode = inode->i_mode;
@@ -732,7 +786,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
 
         lquota_getflag(filter_quota_interface_ref, obd, oa);
 
-        fsfilt_check_slow(obd, now, obd_timeout, "direct_io");
+        fsfilt_check_slow(obd, now, "direct_io");
 
         err = fsfilt_commit_wait(obd, inode, wait_handle);
         if (err) {
@@ -745,7 +799,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa,
                          "oti_transno "LPU64" last_committed "LPU64"\n",
                          oti->oti_transno, obd->obd_last_committed);
 
-        fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit");
+        fsfilt_check_slow(obd, now, "commitrw commit");
 
 cleanup:
         filter_grant_commit(exp, niocount, res);