Whamcloud - gitweb
LU-848 clio: page writeback support
[fs/lustre-release.git] / lustre / osc / osc_io.c
index 6190433..055f8fb 100644 (file)
  * GPL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2011 Whamcloud, Inc.
+ *
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -97,16 +100,9 @@ static void osc_io_unplug(const struct lu_env *env, struct osc_object *osc,
 {
         loi_list_maint(cli, osc->oo_oinfo);
         osc_check_rpcs(env, cli);
-        client_obd_list_unlock(&cli->cl_loi_list_lock);
 }
 
 /**
- * How many pages osc_io_submit() queues before checking whether an RPC is
- * ready.
- */
-#define OSC_QUEUE_GRAIN (32)
-
-/**
  * An implementation of cl_io_operations::cio_io_submit() method for osc
  * layer. Iterates over pages in the in-queue, prepares each for io by calling
  * cl_page_prep() and then either submits them through osc_io_submit_page()
@@ -133,7 +129,7 @@ static int osc_io_submit(const struct lu_env *env,
 
         LASSERT(qin->pl_nr > 0);
 
-        CDEBUG(D_INFO, "%i %i\n", qin->pl_nr, crt);
+        CDEBUG(D_INFO, "%d %d\n", qin->pl_nr, crt);
         /*
          * NOTE: here @page is a top-level page. This is done to avoid
          *       creation of sub-page-list.
@@ -156,26 +152,24 @@ static int osc_io_submit(const struct lu_env *env,
                         oap->oap_async_flags |= ASYNC_HP;
                         cfs_spin_unlock(&oap->oap_lock);
                 }
-                /*
-                 * This can be checked without cli->cl_loi_list_lock, because
-                 * ->oap_*_item are always manipulated when the page is owned.
-                 */
-                if (!cfs_list_empty(&oap->oap_urgent_item) ||
-                    !cfs_list_empty(&oap->oap_rpc_item)) {
-                        result = -EBUSY;
-                        break;
-                }
 
                 if (osc0 == NULL) { /* first iteration */
                         cli = &exp->exp_obd->u.cli;
                         osc0 = osc;
+                        client_obd_list_lock(&cli->cl_loi_list_lock);
                 } else /* check that all pages are against the same object
                         * (for now) */
                         LASSERT(osc == osc0);
-                if (queued++ == 0)
-                        client_obd_list_lock(&cli->cl_loi_list_lock);
+
+                if (!cfs_list_empty(&oap->oap_urgent_item) ||
+                    !cfs_list_empty(&oap->oap_rpc_item)) {
+                        result = -EBUSY;
+                        break;
+                }
+
                 result = cl_page_prep(env, io, page, crt);
                 if (result == 0) {
+                        ++queued;
                         cl_page_list_move(qout, qin, page);
                         if (cfs_list_empty(&oap->oap_pending_item)) {
                                 osc_io_submit_page(env, cl2osc_io(env, ios),
@@ -187,7 +181,7 @@ static int osc_io_submit(const struct lu_env *env,
                                                                   OSC_FLAGS);
                                 /*
                                  * bug 18881: we can't just break out here when
-                                 * error occurrs after cl_page_prep has been
+                                 * error occurs after cl_page_prep has been
                                  * called against the page. The correct
                                  * way is to call page's completion routine,
                                  * as in osc_oap_interrupted.  For simplicity,
@@ -208,30 +202,18 @@ static int osc_io_submit(const struct lu_env *env,
                          */
                         result = 0;
                 }
+
                 /*
-                 * Don't keep client_obd_list_lock() for too long.
-                 *
-                 * XXX client_obd_list lock has to be unlocked periodically to
-                 * avoid soft-lockups that tend to happen otherwise (see bug
-                 * 16651). On the other hand, osc_io_submit_page() queues a
-                 * page with ASYNC_URGENT flag and so all pages queued up
-                 * until this point are sent out immediately by
-                 * osc_io_unplug() resulting in sub-optimal RPCs (sub-optimal
-                 * RPCs only happen during `warm up' phase when less than
-                 * cl_max_rpcs_in_flight RPCs are in flight). To balance these
-                 * conflicting requirements, one might unplug once enough
-                 * pages to form a large RPC were queued (i.e., use
-                 * cli->cl_max_pages_per_rpc as OSC_QUEUE_GRAIN, see
-                 * lop_makes_rpc()), or ignore soft-lockup issue altogether.
+                 * We might hold client_obd_list_lock() for too long and cause
+                 * soft-lockups (see bug 16651). But on the other hand, pages
+                 * are queued here with ASYNC_URGENT flag, thus will be sent
+                 * out immediately once osc_io_unplug() be called, possibly
+                 * resulting sub-optimal RPCs.
                  *
-                 * XXX lock_need_resched() should be used here, but it is not
-                 * available in the older of supported kernels.
+                 * We think creating optimal-sized RPCs is more important than
+                 * avoiding the transient soft-lockups, plus I believe the
+                 * soft-locks only happen in full debug testing.
                  */
-                if (queued > OSC_QUEUE_GRAIN || cfs_need_resched()) {
-                        queued = 0;
-                        osc_io_unplug(env, osc, cli);
-                        cfs_cond_resched();
-                }
         }
 
         LASSERT(ergo(result == 0, cli != NULL));
@@ -239,7 +221,9 @@ static int osc_io_submit(const struct lu_env *env,
 
         if (queued > 0)
                 osc_io_unplug(env, osc, cli);
-        CDEBUG(D_INFO, "%i/%i %i\n", qin->pl_nr, qout->pl_nr, result);
+        if (osc0)
+                client_obd_list_unlock(&cli->cl_loi_list_lock);
+        CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result);
         return qout->pl_nr > 0 ? 0 : result;
 }
 
@@ -317,14 +301,24 @@ static int osc_io_prepare_write(const struct lu_env *env,
 {
         struct osc_device *dev = lu2osc_dev(slice->cpl_obj->co_lu.lo_dev);
         struct obd_import *imp = class_exp2cliimp(dev->od_exp);
-
+        struct osc_io     *oio = cl2osc_io(env, ios);
+        int result = 0;
         ENTRY;
 
         /*
          * This implements OBD_BRW_CHECK logic from old client.
          */
 
-        RETURN(imp == NULL || imp->imp_invalid ? -EIO : 0);
+        if (imp == NULL || imp->imp_invalid)
+                result = -EIO;
+        if (result == 0 && oio->oi_lockless)
+                /* this page contains `invalid' data, but who cares?
+                 * nobody can access the invalid data.
+                 * in osc_io_commit_write(), we're going to write exact
+                 * [from, to) bytes of this page to OST. -jay */
+                cl_page_export(env, slice->cpl_page, 1);
+
+        RETURN(result);
 }
 
 static int osc_io_commit_write(const struct lu_env *env,
@@ -332,6 +326,7 @@ static int osc_io_commit_write(const struct lu_env *env,
                                const struct cl_page_slice *slice,
                                unsigned from, unsigned to)
 {
+        struct osc_io         *oio = cl2osc_io(env, ios);
         struct osc_page       *opg = cl2osc_page(slice);
         struct osc_object     *obj = cl2osc(opg->ops_cl.cpl_obj);
         struct osc_async_page *oap = &opg->ops_oap;
@@ -349,6 +344,10 @@ static int osc_io_commit_write(const struct lu_env *env,
             cfs_capable(CFS_CAP_SYS_RESOURCE))
                 oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
 
+        if (oio->oi_lockless)
+                /* see osc_io_prepare_write() for lockless io handling. */
+                cl_page_clip(env, slice->cpl_page, from, to);
+
         RETURN(0);
 }
 
@@ -362,7 +361,7 @@ static int osc_io_fault_start(const struct lu_env *env,
 
         io  = ios->cis_io;
         fio = &io->u.ci_fault;
-        CDEBUG(D_INFO, "%lu %i %i\n",
+        CDEBUG(D_INFO, "%lu %d %d\n",
                fio->ft_index, fio->ft_writable, fio->ft_nob);
         /*
          * If mapping is writeable, adjust kms to cover this page,
@@ -375,9 +374,9 @@ static int osc_io_fault_start(const struct lu_env *env,
         RETURN(0);
 }
 
-static int osc_punch_upcall(void *a, int rc)
+static int osc_setattr_upcall(void *a, int rc)
 {
-        struct osc_punch_cbargs *args = a;
+        struct osc_setattr_cbargs *args = a;
 
         args->opc_rc = rc;
         cfs_complete(&args->opc_sync);
@@ -414,7 +413,7 @@ static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
          * XXX this is quite expensive check.
          */
         cl_page_list_init(list);
-        cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF, list, 0);
+        cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF, list);
 
         cl_page_list_for_each(page, list)
                 CL_PAGE_DEBUG(D_ERROR, env, page, "exists %lu\n", start);
@@ -433,7 +432,7 @@ static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
                         /*
                          * XXX Linux specific debugging stuff.
                          */
-                        CL_PAGE_DEBUG(D_ERROR, env, page, "%s/%i %lu\n",
+                        CL_PAGE_DEBUG(D_ERROR, env, page, "%s/%d %lu\n",
                                       submitter->comm, submitter->pid, start);
                         libcfs_debug_dumpstack(submitter);
                 }
@@ -444,8 +443,8 @@ static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
 # define osc_trunc_check(env, io, oio, size) do {;} while (0)
 #endif
 
-static int osc_io_trunc_start(const struct lu_env *env,
-                              const struct cl_io_slice *slice)
+static int osc_io_setattr_start(const struct lu_env *env,
+                                const struct cl_io_slice *slice)
 {
         struct cl_io            *io     = slice->cis_io;
         struct osc_io           *oio    = cl2osc_io(env, slice);
@@ -453,56 +452,86 @@ static int osc_io_trunc_start(const struct lu_env *env,
         struct lov_oinfo        *loi    = cl2osc(obj)->oo_oinfo;
         struct cl_attr          *attr   = &osc_env_info(env)->oti_attr;
         struct obdo             *oa     = &oio->oi_oa;
-        struct osc_punch_cbargs *cbargs = &oio->oi_punch_cbarg;
-        struct obd_capa         *capa;
-        loff_t                   size   = io->u.ci_truncate.tr_size;
+        struct osc_setattr_cbargs *cbargs = &oio->oi_setattr_cbarg;
+        loff_t                   size   = io->u.ci_setattr.sa_attr.lvb_size;
+        unsigned int             ia_valid = io->u.ci_setattr.sa_valid;
         int                      result = 0;
+        struct obd_info          oinfo = { { { 0 } } };
 
-        osc_trunc_check(env, io, oio, size);
+        if (ia_valid & ATTR_SIZE)
+                osc_trunc_check(env, io, oio, size);
 
         if (oio->oi_lockless == 0) {
                 cl_object_attr_lock(obj);
                 result = cl_object_attr_get(env, obj, attr);
                 if (result == 0) {
-                        attr->cat_size = attr->cat_kms = size;
-                        result = cl_object_attr_set(env, obj, attr,
-                                                    CAT_SIZE|CAT_KMS);
+                        unsigned int cl_valid = 0;
+
+                        if (ia_valid & ATTR_SIZE) {
+                                attr->cat_size = attr->cat_kms = size;
+                                cl_valid = (CAT_SIZE | CAT_KMS);
+                        }
+                        if (ia_valid & ATTR_MTIME_SET) {
+                                attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime;
+                                cl_valid |= CAT_MTIME;
+                        }
+                        if (ia_valid & ATTR_ATIME_SET) {
+                                attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime;
+                                cl_valid |= CAT_ATIME;
+                        }
+                        if (ia_valid & ATTR_CTIME_SET) {
+                                attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime;
+                                cl_valid |= CAT_CTIME;
+                        }
+                        result = cl_object_attr_set(env, obj, attr, cl_valid);
                 }
                 cl_object_attr_unlock(obj);
         }
-
         memset(oa, 0, sizeof(*oa));
         if (result == 0) {
                 oa->o_id = loi->loi_id;
-                oa->o_gr = loi->loi_gr;
+                oa->o_seq = loi->loi_seq;
                 oa->o_mtime = attr->cat_mtime;
                 oa->o_atime = attr->cat_atime;
                 oa->o_ctime = attr->cat_ctime;
                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME |
                         OBD_MD_FLCTIME | OBD_MD_FLMTIME;
-                if (oio->oi_lockless) {
-                        oa->o_flags = OBD_FL_SRVLOCK;
-                        oa->o_valid |= OBD_MD_FLFLAGS;
+                if (ia_valid & ATTR_SIZE) {
+                        oa->o_size = size;
+                        oa->o_blocks = OBD_OBJECT_EOF;
+                        oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+                        if (oio->oi_lockless) {
+                                oa->o_flags = OBD_FL_SRVLOCK;
+                                oa->o_valid |= OBD_MD_FLFLAGS;
+                        }
+                } else {
+                        LASSERT(oio->oi_lockless == 0);
                 }
-                oa->o_size = size;
-                oa->o_blocks = OBD_OBJECT_EOF;
-                oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
 
-                capa = io->u.ci_truncate.tr_capa;
+                oinfo.oi_oa = oa;
+                oinfo.oi_capa = io->u.ci_setattr.sa_capa;
                 cfs_init_completion(&cbargs->opc_sync);
-                result = osc_punch_base(osc_export(cl2osc(obj)), oa, capa,
-                                        osc_punch_upcall, cbargs, PTLRPCD_SET);
+
+                if (ia_valid & ATTR_SIZE)
+                        result = osc_punch_base(osc_export(cl2osc(obj)),
+                                                &oinfo, osc_setattr_upcall,
+                                                cbargs, PTLRPCD_SET);
+                else
+                        result = osc_setattr_async_base(osc_export(cl2osc(obj)),
+                                                        &oinfo, NULL,
+                                                        osc_setattr_upcall,
+                                                        cbargs, PTLRPCD_SET);
         }
         return result;
 }
 
-static void osc_io_trunc_end(const struct lu_env *env,
-                             const struct cl_io_slice *slice)
+static void osc_io_setattr_end(const struct lu_env *env,
+                               const struct cl_io_slice *slice)
 {
         struct cl_io            *io     = slice->cis_io;
         struct osc_io           *oio    = cl2osc_io(env, slice);
-        struct osc_punch_cbargs *cbargs = &oio->oi_punch_cbarg;
-        struct obdo             *oa     = &oio->oi_oa;
+        struct osc_setattr_cbargs *cbargs = &oio->oi_setattr_cbarg;
         int result;
 
         cfs_wait_for_completion(&cbargs->opc_sync);
@@ -510,56 +539,75 @@ static void osc_io_trunc_end(const struct lu_env *env,
         result = io->ci_result = cbargs->opc_rc;
         if (result == 0) {
                 struct cl_object *obj = slice->cis_obj;
-                if (oio->oi_lockless == 0) {
-                        struct cl_attr *attr = &osc_env_info(env)->oti_attr;
-                        int valid = 0;
-
-                        /* Update kms & size */
-                        if (oa->o_valid & OBD_MD_FLSIZE) {
-                                attr->cat_size = oa->o_size;
-                                attr->cat_kms  = oa->o_size;
-                                valid |= CAT_KMS|CAT_SIZE;
-                        }
-                        if (oa->o_valid & OBD_MD_FLBLOCKS) {
-                                attr->cat_blocks = oa->o_blocks;
-                                valid |= CAT_BLOCKS;
-                        }
-                        if (oa->o_valid & OBD_MD_FLMTIME) {
-                                attr->cat_mtime = oa->o_mtime;
-                                valid |= CAT_MTIME;
-                        }
-                        if (oa->o_valid & OBD_MD_FLCTIME) {
-                                attr->cat_ctime = oa->o_ctime;
-                                valid |= CAT_CTIME;
-                        }
-                        if (oa->o_valid & OBD_MD_FLATIME) {
-                                attr->cat_atime = oa->o_atime;
-                                valid |= CAT_ATIME;
-                        }
-                        cl_object_attr_lock(obj);
-                        result = cl_object_attr_set(env, obj, attr, valid);
-                        cl_object_attr_unlock(obj);
-                } else {  /* lockless truncate */
+                if (oio->oi_lockless) {
+                        /* lockless truncate */
                         struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+
+                        LASSERT(cl_io_is_trunc(io));
                         /* XXX: Need a lock. */
                         osd->od_stats.os_lockless_truncates++;
                 }
         }
+}
+
+static int osc_io_read_start(const struct lu_env *env,
+                             const struct cl_io_slice *slice)
+{
+        struct osc_io    *oio   = cl2osc_io(env, slice);
+        struct cl_object *obj   = slice->cis_obj;
+        struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+        int              result = 0;
+        ENTRY;
+
+        if (oio->oi_lockless == 0) {
+                cl_object_attr_lock(obj);
+                result = cl_object_attr_get(env, obj, attr);
+                if (result == 0) {
+                        attr->cat_atime = LTIME_S(CFS_CURRENT_TIME);
+                        result = cl_object_attr_set(env, obj, attr,
+                                                    CAT_ATIME);
+                }
+                cl_object_attr_unlock(obj);
+        }
+        RETURN(result);
+}
 
-        /* return result; */
+static int osc_io_write_start(const struct lu_env *env,
+                              const struct cl_io_slice *slice)
+{
+        struct osc_io    *oio   = cl2osc_io(env, slice);
+        struct cl_object *obj   = slice->cis_obj;
+        struct cl_attr   *attr  = &osc_env_info(env)->oti_attr;
+        int              result = 0;
+        ENTRY;
+
+        if (oio->oi_lockless == 0) {
+                cl_object_attr_lock(obj);
+                result = cl_object_attr_get(env, obj, attr);
+                if (result == 0) {
+                        attr->cat_mtime = attr->cat_ctime =
+                                LTIME_S(CFS_CURRENT_TIME);
+                        result = cl_object_attr_set(env, obj, attr,
+                                                    CAT_MTIME | CAT_CTIME);
+                }
+                cl_object_attr_unlock(obj);
+        }
+        RETURN(result);
 }
 
 static const struct cl_io_operations osc_io_ops = {
         .op = {
                 [CIT_READ] = {
+                        .cio_start  = osc_io_read_start,
                         .cio_fini   = osc_io_fini
                 },
                 [CIT_WRITE] = {
+                        .cio_start  = osc_io_write_start,
                         .cio_fini   = osc_io_fini
                 },
-                [CIT_TRUNC] = {
-                        .cio_start  = osc_io_trunc_start,
-                        .cio_end    = osc_io_trunc_end
+                [CIT_SETATTR] = {
+                        .cio_start  = osc_io_setattr_start,
+                        .cio_end    = osc_io_setattr_end
                 },
                 [CIT_FAULT] = {
                         .cio_fini   = osc_io_fini,
@@ -604,7 +652,7 @@ static void osc_req_completion(const struct lu_env *env,
 
 /**
  * Implementation of struct cl_req_operations::cro_attr_set() for osc
- * layer. osc is responsible for struct obdo::o_id and struct obdo::o_gr
+ * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq
  * fields.
  */
 static void osc_req_attr_set(const struct lu_env *env,
@@ -627,7 +675,7 @@ static void osc_req_attr_set(const struct lu_env *env,
                 oa->o_valid |= OBD_MD_FLID;
         }
         if (flags & OBD_MD_FLGROUP) {
-                oa->o_gr = oinfo->loi_gr;
+                oa->o_seq = oinfo->loi_seq;
                 oa->o_valid |= OBD_MD_FLGROUP;
         }
         if (flags & OBD_MD_FLHANDLE) {