LU-12328 flr: avoid reading unhealthy mirror

[fs/lustre-release.git] / lustre / lov / lov_io.c
diff --git a/lustre/lov/lov_io.c b/lustre/lov/lov_io.c

index 4f70a68..92b8ce5 100644 (file)
--- a/lustre/lov/lov_io.c
+++ b/lustre/lov/lov_io.c
@@ -122,8 +122,10 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
  
         /* obtain new environment */
         sub->sub_env = cl_env_get(&sub->sub_refcheck);
-       if (IS_ERR(sub->sub_env))
+       if (IS_ERR(sub->sub_env)) {
                 result = PTR_ERR(sub->sub_env);
+               RETURN(result);
+       }
  
         sub_obj = lovsub2cl(lov_r0(lov, index)->lo_sub[stripe]);
         sub_io  = &sub->sub_io;
@@ -136,10 +138,11 @@ static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
         sub_io->ci_type    = io->ci_type;
         sub_io->ci_no_srvlock = io->ci_no_srvlock;
         sub_io->ci_noatime = io->ci_noatime;
-       sub_io->ci_pio = io->ci_pio;
+       sub_io->ci_async_readahead = io->ci_async_readahead;
         sub_io->ci_lock_no_expand = io->ci_lock_no_expand;
         sub_io->ci_ndelay = io->ci_ndelay;
         sub_io->ci_layout_version = io->ci_layout_version;
+       sub_io->ci_tried_all_mirrors = io->ci_tried_all_mirrors;
  
         result = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj);
  
@@ -189,19 +192,6 @@ out:
   * Lov io operations.
   *
   */
-
-int lov_page_index(const struct cl_page *page)
-{
-       const struct cl_page_slice *slice;
-       ENTRY;
-
-       slice = cl_page_at(page, &lov_device_type);
-       LASSERT(slice != NULL);
-       LASSERT(slice->cpl_obj != NULL);
-
-       RETURN(cl2lov_page(slice)->lps_index);
-}
-
  static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
                               struct cl_io *io)
  {
@@ -416,13 +406,13 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
                                 found = true;
                                 break;
                         }
-               }
-
+               } /* each component of the mirror */
                 if (found) {
                         index = (index + i) % comp->lo_mirror_count;
                         break;
                 }
-       }
+       } /* each mirror */
+
         if (i == comp->lo_mirror_count) {
                 CERROR(DFID": failed to find a component covering "
                        "I/O region at %llu\n",
@@ -446,16 +436,22 @@ static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
          * of this client has been partitioned. We should relinquish CPU for
          * a while before trying again.
          */
-       ++io->ci_ndelay_tried;
-       if (io->ci_ndelay && io->ci_ndelay_tried >= comp->lo_mirror_count) {
+       if (io->ci_ndelay && io->ci_ndelay_tried > 0 &&
+           (io->ci_ndelay_tried % comp->lo_mirror_count == 0)) {
                 set_current_state(TASK_INTERRUPTIBLE);
-               schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); /* 10ms */
+               schedule_timeout(cfs_time_seconds(1) / 100); /* 10ms */
                 if (signal_pending(current))
                         RETURN(-EINTR);
  
-               /* reset retry counter */
-               io->ci_ndelay_tried = 1;
+               /**
+                * we'd set ci_tried_all_mirrors to turn off fast mirror
+                * switching for read after we've tried all mirrors several
+                * rounds.
+                */
+               io->ci_tried_all_mirrors = io->ci_ndelay_tried %
+                                          (comp->lo_mirror_count * 4) == 0;
         }
+       ++io->ci_ndelay_tried;
  
         CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n",
                io->ci_ndelay ? "non-" : "");
@@ -478,8 +474,8 @@ static int lov_io_slice_init(struct lov_io *lio,
         switch (io->ci_type) {
         case CIT_READ:
         case CIT_WRITE:
-               lio->lis_pos = io->u.ci_rw.rw_range.cir_pos;
-               lio->lis_endpos = lio->lis_pos + io->u.ci_rw.rw_range.cir_count;
+               lio->lis_pos = io->u.ci_rw.crw_pos;
+               lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
                 lio->lis_io_endpos = lio->lis_endpos;
                 if (cl_io_is_append(io)) {
                         LASSERT(io->ci_type == CIT_WRITE);
@@ -564,7 +560,15 @@ static int lov_io_slice_init(struct lov_io *lio,
          */
         if (cl_io_is_trunc(io)) {
                 io->ci_write_intent.e_start = 0;
-               io->ci_write_intent.e_end = io->u.ci_setattr.sa_attr.lvb_size;
+               /* for writes, e_end is endpos, the location of the file
+                * pointer after the write is completed, so it is not accessed.
+                * For truncate, 'end' is the size, and *is* acccessed.
+                * In other words, writes are [start, end), but truncate is
+                * [start, size], where both are included.  So add 1 to the
+                * size when creating the write intent to account for this.
+                */
+               io->ci_write_intent.e_end =
+                       io->u.ci_setattr.sa_attr.lvb_size + 1;
         } else {
                 io->ci_write_intent.e_start = lio->lis_pos;
                 io->ci_write_intent.e_end = lio->lis_endpos;
@@ -639,7 +643,6 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
         int index = lov_comp_entry(sub->sub_subio_index);
         int stripe = lov_comp_stripe(sub->sub_subio_index);
  
-       io->ci_pio = parent->ci_pio;
         switch (io->ci_type) {
         case CIT_SETATTR: {
                 io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr;
@@ -685,16 +688,13 @@ static void lov_io_sub_inherit(struct lov_io_sub *sub, struct lov_io *lio,
         }
         case CIT_READ:
         case CIT_WRITE: {
-               io->u.ci_rw.rw_ptask = parent->u.ci_rw.rw_ptask;
-               io->u.ci_rw.rw_iter = parent->u.ci_rw.rw_iter;
-               io->u.ci_rw.rw_iocb = parent->u.ci_rw.rw_iocb;
-               io->u.ci_rw.rw_file = parent->u.ci_rw.rw_file;
-               io->u.ci_rw.rw_sync = parent->u.ci_rw.rw_sync;
+               io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent);
+               io->ci_tried_all_mirrors = parent->ci_tried_all_mirrors;
                 if (cl_io_is_append(parent)) {
-                       io->u.ci_rw.rw_append = 1;
+                       io->u.ci_wr.wr_append = 1;
                 } else {
-                       io->u.ci_rw.rw_range.cir_pos = start;
-                       io->u.ci_rw.rw_range.cir_count = end - start;
+                       io->u.ci_rw.crw_pos = start;
+                       io->u.ci_rw.crw_count = end - start;
                 }
                 break;
         }
@@ -787,9 +787,8 @@ static int lov_io_iter_init(const struct lu_env *env,
                         if (rc != 0)
                                 break;
  
-                       CDEBUG(D_VFSTRACE,
-                               "shrink stripe: {%d, %d} range: [%llu, %llu)\n",
-                               index, stripe, start, end);
+                       CDEBUG(D_VFSTRACE, "shrink: %d [%llu, %llu)\n",
+                              stripe, start, end);
  
                         list_add_tail(&sub->sub_linkage, &lio->lis_active);
                 }
@@ -802,11 +801,10 @@ static int lov_io_iter_init(const struct lu_env *env,
  static int lov_io_rw_iter_init(const struct lu_env *env,
                                const struct cl_io_slice *ios)
  {
-       struct cl_io *io = ios->cis_io;
         struct lov_io *lio = cl2lov_io(env, ios);
+       struct cl_io *io = ios->cis_io;
         struct lov_stripe_md_entry *lse;
-       struct cl_io_range *range = &io->u.ci_rw.rw_range;
-       loff_t start = range->cir_pos;
+       loff_t start = io->u.ci_rw.crw_pos;
         loff_t next;
         int index;
  
@@ -816,7 +814,7 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
         if (cl_io_is_append(io))
                 RETURN(lov_io_iter_init(env, ios));
  
-       index = lov_io_layout_at(lio, range->cir_pos);
+       index = lov_io_layout_at(lio, io->u.ci_rw.crw_pos);
         if (index < 0) { /* non-existing layout component */
                 if (io->ci_type == CIT_READ) {
                         /*
@@ -824,8 +822,6 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
                          * then set the next pos
                          */
                         io->ci_continue = 0;
-                       /* execute it in main thread */
-                       io->ci_pio = 0;
  
                         RETURN(lov_io_iter_init(env, ios));
                 }
@@ -849,28 +845,20 @@ static int lov_io_rw_iter_init(const struct lu_env *env,
                         next = MAX_LFS_FILESIZE;
         }
  
-       LASSERTF(range->cir_pos >= lse->lsme_extent.e_start,
-                "pos %lld, [%lld, %lld)\n", range->cir_pos,
+       LASSERTF(io->u.ci_rw.crw_pos >= lse->lsme_extent.e_start,
+                "pos %lld, [%lld, %lld)\n", io->u.ci_rw.crw_pos,
                  lse->lsme_extent.e_start, lse->lsme_extent.e_end);
         next = min_t(__u64, next, lse->lsme_extent.e_end);
         next = min_t(loff_t, next, lio->lis_io_endpos);
  
-       io->ci_continue  = next < lio->lis_io_endpos;
-       range->cir_count = next - range->cir_pos;
-       lio->lis_pos     = range->cir_pos;
-       lio->lis_endpos  = range->cir_pos + range->cir_count;
+       io->ci_continue = next < lio->lis_io_endpos;
+       io->u.ci_rw.crw_count = next - io->u.ci_rw.crw_pos;
+       lio->lis_pos    = io->u.ci_rw.crw_pos;
+       lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
         CDEBUG(D_VFSTRACE,
-              "stripe: {%d, %llu} range: [%llu, %llu) end: %llu, count: %zd\n",
-              index, start, lio->lis_pos, lio->lis_endpos,
-              lio->lis_io_endpos, range->cir_count);
-
-       if (!io->ci_continue) {
-               /* the last piece of IO, execute it in main thread */
-               io->ci_pio = 0;
-       }
-
-       if (io->ci_pio)
-               RETURN(0);
+              "stripe: %llu chunk: [%llu, %llu) %llu, %zd\n",
+              (__u64)start, lio->lis_pos, lio->lis_endpos,
+              (__u64)lio->lis_io_endpos, io->u.ci_rw.crw_count);
  
         /*
          * XXX The following call should be optimized: we know, that
@@ -1118,6 +1106,7 @@ static int lov_io_submit(const struct lu_env *env,
         struct lov_io_sub       *sub;
         struct cl_page_list     *plist = &lov_env_info(env)->lti_plist;
         struct cl_page          *page;
+       struct cl_page          *tmp;
         int index;
         int rc = 0;
         ENTRY;
@@ -1143,11 +1132,11 @@ static int lov_io_submit(const struct lu_env *env,
                 cl_2queue_init(cl2q);
                 cl_page_list_move(&cl2q->c2_qin, qin, page);
  
-               index = lov_page_index(page);
-               while (qin->pl_nr > 0) {
-                       page = cl_page_list_first(qin);
-                       if (index != lov_page_index(page))
-                               break;
+               index = page->cp_lov_index;
+               cl_page_list_for_each_safe(page, tmp, qin) {
+                       /* this page is not on this stripe */
+                       if (index != page->cp_lov_index)
+                               continue;
  
                         cl_page_list_move(&cl2q->c2_qin, qin, page);
                 }
@@ -1210,10 +1199,10 @@ static int lov_io_commit_async(const struct lu_env *env,
  
                 cl_page_list_move(plist, queue, page);
  
-               index = lov_page_index(page);
+               index = page->cp_lov_index;
                 while (queue->pl_nr > 0) {
                         page = cl_page_list_first(queue);
-                       if (index != lov_page_index(page))
+                       if (index != page->cp_lov_index)
                                 break;
  
                         cl_page_list_move(plist, queue, page);
@@ -1259,7 +1248,7 @@ static int lov_io_fault_start(const struct lu_env *env,
  
         fio = &ios->cis_io->u.ci_fault;
         lio = cl2lov_io(env, ios);
-       sub = lov_sub_get(env, lio, lov_page_index(fio->ft_page));
+       sub = lov_sub_get(env, lio, fio->ft_page->cp_lov_index);
         sub->sub_io.u.ci_fault.ft_nob = fio->ft_nob;
  
         RETURN(lov_io_start(env, ios));