+/**
+ * Decide if it will need write intent RPC
+ */
+static int lov_io_mirror_write_intent(struct lov_io *lio,
+ struct lov_object *obj, struct cl_io *io)
+{
+ struct lov_layout_composite *comp = &obj->u.composite;
+ struct lu_extent *ext = &io->ci_write_intent;
+ struct lov_mirror_entry *lre;
+ struct lov_mirror_entry *primary;
+ struct lov_layout_entry *lle;
+ size_t count = 0;
+ ENTRY;
+
+ *ext = (typeof(*ext)) { lio->lis_pos, lio->lis_endpos };
+ io->ci_need_write_intent = 0;
+
+ if (!(io->ci_type == CIT_WRITE || cl_io_is_trunc(io) ||
+ cl_io_is_mkwrite(io)))
+ RETURN(0);
+
+ /*
+ * FLR: check if it needs to send a write intent RPC to server.
+ * Writing to sync_pending file needs write intent RPC to change
+ * the file state back to write_pending, so that the layout version
+ * can be increased when the state changes to sync_pending at a later
+ * time. Otherwise there exists a chance that an evicted client may
+ * dirty the file data while resync client is working on it.
+ * Designated I/O is allowed for resync workload.
+ */
+ if (lov_flr_state(obj) == LCM_FL_RDONLY ||
+ (lov_flr_state(obj) == LCM_FL_SYNC_PENDING &&
+ io->ci_designated_mirror == 0)) {
+ io->ci_need_write_intent = 1;
+ RETURN(0);
+ }
+
+ LASSERT((lov_flr_state(obj) == LCM_FL_WRITE_PENDING));
+ LASSERT(comp->lo_preferred_mirror >= 0);
+
+ /*
+ * need to iterate all components to see if there are
+ * multiple components covering the writing component
+ */
+ primary = &comp->lo_mirrors[comp->lo_preferred_mirror];
+ LASSERT(!primary->lre_stale);
+ lov_foreach_mirror_layout_entry(obj, lle, primary) {
+ LASSERT(lle->lle_valid);
+ if (!lu_extent_is_overlapped(ext, lle->lle_extent))
+ continue;
+
+ ext->e_start = MIN(ext->e_start, lle->lle_extent->e_start);
+ ext->e_end = MAX(ext->e_end, lle->lle_extent->e_end);
+ ++count;
+ }
+ if (count == 0) {
+ CERROR(DFID ": cannot find any valid components covering "
+ "file extent "DEXT", mirror: %d\n",
+ PFID(lu_object_fid(lov2lu(obj))), PEXT(ext),
+ primary->lre_mirror_id);
+ RETURN(-EIO);
+ }
+
+ count = 0;
+ lov_foreach_mirror_entry(obj, lre) {
+ if (lre == primary)
+ continue;
+
+ lov_foreach_mirror_layout_entry(obj, lle, lre) {
+ if (!lle->lle_valid)
+ continue;
+
+ if (lu_extent_is_overlapped(ext, lle->lle_extent)) {
+ ++count;
+ break;
+ }
+ }
+ }
+
+ CDEBUG(D_VFSTRACE, DFID "there are %zd components to be staled to "
+ "modify file extent "DEXT", iot: %d\n",
+ PFID(lu_object_fid(lov2lu(obj))), count, PEXT(ext), io->ci_type);
+
+ io->ci_need_write_intent = count > 0;
+
+ RETURN(0);
+}
+
+static int lov_io_mirror_init(struct lov_io *lio, struct lov_object *obj,
+ struct cl_io *io)
+{
+ struct lov_layout_composite *comp = &obj->u.composite;
+ int index;
+ int i;
+ int result;
+ ENTRY;
+
+ if (!lov_is_flr(obj)) {
+ LASSERT(comp->lo_preferred_mirror == 0);
+ lio->lis_mirror_index = comp->lo_preferred_mirror;
+ io->ci_ndelay = 0;
+ RETURN(0);
+ }
+
+ /* transfer the layout version for verification */
+ if (io->ci_layout_version == 0)
+ io->ci_layout_version = obj->lo_lsm->lsm_layout_gen;
+
+ /* find the corresponding mirror for designated mirror IO */
+ if (io->ci_designated_mirror > 0) {
+ struct lov_mirror_entry *entry;
+
+ LASSERT(!io->ci_ndelay);
+
+ CDEBUG(D_LAYOUT, "designated I/O mirror state: %d\n",
+ lov_flr_state(obj));
+
+ if ((cl_io_is_trunc(io) || io->ci_type == CIT_WRITE) &&
+ (io->ci_layout_version != obj->lo_lsm->lsm_layout_gen)) {
+ /*
+ * For resync I/O, the ci_layout_version was the layout
+ * version when resync starts. If it doesn't match the
+ * current object layout version, it means the layout
+ * has been changed
+ */
+ RETURN(-ESTALE);
+ }
+
+ io->ci_layout_version |= LU_LAYOUT_RESYNC;
+
+ index = 0;
+ lio->lis_mirror_index = -1;
+ lov_foreach_mirror_entry(obj, entry) {
+ if (entry->lre_mirror_id ==
+ io->ci_designated_mirror) {
+ lio->lis_mirror_index = index;
+ break;
+ }
+
+ index++;
+ }
+
+ RETURN(lio->lis_mirror_index < 0 ? -EINVAL : 0);
+ }
+
+ result = lov_io_mirror_write_intent(lio, obj, io);
+ if (result)
+ RETURN(result);
+
+ if (io->ci_need_write_intent) {
+ CDEBUG(D_VFSTRACE, DFID " need write intent for [%llu, %llu)\n",
+ PFID(lu_object_fid(lov2lu(obj))),
+ lio->lis_pos, lio->lis_endpos);
+
+ if (cl_io_is_trunc(io)) {
+ /**
+ * for truncate, we uses [size, EOF) to judge whether
+ * a write intent needs to be send, but we need to
+ * restore the write extent to [0, size).
+ */
+ io->ci_write_intent.e_start = 0;
+ io->ci_write_intent.e_end =
+ io->u.ci_setattr.sa_attr.lvb_size;
+ }
+ /* stop cl_io_init() loop */
+ RETURN(1);
+ }
+
+ if (io->ci_ndelay_tried == 0 || /* first time to try */
+ /* reset the mirror index if layout has changed */
+ lio->lis_mirror_layout_gen != obj->lo_lsm->lsm_layout_gen) {
+ lio->lis_mirror_layout_gen = obj->lo_lsm->lsm_layout_gen;
+ index = lio->lis_mirror_index = comp->lo_preferred_mirror;
+ } else {
+ index = lio->lis_mirror_index;
+ LASSERT(index >= 0);
+
+ /* move mirror index to the next one */
+ index = (index + 1) % comp->lo_mirror_count;
+ }
+
+ for (i = 0; i < comp->lo_mirror_count; i++) {
+ struct lu_extent ext = { .e_start = lio->lis_pos,
+ .e_end = lio->lis_pos + 1 };
+ struct lov_mirror_entry *lre;
+ struct lov_layout_entry *lle;
+ bool found = false;
+
+ lre = &comp->lo_mirrors[(index + i) % comp->lo_mirror_count];
+ if (!lre->lre_valid)
+ continue;
+
+ lov_foreach_mirror_layout_entry(obj, lle, lre) {
+ if (!lle->lle_valid)
+ continue;
+
+ if (lu_extent_is_overlapped(&ext, lle->lle_extent)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found) {
+ index = (index + i) % comp->lo_mirror_count;
+ break;
+ }
+ }
+ if (i == comp->lo_mirror_count) {
+ CERROR(DFID": failed to find a component covering "
+ "I/O region at %llu\n",
+ PFID(lu_object_fid(lov2lu(obj))), lio->lis_pos);
+
+ dump_lsm(D_ERROR, obj->lo_lsm);
+
+ RETURN(-EIO);
+ }
+
+ CDEBUG(D_VFSTRACE, DFID ": flr state: %d, move mirror from %d to %d, "
+ "have retried: %d, mirror count: %d\n",
+ PFID(lu_object_fid(lov2lu(obj))), lov_flr_state(obj),
+ lio->lis_mirror_index, index, io->ci_ndelay_tried,
+ comp->lo_mirror_count);
+
+ lio->lis_mirror_index = index;
+
+ /*
+ * FLR: if all mirrors have been tried once, most likely the network
+ * of this client has been partitioned. We should relinquish CPU for
+ * a while before trying again.
+ */
+ ++io->ci_ndelay_tried;
+ if (io->ci_ndelay && io->ci_ndelay_tried >= comp->lo_mirror_count) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); /* 10ms */
+ if (signal_pending(current))
+ RETURN(-EINTR);
+
+ /* reset retry counter */
+ io->ci_ndelay_tried = 1;
+ }
+
+ CDEBUG(D_VFSTRACE, "use %sdelayed RPC state for this IO\n",
+ io->ci_ndelay ? "non-" : "");
+
+ RETURN(0);
+}
+