Whamcloud - gitweb
LU-1934 ofd: implement precreate batching
authorMikhail Pershin <tappro@whamcloud.com>
Mon, 1 Oct 2012 17:17:48 +0000 (21:17 +0400)
committerOleg Drokin <green@whamcloud.com>
Tue, 2 Oct 2012 22:18:06 +0000 (18:18 -0400)
Bulk precreate objects in a single tx handle.  For zfs this minimizes
the chance that the creates will be spread over multiple txgs.  This
can occur when the osts are under a concurrent write workload which
can consume bulk of the available space in a txg.

For ldiskfs this change shouldn't be harmful and the default value
can be set to 1 for ldiskfs if needed to get the previous behavior.

Change-Id: I81e310a8b630a1a29f3e017f6980901d6b6436fa
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Mikhail Pershin <tappro@whamcloud.com>
Reviewed-on: http://review.whamcloud.com/4147
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/ofd/lproc_ofd.c
lustre/ofd/ofd_fs.c
lustre/ofd/ofd_internal.h
lustre/ofd/ofd_obd.c
lustre/ofd/ofd_objects.c

index c40774b..d863df2 100644 (file)
@@ -141,6 +141,38 @@ static int lprocfs_ofd_wr_grant_ratio(struct file *file, const char *buffer,
        return count;
 }
 
+static int lprocfs_ofd_rd_precreate_batch(char *page, char **start, off_t off,
+                                         int count, int *eof, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct ofd_device *ofd = ofd_dev(obd->obd_lu_dev);
+
+       LASSERT(obd != NULL);
+       *eof = 1;
+       return snprintf(page, count, "%d\n", ofd->ofd_precreate_batch);
+}
+
+static int lprocfs_ofd_wr_precreate_batch(struct file *file, const char *buffer,
+                                         unsigned long count, void *data)
+{
+       struct obd_device *obd = (struct obd_device *)data;
+       struct ofd_device *ofd = ofd_dev(obd->obd_lu_dev);
+       int val;
+       int rc;
+
+       rc = lprocfs_write_helper(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       if (val < 1)
+               return -EINVAL;
+
+       cfs_spin_lock(&ofd->ofd_objid_lock);
+       ofd->ofd_precreate_batch = val;
+       cfs_spin_unlock(&ofd->ofd_objid_lock);
+       return count;
+}
+
 static int lprocfs_ofd_rd_last_id(char *page, char **start, off_t off,
                                  int count, int *eof, void *data)
 {
@@ -438,6 +470,8 @@ static struct lprocfs_vars lprocfs_ofd_obd_vars[] = {
        { "grant_precreate",     lprocfs_ofd_rd_grant_precreate, 0, 0 },
        { "grant_ratio",         lprocfs_ofd_rd_grant_ratio,
                                 lprocfs_ofd_wr_grant_ratio, 0, 0 },
+       { "precreate_batch",     lprocfs_ofd_rd_precreate_batch,
+                                lprocfs_ofd_wr_precreate_batch, 0 },
        { "recovery_status",     lprocfs_obd_rd_recovery_status, 0, 0 },
        { "recovery_time_soft",  lprocfs_obd_rd_recovery_time_soft,
                                 lprocfs_obd_wr_recovery_time_soft, 0},
index 1828758..a21586d 100644 (file)
@@ -68,6 +68,17 @@ int ofd_record_write(const struct lu_env *env, struct ofd_device *ofd,
        RETURN(rc);
 }
 
+int ofd_precreate_batch(struct ofd_device *ofd, int batch)
+{
+       int count;
+
+       cfs_spin_lock(&ofd->ofd_objid_lock);
+       count = min(ofd->ofd_precreate_batch, batch);
+       cfs_spin_unlock(&ofd->ofd_objid_lock);
+
+       return count;
+}
+
 obd_id ofd_last_id(struct ofd_device *ofd, obd_seq group)
 {
        obd_id id;
@@ -221,6 +232,7 @@ int ofd_groups_init(const struct lu_env *env, struct ofd_device *ofd)
        if (rc)
                GOTO(cleanup, rc);
 
+       ofd->ofd_precreate_batch = OFD_PRECREATE_BATCH_DEFAULT;
        groups_size = (unsigned long)info->fti_attr.la_size;
 
        if (groups_size == sizeof(last_group)) {
index 5ab1e61..e130a08 100644 (file)
@@ -49,6 +49,7 @@
 #define OFD_INCOMPAT_SUPP (OBD_INCOMPAT_GROUPS | OBD_INCOMPAT_OST | \
                           OBD_INCOMPAT_COMMON_LR)
 #define OFD_MAX_GROUPS 256
+#define OFD_PRECREATE_BATCH_DEFAULT (FILTER_SUBDIR_COUNT * 4)
 
 /* Limit the returned fields marked valid to those that we actually might set */
 #define OFD_VALID_FLAGS (LA_TYPE | LA_MODE | LA_SIZE | LA_BLOCKS | \
@@ -114,6 +115,7 @@ struct ofd_device {
        struct dt_object        *ofd_lastid_obj[OFD_MAX_GROUPS];
        cfs_spinlock_t           ofd_objid_lock;
        unsigned long            ofd_destroys_in_progress;
+       int                      ofd_precreate_batch;
 
        /* protect all statfs-related counters */
        cfs_spinlock_t           ofd_osfs_lock;
@@ -322,6 +324,7 @@ int ofd_group_load(const struct lu_env *env, struct ofd_device *ofd, int);
 int ofd_fs_setup(const struct lu_env *env, struct ofd_device *ofd,
                 struct obd_device *obd);
 void ofd_fs_cleanup(const struct lu_env *env, struct ofd_device *ofd);
+int ofd_precreate_batch(struct ofd_device *ofd, int batch);
 
 /* ofd_io.c */
 int ofd_preprw(const struct lu_env *env,int cmd, struct obd_export *exp,
@@ -370,8 +373,8 @@ struct ofd_object *ofd_object_find_or_create(const struct lu_env *env,
                                             const struct lu_fid *fid,
                                             struct lu_attr *attr);
 int ofd_object_ff_check(const struct lu_env *env, struct ofd_object *fo);
-int ofd_precreate_object(const struct lu_env *env, struct ofd_device *ofd,
-                        obd_id id, obd_seq seq);
+int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd,
+                         obd_id id, obd_seq group, int nr);
 
 void ofd_object_put(const struct lu_env *env, struct ofd_object *fo);
 int ofd_attr_set(const struct lu_env *env, struct ofd_object *fo,
index 5efa4a4..4895d31 100644 (file)
@@ -1159,8 +1159,10 @@ int ofd_create(const struct lu_env *env, struct obd_export *exp,
                }
        }
        if (diff > 0) {
-               obd_id next_id = ofd_last_id(ofd, oa->o_seq) + 1;
-               int i;
+               cfs_time_t       enough_time = cfs_time_shift(DISK_TIMEOUT);
+               obd_id           next_id;
+               int              created = 0;
+               int              count;
 
                if (!(oa->o_valid & OBD_MD_FLFLAGS) ||
                    !(oa->o_flags & OBD_FL_DELORPHAN)) {
@@ -1176,16 +1178,33 @@ int ofd_create(const struct lu_env *env, struct obd_export *exp,
                        }
                }
 
-               CDEBUG(D_HA,
-                      "%s: reserve %d objects in group "LPU64" at "LPU64"\n",
-                      ofd_obd(ofd)->obd_name, diff, oa->o_seq, next_id);
-               for (i = 0; i < diff; i++) {
-                       rc = ofd_precreate_object(env, ofd, next_id + i,
-                                                 oa->o_seq);
-                       if (rc)
+               while (diff > 0) {
+                       next_id = ofd_last_id(ofd, oa->o_seq) + 1;
+                       count = ofd_precreate_batch(ofd, diff);
+
+                       CDEBUG(D_HA, "%s: reserve %d objects in group "LPU64
+                              " at "LPU64"\n", ofd_obd(ofd)->obd_name,
+                              count, oa->o_seq, next_id);
+
+                       if (cfs_time_after(jiffies, enough_time)) {
+                               LCONSOLE_WARN("%s: Slow creates, %d/%d objects"
+                                             " created at a rate of %d/s\n",
+                                             ofd_obd(ofd)->obd_name,
+                                             created, diff + created,
+                                             created / DISK_TIMEOUT);
+                               break;
+               }
+
+                       rc = ofd_precreate_objects(env, ofd, next_id,
+                                                  oa->o_seq, count);
+                       if (rc > 0) {
+                               created += rc;
+                               diff -= rc;
+                       } else if (rc < 0) {
                                break;
+                       }
                }
-               if (i > 0) {
+               if (created > 0) {
                        /* some objects got created, we can return
                         * them, even if last creation failed */
                        oa->o_id = ofd_last_id(ofd, oa->o_seq);
index 1f48a6c..8ff3c0d 100644 (file)
@@ -145,35 +145,37 @@ void ofd_object_put(const struct lu_env *env, struct ofd_object *fo)
        lu_object_put(env, &fo->ofo_obj.do_lu);
 }
 
-int ofd_precreate_object(const struct lu_env *env, struct ofd_device *ofd,
-                        obd_id id, obd_seq group)
+int ofd_precreate_objects(const struct lu_env *env, struct ofd_device *ofd,
+                         obd_id id, obd_seq seq, int nr)
 {
        struct ofd_thread_info  *info = ofd_info(env);
-       struct ofd_object       *fo;
+       struct ofd_object       *fo = NULL;
        struct dt_object        *next;
        struct thandle          *th;
+       struct ofd_object       **batch;
        obd_id                   tmp;
        int                      rc;
+       int                      i;
+       int                      objects = 0;
+       int                      nr_saved = nr;
 
        ENTRY;
 
        /* Don't create objects beyond the valid range for this SEQ */
-       if (unlikely(fid_seq_is_mdt0(group) && id >= IDIF_MAX_OID)) {
+       if (unlikely(fid_seq_is_mdt0(seq) && (id + nr) >= IDIF_MAX_OID)) {
                CERROR("%s:"POSTID" hit the IDIF_MAX_OID (1<<48)!\n",
-                      ofd_name(ofd), id, group);
+                      ofd_name(ofd), id, seq);
                RETURN(rc = -ENOSPC);
-       } else if (unlikely(!fid_seq_is_mdt0(group) && id >= OBIF_MAX_OID)) {
+       } else if (unlikely(!fid_seq_is_mdt0(seq) &&
+                  (id + nr) >= OBIF_MAX_OID)) {
                CERROR("%s:"POSTID" hit the OBIF_MAX_OID (1<<32)!\n",
-                      ofd_name(ofd), id, group);
+                      ofd_name(ofd), id, seq);
                RETURN(rc = -ENOSPC);
        }
-       info->fti_ostid.oi_id = id;
-       info->fti_ostid.oi_seq = group;
-       fid_ostid_unpack(&info->fti_fid, &info->fti_ostid, 0);
 
-       fo = ofd_object_find(env, ofd, &info->fti_fid);
-       if (IS_ERR(fo))
-               RETURN(PTR_ERR(fo));
+       OBD_ALLOC(batch, nr_saved * sizeof(struct ofd_object *));
+       if (batch == NULL)
+               RETURN(-ENOMEM);
 
        info->fti_attr.la_valid = LA_TYPE | LA_MODE;
        /*
@@ -192,61 +194,113 @@ int ofd_precreate_object(const struct lu_env *env, struct ofd_device *ofd,
        info->fti_attr.la_mtime = 0;
        info->fti_attr.la_ctime = 0;
 
-       next = ofd_object_child(fo);
-       LASSERT(next != NULL);
+       /* prepare objects */
+       for (i = 0; i < nr; i++) {
+               info->fti_ostid.oi_id = id + i;
+               info->fti_ostid.oi_seq = seq;
 
+               rc = fid_ostid_unpack(&info->fti_fid, &info->fti_ostid, 0);
+               if (rc) {
+                       if (i == 0)
+                               GOTO(out, rc = PTR_ERR(fo));
+
+                       nr = i;
+                       break;
+               }
+
+               fo = ofd_object_find(env, ofd, &info->fti_fid);
+               if (IS_ERR(fo)) {
+                       if (i == 0)
+                               GOTO(out, rc = PTR_ERR(fo));
+
+                       nr = i;
+                       break;
+               }
+
+               ofd_write_lock(env, fo);
+               batch[i] = fo;
+       }
        info->fti_buf.lb_buf = &tmp;
        info->fti_buf.lb_len = sizeof(tmp);
        info->fti_off = 0;
 
-       ofd_write_lock(env, fo);
        th = ofd_trans_create(env, ofd);
        if (IS_ERR(th))
-               GOTO(out_unlock, rc = PTR_ERR(th));
+               GOTO(out, rc = PTR_ERR(th));
 
-       rc = dt_declare_record_write(env, ofd->ofd_lastid_obj[group],
+       rc = dt_declare_record_write(env, ofd->ofd_lastid_obj[seq],
                                     sizeof(tmp), info->fti_off, th);
        if (rc)
                GOTO(trans_stop, rc);
 
-       if (unlikely(ofd_object_exists(fo))) {
-               /* object may exist being re-created by write replay */
-               CDEBUG(D_INODE, "object %u/"LPD64" exists: "DFID"\n",
-                      (unsigned) group, id, PFID(&info->fti_fid));
-               rc = dt_trans_start_local(env, ofd->ofd_osd, th);
-               if (rc)
-                       GOTO(trans_stop, rc);
-               GOTO(last_id_write, rc);
+       for (i = 0; i < nr; i++) {
+               fo = batch[i];
+               LASSERT(fo);
+
+               if (unlikely(ofd_object_exists(fo))) {
+                       /* object may exist being re-created by write replay */
+                       CDEBUG(D_INODE, "object "LPD64"/"LPD64" exists: "
+                              DFID"\n", seq, id, PFID(&info->fti_fid));
+                       continue;
+               }
+
+               next = ofd_object_child(fo);
+               LASSERT(next != NULL);
+
+               rc = dt_declare_create(env, next, &info->fti_attr, NULL,
+                                      &info->fti_dof, th);
+               if (rc) {
+                       nr = i;
+                       break;
+               }
        }
-       rc = dt_declare_create(env, next, &info->fti_attr, NULL,
-                              &info->fti_dof, th);
-       if (rc)
-               GOTO(trans_stop, rc);
 
        rc = dt_trans_start_local(env, ofd->ofd_osd, th);
        if (rc)
                GOTO(trans_stop, rc);
 
-       CDEBUG(D_OTHER, "create new object %lu:%llu\n",
-              (unsigned long) info->fti_fid.f_oid, info->fti_fid.f_seq);
+       CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(&info->fti_fid));
 
-       rc = dt_create(env, next, &info->fti_attr, NULL, &info->fti_dof, th);
-       if (rc)
-               GOTO(trans_stop, rc);
-       LASSERT(ofd_object_exists(fo));
+       for (i = 0; i < nr; i++) {
+               fo = batch[i];
+               LASSERT(fo);
+
+               if (likely(!ofd_object_exists(fo))) {
+                       next = ofd_object_child(fo);
+                       LASSERT(next != NULL);
 
-last_id_write:
-       ofd_last_id_set(ofd, id, group);
+                       rc = dt_create(env, next, &info->fti_attr, NULL,
+                                      &info->fti_dof, th);
+                       if (rc)
+                               break;
+                       LASSERT(ofd_object_exists(fo));
+               }
+               ofd_last_id_set(ofd, id + i, seq);
+       }
 
-       tmp = cpu_to_le64(ofd_last_id(ofd, group));
-       rc = dt_record_write(env, ofd->ofd_lastid_obj[group], &info->fti_buf,
-                            &info->fti_off, th);
+       objects = i;
+       if (objects > 0) {
+               tmp = cpu_to_le64(ofd_last_id(ofd, seq));
+               rc = dt_record_write(env, ofd->ofd_lastid_obj[seq],
+                                    &info->fti_buf, &info->fti_off, th);
+       }
 trans_stop:
        ofd_trans_stop(env, ofd, th, rc);
-out_unlock:
-       ofd_write_unlock(env, fo);
-       ofd_object_put(env, fo);
-       RETURN(rc);
+out:
+       for (i = 0; i < nr_saved; i++) {
+               fo = batch[i];
+               if (fo) {
+                       ofd_write_unlock(env, fo);
+                       ofd_object_put(env, fo);
+               }
+       }
+       OBD_FREE(batch, nr_saved * sizeof(struct ofd_object *));
+
+       CDEBUG((objects == 0 && rc == 0) ? D_ERROR : D_OTHER,
+              "created %d/%d objects: %d\n", objects, nr_saved, rc);
+
+       LASSERT(ergo(objects == 0, rc < 0));
+       RETURN(objects > 0 ? objects : rc);
 }
 
 /*