Whamcloud - gitweb
LU-9859 lod: use linux kernel bitmap API
[fs/lustre-release.git] / lustre / lod / lod_dev.c
index e4bbf62..c8ec185 100644 (file)
@@ -96,6 +96,7 @@
 #include <uapi/linux/lustre/lustre_param.h>
 #include <lustre_update.h>
 #include <lustre_log.h>
+#include <lustre_lmv.h>
 
 #include "lod_internal.h"
 
@@ -269,8 +270,10 @@ static int lod_sub_process_config(const struct lu_env *env,
 struct lod_recovery_data {
        struct lod_device       *lrd_lod;
        struct lod_tgt_desc     *lrd_ltd;
-       struct ptlrpc_thread    *lrd_thread;
+       struct task_struct      **lrd_task;
        u32                     lrd_idx;
+       struct lu_env           lrd_env;
+       struct completion       *lrd_started;
 };
 
 
@@ -354,9 +357,8 @@ static int lod_sub_recovery_thread(void *arg)
        struct lod_recovery_data *lrd = arg;
        struct lod_device *lod = lrd->lrd_lod;
        struct dt_device *dt;
-       struct ptlrpc_thread *thread = lrd->lrd_thread;
        struct llog_ctxt *ctxt = NULL;
-       struct lu_env env;
+       struct lu_env *env = &lrd->lrd_env;
        struct lu_target *lut;
        struct lu_tgt_desc *mdt = NULL;
        time64_t start;
@@ -365,17 +367,6 @@ static int lod_sub_recovery_thread(void *arg)
 
        ENTRY;
 
-       thread->t_flags = SVC_RUNNING;
-       wake_up(&thread->t_ctl_waitq);
-
-       rc = lu_env_init(&env, LCT_LOCAL | LCT_MD_THREAD);
-       if (rc != 0) {
-               OBD_FREE_PTR(lrd);
-               CERROR("%s: can't initialize env: rc = %d\n",
-                      lod2obd(lod)->obd_name, rc);
-               RETURN(rc);
-       }
-
        lut = lod2lu_dev(lod)->ld_site->ls_tgt;
        atomic_inc(&lut->lut_tdtd->tdtd_recovery_threads_count);
        if (!lrd->lrd_ltd)
@@ -384,9 +375,17 @@ static int lod_sub_recovery_thread(void *arg)
                dt = lrd->lrd_ltd->ltd_tgt;
 
        start = ktime_get_real_seconds();
+       complete(lrd->lrd_started);
 
 again:
-       rc = lod_sub_prep_llog(&env, lod, dt, lrd->lrd_idx);
+
+       if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_TGT_RECOVERY_CONNECT)) &&
+           lrd->lrd_ltd) {
+               OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_RECOVERY_CONNECT, cfs_fail_val);
+               rc = -EIO;
+       } else {
+               rc = lod_sub_prep_llog(env, lod, dt, lrd->lrd_idx);
+       }
        if (!rc && !lod->lod_child->dd_rdonly) {
                /* Process the recovery record */
                ctxt = llog_get_context(dt->dd_lu_dev.ld_obd,
@@ -394,7 +393,7 @@ again:
                LASSERT(ctxt != NULL);
                LASSERT(ctxt->loc_handle != NULL);
 
-               rc = llog_cat_process(&env, ctxt->loc_handle,
+               rc = llog_cat_process(env, ctxt->loc_handle,
                                      lod_process_recovery_updates, lrd, 0, 0);
        }
 
@@ -412,7 +411,7 @@ again:
                    !top_device->ld_obd->obd_stopping) {
                        if (ctxt) {
                                if (ctxt->loc_handle)
-                                       llog_cat_close(&env,
+                                       llog_cat_close(env,
                                                       ctxt->loc_handle);
                                llog_ctxt_put(ctxt);
                        }
@@ -466,13 +465,16 @@ again:
        EXIT;
 
 out:
-       OBD_FREE_PTR(lrd);
-       thread->t_flags = SVC_STOPPED;
        atomic_dec(&lut->lut_tdtd->tdtd_recovery_threads_count);
        wake_up(&lut->lut_tdtd->tdtd_recovery_threads_waitq);
-       wake_up(&thread->t_ctl_waitq);
-       lu_env_fini(&env);
-       return rc;
+       if (xchg(lrd->lrd_task, NULL) == NULL)
+               /* Someone is waiting for us to finish, need
+                * to synchronize cleanly.
+                */
+               wait_var_event(lrd, kthread_should_stop());
+       lu_env_fini(env);
+       OBD_FREE_PTR(lrd);
+       return 0;
 }
 
 /**
@@ -486,21 +488,21 @@ out:
  * \param[in] thread   recovery thread on this sub device
  */
 void lod_sub_fini_llog(const struct lu_env *env,
-                      struct dt_device *dt, struct ptlrpc_thread *thread)
+                      struct dt_device *dt, struct task_struct **thread)
 {
        struct obd_device *obd;
        struct llog_ctxt *ctxt;
+       struct task_struct *task = NULL;
 
        ENTRY;
 
        obd = dt->dd_lu_dev.ld_obd;
        CDEBUG(D_INFO, "%s: finish sub llog\n", obd->obd_name);
-       /* Stop recovery thread first */
-       if (thread && thread->t_flags & SVC_RUNNING) {
-               thread->t_flags = SVC_STOPPING;
-               wake_up(&thread->t_ctl_waitq);
-               wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_STOPPED);
-       }
+       /* Wait for recovery thread to complete */
+       if (thread)
+               task = xchg(thread, NULL);
+       if (task)
+               kthread_stop(task);
 
        ctxt = llog_get_context(obd, LLOG_UPDATELOG_ORIG_CTXT);
        if (!ctxt)
@@ -528,8 +530,8 @@ void lod_sub_fini_llog(const struct lu_env *env,
  */
 int lodname2mdt_index(char *lodname, u32 *mdt_index)
 {
-       unsigned long index;
-       char *ptr, *tmp;
+       u32 index;
+       const char *ptr, *tmp;
        int rc;
 
        /* 1.8 configs don't have "-MDT0000" at the end */
@@ -564,8 +566,8 @@ int lodname2mdt_index(char *lodname, u32 *mdt_index)
                return rc;
        }
 
-       index = simple_strtol(ptr - 4, &tmp, 16);
-       if (*tmp != '-' || index > INT_MAX) {
+       rc = target_name2index(ptr - 7, &index, &tmp);
+       if (rc < 0 || rc & LDD_F_SV_ALL || *tmp != '-') {
                rc = -EINVAL;
                CERROR("invalid MDT index in '%s': rc = %d\n", lodname, rc);
                return rc;
@@ -593,7 +595,8 @@ int lod_sub_init_llog(const struct lu_env *env, struct lod_device *lod,
 {
        struct obd_device *obd;
        struct lod_recovery_data *lrd = NULL;
-       struct ptlrpc_thread *thread;
+       DECLARE_COMPLETION_ONSTACK(started);
+       struct task_struct **taskp;
        struct task_struct *task;
        struct lod_tgt_desc *subtgt = NULL;
        u32 index;
@@ -611,7 +614,7 @@ int lod_sub_init_llog(const struct lu_env *env, struct lod_device *lod,
                RETURN(-ENOMEM);
 
        if (lod->lod_child == dt) {
-               thread = &lod->lod_child_recovery_thread;
+               taskp = &lod->lod_child_recovery_task;
                index = master_index;
        } else {
                struct lu_tgt_desc *mdt;
@@ -624,20 +627,16 @@ int lod_sub_init_llog(const struct lu_env *env, struct lod_device *lod,
                        }
                }
                LASSERT(subtgt != NULL);
-               OBD_ALLOC_PTR(subtgt->ltd_recovery_thread);
-               if (!subtgt->ltd_recovery_thread)
-                       GOTO(free_lrd, rc = -ENOMEM);
-
-               thread = subtgt->ltd_recovery_thread;
+               taskp = &subtgt->ltd_recovery_task;
        }
 
        CDEBUG(D_INFO, "%s init sub log %s\n", lod2obd(lod)->obd_name,
               dt->dd_lu_dev.ld_obd->obd_name);
        lrd->lrd_lod = lod;
        lrd->lrd_ltd = subtgt;
-       lrd->lrd_thread = thread;
+       lrd->lrd_task = taskp;
        lrd->lrd_idx = index;
-       init_waitqueue_head(&thread->t_ctl_waitq);
+       lrd->lrd_started = &started;
 
        obd = dt->dd_lu_dev.ld_obd;
        obd->obd_lvfs_ctxt.dt = dt;
@@ -646,30 +645,33 @@ int lod_sub_init_llog(const struct lu_env *env, struct lod_device *lod,
        if (rc < 0) {
                CERROR("%s: cannot setup updatelog llog: rc = %d\n",
                       obd->obd_name, rc);
-               GOTO(free_thread, rc);
+               GOTO(free_lrd, rc);
+       }
+
+       rc = lu_env_init(&lrd->lrd_env, LCT_LOCAL | LCT_MD_THREAD);
+       if (rc != 0) {
+               CERROR("%s: can't initialize env: rc = %d\n",
+                      lod2obd(lod)->obd_name, rc);
+               GOTO(free_lrd, rc);
        }
 
        /* Start the recovery thread */
-       task = kthread_run(lod_sub_recovery_thread, lrd, "lod%04x_rec%04x",
-                          master_index, index);
+       task = kthread_create(lod_sub_recovery_thread, lrd, "lod%04x_rec%04x",
+                             master_index, index);
        if (IS_ERR(task)) {
                rc = PTR_ERR(task);
                CERROR("%s: cannot start recovery thread: rc = %d\n",
                       obd->obd_name, rc);
+               lu_env_fini(&lrd->lrd_env);
                GOTO(out_llog, rc);
        }
-
-       wait_event_idle(thread->t_ctl_waitq, thread->t_flags & SVC_RUNNING ||
-                       thread->t_flags & SVC_STOPPED);
+       *taskp = task;
+       wake_up_process(task);
+       wait_for_completion(&started);
 
        RETURN(0);
 out_llog:
-       lod_sub_fini_llog(env, dt, thread);
-free_thread:
-       if (lod->lod_child != dt) {
-               OBD_FREE_PTR(subtgt->ltd_recovery_thread);
-               subtgt->ltd_recovery_thread = NULL;
-       }
+       lod_sub_fini_llog(env, dt, taskp);
 free_lrd:
        OBD_FREE_PTR(lrd);
        RETURN(rc);
@@ -686,32 +688,22 @@ free_lrd:
 static void lod_sub_stop_recovery_threads(const struct lu_env *env,
                                          struct lod_device *lod)
 {
-       struct ptlrpc_thread *thread;
+       struct task_struct *task;
        struct lu_tgt_desc *mdt;
 
        /*
         * Stop the update log commit cancel threads and finish master
         * llog ctxt
         */
-       thread = &lod->lod_child_recovery_thread;
-       /* Stop recovery thread first */
-       if (thread && thread->t_flags & SVC_RUNNING) {
-               thread->t_flags = SVC_STOPPING;
-               wake_up(&thread->t_ctl_waitq);
-               wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_STOPPED);
-       }
+       task = xchg(&lod->lod_child_recovery_task, NULL);
+       if (task)
+               kthread_stop(task);
 
        lod_getref(&lod->lod_mdt_descs);
        lod_foreach_mdt(lod, mdt) {
-               thread = mdt->ltd_recovery_thread;
-               if (thread && thread->t_flags & SVC_RUNNING) {
-                       thread->t_flags = SVC_STOPPING;
-                       wake_up(&thread->t_ctl_waitq);
-                       wait_event(thread->t_ctl_waitq,
-                                  thread->t_flags & SVC_STOPPED);
-                       OBD_FREE_PTR(mdt->ltd_recovery_thread);
-                       mdt->ltd_recovery_thread = NULL;
-               }
+               task = xchg(&mdt->ltd_recovery_task, NULL);
+               if (task)
+                       kthread_stop(task);
        }
        lod_putref(lod, &lod->lod_mdt_descs);
 }
@@ -734,11 +726,11 @@ static void lod_sub_fini_all_llogs(const struct lu_env *env,
         * llog ctxt
         */
        lod_sub_fini_llog(env, lod->lod_child,
-                         &lod->lod_child_recovery_thread);
+                         &lod->lod_child_recovery_task);
        lod_getref(&lod->lod_mdt_descs);
        lod_foreach_mdt(lod, mdt)
                lod_sub_fini_llog(env, mdt->ltd_tgt,
-                                 mdt->ltd_recovery_thread);
+                                 &mdt->ltd_recovery_task);
        lod_putref(lod, &lod->lod_mdt_descs);
 }
 
@@ -770,7 +762,7 @@ static char *lod_show_update_logs_retrievers(void *data, int *size, int *count)
                rc = lodname2mdt_index(lod2obd(lod)->obd_name, &i);
                LASSERTF(rc == 0, "Fail to parse target index: rc = %d\n", rc);
 
-               rc = snprintf(buf + len, *size - len, " %04x", i);
+               rc = scnprintf(buf + len, *size - len, " %04x", i);
                LASSERT(rc > 0);
 
                len += rc;
@@ -779,8 +771,8 @@ static char *lod_show_update_logs_retrievers(void *data, int *size, int *count)
 
        lod_foreach_mdt(lod, mdt) {
                if (!mdt->ltd_got_update_log) {
-                       rc = snprintf(buf + len, *size - len, " %04x",
-                                     mdt->ltd_index);
+                       rc = scnprintf(buf + len, *size - len, " %04x",
+                                      mdt->ltd_index);
                        if (unlikely(rc <= 0))
                                break;
 
@@ -1039,6 +1031,7 @@ static int lod_process_config(const struct lu_env *env,
        case LCFG_PRE_CLEANUP: {
                lod_sub_process_config(env, lod, &lod->lod_mdt_descs, lcfg);
                lod_sub_process_config(env, lod, &lod->lod_ost_descs, lcfg);
+               OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_RECOVERY_CONNECT, cfs_fail_val * 2);
                next = &lod->lod_child->dd_lu_dev;
                rc = next->ld_ops->ldo_process_config(env, next, lcfg);
                if (rc != 0)
@@ -1237,11 +1230,68 @@ out_put:
        RETURN(rc);
 }
 
+/**
+ * Implementation of lu_device_operations::ldo_fid_alloc() for LOD
+ *
+ * Find corresponding device by passed parent and name, and allocate FID from
+ * there.
+ *
+ * see include/lu_object.h for the details.
+ */
+static int lod_fid_alloc(const struct lu_env *env, struct lu_device *d,
+                        struct lu_fid *fid, struct lu_object *parent,
+                        const struct lu_name *name)
+{
+       struct lod_device *lod = lu2lod_dev(d);
+       struct lod_object *lo = lu2lod_obj(parent);
+       struct dt_device *next;
+       int rc;
+
+       ENTRY;
+
+       /* if @parent is remote, we don't know whether its layout was changed,
+        * always reload layout.
+        */
+       if (lu_object_remote(parent))
+               lod_striping_free(env, lo);
+
+       rc = lod_striping_load(env, lo);
+       if (rc)
+               RETURN(rc);
+
+       if (lo->ldo_dir_stripe_count > 0 && name) {
+               struct dt_object *stripe;
+               int idx;
+
+               idx = __lmv_name_to_stripe_index(lo->ldo_dir_hash_type,
+                                                lo->ldo_dir_stripe_count,
+                                                lo->ldo_dir_migrate_hash,
+                                                lo->ldo_dir_migrate_offset,
+                                                name->ln_name,
+                                                name->ln_namelen, true);
+               if (idx < 0)
+                       RETURN(idx);
+
+               stripe = lo->ldo_stripe[idx];
+               if (!stripe || !dt_object_exists(stripe))
+                       RETURN(-ENODEV);
+
+               next = lu2dt_dev(stripe->do_lu.lo_dev);
+       } else {
+               next = lod->lod_child;
+       }
+
+       rc = dt_fid_alloc(env, next, fid, parent, name);
+
+       RETURN(rc);
+}
+
 const struct lu_device_operations lod_lu_ops = {
        .ldo_object_alloc       = lod_object_alloc,
        .ldo_process_config     = lod_process_config,
        .ldo_recovery_complete  = lod_recovery_complete,
        .ldo_prepare            = lod_prepare,
+       .ldo_fid_alloc          = lod_fid_alloc,
 };
 
 /**
@@ -1346,7 +1396,7 @@ static int lod_statfs(const struct lu_env *env, struct dt_device *dev,
                        (int)sfs->os_bsize, (int)ost_sfs.os_bsize);
        }
        lod_putref(lod, &lod->lod_ost_descs);
-       sfs->os_state |= OS_STATE_SUM;
+       sfs->os_state |= OS_STATFS_SUM;
 
        /* If we have _some_ OSTs, but don't have as many free objects on the
         * OSTs as inodes on the MDTs, reduce the reported number of inodes
@@ -1678,6 +1728,26 @@ out:
        RETURN(rc);
 }
 
+static int lod_lsfs_init(const struct lu_env *env, struct lod_device *d)
+{
+       struct obd_statfs sfs;
+       int rc;
+
+       rc = dt_statfs(env, d->lod_child, &sfs);
+       if (rc) {
+               CDEBUG(D_LAYOUT, "%s: failed to get OSD statfs, rc = %d\n",
+                      lod2obd(d)->obd_name, rc);
+               return rc;
+       }
+
+       /* udpate local OSD cached statfs data */
+       spin_lock_init(&d->lod_lsfs_lock);
+       d->lod_lsfs_age = ktime_get_seconds();
+       d->lod_lsfs_total_mb = (sfs.os_blocks * sfs.os_bsize) >> 20;
+       d->lod_lsfs_free_mb = (sfs.os_bfree * sfs.os_bsize) >> 20;
+       return 0;
+}
+
 /**
  * Initialize LOD device at setup.
  *
@@ -1721,7 +1791,17 @@ static int lod_init0(const struct lu_env *env, struct lod_device *lod,
 
        dt_conf_get(env, &lod->lod_dt_dev, &ddp);
        lod->lod_osd_max_easize = ddp.ddp_max_ea_size;
-       lod->lod_dom_max_stripesize = (1ULL << 20); /* 1Mb as default value */
+       lod->lod_dom_stripesize_max_kb = (1ULL << 10); /* 1Mb is default */
+
+       /* initialize local statfs cached values */
+       rc = lod_lsfs_init(env, lod);
+       if (rc)
+               GOTO(out_disconnect, rc);
+
+       /* default threshold as half of total space, in MiB */
+       lod->lod_dom_threshold_free_mb = lod->lod_lsfs_total_mb / 2;
+       /* set default DoM stripe size based on free space amount */
+       lod_dom_stripesize_recalc(lod);
 
        /* setup obd to be used with old lov code */
        rc = lod_pools_init(lod, cfg);
@@ -1761,10 +1841,9 @@ static struct lu_device *lod_device_free(const struct lu_env *env,
 
        ENTRY;
 
-       if (atomic_read(&lu->ld_ref) > 0 &&
-           !cfs_hash_is_empty(lu->ld_site->ls_obj_hash)) {
-               LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
-               lu_site_print(env, lu->ld_site, &msgdata, lu_cdebug_printer);
+       if (atomic_read(&lu->ld_site->ls_obj_hash.nelems)) {
+               lu_site_print(env, lu->ld_site, &lu->ld_ref, D_ERROR,
+                             lu_cdebug_printer);
        }
        LASSERTF(atomic_read(&lu->ld_ref) == 0, "lu is %p\n", lu);
        dt_device_fini(&lod->lod_dt_dev);
@@ -1807,10 +1886,9 @@ static struct lu_device *lod_device_alloc(const struct lu_env *env,
 static void lod_avoid_guide_fini(struct lod_avoid_guide *lag)
 {
        if (lag->lag_oss_avoid_array)
-               OBD_FREE(lag->lag_oss_avoid_array,
-                        sizeof(u32) * lag->lag_oaa_size);
-       if (lag->lag_ost_avoid_bitmap)
-               CFS_FREE_BITMAP(lag->lag_ost_avoid_bitmap);
+               OBD_FREE_PTR_ARRAY(lag->lag_oss_avoid_array,
+                                  lag->lag_oaa_size);
+       bitmap_free(lag->lag_ost_avoid_bitmap);
 }
 
 /**
@@ -1957,8 +2035,8 @@ static void lod_key_fini(const struct lu_context *ctx,
                lod_free_def_comp_entries(&info->lti_def_striping);
 
        if (info->lti_comp_size > 0)
-               OBD_FREE(info->lti_comp_idx,
-                        info->lti_comp_size * sizeof(u32));
+               OBD_FREE_PTR_ARRAY(info->lti_comp_idx,
+                                  info->lti_comp_size);
 
        lod_avoid_guide_fini(&info->lti_avoid);
 
@@ -2124,7 +2202,7 @@ static int lod_obd_set_info_async(const struct lu_env *env,
        RETURN(rc);
 }
 
-static struct obd_ops lod_obd_device_ops = {
+static const struct obd_ops lod_obd_device_ops = {
        .o_owner        = THIS_MODULE,
        .o_connect      = lod_obd_connect,
        .o_disconnect   = lod_obd_disconnect,