Whamcloud - gitweb
LU-4423 libcfs: remove IS_PO2 and __is_po2
[fs/lustre-release.git] / libcfs / libcfs / workitem.c
index 6533867..fb4fd64 100644 (file)
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
  * GPL HEADER START
  *
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * You should have received a copy of the GNU General Public License
  * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
+ * http://www.gnu.org/licenses/gpl-2.0.html
  *
  * GPL HEADER END
  */
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2014, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
 
 #define DEBUG_SUBSYSTEM S_LNET
 
+#include <linux/kthread.h>
 #include <libcfs/libcfs.h>
 
-typedef struct cfs_wi_sched {
-#ifdef __KERNEL__
-        /** serialised workitems */
-        cfs_spinlock_t  ws_lock;
-        /** where schedulers sleep */
-        cfs_waitq_t     ws_waitq;
-#endif
-        /** concurrent workitems */
-        cfs_list_t      ws_runq;
-        /** rescheduled running-workitems */
-        cfs_list_t      ws_rerunq;
-        /** shutting down */
-        int             ws_shuttingdown;
-} cfs_wi_sched_t;
-
-#ifdef __KERNEL__
-/**
- * we have 2 cfs_wi_sched_t so far:
- * one for CFS_WI_SCHED_ANY, another for CFS_WI_SCHED_SERIAL
- * per-cpu implementation will be added for SMP scalability
- */
-
-#define CFS_WI_NSCHED   2
-#else
-/** always 2 for userspace */
-#define CFS_WI_NSCHED   2
-#endif /* __KERNEL__ */
-
-struct cfs_workitem_data {
-        /** serialize */
-        cfs_spinlock_t  wi_glock;
-        /** number of cfs_wi_sched_t */
-        int             wi_nsched;
-        /** number of threads (all schedulers) */
-        int             wi_nthreads;
-        /** default scheduler */
-        cfs_wi_sched_t *wi_scheds;
+#define CFS_WS_NAME_LEN         16
+
+struct cfs_wi_sched {
+       struct list_head                ws_list;        /* chain on global list */
+       /** serialised workitems */
+       spinlock_t                      ws_lock;
+       /** where schedulers sleep */
+       wait_queue_head_t               ws_waitq;
+       /** concurrent workitems */
+       struct list_head                ws_runq;
+       /** rescheduled running-workitems, a workitem can be rescheduled
+        * while running in wi_action(), but we don't to execute it again
+        * unless it returns from wi_action(), so we put it on ws_rerunq
+        * while rescheduling, and move it to runq after it returns
+        * from wi_action() */
+       struct list_head                ws_rerunq;
+       /** CPT-table for this scheduler */
+       struct cfs_cpt_table    *ws_cptab;
+       /** CPT id for affinity */
+       int                     ws_cpt;
+       /** number of scheduled workitems */
+       int                     ws_nscheduled;
+       /** started scheduler thread, protected by cfs_wi_data::wi_glock */
+       unsigned int            ws_nthreads:30;
+       /** shutting down, protected by cfs_wi_data::wi_glock */
+       unsigned int            ws_stopping:1;
+       /** serialize starting thread, protected by cfs_wi_data::wi_glock */
+       unsigned int            ws_starting:1;
+       /** scheduler name */
+       char                    ws_name[CFS_WS_NAME_LEN];
+};
+
+static struct cfs_workitem_data {
+       /** serialize */
+       spinlock_t              wi_glock;
+       /** list of all schedulers */
+       struct list_head        wi_scheds;
+       /** WI module is initialized */
+       int                     wi_init;
+       /** shutting down the whole WI module */
+       int                     wi_stopping;
 } cfs_wi_data;
 
-static inline cfs_wi_sched_t *
-cfs_wi_to_sched(cfs_workitem_t *wi)
-{
-        LASSERT(wi->wi_sched_id == CFS_WI_SCHED_ANY ||
-                wi->wi_sched_id == CFS_WI_SCHED_SERIAL ||
-                (wi->wi_sched_id >= 0 &&
-                 wi->wi_sched_id < cfs_wi_data.wi_nsched));
-
-        if (wi->wi_sched_id == CFS_WI_SCHED_ANY)
-                return &cfs_wi_data.wi_scheds[0];
-        if (wi->wi_sched_id == CFS_WI_SCHED_SERIAL)
-                return &cfs_wi_data.wi_scheds[cfs_wi_data.wi_nsched - 1];
-
-        return &cfs_wi_data.wi_scheds[wi->wi_sched_id];
-}
-
-#ifdef __KERNEL__
-static inline void
-cfs_wi_sched_lock(cfs_wi_sched_t *sched)
-{
-        cfs_spin_lock(&sched->ws_lock);
-}
-
-static inline void
-cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
-{
-        cfs_spin_unlock(&sched->ws_lock);
-}
-
 static inline int
-cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
-{
-        cfs_wi_sched_lock(sched);
-        if (sched->ws_shuttingdown) {
-                cfs_wi_sched_unlock(sched);
-                return 0;
-        }
-
-        if (!cfs_list_empty(&sched->ws_runq)) {
-                cfs_wi_sched_unlock(sched);
-                return 0;
-        }
-        cfs_wi_sched_unlock(sched);
-        return 1;
-}
-
-#else
-
-static inline void
-cfs_wi_sched_lock(cfs_wi_sched_t *sched)
+cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
 {
-        cfs_spin_lock(&cfs_wi_data.wi_glock);
+       spin_lock(&sched->ws_lock);
+       if (sched->ws_stopping) {
+               spin_unlock(&sched->ws_lock);
+               return 0;
+       }
+
+       if (!list_empty(&sched->ws_runq)) {
+               spin_unlock(&sched->ws_lock);
+               return 0;
+       }
+       spin_unlock(&sched->ws_lock);
+       return 1;
 }
 
-static inline void
-cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
-{
-        cfs_spin_unlock(&cfs_wi_data.wi_glock);
-}
-
-#endif
-
 /* XXX:
  * 0. it only works when called from wi->wi_action.
  * 1. when it returns no one shall try to schedule the workitem.
  */
 void
-cfs_wi_exit(cfs_workitem_t *wi)
+cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
 {
-        cfs_wi_sched_t *sched = cfs_wi_to_sched(wi);
+       LASSERT(!in_interrupt()); /* because we use plain spinlock */
+       LASSERT(!sched->ws_stopping);
 
-        LASSERT (!cfs_in_interrupt()); /* because we use plain spinlock */
-        LASSERT (!sched->ws_shuttingdown);
+       spin_lock(&sched->ws_lock);
 
-        cfs_wi_sched_lock(sched);
+       LASSERT(wi->wi_running);
 
-#ifdef __KERNEL__
-        LASSERT (wi->wi_running);
-#endif
-        if (wi->wi_scheduled) { /* cancel pending schedules */
-                LASSERT (!cfs_list_empty(&wi->wi_list));
-                cfs_list_del_init(&wi->wi_list);
-        }
+       if (wi->wi_scheduled) { /* cancel pending schedules */
+               LASSERT(!list_empty(&wi->wi_list));
+               list_del_init(&wi->wi_list);
+
+               LASSERT(sched->ws_nscheduled > 0);
+               sched->ws_nscheduled--;
+       }
+
+       LASSERT(list_empty(&wi->wi_list));
 
-        LASSERT (cfs_list_empty(&wi->wi_list));
-        wi->wi_scheduled = 1; /* LBUG future schedule attempts */
+       wi->wi_scheduled = 1; /* LBUG future schedule attempts */
+       spin_unlock(&sched->ws_lock);
 
-        cfs_wi_sched_unlock(sched);
-        return;
+       return;
 }
-CFS_EXPORT_SYMBOL(cfs_wi_exit);
+EXPORT_SYMBOL(cfs_wi_exit);
 
 /**
- * cancel a workitem:
+ * cancel schedule request of workitem \a wi
  */
 int
-cfs_wi_cancel (cfs_workitem_t *wi)
+cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
 {
-        cfs_wi_sched_t *sched = cfs_wi_to_sched(wi);
-        int             rc;
+       int     rc;
 
-        LASSERT (!cfs_in_interrupt()); /* because we use plain spinlock */
-        LASSERT (!sched->ws_shuttingdown);
+       LASSERT(!in_interrupt()); /* because we use plain spinlock */
+       LASSERT(!sched->ws_stopping);
 
-        cfs_wi_sched_lock(sched);
         /*
          * return 0 if it's running already, otherwise return 1, which
          * means the workitem will not be scheduled and will not have
          * any race with wi_action.
          */
-        rc = !(wi->wi_running);
+       spin_lock(&sched->ws_lock);
 
-        if (wi->wi_scheduled) { /* cancel pending schedules */
-                LASSERT (!cfs_list_empty(&wi->wi_list));
-                cfs_list_del_init(&wi->wi_list);
-                wi->wi_scheduled = 0;
-        }
+       rc = !(wi->wi_running);
 
-        LASSERT (cfs_list_empty(&wi->wi_list));
+       if (wi->wi_scheduled) { /* cancel pending schedules */
+               LASSERT(!list_empty(&wi->wi_list));
+               list_del_init(&wi->wi_list);
 
-        cfs_wi_sched_unlock(sched);
-        return rc;
-}
+               LASSERT(sched->ws_nscheduled > 0);
+               sched->ws_nscheduled--;
 
-CFS_EXPORT_SYMBOL(cfs_wi_cancel);
+               wi->wi_scheduled = 0;
+       }
+
+       LASSERT (list_empty(&wi->wi_list));
+
+       spin_unlock(&sched->ws_lock);
+       return rc;
+}
+EXPORT_SYMBOL(cfs_wi_deschedule);
 
 /*
  * Workitem scheduled with (serial == 1) is strictly serialised not only with
@@ -216,263 +176,294 @@ CFS_EXPORT_SYMBOL(cfs_wi_cancel);
  * be added, and even dynamic creation of serialised queues might be supported.
  */
 void
-cfs_wi_schedule(cfs_workitem_t *wi)
+cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
 {
-        cfs_wi_sched_t *sched = cfs_wi_to_sched(wi);
-
-        LASSERT (!cfs_in_interrupt()); /* because we use plain spinlock */
-        LASSERT (!sched->ws_shuttingdown);
-
-        cfs_wi_sched_lock(sched);
+       LASSERT(!in_interrupt()); /* because we use plain spinlock */
+       LASSERT(!sched->ws_stopping);
+
+       spin_lock(&sched->ws_lock);
+
+       if (!wi->wi_scheduled) {
+               LASSERT (list_empty(&wi->wi_list));
+
+               wi->wi_scheduled = 1;
+               sched->ws_nscheduled++;
+               if (!wi->wi_running) {
+                       list_add_tail(&wi->wi_list, &sched->ws_runq);
+                       wake_up(&sched->ws_waitq);
+               } else {
+                       list_add(&wi->wi_list, &sched->ws_rerunq);
+               }
+       }
+
+       LASSERT (!list_empty(&wi->wi_list));
+       spin_unlock(&sched->ws_lock);
+       return;
+}
+EXPORT_SYMBOL(cfs_wi_schedule);
 
-        if (!wi->wi_scheduled) {
-                LASSERT (cfs_list_empty(&wi->wi_list));
+static int
+cfs_wi_scheduler(void *arg)
+{
+       struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg;
 
-                wi->wi_scheduled = 1;
-                if (!wi->wi_running) {
-                        cfs_list_add_tail(&wi->wi_list, &sched->ws_runq);
-#ifdef __KERNEL__
-                        cfs_waitq_signal(&sched->ws_waitq);
-#endif
-                } else {
-                        cfs_list_add(&wi->wi_list, &sched->ws_rerunq);
-                }
-        }
+       cfs_block_allsigs();
 
-        LASSERT (!cfs_list_empty(&wi->wi_list));
-        cfs_wi_sched_unlock(sched);
-        return;
-}
+       /* CPT affinity scheduler? */
+       if (sched->ws_cptab != NULL)
+               if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
+                       CWARN("Unable to bind %s on CPU partition %d\n",
+                               sched->ws_name, sched->ws_cpt);
 
-CFS_EXPORT_SYMBOL(cfs_wi_schedule);
+       spin_lock(&cfs_wi_data.wi_glock);
 
-#ifdef __KERNEL__
+       LASSERT(sched->ws_starting == 1);
+       sched->ws_starting--;
+       sched->ws_nthreads++;
 
-static int
-cfs_wi_scheduler (void *arg)
-{
-        int             id     = (int)(long_ptr_t) arg;
-        int             serial = (id == -1);
-        char            name[24];
-        cfs_wi_sched_t *sched;
-
-        if (serial) {
-                sched = &cfs_wi_data.wi_scheds[cfs_wi_data.wi_nsched - 1];
-                cfs_daemonize("wi_serial_sd");
-        } else {
-                /* will be sched = &cfs_wi_data.wi_scheds[id] in the future */
-                sched = &cfs_wi_data.wi_scheds[0];
-                snprintf(name, sizeof(name), "cfs_wi_sd%03d", id);
-                cfs_daemonize(name);
-        }
+       spin_unlock(&cfs_wi_data.wi_glock);
 
-        cfs_block_allsigs();
+       spin_lock(&sched->ws_lock);
 
-        cfs_wi_sched_lock(sched);
+       while (!sched->ws_stopping) {
+               int             nloops = 0;
+               int             rc;
+               struct cfs_workitem *wi;
 
-        while (!sched->ws_shuttingdown) {
-                int             nloops = 0;
-                int             rc;
-                cfs_workitem_t *wi;
+               while (!list_empty(&sched->ws_runq) &&
+                      nloops < CFS_WI_RESCHED) {
+                       wi = list_entry(sched->ws_runq.next,
+                                       struct cfs_workitem, wi_list);
+                       LASSERT(wi->wi_scheduled && !wi->wi_running);
 
-                while (!cfs_list_empty(&sched->ws_runq) &&
-                       nloops < CFS_WI_RESCHED) {
-                        wi = cfs_list_entry(sched->ws_runq.next,
-                                            cfs_workitem_t, wi_list);
-                        LASSERT (wi->wi_scheduled && !wi->wi_running);
+                       list_del_init(&wi->wi_list);
 
-                        cfs_list_del_init(&wi->wi_list);
+                       LASSERT(sched->ws_nscheduled > 0);
+                       sched->ws_nscheduled--;
 
                         wi->wi_running   = 1;
                         wi->wi_scheduled = 0;
-                        cfs_wi_sched_unlock(sched);
+
+                       spin_unlock(&sched->ws_lock);
                         nloops++;
 
                         rc = (*wi->wi_action) (wi);
 
-                        cfs_wi_sched_lock(sched);
+                       spin_lock(&sched->ws_lock);
                         if (rc != 0) /* WI should be dead, even be freed! */
                                 continue;
 
-                        wi->wi_running = 0;
-                        if (cfs_list_empty(&wi->wi_list))
+                       wi->wi_running = 0;
+                       if (list_empty(&wi->wi_list))
                                 continue;
 
-                        LASSERT (wi->wi_scheduled);
-                        /* wi is rescheduled, should be on rerunq now, we
-                         * move it to runq so it can run action now */
-                        cfs_list_move_tail(&wi->wi_list, &sched->ws_runq);
-                }
-
-                if (!cfs_list_empty(&sched->ws_runq)) {
-                        cfs_wi_sched_unlock(sched);
-                        /* don't sleep because some workitems still
-                         * expect me to come back soon */
-                        cfs_cond_resched();
-                        cfs_wi_sched_lock(sched);
-                        continue;
+                       LASSERT(wi->wi_scheduled);
+                       /* wi is rescheduled, should be on rerunq now, we
+                        * move it to runq so it can run action now */
+                       list_move_tail(&wi->wi_list, &sched->ws_runq);
                 }
 
-                cfs_wi_sched_unlock(sched);
-                cfs_wait_event_interruptible_exclusive(sched->ws_waitq,
-                                !cfs_wi_sched_cansleep(sched), rc);
-                cfs_wi_sched_lock(sched);
+               if (!list_empty(&sched->ws_runq)) {
+                       spin_unlock(&sched->ws_lock);
+                       /* don't sleep because some workitems still
+                        * expect me to come back soon */
+                       cond_resched();
+                       spin_lock(&sched->ws_lock);
+                       continue;
+               }
+
+               spin_unlock(&sched->ws_lock);
+               rc = wait_event_interruptible_exclusive(sched->ws_waitq,
+                               !cfs_wi_sched_cansleep(sched));
+               spin_lock(&sched->ws_lock);
         }
 
-        cfs_wi_sched_unlock(sched);
+       spin_unlock(&sched->ws_lock);
 
-        cfs_spin_lock(&cfs_wi_data.wi_glock);
-        cfs_wi_data.wi_nthreads--;
-        cfs_spin_unlock(&cfs_wi_data.wi_glock);
-        return 0;
+       spin_lock(&cfs_wi_data.wi_glock);
+       sched->ws_nthreads--;
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       return 0;
 }
 
-static int
-cfs_wi_start_thread (int (*func) (void*), void *arg)
+void
+cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
 {
-        long pid;
-
-        pid = cfs_kernel_thread(func, arg, 0);
-        if (pid < 0)
-                return (int)pid;
-
-        cfs_spin_lock(&cfs_wi_data.wi_glock);
-        cfs_wi_data.wi_nthreads++;
-        cfs_spin_unlock(&cfs_wi_data.wi_glock);
-        return 0;
-}
+       LASSERT(cfs_wi_data.wi_init);
+       LASSERT(!cfs_wi_data.wi_stopping);
 
-#else /* __KERNEL__ */
+       spin_lock(&cfs_wi_data.wi_glock);
+       if (sched->ws_stopping) {
+               CDEBUG(D_INFO, "%s is in progress of stopping\n",
+                      sched->ws_name);
+               spin_unlock(&cfs_wi_data.wi_glock);
+               return;
+       }
 
-int
-cfs_wi_check_events (void)
-{
-        int               n = 0;
-        cfs_workitem_t   *wi;
-        cfs_list_t       *q;
+       LASSERT(!list_empty(&sched->ws_list));
+       sched->ws_stopping = 1;
 
-        cfs_spin_lock(&cfs_wi_data.wi_glock);
+       spin_unlock(&cfs_wi_data.wi_glock);
 
-        for (;;) {
-                /** rerunq is always empty for userspace */
-                if (!cfs_list_empty(&cfs_wi_data.wi_scheds[1].ws_runq))
-                        q = &cfs_wi_data.wi_scheds[1].ws_runq;
-                else if (!cfs_list_empty(&cfs_wi_data.wi_scheds[0].ws_runq))
-                        q = &cfs_wi_data.wi_scheds[0].ws_runq;
-                else
-                        break;
+       wake_up_all(&sched->ws_waitq);
 
-                wi = cfs_list_entry(q->next, cfs_workitem_t, wi_list);
-                cfs_list_del_init(&wi->wi_list);
+       spin_lock(&cfs_wi_data.wi_glock);
+       {
+               int i = 2;
 
-                LASSERT (wi->wi_scheduled);
-                wi->wi_scheduled = 0;
-                cfs_spin_unlock(&cfs_wi_data.wi_glock);
+               while (sched->ws_nthreads > 0) {
+                       CDEBUG(is_power_of_2(++i) ? D_WARNING : D_NET,
+                              "waiting for %d threads of WI sched[%s] to "
+                              "terminate\n", sched->ws_nthreads,
+                              sched->ws_name);
 
-                n++;
-                (*wi->wi_action) (wi);
+                       spin_unlock(&cfs_wi_data.wi_glock);
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       schedule_timeout(cfs_time_seconds(1) / 20);
+                       spin_lock(&cfs_wi_data.wi_glock);
+               }
+       }
 
-                cfs_spin_lock(&cfs_wi_data.wi_glock);
-        }
+       list_del(&sched->ws_list);
 
-        cfs_spin_unlock(&cfs_wi_data.wi_glock);
-        return n;
-}
+       spin_unlock(&cfs_wi_data.wi_glock);
 
-#endif
+       LASSERT(sched->ws_nscheduled == 0);
 
-static void
-cfs_wi_sched_init(cfs_wi_sched_t *sched)
-{
-        sched->ws_shuttingdown = 0;
-#ifdef __KERNEL__
-        cfs_spin_lock_init(&sched->ws_lock);
-        cfs_waitq_init(&sched->ws_waitq);
-#endif
-        CFS_INIT_LIST_HEAD(&sched->ws_runq);
-        CFS_INIT_LIST_HEAD(&sched->ws_rerunq);
+       LIBCFS_FREE(sched, sizeof(*sched));
 }
+EXPORT_SYMBOL(cfs_wi_sched_destroy);
 
-static void
-cfs_wi_sched_shutdown(cfs_wi_sched_t *sched)
+int
+cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
+                   int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
 {
-        cfs_wi_sched_lock(sched);
-
-        LASSERT(cfs_list_empty(&sched->ws_runq));
-        LASSERT(cfs_list_empty(&sched->ws_rerunq));
-
-        sched->ws_shuttingdown = 1;
-
-#ifdef __KERNEL__
-        cfs_waitq_broadcast(&sched->ws_waitq);
-#endif
-        cfs_wi_sched_unlock(sched);
+       struct cfs_wi_sched     *sched;
+
+       LASSERT(cfs_wi_data.wi_init);
+       LASSERT(!cfs_wi_data.wi_stopping);
+       LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
+               (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
+
+       LIBCFS_ALLOC(sched, sizeof(*sched));
+       if (sched == NULL)
+               return -ENOMEM;
+
+       if (strlen(name) > sizeof(sched->ws_name)-1) {
+               LIBCFS_FREE(sched, sizeof(*sched));
+               return -E2BIG;
+       }
+       strlcpy(sched->ws_name, name, sizeof(sched->ws_name));
+
+       sched->ws_cptab = cptab;
+       sched->ws_cpt = cpt;
+
+       spin_lock_init(&sched->ws_lock);
+       init_waitqueue_head(&sched->ws_waitq);
+
+       INIT_LIST_HEAD(&sched->ws_runq);
+       INIT_LIST_HEAD(&sched->ws_rerunq);
+       INIT_LIST_HEAD(&sched->ws_list);
+
+       for (; nthrs > 0; nthrs--)  {
+               char                    name[16];
+               struct task_struct      *task;
+
+               spin_lock(&cfs_wi_data.wi_glock);
+               while (sched->ws_starting > 0) {
+                       spin_unlock(&cfs_wi_data.wi_glock);
+                       schedule();
+                       spin_lock(&cfs_wi_data.wi_glock);
+               }
+
+               sched->ws_starting++;
+               spin_unlock(&cfs_wi_data.wi_glock);
+
+               if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
+                       snprintf(name, sizeof(name), "%s_%02d_%02d",
+                                sched->ws_name, sched->ws_cpt,
+                                sched->ws_nthreads);
+               } else {
+                       snprintf(name, sizeof(name), "%s_%02d",
+                                sched->ws_name, sched->ws_nthreads);
+               }
+
+               task = kthread_run(cfs_wi_scheduler, sched, name);
+               if (IS_ERR(task)) {
+                       int rc = PTR_ERR(task);
+
+                       CERROR("Failed to create thread for "
+                               "WI scheduler %s: %d\n", name, rc);
+
+                       spin_lock(&cfs_wi_data.wi_glock);
+
+                       /* make up for cfs_wi_sched_destroy */
+                       list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+                       sched->ws_starting--;
+
+                       spin_unlock(&cfs_wi_data.wi_glock);
+
+                       cfs_wi_sched_destroy(sched);
+                       return rc;
+               }
+       }
+
+       spin_lock(&cfs_wi_data.wi_glock);
+       list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       *sched_pp = sched;
+       return 0;
 }
-
+EXPORT_SYMBOL(cfs_wi_sched_create);
 
 int
-cfs_wi_startup (void)
+cfs_wi_startup(void)
 {
-        int i;
-        int n;
-        int rc;
-
-        cfs_wi_data.wi_nthreads = 0;
-        cfs_wi_data.wi_nsched   = CFS_WI_NSCHED;
-        LIBCFS_ALLOC(cfs_wi_data.wi_scheds,
-                     cfs_wi_data.wi_nsched * sizeof(cfs_wi_sched_t));
-        if (cfs_wi_data.wi_scheds == NULL)
-                return -ENOMEM;
-
-        cfs_spin_lock_init(&cfs_wi_data.wi_glock);
-        for (i = 0; i < cfs_wi_data.wi_nsched; i++)
-                cfs_wi_sched_init(&cfs_wi_data.wi_scheds[i]);
-
-#ifdef __KERNEL__
-        n = cfs_num_online_cpus();
-        for (i = 0; i <= n; i++) {
-                rc = cfs_wi_start_thread(cfs_wi_scheduler,
-                                         (void *)(long_ptr_t)(i == n ? -1 : i));
-                if (rc != 0) {
-                        CERROR ("Can't spawn workitem scheduler: %d\n", rc);
-                        cfs_wi_shutdown();
-                        return rc;
-                }
-        }
-#else
-        n = rc = 0;
-#endif
+       memset(&cfs_wi_data, 0, sizeof(struct cfs_workitem_data));
+
+       spin_lock_init(&cfs_wi_data.wi_glock);
+       INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
+       cfs_wi_data.wi_init = 1;
 
-        return 0;
+       return 0;
 }
 
 void
 cfs_wi_shutdown (void)
 {
-        int i;
-
-        if (cfs_wi_data.wi_scheds == NULL)
-                return;
-
-        for (i = 0; i < cfs_wi_data.wi_nsched; i++)
-                cfs_wi_sched_shutdown(&cfs_wi_data.wi_scheds[i]);
-
-#ifdef __KERNEL__
-        cfs_spin_lock(&cfs_wi_data.wi_glock);
-        i = 2;
-        while (cfs_wi_data.wi_nthreads != 0) {
-                CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
-                       "waiting for %d threads to terminate\n",
-                       cfs_wi_data.wi_nthreads);
-                cfs_spin_unlock(&cfs_wi_data.wi_glock);
-
-                cfs_pause(cfs_time_seconds(1));
-
-                cfs_spin_lock(&cfs_wi_data.wi_glock);
-        }
-        cfs_spin_unlock(&cfs_wi_data.wi_glock);
-#endif
-        LIBCFS_FREE(cfs_wi_data.wi_scheds,
-                    cfs_wi_data.wi_nsched * sizeof(cfs_wi_sched_t));
-        return;
+       struct cfs_wi_sched     *sched;
+
+       spin_lock(&cfs_wi_data.wi_glock);
+       cfs_wi_data.wi_stopping = 1;
+       spin_unlock(&cfs_wi_data.wi_glock);
+
+       /* nobody should contend on this list */
+       list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+               sched->ws_stopping = 1;
+               wake_up_all(&sched->ws_waitq);
+       }
+
+       list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
+               spin_lock(&cfs_wi_data.wi_glock);
+
+               while (sched->ws_nthreads != 0) {
+                       spin_unlock(&cfs_wi_data.wi_glock);
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       schedule_timeout(cfs_time_seconds(1) / 20);
+                       spin_lock(&cfs_wi_data.wi_glock);
+               }
+               spin_unlock(&cfs_wi_data.wi_glock);
+       }
+
+       while (!list_empty(&cfs_wi_data.wi_scheds)) {
+               sched = list_entry(cfs_wi_data.wi_scheds.next,
+                                      struct cfs_wi_sched, ws_list);
+               list_del(&sched->ws_list);
+               LIBCFS_FREE(sched, sizeof(*sched));
+       }
+
+       cfs_wi_data.wi_stopping = 0;
+       cfs_wi_data.wi_init = 0;
 }