/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 only, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License version 2 for more details (a copy is included * in the LICENSE file that accompanied this code). * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf * * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, * CA 95054 USA or visit www.sun.com if you need additional information or * have any questions. * * GPL HEADER END */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * * Copyright (c) 2011, Whamcloud, Inc. */ /* * This file is part of Lustre, http://www.lustre.org/ * Lustre is a trademark of Sun Microsystems, Inc. * * libcfs/libcfs/workitem.c * * Author: Isaac Huang * Liang Zhen */ #define DEBUG_SUBSYSTEM S_LNET #include typedef struct cfs_wi_sched { #ifdef __KERNEL__ /** serialised workitems */ cfs_spinlock_t ws_lock; /** where schedulers sleep */ cfs_waitq_t ws_waitq; #endif /** concurrent workitems */ cfs_list_t ws_runq; /** rescheduled running-workitems */ cfs_list_t ws_rerunq; /** shutting down */ int ws_shuttingdown; } cfs_wi_sched_t; #ifdef __KERNEL__ /** * we have 2 cfs_wi_sched_t so far: * one for CFS_WI_SCHED_ANY, another for CFS_WI_SCHED_SERIAL * per-cpu implementation will be added for SMP scalability */ #define CFS_WI_NSCHED 2 #else /** always 2 for userspace */ #define CFS_WI_NSCHED 2 #endif /* __KERNEL__ */ struct cfs_workitem_data { /** serialize */ cfs_spinlock_t wi_glock; /** number of cfs_wi_sched_t */ int wi_nsched; /** number of threads (all schedulers) */ int wi_nthreads; /** default scheduler */ cfs_wi_sched_t *wi_scheds; } cfs_wi_data; static inline cfs_wi_sched_t * cfs_wi_to_sched(cfs_workitem_t *wi) { LASSERT(wi->wi_sched_id == CFS_WI_SCHED_ANY || wi->wi_sched_id == CFS_WI_SCHED_SERIAL || (wi->wi_sched_id >= 0 && wi->wi_sched_id < cfs_wi_data.wi_nsched)); if (wi->wi_sched_id == CFS_WI_SCHED_ANY) return &cfs_wi_data.wi_scheds[0]; if (wi->wi_sched_id == CFS_WI_SCHED_SERIAL) return &cfs_wi_data.wi_scheds[cfs_wi_data.wi_nsched - 1]; return &cfs_wi_data.wi_scheds[wi->wi_sched_id]; } #ifdef __KERNEL__ static inline void cfs_wi_sched_lock(cfs_wi_sched_t *sched) { cfs_spin_lock(&sched->ws_lock); } static inline void cfs_wi_sched_unlock(cfs_wi_sched_t *sched) { cfs_spin_unlock(&sched->ws_lock); } static inline int cfs_wi_sched_cansleep(cfs_wi_sched_t *sched) { cfs_wi_sched_lock(sched); if (sched->ws_shuttingdown) { cfs_wi_sched_unlock(sched); return 0; } if (!cfs_list_empty(&sched->ws_runq)) { cfs_wi_sched_unlock(sched); return 0; } cfs_wi_sched_unlock(sched); return 1; } #else static inline void cfs_wi_sched_lock(cfs_wi_sched_t *sched) { cfs_spin_lock(&cfs_wi_data.wi_glock); } static inline void cfs_wi_sched_unlock(cfs_wi_sched_t *sched) { cfs_spin_unlock(&cfs_wi_data.wi_glock); } #endif /* XXX: * 0. it only works when called from wi->wi_action. * 1. when it returns no one shall try to schedule the workitem. */ void cfs_wi_exit(cfs_workitem_t *wi) { cfs_wi_sched_t *sched = cfs_wi_to_sched(wi); LASSERT (!cfs_in_interrupt()); /* because we use plain spinlock */ LASSERT (!sched->ws_shuttingdown); cfs_wi_sched_lock(sched); #ifdef __KERNEL__ LASSERT (wi->wi_running); #endif if (wi->wi_scheduled) { /* cancel pending schedules */ LASSERT (!cfs_list_empty(&wi->wi_list)); cfs_list_del_init(&wi->wi_list); } LASSERT (cfs_list_empty(&wi->wi_list)); wi->wi_scheduled = 1; /* LBUG future schedule attempts */ cfs_wi_sched_unlock(sched); return; } CFS_EXPORT_SYMBOL(cfs_wi_exit); /** * cancel a workitem: */ int cfs_wi_cancel (cfs_workitem_t *wi) { cfs_wi_sched_t *sched = cfs_wi_to_sched(wi); int rc; LASSERT (!cfs_in_interrupt()); /* because we use plain spinlock */ LASSERT (!sched->ws_shuttingdown); cfs_wi_sched_lock(sched); /* * return 0 if it's running already, otherwise return 1, which * means the workitem will not be scheduled and will not have * any race with wi_action. */ rc = !(wi->wi_running); if (wi->wi_scheduled) { /* cancel pending schedules */ LASSERT (!cfs_list_empty(&wi->wi_list)); cfs_list_del_init(&wi->wi_list); wi->wi_scheduled = 0; } LASSERT (cfs_list_empty(&wi->wi_list)); cfs_wi_sched_unlock(sched); return rc; } CFS_EXPORT_SYMBOL(cfs_wi_cancel); /* * Workitem scheduled with (serial == 1) is strictly serialised not only with * itself, but also with others scheduled this way. * * Now there's only one static serialised queue, but in the future more might * be added, and even dynamic creation of serialised queues might be supported. */ void cfs_wi_schedule(cfs_workitem_t *wi) { cfs_wi_sched_t *sched = cfs_wi_to_sched(wi); LASSERT (!cfs_in_interrupt()); /* because we use plain spinlock */ LASSERT (!sched->ws_shuttingdown); cfs_wi_sched_lock(sched); if (!wi->wi_scheduled) { LASSERT (cfs_list_empty(&wi->wi_list)); wi->wi_scheduled = 1; if (!wi->wi_running) { cfs_list_add_tail(&wi->wi_list, &sched->ws_runq); #ifdef __KERNEL__ cfs_waitq_signal(&sched->ws_waitq); #endif } else { cfs_list_add(&wi->wi_list, &sched->ws_rerunq); } } LASSERT (!cfs_list_empty(&wi->wi_list)); cfs_wi_sched_unlock(sched); return; } CFS_EXPORT_SYMBOL(cfs_wi_schedule); #ifdef __KERNEL__ static int cfs_wi_scheduler (void *arg) { int id = (int)(long_ptr_t) arg; int serial = (id == -1); char name[24]; cfs_wi_sched_t *sched; if (serial) { sched = &cfs_wi_data.wi_scheds[cfs_wi_data.wi_nsched - 1]; cfs_daemonize("wi_serial_sd"); } else { /* will be sched = &cfs_wi_data.wi_scheds[id] in the future */ sched = &cfs_wi_data.wi_scheds[0]; snprintf(name, sizeof(name), "cfs_wi_sd%03d", id); cfs_daemonize(name); } cfs_block_allsigs(); cfs_wi_sched_lock(sched); while (!sched->ws_shuttingdown) { int nloops = 0; int rc; cfs_workitem_t *wi; while (!cfs_list_empty(&sched->ws_runq) && nloops < CFS_WI_RESCHED) { wi = cfs_list_entry(sched->ws_runq.next, cfs_workitem_t, wi_list); LASSERT (wi->wi_scheduled && !wi->wi_running); cfs_list_del_init(&wi->wi_list); wi->wi_running = 1; wi->wi_scheduled = 0; cfs_wi_sched_unlock(sched); nloops++; rc = (*wi->wi_action) (wi); cfs_wi_sched_lock(sched); if (rc != 0) /* WI should be dead, even be freed! */ continue; wi->wi_running = 0; if (cfs_list_empty(&wi->wi_list)) continue; LASSERT (wi->wi_scheduled); /* wi is rescheduled, should be on rerunq now, we * move it to runq so it can run action now */ cfs_list_move_tail(&wi->wi_list, &sched->ws_runq); } if (!cfs_list_empty(&sched->ws_runq)) { cfs_wi_sched_unlock(sched); /* don't sleep because some workitems still * expect me to come back soon */ cfs_cond_resched(); cfs_wi_sched_lock(sched); continue; } cfs_wi_sched_unlock(sched); cfs_wait_event_interruptible_exclusive(sched->ws_waitq, !cfs_wi_sched_cansleep(sched), rc); cfs_wi_sched_lock(sched); } cfs_wi_sched_unlock(sched); cfs_spin_lock(&cfs_wi_data.wi_glock); cfs_wi_data.wi_nthreads--; cfs_spin_unlock(&cfs_wi_data.wi_glock); return 0; } static int cfs_wi_start_thread (int (*func) (void*), void *arg) { long pid; pid = cfs_create_thread(func, arg, 0); if (pid < 0) return (int)pid; cfs_spin_lock(&cfs_wi_data.wi_glock); cfs_wi_data.wi_nthreads++; cfs_spin_unlock(&cfs_wi_data.wi_glock); return 0; } #else /* __KERNEL__ */ int cfs_wi_check_events (void) { int n = 0; cfs_workitem_t *wi; cfs_list_t *q; cfs_spin_lock(&cfs_wi_data.wi_glock); for (;;) { /** rerunq is always empty for userspace */ if (!cfs_list_empty(&cfs_wi_data.wi_scheds[1].ws_runq)) q = &cfs_wi_data.wi_scheds[1].ws_runq; else if (!cfs_list_empty(&cfs_wi_data.wi_scheds[0].ws_runq)) q = &cfs_wi_data.wi_scheds[0].ws_runq; else break; wi = cfs_list_entry(q->next, cfs_workitem_t, wi_list); cfs_list_del_init(&wi->wi_list); LASSERT (wi->wi_scheduled); wi->wi_scheduled = 0; cfs_spin_unlock(&cfs_wi_data.wi_glock); n++; (*wi->wi_action) (wi); cfs_spin_lock(&cfs_wi_data.wi_glock); } cfs_spin_unlock(&cfs_wi_data.wi_glock); return n; } #endif static void cfs_wi_sched_init(cfs_wi_sched_t *sched) { sched->ws_shuttingdown = 0; #ifdef __KERNEL__ cfs_spin_lock_init(&sched->ws_lock); cfs_waitq_init(&sched->ws_waitq); #endif CFS_INIT_LIST_HEAD(&sched->ws_runq); CFS_INIT_LIST_HEAD(&sched->ws_rerunq); } static void cfs_wi_sched_shutdown(cfs_wi_sched_t *sched) { cfs_wi_sched_lock(sched); LASSERT(cfs_list_empty(&sched->ws_runq)); LASSERT(cfs_list_empty(&sched->ws_rerunq)); sched->ws_shuttingdown = 1; #ifdef __KERNEL__ cfs_waitq_broadcast(&sched->ws_waitq); #endif cfs_wi_sched_unlock(sched); } int cfs_wi_startup (void) { int i; int n, rc; cfs_wi_data.wi_nthreads = 0; cfs_wi_data.wi_nsched = CFS_WI_NSCHED; LIBCFS_ALLOC(cfs_wi_data.wi_scheds, cfs_wi_data.wi_nsched * sizeof(cfs_wi_sched_t)); if (cfs_wi_data.wi_scheds == NULL) return -ENOMEM; cfs_spin_lock_init(&cfs_wi_data.wi_glock); for (i = 0; i < cfs_wi_data.wi_nsched; i++) cfs_wi_sched_init(&cfs_wi_data.wi_scheds[i]); #ifdef __KERNEL__ n = cfs_num_online_cpus(); for (i = 0; i <= n; i++) { rc = cfs_wi_start_thread(cfs_wi_scheduler, (void *)(long_ptr_t)(i == n ? -1 : i)); if (rc != 0) { CERROR ("Can't spawn workitem scheduler: %d\n", rc); cfs_wi_shutdown(); return rc; } } #else SET_BUT_UNUSED(rc); SET_BUT_UNUSED(n); #endif return 0; } void cfs_wi_shutdown (void) { int i; if (cfs_wi_data.wi_scheds == NULL) return; for (i = 0; i < cfs_wi_data.wi_nsched; i++) cfs_wi_sched_shutdown(&cfs_wi_data.wi_scheds[i]); #ifdef __KERNEL__ cfs_spin_lock(&cfs_wi_data.wi_glock); i = 2; while (cfs_wi_data.wi_nthreads != 0) { CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET, "waiting for %d threads to terminate\n", cfs_wi_data.wi_nthreads); cfs_spin_unlock(&cfs_wi_data.wi_glock); cfs_pause(cfs_time_seconds(1)); cfs_spin_lock(&cfs_wi_data.wi_glock); } cfs_spin_unlock(&cfs_wi_data.wi_glock); #endif LIBCFS_FREE(cfs_wi_data.wi_scheds, cfs_wi_data.wi_nsched * sizeof(cfs_wi_sched_t)); return; }