4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * libcfs/libcfs/workitem.c
38 * Author: Isaac Huang <isaac@clusterfs.com>
39 * Liang Zhen <zhen.liang@sun.com>
42 #define DEBUG_SUBSYSTEM S_LNET
44 #include <libcfs/libcfs.h>
46 #define CFS_WS_NAME_LEN 16
48 typedef struct cfs_wi_sched {
49 struct list_head ws_list; /* chain on global list */
51 /** serialised workitems */
53 /** where schedulers sleep */
54 wait_queue_head_t ws_waitq;
56 /** concurrent workitems */
57 struct list_head ws_runq;
58 /** rescheduled running-workitems, a workitem can be rescheduled
59 * while running in wi_action(), but we don't to execute it again
60 * unless it returns from wi_action(), so we put it on ws_rerunq
61 * while rescheduling, and move it to runq after it returns
63 struct list_head ws_rerunq;
64 /** CPT-table for this scheduler */
65 struct cfs_cpt_table *ws_cptab;
66 /** CPT id for affinity */
68 /** number of scheduled workitems */
70 /** started scheduler thread, protected by cfs_wi_data::wi_glock */
71 unsigned int ws_nthreads:30;
72 /** shutting down, protected by cfs_wi_data::wi_glock */
73 unsigned int ws_stopping:1;
74 /** serialize starting thread, protected by cfs_wi_data::wi_glock */
75 unsigned int ws_starting:1;
77 char ws_name[CFS_WS_NAME_LEN];
80 struct cfs_workitem_data {
83 /** list of all schedulers */
84 struct list_head wi_scheds;
85 /** WI module is initialized */
87 /** shutting down the whole WI module */
93 cfs_wi_sched_lock(cfs_wi_sched_t *sched)
95 spin_lock(&sched->ws_lock);
99 cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
101 spin_unlock(&sched->ws_lock);
105 cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
107 cfs_wi_sched_lock(sched);
108 if (sched->ws_stopping) {
109 cfs_wi_sched_unlock(sched);
113 if (!list_empty(&sched->ws_runq)) {
114 cfs_wi_sched_unlock(sched);
117 cfs_wi_sched_unlock(sched);
121 #else /* !__KERNEL__ */
124 cfs_wi_sched_lock(cfs_wi_sched_t *sched)
126 spin_lock(&cfs_wi_data.wi_glock);
130 cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
132 spin_unlock(&cfs_wi_data.wi_glock);
135 #endif /* __KERNEL__ */
138 * 0. it only works when called from wi->wi_action.
139 * 1. when it returns no one shall try to schedule the workitem.
142 cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
144 LASSERT(!in_interrupt()); /* because we use plain spinlock */
145 LASSERT(!sched->ws_stopping);
147 cfs_wi_sched_lock(sched);
150 LASSERT(wi->wi_running);
152 if (wi->wi_scheduled) { /* cancel pending schedules */
153 LASSERT(!list_empty(&wi->wi_list));
154 list_del_init(&wi->wi_list);
156 LASSERT(sched->ws_nscheduled > 0);
157 sched->ws_nscheduled--;
160 LASSERT(list_empty(&wi->wi_list));
162 wi->wi_scheduled = 1; /* LBUG future schedule attempts */
163 cfs_wi_sched_unlock(sched);
167 EXPORT_SYMBOL(cfs_wi_exit);
170 * cancel schedule request of workitem \a wi
173 cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
177 LASSERT(!in_interrupt()); /* because we use plain spinlock */
178 LASSERT(!sched->ws_stopping);
181 * return 0 if it's running already, otherwise return 1, which
182 * means the workitem will not be scheduled and will not have
183 * any race with wi_action.
185 cfs_wi_sched_lock(sched);
187 rc = !(wi->wi_running);
189 if (wi->wi_scheduled) { /* cancel pending schedules */
190 LASSERT(!list_empty(&wi->wi_list));
191 list_del_init(&wi->wi_list);
193 LASSERT(sched->ws_nscheduled > 0);
194 sched->ws_nscheduled--;
196 wi->wi_scheduled = 0;
199 LASSERT (list_empty(&wi->wi_list));
201 cfs_wi_sched_unlock(sched);
204 EXPORT_SYMBOL(cfs_wi_deschedule);
207 * Workitem scheduled with (serial == 1) is strictly serialised not only with
208 * itself, but also with others scheduled this way.
210 * Now there's only one static serialised queue, but in the future more might
211 * be added, and even dynamic creation of serialised queues might be supported.
214 cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
216 LASSERT(!in_interrupt()); /* because we use plain spinlock */
217 LASSERT(!sched->ws_stopping);
219 cfs_wi_sched_lock(sched);
221 if (!wi->wi_scheduled) {
222 LASSERT (list_empty(&wi->wi_list));
224 wi->wi_scheduled = 1;
225 sched->ws_nscheduled++;
226 if (!wi->wi_running) {
227 list_add_tail(&wi->wi_list, &sched->ws_runq);
229 wake_up(&sched->ws_waitq);
232 list_add(&wi->wi_list, &sched->ws_rerunq);
236 LASSERT (!list_empty(&wi->wi_list));
237 cfs_wi_sched_unlock(sched);
240 EXPORT_SYMBOL(cfs_wi_schedule);
245 cfs_wi_scheduler (void *arg)
247 struct cfs_wi_sched *sched = (cfs_wi_sched_t *)arg;
251 /* CPT affinity scheduler? */
252 if (sched->ws_cptab != NULL)
253 if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
254 CWARN("Failed to bind %s on CPT %d\n",
255 sched->ws_name, sched->ws_cpt);
257 spin_lock(&cfs_wi_data.wi_glock);
259 LASSERT(sched->ws_starting == 1);
260 sched->ws_starting--;
261 sched->ws_nthreads++;
263 spin_unlock(&cfs_wi_data.wi_glock);
265 cfs_wi_sched_lock(sched);
267 while (!sched->ws_stopping) {
272 while (!list_empty(&sched->ws_runq) &&
273 nloops < CFS_WI_RESCHED) {
274 wi = list_entry(sched->ws_runq.next,
275 cfs_workitem_t, wi_list);
276 LASSERT(wi->wi_scheduled && !wi->wi_running);
278 list_del_init(&wi->wi_list);
280 LASSERT(sched->ws_nscheduled > 0);
281 sched->ws_nscheduled--;
284 wi->wi_scheduled = 0;
287 cfs_wi_sched_unlock(sched);
290 rc = (*wi->wi_action) (wi);
292 cfs_wi_sched_lock(sched);
293 if (rc != 0) /* WI should be dead, even be freed! */
297 if (list_empty(&wi->wi_list))
300 LASSERT(wi->wi_scheduled);
301 /* wi is rescheduled, should be on rerunq now, we
302 * move it to runq so it can run action now */
303 list_move_tail(&wi->wi_list, &sched->ws_runq);
306 if (!list_empty(&sched->ws_runq)) {
307 cfs_wi_sched_unlock(sched);
308 /* don't sleep because some workitems still
309 * expect me to come back soon */
311 cfs_wi_sched_lock(sched);
315 cfs_wi_sched_unlock(sched);
316 rc = wait_event_interruptible_exclusive(sched->ws_waitq,
317 !cfs_wi_sched_cansleep(sched));
318 cfs_wi_sched_lock(sched);
321 cfs_wi_sched_unlock(sched);
323 spin_lock(&cfs_wi_data.wi_glock);
324 sched->ws_nthreads--;
325 spin_unlock(&cfs_wi_data.wi_glock);
330 #else /* __KERNEL__ */
333 cfs_wi_check_events (void)
338 spin_lock(&cfs_wi_data.wi_glock);
341 struct cfs_wi_sched *sched = NULL;
342 struct cfs_wi_sched *tmp;
344 /** rerunq is always empty for userspace */
345 list_for_each_entry(tmp, &cfs_wi_data.wi_scheds, ws_list) {
346 if (!list_empty(&tmp->ws_runq)) {
355 wi = list_entry(sched->ws_runq.next,
356 cfs_workitem_t, wi_list);
357 list_del_init(&wi->wi_list);
359 LASSERT(sched->ws_nscheduled > 0);
360 sched->ws_nscheduled--;
362 LASSERT(wi->wi_scheduled);
363 wi->wi_scheduled = 0;
364 spin_unlock(&cfs_wi_data.wi_glock);
367 (*wi->wi_action) (wi);
369 spin_lock(&cfs_wi_data.wi_glock);
372 spin_unlock(&cfs_wi_data.wi_glock);
379 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
381 LASSERT(cfs_wi_data.wi_init);
382 LASSERT(!cfs_wi_data.wi_stopping);
384 spin_lock(&cfs_wi_data.wi_glock);
385 if (sched->ws_stopping) {
386 CDEBUG(D_INFO, "%s is in progress of stopping\n",
388 spin_unlock(&cfs_wi_data.wi_glock);
392 LASSERT(!list_empty(&sched->ws_list));
393 sched->ws_stopping = 1;
395 spin_unlock(&cfs_wi_data.wi_glock);
398 wake_up_all(&sched->ws_waitq);
400 spin_lock(&cfs_wi_data.wi_glock);
404 while (sched->ws_nthreads > 0) {
405 CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
406 "waiting for %d threads of WI sched[%s] to "
407 "terminate\n", sched->ws_nthreads,
410 spin_unlock(&cfs_wi_data.wi_glock);
411 cfs_pause(cfs_time_seconds(1) / 20);
412 spin_lock(&cfs_wi_data.wi_glock);
416 list_del(&sched->ws_list);
418 spin_unlock(&cfs_wi_data.wi_glock);
420 LASSERT(sched->ws_nscheduled == 0);
422 LIBCFS_FREE(sched, sizeof(*sched));
424 EXPORT_SYMBOL(cfs_wi_sched_destroy);
427 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
428 int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
430 struct cfs_wi_sched *sched;
432 LASSERT(cfs_wi_data.wi_init);
433 LASSERT(!cfs_wi_data.wi_stopping);
434 LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
435 (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
437 LIBCFS_ALLOC(sched, sizeof(*sched));
441 if (strlen(name) > sizeof(sched->ws_name)-1) {
442 LIBCFS_FREE(sched, sizeof(*sched));
445 strlcpy(sched->ws_name, name, sizeof(sched->ws_name));
447 sched->ws_cptab = cptab;
451 spin_lock_init(&sched->ws_lock);
452 init_waitqueue_head(&sched->ws_waitq);
454 INIT_LIST_HEAD(&sched->ws_runq);
455 INIT_LIST_HEAD(&sched->ws_rerunq);
456 INIT_LIST_HEAD(&sched->ws_list);
459 for (; nthrs > 0; nthrs--) {
461 struct task_struct *task;
463 spin_lock(&cfs_wi_data.wi_glock);
464 while (sched->ws_starting > 0) {
465 spin_unlock(&cfs_wi_data.wi_glock);
467 spin_lock(&cfs_wi_data.wi_glock);
470 sched->ws_starting++;
471 spin_unlock(&cfs_wi_data.wi_glock);
473 if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
474 snprintf(name, sizeof(name), "%s_%02d_%02d",
475 sched->ws_name, sched->ws_cpt,
478 snprintf(name, sizeof(name), "%s_%02d",
479 sched->ws_name, sched->ws_nthreads);
482 task = kthread_run(cfs_wi_scheduler, sched, name);
484 int rc = PTR_ERR(task);
486 CERROR("Failed to create thread for "
487 "WI scheduler %s: %d\n", name, rc);
489 spin_lock(&cfs_wi_data.wi_glock);
491 /* make up for cfs_wi_sched_destroy */
492 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
493 sched->ws_starting--;
495 spin_unlock(&cfs_wi_data.wi_glock);
497 cfs_wi_sched_destroy(sched);
502 spin_lock(&cfs_wi_data.wi_glock);
503 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
504 spin_unlock(&cfs_wi_data.wi_glock);
509 EXPORT_SYMBOL(cfs_wi_sched_create);
514 memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
516 spin_lock_init(&cfs_wi_data.wi_glock);
517 INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
518 cfs_wi_data.wi_init = 1;
524 cfs_wi_shutdown (void)
526 struct cfs_wi_sched *sched;
528 spin_lock(&cfs_wi_data.wi_glock);
529 cfs_wi_data.wi_stopping = 1;
530 spin_unlock(&cfs_wi_data.wi_glock);
533 /* nobody should contend on this list */
534 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
535 sched->ws_stopping = 1;
536 wake_up_all(&sched->ws_waitq);
539 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
540 spin_lock(&cfs_wi_data.wi_glock);
542 while (sched->ws_nthreads != 0) {
543 spin_unlock(&cfs_wi_data.wi_glock);
544 cfs_pause(cfs_time_seconds(1) / 20);
545 spin_lock(&cfs_wi_data.wi_glock);
547 spin_unlock(&cfs_wi_data.wi_glock);
550 while (!list_empty(&cfs_wi_data.wi_scheds)) {
551 sched = list_entry(cfs_wi_data.wi_scheds.next,
552 struct cfs_wi_sched, ws_list);
553 list_del(&sched->ws_list);
554 LIBCFS_FREE(sched, sizeof(*sched));
557 cfs_wi_data.wi_stopping = 0;
558 cfs_wi_data.wi_init = 0;