4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * libcfs/libcfs/workitem.c
38 * Author: Isaac Huang <isaac@clusterfs.com>
39 * Liang Zhen <zhen.liang@sun.com>
42 #define DEBUG_SUBSYSTEM S_LNET
44 #include <linux/kthread.h>
45 #include <libcfs/libcfs.h>
47 #define CFS_WS_NAME_LEN 16
49 typedef struct cfs_wi_sched {
50 struct list_head ws_list; /* chain on global list */
51 /** serialised workitems */
53 /** where schedulers sleep */
54 wait_queue_head_t ws_waitq;
55 /** concurrent workitems */
56 struct list_head ws_runq;
57 /** rescheduled running-workitems, a workitem can be rescheduled
58 * while running in wi_action(), but we don't to execute it again
59 * unless it returns from wi_action(), so we put it on ws_rerunq
60 * while rescheduling, and move it to runq after it returns
62 struct list_head ws_rerunq;
63 /** CPT-table for this scheduler */
64 struct cfs_cpt_table *ws_cptab;
65 /** CPT id for affinity */
67 /** number of scheduled workitems */
69 /** started scheduler thread, protected by cfs_wi_data::wi_glock */
70 unsigned int ws_nthreads:30;
71 /** shutting down, protected by cfs_wi_data::wi_glock */
72 unsigned int ws_stopping:1;
73 /** serialize starting thread, protected by cfs_wi_data::wi_glock */
74 unsigned int ws_starting:1;
76 char ws_name[CFS_WS_NAME_LEN];
79 static struct cfs_workitem_data {
82 /** list of all schedulers */
83 struct list_head wi_scheds;
84 /** WI module is initialized */
86 /** shutting down the whole WI module */
91 cfs_wi_sched_lock(cfs_wi_sched_t *sched)
93 spin_lock(&sched->ws_lock);
97 cfs_wi_sched_unlock(cfs_wi_sched_t *sched)
99 spin_unlock(&sched->ws_lock);
103 cfs_wi_sched_cansleep(cfs_wi_sched_t *sched)
105 cfs_wi_sched_lock(sched);
106 if (sched->ws_stopping) {
107 cfs_wi_sched_unlock(sched);
111 if (!list_empty(&sched->ws_runq)) {
112 cfs_wi_sched_unlock(sched);
115 cfs_wi_sched_unlock(sched);
120 * 0. it only works when called from wi->wi_action.
121 * 1. when it returns no one shall try to schedule the workitem.
124 cfs_wi_exit(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
126 LASSERT(!in_interrupt()); /* because we use plain spinlock */
127 LASSERT(!sched->ws_stopping);
129 cfs_wi_sched_lock(sched);
131 LASSERT(wi->wi_running);
133 if (wi->wi_scheduled) { /* cancel pending schedules */
134 LASSERT(!list_empty(&wi->wi_list));
135 list_del_init(&wi->wi_list);
137 LASSERT(sched->ws_nscheduled > 0);
138 sched->ws_nscheduled--;
141 LASSERT(list_empty(&wi->wi_list));
143 wi->wi_scheduled = 1; /* LBUG future schedule attempts */
144 cfs_wi_sched_unlock(sched);
148 EXPORT_SYMBOL(cfs_wi_exit);
151 * cancel schedule request of workitem \a wi
154 cfs_wi_deschedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
158 LASSERT(!in_interrupt()); /* because we use plain spinlock */
159 LASSERT(!sched->ws_stopping);
162 * return 0 if it's running already, otherwise return 1, which
163 * means the workitem will not be scheduled and will not have
164 * any race with wi_action.
166 cfs_wi_sched_lock(sched);
168 rc = !(wi->wi_running);
170 if (wi->wi_scheduled) { /* cancel pending schedules */
171 LASSERT(!list_empty(&wi->wi_list));
172 list_del_init(&wi->wi_list);
174 LASSERT(sched->ws_nscheduled > 0);
175 sched->ws_nscheduled--;
177 wi->wi_scheduled = 0;
180 LASSERT (list_empty(&wi->wi_list));
182 cfs_wi_sched_unlock(sched);
185 EXPORT_SYMBOL(cfs_wi_deschedule);
188 * Workitem scheduled with (serial == 1) is strictly serialised not only with
189 * itself, but also with others scheduled this way.
191 * Now there's only one static serialised queue, but in the future more might
192 * be added, and even dynamic creation of serialised queues might be supported.
195 cfs_wi_schedule(struct cfs_wi_sched *sched, cfs_workitem_t *wi)
197 LASSERT(!in_interrupt()); /* because we use plain spinlock */
198 LASSERT(!sched->ws_stopping);
200 cfs_wi_sched_lock(sched);
202 if (!wi->wi_scheduled) {
203 LASSERT (list_empty(&wi->wi_list));
205 wi->wi_scheduled = 1;
206 sched->ws_nscheduled++;
207 if (!wi->wi_running) {
208 list_add_tail(&wi->wi_list, &sched->ws_runq);
209 wake_up(&sched->ws_waitq);
211 list_add(&wi->wi_list, &sched->ws_rerunq);
215 LASSERT (!list_empty(&wi->wi_list));
216 cfs_wi_sched_unlock(sched);
219 EXPORT_SYMBOL(cfs_wi_schedule);
222 cfs_wi_scheduler (void *arg)
224 struct cfs_wi_sched *sched = (cfs_wi_sched_t *)arg;
228 /* CPT affinity scheduler? */
229 if (sched->ws_cptab != NULL)
230 if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
231 CWARN("Failed to bind %s on CPT %d\n",
232 sched->ws_name, sched->ws_cpt);
234 spin_lock(&cfs_wi_data.wi_glock);
236 LASSERT(sched->ws_starting == 1);
237 sched->ws_starting--;
238 sched->ws_nthreads++;
240 spin_unlock(&cfs_wi_data.wi_glock);
242 cfs_wi_sched_lock(sched);
244 while (!sched->ws_stopping) {
249 while (!list_empty(&sched->ws_runq) &&
250 nloops < CFS_WI_RESCHED) {
251 wi = list_entry(sched->ws_runq.next,
252 cfs_workitem_t, wi_list);
253 LASSERT(wi->wi_scheduled && !wi->wi_running);
255 list_del_init(&wi->wi_list);
257 LASSERT(sched->ws_nscheduled > 0);
258 sched->ws_nscheduled--;
261 wi->wi_scheduled = 0;
264 cfs_wi_sched_unlock(sched);
267 rc = (*wi->wi_action) (wi);
269 cfs_wi_sched_lock(sched);
270 if (rc != 0) /* WI should be dead, even be freed! */
274 if (list_empty(&wi->wi_list))
277 LASSERT(wi->wi_scheduled);
278 /* wi is rescheduled, should be on rerunq now, we
279 * move it to runq so it can run action now */
280 list_move_tail(&wi->wi_list, &sched->ws_runq);
283 if (!list_empty(&sched->ws_runq)) {
284 cfs_wi_sched_unlock(sched);
285 /* don't sleep because some workitems still
286 * expect me to come back soon */
288 cfs_wi_sched_lock(sched);
292 cfs_wi_sched_unlock(sched);
293 rc = wait_event_interruptible_exclusive(sched->ws_waitq,
294 !cfs_wi_sched_cansleep(sched));
295 cfs_wi_sched_lock(sched);
298 cfs_wi_sched_unlock(sched);
300 spin_lock(&cfs_wi_data.wi_glock);
301 sched->ws_nthreads--;
302 spin_unlock(&cfs_wi_data.wi_glock);
308 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
310 LASSERT(cfs_wi_data.wi_init);
311 LASSERT(!cfs_wi_data.wi_stopping);
313 spin_lock(&cfs_wi_data.wi_glock);
314 if (sched->ws_stopping) {
315 CDEBUG(D_INFO, "%s is in progress of stopping\n",
317 spin_unlock(&cfs_wi_data.wi_glock);
321 LASSERT(!list_empty(&sched->ws_list));
322 sched->ws_stopping = 1;
324 spin_unlock(&cfs_wi_data.wi_glock);
326 wake_up_all(&sched->ws_waitq);
328 spin_lock(&cfs_wi_data.wi_glock);
332 while (sched->ws_nthreads > 0) {
333 CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
334 "waiting for %d threads of WI sched[%s] to "
335 "terminate\n", sched->ws_nthreads,
338 spin_unlock(&cfs_wi_data.wi_glock);
339 set_current_state(TASK_UNINTERRUPTIBLE);
340 schedule_timeout(cfs_time_seconds(1) / 20);
341 spin_lock(&cfs_wi_data.wi_glock);
345 list_del(&sched->ws_list);
347 spin_unlock(&cfs_wi_data.wi_glock);
349 LASSERT(sched->ws_nscheduled == 0);
351 LIBCFS_FREE(sched, sizeof(*sched));
353 EXPORT_SYMBOL(cfs_wi_sched_destroy);
356 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
357 int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
359 struct cfs_wi_sched *sched;
361 LASSERT(cfs_wi_data.wi_init);
362 LASSERT(!cfs_wi_data.wi_stopping);
363 LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
364 (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
366 LIBCFS_ALLOC(sched, sizeof(*sched));
370 if (strlen(name) > sizeof(sched->ws_name)-1) {
371 LIBCFS_FREE(sched, sizeof(*sched));
374 strlcpy(sched->ws_name, name, sizeof(sched->ws_name));
376 sched->ws_cptab = cptab;
379 spin_lock_init(&sched->ws_lock);
380 init_waitqueue_head(&sched->ws_waitq);
382 INIT_LIST_HEAD(&sched->ws_runq);
383 INIT_LIST_HEAD(&sched->ws_rerunq);
384 INIT_LIST_HEAD(&sched->ws_list);
386 for (; nthrs > 0; nthrs--) {
388 struct task_struct *task;
390 spin_lock(&cfs_wi_data.wi_glock);
391 while (sched->ws_starting > 0) {
392 spin_unlock(&cfs_wi_data.wi_glock);
394 spin_lock(&cfs_wi_data.wi_glock);
397 sched->ws_starting++;
398 spin_unlock(&cfs_wi_data.wi_glock);
400 if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
401 snprintf(name, sizeof(name), "%s_%02d_%02d",
402 sched->ws_name, sched->ws_cpt,
405 snprintf(name, sizeof(name), "%s_%02d",
406 sched->ws_name, sched->ws_nthreads);
409 task = kthread_run(cfs_wi_scheduler, sched, name);
411 int rc = PTR_ERR(task);
413 CERROR("Failed to create thread for "
414 "WI scheduler %s: %d\n", name, rc);
416 spin_lock(&cfs_wi_data.wi_glock);
418 /* make up for cfs_wi_sched_destroy */
419 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
420 sched->ws_starting--;
422 spin_unlock(&cfs_wi_data.wi_glock);
424 cfs_wi_sched_destroy(sched);
429 spin_lock(&cfs_wi_data.wi_glock);
430 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
431 spin_unlock(&cfs_wi_data.wi_glock);
436 EXPORT_SYMBOL(cfs_wi_sched_create);
441 memset(&cfs_wi_data, 0, sizeof(cfs_wi_data));
443 spin_lock_init(&cfs_wi_data.wi_glock);
444 INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
445 cfs_wi_data.wi_init = 1;
451 cfs_wi_shutdown (void)
453 struct cfs_wi_sched *sched;
455 spin_lock(&cfs_wi_data.wi_glock);
456 cfs_wi_data.wi_stopping = 1;
457 spin_unlock(&cfs_wi_data.wi_glock);
459 /* nobody should contend on this list */
460 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
461 sched->ws_stopping = 1;
462 wake_up_all(&sched->ws_waitq);
465 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
466 spin_lock(&cfs_wi_data.wi_glock);
468 while (sched->ws_nthreads != 0) {
469 spin_unlock(&cfs_wi_data.wi_glock);
470 set_current_state(TASK_UNINTERRUPTIBLE);
471 schedule_timeout(cfs_time_seconds(1) / 20);
472 spin_lock(&cfs_wi_data.wi_glock);
474 spin_unlock(&cfs_wi_data.wi_glock);
477 while (!list_empty(&cfs_wi_data.wi_scheds)) {
478 sched = list_entry(cfs_wi_data.wi_scheds.next,
479 struct cfs_wi_sched, ws_list);
480 list_del(&sched->ws_list);
481 LIBCFS_FREE(sched, sizeof(*sched));
484 cfs_wi_data.wi_stopping = 0;
485 cfs_wi_data.wi_init = 0;