4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2014, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * libcfs/libcfs/workitem.c
38 * Author: Isaac Huang <isaac@clusterfs.com>
39 * Liang Zhen <zhen.liang@sun.com>
42 #define DEBUG_SUBSYSTEM S_LNET
44 #include <linux/kthread.h>
45 #include <libcfs/libcfs.h>
47 #define CFS_WS_NAME_LEN 16
50 struct list_head ws_list; /* chain on global list */
51 /** serialised workitems */
53 /** where schedulers sleep */
54 wait_queue_head_t ws_waitq;
55 /** concurrent workitems */
56 struct list_head ws_runq;
57 /** rescheduled running-workitems, a workitem can be rescheduled
58 * while running in wi_action(), but we don't to execute it again
59 * unless it returns from wi_action(), so we put it on ws_rerunq
60 * while rescheduling, and move it to runq after it returns
62 struct list_head ws_rerunq;
63 /** CPT-table for this scheduler */
64 struct cfs_cpt_table *ws_cptab;
65 /** CPT id for affinity */
67 /** number of scheduled workitems */
69 /** started scheduler thread, protected by cfs_wi_data::wi_glock */
70 unsigned int ws_nthreads:30;
71 /** shutting down, protected by cfs_wi_data::wi_glock */
72 unsigned int ws_stopping:1;
73 /** serialize starting thread, protected by cfs_wi_data::wi_glock */
74 unsigned int ws_starting:1;
76 char ws_name[CFS_WS_NAME_LEN];
79 static struct cfs_workitem_data {
82 /** list of all schedulers */
83 struct list_head wi_scheds;
84 /** WI module is initialized */
86 /** shutting down the whole WI module */
91 cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
93 spin_lock(&sched->ws_lock);
94 if (sched->ws_stopping) {
95 spin_unlock(&sched->ws_lock);
99 if (!list_empty(&sched->ws_runq)) {
100 spin_unlock(&sched->ws_lock);
103 spin_unlock(&sched->ws_lock);
108 * 0. it only works when called from wi->wi_action.
109 * 1. when it returns no one shall try to schedule the workitem.
112 cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
114 LASSERT(!in_interrupt()); /* because we use plain spinlock */
115 LASSERT(!sched->ws_stopping);
117 spin_lock(&sched->ws_lock);
119 LASSERT(wi->wi_running);
121 if (wi->wi_scheduled) { /* cancel pending schedules */
122 LASSERT(!list_empty(&wi->wi_list));
123 list_del_init(&wi->wi_list);
125 LASSERT(sched->ws_nscheduled > 0);
126 sched->ws_nscheduled--;
129 LASSERT(list_empty(&wi->wi_list));
131 wi->wi_scheduled = 1; /* LBUG future schedule attempts */
132 spin_unlock(&sched->ws_lock);
136 EXPORT_SYMBOL(cfs_wi_exit);
139 * cancel schedule request of workitem \a wi
142 cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
146 LASSERT(!in_interrupt()); /* because we use plain spinlock */
147 LASSERT(!sched->ws_stopping);
150 * return 0 if it's running already, otherwise return 1, which
151 * means the workitem will not be scheduled and will not have
152 * any race with wi_action.
154 spin_lock(&sched->ws_lock);
156 rc = !(wi->wi_running);
158 if (wi->wi_scheduled) { /* cancel pending schedules */
159 LASSERT(!list_empty(&wi->wi_list));
160 list_del_init(&wi->wi_list);
162 LASSERT(sched->ws_nscheduled > 0);
163 sched->ws_nscheduled--;
165 wi->wi_scheduled = 0;
168 LASSERT (list_empty(&wi->wi_list));
170 spin_unlock(&sched->ws_lock);
173 EXPORT_SYMBOL(cfs_wi_deschedule);
176 * Workitem scheduled with (serial == 1) is strictly serialised not only with
177 * itself, but also with others scheduled this way.
179 * Now there's only one static serialised queue, but in the future more might
180 * be added, and even dynamic creation of serialised queues might be supported.
183 cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
185 LASSERT(!in_interrupt()); /* because we use plain spinlock */
186 LASSERT(!sched->ws_stopping);
188 spin_lock(&sched->ws_lock);
190 if (!wi->wi_scheduled) {
191 LASSERT (list_empty(&wi->wi_list));
193 wi->wi_scheduled = 1;
194 sched->ws_nscheduled++;
195 if (!wi->wi_running) {
196 list_add_tail(&wi->wi_list, &sched->ws_runq);
197 wake_up(&sched->ws_waitq);
199 list_add(&wi->wi_list, &sched->ws_rerunq);
203 LASSERT (!list_empty(&wi->wi_list));
204 spin_unlock(&sched->ws_lock);
207 EXPORT_SYMBOL(cfs_wi_schedule);
210 cfs_wi_scheduler(void *arg)
212 struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg;
216 /* CPT affinity scheduler? */
217 if (sched->ws_cptab != NULL)
218 if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
219 CWARN("Failed to bind %s on CPT %d\n",
220 sched->ws_name, sched->ws_cpt);
222 spin_lock(&cfs_wi_data.wi_glock);
224 LASSERT(sched->ws_starting == 1);
225 sched->ws_starting--;
226 sched->ws_nthreads++;
228 spin_unlock(&cfs_wi_data.wi_glock);
230 spin_lock(&sched->ws_lock);
232 while (!sched->ws_stopping) {
235 struct cfs_workitem *wi;
237 while (!list_empty(&sched->ws_runq) &&
238 nloops < CFS_WI_RESCHED) {
239 wi = list_entry(sched->ws_runq.next,
240 struct cfs_workitem, wi_list);
241 LASSERT(wi->wi_scheduled && !wi->wi_running);
243 list_del_init(&wi->wi_list);
245 LASSERT(sched->ws_nscheduled > 0);
246 sched->ws_nscheduled--;
249 wi->wi_scheduled = 0;
251 spin_unlock(&sched->ws_lock);
254 rc = (*wi->wi_action) (wi);
256 spin_lock(&sched->ws_lock);
257 if (rc != 0) /* WI should be dead, even be freed! */
261 if (list_empty(&wi->wi_list))
264 LASSERT(wi->wi_scheduled);
265 /* wi is rescheduled, should be on rerunq now, we
266 * move it to runq so it can run action now */
267 list_move_tail(&wi->wi_list, &sched->ws_runq);
270 if (!list_empty(&sched->ws_runq)) {
271 spin_unlock(&sched->ws_lock);
272 /* don't sleep because some workitems still
273 * expect me to come back soon */
275 spin_lock(&sched->ws_lock);
279 spin_unlock(&sched->ws_lock);
280 rc = wait_event_interruptible_exclusive(sched->ws_waitq,
281 !cfs_wi_sched_cansleep(sched));
282 spin_lock(&sched->ws_lock);
285 spin_unlock(&sched->ws_lock);
287 spin_lock(&cfs_wi_data.wi_glock);
288 sched->ws_nthreads--;
289 spin_unlock(&cfs_wi_data.wi_glock);
295 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
297 LASSERT(cfs_wi_data.wi_init);
298 LASSERT(!cfs_wi_data.wi_stopping);
300 spin_lock(&cfs_wi_data.wi_glock);
301 if (sched->ws_stopping) {
302 CDEBUG(D_INFO, "%s is in progress of stopping\n",
304 spin_unlock(&cfs_wi_data.wi_glock);
308 LASSERT(!list_empty(&sched->ws_list));
309 sched->ws_stopping = 1;
311 spin_unlock(&cfs_wi_data.wi_glock);
313 wake_up_all(&sched->ws_waitq);
315 spin_lock(&cfs_wi_data.wi_glock);
319 while (sched->ws_nthreads > 0) {
320 CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
321 "waiting for %d threads of WI sched[%s] to "
322 "terminate\n", sched->ws_nthreads,
325 spin_unlock(&cfs_wi_data.wi_glock);
326 set_current_state(TASK_UNINTERRUPTIBLE);
327 schedule_timeout(cfs_time_seconds(1) / 20);
328 spin_lock(&cfs_wi_data.wi_glock);
332 list_del(&sched->ws_list);
334 spin_unlock(&cfs_wi_data.wi_glock);
336 LASSERT(sched->ws_nscheduled == 0);
338 LIBCFS_FREE(sched, sizeof(*sched));
340 EXPORT_SYMBOL(cfs_wi_sched_destroy);
343 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
344 int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
346 struct cfs_wi_sched *sched;
348 LASSERT(cfs_wi_data.wi_init);
349 LASSERT(!cfs_wi_data.wi_stopping);
350 LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
351 (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
353 LIBCFS_ALLOC(sched, sizeof(*sched));
357 if (strlen(name) > sizeof(sched->ws_name)-1) {
358 LIBCFS_FREE(sched, sizeof(*sched));
361 strlcpy(sched->ws_name, name, sizeof(sched->ws_name));
363 sched->ws_cptab = cptab;
366 spin_lock_init(&sched->ws_lock);
367 init_waitqueue_head(&sched->ws_waitq);
369 INIT_LIST_HEAD(&sched->ws_runq);
370 INIT_LIST_HEAD(&sched->ws_rerunq);
371 INIT_LIST_HEAD(&sched->ws_list);
373 for (; nthrs > 0; nthrs--) {
375 struct task_struct *task;
377 spin_lock(&cfs_wi_data.wi_glock);
378 while (sched->ws_starting > 0) {
379 spin_unlock(&cfs_wi_data.wi_glock);
381 spin_lock(&cfs_wi_data.wi_glock);
384 sched->ws_starting++;
385 spin_unlock(&cfs_wi_data.wi_glock);
387 if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
388 snprintf(name, sizeof(name), "%s_%02d_%02d",
389 sched->ws_name, sched->ws_cpt,
392 snprintf(name, sizeof(name), "%s_%02d",
393 sched->ws_name, sched->ws_nthreads);
396 task = kthread_run(cfs_wi_scheduler, sched, name);
398 int rc = PTR_ERR(task);
400 CERROR("Failed to create thread for "
401 "WI scheduler %s: %d\n", name, rc);
403 spin_lock(&cfs_wi_data.wi_glock);
405 /* make up for cfs_wi_sched_destroy */
406 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
407 sched->ws_starting--;
409 spin_unlock(&cfs_wi_data.wi_glock);
411 cfs_wi_sched_destroy(sched);
416 spin_lock(&cfs_wi_data.wi_glock);
417 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
418 spin_unlock(&cfs_wi_data.wi_glock);
423 EXPORT_SYMBOL(cfs_wi_sched_create);
428 memset(&cfs_wi_data, 0, sizeof(struct cfs_workitem_data));
430 spin_lock_init(&cfs_wi_data.wi_glock);
431 INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
432 cfs_wi_data.wi_init = 1;
438 cfs_wi_shutdown (void)
440 struct cfs_wi_sched *sched;
442 spin_lock(&cfs_wi_data.wi_glock);
443 cfs_wi_data.wi_stopping = 1;
444 spin_unlock(&cfs_wi_data.wi_glock);
446 /* nobody should contend on this list */
447 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
448 sched->ws_stopping = 1;
449 wake_up_all(&sched->ws_waitq);
452 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
453 spin_lock(&cfs_wi_data.wi_glock);
455 while (sched->ws_nthreads != 0) {
456 spin_unlock(&cfs_wi_data.wi_glock);
457 set_current_state(TASK_UNINTERRUPTIBLE);
458 schedule_timeout(cfs_time_seconds(1) / 20);
459 spin_lock(&cfs_wi_data.wi_glock);
461 spin_unlock(&cfs_wi_data.wi_glock);
464 while (!list_empty(&cfs_wi_data.wi_scheds)) {
465 sched = list_entry(cfs_wi_data.wi_scheds.next,
466 struct cfs_wi_sched, ws_list);
467 list_del(&sched->ws_list);
468 LIBCFS_FREE(sched, sizeof(*sched));
471 cfs_wi_data.wi_stopping = 0;
472 cfs_wi_data.wi_init = 0;