4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2013, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
32 * libcfs/libcfs/workitem.c
34 * Author: Isaac Huang <isaac@clusterfs.com>
35 * Liang Zhen <zhen.liang@sun.com>
38 #define DEBUG_SUBSYSTEM S_LNET
40 #include <linux/kthread.h>
41 #include <libcfs/libcfs.h>
43 #define CFS_WS_NAME_LEN 16
46 struct list_head ws_list; /* chain on global list */
47 /** serialised workitems */
49 /** where schedulers sleep */
50 wait_queue_head_t ws_waitq;
51 /** concurrent workitems */
52 struct list_head ws_runq;
53 /** rescheduled running-workitems, a workitem can be rescheduled
54 * while running in wi_action(), but we don't to execute it again
55 * unless it returns from wi_action(), so we put it on ws_rerunq
56 * while rescheduling, and move it to runq after it returns
58 struct list_head ws_rerunq;
59 /** CPT-table for this scheduler */
60 struct cfs_cpt_table *ws_cptab;
61 /** CPT id for affinity */
63 /** number of scheduled workitems */
65 /** started scheduler thread, protected by cfs_wi_data::wi_glock */
66 unsigned int ws_nthreads:30;
67 /** shutting down, protected by cfs_wi_data::wi_glock */
68 unsigned int ws_stopping:1;
69 /** serialize starting thread, protected by cfs_wi_data::wi_glock */
70 unsigned int ws_starting:1;
72 char ws_name[CFS_WS_NAME_LEN];
75 static struct cfs_workitem_data {
78 /** list of all schedulers */
79 struct list_head wi_scheds;
80 /** WI module is initialized */
82 /** shutting down the whole WI module */
87 cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
89 spin_lock(&sched->ws_lock);
90 if (sched->ws_stopping) {
91 spin_unlock(&sched->ws_lock);
95 if (!list_empty(&sched->ws_runq)) {
96 spin_unlock(&sched->ws_lock);
99 spin_unlock(&sched->ws_lock);
104 * 0. it only works when called from wi->wi_action.
105 * 1. when it returns no one shall try to schedule the workitem.
108 cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
110 LASSERT(!in_interrupt()); /* because we use plain spinlock */
111 LASSERT(!sched->ws_stopping);
113 spin_lock(&sched->ws_lock);
115 LASSERT(wi->wi_running);
117 if (wi->wi_scheduled) { /* cancel pending schedules */
118 LASSERT(!list_empty(&wi->wi_list));
119 list_del_init(&wi->wi_list);
121 LASSERT(sched->ws_nscheduled > 0);
122 sched->ws_nscheduled--;
125 LASSERT(list_empty(&wi->wi_list));
127 wi->wi_scheduled = 1; /* LBUG future schedule attempts */
128 spin_unlock(&sched->ws_lock);
132 EXPORT_SYMBOL(cfs_wi_exit);
135 * cancel schedule request of workitem \a wi
138 cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
142 LASSERT(!in_interrupt()); /* because we use plain spinlock */
143 LASSERT(!sched->ws_stopping);
146 * return 0 if it's running already, otherwise return 1, which
147 * means the workitem will not be scheduled and will not have
148 * any race with wi_action.
150 spin_lock(&sched->ws_lock);
152 rc = !(wi->wi_running);
154 if (wi->wi_scheduled) { /* cancel pending schedules */
155 LASSERT(!list_empty(&wi->wi_list));
156 list_del_init(&wi->wi_list);
158 LASSERT(sched->ws_nscheduled > 0);
159 sched->ws_nscheduled--;
161 wi->wi_scheduled = 0;
164 LASSERT (list_empty(&wi->wi_list));
166 spin_unlock(&sched->ws_lock);
169 EXPORT_SYMBOL(cfs_wi_deschedule);
172 * Workitem scheduled with (serial == 1) is strictly serialised not only with
173 * itself, but also with others scheduled this way.
175 * Now there's only one static serialised queue, but in the future more might
176 * be added, and even dynamic creation of serialised queues might be supported.
179 cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
181 LASSERT(!in_interrupt()); /* because we use plain spinlock */
182 LASSERT(!sched->ws_stopping);
184 spin_lock(&sched->ws_lock);
186 if (!wi->wi_scheduled) {
187 LASSERT (list_empty(&wi->wi_list));
189 wi->wi_scheduled = 1;
190 sched->ws_nscheduled++;
191 if (!wi->wi_running) {
192 list_add_tail(&wi->wi_list, &sched->ws_runq);
193 wake_up(&sched->ws_waitq);
195 list_add(&wi->wi_list, &sched->ws_rerunq);
199 LASSERT (!list_empty(&wi->wi_list));
200 spin_unlock(&sched->ws_lock);
203 EXPORT_SYMBOL(cfs_wi_schedule);
206 cfs_wi_scheduler(void *arg)
208 struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg;
212 /* CPT affinity scheduler? */
213 if (sched->ws_cptab != NULL)
214 if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
215 CWARN("Unable to bind %s on CPU partition %d\n",
216 sched->ws_name, sched->ws_cpt);
218 spin_lock(&cfs_wi_data.wi_glock);
220 LASSERT(sched->ws_starting == 1);
221 sched->ws_starting--;
222 sched->ws_nthreads++;
224 spin_unlock(&cfs_wi_data.wi_glock);
226 spin_lock(&sched->ws_lock);
228 while (!sched->ws_stopping) {
231 struct cfs_workitem *wi;
233 while (!list_empty(&sched->ws_runq) &&
234 nloops < CFS_WI_RESCHED) {
235 wi = list_entry(sched->ws_runq.next,
236 struct cfs_workitem, wi_list);
237 LASSERT(wi->wi_scheduled && !wi->wi_running);
239 list_del_init(&wi->wi_list);
241 LASSERT(sched->ws_nscheduled > 0);
242 sched->ws_nscheduled--;
245 wi->wi_scheduled = 0;
247 spin_unlock(&sched->ws_lock);
250 rc = (*wi->wi_action) (wi);
252 spin_lock(&sched->ws_lock);
253 if (rc != 0) /* WI should be dead, even be freed! */
257 if (list_empty(&wi->wi_list))
260 LASSERT(wi->wi_scheduled);
261 /* wi is rescheduled, should be on rerunq now, we
262 * move it to runq so it can run action now */
263 list_move_tail(&wi->wi_list, &sched->ws_runq);
266 if (!list_empty(&sched->ws_runq)) {
267 spin_unlock(&sched->ws_lock);
268 /* don't sleep because some workitems still
269 * expect me to come back soon */
271 spin_lock(&sched->ws_lock);
275 spin_unlock(&sched->ws_lock);
276 rc = wait_event_interruptible_exclusive(sched->ws_waitq,
277 !cfs_wi_sched_cansleep(sched));
278 spin_lock(&sched->ws_lock);
281 spin_unlock(&sched->ws_lock);
283 spin_lock(&cfs_wi_data.wi_glock);
284 sched->ws_nthreads--;
285 spin_unlock(&cfs_wi_data.wi_glock);
291 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
293 LASSERT(cfs_wi_data.wi_init);
294 LASSERT(!cfs_wi_data.wi_stopping);
296 spin_lock(&cfs_wi_data.wi_glock);
297 if (sched->ws_stopping) {
298 CDEBUG(D_INFO, "%s is in progress of stopping\n",
300 spin_unlock(&cfs_wi_data.wi_glock);
304 LASSERT(!list_empty(&sched->ws_list));
305 sched->ws_stopping = 1;
307 spin_unlock(&cfs_wi_data.wi_glock);
309 wake_up_all(&sched->ws_waitq);
311 spin_lock(&cfs_wi_data.wi_glock);
315 while (sched->ws_nthreads > 0) {
316 CDEBUG(is_power_of_2(++i / 20) ? D_WARNING : D_NET,
317 "waiting %us for %d %s worker threads to exit\n",
318 i / 20, sched->ws_nthreads, sched->ws_name);
320 spin_unlock(&cfs_wi_data.wi_glock);
321 set_current_state(TASK_UNINTERRUPTIBLE);
322 schedule_timeout(cfs_time_seconds(1) / 20);
323 spin_lock(&cfs_wi_data.wi_glock);
327 list_del(&sched->ws_list);
329 spin_unlock(&cfs_wi_data.wi_glock);
331 LASSERT(sched->ws_nscheduled == 0);
333 LIBCFS_FREE(sched, sizeof(*sched));
335 EXPORT_SYMBOL(cfs_wi_sched_destroy);
338 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
339 int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
341 struct cfs_wi_sched *sched;
343 LASSERT(cfs_wi_data.wi_init);
344 LASSERT(!cfs_wi_data.wi_stopping);
345 LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
346 (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
348 LIBCFS_ALLOC(sched, sizeof(*sched));
352 if (strlen(name) > sizeof(sched->ws_name)-1) {
353 LIBCFS_FREE(sched, sizeof(*sched));
356 strlcpy(sched->ws_name, name, sizeof(sched->ws_name));
358 sched->ws_cptab = cptab;
361 spin_lock_init(&sched->ws_lock);
362 init_waitqueue_head(&sched->ws_waitq);
364 INIT_LIST_HEAD(&sched->ws_runq);
365 INIT_LIST_HEAD(&sched->ws_rerunq);
366 INIT_LIST_HEAD(&sched->ws_list);
368 for (; nthrs > 0; nthrs--) {
370 struct task_struct *task;
372 spin_lock(&cfs_wi_data.wi_glock);
373 while (sched->ws_starting > 0) {
374 spin_unlock(&cfs_wi_data.wi_glock);
376 spin_lock(&cfs_wi_data.wi_glock);
379 sched->ws_starting++;
380 spin_unlock(&cfs_wi_data.wi_glock);
382 if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
383 snprintf(name, sizeof(name), "%s_%02d_%02d",
384 sched->ws_name, sched->ws_cpt,
387 snprintf(name, sizeof(name), "%s_%02d",
388 sched->ws_name, sched->ws_nthreads);
391 task = kthread_run(cfs_wi_scheduler, sched, name);
393 int rc = PTR_ERR(task);
395 CERROR("Failed to create thread for "
396 "WI scheduler %s: %d\n", name, rc);
398 spin_lock(&cfs_wi_data.wi_glock);
400 /* make up for cfs_wi_sched_destroy */
401 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
402 sched->ws_starting--;
404 spin_unlock(&cfs_wi_data.wi_glock);
406 cfs_wi_sched_destroy(sched);
411 spin_lock(&cfs_wi_data.wi_glock);
412 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
413 spin_unlock(&cfs_wi_data.wi_glock);
418 EXPORT_SYMBOL(cfs_wi_sched_create);
423 memset(&cfs_wi_data, 0, sizeof(struct cfs_workitem_data));
425 spin_lock_init(&cfs_wi_data.wi_glock);
426 INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
427 cfs_wi_data.wi_init = 1;
433 cfs_wi_shutdown (void)
435 struct cfs_wi_sched *sched;
437 spin_lock(&cfs_wi_data.wi_glock);
438 cfs_wi_data.wi_stopping = 1;
439 spin_unlock(&cfs_wi_data.wi_glock);
441 /* nobody should contend on this list */
442 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
443 sched->ws_stopping = 1;
444 wake_up_all(&sched->ws_waitq);
447 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
448 spin_lock(&cfs_wi_data.wi_glock);
450 while (sched->ws_nthreads != 0) {
451 spin_unlock(&cfs_wi_data.wi_glock);
452 set_current_state(TASK_UNINTERRUPTIBLE);
453 schedule_timeout(cfs_time_seconds(1) / 20);
454 spin_lock(&cfs_wi_data.wi_glock);
456 spin_unlock(&cfs_wi_data.wi_glock);
459 while (!list_empty(&cfs_wi_data.wi_scheds)) {
460 sched = list_entry(cfs_wi_data.wi_scheds.next,
461 struct cfs_wi_sched, ws_list);
462 list_del(&sched->ws_list);
463 LIBCFS_FREE(sched, sizeof(*sched));
466 cfs_wi_data.wi_stopping = 0;
467 cfs_wi_data.wi_init = 0;