4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2013, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
31 * libcfs/libcfs/workitem.c
33 * Author: Isaac Huang <isaac@clusterfs.com>
34 * Liang Zhen <zhen.liang@sun.com>
37 #define DEBUG_SUBSYSTEM S_LNET
39 #include <linux/kthread.h>
40 #include <libcfs/libcfs.h>
42 #define CFS_WS_NAME_LEN 16
45 struct list_head ws_list; /* chain on global list */
46 /** serialised workitems */
48 /** where schedulers sleep */
49 wait_queue_head_t ws_waitq;
50 /** concurrent workitems */
51 struct list_head ws_runq;
52 /** rescheduled running-workitems, a workitem can be rescheduled
53 * while running in wi_action(), but we don't to execute it again
54 * unless it returns from wi_action(), so we put it on ws_rerunq
55 * while rescheduling, and move it to runq after it returns
57 struct list_head ws_rerunq;
58 /** CPT-table for this scheduler */
59 struct cfs_cpt_table *ws_cptab;
60 /** CPT id for affinity */
62 /** number of scheduled workitems */
64 /** started scheduler thread, protected by cfs_wi_data::wi_glock */
65 unsigned int ws_nthreads:30;
66 /** shutting down, protected by cfs_wi_data::wi_glock */
67 unsigned int ws_stopping:1;
68 /** serialize starting thread, protected by cfs_wi_data::wi_glock */
69 unsigned int ws_starting:1;
71 char ws_name[CFS_WS_NAME_LEN];
74 static struct cfs_workitem_data {
77 /** list of all schedulers */
78 struct list_head wi_scheds;
79 /** WI module is initialized */
81 /** shutting down the whole WI module */
86 cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
88 spin_lock(&sched->ws_lock);
89 if (sched->ws_stopping) {
90 spin_unlock(&sched->ws_lock);
94 if (!list_empty(&sched->ws_runq)) {
95 spin_unlock(&sched->ws_lock);
98 spin_unlock(&sched->ws_lock);
103 * 0. it only works when called from wi->wi_action.
104 * 1. when it returns no one shall try to schedule the workitem.
107 cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
109 LASSERT(!in_interrupt()); /* because we use plain spinlock */
110 LASSERT(!sched->ws_stopping);
112 spin_lock(&sched->ws_lock);
114 LASSERT(wi->wi_running);
116 if (wi->wi_scheduled) { /* cancel pending schedules */
117 LASSERT(!list_empty(&wi->wi_list));
118 list_del_init(&wi->wi_list);
120 LASSERT(sched->ws_nscheduled > 0);
121 sched->ws_nscheduled--;
124 LASSERT(list_empty(&wi->wi_list));
126 wi->wi_scheduled = 1; /* LBUG future schedule attempts */
127 spin_unlock(&sched->ws_lock);
129 EXPORT_SYMBOL(cfs_wi_exit);
132 * cancel schedule request of workitem \a wi
135 cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
139 LASSERT(!in_interrupt()); /* because we use plain spinlock */
140 LASSERT(!sched->ws_stopping);
143 * return 0 if it's running already, otherwise return 1, which
144 * means the workitem will not be scheduled and will not have
145 * any race with wi_action.
147 spin_lock(&sched->ws_lock);
149 rc = !(wi->wi_running);
151 if (wi->wi_scheduled) { /* cancel pending schedules */
152 LASSERT(!list_empty(&wi->wi_list));
153 list_del_init(&wi->wi_list);
155 LASSERT(sched->ws_nscheduled > 0);
156 sched->ws_nscheduled--;
158 wi->wi_scheduled = 0;
161 LASSERT (list_empty(&wi->wi_list));
163 spin_unlock(&sched->ws_lock);
166 EXPORT_SYMBOL(cfs_wi_deschedule);
169 * Workitem scheduled with (serial == 1) is strictly serialised not only with
170 * itself, but also with others scheduled this way.
172 * Now there's only one static serialised queue, but in the future more might
173 * be added, and even dynamic creation of serialised queues might be supported.
176 cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
178 LASSERT(!in_interrupt()); /* because we use plain spinlock */
179 LASSERT(!sched->ws_stopping);
181 spin_lock(&sched->ws_lock);
183 if (!wi->wi_scheduled) {
184 LASSERT (list_empty(&wi->wi_list));
186 wi->wi_scheduled = 1;
187 sched->ws_nscheduled++;
188 if (!wi->wi_running) {
189 list_add_tail(&wi->wi_list, &sched->ws_runq);
190 wake_up(&sched->ws_waitq);
192 list_add(&wi->wi_list, &sched->ws_rerunq);
196 LASSERT (!list_empty(&wi->wi_list));
197 spin_unlock(&sched->ws_lock);
199 EXPORT_SYMBOL(cfs_wi_schedule);
202 cfs_wi_scheduler(void *arg)
204 struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg;
206 /* CPT affinity scheduler? */
207 if (sched->ws_cptab != NULL)
208 if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
209 CWARN("Unable to bind %s on CPU partition %d\n",
210 sched->ws_name, sched->ws_cpt);
212 spin_lock(&cfs_wi_data.wi_glock);
214 LASSERT(sched->ws_starting == 1);
215 sched->ws_starting--;
216 sched->ws_nthreads++;
218 spin_unlock(&cfs_wi_data.wi_glock);
220 spin_lock(&sched->ws_lock);
222 while (!sched->ws_stopping) {
225 struct cfs_workitem *wi;
227 while (!list_empty(&sched->ws_runq) &&
228 nloops < CFS_WI_RESCHED) {
229 wi = list_entry(sched->ws_runq.next,
230 struct cfs_workitem, wi_list);
231 LASSERT(wi->wi_scheduled && !wi->wi_running);
233 list_del_init(&wi->wi_list);
235 LASSERT(sched->ws_nscheduled > 0);
236 sched->ws_nscheduled--;
239 wi->wi_scheduled = 0;
241 spin_unlock(&sched->ws_lock);
244 rc = (*wi->wi_action) (wi);
246 spin_lock(&sched->ws_lock);
247 if (rc != 0) /* WI should be dead, even be freed! */
251 if (list_empty(&wi->wi_list))
254 LASSERT(wi->wi_scheduled);
255 /* wi is rescheduled, should be on rerunq now, we
256 * move it to runq so it can run action now */
257 list_move_tail(&wi->wi_list, &sched->ws_runq);
260 if (!list_empty(&sched->ws_runq)) {
261 spin_unlock(&sched->ws_lock);
262 /* don't sleep because some workitems still
263 * expect me to come back soon */
265 spin_lock(&sched->ws_lock);
269 spin_unlock(&sched->ws_lock);
270 rc = wait_event_interruptible_exclusive(sched->ws_waitq,
271 !cfs_wi_sched_cansleep(sched));
272 spin_lock(&sched->ws_lock);
275 spin_unlock(&sched->ws_lock);
277 spin_lock(&cfs_wi_data.wi_glock);
278 sched->ws_nthreads--;
279 spin_unlock(&cfs_wi_data.wi_glock);
285 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
287 LASSERT(cfs_wi_data.wi_init);
288 LASSERT(!cfs_wi_data.wi_stopping);
290 spin_lock(&cfs_wi_data.wi_glock);
291 if (sched->ws_stopping) {
292 CDEBUG(D_INFO, "%s is in progress of stopping\n",
294 spin_unlock(&cfs_wi_data.wi_glock);
298 LASSERT(!list_empty(&sched->ws_list));
299 sched->ws_stopping = 1;
301 spin_unlock(&cfs_wi_data.wi_glock);
303 wake_up_all(&sched->ws_waitq);
305 spin_lock(&cfs_wi_data.wi_glock);
309 while (sched->ws_nthreads > 0) {
310 CDEBUG(is_power_of_2(++i / 20) ? D_WARNING : D_NET,
311 "waiting %us for %d %s worker threads to exit\n",
312 i / 20, sched->ws_nthreads, sched->ws_name);
314 spin_unlock(&cfs_wi_data.wi_glock);
315 schedule_timeout_uninterruptible(cfs_time_seconds(1)
317 spin_lock(&cfs_wi_data.wi_glock);
321 list_del(&sched->ws_list);
323 spin_unlock(&cfs_wi_data.wi_glock);
325 LASSERT(sched->ws_nscheduled == 0);
327 LIBCFS_FREE(sched, sizeof(*sched));
329 EXPORT_SYMBOL(cfs_wi_sched_destroy);
332 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
333 int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
335 struct cfs_wi_sched *sched;
337 LASSERT(cfs_wi_data.wi_init);
338 LASSERT(!cfs_wi_data.wi_stopping);
339 LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
340 (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
342 LIBCFS_ALLOC(sched, sizeof(*sched));
346 if (strlen(name) > sizeof(sched->ws_name)-1) {
347 LIBCFS_FREE(sched, sizeof(*sched));
350 strlcpy(sched->ws_name, name, sizeof(sched->ws_name));
352 sched->ws_cptab = cptab;
355 spin_lock_init(&sched->ws_lock);
356 init_waitqueue_head(&sched->ws_waitq);
358 INIT_LIST_HEAD(&sched->ws_runq);
359 INIT_LIST_HEAD(&sched->ws_rerunq);
360 INIT_LIST_HEAD(&sched->ws_list);
362 for (; nthrs > 0; nthrs--) {
364 struct task_struct *task;
366 spin_lock(&cfs_wi_data.wi_glock);
367 while (sched->ws_starting > 0) {
368 spin_unlock(&cfs_wi_data.wi_glock);
370 spin_lock(&cfs_wi_data.wi_glock);
373 sched->ws_starting++;
374 spin_unlock(&cfs_wi_data.wi_glock);
376 if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
377 snprintf(name, sizeof(name), "%s_%02d_%02d",
378 sched->ws_name, sched->ws_cpt,
381 snprintf(name, sizeof(name), "%s_%02d",
382 sched->ws_name, sched->ws_nthreads);
385 task = kthread_run(cfs_wi_scheduler, sched, "%s", name);
387 int rc = PTR_ERR(task);
389 CERROR("Failed to create thread for "
390 "WI scheduler %s: %d\n", name, rc);
392 spin_lock(&cfs_wi_data.wi_glock);
394 /* make up for cfs_wi_sched_destroy */
395 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
396 sched->ws_starting--;
398 spin_unlock(&cfs_wi_data.wi_glock);
400 cfs_wi_sched_destroy(sched);
405 spin_lock(&cfs_wi_data.wi_glock);
406 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
407 spin_unlock(&cfs_wi_data.wi_glock);
412 EXPORT_SYMBOL(cfs_wi_sched_create);
417 memset(&cfs_wi_data, 0, sizeof(struct cfs_workitem_data));
419 spin_lock_init(&cfs_wi_data.wi_glock);
420 INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
421 cfs_wi_data.wi_init = 1;
427 cfs_wi_shutdown (void)
429 struct cfs_wi_sched *sched;
431 spin_lock(&cfs_wi_data.wi_glock);
432 cfs_wi_data.wi_stopping = 1;
433 spin_unlock(&cfs_wi_data.wi_glock);
435 /* nobody should contend on this list */
436 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
437 sched->ws_stopping = 1;
438 wake_up_all(&sched->ws_waitq);
441 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
442 spin_lock(&cfs_wi_data.wi_glock);
444 while (sched->ws_nthreads != 0) {
445 spin_unlock(&cfs_wi_data.wi_glock);
446 schedule_timeout_uninterruptible(cfs_time_seconds(1)
448 spin_lock(&cfs_wi_data.wi_glock);
450 spin_unlock(&cfs_wi_data.wi_glock);
453 while (!list_empty(&cfs_wi_data.wi_scheds)) {
454 sched = list_entry(cfs_wi_data.wi_scheds.next,
455 struct cfs_wi_sched, ws_list);
456 list_del(&sched->ws_list);
457 LIBCFS_FREE(sched, sizeof(*sched));
460 cfs_wi_data.wi_stopping = 0;
461 cfs_wi_data.wi_init = 0;