4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2014, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
32 * libcfs/libcfs/workitem.c
34 * Author: Isaac Huang <isaac@clusterfs.com>
35 * Liang Zhen <zhen.liang@sun.com>
38 #define DEBUG_SUBSYSTEM S_LNET
40 #include <linux/kthread.h>
41 #include <libcfs/libcfs.h>
43 #define CFS_WS_NAME_LEN 16
46 struct list_head ws_list; /* chain on global list */
47 /** serialised workitems */
49 /** where schedulers sleep */
50 wait_queue_head_t ws_waitq;
51 /** concurrent workitems */
52 struct list_head ws_runq;
53 /** rescheduled running-workitems, a workitem can be rescheduled
54 * while running in wi_action(), but we don't to execute it again
55 * unless it returns from wi_action(), so we put it on ws_rerunq
56 * while rescheduling, and move it to runq after it returns
58 struct list_head ws_rerunq;
59 /** CPT-table for this scheduler */
60 struct cfs_cpt_table *ws_cptab;
61 /** CPT id for affinity */
63 /** number of scheduled workitems */
65 /** started scheduler thread, protected by cfs_wi_data::wi_glock */
66 unsigned int ws_nthreads:30;
67 /** shutting down, protected by cfs_wi_data::wi_glock */
68 unsigned int ws_stopping:1;
69 /** serialize starting thread, protected by cfs_wi_data::wi_glock */
70 unsigned int ws_starting:1;
72 char ws_name[CFS_WS_NAME_LEN];
75 static struct cfs_workitem_data {
78 /** list of all schedulers */
79 struct list_head wi_scheds;
80 /** WI module is initialized */
82 /** shutting down the whole WI module */
87 cfs_wi_sched_cansleep(struct cfs_wi_sched *sched)
89 spin_lock(&sched->ws_lock);
90 if (sched->ws_stopping) {
91 spin_unlock(&sched->ws_lock);
95 if (!list_empty(&sched->ws_runq)) {
96 spin_unlock(&sched->ws_lock);
99 spin_unlock(&sched->ws_lock);
104 * 0. it only works when called from wi->wi_action.
105 * 1. when it returns no one shall try to schedule the workitem.
108 cfs_wi_exit(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
110 LASSERT(!in_interrupt()); /* because we use plain spinlock */
111 LASSERT(!sched->ws_stopping);
113 spin_lock(&sched->ws_lock);
115 LASSERT(wi->wi_running);
117 if (wi->wi_scheduled) { /* cancel pending schedules */
118 LASSERT(!list_empty(&wi->wi_list));
119 list_del_init(&wi->wi_list);
121 LASSERT(sched->ws_nscheduled > 0);
122 sched->ws_nscheduled--;
125 LASSERT(list_empty(&wi->wi_list));
127 wi->wi_scheduled = 1; /* LBUG future schedule attempts */
128 spin_unlock(&sched->ws_lock);
132 EXPORT_SYMBOL(cfs_wi_exit);
135 * cancel schedule request of workitem \a wi
138 cfs_wi_deschedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
142 LASSERT(!in_interrupt()); /* because we use plain spinlock */
143 LASSERT(!sched->ws_stopping);
146 * return 0 if it's running already, otherwise return 1, which
147 * means the workitem will not be scheduled and will not have
148 * any race with wi_action.
150 spin_lock(&sched->ws_lock);
152 rc = !(wi->wi_running);
154 if (wi->wi_scheduled) { /* cancel pending schedules */
155 LASSERT(!list_empty(&wi->wi_list));
156 list_del_init(&wi->wi_list);
158 LASSERT(sched->ws_nscheduled > 0);
159 sched->ws_nscheduled--;
161 wi->wi_scheduled = 0;
164 LASSERT (list_empty(&wi->wi_list));
166 spin_unlock(&sched->ws_lock);
169 EXPORT_SYMBOL(cfs_wi_deschedule);
172 * Workitem scheduled with (serial == 1) is strictly serialised not only with
173 * itself, but also with others scheduled this way.
175 * Now there's only one static serialised queue, but in the future more might
176 * be added, and even dynamic creation of serialised queues might be supported.
179 cfs_wi_schedule(struct cfs_wi_sched *sched, struct cfs_workitem *wi)
181 LASSERT(!in_interrupt()); /* because we use plain spinlock */
182 LASSERT(!sched->ws_stopping);
184 spin_lock(&sched->ws_lock);
186 if (!wi->wi_scheduled) {
187 LASSERT (list_empty(&wi->wi_list));
189 wi->wi_scheduled = 1;
190 sched->ws_nscheduled++;
191 if (!wi->wi_running) {
192 list_add_tail(&wi->wi_list, &sched->ws_runq);
193 wake_up(&sched->ws_waitq);
195 list_add(&wi->wi_list, &sched->ws_rerunq);
199 LASSERT (!list_empty(&wi->wi_list));
200 spin_unlock(&sched->ws_lock);
203 EXPORT_SYMBOL(cfs_wi_schedule);
206 cfs_wi_scheduler(void *arg)
208 struct cfs_wi_sched *sched = (struct cfs_wi_sched *)arg;
212 /* CPT affinity scheduler? */
213 if (sched->ws_cptab != NULL)
214 if (cfs_cpt_bind(sched->ws_cptab, sched->ws_cpt) != 0)
215 CWARN("Unable to bind %s on CPU partition %d\n",
216 sched->ws_name, sched->ws_cpt);
218 spin_lock(&cfs_wi_data.wi_glock);
220 LASSERT(sched->ws_starting == 1);
221 sched->ws_starting--;
222 sched->ws_nthreads++;
224 spin_unlock(&cfs_wi_data.wi_glock);
226 spin_lock(&sched->ws_lock);
228 while (!sched->ws_stopping) {
231 struct cfs_workitem *wi;
233 while (!list_empty(&sched->ws_runq) &&
234 nloops < CFS_WI_RESCHED) {
235 wi = list_entry(sched->ws_runq.next,
236 struct cfs_workitem, wi_list);
237 LASSERT(wi->wi_scheduled && !wi->wi_running);
239 list_del_init(&wi->wi_list);
241 LASSERT(sched->ws_nscheduled > 0);
242 sched->ws_nscheduled--;
245 wi->wi_scheduled = 0;
247 spin_unlock(&sched->ws_lock);
250 rc = (*wi->wi_action) (wi);
252 spin_lock(&sched->ws_lock);
253 if (rc != 0) /* WI should be dead, even be freed! */
257 if (list_empty(&wi->wi_list))
260 LASSERT(wi->wi_scheduled);
261 /* wi is rescheduled, should be on rerunq now, we
262 * move it to runq so it can run action now */
263 list_move_tail(&wi->wi_list, &sched->ws_runq);
266 if (!list_empty(&sched->ws_runq)) {
267 spin_unlock(&sched->ws_lock);
268 /* don't sleep because some workitems still
269 * expect me to come back soon */
271 spin_lock(&sched->ws_lock);
275 spin_unlock(&sched->ws_lock);
276 rc = wait_event_interruptible_exclusive(sched->ws_waitq,
277 !cfs_wi_sched_cansleep(sched));
278 spin_lock(&sched->ws_lock);
281 spin_unlock(&sched->ws_lock);
283 spin_lock(&cfs_wi_data.wi_glock);
284 sched->ws_nthreads--;
285 spin_unlock(&cfs_wi_data.wi_glock);
291 cfs_wi_sched_destroy(struct cfs_wi_sched *sched)
293 LASSERT(cfs_wi_data.wi_init);
294 LASSERT(!cfs_wi_data.wi_stopping);
296 spin_lock(&cfs_wi_data.wi_glock);
297 if (sched->ws_stopping) {
298 CDEBUG(D_INFO, "%s is in progress of stopping\n",
300 spin_unlock(&cfs_wi_data.wi_glock);
304 LASSERT(!list_empty(&sched->ws_list));
305 sched->ws_stopping = 1;
307 spin_unlock(&cfs_wi_data.wi_glock);
309 wake_up_all(&sched->ws_waitq);
311 spin_lock(&cfs_wi_data.wi_glock);
315 while (sched->ws_nthreads > 0) {
316 CDEBUG(IS_PO2(++i) ? D_WARNING : D_NET,
317 "waiting for %d threads of WI sched[%s] to "
318 "terminate\n", sched->ws_nthreads,
321 spin_unlock(&cfs_wi_data.wi_glock);
322 set_current_state(TASK_UNINTERRUPTIBLE);
323 schedule_timeout(cfs_time_seconds(1) / 20);
324 spin_lock(&cfs_wi_data.wi_glock);
328 list_del(&sched->ws_list);
330 spin_unlock(&cfs_wi_data.wi_glock);
332 LASSERT(sched->ws_nscheduled == 0);
334 LIBCFS_FREE(sched, sizeof(*sched));
336 EXPORT_SYMBOL(cfs_wi_sched_destroy);
339 cfs_wi_sched_create(char *name, struct cfs_cpt_table *cptab,
340 int cpt, int nthrs, struct cfs_wi_sched **sched_pp)
342 struct cfs_wi_sched *sched;
344 LASSERT(cfs_wi_data.wi_init);
345 LASSERT(!cfs_wi_data.wi_stopping);
346 LASSERT(cptab == NULL || cpt == CFS_CPT_ANY ||
347 (cpt >= 0 && cpt < cfs_cpt_number(cptab)));
349 LIBCFS_ALLOC(sched, sizeof(*sched));
353 if (strlen(name) > sizeof(sched->ws_name)-1) {
354 LIBCFS_FREE(sched, sizeof(*sched));
357 strlcpy(sched->ws_name, name, sizeof(sched->ws_name));
359 sched->ws_cptab = cptab;
362 spin_lock_init(&sched->ws_lock);
363 init_waitqueue_head(&sched->ws_waitq);
365 INIT_LIST_HEAD(&sched->ws_runq);
366 INIT_LIST_HEAD(&sched->ws_rerunq);
367 INIT_LIST_HEAD(&sched->ws_list);
369 for (; nthrs > 0; nthrs--) {
371 struct task_struct *task;
373 spin_lock(&cfs_wi_data.wi_glock);
374 while (sched->ws_starting > 0) {
375 spin_unlock(&cfs_wi_data.wi_glock);
377 spin_lock(&cfs_wi_data.wi_glock);
380 sched->ws_starting++;
381 spin_unlock(&cfs_wi_data.wi_glock);
383 if (sched->ws_cptab != NULL && sched->ws_cpt >= 0) {
384 snprintf(name, sizeof(name), "%s_%02d_%02d",
385 sched->ws_name, sched->ws_cpt,
388 snprintf(name, sizeof(name), "%s_%02d",
389 sched->ws_name, sched->ws_nthreads);
392 task = kthread_run(cfs_wi_scheduler, sched, name);
394 int rc = PTR_ERR(task);
396 CERROR("Failed to create thread for "
397 "WI scheduler %s: %d\n", name, rc);
399 spin_lock(&cfs_wi_data.wi_glock);
401 /* make up for cfs_wi_sched_destroy */
402 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
403 sched->ws_starting--;
405 spin_unlock(&cfs_wi_data.wi_glock);
407 cfs_wi_sched_destroy(sched);
412 spin_lock(&cfs_wi_data.wi_glock);
413 list_add(&sched->ws_list, &cfs_wi_data.wi_scheds);
414 spin_unlock(&cfs_wi_data.wi_glock);
419 EXPORT_SYMBOL(cfs_wi_sched_create);
424 memset(&cfs_wi_data, 0, sizeof(struct cfs_workitem_data));
426 spin_lock_init(&cfs_wi_data.wi_glock);
427 INIT_LIST_HEAD(&cfs_wi_data.wi_scheds);
428 cfs_wi_data.wi_init = 1;
434 cfs_wi_shutdown (void)
436 struct cfs_wi_sched *sched;
438 spin_lock(&cfs_wi_data.wi_glock);
439 cfs_wi_data.wi_stopping = 1;
440 spin_unlock(&cfs_wi_data.wi_glock);
442 /* nobody should contend on this list */
443 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
444 sched->ws_stopping = 1;
445 wake_up_all(&sched->ws_waitq);
448 list_for_each_entry(sched, &cfs_wi_data.wi_scheds, ws_list) {
449 spin_lock(&cfs_wi_data.wi_glock);
451 while (sched->ws_nthreads != 0) {
452 spin_unlock(&cfs_wi_data.wi_glock);
453 set_current_state(TASK_UNINTERRUPTIBLE);
454 schedule_timeout(cfs_time_seconds(1) / 20);
455 spin_lock(&cfs_wi_data.wi_glock);
457 spin_unlock(&cfs_wi_data.wi_glock);
460 while (!list_empty(&cfs_wi_data.wi_scheds)) {
461 sched = list_entry(cfs_wi_data.wi_scheds.next,
462 struct cfs_wi_sched, ws_list);
463 list_del(&sched->ws_list);
464 LIBCFS_FREE(sched, sizeof(*sched));
467 cfs_wi_data.wi_stopping = 0;
468 cfs_wi_data.wi_init = 0;