Whamcloud - gitweb
merge b_devel into HEAD, which will become 0.7.3
[fs/lustre-release.git] / lustre / ptlrpc / recov_thread.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Copyright (C) 2003 Cluster File Systems, Inc.
5  *   Author: Andreas Dilger <adilger@clusterfs.com>
6  *
7  *   This file is part of Lustre, http://www.lustre.org.
8  *
9  *   Lustre is free software; you can redistribute it and/or
10  *   modify it under the terms of version 2 of the GNU General Public
11  *   License as published by the Free Software Foundation.
12  *
13  *   Lustre is distributed in the hope that it will be useful,
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *   GNU General Public License for more details.
17  *
18  *   You should have received a copy of the GNU General Public License
19  *   along with Lustre; if not, write to the Free Software
20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  *
22  * OST<->MDS recovery logging thread.
23  *
24  * Invariants in implementation:
25  * - we do not share logs among different OST<->MDS connections, so that
26  *   if an OST or MDS fails it need only look at log(s) relevant to itself
27  */
28
29 #define DEBUG_SUBSYSTEM S_LOG
30
31 #ifndef EXPORT_SYMTAB
32 # define EXPORT_SYMTAB
33 #endif
34
35 #ifndef __KERNEL__
36 # include <portals/list.h>
37 # include <liblustre.h>
38 #endif
39 #include <linux/kp30.h>
40 #include <linux/fs.h>
41 #include <linux/obd_class.h>
42 #include <linux/lustre_commit_confd.h>
43 #include <linux/obd_support.h>
44 #include <linux/obd_class.h>
45 #include <linux/lustre_net.h>
46 #include <portals/types.h>
47 #include <portals/list.h>
48 #include "ptlrpc_internal.h"
49
50 static struct llog_commit_master lustre_lcm;
51 static struct llog_commit_master *lcm = &lustre_lcm;
52
53 /* Allocate new commit structs in case we do not have enough */
54 static int llcd_alloc(void)
55 {
56         struct llog_commit_data *llcd;
57
58         OBD_ALLOC(llcd, PAGE_SIZE);
59         if (llcd == NULL)
60                 return -ENOMEM;
61
62         llcd->llcd_lcm = lcm;
63
64         spin_lock(&lcm->lcm_llcd_lock);
65         list_add(&llcd->llcd_list, &lcm->lcm_llcd_free);
66         atomic_inc(&lcm->lcm_llcd_numfree);
67         spin_unlock(&lcm->lcm_llcd_lock);
68
69         return 0;
70 }
71
72 /* Get a free cookie struct from the list */
73 struct llog_commit_data *llcd_grab(void)
74 {
75         struct llog_commit_data *llcd;
76
77         spin_lock(&lcm->lcm_llcd_lock);
78         if (list_empty(&lcm->lcm_llcd_free)) {
79                 spin_unlock(&lcm->lcm_llcd_lock);
80                 if (llcd_alloc() < 0) {
81                         CERROR("unable to allocate log commit data!\n");
82                         return NULL;
83                 }
84                 spin_lock(&lcm->lcm_llcd_lock);
85         }
86
87         llcd = list_entry(lcm->lcm_llcd_free.next, typeof(*llcd), llcd_list);
88         list_del(&llcd->llcd_list);
89         atomic_dec(&lcm->lcm_llcd_numfree);
90         spin_unlock(&lcm->lcm_llcd_lock);
91
92         llcd->llcd_tries = 0;
93         llcd->llcd_cookiebytes = 0;
94
95         return llcd;
96 }
97 EXPORT_SYMBOL(llcd_grab);
98
99 static void llcd_put(struct llog_commit_data *llcd)
100 {
101         if (atomic_read(&lcm->lcm_llcd_numfree) >= lcm->lcm_llcd_maxfree) {
102                 OBD_FREE(llcd, PAGE_SIZE);
103         } else {
104                 spin_lock(&lcm->lcm_llcd_lock);
105                 list_add(&llcd->llcd_list, &lcm->lcm_llcd_free);
106                 atomic_inc(&lcm->lcm_llcd_numfree);
107                 spin_unlock(&lcm->lcm_llcd_lock);
108         }
109 }
110
111 /* Send some cookies to the appropriate target */
112 void llcd_send(struct llog_commit_data *llcd)
113 {
114         spin_lock(&llcd->llcd_lcm->lcm_llcd_lock);
115         list_add_tail(&llcd->llcd_list, &llcd->llcd_lcm->lcm_llcd_pending);
116         spin_unlock(&llcd->llcd_lcm->lcm_llcd_lock);
117
118         wake_up_nr(&llcd->llcd_lcm->lcm_waitq, 1);
119 }
120 EXPORT_SYMBOL(llcd_send);
121
122 static int log_commit_thread(void *arg)
123 {
124         struct llog_commit_master *lcm = arg;
125         struct llog_commit_daemon *lcd;
126         struct llog_commit_data *llcd, *n;
127         unsigned long flags;
128         ENTRY;
129
130         OBD_ALLOC(lcd, sizeof(*lcd));
131         if (lcd == NULL)
132                 RETURN(-ENOMEM);
133
134         lock_kernel();
135         ptlrpc_daemonize(); /* thread never needs to do IO */
136
137         SIGNAL_MASK_LOCK(current, flags);
138         sigfillset(&current->blocked);
139         RECALC_SIGPENDING;
140         SIGNAL_MASK_UNLOCK(current, flags);
141
142         spin_lock(&lcm->lcm_thread_lock);
143         THREAD_NAME(current->comm, "ll_log_commit_%d",
144                     atomic_read(&lcm->lcm_thread_total));
145         atomic_inc(&lcm->lcm_thread_total);
146         spin_unlock(&lcm->lcm_thread_lock);
147         unlock_kernel();
148
149         INIT_LIST_HEAD(&lcd->lcd_lcm_list);
150         INIT_LIST_HEAD(&lcd->lcd_llcd_list);
151         lcd->lcd_lcm = lcm;
152
153         CDEBUG(D_HA, "%s started\n", current->comm);
154         do {
155                 struct ptlrpc_request *request;
156                 struct obd_import *import = NULL;
157                 struct list_head *sending_list;
158                 int rc = 0;
159
160                 /* If we do not have enough pages available, allocate some */
161                 while (atomic_read(&lcm->lcm_llcd_numfree) <
162                        lcm->lcm_llcd_minfree) {
163                         if (llcd_alloc() < 0)
164                                 break;
165                 }
166
167                 spin_lock(&lcm->lcm_thread_lock);
168                 atomic_inc(&lcm->lcm_thread_numidle);
169                 list_move(&lcd->lcd_lcm_list, &lcm->lcm_thread_idle);
170                 spin_unlock(&lcm->lcm_thread_lock);
171
172                 wait_event_interruptible(lcm->lcm_waitq,
173                                          !list_empty(&lcm->lcm_llcd_pending) ||
174                                          lcm->lcm_flags & LLOG_LCM_FL_EXIT);
175
176                 /* If we are the last available thread, start a new one in case
177                  * we get blocked on an RPC (nobody else will start a new one)*/
178                 spin_lock(&lcm->lcm_thread_lock);
179                 atomic_dec(&lcm->lcm_thread_numidle);
180                 list_move(&lcd->lcd_lcm_list, &lcm->lcm_thread_busy);
181                 spin_unlock(&lcm->lcm_thread_lock);
182
183                 sending_list = &lcm->lcm_llcd_pending;
184         resend:
185                 if (lcm->lcm_flags & LLOG_LCM_FL_EXIT) {
186                         lcm->lcm_llcd_maxfree = 0;
187                         lcm->lcm_llcd_minfree = 0;
188                         lcm->lcm_thread_max = 0;
189
190                         if (list_empty(&lcm->lcm_llcd_pending) ||
191                             lcm->lcm_flags & LLOG_LCM_FL_EXIT_FORCE)
192                                 break;
193                 }
194
195                 if (atomic_read(&lcm->lcm_thread_numidle) <= 1 &&
196                     atomic_read(&lcm->lcm_thread_total) < lcm->lcm_thread_max) {
197                         rc = llog_start_commit_thread();
198                         if (rc < 0)
199                                 CERROR("error starting thread: rc %d\n", rc);
200                 }
201
202                 /* Move all of the pending cancels from the same OST off of
203                  * the list, so we don't get multiple threads blocked and/or
204                  * doing upcalls on the same OST in case of failure. */
205                 spin_lock(&lcm->lcm_llcd_lock);
206                 if (!list_empty(sending_list)) {
207                         list_move_tail(sending_list->next,
208                                        &lcd->lcd_llcd_list);
209                         llcd = list_entry(lcd->lcd_llcd_list.next,
210                                           typeof(*llcd), llcd_list);
211                         LASSERT(llcd->llcd_lcm == lcm);
212                         import = llcd->llcd_import;
213                 }
214                 list_for_each_entry_safe(llcd, n, sending_list, llcd_list) {
215                         LASSERT(llcd->llcd_lcm == lcm);
216                         if (import == llcd->llcd_import)
217                                 list_move_tail(&llcd->llcd_list,
218                                                &lcd->lcd_llcd_list);
219                 }
220                 if (sending_list != &lcm->lcm_llcd_resend) {
221                         list_for_each_entry_safe(llcd, n, &lcm->lcm_llcd_resend,
222                                                  llcd_list) {
223                                 LASSERT(llcd->llcd_lcm == lcm);
224                                 if (import == llcd->llcd_import)
225                                         list_move_tail(&llcd->llcd_list,
226                                                        &lcd->lcd_llcd_list);
227                         }
228                 }
229                 spin_unlock(&lcm->lcm_llcd_lock);
230
231                 /* We are the only one manipulating our local list - no lock */
232                 list_for_each_entry_safe(llcd,n, &lcd->lcd_llcd_list,llcd_list){
233                         char *bufs[1] = {(char *)llcd->llcd_cookies};
234                         list_del(&llcd->llcd_list);
235
236                         request = ptlrpc_prep_req(import, OBD_LOG_CANCEL, 1,
237                                                   &llcd->llcd_cookiebytes,
238                                                   bufs);
239                         if (request == NULL) {
240                                 rc = -ENOMEM;
241                                 CERROR("error preparing commit: rc %d\n", rc);
242
243                                 spin_lock(&lcm->lcm_llcd_lock);
244                                 list_splice(&lcd->lcd_llcd_list,
245                                             &lcm->lcm_llcd_resend);
246                                 INIT_LIST_HEAD(&lcd->lcd_llcd_list);
247                                 spin_unlock(&lcm->lcm_llcd_lock);
248                                 break;
249                         }
250
251                         request->rq_replen = lustre_msg_size(0, NULL);
252                         rc = ptlrpc_queue_wait(request);
253                         ptlrpc_req_finished(request);
254
255                         /* If the RPC failed, we put this and the remaining
256                          * messages onto the resend list for another time. */
257                         if (rc == 0) {
258                                 llcd_put(llcd);
259                                 continue;
260                         }
261
262                         spin_lock(&lcm->lcm_llcd_lock);
263                         list_splice(&lcd->lcd_llcd_list, &lcm->lcm_llcd_resend);
264                         if (++llcd->llcd_tries < 5) {
265                                 CERROR("commit %p failed %dx: rc %d\n",
266                                        llcd, llcd->llcd_tries, rc);
267
268                                 list_add_tail(&llcd->llcd_list,
269                                               &lcm->lcm_llcd_resend);
270                                 spin_unlock(&lcm->lcm_llcd_lock);
271                         } else {
272                                 spin_unlock(&lcm->lcm_llcd_lock);
273                                 CERROR("commit %p dropped %d cookies: rc %d\n",
274                                        llcd, (int)(llcd->llcd_cookiebytes /
275                                                    sizeof(*llcd->llcd_cookies)),
276                                        rc);
277                                 llcd_put(llcd);
278                         }
279                         break;
280                 }
281
282                 if (rc == 0) {
283                         sending_list = &lcm->lcm_llcd_resend;
284                         if (!list_empty(sending_list))
285                                 goto resend;
286                 }
287         } while(1);
288
289         /* If we are force exiting, just drop all of the cookies. */
290         if (lcm->lcm_flags & LLOG_LCM_FL_EXIT_FORCE) {
291                 spin_lock(&lcm->lcm_llcd_lock);
292                 list_splice(&lcm->lcm_llcd_pending, &lcd->lcd_llcd_list);
293                 list_splice(&lcm->lcm_llcd_resend, &lcd->lcd_llcd_list);
294                 list_splice(&lcm->lcm_llcd_free, &lcd->lcd_llcd_list);
295                 spin_unlock(&lcm->lcm_llcd_lock);
296
297                 list_for_each_entry_safe(llcd, n, &lcd->lcd_llcd_list,llcd_list)
298                         llcd_put(llcd);
299         }
300
301         OBD_FREE(lcd, sizeof(*lcd));
302
303         spin_lock(&lcm->lcm_thread_lock);
304         atomic_dec(&lcm->lcm_thread_total);
305         spin_unlock(&lcm->lcm_thread_lock);
306         wake_up(&lcm->lcm_waitq);
307
308         CDEBUG(D_HA, "%s exiting\n", current->comm);
309         return 0;
310 }
311
312 int llog_start_commit_thread(void)
313 {
314         int rc;
315         ENTRY;
316
317         if (atomic_read(&lcm->lcm_thread_total) >= lcm->lcm_thread_max)
318                 RETURN(0);
319
320         rc = kernel_thread(log_commit_thread, lcm, CLONE_VM | CLONE_FILES);
321         if (rc < 0) {
322                 CERROR("error starting thread #%d: %d\n",
323                        atomic_read(&lcm->lcm_thread_total), rc);
324                 RETURN(rc);
325         }
326
327         RETURN(0);
328 }
329 EXPORT_SYMBOL(llog_start_commit_thread);
330
331 int llog_init_commit_master(void)
332 {
333         INIT_LIST_HEAD(&lcm->lcm_thread_busy);
334         INIT_LIST_HEAD(&lcm->lcm_thread_idle);
335         spin_lock_init(&lcm->lcm_thread_lock);
336         atomic_set(&lcm->lcm_thread_numidle, 0);
337         init_waitqueue_head(&lcm->lcm_waitq);
338         INIT_LIST_HEAD(&lcm->lcm_llcd_pending);
339         INIT_LIST_HEAD(&lcm->lcm_llcd_resend);
340         INIT_LIST_HEAD(&lcm->lcm_llcd_free);
341         spin_lock_init(&lcm->lcm_llcd_lock);
342         atomic_set(&lcm->lcm_llcd_numfree, 0);
343         lcm->lcm_llcd_minfree = 0;
344         lcm->lcm_thread_max = 5;
345         return 0;
346 }
347
348 int llog_cleanup_commit_master(int force)
349 {
350         lcm->lcm_flags |= LLOG_LCM_FL_EXIT;
351         if (force)
352                 lcm->lcm_flags |= LLOG_LCM_FL_EXIT_FORCE;
353         wake_up(&lcm->lcm_waitq);
354
355         wait_event_interruptible(lcm->lcm_waitq,
356                                  atomic_read(&lcm->lcm_thread_total) == 0);
357         return 0;
358 }