Whamcloud - gitweb
- debug.c was only used by obdclass, so I moved it there
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  * This code is issued under the GNU General Public License.
6  * See the file COPYING in this distribution
7  *
8  *  Copyright (C) 1991, 1992  Linus Torvalds
9  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
10  * Copryright (C) 1999 Seagate Technology Inc.
11  *
12  */
13 #define __NO_VERSION__
14 #include <linux/fs.h>
15 #include <linux/locks.h>
16 #include <linux/swap.h>
17
18 #include <linux/obd_support.h>
19 #include <linux/obd_class.h>
20 #include <linux/obdfs.h>
21
22
23 /* XXX temporary until the real function is available from kernel
24  * XXX set this to memory size in pages for max page cache size
25  */
26 #define nr_free_buffer_pages() 32768
27
28 /* Defines for page buf daemon */
29 struct pupd_prm {
30         int nfract;  /* Percentage of buffer cache dirty to 
31                         activate bdflush */
32         int ndirty;  /* Maximum number of dirty blocks to write out per
33                         wake-cycle */
34         int nrefill; /* Number of clean buffers to try to obtain
35                                 each time we call refill */
36         int nref_dirt; /* Dirty buffer threshold for activating bdflush
37                           when trying to refill buffers. */
38         int interval; /* jiffies delay between pupdate flushes */
39         int age_buffer;  /* Time for normal buffer to age before we flush it */
40         int age_super;  /* Time for superblock to age before we flush it */
41 };
42
43
44 static struct pupdated {
45         int active;
46         wait_queue_head_t waitq;
47         struct timer_list timer;
48         struct pupd_prm parms;
49 } pupdated = {
50         active: -1,
51         parms: {40, 1024, 64, 256, 1*HZ, 30*HZ, 5*HZ }
52 };
53
54
55 /* Called with the superblock list lock held */
56 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
57                                int nr_slots, struct page **pages, char **bufs,
58                                obd_size *counts, obd_off *offsets,
59                                obd_flag *flag, unsigned long check_time)
60 {
61         struct list_head *page_list = obdfs_iplist(inode);
62         struct list_head *tmp;
63         int num = 0;
64
65         ENTRY;
66
67         tmp = page_list;
68         /* Traverse list in reverse order, so we do FIFO, not LIFO order */
69         while ( (tmp = tmp->prev) != page_list && num < nr_slots ) {
70                 struct obdfs_pgrq *req;
71                 struct page *page;
72                 
73                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
74                 page = req->rq_page;
75
76                 
77                 if (req->rq_jiffies > check_time)
78                         break;          /* pages are in chronological order */
79
80                 /* Only allocate the obdo if we will actually do I/O here */
81                 if ( !*obdo ) {
82                         OIDEBUG(inode);
83                         *obdo = obdo_fromid(IID(inode), inode->i_ino,
84                                             OBD_MD_FLNOTOBD);
85                         if ( IS_ERR(*obdo) ) {
86                                 int err = PTR_ERR(*obdo);
87                                 *obdo = NULL;
88
89                                 EXIT;
90                                 return err;
91                         }
92
93                         /* FIXME revisit fromid & from_inode */
94                         obdfs_from_inode(*obdo, inode);
95                         *flag = OBD_BRW_CREATE;
96                 }
97
98                 /* Remove request from list before write to avoid conflict.
99                  * Note that obdfs_pgrq_del() also deletes the request.
100                  */
101                 obdfs_pgrq_del(req);
102                 if ( !page ) {
103                         CDEBUG(D_CACHE, "no page \n");
104                         continue;
105                 }
106
107                 bufs[num] = (char *)page_address(page);
108                 pages[num] = page;
109                 counts[num] = PAGE_SIZE;
110                 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
111                 CDEBUG(D_INFO, "ENQ inode %ld, page %p addr %p to vector\n", 
112                        inode->i_ino, page, (char *)page_address(page));
113                 num++;
114         }
115
116         if (!list_empty(page_list))
117                 CDEBUG(D_INFO, "inode %ld list not empty\n", inode->i_ino);
118         CDEBUG(D_INFO, "added %d page(s) to vector\n", num);
119
120         EXIT;
121         return num;  
122 } /* obdfs_enqueue_pages */
123
124 /* Dequeue cached pages for a dying inode without writing them to disk. */
125 void obdfs_dequeue_pages(struct inode *inode)
126 {
127         struct list_head *tmp;
128
129         ENTRY;
130         obd_down(&obdfs_i2sbi(inode)->osi_list_mutex);
131         tmp = obdfs_islist(inode);
132         if ( list_empty(tmp) ) {
133                 CDEBUG(D_INFO, "no dirty pages for inode %ld\n", inode->i_ino);
134                 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
135                 EXIT;
136                 return;
137         }
138
139         /* take it out of the super list */
140         list_del(tmp);
141         INIT_LIST_HEAD(obdfs_islist(inode));
142
143         tmp = obdfs_iplist(inode);
144         while ( (tmp = tmp->prev) != obdfs_iplist(inode) ) {
145                 struct obdfs_pgrq *req;
146                 struct page *page;
147                 
148                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
149                 page = req->rq_page;
150                 /* take it out of the list and free */
151                 obdfs_pgrq_del(req);
152                 /* now put the page away */
153                 put_page(page);
154         }
155
156         obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
157
158         /* decrement inode reference for page cache */
159         atomic_dec(&inode->i_count);
160         EXIT;
161 }
162
163 /* This value is not arbitrarily chosen.  KIO_STATIC_PAGES from linux/iobuf.h */
164 #define MAX_IOVEC       (KIO_STATIC_PAGES - 1)
165
166 /* Remove writeback requests for the superblock */
167 int obdfs_flush_reqs(struct list_head *inode_list, unsigned long check_time)
168 {
169         struct list_head *tmp;
170         unsigned long     max_io, total_io = 0;
171         obd_count         num_io;
172         obd_count         num_obdos;
173         struct inode     *inodes[MAX_IOVEC];    /* write data back to these */
174         struct page      *pages[MAX_IOVEC];     /* call put_page on these */
175         struct obdo      *obdos[MAX_IOVEC];
176         char             *bufs[MAX_IOVEC];
177         obd_size          counts[MAX_IOVEC];
178         obd_off           offsets[MAX_IOVEC];
179         obd_flag          flags[MAX_IOVEC];
180         obd_count         bufs_per_obdo[MAX_IOVEC];
181         int               err = 0;
182         struct obdfs_sb_info *sbi;
183
184         ENTRY;
185         if (!inode_list) {
186                 CDEBUG(D_INODE, "no list\n");
187                 EXIT;
188                 return 0;
189         }
190
191         sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
192
193         obd_down(&sbi->osi_list_mutex);
194         if ( list_empty(inode_list) ) {
195                 CDEBUG(D_INFO, "list empty\n");
196                 obd_up(&sbi->osi_list_mutex);
197                 EXIT;
198                 return 0;
199         }
200
201         /* If we are forcing a write, write out all dirty pages */
202         max_io = check_time == ~0UL ? 1<<31 : pupdated.parms.ndirty;
203         CDEBUG(D_INFO, "max_io = %lu\n", max_io);
204
205         /* Add each inode's dirty pages to a write vector, and write it.
206          * Traverse list in reverse order, so we do FIFO, not LIFO order
207          */
208  again:
209         tmp = inode_list;
210         num_io = 0;
211         num_obdos = 0;
212         while ( (tmp = tmp->prev) != inode_list && total_io < max_io) {
213                 struct obdfs_inode_info *ii;
214                 struct inode *inode;
215                 int res;
216
217                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
218                 inode = list_entry(ii, struct inode, u);
219                 inodes[num_obdos] = inode;
220                 obdos[num_obdos] = NULL;
221                 CDEBUG(D_INFO, "checking inode %ld pages\n", inode->i_ino);
222
223                 /* Make sure we reference "inode" and not "inodes[num_obdos]",
224                  * as num_obdos will change after the loop is run.
225                  */
226                 if (!list_empty(obdfs_iplist(inode))) {
227                         res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
228                                                   MAX_IOVEC - num_io,
229                                                   &pages[num_io], &bufs[num_io],
230                                                   &counts[num_io],
231                                                   &offsets[num_io],
232                                                   &flags[num_obdos],
233                                                   check_time);
234                         CDEBUG(D_INFO, "FLUSH inode %ld, pages flushed: %d\n",
235                                inode->i_ino, res);
236                         if ( res < 0 ) {
237                                 CDEBUG(D_INODE,
238                                        "fatal: unable to enqueue inode %ld (err %d)\n",
239                                        inode->i_ino, res);
240                                 /* XXX Move bad inode to end of list so we can
241                                  * continue with flushing list.  This is a
242                                  * temporary measure to avoid machine lockups.
243                                  * Maybe if we have -ENOENT, simply discard.
244                                  */
245                                 list_del(tmp);
246                                 list_add(tmp, inode_list);
247                                 err = res;
248                                 EXIT;
249                                 goto BREAK;
250                         }
251                         if (res == 0)
252                                 continue;
253
254                         num_io += res;
255                         total_io += res;
256                         bufs_per_obdo[num_obdos] = res;
257                         num_obdos++;
258
259                         if ( num_io == MAX_IOVEC ) {
260                                 obd_up(&sbi->osi_list_mutex);
261                                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
262                                                       obdos, bufs_per_obdo,
263                                                       pages, bufs, counts,
264                                                       offsets, flags);
265                                 if ( err ) {
266                                         CDEBUG(D_INODE,
267                                                "fatal: do_vec_wr err=%d\n",
268                                                err);
269                                         EXIT;
270                                         goto ERR;
271                                 }
272                                 obd_down(&sbi->osi_list_mutex);
273                                 goto again;
274                         }
275                 }
276         }
277
278 BREAK:
279         obd_up(&sbi->osi_list_mutex);
280
281         /* flush any remaining I/Os */
282         if ( num_io ) {
283                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
284                                       bufs_per_obdo, pages, bufs, counts,
285                                       offsets, flags);
286                 if (err)
287                         CDEBUG(D_INODE, "fatal: unable to do vec_wr (err %d)\n", err);
288                 num_io = 0;
289                 num_obdos = 0;
290         }
291
292         /* Remove inode from superblock dirty list when no more pages.
293          * Make sure we don't point at the current inode with tmp
294          * when we re-init the list on the inode, or we will loop.
295          */
296         obd_down(&sbi->osi_list_mutex);
297         tmp = inode_list;
298         while ( (tmp = tmp->prev) != inode_list ) {
299                 struct obdfs_inode_info *ii;
300                 struct inode *inode;
301
302                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
303                 inode = list_entry(ii, struct inode, u);
304                 CDEBUG(D_INFO, "checking inode %ld empty\n", inode->i_ino);
305                 if (list_empty(obdfs_iplist(inode))) {
306                         CDEBUG(D_INFO, "remove inode %ld from dirty list\n",
307                                inode->i_ino);
308                         tmp = tmp->next;
309                         list_del(obdfs_islist(inode));
310                         /* decrement inode reference for page cache */
311                         atomic_dec(&inode->i_count);
312                         INIT_LIST_HEAD(obdfs_islist(inode));
313                 }
314         }
315         obd_up(&sbi->osi_list_mutex);
316
317         CDEBUG(D_INFO, "flushed %ld pages in total\n", total_io);
318         EXIT;
319 ERR:
320         return err ? err : total_io;
321 } /* obdfs_flush_reqs */
322
323
324 /* Walk all of the superblocks and write out blocks which are too old.
325  * Return the maximum number of blocks written for a single filesystem.
326  */
327 int obdfs_flush_dirty_pages(unsigned long check_time)
328 {
329         struct list_head *sl;
330         int max = 0;
331
332         /*        ENTRY; */
333         sl = &obdfs_super_list;
334         while ( (sl = sl->prev) != &obdfs_super_list ) {
335                 struct obdfs_sb_info *sbi = 
336                         list_entry(sl, struct obdfs_sb_info, osi_list);
337                 int ret;
338
339                 /* walk write requests here, use the sb, check the time */
340                 ret = obdfs_flush_reqs(&sbi->osi_inodes, check_time);
341                 /* XXX handle error?  What to do with it? */
342
343                 max = ret > max ? ret : max;
344         }
345         if (max) { EXIT; }
346         return max;
347 } /* obdfs_flush_dirty_pages */
348
349
350 static void pupdate_wakeup(unsigned long l)
351 {
352         wake_up(&pupdated.waitq);
353 }
354
355
356 static int pupdate(void *unused) 
357 {
358         u_long flags;
359         int interval = pupdated.parms.interval;
360         long age = pupdated.parms.age_buffer;
361         int wrote = 0;
362
363         if (pupdated.active >= 0) {
364                 CDEBUG(D_CACHE, "attempted to run multiple pupdates\n");
365                 return 1;
366         }
367
368         init_timer(&pupdated.timer);
369         init_waitqueue_head(&pupdated.waitq);
370         pupdated.timer.function = pupdate_wakeup;
371         
372         exit_files(current);
373         exit_mm(current);
374         daemonize();
375
376         current->session = 1;
377         current->pgrp = 1;
378         strcpy(current->comm, "pupdated");
379
380         CDEBUG(D_CACHE, "pupdated activated...\n");
381         pupdated.active = 1;
382
383         spin_lock_irqsave(&current->sigmask_lock, flags);
384         flush_signals(current);
385         sigfillset(&current->blocked);
386         recalc_sigpending(current);
387         spin_unlock_irqrestore(&current->sigmask_lock, flags);
388
389         do {
390                 long dirty_limit;
391
392                 /* update interval */
393                 if (pupdated.active == 1 && interval) {
394                         mod_timer(&pupdated.timer, jiffies + interval);
395                         interruptible_sleep_on(&pupdated.waitq);
396                 }
397                 if (pupdated.active == 0) {
398                         del_timer(&pupdated.timer);
399                         /* If stopped, we flush one last time... */
400                 }
401
402                 /* asynchronous setattr etc for the future ...
403                 obdfs_flush_dirty_inodes(jiffies - pupdated.parms.age_super);
404                  */
405                 dirty_limit = nr_free_buffer_pages() * pupdated.parms.nfract / 100;
406
407                 if (obdfs_cache_count > dirty_limit) {
408                         interval = 0;
409                         if (wrote < pupdated.parms.ndirty)
410                                 age >>= 1;
411                         if (wrote) 
412                           CDEBUG(D_CACHE, "wrote %d, age %ld, interval %d\n",
413                                 wrote, age, interval);
414                 } else {
415                         if (wrote < pupdated.parms.ndirty >> 1 &&
416                             obdfs_cache_count < dirty_limit / 2) {
417                                 interval = pupdated.parms.interval;
418                                 age = pupdated.parms.age_buffer;
419                                 if (wrote) 
420                                   CDEBUG(D_INFO,
421                                        "wrote %d, age %ld, interval %d\n",
422                                        wrote, age, interval);
423                         } else if (obdfs_cache_count > dirty_limit / 2) {
424                                 interval >>= 1;
425                                 if (wrote < pupdated.parms.ndirty)
426                                         age >>= 1;
427                                 if (wrote) 
428                                   CDEBUG(D_CACHE,
429                                        "wrote %d, age %ld, interval %d\n",
430                                        wrote, age, interval);
431                         }
432                 }
433
434                 wrote = obdfs_flush_dirty_pages(jiffies - age);
435                 if (wrote) {
436                         CDEBUG(D_CACHE,
437                                "dirty_limit %ld, cache_count %ld, wrote %d\n",
438                                dirty_limit, obdfs_cache_count, wrote);
439                         run_task_queue(&tq_disk);
440                 }
441         } while (pupdated.active == 1);
442
443         CDEBUG(D_CACHE, "pupdated stopped...\n");
444         pupdated.active = -1;
445         wake_up(&pupdated.waitq);
446         return 0;
447 }
448
449
450 int obdfs_flushd_init(void)
451 {
452         /*
453         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
454          */
455         kernel_thread(pupdate, NULL, 0);
456         CDEBUG(D_PSDEV, "flushd inited\n");
457         return 0;
458 }
459
460 int obdfs_flushd_cleanup(void)
461 {
462         ENTRY;
463
464         /* Shut down pupdated. */
465         if (pupdated.active > 0) {
466                 CDEBUG(D_CACHE, "inform pupdated\n");
467                 pupdated.active = 0;
468                 wake_up(&pupdated.waitq);
469
470                 CDEBUG(D_CACHE, "wait for pupdated\n");
471                 while (pupdated.active == 0) {
472                         interruptible_sleep_on(&pupdated.waitq);
473                 }
474                 CDEBUG(D_CACHE, "done waiting for pupdated\n");
475         }               
476
477         EXIT;
478         return 0;
479 }