Whamcloud - gitweb
b56cc8a5ec1227dd4595bf0846d66d3ddb8a11f7
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7  * Copryright (C) 1999 Seagate Technology Inc.
8  *
9  */
10 #define __NO_VERSION__
11 #include <linux/fs.h>
12 #include <linux/locks.h>
13 #include <linux/swap.h>
14
15 #include <linux/obd_support.h>
16 #include <linux/obd_class.h>
17 #include <linux/obdfs.h>
18
19
20 /* XXX temporary until the real function is available from kernel
21  * XXX set this to memory size in pages for max page cache size
22  */
23 #define nr_free_buffer_pages() 32768
24
25 struct {
26         int nfract;  /* Percentage of buffer cache dirty to 
27                         activate bdflush */
28         int ndirty;  /* Maximum number of dirty blocks to write out per
29                         wake-cycle */
30         int nrefill; /* Number of clean buffers to try to obtain
31                                 each time we call refill */
32         int nref_dirt; /* Dirty buffer threshold for activating bdflush
33                           when trying to refill buffers. */
34         int interval; /* jiffies delay between pupdate flushes */
35         int age_buffer;  /* Time for normal buffer to age before we flush it */
36         int age_super;  /* Time for superblock to age before we flush it */
37 } pupd_prm = {40, 1024, 64, 256, 1*HZ, 30*HZ, 5*HZ };
38
39 /* Called with the superblock list lock held */
40 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
41                                int nr_slots, struct page **pages, char **bufs,
42                                obd_size *counts, obd_off *offsets,
43                                obd_flag *flag, unsigned long check_time)
44 {
45         struct list_head *page_list = obdfs_iplist(inode);
46         struct list_head *tmp;
47         int num = 0;
48
49         ENTRY;
50
51         tmp = page_list;
52         /* Traverse list in reverse order, so we do FIFO, not LIFO order */
53         while ( (tmp = tmp->prev) != page_list && num < nr_slots ) {
54                 struct obdfs_pgrq *req;
55                 struct page *page;
56                 
57                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
58                 page = req->rq_page;
59
60                 
61                 if (req->rq_jiffies > check_time)
62                         break;          /* pages are in chronological order */
63
64                 /* Only allocate the obdo if we will actually do I/O here */
65                 if ( !*obdo ) {
66                         OIDEBUG(inode);
67                         *obdo = obdo_fromid(IID(inode), inode->i_ino,
68                                             OBD_MD_FLNOTOBD);
69                         if ( IS_ERR(*obdo) ) {
70                                 int err = PTR_ERR(*obdo);
71                                 *obdo = NULL;
72
73                                 EXIT;
74                                 return err;
75                         }
76
77                         /* FIXME revisit fromid & from_inode */
78                         obdfs_from_inode(*obdo, inode);
79                         *flag = OBD_BRW_CREATE;
80                 }
81
82                 /* Remove request from list before write to avoid conflict.
83                  * Note that obdfs_pgrq_del() also deletes the request.
84                  */
85                 obdfs_pgrq_del(req);
86                 if ( !page ) {
87                         CDEBUG(D_CACHE, "no page \n");
88                         continue;
89                 }
90
91                 bufs[num] = (char *)page_address(page);
92                 pages[num] = page;
93                 counts[num] = PAGE_SIZE;
94                 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
95                 CDEBUG(D_INFO, "ENQ inode %ld, page %p addr %p to vector\n", 
96                        inode->i_ino, page, (char *)page_address(page));
97                 num++;
98         }
99
100         if (!list_empty(page_list))
101                 CDEBUG(D_INFO, "inode %ld list not empty\n", inode->i_ino);
102         CDEBUG(D_INFO, "added %d page(s) to vector\n", num);
103
104         EXIT;
105         return num;  
106 } /* obdfs_enqueue_pages */
107
108 /* Dequeue cached pages for a dying inode without writing them to disk. */
109 void obdfs_dequeue_pages(struct inode *inode)
110 {
111         struct list_head *tmp;
112
113         ENTRY;
114         obd_down(&obdfs_i2sbi(inode)->osi_list_mutex);
115         tmp = obdfs_islist(inode);
116         if ( list_empty(tmp) ) {
117                 CDEBUG(D_INFO, "no dirty pages for inode %ld\n", inode->i_ino);
118                 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
119                 EXIT;
120                 return;
121         }
122
123         /* take it out of the super list */
124         list_del(tmp);
125         INIT_LIST_HEAD(obdfs_islist(inode));
126
127         tmp = obdfs_iplist(inode);
128         while ( (tmp = tmp->prev) != obdfs_iplist(inode) ) {
129                 struct obdfs_pgrq *req;
130                 struct page *page;
131                 
132                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
133                 page = req->rq_page;
134                 /* take it out of the list and free */
135                 obdfs_pgrq_del(req);
136                 /* now put the page away */
137                 put_page(page);
138         }
139
140         obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
141
142         /* decrement inode reference for page cache */
143         atomic_dec(&inode->i_count);
144         EXIT;
145 }
146
147 /* Remove writeback requests for the superblock */
148 int obdfs_flush_reqs(struct list_head *inode_list, unsigned long check_time)
149 {
150         struct list_head *tmp;
151         unsigned long     max_io, total_io = 0;
152         obd_count         num_io;
153         obd_count         num_obdos;
154         struct inode     *inodes[MAX_IOVEC];    /* write data back to these */
155         struct page      *pages[MAX_IOVEC];     /* call put_page on these */
156         struct obdo      *obdos[MAX_IOVEC];
157         char             *bufs[MAX_IOVEC];
158         obd_size          counts[MAX_IOVEC];
159         obd_off           offsets[MAX_IOVEC];
160         obd_flag          flags[MAX_IOVEC];
161         obd_count         bufs_per_obdo[MAX_IOVEC];
162         int               err = 0;
163         struct obdfs_sb_info *sbi;
164
165         ENTRY;
166         if (!inode_list) {
167                 CDEBUG(D_INODE, "no list\n");
168                 EXIT;
169                 return 0;
170         }
171
172         sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
173
174         obd_down(&sbi->osi_list_mutex);
175         if ( list_empty(inode_list) ) {
176                 CDEBUG(D_INFO, "list empty\n");
177                 obd_up(&sbi->osi_list_mutex);
178                 EXIT;
179                 return 0;
180         }
181
182         /* If we are forcing a write, write out all dirty pages */
183         max_io = check_time == ~0UL ? 1<<31 : pupd_prm.ndirty;
184         CDEBUG(D_INFO, "max_io = %lu\n", max_io);
185
186         /* Add each inode's dirty pages to a write vector, and write it.
187          * Traverse list in reverse order, so we do FIFO, not LIFO order
188          */
189  again:
190         tmp = inode_list;
191         num_io = 0;
192         num_obdos = 0;
193         while ( (tmp = tmp->prev) != inode_list && total_io < max_io) {
194                 struct obdfs_inode_info *ii;
195                 struct inode *inode;
196                 int res;
197
198                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
199                 inode = list_entry(ii, struct inode, u);
200                 inodes[num_obdos] = inode;
201                 obdos[num_obdos] = NULL;
202                 CDEBUG(D_INFO, "checking inode %ld pages\n", inode->i_ino);
203
204                 /* Make sure we reference "inode" and not "inodes[num_obdos]",
205                  * as num_obdos will change after the loop is run.
206                  */
207                 if (!list_empty(obdfs_iplist(inode))) {
208                         res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
209                                                   MAX_IOVEC - num_io,
210                                                   &pages[num_io], &bufs[num_io],
211                                                   &counts[num_io],
212                                                   &offsets[num_io],
213                                                   &flags[num_obdos],
214                                                   check_time);
215                         CDEBUG(D_INFO, "FLUSH inode %ld, pages flushed: %d\n",
216                                inode->i_ino, res);
217                         if ( res < 0 ) {
218                                 CDEBUG(D_INODE,
219                                        "fatal: unable to enqueue inode %ld (err %d)\n",
220                                        inode->i_ino, res);
221                                 /* XXX Move bad inode to end of list so we can
222                                  * continue with flushing list.  This is a
223                                  * temporary measure to avoid machine lockups.
224                                  * Maybe if we have -ENOENT, simply discard.
225                                  */
226                                 list_del(tmp);
227                                 list_add(tmp, inode_list);
228                                 err = res;
229                                 EXIT;
230                                 goto BREAK;
231                         }
232                         if (res == 0)
233                                 continue;
234
235                         num_io += res;
236                         total_io += res;
237                         bufs_per_obdo[num_obdos] = res;
238                         num_obdos++;
239
240                         if ( num_io == MAX_IOVEC ) {
241                                 obd_up(&sbi->osi_list_mutex);
242                                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
243                                                       obdos, bufs_per_obdo,
244                                                       pages, bufs, counts,
245                                                       offsets, flags);
246                                 if ( err ) {
247                                         CDEBUG(D_INODE,
248                                                "fatal: do_vec_wr err=%d\n",
249                                                err);
250                                         EXIT;
251                                         goto ERR;
252                                 }
253                                 obd_down(&sbi->osi_list_mutex);
254                                 goto again;
255                         }
256                 }
257         }
258
259 BREAK:
260         obd_up(&sbi->osi_list_mutex);
261
262         /* flush any remaining I/Os */
263         if ( num_io ) {
264                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
265                                       bufs_per_obdo, pages, bufs, counts,
266                                       offsets, flags);
267                 if (err)
268                         CDEBUG(D_INODE, "fatal: unable to do vec_wr (err %d)\n", err);
269                 num_io = 0;
270                 num_obdos = 0;
271         }
272
273         /* Remove inode from superblock dirty list when no more pages.
274          * Make sure we don't point at the current inode with tmp
275          * when we re-init the list on the inode, or we will loop.
276          */
277         obd_down(&sbi->osi_list_mutex);
278         tmp = inode_list;
279         while ( (tmp = tmp->prev) != inode_list ) {
280                 struct obdfs_inode_info *ii;
281                 struct inode *inode;
282
283                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
284                 inode = list_entry(ii, struct inode, u);
285                 CDEBUG(D_INFO, "checking inode %ld empty\n", inode->i_ino);
286                 if (list_empty(obdfs_iplist(inode))) {
287                         CDEBUG(D_INFO, "remove inode %ld from dirty list\n",
288                                inode->i_ino);
289                         tmp = tmp->next;
290                         list_del(obdfs_islist(inode));
291                         /* decrement inode reference for page cache */
292                         atomic_dec(&inode->i_count);
293                         INIT_LIST_HEAD(obdfs_islist(inode));
294                 }
295         }
296         obd_up(&sbi->osi_list_mutex);
297
298         CDEBUG(D_INFO, "flushed %ld pages in total\n", total_io);
299         EXIT;
300 ERR:
301         return err ? err : total_io;
302 } /* obdfs_flush_reqs */
303
304
305 /* Walk all of the superblocks and write out blocks which are too old.
306  * Return the maximum number of blocks written for a single filesystem.
307  */
308 int obdfs_flush_dirty_pages(unsigned long check_time)
309 {
310         struct list_head *sl;
311         int max = 0;
312
313         ENTRY;
314         sl = &obdfs_super_list;
315         while ( (sl = sl->prev) != &obdfs_super_list ) {
316                 struct obdfs_sb_info *sbi = 
317                         list_entry(sl, struct obdfs_sb_info, osi_list);
318                 int ret;
319
320                 /* walk write requests here, use the sb, check the time */
321                 ret = obdfs_flush_reqs(&sbi->osi_inodes, check_time);
322                 /* XXX handle error?  What to do with it? */
323
324                 max = ret > max ? ret : max;
325         }
326         EXIT;
327         return max;
328 } /* obdfs_flush_dirty_pages */
329
330
331 static struct task_struct *pupdated;
332
333 static int pupdate(void *unused) 
334 {
335         int interval = pupd_prm.interval;
336         long age = pupd_prm.age_buffer;
337         int wrote = 0;
338         
339         exit_files(current);
340         exit_mm(current);
341
342         pupdated = current;
343         pupdated->session = 1;
344         pupdated->pgrp = 1;
345         strcpy(pupdated->comm, "pupdated");
346
347         printk("pupdated activated...\n");
348
349         spin_lock_irq(&pupdated->sigmask_lock);
350         sigfillset(&pupdated->blocked);
351         siginitsetinv(&pupdated->blocked, sigmask(SIGTERM));
352         recalc_sigpending(pupdated);
353         spin_unlock_irq(&pupdated->sigmask_lock);
354
355         for (;;) {
356                 long dirty_limit;
357
358                 /* update interval */
359                 if (interval) {
360                         set_task_state(pupdated, TASK_INTERRUPTIBLE);
361                         schedule_timeout(interval);
362                 }
363                 if (signal_pending(pupdated))
364                 {
365                         int stopped = 0;
366                         spin_lock_irq(&pupdated->sigmask_lock);
367                         if (sigismember(&pupdated->pending.signal, SIGTERM))
368                         {
369                                 sigdelset(&pupdated->pending.signal, SIGTERM);
370                                 stopped = 1;
371                         }
372                         recalc_sigpending(pupdated);
373                         spin_unlock_irq(&pupdated->sigmask_lock);
374                         if (stopped) {
375                                 printk("pupdated stopped...\n");
376                                 set_task_state(pupdated, TASK_STOPPED);
377                                 pupdated = NULL;
378                                 return 0;
379                         }
380                 }
381                 /* asynchronous setattr etc for the future ...
382                 obdfs_flush_dirty_inodes(jiffies - pupd_prm.age_super);
383                  */
384                 dirty_limit = nr_free_buffer_pages() * pupd_prm.nfract / 100;
385
386                 if (obdfs_cache_count > dirty_limit) {
387                         interval = 0;
388                         if ( wrote < pupd_prm.ndirty )
389                                 age >>= 1;
390                         CDEBUG(D_CACHE, "wrote %d, age %ld, interval %d\n",
391                                 wrote, age, interval);
392                 } else {
393                         if ( wrote < pupd_prm.ndirty >> 1 &&
394                              obdfs_cache_count < dirty_limit / 2) {
395                                 interval = pupd_prm.interval;
396                                 age = pupd_prm.age_buffer;
397                                 CDEBUG(D_INFO,
398                                        "wrote %d, age %ld, interval %d\n",
399                                        wrote, age, interval);
400                         } else if (obdfs_cache_count > dirty_limit / 2) {
401                                 interval >>= 1;
402                                 if ( wrote < pupd_prm.ndirty )
403                                         age >>= 1;
404                                 CDEBUG(D_CACHE,
405                                        "wrote %d, age %ld, interval %d\n",
406                                        wrote, age, interval);
407                         }
408                 }
409
410                 wrote = obdfs_flush_dirty_pages(jiffies - age);
411                 if (wrote)
412                         CDEBUG(D_CACHE,
413                                "dirty_limit %ld, cache_count %ld, wrote %d\n",
414                                dirty_limit, obdfs_cache_count, wrote);
415         }
416 }
417
418
419 int obdfs_flushd_init(void)
420 {
421         /*
422         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
423          */
424         kernel_thread(pupdate, NULL, 0);
425         CDEBUG(D_PSDEV, __FUNCTION__ ": flushd inited\n");
426         return 0;
427 }
428
429 int obdfs_flushd_cleanup(void)
430 {
431         ENTRY;
432
433         if (pupdated) /* for debugging purposes only */
434                 CDEBUG(D_CACHE, "pupdated->state = %lx\n", pupdated->state);
435
436         /* deliver a signal to pupdated to shut it down */
437         if (pupdated && (pupdated->state == TASK_RUNNING ||
438                          pupdated->state == TASK_INTERRUPTIBLE )) {
439                 unsigned long timeout = HZ/20;
440                 unsigned long count = 0;
441                 send_sig_info(SIGTERM, (struct siginfo *)1, pupdated);
442                 while (pupdated) {
443                         if ((count % 2*HZ) == timeout)
444                                 printk(KERN_INFO "wait for pupdated to stop\n");
445                         count += timeout;
446                         set_current_state(TASK_INTERRUPTIBLE);
447                         schedule_timeout(timeout);
448                 }
449         }
450
451         EXIT;
452         /* not reached */
453         return 0;
454
455 }