Whamcloud - gitweb
obdfs/flushd.c: added constant for nr_free_buffer_pages() function call as
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7  * Copryright (C) 1999 Seagate Technology Inc.
8  *
9  */
10 #define __NO_VERSION__
11 #include <linux/fs.h>
12 #include <linux/locks.h>
13 #include <linux/swap.h>
14
15 #include <linux/obd_support.h>
16 #include <linux/obd_class.h>
17 #include <linux/obdfs.h>
18
19
20 /* XXX temporary until the real function is available from kernel
21  * XXX set this to memory size in pages for max page cache size
22  */
23 #define nr_free_buffer_pages() 32768
24
25 struct {
26         int nfract;  /* Percentage of buffer cache dirty to 
27                         activate bdflush */
28         int ndirty;  /* Maximum number of dirty blocks to write out per
29                         wake-cycle */
30         int nrefill; /* Number of clean buffers to try to obtain
31                                 each time we call refill */
32         int nref_dirt; /* Dirty buffer threshold for activating bdflush
33                           when trying to refill buffers. */
34         int interval; /* jiffies delay between pupdate flushes */
35         int age_buffer;  /* Time for normal buffer to age before we flush it */
36         int age_super;  /* Time for superblock to age before we flush it */
37 } pupd_prm = {40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ };
38
39 /* Called with the superblock list lock */
40 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
41                                int nr_slots, struct page **pages, char **bufs,
42                                obd_size *counts, obd_off *offsets,
43                                obd_flag *flag, unsigned long check_time)
44 {
45         struct list_head *page_list = obdfs_iplist(inode);
46         struct list_head *tmp;
47         int num = 0;
48
49         ENTRY;
50
51         tmp = page_list;
52         /* Traverse list in reverse order, so we do FIFO, not LIFO order */
53         while ( (tmp = tmp->prev) != page_list && num < nr_slots ) {
54                 struct obdfs_pgrq *req;
55                 struct page *page;
56                 
57                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
58                 page = req->rq_page;
59
60                 
61                 if (req->rq_jiffies > check_time)
62                         break;          /* pages are in chronological order */
63
64                 /* Only allocate the obdo if we will actually do I/O here */
65                 if ( !*obdo ) {
66                         OIDEBUG(inode);
67                         *obdo = obdo_fromid(IID(inode), inode->i_ino,
68                                             OBD_MD_FLNOTOBD);
69                         if ( IS_ERR(*obdo) ) {
70                                 int err = PTR_ERR(*obdo);
71                                 *obdo = NULL;
72
73                                 EXIT;
74                                 return err;
75                         }
76
77                         /* FIXME revisit fromid & from_inode */
78                         obdfs_from_inode(*obdo, inode);
79                         *flag = OBD_BRW_CREATE;
80                 }
81
82                 /* Remove request from list before write to avoid conflict.
83                  * Note that obdfs_pgrq_del() also deletes the request.
84                  */
85                 obdfs_pgrq_del(req);
86                 if ( !page ) {
87                         CDEBUG(D_CACHE, "no page \n");
88                         continue;
89                 }
90
91                 bufs[num] = (char *)page_address(page);
92                 pages[num] = page;
93                 counts[num] = PAGE_SIZE;
94                 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
95                 CDEBUG(D_INFO, "ENQ inode %ld, page %p addr %p to vector\n", 
96                        inode->i_ino, page, (char *)page_address(page));
97                 num++;
98         }
99
100         if (!list_empty(page_list))
101                 CDEBUG(D_INFO, "inode %ld list not empty\n", inode->i_ino);
102         CDEBUG(D_INFO, "added %d page(s) to vector\n", num);
103
104         EXIT;
105         return num;  
106 } /* obdfs_enqueue_pages */
107
108 /* Dequeue cached pages for a dying inode without writing them to disk. */
109 void obdfs_dequeue_pages(struct inode *inode)
110 {
111         struct list_head *tmp;
112
113         obd_down(&obdfs_i2sbi(inode)->osi_list_mutex);
114         tmp = obdfs_islist(inode);
115         if ( list_empty(tmp) ) {
116                 CDEBUG(D_INFO, "no dirty pages for inode %ld\n", inode->i_ino);
117                 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
118                 EXIT;
119                 return;
120         }
121
122         /* take it out of the super list */
123         list_del(tmp);
124         INIT_LIST_HEAD(obdfs_islist(inode));
125
126         tmp = obdfs_iplist(inode);
127         while ( (tmp = tmp->prev) != obdfs_iplist(inode) ) {
128                 struct obdfs_pgrq *req;
129                 struct page *page;
130                 
131                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
132                 page = req->rq_page;
133                 /* take it out of the list and free */
134                 obdfs_pgrq_del(req);
135                 /* now put the page away */
136                 put_page(page);
137         }
138
139         obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
140
141         /* decrement inode reference for page cache */
142         inode->i_count--;
143 }
144
145 /* Remove writeback requests for the superblock */
146 int obdfs_flush_reqs(struct list_head *inode_list, unsigned long check_time)
147 {
148         struct list_head *tmp;
149         int               total_io = 0;
150         obd_count         num_io;
151         obd_count         num_obdos;
152         struct inode     *inodes[MAX_IOVEC];    /* write data back to these */
153         struct page      *pages[MAX_IOVEC];     /* call put_page on these */
154         struct obdo      *obdos[MAX_IOVEC];
155         char             *bufs[MAX_IOVEC];
156         obd_size          counts[MAX_IOVEC];
157         obd_off           offsets[MAX_IOVEC];
158         obd_flag          flags[MAX_IOVEC];
159         obd_count         bufs_per_obdo[MAX_IOVEC];
160         int               err = 0;
161         struct obdfs_sb_info *sbi;
162
163         ENTRY;
164         if (!inode_list) {
165                 CDEBUG(D_INODE, "no list\n");
166                 EXIT;
167                 return 0;
168         }
169
170         sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
171
172         obd_down(&sbi->osi_list_mutex);
173         if ( list_empty(inode_list) ) {
174                 CDEBUG(D_CACHE, "list empty: memory %ld\n", obd_memory);
175                 obd_up(&sbi->osi_list_mutex);
176                 EXIT;
177                 return 0;
178         }
179
180         /* Add each inode's dirty pages to a write vector, and write it.
181          * Traverse list in reverse order, so we do FIFO, not LIFO order
182          */
183  again:
184         tmp = inode_list;
185         num_io = 0;
186         num_obdos = 0;
187         while ( (tmp = tmp->prev) != inode_list && total_io < pupd_prm.ndirty) {
188                 struct obdfs_inode_info *ii;
189                 struct inode *inode;
190                 int res;
191
192                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
193                 inode = list_entry(ii, struct inode, u);
194                 inodes[num_obdos] = inode;
195                 obdos[num_obdos] = NULL;
196                 CDEBUG(D_INFO, "checking inode %ld pages\n", inode->i_ino);
197
198                 /* Make sure we reference "inode" and not "inodes[num_obdos]",
199                  * as num_obdos will change after the loop is run.
200                  */
201                 if (!list_empty(obdfs_iplist(inode))) {
202                         res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
203                                                   MAX_IOVEC - num_io,
204                                                   &pages[num_io], &bufs[num_io],
205                                                   &counts[num_io],
206                                                   &offsets[num_io],
207                                                   &flags[num_obdos],
208                                                   check_time);
209                         CDEBUG(D_INFO, "FLUSH inode %ld, pages flushed: %d\n",
210                                inode->i_ino, res);
211                         if ( res < 0 ) {
212                                 CDEBUG(D_INODE,
213                                        "fatal: unable to enqueue inode %ld (err %d)\n",
214                                        inode->i_ino, err);
215                                 /* XXX Move bad inode to end of list so we can
216                                  * continue with flushing list.  This is a
217                                  * temporary measure to avoid machine lockups.
218                                  */
219                                 list_del(tmp);
220                                 list_add(tmp, inode_list);
221                                 err = res;
222                                 EXIT;
223                                 goto BREAK;
224                         } else if (res) {
225                                 num_io += res;
226                                 total_io += res;
227                                 bufs_per_obdo[num_obdos] = res;
228                                 num_obdos++;
229                         }
230
231                         if ( num_io == MAX_IOVEC ) {
232                                 obd_up(&sbi->osi_list_mutex);
233                                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
234                                                       obdos, bufs_per_obdo,
235                                                       pages, bufs, counts,
236                                                       offsets, flags);
237                                 if ( err ) {
238                                         CDEBUG(D_INODE,
239                                                 "fatal: unable to do vec_wr (err %d)\n", err);
240                                         EXIT;
241                                         goto ERR;
242                                 }
243                                 obd_down(&sbi->osi_list_mutex);
244                                 goto again;
245                         }
246                 }
247         }
248
249 BREAK:
250         obd_up(&sbi->osi_list_mutex);
251
252         /* flush any remaining I/Os */
253         if ( num_io ) {
254                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
255                                       bufs_per_obdo, pages, bufs, counts,
256                                       offsets, flags);
257                 if (err)
258                         CDEBUG(D_INODE, "fatal: unable to do vec_wr (err %d)\n", err);
259                 num_io = 0;
260                 num_obdos = 0;
261         }
262
263         /* Remove inode from superblock dirty list when no more pages.
264          * Make sure we don't point at the current inode with tmp
265          * when we re-init the list on the inode, or we will loop.
266          */
267         obd_down(&sbi->osi_list_mutex);
268         tmp = inode_list;
269         while ( (tmp = tmp->prev) != inode_list ) {
270                 struct obdfs_inode_info *ii;
271                 struct inode *inode;
272
273                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
274                 inode = list_entry(ii, struct inode, u);
275                 CDEBUG(D_INFO, "checking inode %ld empty\n", inode->i_ino);
276                 if (list_empty(obdfs_iplist(inode))) {
277                         CDEBUG(D_INFO, "remove inode %ld from dirty list\n",
278                                inode->i_ino);
279                         tmp = tmp->next;
280                         list_del(obdfs_islist(inode));
281                         /* decrement inode reference for page cache */
282                         inode->i_count--;
283                         INIT_LIST_HEAD(obdfs_islist(inode));
284                 }
285         }
286         obd_up(&sbi->osi_list_mutex);
287
288         CDEBUG(D_INFO, "flushed %d pages in total\n", total_io);
289         EXIT;
290 ERR:
291         return err ? err : total_io;
292 } /* obdfs_flush_reqs */
293
294
295 /* Walk all of the superblocks and write out blocks which are too old.
296  * Return the maximum number of blocks written for a single filesystem.
297  */
298 int obdfs_flush_dirty_pages(unsigned long check_time)
299 {
300         struct list_head *sl;
301         int max = 0;
302
303         ENTRY;
304         sl = &obdfs_super_list;
305         while ( (sl = sl->prev) != &obdfs_super_list ) {
306                 struct obdfs_sb_info *sbi = 
307                         list_entry(sl, struct obdfs_sb_info, osi_list);
308                 int ret;
309
310                 /* walk write requests here, use the sb, check the time */
311                 ret = obdfs_flush_reqs(&sbi->osi_inodes, check_time);
312                 /* XXX handle error?  What to do with it? */
313
314                 max = ret > max ? ret : max;
315         }
316         EXIT;
317         return max;
318 } /* obdfs_flush_dirty_pages */
319
320
321 static struct task_struct *pupdated;
322
323 static int pupdate(void *unused) 
324 {
325         int interval = pupd_prm.interval;
326         long age = pupd_prm.age_buffer;
327         int wrote = 0;
328         
329         exit_files(current);
330         exit_mm(current);
331
332         pupdated = current;
333         pupdated->session = 1;
334         pupdated->pgrp = 1;
335         strcpy(pupdated->comm, "pupdated");
336
337         printk("pupdated activated...\n");
338
339         spin_lock_irq(&pupdated->sigmask_lock);
340         sigfillset(&pupdated->blocked);
341         siginitsetinv(&pupdated->blocked, sigmask(SIGTERM));
342         recalc_sigpending(pupdated);
343         spin_unlock_irq(&pupdated->sigmask_lock);
344
345         for (;;) {
346                 long dirty_limit;
347
348                 /* update interval */
349                 if (interval) {
350                         set_task_state(pupdated, TASK_INTERRUPTIBLE);
351                         schedule_timeout(interval);
352                 }
353                 if (signal_pending(pupdated))
354                 {
355                         int stopped = 0;
356                         spin_lock_irq(&pupdated->sigmask_lock);
357                         if (sigismember(&pupdated->signal, SIGTERM))
358                         {
359                                 sigdelset(&pupdated->signal, SIGTERM);
360                                 stopped = 1;
361                         }
362                         recalc_sigpending(pupdated);
363                         spin_unlock_irq(&pupdated->sigmask_lock);
364                         if (stopped) {
365                                 printk("pupdated stopped...\n");
366                                 set_task_state(pupdated, TASK_STOPPED);
367                                 pupdated = NULL;
368                                 return 0;
369                         }
370                 }
371                 /* asynchronous setattr etc for the future ...
372                 obdfs_flush_dirty_inodes(jiffies - pupd_prm.age_super);
373                  */
374                 dirty_limit = nr_free_buffer_pages() * pupd_prm.nfract / 100;
375                 CDEBUG(D_CACHE, "dirty_limit %ld, cache_count %ld, wrote %d\n",
376                        dirty_limit, obdfs_cache_count, wrote);
377
378                 if (obdfs_cache_count > dirty_limit) {
379                         interval = 0;
380                         if ( wrote < pupd_prm.ndirty )
381                                 age >>= 1;
382                         CDEBUG(D_CACHE, "age %ld, interval %d\n",
383                                 age, interval);
384                 } else {
385                         if ( wrote < pupd_prm.ndirty >> 1 &&
386                              obdfs_cache_count < dirty_limit / 2) {
387                                 interval = pupd_prm.interval;
388                                 age = pupd_prm.age_buffer;
389                         } else if (obdfs_cache_count > dirty_limit / 2) {
390                                 interval >>= 1;
391                                 if ( wrote < pupd_prm.ndirty )
392                                         age >>= 1;
393                                 CDEBUG(D_CACHE, "age %ld, interval %d\n",
394                                        age, interval);
395                         }
396                 }
397
398                 wrote = obdfs_flush_dirty_pages(jiffies - age);
399         }
400 }
401
402
403 int obdfs_flushd_init(void)
404 {
405         /*
406         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
407          */
408         kernel_thread(pupdate, NULL, 0);
409         CDEBUG(D_PSDEV, __FUNCTION__ ": flushd inited\n");
410         return 0;
411 }
412
413 int obdfs_flushd_cleanup(void)
414 {
415         ENTRY;
416
417         if (pupdated) /* for debugging purposes only */
418                 CDEBUG(D_CACHE, "pupdated->state = %lx\n", pupdated->state);
419
420         /* deliver a signal to pupdated to shut it down */
421         if (pupdated && (pupdated->state == TASK_RUNNING ||
422                          pupdated->state == TASK_INTERRUPTIBLE )) {
423                 unsigned long timeout = HZ/20;
424                 unsigned long count = 0;
425                 send_sig_info(SIGTERM, (struct siginfo *)1, pupdated);
426                 while (pupdated) {
427                         if ((count % 2*HZ) == timeout)
428                                 printk(KERN_INFO "wait for pupdated to stop\n");
429                         count += timeout;
430                         set_current_state(TASK_INTERRUPTIBLE);
431                         schedule_timeout(timeout);
432                 }
433         }
434
435         EXIT;
436         /* not reached */
437         return 0;
438
439 }