Whamcloud - gitweb
- ext2_obd.c --- fix the bugs in read/write for Linux 2.4.3
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  * This code is issued under the GNU General Public License.
6  * See the file COPYING in this distribution
7  *
8  *  Copyright (C) 1991, 1992  Linus Torvalds
9  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
10  * Copryright (C) 1999 Seagate Technology Inc.
11  *
12  */
13 #define __NO_VERSION__
14 #include <linux/fs.h>
15 #include <linux/locks.h>
16 #include <linux/swap.h>
17
18 #include <linux/obd_support.h>
19 #include <linux/obd_class.h>
20 #include <linux/obdfs.h>
21
22
23 /* XXX temporary until the real function is available from kernel
24  * XXX set this to memory size in pages for max page cache size
25  */
26 #define nr_free_buffer_pages() 32768
27
28 struct {
29         int nfract;  /* Percentage of buffer cache dirty to 
30                         activate bdflush */
31         int ndirty;  /* Maximum number of dirty blocks to write out per
32                         wake-cycle */
33         int nrefill; /* Number of clean buffers to try to obtain
34                                 each time we call refill */
35         int nref_dirt; /* Dirty buffer threshold for activating bdflush
36                           when trying to refill buffers. */
37         int interval; /* jiffies delay between pupdate flushes */
38         int age_buffer;  /* Time for normal buffer to age before we flush it */
39         int age_super;  /* Time for superblock to age before we flush it */
40 } pupd_prm = {40, 1024, 64, 256, 1*HZ, 30*HZ, 5*HZ };
41
42 /* Called with the superblock list lock held */
43 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
44                                int nr_slots, struct page **pages, char **bufs,
45                                obd_size *counts, obd_off *offsets,
46                                obd_flag *flag, unsigned long check_time)
47 {
48         struct list_head *page_list = obdfs_iplist(inode);
49         struct list_head *tmp;
50         int num = 0;
51
52         ENTRY;
53
54         tmp = page_list;
55         /* Traverse list in reverse order, so we do FIFO, not LIFO order */
56         while ( (tmp = tmp->prev) != page_list && num < nr_slots ) {
57                 struct obdfs_pgrq *req;
58                 struct page *page;
59                 
60                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
61                 page = req->rq_page;
62
63                 
64                 if (req->rq_jiffies > check_time)
65                         break;          /* pages are in chronological order */
66
67                 /* Only allocate the obdo if we will actually do I/O here */
68                 if ( !*obdo ) {
69                         OIDEBUG(inode);
70                         *obdo = obdo_fromid(IID(inode), inode->i_ino,
71                                             OBD_MD_FLNOTOBD);
72                         if ( IS_ERR(*obdo) ) {
73                                 int err = PTR_ERR(*obdo);
74                                 *obdo = NULL;
75
76                                 EXIT;
77                                 return err;
78                         }
79
80                         /* FIXME revisit fromid & from_inode */
81                         obdfs_from_inode(*obdo, inode);
82                         *flag = OBD_BRW_CREATE;
83                 }
84
85                 /* Remove request from list before write to avoid conflict.
86                  * Note that obdfs_pgrq_del() also deletes the request.
87                  */
88                 obdfs_pgrq_del(req);
89                 if ( !page ) {
90                         CDEBUG(D_CACHE, "no page \n");
91                         continue;
92                 }
93
94                 bufs[num] = (char *)page_address(page);
95                 pages[num] = page;
96                 counts[num] = PAGE_SIZE;
97                 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
98                 CDEBUG(D_INFO, "ENQ inode %ld, page %p addr %p to vector\n", 
99                        inode->i_ino, page, (char *)page_address(page));
100                 num++;
101         }
102
103         if (!list_empty(page_list))
104                 CDEBUG(D_INFO, "inode %ld list not empty\n", inode->i_ino);
105         CDEBUG(D_INFO, "added %d page(s) to vector\n", num);
106
107         EXIT;
108         return num;  
109 } /* obdfs_enqueue_pages */
110
111 /* Dequeue cached pages for a dying inode without writing them to disk. */
112 void obdfs_dequeue_pages(struct inode *inode)
113 {
114         struct list_head *tmp;
115
116         ENTRY;
117         obd_down(&obdfs_i2sbi(inode)->osi_list_mutex);
118         tmp = obdfs_islist(inode);
119         if ( list_empty(tmp) ) {
120                 CDEBUG(D_INFO, "no dirty pages for inode %ld\n", inode->i_ino);
121                 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
122                 EXIT;
123                 return;
124         }
125
126         /* take it out of the super list */
127         list_del(tmp);
128         INIT_LIST_HEAD(obdfs_islist(inode));
129
130         tmp = obdfs_iplist(inode);
131         while ( (tmp = tmp->prev) != obdfs_iplist(inode) ) {
132                 struct obdfs_pgrq *req;
133                 struct page *page;
134                 
135                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
136                 page = req->rq_page;
137                 /* take it out of the list and free */
138                 obdfs_pgrq_del(req);
139                 /* now put the page away */
140                 put_page(page);
141         }
142
143         obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
144
145         /* decrement inode reference for page cache */
146         atomic_dec(&inode->i_count);
147         EXIT;
148 }
149
150 /* Remove writeback requests for the superblock */
151 int obdfs_flush_reqs(struct list_head *inode_list, unsigned long check_time)
152 {
153         struct list_head *tmp;
154         unsigned long     max_io, total_io = 0;
155         obd_count         num_io;
156         obd_count         num_obdos;
157         struct inode     *inodes[MAX_IOVEC];    /* write data back to these */
158         struct page      *pages[MAX_IOVEC];     /* call put_page on these */
159         struct obdo      *obdos[MAX_IOVEC];
160         char             *bufs[MAX_IOVEC];
161         obd_size          counts[MAX_IOVEC];
162         obd_off           offsets[MAX_IOVEC];
163         obd_flag          flags[MAX_IOVEC];
164         obd_count         bufs_per_obdo[MAX_IOVEC];
165         int               err = 0;
166         struct obdfs_sb_info *sbi;
167
168         ENTRY;
169         if (!inode_list) {
170                 CDEBUG(D_INODE, "no list\n");
171                 EXIT;
172                 return 0;
173         }
174
175         sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
176
177         obd_down(&sbi->osi_list_mutex);
178         if ( list_empty(inode_list) ) {
179                 CDEBUG(D_INFO, "list empty\n");
180                 obd_up(&sbi->osi_list_mutex);
181                 EXIT;
182                 return 0;
183         }
184
185         /* If we are forcing a write, write out all dirty pages */
186         max_io = check_time == ~0UL ? 1<<31 : pupd_prm.ndirty;
187         CDEBUG(D_INFO, "max_io = %lu\n", max_io);
188
189         /* Add each inode's dirty pages to a write vector, and write it.
190          * Traverse list in reverse order, so we do FIFO, not LIFO order
191          */
192  again:
193         tmp = inode_list;
194         num_io = 0;
195         num_obdos = 0;
196         while ( (tmp = tmp->prev) != inode_list && total_io < max_io) {
197                 struct obdfs_inode_info *ii;
198                 struct inode *inode;
199                 int res;
200
201                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
202                 inode = list_entry(ii, struct inode, u);
203                 inodes[num_obdos] = inode;
204                 obdos[num_obdos] = NULL;
205                 CDEBUG(D_INFO, "checking inode %ld pages\n", inode->i_ino);
206
207                 /* Make sure we reference "inode" and not "inodes[num_obdos]",
208                  * as num_obdos will change after the loop is run.
209                  */
210                 if (!list_empty(obdfs_iplist(inode))) {
211                         res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
212                                                   MAX_IOVEC - num_io,
213                                                   &pages[num_io], &bufs[num_io],
214                                                   &counts[num_io],
215                                                   &offsets[num_io],
216                                                   &flags[num_obdos],
217                                                   check_time);
218                         CDEBUG(D_INFO, "FLUSH inode %ld, pages flushed: %d\n",
219                                inode->i_ino, res);
220                         if ( res < 0 ) {
221                                 CDEBUG(D_INODE,
222                                        "fatal: unable to enqueue inode %ld (err %d)\n",
223                                        inode->i_ino, res);
224                                 /* XXX Move bad inode to end of list so we can
225                                  * continue with flushing list.  This is a
226                                  * temporary measure to avoid machine lockups.
227                                  * Maybe if we have -ENOENT, simply discard.
228                                  */
229                                 list_del(tmp);
230                                 list_add(tmp, inode_list);
231                                 err = res;
232                                 EXIT;
233                                 goto BREAK;
234                         }
235                         if (res == 0)
236                                 continue;
237
238                         num_io += res;
239                         total_io += res;
240                         bufs_per_obdo[num_obdos] = res;
241                         num_obdos++;
242
243                         if ( num_io == MAX_IOVEC ) {
244                                 obd_up(&sbi->osi_list_mutex);
245                                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
246                                                       obdos, bufs_per_obdo,
247                                                       pages, bufs, counts,
248                                                       offsets, flags);
249                                 if ( err ) {
250                                         CDEBUG(D_INODE,
251                                                "fatal: do_vec_wr err=%d\n",
252                                                err);
253                                         EXIT;
254                                         goto ERR;
255                                 }
256                                 obd_down(&sbi->osi_list_mutex);
257                                 goto again;
258                         }
259                 }
260         }
261
262 BREAK:
263         obd_up(&sbi->osi_list_mutex);
264
265         /* flush any remaining I/Os */
266         if ( num_io ) {
267                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
268                                       bufs_per_obdo, pages, bufs, counts,
269                                       offsets, flags);
270                 if (err)
271                         CDEBUG(D_INODE, "fatal: unable to do vec_wr (err %d)\n", err);
272                 num_io = 0;
273                 num_obdos = 0;
274         }
275
276         /* Remove inode from superblock dirty list when no more pages.
277          * Make sure we don't point at the current inode with tmp
278          * when we re-init the list on the inode, or we will loop.
279          */
280         obd_down(&sbi->osi_list_mutex);
281         tmp = inode_list;
282         while ( (tmp = tmp->prev) != inode_list ) {
283                 struct obdfs_inode_info *ii;
284                 struct inode *inode;
285
286                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
287                 inode = list_entry(ii, struct inode, u);
288                 CDEBUG(D_INFO, "checking inode %ld empty\n", inode->i_ino);
289                 if (list_empty(obdfs_iplist(inode))) {
290                         CDEBUG(D_INFO, "remove inode %ld from dirty list\n",
291                                inode->i_ino);
292                         tmp = tmp->next;
293                         list_del(obdfs_islist(inode));
294                         /* decrement inode reference for page cache */
295                         atomic_dec(&inode->i_count);
296                         INIT_LIST_HEAD(obdfs_islist(inode));
297                 }
298         }
299         obd_up(&sbi->osi_list_mutex);
300
301         CDEBUG(D_INFO, "flushed %ld pages in total\n", total_io);
302         EXIT;
303 ERR:
304         return err ? err : total_io;
305 } /* obdfs_flush_reqs */
306
307
308 /* Walk all of the superblocks and write out blocks which are too old.
309  * Return the maximum number of blocks written for a single filesystem.
310  */
311 int obdfs_flush_dirty_pages(unsigned long check_time)
312 {
313         struct list_head *sl;
314         int max = 0;
315
316         /*        ENTRY; */
317         sl = &obdfs_super_list;
318         while ( (sl = sl->prev) != &obdfs_super_list ) {
319                 struct obdfs_sb_info *sbi = 
320                         list_entry(sl, struct obdfs_sb_info, osi_list);
321                 int ret;
322
323                 /* walk write requests here, use the sb, check the time */
324                 ret = obdfs_flush_reqs(&sbi->osi_inodes, check_time);
325                 /* XXX handle error?  What to do with it? */
326
327                 max = ret > max ? ret : max;
328         }
329         if (max) { EXIT; }
330         return max;
331 } /* obdfs_flush_dirty_pages */
332
333
334 static struct task_struct *pupdated;
335
336 static int pupdate(void *unused) 
337 {
338         int interval = pupd_prm.interval;
339         long age = pupd_prm.age_buffer;
340         int wrote = 0;
341         
342         exit_files(current);
343         exit_mm(current);
344
345         pupdated = current;
346         pupdated->session = 1;
347         pupdated->pgrp = 1;
348         strcpy(pupdated->comm, "pupdated");
349
350         printk("pupdated activated...\n");
351
352         spin_lock_irq(&pupdated->sigmask_lock);
353         sigfillset(&pupdated->blocked);
354         siginitsetinv(&pupdated->blocked, sigmask(SIGTERM));
355         recalc_sigpending(pupdated);
356         spin_unlock_irq(&pupdated->sigmask_lock);
357
358         for (;;) {
359                 long dirty_limit;
360
361                 /* update interval */
362                 if (interval) {
363                         set_task_state(pupdated, TASK_INTERRUPTIBLE);
364                         schedule_timeout(interval);
365                 }
366                 if (signal_pending(pupdated))
367                 {
368                         int stopped = 0;
369                         spin_lock_irq(&pupdated->sigmask_lock);
370                         if (sigismember(&pupdated->pending.signal, SIGTERM))
371                         {
372                                 sigdelset(&pupdated->pending.signal, SIGTERM);
373                                 stopped = 1;
374                         }
375                         recalc_sigpending(pupdated);
376                         spin_unlock_irq(&pupdated->sigmask_lock);
377                         if (stopped) {
378                                 printk("pupdated stopped...\n");
379                                 set_task_state(pupdated, TASK_STOPPED);
380                                 pupdated = NULL;
381                                 return 0;
382                         }
383                 }
384                 /* asynchronous setattr etc for the future ...
385                 obdfs_flush_dirty_inodes(jiffies - pupd_prm.age_super);
386                  */
387                 dirty_limit = nr_free_buffer_pages() * pupd_prm.nfract / 100;
388
389                 if (obdfs_cache_count > dirty_limit) {
390                         interval = 0;
391                         if ( wrote < pupd_prm.ndirty )
392                                 age >>= 1;
393                         if (wrote) 
394                           CDEBUG(D_CACHE, "wrote %d, age %ld, interval %d\n",
395                                 wrote, age, interval);
396                 } else {
397                         if ( wrote < pupd_prm.ndirty >> 1 &&
398                              obdfs_cache_count < dirty_limit / 2) {
399                                 interval = pupd_prm.interval;
400                                 age = pupd_prm.age_buffer;
401                                 if (wrote) 
402                                   CDEBUG(D_INFO,
403                                        "wrote %d, age %ld, interval %d\n",
404                                        wrote, age, interval);
405                         } else if (obdfs_cache_count > dirty_limit / 2) {
406                                 interval >>= 1;
407                                 if ( wrote < pupd_prm.ndirty )
408                                         age >>= 1;
409                                 if (wrote) 
410                                   CDEBUG(D_CACHE,
411                                        "wrote %d, age %ld, interval %d\n",
412                                        wrote, age, interval);
413                         }
414                 }
415
416                 wrote = obdfs_flush_dirty_pages(jiffies - age);
417                 if (wrote)
418                         CDEBUG(D_CACHE,
419                                "dirty_limit %ld, cache_count %ld, wrote %d\n",
420                                dirty_limit, obdfs_cache_count, wrote);
421         }
422 }
423
424
425 int obdfs_flushd_init(void)
426 {
427         /*
428         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
429          */
430         kernel_thread(pupdate, NULL, 0);
431         CDEBUG(D_PSDEV, __FUNCTION__ ": flushd inited\n");
432         return 0;
433 }
434
435 int obdfs_flushd_cleanup(void)
436 {
437         ENTRY;
438
439         if (pupdated) /* for debugging purposes only */
440                 CDEBUG(D_CACHE, "pupdated->state = %lx\n", pupdated->state);
441
442         /* deliver a signal to pupdated to shut it down */
443         if (pupdated && (pupdated->state == TASK_RUNNING ||
444                          pupdated->state == TASK_INTERRUPTIBLE )) {
445                 unsigned long timeout = HZ/20;
446                 unsigned long count = 0;
447                 send_sig_info(SIGTERM, (struct siginfo *)1, pupdated);
448                 while (pupdated) {
449                         if ((count % 2*HZ) == timeout)
450                                 printk(KERN_INFO "wait for pupdated to stop\n");
451                         count += timeout;
452                         set_current_state(TASK_INTERRUPTIBLE);
453                         schedule_timeout(timeout);
454                 }
455         }
456
457         EXIT;
458         /* not reached */
459         return 0;
460
461 }