Whamcloud - gitweb
obdfs/*.c: finished truncate implementation
[fs/lustre-release.git] / lustre / obdfs / flushd.c
1 /*
2  * OBDFS Super operations - also used for Lustre file system
3  *
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7  * Copryright (C) 1999 Seagate Technology Inc.
8  *
9  */
10 #define __NO_VERSION__
11 #include <linux/fs.h>
12 #include <linux/locks.h>
13 #include <linux/swap.h>
14
15 #include <linux/obd_support.h>
16 #include <linux/obd_class.h>
17 #include <linux/obdfs.h>
18
19
20 struct {
21         int nfract;  /* Percentage of buffer cache dirty to 
22                         activate bdflush */
23         int ndirty;  /* Maximum number of dirty blocks to write out per
24                         wake-cycle */
25         int nrefill; /* Number of clean buffers to try to obtain
26                                 each time we call refill */
27         int nref_dirt; /* Dirty buffer threshold for activating bdflush
28                           when trying to refill buffers. */
29         int interval; /* jiffies delay between pupdate flushes */
30         int age_buffer;  /* Time for normal buffer to age before we flush it */
31         int age_super;  /* Time for superblock to age before we flush it */
32 } pupd_prm = {40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ };
33
34 /* Called with the superblock list lock */
35 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
36                                int nr_slots, struct page **pages, char **bufs,
37                                obd_size *counts, obd_off *offsets,
38                                obd_flag *flag, unsigned long check_time)
39 {
40         struct list_head *page_list = obdfs_iplist(inode);
41         struct list_head *tmp;
42         int num = 0;
43
44         ENTRY;
45
46         tmp = page_list;
47         /* Traverse list in reverse order, so we do FIFO, not LIFO order */
48         while ( (tmp = tmp->prev) != page_list && num < nr_slots ) {
49                 struct obdfs_pgrq *req;
50                 struct page *page;
51                 
52                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
53                 page = req->rq_page;
54
55                 
56                 if (req->rq_jiffies > check_time)
57                         break;          /* pages are in chronological order */
58
59                 /* Only allocate the obdo if we will actually do I/O here */
60                 if ( !*obdo ) {
61                         OIDEBUG(inode);
62                         *obdo = obdo_fromid(IID(inode), inode->i_ino,
63                                             OBD_MD_FLNOTOBD);
64                         if ( IS_ERR(*obdo) ) {
65                                 int err = PTR_ERR(*obdo);
66                                 *obdo = NULL;
67
68                                 EXIT;
69                                 return err;
70                         }
71
72                         /* FIXME revisit fromid & from_inode */
73                         obdfs_from_inode(*obdo, inode);
74                         *flag = OBD_BRW_CREATE;
75                 }
76
77                 /* Remove request from list before write to avoid conflict.
78                  * Note that obdfs_pgrq_del() also deletes the request.
79                  */
80                 obdfs_pgrq_del(req);
81                 if ( !page ) {
82                         CDEBUG(D_CACHE, "no page \n");
83                         continue;
84                 }
85
86                 bufs[num] = (char *)page_address(page);
87                 pages[num] = page;
88                 counts[num] = PAGE_SIZE;
89                 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
90                 CDEBUG(D_INFO, "ENQ inode %ld, page %p addr %p to vector\n", 
91                        inode->i_ino, page, (char *)page_address(page));
92                 num++;
93         }
94
95         if (!list_empty(page_list))
96                 CDEBUG(D_INFO, "inode %ld list not empty\n", inode->i_ino);
97         CDEBUG(D_INFO, "added %d page(s) to vector\n", num);
98
99         EXIT;
100         return num;  
101 } /* obdfs_enqueue_pages */
102
103 /* Dequeue cached pages for a dying inode without writing them to disk. */
104 void obdfs_dequeue_pages(struct inode *inode)
105 {
106         struct list_head *tmp;
107
108         obd_down(&obdfs_i2sbi(inode)->osi_list_mutex);
109         tmp = obdfs_islist(inode);
110         if ( list_empty(tmp) ) {
111                 CDEBUG(D_INFO, "no dirty pages for inode %ld\n", inode->i_ino);
112                 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
113                 EXIT;
114                 return;
115         }
116
117         /* take it out of the super list */
118         list_del(tmp);
119         INIT_LIST_HEAD(obdfs_islist(inode));
120
121         tmp = obdfs_iplist(inode);
122         while ( (tmp = tmp->prev) != obdfs_iplist(inode) ) {
123                 struct obdfs_pgrq *req;
124                 struct page *page;
125                 
126                 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
127                 page = req->rq_page;
128                 /* take it out of the list and free */
129                 obdfs_pgrq_del(req);
130                 /* now put the page away */
131                 put_page(page);
132         }
133
134         obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
135
136         /* decrement inode reference for page cache */
137         inode->i_count--;
138 }
139
140 /* Remove writeback requests for the superblock */
141 int obdfs_flush_reqs(struct list_head *inode_list, unsigned long check_time)
142 {
143         struct list_head *tmp;
144         int               total_io = 0;
145         obd_count         num_io;
146         obd_count         num_obdos;
147         struct inode     *inodes[MAX_IOVEC];    /* write data back to these */
148         struct page      *pages[MAX_IOVEC];     /* call put_page on these */
149         struct obdo      *obdos[MAX_IOVEC];
150         char             *bufs[MAX_IOVEC];
151         obd_size          counts[MAX_IOVEC];
152         obd_off           offsets[MAX_IOVEC];
153         obd_flag          flags[MAX_IOVEC];
154         obd_count         bufs_per_obdo[MAX_IOVEC];
155         int               err = 0;
156         struct obdfs_sb_info *sbi;
157
158         ENTRY;
159         if (!inode_list) {
160                 CDEBUG(D_INODE, "no list\n");
161                 EXIT;
162                 return 0;
163         }
164
165         sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
166
167         obd_down(&sbi->osi_list_mutex);
168         if ( list_empty(inode_list) ) {
169                 CDEBUG(D_CACHE, "list empty: memory %ld\n", obd_memory);
170                 obd_up(&sbi->osi_list_mutex);
171                 EXIT;
172                 return 0;
173         }
174
175         /* Add each inode's dirty pages to a write vector, and write it.
176          * Traverse list in reverse order, so we do FIFO, not LIFO order
177          */
178  again:
179         tmp = inode_list;
180         num_io = 0;
181         num_obdos = 0;
182         while ( (tmp = tmp->prev) != inode_list && total_io < pupd_prm.ndirty) {
183                 struct obdfs_inode_info *ii;
184                 struct inode *inode;
185                 int res;
186
187                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
188                 inode = list_entry(ii, struct inode, u);
189                 inodes[num_obdos] = inode;
190                 obdos[num_obdos] = NULL;
191                 CDEBUG(D_INFO, "checking inode %ld pages\n", inode->i_ino);
192
193                 /* Make sure we reference "inode" and not "inodes[num_obdos]",
194                  * as num_obdos will change after the loop is run.
195                  */
196                 if (!list_empty(obdfs_iplist(inode))) {
197                         res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
198                                                   MAX_IOVEC - num_io,
199                                                   &pages[num_io], &bufs[num_io],
200                                                   &counts[num_io],
201                                                   &offsets[num_io],
202                                                   &flags[num_obdos],
203                                                   check_time);
204                         CDEBUG(D_INFO, "FLUSH inode %ld, pages flushed: %d\n",
205                                inode->i_ino, res);
206                         if ( res < 0 ) {
207                                 CDEBUG(D_INODE,
208                                        "fatal: unable to enqueue inode %ld (err %d)\n",
209                                        inode->i_ino, err);
210                                 /* XXX Move bad inode to end of list so we can
211                                  * continue with flushing list.  This is a
212                                  * temporary measure to avoid machine lockups.
213                                  */
214                                 list_del(tmp);
215                                 list_add(tmp, inode_list);
216                                 err = res;
217                                 EXIT;
218                                 goto BREAK;
219                         } else if (res) {
220                                 num_io += res;
221                                 total_io += res;
222                                 bufs_per_obdo[num_obdos] = res;
223                                 num_obdos++;
224                         }
225
226                         if ( num_io == MAX_IOVEC ) {
227                                 obd_up(&sbi->osi_list_mutex);
228                                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
229                                                       obdos, bufs_per_obdo,
230                                                       pages, bufs, counts,
231                                                       offsets, flags);
232                                 if ( err ) {
233                                         CDEBUG(D_INODE,
234                                                 "fatal: unable to do vec_wr (err %d)\n", err);
235                                         EXIT;
236                                         goto ERR;
237                                 }
238                                 obd_down(&sbi->osi_list_mutex);
239                                 goto again;
240                         }
241                 }
242         }
243
244 BREAK:
245         obd_up(&sbi->osi_list_mutex);
246
247         /* flush any remaining I/Os */
248         if ( num_io ) {
249                 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
250                                       bufs_per_obdo, pages, bufs, counts,
251                                       offsets, flags);
252                 if (err)
253                         CDEBUG(D_INODE, "fatal: unable to do vec_wr (err %d)\n", err);
254                 num_io = 0;
255                 num_obdos = 0;
256         }
257
258         /* Remove inode from superblock dirty list when no more pages.
259          * Make sure we don't point at the current inode with tmp
260          * when we re-init the list on the inode, or we will loop.
261          */
262         obd_down(&sbi->osi_list_mutex);
263         tmp = inode_list;
264         while ( (tmp = tmp->prev) != inode_list ) {
265                 struct obdfs_inode_info *ii;
266                 struct inode *inode;
267
268                 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
269                 inode = list_entry(ii, struct inode, u);
270                 CDEBUG(D_INFO, "checking inode %ld empty\n", inode->i_ino);
271                 if (list_empty(obdfs_iplist(inode))) {
272                         CDEBUG(D_INFO, "remove inode %ld from dirty list\n",
273                                inode->i_ino);
274                         tmp = tmp->next;
275                         list_del(obdfs_islist(inode));
276                         /* decrement inode reference for page cache */
277                         inode->i_count--;
278                         INIT_LIST_HEAD(obdfs_islist(inode));
279                 }
280         }
281         obd_up(&sbi->osi_list_mutex);
282
283         CDEBUG(D_INFO, "flushed %d pages in total\n", total_io);
284         EXIT;
285 ERR:
286         return err ? err : total_io;
287 } /* obdfs_flush_reqs */
288
289
290 /* Walk all of the superblocks and write out blocks which are too old.
291  * Return the maximum number of blocks written for a single filesystem.
292  */
293 int obdfs_flush_dirty_pages(unsigned long check_time)
294 {
295         struct list_head *sl;
296         int max = 0;
297
298         ENTRY;
299         sl = &obdfs_super_list;
300         while ( (sl = sl->prev) != &obdfs_super_list ) {
301                 struct obdfs_sb_info *sbi = 
302                         list_entry(sl, struct obdfs_sb_info, osi_list);
303                 int ret;
304
305                 /* walk write requests here, use the sb, check the time */
306                 ret = obdfs_flush_reqs(&sbi->osi_inodes, check_time);
307                 /* XXX handle error?  What to do with it? */
308
309                 max = ret > max ? ret : max;
310         }
311         EXIT;
312         return max;
313 } /* obdfs_flush_dirty_pages */
314
315
316 static struct task_struct *pupdated;
317
318 static int pupdate(void *unused) 
319 {
320         int interval = pupd_prm.interval;
321         long age = pupd_prm.age_buffer;
322         int wrote = 0;
323         
324         exit_files(current);
325         exit_mm(current);
326
327         pupdated = current;
328         pupdated->session = 1;
329         pupdated->pgrp = 1;
330         strcpy(pupdated->comm, "pupdated");
331
332         printk("pupdated activated...\n");
333
334         spin_lock_irq(&pupdated->sigmask_lock);
335         sigfillset(&pupdated->blocked);
336         siginitsetinv(&pupdated->blocked, sigmask(SIGTERM));
337         recalc_sigpending(pupdated);
338         spin_unlock_irq(&pupdated->sigmask_lock);
339
340         for (;;) {
341                 long dirty_limit;
342
343                 /* update interval */
344                 if (interval) {
345                         set_task_state(pupdated, TASK_INTERRUPTIBLE);
346                         schedule_timeout(interval);
347                 }
348                 if (signal_pending(pupdated))
349                 {
350                         int stopped = 0;
351                         spin_lock_irq(&pupdated->sigmask_lock);
352                         if (sigismember(&pupdated->signal, SIGTERM))
353                         {
354                                 sigdelset(&pupdated->signal, SIGTERM);
355                                 stopped = 1;
356                         }
357                         recalc_sigpending(pupdated);
358                         spin_unlock_irq(&pupdated->sigmask_lock);
359                         if (stopped) {
360                                 printk("pupdated stopped...\n");
361                                 set_task_state(pupdated, TASK_STOPPED);
362                                 pupdated = NULL;
363                                 return 0;
364                         }
365                 }
366                 /* asynchronous setattr etc for the future ...
367                 obdfs_flush_dirty_inodes(jiffies - pupd_prm.age_super);
368                  */
369                 /* XXX for debugging
370                 dirty_limit = nr_free_buffer_pages() * pupd_prm.nfract / 100;
371                  * XXX */
372                 dirty_limit = 16384 * pupd_prm.nfract / 100;
373                 CDEBUG(D_CACHE, "dirty_limit %ld, cache_count %ld, wrote %d\n",
374                        dirty_limit, obdfs_cache_count, wrote);
375
376                 if (obdfs_cache_count > dirty_limit) {
377                         interval = 0;
378                         if ( wrote < pupd_prm.ndirty )
379                                 age >>= 1;
380                         CDEBUG(D_CACHE, "age %ld, interval %d\n",
381                                 age, interval);
382                 } else {
383                         if ( wrote < pupd_prm.ndirty >> 1 &&
384                              obdfs_cache_count < dirty_limit / 2) {
385                                 interval = pupd_prm.interval;
386                                 age = pupd_prm.age_buffer;
387                         } else if (obdfs_cache_count > dirty_limit / 2) {
388                                 interval >>= 1;
389                                 if ( wrote < pupd_prm.ndirty )
390                                         age >>= 1;
391                                 CDEBUG(D_CACHE, "age %ld, interval %d\n",
392                                        age, interval);
393                         }
394                 }
395
396                 wrote = obdfs_flush_dirty_pages(jiffies - age);
397         }
398 }
399
400
401 int obdfs_flushd_init(void)
402 {
403         /*
404         kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
405          */
406         kernel_thread(pupdate, NULL, 0);
407         CDEBUG(D_PSDEV, __FUNCTION__ ": flushd inited\n");
408         return 0;
409 }
410
411 int obdfs_flushd_cleanup(void)
412 {
413         ENTRY;
414
415         if (pupdated) /* for debugging purposes only */
416                 CDEBUG(D_CACHE, "pupdated->state = %lx\n", pupdated->state);
417
418         /* deliver a signal to pupdated to shut it down */
419         if (pupdated && (pupdated->state == TASK_RUNNING ||
420                          pupdated->state == TASK_INTERRUPTIBLE )) {
421                 unsigned long timeout = HZ/20;
422                 unsigned long count = 0;
423                 send_sig_info(SIGTERM, (struct siginfo *)1, pupdated);
424                 while (pupdated) {
425                         if ((count % 2*HZ) == timeout)
426                                 printk(KERN_INFO "wait for pupdated to stop\n");
427                         count += timeout;
428                         set_current_state(TASK_INTERRUPTIBLE);
429                         schedule_timeout(timeout);
430                 }
431         }
432
433         EXIT;
434         /* not reached */
435         return 0;
436
437 }