Whamcloud - gitweb
large file printk cleanups
[fs/lustre-release.git] / lustre / obdfs / rw.c
1 /*
2  * OBDFS Super operations
3  *
4  * This code is issued under the GNU General Public License.
5  * See the file COPYING in this distribution
6  *
7  * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
8  * Copryright (C) 1999 Stelias Computing Inc, 
9  *                (author Peter J. Braam <braam@stelias.com>)
10  * Copryright (C) 1999 Seagate Technology Inc.
11 */
12
13
14 #include <linux/config.h>
15 #include <linux/kernel.h>
16 #include <linux/mm.h>
17 #include <linux/string.h>
18 #include <linux/stat.h>
19 #include <linux/errno.h>
20 #include <linux/locks.h>
21 #include <linux/unistd.h>
22
23 #include <asm/system.h>
24 #include <asm/uaccess.h>
25
26 #include <linux/fs.h>
27 #include <linux/stat.h>
28 #include <asm/uaccess.h>
29 #include <linux/vmalloc.h>
30 #include <asm/segment.h>
31 #include <linux/mm.h>
32 #include <linux/pagemap.h>
33 #include <linux/smp_lock.h>
34
35 #include <linux/obd_support.h>
36 #include <linux/obd_ext2.h>
37 #include <linux/obdfs.h>
38
39 void obdfs_change_inode(struct inode *inode);
40
41 static int cache_writes = 0;
42
43
44 /* page cache support stuff */ 
45
46 /*
47  * Add a page to the dirty page list.
48  */
49 void __set_page_dirty(struct page *page)
50 {
51         struct address_space *mapping;
52         spinlock_t *pg_lock;
53
54         pg_lock = PAGECACHE_LOCK(page);
55         spin_lock(pg_lock);
56
57         mapping = page->mapping;
58         spin_lock(&mapping->page_lock);
59
60         list_del(&page->list);
61         list_add(&page->list, &mapping->dirty_pages);
62
63         spin_unlock(&mapping->page_lock);
64         spin_unlock(pg_lock);
65
66         if (mapping->host)
67                 mark_inode_dirty_pages(mapping->host);
68 }
69
70 /*
71  * Add a page to the dirty page list.
72  */
73 void __set_page_clean(struct page *page)
74 {
75         struct inode *inode;
76         struct address_space *mapping;
77         spinlock_t *pg_lock;
78
79         pg_lock = PAGECACHE_LOCK(page);
80         spin_lock(pg_lock);
81
82         mapping = page->mapping;
83         spin_lock(&mapping->page_lock);
84
85         list_del(&page->list);
86         list_add(&page->list, &mapping->clean_pages);
87
88         spin_unlock(&mapping->page_lock);
89         spin_unlock(pg_lock);
90
91         inode = mapping->host;
92         if (list_empty(&mapping->dirty_pages)) { 
93                 CDEBUG(D_INODE, "inode clean\n");
94                 inode->i_state &= ~I_DIRTY_PAGES;
95         }
96         EXIT;
97 }
98
99 inline void set_page_clean(struct page *page)
100 {
101         if (PageDirty(page)) { 
102                 ClearPageDirty(page);
103                 __set_page_clean(page);
104         }
105 }
106
107 /* SYNCHRONOUS I/O to object storage for an inode -- object attr will be updated too */
108 static int obdfs_brw(int rw, struct inode *inode, struct page *page, int create)
109 {
110         obd_count        num_obdo = 1;
111         obd_count        bufs_per_obdo = 1;
112         struct obdo     *oa;
113         obd_size         count = PAGE_SIZE;
114         obd_off          offset = ((obd_off)page->index) << PAGE_SHIFT;
115         obd_flag         flags = create ? OBD_BRW_CREATE : 0;
116         int              err;
117
118         ENTRY;
119         if (IOPS(inode, brw) == NULL) {
120                 printk(KERN_ERR __FUNCTION__ ": no brw method!\n");
121                 EXIT;
122                 return -EIO;
123         }
124
125         oa = obdo_alloc();
126         if ( !oa ) {
127                 EXIT;
128                 return -ENOMEM;
129         }
130         oa->o_valid = OBD_MD_FLNOTOBD;
131         obdfs_from_inode(oa, inode);
132
133         err = IOPS(inode, brw)(rw, IID(inode), num_obdo, &oa, &bufs_per_obdo,
134                                &page, &count, &offset, &flags);
135         //if ( !err )
136         //      obdfs_to_inode(inode, oa); /* copy o_blocks to i_blocks */
137
138         obdo_free(oa);
139         EXIT;
140         return err;
141 } /* obdfs_brw */
142
143 extern void set_page_clean(struct page *);
144
145 /* SYNCHRONOUS I/O to object storage for an inode -- object attr will be updated too */
146 static int obdfs_commit_page(struct page *page, int create, int from, int to)
147 {
148         struct inode *inode = page->mapping->host;
149         obd_count        num_obdo = 1;
150         obd_count        bufs_per_obdo = 1;
151         struct obdo     *oa;
152         obd_size         count = to;
153         obd_off          offset = (((obd_off)page->index) << PAGE_SHIFT);
154         obd_flag         flags = create ? OBD_BRW_CREATE : 0;
155         int              err;
156
157         ENTRY;
158         if (IOPS(inode, brw) == NULL) {
159                 printk(KERN_ERR __FUNCTION__ ": no brw method!\n");
160                 EXIT;
161                 return -EIO;
162         }
163
164         oa = obdo_alloc();
165         if ( !oa ) {
166                 EXIT;
167                 return -ENOMEM;
168         }
169         oa->o_valid = OBD_MD_FLNOTOBD;
170         obdfs_from_inode(oa, inode);
171
172         CDEBUG(D_INODE, "commit_page writing (at %d) to %d, count %Ld\n", 
173                from, to, count);
174
175         err = IOPS(inode, brw)(WRITE, IID(inode), num_obdo, &oa, &bufs_per_obdo,
176                                &page, &count, &offset, &flags);
177         if ( !err ) {
178                 SetPageUptodate(page);
179                 set_page_clean(page);
180         }
181
182         //if ( !err )
183         //      obdfs_to_inode(inode, oa); /* copy o_blocks to i_blocks */
184
185         obdo_free(oa);
186         EXIT;
187         return err;
188 } /* obdfs_brw */
189
190
191 /* returns the page unlocked, but with a reference */
192 int obdfs_readpage(struct file *file, struct page *page)
193 {
194         struct inode *inode = page->mapping->host;
195         int rc;
196
197         ENTRY;
198
199         if ( ((inode->i_size + PAGE_CACHE_SIZE -1)>>PAGE_SHIFT) 
200              <= page->index) {
201                 memset(kmap(page), 0, PAGE_CACHE_SIZE);
202                 kunmap(page);
203                 goto readpage_out;
204         }
205
206         if (Page_Uptodate(page)) {
207                 EXIT;
208                 goto readpage_out;
209         }
210
211         rc = obdfs_brw(READ, inode, page, 0);
212         if ( rc ) {
213                 EXIT; 
214                 return rc;
215         } 
216         /* PDEBUG(page, "READ"); */
217
218  readpage_out:
219         SetPageUptodate(page);
220         obd_unlock_page(page);
221         EXIT;
222         return 0;
223 } /* obdfs_readpage */
224
225 int obdfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to)
226 {
227         struct inode *inode = page->mapping->host;
228         obd_off offset = ((obd_off)page->index) << PAGE_SHIFT;
229         int rc = 0;
230         ENTRY; 
231         
232         kmap(page);
233         if (Page_Uptodate(page)) { 
234                 EXIT;
235                 goto prepare_done;
236         }
237
238         if ( (from <= offset) && (to >= offset + PAGE_SIZE) ) {
239                 EXIT;
240                 return 0;
241         }
242         
243         rc = obdfs_brw(READ, inode, page, 0);
244         if ( !rc ) {
245                 SetPageUptodate(page);
246         } 
247
248  prepare_done:
249         set_page_dirty(page);
250         //SetPageDirty(page);
251         EXIT;
252         return rc;
253 }
254
255
256
257
258
259
260 static kmem_cache_t *obdfs_pgrq_cachep = NULL;
261
262 int obdfs_init_pgrqcache(void)
263 {
264         ENTRY;
265         if (obdfs_pgrq_cachep == NULL) {
266                 CDEBUG(D_CACHE, "allocating obdfs_pgrq_cache\n");
267                 obdfs_pgrq_cachep = kmem_cache_create("obdfs_pgrq",
268                                                       sizeof(struct obdfs_pgrq),
269                                                       0, SLAB_HWCACHE_ALIGN,
270                                                       NULL, NULL);
271                 if (obdfs_pgrq_cachep == NULL) {
272                         EXIT;
273                         return -ENOMEM;
274                 } else {
275                         CDEBUG(D_CACHE, "allocated cache at %p\n",
276                                obdfs_pgrq_cachep);
277                 }
278         } else {
279                 CDEBUG(D_CACHE, "using existing cache at %p\n",
280                        obdfs_pgrq_cachep);
281         }
282         EXIT;
283         return 0;
284 } /* obdfs_init_wreqcache */
285
286 inline void obdfs_pgrq_del(struct obdfs_pgrq *pgrq)
287 {
288         --obdfs_cache_count;
289         CDEBUG(D_INFO, "deleting page %p from list [count %ld]\n",
290                pgrq->rq_page, obdfs_cache_count);
291         list_del(&pgrq->rq_plist);
292         OBDClearCachePage(pgrq->rq_page);
293         kmem_cache_free(obdfs_pgrq_cachep, pgrq);
294 }
295
296 void obdfs_cleanup_pgrqcache(void)
297 {
298         ENTRY;
299         if (obdfs_pgrq_cachep != NULL) {
300                 CDEBUG(D_CACHE, "destroying obdfs_pgrqcache at %p, count %ld\n",
301                        obdfs_pgrq_cachep, obdfs_cache_count);
302                 if (kmem_cache_destroy(obdfs_pgrq_cachep))
303                         printk(KERN_INFO __FUNCTION__
304                                ": unable to free all of cache\n");
305                 obdfs_pgrq_cachep = NULL;
306         } else
307                 printk(KERN_INFO __FUNCTION__ ": called with NULL pointer\n");
308
309         EXIT;
310 } /* obdfs_cleanup_wreqcache */
311
312
313 /* called with the list lock held */
314 static struct page *obdfs_find_page_index(struct inode *inode,
315                                           unsigned long index)
316 {
317         struct list_head *page_list = obdfs_iplist(inode);
318         struct list_head *tmp;
319         struct page *page;
320
321         ENTRY;
322
323         CDEBUG(D_INFO, "looking for inode %ld pageindex %ld\n",
324                inode->i_ino, index);
325         OIDEBUG(inode);
326
327         if (list_empty(page_list)) {
328                 EXIT;
329                 return NULL;
330         }
331         tmp = page_list;
332         while ( (tmp = tmp->next) != page_list ) {
333                 struct obdfs_pgrq *pgrq;
334
335                 pgrq = list_entry(tmp, struct obdfs_pgrq, rq_plist);
336                 page = pgrq->rq_page;
337                 if (index == page->index) {
338                         CDEBUG(D_INFO,
339                                "INDEX SEARCH found page %p, index %ld\n",
340                                page, index);
341                         EXIT;
342                         return page;
343                 }
344         } 
345
346         EXIT;
347         return NULL;
348 } /* obdfs_find_page_index */
349
350
351 /* call and free pages from Linux page cache: called with io lock on inodes */
352 int obdfs_do_vec_wr(struct inode **inodes, obd_count num_io,
353                     obd_count num_obdos, struct obdo **obdos,
354                     obd_count *oa_bufs, struct page **pages, char **bufs,
355                     obd_size *counts, obd_off *offsets, obd_flag *flags)
356 {
357         int err;
358
359         ENTRY;
360         if (IOPS(inodes[0], brw) == NULL) {
361                 printk(KERN_ERR __FUNCTION__ ": no brw method!\n");
362                 EXIT;
363                 return -EIO;
364         }
365
366         CDEBUG(D_INFO, "writing %d page(s), %d obdo(s) in vector\n",
367                num_io, num_obdos);
368         if (obd_debug_level & D_INFO) { /* DEBUGGING */
369                 int i;
370                 printk("OBDOS: ");
371                 for (i = 0; i < num_obdos; i++)
372                         printk("%ld:0x%p ", (long)obdos[i]->o_id, obdos[i]);
373
374                 printk("\nPAGES: ");
375                 for (i = 0; i < num_io; i++)
376                         printk("0x%p ", pages[i]);
377                 printk("\n");
378         }
379
380         err = IOPS(inodes[0], brw)(WRITE, IID(inodes[0]), num_obdos, obdos,
381                                   oa_bufs, pages, counts, offsets, flags);
382
383         CDEBUG(D_INFO, "BRW done\n");
384         /* release the pages from the page cache */
385         while ( num_io > 0 ) {
386                 --num_io;
387                 CDEBUG(D_INFO, "calling put_page for %p, index %ld\n",
388                        pages[num_io], pages[num_io]->index);
389                 /* PDEBUG(pages[num_io], "do_vec_wr"); */
390                 put_page(pages[num_io]);
391                 /* PDEBUG(pages[num_io], "do_vec_wr"); */
392         }
393         CDEBUG(D_INFO, "put_page done\n");
394
395         while ( num_obdos > 0) {
396                 --num_obdos;
397                 CDEBUG(D_INFO, "free obdo %ld\n",(long)obdos[num_obdos]->o_id);
398                 /* copy o_blocks to i_blocks */
399                 obdfs_set_size (inodes[num_obdos], obdos[num_obdos]->o_size);
400                 //obdfs_to_inode(inodes[num_obdos], obdos[num_obdos]);
401                 obdo_free(obdos[num_obdos]);
402         }
403         CDEBUG(D_INFO, "obdo_free done\n");
404         EXIT;
405         return err;
406 }
407
408
409 /*
410  * Add a page to the write request cache list for later writing.
411  * ASYNCHRONOUS write method.
412  */
413 static int obdfs_add_page_to_cache(struct inode *inode, struct page *page)
414 {
415         int err = 0;
416         ENTRY;
417
418         /* The PG_obdcache bit is cleared by obdfs_pgrq_del() BEFORE the page
419          * is written, so at worst we will write the page out twice.
420          *
421          * If the page has the PG_obdcache bit set, then the inode MUST be
422          * on the superblock dirty list so we don't need to check this.
423          * Dirty inodes are removed from the superblock list ONLY when they
424          * don't have any more cached pages.  It is possible to have an inode
425          * with no dirty pages on the superblock list, but not possible to
426          * have an inode with dirty pages NOT on the superblock dirty list.
427          */
428         if (!OBDAddCachePage(page)) {
429                 struct obdfs_pgrq *pgrq;
430                 pgrq = kmem_cache_alloc(obdfs_pgrq_cachep, SLAB_KERNEL);
431                 if (!pgrq) {
432                         OBDClearCachePage(page);
433                         EXIT;
434                         return -ENOMEM;
435                 }
436                 /* not really necessary since we set all pgrq fields here
437                 memset(pgrq, 0, sizeof(*pgrq)); 
438                 */
439                 
440                 pgrq->rq_page = page;
441                 pgrq->rq_jiffies = jiffies;
442                 get_page(pgrq->rq_page);
443
444                 obd_down(&obdfs_i2sbi(inode)->osi_list_mutex);
445                 list_add(&pgrq->rq_plist, obdfs_iplist(inode));
446                 obdfs_cache_count++;
447                 //printk("-- count %d\n", obdfs_cache_count);
448
449                 /* If inode isn't already on superblock inodes list, add it.
450                  *
451                  * We increment the reference count on the inode to keep it
452                  * from being freed from memory.  This _should_ be an iget()
453                  * with an iput() in both flush_reqs() and put_inode(), but
454                  * since put_inode() is called from iput() we can't call iput()
455                  * again there.  Instead we just increment/decrement i_count,
456                  * which is mostly what iget/iput do for an inode in memory.
457                  */
458                 if ( list_empty(obdfs_islist(inode)) ) {
459                         atomic_inc(&inode->i_count);
460                         CDEBUG(D_INFO,
461                                "adding inode %ld to superblock list %p\n",
462                                inode->i_ino, obdfs_slist(inode));
463                         list_add(obdfs_islist(inode), obdfs_slist(inode));
464                 }
465                 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
466
467         }
468
469         /* XXX For testing purposes, we can write out the page here.
470         err = obdfs_flush_reqs(obdfs_slist(inode), ~0UL);
471          */
472
473         EXIT;
474         return err;
475 } /* obdfs_add_page_to_cache */
476
477 void rebalance(void)
478 {
479         if (obdfs_cache_count > 60000) {
480                 printk("-- count %ld\n", obdfs_cache_count);
481                 //obdfs_flush_dirty_pages(~0UL);
482                 printk("-- count %ld\n", obdfs_cache_count);
483         }
484 }
485
486 /* select between SYNC and ASYNC I/O methods */
487 int obdfs_do_writepage(struct page *page, int sync)
488 {
489         struct inode *inode = page->mapping->host;
490         int err;
491
492         ENTRY;
493         /* PDEBUG(page, "WRITEPAGE"); */
494         if ( sync )
495                 err = obdfs_brw(WRITE, inode, page, 1);
496         else {
497                 err = obdfs_add_page_to_cache(inode, page);
498                 CDEBUG(D_INFO, "DO_WR ino: %ld, page %p, err %d, uptodate %d\n",
499                        inode->i_ino, page, err, Page_Uptodate(page));
500         }
501                 
502         if ( !err ) {
503                 SetPageUptodate(page);
504                 set_page_clean(page);
505         }
506         /* PDEBUG(page,"WRITEPAGE"); */
507         EXIT;
508         return err;
509 } /* obdfs_do_writepage */
510
511
512
513 /* returns the page unlocked, but with a reference */
514 int obdfs_writepage(struct page *page)
515 {
516         int rc;
517         struct inode *inode = page->mapping->host;
518         ENTRY;
519         printk("---> writepage called ino %ld!\n", inode->i_ino);
520         BUG();
521         rc = obdfs_do_writepage(page, 1);
522         if ( !rc ) {
523                 set_page_clean(page);
524         } else {
525                 CDEBUG(D_INODE, "--> GRR %d\n", rc);
526         }
527         EXIT;
528         return rc;
529 }
530
531 void write_inode_pages(struct inode *inode)
532 {
533         struct list_head *tmp = &inode->i_mapping->dirty_pages;
534         
535         while ( (tmp = tmp->next) != &inode->i_mapping->dirty_pages) { 
536                 struct page *page;
537                 page = list_entry(tmp, struct page, list);
538                 obdfs_writepage(page);
539         }
540 }
541
542
543 int obdfs_commit_write(struct file *file, struct page *page, unsigned from, unsigned to)
544 {
545         struct inode *inode = page->mapping->host;
546         int rc = 0;
547         loff_t len = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
548         ENTRY;
549         CDEBUG(D_INODE, "commit write ino %ld (end at %Ld) from %d to %d ,ind %ld\n",
550                inode->i_ino, len, from, to, page->index);
551
552
553         if (cache_writes == 0) { 
554                 rc = obdfs_commit_page(page, 1, from, to);
555         }
556
557         if (len > inode->i_size) {
558                 obdfs_set_size(inode, len);
559         }
560
561         kunmap(page);
562         EXIT;
563         return rc;
564 }
565
566
567 /*
568  * This does the "real" work of the write. The generic routine has
569  * allocated the page, locked it, done all the page alignment stuff
570  * calculations etc. Now we should just copy the data from user
571  * space and write it back to the real medium..
572  *
573  * If the writer ends up delaying the write, the writer needs to
574  * increment the page use counts until he is done with the page.
575  *
576  * Return value is the number of bytes written.
577  */
578 int obdfs_write_one_page(struct file *file, struct page *page,
579                          unsigned long offset, unsigned long bytes,
580                          const char * buf)
581 {
582         struct inode *inode = file->f_dentry->d_inode;
583         int err;
584
585         ENTRY;
586         /* We check for complete page writes here, as we then don't have to
587          * get the page before writing over everything anyways.
588          */
589         if ( !Page_Uptodate(page) && (offset != 0 || bytes != PAGE_SIZE) ) {
590                 err = obdfs_brw(READ, inode, page, 0);
591                 if ( err )
592                         return err;
593                 SetPageUptodate(page);
594         }
595
596         if (copy_from_user((u8*)page_address(page) + offset, buf, bytes))
597                 return -EFAULT;
598
599         lock_kernel();
600         err = obdfs_writepage(page);
601         unlock_kernel();
602
603         return (err < 0 ? err : bytes);
604 } /* obdfs_write_one_page */
605
606 /* 
607  * return an up to date page:
608  *  - if locked is true then is returned locked
609  *  - if create is true the corresponding disk blocks are created 
610  *  - page is held, i.e. caller must release the page
611  *
612  * modeled on NFS code.
613  */
614 struct page *obdfs_getpage(struct inode *inode, unsigned long offset,
615                            int create, int locked)
616 {
617         struct page * page;
618         int index;
619         int err;
620
621         ENTRY;
622
623         offset = offset & PAGE_CACHE_MASK;
624         CDEBUG(D_INFO, "ino: %ld, offset %ld, create %d, locked %d\n",
625                inode->i_ino, offset, create, locked);
626         index = offset >> PAGE_CACHE_SHIFT;
627
628         page = grab_cache_page(&inode->i_data, index);
629
630         /* Yuck, no page */
631         if (! page) {
632             printk(KERN_WARNING " grab_cache_page says no dice ...\n");
633             EXIT;
634             return NULL;
635         }
636
637         /* PDEBUG(page, "GETPAGE: got page - before reading\n"); */
638         /* now check if the data in the page is up to date */
639         if ( Page_Uptodate(page)) { 
640                 if (!locked) {
641                         if (PageLocked(page))
642                                 obd_unlock_page(page);
643                 } else {
644                         printk("file %s, line %d: expecting locked page\n",
645                                __FILE__, __LINE__); 
646                 }
647                 EXIT;
648                 return page;
649         } 
650
651
652 #ifdef EXT2_OBD_DEBUG
653         if ((obd_debug_level & D_INFO) && obdfs_find_page_index(inode, index)) {
654                 CDEBUG(D_INFO, "OVERWRITE: found dirty page %p, index %ld\n",
655                        page, page->index);
656         }
657 #endif
658
659         err = obdfs_brw(READ, inode, page, create);
660
661         if ( err ) {
662                 SetPageError(page);
663                 obd_unlock_page(page);
664                 EXIT;
665                 return page;
666         }
667
668         if ( !locked )
669                 obd_unlock_page(page);
670         SetPageUptodate(page);
671         /* PDEBUG(page,"GETPAGE - after reading"); */
672         EXIT;
673         return page;
674 } /* obdfs_getpage */
675
676
677 void obdfs_truncate(struct inode *inode)
678 {
679         struct obdo *oa;
680         int err;
681         ENTRY;
682
683         //obdfs_dequeue_pages(inode);
684
685         if (IOPS(inode, punch) == NULL) {
686                 printk(KERN_ERR __FUNCTION__ ": no punch method!\n");
687                 EXIT;
688                 return;
689         }
690
691         oa = obdo_alloc();
692         if ( !oa ) {
693                 /* XXX This would give an inconsistent FS, so deal with it as
694                  * best we can for now - an obdo on the stack is not pretty.
695                  */
696                 struct obdo obdo;
697
698                 printk(__FUNCTION__ ": obdo_alloc failed - using stack!\n");
699
700                 obdo.o_valid = OBD_MD_FLNOTOBD;
701                 obdfs_from_inode(&obdo, inode);
702
703                 err = IOPS(inode, punch)(IID(inode), &obdo, obdo.o_size, 0);
704         } else {
705                 oa->o_valid = OBD_MD_FLNOTOBD;
706                 obdfs_from_inode(oa, inode);
707
708                 CDEBUG(D_INFO, "calling punch for %ld (%Lu bytes at 0)\n",
709                        (long)oa->o_id, oa->o_size);
710                 err = IOPS(inode, punch)(IID(inode), oa, oa->o_size, 0);
711
712                 obdo_free(oa);
713         }
714
715         if (err) {
716                 printk(__FUNCTION__ ": obd_truncate fails (%d)\n", err);
717                 EXIT;
718                 return;
719         }
720         EXIT;
721 } /* obdfs_truncate */
722
723 struct address_space_operations obdfs_aops = {
724         readpage: obdfs_readpage,
725         writepage: obdfs_writepage,
726         sync_page: block_sync_page,
727         prepare_write: obdfs_prepare_write, 
728         commit_write: obdfs_commit_write,
729         bmap: NULL
730 };