1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <linux/lustre_dlm.h>
27 #include <linux/lustre_lite.h>
28 #include <linux/obd_lov.h> /* for lov_mds_md_size() in lov_setstripe() */
29 #include <linux/random.h>
30 #include <linux/pagemap.h>
31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
32 #include <linux/lustre_compat25.h>
35 int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc);
36 extern int ll_setattr(struct dentry *de, struct iattr *attr);
38 static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
41 struct ll_file_data *fd = file->private_data;
42 struct ptlrpc_request *req = NULL;
44 struct obd_import *imp;
48 /* Complete the open request and remove it from replay list */
49 rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, inode->i_ino,
50 inode->i_mode, &fd->fd_mds_och.och_fh, &req);
52 CERROR("inode %lu close failed: rc = %d\n", inode->i_ino, rc);
54 imp = fd->fd_mds_och.och_req->rq_import;
56 spin_lock_irqsave(&imp->imp_lock, flags);
58 DEBUG_REQ(D_HA, fd->fd_mds_och.och_req, "matched open req %p",
59 fd->fd_mds_och.och_req);
61 /* We held on to the request for replay until we saw a close for that
62 * file. Now that we've closed it, it gets replayed on the basis of
63 * its transno only. */
64 spin_lock (&fd->fd_mds_och.och_req->rq_lock);
65 fd->fd_mds_och.och_req->rq_replay = 0;
66 spin_unlock (&fd->fd_mds_och.och_req->rq_lock);
68 if (fd->fd_mds_och.och_req->rq_transno) {
69 /* This open created a file, so it needs replay as a
70 * normal transaction now. Our reference to it now
71 * effectively owned by the imp_replay_list, and it'll
72 * be committed just like other transno-having
73 * requests from here on out. */
75 /* We now retain this close request, so that it is
76 * replayed if the open is replayed. We duplicate the
77 * transno, so that we get freed at the right time,
78 * and rely on the difference in xid to keep
79 * everything ordered correctly.
81 * But! If this close was already given a transno
82 * (because it caused real unlinking of an
83 * open-unlinked file, f.e.), then we'll be ordered on
84 * the basis of that and we don't need to do anything
86 if (!req->rq_transno) {
87 req->rq_transno = fd->fd_mds_och.och_req->rq_transno;
88 ptlrpc_retain_replayable_request(req, imp);
90 spin_unlock_irqrestore(&imp->imp_lock, flags);
92 /* Should we free_committed now? we always free before
93 * replay, so it's probably a wash. We could check to
94 * see if the fd_req should already be committed, in
95 * which case we can avoid the whole retain_replayable
98 /* No transno means that we can just drop our ref. */
99 spin_unlock_irqrestore(&imp->imp_lock, flags);
101 ptlrpc_req_finished(fd->fd_mds_och.och_req);
103 /* Do this after the fd_req->rq_transno check, because we don't want
104 * to bounce off zero references. */
105 ptlrpc_req_finished(req);
106 fd->fd_mds_och.och_fh.cookie = DEAD_HANDLE_MAGIC;
107 file->private_data = NULL;
108 OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd);
113 /* While this returns an error code, fput() the caller does not, so we need
114 * to make every effort to clean up all of our state here. Also, applications
115 * rarely check close errors and even if an error is returned they will not
116 * re-try the close call.
118 int ll_file_release(struct inode *inode, struct file *file)
120 struct ll_file_data *fd;
122 struct ll_sb_info *sbi = ll_i2sbi(inode);
123 struct ll_inode_info *lli = ll_i2info(inode);
124 struct lov_stripe_md *lsm = lli->lli_smd;
128 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
129 inode->i_generation, inode);
131 /* don't do anything for / */
132 if (inode->i_sb->s_root == file->f_dentry)
135 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_RELEASE);
136 fd = (struct ll_file_data *)file->private_data;
137 if (!fd) /* no process opened the file after an mcreate */
140 /* we might not be able to get a valid handle on this file
141 * again so we really want to flush our write cache.. */
142 if (S_ISREG(inode->i_mode)) {
143 filemap_fdatasync(inode->i_mapping);
144 filemap_fdatawait(inode->i_mapping);
147 memset(&oa, 0, sizeof(oa));
148 oa.o_id = lsm->lsm_object_id;
150 oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID;
152 memcpy(&oa.o_inline, &fd->fd_ost_och, FD_OSTDATA_SIZE);
153 oa.o_valid |= OBD_MD_FLHANDLE;
155 rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL);
157 CERROR("inode %lu object close failed: rc = "
158 "%d\n", inode->i_ino, rc);
162 rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
169 static int ll_local_open(struct file *file, struct lookup_intent *it)
171 struct ptlrpc_request *req = it->it_data;
172 struct ll_file_data *fd;
173 struct mds_body *body;
176 body = lustre_msg_buf (req->rq_repmsg, 1, sizeof (*body));
177 LASSERT (body != NULL); /* reply already checked out */
178 LASSERT_REPSWABBED (req, 1); /* and swabbed down */
180 LASSERT(!file->private_data);
182 OBD_SLAB_ALLOC(fd, ll_file_data_slab, SLAB_KERNEL, sizeof *fd);
183 /* We can't handle this well without reorganizing ll_file_open and
184 * ll_mdc_close, so don't even try right now. */
187 memset(fd, 0, sizeof(*fd));
189 memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle));
190 fd->fd_mds_och.och_req = it->it_data;
191 file->private_data = fd;
196 static int ll_osc_open(struct lustre_handle *conn, struct inode *inode,
197 struct file *file, struct lov_stripe_md *lsm)
199 struct ll_file_data *fd = file->private_data;
207 oa->o_id = lsm->lsm_object_id;
208 oa->o_mode = S_IFREG;
209 oa->o_valid = (OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLBLOCKS |
210 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
211 rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och);
215 file->f_flags &= ~O_LOV_DELAY_CREATE;
216 obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
217 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
225 /* Caller must hold lli_open_sem to protect lli->lli_smd from changing and
226 * duplicate objects from being created. We only install lsm to lli_smd if
227 * the mdc open was successful (hence stored stripe MD on MDS), otherwise
228 * other nodes could try to create different objects for the same file.
230 static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
231 struct file *file, struct lov_stripe_md *lsm)
233 struct ptlrpc_request *req = NULL;
234 struct ll_inode_info *lli = ll_i2info(inode);
235 struct lov_mds_md *lmm = NULL;
238 struct mdc_op_data op_data;
239 int rc, err, lmm_size = 0;;
246 oa->o_mode = S_IFREG | 0600;
247 oa->o_id = inode->i_ino;
248 /* Keep these 0 for now, because chown/chgrp does not change the
249 * ownership on the OST, and we don't want to allow BA OST NFS
250 * users to access these objects by mistake. */
253 oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
254 OBD_MD_FLUID | OBD_MD_FLGID;
256 rc = obd_create(conn, oa, &lsm, NULL);
258 CERROR("error creating objects for inode %lu: rc = %d\n",
261 CERROR("obd_create returned invalid rc %d\n", rc);
266 obdo_to_inode(inode, oa, OBD_MD_FLBLKSZ);
268 LASSERT(lsm && lsm->lsm_object_id);
269 rc = obd_packmd(conn, &lmm, lsm);
271 GOTO(out_destroy, rc);
275 /* Save the stripe MD with this file on the MDS */
276 memset(&iattr, 0, sizeof(iattr));
277 iattr.ia_valid = ATTR_FROM_OPEN;
279 ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
281 rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, &op_data,
282 &iattr, lmm, lmm_size, &req);
283 ptlrpc_req_finished(req);
285 obd_free_diskmd (conn, &lmm);
287 /* If we couldn't complete mdc_open() and store the stripe MD on the
288 * MDS, we need to destroy the objects now or they will be leaked.
291 CERROR("error: storing stripe MD for %lu: rc %d\n",
293 GOTO(out_destroy, rc);
296 lli->lli_maxbytes = lsm->lsm_maxbytes;
304 obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
305 oa->o_id = lsm->lsm_object_id;
306 oa->o_valid |= OBD_MD_FLID;
307 err = obd_destroy(conn, oa, lsm, NULL);
308 obd_free_memmd(conn, &lsm);
310 CERROR("error uncreating inode %lu objects: rc %d\n",
315 /* Open a file, and (for the very first open) create objects on the OSTs at
316 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
317 * creation or open until ll_lov_setstripe() ioctl is called. We grab
318 * lli_open_sem to ensure no other process will create objects, send the
319 * stripe MD to the MDS, or try to destroy the objects if that fails.
321 * If we already have the stripe MD locally then we don't request it in
322 * mdc_open(), by passing a lmm_size = 0.
324 * It is up to the application to ensure no other processes open this file
325 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
326 * used. We might be able to avoid races of that sort by getting lli_open_sem
327 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
328 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
330 extern int ll_it_open_error(int phase, struct lookup_intent *it);
332 int ll_file_open(struct inode *inode, struct file *file)
334 struct ll_sb_info *sbi = ll_i2sbi(inode);
335 struct ll_inode_info *lli = ll_i2info(inode);
336 struct lustre_handle *conn = ll_i2obdconn(inode);
337 struct lookup_intent *it;
338 struct lov_stripe_md *lsm;
342 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
343 inode->i_generation, inode);
345 /* don't do anything for / */
346 if (inode->i_sb->s_root == file->f_dentry)
349 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
350 LL_GET_INTENT(file->f_dentry, it);
351 rc = ll_it_open_error(IT_OPEN_OPEN, it);
355 rc = ll_local_open(file, it);
359 mdc_set_open_replay_data(&((struct ll_file_data *)
360 file->private_data)->fd_mds_och);
361 if (!S_ISREG(inode->i_mode))
366 if (file->f_flags & O_LOV_DELAY_CREATE) {
367 CDEBUG(D_INODE, "delaying object creation\n");
370 down(&lli->lli_open_sem);
372 rc = ll_create_obj(conn, inode, file, NULL);
373 up(&lli->lli_open_sem);
377 CERROR("warning: stripe already set on ino %lu\n",
379 up(&lli->lli_open_sem);
384 rc = ll_osc_open(conn, inode, file, lsm);
390 ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
395 * really does the getattr on the inode and updates its fields
397 int ll_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm,
400 struct ll_sb_info *sbi = ll_i2sbi(inode);
401 struct ll_inode_info *lli = ll_i2info(inode);
402 struct ptlrpc_request_set *set;
405 unsigned long before, after;
413 memset(&oa, 0, sizeof oa);
414 oa.o_id = lsm->lsm_object_id;
416 oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
417 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
420 if (ostdata != NULL) {
421 memcpy(&oa.o_inline, ostdata, FD_OSTDATA_SIZE);
422 oa.o_valid |= OBD_MD_FLHANDLE;
425 /* getattr can race with writeback. we don't want to trust a getattr
426 * that doesn't include the writeback of our farthest cached pages
427 * that it raced with. */
428 /* Now that the OSC knows the cached-page status, it can and should be
429 * adjusting its getattr results to include the maximum cached offset
430 * for its stripe(s). */
432 bef = obd_last_dirty_offset(ll_i2obdconn(inode), lli->lli_smd,
435 rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
437 set = ptlrpc_prep_set ();
439 CERROR ("ENOMEM allocing request set\n");
442 rc = obd_getattr_async(&sbi->ll_osc_conn, &oa, lsm, set);
444 rc = ptlrpc_set_wait (set);
445 ptlrpc_set_destroy (set);
451 aft = obd_last_dirty_offset(ll_i2obdconn(inode), lli->lli_smd,
453 CDEBUG(D_INODE, " %d,%lu -> %d,%lu\n", bef, before, aft, after);
455 (aft != 0 || after < before) &&
456 oa.o_size < ((u64)before + 1) << PAGE_CACHE_SHIFT);
458 obdo_to_inode(inode, &oa, (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
459 OBD_MD_FLMTIME | OBD_MD_FLCTIME));
460 if (inode->i_blksize < PAGE_CACHE_SIZE)
461 inode->i_blksize = PAGE_CACHE_SIZE;
463 /* make sure getattr doesn't return a size that causes writeback
464 * to forget about cached writes */
465 if ((aft == 0) && oa.o_size < ((u64)after + 1) << PAGE_CACHE_SHIFT) {
466 CDEBUG(D_INODE, "cached at %lu, keeping %llu i_size instead "
467 "of oa "LPU64"\n", after, inode->i_size,
472 obdo_to_inode(inode, &oa, OBD_MD_FLSIZE);
474 CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lu blksize %lu\n",
475 lsm->lsm_object_id, inode->i_size, inode->i_size,
481 * some callers, notably truncate, really don't want i_size set based
482 * on the the size returned by the getattr, or lock acquisition in
485 int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode,
486 struct lov_stripe_md *lsm,
487 int mode, struct ldlm_extent *extent,
488 struct lustre_handle *lockh)
490 struct ll_sb_info *sbi = ll_i2sbi(inode);
494 LASSERT(lockh->cookie == 0);
496 /* XXX phil: can we do this? won't it screw the file size up? */
497 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
498 (sbi->ll_flags & LL_SBI_NOLCK))
501 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
502 inode->i_ino, extent->start, extent->end);
504 rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, extent,
505 sizeof(extent), mode, &flags, ll_extent_lock_callback,
512 * this grabs a lock and manually implements behaviour that makes it look like
513 * the OST is returning the file size with each lock acquisition.
515 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
516 struct lov_stripe_md *lsm, int mode,
517 struct ldlm_extent *extent, struct lustre_handle *lockh)
519 struct ll_inode_info *lli = ll_i2info(inode);
520 struct ldlm_extent size_lock;
521 struct lustre_handle match_lockh = {0};
522 int flags, rc, matched;
525 rc = ll_extent_lock_no_validate(fd, inode, lsm, mode, extent, lockh);
529 if (test_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags))
532 rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL);
534 ll_extent_unlock(fd, inode, lsm, mode, lockh);
538 size_lock.start = inode->i_size;
539 size_lock.end = OBD_OBJECT_EOF;
541 /* XXX I bet we should be checking the lock ignore flags.. */
542 flags = LDLM_FL_CBPENDING | LDLM_FL_BLOCK_GRANTED | LDLM_FL_MATCH_DATA;
543 matched = obd_match(&ll_i2sbi(inode)->ll_osc_conn, lsm, LDLM_EXTENT,
544 &size_lock, sizeof(size_lock), LCK_PR, &flags,
545 inode, &match_lockh);
547 /* hey, alright, we hold a size lock that covers the size we
548 * just found, its not going to change for a while.. */
550 set_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags);
551 obd_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, LCK_PR,
558 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
559 struct lov_stripe_md *lsm, int mode,
560 struct lustre_handle *lockh)
562 struct ll_sb_info *sbi = ll_i2sbi(inode);
566 /* XXX phil: can we do this? won't it screw the file size up? */
567 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
568 (sbi->ll_flags & LL_SBI_NOLCK))
571 rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh);
576 static inline void ll_remove_suid(struct inode *inode)
580 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
581 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
583 /* was any of the uid bits set? */
584 mode &= inode->i_mode;
585 if (mode && !capable(CAP_FSETID)) {
586 inode->i_mode &= ~mode;
587 // XXX careful here - we cannot change the size
592 static void ll_update_atime(struct inode *inode)
597 attr.ia_atime = LTIME_S(CURRENT_TIME);
598 attr.ia_valid = ATTR_ATIME;
600 if (inode->i_atime == attr.ia_atime) return;
601 if (IS_RDONLY(inode)) return;
602 if (IS_NOATIME(inode)) return;
604 /* ll_inode_setattr() sets inode->i_atime from attr.ia_atime */
605 ll_inode_setattr(inode, &attr, 0);
607 /* update atime, but don't explicitly write it out just this change */
608 inode->i_atime = CURRENT_TIME;
614 * flush the page cache for an extent as its canceled. when we're on an
615 * lov we get a lock cancelation for each of the obd locks under the lov
616 * so we have to map the obd's region back onto the stripes in the file
619 * no one can dirty the extent until we've finished our work and they
620 * can enqueue another lock.
622 * XXX this could be asking the inode's dirty tree for info
624 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
625 struct ldlm_lock *lock)
627 struct ldlm_extent *extent = &lock->l_extent;
628 unsigned long start, end, count, skip, i, j;
633 CDEBUG(D_INODE, "obdo %lu inode %p ["LPU64"->"LPU64"] size: %llu\n",
634 inode->i_ino, inode, extent->start, extent->end, inode->i_size);
636 start = extent->start >> PAGE_CACHE_SHIFT;
639 end = (extent->end >> PAGE_CACHE_SHIFT) + 1;
640 if ((end << PAGE_CACHE_SHIFT) < extent->end)
642 if (lsm->lsm_stripe_count > 1) {
645 struct ldlm_lock *lock;
646 struct lov_stripe_md *lsm;
647 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
649 __u32 vallen = sizeof(stripe);
652 /* get our offset in the lov */
653 rc = obd_get_info(ll_i2obdconn(inode), sizeof(key),
654 &key, &vallen, &stripe);
656 CERROR("obd_get_info: rc = %d\n", rc);
659 LASSERT(stripe < lsm->lsm_stripe_count);
661 count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT;
662 skip = (lsm->lsm_stripe_count - 1) * count;
663 start += (start/count * skip) + (stripe * count);
665 end += (end/count * skip) + (stripe * count);
668 i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
670 clear_bit(LLI_F_HAVE_SIZE_LOCK, &(ll_i2info(inode)->lli_flags));
674 CDEBUG(D_INODE, "start: %lu j: %lu count: %lu skip: %lu end: %lu\n",
675 start, start % count, count, skip, end);
677 /* start writeback on dirty pages in the extent when its PW */
678 for (i = start, j = start % count;
679 lock->l_granted_mode == LCK_PW && i < end; j++, i++) {
684 /* its unlikely, but give us a chance to bail when we're out */
685 PGCACHE_WRLOCK(inode->i_mapping);
686 if (list_empty(&inode->i_mapping->dirty_pages)) {
687 CDEBUG(D_INODE, "dirty list empty\n");
688 PGCACHE_WRUNLOCK(inode->i_mapping);
691 PGCACHE_WRUNLOCK(inode->i_mapping);
696 page = find_get_page(inode->i_mapping, i);
699 if (!PageDirty(page) || TryLockPage(page)) {
700 page_cache_release(page);
703 if (PageDirty(page)) {
704 CDEBUG(D_INODE, "writing page %p\n", page);
705 PGCACHE_WRLOCK(inode->i_mapping);
706 list_del(&page->list);
707 list_add(&page->list, &inode->i_mapping->locked_pages);
708 PGCACHE_WRUNLOCK(inode->i_mapping);
710 /* this writepage might write out pages outside
711 * this extent, but that's ok, the pages are only
712 * still dirty because a lock still covers them */
713 ClearPageDirty(page);
714 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
715 ret = inode->i_mapping->a_ops->writepage(page);
717 ret = inode->i_mapping->a_ops->writepage(page, NULL);
724 page_cache_release(page);
728 /* our locks are page granular thanks to osc_enqueue, we invalidate the
730 LASSERT((extent->start & ~PAGE_CACHE_MASK) == 0);
731 LASSERT(((extent->end+1) & ~PAGE_CACHE_MASK) == 0);
732 for (i = start, j = start % count ; i < end ; j++, i++) {
737 PGCACHE_WRLOCK(inode->i_mapping);
738 if (list_empty(&inode->i_mapping->dirty_pages) &&
739 list_empty(&inode->i_mapping->clean_pages) &&
740 list_empty(&inode->i_mapping->locked_pages)) {
741 CDEBUG(D_INODE, "nothing left\n");
742 PGCACHE_WRUNLOCK(inode->i_mapping);
745 PGCACHE_WRUNLOCK(inode->i_mapping);
748 page = find_get_page(inode->i_mapping, i);
751 CDEBUG(D_INODE, "dropping page %p at %lu\n", page, page->index);
753 if (page->mapping) /* might have raced */
754 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
755 truncate_complete_page(page);
757 truncate_complete_page(page->mapping, page);
760 page_cache_release(page);
765 int ll_extent_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
766 void *data, int flag)
768 struct inode *inode = data;
769 struct ll_inode_info *lli = ll_i2info(inode);
770 struct lustre_handle lockh = { 0 };
774 LASSERT(inode != NULL);
777 case LDLM_CB_BLOCKING:
778 ldlm_lock2handle(lock, &lockh);
779 rc = ldlm_cli_cancel(&lockh);
781 CERROR("ldlm_cli_cancel failed: %d\n", rc);
783 case LDLM_CB_CANCELING:
784 /* FIXME: we could be given 'canceling intents' so that we
785 * could know to write-back or simply throw away the pages
786 * based on if the cancel comes from a desire to, say,
787 * read or truncate.. */
788 LASSERT((unsigned long)inode > 0x1000);
789 LASSERT((unsigned long)lli > 0x1000);
790 LASSERT((unsigned long)lli->lli_smd > 0x1000);
791 ll_pgcache_remove_extent(inode, lli->lli_smd, lock);
800 static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
803 struct ll_file_data *fd = filp->private_data;
804 struct inode *inode = filp->f_dentry->d_inode;
805 struct ll_inode_info *lli = ll_i2info(inode);
806 struct lov_stripe_md *lsm = lli->lli_smd;
807 struct lustre_handle lockh = { 0 };
808 struct ll_read_extent rextent;
812 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
813 inode->i_ino, inode->i_generation, inode, count, *ppos);
815 /* "If nbyte is 0, read() will return 0 and have no other results."
816 * -- Single Unix Spec */
820 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
822 /* grab a -> eof extent to push extending writes out of node's caches
823 * so we can see them at the getattr after lock acquisition. this will
824 * turn into a seperate [*ppos + count, EOF] 'size intent' lock attempt
826 rextent.re_extent.start = *ppos;
827 rextent.re_extent.end = OBD_OBJECT_EOF;
829 err = ll_extent_lock(fd, inode, lsm, LCK_PR, &rextent.re_extent,&lockh);
833 /* XXX tell ll_readpage what pages have a PR lock.. */
834 rextent.re_task = current;
835 spin_lock(&lli->lli_read_extent_lock);
836 list_add(&rextent.re_lli_item, &lli->lli_read_extents);
837 spin_unlock(&lli->lli_read_extent_lock);
839 CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n",
840 inode->i_ino, count, *ppos);
841 retval = generic_file_read(filp, buf, count, ppos);
843 spin_lock(&lli->lli_read_extent_lock);
844 list_del(&rextent.re_lli_item);
845 spin_unlock(&lli->lli_read_extent_lock);
848 ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
853 * Write to a file (through the page cache).
856 ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
858 struct ll_file_data *fd = file->private_data;
859 struct inode *inode = file->f_dentry->d_inode;
860 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
861 struct lustre_handle lockh = { 0 };
862 struct ldlm_extent extent;
863 loff_t maxbytes = ll_file_maxbytes(inode);
866 char should_validate = 1;
868 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
869 inode->i_ino, inode->i_generation, inode, count, *ppos);
872 * sleep doing some writeback work of this mount's dirty data
873 * if the VM thinks we're low on memory.. other dirtying code
874 * paths should think about doing this, too, but they should be
875 * careful not to hold locked pages while they do so. like
876 * ll_prepare_write. *cough*
878 LL_CHECK_DIRTY(inode->i_sb);
880 /* POSIX, but surprised the VFS doesn't check this already */
884 if (file->f_flags & O_APPEND) {
886 extent.end = OBD_OBJECT_EOF;
888 extent.start = *ppos;
889 extent.end = *ppos + count - 1;
890 /* we really don't care what i_size is if we're doing
891 * fully page aligned writes */
892 if ((*ppos & ~PAGE_CACHE_MASK) == 0 &&
893 (count & ~PAGE_CACHE_MASK) == 0)
898 err = ll_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh);
900 err = ll_extent_lock_no_validate(fd, inode, lsm, LCK_PW,
905 /* this is ok, g_f_w will overwrite this under i_sem if it races
906 * with a local truncate, it just makes our maxbyte checking easier */
907 if (file->f_flags & O_APPEND)
908 *ppos = inode->i_size;
910 if (*ppos >= maxbytes) {
911 if (count || *ppos > maxbytes) {
912 send_sig(SIGXFSZ, current, 0);
913 GOTO(out, retval = -EFBIG);
916 if (*ppos + count > maxbytes)
917 count = maxbytes - *ppos;
919 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
920 inode->i_ino, count, *ppos);
922 /* generic_file_write handles O_APPEND after getting i_sem */
923 retval = generic_file_write(file, buf, count, ppos);
927 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES,
929 ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh);
933 static int ll_lov_setstripe(struct inode *inode, struct file *file,
936 struct ll_inode_info *lli = ll_i2info(inode);
937 struct lustre_handle *conn = ll_i2obdconn(inode);
938 struct lov_stripe_md *lsm;
942 down(&lli->lli_open_sem);
945 up(&lli->lli_open_sem);
946 CERROR("stripe already exists for ino %lu\n", inode->i_ino);
947 /* If we haven't already done the open, do so now */
948 if (file->f_flags & O_LOV_DELAY_CREATE) {
949 int rc2 = ll_osc_open(conn, inode, file, lsm);
957 rc = obd_iocontrol(LL_IOC_LOV_SETSTRIPE, conn, 0, &lsm, (void *)arg);
959 up(&lli->lli_open_sem);
962 rc = ll_create_obj(conn, inode, file, lsm);
963 up(&lli->lli_open_sem);
966 obd_free_memmd(conn, &lsm);
969 rc = ll_osc_open(conn, inode, file, lli->lli_smd);
973 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
975 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
976 struct lustre_handle *conn = ll_i2obdconn(inode);
981 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, conn, 0, lsm, (void *)arg);
984 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
987 struct ll_file_data *fd = file->private_data;
988 struct lustre_handle *conn;
990 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%u\n", inode->i_ino,
991 inode->i_generation, inode, cmd);
993 if (_IOC_TYPE(cmd) == 'T') /* tty ioctls */
996 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_IOCTL);
998 case LL_IOC_GETFLAGS:
999 /* Get the current value of the file flags */
1000 return put_user(fd->fd_flags, (int *)arg);
1001 case LL_IOC_SETFLAGS:
1002 case LL_IOC_CLRFLAGS:
1003 /* Set or clear specific file flags */
1004 /* XXX This probably needs checks to ensure the flags are
1005 * not abused, and to handle any flag side effects.
1007 if (get_user(flags, (int *) arg))
1010 if (cmd == LL_IOC_SETFLAGS)
1011 fd->fd_flags |= flags;
1013 fd->fd_flags &= ~flags;
1015 case LL_IOC_LOV_SETSTRIPE:
1016 return ll_lov_setstripe(inode, file, arg);
1017 case LL_IOC_LOV_GETSTRIPE:
1018 return ll_lov_getstripe(inode, arg);
1020 /* We need to special case any other ioctls we want to handle,
1021 * to send them to the MDS/OST as appropriate and to properly
1022 * network encode the arg field.
1023 case EXT2_IOC_GETFLAGS:
1024 case EXT2_IOC_SETFLAGS:
1025 case EXT2_IOC_GETVERSION_OLD:
1026 case EXT2_IOC_GETVERSION_NEW:
1027 case EXT2_IOC_SETVERSION_OLD:
1028 case EXT2_IOC_SETVERSION_NEW:
1031 conn = ll_i2obdconn(inode);
1032 return obd_iocontrol(cmd, conn, 0, NULL, (void *)arg);
1036 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1038 struct inode *inode = file->f_dentry->d_inode;
1039 struct ll_file_data *fd = file->private_data;
1040 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1041 struct lustre_handle lockh = {0};
1044 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),to=%llu\n", inode->i_ino,
1045 inode->i_generation, inode,
1046 offset + ((origin==2) ? inode->i_size : file->f_pos));
1048 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_LLSEEK);
1049 if (origin == 2) { /* SEEK_END */
1051 struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
1052 err = ll_extent_lock(fd, inode, lsm, LCK_PR, &extent, &lockh);
1053 if (err != ELDLM_OK)
1056 offset += inode->i_size;
1057 } else if (origin == 1) { /* SEEK_CUR */
1058 offset += file->f_pos;
1062 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1063 if (offset != file->f_pos) {
1064 file->f_pos = offset;
1065 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1067 file->f_version = ++event;
1074 ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
1078 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1081 struct inode *inode = dentry->d_inode;
1083 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1084 inode->i_generation, inode);
1086 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_FSYNC);
1088 * filemap_fdata{sync,wait} are also called at PW lock cancelation so
1089 * we know that they can only find data to writeback here if we are
1090 * still holding the PW lock that covered the dirty pages. XXX we
1091 * should probably get a reference on it, though, just to be clear.
1093 ret = filemap_fdatasync(dentry->d_inode->i_mapping);
1095 ret = filemap_fdatawait(dentry->d_inode->i_mapping);
1100 int ll_inode_revalidate(struct dentry *dentry)
1102 struct inode *inode = dentry->d_inode;
1103 struct lov_stripe_md *lsm = NULL;
1107 CERROR("REPORT THIS LINE TO PETER\n");
1110 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
1111 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
1112 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
1113 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE);
1116 /* this is very tricky. it is unsafe to call ll_have_md_lock
1117 when we have a referenced lock: because it may cause an RPC
1118 below when the lock is marked CB_PENDING. That RPC may not
1119 go out because someone else may be in another RPC waiting for
1121 if (!(dentry->d_it && dentry->d_it->it_lock_mode) &&
1122 !ll_have_md_lock(dentry)) {
1123 struct ptlrpc_request *req = NULL;
1124 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
1126 struct mds_body *body;
1127 struct lov_mds_md *lmm;
1128 unsigned long valid = 0;
1129 int eadatalen = 0, rc;
1131 /* Why don't we update all valid MDS fields here, if we're
1132 * doing an RPC anyways? -phil */
1133 if (S_ISREG(inode->i_mode)) {
1134 eadatalen = obd_size_diskmd(&sbi->ll_osc_conn, NULL);
1135 valid |= OBD_MD_FLEASIZE;
1137 ll_inode2fid(&fid, inode);
1138 rc = mdc_getattr(&sbi->ll_mdc_conn, &fid,
1139 valid, eadatalen, &req);
1141 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
1145 body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
1146 LASSERT (body != NULL); /* checked by mdc_getattr() */
1147 LASSERT_REPSWABBED (req, 0); /* swabbed by mdc_getattr() */
1149 if (S_ISREG(inode->i_mode) &&
1150 (body->valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS))) {
1151 CERROR("MDS sent back size for regular file\n");
1152 body->valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
1155 /* XXX Too paranoid? */
1156 if ((body->valid ^ valid) & OBD_MD_FLEASIZE)
1157 CERROR("Asked for %s eadata but got %s\n",
1158 (valid & OBD_MD_FLEASIZE) ? "some" : "no",
1159 (body->valid & OBD_MD_FLEASIZE) ? "some":"none");
1161 if (S_ISREG(inode->i_mode) &&
1162 (body->valid & OBD_MD_FLEASIZE)) {
1163 if (body->eadatasize == 0) { /* no EA data */
1164 CERROR("OBD_MD_FLEASIZE set but no data\n");
1167 /* Only bother with this if inode's lsm not set? */
1168 lmm = lustre_msg_buf(req->rq_repmsg,1,body->eadatasize);
1169 LASSERT(lmm != NULL); /* mdc_getattr() checked */
1170 LASSERT_REPSWABBED(req, 1); /* mdc_getattr() swabbed */
1172 rc = obd_unpackmd (&sbi->ll_osc_conn,
1173 &lsm, lmm, body->eadatasize);
1175 CERROR("Error %d unpacking eadata\n", rc);
1176 ptlrpc_req_finished(req);
1179 LASSERT(rc >= sizeof(*lsm));
1182 ll_update_inode(inode, body, lsm);
1183 if (lsm != NULL && ll_i2info(inode)->lli_smd != lsm)
1184 obd_free_memmd(&sbi->ll_osc_conn, &lsm);
1186 ptlrpc_req_finished(req);
1189 lsm = ll_i2info(inode)->lli_smd;
1190 if (!lsm) /* object not yet allocated, don't validate size */
1194 * unfortunately stat comes in through revalidate and we don't
1195 * differentiate this use from initial instantiation. we're
1196 * also being wildly conservative and flushing write caches
1197 * so that stat really returns the proper size.
1200 struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
1201 struct lustre_handle lockh = {0};
1204 err = ll_extent_lock(NULL, inode, lsm, LCK_PR, &extent, &lockh);
1205 if (err != ELDLM_OK)
1208 ll_extent_unlock(NULL, inode, lsm, LCK_PR, &lockh);
1213 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1214 static int ll_getattr(struct vfsmount *mnt, struct dentry *de,
1218 struct inode *inode = de->d_inode;
1220 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR);
1221 res = ll_inode_revalidate(de);
1224 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1225 stat->dev = inode->i_dev;
1227 stat->ino = inode->i_ino;
1228 stat->mode = inode->i_mode;
1229 stat->nlink = inode->i_nlink;
1230 stat->uid = inode->i_uid;
1231 stat->gid = inode->i_gid;
1232 stat->rdev = kdev_t_to_nr(inode->i_rdev);
1233 stat->atime = inode->i_atime;
1234 stat->mtime = inode->i_mtime;
1235 stat->ctime = inode->i_ctime;
1236 stat->size = inode->i_size;
1241 struct file_operations ll_file_operations = {
1243 write: ll_file_write,
1244 ioctl: ll_file_ioctl,
1246 release: ll_file_release,
1247 mmap: generic_file_mmap,
1248 llseek: ll_file_seek,
1252 struct inode_operations ll_file_inode_operations = {
1253 setattr_raw: ll_setattr_raw,
1254 setattr: ll_setattr,
1255 truncate: ll_truncate,
1256 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1257 getattr: ll_getattr,
1259 revalidate: ll_inode_revalidate,
1263 struct inode_operations ll_special_inode_operations = {
1264 setattr_raw: ll_setattr_raw,
1265 setattr: ll_setattr,
1266 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1267 getattr: ll_getattr,
1269 revalidate: ll_inode_revalidate,