1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <linux/lustre_dlm.h>
27 #include <linux/lustre_lite.h>
28 #include <linux/obd_lov.h> /* for lov_mds_md_size() in lov_setstripe() */
29 #include <linux/random.h>
30 #include <linux/pagemap.h>
31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
32 #include <linux/lustre_compat25.h>
35 #include "llite_internal.h"
37 static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
40 struct ll_file_data *fd = file->private_data;
41 struct ptlrpc_request *req = NULL;
43 struct obd_import *imp;
47 /* Complete the open request and remove it from replay list */
48 rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, inode->i_ino,
49 inode->i_mode, &fd->fd_mds_och.och_fh, &req);
51 CERROR("inode %lu close failed: rc = %d\n", inode->i_ino, rc);
53 imp = fd->fd_mds_och.och_req->rq_import;
55 spin_lock_irqsave(&imp->imp_lock, flags);
57 DEBUG_REQ(D_HA, fd->fd_mds_och.och_req, "matched open req %p",
58 fd->fd_mds_och.och_req);
60 /* We held on to the request for replay until we saw a close for that
61 * file. Now that we've closed it, it gets replayed on the basis of
62 * its transno only. */
63 spin_lock (&fd->fd_mds_och.och_req->rq_lock);
64 fd->fd_mds_och.och_req->rq_replay = 0;
65 spin_unlock (&fd->fd_mds_och.och_req->rq_lock);
67 if (fd->fd_mds_och.och_req->rq_transno) {
68 /* This open created a file, so it needs replay as a
69 * normal transaction now. Our reference to it now
70 * effectively owned by the imp_replay_list, and it'll
71 * be committed just like other transno-having
72 * requests from here on out. */
74 /* We now retain this close request, so that it is
75 * replayed if the open is replayed. We duplicate the
76 * transno, so that we get freed at the right time,
77 * and rely on the difference in xid to keep
78 * everything ordered correctly.
80 * But! If this close was already given a transno
81 * (because it caused real unlinking of an
82 * open-unlinked file, f.e.), then we'll be ordered on
83 * the basis of that and we don't need to do anything
85 if (!req->rq_transno) {
86 req->rq_transno = fd->fd_mds_och.och_req->rq_transno;
87 ptlrpc_retain_replayable_request(req, imp);
89 spin_unlock_irqrestore(&imp->imp_lock, flags);
91 /* Should we free_committed now? we always free before
92 * replay, so it's probably a wash. We could check to
93 * see if the fd_req should already be committed, in
94 * which case we can avoid the whole retain_replayable
97 /* No transno means that we can just drop our ref. */
98 spin_unlock_irqrestore(&imp->imp_lock, flags);
100 ptlrpc_req_finished(fd->fd_mds_och.och_req);
102 /* Do this after the fd_req->rq_transno check, because we don't want
103 * to bounce off zero references. */
104 ptlrpc_req_finished(req);
105 fd->fd_mds_och.och_fh.cookie = DEAD_HANDLE_MAGIC;
106 file->private_data = NULL;
107 OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd);
112 /* While this returns an error code, fput() the caller does not, so we need
113 * to make every effort to clean up all of our state here. Also, applications
114 * rarely check close errors and even if an error is returned they will not
115 * re-try the close call.
117 int ll_file_release(struct inode *inode, struct file *file)
119 struct ll_file_data *fd;
121 struct ll_sb_info *sbi = ll_i2sbi(inode);
122 struct ll_inode_info *lli = ll_i2info(inode);
123 struct lov_stripe_md *lsm = lli->lli_smd;
127 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
128 inode->i_generation, inode);
130 /* don't do anything for / */
131 if (inode->i_sb->s_root == file->f_dentry)
134 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_RELEASE);
135 fd = (struct ll_file_data *)file->private_data;
136 if (!fd) /* no process opened the file after an mcreate */
139 /* we might not be able to get a valid handle on this file
140 * again so we really want to flush our write cache.. */
141 if (S_ISREG(inode->i_mode) && lsm) {
142 write_inode_now(inode, 0);
143 obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
144 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
145 memcpy(obdo_handle(&oa), &fd->fd_ost_och, FD_OSTDATA_SIZE);
146 oa.o_valid |= OBD_MD_FLHANDLE;
148 rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL);
150 CERROR("inode %lu object close failed: rc %d\n",
154 rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
161 static int ll_local_open(struct file *file, struct lookup_intent *it)
163 struct ptlrpc_request *req = it->it_data;
164 struct ll_file_data *fd;
165 struct mds_body *body;
168 body = lustre_msg_buf (req->rq_repmsg, 1, sizeof (*body));
169 LASSERT (body != NULL); /* reply already checked out */
170 LASSERT_REPSWABBED (req, 1); /* and swabbed down */
172 LASSERT(!file->private_data);
174 OBD_SLAB_ALLOC(fd, ll_file_data_slab, SLAB_KERNEL, sizeof *fd);
175 /* We can't handle this well without reorganizing ll_file_open and
176 * ll_mdc_close, so don't even try right now. */
179 memset(fd, 0, sizeof(*fd));
181 memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle));
182 fd->fd_mds_och.och_req = it->it_data;
183 file->private_data = fd;
188 static int ll_osc_open(struct lustre_handle *conn, struct inode *inode,
189 struct file *file, struct lov_stripe_md *lsm)
191 struct ll_file_data *fd = file->private_data;
199 oa->o_id = lsm->lsm_object_id;
200 oa->o_mode = S_IFREG;
201 oa->o_valid = OBD_MD_FLID;
202 obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
203 rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och);
207 file->f_flags &= ~O_LOV_DELAY_CREATE;
208 obdo_refresh_inode(inode, oa, (OBD_MD_FLBLOCKS | OBD_MD_FLATIME |
209 OBD_MD_FLMTIME | OBD_MD_FLCTIME));
216 /* Caller must hold lli_open_sem to protect lli->lli_smd from changing and
217 * duplicate objects from being created. We only install lsm to lli_smd if
218 * the mdc open was successful (hence stored stripe MD on MDS), otherwise
219 * other nodes could try to create different objects for the same file.
221 static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
222 struct file *file, struct lov_stripe_md *lsm)
224 struct ptlrpc_request *req = NULL;
225 struct ll_inode_info *lli = ll_i2info(inode);
226 struct lov_mds_md *lmm = NULL;
229 struct mdc_op_data op_data;
230 struct obd_trans_info oti = { 0 };
231 int rc, err, lmm_size = 0;
238 LASSERT(S_ISREG(inode->i_mode));
239 oa->o_mode = S_IFREG | 0600;
240 oa->o_id = inode->i_ino;
241 oa->o_generation = inode->i_generation;
242 /* Keep these 0 for now, because chown/chgrp does not change the
243 * ownership on the OST, and we don't want to allow BA OST NFS
244 * users to access these objects by mistake. */
247 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGENER | OBD_MD_FLTYPE |
248 OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID;
249 #ifdef ENABLE_ORPHANS
250 oa->o_valid |= OBD_MD_FLCOOKIE;
253 obdo_from_inode(oa, inode, OBD_MD_FLTYPE|OBD_MD_FLATIME|OBD_MD_FLMTIME|
254 OBD_MD_FLCTIME | (inode->i_size ? OBD_MD_FLSIZE : 0));
256 rc = obd_create(conn, oa, &lsm, &oti);
258 CERROR("error creating objects for inode %lu: rc = %d\n",
261 CERROR("obd_create returned invalid rc %d\n", rc);
267 LASSERT(lsm && lsm->lsm_object_id);
268 rc = obd_packmd(conn, &lmm, lsm);
270 GOTO(out_destroy, rc);
274 /* Save the stripe MD with this file on the MDS */
275 memset(&iattr, 0, sizeof(iattr));
276 iattr.ia_valid = ATTR_FROM_OPEN;
278 ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
281 #warning FIXME: next line is for debugging purposes only
282 obd_log_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, oti.oti_numcookies,
283 oti.oti_logcookies, OBD_LLOG_FL_SENDNOW);
286 rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, &op_data, &iattr,
287 lmm, lmm_size, oti.oti_logcookies,
288 oti.oti_numcookies * sizeof(oti.oti_onecookie), &req);
289 ptlrpc_req_finished(req);
291 obd_free_diskmd(conn, &lmm);
293 /* If we couldn't complete mdc_open() and store the stripe MD on the
294 * MDS, we need to destroy the objects now or they will be leaked.
297 CERROR("error: storing stripe MD for %lu: rc %d\n",
299 GOTO(out_destroy, rc);
302 lli->lli_maxbytes = lsm->lsm_maxbytes;
306 oti_free_cookies(&oti);
311 oa->o_id = lsm->lsm_object_id;
312 oa->o_valid = OBD_MD_FLID;
313 obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
315 err = obd_log_cancel(conn, lsm, oti.oti_numcookies, oti.oti_logcookies,
316 OBD_LLOG_FL_SENDNOW);
318 CERROR("error cancelling inode %lu log cookies: rc %d\n",
321 err = obd_destroy(conn, oa, lsm, NULL);
322 obd_free_memmd(conn, &lsm);
324 CERROR("error uncreating inode %lu objects: rc %d\n",
329 /* Open a file, and (for the very first open) create objects on the OSTs at
330 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
331 * creation or open until ll_lov_setstripe() ioctl is called. We grab
332 * lli_open_sem to ensure no other process will create objects, send the
333 * stripe MD to the MDS, or try to destroy the objects if that fails.
335 * If we already have the stripe MD locally then we don't request it in
336 * mdc_open(), by passing a lmm_size = 0.
338 * It is up to the application to ensure no other processes open this file
339 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
340 * used. We might be able to avoid races of that sort by getting lli_open_sem
341 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
342 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
344 int ll_file_open(struct inode *inode, struct file *file)
346 struct ll_sb_info *sbi = ll_i2sbi(inode);
347 struct ll_inode_info *lli = ll_i2info(inode);
348 struct lustre_handle *conn = ll_i2obdconn(inode);
349 struct lookup_intent *it;
350 struct lov_stripe_md *lsm;
354 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
355 inode->i_generation, inode);
357 /* don't do anything for / */
358 if (inode->i_sb->s_root == file->f_dentry)
362 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
364 rc = ll_it_open_error(DISP_OPEN_OPEN, it);
368 rc = ll_local_open(file, it);
372 mdc_set_open_replay_data(&((struct ll_file_data *)
373 file->private_data)->fd_mds_och);
374 if (!S_ISREG(inode->i_mode))
379 if (file->f_flags & O_LOV_DELAY_CREATE ||
380 !(file->f_mode & FMODE_WRITE)) {
381 CDEBUG(D_INODE, "delaying object creation\n");
384 down(&lli->lli_open_sem);
386 rc = ll_create_obj(conn, inode, file, NULL);
387 up(&lli->lli_open_sem);
391 CERROR("warning: stripe already set on ino %lu\n",
393 up(&lli->lli_open_sem);
398 rc = ll_osc_open(conn, inode, file, lsm);
404 ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
409 * really does the getattr on the inode and updates its fields
411 int ll_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm,
414 struct ll_sb_info *sbi = ll_i2sbi(inode);
415 struct ll_inode_info *lli = ll_i2info(inode);
416 struct ptlrpc_request_set *set;
419 unsigned long before, after;
427 memset(&oa, 0, sizeof oa);
428 oa.o_id = lsm->lsm_object_id;
430 oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
431 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
434 if (ostdata != NULL) {
435 memcpy(obdo_handle(&oa), ostdata, FD_OSTDATA_SIZE);
436 oa.o_valid |= OBD_MD_FLHANDLE;
439 /* getattr can race with writeback. we don't want to trust a getattr
440 * that doesn't include the writeback of our farthest cached pages
441 * that it raced with. */
442 /* Now that the OSC knows the cached-page status, it can and should be
443 * adjusting its getattr results to include the maximum cached offset
444 * for its stripe(s). */
446 bef = obd_last_dirty_offset(ll_i2obdconn(inode), lli->lli_smd,
449 rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
451 set = ptlrpc_prep_set ();
453 CERROR ("ENOMEM allocing request set\n");
456 rc = obd_getattr_async(&sbi->ll_osc_conn, &oa, lsm, set);
458 rc = ptlrpc_set_wait (set);
459 ptlrpc_set_destroy (set);
465 aft = obd_last_dirty_offset(ll_i2obdconn(inode), lli->lli_smd,
467 CDEBUG(D_INODE, " %d,%lu -> %d,%lu\n", bef, before, aft, after);
469 (aft != 0 || after < before) &&
470 oa.o_size < ((u64)before + 1) << PAGE_CACHE_SHIFT);
472 obdo_refresh_inode(inode, &oa, (OBD_MD_FLBLOCKS | OBD_MD_FLMTIME |
474 if (inode->i_blksize < PAGE_CACHE_SIZE)
475 inode->i_blksize = PAGE_CACHE_SIZE;
477 /* make sure getattr doesn't return a size that causes writeback
478 * to forget about cached writes */
479 if ((aft == 0) && oa.o_size < ((u64)after + 1) << PAGE_CACHE_SHIFT) {
480 CDEBUG(D_INODE, "cached at %lu, keeping %llu i_size instead "
481 "of oa "LPU64"\n", after, inode->i_size,
486 obdo_to_inode(inode, &oa, OBD_MD_FLSIZE);
488 CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lu blksize %lu\n",
489 lsm->lsm_object_id, inode->i_size, inode->i_size,
494 static inline void ll_remove_suid(struct inode *inode)
498 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
499 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
501 /* was any of the uid bits set? */
502 mode &= inode->i_mode;
503 if (mode && !capable(CAP_FSETID)) {
504 inode->i_mode &= ~mode;
505 // XXX careful here - we cannot change the size
510 static void ll_update_atime(struct inode *inode)
512 if (IS_RDONLY(inode)) return;
514 /* update atime, but don't explicitly write it out just this change */
515 inode->i_atime = CURRENT_TIME;
520 * flush the page cache for an extent as its canceled. when we're on an
521 * lov we get a lock cancelation for each of the obd locks under the lov
522 * so we have to map the obd's region back onto the stripes in the file
525 * no one can dirty the extent until we've finished our work and they
526 * can enqueue another lock.
528 * XXX this could be asking the inode's dirty tree for info
530 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
531 struct ldlm_lock *lock)
533 struct ldlm_extent *extent = &lock->l_extent;
534 unsigned long start, end, count, skip, i, j;
539 CDEBUG(D_INODE, "obdo %lu inode %p ["LPU64"->"LPU64"] size: %llu\n",
540 inode->i_ino, inode, extent->start, extent->end, inode->i_size);
542 start = extent->start >> PAGE_CACHE_SHIFT;
545 end = (extent->end >> PAGE_CACHE_SHIFT) + 1;
546 if ((end << PAGE_CACHE_SHIFT) < extent->end)
548 if (lsm->lsm_stripe_count > 1) {
551 struct ldlm_lock *lock;
552 struct lov_stripe_md *lsm;
553 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
555 __u32 vallen = sizeof(stripe);
558 /* get our offset in the lov */
559 rc = obd_get_info(ll_i2obdconn(inode), sizeof(key),
560 &key, &vallen, &stripe);
562 CERROR("obd_get_info: rc = %d\n", rc);
565 LASSERT(stripe < lsm->lsm_stripe_count);
567 count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT;
568 skip = (lsm->lsm_stripe_count - 1) * count;
569 start += (start/count * skip) + (stripe * count);
571 end += (end/count * skip) + (stripe * count);
574 i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
576 clear_bit(LLI_F_HAVE_SIZE_LOCK, &(ll_i2info(inode)->lli_flags));
580 CDEBUG(D_INODE, "start: %lu j: %lu count: %lu skip: %lu end: %lu\n",
581 start, start % count, count, skip, end);
583 /* start writeback on dirty pages in the extent when its PW */
584 for (i = start, j = start % count;
585 lock->l_granted_mode == LCK_PW && i < end; j++, i++) {
590 /* its unlikely, but give us a chance to bail when we're out */
591 ll_pgcache_lock(inode->i_mapping);
592 if (list_empty(&inode->i_mapping->dirty_pages)) {
593 CDEBUG(D_INODE, "dirty list empty\n");
594 ll_pgcache_unlock(inode->i_mapping);
597 ll_pgcache_unlock(inode->i_mapping);
602 page = find_get_page(inode->i_mapping, i);
605 if (!PageDirty(page) || TryLockPage(page)) {
606 page_cache_release(page);
609 if (PageDirty(page)) {
610 CDEBUG(D_INODE, "writing page %p\n", page);
611 ll_pgcache_lock(inode->i_mapping);
612 list_del(&page->list);
613 list_add(&page->list, &inode->i_mapping->locked_pages);
614 ll_pgcache_unlock(inode->i_mapping);
616 /* this writepage might write out pages outside
617 * this extent, but that's ok, the pages are only
618 * still dirty because a lock still covers them */
619 ClearPageDirty(page);
620 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
621 ret = inode->i_mapping->a_ops->writepage(page);
623 ret = inode->i_mapping->a_ops->writepage(page, NULL);
630 page_cache_release(page);
634 /* our locks are page granular thanks to osc_enqueue, we invalidate the
636 LASSERT((extent->start & ~PAGE_CACHE_MASK) == 0);
637 LASSERT(((extent->end+1) & ~PAGE_CACHE_MASK) == 0);
638 for (i = start, j = start % count ; i < end ; j++, i++) {
643 ll_pgcache_lock(inode->i_mapping);
644 if (list_empty(&inode->i_mapping->dirty_pages) &&
645 list_empty(&inode->i_mapping->clean_pages) &&
646 list_empty(&inode->i_mapping->locked_pages)) {
647 CDEBUG(D_INODE, "nothing left\n");
648 ll_pgcache_unlock(inode->i_mapping);
651 ll_pgcache_unlock(inode->i_mapping);
654 page = find_get_page(inode->i_mapping, i);
657 CDEBUG(D_INODE, "dropping page %p at %lu\n", page, page->index);
659 if (page->mapping) /* might have raced */
660 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
661 truncate_complete_page(page);
663 truncate_complete_page(page->mapping, page);
666 page_cache_release(page);
671 static int ll_extent_lock_callback(struct ldlm_lock *lock,
672 struct ldlm_lock_desc *new, void *data,
675 struct inode *inode = data;
676 struct ll_inode_info *lli = ll_i2info(inode);
677 struct lustre_handle lockh = { 0 };
681 if ((unsigned long)inode < 0x1000) {
682 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
687 case LDLM_CB_BLOCKING:
688 ldlm_lock2handle(lock, &lockh);
689 rc = ldlm_cli_cancel(&lockh);
691 CERROR("ldlm_cli_cancel failed: %d\n", rc);
693 case LDLM_CB_CANCELING:
694 /* FIXME: we could be given 'canceling intents' so that we
695 * could know to write-back or simply throw away the pages
696 * based on if the cancel comes from a desire to, say,
697 * read or truncate.. */
698 if ((unsigned long)lli->lli_smd < 0x1000) {
699 /* note that lli is part of the inode itself, so it
700 * is valid if as checked the inode pointer above. */
701 CERROR("inode %lu, sb %p, lli %p, lli_smd %p\n",
702 inode->i_ino, inode->i_sb, lli, lli->lli_smd);
703 LDLM_ERROR(lock, "cancel lock on bad inode %p", inode);
707 ll_pgcache_remove_extent(inode, lli->lli_smd, lock);
717 * some callers, notably truncate, really don't want i_size set based
718 * on the the size returned by the getattr, or lock acquisition in
721 int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode,
722 struct lov_stripe_md *lsm,
723 int mode, struct ldlm_extent *extent,
724 struct lustre_handle *lockh)
726 struct ll_sb_info *sbi = ll_i2sbi(inode);
730 LASSERT(lockh->cookie == 0);
732 /* XXX phil: can we do this? won't it screw the file size up? */
733 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
734 (sbi->ll_flags & LL_SBI_NOLCK))
737 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
738 inode->i_ino, extent->start, extent->end);
740 rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, extent,
741 sizeof(extent), mode, &flags, ll_extent_lock_callback,
748 * this grabs a lock and manually implements behaviour that makes it look like
749 * the OST is returning the file size with each lock acquisition.
751 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
752 struct lov_stripe_md *lsm, int mode,
753 struct ldlm_extent *extent, struct lustre_handle *lockh)
755 struct ll_inode_info *lli = ll_i2info(inode);
756 struct ldlm_extent size_lock;
757 struct lustre_handle match_lockh = {0};
758 int flags, rc, matched;
761 rc = ll_extent_lock_no_validate(fd, inode, lsm, mode, extent, lockh);
765 if (test_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags))
768 rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL);
770 ll_extent_unlock(fd, inode, lsm, mode, lockh);
774 size_lock.start = inode->i_size;
775 size_lock.end = OBD_OBJECT_EOF;
777 /* XXX I bet we should be checking the lock ignore flags.. */
778 flags = LDLM_FL_CBPENDING | LDLM_FL_BLOCK_GRANTED | LDLM_FL_MATCH_DATA;
779 matched = obd_match(&ll_i2sbi(inode)->ll_osc_conn, lsm, LDLM_EXTENT,
780 &size_lock, sizeof(size_lock), LCK_PR, &flags,
781 inode, &match_lockh);
783 /* hey, alright, we hold a size lock that covers the size we
784 * just found, its not going to change for a while.. */
786 set_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags);
787 obd_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, LCK_PR,
794 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
795 struct lov_stripe_md *lsm, int mode,
796 struct lustre_handle *lockh)
798 struct ll_sb_info *sbi = ll_i2sbi(inode);
802 /* XXX phil: can we do this? won't it screw the file size up? */
803 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
804 (sbi->ll_flags & LL_SBI_NOLCK))
807 rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh);
812 static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
815 struct ll_file_data *fd = filp->private_data;
816 struct inode *inode = filp->f_dentry->d_inode;
817 struct ll_inode_info *lli = ll_i2info(inode);
818 struct lov_stripe_md *lsm = lli->lli_smd;
819 struct lustre_handle lockh = { 0 };
820 struct ll_read_extent rextent;
824 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
825 inode->i_ino, inode->i_generation, inode, count, *ppos);
827 /* "If nbyte is 0, read() will return 0 and have no other results."
828 * -- Single Unix Spec */
832 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
838 /* grab a -> eof extent to push extending writes out of node's caches
839 * so we can see them at the getattr after lock acquisition. this will
840 * turn into a seperate [*ppos + count, EOF] 'size intent' lock attempt
842 rextent.re_extent.start = *ppos;
843 rextent.re_extent.end = OBD_OBJECT_EOF;
845 err = ll_extent_lock(fd, inode, lsm, LCK_PR, &rextent.re_extent,&lockh);
849 /* XXX tell ll_readpage what pages have a PR lock.. */
850 rextent.re_task = current;
851 spin_lock(&lli->lli_read_extent_lock);
852 list_add(&rextent.re_lli_item, &lli->lli_read_extents);
853 spin_unlock(&lli->lli_read_extent_lock);
855 CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n",
856 inode->i_ino, count, *ppos);
857 retval = generic_file_read(filp, buf, count, ppos);
859 spin_lock(&lli->lli_read_extent_lock);
860 list_del(&rextent.re_lli_item);
861 spin_unlock(&lli->lli_read_extent_lock);
864 ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
869 * Write to a file (through the page cache).
871 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
874 struct ll_file_data *fd = file->private_data;
875 struct inode *inode = file->f_dentry->d_inode;
876 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
877 struct lustre_handle lockh = { 0 };
878 struct ldlm_extent extent;
879 loff_t maxbytes = ll_file_maxbytes(inode);
882 char should_validate = 1;
884 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
885 inode->i_ino, inode->i_generation, inode, count, *ppos);
887 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
889 * sleep doing some writeback work of this mount's dirty data
890 * if the VM thinks we're low on memory.. other dirtying code
891 * paths should think about doing this, too, but they should be
892 * careful not to hold locked pages while they do so. like
893 * ll_prepare_write. *cough*
895 ll_check_dirty(inode->i_sb);
897 /* POSIX, but surprised the VFS doesn't check this already */
903 if (file->f_flags & O_APPEND) {
905 extent.end = OBD_OBJECT_EOF;
907 extent.start = *ppos;
908 extent.end = *ppos + count - 1;
909 /* we really don't care what i_size is if we're doing
910 * fully page aligned writes */
911 if ((*ppos & ~PAGE_CACHE_MASK) == 0 &&
912 (count & ~PAGE_CACHE_MASK) == 0)
917 err = ll_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh);
919 err = ll_extent_lock_no_validate(fd, inode, lsm, LCK_PW,
924 /* this is ok, g_f_w will overwrite this under i_sem if it races
925 * with a local truncate, it just makes our maxbyte checking easier */
926 if (file->f_flags & O_APPEND)
927 *ppos = inode->i_size;
929 if (*ppos >= maxbytes) {
930 if (count || *ppos > maxbytes) {
931 send_sig(SIGXFSZ, current, 0);
932 GOTO(out, retval = -EFBIG);
935 if (*ppos + count > maxbytes)
936 count = maxbytes - *ppos;
938 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
939 inode->i_ino, count, *ppos);
941 /* generic_file_write handles O_APPEND after getting i_sem */
942 retval = generic_file_write(file, buf, count, ppos);
946 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES,
948 ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh);
952 static int ll_lov_setstripe(struct inode *inode, struct file *file,
955 struct ll_inode_info *lli = ll_i2info(inode);
956 struct lustre_handle *conn = ll_i2obdconn(inode);
957 struct lov_stripe_md *lsm;
961 down(&lli->lli_open_sem);
964 up(&lli->lli_open_sem);
965 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
967 /* If we haven't already done the open, do so now */
968 if (file->f_flags & O_LOV_DELAY_CREATE) {
969 int rc2 = ll_osc_open(conn, inode, file, lsm);
977 rc = obd_iocontrol(LL_IOC_LOV_SETSTRIPE, conn, 0, &lsm, (void *)arg);
979 up(&lli->lli_open_sem);
982 rc = ll_create_obj(conn, inode, file, lsm);
983 up(&lli->lli_open_sem);
986 obd_free_memmd(conn, &lsm);
989 rc = ll_osc_open(conn, inode, file, lli->lli_smd);
993 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
995 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
996 struct lustre_handle *conn = ll_i2obdconn(inode);
1001 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, conn, 0, lsm, (void *)arg);
1004 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1007 struct ll_file_data *fd = file->private_data;
1008 struct lustre_handle *conn;
1011 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%u\n", inode->i_ino,
1012 inode->i_generation, inode, cmd);
1014 if (_IOC_TYPE(cmd) == 'T') /* tty ioctls */
1017 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_IOCTL);
1019 case LL_IOC_GETFLAGS:
1020 /* Get the current value of the file flags */
1021 return put_user(fd->fd_flags, (int *)arg);
1022 case LL_IOC_SETFLAGS:
1023 case LL_IOC_CLRFLAGS:
1024 /* Set or clear specific file flags */
1025 /* XXX This probably needs checks to ensure the flags are
1026 * not abused, and to handle any flag side effects.
1028 if (get_user(flags, (int *) arg))
1031 if (cmd == LL_IOC_SETFLAGS)
1032 fd->fd_flags |= flags;
1034 fd->fd_flags &= ~flags;
1036 case LL_IOC_LOV_SETSTRIPE:
1037 return ll_lov_setstripe(inode, file, arg);
1038 case LL_IOC_LOV_GETSTRIPE:
1039 return ll_lov_getstripe(inode, arg);
1041 /* We need to special case any other ioctls we want to handle,
1042 * to send them to the MDS/OST as appropriate and to properly
1043 * network encode the arg field.
1044 case EXT2_IOC_GETFLAGS:
1045 case EXT2_IOC_SETFLAGS:
1046 case EXT2_IOC_GETVERSION_OLD:
1047 case EXT2_IOC_GETVERSION_NEW:
1048 case EXT2_IOC_SETVERSION_OLD:
1049 case EXT2_IOC_SETVERSION_NEW:
1052 conn = ll_i2obdconn(inode);
1053 return obd_iocontrol(cmd, conn, 0, NULL, (void *)arg);
1057 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1059 struct inode *inode = file->f_dentry->d_inode;
1060 struct ll_file_data *fd = file->private_data;
1061 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1062 struct lustre_handle lockh = {0};
1065 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),to=%llu\n", inode->i_ino,
1066 inode->i_generation, inode,
1067 offset + ((origin==2) ? inode->i_size : file->f_pos));
1069 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_LLSEEK);
1070 if (origin == 2) { /* SEEK_END */
1072 struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
1073 err = ll_extent_lock(fd, inode, lsm, LCK_PR, &extent, &lockh);
1074 if (err != ELDLM_OK)
1077 offset += inode->i_size;
1078 } else if (origin == 1) { /* SEEK_CUR */
1079 offset += file->f_pos;
1083 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1084 if (offset != file->f_pos) {
1085 file->f_pos = offset;
1086 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1088 file->f_version = ++event;
1095 ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
1099 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1101 struct inode *inode = dentry->d_inode;
1104 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1105 inode->i_generation, inode);
1107 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_FSYNC);
1109 * filemap_fdata{sync,wait} are also called at PW lock cancelation so
1110 * we know that they can only find data to writeback here if we are
1111 * still holding the PW lock that covered the dirty pages. XXX we
1112 * should probably get a reference on it, though, just to be clear.
1114 rc = filemap_fdatasync(inode->i_mapping);
1116 rc = filemap_fdatawait(inode->i_mapping);
1121 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
1123 struct inode *inode = dentry->d_inode;
1124 struct lov_stripe_md *lsm;
1128 CERROR("REPORT THIS LINE TO PETER\n");
1131 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
1132 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
1133 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
1134 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE);
1137 /* this is very tricky. it is unsafe to call ll_have_md_lock
1138 when we have a referenced lock: because it may cause an RPC
1139 below when the lock is marked CB_PENDING. That RPC may not
1140 go out because someone else may be in another RPC waiting for
1142 if (!(it && it->it_lock_mode) && !ll_have_md_lock(dentry)) {
1143 struct lustre_md md;
1144 struct ptlrpc_request *req = NULL;
1145 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
1147 unsigned long valid = 0;
1151 if (S_ISREG(inode->i_mode)) {
1152 ealen = obd_size_diskmd(&sbi->ll_osc_conn, NULL);
1153 valid |= OBD_MD_FLEASIZE;
1155 ll_inode2fid(&fid, inode);
1156 rc = mdc_getattr(&sbi->ll_mdc_conn, &fid, valid, ealen, &req);
1158 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
1161 rc = mdc_req2lustre_md(req, 0, &sbi->ll_osc_conn, &md);
1163 /* XXX Too paranoid? */
1164 if ((md.body->valid ^ valid) & OBD_MD_FLEASIZE)
1165 CERROR("Asked for %s eadata but got %s\n",
1166 (valid & OBD_MD_FLEASIZE) ? "some" : "no",
1167 (md.body->valid & OBD_MD_FLEASIZE) ? "some":
1170 ptlrpc_req_finished(req);
1174 ll_update_inode(inode, md.body, md.lsm);
1175 if (md.lsm != NULL && ll_i2info(inode)->lli_smd != md.lsm)
1176 obd_free_memmd(&sbi->ll_osc_conn, &md.lsm);
1178 ptlrpc_req_finished(req);
1181 lsm = ll_i2info(inode)->lli_smd;
1182 if (!lsm) /* object not yet allocated, don't validate size */
1186 * unfortunately stat comes in through revalidate and we don't
1187 * differentiate this use from initial instantiation. we're
1188 * also being wildly conservative and flushing write caches
1189 * so that stat really returns the proper size.
1192 struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
1193 struct lustre_handle lockh = {0};
1196 err = ll_extent_lock(NULL, inode, lsm, LCK_PR, &extent, &lockh);
1197 if (err != ELDLM_OK)
1200 ll_extent_unlock(NULL, inode, lsm, LCK_PR, &lockh);
1205 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1206 int ll_getattr(struct vfsmount *mnt, struct dentry *de,
1207 struct lookup_intent *it,
1211 struct inode *inode = de->d_inode;
1213 res = ll_inode_revalidate_it(de, it);
1214 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR);
1219 stat->dev = inode->i_sb->s_dev;
1220 stat->ino = inode->i_ino;
1221 stat->mode = inode->i_mode;
1222 stat->nlink = inode->i_nlink;
1223 stat->uid = inode->i_uid;
1224 stat->gid = inode->i_gid;
1225 stat->rdev = kdev_t_to_nr(inode->i_rdev);
1226 stat->atime = inode->i_atime;
1227 stat->mtime = inode->i_mtime;
1228 stat->ctime = inode->i_ctime;
1229 stat->size = inode->i_size;
1230 stat->blksize = inode->i_blksize;
1231 stat->blocks = inode->i_blocks;
1236 struct file_operations ll_file_operations = {
1238 write: ll_file_write,
1239 ioctl: ll_file_ioctl,
1241 release: ll_file_release,
1242 mmap: generic_file_mmap,
1243 llseek: ll_file_seek,
1247 struct inode_operations ll_file_inode_operations = {
1248 setattr_raw: ll_setattr_raw,
1249 setattr: ll_setattr,
1250 truncate: ll_truncate,
1251 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1252 getattr_it: ll_getattr,
1254 revalidate_it: ll_inode_revalidate_it,
1258 struct inode_operations ll_special_inode_operations = {
1259 setattr_raw: ll_setattr_raw,
1260 setattr: ll_setattr,
1261 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1262 getattr_it: ll_getattr,
1264 revalidate_it: ll_inode_revalidate_it,