1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <linux/lustre_dlm.h>
27 #include <linux/lustre_lite.h>
28 #include <linux/obd_lov.h> /* for lov_mds_md_size() in lov_setstripe() */
29 #include <linux/random.h>
30 #include <linux/pagemap.h>
31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
32 #include <linux/lustre_compat25.h>
35 #include "llite_internal.h"
37 static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
40 struct ll_file_data *fd = file->private_data;
41 struct ptlrpc_request *req = NULL;
43 struct obd_import *imp;
47 /* Complete the open request and remove it from replay list */
48 rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, inode->i_ino,
49 inode->i_mode, &fd->fd_mds_och.och_fh, &req);
51 CERROR("inode %lu close failed: rc = %d\n", inode->i_ino, rc);
53 imp = fd->fd_mds_och.och_req->rq_import;
55 spin_lock_irqsave(&imp->imp_lock, flags);
57 DEBUG_REQ(D_HA, fd->fd_mds_och.och_req, "matched open req %p",
58 fd->fd_mds_och.och_req);
60 /* We held on to the request for replay until we saw a close for that
61 * file. Now that we've closed it, it gets replayed on the basis of
62 * its transno only. */
63 spin_lock (&fd->fd_mds_och.och_req->rq_lock);
64 fd->fd_mds_och.och_req->rq_replay = 0;
65 spin_unlock (&fd->fd_mds_och.och_req->rq_lock);
67 if (fd->fd_mds_och.och_req->rq_transno) {
68 /* This open created a file, so it needs replay as a
69 * normal transaction now. Our reference to it now
70 * effectively owned by the imp_replay_list, and it'll
71 * be committed just like other transno-having
72 * requests from here on out. */
74 /* We now retain this close request, so that it is
75 * replayed if the open is replayed. We duplicate the
76 * transno, so that we get freed at the right time,
77 * and rely on the difference in xid to keep
78 * everything ordered correctly.
80 * But! If this close was already given a transno
81 * (because it caused real unlinking of an
82 * open-unlinked file, f.e.), then we'll be ordered on
83 * the basis of that and we don't need to do anything
85 if (!req->rq_transno) {
86 req->rq_transno = fd->fd_mds_och.och_req->rq_transno;
87 ptlrpc_retain_replayable_request(req, imp);
89 spin_unlock_irqrestore(&imp->imp_lock, flags);
91 /* Should we free_committed now? we always free before
92 * replay, so it's probably a wash. We could check to
93 * see if the fd_req should already be committed, in
94 * which case we can avoid the whole retain_replayable
97 /* No transno means that we can just drop our ref. */
98 spin_unlock_irqrestore(&imp->imp_lock, flags);
100 ptlrpc_req_finished(fd->fd_mds_och.och_req);
102 /* Do this after the fd_req->rq_transno check, because we don't want
103 * to bounce off zero references. */
104 ptlrpc_req_finished(req);
105 fd->fd_mds_och.och_fh.cookie = DEAD_HANDLE_MAGIC;
106 file->private_data = NULL;
107 OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd);
112 /* While this returns an error code, fput() the caller does not, so we need
113 * to make every effort to clean up all of our state here. Also, applications
114 * rarely check close errors and even if an error is returned they will not
115 * re-try the close call.
117 int ll_file_release(struct inode *inode, struct file *file)
119 struct ll_file_data *fd;
121 struct ll_sb_info *sbi = ll_i2sbi(inode);
122 struct ll_inode_info *lli = ll_i2info(inode);
123 struct lov_stripe_md *lsm = lli->lli_smd;
127 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
128 inode->i_generation, inode);
130 /* don't do anything for / */
131 if (inode->i_sb->s_root == file->f_dentry)
134 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_RELEASE);
135 fd = (struct ll_file_data *)file->private_data;
136 if (!fd) /* no process opened the file after an mcreate */
139 /* we might not be able to get a valid handle on this file
140 * again so we really want to flush our write cache.. */
141 if (S_ISREG(inode->i_mode) && lsm) {
142 write_inode_now(inode, 0);
143 obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
144 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
145 memcpy(obdo_handle(&oa), &fd->fd_ost_och, FD_OSTDATA_SIZE);
146 oa.o_valid |= OBD_MD_FLHANDLE;
148 rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL);
150 CERROR("inode %lu object close failed: rc %d\n",
154 rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
161 static int ll_local_open(struct file *file, struct lookup_intent *it)
163 struct ptlrpc_request *req = it->it_data;
164 struct ll_file_data *fd;
165 struct mds_body *body;
168 body = lustre_msg_buf (req->rq_repmsg, 1, sizeof (*body));
169 LASSERT (body != NULL); /* reply already checked out */
170 LASSERT_REPSWABBED (req, 1); /* and swabbed down */
172 LASSERT(!file->private_data);
174 OBD_SLAB_ALLOC(fd, ll_file_data_slab, SLAB_KERNEL, sizeof *fd);
175 /* We can't handle this well without reorganizing ll_file_open and
176 * ll_mdc_close, so don't even try right now. */
179 memset(fd, 0, sizeof(*fd));
181 memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle));
182 fd->fd_mds_och.och_req = it->it_data;
183 file->private_data = fd;
188 static int ll_osc_open(struct lustre_handle *conn, struct inode *inode,
189 struct file *file, struct lov_stripe_md *lsm)
191 struct ll_file_data *fd = file->private_data;
199 oa->o_id = lsm->lsm_object_id;
200 oa->o_mode = S_IFREG;
201 oa->o_valid = OBD_MD_FLID;
202 obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
203 rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och);
207 file->f_flags &= ~O_LOV_DELAY_CREATE;
208 obdo_refresh_inode(inode, oa, (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
209 OBD_MD_FLATIME | OBD_MD_FLMTIME |
217 /* Caller must hold lli_open_sem to protect lli->lli_smd from changing and
218 * duplicate objects from being created. We only install lsm to lli_smd if
219 * the mdc open was successful (hence stored stripe MD on MDS), otherwise
220 * other nodes could try to create different objects for the same file.
222 static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
223 struct file *file, struct lov_stripe_md *lsm)
225 struct ptlrpc_request *req = NULL;
226 struct ll_inode_info *lli = ll_i2info(inode);
227 struct lov_mds_md *lmm = NULL;
230 struct mdc_op_data op_data;
231 struct obd_trans_info oti = { 0 };
232 int rc, err, lmm_size = 0;
239 LASSERT(S_ISREG(inode->i_mode));
240 oa->o_mode = S_IFREG | 0600;
241 oa->o_id = inode->i_ino;
242 oa->o_generation = inode->i_generation;
243 /* Keep these 0 for now, because chown/chgrp does not change the
244 * ownership on the OST, and we don't want to allow BA OST NFS
245 * users to access these objects by mistake. */
248 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGENER | OBD_MD_FLTYPE |
249 OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID;
250 #ifdef ENABLE_ORPHANS
251 oa->o_valid |= OBD_MD_FLCOOKIE;
254 obdo_from_inode(oa, inode, OBD_MD_FLTYPE|OBD_MD_FLATIME|OBD_MD_FLMTIME|
255 OBD_MD_FLCTIME | (inode->i_size ? OBD_MD_FLSIZE : 0));
257 rc = obd_create(conn, oa, &lsm, &oti);
259 CERROR("error creating objects for inode %lu: rc = %d\n",
262 CERROR("obd_create returned invalid rc %d\n", rc);
267 obdo_refresh_inode(inode, oa, OBD_MD_FLBLKSZ);
269 LASSERT(lsm && lsm->lsm_object_id);
270 rc = obd_packmd(conn, &lmm, lsm);
272 GOTO(out_destroy, rc);
276 /* Save the stripe MD with this file on the MDS */
277 memset(&iattr, 0, sizeof(iattr));
278 iattr.ia_valid = ATTR_FROM_OPEN;
280 ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
283 #warning FIXME: next line is for debugging purposes only
284 obd_log_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, oti.oti_numcookies,
285 oti.oti_logcookies, OBD_LLOG_FL_SENDNOW);
288 rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, &op_data, &iattr,
289 lmm, lmm_size, oti.oti_logcookies,
290 oti.oti_numcookies * sizeof(oti.oti_onecookie), &req);
291 ptlrpc_req_finished(req);
293 obd_free_diskmd(conn, &lmm);
295 /* If we couldn't complete mdc_open() and store the stripe MD on the
296 * MDS, we need to destroy the objects now or they will be leaked.
299 CERROR("error: storing stripe MD for %lu: rc %d\n",
301 GOTO(out_destroy, rc);
304 lli->lli_maxbytes = lsm->lsm_maxbytes;
308 oti_free_cookies(&oti);
313 oa->o_id = lsm->lsm_object_id;
314 oa->o_valid = OBD_MD_FLID;
315 obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
317 err = obd_log_cancel(conn, lsm, oti.oti_numcookies, oti.oti_logcookies,
318 OBD_LLOG_FL_SENDNOW);
320 CERROR("error cancelling inode %lu log cookies: rc %d\n",
323 err = obd_destroy(conn, oa, lsm, NULL);
324 obd_free_memmd(conn, &lsm);
326 CERROR("error uncreating inode %lu objects: rc %d\n",
331 /* Open a file, and (for the very first open) create objects on the OSTs at
332 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
333 * creation or open until ll_lov_setstripe() ioctl is called. We grab
334 * lli_open_sem to ensure no other process will create objects, send the
335 * stripe MD to the MDS, or try to destroy the objects if that fails.
337 * If we already have the stripe MD locally then we don't request it in
338 * mdc_open(), by passing a lmm_size = 0.
340 * It is up to the application to ensure no other processes open this file
341 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
342 * used. We might be able to avoid races of that sort by getting lli_open_sem
343 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
344 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
346 int ll_file_open(struct inode *inode, struct file *file)
348 struct ll_sb_info *sbi = ll_i2sbi(inode);
349 struct ll_inode_info *lli = ll_i2info(inode);
350 struct lustre_handle *conn = ll_i2obdconn(inode);
351 struct lookup_intent *it;
352 struct lov_stripe_md *lsm;
356 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
357 inode->i_generation, inode);
359 /* don't do anything for / */
360 if (inode->i_sb->s_root == file->f_dentry)
364 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
366 rc = ll_it_open_error(DISP_OPEN_OPEN, it);
370 rc = ll_local_open(file, it);
374 mdc_set_open_replay_data(&((struct ll_file_data *)
375 file->private_data)->fd_mds_och);
376 if (!S_ISREG(inode->i_mode))
381 if (file->f_flags & O_LOV_DELAY_CREATE ||
382 !(file->f_mode & FMODE_WRITE)) {
383 CDEBUG(D_INODE, "delaying object creation\n");
386 down(&lli->lli_open_sem);
388 rc = ll_create_obj(conn, inode, file, NULL);
389 up(&lli->lli_open_sem);
393 CERROR("warning: stripe already set on ino %lu\n",
395 up(&lli->lli_open_sem);
400 rc = ll_osc_open(conn, inode, file, lsm);
406 ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
411 * really does the getattr on the inode and updates its fields
413 int ll_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm,
416 struct ll_sb_info *sbi = ll_i2sbi(inode);
417 struct ll_inode_info *lli = ll_i2info(inode);
418 struct ptlrpc_request_set *set;
421 unsigned long before, after;
429 memset(&oa, 0, sizeof oa);
430 oa.o_id = lsm->lsm_object_id;
432 oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
433 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
436 if (ostdata != NULL) {
437 memcpy(obdo_handle(&oa), ostdata, FD_OSTDATA_SIZE);
438 oa.o_valid |= OBD_MD_FLHANDLE;
441 /* getattr can race with writeback. we don't want to trust a getattr
442 * that doesn't include the writeback of our farthest cached pages
443 * that it raced with. */
444 /* Now that the OSC knows the cached-page status, it can and should be
445 * adjusting its getattr results to include the maximum cached offset
446 * for its stripe(s). */
448 bef = obd_last_dirty_offset(ll_i2obdconn(inode), lli->lli_smd,
451 rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
453 set = ptlrpc_prep_set ();
455 CERROR ("ENOMEM allocing request set\n");
458 rc = obd_getattr_async(&sbi->ll_osc_conn, &oa, lsm, set);
460 rc = ptlrpc_set_wait (set);
461 ptlrpc_set_destroy (set);
467 aft = obd_last_dirty_offset(ll_i2obdconn(inode), lli->lli_smd,
469 CDEBUG(D_INODE, " %d,%lu -> %d,%lu\n", bef, before, aft, after);
471 (aft != 0 || after < before) &&
472 oa.o_size < ((u64)before + 1) << PAGE_CACHE_SHIFT);
474 obdo_refresh_inode(inode, &oa, (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
475 OBD_MD_FLMTIME | OBD_MD_FLCTIME));
476 if (inode->i_blksize < PAGE_CACHE_SIZE)
477 inode->i_blksize = PAGE_CACHE_SIZE;
479 /* make sure getattr doesn't return a size that causes writeback
480 * to forget about cached writes */
481 if ((aft == 0) && oa.o_size < ((u64)after + 1) << PAGE_CACHE_SHIFT) {
482 CDEBUG(D_INODE, "cached at %lu, keeping %llu i_size instead "
483 "of oa "LPU64"\n", after, inode->i_size,
488 obdo_to_inode(inode, &oa, OBD_MD_FLSIZE);
490 CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lu blksize %lu\n",
491 lsm->lsm_object_id, inode->i_size, inode->i_size,
496 static inline void ll_remove_suid(struct inode *inode)
500 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
501 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
503 /* was any of the uid bits set? */
504 mode &= inode->i_mode;
505 if (mode && !capable(CAP_FSETID)) {
506 inode->i_mode &= ~mode;
507 // XXX careful here - we cannot change the size
512 static void ll_update_atime(struct inode *inode)
514 if (IS_RDONLY(inode)) return;
516 /* update atime, but don't explicitly write it out just this change */
517 inode->i_atime = CURRENT_TIME;
522 * flush the page cache for an extent as its canceled. when we're on an
523 * lov we get a lock cancelation for each of the obd locks under the lov
524 * so we have to map the obd's region back onto the stripes in the file
527 * no one can dirty the extent until we've finished our work and they
528 * can enqueue another lock.
530 * XXX this could be asking the inode's dirty tree for info
532 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
533 struct ldlm_lock *lock)
535 struct ldlm_extent *extent = &lock->l_extent;
536 unsigned long start, end, count, skip, i, j;
541 CDEBUG(D_INODE, "obdo %lu inode %p ["LPU64"->"LPU64"] size: %llu\n",
542 inode->i_ino, inode, extent->start, extent->end, inode->i_size);
544 start = extent->start >> PAGE_CACHE_SHIFT;
547 end = (extent->end >> PAGE_CACHE_SHIFT) + 1;
548 if ((end << PAGE_CACHE_SHIFT) < extent->end)
550 if (lsm->lsm_stripe_count > 1) {
553 struct ldlm_lock *lock;
554 struct lov_stripe_md *lsm;
555 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
557 __u32 vallen = sizeof(stripe);
560 /* get our offset in the lov */
561 rc = obd_get_info(ll_i2obdconn(inode), sizeof(key),
562 &key, &vallen, &stripe);
564 CERROR("obd_get_info: rc = %d\n", rc);
567 LASSERT(stripe < lsm->lsm_stripe_count);
569 count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT;
570 skip = (lsm->lsm_stripe_count - 1) * count;
571 start += (start/count * skip) + (stripe * count);
573 end += (end/count * skip) + (stripe * count);
576 i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
578 clear_bit(LLI_F_HAVE_SIZE_LOCK, &(ll_i2info(inode)->lli_flags));
582 CDEBUG(D_INODE, "start: %lu j: %lu count: %lu skip: %lu end: %lu\n",
583 start, start % count, count, skip, end);
585 /* start writeback on dirty pages in the extent when its PW */
586 for (i = start, j = start % count;
587 lock->l_granted_mode == LCK_PW && i < end; j++, i++) {
592 /* its unlikely, but give us a chance to bail when we're out */
593 ll_pgcache_lock(inode->i_mapping);
594 if (list_empty(&inode->i_mapping->dirty_pages)) {
595 CDEBUG(D_INODE, "dirty list empty\n");
596 ll_pgcache_unlock(inode->i_mapping);
599 ll_pgcache_unlock(inode->i_mapping);
604 page = find_get_page(inode->i_mapping, i);
607 if (!PageDirty(page) || TryLockPage(page)) {
608 page_cache_release(page);
611 if (PageDirty(page)) {
612 CDEBUG(D_INODE, "writing page %p\n", page);
613 ll_pgcache_lock(inode->i_mapping);
614 list_del(&page->list);
615 list_add(&page->list, &inode->i_mapping->locked_pages);
616 ll_pgcache_unlock(inode->i_mapping);
618 /* this writepage might write out pages outside
619 * this extent, but that's ok, the pages are only
620 * still dirty because a lock still covers them */
621 ClearPageDirty(page);
622 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
623 ret = inode->i_mapping->a_ops->writepage(page);
625 ret = inode->i_mapping->a_ops->writepage(page, NULL);
632 page_cache_release(page);
636 /* our locks are page granular thanks to osc_enqueue, we invalidate the
638 LASSERT((extent->start & ~PAGE_CACHE_MASK) == 0);
639 LASSERT(((extent->end+1) & ~PAGE_CACHE_MASK) == 0);
640 for (i = start, j = start % count ; i < end ; j++, i++) {
645 ll_pgcache_lock(inode->i_mapping);
646 if (list_empty(&inode->i_mapping->dirty_pages) &&
647 list_empty(&inode->i_mapping->clean_pages) &&
648 list_empty(&inode->i_mapping->locked_pages)) {
649 CDEBUG(D_INODE, "nothing left\n");
650 ll_pgcache_unlock(inode->i_mapping);
653 ll_pgcache_unlock(inode->i_mapping);
656 page = find_get_page(inode->i_mapping, i);
659 CDEBUG(D_INODE, "dropping page %p at %lu\n", page, page->index);
661 if (page->mapping) /* might have raced */
662 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
663 truncate_complete_page(page);
665 truncate_complete_page(page->mapping, page);
668 page_cache_release(page);
673 static int ll_extent_lock_callback(struct ldlm_lock *lock,
674 struct ldlm_lock_desc *new, void *data,
677 struct inode *inode = data;
678 struct ll_inode_info *lli = ll_i2info(inode);
679 struct lustre_handle lockh = { 0 };
683 if ((unsigned long)inode < 0x1000) {
684 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
689 case LDLM_CB_BLOCKING:
690 ldlm_lock2handle(lock, &lockh);
691 rc = ldlm_cli_cancel(&lockh);
693 CERROR("ldlm_cli_cancel failed: %d\n", rc);
695 case LDLM_CB_CANCELING:
696 /* FIXME: we could be given 'canceling intents' so that we
697 * could know to write-back or simply throw away the pages
698 * based on if the cancel comes from a desire to, say,
699 * read or truncate.. */
700 if ((unsigned long)lli->lli_smd < 0x1000) {
701 /* note that lli is part of the inode itself, so it
702 * is valid if as checked the inode pointer above. */
703 CERROR("inode %lu, sb %p, lli %p, lli_smd %p\n",
704 inode->i_ino, inode->i_sb, lli, lli->lli_smd);
705 LDLM_ERROR(lock, "cancel lock on bad inode %p", inode);
709 ll_pgcache_remove_extent(inode, lli->lli_smd, lock);
719 * some callers, notably truncate, really don't want i_size set based
720 * on the the size returned by the getattr, or lock acquisition in
723 int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode,
724 struct lov_stripe_md *lsm,
725 int mode, struct ldlm_extent *extent,
726 struct lustre_handle *lockh)
728 struct ll_sb_info *sbi = ll_i2sbi(inode);
732 LASSERT(lockh->cookie == 0);
734 /* XXX phil: can we do this? won't it screw the file size up? */
735 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
736 (sbi->ll_flags & LL_SBI_NOLCK))
739 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
740 inode->i_ino, extent->start, extent->end);
742 rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, extent,
743 sizeof(extent), mode, &flags, ll_extent_lock_callback,
750 * this grabs a lock and manually implements behaviour that makes it look like
751 * the OST is returning the file size with each lock acquisition.
753 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
754 struct lov_stripe_md *lsm, int mode,
755 struct ldlm_extent *extent, struct lustre_handle *lockh)
757 struct ll_inode_info *lli = ll_i2info(inode);
758 struct ldlm_extent size_lock;
759 struct lustre_handle match_lockh = {0};
760 int flags, rc, matched;
763 rc = ll_extent_lock_no_validate(fd, inode, lsm, mode, extent, lockh);
767 if (test_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags))
770 rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL);
772 ll_extent_unlock(fd, inode, lsm, mode, lockh);
776 size_lock.start = inode->i_size;
777 size_lock.end = OBD_OBJECT_EOF;
779 /* XXX I bet we should be checking the lock ignore flags.. */
780 flags = LDLM_FL_CBPENDING | LDLM_FL_BLOCK_GRANTED | LDLM_FL_MATCH_DATA;
781 matched = obd_match(&ll_i2sbi(inode)->ll_osc_conn, lsm, LDLM_EXTENT,
782 &size_lock, sizeof(size_lock), LCK_PR, &flags,
783 inode, &match_lockh);
785 /* hey, alright, we hold a size lock that covers the size we
786 * just found, its not going to change for a while.. */
788 set_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags);
789 obd_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, LCK_PR,
796 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
797 struct lov_stripe_md *lsm, int mode,
798 struct lustre_handle *lockh)
800 struct ll_sb_info *sbi = ll_i2sbi(inode);
804 /* XXX phil: can we do this? won't it screw the file size up? */
805 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
806 (sbi->ll_flags & LL_SBI_NOLCK))
809 rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh);
814 static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
817 struct ll_file_data *fd = filp->private_data;
818 struct inode *inode = filp->f_dentry->d_inode;
819 struct ll_inode_info *lli = ll_i2info(inode);
820 struct lov_stripe_md *lsm = lli->lli_smd;
821 struct lustre_handle lockh = { 0 };
822 struct ll_read_extent rextent;
826 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
827 inode->i_ino, inode->i_generation, inode, count, *ppos);
829 /* "If nbyte is 0, read() will return 0 and have no other results."
830 * -- Single Unix Spec */
834 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
840 /* grab a -> eof extent to push extending writes out of node's caches
841 * so we can see them at the getattr after lock acquisition. this will
842 * turn into a seperate [*ppos + count, EOF] 'size intent' lock attempt
844 rextent.re_extent.start = *ppos;
845 rextent.re_extent.end = OBD_OBJECT_EOF;
847 err = ll_extent_lock(fd, inode, lsm, LCK_PR, &rextent.re_extent,&lockh);
851 /* XXX tell ll_readpage what pages have a PR lock.. */
852 rextent.re_task = current;
853 spin_lock(&lli->lli_read_extent_lock);
854 list_add(&rextent.re_lli_item, &lli->lli_read_extents);
855 spin_unlock(&lli->lli_read_extent_lock);
857 CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n",
858 inode->i_ino, count, *ppos);
859 retval = generic_file_read(filp, buf, count, ppos);
861 spin_lock(&lli->lli_read_extent_lock);
862 list_del(&rextent.re_lli_item);
863 spin_unlock(&lli->lli_read_extent_lock);
866 ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
871 * Write to a file (through the page cache).
873 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
876 struct ll_file_data *fd = file->private_data;
877 struct inode *inode = file->f_dentry->d_inode;
878 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
879 struct lustre_handle lockh = { 0 };
880 struct ldlm_extent extent;
881 loff_t maxbytes = ll_file_maxbytes(inode);
884 char should_validate = 1;
886 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
887 inode->i_ino, inode->i_generation, inode, count, *ppos);
889 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
891 * sleep doing some writeback work of this mount's dirty data
892 * if the VM thinks we're low on memory.. other dirtying code
893 * paths should think about doing this, too, but they should be
894 * careful not to hold locked pages while they do so. like
895 * ll_prepare_write. *cough*
897 ll_check_dirty(inode->i_sb);
899 /* POSIX, but surprised the VFS doesn't check this already */
905 if (file->f_flags & O_APPEND) {
907 extent.end = OBD_OBJECT_EOF;
909 extent.start = *ppos;
910 extent.end = *ppos + count - 1;
911 /* we really don't care what i_size is if we're doing
912 * fully page aligned writes */
913 if ((*ppos & ~PAGE_CACHE_MASK) == 0 &&
914 (count & ~PAGE_CACHE_MASK) == 0)
919 err = ll_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh);
921 err = ll_extent_lock_no_validate(fd, inode, lsm, LCK_PW,
926 /* this is ok, g_f_w will overwrite this under i_sem if it races
927 * with a local truncate, it just makes our maxbyte checking easier */
928 if (file->f_flags & O_APPEND)
929 *ppos = inode->i_size;
931 if (*ppos >= maxbytes) {
932 if (count || *ppos > maxbytes) {
933 send_sig(SIGXFSZ, current, 0);
934 GOTO(out, retval = -EFBIG);
937 if (*ppos + count > maxbytes)
938 count = maxbytes - *ppos;
940 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
941 inode->i_ino, count, *ppos);
943 /* generic_file_write handles O_APPEND after getting i_sem */
944 retval = generic_file_write(file, buf, count, ppos);
948 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES,
950 ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh);
954 static int ll_lov_setstripe(struct inode *inode, struct file *file,
957 struct ll_inode_info *lli = ll_i2info(inode);
958 struct lustre_handle *conn = ll_i2obdconn(inode);
959 struct lov_stripe_md *lsm;
963 down(&lli->lli_open_sem);
966 up(&lli->lli_open_sem);
967 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
969 /* If we haven't already done the open, do so now */
970 if (file->f_flags & O_LOV_DELAY_CREATE) {
971 int rc2 = ll_osc_open(conn, inode, file, lsm);
979 rc = obd_iocontrol(LL_IOC_LOV_SETSTRIPE, conn, 0, &lsm, (void *)arg);
981 up(&lli->lli_open_sem);
984 rc = ll_create_obj(conn, inode, file, lsm);
985 up(&lli->lli_open_sem);
988 obd_free_memmd(conn, &lsm);
991 rc = ll_osc_open(conn, inode, file, lli->lli_smd);
995 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
997 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
998 struct lustre_handle *conn = ll_i2obdconn(inode);
1003 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, conn, 0, lsm, (void *)arg);
1006 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1009 struct ll_file_data *fd = file->private_data;
1010 struct lustre_handle *conn;
1013 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%u\n", inode->i_ino,
1014 inode->i_generation, inode, cmd);
1016 if (_IOC_TYPE(cmd) == 'T') /* tty ioctls */
1019 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_IOCTL);
1021 case LL_IOC_GETFLAGS:
1022 /* Get the current value of the file flags */
1023 return put_user(fd->fd_flags, (int *)arg);
1024 case LL_IOC_SETFLAGS:
1025 case LL_IOC_CLRFLAGS:
1026 /* Set or clear specific file flags */
1027 /* XXX This probably needs checks to ensure the flags are
1028 * not abused, and to handle any flag side effects.
1030 if (get_user(flags, (int *) arg))
1033 if (cmd == LL_IOC_SETFLAGS)
1034 fd->fd_flags |= flags;
1036 fd->fd_flags &= ~flags;
1038 case LL_IOC_LOV_SETSTRIPE:
1039 return ll_lov_setstripe(inode, file, arg);
1040 case LL_IOC_LOV_GETSTRIPE:
1041 return ll_lov_getstripe(inode, arg);
1043 /* We need to special case any other ioctls we want to handle,
1044 * to send them to the MDS/OST as appropriate and to properly
1045 * network encode the arg field.
1046 case EXT2_IOC_GETFLAGS:
1047 case EXT2_IOC_SETFLAGS:
1048 case EXT2_IOC_GETVERSION_OLD:
1049 case EXT2_IOC_GETVERSION_NEW:
1050 case EXT2_IOC_SETVERSION_OLD:
1051 case EXT2_IOC_SETVERSION_NEW:
1054 conn = ll_i2obdconn(inode);
1055 return obd_iocontrol(cmd, conn, 0, NULL, (void *)arg);
1059 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1061 struct inode *inode = file->f_dentry->d_inode;
1062 struct ll_file_data *fd = file->private_data;
1063 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1064 struct lustre_handle lockh = {0};
1067 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),to=%llu\n", inode->i_ino,
1068 inode->i_generation, inode,
1069 offset + ((origin==2) ? inode->i_size : file->f_pos));
1071 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_LLSEEK);
1072 if (origin == 2) { /* SEEK_END */
1074 struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
1075 err = ll_extent_lock(fd, inode, lsm, LCK_PR, &extent, &lockh);
1076 if (err != ELDLM_OK)
1079 offset += inode->i_size;
1080 } else if (origin == 1) { /* SEEK_CUR */
1081 offset += file->f_pos;
1085 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1086 if (offset != file->f_pos) {
1087 file->f_pos = offset;
1088 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1090 file->f_version = ++event;
1097 ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
1101 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1103 struct inode *inode = dentry->d_inode;
1106 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1107 inode->i_generation, inode);
1109 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_FSYNC);
1111 * filemap_fdata{sync,wait} are also called at PW lock cancelation so
1112 * we know that they can only find data to writeback here if we are
1113 * still holding the PW lock that covered the dirty pages. XXX we
1114 * should probably get a reference on it, though, just to be clear.
1116 rc = filemap_fdatasync(inode->i_mapping);
1118 rc = filemap_fdatawait(inode->i_mapping);
1123 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
1125 struct inode *inode = dentry->d_inode;
1126 struct lov_stripe_md *lsm;
1130 CERROR("REPORT THIS LINE TO PETER\n");
1133 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
1134 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
1135 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
1136 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE);
1139 /* this is very tricky. it is unsafe to call ll_have_md_lock
1140 when we have a referenced lock: because it may cause an RPC
1141 below when the lock is marked CB_PENDING. That RPC may not
1142 go out because someone else may be in another RPC waiting for
1144 if (!(it && it->it_lock_mode) && !ll_have_md_lock(dentry)) {
1145 struct lustre_md md;
1146 struct ptlrpc_request *req = NULL;
1147 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
1149 unsigned long valid = 0;
1153 if (S_ISREG(inode->i_mode)) {
1154 ealen = obd_size_diskmd(&sbi->ll_osc_conn, NULL);
1155 valid |= OBD_MD_FLEASIZE;
1157 ll_inode2fid(&fid, inode);
1158 rc = mdc_getattr(&sbi->ll_mdc_conn, &fid, valid, ealen, &req);
1160 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
1163 rc = mdc_req2lustre_md(req, 0, &sbi->ll_osc_conn, &md);
1165 /* XXX Too paranoid? */
1166 if ((md.body->valid ^ valid) & OBD_MD_FLEASIZE)
1167 CERROR("Asked for %s eadata but got %s\n",
1168 (valid & OBD_MD_FLEASIZE) ? "some" : "no",
1169 (md.body->valid & OBD_MD_FLEASIZE) ? "some":
1172 ptlrpc_req_finished(req);
1176 ll_update_inode(inode, md.body, md.lsm);
1177 if (md.lsm != NULL && ll_i2info(inode)->lli_smd != md.lsm)
1178 obd_free_memmd(&sbi->ll_osc_conn, &md.lsm);
1180 ptlrpc_req_finished(req);
1183 lsm = ll_i2info(inode)->lli_smd;
1184 if (!lsm) /* object not yet allocated, don't validate size */
1188 * unfortunately stat comes in through revalidate and we don't
1189 * differentiate this use from initial instantiation. we're
1190 * also being wildly conservative and flushing write caches
1191 * so that stat really returns the proper size.
1194 struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
1195 struct lustre_handle lockh = {0};
1198 err = ll_extent_lock(NULL, inode, lsm, LCK_PR, &extent, &lockh);
1199 if (err != ELDLM_OK)
1202 ll_extent_unlock(NULL, inode, lsm, LCK_PR, &lockh);
1207 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1208 int ll_getattr(struct vfsmount *mnt, struct dentry *de,
1209 struct lookup_intent *it,
1213 struct inode *inode = de->d_inode;
1215 res = ll_inode_revalidate_it(de, it);
1216 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR);
1221 stat->dev = inode->i_sb->s_dev;
1222 stat->ino = inode->i_ino;
1223 stat->mode = inode->i_mode;
1224 stat->nlink = inode->i_nlink;
1225 stat->uid = inode->i_uid;
1226 stat->gid = inode->i_gid;
1227 stat->rdev = kdev_t_to_nr(inode->i_rdev);
1228 stat->atime = inode->i_atime;
1229 stat->mtime = inode->i_mtime;
1230 stat->ctime = inode->i_ctime;
1231 stat->size = inode->i_size;
1232 stat->blksize = inode->i_blksize;
1233 stat->blocks = inode->i_blocks;
1238 struct file_operations ll_file_operations = {
1240 write: ll_file_write,
1241 ioctl: ll_file_ioctl,
1243 release: ll_file_release,
1244 mmap: generic_file_mmap,
1245 llseek: ll_file_seek,
1249 struct inode_operations ll_file_inode_operations = {
1250 setattr_raw: ll_setattr_raw,
1251 setattr: ll_setattr,
1252 truncate: ll_truncate,
1253 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1254 getattr_it: ll_getattr,
1256 revalidate_it: ll_inode_revalidate_it,
1260 struct inode_operations ll_special_inode_operations = {
1261 setattr_raw: ll_setattr_raw,
1262 setattr: ll_setattr,
1263 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1264 getattr_it: ll_getattr,
1266 revalidate_it: ll_inode_revalidate_it,