/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * * Copyright (c) 2002, 2003 Cluster File Systems, Inc. * Author: Peter Braam * Author: Phil Schwan * Author: Andreas Dilger * * This file is part of Lustre, http://www.lustre.org. * * Lustre is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * Lustre is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define DEBUG_SUBSYSTEM S_LLITE #include #include #include /* for lov_mds_md_size() in lov_setstripe() */ #include #include #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) #include #endif int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc); extern int ll_setattr(struct dentry *de, struct iattr *attr); static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode, struct file *file) { struct ll_file_data *fd = file->private_data; struct ptlrpc_request *req = NULL; unsigned long flags; struct obd_import *imp; int rc; ENTRY; /* Complete the open request and remove it from replay list */ rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, inode->i_ino, inode->i_mode, &fd->fd_mds_och.och_fh, &req); if (rc) CERROR("inode %lu close failed: rc = %d\n", inode->i_ino, rc); imp = fd->fd_mds_och.och_req->rq_import; LASSERT(imp != NULL); spin_lock_irqsave(&imp->imp_lock, flags); DEBUG_REQ(D_HA, fd->fd_mds_och.och_req, "matched open req %p", fd->fd_mds_och.och_req); /* We held on to the request for replay until we saw a close for that * file. Now that we've closed it, it gets replayed on the basis of * its transno only. */ spin_lock (&fd->fd_mds_och.och_req->rq_lock); fd->fd_mds_och.och_req->rq_replay = 0; spin_unlock (&fd->fd_mds_och.och_req->rq_lock); if (fd->fd_mds_och.och_req->rq_transno) { /* This open created a file, so it needs replay as a * normal transaction now. Our reference to it now * effectively owned by the imp_replay_list, and it'll * be committed just like other transno-having * requests from here on out. */ /* We now retain this close request, so that it is * replayed if the open is replayed. We duplicate the * transno, so that we get freed at the right time, * and rely on the difference in xid to keep * everything ordered correctly. * * But! If this close was already given a transno * (because it caused real unlinking of an * open-unlinked file, f.e.), then we'll be ordered on * the basis of that and we don't need to do anything * magical here. */ if (!req->rq_transno) { req->rq_transno = fd->fd_mds_och.och_req->rq_transno; ptlrpc_retain_replayable_request(req, imp); } spin_unlock_irqrestore(&imp->imp_lock, flags); /* Should we free_committed now? we always free before * replay, so it's probably a wash. We could check to * see if the fd_req should already be committed, in * which case we can avoid the whole retain_replayable * dance. */ } else { /* No transno means that we can just drop our ref. */ spin_unlock_irqrestore(&imp->imp_lock, flags); } ptlrpc_req_finished(fd->fd_mds_och.och_req); /* Do this after the fd_req->rq_transno check, because we don't want * to bounce off zero references. */ ptlrpc_req_finished(req); fd->fd_mds_och.och_fh.cookie = DEAD_HANDLE_MAGIC; file->private_data = NULL; OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd); RETURN(-abs(rc)); } /* While this returns an error code, fput() the caller does not, so we need * to make every effort to clean up all of our state here. Also, applications * rarely check close errors and even if an error is returned they will not * re-try the close call. */ int ll_file_release(struct inode *inode, struct file *file) { struct ll_file_data *fd; struct obdo oa; struct ll_sb_info *sbi = ll_i2sbi(inode); struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; int rc = 0, rc2; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, inode->i_generation, inode); /* don't do anything for / */ if (inode->i_sb->s_root == file->f_dentry) RETURN(0); lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_RELEASE); fd = (struct ll_file_data *)file->private_data; if (!fd) /* no process opened the file after an mcreate */ RETURN(rc = 0); /* we might not be able to get a valid handle on this file * again so we really want to flush our write cache.. */ if (S_ISREG(inode->i_mode)) { filemap_fdatasync(inode->i_mapping); filemap_fdatawait(inode->i_mapping); if (lsm != NULL) { memset(&oa, 0, sizeof(oa)); oa.o_id = lsm->lsm_object_id; oa.o_mode = S_IFREG; oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID; memcpy(&oa.o_inline, &fd->fd_ost_och, FD_OSTDATA_SIZE); oa.o_valid |= OBD_MD_FLHANDLE; rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL); if (rc) CERROR("inode %lu object close failed: rc = " "%d\n", inode->i_ino, rc); } } rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file); if (rc2 && !rc) rc = rc2; RETURN(rc); } static int ll_local_open(struct file *file, struct lookup_intent *it) { struct ptlrpc_request *req = it->it_data; struct ll_file_data *fd; struct mds_body *body; ENTRY; body = lustre_msg_buf (req->rq_repmsg, 1, sizeof (*body)); LASSERT (body != NULL); /* reply already checked out */ LASSERT_REPSWABBED (req, 1); /* and swabbed down */ LASSERT(!file->private_data); OBD_SLAB_ALLOC(fd, ll_file_data_slab, SLAB_KERNEL, sizeof *fd); /* We can't handle this well without reorganizing ll_file_open and * ll_mdc_close, so don't even try right now. */ LASSERT(fd != NULL); memset(fd, 0, sizeof(*fd)); memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle)); fd->fd_mds_och.och_req = it->it_data; file->private_data = fd; RETURN(0); } static int ll_osc_open(struct lustre_handle *conn, struct inode *inode, struct file *file, struct lov_stripe_md *lsm) { struct ll_file_data *fd = file->private_data; struct obdo *oa; int rc; ENTRY; oa = obdo_alloc(); if (!oa) RETURN(-ENOMEM); oa->o_id = lsm->lsm_object_id; oa->o_mode = S_IFREG; oa->o_valid = (OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME); rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och); if (rc) GOTO(out, rc); file->f_flags &= ~O_LOV_DELAY_CREATE; obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME | OBD_MD_FLCTIME); EXIT; out: obdo_free(oa); return rc; } /* Caller must hold lli_open_sem to protect lli->lli_smd from changing and * duplicate objects from being created. We only install lsm to lli_smd if * the mdc open was successful (hence stored stripe MD on MDS), otherwise * other nodes could try to create different objects for the same file. */ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode, struct file *file, struct lov_stripe_md *lsm) { struct ptlrpc_request *req = NULL; struct ll_inode_info *lli = ll_i2info(inode); struct lov_mds_md *lmm = NULL; struct obdo *oa; struct iattr iattr; struct mdc_op_data op_data; int rc, err, lmm_size = 0;; ENTRY; oa = obdo_alloc(); if (!oa) RETURN(-ENOMEM); oa->o_mode = S_IFREG | 0600; oa->o_id = inode->i_ino; /* Keep these 0 for now, because chown/chgrp does not change the * ownership on the OST, and we don't want to allow BA OST NFS * users to access these objects by mistake. */ oa->o_uid = 0; oa->o_gid = 0; oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID; rc = obd_create(conn, oa, &lsm, NULL); if (rc) { CERROR("error creating objects for inode %lu: rc = %d\n", inode->i_ino, rc); if (rc > 0) { CERROR("obd_create returned invalid rc %d\n", rc); rc = -EIO; } GOTO(out_oa, rc); } obdo_to_inode(inode, oa, OBD_MD_FLBLKSZ); LASSERT(lsm && lsm->lsm_object_id); rc = obd_packmd(conn, &lmm, lsm); if (rc < 0) GOTO(out_destroy, rc); lmm_size = rc; /* Save the stripe MD with this file on the MDS */ memset(&iattr, 0, sizeof(iattr)); iattr.ia_valid = ATTR_FROM_OPEN; ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, &op_data, &iattr, lmm, lmm_size, &req); ptlrpc_req_finished(req); obd_free_diskmd (conn, &lmm); /* If we couldn't complete mdc_open() and store the stripe MD on the * MDS, we need to destroy the objects now or they will be leaked. */ if (rc) { CERROR("error: storing stripe MD for %lu: rc %d\n", inode->i_ino, rc); GOTO(out_destroy, rc); } lli->lli_smd = lsm; lli->lli_maxbytes = lsm->lsm_maxbytes; EXIT; out_oa: obdo_free(oa); return rc; out_destroy: obdo_from_inode(oa, inode, OBD_MD_FLTYPE); oa->o_id = lsm->lsm_object_id; oa->o_valid |= OBD_MD_FLID; err = obd_destroy(conn, oa, lsm, NULL); obd_free_memmd(conn, &lsm); if (err) CERROR("error uncreating inode %lu objects: rc %d\n", inode->i_ino, err); goto out_oa; } /* Open a file, and (for the very first open) create objects on the OSTs at * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object * creation or open until ll_lov_setstripe() ioctl is called. We grab * lli_open_sem to ensure no other process will create objects, send the * stripe MD to the MDS, or try to destroy the objects if that fails. * * If we already have the stripe MD locally then we don't request it in * mdc_open(), by passing a lmm_size = 0. * * It is up to the application to ensure no other processes open this file * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be * used. We might be able to avoid races of that sort by getting lli_open_sem * before returning in the O_LOV_DELAY_CREATE case and dropping it here * or in ll_file_release(), but I'm not sure that is desirable/necessary. */ extern int ll_it_open_error(int phase, struct lookup_intent *it); int ll_file_open(struct inode *inode, struct file *file) { struct ll_sb_info *sbi = ll_i2sbi(inode); struct ll_inode_info *lli = ll_i2info(inode); struct lustre_handle *conn = ll_i2obdconn(inode); struct lookup_intent *it; struct lov_stripe_md *lsm; int rc = 0; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, inode->i_generation, inode); /* don't do anything for / */ if (inode->i_sb->s_root == file->f_dentry) RETURN(0); lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN); LL_GET_INTENT(file->f_dentry, it); rc = ll_it_open_error(IT_OPEN_OPEN, it); if (rc) RETURN(rc); rc = ll_local_open(file, it); if (rc) LBUG(); mdc_set_open_replay_data(&((struct ll_file_data *) file->private_data)->fd_mds_och); if (!S_ISREG(inode->i_mode)) RETURN(0); lsm = lli->lli_smd; if (lsm == NULL) { if (file->f_flags & O_LOV_DELAY_CREATE) { CDEBUG(D_INODE, "delaying object creation\n"); RETURN(0); } down(&lli->lli_open_sem); if (!lli->lli_smd) { rc = ll_create_obj(conn, inode, file, NULL); up(&lli->lli_open_sem); if (rc) GOTO(out_close, rc); } else { CERROR("warning: stripe already set on ino %lu\n", inode->i_ino); up(&lli->lli_open_sem); } lsm = lli->lli_smd; } rc = ll_osc_open(conn, inode, file, lsm); if (rc) GOTO(out_close, rc); RETURN(0); out_close: ll_mdc_close(&sbi->ll_mdc_conn, inode, file); return rc; } /* * really does the getattr on the inode and updates its fields */ int ll_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm, void *ostdata) { struct ll_sb_info *sbi = ll_i2sbi(inode); struct ll_inode_info *lli = ll_i2info(inode); struct ptlrpc_request_set *set; struct obdo oa; int bef, aft; unsigned long before, after; int rc; ENTRY; LASSERT(lsm); LASSERT(sbi); LASSERT(lli); memset(&oa, 0, sizeof oa); oa.o_id = lsm->lsm_object_id; oa.o_mode = S_IFREG; oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME | OBD_MD_FLCTIME; if (ostdata != NULL) { memcpy(&oa.o_inline, ostdata, FD_OSTDATA_SIZE); oa.o_valid |= OBD_MD_FLHANDLE; } /* getattr can race with writeback. we don't want to trust a getattr * that doesn't include the writeback of our farthest cached pages * that it raced with. */ /* Now that the OSC knows the cached-page status, it can and should be * adjusting its getattr results to include the maximum cached offset * for its stripe(s). */ do { bef = obd_last_dirty_offset(ll_i2obdconn(inode), lli->lli_smd, &before); #if 0 rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm); #else set = ptlrpc_prep_set (); if (set == NULL) { CERROR ("ENOMEM allocing request set\n"); rc = -ENOMEM; } else { rc = obd_getattr_async(&sbi->ll_osc_conn, &oa, lsm, set); if (rc == 0) rc = ptlrpc_set_wait (set); ptlrpc_set_destroy (set); } #endif if (rc) RETURN(rc); aft = obd_last_dirty_offset(ll_i2obdconn(inode), lli->lli_smd, &after); CDEBUG(D_INODE, " %d,%lu -> %d,%lu\n", bef, before, aft, after); } while (bef == 0 && (aft != 0 || after < before) && oa.o_size < ((u64)before + 1) << PAGE_CACHE_SHIFT); obdo_to_inode(inode, &oa, (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME | OBD_MD_FLCTIME)); if (inode->i_blksize < PAGE_CACHE_SIZE) inode->i_blksize = PAGE_CACHE_SIZE; /* make sure getattr doesn't return a size that causes writeback * to forget about cached writes */ if ((aft == 0) && oa.o_size < ((u64)after + 1) << PAGE_CACHE_SHIFT) { CDEBUG(D_INODE, "cached at %lu, keeping %llu i_size instead " "of oa "LPU64"\n", after, inode->i_size, oa.o_size); RETURN(0); } obdo_to_inode(inode, &oa, OBD_MD_FLSIZE); CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lu blksize %lu\n", lsm->lsm_object_id, inode->i_size, inode->i_size, inode->i_blksize); RETURN(0); } /* * some callers, notably truncate, really don't want i_size set based * on the the size returned by the getattr, or lock acquisition in * the future. */ int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode, struct lov_stripe_md *lsm, int mode, struct ldlm_extent *extent, struct lustre_handle *lockh) { struct ll_sb_info *sbi = ll_i2sbi(inode); int rc, flags = 0; ENTRY; LASSERT(lockh->cookie == 0); /* XXX phil: can we do this? won't it screw the file size up? */ if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) || (sbi->ll_flags & LL_SBI_NOLCK)) RETURN(0); CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n", inode->i_ino, extent->start, extent->end); rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, extent, sizeof(extent), mode, &flags, ll_extent_lock_callback, inode, lockh); RETURN(rc); } /* * this grabs a lock and manually implements behaviour that makes it look like * the OST is returning the file size with each lock acquisition. */ int ll_extent_lock(struct ll_file_data *fd, struct inode *inode, struct lov_stripe_md *lsm, int mode, struct ldlm_extent *extent, struct lustre_handle *lockh) { struct ll_inode_info *lli = ll_i2info(inode); struct ldlm_extent size_lock; struct lustre_handle match_lockh = {0}; int flags, rc, matched; ENTRY; rc = ll_extent_lock_no_validate(fd, inode, lsm, mode, extent, lockh); if (rc != ELDLM_OK) RETURN(rc); if (test_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags)) RETURN(0); rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL); if (rc) { ll_extent_unlock(fd, inode, lsm, mode, lockh); RETURN(rc); } size_lock.start = inode->i_size; size_lock.end = OBD_OBJECT_EOF; /* XXX I bet we should be checking the lock ignore flags.. */ flags = LDLM_FL_CBPENDING | LDLM_FL_BLOCK_GRANTED | LDLM_FL_MATCH_DATA; matched = obd_match(&ll_i2sbi(inode)->ll_osc_conn, lsm, LDLM_EXTENT, &size_lock, sizeof(size_lock), LCK_PR, &flags, inode, &match_lockh); /* hey, alright, we hold a size lock that covers the size we * just found, its not going to change for a while.. */ if (matched == 1) { set_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags); obd_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, LCK_PR, &match_lockh); } RETURN(0); } int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode, struct lov_stripe_md *lsm, int mode, struct lustre_handle *lockh) { struct ll_sb_info *sbi = ll_i2sbi(inode); int rc; ENTRY; /* XXX phil: can we do this? won't it screw the file size up? */ if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) || (sbi->ll_flags & LL_SBI_NOLCK)) RETURN(0); rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh); RETURN(rc); } static inline void ll_remove_suid(struct inode *inode) { unsigned int mode; /* set S_IGID if S_IXGRP is set, and always set S_ISUID */ mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID; /* was any of the uid bits set? */ mode &= inode->i_mode; if (mode && !capable(CAP_FSETID)) { inode->i_mode &= ~mode; // XXX careful here - we cannot change the size } } #if 0 static void ll_update_atime(struct inode *inode) { #ifdef USE_ATIME struct iattr attr; attr.ia_atime = LTIME_S(CURRENT_TIME); attr.ia_valid = ATTR_ATIME; if (inode->i_atime == attr.ia_atime) return; if (IS_RDONLY(inode)) return; if (IS_NOATIME(inode)) return; /* ll_inode_setattr() sets inode->i_atime from attr.ia_atime */ ll_inode_setattr(inode, &attr, 0); #else /* update atime, but don't explicitly write it out just this change */ inode->i_atime = CURRENT_TIME; #endif } #endif /* * flush the page cache for an extent as its canceled. when we're on an * lov we get a lock cancelation for each of the obd locks under the lov * so we have to map the obd's region back onto the stripes in the file * that it held. * * no one can dirty the extent until we've finished our work and they * can enqueue another lock. * * XXX this could be asking the inode's dirty tree for info */ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, struct ldlm_lock *lock) { struct ldlm_extent *extent = &lock->l_extent; unsigned long start, end, count, skip, i, j; struct page *page; int ret; ENTRY; CDEBUG(D_INODE, "obdo %lu inode %p ["LPU64"->"LPU64"] size: %llu\n", inode->i_ino, inode, extent->start, extent->end, inode->i_size); start = extent->start >> PAGE_CACHE_SHIFT; count = ~0; skip = 0; end = (extent->end >> PAGE_CACHE_SHIFT) + 1; if ((end << PAGE_CACHE_SHIFT) < extent->end) end = ~0; if (lsm->lsm_stripe_count > 1) { struct { char name[16]; struct ldlm_lock *lock; struct lov_stripe_md *lsm; } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm }; __u32 stripe; __u32 vallen = sizeof(stripe); int rc; /* get our offset in the lov */ rc = obd_get_info(ll_i2obdconn(inode), sizeof(key), &key, &vallen, &stripe); if (rc != 0) { CERROR("obd_get_info: rc = %d\n", rc); LBUG(); } LASSERT(stripe < lsm->lsm_stripe_count); count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT; skip = (lsm->lsm_stripe_count - 1) * count; start += (start/count * skip) + (stripe * count); if (end != ~0) end += (end/count * skip) + (stripe * count); } i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; if (end >= i) clear_bit(LLI_F_HAVE_SIZE_LOCK, &(ll_i2info(inode)->lli_flags)); if (i < end) end = i; CDEBUG(D_INODE, "start: %lu j: %lu count: %lu skip: %lu end: %lu\n", start, start % count, count, skip, end); /* start writeback on dirty pages in the extent when its PW */ for (i = start, j = start % count; lock->l_granted_mode == LCK_PW && i < end; j++, i++) { if (j == count) { i += skip; j = 0; } /* its unlikely, but give us a chance to bail when we're out */ PGCACHE_WRLOCK(inode->i_mapping); if (list_empty(&inode->i_mapping->dirty_pages)) { CDEBUG(D_INODE, "dirty list empty\n"); PGCACHE_WRUNLOCK(inode->i_mapping); break; } PGCACHE_WRUNLOCK(inode->i_mapping); if (need_resched()) schedule(); page = find_get_page(inode->i_mapping, i); if (page == NULL) continue; if (!PageDirty(page) || TryLockPage(page)) { page_cache_release(page); continue; } if (PageDirty(page)) { CDEBUG(D_INODE, "writing page %p\n", page); PGCACHE_WRLOCK(inode->i_mapping); list_del(&page->list); list_add(&page->list, &inode->i_mapping->locked_pages); PGCACHE_WRUNLOCK(inode->i_mapping); /* this writepage might write out pages outside * this extent, but that's ok, the pages are only * still dirty because a lock still covers them */ ClearPageDirty(page); #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) ret = inode->i_mapping->a_ops->writepage(page); #else ret = inode->i_mapping->a_ops->writepage(page, NULL); #endif if (ret != 0) unlock_page(page); } else { unlock_page(page); } page_cache_release(page); } /* our locks are page granular thanks to osc_enqueue, we invalidate the * whole page. */ LASSERT((extent->start & ~PAGE_CACHE_MASK) == 0); LASSERT(((extent->end+1) & ~PAGE_CACHE_MASK) == 0); for (i = start, j = start % count ; i < end ; j++, i++) { if ( j == count ) { i += skip; j = 0; } PGCACHE_WRLOCK(inode->i_mapping); if (list_empty(&inode->i_mapping->dirty_pages) && list_empty(&inode->i_mapping->clean_pages) && list_empty(&inode->i_mapping->locked_pages)) { CDEBUG(D_INODE, "nothing left\n"); PGCACHE_WRUNLOCK(inode->i_mapping); break; } PGCACHE_WRUNLOCK(inode->i_mapping); if (need_resched()) schedule(); page = find_get_page(inode->i_mapping, i); if (page == NULL) continue; CDEBUG(D_INODE, "dropping page %p at %lu\n", page, page->index); lock_page(page); if (page->mapping) /* might have raced */ #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) truncate_complete_page(page); #else truncate_complete_page(page->mapping, page); #endif unlock_page(page); page_cache_release(page); } EXIT; } int ll_extent_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new, void *data, int flag) { struct inode *inode = data; struct ll_inode_info *lli = ll_i2info(inode); struct lustre_handle lockh = { 0 }; int rc; ENTRY; LASSERT(inode != NULL); switch (flag) { case LDLM_CB_BLOCKING: ldlm_lock2handle(lock, &lockh); rc = ldlm_cli_cancel(&lockh); if (rc != ELDLM_OK) CERROR("ldlm_cli_cancel failed: %d\n", rc); break; case LDLM_CB_CANCELING: /* FIXME: we could be given 'canceling intents' so that we * could know to write-back or simply throw away the pages * based on if the cancel comes from a desire to, say, * read or truncate.. */ LASSERT((unsigned long)inode > 0x1000); LASSERT((unsigned long)lli > 0x1000); LASSERT((unsigned long)lli->lli_smd > 0x1000); ll_pgcache_remove_extent(inode, lli->lli_smd, lock); break; default: LBUG(); } RETURN(0); } static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos) { struct ll_file_data *fd = filp->private_data; struct inode *inode = filp->f_dentry->d_inode; struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; struct lustre_handle lockh = { 0 }; struct ll_read_extent rextent; ldlm_error_t err; ssize_t retval; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n", inode->i_ino, inode->i_generation, inode, count, *ppos); /* "If nbyte is 0, read() will return 0 and have no other results." * -- Single Unix Spec */ if (count == 0) RETURN(0); lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES, count); /* grab a -> eof extent to push extending writes out of node's caches * so we can see them at the getattr after lock acquisition. this will * turn into a seperate [*ppos + count, EOF] 'size intent' lock attempt * in the future. */ rextent.re_extent.start = *ppos; rextent.re_extent.end = OBD_OBJECT_EOF; err = ll_extent_lock(fd, inode, lsm, LCK_PR, &rextent.re_extent,&lockh); if (err != ELDLM_OK) RETURN(-ENOLCK); /* XXX tell ll_readpage what pages have a PR lock.. */ rextent.re_task = current; spin_lock(&lli->lli_read_extent_lock); list_add(&rextent.re_lli_item, &lli->lli_read_extents); spin_unlock(&lli->lli_read_extent_lock); CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n", inode->i_ino, count, *ppos); retval = generic_file_read(filp, buf, count, ppos); spin_lock(&lli->lli_read_extent_lock); list_del(&rextent.re_lli_item); spin_unlock(&lli->lli_read_extent_lock); /* XXX errors? */ ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh); RETURN(retval); } /* * Write to a file (through the page cache). */ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) { struct ll_file_data *fd = file->private_data; struct inode *inode = file->f_dentry->d_inode; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; struct lustre_handle lockh = { 0 }; struct ldlm_extent extent; loff_t maxbytes = ll_file_maxbytes(inode); ldlm_error_t err; ssize_t retval; char should_validate = 1; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n", inode->i_ino, inode->i_generation, inode, count, *ppos); /* * sleep doing some writeback work of this mount's dirty data * if the VM thinks we're low on memory.. other dirtying code * paths should think about doing this, too, but they should be * careful not to hold locked pages while they do so. like * ll_prepare_write. *cough* */ LL_CHECK_DIRTY(inode->i_sb); /* POSIX, but surprised the VFS doesn't check this already */ if (count == 0) RETURN(0); if (file->f_flags & O_APPEND) { extent.start = 0; extent.end = OBD_OBJECT_EOF; } else { extent.start = *ppos; extent.end = *ppos + count - 1; /* we really don't care what i_size is if we're doing * fully page aligned writes */ if ((*ppos & ~PAGE_CACHE_MASK) == 0 && (count & ~PAGE_CACHE_MASK) == 0) should_validate = 0; } if (should_validate) err = ll_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh); else err = ll_extent_lock_no_validate(fd, inode, lsm, LCK_PW, &extent, &lockh); if (err != ELDLM_OK) RETURN(-ENOLCK); /* this is ok, g_f_w will overwrite this under i_sem if it races * with a local truncate, it just makes our maxbyte checking easier */ if (file->f_flags & O_APPEND) *ppos = inode->i_size; if (*ppos >= maxbytes) { if (count || *ppos > maxbytes) { send_sig(SIGXFSZ, current, 0); GOTO(out, retval = -EFBIG); } } if (*ppos + count > maxbytes) count = maxbytes - *ppos; CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n", inode->i_ino, count, *ppos); /* generic_file_write handles O_APPEND after getting i_sem */ retval = generic_file_write(file, buf, count, ppos); out: /* XXX errors? */ lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES, retval); ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh); RETURN(retval); } static int ll_lov_setstripe(struct inode *inode, struct file *file, unsigned long arg) { struct ll_inode_info *lli = ll_i2info(inode); struct lustre_handle *conn = ll_i2obdconn(inode); struct lov_stripe_md *lsm; int rc; ENTRY; down(&lli->lli_open_sem); lsm = lli->lli_smd; if (lsm) { up(&lli->lli_open_sem); CERROR("stripe already exists for ino %lu\n", inode->i_ino); /* If we haven't already done the open, do so now */ if (file->f_flags & O_LOV_DELAY_CREATE) { int rc2 = ll_osc_open(conn, inode, file, lsm); if (rc2) RETURN(rc2); } RETURN(-EEXIST); } rc = obd_iocontrol(LL_IOC_LOV_SETSTRIPE, conn, 0, &lsm, (void *)arg); if (rc) { up(&lli->lli_open_sem); RETURN(rc); } rc = ll_create_obj(conn, inode, file, lsm); up(&lli->lli_open_sem); if (rc) { obd_free_memmd(conn, &lsm); RETURN(rc); } rc = ll_osc_open(conn, inode, file, lli->lli_smd); RETURN(rc); } static int ll_lov_getstripe(struct inode *inode, unsigned long arg) { struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; struct lustre_handle *conn = ll_i2obdconn(inode); if (!lsm) RETURN(-ENODATA); return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, conn, 0, lsm, (void *)arg); } int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { struct ll_file_data *fd = file->private_data; struct lustre_handle *conn; int flags; CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%u\n", inode->i_ino, inode->i_generation, inode, cmd); if (_IOC_TYPE(cmd) == 'T') /* tty ioctls */ return -ENOTTY; lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_IOCTL); switch(cmd) { case LL_IOC_GETFLAGS: /* Get the current value of the file flags */ return put_user(fd->fd_flags, (int *)arg); case LL_IOC_SETFLAGS: case LL_IOC_CLRFLAGS: /* Set or clear specific file flags */ /* XXX This probably needs checks to ensure the flags are * not abused, and to handle any flag side effects. */ if (get_user(flags, (int *) arg)) return -EFAULT; if (cmd == LL_IOC_SETFLAGS) fd->fd_flags |= flags; else fd->fd_flags &= ~flags; return 0; case LL_IOC_LOV_SETSTRIPE: return ll_lov_setstripe(inode, file, arg); case LL_IOC_LOV_GETSTRIPE: return ll_lov_getstripe(inode, arg); /* We need to special case any other ioctls we want to handle, * to send them to the MDS/OST as appropriate and to properly * network encode the arg field. case EXT2_IOC_GETFLAGS: case EXT2_IOC_SETFLAGS: case EXT2_IOC_GETVERSION_OLD: case EXT2_IOC_GETVERSION_NEW: case EXT2_IOC_SETVERSION_OLD: case EXT2_IOC_SETVERSION_NEW: */ default: conn = ll_i2obdconn(inode); return obd_iocontrol(cmd, conn, 0, NULL, (void *)arg); } } loff_t ll_file_seek(struct file *file, loff_t offset, int origin) { struct inode *inode = file->f_dentry->d_inode; struct ll_file_data *fd = file->private_data; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; struct lustre_handle lockh = {0}; loff_t retval; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),to=%llu\n", inode->i_ino, inode->i_generation, inode, offset + ((origin==2) ? inode->i_size : file->f_pos)); lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_LLSEEK); if (origin == 2) { /* SEEK_END */ ldlm_error_t err; struct ldlm_extent extent = {0, OBD_OBJECT_EOF}; err = ll_extent_lock(fd, inode, lsm, LCK_PR, &extent, &lockh); if (err != ELDLM_OK) RETURN(-ENOLCK); offset += inode->i_size; } else if (origin == 1) { /* SEEK_CUR */ offset += file->f_pos; } retval = -EINVAL; if (offset >= 0 && offset <= ll_file_maxbytes(inode)) { if (offset != file->f_pos) { file->f_pos = offset; #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) file->f_reada = 0; file->f_version = ++event; #endif } retval = offset; } if (origin == 2) ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh); RETURN(retval); } int ll_fsync(struct file *file, struct dentry *dentry, int data) { int ret; struct inode *inode = dentry->d_inode; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, inode->i_generation, inode); lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_FSYNC); /* * filemap_fdata{sync,wait} are also called at PW lock cancelation so * we know that they can only find data to writeback here if we are * still holding the PW lock that covered the dirty pages. XXX we * should probably get a reference on it, though, just to be clear. */ ret = filemap_fdatasync(dentry->d_inode->i_mapping); if ( ret == 0 ) ret = filemap_fdatawait(dentry->d_inode->i_mapping); RETURN(ret); } int ll_inode_revalidate(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct lov_stripe_md *lsm = NULL; ENTRY; if (!inode) { CERROR("REPORT THIS LINE TO PETER\n"); RETURN(0); } CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n", inode->i_ino, inode->i_generation, inode, dentry->d_name.name); #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0)) lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE); #endif /* this is very tricky. it is unsafe to call ll_have_md_lock when we have a referenced lock: because it may cause an RPC below when the lock is marked CB_PENDING. That RPC may not go out because someone else may be in another RPC waiting for that lock*/ if (!(dentry->d_it && dentry->d_it->it_lock_mode) && !ll_have_md_lock(dentry)) { struct ptlrpc_request *req = NULL; struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode); struct ll_fid fid; struct mds_body *body; struct lov_mds_md *lmm; unsigned long valid = 0; int eadatalen = 0, rc; /* Why don't we update all valid MDS fields here, if we're * doing an RPC anyways? -phil */ if (S_ISREG(inode->i_mode)) { eadatalen = obd_size_diskmd(&sbi->ll_osc_conn, NULL); valid |= OBD_MD_FLEASIZE; } ll_inode2fid(&fid, inode); rc = mdc_getattr(&sbi->ll_mdc_conn, &fid, valid, eadatalen, &req); if (rc) { CERROR("failure %d inode %lu\n", rc, inode->i_ino); RETURN(-abs(rc)); } body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body)); LASSERT (body != NULL); /* checked by mdc_getattr() */ LASSERT_REPSWABBED (req, 0); /* swabbed by mdc_getattr() */ if (S_ISREG(inode->i_mode) && (body->valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS))) { CERROR("MDS sent back size for regular file\n"); body->valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); } /* XXX Too paranoid? */ if ((body->valid ^ valid) & OBD_MD_FLEASIZE) CERROR("Asked for %s eadata but got %s\n", (valid & OBD_MD_FLEASIZE) ? "some" : "no", (body->valid & OBD_MD_FLEASIZE) ? "some":"none"); if (S_ISREG(inode->i_mode) && (body->valid & OBD_MD_FLEASIZE)) { if (body->eadatasize == 0) { /* no EA data */ CERROR("OBD_MD_FLEASIZE set but no data\n"); RETURN(-EPROTO); } /* Only bother with this if inode's lsm not set? */ lmm = lustre_msg_buf(req->rq_repmsg,1,body->eadatasize); LASSERT(lmm != NULL); /* mdc_getattr() checked */ LASSERT_REPSWABBED(req, 1); /* mdc_getattr() swabbed */ rc = obd_unpackmd (&sbi->ll_osc_conn, &lsm, lmm, body->eadatasize); if (rc < 0) { CERROR("Error %d unpacking eadata\n", rc); ptlrpc_req_finished(req); RETURN(rc); } LASSERT(rc >= sizeof(*lsm)); } ll_update_inode(inode, body, lsm); if (lsm != NULL && ll_i2info(inode)->lli_smd != lsm) obd_free_memmd(&sbi->ll_osc_conn, &lsm); ptlrpc_req_finished(req); } lsm = ll_i2info(inode)->lli_smd; if (!lsm) /* object not yet allocated, don't validate size */ RETURN(0); /* * unfortunately stat comes in through revalidate and we don't * differentiate this use from initial instantiation. we're * also being wildly conservative and flushing write caches * so that stat really returns the proper size. */ { struct ldlm_extent extent = {0, OBD_OBJECT_EOF}; struct lustre_handle lockh = {0}; ldlm_error_t err; err = ll_extent_lock(NULL, inode, lsm, LCK_PR, &extent, &lockh); if (err != ELDLM_OK) RETURN(err); ll_extent_unlock(NULL, inode, lsm, LCK_PR, &lockh); } RETURN(0); } #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) static int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat) { int res = 0; struct inode *inode = de->d_inode; lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR); res = ll_inode_revalidate(de); if (res) return res; #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) stat->dev = inode->i_dev; #endif stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->rdev = kdev_t_to_nr(inode->i_rdev); stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; stat->ctime = inode->i_ctime; stat->size = inode->i_size; return 0; } #endif struct file_operations ll_file_operations = { read: ll_file_read, write: ll_file_write, ioctl: ll_file_ioctl, open: ll_file_open, release: ll_file_release, mmap: generic_file_mmap, llseek: ll_file_seek, fsync: ll_fsync, }; struct inode_operations ll_file_inode_operations = { setattr_raw: ll_setattr_raw, setattr: ll_setattr, truncate: ll_truncate, #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) getattr: ll_getattr, #else revalidate: ll_inode_revalidate, #endif }; struct inode_operations ll_special_inode_operations = { setattr_raw: ll_setattr_raw, setattr: ll_setattr, #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) getattr: ll_getattr, #else revalidate: ll_inode_revalidate, #endif };