/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*
- * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
- * Author: Peter Braam <braam@clusterfs.com>
- * Author: Phil Schwan <phil@clusterfs.com>
- * Author: Andreas Dilger <adilger@clusterfs.com>
+ * GPL HEADER START
*
- * This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
- * Lustre is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
*
- * Lustre is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
*
- * You should have received a copy of the GNU General Public License
- * along with Lustre; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/llite/file.c
+ *
+ * Author: Peter Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Andreas Dilger <adilger@clusterfs.com>
*/
#define DEBUG_SUBSYSTEM S_LLITE
#include <linux/pagemap.h>
#include <linux/file.h>
#include "llite_internal.h"
+#include <lustre/ll_fiemap.h>
/* also used by llite/special.c:ll_special_open() */
struct ll_file_data *ll_file_data_get(void)
let's close it somehow. This will decref request. */
rc = it_open_error(DISP_OPEN_OPEN, it);
if (rc) {
+ up(&lli->lli_och_sem);
ll_file_data_put(fd);
- GOTO(out_och_free, rc);
+ GOTO(out_openerr, rc);
}
ll_release_openhandle(file->f_dentry, it);
lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
rc = ll_local_open(file, it, fd, NULL);
if (rc) {
+ (*och_usecount)--;
up(&lli->lli_och_sem);
ll_file_data_put(fd);
- RETURN(rc);
+ GOTO(out_openerr, rc);
}
} else {
LASSERT(*och_usecount == 0);
ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
rc = ll_local_open(file, it, fd, *och_p);
if (rc) {
- up(&lli->lli_och_sem);
ll_file_data_put(fd);
GOTO(out_och_free, rc);
}
RETURN(0);
}
-static inline void ll_remove_suid(struct inode *inode)
-{
- unsigned int mode;
-
- /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
- mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
-
- /* was any of the uid bits set? */
- mode &= inode->i_mode;
- if (mode && !capable(CAP_FSETID)) {
- inode->i_mode &= ~mode;
- // XXX careful here - we cannot change the size
- }
-}
-
static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
{
struct ll_inode_info *lli = ll_i2info(inode);
struct {
char name[16];
struct ldlm_lock *lock;
- struct lov_stripe_md *lsm;
- } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock, .lsm = lsm };
+ } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock };
__u32 stripe, vallen = sizeof(stripe);
+ struct lov_oinfo *loinfo;
int rc;
ENTRY;
GOTO(check, stripe = 0);
/* get our offset in the lov */
- rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
+ rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm);
if (rc != 0) {
CERROR("obd_get_info: rc = %d\n", rc);
RETURN(rc);
LASSERT(stripe < lsm->lsm_stripe_count);
check:
- if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
- lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
+ loinfo = lsm->lsm_oinfo[stripe];
+ if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr,
+ &lock->l_resource->lr_name)){
LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
- lsm->lsm_oinfo[stripe]->loi_id,
- lsm->lsm_oinfo[stripe]->loi_gr);
+ loinfo->loi_id, loinfo->loi_gr);
RETURN(-ELDLM_NO_LOCK_DATA);
}
struct inode * inode = file->f_dentry->d_inode;
ENTRY;
- append = (rw == WRITE) && (file->f_flags & O_APPEND);
+ append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
if (append || !ll_is_file_contended(file)) {
struct ll_lock_tree_node *node;
if (file->f_flags & O_NONBLOCK)
ast_flags |= LDLM_FL_BLOCK_NOWAIT;
node = ll_node_from_inode(inode, start, end,
- (rw == WRITE) ? LCK_PW : LCK_PR);
+ (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
if (IS_ERR(node)) {
rc = PTR_ERR(node);
GOTO(out, rc);
return rc;
}
+/**
+ * Checks if requested extent lock is compatible with a lock under a page.
+ *
+ * Checks if the lock under \a page is compatible with a read or write lock
+ * (specified by \a rw) for an extent [\a start , \a end].
+ *
+ * \param page the page under which lock is considered
+ * \param rw OBD_BRW_READ if requested for reading,
+ * OBD_BRW_WRITE if requested for writing
+ * \param start start of the requested extent
+ * \param end end of the requested extent
+ * \param cookie transparent parameter for passing locking context
+ *
+ * \post result == 1, *cookie == context, appropriate lock is referenced or
+ * \post result == 0
+ *
+ * \retval 1 owned lock is reused for the request
+ * \retval 0 no lock reused for the request
+ *
+ * \see ll_release_short_lock
+ */
+static int ll_reget_short_lock(struct page *page, int rw,
+ obd_off start, obd_off end,
+ void **cookie)
+{
+ struct ll_async_page *llap;
+ struct obd_export *exp;
+ struct inode *inode = page->mapping->host;
+
+ ENTRY;
+
+ exp = ll_i2dtexp(inode);
+ if (exp == NULL)
+ RETURN(0);
+
+ llap = llap_cast_private(page);
+ if (llap == NULL)
+ RETURN(0);
+
+ RETURN(obd_reget_short_lock(exp, ll_i2info(inode)->lli_smd,
+ &llap->llap_cookie, rw, start, end,
+ cookie));
+}
+
+/**
+ * Releases a reference to a lock taken in a "fast" way.
+ *
+ * Releases a read or a write (specified by \a rw) lock
+ * referenced by \a cookie.
+ *
+ * \param inode inode to which data belong
+ * \param end end of the locked extent
+ * \param rw OBD_BRW_READ if requested for reading,
+ * OBD_BRW_WRITE if requested for writing
+ * \param cookie transparent parameter for passing locking context
+ *
+ * \post appropriate lock is dereferenced
+ *
+ * \see ll_reget_short_lock
+ */
+static void ll_release_short_lock(struct inode *inode, obd_off end,
+ void *cookie, int rw)
+{
+ struct obd_export *exp;
+ int rc;
+
+ exp = ll_i2dtexp(inode);
+ if (exp == NULL)
+ return;
+
+ rc = obd_release_short_lock(exp, ll_i2info(inode)->lli_smd, end,
+ cookie, rw);
+ if (rc < 0)
+ CERROR("unlock failed (%d)\n", rc);
+}
+
+/**
+ * Checks if requested extent lock is compatible
+ * with a lock under a page in page cache.
+ *
+ * Checks if a lock under some \a page is compatible with a read or write lock
+ * (specified by \a rw) for an extent [\a start , \a end].
+ *
+ * \param file the file under which lock is considered
+ * \param rw OBD_BRW_READ if requested for reading,
+ * OBD_BRW_WRITE if requested for writing
+ * \param ppos start of the requested extent
+ * \param end end of the requested extent
+ * \param cookie transparent parameter for passing locking context
+ * \param buf userspace buffer for the data
+ *
+ * \post result == 1, *cookie == context, appropriate lock is referenced
+ * \post retuls == 0
+ *
+ * \retval 1 owned lock is reused for the request
+ * \retval 0 no lock reused for the request
+ *
+ * \see ll_file_put_fast_lock
+ */
+static inline int ll_file_get_fast_lock(struct file *file,
+ obd_off ppos, obd_off end,
+ char *buf, void **cookie, int rw)
+{
+ int rc = 0;
+ struct page *page;
+
+ ENTRY;
+
+ if (!ll_region_mapped((unsigned long)buf, end - ppos)) {
+ page = find_lock_page(file->f_dentry->d_inode->i_mapping,
+ ppos >> CFS_PAGE_SHIFT);
+ if (page) {
+ if (ll_reget_short_lock(page, rw, ppos, end, cookie))
+ rc = 1;
+
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+
+ RETURN(rc);
+}
+
+/**
+ * Releases a reference to a lock taken in a "fast" way.
+ *
+ * Releases a read or a write (specified by \a rw) lock
+ * referenced by \a cookie.
+ *
+ * \param inode inode to which data belong
+ * \param end end of the locked extent
+ * \param rw OBD_BRW_READ if requested for reading,
+ * OBD_BRW_WRITE if requested for writing
+ * \param cookie transparent parameter for passing locking context
+ *
+ * \post appropriate lock is dereferenced
+ *
+ * \see ll_file_get_fast_lock
+ */
+static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
+ void *cookie, int rw)
+{
+ ll_release_short_lock(inode, end, cookie, rw);
+}
+
+enum ll_lock_style {
+ LL_LOCK_STYLE_NOLOCK = 0,
+ LL_LOCK_STYLE_FASTLOCK = 1,
+ LL_LOCK_STYLE_TREELOCK = 2
+};
+
+/**
+ * Checks if requested extent lock is compatible with a lock
+ * under a page cache page.
+ *
+ * Checks if the lock under \a page is compatible with a read or write lock
+ * (specified by \a rw) for an extent [\a start , \a end].
+ *
+ * \param file file under which I/O is processed
+ * \param rw OBD_BRW_READ if requested for reading,
+ * OBD_BRW_WRITE if requested for writing
+ * \param ppos start of the requested extent
+ * \param end end of the requested extent
+ * \param cookie transparent parameter for passing locking context
+ * (only used with LL_LOCK_STYLE_FASTLOCK)
+ * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
+ * \param buf userspace buffer for the data
+ *
+ * \retval LL_LOCK_STYLE_FASTLOCK owned lock is reused through fast lock
+ * \retval LL_LOCK_STYLE_TREELOCK got a lock through tree lock
+ * \retval LL_LOCK_STYLE_NOLOCK got no lock
+ *
+ * \see ll_file_put_lock
+ */
+static inline int ll_file_get_lock(struct file *file, obd_off ppos,
+ obd_off end, char *buf, void **cookie,
+ struct ll_lock_tree *tree, int rw)
+{
+ int rc;
+
+ ENTRY;
+
+ if (ll_file_get_fast_lock(file, ppos, end, buf, cookie, rw))
+ RETURN(LL_LOCK_STYLE_FASTLOCK);
+
+ rc = ll_file_get_tree_lock(tree, file, buf, ppos - end, ppos, end, rw);
+ /* rc: 1 for tree lock, 0 for no lock, <0 for error */
+ switch (rc) {
+ case 1:
+ RETURN(LL_LOCK_STYLE_TREELOCK);
+ case 0:
+ RETURN(LL_LOCK_STYLE_NOLOCK);
+ }
+
+ /* an error happened if we reached this point, rc = -errno here */
+ RETURN(rc);
+}
+
+/**
+ * Drops the lock taken by ll_file_get_lock.
+ *
+ * Releases a read or a write (specified by \a rw) lock
+ * referenced by \a tree or \a cookie.
+ *
+ * \param inode inode to which data belong
+ * \param end end of the locked extent
+ * \param lockstyle facility through which the lock was taken
+ * \param rw OBD_BRW_READ if requested for reading,
+ * OBD_BRW_WRITE if requested for writing
+ * \param cookie transparent parameter for passing locking context
+ * (only used with LL_LOCK_STYLE_FASTLOCK)
+ * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
+ *
+ * \post appropriate lock is dereferenced
+ *
+ * \see ll_file_get_lock
+ */
+static inline void ll_file_put_lock(struct inode *inode, obd_off end,
+ enum ll_lock_style lock_style,
+ void *cookie, struct ll_lock_tree *tree,
+ int rw)
+
+{
+ switch (lock_style) {
+ case LL_LOCK_STYLE_TREELOCK:
+ ll_tree_unlock(tree);
+ break;
+ case LL_LOCK_STYLE_FASTLOCK:
+ ll_file_put_fast_lock(inode, end, cookie, rw);
+ break;
+ default:
+ CERROR("invalid locking style (%d)\n", lock_style);
+ }
+}
+
static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
loff_t *ppos)
{
struct ost_lvb lvb;
struct ll_ra_read bead;
int ra = 0;
- loff_t end;
+ obd_off end;
ssize_t retval, chunk, sum = 0;
- int tree_locked;
+ int lock_style;
+ void *cookie;
__u64 kms;
ENTRY;
if (sbi->ll_max_rw_chunk != 0) {
/* first, let's know the end of the current stripe */
end = *ppos;
- obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
- (obd_off *)&end);
+ obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
/* correct, the end is beyond the request */
if (end > *ppos + count - 1)
end = *ppos + count - 1;
}
- tree_locked = ll_file_get_tree_lock(&tree, file, buf,
- count, *ppos, end, READ);
- if (tree_locked < 0)
- GOTO(out, retval = tree_locked);
+ lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
+ buf, &cookie, &tree, OBD_BRW_READ);
+ if (lock_style < 0)
+ GOTO(out, retval = lock_style);
ll_inode_size_lock(inode, 1);
/*
ll_inode_size_unlock(inode, 1);
retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
if (retval) {
- if (tree_locked)
- ll_tree_unlock(&tree);
+ if (lock_style != LL_LOCK_STYLE_NOLOCK)
+ ll_file_put_lock(inode, end, lock_style,
+ cookie, &tree, OBD_BRW_READ);
goto out;
}
} else {
CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
inode->i_ino, chunk, *ppos, i_size_read(inode));
- if (tree_locked) {
+ if (lock_style != LL_LOCK_STYLE_NOLOCK) {
/* turn off the kernel's read-ahead */
file->f_ra.ra_pages = 0;
/* BUG: 5972 */
file_accessed(file);
retval = generic_file_read(file, buf, chunk, ppos);
- ll_tree_unlock(&tree);
+ ll_file_put_lock(inode, end, lock_style, cookie, &tree,
+ OBD_BRW_READ);
} else {
retval = ll_file_lockless_io(file, buf, chunk, ppos, READ);
}
}
tree_locked = ll_file_get_tree_lock(&tree, file, buf, count,
- lock_start, lock_end, WRITE);
+ lock_start, lock_end, OBD_BRW_WRITE);
if (tree_locked < 0)
GOTO(out, retval = tree_locked);
RETURN(rc);
}
+/**
+ * Get size for inode for which FIEMAP mapping is requested.
+ * Make the FIEMAP get_info call and returns the result.
+ */
+int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
+ int num_bytes)
+{
+ struct obd_export *exp = ll_i2dtexp(inode);
+ struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
+ struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
+ int vallen = num_bytes;
+ int rc;
+ ENTRY;
+
+ /* If the stripe_count > 1 and the application does not understand
+ * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
+ */
+ if (lsm->lsm_stripe_count > 1 &&
+ !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
+ return -EOPNOTSUPP;
+
+ fm_key.oa.o_id = lsm->lsm_object_id;
+ fm_key.oa.o_gr = lsm->lsm_object_gr;
+ fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+ obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
+ OBD_MD_FLSIZE);
+
+ /* If filesize is 0, then there would be no objects for mapping */
+ if (fm_key.oa.o_size == 0) {
+ fiemap->fm_mapped_extents = 0;
+ RETURN(0);
+ }
+
+ memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
+
+ rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
+ if (rc)
+ CERROR("obd_get_info failed: rc = %d\n", rc);
+
+ RETURN(rc);
+}
+
int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
unsigned long arg)
{
RETURN(ll_lov_getstripe(inode, arg));
case LL_IOC_RECREATE_OBJ:
RETURN(ll_lov_recreate_obj(inode, file, arg));
+ case EXT3_IOC_FIEMAP: {
+ struct ll_user_fiemap *fiemap_s;
+ size_t num_bytes, ret_bytes;
+ unsigned int extent_count;
+ int rc = 0;
+
+ /* Get the extent count so we can calculate the size of
+ * required fiemap buffer */
+ if (get_user(extent_count,
+ &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
+ RETURN(-EFAULT);
+ num_bytes = sizeof(*fiemap_s) + (extent_count *
+ sizeof(struct ll_fiemap_extent));
+ OBD_VMALLOC(fiemap_s, num_bytes);
+ if (fiemap_s == NULL)
+ RETURN(-ENOMEM);
+
+ if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
+ sizeof(*fiemap_s)))
+ GOTO(error, rc = -EFAULT);
+
+ if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
+ fiemap_s->fm_flags = fiemap_s->fm_flags &
+ ~LUSTRE_FIEMAP_FLAGS_COMPAT;
+ if (copy_to_user((char *)arg, fiemap_s,
+ sizeof(*fiemap_s)))
+ GOTO(error, rc = -EFAULT);
+
+ GOTO(error, rc = -EBADR);
+ }
+
+ /* If fm_extent_count is non-zero, read the first extent since
+ * it is used to calculate end_offset and device from previous
+ * fiemap call. */
+ if (extent_count) {
+ if (copy_from_user(&fiemap_s->fm_extents[0],
+ (char __user *)arg + sizeof(*fiemap_s),
+ sizeof(struct ll_fiemap_extent)))
+ GOTO(error, rc = -EFAULT);
+ }
+
+ if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
+ int rc;
+
+ rc = filemap_fdatawrite(inode->i_mapping);
+ if (rc)
+ GOTO(error, rc);
+ }
+
+ rc = ll_fiemap(inode, fiemap_s, num_bytes);
+ if (rc)
+ GOTO(error, rc);
+
+ ret_bytes = sizeof(struct ll_user_fiemap);
+
+ if (extent_count != 0)
+ ret_bytes += (fiemap_s->fm_mapped_extents *
+ sizeof(struct ll_fiemap_extent));
+
+ if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
+ rc = -EFAULT;
+
+error:
+ OBD_VFREE(fiemap_s, num_bytes);
+ RETURN(rc);
+ }
case EXT3_IOC_GETFLAGS:
case EXT3_IOC_SETFLAGS:
RETURN(ll_iocontrol(inode, file, cmd, arg));
rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &einfo, &res_id,
&flock, &flags, NULL, 0, NULL, &lockh, 0);
- if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
+ if ((file_lock->fl_flags & FL_FLOCK) &&
+ (rc == 0 || file_lock->fl_type == F_UNLCK))
ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
#ifdef HAVE_F_OP_FLOCK
- if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
+ if ((file_lock->fl_flags & FL_POSIX) &&
+ (rc == 0 || file_lock->fl_type == F_UNLCK) &&
!(flags & LDLM_FL_TEST_LOCK))
posix_lock_file_wait(file, file_lock);
#endif