#include <linux/quotaops.h>
#include <linux/extN_fs.h>
#include <linux/extN_jbd.h>
-#include <linux/extN_xattr.h>
+#include <linux/version.h>
+/* XXX ugh */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+ #include <linux/extN_xattr.h>
+#else
+ #include <linux/../../fs/extN/xattr.h>
+#endif
#include <linux/kp30.h>
#include <linux/lustre_fsfilt.h>
#include <linux/obd.h>
void *cb_data; /* MDS/OST completion function data */
};
-#define EXTN_XATTR_INDEX_LUSTRE 5
-#define XATTR_LUSTRE_MDS_OBJID "system.lustre_mds_objid"
+#ifndef EXTN_XATTR_INDEX_TRUSTED /* temporary until we hit l28 kernel */
+#define EXTN_XATTR_INDEX_TRUSTED 4
+#endif
+#define XATTR_LUSTRE_MDS_LOV_EA "lov"
+
+#define EXTN_XATTR_INDEX_LUSTRE 5 /* old */
+#define XATTR_LUSTRE_MDS_OBJID "system.lustre_mds_objid" /* old */
/*
* We don't currently need any additional blocks for rmdir and
int nblocks = EXTN_DATA_TRANS_BLOCKS;
void *handle;
- LASSERT(current->journal_info == NULL);
+ if (current->journal_info) {
+ CDEBUG(D_INODE, "increasing refcount on %p\n", current->journal_info);
+ goto journal_start;
+ }
switch(op) {
case FSFILT_OP_CREATE_LOG:
}
LASSERT(current->journal_info == desc_private);
+
+ journal_start:
lock_kernel();
handle = journal_start(EXTN_JOURNAL(inode), nblocks);
unlock_kernel();
rc = journal_stop(handle);
unlock_kernel();
- LASSERT(current->journal_info == NULL);
+ // LASSERT(current->journal_info == NULL);
return rc;
}
+static int fsfilt_extN_commit_async(struct inode *inode, void *h,
+ void **wait_handle)
+{
+ transaction_t *transaction;
+ unsigned long tid, rtid;
+ handle_t *handle = h;
+ journal_t *journal;
+ int rc;
+
+ LASSERT(current->journal_info == handle);
+
+ lock_kernel();
+ transaction = handle->h_transaction;
+ journal = transaction->t_journal;
+ tid = transaction->t_tid;
+ /* we don't want to be blocked */
+ handle->h_sync = 0;
+ rc = journal_stop(handle);
+ if (rc) {
+ CERROR("error while stopping transaction: %d\n", rc);
+ unlock_kernel();
+ return rc;
+ }
+
+ rtid = log_start_commit(journal, transaction);
+ if (rtid != tid)
+ CERROR("strange race: %lu != %lu\n",
+ (unsigned long) tid, (unsigned long) rtid);
+ unlock_kernel();
+
+ *wait_handle = (void *) tid;
+ CDEBUG(D_INODE, "commit async: %lu\n", (unsigned long) tid);
+ return 0;
+}
+
+static int fsfilt_extN_commit_wait(struct inode *inode, void *h)
+{
+ tid_t tid = (tid_t)(long)h;
+
+ CDEBUG(D_INODE, "commit wait: %lu\n", (unsigned long) tid);
+ if (is_journal_aborted(EXTN_JOURNAL(inode)))
+ return -EIO;
+
+ log_wait_commit(EXTN_JOURNAL(inode), tid);
+
+ return 0;
+}
+
static int fsfilt_extN_setattr(struct dentry *dentry, void *handle,
struct iattr *iattr, int do_trunc)
{
return rc;
}
+static int fsfilt_extN_iocontrol(struct inode * inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ int rc = 0;
+ ENTRY;
+
+ if (inode->i_fop->ioctl)
+ rc = inode->i_fop->ioctl(inode, file, cmd, arg);
+ else
+ RETURN(-ENOTTY);
+
+ RETURN(rc);
+}
+
+#undef INLINE_EA
+#undef OLD_EA
static int fsfilt_extN_set_md(struct inode *inode, void *handle,
void *lmm, int lmm_size)
{
- int rc;
+ int rc, old_ea = 0;
+#ifdef INLINE_EA /* can go away before 1.0 - just for testing bug 2097 now */
/* Nasty hack city - store stripe MD data in the block pointers if
* it will fit, because putting it in an EA currently kills the MDS
* performance. We'll fix this with "fast EAs" in the future.
*/
if (inode->i_blocks == 0 && lmm_size <= sizeof(EXTN_I(inode)->i_data) -
sizeof(EXTN_I(inode)->i_data[0])) {
- /* XXX old_size is debugging only */
- int old_size = EXTN_I(inode)->i_data[0];
+ unsigned old_size = EXTN_I(inode)->i_data[0];
if (old_size != 0) {
LASSERT(old_size < sizeof(EXTN_I(inode)->i_data));
- CERROR("setting EA on %lu again... interesting\n",
- inode->i_ino);
+ CERROR("setting EA on %lu/%u again... interesting\n",
+ inode->i_ino, inode->i_generation);
}
EXTN_I(inode)->i_data[0] = cpu_to_le32(lmm_size);
memcpy(&EXTN_I(inode)->i_data[1], lmm, lmm_size);
mark_inode_dirty(inode);
return 0;
- } else {
- down(&inode->i_sem);
- lock_kernel();
- rc = extN_xattr_set(handle, inode, EXTN_XATTR_INDEX_LUSTRE,
- XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
- unlock_kernel();
- up(&inode->i_sem);
}
+#endif
+#ifdef OLD_EA
+ /* keep this when we get rid of OLD_EA (too noisy during conversion) */
+ if (EXTN_I(inode)->i_file_acl /* || large inode EA flag */) {
+ CWARN("setting EA on %lu/%u again... interesting\n",
+ inode->i_ino, inode->i_generation);
+ old_ea = 1;
+ }
+
+ lock_kernel();
+ /* this can go away before 1.0. For bug 2097 testing only. */
+ rc = extN_xattr_set_handle(handle, inode, EXTN_XATTR_INDEX_LUSTRE,
+ XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
+#else
+ lock_kernel();
+ rc = extN_xattr_set_handle(handle, inode, EXTN_XATTR_INDEX_TRUSTED,
+ XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size, 0);
+
+ /* This tries to delete the old-format LOV EA, but only as long as we
+ * have successfully saved the new-format LOV EA (we can always try
+ * the conversion again the next time the file is accessed). It is
+ * possible (although unlikely) that the new-format LOV EA couldn't be
+ * saved because it ran out of space but we would need a file striped
+ * over least 123 OSTs before the two EAs filled a 4kB block.
+ *
+ * This can be removed when all filesystems have converted to the
+ * new EA format, but otherwise adds little if any overhead. If we
+ * wanted backward compatibility for existing files, we could keep
+ * the old EA around for a while but we'd have to clean it up later. */
+ if (rc >= 0 && old_ea) {
+ int err = extN_xattr_set_handle(handle, inode,
+ EXTN_XATTR_INDEX_LUSTRE,
+ XATTR_LUSTRE_MDS_OBJID,
+ NULL, 0, 0);
+ if (err)
+ CERROR("error deleting old LOV EA on %lu/%u: rc %d\n",
+ inode->i_ino, inode->i_generation, err);
+ }
+#endif
+ unlock_kernel();
if (rc)
CERROR("error adding MD data to inode %lu: rc = %d\n",
return rc;
}
+/* Must be called with i_sem held */
static int fsfilt_extN_get_md(struct inode *inode, void *lmm, int lmm_size)
{
int rc;
+ LASSERT(down_trylock(&inode->i_sem) != 0);
+ lock_kernel();
+ /* Keep support for reading "inline EAs" until we convert
+ * users over to new format entirely. See bug 841/2097. */
if (inode->i_blocks == 0 && EXTN_I(inode)->i_data[0]) {
- int size = le32_to_cpu(EXTN_I(inode)->i_data[0]);
+ unsigned size = le32_to_cpu(EXTN_I(inode)->i_data[0]);
+ void *handle;
+
LASSERT(size < sizeof(EXTN_I(inode)->i_data));
if (lmm) {
- if (size > lmm_size)
+ if (size > lmm_size) {
+ CERROR("inline EA on %lu/%u bad size %u > %u\n",
+ inode->i_ino, inode->i_generation,
+ size, lmm_size);
return -ERANGE;
+ }
memcpy(lmm, &EXTN_I(inode)->i_data[1], size);
}
+
+#ifndef INLINE_EA
+ /* migrate LOV EA data to external block - keep same format */
+ CWARN("DEBUG: migrate inline EA for inode %lu/%u to block\n",
+ inode->i_ino, inode->i_generation);
+
+ handle = journal_start(EXTN_JOURNAL(inode),
+ EXTN_XATTR_TRANS_BLOCKS);
+ if (!IS_ERR(handle)) {
+ int err;
+ rc = fsfilt_extN_set_md(inode, handle,
+ &EXTN_I(inode)->i_data[1],size);
+ if (rc == 0) {
+ memset(EXTN_I(inode)->i_data, 0,
+ sizeof(EXTN_I(inode)->i_data));
+ mark_inode_dirty(inode);
+ }
+ err = journal_stop(handle);
+ if (err && rc == 0)
+ rc = err;
+ } else {
+ rc = PTR_ERR(handle);
+ }
+#endif
+ unlock_kernel();
return size;
}
- down(&inode->i_sem);
- lock_kernel();
- rc = extN_xattr_get(inode, EXTN_XATTR_INDEX_LUSTRE,
- XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size);
+ rc = extN_xattr_get(inode, EXTN_XATTR_INDEX_TRUSTED,
+ XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size);
+ /* try old EA type if new one failed - MDS will convert it for us */
+ if (rc == -ENODATA) {
+ CDEBUG(D_INFO,"failed new LOV EA %d/%s from inode %lu: rc %d\n",
+ EXTN_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA,
+ inode->i_ino, rc);
+
+ rc = extN_xattr_get(inode, EXTN_XATTR_INDEX_LUSTRE,
+ XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size);
+ }
unlock_kernel();
- up(&inode->i_sem);
/* This gives us the MD size */
if (lmm == NULL)
return (rc == -ENODATA) ? 0 : rc;
if (rc < 0) {
- CDEBUG(D_INFO, "error getting EA %s from inode %lu: "
- "rc = %d\n", XATTR_LUSTRE_MDS_OBJID, inode->i_ino, rc);
+ CDEBUG(D_INFO, "error getting EA %d/%s from inode %lu: rc %d\n",
+ EXTN_XATTR_INDEX_LUSTRE, XATTR_LUSTRE_MDS_OBJID,
+ inode->i_ino, rc);
memset(lmm, 0, lmm_size);
return (rc == -ENODATA) ? 0 : rc;
}
atomic_dec(&fcb_cache_count);
}
-static int fsfilt_extN_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd,
- void *handle, fsfilt_cb_t cb_func,
- void *cb_data)
+static int fsfilt_extN_add_journal_cb(struct obd_device *obd, __u64 last_rcvd,
+ void *handle, fsfilt_cb_t cb_func,
+ void *cb_data)
{
struct fsfilt_cb_data *fcb;
return 0;
}
-static int fsfilt_extN_journal_data(struct file *filp)
-{
- struct inode *inode = filp->f_dentry->d_inode;
-
- EXTN_I(inode)->i_flags |= EXTN_JOURNAL_DATA_FL;
-
- return 0;
-}
-
/*
* We need to hack the return value for the free inode counts because
* the current EA code requires one filesystem block per inode with EAs,
return extN_force_commit(sb);
}
+extern int extN_map_inode_page(struct inode *inode, struct page *page,
+ unsigned long *blocks, int *created, int create);
+int fsfilt_extN_map_inode_page(struct inode *inode, struct page *page,
+ unsigned long *blocks, int *created, int create)
+{
+ return extN_map_inode_page(inode, page, blocks, created, create);
+}
+
extern int extN_prep_san_write(struct inode *inode, long *blocks,
- int nblocks, loff_t newsize);
+ int nblocks, loff_t newsize);
static int fsfilt_extN_prep_san_write(struct inode *inode, long *blocks,
int nblocks, loff_t newsize)
{
int err;
if (inode->i_size < *offs + size) {
- CERROR("file size %llu is too short for read %u@%llu\n",
- inode->i_size, size, *offs);
- return -EIO;
+ size = inode->i_size - *offs;
+ if (size < 0) {
+ CERROR("size %llu is too short for read %u@%llu\n",
+ inode->i_size, size, *offs);
+ return -EIO;
+ } else if (size == 0)
+ return 0;
}
block = *offs >> inode->i_blkbits;
memcpy(buf, bh->b_data + boffs, size);
brelse(bh);
*offs += size;
- return size;
+ return 0;
}
-static int fsfilt_extN_write_record(struct file * file, void *buf,
- int size, loff_t *offs)
+static int fsfilt_extN_write_record(struct file *file, void *buf, int size,
+ loff_t *offs, int force_sync)
{
struct buffer_head *bh;
unsigned long block, boffs;
journal = EXTN_SB(inode->i_sb)->s_journal;
handle = journal_start(journal, EXTN_DATA_TRANS_BLOCKS + 2);
- if (handle == NULL) {
+ if (IS_ERR(handle)) {
CERROR("can't start transaction\n");
- return -EIO;
+ return PTR_ERR(handle);
}
block = *offs >> inode->i_blkbits;
if (*offs + size > inode->i_size) {
down(&inode->i_sem);
if (*offs + size > inode->i_size)
- inode->i_size = ((loff_t)block + 1) << inode->i_blkbits;
+ inode->i_size = *offs + size;
+ if (inode->i_size > EXTN_I(inode)->i_disksize)
+ EXTN_I(inode)->i_disksize = inode->i_size;
up(&inode->i_sem);
}
CERROR("journal_dirty_metadata() returned error %d\n", err);
goto out;
}
- err = size;
+
+ if (force_sync)
+ handle->h_sync = 1; /* recovery likes this */
out:
if (bh)
brelse(bh);
journal_stop(handle);
- if (err > 0)
+ if (err == 0)
*offs += size;
return err;
}
+static int fsfilt_extN_setup(struct super_block *sb)
+{
+#if 0
+ EXTN_SB(sb)->dx_lock = fsfilt_extN_dx_lock;
+ EXTN_SB(sb)->dx_unlock = fsfilt_extN_dx_unlock;
+#endif
+#ifdef S_PDIROPS
+ CWARN("Enabling PDIROPS\n");
+ set_opt(EXTN_SB(sb)->s_mount_opt, PDIROPS);
+ sb->s_flags |= S_PDIROPS;
+#endif
+ return 0;
+}
+
static struct fsfilt_operations fsfilt_extN_ops = {
fs_type: "extN",
fs_owner: THIS_MODULE,
fs_start: fsfilt_extN_start,
fs_brw_start: fsfilt_extN_brw_start,
fs_commit: fsfilt_extN_commit,
+ fs_commit_async: fsfilt_extN_commit_async,
+ fs_commit_wait: fsfilt_extN_commit_wait,
fs_setattr: fsfilt_extN_setattr,
+ fs_iocontrol: fsfilt_extN_iocontrol,
fs_set_md: fsfilt_extN_set_md,
fs_get_md: fsfilt_extN_get_md,
fs_readpage: fsfilt_extN_readpage,
- fs_journal_data: fsfilt_extN_journal_data,
- fs_set_last_rcvd: fsfilt_extN_set_last_rcvd,
+ fs_add_journal_cb: fsfilt_extN_add_journal_cb,
fs_statfs: fsfilt_extN_statfs,
fs_sync: fsfilt_extN_sync,
+ fs_map_inode_page: fsfilt_extN_map_inode_page,
fs_prep_san_write: fsfilt_extN_prep_san_write,
fs_write_record: fsfilt_extN_write_record,
fs_read_record: fsfilt_extN_read_record,
+ fs_setup: fsfilt_extN_setup,
};
static int __init fsfilt_extN_init(void)
//rc = extN_xattr_unregister();
}
+module_init(fsfilt_extN_init);
+module_exit(fsfilt_extN_exit);
+
MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
MODULE_DESCRIPTION("Lustre extN Filesystem Helper v0.1");
MODULE_LICENSE("GPL");
-
-module_init(fsfilt_extN_init);
-module_exit(fsfilt_extN_exit);