X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fliblustre%2Fsuper.c;h=028c2c817f2e548afec6ab0394d53b6ccd9dde07;hb=c39489126f88bb5b30643ebb11c72fbe9f9d2241;hp=0e889330d285bd7972f4e970d088587fed90f692;hpb=576c9a8212bc6607146d99e3413f7a24cbf91b5c;p=fs%2Flustre-release.git diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index 0e88933..028c2c8 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -3,7 +3,7 @@ * * Lustre Light Super operations * - * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * Copyright (c) 2002-2004 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -25,11 +25,17 @@ #include #include -#include #include #include #include +#include +#include #include +#ifndef __CYGWIN__ +# include +#else +# include +#endif #include #include @@ -37,11 +43,77 @@ #include #include +#undef LIST_HEAD + #include "llite_lib.h" +#ifndef MAY_EXEC +#define MAY_EXEC 1 +#define MAY_WRITE 2 +#define MAY_READ 4 +#endif + +#define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH) + +static int ll_permission(struct inode *inode, int mask) +{ + struct llu_inode_info *lli = llu_i2info(inode); + mode_t mode = lli->lli_st_mode; + + if (current->fsuid == lli->lli_st_uid) + mode >>= 6; + else if (in_group_p(lli->lli_st_gid)) + mode >>= 3; + + if ((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask) + return 0; + + if ((mask & (MAY_READ|MAY_WRITE)) || + (lli->lli_st_mode & S_IXUGO)) + if (capable(CAP_DAC_OVERRIDE)) + return 0; + + if (mask == MAY_READ || + (S_ISDIR(lli->lli_st_mode) && !(mask & MAY_WRITE))) { + if (capable(CAP_DAC_READ_SEARCH)) + return 0; + } + + return -EACCES; +} + static void llu_fsop_gone(struct filesys *fs) { - /* FIXME */ + struct llu_sb_info *sbi = (struct llu_sb_info *) fs->fs_private; + struct obd_device *obd = class_exp2obd(sbi->ll_mdc_exp); + struct lustre_cfg lcfg; + int next = 0; + ENTRY; + + list_del(&sbi->ll_conn_chain); + obd_disconnect(sbi->ll_osc_exp, 0); + obd_disconnect(sbi->ll_mdc_exp, 0); + + while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next)) != NULL) + { + int err; + + LCFG_INIT(lcfg, LCFG_CLEANUP, obd->obd_name); + err = class_process_config(&lcfg); + if (err) { + CERROR("cleanup failed: %s\n", obd->obd_name); + } + + LCFG_INIT(lcfg, LCFG_DETACH, obd->obd_name); + err = class_process_config(&lcfg); + if (err) { + CERROR("detach failed: %s\n", obd->obd_name); + } + } + + OBD_FREE(sbi, sizeof(*sbi)); + + EXIT; } static struct inode_ops llu_inode_ops; @@ -53,11 +125,18 @@ void llu_update_inode(struct inode *inode, struct mds_body *body, LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); if (lsm != NULL) { - if (lli->lli_smd == NULL) + if (lli->lli_smd == NULL) { lli->lli_smd = lsm; - else - LASSERT (!memcmp (lli->lli_smd, lsm, - sizeof (*lsm))); + lli->lli_maxbytes = lsm->lsm_maxbytes; + if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES) + lli->lli_maxbytes = PAGE_CACHE_MAXBYTES; + } else { + if (memcmp(lli->lli_smd, lsm, sizeof(*lsm))) { + CERROR("lsm mismatch for inode %ld\n", + lli->lli_st_ino); + LBUG(); + } + } } if (body->valid & OBD_MD_FLID) @@ -104,6 +183,12 @@ void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid) valid &= src->o_valid; + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, "valid %x, cur time %lu/%lu, new %lu/%lu\n", + src->o_valid, + LTIME_S(lli->lli_st_mtime), LTIME_S(lli->lli_st_ctime), + (long)src->o_mtime, (long)src->o_ctime); + if (valid & OBD_MD_FLATIME) LTIME_S(lli->lli_st_atime) = src->o_atime; if (valid & OBD_MD_FLMTIME) @@ -126,83 +211,138 @@ void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid) lli->lli_st_gid = src->o_gid; if (valid & OBD_MD_FLFLAGS) lli->lli_st_flags = src->o_flags; - if (valid & OBD_MD_FLNLINK) - lli->lli_st_nlink = src->o_nlink; if (valid & OBD_MD_FLGENER) lli->lli_st_generation = src->o_generation; - if (valid & OBD_MD_FLRDEV) - lli->lli_st_rdev = src->o_rdev; } +#define S_IRWXUGO (S_IRWXU|S_IRWXG|S_IRWXO) +#define S_IALLUGO (S_ISUID|S_ISGID|S_ISVTX|S_IRWXUGO) + void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid) { struct llu_inode_info *lli = llu_i2info(src); + obd_flag newvalid = 0; - if (valid & OBD_MD_FLATIME) + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n", + valid, LTIME_S(lli->lli_st_mtime), + LTIME_S(lli->lli_st_ctime)); + + if (valid & OBD_MD_FLATIME) { dst->o_atime = LTIME_S(lli->lli_st_atime); - if (valid & OBD_MD_FLMTIME) + newvalid |= OBD_MD_FLATIME; + } + if (valid & OBD_MD_FLMTIME) { dst->o_mtime = LTIME_S(lli->lli_st_mtime); - if (valid & OBD_MD_FLCTIME) + newvalid |= OBD_MD_FLMTIME; + } + if (valid & OBD_MD_FLCTIME) { dst->o_ctime = LTIME_S(lli->lli_st_ctime); - if (valid & OBD_MD_FLSIZE) + newvalid |= OBD_MD_FLCTIME; + } + if (valid & OBD_MD_FLSIZE) { dst->o_size = lli->lli_st_size; - if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + newvalid |= OBD_MD_FLSIZE; + } + if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */ dst->o_blocks = lli->lli_st_blocks; - if (valid & OBD_MD_FLBLKSZ) + newvalid |= OBD_MD_FLBLOCKS; + } + if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */ dst->o_blksize = lli->lli_st_blksize; - if (valid & OBD_MD_FLTYPE) - dst->o_mode = (dst->o_mode & ~S_IFMT) | (lli->lli_st_mode & S_IFMT); - if (valid & OBD_MD_FLMODE) - dst->o_mode = (dst->o_mode & S_IFMT) | (lli->lli_st_mode & ~S_IFMT); - if (valid & OBD_MD_FLUID) + newvalid |= OBD_MD_FLBLKSZ; + } + if (valid & OBD_MD_FLTYPE) { + dst->o_mode = (dst->o_mode & S_IALLUGO)|(lli->lli_st_mode & S_IFMT); + newvalid |= OBD_MD_FLTYPE; + } + if (valid & OBD_MD_FLMODE) { + dst->o_mode = (dst->o_mode & S_IFMT)|(lli->lli_st_mode & S_IALLUGO); + newvalid |= OBD_MD_FLMODE; + } + if (valid & OBD_MD_FLUID) { dst->o_uid = lli->lli_st_uid; - if (valid & OBD_MD_FLGID) + newvalid |= OBD_MD_FLUID; + } + if (valid & OBD_MD_FLGID) { dst->o_gid = lli->lli_st_gid; - if (valid & OBD_MD_FLFLAGS) + newvalid |= OBD_MD_FLGID; + } + if (valid & OBD_MD_FLFLAGS) { dst->o_flags = lli->lli_st_flags; - if (valid & OBD_MD_FLNLINK) - dst->o_nlink = lli->lli_st_nlink; - if (valid & OBD_MD_FLGENER) + newvalid |= OBD_MD_FLFLAGS; + } + if (valid & OBD_MD_FLGENER) { dst->o_generation = lli->lli_st_generation; - if (valid & OBD_MD_FLRDEV) - dst->o_rdev = (__u32)(lli->lli_st_rdev); + newvalid |= OBD_MD_FLGENER; + } - dst->o_valid |= (valid & ~OBD_MD_FLID); + dst->o_valid |= newvalid; } -int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm, - char *ostdata) +/* + * really does the getattr on the inode and updates its fields + */ +int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm) { - struct llu_sb_info *sbi = llu_i2sbi(inode); + struct llu_inode_info *lli = llu_i2info(inode); + struct obd_export *exp = llu_i2obdexp(inode); + struct ptlrpc_request_set *set; struct obdo oa; + obd_flag refresh_valid; int rc; ENTRY; LASSERT(lsm); - LASSERT(sbi); + LASSERT(lli); memset(&oa, 0, sizeof oa); oa.o_id = lsm->lsm_object_id; oa.o_mode = S_IFREG; oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | - OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME; + OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME | + OBD_MD_FLCTIME; - if (ostdata != NULL) { - memcpy(&oa.o_inline, ostdata, FD_OSTDATA_SIZE); - oa.o_valid |= OBD_MD_FLHANDLE; + set = ptlrpc_prep_set(); + if (set == NULL) { + CERROR ("ENOMEM allocing request set\n"); + rc = -ENOMEM; + } else { + rc = obd_getattr_async(exp, &oa, lsm, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); } - - rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm); if (rc) RETURN(rc); - obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | - OBD_MD_FLMTIME | OBD_MD_FLCTIME); + refresh_valid = OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME | + OBD_MD_FLCTIME | OBD_MD_FLSIZE; + + /* We set this flag in commit write as we extend the file size. When + * the bit is set and the lock is canceled that covers the file size, + * we clear the bit. This is enough to protect the window where our + * local size extension is needed for writeback. However, it relies on + * behaviour that won't be true in the near future. This assumes that + * all getattr callers get extent locks, which they currnetly do. It + * also assumes that we only send discarding asts for {0,eof} truncates + * as is currently the case. This will have to be replaced by the + * proper eoc communication between clients and the ost, which is on + * its way. */ + if (test_bit(LLI_F_PREFER_EXTENDED_SIZE, &lli->lli_flags)) { + if (oa.o_size < lli->lli_st_size) + refresh_valid &= ~OBD_MD_FLSIZE; + else + clear_bit(LLI_F_PREFER_EXTENDED_SIZE, &lli->lli_flags); + } + + obdo_refresh_inode(inode, &oa, refresh_valid); RETURN(0); } -struct inode* llu_new_inode(struct filesys *fs, ino_t ino, mode_t mode) +static struct inode* llu_new_inode(struct filesys *fs, + struct ll_fid *fid) { struct inode *inode; struct llu_inode_info *lli; @@ -216,17 +356,22 @@ struct inode* llu_new_inode(struct filesys *fs, ino_t ino, mode_t mode) lli->lli_smd = NULL; lli->lli_symlink_name = NULL; lli->lli_flags = 0; - INIT_LIST_HEAD(&lli->lli_read_extents); + lli->lli_maxbytes = (__u64)(~0UL); lli->lli_file_data = NULL; - /* could file_identifier be 0 ? FIXME */ - inode = _sysio_i_new(fs, ino, NULL, + lli->lli_sysio_fid.fid_data = &lli->lli_fid; + lli->lli_sysio_fid.fid_len = sizeof(lli->lli_fid); + + memcpy(&lli->lli_fid, fid, sizeof(*fid)); + + /* file identifier is needed by functions like _sysio_i_find() */ + inode = _sysio_i_new(fs, &lli->lli_sysio_fid, #ifndef AUTOMOUNT_FILE_NAME - mode & S_IFMT, + fid->f_type & S_IFMT, #else - mode, /* all of the bits! */ + fid->f_type, /* all of the bits! */ #endif - 0, + 0, 0, &llu_inode_ops, lli); if (!inode) @@ -235,113 +380,110 @@ struct inode* llu_new_inode(struct filesys *fs, ino_t ino, mode_t mode) return inode; } -static int llu_iop_lookup(struct pnode *pnode, - struct inode **inop, - struct intent *intnt __IS_UNUSED, - const char *path __IS_UNUSED) +static int llu_have_md_lock(struct inode *inode, __u64 lockpart) { - struct pnode_base *pb_dir = pnode->p_parent->p_base; - struct ptlrpc_request *request = NULL; - struct llu_sb_info *sbi = llu_i2sbi(pb_dir->pb_ino); - struct ll_fid *fid = &llu_i2info(pb_dir->pb_ino)->lli_fid; - struct qstr *name = &pnode->p_base->pb_name; - struct mds_body *body; - unsigned long valid; - char *pname; - int rc, easize; - struct ll_read_inode2_cookie lic = {.lic_body = NULL, .lic_lsm = NULL}; - - /* the mount root inode have no name, so don't call - * remote in this case. but probably we need revalidate - * it here? FIXME */ - if (pnode->p_mount->mnt_root == pnode) { - struct inode *i = pnode->p_base->pb_ino; - I_REF(i); - *inop = i; - return 0; + struct llu_sb_info *sbi = llu_i2sbi(inode); + struct llu_inode_info *lli = llu_i2info(inode); + struct lustre_handle lockh; + struct ldlm_res_id res_id = { .name = {0} }; + struct obd_device *obddev; + ldlm_policy_data_t policy = { .l_inodebits = { lockpart } }; + int flags; + ENTRY; + + LASSERT(inode); + + obddev = sbi->ll_mdc_exp->exp_obd; + res_id.name[0] = lli->lli_st_ino; + res_id.name[1] = lli->lli_st_generation; + + CDEBUG(D_INFO, "trying to match res "LPU64"\n", res_id.name[0]); + + /* FIXME use LDLM_FL_TEST_LOCK instead */ + flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING; + if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS, + &policy, LCK_PR, &lockh)) { + ldlm_lock_decref(&lockh, LCK_PR); + RETURN(1); + } + + if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS, + &policy, LCK_PW, &lockh)) { + ldlm_lock_decref(&lockh, LCK_PW); + RETURN(1); } + RETURN(0); +} + +static int llu_inode_revalidate(struct inode *inode) +{ + struct llu_inode_info *lli = llu_i2info(inode); + struct lov_stripe_md *lsm = NULL; + ENTRY; - if (!name->len) - return -EINVAL; - - /* mdc_getattr_name require NULL-terminated name */ - OBD_ALLOC(pname, name->len + 1); - if (!pname) - return -ENOMEM; - memcpy(pname, name->name, name->len); - pname[name->len] = 0; - - valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE; - - /* FIXME before getattr_name, we don't know whether - * the inode we are finding is regular or not, so here - * we blindly require server feed in EA data */ - easize = obd_size_diskmd(&sbi->ll_osc_conn, NULL); - valid |= OBD_MD_FLEASIZE; - - rc = mdc_getattr_name(&sbi->ll_mdc_conn, fid, - pname, name->len + 1, - valid, easize, &request); - if (rc < 0) { - CERROR("mdc_getattr_name: %d\n", rc); - rc = -ENOENT; - goto out; - } - body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body)); - - *inop = llu_new_inode(pnode->p_mount->mnt_fs, body->ino, body->mode); - if (!inop) - goto out; - - lic.lic_body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*lic.lic_body)); - LASSERT (lic.lic_body != NULL); - LASSERT_REPSWABBED (request, 0); - - if (S_ISREG(lic.lic_body->mode) && - lic.lic_body->valid & OBD_MD_FLEASIZE) { - struct lov_mds_md *lmm; - int lmm_size; - int rc; - - lmm_size = lic.lic_body->eadatasize; - if (lmm_size == 0) { - CERROR ("OBD_MD_FLEASIZE set but eadatasize 0\n"); - RETURN (-EPROTO); + if (!inode) { + CERROR("REPORT THIS LINE TO PETER\n"); + RETURN(0); + } + + if (!llu_have_md_lock(inode, MDS_INODELOCK_UPDATE)) { + struct lustre_md md; + struct ptlrpc_request *req = NULL; + struct llu_sb_info *sbi = llu_i2sbi(inode); + struct ll_fid fid; + unsigned long valid = 0; + int rc, ealen = 0; + + /* Why don't we update all valid MDS fields here, if we're + * doing an RPC anyways? -phil */ + if (S_ISREG(lli->lli_st_mode)) { + ealen = obd_size_diskmd(sbi->ll_osc_exp, NULL); + valid |= OBD_MD_FLEASIZE; + } + ll_inode2fid(&fid, inode); + rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req); + if (rc) { + CERROR("failure %d inode %lu\n", rc, lli->lli_st_ino); + RETURN(-abs(rc)); } - lmm = lustre_msg_buf(request->rq_repmsg, 0 + 1, lmm_size); - LASSERT(lmm != NULL); - LASSERT_REPSWABBED (request, 0 + 1); - - rc = obd_unpackmd (&sbi->ll_osc_conn, - &lic.lic_lsm, lmm, lmm_size); - if (rc < 0) { - CERROR ("Error %d unpacking eadata\n", rc); - RETURN (rc); + rc = mdc_req2lustre_md(sbi->ll_mdc_exp, req, 0, sbi->ll_osc_exp, + &md); + + /* XXX Too paranoid? */ + if (((md.body->valid ^ valid) & OBD_MD_FLEASIZE) && + !((md.body->valid & OBD_MD_FLNLINK) && + (md.body->nlink == 0))) { + CERROR("Asked for %s eadata but got %s (%d)\n", + (valid & OBD_MD_FLEASIZE) ? "some" : "no", + (md.body->valid & OBD_MD_FLEASIZE) ? "some":"none", + md.body->eadatasize); + } + if (rc) { + ptlrpc_req_finished(req); + RETURN(rc); } - LASSERT (rc >= sizeof (*lic.lic_lsm)); - } else { - lic.lic_lsm = NULL; - } - llu_update_inode(*inop, body, lic.lic_lsm); + llu_update_inode(inode, md.body, md.lsm); + if (md.lsm != NULL && llu_i2info(inode)->lli_smd != md.lsm) + obd_free_memmd(sbi->ll_osc_exp, &md.lsm); - if (llu_i2info(*inop)->lli_smd) { - rc = llu_inode_getattr(*inop, llu_i2info(*inop)->lli_smd, NULL); - if (rc) - _sysio_i_gone(*inop); + if (md.body->valid & OBD_MD_FLSIZE) + set_bit(LLI_F_HAVE_MDS_SIZE_LOCK, + &llu_i2info(inode)->lli_flags); + ptlrpc_req_finished(req); } -out: - ptlrpc_req_finished(request); - OBD_FREE(pname, name->len + 1); + lsm = llu_i2info(inode)->lli_smd; + if (!lsm) /* object not yet allocated, don't validate size */ + RETURN(0); - return rc; + /* ll_glimpse_size will prefer locally cached writes if they extend + * the file */ + RETURN(llu_glimpse_size(inode)); } -static int llu_iop_getattr(struct pnode *pno, - struct inode *ino, - struct intnl_stat *b) +static void copy_stat_buf(struct inode *ino, struct intnl_stat *b) { struct llu_inode_info *lli = llu_i2info(ino); @@ -358,46 +500,71 @@ static int llu_iop_getattr(struct pnode *pno, b->st_atime = lli->lli_st_atime; b->st_mtime = lli->lli_st_mtime; b->st_ctime = lli->lli_st_ctime; - - return 0; } -int llu_mdc_cancel_unused(struct lustre_handle *conn, - struct llu_inode_info *lli, - int flags) +static int llu_iop_getattr(struct pnode *pno, + struct inode *ino, + struct intnl_stat *b) { - struct ldlm_res_id res_id = - { .name = {lli->lli_st_ino, lli->lli_st_generation} }; - struct obd_device *obddev = class_conn2obd(conn); + int rc; ENTRY; - RETURN(ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags)); + + liblustre_wait_event(0); + + if (!ino) { + LASSERT(pno); + LASSERT(pno->p_base->pb_ino); + ino = pno->p_base->pb_ino; + } else { + LASSERT(!pno || pno->p_base->pb_ino == ino); + } + + /* libsysio might call us directly without intent lock, + * we must re-fetch the attrs here + */ + rc = llu_inode_revalidate(ino); + if (!rc) { + copy_stat_buf(ino, b); + LASSERT(!llu_i2info(ino)->lli_it); + } + + RETURN(rc); } -static void llu_clear_inode(struct inode *inode) +static int null_if_equal(struct ldlm_lock *lock, void *data) { - struct llu_sb_info *sbi = llu_i2sbi(inode); + if (data == lock->l_ast_data) { + lock->l_ast_data = NULL; + + if (lock->l_req_mode != lock->l_granted_mode) + LDLM_ERROR(lock,"clearing inode with ungranted lock\n"); + } + + return LDLM_ITER_CONTINUE; +} + +void llu_clear_inode(struct inode *inode) +{ + struct ll_fid fid; struct llu_inode_info *lli = llu_i2info(inode); - int rc; + struct llu_sb_info *sbi = llu_i2sbi(inode); ENTRY; - CDEBUG(D_INODE, "clear inode: %lu\n", lli->lli_st_ino); - rc = llu_mdc_cancel_unused(&sbi->ll_mdc_conn, lli, - LDLM_FL_NO_CALLBACK); - if (rc < 0) { - CERROR("ll_mdc_cancel_unused: %d\n", rc); - /* XXX FIXME do something dramatic */ - } + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%lu(%p)\n", lli->lli_st_ino, + lli->lli_st_generation, inode); - if (lli->lli_smd) { - rc = obd_cancel_unused(&sbi->ll_osc_conn, lli->lli_smd, 0); - if (rc < 0) { - CERROR("obd_cancel_unused: %d\n", rc); - /* XXX FIXME do something dramatic */ - } - } + ll_inode2fid(&fid, inode); + clear_bit(LLI_F_HAVE_MDS_SIZE_LOCK, &(lli->lli_flags)); + mdc_change_cbdata(sbi->ll_mdc_exp, &fid, null_if_equal, inode); if (lli->lli_smd) - obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd); + obd_change_cbdata(sbi->ll_osc_exp, lli->lli_smd, + null_if_equal, inode); + + if (lli->lli_smd) { + obd_free_memmd(sbi->ll_osc_exp, &lli->lli_smd); + lli->lli_smd = NULL; + } if (lli->lli_symlink_name) { OBD_FREE(lli->lli_symlink_name, @@ -411,66 +578,207 @@ static void llu_clear_inode(struct inode *inode) void llu_iop_gone(struct inode *inode) { struct llu_inode_info *lli = llu_i2info(inode); + ENTRY; + liblustre_wait_event(0); llu_clear_inode(inode); OBD_FREE(lli, sizeof(*lli)); + EXIT; } -static int llu_setattr_raw(struct inode *inode, struct iattr *attr) +static int inode_setattr(struct inode * inode, struct iattr * attr) { - struct ptlrpc_request *request = NULL; + unsigned int ia_valid = attr->ia_valid; + struct llu_inode_info *lli = llu_i2info(inode); + int error = 0; + + if (ia_valid & ATTR_SIZE) { + error = llu_vmtruncate(inode, attr->ia_size); + if (error) + goto out; + } + + if (ia_valid & ATTR_UID) + lli->lli_st_uid = attr->ia_uid; + if (ia_valid & ATTR_GID) + lli->lli_st_gid = attr->ia_gid; + if (ia_valid & ATTR_ATIME) + lli->lli_st_atime = attr->ia_atime; + if (ia_valid & ATTR_MTIME) + lli->lli_st_mtime = attr->ia_mtime; + if (ia_valid & ATTR_CTIME) + lli->lli_st_ctime = attr->ia_ctime; + if (ia_valid & ATTR_MODE) { + lli->lli_st_mode = attr->ia_mode; + if (!in_group_p(lli->lli_st_gid) && !capable(CAP_FSETID)) + lli->lli_st_mode &= ~S_ISGID; + } + /* mark_inode_dirty(inode); */ +out: + return error; +} + +/* If this inode has objects allocated to it (lsm != NULL), then the OST + * object(s) determine the file size and mtime. Otherwise, the MDS will + * keep these values until such a time that objects are allocated for it. + * We do the MDS operations first, as it is checking permissions for us. + * We don't to the MDS RPC if there is nothing that we want to store there, + * otherwise there is no harm in updating mtime/atime on the MDS if we are + * going to do an RPC anyways. + * + * If we are doing a truncate, we will send the mtime and ctime updates + * to the OST with the punch RPC, otherwise we do an explicit setattr RPC. + * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE + * at the same time. + */ +int llu_setattr_raw(struct inode *inode, struct iattr *attr) +{ + struct lov_stripe_md *lsm = llu_i2info(inode)->lli_smd; struct llu_sb_info *sbi = llu_i2sbi(inode); struct llu_inode_info *lli = llu_i2info(inode); + struct ptlrpc_request *request = NULL; struct mdc_op_data op_data; - int err = 0; + int ia_valid = attr->ia_valid; + int rc = 0; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", lli->lli_st_ino); - /* if need truncate, do it at first */ - if (attr->ia_valid & ATTR_SIZE) { - printf("************* don't support truncate now !!!!!!!!\n"); - LBUG(); + if (ia_valid & ATTR_SIZE) { + if (attr->ia_size > ll_file_maxbytes(inode)) { + CDEBUG(D_INODE, "file too large %llu > "LPU64"\n", + attr->ia_size, ll_file_maxbytes(inode)); + RETURN(-EFBIG); + } + + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; } - /* Don't send size changes to MDS to avoid "fast EA" problems, and - * also avoid a pointless RPC (we get file size from OST anyways). - */ - attr->ia_valid &= ~ATTR_SIZE; - if (!attr->ia_valid) - RETURN(0); + /* We mark all of the fields "set" so MDS/OST does not re-set them */ + if (attr->ia_valid & ATTR_CTIME) { + attr->ia_ctime = CURRENT_TIME; + attr->ia_valid |= ATTR_CTIME_SET; + } + if (!(ia_valid & ATTR_ATIME_SET) && (attr->ia_valid & ATTR_ATIME)) { + attr->ia_atime = CURRENT_TIME; + attr->ia_valid |= ATTR_ATIME_SET; + } + if (!(ia_valid & ATTR_MTIME_SET) && (attr->ia_valid & ATTR_MTIME)) { + attr->ia_mtime = CURRENT_TIME; + attr->ia_valid |= ATTR_MTIME_SET; + } - llu_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); + if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME)) + CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %lu\n", + LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime), + LTIME_S(CURRENT_TIME)); + if (lsm) + attr->ia_valid &= ~ATTR_SIZE; + + /* If only OST attributes being set on objects, don't do MDS RPC. + * In that case, we need to check permissions and update the local + * inode ourselves so we can call obdo_from_inode() always. */ + if (ia_valid & (lsm ? ~(ATTR_SIZE | ATTR_FROM_OPEN | ATTR_RAW) : ~0)) { + struct lustre_md md; + llu_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); + + rc = mdc_setattr(sbi->ll_mdc_exp, &op_data, + attr, NULL, 0, NULL, 0, &request); + + if (rc) { + ptlrpc_req_finished(request); + if (rc != -EPERM && rc != -EACCES) + CERROR("mdc_setattr fails: rc = %d\n", rc); + RETURN(rc); + } - err = mdc_setattr(&sbi->ll_mdc_conn, &op_data, - attr, NULL, 0, &request); - if (err) - CERROR("mdc_setattr fails: err = %d\n", err); + rc = mdc_req2lustre_md(sbi->ll_mdc_exp, request, 0, + sbi->ll_osc_exp, &md); + if (rc) { + ptlrpc_req_finished(request); + RETURN(rc); + } - ptlrpc_req_finished(request); + /* Won't invoke vmtruncate as we already cleared ATTR_SIZE, + * but needed to set timestamps backwards on utime. */ + inode_setattr(inode, attr); + llu_update_inode(inode, md.body, md.lsm); + ptlrpc_req_finished(request); + + if (!md.lsm || !S_ISREG(lli->lli_st_mode)) { + CDEBUG(D_INODE, "no lsm: not setting attrs on OST\n"); + RETURN(0); + } + } else { + /* The OST doesn't check permissions, but the alternative is + * a gratuitous RPC to the MDS. We already rely on the client + * to do read/write/truncate permission checks, so is mtime OK? + */ + if (ia_valid & (ATTR_MTIME | ATTR_ATIME)) { + /* from sys_utime() */ + if (!(ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET))) { + if (current->fsuid != lli->lli_st_uid && + (rc = ll_permission(inode, MAY_WRITE)) != 0) + RETURN(rc); + } else { + /* from inode_change_ok() */ + if (current->fsuid != lli->lli_st_uid && + !capable(CAP_FOWNER)) + RETURN(-EPERM); + } + } - if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_MTIME_SET) { - struct lov_stripe_md *lsm = lli->lli_smd; + /* Won't invoke vmtruncate, as we already cleared ATTR_SIZE */ + inode_setattr(inode, attr); + } + + if (ia_valid & ATTR_SIZE) { + ldlm_policy_data_t policy = { .l_extent = {attr->ia_size, + OBD_OBJECT_EOF} }; + struct lustre_handle lockh = { 0 }; + int err, ast_flags = 0; + /* XXX when we fix the AST intents to pass the discard-range + * XXX extent, make ast_flags always LDLM_AST_DISCARD_DATA + * XXX here. */ + if (attr->ia_size == 0) + ast_flags = LDLM_AST_DISCARD_DATA; + + rc = llu_extent_lock(NULL, inode, lsm, LCK_PW, &policy, + &lockh, ast_flags); + if (rc != ELDLM_OK) { + if (rc > 0) + RETURN(-ENOLCK); + RETURN(rc); + } + + rc = llu_vmtruncate(inode, attr->ia_size); + + /* unlock now as we don't mind others file lockers racing with + * the mds updates below? */ + err = llu_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh); + if (err) { + CERROR("llu_extent_unlock failed: %d\n", err); + if (!rc) + rc = err; + } + } else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) { struct obdo oa; - int err2; CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n", - lli->lli_st_ino, attr->ia_mtime); + lli->lli_st_ino, LTIME_S(attr->ia_mtime)); oa.o_id = lsm->lsm_object_id; - oa.o_mode = S_IFREG; - oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMTIME; - oa.o_mtime = attr->ia_mtime; - err2 = obd_setattr(&sbi->ll_osc_conn, &oa, lsm, NULL); - if (err2) { - CERROR("obd_setattr fails: rc=%d\n", err); - if (!err) - err = err2; - } + oa.o_valid = OBD_MD_FLID; + obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME | + OBD_MD_FLMTIME | OBD_MD_FLCTIME); + rc = obd_setattr(sbi->ll_osc_exp, &oa, lsm, NULL); + if (rc) + CERROR("obd_setattr fails: rc=%d\n", rc); } - RETURN(err); + RETURN(rc); } -/* FIXME here we simply act as a thin layer to glue it with +/* here we simply act as a thin layer to glue it with * llu_setattr_raw(), which is copy from kernel */ static int llu_iop_setattr(struct pnode *pno, @@ -479,7 +787,13 @@ static int llu_iop_setattr(struct pnode *pno, struct intnl_stat *stbuf) { struct iattr iattr; + ENTRY; + liblustre_wait_event(0); + + LASSERT(!(mask & ~(SETATTR_MTIME | SETATTR_ATIME | + SETATTR_UID | SETATTR_GID | + SETATTR_LEN | SETATTR_MODE))); memset(&iattr, 0, sizeof(iattr)); if (mask & SETATTR_MODE) { @@ -503,104 +817,532 @@ static int llu_iop_setattr(struct pnode *pno, iattr.ia_valid |= ATTR_GID; } if (mask & SETATTR_LEN) { - iattr.ia_size = stbuf->st_size; /* FIXME signed expansion problem */ + iattr.ia_size = stbuf->st_size; /* XXX signed expansion problem */ iattr.ia_valid |= ATTR_SIZE; } iattr.ia_valid |= ATTR_RAW; - /* FIXME FIXME FIXME FIXME FIXME FIXME FIXME - * without ATTR_FROM_OPEN, mds_reint_setattr will call - * mds_fid2locked_dentry() and deadlocked at completion_ast call. - * Here we workaround it and avoid any locking. - * FIXME FIXME FIXME FIXME FIXME FIXME FIXME - */ - iattr.ia_valid |= ATTR_FROM_OPEN; - return llu_setattr_raw(ino, &iattr); + RETURN(llu_setattr_raw(ino, &iattr)); } +#define EXT2_LINK_MAX 32000 -static int llu_mkdir2(struct inode *dir, const char *name, int len, int mode) +static int llu_iop_symlink_raw(struct pnode *pno, const char *tgt) { + struct inode *dir = pno->p_base->pb_parent->pb_ino; + struct qstr *qstr = &pno->p_base->pb_name; + const char *name = qstr->name; + int len = qstr->len; struct ptlrpc_request *request = NULL; - time_t curtime = CURRENT_TIME; struct llu_sb_info *sbi = llu_i2sbi(dir); - struct llu_inode_info *lli = llu_i2info(dir); struct mdc_op_data op_data; int err = -EMLINK; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu\n", - name, lli->lli_st_ino); - /* FIXME check this later */ -#if 0 - if (dir->i_nlink >= EXT2_LINK_MAX) + if (llu_i2info(dir)->lli_st_nlink >= EXT2_LINK_MAX) RETURN(err); - mode = (mode & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR; -#endif - mode |= S_IFDIR; + llu_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0); - err = mdc_create(&sbi->ll_mdc_conn, &op_data, NULL, 0, mode, - current->fsuid, current->fsgid, - curtime, 0, &request); + err = mdc_create(sbi->ll_mdc_exp, &op_data, + tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO, + current->fsuid, current->fsgid, 0, &request); + ptlrpc_req_finished(request); + RETURN(err); +} + +static int llu_readlink_internal(struct inode *inode, + struct ptlrpc_request **request, + char **symname) +{ + struct llu_inode_info *lli = llu_i2info(inode); + struct llu_sb_info *sbi = llu_i2sbi(inode); + struct ll_fid fid; + struct mds_body *body; + int rc, symlen = lli->lli_st_size + 1; + ENTRY; + + *request = NULL; + + if (lli->lli_symlink_name) { + *symname = lli->lli_symlink_name; + CDEBUG(D_INODE, "using cached symlink %s\n", *symname); + RETURN(0); + } + + ll_inode2fid(&fid, inode); + rc = mdc_getattr(sbi->ll_mdc_exp, &fid, + OBD_MD_LINKNAME, symlen, request); + if (rc) { + CERROR("inode %lu: rc = %d\n", lli->lli_st_ino, rc); + RETURN(rc); + } + + body = lustre_msg_buf ((*request)->rq_repmsg, 0, sizeof (*body)); + LASSERT (body != NULL); + LASSERT_REPSWABBED (*request, 0); + + if ((body->valid & OBD_MD_LINKNAME) == 0) { + CERROR ("OBD_MD_LINKNAME not set on reply\n"); + GOTO (failed, rc = -EPROTO); + } + + LASSERT (symlen != 0); + if (body->eadatasize != symlen) { + CERROR ("inode %lu: symlink length %d not expected %d\n", + lli->lli_st_ino, body->eadatasize - 1, symlen - 1); + GOTO (failed, rc = -EPROTO); + } + + *symname = lustre_msg_buf ((*request)->rq_repmsg, 1, symlen); + if (*symname == NULL || + strnlen (*symname, symlen) != symlen - 1) { + /* not full/NULL terminated */ + CERROR ("inode %lu: symlink not NULL terminated string" + "of length %d\n", lli->lli_st_ino, symlen - 1); + GOTO (failed, rc = -EPROTO); + } + + OBD_ALLOC(lli->lli_symlink_name, symlen); + /* do not return an error if we cannot cache the symlink locally */ + if (lli->lli_symlink_name) + memcpy(lli->lli_symlink_name, *symname, symlen); + + RETURN(0); + + failed: + ptlrpc_req_finished (*request); + RETURN (-EPROTO); +} + +static int llu_iop_readlink(struct pnode *pno, char *data, size_t bufsize) +{ + struct inode *inode = pno->p_base->pb_ino; + struct ptlrpc_request *request; + char *symname; + int rc; + ENTRY; + + rc = llu_readlink_internal(inode, &request, &symname); + if (rc) + GOTO(out, rc); + + LASSERT(symname); + strncpy(data, symname, bufsize); + ptlrpc_req_finished(request); + out: + RETURN(rc); +} + +static int llu_iop_mknod_raw(struct pnode *pno, + mode_t mode, + dev_t dev) +{ + struct ptlrpc_request *request = NULL; + struct inode *dir = pno->p_parent->p_base->pb_ino; + struct llu_sb_info *sbi = llu_i2sbi(dir); + struct mdc_op_data op_data; + int err = -EMLINK; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu\n", + pno->p_base->pb_name.name, llu_i2info(dir)->lli_st_ino); + + if (llu_i2info(dir)->lli_st_nlink >= EXT2_LINK_MAX) + RETURN(err); + + mode &= ~current->fs->umask; + + switch (mode & S_IFMT) { + case 0: + case S_IFREG: + mode |= S_IFREG; /* for mode = 0 case, fallthrough */ + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + llu_prepare_mdc_op_data(&op_data, dir, NULL, + pno->p_base->pb_name.name, + pno->p_base->pb_name.len, + 0); + err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode, + current->fsuid, current->fsgid, dev, &request); + ptlrpc_req_finished(request); + break; + case S_IFDIR: + err = -EPERM; + break; + default: + err = -EINVAL; + } RETURN(err); } -static int llu_iop_mkdir(struct pnode *pno, mode_t mode) +static int llu_iop_link_raw(struct pnode *old, struct pnode *new) +{ + struct inode *src = old->p_base->pb_ino; + struct inode *dir = new->p_parent->p_base->pb_ino; + const char *name = new->p_base->pb_name.name; + int namelen = new->p_base->pb_name.len; + struct ptlrpc_request *request = NULL; + struct mdc_op_data op_data; + int rc; + ENTRY; + + LASSERT(src); + LASSERT(dir); + + liblustre_wait_event(0); + llu_prepare_mdc_op_data(&op_data, src, dir, name, namelen, 0); + rc = mdc_link(llu_i2sbi(src)->ll_mdc_exp, &op_data, &request); + ptlrpc_req_finished(request); + liblustre_wait_event(0); + + RETURN(rc); +} + +/* + * libsysio will clear the inode immediately after return + */ +static int llu_iop_unlink_raw(struct pnode *pno) { struct inode *dir = pno->p_base->pb_parent->pb_ino; struct qstr *qstr = &pno->p_base->pb_name; + const char *name = qstr->name; + int len = qstr->len; + struct inode *target = pno->p_base->pb_ino; + struct ptlrpc_request *request = NULL; + struct mdc_op_data op_data; int rc; + ENTRY; - LASSERT(dir); + LASSERT(target); - rc = llu_mkdir2(dir, qstr->name, qstr->len, mode); + liblustre_wait_event(0); + llu_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0); + rc = mdc_unlink(llu_i2sbi(dir)->ll_mdc_exp, &op_data, &request); + if (!rc) + rc = llu_objects_destroy(request, dir); + ptlrpc_req_finished(request); + liblustre_wait_event(0); - return rc; + RETURN(rc); } -#ifndef S_IRWXUGO -#define S_IRWXUGO (S_IRWXU|S_IRWXG|S_IRWXO) +static int llu_iop_rename_raw(struct pnode *old, struct pnode *new) +{ + struct inode *src = old->p_parent->p_base->pb_ino; + struct inode *tgt = new->p_parent->p_base->pb_ino; + const char *oldname = old->p_base->pb_name.name; + int oldnamelen = old->p_base->pb_name.len; + const char *newname = new->p_base->pb_name.name; + int newnamelen = new->p_base->pb_name.len; + struct ptlrpc_request *request = NULL; + struct mdc_op_data op_data; + int rc; + ENTRY; + + LASSERT(src); + LASSERT(tgt); + + llu_prepare_mdc_op_data(&op_data, src, tgt, NULL, 0, 0); + rc = mdc_rename(llu_i2sbi(src)->ll_mdc_exp, &op_data, + oldname, oldnamelen, newname, newnamelen, + &request); + if (!rc) { + rc = llu_objects_destroy(request, src); + } + + ptlrpc_req_finished(request); + + RETURN(rc); +} + +#ifdef _HAVE_STATVFS +static int llu_statfs_internal(struct llu_sb_info *sbi, + struct obd_statfs *osfs, + unsigned long max_age) +{ + struct obd_statfs obd_osfs; + int rc; + ENTRY; + + rc = obd_statfs(class_exp2obd(sbi->ll_mdc_exp), osfs, max_age); + if (rc) { + CERROR("mdc_statfs fails: rc = %d\n", rc); + RETURN(rc); + } + + CDEBUG(D_SUPER, "MDC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n", + osfs->os_bavail, osfs->os_blocks, osfs->os_ffree,osfs->os_files); + + rc = obd_statfs(class_exp2obd(sbi->ll_osc_exp), &obd_osfs, max_age); + if (rc) { + CERROR("obd_statfs fails: rc = %d\n", rc); + RETURN(rc); + } + + CDEBUG(D_SUPER, "OSC blocks "LPU64"/"LPU64" objects "LPU64"/"LPU64"\n", + obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree, + obd_osfs.os_files); + + osfs->os_blocks = obd_osfs.os_blocks; + osfs->os_bfree = obd_osfs.os_bfree; + osfs->os_bavail = obd_osfs.os_bavail; + + /* If we don't have as many objects free on the OST as inodes + * on the MDS, we reduce the total number of inodes to + * compensate, so that the "inodes in use" number is correct. + */ + if (obd_osfs.os_ffree < osfs->os_ffree) { + osfs->os_files = (osfs->os_files - osfs->os_ffree) + + obd_osfs.os_ffree; + osfs->os_ffree = obd_osfs.os_ffree; + } + + RETURN(rc); +} + +static int llu_statfs(struct llu_sb_info *sbi, struct statfs *sfs) +{ + struct obd_statfs osfs; + int rc; + + CDEBUG(D_VFSTRACE, "VFS Op:\n"); + + /* For now we will always get up-to-date statfs values, but in the + * future we may allow some amount of caching on the client (e.g. + * from QOS or lprocfs updates). */ + rc = llu_statfs_internal(sbi, &osfs, jiffies - 1); + if (rc) + return rc; + + statfs_unpack(sfs, &osfs); + + if (sizeof(sfs->f_blocks) == 4) { + while (osfs.os_blocks > ~0UL) { + sfs->f_bsize <<= 1; + + osfs.os_blocks >>= 1; + osfs.os_bfree >>= 1; + osfs.os_bavail >>= 1; + } + } + + sfs->f_blocks = osfs.os_blocks; + sfs->f_bfree = osfs.os_bfree; + sfs->f_bavail = osfs.os_bavail; + + return 0; +} + +static int llu_iop_statvfs(struct pnode *pno, + struct inode *ino, + struct intnl_statvfs *buf) +{ + struct statfs fs; + int rc; + ENTRY; + + liblustre_wait_event(0); + +#ifndef __CYGWIN__ + LASSERT(pno->p_base->pb_ino); + rc = llu_statfs(llu_i2sbi(pno->p_base->pb_ino), &fs); + if (rc) + RETURN(rc); + + /* from native driver */ + buf->f_bsize = fs.f_bsize; /* file system block size */ + buf->f_frsize = fs.f_bsize; /* file system fundamental block size */ + buf->f_blocks = fs.f_blocks; + buf->f_bfree = fs.f_bfree; + buf->f_bavail = fs.f_bavail; + buf->f_files = fs.f_files; /* Total number serial numbers */ + buf->f_ffree = fs.f_ffree; /* Number free serial numbers */ + buf->f_favail = fs.f_ffree; /* Number free ser num for non-privileged*/ + buf->f_fsid = fs.f_fsid.__val[1]; + buf->f_flag = 0; /* No equiv in statfs; maybe use type? */ + buf->f_namemax = fs.f_namelen; #endif -static int llu_symlink2(struct inode *dir, const char *name, int len, - const char *tgt) + RETURN(0); +} +#endif /* _HAVE_STATVFS */ + +static int llu_iop_mkdir_raw(struct pnode *pno, mode_t mode) { + struct inode *dir = pno->p_base->pb_parent->pb_ino; + struct qstr *qstr = &pno->p_base->pb_name; + const char *name = qstr->name; + int len = qstr->len; struct ptlrpc_request *request = NULL; - time_t curtime = CURRENT_TIME; - struct llu_sb_info *sbi = llu_i2sbi(dir); struct llu_inode_info *lli = llu_i2info(dir); struct mdc_op_data op_data; int err = -EMLINK; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%lu(%p)\n", + name, lli->lli_st_ino, lli->lli_st_generation, dir); - CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu,target=%s\n", - name, lli->lli_st_ino, tgt); - -#if 0 - if (dir->i_nlink >= EXT2_LINK_MAX) + if (lli->lli_st_nlink >= EXT2_LINK_MAX) RETURN(err); -#endif + + mode = (mode & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR; llu_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0); - err = mdc_create(&sbi->ll_mdc_conn, &op_data, - tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO, - current->fsuid, current->fsgid, curtime, 0, &request); + err = mdc_create(llu_i2sbi(dir)->ll_mdc_exp, &op_data, NULL, 0, mode, + current->fsuid, current->fsgid, 0, &request); ptlrpc_req_finished(request); RETURN(err); } -static int llu_iop_symlink(struct pnode *pno, const char *data) +static int llu_iop_rmdir_raw(struct pnode *pno) { struct inode *dir = pno->p_base->pb_parent->pb_ino; struct qstr *qstr = &pno->p_base->pb_name; + const char *name = qstr->name; + int len = qstr->len; + struct ptlrpc_request *request = NULL; + struct mdc_op_data op_data; + struct llu_inode_info *lli = llu_i2info(dir); int rc; - - LASSERT(dir); + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%lu(%p)\n", + name, lli->lli_st_ino, lli->lli_st_generation, dir); - rc = llu_symlink2(dir, qstr->name, qstr->len, data); + llu_prepare_mdc_op_data(&op_data, dir, NULL, name, len, S_IFDIR); + rc = mdc_unlink(llu_i2sbi(dir)->ll_mdc_exp, &op_data, &request); + ptlrpc_req_finished(request); - return rc; + RETURN(rc); +} + +#ifdef O_DIRECT +#define FCNTL_FLMASK (O_APPEND|O_NONBLOCK|O_ASYNC|O_DIRECT) +#else +#define FCNTL_FLMASK (O_APPEND|O_NONBLOCK|O_ASYNC) +#endif +#define FCNTL_FLMASK_INVALID (O_NONBLOCK|O_ASYNC) + +static int llu_iop_fcntl(struct inode *ino, int cmd, va_list ap, int *rtn) +{ + struct llu_inode_info *lli = llu_i2info(ino); + long flags; + + switch (cmd) { + case F_GETFL: + *rtn = lli->lli_open_flags; + return 0; + case F_SETFL: + flags = va_arg(ap, long); + flags &= FCNTL_FLMASK; + if (flags & FCNTL_FLMASK_INVALID) { + CERROR("liblustre don't support O_NONBLOCK, O_ASYNC, " + "and O_DIRECT on file descriptor\n"); + *rtn = -1; + return EINVAL; + } + lli->lli_open_flags = (int) flags; + *rtn = 0; + return 0; + } + + CERROR("unsupported fcntl cmd %x\n", cmd); + *rtn = -1; + return ENOSYS; +} + +static int llu_get_grouplock(struct inode *inode, unsigned long arg) +{ + struct llu_inode_info *lli = llu_i2info(inode); + struct ll_file_data *fd = lli->lli_file_data; + ldlm_policy_data_t policy = { .l_extent = { .start = 0, + .end = OBD_OBJECT_EOF}}; + struct lustre_handle lockh = { 0 }; + struct lov_stripe_md *lsm = lli->lli_smd; + ldlm_error_t err; + int flags = 0; + ENTRY; + + if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { + RETURN(-EINVAL); + } + + policy.l_extent.gid = arg; + if (lli->lli_open_flags & O_NONBLOCK) + flags = LDLM_FL_BLOCK_NOWAIT; + + err = llu_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, + flags); + if (err) + RETURN(err); + + fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK; + fd->fd_gid = arg; + memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh)); + + RETURN(0); +} + +static int llu_put_grouplock(struct inode *inode, unsigned long arg) +{ + struct llu_inode_info *lli = llu_i2info(inode); + struct ll_file_data *fd = lli->lli_file_data; + struct lov_stripe_md *lsm = lli->lli_smd; + ldlm_error_t err; + ENTRY; + + if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) + RETURN(-EINVAL); + + if (fd->fd_gid != arg) + RETURN(-EINVAL); + + fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK); + + err = llu_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh); + if (err) + RETURN(err); + + fd->fd_gid = 0; + memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh)); + + RETURN(0); +} + +static int llu_iop_ioctl(struct inode *ino, unsigned long int request, + va_list ap) +{ + unsigned long arg; + + liblustre_wait_event(0); + + switch (request) { + case LL_IOC_GROUP_LOCK: + arg = va_arg(ap, unsigned long); + return llu_get_grouplock(ino, arg); + case LL_IOC_GROUP_UNLOCK: + arg = va_arg(ap, unsigned long); + return llu_put_grouplock(ino, arg); + } + + CERROR("did not support ioctl cmd %lx\n", request); + return -ENOSYS; +} + +/* + * we already do syncronous read/write + */ +static int llu_iop_sync(struct inode *inode) +{ + liblustre_wait_event(0); + return 0; +} + +static int llu_iop_datasync(struct inode *inode) +{ + liblustre_wait_event(0); + return 0; } struct filesys_ops llu_filesys_ops = @@ -608,31 +1350,47 @@ struct filesys_ops llu_filesys_ops = fsop_gone: llu_fsop_gone, }; +struct inode *llu_iget(struct filesys *fs, struct lustre_md *md) +{ + struct inode *inode; + struct ll_fid fid; + struct file_identifier fileid = {&fid, sizeof(fid)}; + + if ((md->body->valid & + (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) != + (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) { + CERROR("bad md body valid mask 0x%x\n", md->body->valid); + LBUG(); + return ERR_PTR(-EPERM); + } -static struct inode_ops llu_inode_ops = { - inop_lookup: llu_iop_lookup, - inop_getattr: llu_iop_getattr, - inop_setattr: llu_iop_setattr, - inop_getdirentries: NULL, - inop_mkdir: llu_iop_mkdir, - inop_rmdir: NULL, - inop_symlink: llu_iop_symlink, - inop_readlink: NULL, - inop_open: llu_iop_open, - inop_close: llu_iop_close, - inop_unlink: NULL, - inop_ipreadv: llu_iop_ipreadv, - inop_ipwritev: llu_iop_ipwritev, - inop_iodone: llu_iop_iodone, - inop_fcntl: NULL, - inop_sync: NULL, - inop_datasync: NULL, - inop_ioctl: NULL, - inop_mknod: NULL, - inop_statvfs: NULL, - inop_gone: llu_iop_gone, -}; + /* try to find existing inode */ + fid.id = md->body->ino; + fid.generation = md->body->generation; + fid.f_type = md->body->mode & S_IFMT; + inode = _sysio_i_find(fs, &fileid); + if (inode) { + struct llu_inode_info *lli = llu_i2info(inode); + + if (inode->i_zombie || + lli->lli_st_generation != md->body->generation) { + I_RELE(inode); + } + else { + llu_update_inode(inode, md->body, md->lsm); + return inode; + } + } + + inode = llu_new_inode(fs, &fid); + if (inode) + llu_update_inode(inode, md->body, md->lsm); + + return inode; +} + +extern struct list_head lustre_profile_list; static int llu_fsswop_mount(const char *source, @@ -644,22 +1402,24 @@ llu_fsswop_mount(const char *source, struct filesys *fs; struct inode *root; struct pnode_base *rootpb; - static struct qstr noname = { NULL, 0, 0 }; + struct obd_device *obd; struct ll_fid rootfid; - struct llu_sb_info *sbi; - struct ptlrpc_connection *mdc_conn; + struct obd_statfs osfs; + static struct qstr noname = { NULL, 0, 0 }; struct ptlrpc_request *request = NULL; - struct mds_body *root_body; - struct obd_uuid param_uuid; + struct lustre_handle mdc_conn = {0, }; + struct lustre_handle osc_conn = {0, }; + struct lustre_md md; class_uuid_t uuid; - struct obd_device *obd; - char *osc=mount_option.osc_uuid; - char *mdc=mount_option.mdc_uuid; - int err = -EINVAL; + struct config_llog_instance cfg; + struct lustre_profile *lprof; + char *osc = NULL, *mdc = NULL; + int async = 1, err = -EINVAL; ENTRY; + /* allocate & initialize sbi */ OBD_ALLOC(sbi, sizeof(*sbi)); if (!sbi) RETURN(-ENOMEM); @@ -668,6 +1428,48 @@ llu_fsswop_mount(const char *source, generate_random_uuid(uuid); class_uuid_unparse(uuid, &sbi->ll_sb_uuid); + /* generate a string unique to this super, let's try + the address of the super itself.*/ + OBD_ALLOC(sbi->ll_instance, sizeof(sbi) * 2 + 1); + if (sbi->ll_instance == NULL) + GOTO(out_free, err = -ENOMEM); + sprintf(sbi->ll_instance, "%p", sbi); + + /* retrive & parse config log */ + cfg.cfg_instance = sbi->ll_instance; + cfg.cfg_uuid = sbi->ll_sb_uuid; + err = liblustre_process_log(&cfg, 1); + if (err < 0) { + CERROR("Unable to process log: %s\n", g_zconf_profile); + GOTO(out_free, err); + } + + lprof = class_get_profile(g_zconf_profile); + if (lprof == NULL) { + CERROR("No profile found: %s\n", g_zconf_profile); + GOTO(out_free, err = -EINVAL); + } + if (osc) + OBD_FREE(osc, strlen(osc) + 1); + OBD_ALLOC(osc, strlen(lprof->lp_osc) + + strlen(sbi->ll_instance) + 2); + sprintf(osc, "%s-%s", lprof->lp_osc, sbi->ll_instance); + + if (mdc) + OBD_FREE(mdc, strlen(mdc) + 1); + OBD_ALLOC(mdc, strlen(lprof->lp_mdc) + + strlen(sbi->ll_instance) + 2); + sprintf(mdc, "%s-%s", lprof->lp_mdc, sbi->ll_instance); + + if (!osc) { + CERROR("no osc\n"); + GOTO(out_free, err = -EINVAL); + } + if (!mdc) { + CERROR("no mdc\n"); + GOTO(out_free, err = -EINVAL); + } + fs = _sysio_fs_new(&llu_filesys_ops, flags, sbi); if (!fs) { err = -ENOMEM; @@ -677,65 +1479,78 @@ llu_fsswop_mount(const char *source, obd = class_name2obd(mdc); if (!obd) { CERROR("MDC %s: not setup or attached\n", mdc); - err = -EINVAL; - goto out_free; + GOTO(out_free, err = -EINVAL); } - + obd_set_info(obd->obd_self_export, strlen("async"), "async", + sizeof(async), &async); +#warning "FIXME ASAP!" +#if 0 + if (mdc_init_ea_size(obd, osc)) + GOTO(out_free, err = -EINVAL); +#endif /* setup mdc */ - /* FIXME need recover stuff */ - err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid); + err = obd_connect(&mdc_conn, obd, &sbi->ll_sb_uuid, 0); if (err) { CERROR("cannot connect to %s: rc = %d\n", mdc, err); - goto out_free; + GOTO(out_free, err); } + sbi->ll_mdc_exp = class_conn2export(&mdc_conn); + + err = obd_statfs(obd, &osfs, 100000000); + if (err) + GOTO(out_mdc, err); - mdc_conn = sbi2mdc(sbi)->cl_import->imp_connection; + /* + * FIXME fill fs stat data into sbi here!!! FIXME + */ /* setup osc */ obd = class_name2obd(osc); if (!obd) { CERROR("OSC %s: not setup or attached\n", osc); - err = -EINVAL; - goto out_mdc; + GOTO(out_mdc, err = -EINVAL); } + obd_set_info(obd->obd_self_export, strlen("async"), "async", + sizeof(async), &async); - err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid); + err = obd_connect(&osc_conn, obd, &sbi->ll_sb_uuid, 0); if (err) { CERROR("cannot connect to %s: rc = %d\n", osc, err); - goto out_mdc; + GOTO(out_mdc, err); } + sbi->ll_osc_exp = class_conn2export(&osc_conn); - err = mdc_getstatus(&sbi->ll_mdc_conn, &rootfid); + err = mdc_getstatus(sbi->ll_mdc_exp, &rootfid); if (err) { CERROR("cannot mds_connect: rc = %d\n", err); - goto out_osc; + GOTO(out_osc, err); } CDEBUG(D_SUPER, "rootfid "LPU64"\n", rootfid.id); sbi->ll_rootino = rootfid.id; -/* XXX do we need this?? - memset(&osfs, 0, sizeof(osfs)); - rc = obd_statfs(&sbi->ll_mdc_conn, &osfs); -*/ /* fetch attr of root inode */ - err = mdc_getattr(&sbi->ll_mdc_conn, &rootfid, + err = mdc_getattr(sbi->ll_mdc_exp, &rootfid, OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, 0, &request); if (err) { CERROR("mdc_getattr failed for root: rc = %d\n", err); - goto out_request; + GOTO(out_osc, err); + } + + err = mdc_req2lustre_md(sbi->ll_mdc_exp, request, 0, sbi->ll_osc_exp, + &md); + if (err) { + CERROR("failed to understand root inode md: rc = %d\n",err); + GOTO(out_request, err); } - root_body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*root_body)); LASSERT(sbi->ll_rootino != 0); - root = llu_new_inode(fs, root_body->ino, root_body->mode); - if (!root) { - err = -ENOMEM; - goto out_request; + root = llu_iget(fs, &md); + if (!root || IS_ERR(root)) { + CERROR("fail to generate root inode\n"); + GOTO(out_request, err = -EBADF); } - llu_update_inode(root, root_body, NULL); - /* * Generate base path-node for root. */ @@ -745,18 +1560,15 @@ llu_fsswop_mount(const char *source, goto out_inode; } - err = _sysio_do_mount(fs, rootpb, flags, NULL, mntp); + err = _sysio_do_mount(fs, rootpb, flags, tocover, mntp); if (err) { _sysio_pb_gone(rootpb); goto out_inode; } ptlrpc_req_finished(request); - request = NULL; - printf("************************************************\n"); - printf("* Mount successfully!!!!!!! *\n"); - printf("************************************************\n"); + printf("LibLustre: namespace mounted successfully!\n"); return 0; @@ -765,9 +1577,9 @@ out_inode: out_request: ptlrpc_req_finished(request); out_osc: - obd_disconnect(&sbi->ll_osc_conn); + obd_disconnect(sbi->ll_osc_exp, 0); out_mdc: - obd_disconnect(&sbi->ll_mdc_conn); + obd_disconnect(sbi->ll_mdc_exp, 0); out_free: OBD_FREE(sbi, sizeof(*sbi)); return err; @@ -777,3 +1589,31 @@ struct fssw_ops llu_fssw_ops = { llu_fsswop_mount }; +static struct inode_ops llu_inode_ops = { + inop_lookup: llu_iop_lookup, + inop_getattr: llu_iop_getattr, + inop_setattr: llu_iop_setattr, + inop_getdirentries: llu_iop_getdirentries, + inop_mkdir: llu_iop_mkdir_raw, + inop_rmdir: llu_iop_rmdir_raw, + inop_symlink: llu_iop_symlink_raw, + inop_readlink: llu_iop_readlink, + inop_open: llu_iop_open, + inop_close: llu_iop_close, + inop_link: llu_iop_link_raw, + inop_unlink: llu_iop_unlink_raw, + inop_rename: llu_iop_rename_raw, + inop_pos: llu_iop_pos, + inop_read: llu_iop_read, + inop_write: llu_iop_write, + inop_iodone: llu_iop_iodone, + inop_fcntl: llu_iop_fcntl, + inop_sync: llu_iop_sync, + inop_datasync: llu_iop_datasync, + inop_ioctl: llu_iop_ioctl, + inop_mknod: llu_iop_mknod_raw, +#ifdef _HAVE_STATVFS + inop_statvfs: llu_iop_statvfs, +#endif + inop_gone: llu_iop_gone, +};