X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fmds%2Fmds_lib.c;h=bc0fb6f73fbd905e7bbb0943f3cb0896cc950c23;hb=02be224ce61aa34c95d5c6323027de99d4485e6b;hp=93ac3007ad0e80b8e716d76828890777882fa693;hpb=a2a0746305449dbd925879b14dc2c0d6040bb8bf;p=fs%2Flustre-release.git diff --git a/lustre/mds/mds_lib.c b/lustre/mds/mds_lib.c index 93ac300..bc0fb6f 100644 --- a/lustre/mds/mds_lib.c +++ b/lustre/mds/mds_lib.c @@ -44,30 +44,337 @@ #include #include #include +#include #include #include -#include -#include +#include +#include +#include +#include -void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode) +#include "mds_internal.h" + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4) +struct group_info *groups_alloc(int ngroups) { - fid->id = inode->i_ino; - fid->generation = inode->i_generation; - fid->f_type = (S_IFMT & inode->i_mode); + struct group_info *ginfo; + + LASSERT(ngroups <= NGROUPS_SMALL); + + OBD_ALLOC(ginfo, sizeof(*ginfo) + 1 * sizeof(gid_t *)); + if (!ginfo) + return NULL; + ginfo->ngroups = ngroups; + ginfo->nblocks = 1; + ginfo->blocks[0] = ginfo->small_block; + atomic_set(&ginfo->usage, 1); + + return ginfo; } -/* Note that we can copy all of the fields, just some will not be "valid" */ -void mds_pack_inode2body(struct mds_body *b, struct inode *inode) +void groups_free(struct group_info *ginfo) +{ + LASSERT(ginfo->ngroups <= NGROUPS_SMALL); + LASSERT(ginfo->nblocks == 1); + LASSERT(ginfo->blocks[0] == ginfo->small_block); + + OBD_FREE(ginfo, sizeof(*ginfo) + 1 * sizeof(gid_t *)); +} + +/* for 2.4 the group number is small, so simply search the + * whole array. + */ +int groups_search(struct group_info *ginfo, gid_t grp) +{ + int i; + + if (!ginfo) + return 0; + + for (i = 0; i < ginfo->ngroups; i++) + if (GROUP_AT(ginfo, i) == grp) + return 1; + return 0; +} + +#else /* >= 2.6.4 */ + +void groups_sort(struct group_info *ginfo) +{ + int base, max, stride; + int gidsetsize = ginfo->ngroups; + + for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) + ; /* nothing */ + stride /= 3; + + while (stride) { + max = gidsetsize - stride; + for (base = 0; base < max; base++) { + int left = base; + int right = left + stride; + gid_t tmp = GROUP_AT(ginfo, right); + + while (left >= 0 && GROUP_AT(ginfo, left) > tmp) { + GROUP_AT(ginfo, right) = + GROUP_AT(ginfo, left); + right = left; + left -= stride; + } + GROUP_AT(ginfo, right) = tmp; + } + stride /= 3; + } +} + +int groups_search(struct group_info *ginfo, gid_t grp) +{ + int left, right; + + if (!ginfo) + return 0; + + left = 0; + right = ginfo->ngroups; + while (left < right) { + int mid = (left + right) / 2; + int cmp = grp - GROUP_AT(ginfo, mid); + if (cmp > 0) + left = mid + 1; + else if (cmp < 0) + right = mid; + else + return 1; + } + return 0; +} +#endif + +void groups_from_buffer(struct group_info *ginfo, __u32 *gids) +{ + int i, ngroups = ginfo->ngroups; + + for (i = 0; i < ginfo->nblocks; i++) { + int count = min(NGROUPS_PER_BLOCK, ngroups); + + memcpy(ginfo->blocks[i], gids, count * sizeof(__u32)); + gids += NGROUPS_PER_BLOCK; + ngroups -= count; + } +} + +void mds_pack_dentry2id(struct obd_device *obd, + struct lustre_id *id, + struct dentry *dentry, + int fid) +{ + id_ino(id) = dentry->d_inum; + id_gen(id) = dentry->d_generation; + + if (fid) { + id_fid(id) = dentry->d_fid; + id_group(id) = dentry->d_mdsnum; + } +} + +void mds_pack_dentry2body(struct obd_device *obd, + struct mds_body *b, + struct dentry *dentry, + int fid) { - b->valid = OBD_MD_FLID | OBD_MD_FLCTIME | OBD_MD_FLUID | OBD_MD_FLGID | - OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLNLINK | OBD_MD_FLGENER; + b->valid |= OBD_MD_FLID | OBD_MD_FLGENER | + OBD_MD_MDS; - if (!S_ISREG(inode->i_mode)) - b->valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLATIME | - OBD_MD_FLMTIME; + if (fid) + b->valid |= OBD_MD_FID; + + mds_pack_dentry2id(obd, &b->id1, dentry, fid); +} + +int mds_pack_inode2id(struct obd_device *obd, + struct lustre_id *id, + struct inode *inode, + int fid) +{ + int rc = 0; + ENTRY; + + if (fid) { + /* we have to avoid deadlock. */ + if (!down_trylock(&inode->i_sem)) { + rc = mds_read_inode_sid(obd, inode, id); + up(&inode->i_sem); + } else { + rc = mds_read_inode_sid(obd, inode, id); + } + } + + if (rc == 0) { + id_ino(id) = inode->i_ino; + id_gen(id) = inode->i_generation; + id_type(id) = (S_IFMT & inode->i_mode); + } + RETURN(rc); +} + +void mds_inode2id(struct obd_device *obd, struct lustre_id *id, + struct inode *inode, __u64 fid) +{ + struct mds_obd *mds = &obd->u.mds; + ENTRY; + + LASSERT(inode != NULL); + LASSERT(id != NULL); + LASSERT(fid != 0); + + id_fid(id) = fid; + id_ino(id) = inode->i_ino; + id_group(id) = mds->mds_num; + id_gen(id) = inode->i_generation; + id_type(id) = (S_IFMT & inode->i_mode); + + EXIT; +} + +int mds_pack_gskey(struct obd_device *obd, struct lustre_msg *repmsg, + int *offset, struct mds_body *body, struct inode *inode) +{ + struct crypto_key_md *md_key; + struct crypto_key *ckey; + __u32 buflen, *sizep; + void *buf; + int size, rc = 0; + ENTRY; + + sizep = lustre_msg_buf(repmsg, (*offset)++, 4); + if (!sizep) { + CERROR("can't locate returned ckey size buf\n"); + RETURN(-EPROTO); + } + *sizep = cpu_to_le32(sizeof(*ckey)); + + OBD_ALLOC(md_key, sizeof(*md_key)); + if (!md_key) + RETURN(-ENOMEM); + + buflen = repmsg->buflens[*offset]; + buf = lustre_msg_buf(repmsg, (*offset)++, buflen); - b->ino = inode->i_ino; + size = fsfilt_get_md(obd, inode, md_key, sizeof(*md_key), + EA_KEY); + if (size <= 0) { + if (size < 0) + CERROR("Can not get gskey from MDS ino %lu rc %d\n", + inode->i_ino, size); + GOTO(out, rc = size); + } + if (le32_to_cpu(md_key->md_magic) != MD_KEY_MAGIC) { + CDEBUG(D_INFO, "given match %x != magic %x\n", + md_key->md_magic, MD_KEY_MAGIC); + GOTO(out, rc = 0); + } + + CDEBUG(D_INFO, "get key %s mac %s for ino %lu size %d \n", + md_key->md_ck.ck_key, md_key->md_ck.ck_mac, inode->i_ino, size); + ckey=(struct crypto_key*)buf; + + memcpy(ckey, &md_key->md_ck, sizeof(*ckey)); + body->valid |= OBD_MD_FLKEY; +out: + OBD_FREE(md_key, sizeof(*md_key)); + RETURN(rc); +} + +static int mds_get_gskey(struct inode *inode, struct crypto_key *ckey) +{ + LASSERT(ckey); + /*tmp create gs key here*/ + LASSERT(ckey->ck_type == MKS_TYPE); + get_random_bytes(ckey->ck_key, KEY_SIZE); + RETURN(0); +} + +int mds_set_gskey(struct obd_device *obd, void *handle, + struct inode *inode, void *key, int key_len, + int valid) +{ + struct crypto_key_md *md_key = NULL; + struct crypto_key *ckey = (struct crypto_key *)key; + int rc = 0; + ENTRY; + + if (!ckey) + RETURN(0); + + LASSERT(ckey->ck_type == MKS_TYPE || ckey->ck_type == GKS_TYPE); + + OBD_ALLOC(md_key, sizeof(*md_key)); + if (ckey->ck_type == MKS_TYPE) + mds_get_gskey(inode, ckey); + + rc = fsfilt_get_md(obd, inode, md_key, sizeof(*md_key), + EA_KEY); + if (rc < 0) + GOTO(free, rc); + LASSERT(le32_to_cpu(md_key->md_magic) == MD_KEY_MAGIC || + md_key->md_magic == 0); + + if (le32_to_cpu(md_key->md_magic) == MD_KEY_MAGIC) { + CDEBUG(D_INFO, "reset key %s mac %s", md_key->md_ck.ck_mac, + md_key->md_ck.ck_key); + } + + md_key->md_magic = cpu_to_le32(MD_KEY_MAGIC); + /*get key and mac from request buffer*/ + if (valid & ATTR_MAC) { + memcpy(md_key->md_ck.ck_mac, ckey->ck_mac, MAC_SIZE); + CDEBUG(D_INFO, "set mac %s for ino %lu \n", + md_key->md_ck.ck_mac, inode->i_ino); + } + if (valid & ATTR_KEY) { + memcpy(md_key->md_ck.ck_key, ckey->ck_key, KEY_SIZE); + CDEBUG(D_INFO, "set key %s for ino %lu \n", + md_key->md_ck.ck_key, inode->i_ino); + } + rc = fsfilt_set_md(obd, inode, handle, md_key, sizeof(*md_key), EA_KEY); +free: + if (md_key) + OBD_FREE(md_key, sizeof(*md_key)); + RETURN(rc); +} + +int mds_set_crypto_type(struct obd_device *obd, void *val, __u32 vallen) +{ + struct mds_obd *mds = &obd->u.mds; + ENTRY; + if (vallen >= strlen("mks") && + memcmp(val, "mks", vallen) == 0) { + mds->mds_crypto_type = MKS_TYPE; + CDEBUG(D_IOCTL, "mks type\n"); + } + if (vallen >= strlen("gks") && + memcmp(val, "gks", vallen) == 0) { + mds->mds_crypto_type = GKS_TYPE; + CDEBUG(D_IOCTL, "gks type \n"); + } + RETURN(0); +} + +/* Note that we can copy all of the fields, just some will not be "valid" */ +void mds_pack_inode2body(struct obd_device *obd, struct mds_body *b, + struct inode *inode, int fid) +{ + b->valid |= OBD_MD_FLID | OBD_MD_FLCTIME | OBD_MD_FLUID | + OBD_MD_FLGID | OBD_MD_FLFLAGS | OBD_MD_FLTYPE | + OBD_MD_FLMODE | OBD_MD_FLNLINK | OBD_MD_FLGENER | + OBD_MD_FLATIME | OBD_MD_FLMTIME; /* bug 2020 */ + + if (!S_ISREG(inode->i_mode)) { + b->valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | + OBD_MD_FLATIME | OBD_MD_FLMTIME | + OBD_MD_FLRDEV; + } b->atime = LTIME_S(inode->i_atime); b->mtime = LTIME_S(inode->i_mtime); b->ctime = LTIME_S(inode->i_ctime); @@ -77,11 +384,21 @@ void mds_pack_inode2body(struct mds_body *b, struct inode *inode) b->uid = inode->i_uid; b->gid = inode->i_gid; b->flags = inode->i_flags; - b->rdev = b->rdev; + b->rdev = inode->i_rdev; + /* Return the correct link count for orphan inodes */ - b->nlink = mds_inode_is_orphan(inode) ? 0 : inode->i_nlink; - b->generation = inode->i_generation; - b->suppgid = -1; + if (mds_inode_is_orphan(inode)) { + b->nlink = 0; + } else if (S_ISDIR(inode->i_mode)) { + b->nlink = 1; + } else { + b->nlink = inode->i_nlink; + } + + if (fid) + b->valid |= OBD_MD_FID; + + mds_pack_inode2id(obd, &b->id1, inode, fid); } /* unpacking */ @@ -97,12 +414,8 @@ static int mds_setattr_unpack(struct ptlrpc_request *req, int offset, if (rec == NULL) RETURN (-EFAULT); - r->ur_fsuid = rec->sa_fsuid; - r->ur_fsgid = rec->sa_fsgid; - r->ur_cap = rec->sa_cap; - r->ur_suppgid1 = rec->sa_suppgid; - r->ur_suppgid2 = -1; - r->ur_fid1 = &rec->sa_fid; + r->ur_id1 = &rec->sa_id; + r->ur_flags = rec->sa_flags; attr->ia_valid = rec->sa_valid; attr->ia_mode = rec->sa_mode; attr->ia_uid = rec->sa_uid; @@ -113,21 +426,29 @@ static int mds_setattr_unpack(struct ptlrpc_request *req, int offset, LTIME_S(attr->ia_ctime) = rec->sa_ctime; attr->ia_attr_flags = rec->sa_attr_flags; - LASSERT_REQSWAB (req, offset + 1); + LASSERT_REQSWAB(req, offset + 1); if (req->rq_reqmsg->bufcount > offset + 1) { - r->ur_eadata = lustre_msg_buf (req->rq_reqmsg, - offset + 1, 0); + r->ur_eadata = lustre_msg_buf(req->rq_reqmsg, + offset + 1, 0); if (r->ur_eadata == NULL) RETURN (-EFAULT); r->ur_eadatalen = req->rq_reqmsg->buflens[offset + 1]; } if (req->rq_reqmsg->bufcount > offset + 2) { - r->ur_logcookies = lustre_msg_buf(req->rq_reqmsg, offset + 2,0); - if (r->ur_eadata == NULL) + r->ur_ea2data = lustre_msg_buf(req->rq_reqmsg, offset + 2, 0); + if (r->ur_ea2data == NULL) RETURN (-EFAULT); - r->ur_cookielen = req->rq_reqmsg->buflens[offset + 2]; + r->ur_ea2datalen = req->rq_reqmsg->buflens[offset + 2]; + } + + if (req->rq_reqmsg->bufcount > offset + 3) { + r->ur_ea3data = lustre_msg_buf(req->rq_reqmsg, offset + 3, 0); + if (r->ur_ea3data == NULL) + RETURN (-EFAULT); + + r->ur_ea3datalen = req->rq_reqmsg->buflens[offset + 3]; } RETURN(0); @@ -139,44 +460,52 @@ static int mds_create_unpack(struct ptlrpc_request *req, int offset, struct mds_rec_create *rec; ENTRY; - rec = lustre_swab_reqbuf (req, offset, sizeof (*rec), - lustre_swab_mds_rec_create); + rec = lustre_swab_reqbuf(req, offset, sizeof(*rec), + lustre_swab_mds_rec_create); if (rec == NULL) RETURN (-EFAULT); - r->ur_fsuid = rec->cr_fsuid; - r->ur_fsgid = rec->cr_fsgid; - r->ur_cap = rec->cr_cap; - r->ur_fid1 = &rec->cr_fid; - r->ur_fid2 = &rec->cr_replayfid; + r->ur_id1 = &rec->cr_id; + r->ur_id2 = &rec->cr_replayid; r->ur_mode = rec->cr_mode; r->ur_rdev = rec->cr_rdev; - r->ur_uid = rec->cr_uid; - r->ur_gid = rec->cr_gid; r->ur_time = rec->cr_time; r->ur_flags = rec->cr_flags; - r->ur_suppgid1 = rec->cr_suppgid; - r->ur_suppgid2 = -1; - LASSERT_REQSWAB (req, offset + 1); - r->ur_name = lustre_msg_string (req->rq_reqmsg, offset + 1, 0); + LASSERT_REQSWAB(req, offset + 1); + r->ur_name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0); if (r->ur_name == NULL) - RETURN (-EFAULT); + RETURN(-EFAULT); r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; - LASSERT_REQSWAB (req, offset + 2); + LASSERT_REQSWAB(req, offset + 2); if (req->rq_reqmsg->bufcount > offset + 2) { - /* NB for now, we only seem to pass NULL terminated symlink - * target strings here. If this ever changes, we'll have - * to stop checking for a buffer filled completely with a - * NULL terminated string here, and make the callers check - * depending on what they expect. We should probably stash - * it in r->ur_eadata in that case, so it's obvious... -eeb - */ - r->ur_tgt = lustre_msg_string(req->rq_reqmsg, offset + 2, 0); - if (r->ur_tgt == NULL) - RETURN (-EFAULT); - r->ur_tgtlen = req->rq_reqmsg->buflens[offset + 2]; + if (S_ISLNK(r->ur_mode)) { + r->ur_tgt = lustre_msg_string(req->rq_reqmsg, + offset + 2, 0); + if (r->ur_tgt == NULL) + RETURN(-EFAULT); + r->ur_tgtlen = req->rq_reqmsg->buflens[offset + 2]; + } else if (S_ISDIR(r->ur_mode) ) { + /* Stripe info for mkdir - just a 16bit integer */ + if (req->rq_reqmsg->buflens[offset + 2] != 2) { + CERROR("mkdir stripe info does not match " + "expected size %d vs 2\n", + req->rq_reqmsg->buflens[offset + 2]); + RETURN(-EINVAL); + } + r->ur_eadata = lustre_swab_buf(req->rq_reqmsg, + offset + 2, 2, + __swab16s); + r->ur_eadatalen = req->rq_reqmsg->buflens[offset + 2]; + } else if (S_ISREG(r->ur_mode)){ + r->ur_eadata = lustre_msg_buf(req->rq_reqmsg, + offset + 2, 0); + r->ur_eadatalen = req->rq_reqmsg->buflens[offset + 2]; + } else { + /* Hm, no other users so far? */ + LBUG(); + } } RETURN(0); } @@ -187,23 +516,20 @@ static int mds_link_unpack(struct ptlrpc_request *req, int offset, struct mds_rec_link *rec; ENTRY; - rec = lustre_swab_reqbuf (req, offset, sizeof (*rec), - lustre_swab_mds_rec_link); + rec = lustre_swab_reqbuf(req, offset, sizeof(*rec), + lustre_swab_mds_rec_link); if (rec == NULL) - RETURN (-EFAULT); + RETURN(-EFAULT); - r->ur_fsuid = rec->lk_fsuid; - r->ur_fsgid = rec->lk_fsgid; - r->ur_cap = rec->lk_cap; - r->ur_suppgid1 = rec->lk_suppgid1; - r->ur_suppgid2 = rec->lk_suppgid2; - r->ur_fid1 = &rec->lk_fid1; - r->ur_fid2 = &rec->lk_fid2; + r->ur_id1 = &rec->lk_id1; + r->ur_id2 = &rec->lk_id2; + r->ur_time = rec->lk_time; + r->ur_flags = rec->lk_flags; - LASSERT_REQSWAB (req, offset + 1); - r->ur_name = lustre_msg_string (req->rq_reqmsg, offset + 1, 0); + LASSERT_REQSWAB(req, offset + 1); + r->ur_name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0); if (r->ur_name == NULL) - RETURN (-EFAULT); + RETURN(-EFAULT); r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; RETURN(0); } @@ -214,21 +540,18 @@ static int mds_unlink_unpack(struct ptlrpc_request *req, int offset, struct mds_rec_unlink *rec; ENTRY; - rec = lustre_swab_reqbuf (req, offset, sizeof (*rec), - lustre_swab_mds_rec_unlink); + rec = lustre_swab_reqbuf(req, offset, sizeof (*rec), + lustre_swab_mds_rec_unlink); if (rec == NULL) RETURN(-EFAULT); - r->ur_fsuid = rec->ul_fsuid; - r->ur_fsgid = rec->ul_fsgid; - r->ur_cap = rec->ul_cap; r->ur_mode = rec->ul_mode; - r->ur_suppgid1 = rec->ul_suppgid; - r->ur_suppgid2 = -1; - r->ur_fid1 = &rec->ul_fid1; - r->ur_fid2 = &rec->ul_fid2; + r->ur_id1 = &rec->ul_id1; + r->ur_id2 = &rec->ul_id2; + r->ur_time = rec->ul_time; + r->ur_flags = rec->ul_flags; - LASSERT_REQSWAB (req, offset + 1); + LASSERT_REQSWAB(req, offset + 1); r->ur_name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0); if (r->ur_name == NULL) RETURN(-EFAULT); @@ -242,26 +565,23 @@ static int mds_rename_unpack(struct ptlrpc_request *req, int offset, struct mds_rec_rename *rec; ENTRY; - rec = lustre_swab_reqbuf (req, offset, sizeof (*rec), - lustre_swab_mds_rec_unlink); + rec = lustre_swab_reqbuf(req, offset, sizeof (*rec), + lustre_swab_mds_rec_rename); if (rec == NULL) RETURN(-EFAULT); - r->ur_fsuid = rec->rn_fsuid; - r->ur_fsgid = rec->rn_fsgid; - r->ur_cap = rec->rn_cap; - r->ur_suppgid1 = rec->rn_suppgid1; - r->ur_suppgid2 = rec->rn_suppgid2; - r->ur_fid1 = &rec->rn_fid1; - r->ur_fid2 = &rec->rn_fid2; + r->ur_id1 = &rec->rn_id1; + r->ur_id2 = &rec->rn_id2; + r->ur_time = rec->rn_time; + r->ur_flags = rec->rn_flags; - LASSERT_REQSWAB (req, offset + 1); + LASSERT_REQSWAB(req, offset + 1); r->ur_name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0); if (r->ur_name == NULL) RETURN(-EFAULT); r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; - LASSERT_REQSWAB (req, offset + 2); + LASSERT_REQSWAB(req, offset + 2); r->ur_tgt = lustre_msg_string(req->rq_reqmsg, offset + 2, 0); if (r->ur_tgt == NULL) RETURN(-EFAULT); @@ -269,6 +589,48 @@ static int mds_rename_unpack(struct ptlrpc_request *req, int offset, RETURN(0); } +static int mds_open_unpack(struct ptlrpc_request *req, int offset, + struct mds_update_record *r) +{ + struct mds_rec_create *rec; + ENTRY; + + rec = lustre_swab_reqbuf(req, offset, sizeof (*rec), + lustre_swab_mds_rec_create); + if (rec == NULL) + RETURN(-EFAULT); + + r->ur_id1 = &rec->cr_id; + r->ur_id2 = &rec->cr_replayid; + r->ur_mode = rec->cr_mode; + r->ur_rdev = rec->cr_rdev; + r->ur_time = rec->cr_time; + r->ur_flags = rec->cr_flags; + r->ur_ioepoch = rec->cr_ioepoch; + + LASSERT_REQSWAB(req, offset + 1); + r->ur_name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0); + + if (r->ur_name == NULL) + RETURN(-EFAULT); + r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; + + LASSERT_REQSWAB(req, offset + 2); + if (req->rq_reqmsg->bufcount > offset + 2) { + r->ur_eadata = lustre_msg_buf(req->rq_reqmsg, offset + 2, 0); + if (r->ur_eadata == NULL) + RETURN(-EFAULT); + r->ur_eadatalen = req->rq_reqmsg->buflens[offset + 2]; + } + + if (rec->cr_flags & MDS_OPEN_HAS_KEY) { + LASSERT(req->rq_reqmsg->bufcount > offset + 3); + r->ur_ea2data = lustre_msg_buf(req->rq_reqmsg, offset + 3, 0); + r->ur_ea2datalen = req->rq_reqmsg->buflens[offset + 3]; + } + RETURN(0); +} + typedef int (*update_unpacker)(struct ptlrpc_request *req, int offset, struct mds_update_record *r); @@ -278,7 +640,7 @@ static update_unpacker mds_unpackers[REINT_MAX + 1] = { [REINT_LINK] mds_link_unpack, [REINT_UNLINK] mds_unlink_unpack, [REINT_RENAME] mds_rename_unpack, - [REINT_OPEN] mds_create_unpack, + [REINT_OPEN] mds_open_unpack, }; int mds_update_unpack(struct ptlrpc_request *req, int offset, @@ -289,24 +651,664 @@ int mds_update_unpack(struct ptlrpc_request *req, int offset, int rc; ENTRY; - /* NB don't lustre_swab_reqbuf() here. We're just taking a peek - * and we want to leave it to the specific unpacker once we've - * identified the message type */ - opcodep = lustre_msg_buf (req->rq_reqmsg, offset, sizeof (*opcodep)); + /* NB don't lustre_swab_reqbuf() here. We're just taking a peek and we + * want to leave it to the specific unpacker once we've identified the + * message type. */ + opcodep = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*opcodep)); if (opcodep == NULL) RETURN(-EFAULT); opcode = *opcodep; - if (lustre_msg_swabbed (req->rq_reqmsg)) - __swab32s (&opcode); + if (lustre_msg_swabbed(req->rq_reqmsg)) + __swab32s(&opcode); if (opcode > REINT_MAX || mds_unpackers[opcode] == NULL) { - CERROR ("Unexpected opcode %d\n", opcode); + CERROR("Unexpected opcode %d\n", opcode); RETURN(-EFAULT); } + rec->ur_id1 = NULL; + rec->ur_id2 = NULL; rec->ur_opcode = opcode; + rc = mds_unpackers[opcode](req, offset, rec); + +#if CRAY_PORTALS + rec->ur_fsuid = req->rq_uid; +#endif RETURN(rc); } + +/* + * here we take simple rule: once uid/fsuid is root, we also squash + * the gid/fsgid, don't care setuid/setgid attributes. + */ +static +int mds_squash_root(struct mds_obd *mds, struct mds_req_sec_desc *rsd, + ptl_nid_t *peernid) +{ + if (!mds->mds_squash_uid || *peernid == mds->mds_nosquash_nid) + return 0; + + if (rsd->rsd_uid && rsd->rsd_fsuid) + return 0; + + CDEBUG(D_SEC, "squash req from "LPX64":" + "(%u:%u-%u:%u/%x)=>(%u:%u-%u:%u/%x)\n", *peernid, + rsd->rsd_uid, rsd->rsd_gid, + rsd->rsd_fsuid, rsd->rsd_fsgid, rsd->rsd_cap, + rsd->rsd_uid ? rsd->rsd_uid : mds->mds_squash_uid, + rsd->rsd_uid ? rsd->rsd_gid : mds->mds_squash_gid, + rsd->rsd_fsuid ? rsd->rsd_fsuid : mds->mds_squash_uid, + rsd->rsd_fsuid ? rsd->rsd_fsgid : mds->mds_squash_gid, + rsd->rsd_cap & ~CAP_FS_MASK); + + if (rsd->rsd_uid == 0) { + rsd->rsd_uid = mds->mds_squash_uid; + rsd->rsd_gid = mds->mds_squash_gid; + } + if (rsd->rsd_fsuid == 0) { + rsd->rsd_fsuid = mds->mds_squash_uid; + rsd->rsd_fsgid = mds->mds_squash_gid; + } + rsd->rsd_cap &= ~CAP_FS_MASK; + + return 1; +} + +/******************************** + * MDS uid/gid mapping handling * + ********************************/ + +static +struct mds_idmap_entry* idmap_alloc_entry(__u32 rmt_id, __u32 lcl_id) +{ + struct mds_idmap_entry *e; + + OBD_ALLOC(e, sizeof(*e)); + if (!e) + return NULL; + + INIT_LIST_HEAD(&e->rmt_hash); + INIT_LIST_HEAD(&e->lcl_hash); + atomic_set(&e->refcount, 1); + e->rmt_id = rmt_id; + e->lcl_id = lcl_id; + + return e; +} + +void idmap_free_entry(struct mds_idmap_entry *e) +{ + if (!list_empty(&e->rmt_hash)) + list_del(&e->rmt_hash); + if (!list_empty(&e->lcl_hash)) + list_del(&e->lcl_hash); + OBD_FREE(e, sizeof(*e)); +} + +static +int idmap_insert_entry(struct list_head *rmt_hash, struct list_head *lcl_hash, + struct mds_idmap_entry *new, const char *warn_msg) +{ + struct list_head *rmt_head = &rmt_hash[MDS_IDMAP_HASHFUNC(new->rmt_id)]; + struct list_head *lcl_head = &lcl_hash[MDS_IDMAP_HASHFUNC(new->lcl_id)]; + struct mds_idmap_entry *e; + + list_for_each_entry(e, rmt_head, rmt_hash) { + if (e->rmt_id == new->rmt_id && + e->lcl_id == new->lcl_id) { + atomic_inc(&e->refcount); + return 1; + } + if (e->rmt_id == new->rmt_id && warn_msg) + CWARN("%s: rmt id %u already map to %u (new %u)\n", + warn_msg, e->rmt_id, e->lcl_id, new->lcl_id); + if (e->lcl_id == new->lcl_id && warn_msg) + CWARN("%s: lcl id %u already be mapped from %u " + "(new %u)\n", warn_msg, + e->lcl_id, e->rmt_id, new->rmt_id); + } + + list_add_tail(rmt_head, &new->rmt_hash); + list_add_tail(lcl_head, &new->lcl_hash); + return 0; +} + +static +int idmap_remove_entry(struct list_head *rmt_hash, struct list_head *lcl_hash, + __u32 rmt_id, __u32 lcl_id) +{ + struct list_head *rmt_head = &rmt_hash[MDS_IDMAP_HASHFUNC(rmt_id)]; + struct mds_idmap_entry *e; + + list_for_each_entry(e, rmt_head, rmt_hash) { + if (e->rmt_id == rmt_id && e->lcl_id == lcl_id) { + if (atomic_dec_and_test(&e->refcount)) { + list_del(&e->rmt_hash); + list_del(&e->lcl_hash); + OBD_FREE(e, sizeof(*e)); + return 0; + } else + return 1; + } + } + return -ENOENT; +} + +int mds_idmap_add(struct mds_idmap_table *tbl, + uid_t rmt_uid, uid_t lcl_uid, + gid_t rmt_gid, gid_t lcl_gid) +{ + struct mds_idmap_entry *ue, *ge; + ENTRY; + + if (!tbl) + RETURN(-EPERM); + + ue = idmap_alloc_entry(rmt_uid, lcl_uid); + if (!ue) + RETURN(-ENOMEM); + ge = idmap_alloc_entry(rmt_gid, lcl_gid); + if (!ge) { + idmap_free_entry(ue); + RETURN(-ENOMEM); + } + + spin_lock(&tbl->mit_lock); + + if (idmap_insert_entry(tbl->mit_idmaps[MDS_RMT_UIDMAP_IDX], + tbl->mit_idmaps[MDS_LCL_UIDMAP_IDX], + ue, "UID mapping")) { + idmap_free_entry(ue); + } + + if (idmap_insert_entry(tbl->mit_idmaps[MDS_RMT_GIDMAP_IDX], + tbl->mit_idmaps[MDS_LCL_GIDMAP_IDX], + ge, "GID mapping")) { + idmap_free_entry(ge); + } + + spin_unlock(&tbl->mit_lock); + RETURN(0); +} + +int mds_idmap_del(struct mds_idmap_table *tbl, + uid_t rmt_uid, uid_t lcl_uid, + gid_t rmt_gid, gid_t lcl_gid) +{ + ENTRY; + + if (!tbl) + RETURN(0); + + spin_lock(&tbl->mit_lock); + idmap_remove_entry(tbl->mit_idmaps[MDS_RMT_UIDMAP_IDX], + tbl->mit_idmaps[MDS_LCL_UIDMAP_IDX], + rmt_uid, lcl_uid); + idmap_remove_entry(tbl->mit_idmaps[MDS_RMT_GIDMAP_IDX], + tbl->mit_idmaps[MDS_LCL_GIDMAP_IDX], + rmt_gid, lcl_gid); + spin_unlock(&tbl->mit_lock); + RETURN(0); +} + +static +__u32 idmap_lookup_id(struct list_head *hash, int reverse, __u32 id) +{ + struct list_head *head = &hash[MDS_IDMAP_HASHFUNC(id)]; + struct mds_idmap_entry *e; + + if (!reverse) { + list_for_each_entry(e, head, rmt_hash) { + if (e->rmt_id == id) + return e->lcl_id; + } + return MDS_IDMAP_NOTFOUND; + } else { + list_for_each_entry(e, head, lcl_hash) { + if (e->lcl_id == id) + return e->rmt_id; + } + return MDS_IDMAP_NOTFOUND; + } +} + +int mds_idmap_lookup_uid(struct mds_idmap_table *tbl, int reverse, uid_t uid) +{ + struct list_head *hash; + + if (!tbl) + return MDS_IDMAP_NOTFOUND; + + if (!reverse) + hash = tbl->mit_idmaps[MDS_RMT_UIDMAP_IDX]; + else + hash = tbl->mit_idmaps[MDS_LCL_UIDMAP_IDX]; + + spin_lock(&tbl->mit_lock); + uid = idmap_lookup_id(hash, reverse, uid); + spin_unlock(&tbl->mit_lock); + + return uid; +} + +int mds_idmap_lookup_gid(struct mds_idmap_table *tbl, int reverse, gid_t gid) +{ + struct list_head *hash; + + if (!tbl) + return MDS_IDMAP_NOTFOUND; + + if (!reverse) + hash = tbl->mit_idmaps[MDS_RMT_GIDMAP_IDX]; + else + hash = tbl->mit_idmaps[MDS_LCL_GIDMAP_IDX]; + + spin_lock(&tbl->mit_lock); + gid = idmap_lookup_id(hash, reverse, gid); + spin_unlock(&tbl->mit_lock); + + return gid; +} + +struct mds_idmap_table *mds_idmap_alloc() +{ + struct mds_idmap_table *tbl; + int i, j; + + OBD_ALLOC(tbl, sizeof(*tbl)); + if (!tbl) + return NULL; + + spin_lock_init(&tbl->mit_lock); + for (i = 0; i < MDS_IDMAP_N_HASHES; i++) + for (j = 0; j < MDS_IDMAP_HASHSIZE; j++) + INIT_LIST_HEAD(&tbl->mit_idmaps[i][j]); + + return tbl; +} + +static void idmap_clear_rmt_hash(struct list_head *list) +{ + struct mds_idmap_entry *e; + int i; + + for (i = 0; i < MDS_IDMAP_HASHSIZE; i++) { + while (!list_empty(&list[i])) { + e = list_entry(list[i].next, struct mds_idmap_entry, + rmt_hash); + idmap_free_entry(e); + } + } +} + +void mds_idmap_free(struct mds_idmap_table *tbl) +{ + int i; + + spin_lock(&tbl->mit_lock); + idmap_clear_rmt_hash(tbl->mit_idmaps[MDS_RMT_UIDMAP_IDX]); + idmap_clear_rmt_hash(tbl->mit_idmaps[MDS_RMT_GIDMAP_IDX]); + + /* paranoid checking */ + for (i = 0; i < MDS_IDMAP_HASHSIZE; i++) { + LASSERT(list_empty(&tbl->mit_idmaps[MDS_LCL_UIDMAP_IDX][i])); + LASSERT(list_empty(&tbl->mit_idmaps[MDS_LCL_GIDMAP_IDX][i])); + } + spin_unlock(&tbl->mit_lock); + + OBD_FREE(tbl, sizeof(*tbl)); +} + +/********************************* + * helpers doing mapping for MDS * + *********************************/ + +/* + * we allow remote setuid/setgid to an "authencated" one, + * this policy probably change later. + */ +static +int mds_req_secdesc_do_map(struct mds_export_data *med, + struct mds_req_sec_desc *rsd) +{ + struct mds_idmap_table *idmap = med->med_idmap; + uid_t uid, fsuid; + gid_t gid, fsgid; + + uid = mds_idmap_lookup_uid(idmap, 0, rsd->rsd_uid); + if (uid == MDS_IDMAP_NOTFOUND) { + CERROR("can't find map for uid %u\n", rsd->rsd_uid); + return -EPERM; + } + + if (rsd->rsd_uid == rsd->rsd_fsuid) + fsuid = uid; + else { + fsuid = mds_idmap_lookup_uid(idmap, 0, rsd->rsd_fsuid); + if (fsuid == MDS_IDMAP_NOTFOUND) { + CERROR("can't find map for fsuid %u\n", rsd->rsd_fsuid); + return -EPERM; + } + } + + gid = mds_idmap_lookup_gid(idmap, 0, rsd->rsd_gid); + if (gid == MDS_IDMAP_NOTFOUND) { + CERROR("can't find map for gid %u\n", rsd->rsd_gid); + return -EPERM; + } + + if (rsd->rsd_gid == rsd->rsd_fsgid) + fsgid = gid; + else { + fsgid = mds_idmap_lookup_gid(idmap, 0, rsd->rsd_fsgid); + if (fsgid == MDS_IDMAP_NOTFOUND) { + CERROR("can't find map for fsgid %u\n", rsd->rsd_fsgid); + return -EPERM; + } + } + + rsd->rsd_uid = uid; + rsd->rsd_gid = gid; + rsd->rsd_fsuid = fsuid; + rsd->rsd_fsgid = fsgid; + + return 0; +} + +void mds_body_do_reverse_map(struct mds_export_data *med, + struct mds_body *body) +{ + uid_t uid; + gid_t gid; + + if (!med->med_remote) + return; + + ENTRY; + if (body->valid & OBD_MD_FLUID) { + uid = mds_idmap_lookup_uid(med->med_idmap, 1, body->uid); + if (uid == MDS_IDMAP_NOTFOUND) { + uid = med->med_nllu; + if (body->valid & OBD_MD_FLMODE) { + body->mode = (body->mode & ~S_IRWXU) | + ((body->mode & S_IRWXO) << 6); + } + } + body->uid = uid; + } + if (body->valid & OBD_MD_FLGID) { + gid = mds_idmap_lookup_gid(med->med_idmap, 1, body->gid); + if (gid == MDS_IDMAP_NOTFOUND) { + gid = med->med_nllg; + if (body->valid & OBD_MD_FLMODE) { + body->mode = (body->mode & ~S_IRWXG) | + ((body->mode & S_IRWXO) << 3); + } + } + body->gid = gid; + } + + EXIT; +} + +/* + * return error if can't find mapping, it's a error so should not + * fall into nllu/nllg. + */ +int mds_remote_perm_do_reverse_map(struct mds_export_data *med, + struct mds_remote_perm *perm) +{ + uid_t uid; + gid_t gid; + + LASSERT(med->med_remote); + + uid = mds_idmap_lookup_uid(med->med_idmap, 1, perm->mrp_auth_uid); + if (uid == MDS_IDMAP_NOTFOUND) { + CERROR("no map for uid %u\n", perm->mrp_auth_uid); + return -EPERM; + } + gid = mds_idmap_lookup_gid(med->med_idmap, 1, perm->mrp_auth_gid); + if (gid == MDS_IDMAP_NOTFOUND) { + CERROR("no map for uid %u\n", perm->mrp_auth_uid); + return -EPERM; + } + + perm->mrp_auth_uid = uid; + perm->mrp_auth_gid = gid; + return 0; +} + +/********************** + * MDS ucred handling * + **********************/ + +static inline void drop_ucred_ginfo(struct lvfs_ucred *ucred) +{ + if (ucred->luc_ginfo) { + put_group_info(ucred->luc_ginfo); + ucred->luc_ginfo = NULL; + } +} + +static inline void drop_ucred_lsd(struct lvfs_ucred *ucred) +{ + if (ucred->luc_lsd) { + mds_put_lsd(ucred->luc_lsd); + ucred->luc_lsd = NULL; + } +} + +/* + * the heart of the uid/gid handling and security checking. + * + * root could set any group_info if we allowed setgroups, while + * normal user only could 'reduce' their group members -- which + * is somewhat expensive. + * + * authenticated as mds user (using mds service credential) could + * bypass all checkings. + */ +int mds_init_ucred(struct lvfs_ucred *ucred, + struct ptlrpc_request *req, + struct mds_req_sec_desc *rsd) +{ + struct mds_obd *mds = &req->rq_export->exp_obd->u.mds; + struct mds_export_data *med = &req->rq_export->u.eu_mds_data; + struct lustre_sec_desc *lsd; + ptl_nid_t peernid = req->rq_peer.peer_id.nid; + struct group_info *gnew; + unsigned int setuid, setgid, strong_sec, root_squashed; + __u32 lsd_perms; + ENTRY; + + LASSERT(ucred); + LASSERT(rsd); + LASSERT(rsd->rsd_ngroups <= LUSTRE_MAX_GROUPS); + + if (SEC_FLAVOR_MAJOR(req->rq_req_secflvr) == PTLRPCS_FLVR_MAJOR_GSS && + (SEC_FLAVOR_SVC(req->rq_req_secflvr) == PTLRPCS_SVC_AUTH || + SEC_FLAVOR_SVC(req->rq_req_secflvr) == PTLRPCS_SVC_PRIV)) + strong_sec = 1; + else + strong_sec = 0; + + LASSERT(!(req->rq_remote_realm && !strong_sec)); + + if (strong_sec && req->rq_auth_uid == -1) { + CWARN("user not authenticated, deny access\n"); + RETURN(-EPERM); + } + + /* sanity check: if we use strong authentication, we expect the + * uid which client claimed is true. + * not apply to special mds user . + */ + if (!req->rq_auth_usr_mds && strong_sec) { + if (!med->med_remote) { + if (req->rq_auth_uid != rsd->rsd_uid) { + CERROR("local client "LPU64": auth uid %u " + "while client claim %u:%u/%u:%u\n", + peernid, req->rq_auth_uid, + rsd->rsd_uid, rsd->rsd_gid, + rsd->rsd_fsuid, rsd->rsd_fsgid); + RETURN(-EPERM); + } + } else { + if (req->rq_mapped_uid == MDS_IDMAP_NOTFOUND) { + CWARN("no mapping found, deny\n"); + RETURN(-EPERM); + } + + if (mds_req_secdesc_do_map(med, rsd)) + RETURN(-EPERM); + + if (req->rq_mapped_uid != rsd->rsd_uid) { + CERROR("remote client "LPU64": auth uid %u " + "while client claim %u:%u/%u:%u\n", + peernid, req->rq_auth_uid, + rsd->rsd_uid, rsd->rsd_gid, + rsd->rsd_fsuid, rsd->rsd_fsgid); + RETURN(-EPERM); + } + } + } + + /* now LSD come into play */ + ucred->luc_ginfo = NULL; + ucred->luc_lsd = lsd = mds_get_lsd(rsd->rsd_uid); + + if (!lsd) { + CERROR("Deny access without LSD: uid %d\n", rsd->rsd_uid); + RETURN(-EPERM); + } + + lsd_perms = mds_lsd_get_perms(lsd, med->med_remote, 0, peernid); + + /* check setuid/setgid permissions. + * again not apply to special mds user. + */ + if (!req->rq_auth_usr_mds) { + /* find out the setuid/setgid attempt */ + setuid = (rsd->rsd_uid != rsd->rsd_fsuid); + setgid = (rsd->rsd_gid != rsd->rsd_fsgid || + rsd->rsd_gid != lsd->lsd_gid); + + /* check permission of setuid */ + if (setuid && !(lsd_perms & LSD_PERM_SETUID)) { + CWARN("mds blocked setuid attempt (%u -> %u) " + "from "LPU64"\n", rsd->rsd_uid, rsd->rsd_fsuid, + peernid); + RETURN(-EPERM); + } + + /* check permission of setgid */ + if (setgid && !(lsd_perms & LSD_PERM_SETGID)) { + CWARN("mds blocked setgid attempt (%u:%u/%u:%u -> %u) " + "from "LPU64"\n", rsd->rsd_uid, rsd->rsd_gid, + rsd->rsd_fsuid, rsd->rsd_fsgid, lsd->lsd_gid, + peernid); + RETURN(-EPERM); + } + } + + root_squashed = mds_squash_root(mds, rsd, &peernid); + + /* remove privilege for non-root user */ + if (rsd->rsd_fsuid) + rsd->rsd_cap &= ~CAP_FS_MASK; + + /* by now every fields other than groups in rsd have been granted */ + ucred->luc_nid = peernid; + ucred->luc_uid = rsd->rsd_uid; + ucred->luc_gid = rsd->rsd_gid; + ucred->luc_fsuid = rsd->rsd_fsuid; + ucred->luc_fsgid = rsd->rsd_fsgid; + ucred->luc_cap = rsd->rsd_cap; + + /* don't use any supplementary group if we squashed root. + * XXX The exact behavior of root_squash is not defined, we just + * keep the reminder here */ + if (root_squashed) + RETURN(0); + + /* install groups from LSD */ + if (lsd->lsd_ginfo) { + ucred->luc_ginfo = lsd->lsd_ginfo; + get_group_info(ucred->luc_ginfo); + } + + /* everything is done if we don't allow setgroups, or it is + * from remote client (which implies forced to be no-setgroups). + * + * Note: remote user's supplementary groups sent along the request + * (if any) are all ignored, but we make the mapped local user's + * supplementary groups take effect. + */ + if (med->med_remote || !(lsd_perms & LSD_PERM_SETGRP)) + RETURN(0); + + /* root could set any groups as he want (if allowed), normal + * users only could reduce his group array. + */ + if (ucred->luc_uid == 0) { + drop_ucred_ginfo(ucred); + + if (rsd->rsd_ngroups == 0) + RETURN(0); + + gnew = groups_alloc(rsd->rsd_ngroups); + if (!gnew) { + CERROR("out of memory\n"); + drop_ucred_lsd(ucred); + RETURN(-ENOMEM); + } + groups_from_buffer(gnew, rsd->rsd_groups); + groups_sort(gnew); /* don't rely on client doing this */ + + ucred->luc_ginfo = gnew; + } else { + __u32 set = 0, cur = 0; + struct group_info *ginfo = ucred->luc_ginfo; + + if (!ginfo) + RETURN(0); + + /* Note: freeing a group_info count on 'nblocks' instead of + * 'ngroups', thus we can safely alloc enough buffer and reduce + * and ngroups number later. + */ + gnew = groups_alloc(rsd->rsd_ngroups); + if (!gnew) { + CERROR("out of memory\n"); + drop_ucred_ginfo(ucred); + drop_ucred_lsd(ucred); + RETURN(-ENOMEM); + } + + while (cur < rsd->rsd_ngroups) { + if (groups_search(ginfo, rsd->rsd_groups[cur])) { + GROUP_AT(gnew, set) = rsd->rsd_groups[cur]; + set++; + } + cur++; + } + gnew->ngroups = set; + + put_group_info(ucred->luc_ginfo); + ucred->luc_ginfo = gnew; + } + RETURN(0); +} + +void mds_exit_ucred(struct lvfs_ucred *ucred) +{ + ENTRY; + drop_ucred_ginfo(ucred); + drop_ucred_lsd(ucred); + EXIT; +} +