From: adilger Date: Wed, 10 Aug 2005 19:36:08 +0000 (+0000) Subject: Land b1_4_bug3389 onto b1_4 (20050810_1124) X-Git-Tag: v1_7_140~1^12~3^2~55^5~35 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=45a08b69dc32bb56596612ff1eb3fb3c6d889b11;p=fs%2Flustre-release.git Land b1_4_bug3389 onto b1_4 (20050810_1124) Description: Add support for supplementary groups on the MDS. Details : The MDS has an upcall /proc/fs/lustre/mds/{mds}/group_upcall (set to /usr/sbin/l_getgroups if enabled) which will do MDS-side lookups for user supplementary groups into a cache. b=3389, b=6253 Description: Add support for compiling against Cray portals. Details : Conditional compiling for some areas that are different on Cray Portals. --- diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 299b731..d19a51d 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -26,6 +26,13 @@ Details : If a client has max_dirty_mb smaller than max_rpcs_in_flight, to complete instead of consuming its dirty limit. With change we get improved performance when max_dirty_mb is small. +Severity : enhancement +Bugzilla : 3389, 6253 +Description: Add support for supplementary groups on the MDS. +Details : The MDS has an upcall /proc/fs/lustre/mds/{mds}/group_upcall + (set to /usr/sbin/l_getgroups if enabled) which will do MDS-side + lookups for user supplementary groups into a cache. + 2005-08-08 Cluster File Systems, Inc. * version 1.4.4 * bug fixes @@ -266,6 +273,11 @@ Details : The interpretation of the default stripe count (0, to lfs OST, rather than all available. For general usage we have found a stripe count of 1 or 2 works best. +Severity : enhancement +Description: Add support for compiling against Cray portals. +Details : Conditional compiling for some areas that are different + on Cray Portals. + Severity : major Frequency : occasional Bugzilla : 6409, 6834 @@ -312,7 +324,7 @@ Severity : minor Bugzilla : 6147 Description: Changes the "SCSI I/O Stats" kernel patch to default to "enabled" ------------------------------------------------------------------------------- +----------------------------------------------------------------------------- 2005-05-05 Cluster File Systems, Inc. * version 1.4.2 diff --git a/lustre/autoconf/lustre-version.ac b/lustre/autoconf/lustre-version.ac index 9328b6c..1bdd117 100644 --- a/lustre/autoconf/lustre-version.ac +++ b/lustre/autoconf/lustre-version.ac @@ -1 +1 @@ -m4_define([LUSTRE_VERSION],[1.4.4.1]) +m4_define([LUSTRE_VERSION],[1.4.4.5]) diff --git a/lustre/doc/lconf.8 b/lustre/doc/lconf.8 index aee8352..6143c6b 100644 --- a/lustre/doc/lconf.8 +++ b/lustre/doc/lconf.8 @@ -53,6 +53,9 @@ Full name of gdb debug script. Default is /tmp/ogdb. --group The group of devices to cleanup/configure. .TP +--group_upcall +Pathname to the MDS upcall to resolve secondary group membership. Defaults to NONE, meaning that the MDS will use whatever group the client supplies, but this is limited to a single supplementary group. +.TP -h,--help Print help. .TP diff --git a/lustre/doc/lconf.lyx b/lustre/doc/lconf.lyx index e00d94d..2846f48 100644 --- a/lustre/doc/lconf.lyx +++ b/lustre/doc/lconf.lyx @@ -122,6 +122,12 @@ This program configures a node following directives in the . The group of devices to cleanup/configure. \layout Description +--group_upcall\SpecialChar ~ + Pathname to the MDS upcall to resolve secondary group membership. + Defaults to NONE, meaning that the MDS will use whatever group the client + supplies, but this is limited to a single supplementary group. +\layout Description + -h,--help Print help. \layout Description diff --git a/lustre/doc/lmc.1 b/lustre/doc/lmc.1 index bcb55bf..1f05f0b 100644 --- a/lustre/doc/lmc.1 +++ b/lustre/doc/lmc.1 @@ -115,11 +115,21 @@ Optional argument indicating the size (in KB) of the device to be created (used --node Adds an MDS to the specified node. This requires a --node argument, and it must not be a profile node. .TP ---fstype extN|ext3 +--fstype ldiskfs|ext3 Optional argument used to specify the file system type. Default is ext3. +For 2.6 kernels the ldiskfs filesystem must be used. .TP --inode_size -Specify new inode size for underlying ext3 file system. +Specify new inode size for underlying ext3 file system. Must be a power of 2 +between 128 and 4096. The default inode size is selected based on the default +number of stripes specified for the filesystem. +.TP +--group_upcall +The group upcall program to call for resolving a user's secondary groups. +The default value is NONE, which means that the MDS will use whatever +supplementary group is passed from the client. The supplied upcall is +/usr/sbin/l_getgroups, which gets groups from the MDS's /etc/group file +based on the client-supplied UID. .TP --mkfsoptions Optional argument to mkfs. @@ -128,7 +138,7 @@ Optional argument to mkfs. Optional argument to mount fs. Mount options will be passed by this argument. For example, extents are to be enabled by adding ",extents" to the --mountfsoptions option. "errors=remount-ro" and "asyncdel" can also be added to it. .TP --journal_size -Optional arguement to specify the journal size for the ext2/ext3 file system. The size should be in the units expected by mkfs, so for ext3 it should be in MB. If this is option is not used, the ext2/ext3 filesystem will be configured with the default journal size. +Optional arguement to specify the journal size for the ext3 file system. The size should be in the units expected by mkfs, so for ext3 it should be in MB. If this is option is not used, the ext3 filesystem will be configured with the default journal size. .PP .B --add lov Creates an LOV with the specified parameters. The mds_name must already exist in the descriptor. @@ -191,7 +201,7 @@ Optional argument to mkfs. Optional argument to mount fs. Mount options will be passed by this argument. For example, extents are to be enabled by adding ",extents" to the --mountfsoptions option. "errors=remount-ro" and "asyncdel" can also be added to it. .TP --journal_size -Optional arguement to specify the journal size for the ext2/ext3 file system. The size should be in the units expected by mkfs, so for ext3 it should be in MB. If this is option is not used, the ext2/ext3 filesystem will be configured with the default journal size. +Optional arguement to specify the journal size for the ext3 file system. The size should be in the units expected by mkfs, so for ext3 it should be in MB. If this is option is not used, the ext3 filesystem will be configured with a journal size dependent upon how large the filesystem is. .PP .B --add mtpt Creates a mount-point on the specified node. Either an LOV or OSC name can be used. diff --git a/lustre/doc/lmc.lyx b/lustre/doc/lmc.lyx index 55fe26b..837c61f 100644 --- a/lustre/doc/lmc.lyx +++ b/lustre/doc/lmc.lyx @@ -313,14 +313,23 @@ name> Name of the node on which the MDS resides \layout Description --fstype\SpecialChar ~ -extN|ext3 Optional argument used to specify the file system type. - Default is ext3. +ldiskfs|ext3 Optional argument used to specify the file system type. + Default is ext3. For 2.6 kernels the ldiskfs filesystem must be used. \layout Description --inode_size\SpecialChar ~ - Specify new inode size for underlying ext3 file system. + Specify new inode size for underlying ext3 file system. Must be a + power of 2 between 128 and 4096. The default inode size is selected based + on the default number of stripes specified for the filesystem. \layout Description +--group_upcall\SpecialChar ~ + The group upcall program to call for resolving a user's secondary + groups. The default value is NONE, which means that the MDS will use whatever + supplementary group is passed from the client. The supplied upcall is + /usr/sbin/l_getgroups, which gets groups from the MDS's /etc/group file + based on the client-supplied UID. + --mkfsoptions\SpecialChar ~ Optional argument to mkfs. \layout Description @@ -351,14 +360,14 @@ asyncdel --journal_size\SpecialChar ~ Optional arguement to specify the journal size for - the ext2/ext3 file system. + the ext3 file system. The size should be in the units expected by \series bold mkfs \series default , so for ext3 it should be in MB. - If this is option is not used, the ext2/ext3 filesystem will be configured - with the default journal size. + If this is option is not used, the ext3 filesystem will be configured + with a journal size dependent upon how large the filesystem is. \end_deeper \layout Description @@ -478,14 +487,14 @@ asyncdel --journal_size\SpecialChar ~ Optional arguement to specify the journal size for - the ext2/ext3 file system. + the ext3 file system. The size should be in the units expected by \series bold mkfs \series default , so for ext3 it should be in MB. - If this is option is not used, the ext2/ext3 filesystem will be configured - with the default journal size. + If this is option is not used, the ext3 filesystem will be configured + with a journal size dependent upon how large the filesystem is. \end_deeper \layout Description diff --git a/lustre/include/linux/Makefile.am b/lustre/include/linux/Makefile.am index f0d145f..aa0153d 100644 --- a/lustre/include/linux/Makefile.am +++ b/lustre/include/linux/Makefile.am @@ -15,4 +15,4 @@ EXTRA_DIST = lprocfs_status.h lustre_debug.h lustre_ha.h lustre_lib.h \ lustre_export.h lustre_log.h obd_echo.h \ lustre_compat25.h lustre_fsfilt.h lustre_import.h lustre_mds.h obd.h \ lvfs.h lvfs_linux.h lustre_cfg.h lustre_lite.h lustre_idl.h \ - lustre_quota.h + lustre_quota.h lustre_ucache.h diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 96f4d7e..28c2092 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -31,6 +31,29 @@ #include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4) +#define NGROUPS_SMALL NGROUPS +#define NGROUPS_PER_BLOCK ((int)(EXEC_PAGESIZE / sizeof(gid_t))) + +struct group_info { + int ngroups; + atomic_t usage; + gid_t small_block[NGROUPS_SMALL]; + int nblocks; + gid_t *blocks[0]; +}; +#define current_ngroups current->ngroups +#define current_groups current->groups + +struct group_info *groups_alloc(int gidsetsize); +void groups_free(struct group_info *ginfo); +#else /* >= 2.6.4 */ + +#define current_ngroups current->group_info->ngroups +#define current_groups current->group_info->small_block + +#endif + #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) /* @@ -88,11 +111,10 @@ static inline int cleanup_group_info(void) { struct group_info *ginfo; - ginfo = groups_alloc(2); + ginfo = groups_alloc(0); if (!ginfo) return -ENOMEM; - ginfo->ngroups = 0; set_current_groups(ginfo); put_group_info(ginfo); diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index 93c4434..88a7bf8 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -419,6 +419,7 @@ extern void lustre_swab_obd_statfs (struct obd_statfs *os); #define OBD_BRW_FROM_GRANT 0x20 /* the osc manages this under llite */ #define OBD_BRW_GRANTED 0x40 /* the ost manages this */ #define OBD_BRW_DROP 0x80 /* drop the page after IO */ +#define OBD_BRW_NOQUOTA 0x100 #define OBD_OBJECT_EOF 0xffffffffffffffffULL @@ -445,8 +446,6 @@ extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr); /* request structure for OST's */ -#define OST_REQ_HAS_OA1 0x1 - struct ost_body { struct obdo oa; }; diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h index 13ad7f8..a05285b 100644 --- a/lustre/include/linux/lustre_lib.h +++ b/lustre/include/linux/lustre_lib.h @@ -235,8 +235,8 @@ static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data) CERROR("OBD ioctl: plen2 set but NULL pointer\n"); return 1; } - if (obd_ioctl_packlen(data) != data->ioc_len) { - CERROR("OBD ioctl: packlen exceeds ioc_len (%d != %d)\n", + if (obd_ioctl_packlen(data) > data->ioc_len) { + CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n", obd_ioctl_packlen(data), data->ioc_len); return 1; } diff --git a/lustre/include/linux/lustre_mds.h b/lustre/include/linux/lustre_mds.h index ce1ebb0..66d1b2b 100644 --- a/lustre/include/linux/lustre_mds.h +++ b/lustre/include/linux/lustre_mds.h @@ -40,19 +40,14 @@ struct lustre_md { struct lov_stripe_md *lsm; }; -struct ll_uctxt { - __u32 gid1; - __u32 gid2; -}; - struct mdc_op_data { - struct ll_fid fid1; - struct ll_fid fid2; - struct ll_uctxt ctxt; - __u64 mod_time; - const char *name; - int namelen; - __u32 create_mode; + struct ll_fid fid1; + struct ll_fid fid2; + __u64 mod_time; + const char *name; + int namelen; + __u32 create_mode; + __u32 suppgids[2]; }; struct mds_update_record { @@ -73,14 +68,9 @@ struct mds_update_record { __u64 ur_time; __u32 ur_mode; __u32 ur_flags; + struct lvfs_grp_hash_entry *ur_grp_entry; }; -#define ur_fsuid ur_uc.luc_fsuid -#define ur_fsgid ur_uc.luc_fsgid -#define ur_cap ur_uc.luc_cap -#define ur_suppgid1 ur_uc.luc_suppgid1 -#define ur_suppgid2 ur_uc.luc_suppgid2 - #define MDS_LR_SERVER_SIZE 512 #define MDS_LR_CLIENT_START 8192 @@ -164,11 +154,10 @@ int it_open_error(int phase, struct lookup_intent *it); void mdc_set_lock_data(__u64 *lockh, void *data); int mdc_change_cbdata(struct obd_export *exp, struct ll_fid *fid, ldlm_iterator_t it, void *data); -int mdc_intent_lock(struct obd_export *exp, struct ll_uctxt *, - struct ll_fid *parent, - const char *name, int len, void *lmm, int lmmsize, - struct ll_fid *child, - struct lookup_intent *, int, +int mdc_intent_lock(struct obd_export *exp, + struct mdc_op_data *, + void *lmm, int lmmsize, + struct lookup_intent *, int, struct ptlrpc_request **reqp, ldlm_blocking_callback cb_blocking); int mdc_enqueue(struct obd_export *exp, @@ -210,7 +199,7 @@ int mdc_readpage(struct obd_export *exp, struct ll_fid *mdc_fid, __u64 offset, struct page *, struct ptlrpc_request **); int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data, const void *data, int datalen, int mode, __u32 uid, __u32 gid, - __u64 rdev, struct ptlrpc_request **request); + __u32 cap_effective, __u64 rdev,struct ptlrpc_request **request); int mdc_unlink(struct obd_export *exp, struct mdc_op_data *data, struct ptlrpc_request **request); int mdc_link(struct obd_export *exp, struct mdc_op_data *data, diff --git a/lustre/include/linux/lustre_ucache.h b/lustre/include/linux/lustre_ucache.h index ac89de5..db28cef 100644 --- a/lustre/include/linux/lustre_ucache.h +++ b/lustre/include/linux/lustre_ucache.h @@ -27,53 +27,42 @@ #define UC_CACHE_CLEAR_INVALID(i) (i)->ue_flags &= ~UC_CACHE_INVALID #define UC_CACHE_CLEAR_EXPIRED(i) (i)->ue_flags &= ~UC_CACHE_EXPIRED -struct upcall_cache; - struct upcall_cache_entry { struct list_head ue_hash; - atomic_t ue_refcount; __u64 ue_key; - struct upcall_cache *ue_cache; + __u64 ue_primary; + struct group_info *ue_group_info; + atomic_t ue_refcount; int ue_flags; wait_queue_head_t ue_waitq; unsigned long ue_acquire_expire; unsigned long ue_expire; }; -#define UC_CACHE_UPCALL_MAXPATH (1024) +#define UC_CACHE_HASH_SIZE (128) +#define UC_CACHE_HASH_INDEX(id) ((id) & (UC_CACHE_HASH_SIZE - 1)) +#define UC_CACHE_UPCALL_MAXPATH (1024UL) struct upcall_cache { - struct list_head *uc_hashtable; - int uc_hashsize; - rwlock_t uc_hashlock; + struct list_head uc_hashtable[UC_CACHE_HASH_SIZE]; + spinlock_t uc_lock; - char *uc_name; + char uc_name[40]; /* for upcall */ char uc_upcall[UC_CACHE_UPCALL_MAXPATH]; - unsigned long uc_acquire_expire; /* max acquire time */ - unsigned long uc_entry_expire; /* max entry life time */ - unsigned long uc_err_entry_expire; /* err entry life time */ - - /* functions */ - unsigned int (*hash)(struct upcall_cache *, __u64); - struct upcall_cache_entry* (*alloc_entry)(struct upcall_cache *, __u64); - void (*free_entry)(struct upcall_cache *, - struct upcall_cache_entry *); - int (*make_upcall)(struct upcall_cache *, - struct upcall_cache_entry *); - int (*parse_downcall)(struct upcall_cache *, - struct upcall_cache_entry *, - void *args); + unsigned long uc_acquire_expire; /* jiffies */ + unsigned long uc_entry_expire; /* jiffies */ }; -void upcall_cache_init_entry(struct upcall_cache *cache, - struct upcall_cache_entry *entry, - __u64 key); -struct upcall_cache_entry * -upcall_cache_get_entry(struct upcall_cache *cache, __u64 key); -void upcall_cache_put_entry(struct upcall_cache_entry *entry); -int upcall_cache_downcall(struct upcall_cache *cache, __u64 key, void *args); -void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key); +struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *hash, + __u64 key, __u32 primary, + __u32 ngroups, __u32 *groups); +void upcall_cache_put_entry(struct upcall_cache *hash, + struct upcall_cache_entry *entry); +int upcall_cache_downcall(struct upcall_cache *hash, __u32 err, __u64 key, + __u32 primary, __u32 ngroups, __u32 *groups); void upcall_cache_flush_idle(struct upcall_cache *cache); void upcall_cache_flush_all(struct upcall_cache *cache); +struct upcall_cache *upcall_cache_init(const char *name); +void upcall_cache_cleanup(struct upcall_cache *hash); #endif /* _UPCALL_CACHE_H */ diff --git a/lustre/include/linux/lvfs.h b/lustre/include/linux/lvfs.h index c9604ff..858e3f5 100644 --- a/lustre/include/linux/lvfs.h +++ b/lustre/include/linux/lvfs.h @@ -25,11 +25,15 @@ #define __LVFS_H__ #include +#include #define LL_FID_NAMELEN (16 + 1 + 8 + 1) #if defined __KERNEL__ +#include #include +#else +struct group_info { /* unused */ }; #endif #ifdef LIBLUSTRE @@ -37,7 +41,9 @@ #endif /* simple.c */ + struct lvfs_ucred { + struct upcall_cache_entry *luc_uce; __u32 luc_fsuid; __u32 luc_fsgid; __u32 luc_cap; @@ -53,14 +59,19 @@ struct lvfs_callback_ops { #define OBD_RUN_CTXT_MAGIC 0xC0FFEEAA #define OBD_CTXT_DEBUG /* development-only debugging */ struct lvfs_run_ctxt { - struct vfsmount *pwdmnt; - struct dentry *pwd; - mm_segment_t fs; - struct lvfs_ucred luc; - int ngroups; + struct vfsmount *pwdmnt; + struct dentry *pwd; + mm_segment_t fs; + struct lvfs_ucred luc; + int ngroups; struct lvfs_callback_ops cb_ops; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4) + struct group_info *group_info; +#else + struct group_info group_info; +#endif #ifdef OBD_CTXT_DEBUG - __u32 magic; + __u32 magic; #endif }; diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 3bed2e9..399ecd9 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -366,6 +366,7 @@ struct mds_obd { struct file *mds_lov_objid_filp; unsigned long *mds_client_bitmap; struct semaphore mds_orphan_recovery_sem; + struct upcall_cache *mds_group_hash; struct lustre_quota_info mds_quota_info; struct lustre_quota_ctxt mds_quota_ctxt; atomic_t mds_quotachecking; diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h index 079e23d..30e55e6 100644 --- a/lustre/include/lustre/lustre_user.h +++ b/lustre/include/lustre/lustre_user.h @@ -149,6 +149,17 @@ struct if_quotacheck { int stat; }; +#define MDS_GRP_DOWNCALL_MAGIC 0x6d6dd620 + +struct mds_grp_downcall_data { + __u32 mgd_magic; + __u32 mgd_err; + __u32 mgd_uid; + __u32 mgd_gid; + __u32 mgd_ngroups; + __u32 mgd_groups[0]; +}; + #ifndef __KERNEL__ #define NEED_QUOTA_DEFS #else diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c index a5c403d..c5e3cf8 100644 --- a/lustre/liblustre/file.c +++ b/lustre/liblustre/file.c @@ -47,6 +47,30 @@ #include "llite_lib.h" +/* Pack the required supplementary groups into the supplied groups array. + * If we don't need to use the groups from the target inode(s) then we + * instead pack one or more groups from the user's supplementary group + * array in case it might be useful. Not needed if doing an MDS-side upcall. */ +void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2) +{ + LASSERT(i1 != NULL); + LASSERT(suppgids != NULL); + + if (in_group_p(i1->i_stbuf.st_gid)) + suppgids[0] = i1->i_stbuf.st_gid; + else + suppgids[0] = -1; + + if (i2) { + if (in_group_p(i2->i_stbuf.st_gid)) + suppgids[1] = i2->i_stbuf.st_gid; + else + suppgids[1] = -1; + } else { + suppgids[1] = -1; + } +} + void llu_prepare_mdc_op_data(struct mdc_op_data *data, struct inode *i1, struct inode *i2, @@ -55,13 +79,14 @@ void llu_prepare_mdc_op_data(struct mdc_op_data *data, int mode) { LASSERT(i1); - - ll_i2uctxt(&data->ctxt, i1, i2); + + ll_i2gids(data->suppgids, i1, i2); ll_inode2fid(&data->fid1, i1); - if (i2) { + if (i2) ll_inode2fid(&data->fid2, i2); - } + else + memset(&data->fid2, 0, sizeof(data->fid2)); data->name = name; data->namelen = namelen; diff --git a/lustre/liblustre/genlib.sh b/lustre/liblustre/genlib.sh index f70116d..08a9037 100755 --- a/lustre/liblustre/genlib.sh +++ b/lustre/liblustre/genlib.sh @@ -1,5 +1,6 @@ #!/bin/bash #set -xv +set -e # # This script is to generate lib lustre library as a whole. It will leave diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index b8b9997..9c2e22b 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -62,7 +62,7 @@ static int lllib_init(void) { liblustre_set_nal_nid(); - if (liblustre_init_current("dummy") || + if (liblustre_init_current("liblustre") || init_obdclass() || init_lib_portals() || ptlrpc_init() || diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h index 0f96e8d..0aeae7a 100644 --- a/lustre/liblustre/llite_lib.h +++ b/lustre/liblustre/llite_lib.h @@ -146,29 +146,7 @@ struct it_cb_data { obd_id hash; }; -static inline void ll_i2uctxt(struct ll_uctxt *ctxt, struct inode *i1, - struct inode *i2) -{ - struct intnl_stat *st = llu_i2stat(i1); - - LASSERT(i1); - LASSERT(ctxt); - - if (in_group_p(st->st_gid)) - ctxt->gid1 = st->st_gid; - else - ctxt->gid1 = -1; - - if (i2) { - st = llu_i2stat(i2); - if (in_group_p(st->st_gid)) - ctxt->gid2 = st->st_gid; - else - ctxt->gid2 = -1; - } else - ctxt->gid2 = 0; -} - +void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2); typedef int (*intent_finish_cb)(struct ptlrpc_request *, struct inode *parent, struct pnode *pnode, diff --git a/lustre/liblustre/lutil.c b/lustre/liblustre/lutil.c index a99d4a0..8eaacd5 100644 --- a/lustre/liblustre/lutil.c +++ b/lustre/liblustre/lutil.c @@ -76,7 +76,7 @@ char *portals_nid2str(int nal, ptl_nid_t nid, char *str) return str; } - switch(nal){ + switch(NALID_FROM_IFACE(nal)){ #if !CRAY_PORTALS case TCPNAL: /* userspace NAL */ @@ -91,6 +91,11 @@ char *portals_nid2str(int nal, ptl_nid_t nid, char *str) snprintf(str, PTL_NALFMT_SIZE, "%u:%u", (__u32)(nid >> 32), (__u32)nid); break; +#else + case PTL_IFACE_SS: + case PTL_IFACE_SS_ACCEL: + snprintf(str, PTL_NALFMT_SIZE, "%u", (__u32)nid); + break; #endif default: snprintf(str, PTL_NALFMT_SIZE, "?%x? %llx", @@ -131,7 +136,7 @@ static int get_ipv4_addr() if (hptr == NULL || hptr->h_addrtype != AF_INET || *hptr->h_addr_list == NULL) { - printf("LibLustre: Warning: fail to get local IPv4 address\n"); + CWARN("Warning: fail to get local IPv4 address\n"); return 0; } @@ -197,8 +202,8 @@ static void init_capability(int *res) syscap = cap_get_proc(); if (!syscap) { - printf("Liblustre: Warning: failed to get system capability, " - "set to minimal\n"); + CWARN("Warning: failed to get system capability, " + "set to minimal\n"); return; } diff --git a/lustre/liblustre/namei.c b/lustre/liblustre/namei.c index 772ef7f..0df523a 100644 --- a/lustre/liblustre/namei.c +++ b/lustre/liblustre/namei.c @@ -215,9 +215,8 @@ static int pnode_revalidate_finish(struct ptlrpc_request *req, int llu_pb_revalidate(struct pnode *pnode, int flags, struct lookup_intent *it) { struct pnode_base *pb = pnode->p_base; - struct ll_fid pfid, cfid; struct it_cb_data icbd; - struct ll_uctxt ctxt; + struct mdc_op_data op_data; struct ptlrpc_request *req = NULL; struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; struct obd_export *exp; @@ -253,8 +252,6 @@ int llu_pb_revalidate(struct pnode *pnode, int flags, struct lookup_intent *it) } exp = llu_i2mdcexp(pb->pb_ino); - ll_inode2fid(&pfid, pnode->p_parent->p_base->pb_ino); - ll_inode2fid(&cfid, pb->pb_ino); icbd.icbd_parent = pnode->p_parent->p_base->pb_ino; icbd.icbd_child = pnode; @@ -263,12 +260,11 @@ int llu_pb_revalidate(struct pnode *pnode, int flags, struct lookup_intent *it) it->it_op_release = ll_intent_release; } - ll_i2uctxt(&ctxt, pnode->p_parent->p_base->pb_ino, pb->pb_ino); + llu_prepare_mdc_op_data(&op_data, pnode->p_parent->p_base->pb_ino, + pb->pb_ino, pb->pb_name.name,pb->pb_name.len,0); - rc = mdc_intent_lock(exp, &ctxt, &pfid, - pb->pb_name.name, pb->pb_name.len, - NULL, 0, &cfid, it, flags, &req, - llu_mdc_blocking_ast); + rc = mdc_intent_lock(exp, &op_data, NULL, 0, it, flags, + &req, llu_mdc_blocking_ast); /* If req is NULL, then mdc_intent_lock only tried to do a lock match; * if all was well, it will return 1 if it found locks, 0 otherwise. */ if (req == NULL && rc >= 0) @@ -411,8 +407,7 @@ struct inode *llu_inode_from_lock(struct ldlm_lock *lock) static int llu_lookup_it(struct inode *parent, struct pnode *pnode, struct lookup_intent *it, int flags) { - struct ll_fid pfid; - struct ll_uctxt ctxt; + struct mdc_op_data op_data; struct it_cb_data icbd; struct ptlrpc_request *req = NULL; struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; @@ -429,18 +424,16 @@ static int llu_lookup_it(struct inode *parent, struct pnode *pnode, icbd.icbd_child = pnode; icbd.icbd_parent = parent; - icbd.icbd_child = pnode; - ll_inode2fid(&pfid, parent); - ll_i2uctxt(&ctxt, parent, NULL); - - rc = mdc_intent_lock(llu_i2mdcexp(parent), &ctxt, &pfid, - pnode->p_base->pb_name.name, - pnode->p_base->pb_name.len, - NULL, 0, NULL, it, flags, &req, - llu_mdc_blocking_ast); + + llu_prepare_mdc_op_data(&op_data, parent, NULL, + pnode->p_base->pb_name.name, + pnode->p_base->pb_name.len, flags); + + rc = mdc_intent_lock(llu_i2mdcexp(parent), &op_data, NULL, 0, it, + flags, &req, llu_mdc_blocking_ast); if (rc < 0) GOTO(out, rc); - + rc = lookup_it_finish(req, 1, it, &icbd); if (rc != 0) { ll_intent_release(it); diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c index 4e38460..f548b88 100644 --- a/lustre/liblustre/rw.c +++ b/lustre/liblustre/rw.c @@ -332,7 +332,7 @@ static void llu_ap_fill_obdo(void *data, int cmd, struct obdo *oa) oa->o_id = lsm->lsm_object_id; oa->o_valid = OBD_MD_FLID; valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME; - if (cmd == OBD_BRW_WRITE) + if (cmd & OBD_BRW_WRITE) valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME; obdo_from_inode(oa, inode, valid_flags); @@ -351,7 +351,7 @@ static void llu_ap_completion(void *data, int cmd, struct obdo *oa, int rc) page = llap->llap_page; if (rc != 0) { - if (cmd == OBD_BRW_WRITE) + if (cmd & OBD_BRW_WRITE) CERROR("writeback error on page %p index %ld: %d\n", page, page->index, rc); } diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index 76bf207..2b4b71a 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -835,7 +835,8 @@ static int llu_iop_symlink_raw(struct pnode *pno, const char *tgt) llu_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0); err = mdc_create(sbi->ll_mdc_exp, &op_data, tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO, - current->fsuid, current->fsgid, 0, &request); + current->fsuid, current->fsgid, current->cap_effective, + 0, &request); ptlrpc_req_finished(request); RETURN(err); } @@ -957,7 +958,8 @@ static int llu_iop_mknod_raw(struct pnode *pno, pno->p_base->pb_name.len, 0); err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode, - current->fsuid, current->fsgid, dev, &request); + current->fsuid, current->fsgid, + current->cap_effective, dev, &request); ptlrpc_req_finished(request); break; case S_IFDIR: @@ -1181,7 +1183,8 @@ static int llu_iop_mkdir_raw(struct pnode *pno, mode_t mode) llu_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0); err = mdc_create(llu_i2sbi(dir)->ll_mdc_exp, &op_data, NULL, 0, mode, - current->fsuid, current->fsgid, 0, &request); + current->fsuid, current->fsgid, current->cap_effective, + 0, &request); ptlrpc_req_finished(request); RETURN(err); } diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c index 8f55aa4..55759ef 100644 --- a/lustre/llite/dcache.c +++ b/lustre/llite/dcache.c @@ -272,9 +272,8 @@ int ll_revalidate_it(struct dentry *de, int lookup_flags, struct lookup_intent *it) { int rc; - struct ll_fid pfid, cfid; struct it_cb_data icbd; - struct ll_uctxt ctxt; + struct mdc_op_data op_data; struct ptlrpc_request *req = NULL; struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; struct obd_export *exp; @@ -288,8 +287,6 @@ int ll_revalidate_it(struct dentry *de, int lookup_flags, RETURN(0); exp = ll_i2mdcexp(de->d_inode); - ll_inode2fid(&pfid, de->d_parent->d_inode); - ll_inode2fid(&cfid, de->d_inode); icbd.icbd_parent = de->d_parent->d_inode; icbd.icbd_childp = &de; @@ -302,11 +299,11 @@ int ll_revalidate_it(struct dentry *de, int lookup_flags, ll_frob_intent(&it, &lookup_it); LASSERT(it); - ll_i2uctxt(&ctxt, de->d_parent->d_inode, de->d_inode); + ll_prepare_mdc_op_data(&op_data, de->d_parent->d_inode, de->d_inode, + de->d_name.name, de->d_name.len, 0); - rc = mdc_intent_lock(exp, &ctxt, &pfid, de->d_name.name, de->d_name.len, - NULL, 0, - &cfid, it, lookup_flags, &req,ll_mdc_blocking_ast); + rc = mdc_intent_lock(exp, &op_data, NULL, 0, it, lookup_flags, + &req, ll_mdc_blocking_ast); /* If req is NULL, then mdc_intent_lock only tried to do a lock match; * if all was well, it will return 1 if it found locks, 0 otherwise. */ if (req == NULL && rc >= 0) diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 64a5d45..47e33f3 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -224,32 +224,14 @@ static inline struct inode *ll_info2i(struct ll_inode_info *lli) #endif } -static inline void ll_i2uctxt(struct ll_uctxt *ctxt, struct inode *i1, - struct inode *i2) -{ - LASSERT(i1); - LASSERT(ctxt); - - if (in_group_p(i1->i_gid)) - ctxt->gid1 = i1->i_gid; - else - ctxt->gid1 = -1; - - if (i2) { - if (in_group_p(i2->i_gid)) - ctxt->gid2 = i2->i_gid; - else - ctxt->gid2 = -1; - } else - ctxt->gid2 = 0; -} - struct it_cb_data { struct inode *icbd_parent; struct dentry **icbd_childp; obd_id hash; }; +void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2); + #define LLAP_MAGIC 98764321 extern kmem_cache_t *ll_async_page_slab; diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index d9571c7..c70d4b1 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -195,6 +195,45 @@ int ll_mdc_cancel_unused(struct lustre_handle *conn, struct inode *inode, opaque)); } +/* Pack the required supplementary groups into the supplied groups array. + * If we don't need to use the groups from the target inode(s) then we + * instead pack one or more groups from the user's supplementary group + * array in case it might be useful. Not needed if doing an MDS-side upcall. */ +void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2) +{ + int i; + + LASSERT(i1 != NULL); + LASSERT(suppgids != NULL); + + if (in_group_p(i1->i_gid)) + suppgids[0] = i1->i_gid; + else + suppgids[0] = -1; + + if (i2) { + if (in_group_p(i2->i_gid)) + suppgids[1] = i2->i_gid; + else + suppgids[1] = -1; + } else { + suppgids[1] = -1; + } + + for (i = 0; i < current_ngroups; i++) { + if (suppgids[0] == -1) { + if (current_groups[i] != suppgids[1]) + suppgids[0] = current_groups[i]; + continue; + } + if (suppgids[1] == -1) { + if (current_groups[i] != suppgids[0]) + suppgids[1] = current_groups[i]; + continue; + } + break; + } +} void ll_prepare_mdc_op_data(struct mdc_op_data *data, struct inode *i1, struct inode *i2, const char *name, int namelen, @@ -202,11 +241,13 @@ void ll_prepare_mdc_op_data(struct mdc_op_data *data, struct inode *i1, { LASSERT(i1); - ll_i2uctxt(&data->ctxt, i1, i2); + ll_i2gids(data->suppgids, i1, i2); ll_inode2fid(&data->fid1, i1); if (i2) ll_inode2fid(&data->fid2, i2); + else + memset(&data->fid2, 0, sizeof(data->fid2)); data->name = name; data->namelen = namelen; @@ -348,8 +389,7 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, struct lookup_intent *it, int lookup_flags) { struct dentry *save = dentry, *retval; - struct ll_fid pfid; - struct ll_uctxt ctxt; + struct mdc_op_data op_data; struct it_cb_data icbd; struct ptlrpc_request *req = NULL; struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; @@ -370,14 +410,15 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, icbd.icbd_childp = &dentry; icbd.icbd_parent = parent; - ll_inode2fid(&pfid, parent); - ll_i2uctxt(&ctxt, parent, NULL); + + ll_prepare_mdc_op_data(&op_data, parent, NULL, dentry->d_name.name, + dentry->d_name.len, lookup_flags); it->it_create_mode &= ~current->fs->umask; - rc = mdc_intent_lock(ll_i2mdcexp(parent), &ctxt, &pfid, - dentry->d_name.name, dentry->d_name.len, NULL, 0, - NULL, it, lookup_flags, &req, ll_mdc_blocking_ast); + rc = mdc_intent_lock(ll_i2mdcexp(parent), &op_data, NULL, 0, it, + lookup_flags, &req, ll_mdc_blocking_ast); + if (rc < 0) GOTO(out, retval = ERR_PTR(rc)); @@ -541,7 +582,7 @@ static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev) nd->last.len, 0); err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode, current->fsuid, current->fsgid, - rdev, &request); + current->cap_effective, rdev, &request); if (err == 0) ll_update_times(request, 0, dir); ptlrpc_req_finished(request); @@ -583,7 +624,7 @@ static int ll_mknod(struct inode *dir, struct dentry *dchild, int mode, dchild->d_name.len, 0); err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode, current->fsuid, current->fsgid, - rdev, &request); + current->cap_effective, rdev, &request); if (err) GOTO(out_err, err); @@ -624,7 +665,8 @@ static int ll_symlink_raw(struct nameidata *nd, const char *tgt) nd->last.len, 0); err = mdc_create(sbi->ll_mdc_exp, &op_data, tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO, - current->fsuid, current->fsgid, 0, &request); + current->fsuid, current->fsgid, current->cap_effective, + 0, &request); if (err == 0) ll_update_times(request, 0, dir); @@ -674,7 +716,8 @@ static int ll_mkdir_raw(struct nameidata *nd, int mode) ll_prepare_mdc_op_data(&op_data, dir, NULL, nd->last.name, nd->last.len, 0); err = mdc_create(sbi->ll_mdc_exp, &op_data, NULL, 0, mode, - current->fsuid, current->fsgid, 0, &request); + current->fsuid, current->fsgid, current->cap_effective, + 0, &request); if (err == 0) ll_update_times(request, 0, dir); diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index d7f721e..b8f2b2c 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -70,7 +70,7 @@ static int ll_brw(int cmd, struct inode *inode, struct obdo *oa, pg.pg = page; pg.off = ((obd_off)page->index) << PAGE_SHIFT; - if (cmd == OBD_BRW_WRITE && (pg.off + PAGE_SIZE > inode->i_size)) + if ((cmd & OBD_BRW_WRITE) && (pg.off + PAGE_SIZE > inode->i_size)) pg.count = inode->i_size % PAGE_SIZE; else pg.count = PAGE_SIZE; @@ -87,7 +87,7 @@ static int ll_brw(int cmd, struct inode *inode, struct obdo *oa, pg.flag = flags; - if (cmd == OBD_BRW_WRITE) + if (cmd & OBD_BRW_WRITE) lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_BRW_WRITE, pg.count); else @@ -266,7 +266,7 @@ static int ll_ap_make_ready(void *data, int cmd) llap = LLAP_FROM_COOKIE(data); page = llap->llap_page; - LASSERT(cmd != OBD_BRW_READ); + LASSERT(!(cmd & OBD_BRW_READ)); /* we're trying to write, but the page is locked.. come back later */ if (TryLockPage(page)) @@ -343,7 +343,7 @@ void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa) oa->o_id = lsm->lsm_object_id; oa->o_valid = OBD_MD_FLID; valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME; - if (cmd == OBD_BRW_WRITE) { + if (cmd & OBD_BRW_WRITE) { oa->o_valid |= OBD_MD_FLIFID | OBD_MD_FLEPOCH; mdc_pack_fid(obdo_fid(oa), inode->i_ino, 0, inode->i_mode); oa->o_easize = ll_i2info(inode)->lli_io_epoch; @@ -580,14 +580,14 @@ static int queue_or_sync_write(struct obd_export *exp, struct inode *inode, unsigned long size_index = inode->i_size >> PAGE_SHIFT; struct obd_io_group *oig; struct ll_sb_info *sbi = ll_i2sbi(inode); - int rc; + int rc, noquot = capable(CAP_SYS_RESOURCE) ? OBD_BRW_NOQUOTA : 0; ENTRY; /* _make_ready only sees llap once we've unlocked the page */ llap->llap_write_queued = 1; rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL, - llap->llap_cookie, OBD_BRW_WRITE, 0, 0, 0, - async_flags); + llap->llap_cookie, OBD_BRW_WRITE | noquot, + 0, 0, 0, async_flags); if (rc == 0) { LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "write queued\n"); //llap_write_pending(inode, llap); @@ -632,8 +632,8 @@ static int queue_or_sync_write(struct obd_export *exp, struct inode *inode, } rc = obd_queue_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig, - llap->llap_cookie, OBD_BRW_WRITE, 0, to, 0, - ASYNC_READY | ASYNC_URGENT | + llap->llap_cookie, OBD_BRW_WRITE | noquot, + 0, to, 0, ASYNC_READY | ASYNC_URGENT | ASYNC_COUNT_STABLE | ASYNC_GROUP_SYNC); if (rc) GOTO(free_oig, rc); @@ -762,11 +762,11 @@ void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) LL_CDEBUG_PAGE(D_PAGE, page, "completing cmd %d with %d\n", cmd, rc); - if (cmd == OBD_BRW_READ && llap->llap_defer_uptodate) + if (cmd & OBD_BRW_READ && llap->llap_defer_uptodate) ll_ra_count_put(ll_i2sbi(page->mapping->host), 1); if (rc == 0) { - if (cmd == OBD_BRW_READ) { + if (cmd & OBD_BRW_READ) { if (!llap->llap_defer_uptodate) SetPageUptodate(page); } else { @@ -774,7 +774,7 @@ void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) } ClearPageError(page); } else { - if (cmd == OBD_BRW_READ) { + if (cmd & OBD_BRW_READ) { llap->llap_defer_uptodate = 0; } else { ll_redirty_page(page); @@ -784,7 +784,7 @@ void ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) unlock_page(page); - if (0 && cmd == OBD_BRW_WRITE) { + if (0 && cmd & OBD_BRW_WRITE) { llap_write_complete(page->mapping->host, llap); ll_try_done_writing(page->mapping->host); } @@ -891,8 +891,8 @@ static int ll_issue_page_read(struct obd_export *exp, llap->llap_ra_used = 0; rc = obd_queue_group_io(exp, ll_i2info(page->mapping->host)->lli_smd, NULL, oig, llap->llap_cookie, OBD_BRW_READ, 0, - PAGE_SIZE, 0, ASYNC_COUNT_STABLE | ASYNC_READY - | ASYNC_URGENT); + PAGE_SIZE, 0, ASYNC_COUNT_STABLE | ASYNC_READY | + ASYNC_URGENT); if (rc) { LL_CDEBUG_PAGE(D_ERROR, page, "read queue failed: rc %d\n", rc); page_cache_release(page); diff --git a/lustre/lvfs/Makefile.in b/lustre/lvfs/Makefile.in index 02fb755..dc286b7 100644 --- a/lustre/lvfs/Makefile.in +++ b/lustre/lvfs/Makefile.in @@ -1,7 +1,7 @@ MODULES := lvfs #quotactl_test quotacheck_test @SERVER_TRUE@MODULES += fsfilt_@BACKINGFS@ -lvfs-objs := lvfs_common.o lvfs_linux.o fsfilt.o +lvfs-objs := lvfs_common.o lvfs_linux.o fsfilt.o upcall_cache.o #quotactl-objs := quotactl_test.o #quotaccheck-objs := quotacheck_test.o diff --git a/lustre/lvfs/lvfs_linux.c b/lustre/lvfs/lvfs_linux.c index 52f0f0d..b47be6b 100644 --- a/lustre/lvfs/lvfs_linux.c +++ b/lustre/lvfs/lvfs_linux.c @@ -65,13 +65,57 @@ int obd_memmax; # define ASSERT_KERNEL_CTXT(msg) do {} while(0) #endif -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4)) -#define current_ngroups current->group_info->ngroups -#define current_groups current->group_info->small_block +static void push_group_info(struct lvfs_run_ctxt *save, + struct upcall_cache_entry *uce) +{ + struct group_info *ginfo = uce ? uce->ue_group_info : NULL; + + if (!ginfo) { + save->ngroups = current_ngroups; + current_ngroups = 0; + } else { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4) + task_lock(current); + save->group_info = current->group_info; + current->group_info = ginfo; + task_unlock(current); #else -#define current_ngroups current->ngroups -#define current_groups current->groups + LASSERT(ginfo->ngroups <= NGROUPS); + LASSERT(current->ngroups <= NGROUPS_SMALL); + /* save old */ + save->group_info.ngroups = current->ngroups; + if (current->ngroups) + memcpy(save->group_info.small_block, current->groups, + current->ngroups * sizeof(gid_t)); + /* push new */ + current->ngroups = ginfo->ngroups; + if (ginfo->ngroups) + memcpy(current->groups, ginfo->small_block, + current->ngroups * sizeof(gid_t)); #endif + } +} + +static void pop_group_info(struct lvfs_run_ctxt *save, + struct upcall_cache_entry *uce) +{ + struct group_info *ginfo = uce ? uce->ue_group_info : NULL; + + if (!ginfo) { + current_ngroups = save->ngroups; + } else { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4) + task_lock(current); + current->group_info = save->group_info; + task_unlock(current); +#else + current->ngroups = save->group_info.ngroups; + if (current->ngroups) + memcpy(current->groups, save->group_info.small_block, + current->ngroups * sizeof(gid_t)); +#endif + } +} /* push / pop to root of obd store */ void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx, @@ -97,7 +141,6 @@ void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx, LASSERT(atomic_read(&new_ctx->pwd->d_count)); save->pwd = dget(current->fs->pwd); save->pwdmnt = mntget(current->fs->pwdmnt); - save->ngroups = current_ngroups; save->luc.luc_umask = current->fs->umask; LASSERT(save->pwd); @@ -109,25 +152,11 @@ void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx, save->luc.luc_fsuid = current->fsuid; save->luc.luc_fsgid = current->fsgid; save->luc.luc_cap = current->cap_effective; - save->luc.luc_suppgid1 = current_groups[0]; - save->luc.luc_suppgid2 = current_groups[1]; current->fsuid = uc->luc_fsuid; current->fsgid = uc->luc_fsgid; current->cap_effective = uc->luc_cap; - current_ngroups = 0; - - if (uc->luc_suppgid1 != -1) - current_groups[current_ngroups++] = uc->luc_suppgid1; - if (uc->luc_suppgid2 != -1) - current_groups[current_ngroups++] = uc->luc_suppgid2; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4) - if (uc->luc_suppgid1 != -1 && uc->luc_suppgid2 != -1 && - (uc->luc_suppgid1 > uc->luc_suppgid2)) { - current_groups[0] = uc->luc_suppgid2; - current_groups[1] = uc->luc_suppgid1; - } -#endif + push_group_info(save, uc->luc_uce); } current->fs->umask = 0; /* umask already applied on client */ set_fs(new_ctx->fs); @@ -178,9 +207,7 @@ void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx, current->fsuid = saved->luc.luc_fsuid; current->fsgid = saved->luc.luc_fsgid; current->cap_effective = saved->luc.luc_cap; - current_ngroups = saved->ngroups; - current_groups[0] = saved->luc.luc_suppgid1; - current_groups[1] = saved->luc.luc_suppgid2; + pop_group_info(saved, uc->luc_uce); } /* diff --git a/lustre/lvfs/upcall_cache.c b/lustre/lvfs/upcall_cache.c new file mode 100644 index 0000000..a8bb2b6 --- /dev/null +++ b/lustre/lvfs/upcall_cache.c @@ -0,0 +1,519 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Supplementary groups cache. + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4) +struct group_info *groups_alloc(int ngroups) +{ + struct group_info *ginfo; + + LASSERT(ngroups <= NGROUPS_SMALL); + + OBD_ALLOC(ginfo, sizeof(*ginfo) + 1 * sizeof(gid_t *)); + if (!ginfo) + return NULL; + ginfo->ngroups = ngroups; + ginfo->nblocks = 1; + ginfo->blocks[0] = ginfo->small_block; + atomic_set(&ginfo->usage, 1); + + return ginfo; +} + +void groups_free(struct group_info *ginfo) +{ + if (!ginfo) + return; + + LASSERT(ginfo->ngroups <= NGROUPS_SMALL); + LASSERT(ginfo->nblocks == 1); + LASSERT(ginfo->blocks[0] == ginfo->small_block); + + OBD_FREE(ginfo, sizeof(*ginfo) + 1 * sizeof(gid_t *)); +} +#endif + +static struct upcall_cache_entry *alloc_entry(__u64 key) +{ + struct upcall_cache_entry *entry; + + OBD_ALLOC(entry, sizeof(*entry)); + if (!entry) + return NULL; + + UC_CACHE_SET_NEW(entry); + INIT_LIST_HEAD(&entry->ue_hash); + entry->ue_key = key; + atomic_set(&entry->ue_refcount, 0); + init_waitqueue_head(&entry->ue_waitq); + return entry; +} + +/* protected by hash lock */ +static void free_entry(struct upcall_cache_entry *entry) +{ + groups_free(entry->ue_group_info); + list_del(&entry->ue_hash); + CDEBUG(D_OTHER, "destroy cache entry %p for key "LPU64"\n", + entry, entry->ue_key); + OBD_FREE(entry, sizeof(*entry)); +} + +static void get_entry(struct upcall_cache_entry *entry) +{ + atomic_inc(&entry->ue_refcount); +} + +static void put_entry(struct upcall_cache_entry *entry) +{ + if (atomic_dec_and_test(&entry->ue_refcount) && + (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry))) { + free_entry(entry); + } +} + +static int check_unlink_entry(struct upcall_cache_entry *entry) +{ + if (UC_CACHE_IS_VALID(entry) && + time_before(jiffies, entry->ue_expire)) + return 0; + + if (UC_CACHE_IS_ACQUIRING(entry)) { + if (time_before(jiffies, entry->ue_acquire_expire)) + return 0; + + UC_CACHE_SET_EXPIRED(entry); + wake_up_all(&entry->ue_waitq); + } else if (!UC_CACHE_IS_INVALID(entry)) { + UC_CACHE_SET_EXPIRED(entry); + } + + list_del_init(&entry->ue_hash); + if (!atomic_read(&entry->ue_refcount)) + free_entry(entry); + return 1; +} + +static int refresh_entry(struct upcall_cache *hash, + struct upcall_cache_entry *entry) +{ + char *argv[4]; + char *envp[3]; + char keystr[16]; + int rc; + ENTRY; + + snprintf(keystr, 16, LPU64, entry->ue_key); + + CDEBUG(D_INFO, "The groups upcall is: %s \n", hash->uc_upcall); + argv[0] = hash->uc_upcall; + argv[1] = hash->uc_name; + argv[2] = keystr; + argv[3] = NULL; + + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/usr/sbin"; + envp[2] = NULL; + + rc = USERMODEHELPER(argv[0], argv, envp); + if (rc < 0) { + CERROR("%s: error invoking getgroups upcall %s %s %s: rc %d; " + "check /proc/fs/lustre/mds/%s/group_upcall\n", + hash->uc_name, argv[0], argv[1], argv[2], rc, argv[1]); + } else { + CDEBUG(D_HA, "%s: invoked upcall %s %s %s\n", hash->uc_name, + argv[0], argv[1], argv[2]); + rc = 0; + } + RETURN(rc); +} + +static int entry_set_group_info(struct upcall_cache_entry *entry, __u32 primary, + __u32 ngroups, __u32 *groups) +{ + struct group_info *ginfo; + int i, j; + ENTRY; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4) + if (ngroups > NGROUPS) + ngroups = NGROUPS; +#endif + + if (ngroups > NGROUPS_MAX) { + CERROR("using first %d supplementary groups for uid "LPU64"\n", + NGROUPS_MAX, entry->ue_key); + ngroups = NGROUPS_MAX; + } + + ginfo = groups_alloc(ngroups); + if (!ginfo) { + CERROR("uid "LPU64" update can't alloc ginfo for %d groups\n", + entry->ue_key, ngroups); + RETURN(-ENOMEM); + } + entry->ue_group_info = ginfo; + entry->ue_primary = primary; + + for (i = 0; i < ginfo->nblocks; i++) { + int cp_count = min(NGROUPS_PER_BLOCK, (int)ngroups); + int off = i * NGROUPS_PER_BLOCK; + + for (j = 0; j < cp_count; j++) + ginfo->blocks[i][j] = groups[off + j]; + + ngroups -= cp_count; + } + RETURN(0); +} + +struct upcall_cache_entry *upcall_cache_get_entry(struct upcall_cache *hash, + __u64 key, __u32 primary, + __u32 ngroups, __u32 *groups) +{ + struct upcall_cache_entry *entry = NULL, *new = NULL, *next; + struct list_head *head; + wait_queue_t wait; + int rc, found; + ENTRY; + + LASSERT(hash); + + if (strcmp(hash->uc_upcall, "NONE") == 0) { + new = alloc_entry(key); + if (!new) { + CERROR("fail to alloc entry\n"); + RETURN(NULL); + } + get_entry(new); + + /* We have to sort the groups for 2.6 kernels */ + LASSERT(ngroups <= 2); + if (ngroups == 2 && groups[1] == -1) + ngroups--; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,4) + /* 2.6 needs groups array sorted */ + if (ngroups == 2 && groups[0] > groups[1]) { + __u32 tmp = groups[1]; + groups[1] = groups[0]; + groups[0] = tmp; + } +#endif + if (ngroups > 0 && groups[0] == -1) { + groups[0] = groups[1]; + ngroups--; + } + + rc = entry_set_group_info(new, primary, ngroups, groups); + + /* We can't cache this entry as it only has a subset of + * the user's groups, as sent in suppgid1, suppgid2. */ + UC_CACHE_SET_EXPIRED(new); + RETURN(new); + } + head = &hash->uc_hashtable[UC_CACHE_HASH_INDEX(key)]; +find_again: + found = 0; + spin_lock(&hash->uc_lock); + list_for_each_entry_safe(entry, next, head, ue_hash) { + /* check invalid & expired items */ + if (check_unlink_entry(entry)) + continue; + if (entry->ue_key == key) { + found = 1; + break; + } + } + + if (!found) { /* didn't find it */ + if (!new) { + spin_unlock(&hash->uc_lock); + new = alloc_entry(key); + if (!new) { + CERROR("fail to alloc entry\n"); + RETURN(ERR_PTR(-ENOMEM)); + } + goto find_again; + } else { + list_add(&new->ue_hash, head); + entry = new; + } + } else { + if (new) { + free_entry(new); + new = NULL; + } + list_move(&entry->ue_hash, head); + } + get_entry(entry); + + /* acquire for new one */ + if (UC_CACHE_IS_NEW(entry)) { + UC_CACHE_SET_ACQUIRING(entry); + UC_CACHE_CLEAR_NEW(entry); + entry->ue_acquire_expire = jiffies + hash->uc_acquire_expire; + spin_unlock(&hash->uc_lock); + rc = refresh_entry(hash, entry); + spin_lock(&hash->uc_lock); + if (rc < 0) { + UC_CACHE_CLEAR_ACQUIRING(entry); + UC_CACHE_SET_INVALID(entry); + } + /* fall through */ + } + /* someone (and only one) is doing upcall upon + * this item, just wait it complete + */ + if (UC_CACHE_IS_ACQUIRING(entry)) { + init_waitqueue_entry(&wait, current); + add_wait_queue(&entry->ue_waitq, &wait); + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock(&hash->uc_lock); + + schedule_timeout(hash->uc_acquire_expire); + + spin_lock(&hash->uc_lock); + remove_wait_queue(&entry->ue_waitq, &wait); + if (UC_CACHE_IS_ACQUIRING(entry)) { + static unsigned long next; + /* we're interrupted or upcall failed in the middle */ + if (time_after(jiffies, next)) { + CERROR("key "LPU64" update failed: check %s\n", + entry->ue_key, hash->uc_upcall); + next = jiffies + 1800; + } + put_entry(entry); + GOTO(out, entry = ERR_PTR(-EIDRM)); + } + /* fall through */ + } + + /* invalid means error, don't need to try again */ + if (UC_CACHE_IS_INVALID(entry)) { + put_entry(entry); + GOTO(out, entry = ERR_PTR(-EIDRM)); + } + + /* check expired + * We can't refresh the existing one because some + * memory might be shared by multiple processes. + */ + if (check_unlink_entry(entry)) { + /* if expired, try again. but if this entry is + * created by me but too quickly turn to expired + * without any error, should at least give a + * chance to use it once. + */ + if (entry != new) { + put_entry(entry); + spin_unlock(&hash->uc_lock); + new = NULL; + goto find_again; + } + } + + /* Now we know it's good */ +out: + spin_unlock(&hash->uc_lock); + RETURN(entry); +} +EXPORT_SYMBOL(upcall_cache_get_entry); + +void upcall_cache_put_entry(struct upcall_cache *hash, + struct upcall_cache_entry *entry) +{ + ENTRY; + + if (!entry) { + EXIT; + return; + } + + LASSERT(atomic_read(&entry->ue_refcount) > 0); + spin_lock(&hash->uc_lock); + put_entry(entry); + spin_unlock(&hash->uc_lock); + EXIT; +} +EXPORT_SYMBOL(upcall_cache_put_entry); + +int upcall_cache_downcall(struct upcall_cache *hash, __u32 err, __u64 key, + __u32 primary, __u32 ngroups, __u32 *groups) +{ + struct upcall_cache_entry *entry = NULL; + struct list_head *head; + int found = 0, rc = 0; + ENTRY; + + LASSERT(hash); + + head = &hash->uc_hashtable[UC_CACHE_HASH_INDEX(key)]; + + spin_lock(&hash->uc_lock); + list_for_each_entry(entry, head, ue_hash) { + if (entry->ue_key == key) { + found = 1; + get_entry(entry); + break; + } + } + + if (!found) { + CDEBUG(D_OTHER, "%s: upcall for key "LPU64" not expected\n", + hash->uc_name, entry->ue_key); + /* haven't found, it's possible */ + spin_unlock(&hash->uc_lock); + RETURN(-EINVAL); + } + + if (err) { + CDEBUG(D_OTHER, "%s: upcall for key "LPU64" returned %d\n", + hash->uc_name, entry->ue_key, err); + GOTO(out, rc = -EINVAL); + } + + if (!UC_CACHE_IS_ACQUIRING(entry)) { + CERROR("%s: found uptodate entry %p (key "LPU64") in ioctl\n", + hash->uc_name, entry, entry->ue_key); + GOTO(out, rc = 0); + } + + if (UC_CACHE_IS_INVALID(entry) || UC_CACHE_IS_EXPIRED(entry)) { + CERROR("%s: found a stale entry %p (key "LPU64") in ioctl\n", + hash->uc_name, entry, entry->ue_key); + GOTO(out, rc = -EINVAL); + } + + spin_unlock(&hash->uc_lock); + rc = entry_set_group_info(entry, primary, ngroups, groups); + spin_lock(&hash->uc_lock); + if (rc) + GOTO(out, rc); + + entry->ue_expire = jiffies + hash->uc_entry_expire; + UC_CACHE_SET_VALID(entry); + CDEBUG(D_OTHER, "%s: created upcall cache entry %p for key "LPU64"\n", + hash->uc_name, entry, entry->ue_key); +out: + if (rc) { + UC_CACHE_SET_INVALID(entry); + list_del_init(&entry->ue_hash); + } + UC_CACHE_CLEAR_ACQUIRING(entry); + spin_unlock(&hash->uc_lock); + wake_up_all(&entry->ue_waitq); + put_entry(entry); + + RETURN(rc); +} +EXPORT_SYMBOL(upcall_cache_downcall); + +static void cache_flush(struct upcall_cache *hash, int force) +{ + struct upcall_cache_entry *entry, *next; + int i; + ENTRY; + + spin_lock(&hash->uc_lock); + for (i = 0; i < UC_CACHE_HASH_SIZE; i++) { + list_for_each_entry_safe(entry, next, + &hash->uc_hashtable[i], ue_hash) { + if (!force && atomic_read(&entry->ue_refcount)) { + UC_CACHE_SET_EXPIRED(entry); + continue; + } + LASSERT(!atomic_read(&entry->ue_refcount)); + free_entry(entry); + } + } + spin_unlock(&hash->uc_lock); + EXIT; +} + +void upcall_cache_flush_idle(struct upcall_cache *cache) +{ + cache_flush(cache, 0); +} +EXPORT_SYMBOL(upcall_cache_flush_idle); + +void upcall_cache_flush_all(struct upcall_cache *cache) +{ + cache_flush(cache, 1); +} +EXPORT_SYMBOL(upcall_cache_flush_all); + +struct upcall_cache *upcall_cache_init(const char *name) +{ + struct upcall_cache *hash; + int i; + ENTRY; + + OBD_ALLOC(hash, sizeof(*hash)); + if (!hash) + RETURN(ERR_PTR(-ENOMEM)); + + spin_lock_init(&hash->uc_lock); + for (i = 0; i < UC_CACHE_HASH_SIZE; i++) + INIT_LIST_HEAD(&hash->uc_hashtable[i]); + strncpy(hash->uc_name, name, sizeof(hash->uc_name) - 1); + /* set default value, proc tunable */ + strcpy(hash->uc_upcall, "NONE"); + hash->uc_entry_expire = 5 * 60 * HZ; + hash->uc_acquire_expire = 5 * HZ; + + RETURN(hash); +} +EXPORT_SYMBOL(upcall_cache_init); + +void upcall_cache_cleanup(struct upcall_cache *hash) +{ + if (!hash) + return; + upcall_cache_flush_all(hash); + OBD_FREE(hash, sizeof(*hash)); +} +EXPORT_SYMBOL(upcall_cache_cleanup); diff --git a/lustre/mdc/mdc_internal.h b/lustre/mdc/mdc_internal.h index c08b0ab..8d79fe3 100644 --- a/lustre/mdc/mdc_internal.h +++ b/lustre/mdc/mdc_internal.h @@ -10,8 +10,9 @@ void mdc_setattr_pack(struct ptlrpc_request *req, struct iattr *iattr, void *ea, int ealen, void *ea2, int ea2len); void mdc_create_pack(struct ptlrpc_request *req, int offset, - struct mdc_op_data *op_data, __u32 mode, __u64 rdev, - const void *data, int datalen); + struct mdc_op_data *op_data, const void *data, int datalen, + __u32 mode, __u32 uid, __u32 gid, __u32 cap_effective, + __u64 rdev); void mdc_open_pack(struct ptlrpc_request *req, int offset, struct mdc_op_data *op_data, __u32 mode, __u64 rdev, __u32 flags, const void *data, int datalen); diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c index 740d22b..8bfc1a6 100644 --- a/lustre/mdc/mdc_lib.c +++ b/lustre/mdc/mdc_lib.c @@ -71,23 +71,24 @@ void mdc_pack_req_body(struct ptlrpc_request *req) /* packing of MDS records */ void mdc_create_pack(struct ptlrpc_request *req, int offset, - struct mdc_op_data *op_data, __u32 mode, __u64 rdev, - const void *data, int datalen) + struct mdc_op_data *op_data, const void *data, int datalen, + __u32 mode, __u32 uid, __u32 gid, __u32 cap_effective, + __u64 rdev) { struct mds_rec_create *rec; char *tmp; rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); rec->cr_opcode = REINT_CREATE; - rec->cr_fsuid = current->fsuid; - rec->cr_fsgid = current->fsgid; - rec->cr_cap = current->cap_effective; + rec->cr_fsuid = uid; + rec->cr_fsgid = gid; + rec->cr_cap = cap_effective; rec->cr_fid = op_data->fid1; memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid)); rec->cr_mode = mode; rec->cr_rdev = rdev; rec->cr_time = op_data->mod_time; - rec->cr_suppgid = op_data->ctxt.gid1; + rec->cr_suppgid = op_data->suppgids[0]; tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, op_data->namelen + 1); LOGL0(op_data->name, op_data->namelen, tmp); @@ -134,7 +135,7 @@ void mdc_open_pack(struct ptlrpc_request *req, int offset, rec->cr_flags = mds_pack_open_flags(flags); rec->cr_rdev = rdev; rec->cr_time = op_data->mod_time; - rec->cr_suppgid = op_data->ctxt.gid1; + rec->cr_suppgid = op_data->suppgids[0]; if (op_data->name) { tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, @@ -175,10 +176,10 @@ void mdc_setattr_pack(struct ptlrpc_request *req, struct mdc_op_data *data, rec->sa_suppgid = iattr->ia_gid; else if ((iattr->ia_valid & ATTR_MODE) && in_group_p(iattr->ia_gid)) - rec->sa_suppgid = data->ctxt.gid1; + rec->sa_suppgid = data->suppgids[0]; else if ((iattr->ia_valid & (ATTR_MTIME|ATTR_CTIME)) && - data->ctxt.gid1 != -1) - rec->sa_suppgid = data->ctxt.gid1; + data->suppgids[0] != -1) + rec->sa_suppgid = data->suppgids[0]; } if (ealen == 0) @@ -206,7 +207,7 @@ void mdc_unlink_pack(struct ptlrpc_request *req, int offset, rec->ul_fsgid = current->fsgid; rec->ul_cap = current->cap_effective; rec->ul_mode = data->create_mode; - rec->ul_suppgid = data->ctxt.gid1; + rec->ul_suppgid = data->suppgids[0]; rec->ul_fid1 = data->fid1; rec->ul_fid2 = data->fid2; rec->ul_time = data->mod_time; @@ -228,8 +229,8 @@ void mdc_link_pack(struct ptlrpc_request *req, int offset, rec->lk_fsuid = current->fsuid; rec->lk_fsgid = current->fsgid; rec->lk_cap = current->cap_effective; - rec->lk_suppgid1 = data->ctxt.gid1; - rec->lk_suppgid2 = data->ctxt.gid2; + rec->lk_suppgid1 = data->suppgids[0]; + rec->lk_suppgid2 = data->suppgids[1]; rec->lk_fid1 = data->fid1; rec->lk_fid2 = data->fid2; rec->lk_time = data->mod_time; @@ -252,8 +253,8 @@ void mdc_rename_pack(struct ptlrpc_request *req, int offset, rec->rn_fsuid = current->fsuid; rec->rn_fsgid = current->fsgid; rec->rn_cap = current->cap_effective; - rec->rn_suppgid1 = data->ctxt.gid1; - rec->rn_suppgid2 = data->ctxt.gid2; + rec->rn_suppgid1 = data->suppgids[0]; + rec->rn_suppgid2 = data->suppgids[1]; rec->rn_fid1 = data->fid1; rec->rn_fid2 = data->fid2; rec->rn_time = data->mod_time; @@ -278,7 +279,7 @@ void mdc_getattr_pack(struct ptlrpc_request *req, int valid, int offset, b->capability = current->cap_effective; b->valid = valid; b->flags = flags; - b->suppgid = data->ctxt.gid1; + b->suppgid = data->suppgids[0]; b->fid1 = data->fid1; if (data->name) { diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index d18d421..e1475d0 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -54,26 +54,6 @@ void it_set_disposition(struct lookup_intent *it, int flag) } EXPORT_SYMBOL(it_set_disposition); -static void mdc_fid2mdc_op_data(struct mdc_op_data *data, struct ll_uctxt *ctxt, - struct ll_fid *f1, struct ll_fid *f2, - const char *name, int namelen, int mode) -{ - LASSERT(data); - LASSERT(ctxt); - LASSERT(f1); - - data->ctxt = *ctxt; - data->fid1 = *f1; - if (f2) - data->fid2 = *f2; - else - memset(&data->fid2, 0, sizeof(data->fid2)); - data->name = name; - data->namelen = namelen; - data->create_mode = mode; - data->mod_time = CURRENT_SECONDS; -} - static int it_to_lock_mode(struct lookup_intent *it) { /* CREAT needs to be tested before open (both could be set) */ @@ -489,10 +469,8 @@ EXPORT_SYMBOL(mdc_enqueue); * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the * child lookup. */ -int mdc_intent_lock(struct obd_export *exp, struct ll_uctxt *uctxt, - struct ll_fid *pfid, const char *name, int len, - void *lmm, int lmmsize, - struct ll_fid *cfid, struct lookup_intent *it, +int mdc_intent_lock(struct obd_export *exp, struct mdc_op_data *op_data, + void *lmm, int lmmsize, struct lookup_intent *it, int lookup_flags, struct ptlrpc_request **reqp, ldlm_blocking_callback cb_blocking) { @@ -506,14 +484,16 @@ int mdc_intent_lock(struct obd_export *exp, struct ll_uctxt *uctxt, LASSERT(it); CDEBUG(D_DLMTRACE,"name: %.*s in inode "LPU64", intent: %s flags %#o\n", - len, name, pfid->id, ldlm_it2str(it->it_op), it->it_flags); + op_data->namelen, op_data->name, op_data->fid1.id, + ldlm_it2str(it->it_op), it->it_flags); - if (cfid && (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR)) { + if (op_data->fid2.id && + (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR)) { /* We could just return 1 immediately, but since we should only * be called in revalidate_it if we already have a lock, let's * verify that. */ - struct ldlm_res_id res_id = { .name = { cfid->id, - cfid->generation}}; + struct ldlm_res_id res_id = {.name ={op_data->fid2.id, + op_data->fid2.generation}}; struct lustre_handle lockh; int mode = LCK_PR; @@ -543,11 +523,9 @@ int mdc_intent_lock(struct obd_export *exp, struct ll_uctxt *uctxt, * this and use the request from revalidate. In this case, revalidate * never dropped its reference, so the refcounts are all OK */ if (!it_disposition(it, DISP_ENQ_COMPLETE)) { - struct mdc_op_data op_data; - mdc_fid2mdc_op_data(&op_data, uctxt, pfid, cfid, name, len, 0); rc = mdc_enqueue(exp, LDLM_PLAIN, it, it_to_lock_mode(it), - &op_data, &lockh, lmm, lmmsize, + op_data, &lockh, lmm, lmmsize, ldlm_completion_ast, cb_blocking, NULL); if (rc < 0) RETURN(rc); @@ -584,10 +562,10 @@ int mdc_intent_lock(struct obd_export *exp, struct ll_uctxt *uctxt, /* If we were revalidating a fid/name pair, mark the intent in * case we fail and get called again from lookup */ - if (cfid != NULL) { + if (op_data->fid2.id) { it_set_disposition(it, DISP_ENQ_COMPLETE); /* Also: did we find the same inode? */ - if (memcmp(cfid, &mds_body->fid1, sizeof(*cfid))) + if (memcmp(&op_data->fid2, &mds_body->fid1, sizeof(op_data->fid2))) RETURN(-ESTALE); } @@ -633,8 +611,8 @@ int mdc_intent_lock(struct obd_export *exp, struct ll_uctxt *uctxt, } } CDEBUG(D_DENTRY,"D_IT dentry %.*s intent: %s status %d disp %x rc %d\n", - len, name, ldlm_it2str(it->it_op), it->d.lustre.it_status, - it->d.lustre.it_disposition, rc); + op_data->namelen, op_data->name, ldlm_it2str(it->it_op), + it->d.lustre.it_status, it->d.lustre.it_disposition, rc); RETURN(rc); } diff --git a/lustre/mdc/mdc_reint.c b/lustre/mdc/mdc_reint.c index 583d025..f282a69 100644 --- a/lustre/mdc/mdc_reint.c +++ b/lustre/mdc/mdc_reint.c @@ -115,7 +115,7 @@ int mdc_setattr(struct obd_export *exp, struct mdc_op_data *data, int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data, const void *data, int datalen, int mode, __u32 uid, __u32 gid, - __u64 rdev, struct ptlrpc_request **request) + __u32 cap_effective, __u64 rdev, struct ptlrpc_request **request) { struct obd_device *obd = exp->exp_obd; struct ptlrpc_request *req; @@ -135,7 +135,8 @@ int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data, /* mdc_create_pack fills msg->bufs[1] with name * and msg->bufs[2] with tgt, for symlinks or lov MD data */ - mdc_create_pack(req, 0, op_data, mode, rdev, data, datalen); + mdc_create_pack(req, 0, op_data, data, datalen, mode, + uid, gid, cap_effective, rdev); size[0] = sizeof(struct mds_body); req->rq_replen = lustre_msg_size(1, size); diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 298331b..8aa7787 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -664,11 +664,12 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req, struct lustre_handle *child_lockh) { struct obd_device *obd = req->rq_export->exp_obd; + struct mds_obd *mds = &obd->u.mds; struct ldlm_reply *rep = NULL; struct lvfs_run_ctxt saved; struct mds_body *body; struct dentry *dparent = NULL, *dchild = NULL; - struct lvfs_ucred uc; + struct lvfs_ucred uc = {NULL,}; struct lustre_handle parent_lockh; int namesize; int rc = 0, cleanup_phase = 0, resent_req = 0; @@ -683,16 +684,20 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req, lustre_swab_mds_body); if (body == NULL) { CERROR("Can't swab mds_body\n"); - GOTO(cleanup, rc = -EFAULT); + RETURN(-EFAULT); } LASSERT_REQSWAB(req, offset + 1); name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0); if (name == NULL) { CERROR("Can't unpack name\n"); - GOTO(cleanup, rc = -EFAULT); + RETURN(-EFAULT); } - namesize = req->rq_reqmsg->buflens[offset + 1]; + namesize = lustre_msg_buflen(req->rq_reqmsg, offset + 1); + + rc = mds_init_ucred(&uc, req, offset); + if (rc) + GOTO(cleanup, rc); LASSERT (offset == 0 || offset == 2); /* if requests were at offset 2, the getattr reply goes back at 1 */ @@ -701,15 +706,6 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req, offset = 1; } -#if CRAY_PORTALS - uc.luc_fsuid = req->rq_uid; -#else - uc.luc_fsuid = body->fsuid; -#endif - uc.luc_fsgid = body->fsgid; - uc.luc_cap = body->capability; - uc.luc_suppgid1 = body->suppgid; - uc.luc_suppgid2 = -1; push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); cleanup_phase = 1; /* kernel context */ intent_set_disposition(rep, DISP_LOOKUP_EXECD); @@ -802,7 +798,12 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req, l_dput(dchild); case 1: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); - default: ; + default: + mds_exit_ucred(&uc, mds); + if (req->rq_reply_state == NULL) { + req->rq_status = rc; + lustre_pack_reply(req, 0, NULL, NULL); + } } return rc; } @@ -814,7 +815,7 @@ static int mds_getattr(int offset, struct ptlrpc_request *req) struct lvfs_run_ctxt saved; struct dentry *de; struct mds_body *body; - struct lvfs_ucred uc; + struct lvfs_ucred uc = {NULL,}; int rc = 0; ENTRY; @@ -825,13 +826,10 @@ static int mds_getattr(int offset, struct ptlrpc_request *req) RETURN(-EFAULT); } -#if CRAY_PORTALS - uc.luc_fsuid = req->rq_uid; -#else - uc.luc_fsuid = body->fsuid; -#endif - uc.luc_fsgid = body->fsgid; - uc.luc_cap = body->capability; + rc = mds_init_ucred(&uc, req, offset); + if (rc) + GOTO(out_ucred, rc); + push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); de = mds_fid2dentry(mds, &body->fid1, NULL); if (IS_ERR(de)) { @@ -851,6 +849,12 @@ static int mds_getattr(int offset, struct ptlrpc_request *req) GOTO(out_pop, rc); out_pop: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); +out_ucred: + if (req->rq_reply_state == NULL) { + req->rq_status = rc; + lustre_pack_reply(req, 0, NULL, NULL); + } + mds_exit_ucred(&uc, mds); return rc; } @@ -953,13 +957,14 @@ out: static int mds_readpage(struct ptlrpc_request *req) { struct obd_device *obd = req->rq_export->exp_obd; + struct mds_obd *mds = &obd->u.mds; struct vfsmount *mnt; struct dentry *de; struct file *file; struct mds_body *body, *repbody; struct lvfs_run_ctxt saved; int rc, size = sizeof(*repbody); - struct lvfs_ucred uc; + struct lvfs_ucred uc = {NULL,}; ENTRY; if (OBD_FAIL_CHECK(OBD_FAIL_MDS_READPAGE_PACK)) @@ -975,13 +980,10 @@ static int mds_readpage(struct ptlrpc_request *req) if (body == NULL) GOTO (out, rc = -EFAULT); -#if CRAY_PORTALS - uc.luc_fsuid = req->rq_uid; -#else - uc.luc_fsuid = body->fsuid; -#endif - uc.luc_fsgid = body->fsgid; - uc.luc_cap = body->capability; + rc = mds_init_ucred(&uc, req, 0); + if (rc) + GOTO(out, rc); + push_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); de = mds_fid2dentry(&obd->u.mds, &body->fid1, &mnt); if (IS_ERR(de)) @@ -1023,6 +1025,7 @@ out_file: out_pop: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); out: + mds_exit_ucred(&uc, mds); req->rq_status = rc; RETURN(0); } @@ -1042,6 +1045,7 @@ int mds_reint(struct ptlrpc_request *req, int offset, CERROR("invalid record\n"); GOTO(out, req->rq_status = -EINVAL); } + /* rc will be used to interrupt a for loop over multiple records */ rc = mds_reint_rec(rec, offset, req, lockh); out: @@ -1548,6 +1552,13 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) "mds_ldlm_client", &obd->obd_ldlm_client); obd->obd_replayable = 1; + mds->mds_group_hash = upcall_cache_init(obd->obd_name); + if (IS_ERR(mds->mds_group_hash)) { + rc = PTR_ERR(mds->mds_group_hash); + mds->mds_group_hash = NULL; + GOTO(err_fs, rc); + } + mds_quota_setup(mds); rc = mds_postsetup(obd); @@ -1587,6 +1598,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) err_fs: /* No extra cleanup needed for llog_init_commit_thread() */ mds_fs_cleanup(obd); + upcall_cache_cleanup(mds->mds_group_hash); + mds->mds_group_hash = NULL; err_ns: ldlm_namespace_free(obd->obd_namespace, 0); obd->obd_namespace = NULL; @@ -1794,6 +1807,9 @@ static int mds_cleanup(struct obd_device *obd) } mds_fs_cleanup(obd); + upcall_cache_cleanup(mds->mds_group_hash); + mds->mds_group_hash = NULL; + /* 2 seems normal on mds, (may_umount() also expects 2 fwiw), but we only see 1 at this point in obdfilter. */ if (atomic_read(&obd->u.mds.mds_vfsmnt->mnt_count) > 2) diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c index e947b1f..54949d7 100644 --- a/lustre/mds/lproc_mds.c +++ b/lustre/mds/lproc_mds.c @@ -31,7 +31,6 @@ #include #include #include - #include "mds_internal.h" #ifdef LPROCFS @@ -47,25 +46,187 @@ static int lprocfs_mds_rd_mntdev(char *page, char **start, off_t off, int count, return snprintf(page, count, "%s\n",obd->u.mds.mds_vfsmnt->mnt_devname); } +static int lprocfs_wr_group_info(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct mds_obd *mds = &obd->u.mds; + struct mds_grp_downcall_data sparam, *param = &sparam; + int size = 0, rc = count; + + if (count < sizeof(param)) { + CERROR("%s: invalid data size %lu\n", obd->obd_name, count); + return count; + } + + if (copy_from_user(param, buffer, sizeof(*param)) || + param->mgd_magic != MDS_GRP_DOWNCALL_MAGIC) { + CERROR("%s: MDS group downcall bad params\n", obd->obd_name); + return count; + } + + if (param->mgd_ngroups > NGROUPS_MAX) { + CWARN("%s: uid %u groups %d more than maximum %d\n", + obd->obd_name, param->mgd_uid, param->mgd_ngroups, + NGROUPS_MAX); + param->mgd_ngroups = NGROUPS_MAX; + } + + if (param->mgd_ngroups > 0) { + size = offsetof(struct mds_grp_downcall_data, + mgd_groups[param->mgd_ngroups]); + OBD_ALLOC(param, size); + if (!param) { + CERROR("%s: fail to alloc %d bytes for uid %u" + " with %d groups\n", obd->obd_name, size, + sparam.mgd_uid, sparam.mgd_ngroups); + param = &sparam; + param->mgd_ngroups = 0; + } else if (copy_from_user(param, buffer, size)) { + CERROR("%s: uid %u bad supplementary group data\n", + obd->obd_name, sparam.mgd_uid); + OBD_FREE(param, size); + param = &sparam; + param->mgd_ngroups = 0; + } + } + rc = upcall_cache_downcall(mds->mds_group_hash, param->mgd_err, + param->mgd_uid, param->mgd_gid, + param->mgd_ngroups, param->mgd_groups); + + if (param && param != &sparam) + OBD_FREE(param, size); + + return rc; +} + +static int lprocfs_rd_group_expire(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + + *eof = 1; + return snprintf(page, count, "%lu\n", + obd->u.mds.mds_group_hash->uc_entry_expire / HZ); +} + +static int lprocfs_wr_group_expire(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val > 5) + obd->u.mds.mds_group_hash->uc_entry_expire = val * HZ; + else + CERROR("invalid expire time %u for group cache\n", val); + + return count; +} + +static int lprocfs_rd_group_acquire_expire(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + + *eof = 1; + return snprintf(page, count, "%lu\n", + obd->u.mds.mds_group_hash->uc_acquire_expire / HZ); +} + +static int lprocfs_wr_group_acquire_expire(struct file *file,const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + int val, rc = 0; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val > 2) + obd->u.mds.mds_group_hash->uc_acquire_expire = val * HZ; + + return count; +} + +static int lprocfs_rd_group_upcall(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + + *eof = 1; + return snprintf(page, count, "%s\n", + obd->u.mds.mds_group_hash->uc_upcall); +} + +static int lprocfs_wr_group_upcall(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct upcall_cache *hash = obd->u.mds.mds_group_hash; + char kernbuf[UC_CACHE_UPCALL_MAXPATH] = { '\0' }; + + if (count >= UC_CACHE_UPCALL_MAXPATH) { + CERROR("%s: group upcall too long\n", obd->obd_name); + return -EINVAL; + } + + if (copy_from_user(kernbuf, buffer, + min(count, UC_CACHE_UPCALL_MAXPATH - 1))) + return -EFAULT; + + /* Remove any extraneous bits from the upcall (e.g. linefeeds) */ + sscanf(kernbuf, "%s", hash->uc_upcall); + + if (strcmp(hash->uc_name, obd->obd_name) != 0) + CWARN("%s: write to upcall name %s for MDS %s\n", + obd->obd_name, hash->uc_upcall, obd->obd_name); + CWARN("%s: group upcall set to %s\n", obd->obd_name, hash->uc_upcall); + + return count; +} + +static int lprocfs_wr_group_flush(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + + upcall_cache_flush_idle(obd->u.mds.mds_group_hash); + return count; +} + struct lprocfs_vars lprocfs_mds_obd_vars[] = { - { "uuid", lprocfs_rd_uuid, 0, 0 }, - { "blocksize", lprocfs_rd_blksize, 0, 0 }, - { "kbytestotal", lprocfs_rd_kbytestotal, 0, 0 }, - { "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 }, - { "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 }, - { "filestotal", lprocfs_rd_filestotal, 0, 0 }, - { "filesfree", lprocfs_rd_filesfree, 0, 0 }, - { "fstype", lprocfs_rd_fstype, 0, 0 }, - { "mntdev", lprocfs_mds_rd_mntdev, 0, 0 }, + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "blocksize", lprocfs_rd_blksize, 0, 0 }, + { "kbytestotal", lprocfs_rd_kbytestotal, 0, 0 }, + { "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 }, + { "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 }, + { "filestotal", lprocfs_rd_filestotal, 0, 0 }, + { "filesfree", lprocfs_rd_filesfree, 0, 0 }, + { "fstype", lprocfs_rd_fstype, 0, 0 }, + { "mntdev", lprocfs_mds_rd_mntdev, 0, 0 }, { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 }, - { "evict_client", 0, lprocfs_wr_evict_client, 0 }, - { "num_exports", lprocfs_rd_num_exports, 0, 0 }, + { "evict_client", 0, lprocfs_wr_evict_client, 0 }, + { "num_exports", lprocfs_rd_num_exports, 0, 0 }, #ifdef HAVE_QUOTA_SUPPORT - { "quota_bunit_sz", lprocfs_mds_rd_bunit, lprocfs_mds_wr_bunit, 0 }, - { "quota_btune_sz", lprocfs_mds_rd_btune, lprocfs_mds_wr_btune, 0 }, - { "quota_iunit_sz", lprocfs_mds_rd_iunit, lprocfs_mds_wr_iunit, 0 }, - { "quota_itune_sz", lprocfs_mds_rd_itune, lprocfs_mds_wr_itune, 0 }, + { "quota_bunit_sz", lprocfs_mds_rd_bunit, lprocfs_mds_wr_bunit, 0 }, + { "quota_btune_sz", lprocfs_mds_rd_btune, lprocfs_mds_wr_btune, 0 }, + { "quota_iunit_sz", lprocfs_mds_rd_iunit, lprocfs_mds_wr_iunit, 0 }, + { "quota_itune_sz", lprocfs_mds_rd_itune, lprocfs_mds_wr_itune, 0 }, #endif + { "group_expire_interval", lprocfs_rd_group_expire, + lprocfs_wr_group_expire, 0}, + { "group_acquire_expire", lprocfs_rd_group_acquire_expire, + lprocfs_wr_group_acquire_expire, 0}, + { "group_upcall", lprocfs_rd_group_upcall, + lprocfs_wr_group_upcall, 0}, + { "group_flush", 0, lprocfs_wr_group_flush, 0}, + { "group_info", 0, lprocfs_wr_group_info, 0 }, { 0 } }; diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index 35b09b4..166e6d2 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -597,14 +597,13 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa, struct lvfs_run_ctxt saved; char fidname[LL_FID_NAMELEN]; void *handle; - struct lvfs_ucred ucred; + struct lvfs_ucred ucred = { 0 }; int rc = 0, err, namelen; ENTRY; /* the owner of object file should always be root */ - memset(&ucred, 0, sizeof(ucred)); ucred.luc_cap = current->cap_effective | CAP_SYS_RESOURCE; - + push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, &ucred); sprintf(fidname, "OBJECTS/%u.%u", tmpname, current->pid); @@ -680,14 +679,13 @@ int mds_obd_destroy(struct obd_export *exp, struct obdo *oa, struct inode *parent_inode = mds->mds_objects_dir->d_inode; struct obd_device *obd = exp->exp_obd; struct lvfs_run_ctxt saved; - struct lvfs_ucred ucred; + struct lvfs_ucred ucred = { 0 }; char fidname[LL_FID_NAMELEN]; struct dentry *de; void *handle; int err, namelen, rc = 0; ENTRY; - memset(&ucred, 0, sizeof(ucred)); ucred.luc_cap = current->cap_effective | CAP_SYS_RESOURCE; push_ctxt(&saved, &obd->obd_lvfs_ctxt, &ucred); @@ -715,7 +713,7 @@ int mds_obd_destroy(struct obd_export *exp, struct obdo *oa, if (IS_ERR(handle)) GOTO(out_dput, rc = PTR_ERR(handle)); - + rc = vfs_unlink(mds->mds_objects_dir->d_inode, de); if (rc) CERROR("error destroying object "LPU64":%u: rc %d\n", diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h index f57ac96..4ebc93a 100644 --- a/lustre/mds/mds_internal.h +++ b/lustre/mds/mds_internal.h @@ -125,6 +125,9 @@ int mds_osc_setattr_async(struct obd_device *obd, struct inode *inode, /* mds/mds_lib.c */ int mds_update_unpack(struct ptlrpc_request *, int offset, struct mds_update_record *); +int mds_init_ucred(struct lvfs_ucred *ucred, struct ptlrpc_request *req, + int offset); +void mds_exit_ucred(struct lvfs_ucred *ucred, struct mds_obd *obd); /* mds/mds_unlink_open.c */ int mds_cleanup_orphans(struct obd_device *obd); diff --git a/lustre/mds/mds_lib.c b/lustre/mds/mds_lib.c index ceab774..c780cb1 100644 --- a/lustre/mds/mds_lib.c +++ b/lustre/mds/mds_lib.c @@ -101,11 +101,11 @@ static int mds_setattr_unpack(struct ptlrpc_request *req, int offset, if (rec == NULL) RETURN (-EFAULT); - r->ur_fsuid = rec->sa_fsuid; - r->ur_fsgid = rec->sa_fsgid; - r->ur_cap = rec->sa_cap; - r->ur_suppgid1 = rec->sa_suppgid; - r->ur_suppgid2 = -1; + r->ur_uc.luc_fsuid = rec->sa_fsuid; + r->ur_uc.luc_fsgid = rec->sa_fsgid; + r->ur_uc.luc_cap = rec->sa_cap; + r->ur_uc.luc_suppgid1 = rec->sa_suppgid; + r->ur_uc.luc_suppgid2 = -1; r->ur_fid1 = &rec->sa_fid; attr->ia_valid = rec->sa_valid; attr->ia_mode = rec->sa_mode; @@ -148,17 +148,17 @@ static int mds_create_unpack(struct ptlrpc_request *req, int offset, if (rec == NULL) RETURN (-EFAULT); - r->ur_fsuid = rec->cr_fsuid; - r->ur_fsgid = rec->cr_fsgid; - r->ur_cap = rec->cr_cap; + r->ur_uc.luc_fsuid = rec->cr_fsuid; + r->ur_uc.luc_fsgid = rec->cr_fsgid; + r->ur_uc.luc_cap = rec->cr_cap; + r->ur_uc.luc_suppgid1 = rec->cr_suppgid; + r->ur_uc.luc_suppgid2 = -1; r->ur_fid1 = &rec->cr_fid; r->ur_fid2 = &rec->cr_replayfid; r->ur_mode = rec->cr_mode; r->ur_rdev = rec->cr_rdev; r->ur_time = rec->cr_time; r->ur_flags = rec->cr_flags; - r->ur_suppgid1 = rec->cr_suppgid; - r->ur_suppgid2 = -1; LASSERT_REQSWAB (req, offset + 1); r->ur_name = lustre_msg_string (req->rq_reqmsg, offset + 1, 0); @@ -194,11 +194,11 @@ static int mds_link_unpack(struct ptlrpc_request *req, int offset, if (rec == NULL) RETURN (-EFAULT); - r->ur_fsuid = rec->lk_fsuid; - r->ur_fsgid = rec->lk_fsgid; - r->ur_cap = rec->lk_cap; - r->ur_suppgid1 = rec->lk_suppgid1; - r->ur_suppgid2 = rec->lk_suppgid2; + r->ur_uc.luc_fsuid = rec->lk_fsuid; + r->ur_uc.luc_fsgid = rec->lk_fsgid; + r->ur_uc.luc_cap = rec->lk_cap; + r->ur_uc.luc_suppgid1 = rec->lk_suppgid1; + r->ur_uc.luc_suppgid2 = rec->lk_suppgid2; r->ur_fid1 = &rec->lk_fid1; r->ur_fid2 = &rec->lk_fid2; r->ur_time = rec->lk_time; @@ -222,12 +222,12 @@ static int mds_unlink_unpack(struct ptlrpc_request *req, int offset, if (rec == NULL) RETURN(-EFAULT); - r->ur_fsuid = rec->ul_fsuid; - r->ur_fsgid = rec->ul_fsgid; - r->ur_cap = rec->ul_cap; + r->ur_uc.luc_fsuid = rec->ul_fsuid; + r->ur_uc.luc_fsgid = rec->ul_fsgid; + r->ur_uc.luc_cap = rec->ul_cap; + r->ur_uc.luc_suppgid1 = rec->ul_suppgid; + r->ur_uc.luc_suppgid2 = -1; r->ur_mode = rec->ul_mode; - r->ur_suppgid1 = rec->ul_suppgid; - r->ur_suppgid2 = -1; r->ur_fid1 = &rec->ul_fid1; r->ur_fid2 = &rec->ul_fid2; r->ur_time = rec->ul_time; @@ -251,11 +251,11 @@ static int mds_rename_unpack(struct ptlrpc_request *req, int offset, if (rec == NULL) RETURN(-EFAULT); - r->ur_fsuid = rec->rn_fsuid; - r->ur_fsgid = rec->rn_fsgid; - r->ur_cap = rec->rn_cap; - r->ur_suppgid1 = rec->rn_suppgid1; - r->ur_suppgid2 = rec->rn_suppgid2; + r->ur_uc.luc_fsuid = rec->rn_fsuid; + r->ur_uc.luc_fsgid = rec->rn_fsgid; + r->ur_uc.luc_cap = rec->rn_cap; + r->ur_uc.luc_suppgid1 = rec->rn_suppgid1; + r->ur_uc.luc_suppgid2 = rec->rn_suppgid2; r->ur_fid1 = &rec->rn_fid1; r->ur_fid2 = &rec->rn_fid2; r->ur_time = rec->rn_time; @@ -285,17 +285,17 @@ static int mds_open_unpack(struct ptlrpc_request *req, int offset, if (rec == NULL) RETURN (-EFAULT); - r->ur_fsuid = rec->cr_fsuid; - r->ur_fsgid = rec->cr_fsgid; - r->ur_cap = rec->cr_cap; + r->ur_uc.luc_fsuid = rec->cr_fsuid; + r->ur_uc.luc_fsgid = rec->cr_fsgid; + r->ur_uc.luc_cap = rec->cr_cap; + r->ur_uc.luc_suppgid1 = rec->cr_suppgid; + r->ur_uc.luc_suppgid2 = -1; r->ur_fid1 = &rec->cr_fid; r->ur_fid2 = &rec->cr_replayfid; r->ur_mode = rec->cr_mode; r->ur_rdev = rec->cr_rdev; r->ur_time = rec->cr_time; r->ur_flags = rec->cr_flags; - r->ur_suppgid1 = rec->cr_suppgid; - r->ur_suppgid2 = -1; LASSERT_REQSWAB (req, offset + 1); r->ur_name = lustre_msg_string (req->rq_reqmsg, offset + 1, 0); @@ -350,8 +350,47 @@ int mds_update_unpack(struct ptlrpc_request *req, int offset, rec->ur_opcode = opcode; rc = mds_unpackers[opcode](req, offset, rec); + + RETURN(rc); +} + +int mds_init_ucred(struct lvfs_ucred *ucred, struct ptlrpc_request *req, + int offset) +{ + struct mds_body *body = lustre_msg_buf(req->rq_reqmsg, offset, + sizeof(*body)); + struct mds_obd *mds = mds_req2mds(req); + int rc; + + LASSERT(body != NULL); /* previously verified & swabbed by caller */ + #if CRAY_PORTALS - rec->ur_fsuid = req->rq_uid; + ucred->luc_fsuid = req->rq_uid; +#else + ucred->luc_fsuid = body->fsuid; + ucred->luc_fsgid = body->fsgid; + ucred->luc_cap = body->capability; #endif - RETURN(rc); + + ucred->luc_uce = upcall_cache_get_entry(mds->mds_group_hash, + ucred->luc_fsuid, + ucred->luc_fsgid, 1, + &body->suppgid); + if (IS_ERR(ucred->luc_uce)) { + rc = PTR_ERR(ucred->luc_uce); + ucred->luc_uce = NULL; + return rc; + } + +#if CRAY_PORTALS + if (ucred->luc_uce) + ucred->luc_fsgid = ucred->luc_uce->ue_primary; +#endif + + return 0; +} + +void mds_exit_ucred(struct lvfs_ucred *ucred, struct mds_obd *mds) +{ + upcall_cache_put_entry(mds->mds_group_hash, ucred->luc_uce); } diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index 1c96f5f..4a4cdd0 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -963,11 +963,11 @@ int mds_open(struct mds_update_record *rec, int offset, LTIME_S(iattr.ia_ctime) = rec->ur_time; LTIME_S(iattr.ia_mtime) = rec->ur_time; - iattr.ia_uid = rec->ur_fsuid; + iattr.ia_uid = current->fsuid; /* set by push_ctxt already */ if (dparent->d_inode->i_mode & S_ISGID) iattr.ia_gid = dparent->d_inode->i_gid; else - iattr.ia_gid = rec->ur_fsgid; + iattr.ia_gid = current->fsgid; iattr.ia_valid = ATTR_UID | ATTR_GID | ATTR_ATIME | ATTR_MTIME | ATTR_CTIME; diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index 5144a52..ea63e1f 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "mds_internal.h" @@ -218,7 +219,7 @@ int mds_fix_attr(struct inode *inode, struct mds_update_record *rec) /* times */ if ((ia_valid & (ATTR_MTIME|ATTR_ATIME)) == (ATTR_MTIME|ATTR_ATIME)) { - if (rec->ur_fsuid != inode->i_uid && + if (rec->ur_uc.luc_fsuid != inode->i_uid && (error = ll_permission(inode, MAY_WRITE, NULL)) != 0) RETURN(error); } @@ -727,10 +728,8 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, GOTO(cleanup, rc = -EROFS); } - if (dir->i_mode & S_ISGID) { - if (S_ISDIR(rec->ur_mode)) - rec->ur_mode |= S_ISGID; - } + if (dir->i_mode & S_ISGID && S_ISDIR(rec->ur_mode)) + rec->ur_mode |= S_ISGID; dchild->d_fsdata = (void *)&dp; dp.p_inum = (unsigned long)rec->ur_fid2->id; @@ -798,11 +797,11 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, LTIME_S(iattr.ia_atime) = rec->ur_time; LTIME_S(iattr.ia_ctime) = rec->ur_time; LTIME_S(iattr.ia_mtime) = rec->ur_time; - iattr.ia_uid = rec->ur_fsuid; + iattr.ia_uid = current->fsuid; /* set by push_ctxt already */ if (dir->i_mode & S_ISGID) iattr.ia_gid = dir->i_gid; else - iattr.ia_gid = rec->ur_fsgid; + iattr.ia_gid = current->fsgid; iattr.ia_valid = ATTR_UID | ATTR_GID | ATTR_ATIME | ATTR_MTIME | ATTR_CTIME; @@ -2076,16 +2075,39 @@ int mds_reint_rec(struct mds_update_record *rec, int offset, struct ptlrpc_request *req, struct lustre_handle *lockh) { struct obd_device *obd = req->rq_export->exp_obd; + struct mds_obd *mds = &obd->u.mds; struct lvfs_run_ctxt saved; int rc; ENTRY; +#if CRAY_PORTALS + rec->ur_uc.luc_fsuid = req->rq_uid; +#endif + + /* get group info of this user */ + rec->ur_uc.luc_uce = upcall_cache_get_entry(mds->mds_group_hash, + rec->ur_uc.luc_fsuid, + rec->ur_uc.luc_fsgid, 2, + &rec->ur_uc.luc_suppgid1); + + if (IS_ERR(rec->ur_uc.luc_uce)) { + rc = PTR_ERR(rec->ur_uc.luc_uce); + rec->ur_uc.luc_uce = NULL; + RETURN(rc); + } + /* checked by unpacker */ LASSERT(rec->ur_opcode < REINT_MAX && reinters[rec->ur_opcode] != NULL); +#if CRAY_PORTALS + if (rec->ur_uc.luc_uce) + rec->ur_uc.luc_fsgid = rec->ur_uc.luc_uce->ue_primary; +#endif + push_ctxt(&saved, &obd->obd_lvfs_ctxt, &rec->ur_uc); rc = reinters[rec->ur_opcode] (rec, offset, req, lockh); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &rec->ur_uc); + upcall_cache_put_entry(mds->mds_group_hash, rec->ur_uc.luc_uce); RETURN(rc); } diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 713569d..7b38cc0 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -692,6 +692,9 @@ int init_obdclass(void) #ifdef __KERNEL__ printk(KERN_INFO "Lustre: OBD class driver Build Version: " BUILD_VERSION", info@clusterfs.com\n"); +#else + CDEBUG(D_INFO, "Lustre: OBD class driver Build Version: " + BUILD_VERSION", info@clusterfs.com\n"); #endif err = obd_init_checks(); diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index 7118c76..1db5ff5 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -577,6 +577,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, cleanup_phase = 2; down(&inode->i_sem); + fsfilt_check_slow(now, obd_timeout, "i_sem"); oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res, oti); if (IS_ERR(oti->oti_handle)) { diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 4d66928..08276f7 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1153,7 +1153,7 @@ static int osc_brw(int cmd, struct obd_export *exp, struct obdo *oa, { ENTRY; - if (cmd == OBD_BRW_CHECK) { + if (cmd & OBD_BRW_CHECK) { /* The caller just wants to know if there's a chance that this * I/O can succeed */ struct obd_import *imp = class_exp2cliimp(exp); @@ -1193,7 +1193,7 @@ static int osc_brw_async(int cmd, struct obd_export *exp, struct obdo *oa, { ENTRY; - if (cmd == OBD_BRW_CHECK) { + if (cmd & OBD_BRW_CHECK) { /* The caller just wants to know if there's a chance that this * I/O can succeed */ struct obd_import *imp = class_exp2cliimp(exp); @@ -1229,9 +1229,84 @@ static int osc_brw_async(int cmd, struct obd_export *exp, struct obdo *oa, static void osc_check_rpcs(struct client_obd *cli); static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap, int sent); -static void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi); + +static int lop_makes_rpc(struct client_obd *cli, struct loi_oap_pages *lop, + int cmd) +{ + int optimal; + ENTRY; + + if (lop->lop_num_pending == 0) + RETURN(0); + + /* if we have an invalid import we want to drain the queued pages + * by forcing them through rpcs that immediately fail and complete + * the pages. recovery relies on this to empty the queued pages + * before canceling the locks and evicting down the llite pages */ + if (cli->cl_import == NULL || cli->cl_import->imp_invalid) + RETURN(1); + + /* stream rpcs in queue order as long as as there is an urgent page + * queued. this is our cheap solution for good batching in the case + * where writepage marks some random page in the middle of the file + * as urgent because of, say, memory pressure */ + if (!list_empty(&lop->lop_urgent)) + RETURN(1); + + /* fire off rpcs when we have 'optimal' rpcs as tuned for the wire. */ + optimal = cli->cl_max_pages_per_rpc; + if (cmd & OBD_BRW_WRITE) { + /* trigger a write rpc stream as long as there are dirtiers + * waiting for space. as they're waiting, they're not going to + * create more pages to coallesce with what's waiting.. */ + if (!list_empty(&cli->cl_cache_waiters)) + RETURN(1); + + /* +16 to avoid triggering rpcs that would want to include pages + * that are being queued but which can't be made ready until + * the queuer finishes with the page. this is a wart for + * llite::commit_write() */ + optimal += 16; + } + if (lop->lop_num_pending >= optimal) + RETURN(1); + + RETURN(0); +} + +static void on_list(struct list_head *item, struct list_head *list, + int should_be_on) +{ + if (list_empty(item) && should_be_on) + list_add_tail(item, list); + else if (!list_empty(item) && !should_be_on) + list_del_init(item); +} + +/* maintain the loi's cli list membership invariants so that osc_send_oap_rpc + * can find pages to build into rpcs quickly */ +static void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi) +{ + on_list(&loi->loi_cli_item, &cli->cl_loi_ready_list, + lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE) || + lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ)); + + on_list(&loi->loi_write_item, &cli->cl_loi_write_list, + loi->loi_write_lop.lop_num_pending); + + on_list(&loi->loi_read_item, &cli->cl_loi_read_list, + loi->loi_read_lop.lop_num_pending); +} + static void lop_update_pending(struct client_obd *cli, - struct loi_oap_pages *lop, int cmd, int delta); + struct loi_oap_pages *lop, int cmd, int delta) +{ + lop->lop_num_pending += delta; + if (cmd & OBD_BRW_WRITE) + cli->cl_pending_w_pages += delta; + else + cli->cl_pending_r_pages += delta; +} /* this is called when a sync waiter receives an interruption. Its job is to * get the caller woken as soon as possible. If its page hasn't been put in an @@ -1267,7 +1342,7 @@ static void osc_occ_interrupted(struct oig_callback_context *occ) list_del_init(&oap->oap_urgent_item); loi = oap->oap_loi; - lop = (oap->oap_cmd == OBD_BRW_WRITE) ? + lop = (oap->oap_cmd & OBD_BRW_WRITE) ? &loi->loi_write_lop : &loi->loi_read_lop; lop_update_pending(oap->oap_cli, lop, oap->oap_cmd, -1); loi_list_maint(oap->oap_cli, oap->oap_loi); @@ -1311,7 +1386,7 @@ static void osc_ap_completion(struct client_obd *cli, struct obdo *oa, oap->oap_async_flags = 0; oap->oap_interrupted = 0; - if (oap->oap_cmd == OBD_BRW_WRITE) { + if (oap->oap_cmd & OBD_BRW_WRITE) { osc_process_ar(&cli->cl_ar, oap->oap_request, rc); osc_process_ar(&oap->oap_loi->loi_ar, oap->oap_request, rc); } @@ -1461,16 +1536,6 @@ out: RETURN(req); } -static void lop_update_pending(struct client_obd *cli, - struct loi_oap_pages *lop, int cmd, int delta) -{ - lop->lop_num_pending += delta; - if (cmd == OBD_BRW_WRITE) - cli->cl_pending_w_pages += delta; - else - cli->cl_pending_r_pages += delta; -} - /* the loi lock is held across this function but it's allowed to release * and reacquire it during its work */ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, @@ -1683,74 +1748,6 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, RETURN(1); } -static int lop_makes_rpc(struct client_obd *cli, struct loi_oap_pages *lop, - int cmd) -{ - int optimal; - ENTRY; - - if (lop->lop_num_pending == 0) - RETURN(0); - - /* if we have an invalid import we want to drain the queued pages - * by forcing them through rpcs that immediately fail and complete - * the pages. recovery relies on this to empty the queued pages - * before canceling the locks and evicting down the llite pages */ - if (cli->cl_import == NULL || cli->cl_import->imp_invalid) - RETURN(1); - - /* stream rpcs in queue order as long as as there is an urgent page - * queued. this is our cheap solution for good batching in the case - * where writepage marks some random page in the middle of the file as - * urgent because of, say, memory pressure */ - if (!list_empty(&lop->lop_urgent)) - RETURN(1); - - /* fire off rpcs when we have 'optimal' rpcs as tuned for the wire. */ - optimal = cli->cl_max_pages_per_rpc; - if (cmd == OBD_BRW_WRITE) { - /* trigger a write rpc stream as long as there are dirtiers - * waiting for space. as they're waiting, they're not going to - * create more pages to coallesce with what's waiting.. */ - if (!list_empty(&cli->cl_cache_waiters)) - RETURN(1); - - /* +16 to avoid triggering rpcs that would want to include pages - * that are being queued but which can't be made ready until - * the queuer finishes with the page. this is a wart for - * llite::commit_write() */ - optimal += 16; - } - if (lop->lop_num_pending >= optimal) - RETURN(1); - - RETURN(0); -} - -static void on_list(struct list_head *item, struct list_head *list, - int should_be_on) -{ - if (list_empty(item) && should_be_on) - list_add_tail(item, list); - else if (!list_empty(item) && !should_be_on) - list_del_init(item); -} - -/* maintain the loi's cli list membership invariants so that osc_send_oap_rpc - * can find pages to build into rpcs quickly */ -static void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi) -{ - on_list(&loi->loi_cli_item, &cli->cl_loi_ready_list, - lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE) || - lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ)); - - on_list(&loi->loi_write_item, &cli->cl_loi_write_list, - loi->loi_write_lop.lop_num_pending); - - on_list(&loi->loi_read_item, &cli->cl_loi_read_list, - loi->loi_read_lop.lop_num_pending); -} - #define LOI_DEBUG(LOI, STR, args...) \ CDEBUG(D_INODE, "loi ready %d wr %d:%d rd %d:%d " STR, \ !list_empty(&(LOI)->loi_cli_item), \ @@ -2026,9 +2023,9 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm, !list_empty(&oap->oap_rpc_item)) RETURN(-EBUSY); -#ifdef HAVE_QUOTA_SUPPORT /* check if the file's owner/group is over quota */ - if (cmd == OBD_BRW_WRITE){ +#ifdef HAVE_QUOTA_SUPPORT + if ((cmd & OBD_BRW_WRITE) && !(cmd & OBD_BRW_NOQUOTA)){ struct obd_async_page_ops *ops; struct obdo *oa = NULL; @@ -2053,12 +2050,12 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm, spin_lock(&cli->cl_loi_list_lock); oap->oap_cmd = cmd; - oap->oap_async_flags = async_flags; oap->oap_page_off = off; oap->oap_count = count; oap->oap_brw_flags = brw_flags; + oap->oap_async_flags = async_flags; - if (cmd == OBD_BRW_WRITE) { + if (cmd & OBD_BRW_WRITE) { rc = osc_enter_cache(cli, loi, oap); if (rc) { spin_unlock(&cli->cl_loi_list_lock); @@ -2109,7 +2106,7 @@ static int osc_set_async_flags(struct obd_export *exp, if (loi == NULL) loi = &lsm->lsm_oinfo[0]; - if (oap->oap_cmd == OBD_BRW_WRITE) { + if (oap->oap_cmd & OBD_BRW_WRITE) { lop = &loi->loi_write_lop; } else { lop = &loi->loi_read_lop; @@ -2176,7 +2173,7 @@ static int osc_queue_group_io(struct obd_export *exp, struct lov_stripe_md *lsm, oap->oap_brw_flags = brw_flags; oap->oap_async_flags = async_flags; - if (cmd == OBD_BRW_WRITE) + if (cmd & OBD_BRW_WRITE) lop = &loi->loi_write_lop; else lop = &loi->loi_read_lop; @@ -2249,7 +2246,7 @@ static int osc_teardown_async_page(struct obd_export *exp, if (loi == NULL) loi = &lsm->lsm_oinfo[0]; - if (oap->oap_cmd == OBD_BRW_WRITE) { + if (oap->oap_cmd & OBD_BRW_WRITE) { lop = &loi->loi_write_lop; } else { lop = &loi->loi_read_lop; diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index b106311..2c37488 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -486,9 +486,7 @@ int ptlrpc_ni_init(int number, char *name, struct ptlrpc_ni *pni) * the event queue. In fact lustre never pulls events off this queue, * so it's only sized for some debug history. */ # if CRAY_PORTALS - rc = PtlNIDebug(pni->pni_ni_h, 0xffffffff); - if (rc != PTL_OK) - CDEBUG(D_ERROR, "Can't enable Cray Portals Debug: rc %d\n", rc); + PtlNIDebug(pni->pni_ni_h, 0xffffffff); # endif rc = PtlEQAlloc(pni->pni_ni_h, 1024, ptlrpc_master_callback, &pni->pni_eq_h); diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index ef777f9..039911b 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -120,6 +120,11 @@ check_mount2() { build_test_filter +if [ "$ONLY" == "setup" ]; then + setup + exit +fi + if [ "$ONLY" == "cleanup" ]; then cleanup exit @@ -217,8 +222,6 @@ run_test 5 "force cleanup mds, then cleanup" test_5b() { start_ost - start_mds - stop_mds [ -d $MOUNT ] || mkdir -p $MOUNT $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null diff --git a/lustre/tests/ll_dirstripe_verify.c b/lustre/tests/ll_dirstripe_verify.c index 56eea6e..903e2a7 100644 --- a/lustre/tests/ll_dirstripe_verify.c +++ b/lustre/tests/ll_dirstripe_verify.c @@ -30,12 +30,12 @@ int read_proc_entry(char *proc_path, char *buf, int len) { - int rcnt = -1, fd; + int rcnt = -2, fd; if ((fd = open(proc_path, O_RDONLY)) == -1) { fprintf(stderr, "open('%s') failed: %s\n", proc_path, strerror(errno)); - rcnt = -1; + rcnt = -3; } else if ((rcnt = read(fd, buf, len)) <= 0) { fprintf(stderr, "read('%s') failed: %s\n", proc_path, strerror(errno)); @@ -59,56 +59,65 @@ int compare(struct lov_user_md *lum_dir, struct lov_user_md *lum_file1, char buf[128]; char lov_path[PATH_MAX]; char tmp_path[PATH_MAX]; - int i; + int i, rc; - if (read_proc_entry("/proc/fs/lustre/llite/fs0/lov/common_name", - buf, sizeof(buf)) <= 0) - return -1; + rc = read_proc_entry("/proc/fs/lustre/llite/fs0/lov/common_name", + buf, sizeof(buf)) <= 0; + if (rc < 0) + return -rc; snprintf(lov_path, sizeof(lov_path) - 1, "/proc/fs/lustre/lov/%s", buf); stripe_count = (int)lum_dir->lmm_stripe_count; - if (stripe_count == 0) { + if (stripe_count == 0 || stripe_count == (__u16)-1) { snprintf(tmp_path, sizeof(tmp_path) - 1, "%s/stripecount", lov_path); if (read_proc_entry(tmp_path, buf, sizeof(buf)) <= 0) - return -1; + return 4; stripe_count = atoi(buf); } + snprintf(tmp_path, sizeof(tmp_path) - 1, "%s/numobd", lov_path); + if (read_proc_entry(tmp_path, buf, sizeof(buf)) <= 0) + return 6; + + ost_count = atoi(buf); + stripe_count = stripe_count ? stripe_count : ost_count; + + if (lum_file1->lmm_stripe_count != stripe_count) { + fprintf(stderr, "stripe count %d != %d\n", + lum_file1->lmm_stripe_count, stripe_count); + return 7; + } + stripe_size = (int)lum_dir->lmm_stripe_size; if (stripe_size == 0) { snprintf(tmp_path, sizeof(tmp_path) - 1, "%s/stripesize", lov_path); if (read_proc_entry(tmp_path, buf, sizeof(buf)) <= 0) - return -1; + return 5; stripe_size = atoi(buf); } - snprintf(tmp_path, sizeof(tmp_path) - 1, "%s/numobd", lov_path); - if (read_proc_entry(tmp_path, buf, sizeof(buf)) <= 0) - return -1; - - ost_count = atoi(buf); - stripe_count = stripe_count ? stripe_count : ost_count; - - if ((lum_file1->lmm_stripe_count != stripe_count) || - (lum_file1->lmm_stripe_size != stripe_size)) - return -1; + if (lum_file1->lmm_stripe_size != stripe_size) { + fprintf(stderr, "stripe size %d != %d\n", + lum_file1->lmm_stripe_size, stripe_size); + return 8; + } stripe_offset = (short int)lum_dir->lmm_stripe_offset; if (stripe_offset != -1) { for (i = 0; i < stripe_count; i++) if (lum_file1->lmm_objects[i].l_ost_idx != (stripe_offset + i) % ost_count) - return -1; + return 9; } else if (lum_file2 != NULL) { int next, idx; next = (lum_file1->lmm_objects[stripe_count-1].l_ost_idx + 1) % ost_count; idx = lum_file2->lmm_objects[0].l_ost_idx; if (idx != next) - return -1; + return 10; } return 0; @@ -129,7 +138,7 @@ int main(int argc, char **argv) } dir = opendir(argv[1]); - if (dir == NULL) { + if (dir == NULL) { fprintf(stderr, "%s opendir failed\n", argv[1]); return errno; } diff --git a/lustre/tests/runas.c b/lustre/tests/runas.c index f36a9ff..4db7617 100644 --- a/lustre/tests/runas.c +++ b/lustre/tests/runas.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -37,21 +38,43 @@ int main(int argc, char **argv) uid_t user_id = 0; gid_t grp_id = 0, supp_groups[NGROUPS_MAX] = { 0 }; - if (argc == 1) + if (argc == 1) { + fprintf(stderr, "No parameter count\n"); Usage_and_abort(name); + } // get UID and GID while ((c = getopt(argc, argv, "+u:g:hG::")) != -1) { switch (c) { case 'u': - user_id = (uid_t)atoi(optarg); + if (!isdigit(optarg[0])) { + struct passwd *pw = getpwnam(optarg); + if (pw == NULL) { + fprintf(stderr, "parameter '%s' bad\n", + optarg); + Usage_and_abort(name); + } + user_id = pw->pw_uid; + } else { + user_id = (uid_t)atoi(optarg); + } uid_is_set = 1; if (!gid_is_set) grp_id = user_id; break; case 'g': - grp_id = (gid_t)atoi(optarg); + if (!isdigit(optarg[0])) { + struct group *gr = getgrnam(optarg); + if (gr == NULL) { + fprintf(stderr, "getgrname %s failed\n", + optarg); + Usage_and_abort(name); + } + grp_id = gr->gr_gid; + } else { + grp_id = (gid_t)atoi(optarg); + } gid_is_set = 1; break; @@ -74,11 +97,13 @@ int main(int argc, char **argv) } } - if (!uid_is_set) + if (!uid_is_set) { + fprintf(stderr, "Must specify uid to run.\n"); Usage_and_abort(name); + } if (optind == argc) { - fputs("Must specify command to run.\n", stderr); + fprintf(stderr, "Must specify command to run.\n"); Usage_and_abort(name); } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 7ab0317..4de3216 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -11,6 +11,8 @@ ONLY=${ONLY:-"$*"} ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"42a 42c 45 68"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! +[ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 24o 51b 51c 64b 71" + [ "$ALWAYS_EXCEPT$EXCEPT" ] && \ echo "Skipping tests: `echo $ALWAYS_EXCEPT $EXCEPT`" @@ -193,6 +195,7 @@ STRIPECOUNT=`cat $LPROC/lov/$LOVNAME/stripecount` STRIPESIZE=`cat $LPROC/lov/$LOVNAME/stripesize` ORIGFREE=`cat $LPROC/lov/$LOVNAME/kbytesavail` MAXFREE=${MAXFREE:-$((200000 * $OSTCOUNT))} +MDS=$(\ls $LPROC/mds 2> /dev/null | grep -v num_refs | tail -n 1) [ -f $DIR/d52a/foo ] && chattr -a $DIR/d52a/foo [ -f $DIR/d52b/foo ] && chattr -i $DIR/d52b/foo @@ -2238,13 +2241,9 @@ run_test 58 "verify cross-platform wire constants ==============" test_59() { echo "touch 130 files" - for i in `seq 1 130` ; do - touch $DIR/59-$i - done + createmany -o $DIR/f59- 130 echo "rm 130 files" - for i in `seq 1 130` ; do - rm -f $DIR/59-$i - done + unlinkmany $DIR/f59- 130 sync sleep 2 # wait for commitment of removal @@ -2260,7 +2259,7 @@ run_test 60 "llog sanity tests run from kernel module ==========" test_60b() { # bug 6411 dmesg > $DIR/dmesg LLOG_COUNT=`dmesg | grep -c llog_test` - [ $LLOG_COUNT -gt 50 ] && error "CDEBUG_LIMIT broken" || true + [ $LLOG_COUNT -gt 50 ] && error "CDEBUG_LIMIT not limiting messages"|| true } run_test 60b "limit repeated messages from CERROR/CWARN ========" @@ -2439,7 +2438,11 @@ test_67() { # bug 3285 - supplementary group fails on MDS, passes on client mkdir $DIR/d67 chmod 771 $DIR/d67 chgrp $RUNAS_ID $DIR/d67 - $RUNAS -g $(($RUNAS_ID + 1)) -G1,2,$RUNAS_ID ls $DIR/d67 && error ||true + $RUNAS -u $RUNAS_ID -g $(($RUNAS_ID + 1)) -G1,2,$RUNAS_ID ls $DIR/d67 + RC=$? + GROUP_UPCALL=`cat /proc/fs/lustre/mds/$MDS/group_upcall` + [ "$GROUP_UPCALL" = "NONE" -a $RC -eq 0 ] && error "no-upcall passwd" || true + [ "$GROUP_UPCALL" != "NONE" -a $RC -ne 0 ] && error "upcall failed" || true } run_test 67 "supplementary group failure (should return error) =" diff --git a/lustre/utils/.cvsignore b/lustre/utils/.cvsignore index 59147ac..6fbc4de 100644 --- a/lustre/utils/.cvsignore +++ b/lustre/utils/.cvsignore @@ -16,6 +16,7 @@ lload wirecheck lfs llmount +l_getgroups mount.lustre wiretest .*.cmd diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index 868835b..daf9480 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -14,7 +14,8 @@ bin_scripts = lfind lstripe if UTILS rootsbin_SCRIPTS = mount.lustre -sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount +sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount \ + l_getgroups bin_PROGRAMS = lfs lib_LIBRARIES = liblustreapi.a sbin_SCRIPTS = $(sbin_scripts) diff --git a/lustre/utils/l_getgroups.c b/lustre/utils/l_getgroups.c index 8909422..b569e4a 100644 --- a/lustre/utils/l_getgroups.c +++ b/lustre/utils/l_getgroups.c @@ -29,8 +29,40 @@ #include #include #include +#include +#include + #include +static char *progname; + +void usage(FILE *out) +{ + fprintf(out, "\nusage: %s {-d | mdsname} {uid}\n" + "Normally invoked as an upcall from Lustre, set via:\n" + " /proc/fs/lustre/mds/{mdsname}/group_upcall\n" + "\t-d: debug, print values to stdout instead of Lustre\n", + progname); +} + +static int compare_u32(const void *v1, const void *v2) +{ + return (*(__u32 *)v1 - *(__u32 *)v2); +} + +static void errlog(const char *fmt, ...) +{ + va_list arg; + + openlog(progname, LOG_PERROR, LOG_AUTHPRIV); + + va_start(arg, fmt); + vsyslog(LOG_NOTICE, fmt, arg); + va_end(arg); + + closelog(); +} + int get_groups_local(struct mds_grp_downcall_data **grp) { struct mds_grp_downcall_data *param; @@ -40,19 +72,23 @@ int get_groups_local(struct mds_grp_downcall_data **grp) pw = getpwuid((*grp)->mgd_uid); if (!pw) { - (*grp)->mgd_err = -errno; + errlog("no such user %u\n", (*grp)->mgd_uid); + (*grp)->mgd_err = errno ? errno : EIDRM; return sizeof(*param); } + (*grp)->mgd_gid = pw->pw_gid; maxgroups = sysconf(_SC_NGROUPS_MAX); size = offsetof(struct mds_grp_downcall_data, mgd_groups[maxgroups]); param = malloc(size); if (param == NULL) { - (*grp)->mgd_err = -ENOMEM; + errlog("fail to alloc %d bytes for uid %u with %d groups\n", + size, (*grp)->mgd_uid, maxgroups); return sizeof(*param); } memcpy(param, *grp, sizeof(*param)); + param->mgd_groups[param->mgd_ngroups++] = pw->pw_gid; *grp = param; while ((gr = getgrent())) { if (!gr->gr_mem) @@ -68,6 +104,8 @@ int get_groups_local(struct mds_grp_downcall_data **grp) break; } endgrent(); + qsort(param->mgd_groups, param->mgd_ngroups, + sizeof(param->mgd_groups[0]), compare_u32); return size; } @@ -76,29 +114,65 @@ int get_groups_local(struct mds_grp_downcall_data **grp) * MDS doesn't continue to wait on the upcall. */ int main(int argc, char **argv) { - int fd, rc, size; + int fd, rc, size, debug = 0; struct mds_grp_downcall_data sparam = { MDS_GRP_DOWNCALL_MAGIC }; struct mds_grp_downcall_data *param = &sparam; - char pathname[1024]; + char pathname[1024], *end; - if (argc != 3) { - printf("bad parameter\n"); - return -1; - } + progname = strrchr(argv[0], '/'); + if (progname == NULL) + progname = argv[0]; + else + progname++; - snprintf(pathname, 1024, "/proc/fs/lustre/mds/%s/group_info", argv[1]); - param->mgd_uid = atoi(argv[2]); + if (strcmp(argv[1], "-d") == 0) + debug = 1; - fd = open(pathname, O_WRONLY); - if (fd < 0) { - printf("can't open device %s\n", pathname); - return -1; + if (argc != 3) { + fprintf(stderr, "%s: bad parameter count\n", progname); + usage(stderr); + return EINVAL; + } + param->mgd_uid = strtoul(argv[2], &end, 0); + if (*end) { + fprintf(stderr, "%s: invalid uid '%s'\n", progname, argv[2]); + usage(stderr); + return EINVAL; } size = get_groups_local(¶m); + if (debug) { + int i; + if (param->mgd_err) { + if (param->mgd_err != ENXIO) + fprintf(stderr, + "%s: error getting uid %d groups: %s\n", + progname, param->mgd_uid, + strerror(param->mgd_err)); + rc = param->mgd_err; + } else { + printf("uid=%d gid=", param->mgd_uid); + for (i = 0; i < param->mgd_ngroups; i++) + printf("%s%d", i > 0 ? "," : "", + param->mgd_groups[i]); + printf("\n"); + rc = 0; + } + } else { + snprintf(pathname, 1024, "/proc/fs/lustre/mds/%s/group_info", + argv[1]); + fd = open(pathname, O_WRONLY); + if (fd < 0) { + fprintf(stderr, "%s: can't open device %s: %s\n", + progname, pathname, strerror(errno)); + rc = errno; + } else { + rc = write(fd, param, size); + if (rc > 0) + rc = 0; - rc = write(fd, param, size); - - close(fd); + close(fd); + } + } return rc; } diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 61b73b3..0ee82ef 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -1791,6 +1791,9 @@ class MDSDEV(Module): lctl.newdev("mds", self.name, self.uuid, setup ="%s %s %s %s" %(blkdev, self.fstype, self.name, mountfsoptions)) + self.group_upcall = self.db.get_val('group_upcall','') + sys_set_group_upcall(self.name, self.group_upcall) + except CommandError, e: if e.rc == 2: panic("MDS is missing the config log. Need to run " + @@ -3079,7 +3082,7 @@ def validate_upcall(upcall): print "WARNING invalid upcall script specified: %s" % upcall def sys_set_lustre_upcall(upcall): - # the command overrides the value in the node config + # the command line overrides the value in the node config if config.lustre_upcall: upcall = config.lustre_upcall elif config.upcall: @@ -3089,7 +3092,7 @@ def sys_set_lustre_upcall(upcall): lctl.set_lustre_upcall(upcall) def sys_set_portals_upcall(upcall): - # the command overrides the value in the node config + # the command line overrides the value in the node config if config.portals_upcall: upcall = config.portals_upcall elif config.upcall: @@ -3098,6 +3101,20 @@ def sys_set_portals_upcall(upcall): validate_upcall(upcall) sysctl('portals/upcall', upcall) +def sys_set_group_upcall(mds, upcall): + if config.noexec: + return + # the command line overrides the value in the MDS config + if config.group_upcall: + upcall = config.group_upcall + if upcall: + validate_upcall(upcall) + debug("setting MDS", mds, "upcall to:", upcall) + path = "/proc/fs/lustre/mds/" + mds + "/group_upcall" + fp = open(path, 'w') + fp.write(upcall) + fp.close() + def sys_set_timeout(timeout): # the command overrides the value in the node config if config.timeout and config.timeout > 0: @@ -3266,6 +3283,7 @@ lconf_options = [ ('upcall', "Set both portals and lustre upcall script", PARAM), ('lustre_upcall', "Set lustre upcall script", PARAM), ('portals_upcall', "Set portals upcall script", PARAM), + ('group_upcall', "Set supplementary group upcall program", PARAM), ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM), ('ptldebug', "Set the portals debug level", PARAM), ('subsystem', "Set the portals debug subsystem", PARAM), diff --git a/lustre/utils/lmc b/lustre/utils/lmc index f60def2..78435c0 100755 --- a/lustre/utils/lmc +++ b/lustre/utils/lmc @@ -66,6 +66,7 @@ Object creation command summary: --timeout num --upcall path --lustre_upcall path + --groups_upcall path --portals_upcall path --ptldebug debug_level --subsystem subsystem_name @@ -88,6 +89,7 @@ Object creation command summary: --fstype ldiskfs|ext3 --size size --nspath + --group_upcall upcall --journal_size size --inode_size size --mdsuuid uuid @@ -160,6 +162,7 @@ lmc_options = [ ('timeout', "Set timeout to initiate recovery.", PARAM), ('upcall', "Set both lustre and portals upcall scripts.", PARAM), ('lustre_upcall', "Set location of lustre upcall script.", PARAM), + ('groups_upcall', "Set location of extended groups upcall script.", PARAM), ('portals_upcall', "Set location of portals upcall script.", PARAM), ('ptldebug', "Set the portals debug level", PARAM), ('subsystem', "Specify which Lustre subsystems have debug output recorded in the log", PARAM), @@ -189,6 +192,7 @@ lmc_options = [ ('group', "", PARAM), ('dev', "Path of the device on local system.", PARAM,""), ('size', "Specify the size of the device if needed.", PARAM,"0"), + ('group_upcall', "Set location of supplementary group upcall.", PARAM,""), ('journal_size', "Specify new journal size for underlying ext3 file system.", PARAM,"0"), ('inode_size', "Specify new inode size for underlying ext3 file system.", PARAM,"0"), ('fstype', "Optional argument to specify the filesystem type.", PARAM, "ext3"), @@ -442,7 +446,7 @@ class GenConfig: def mdsdev(self, name, uuid, fstype, devname, format, node_uuid, mds_uuid, dev_size=0, journal_size=0, inode_size=256, - nspath="", mkfsoptions="", mountfsoptions=""): + nspath="", mkfsoptions="", mountfsoptions="", group_upcall=""): mdd = self.newService("mdsdev", name, uuid) self.addElement(mdd, "fstype", fstype) dev = self.addElement(mdd, "devpath", devname) @@ -459,6 +463,8 @@ class GenConfig: self.addElement(mdd, "mkfsoptions", mkfsoptions) if mountfsoptions: self.addElement(mdd, "mountfsoptions", mountfsoptions) + if group_upcall: + self.addElement(mdd, "group_upcall", group_upcall) mdd.appendChild(self.ref("node", node_uuid)) mdd.appendChild(self.ref("target", mds_uuid)) @@ -607,6 +613,11 @@ def set_node_options(gen, node, options): gen.addElement(node, 'lustreUpcall', options.lustre_upcall) else: gen.addElement(node, 'lustreUpcall', default_upcall) + if default_upcall or options.groups_upcall: + if options.groups_upcall: + gen.addElement(node, 'groupsUpcall', options.groups_upcall) + else: + gen.addElement(node, 'groupsUpcall', default_upcall) if default_upcall or options.portals_upcall: if options.portals_upcall: gen.addElement(node, 'portalsUpcall', options.portals_upcall) @@ -736,6 +747,7 @@ def add_mds(gen, lustre, options): nspath = get_option(options, 'nspath') mkfsoptions = get_option(options, 'mkfsoptions') mountfsoptions = get_option(options, 'mountfsoptions') + group_upcall = get_option(options, 'group_upcall') node_uuid = name2uuid(lustre, node_name, 'node') @@ -748,7 +760,7 @@ def add_mds(gen, lustre, options): mdd = gen.mdsdev(mdd_name, mdd_uuid, fstype, devname, get_format_flag(options), node_uuid, mds_uuid, size, journal_size, inode_size, nspath, mkfsoptions, - mountfsoptions) + mountfsoptions, group_upcall) lustre.appendChild(mdd) diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 16db4e7..08a4721 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -270,6 +270,7 @@ check_niobuf_remote(void) CHECK_VALUE(OBD_BRW_WRITE); CHECK_VALUE(OBD_BRW_SYNC); CHECK_VALUE(OBD_BRW_FROM_GRANT); + CHECK_VALUE(OBD_BRW_NOQUOTA); } void