include/linux/ext3_fs.h | 5 ++++-
5 files changed, 85 insertions(+), 6 deletions(-)
-Index: uml-2.6.3/fs/ext3/ialloc.c
+Index: linux-2.6.7/fs/ext3/ialloc.c
===================================================================
---- uml-2.6.3.orig/fs/ext3/ialloc.c 2004-02-20 15:00:48.000000000 +0800
-+++ uml-2.6.3/fs/ext3/ialloc.c 2004-02-21 00:24:45.202693776 +0800
+--- linux-2.6.7.orig/fs/ext3/ialloc.c 2005-03-24 00:27:43.282608616 +0800
++++ linux-2.6.7/fs/ext3/ialloc.c 2005-03-24 00:27:43.888516504 +0800
@@ -420,7 +420,8 @@
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
if (S_ISDIR(mode)) {
if (test_opt (sb, OLDALLOC))
group = find_group_dir(sb, dir);
-Index: uml-2.6.3/fs/ext3/ioctl.c
+Index: linux-2.6.7/fs/ext3/ioctl.c
===================================================================
---- uml-2.6.3.orig/fs/ext3/ioctl.c 2004-01-09 14:59:26.000000000 +0800
-+++ uml-2.6.3/fs/ext3/ioctl.c 2004-02-21 00:21:04.541239416 +0800
-@@ -24,6 +24,31 @@
+--- linux-2.6.7.orig/fs/ext3/ioctl.c 2004-06-16 13:19:13.000000000 +0800
++++ linux-2.6.7/fs/ext3/ioctl.c 2005-03-24 00:31:16.113253440 +0800
+@@ -9,6 +9,7 @@
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
++#include <linux/namei.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/time.h>
+@@ -24,6 +25,31 @@
ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
switch (cmd) {
+ }
case EXT3_IOC_GETFLAGS:
flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
- return put_user(flags, (int *) arg);
-Index: uml-2.6.3/fs/ext3/namei.c
+ return put_user(flags, (int __user *) arg);
+Index: linux-2.6.7/fs/ext3/namei.c
===================================================================
---- uml-2.6.3.orig/fs/ext3/namei.c 2004-02-20 15:01:27.000000000 +0800
-+++ uml-2.6.3/fs/ext3/namei.c 2004-02-21 00:21:04.611228776 +0800
-@@ -1617,6 +1617,19 @@
+--- linux-2.6.7.orig/fs/ext3/namei.c 2005-03-24 00:27:43.536570008 +0800
++++ linux-2.6.7/fs/ext3/namei.c 2005-03-24 00:27:43.893515744 +0800
+@@ -1939,6 +1939,19 @@
return err;
}
/*
* By the time this is called, we already have created
* the directory cache entry for the new file, but it
-@@ -1640,7 +1653,7 @@
+@@ -1963,7 +1976,7 @@
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext3_file_inode_operations;
-@@ -1670,7 +1683,7 @@
+@@ -1994,7 +2007,7 @@
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
-@@ -1702,7 +1715,7 @@
+@@ -2027,7 +2040,7 @@
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
-@@ -2094,7 +2107,7 @@
+@@ -2439,7 +2452,7 @@
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
-Index: uml-2.6.3/include/linux/ext3_fs.h
+Index: linux-2.6.7/include/linux/ext3_fs.h
===================================================================
---- uml-2.6.3.orig/include/linux/ext3_fs.h 2004-01-09 14:59:44.000000000 +0800
-+++ uml-2.6.3/include/linux/ext3_fs.h 2004-02-21 00:21:04.613228472 +0800
+--- linux-2.6.7.orig/include/linux/ext3_fs.h 2005-03-24 00:27:43.542569096 +0800
++++ linux-2.6.7/include/linux/ext3_fs.h 2005-03-24 00:27:43.893515744 +0800
@@ -203,6 +203,7 @@
#define EXT3_IOC_SETFLAGS _IOW('f', 2, long)
#define EXT3_IOC_GETVERSION _IOR('f', 3, long)
#define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long)
#define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long)
#ifdef CONFIG_JBD_DEBUG
-@@ -707,7 +708,8 @@
+@@ -708,7 +709,8 @@
dx_hash_info *hinfo);
/* ialloc.c */
extern void ext3_free_inode (handle_t *, struct inode *);
extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
extern unsigned long ext3_count_free_inodes (struct super_block *);
-@@ -792,4 +794,5 @@
+@@ -793,4 +795,5 @@
#endif /* __KERNEL__ */
subdir-m += lvfs
subdir-m += obdclass
+subdir-m += sec
subdir-m += lov
subdir-m += lmv
subdir-m += ptlrpc
AUTOMAKE_OPTIONS = foreign
-SUBDIRS = include ldiskfs lvfs obdclass lov ldlm ptlrpc \
+SUBDIRS = include ldiskfs lvfs obdclass lov ldlm sec ptlrpc \
obdecho osc mdc lmv mds obdfilter ost llite cobd ptlbd smfs snapfs \
cmobd liblustre doc utils tests conf scripts autoconf
])
#
+# LC_CONFIG_GSS
+#
+# whether build-in gss/krb5 capability
+#
+AC_DEFUN([LC_CONFIG_GSS],
+[AC_MSG_CHECKING([whether to enable gss/krb5 support])
+AC_ARG_ENABLE([gss],
+ AC_HELP_STRING([--enable-gss],
+ [enable gss/krb5 support]),
+ [],[enable_gss='yes'])
+AC_MSG_RESULT([$enable_gss])
+if test x$enable_gss != xno ; then
+ AC_DEFINE(ENABLE_GSS, 1, Support GSS/krb5)
+fi
+])
+
+#
# LC_CONFIG_SNAPFS
#
# Whether snapfs is desired
AC_DEFUN([LC_PROG_LINUX],
[LC_CONFIG_BACKINGFS
LC_CONFIG_PINGER
+LC_CONFIG_GSS
LC_CONFIG_SNAPFS
LC_CONFIG_SMFS
AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests)
AM_CONDITIONAL(SNAPFS, test x$enable_snapfs = xyes)
AM_CONDITIONAL(SMFS, test x$enable_smfs = xyes)
+AM_CONDITIONAL(GSS, test x$enable_gss = xyes)
AM_CONDITIONAL(LIBLUSTRE, test x$enable_liblustre = xyes)
AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests)
])
lustre/ldiskfs/autoMakefile
lustre/ldlm/Makefile
lustre/liblustre/Makefile
-lustre/liblustre/tests/Makefile
lustre/llite/Makefile
lustre/llite/autoMakefile
lustre/lmv/Makefile
lustre/ptlrpc/autoMakefile
lustre/scripts/Makefile
lustre/scripts/version_tag.pl
+lustre/sec/Makefile
+lustre/sec/autoMakefile
+lustre/sec/gss/Makefile
+lustre/sec/gss/autoMakefile
lustre/smfs/Makefile
lustre/smfs/autoMakefile
lustre/snapfs/Makefile
}
static int cobd_getattr(struct obd_export *exp, struct obdo *oa,
- struct lov_stripe_md *lsm)
+ struct lov_stripe_md *ea)
{
struct obd_device *obd = class_exp2obd(exp);
struct obd_export *cobd_exp;
return -EINVAL;
}
cobd_exp = cobd_get_exp(obd);
- return obd_getattr(cobd_exp, oa, lsm);
+ return obd_getattr(cobd_exp, oa, ea);
}
static int cobd_getattr_async(struct obd_export *exp,
}
static int cobd_md_getattr(struct obd_export *exp, struct lustre_id *id,
- __u64 valid, unsigned int ea_size,
- struct ptlrpc_request **request)
+ __u64 valid, const char *ea_name, int ea_namelen,
+ unsigned int ea_size, struct ptlrpc_request **request)
{
struct obd_device *obd = class_exp2obd(exp);
struct obd_export *cobd_exp;
return -EINVAL;
}
cobd_exp = cobd_get_exp(obd);
- return md_getattr(cobd_exp, id, valid, ea_size, request);
+ return md_getattr(cobd_exp, id, valid, NULL, 0, ea_size, request);
}
static int cobd_md_req2lustre_md (struct obd_export *mdc_exp,
int count;
};
-static inline void MODULE_AUTHOR(char *name)
-{
- printf("%s\n", name);
-}
-#define MODULE_DESCRIPTION(name) MODULE_AUTHOR(name)
-#define MODULE_LICENSE(name) MODULE_AUTHOR(name)
+#define MODULE_AUTHOR(name)
+#define MODULE_DESCRIPTION(name)
+#define MODULE_LICENSE(name)
+
+#define module_init(init)
+#define module_exit(exit)
#define THIS_MODULE NULL
#define __init
#define __exit
+#define __user
/* devices */
static inline void spin_lock_irqsave(spinlock_t *a, unsigned long b) {}
static inline void spin_unlock_irqrestore(spinlock_t *a, unsigned long b) {}
+typedef struct { } rwlock_t;
+#define rwlock_init(x) do {} while(0)
+#define RW_LOCK_UNLOCKED (rwlock_t) {}
+#define read_lock(l)
+#define read_unlock(l)
+#define write_lock(l)
+#define write_unlock(l)
+
#define min(x,y) ((x)<(y) ? (x) : (y))
#define max(x,y) ((x)>(y) ? (x) : (y))
({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
#endif
+#define container_of(ptr, type, member) ({ \
+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type,member) );})
+
/* registering symbols */
#define ERESTARTSYS ERESTART
return 0;
}
+static inline long strncpy_from_user(char *dest, const char *src, long n)
+{
+ char *s;
+ s = strncpy(dest, src, n);
+ return strnlen(s, n);
+}
/* slabs */
typedef struct {
#define ATTR_ATTR_FLAG 0x0400
#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */
#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */
-#define ATTR_CTIME_SET 0x2000
+/* ATTR_CTIME_SET has been defined in lustre_idl.h */
struct iattr {
unsigned int ia_valid;
#define INTENT_MAGIC 0x19620323
-struct lustre_intent_data {
- int it_disposition;
- int it_status;
- __u64 it_lock_handle;
- void *it_data;
- int it_lock_mode;
- int it_int_flags;
-};
struct lookup_intent {
int it_magic;
void (*it_op_release)(struct lookup_intent *);
int it_op;
int it_flags;
int it_create_mode;
- union {
- struct lustre_intent_data lustre;
- } d;
+ union {
+ void *fs_data; /* FS-specific intent data */
+ } d;
};
+struct lustre_intent_data {
+ int it_disposition;
+ int it_status;
+ __u64 it_lock_handle;
+ void *it_data;
+ int it_lock_mode;
+ int it_int_flags;
+};
+
+#define LUSTRE_IT(it) ((struct lustre_intent_data *)((it)->d.fs_data))
+
static inline void intent_init(struct lookup_intent *it, int op, int flags)
{
memset(it, 0, sizeof(*it));
struct signal pending;
char comm[32];
int pid;
+ uid_t uid;
+ gid_t gid;
int fsuid;
int fsgid;
int max_groups;
#define time_after(a, b) ((long)(b) - (long)(a) < 0)
#define time_before(a, b) time_after(b,a)
+static inline unsigned long get_seconds(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return (tv.tv_sec + tv.tv_usec / 1000000);
+}
+
struct timer_list {
struct list_head tl_list;
void (*function)(unsigned long unused);
lustre_export.h lustre_log.h obd_echo.h obd_ptlbd.h obd_trace.h \
lustre_compat25.h lustre_fsfilt.h lustre_import.h lustre_mds.h obd.h \
lvfs.h lvfs_linux.h lustre_cfg.h lustre_lite.h lustre_idl.h lustre_smfs.h \
- lustre_cmobd.h obd_lmv.h lustre_snap.h
+ lustre_cmobd.h obd_lmv.h lustre_snap.h lustre_sec.h lustre_ucache.h \
+ lustre_acl.h
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef _LUSTRE_ACL_H_
+#define _LUSTRE_ACL_H_
+
+#include <linux/xattr_acl.h>
+
+/*
+* the value of LL_ACL_MAX_ENTRIES and LL_ACL_NOT_CACHED should be
+* kept step with related definition in ext3 (EXT3_ACL_MAX_ENTRIES and
+* EXT3_ACL_NOT_CACHED)
+*/
+#define LL_ACL_MAX_ENTRIES 32 // EXT3_ACL_MAX_ENTRIES
+#define LL_ACL_NOT_CACHED ((void *)-1) //EXT3_ACL_NOT_CACHED
+
+#endif
LCFG_LOV_DEL_OBD = 0x00cf00c,
LCFG_ADD_CONN = 0x00cf00d,
LCFG_DEL_CONN = 0x00cf00e,
+ LCFG_SET_SECURITY = 0x00cf00f,
};
struct lustre_cfg {
uint32_t lmd_nal;
uint32_t lmd_server_ipaddr;
uint32_t lmd_port;
+ uint32_t lmd_nllu;
+ uint32_t lmd_nllg;
+ char lmd_security[16];
char lmd_mds[64];
char lmd_profile[64];
};
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+/* New (actually old) intent naming */
+#define lookup_intent open_intent
+
+/* And internals */
+#define it_flags flags
+#define it_op op
+#define it_magic magic
+#define it_op_release op_release
+#define it_create_mode create_mode
+
/*
* OBD need working random driver, thus all our
* initialization routines must be called after device
#include <linux/lustre_dlm.h>
struct mds_client_data;
+struct mds_idmap_table;
struct mds_export_data {
struct list_head med_open_head;
struct mds_client_data *med_mcd;
loff_t med_off;
int med_idx;
+ unsigned int med_local:1;
+ __u32 med_nllu;
+ __u32 med_nllg;
+ /* simple idmapping */
+ spinlock_t med_idmap_lock;
+ struct mds_idmap_table *med_idmap;
};
struct osc_creator {
#define OBD_MD_FLUID (0x0000000000000200LL) /* user ID */
#define OBD_MD_FLGID (0x0000000000000400LL) /* group ID */
#define OBD_MD_FLFLAGS (0x0000000000000800LL) /* flags word */
+#define OBD_MD_FLEA (0x0000000000001000LL) /* extended attributes */
#define OBD_MD_FLNLINK (0x0000000000002000LL) /* link count */
#define OBD_MD_FLGENER (0x0000000000004000LL) /* generation number */
#define OBD_MD_FLINLINE (0x0000000000008000LL) /* inline data */
#define OBD_MD_FLDIREA (0x0000000020000000LL) /* dir's extended attribute data */
#define OBD_MD_REINT (0x0000000040000000LL) /* reintegrate oa */
#define OBD_MD_FID (0x0000000080000000LL) /* lustre_id data */
+#define OBD_MD_FLEALIST (0x0000000100000000LL) /* list extended attributes */
+#define OBD_MD_FLACL_ACCESS (0x0000000200000000LL) /*access acl*/
#define OBD_MD_FLNOTOBD (~(OBD_MD_FLBLOCKS | OBD_MD_LINKNAME | \
OBD_MD_FLEASIZE | OBD_MD_FLHANDLE | \
OBD_MD_FLCKSUM | OBD_MD_FLQOS | \
OBD_MD_FLOSCOPQ | OBD_MD_FLCOOKIE | \
- OBD_MD_MDS))
+ OBD_MD_FLEA | OBD_MD_FLEALIST | \
+ OBD_MD_FLACL_ACCESS | OBD_MD_MDS))
static inline struct lustre_handle *obdo_handle(struct obdo *oa)
{
/*
* security descriptor in mds request
- *
- * note gid & cap might need be removed later:
- * - cap should be obtained on mds
- * - gid is actually not used.
*/
struct mds_req_sec_desc {
__u32 rsd_uid;
struct mds_body *body;
struct lov_stripe_md *lsm;
struct mea *mea;
+ struct posix_acl *acl_access;
};
struct mdc_op_data {
__u64 sa_ctime;
};
-/* Remove this once we declare it in include/linux/fs.h (v21 kernel patch?) */
-#ifndef ATTR_CTIME_SET
-#define ATTR_CTIME_SET 0x2000
+/* XXX Following ATTR_XXX should go to vfs patch... */
+#ifdef ATTR_CTIME_SET
+#error "ATTR_CTIME_SET has been defined somewhere else"
+#endif
+#ifdef ATTR_EA
+#error "ATTR_EA has been defined somewhere else"
+#endif
+#ifdef ATTR_EA_RM
+#error "ATTR_EA_RM has been defined somewhere else"
#endif
+#define ATTR_CTIME_SET 0x00002000
+#define ATTR_EA 0x00040000
+#define ATTR_EA_RM 0x00080000
+
extern void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa);
#ifndef FMODE_READ
return (struct lustre_id *)raw_id;
}
+/* security negotiate */
+typedef enum {
+ SEC_INIT = 600,
+ SEC_INIT_CONTINUE = 601,
+ SEC_FINI = 602,
+ SEC_LAST_OPC
+} sec_cmd_t;
+#define SEC_FIRST_OPC SEC_INIT
+
#endif
IMP_EVENT_ACTIVE = 0x808004,
};
+struct ptlrpc_sec;
+
struct obd_import_conn {
struct list_head oic_item;
struct ptlrpc_connection *oic_conn;
unsigned long oic_last_attempt; /* in jiffies */
};
-
struct obd_import {
struct portals_handle imp_handle;
atomic_t imp_refcount;
struct list_head imp_sending_list;
struct list_head imp_delayed_list;
+ /* list of ongoing raw rpcs (only used by gss) */
+ struct list_head imp_rawrpc_list;
+
struct obd_device *imp_obd;
+ struct ptlrpc_sec *imp_sec;
wait_queue_head_t imp_recovery_waitq;
__u64 imp_last_replay_transno;
atomic_t imp_inflight;
#include <linux/rbtree.h>
#include <linux/lustre_compat25.h>
#include <linux/pagemap.h>
+#include <linux/namei.h>
+
/* careful, this is easy to screw up */
#define PAGE_CACHE_MAXBYTES ((__u64)(~0UL) << PAGE_CACHE_SHIFT)
static inline struct lookup_intent *ll_nd2it(struct nameidata *nd)
{
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
- return &nd->intent;
+ return &nd->intent.open;
#else
return nd->intent;
#endif
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
struct inode lli_vfs_inode;
#endif
+ struct posix_acl *lli_acl_access;
};
// FIXME: replace the name of this with LL_I to conform to kernel stuff
LPROC_LL_DIRECT_READ,
LPROC_LL_DIRECT_WRITE,
- LPROC_LL_FILE_OPCODES
+ LPROC_LL_SETXATTR,
+ LPROC_LL_GETXATTR,
+ LPROC_LL_FILE_OPCODES,
+};
+
+struct lustre_intent_data {
+ int it_disposition;
+ int it_status;
+ __u64 it_lock_handle;
+ void *it_data;
+ int it_lock_mode;
};
+#define LUSTRE_IT(it) ((struct lustre_intent_data *)((it)->d.fs_data))
static inline void
ll_inode2id(struct lustre_id *id, struct inode *inode)
#include <linux/lustre_dlm.h>
#include <linux/lustre_log.h>
#include <linux/lustre_export.h>
+#include <linux/lustre_ucache.h>
struct ldlm_lock_desc;
struct mds_obd;
char *ur_tgt;
int ur_eadatalen;
void *ur_eadata;
- int ur_cookielen;
- struct llog_cookie *ur_logcookies;
+ int ur_ea2datalen;
+ void *ur_ea2data;
+ int ur_cookielen; /* obsolete? */
+ struct llog_cookie *ur_logcookies; /* obsolete? */
struct iattr ur_iattr;
struct lvfs_ucred ur_uc;
__u64 ur_rdev;
__u8 mcd_padding[MDS_LR_CLIENT_SIZE - 64];
};
+/* simple uid/gid mapping hash table */
+struct mds_idmap_item {
+ struct list_head hash;
+ __u32 id1;
+ __u32 id2;
+};
+
+#define MDS_IDMAP_HASHSIZE (32)
+struct mds_idmap_table {
+ struct list_head uidmap[MDS_IDMAP_HASHSIZE];
+ struct list_head gidmap[MDS_IDMAP_HASHSIZE];
+};
+
/* file data for open files on MDS */
struct mds_file_data {
struct portals_handle mfd_handle; /* must be first */
unsigned int gh_allow_setgroups:1;
};
+/* lustre security descriptor */
+struct lustre_sec_desc {
+ uid_t lsd_uid;
+ gid_t lsd_gid;
+ struct group_info *lsd_ginfo;
+ unsigned int lsd_allow_setuid:1,
+ lsd_allow_setgid:1,
+ lsd_allow_setgrp:1;
+};
+
+struct lsd_cache_entry {
+ struct upcall_cache_entry base;
+ struct lustre_sec_desc lsd;
+};
+
+struct lsd_downcall_args {
+ int err;
+ uid_t uid;
+ gid_t gid;
+ __u32 ngroups;
+ gid_t *groups;
+ __u32 allow_setuid;
+ __u32 allow_setgid;
+ __u32 allow_setgrp;
+};
+
/* mds/mds_reint.c */
int mds_reint_rec(struct mds_update_record *r, int offset,
struct ptlrpc_request *req, struct lustre_handle *);
struct lustre_md *md);
int mdc_getstatus(struct obd_export *exp, struct lustre_id *rootid);
int mdc_getattr(struct obd_export *exp, struct lustre_id *id,
- __u64 valid, unsigned int ea_size,
- struct ptlrpc_request **request);
+ __u64 valid, const char *ea_name, int ea_namelen,
+ unsigned int ea_size, struct ptlrpc_request **request);
int mdc_getattr_lock(struct obd_export *exp, struct lustre_id *id,
char *filename, int namelen, __u64 valid,
unsigned int ea_size, struct ptlrpc_request **request);
void *cbid_arg; /* additional arg */
};
+struct ptlrpc_cred;
+struct ptlrpc_svcsec;
+
#define RS_MAX_LOCKS 4
#define RS_DEBUG 1
unsigned int rs_handled:1; /* been handled yet? */
unsigned int rs_on_net:1; /* reply_out_callback pending? */
- int rs_size;
+ struct ptlrpc_svcsec *rs_svcsec;
+ char *rs_buf; /* backend buffer */
+ int rs_buf_len; /* backend buffer length */
+ char *rs_repbuf; /* will be sent on wire */
+ int rs_repbuf_len; /* max on-wire data length */
+ int rs_repdata_len; /* actual on-wire data length */
+ struct lustre_msg *rs_msg; /* lustre msg pointer */
+ int rs_msg_len; /* length of lustre msg */
+
__u64 rs_transno;
__u64 rs_xid;
struct obd_export *rs_export;
struct lustre_handle rs_locks[RS_MAX_LOCKS];
ldlm_mode_t rs_modes[RS_MAX_LOCKS];
struct llog_create_locks *rs_llog_locks;
-
- /* last member: variable sized reply message */
- struct lustre_msg rs_msg;
};
struct ptlrpc_request {
unsigned int rq_intr:1, rq_replied:1, rq_err:1,
rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1,
rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
- rq_no_delay:1, rq_net_err:1;
+ rq_no_delay:1, rq_net_err:1, rq_req_wrapped:1,
+ rq_ptlrpcs_restart:1;
int rq_phase;
/* client-side refcount for SENT race */
atomic_t rq_refcount;
__u64 rq_xid;
struct list_head rq_replay_list;
+ struct ptlrpc_cred *rq_cred; /* client side credit */
+ struct ptlrpc_svcsec *rq_svcsec; /* server side security */
+ /* XXX temporarily put here XXX */
+ void *rq_sec_svcdata; /* server security data */
+ unsigned int rq_remote; /* from remote client */
+ uid_t rq_auth_uid;
+
+ char *rq_reqbuf; /* backend request buffer */
+ int rq_reqbuf_len; /* backend request buffer length */
+ int rq_reqdata_len; /* actual request data length */
+ char *rq_repbuf; /* backend reply buffer */
+ int rq_repbuf_len; /* backend reply buffer length */
+ int rq_repdata_len; /* actual reply data length, not used yet */
+
#if SWAB_PARANOIA
__u32 rq_req_swab_mask;
__u32 rq_rep_swab_mask;
void ptlrpc_resend_req(struct ptlrpc_request *request);
int ptl_send_rpc(struct ptlrpc_request *request);
int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd);
+int ptlrpc_do_rawrpc(struct obd_import *imp, char *reqbuf, int reqlen,
+ char *repbuf, int *replenp, int timeout);
/* ptlrpc/client.c */
void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __LINUX_SEC_H_
+#define __LINUX_SEC_H_
+
+/* forward declaration */
+struct obd_import;
+struct ptlrpc_request;
+struct ptlrpc_cred;
+struct ptlrpc_credops;
+struct ptlrpc_sec;
+struct ptlrpc_secops;
+
+#define PTLRPC_SEC_MAX_FLAVORS (4)
+
+typedef struct ptlrpcs_flavor_s {
+ __u32 flavor;
+ __u32 subflavor;
+} ptlrpcs_flavor_t;
+
+enum ptlrpcs_security_type {
+ PTLRPC_SEC_TYPE_NONE = 0, /* no security */
+ PTLRPC_SEC_TYPE_AUTH = 1, /* authentication */
+ PTLRPC_SEC_TYPE_PRIV = 2, /* privacy */
+};
+
+/*
+ * This header is prepended at any on-wire ptlrpc packets
+ */
+struct ptlrpcs_wire_hdr {
+ __u32 flavor;
+ __u32 sectype;
+ __u32 msg_len;
+ __u32 sec_len;
+};
+
+static inline
+struct ptlrpcs_wire_hdr *buf_to_sec_hdr(void *buf)
+{
+ return (struct ptlrpcs_wire_hdr *) buf;
+}
+
+static inline
+struct lustre_msg *buf_to_lustre_msg(void *buf)
+{
+ return (struct lustre_msg *)
+ ((char *) buf + sizeof(struct ptlrpcs_wire_hdr));
+}
+
+static inline
+__u8 *buf_to_sec_data(void *buf)
+{
+ struct ptlrpcs_wire_hdr *hdr = buf_to_sec_hdr(buf);
+ return (__u8 *) (buf + sizeof(*hdr) + hdr->msg_len);
+}
+
+enum ptlrpcs_flavors {
+ PTLRPC_SEC_NULL = 0,
+ PTLRPC_SEC_GSS = 1,
+};
+
+#define PTLRPC_SEC_GSS_VERSION (1)
+
+enum ptlrpcs_gss_subflavors {
+ PTLRPC_SEC_GSS_KRB5 = 0,
+ PTLRPC_SEC_GSS_KRB5I = 1,
+ PTLRPC_SEC_GSS_KRB5P = 2,
+};
+
+enum ptlrpcs_gss_proc {
+ PTLRPC_GSS_PROC_DATA = 0,
+ PTLRPC_GSS_PROC_INIT = 1,
+ PTLRPC_GSS_PROC_CONTINUE_INIT = 2,
+ PTLRPC_GSS_PROC_DESTROY = 3,
+ PTLRPC_GSS_PROC_ERR = 4,
+};
+
+enum ptlrpcs_gss_svc {
+ PTLRPC_GSS_SVC_NONE = 1,
+ PTLRPC_GSS_SVC_INTEGRITY = 2,
+ PTLRPC_GSS_SVC_PRIVACY = 3,
+};
+
+enum ptlrpcs_error {
+ PTLRPCS_OK = 0,
+ PTLRPCS_BADCRED = 1,
+ PTLRPCS_REJECTEDCRED = 2,
+ PTLRPCS_BADVERF = 3,
+ PTLRPCS_REJECTEDVERF = 4,
+ PTLRPCS_TOOWEAK = 5,
+ /* GSS errors */
+ PTLRPCS_GSS_CREDPROBLEM = 13,
+ PTLRPCS_GSS_CTXPROBLEM = 14,
+};
+
+struct vfs_cred {
+ __u64 vc_pag;
+ uid_t vc_uid;
+ gid_t vc_gid;
+ struct group_info *vc_ginfo;
+};
+
+struct ptlrpc_credops {
+ int (*refresh)(struct ptlrpc_cred *cred);
+ int (*match) (struct ptlrpc_cred *cred,
+ struct ptlrpc_request *req,
+ struct vfs_cred *vcred);
+ int (*sign) (struct ptlrpc_cred *cred, struct ptlrpc_request *req);
+ int (*verify) (struct ptlrpc_cred *cred, struct ptlrpc_request *req);
+ int (*seal) (struct ptlrpc_cred *cred, struct ptlrpc_request *req);
+ int (*unseal) (struct ptlrpc_cred *cred, struct ptlrpc_request *req);
+ void (*destroy)(struct ptlrpc_cred *cred);
+};
+
+#define PTLRPC_CRED_UPTODATE 0x00000001
+#define PTLRPC_CRED_DEAD 0x00000002
+
+struct ptlrpc_cred {
+ struct list_head pc_hash; /* linked into hash table */
+ atomic_t pc_refcount;
+ struct ptlrpc_sec *pc_sec;
+ struct ptlrpc_credops *pc_ops;
+ struct ptlrpc_request *pc_req;
+ unsigned long pc_expire;
+ int pc_flags;
+ /* XXX maybe should not be here */
+ __u64 pc_pag;
+ uid_t pc_uid;
+};
+
+struct ptlrpc_secops {
+ struct ptlrpc_sec * (*create_sec) (ptlrpcs_flavor_t *flavor,
+ const char *pipe_dir,
+ void *pipe_data);
+ void (*destroy_sec) (struct ptlrpc_sec *sec);
+ struct ptlrpc_cred * (*create_cred) (struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ struct vfs_cred *vcred);
+ /* buffer manipulation */
+ int (*alloc_reqbuf) (struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int lustre_msg_size);
+ int (*alloc_repbuf) (struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int lustre_msg_size);
+ void (*free_reqbuf) (struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req);
+ void (*free_repbuf) (struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req);
+ /* security payload size estimation */
+ int (*est_req_payload)(struct ptlrpc_sec *sec,
+ int msgsize);
+ int (*est_rep_payload)(struct ptlrpc_sec *sec,
+ int msgsize);
+};
+
+struct ptlrpc_sec_type {
+ struct module *pst_owner;
+ char *pst_name;
+ atomic_t pst_inst; /* instance, debug only */
+ ptlrpcs_flavor_t pst_flavor;
+ struct ptlrpc_secops *pst_ops;
+};
+
+#define PTLRPC_CREDCACHE_NR 8
+#define PTLRPC_CREDCACHE_MASK (PTLRPC_CREDCACHE_NR - 1)
+
+struct ptlrpc_sec {
+ struct ptlrpc_sec_type *ps_type;
+ struct list_head ps_credcache[PTLRPC_CREDCACHE_NR];
+ spinlock_t ps_lock; /* protect cred cache */
+ __u32 ps_sectype;
+ ptlrpcs_flavor_t ps_flavor;
+ atomic_t ps_refcount;
+ atomic_t ps_credcount;
+ struct obd_import *ps_import;
+ /* actual security model need initialize following fields */
+ unsigned long ps_expire; /* cache expire interval */
+ unsigned long ps_nextgc; /* next gc time */
+ unsigned int ps_flags;
+};
+
+/* sec.c */
+int ptlrpcs_register(struct ptlrpc_sec_type *type);
+int ptlrpcs_unregister(struct ptlrpc_sec_type *type);
+
+struct ptlrpc_sec * ptlrpcs_sec_create(ptlrpcs_flavor_t *flavor,
+ struct obd_import *import,
+ const char *pipe_dir,
+ void *pipe_data);
+void ptlrpcs_sec_put(struct ptlrpc_sec *sec);
+void ptlrpcs_sec_invalidate_cache(struct ptlrpc_sec *sec);
+
+struct ptlrpc_cred * ptlrpcs_cred_lookup(struct ptlrpc_sec *sec,
+ struct vfs_cred *vcred);
+void ptlrpcs_cred_put(struct ptlrpc_cred *cred, int sync);
+
+static inline void ptlrpcs_cred_get(struct ptlrpc_cred *cred)
+{
+ LASSERT(atomic_read(&cred->pc_refcount));
+ atomic_inc(&cred->pc_refcount);
+}
+
+static inline int ptlrpcs_cred_is_uptodate(struct ptlrpc_cred *cred)
+{
+ LASSERT(cred);
+ LASSERT(atomic_read(&cred->pc_refcount));
+ return (cred->pc_flags & PTLRPC_CRED_UPTODATE);
+}
+static inline int ptlrpcs_cred_refresh(struct ptlrpc_cred *cred)
+{
+ LASSERT(cred);
+ LASSERT(atomic_read(&cred->pc_refcount));
+ LASSERT(cred->pc_ops);
+ LASSERT(cred->pc_ops->refresh);
+ return cred->pc_ops->refresh(cred);
+}
+static inline void ptlrpcs_cred_die(struct ptlrpc_cred *cred)
+{
+ LASSERT(atomic_read(&cred->pc_refcount));
+ LASSERT(cred->pc_sec);
+ if (!(cred->pc_flags & PTLRPC_CRED_DEAD)) {
+ spin_lock(&cred->pc_sec->ps_lock);
+ cred->pc_flags |= PTLRPC_CRED_DEAD;
+ cred->pc_flags &= ~PTLRPC_CRED_UPTODATE;
+ list_del_init(&cred->pc_hash);
+ spin_unlock(&cred->pc_sec->ps_lock);
+ }
+}
+static inline int ptlrpcs_cred_is_dead(struct ptlrpc_cred *cred)
+{
+ return(cred->pc_flags & PTLRPC_CRED_DEAD);
+}
+
+static inline int ptlrpcs_est_req_payload(struct ptlrpc_sec *sec,
+ int datasize)
+{
+ struct ptlrpc_secops *ops;
+
+ LASSERT(sec);
+ LASSERT(sec->ps_type);
+ LASSERT(sec->ps_type->pst_ops);
+
+ ops = sec->ps_type->pst_ops;
+ if (ops->est_req_payload)
+ return ops->est_req_payload(sec, datasize);
+ else
+ return 0;
+}
+
+static inline int ptlrpcs_est_rep_payload(struct ptlrpc_sec *sec,
+ int datasize)
+{
+ struct ptlrpc_secops *ops;
+
+ LASSERT(sec);
+ LASSERT(sec->ps_type);
+ LASSERT(sec->ps_type->pst_ops);
+
+ ops = sec->ps_type->pst_ops;
+ if (ops->est_rep_payload)
+ return ops->est_rep_payload(sec, datasize);
+ else
+ return 0;
+}
+
+int ptlrpcs_cli_wrap_request(struct ptlrpc_request *req);
+int ptlrpcs_cli_unwrap_reply(struct ptlrpc_request *req);
+int ptlrpcs_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize);
+int ptlrpcs_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize);
+void ptlrpcs_cli_free_reqbuf(struct ptlrpc_request *req);
+void ptlrpcs_cli_free_repbuf(struct ptlrpc_request *req);
+
+/* higher interface */
+int ptlrpcs_import_get_sec(struct obd_import *imp);
+void ptlrpcs_import_drop_sec(struct obd_import *imp);
+int ptlrpcs_req_get_cred(struct ptlrpc_request *req);
+void ptlrpcs_req_drop_cred(struct ptlrpc_request *req);
+int ptlrpcs_req_replace_dead_cred(struct ptlrpc_request *req);
+int ptlrpcs_req_refresh_cred(struct ptlrpc_request *req);
+
+/* internal helpers */
+int sec_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+ int msgsize, int secsize);
+void sec_free_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+
+/* sec_null.c */
+int ptlrpcs_null_init(void);
+int ptlrpcs_null_exit(void);
+
+/**********************************************************
+ * Server side stuff
+ **********************************************************/
+
+struct ptlrpc_reply_state;
+
+struct ptlrpc_svcsec {
+ struct module *pss_owner;
+ char *pss_name;
+ ptlrpcs_flavor_t pss_flavor;
+ int pss_sec_size;
+
+ int (*accept) (struct ptlrpc_request *req,
+ enum ptlrpcs_error *res);
+ int (*authorize) (struct ptlrpc_request *req);
+ int (*alloc_repbuf)(struct ptlrpc_svcsec *svcsec,
+ struct ptlrpc_request *req,
+ int msgsize);
+ void (*free_repbuf) (struct ptlrpc_svcsec *svcsec,
+ struct ptlrpc_reply_state *rs);
+ void (*cleanup_req) (struct ptlrpc_svcsec *svcsec,
+ struct ptlrpc_request *req);
+};
+
+#define SVC_OK 1
+#define SVC_COMPLETE 2
+#define SVC_DROP 3
+#define SVC_LOGIN 4
+#define SVC_LOGOUT 5
+
+int svcsec_register(struct ptlrpc_svcsec *ss);
+int svcsec_unregister(struct ptlrpc_svcsec *ss);
+int svcsec_accept(struct ptlrpc_request *req, enum ptlrpcs_error *res);
+int svcsec_authorize(struct ptlrpc_request *req);
+int svcsec_alloc_repbuf(struct ptlrpc_svcsec *svcsec,
+ struct ptlrpc_request *req, int msgsize);
+void svcsec_cleanup_req(struct ptlrpc_request *req);
+
+struct ptlrpc_svcsec * svcsec_get(struct ptlrpc_svcsec *sec);
+void svcsec_put(struct ptlrpc_svcsec *sec);
+
+/* internal helpers */
+int svcsec_alloc_reply_state(struct ptlrpc_request *req,
+ int msgsize, int secsize);
+void svcsec_free_reply_state(struct ptlrpc_reply_state *rs);
+
+/* svcsec_null.c */
+int svcsec_null_init(void);
+int svcsec_null_exit(void);
+
+#endif /* __LINUX_SEC_H_ */
#ifndef __LUSTRE_SMFS_H
#define __LUSTRE_SMFS_H
+#include <linux/namei.h>
struct snap_inode_info {
int sn_flags; /*the flags indicated inode type */
int sn_gen; /*the inode generation*/
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+
+#ifndef _UPCALL_CACHE_H
+#define _UPCALL_CACHE_H
+
+#define UC_CACHE_NEW 0x01
+#define UC_CACHE_ACQUIRING 0x02
+#define UC_CACHE_INVALID 0x04
+#define UC_CACHE_EXPIRED 0x08
+
+#define UC_CACHE_IS_NEW(i) ((i)->ue_flags & UC_CACHE_NEW)
+#define UC_CACHE_IS_INVALID(i) ((i)->ue_flags & UC_CACHE_INVALID)
+#define UC_CACHE_IS_ACQUIRING(i) ((i)->ue_flags & UC_CACHE_ACQUIRING)
+#define UC_CACHE_IS_EXPIRED(i) ((i)->ue_flags & UC_CACHE_EXPIRED)
+#define UC_CACHE_IS_VALID(i) ((i)->ue_flags == 0)
+
+#define UC_CACHE_SET_NEW(i) (i)->ue_flags |= UC_CACHE_NEW
+#define UC_CACHE_SET_INVALID(i) (i)->ue_flags |= UC_CACHE_INVALID
+#define UC_CACHE_SET_ACQUIRING(i) (i)->ue_flags |= UC_CACHE_ACQUIRING
+#define UC_CACHE_SET_EXPIRED(i) (i)->ue_flags |= UC_CACHE_EXPIRED
+#define UC_CACHE_SET_VALID(i) (i)->ue_flags = 0
+
+#define UC_CACHE_CLEAR_NEW(i) (i)->ue_flags &= ~UC_CACHE_NEW
+#define UC_CACHE_CLEAR_ACQUIRING(i) (i)->ue_flags &= ~UC_CACHE_ACQUIRING
+#define UC_CACHE_CLEAR_INVALID(i) (i)->ue_flags &= ~UC_CACHE_INVALID
+#define UC_CACHE_CLEAR_EXPIRED(i) (i)->ue_flags &= ~UC_CACHE_EXPIRED
+
+struct upcall_cache;
+
+struct upcall_cache_entry {
+ struct list_head ue_hash;
+ atomic_t ue_refcount;
+ __u64 ue_key;
+ struct upcall_cache *ue_cache;
+ int ue_flags;
+ wait_queue_head_t ue_waitq;
+ unsigned long ue_acquire_expire;
+ unsigned long ue_expire;
+};
+
+#define UC_CACHE_UPCALL_MAXPATH (1024)
+
+struct upcall_cache {
+ struct list_head *uc_hashtable;
+ int uc_hashsize;
+ rwlock_t uc_hashlock;
+
+ char *uc_name;
+ char uc_upcall[UC_CACHE_UPCALL_MAXPATH];
+ unsigned long uc_acquire_expire;
+ unsigned long uc_entry_expire;
+
+ /* functions */
+ unsigned int (*hash)(struct upcall_cache *, __u64);
+ struct upcall_cache_entry* (*alloc_entry)(struct upcall_cache *, __u64);
+ void (*free_entry)(struct upcall_cache *,
+ struct upcall_cache_entry *);
+ int (*make_upcall)(struct upcall_cache *,
+ struct upcall_cache_entry *);
+ int (*parse_downcall)(struct upcall_cache *,
+ struct upcall_cache_entry *,
+ void *args);
+};
+
+void upcall_cache_init_entry(struct upcall_cache *cache,
+ struct upcall_cache_entry *entry,
+ __u64 key);
+struct upcall_cache_entry *
+upcall_cache_get_entry(struct upcall_cache *cache, __u64 key);
+void upcall_cache_put_entry(struct upcall_cache_entry *entry);
+int upcall_cache_downcall(struct upcall_cache *cache, __u64 key,
+ int err, void *args);
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key);
+void upcall_cache_flush_idle(struct upcall_cache *cache);
+void upcall_cache_flush_all(struct upcall_cache *cache);
+
+#endif /* _UPCALL_CACHE_H */
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
#ifndef __LVFS_H__
#define __LVFS_H__
#define LL_ID_NAMELEN (16 + 1 + 8 + 1)
#if defined __KERNEL__
+#include <linux/dcache.h>
+#include <linux/namei.h>
#include <linux/lustre_compat25.h>
#include <linux/lvfs_linux.h>
#endif
/* simple.c */
struct lvfs_ucred {
- struct mds_grp_hash_entry *luc_ghash;
- struct group_info *luc_ginfo;
+ struct lustre_sec_desc *luc_lsd;
+ struct group_info *luc_ginfo;
__u32 luc_fsuid;
__u32 luc_fsgid;
__u32 luc_cap;
__u32 luc_uid;
- __u32 luc_umask;
+ __u32 luc_umask;
};
struct lvfs_callback_ops {
{
struct dentry *dchild;
#ifdef S_PDIROPS
- struct qstr qstr;
- void *lock;
- qstr.name = name;
- qstr.len = namelen;
- lock = lock_dir(dparent->d_inode, &qstr);
+ struct qstr qstr;
+ void *lock;
+ qstr.name = name;
+ qstr.len = namelen;
+ lock = lock_dir(dparent->d_inode, &qstr);
#else
down(&dparent->d_inode->i_sem);
#endif
dchild = lookup_one_len(name, dparent, namelen);
#ifdef S_PDIROPS
- unlock_dir(dparent->d_inode, lock);
+ unlock_dir(dparent->d_inode, lock);
#else
up(&dparent->d_inode->i_sem);
#endif
schedule_timeout(t * HZ);
set_current_state(TASK_RUNNING);
}
+
+static inline struct dentry *
+ll_d_lookup(const char *name,
+ struct dentry *dparent, int len)
+{
+ struct qstr qstr;
+
+ qstr.len = len;
+ qstr.name = name;
+ qstr.hash = full_name_hash(name, len);
+ return d_lookup(dparent, &qstr);
+}
#endif
static inline int ll_id2str(char *str, __u64 id, __u32 generation)
#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
/* Moved to lustre_user.h
#define IOC_MDC_GETSTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_mds_md *) */
-#define IOC_MDC_FINISH_GNS _IOWR(IOC_MDC_TYPE, 22, struct obd_device *)
#define IOC_MDC_MAX_NR 50
#ifdef __KERNEL__
int cl_max_mds_cookiesize;
kdev_t cl_sandev;
+ /* security flavors */
+ __u32 cl_sec_flavor;
+ __u32 cl_sec_subflavor;
+ __u32 cl_nllu; /* non lustre local user */
+ __u32 cl_nllg; /* non lustre local group */
+
//struct llog_canceld_ctxt *cl_llcd; /* it's included by obd_llog_ctxt */
void *cl_llcd_offset;
struct dentry *mds_id_dir;
int mds_obd_type;
struct dentry *mds_unnamed_dir; /* for mdt_obd_create only */
+
+ /* security related */
+ char *mds_mds_sec;
+ char *mds_ost_sec;
};
struct echo_obd {
void *, int, ldlm_completion_callback,
ldlm_blocking_callback, void *);
int (*m_getattr)(struct obd_export *, struct lustre_id *,
- __u64, unsigned int,
- struct ptlrpc_request **);
+ __u64, const char *, int,
+ unsigned int, struct ptlrpc_request **);
int (*m_getattr_lock)(struct obd_export *, struct lustre_id *,
char *, int, __u64,
unsigned int, struct ptlrpc_request **);
}
static inline int md_getattr(struct obd_export *exp, struct lustre_id *id,
- __u64 valid, unsigned int ea_size,
- struct ptlrpc_request **request)
+ __u64 valid, const char *ea_name, int ea_namelen,
+ unsigned int ea_size, struct ptlrpc_request **request)
{
int rc;
ENTRY;
EXP_CHECK_MD_OP(exp, getattr);
MD_COUNTER_INCREMENT(exp->exp_obd, getattr);
- rc = MDP(exp->exp_obd, getattr)(exp, id, valid, ea_size, request);
+ rc = MDP(exp->exp_obd, getattr)(exp, id, valid, ea_name, ea_namelen, ea_size, request);
RETURN(rc);
}
#define OBD_FAIL_TGT_REPLY_NET 0x700
#define OBD_FAIL_TGT_CONN_RACE 0x701
+#define OBD_FAIL_SVCSEC_ACCEPT_BEG 0x750
+#define OBD_FAIL_SVCSEC_ACCEPT_END 0x751
+#define OBD_FAIL_SVCSEC_WRAP_BEG 0x752
+#define OBD_FAIL_SVCSEC_WRAP_END 0x753
+#define OBD_FAIL_SVCGSS_ERR_NOTIFY 0x760
+#define OBD_FAIL_SVCGSS_INIT_REQ 0x780
+#define OBD_FAIL_SVCGSS_INIT_REP 0x781
+
/* preparation for a more advanced failure testbed (not functional yet) */
#define OBD_FAIL_MASK_SYS 0x0000FF00
#define OBD_FAIL_MASK_LOC (0x000000FF | OBD_FAIL_MASK_SYS)
Index: linux-2.6.7/include/linux/dcache.h
===================================================================
---- linux-2.6.7.orig/include/linux/dcache.h 2004-08-30 17:20:57.000000000 +0800
-+++ linux-2.6.7/include/linux/dcache.h 2004-08-30 17:39:12.000000000 +0800
-@@ -94,6 +94,9 @@
+--- linux-2.6.7.orig/include/linux/dcache.h 2005-03-23 23:28:49.669799416 +0800
++++ linux-2.6.7/include/linux/dcache.h 2005-03-23 23:38:25.648237384 +0800
+@@ -86,6 +86,9 @@
spinlock_t d_lock; /* per dentry lock */
struct inode *d_inode; /* Where the name belongs to - NULL is
* negative */
/*
* The next three fields are touched by __d_lookup. Place them here
* so they all fit in a 16-byte range, with 16-byte alignment.
-@@ -166,6 +169,7 @@
+@@ -158,6 +161,8 @@
#define DCACHE_UNHASHED 0x0010
- #define DCACHE_LUSTRE_INVALID 0x0020 /* Lustre invalidated */
+ #define DCACHE_LUSTRE_INVALID 0x0020 /* invalidated by Lustre */
+#define DCACHE_CROSS_REF 0x0040 /* entry points to inode on another MDS */
-
++
extern spinlock_t dcache_lock;
+ /**
--- /dev/null
+Index: linux-2.6.7/mm/truncate.c
+===================================================================
+--- linux-2.6.7.orig/mm/truncate.c 2004-06-16 13:20:04.000000000 +0800
++++ linux-2.6.7/mm/truncate.c 2005-03-23 23:30:30.676444072 +0800
+@@ -42,7 +42,7 @@
+ * its lock, b) when a concurrent invalidate_inode_pages got there first and
+ * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
+ */
+-static void
++void
+ truncate_complete_page(struct address_space *mapping, struct page *page)
+ {
+ if (page->mapping != mapping)
+@@ -58,6 +58,8 @@
+ page_cache_release(page); /* pagecache ref */
+ }
+
++EXPORT_SYMBOL(truncate_complete_page);
++
+ /*
+ * This is for invalidate_inode_pages(). That function can be called at
+ * any time, and is not supposed to throw away dirty pages. But pages can
+Index: linux-2.6.7/fs/super.c
+===================================================================
+--- linux-2.6.7.orig/fs/super.c 2004-06-16 13:19:22.000000000 +0800
++++ linux-2.6.7/fs/super.c 2005-03-23 23:30:30.648448328 +0800
+@@ -804,6 +804,8 @@
+ return (struct vfsmount *)sb;
+ }
+
++EXPORT_SYMBOL(do_kern_mount);
++
+ struct vfsmount *kern_mount(struct file_system_type *type)
+ {
+ return do_kern_mount(type->name, 0, type->name, NULL);
+Index: linux-2.6.7/fs/jbd/journal.c
+===================================================================
+--- linux-2.6.7.orig/fs/jbd/journal.c 2004-06-16 13:18:59.000000000 +0800
++++ linux-2.6.7/fs/jbd/journal.c 2005-03-23 23:30:30.647448480 +0800
+@@ -71,6 +71,7 @@
+ EXPORT_SYMBOL(journal_errno);
+ EXPORT_SYMBOL(journal_ack_err);
+ EXPORT_SYMBOL(journal_clear_err);
++EXPORT_SYMBOL(log_start_commit);
+ EXPORT_SYMBOL(log_wait_commit);
+ EXPORT_SYMBOL(journal_start_commit);
+ EXPORT_SYMBOL(journal_wipe);
+Index: linux-2.6.7/kernel/exit.c
+===================================================================
+--- linux-2.6.7.orig/kernel/exit.c 2004-06-16 13:19:52.000000000 +0800
++++ linux-2.6.7/kernel/exit.c 2005-03-23 23:34:17.539955576 +0800
+@@ -256,6 +256,8 @@
+ write_unlock_irq(&tasklist_lock);
+ }
+
++EXPORT_SYMBOL(reparent_to_init);
++
+ void __set_special_pids(pid_t session, pid_t pgrp)
+ {
+ struct task_struct *curr = current;
+@@ -435,6 +437,7 @@
+ {
+ __exit_files(tsk);
+ }
++EXPORT_SYMBOL(exit_files);
+
+ static inline void __put_fs_struct(struct fs_struct *fs)
+ {
+Index: linux-2.6.7/include/linux/fs.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/fs.h 2005-03-23 23:30:08.535809960 +0800
++++ linux-2.6.7/include/linux/fs.h 2005-03-23 23:30:30.675444224 +0800
+@@ -1133,6 +1133,7 @@
+ extern struct vfsmount *kern_mount(struct file_system_type *);
+ extern int may_umount_tree(struct vfsmount *);
+ extern int may_umount(struct vfsmount *);
++struct vfsmount *do_kern_mount(const char *type, int flags, const char *name, void *data);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
+
+ extern int vfs_statfs(struct super_block *, struct kstatfs *);
+Index: linux-2.6.7/include/linux/mm.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/mm.h 2004-06-16 13:18:56.000000000 +0800
++++ linux-2.6.7/include/linux/mm.h 2005-03-23 23:30:30.673444528 +0800
+@@ -653,6 +653,9 @@
+
+ extern unsigned long do_brk(unsigned long, unsigned long);
+
++/* truncate.c */
++extern void truncate_complete_page(struct address_space *mapping,struct page *);
++
+ /* filemap.c */
+ extern unsigned long page_unuse(struct page *);
+ extern void truncate_inode_pages(struct address_space *, loff_t);
include/linux/ext3_fs.h | 5 ++++-
5 files changed, 85 insertions(+), 6 deletions(-)
-Index: uml-2.6.3/fs/ext3/ialloc.c
+Index: linux-2.6.7/fs/ext3/ialloc.c
===================================================================
---- uml-2.6.3.orig/fs/ext3/ialloc.c 2004-02-20 15:00:48.000000000 +0800
-+++ uml-2.6.3/fs/ext3/ialloc.c 2004-02-21 00:24:45.202693776 +0800
+--- linux-2.6.7.orig/fs/ext3/ialloc.c 2005-03-24 00:27:43.282608616 +0800
++++ linux-2.6.7/fs/ext3/ialloc.c 2005-03-24 00:27:43.888516504 +0800
@@ -420,7 +420,8 @@
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
if (S_ISDIR(mode)) {
if (test_opt (sb, OLDALLOC))
group = find_group_dir(sb, dir);
-Index: uml-2.6.3/fs/ext3/ioctl.c
+Index: linux-2.6.7/fs/ext3/ioctl.c
===================================================================
---- uml-2.6.3.orig/fs/ext3/ioctl.c 2004-01-09 14:59:26.000000000 +0800
-+++ uml-2.6.3/fs/ext3/ioctl.c 2004-02-21 00:21:04.541239416 +0800
-@@ -24,6 +24,31 @@
+--- linux-2.6.7.orig/fs/ext3/ioctl.c 2004-06-16 13:19:13.000000000 +0800
++++ linux-2.6.7/fs/ext3/ioctl.c 2005-03-24 00:31:16.113253440 +0800
+@@ -9,6 +9,7 @@
+
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
++#include <linux/namei.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/time.h>
+@@ -24,6 +25,31 @@
ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
switch (cmd) {
+ }
case EXT3_IOC_GETFLAGS:
flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
- return put_user(flags, (int *) arg);
-Index: uml-2.6.3/fs/ext3/namei.c
+ return put_user(flags, (int __user *) arg);
+Index: linux-2.6.7/fs/ext3/namei.c
===================================================================
---- uml-2.6.3.orig/fs/ext3/namei.c 2004-02-20 15:01:27.000000000 +0800
-+++ uml-2.6.3/fs/ext3/namei.c 2004-02-21 00:21:04.611228776 +0800
-@@ -1617,6 +1617,19 @@
+--- linux-2.6.7.orig/fs/ext3/namei.c 2005-03-24 00:27:43.536570008 +0800
++++ linux-2.6.7/fs/ext3/namei.c 2005-03-24 00:27:43.893515744 +0800
+@@ -1939,6 +1939,19 @@
return err;
}
/*
* By the time this is called, we already have created
* the directory cache entry for the new file, but it
-@@ -1640,7 +1653,7 @@
+@@ -1963,7 +1976,7 @@
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext3_file_inode_operations;
-@@ -1670,7 +1683,7 @@
+@@ -1994,7 +2007,7 @@
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
-@@ -1702,7 +1715,7 @@
+@@ -2027,7 +2040,7 @@
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
-@@ -2094,7 +2107,7 @@
+@@ -2439,7 +2452,7 @@
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
-Index: uml-2.6.3/include/linux/ext3_fs.h
+Index: linux-2.6.7/include/linux/ext3_fs.h
===================================================================
---- uml-2.6.3.orig/include/linux/ext3_fs.h 2004-01-09 14:59:44.000000000 +0800
-+++ uml-2.6.3/include/linux/ext3_fs.h 2004-02-21 00:21:04.613228472 +0800
+--- linux-2.6.7.orig/include/linux/ext3_fs.h 2005-03-24 00:27:43.542569096 +0800
++++ linux-2.6.7/include/linux/ext3_fs.h 2005-03-24 00:27:43.893515744 +0800
@@ -203,6 +203,7 @@
#define EXT3_IOC_SETFLAGS _IOW('f', 2, long)
#define EXT3_IOC_GETVERSION _IOR('f', 3, long)
#define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long)
#define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long)
#ifdef CONFIG_JBD_DEBUG
-@@ -707,7 +708,8 @@
+@@ -708,7 +709,8 @@
dx_hash_info *hinfo);
/* ialloc.c */
extern void ext3_free_inode (handle_t *, struct inode *);
extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
extern unsigned long ext3_count_free_inodes (struct super_block *);
-@@ -792,4 +794,5 @@
+@@ -793,4 +795,5 @@
#endif /* __KERNEL__ */
--- /dev/null
+%diffstat
+ blockgroup_lock.h | 4 +++-
+ percpu_counter.h | 4 ++++
+ 2 files changed, 7 insertions(+), 1 deletion(-)
+
+%patch
+Index: linux-2.6.6/include/linux/percpu_counter.h
+===================================================================
+--- linux-2.6.6.orig/include/linux/percpu_counter.h 2004-04-04 11:37:23.000000000 +0800
++++ linux-2.6.6/include/linux/percpu_counter.h 2004-05-22 16:08:16.000000000 +0800
+@@ -3,6 +3,8 @@
+ *
+ * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4.
+ */
++#ifndef _LINUX_PERCPU_COUNTER_H
++#define _LINUX_PERCPU_COUNTER_H
+
+ #include <linux/config.h>
+ #include <linux/spinlock.h>
+@@ -101,3 +103,5 @@ static inline void percpu_counter_dec(st
+ {
+ percpu_counter_mod(fbc, -1);
+ }
++
++#endif /* _LINUX_PERCPU_COUNTER_H */
+Index: linux-2.6.6/include/linux/blockgroup_lock.h
+===================================================================
+--- linux-2.6.6.orig/include/linux/blockgroup_lock.h 2004-04-04 11:36:26.000000000 +0800
++++ linux-2.6.6/include/linux/blockgroup_lock.h 2004-05-22 16:08:45.000000000 +0800
+@@ -3,6 +3,8 @@
+ *
+ * Simple hashed spinlocking.
+ */
++#ifndef _LINUX_BLOCKGROUP_LOCK_H
++#define _LINUX_BLOCKGROUP_LOCK_H
+
+ #include <linux/config.h>
+ #include <linux/spinlock.h>
+@@ -55,4 +57,4 @@ static inline void bgl_lock_init(struct
+ #define sb_bgl_lock(sb, block_group) \
+ (&(sb)->s_blockgroup_lock.locks[(block_group) & (NR_BG_LOCKS-1)].lock)
+
+-
++#endif
+
+ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */
+ dentry->d_inode = inode;
+
-+ __d_rehash(dentry, 0); /* d_rehash */
++ __d_rehash(dentry); /* d_rehash */
+ spin_unlock(&dcache_lock);
+
+ return NULL;
+ /* Move the goal to the de hash queue */
+ goal->d_flags &= ~ DCACHE_DISCONNECTED;
+ security_d_instantiate(goal, inode);
-+ __d_rehash(dentry, 0);
++ __d_rehash(dentry);
+ __d_move(goal, dentry);
+ spin_unlock(&dcache_lock);
+ iput(inode);
+ dentry->d_inode = inode;
+do_rehash:
+ if (rehash)
-+ __d_rehash(dentry, 0); /* d_rehash */
++ __d_rehash(dentry); /* d_rehash */
+ spin_unlock(&dcache_lock);
+
+ return NULL;
--- /dev/null
+--- linux-2.6.7/Documentation/filesystems/00-INDEX.lsec 2004-06-15 23:20:26.000000000 -0600
++++ linux-2.6.7/Documentation/filesystems/00-INDEX 2005-03-23 14:28:24.576313528 -0700
+@@ -28,6 +28,8 @@ jfs.txt
+ - info and mount options for the JFS filesystem.
+ ncpfs.txt
+ - info on Novell Netware(tm) filesystem using NCP protocol.
++nfs4.txt
++ - info and mount options for the nfs4 filesystem.
+ ntfs.txt
+ - info and mount options for the NTFS filesystem (Windows NT).
+ proc.txt
+--- linux-2.6.7/Documentation/filesystems/nfs4.txt.lsec 2005-03-23 14:28:24.576313528 -0700
++++ linux-2.6.7/Documentation/filesystems/nfs4.txt 2005-03-23 14:28:24.576313528 -0700
+@@ -0,0 +1,20 @@
++NFS version 4
++=============
++
++NFS version 4 is specified by RFC3530. Compared to earlier NFS versions,
++it provides enhanced security and better client caching, among other features.
++
++In addition to basic file operations, the NFS client supports locking, kerberos
++(basic authentication and integrity), and reboot recovery.
++
++As this writing (July 2004), patches to nfs-utils and util-linux are required
++for NFSv4 support; see http://www.citi.umich.edu/projects/nfsv4/linux/ for
++patches and instructions.
++
++The kernel treats NFS version 4 as a separate filesystem type, nfs4, so it is
++mounted using "mount -tnfs4 server:/path /mntpoint", not by mounting the nfs
++filesystem with -onfsver=4.
++
++Mount options:
++
++XXX?
+--- linux-2.6.7/fs/locks.c.lsec 2004-06-15 23:20:03.000000000 -0600
++++ linux-2.6.7/fs/locks.c 2005-03-23 14:28:22.425640480 -0700
+@@ -317,7 +317,7 @@ static int flock_to_posix_lock(struct fi
+ if (l->l_len == 0)
+ fl->fl_end = OFFSET_MAX;
+
+- fl->fl_owner = current->files;
++ fl->fl_owner = 0;
+ fl->fl_pid = current->tgid;
+ fl->fl_file = filp;
+ fl->fl_flags = FL_POSIX;
+@@ -357,7 +357,7 @@ static int flock64_to_posix_lock(struct
+ if (l->l_len == 0)
+ fl->fl_end = OFFSET_MAX;
+
+- fl->fl_owner = current->files;
++ fl->fl_owner = 0;
+ fl->fl_pid = current->tgid;
+ fl->fl_file = filp;
+ fl->fl_flags = FL_POSIX;
+@@ -920,7 +920,7 @@ int posix_lock_file(struct file *filp, s
+ */
+ int locks_mandatory_locked(struct inode *inode)
+ {
+- fl_owner_t owner = current->files;
++ unsigned int pid = current->tgid;
+ struct file_lock *fl;
+
+ /*
+@@ -930,7 +930,9 @@ int locks_mandatory_locked(struct inode
+ for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+ if (!IS_POSIX(fl))
+ continue;
+- if (fl->fl_owner != owner)
++ if (fl->fl_owner != 0)
++ break;
++ if (fl->fl_pid != pid)
+ break;
+ }
+ unlock_kernel();
+@@ -958,7 +960,7 @@ int locks_mandatory_area(int read_write,
+ int error;
+
+ locks_init_lock(&fl);
+- fl.fl_owner = current->files;
++ fl.fl_owner = 0;
+ fl.fl_pid = current->tgid;
+ fl.fl_file = filp;
+ fl.fl_flags = FL_POSIX | FL_ACCESS;
+@@ -1684,7 +1686,7 @@ void locks_remove_posix(struct file *fil
+ lock_kernel();
+ while (*before != NULL) {
+ struct file_lock *fl = *before;
+- if (IS_POSIX(fl) && (fl->fl_owner == owner)) {
++ if (IS_POSIX(fl) && posix_same_owner(fl, &lock)) {
+ locks_delete_lock(before);
+ continue;
+ }
+@@ -1982,18 +1984,6 @@ int lock_may_write(struct inode *inode,
+
+ EXPORT_SYMBOL(lock_may_write);
+
+-static inline void __steal_locks(struct file *file, fl_owner_t from)
+-{
+- struct inode *inode = file->f_dentry->d_inode;
+- struct file_lock *fl = inode->i_flock;
+-
+- while (fl) {
+- if (fl->fl_file == file && fl->fl_owner == from)
+- fl->fl_owner = current->files;
+- fl = fl->fl_next;
+- }
+-}
+-
+ /* When getting ready for executing a binary, we make sure that current
+ * has a files_struct on its own. Before dropping the old files_struct,
+ * we take over ownership of all locks for all file descriptors we own.
+@@ -2002,31 +1992,6 @@ static inline void __steal_locks(struct
+ */
+ void steal_locks(fl_owner_t from)
+ {
+- struct files_struct *files = current->files;
+- int i, j;
+-
+- if (from == files)
+- return;
+-
+- lock_kernel();
+- j = 0;
+- for (;;) {
+- unsigned long set;
+- i = j * __NFDBITS;
+- if (i >= files->max_fdset || i >= files->max_fds)
+- break;
+- set = files->open_fds->fds_bits[j++];
+- while (set) {
+- if (set & 1) {
+- struct file *file = files->fd[i];
+- if (file)
+- __steal_locks(file, from);
+- }
+- i++;
+- set >>= 1;
+- }
+- }
+- unlock_kernel();
+ }
+ EXPORT_SYMBOL(steal_locks);
+
+--- linux-2.6.7/fs/hostfs/hostfs_kern.c.lsec 2005-03-23 14:25:58.982447160 -0700
++++ linux-2.6.7/fs/hostfs/hostfs_kern.c 2005-03-23 14:33:11.946626600 -0700
+@@ -290,7 +290,6 @@ static void hostfs_delete_inode(struct i
+ {
+ if(HOSTFS_I(inode)->fd != -1) {
+ close_file(&HOSTFS_I(inode)->fd);
+- printk("Closing host fd in .delete_inode\n");
+ HOSTFS_I(inode)->fd = -1;
+ }
+ clear_inode(inode);
+@@ -303,7 +302,6 @@ static void hostfs_destroy_inode(struct
+
+ if(HOSTFS_I(inode)->fd != -1) {
+ close_file(&HOSTFS_I(inode)->fd);
+- printk("Closing host fd in .destroy_inode\n");
+ }
+
+ kfree(HOSTFS_I(inode));
+--- linux-2.6.7/fs/open.c.lsec 2005-03-23 14:26:01.774022776 -0700
++++ linux-2.6.7/fs/open.c 2005-03-23 14:28:23.226518728 -0700
+@@ -1025,7 +1025,7 @@ int filp_close(struct file *filp, fl_own
+ }
+
+ dnotify_flush(filp, id);
+- locks_remove_posix(filp, id);
++ locks_remove_posix(filp, 0);
+ fput(filp);
+ return retval;
+ }
+--- linux-2.6.7/fs/nfsd/export.c.lsec 2004-06-15 23:19:36.000000000 -0600
++++ linux-2.6.7/fs/nfsd/export.c 2005-03-23 14:28:24.686296808 -0700
+@@ -255,7 +255,7 @@ static inline void svc_expkey_update(str
+ new->ek_export = item->ek_export;
+ }
+
+-static DefineSimpleCacheLookup(svc_expkey,0) /* no inplace updates */
++static DefineSimpleCacheLookup(svc_expkey)
+
+ #define EXPORT_HASHBITS 8
+ #define EXPORT_HASHMAX (1<< EXPORT_HASHBITS)
+@@ -487,8 +487,72 @@ static inline void svc_export_update(str
+ new->ex_fsid = item->ex_fsid;
+ }
+
+-static DefineSimpleCacheLookup(svc_export,1) /* allow inplace updates */
++struct svc_export *
++svc_export_lookup(struct svc_export *item, int set)
++{
++ struct svc_export *tmp, *new = NULL;
++ struct cache_head **hp, **head;
+
++ head = &svc_export_cache.hash_table[svc_export_hash(item)];
++retry:
++ if (set||new)
++ write_lock(&svc_export_cache.hash_lock);
++ else
++ read_lock(&svc_export_cache.hash_lock);
++ for(hp=head; *hp != NULL; hp = &tmp->h.next) {
++ tmp = container_of(*hp, struct svc_export, h);
++ if (svc_export_match(item, tmp)) { /* found a match */
++ cache_get(&tmp->h);
++ if (set) {
++ if (test_bit(CACHE_NEGATIVE, &item->h.flags))
++ set_bit(CACHE_NEGATIVE, &tmp->h.flags);
++ else {
++ clear_bit(CACHE_NEGATIVE, &tmp->h.flags);
++ svc_export_update(tmp, item);
++ }
++ }
++ if (set||new)
++ write_unlock(&svc_export_cache.hash_lock);
++ else
++ read_unlock(&svc_export_cache.hash_lock);
++ if (set)
++ cache_fresh(&svc_export_cache, &tmp->h,
++ item->h.expiry_time);
++ if (new)
++ svc_export_put(&new->h, &svc_export_cache);
++ return tmp;
++ }
++ }
++ /* Didn't find anything */
++ if (new) {
++ svc_export_init(new, item);
++ new->h.next = *head;
++ *head = &new->h;
++ set_bit(CACHE_HASHED, &new->h.flags);
++ svc_export_cache.entries++;
++ if (set) {
++ tmp = new;
++ if (test_bit(CACHE_NEGATIVE, &item->h.flags))
++ set_bit(CACHE_NEGATIVE, &tmp->h.flags);
++ else
++ svc_export_update(tmp, item);
++ }
++ }
++ if (set||new)
++ write_unlock(&svc_export_cache.hash_lock);
++ else
++ read_unlock(&svc_export_cache.hash_lock);
++ if (new && set)
++ cache_fresh(&svc_export_cache, &new->h, item->h.expiry_time);
++ if (new)
++ return new;
++ new = kmalloc(sizeof(*new), GFP_KERNEL);
++ if (new) {
++ cache_init(&new->h);
++ goto retry;
++ }
++ return NULL;
++}
+
+ struct svc_expkey *
+ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
+--- linux-2.6.7/fs/nfsd/nfs4callback.c.lsec 2005-03-23 14:28:24.578313224 -0700
++++ linux-2.6.7/fs/nfsd/nfs4callback.c 2005-03-23 14:28:24.578313224 -0700
+@@ -0,0 +1,631 @@
++/*
++ * linux/fs/nfsd/nfs4callback.c
++ *
++ * Copyright (c) 2001 The Regents of the University of Michigan.
++ * All rights reserved.
++ *
++ * Kendrick Smith <kmsmith@umich.edu>
++ * Andy Adamson <andros@umich.edu>
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the University nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/list.h>
++#include <linux/inet.h>
++#include <linux/errno.h>
++#include <linux/sunrpc/xdr.h>
++#include <linux/sunrpc/svc.h>
++#include <linux/sunrpc/clnt.h>
++#include <linux/nfsd/nfsd.h>
++#include <linux/nfsd/state.h>
++#include <linux/sunrpc/sched.h>
++#include <linux/nfs4.h>
++
++#define NFSDDBG_FACILITY NFSDDBG_PROC
++
++#define NFSPROC4_CB_NULL 0
++#define NFSPROC4_CB_COMPOUND 1
++
++/* forward declarations */
++static void nfs4_cb_null(struct rpc_task *task);
++
++/* Index of predefined Linux callback client operations */
++
++enum {
++ NFSPROC4_CLNT_CB_NULL = 0,
++ NFSPROC4_CLNT_CB_GETATTR,
++ NFSPROC4_CLNT_CB_RECALL,
++};
++
++enum nfs_cb_opnum4 {
++ OP_CB_GETATTR = 3,
++ OP_CB_RECALL = 4,
++ OP_CB_ILLEGAL = 10044
++};
++
++
++#define NFS4_MAXTAGLEN 20
++
++#define cb_compound_enc_hdr_sz 4
++#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2))
++#define op_enc_sz 1
++#define op_dec_sz 2
++#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2))
++#define enc_stateid_sz 16
++
++#define NFS4_enc_cb_getattr_sz (cb_compound_enc_hdr_sz + \
++ op_enc_sz + \
++ enc_nfs4_fh_sz + 4)
++
++#define NFS4_dec_cb_getattr_sz (cb_compound_dec_hdr_sz + \
++ op_dec_sz + \
++ 11)
++
++#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \
++ 1 + enc_stateid_sz + \
++ enc_nfs4_fh_sz)
++
++#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
++ op_dec_sz)
++
++/*
++* Generic encode routines from fs/nfs/nfs4xdr.c
++*/
++static inline u32 *
++xdr_writemem(u32 *p, const void *ptr, int nbytes)
++{
++ int tmp = XDR_QUADLEN(nbytes);
++ if (!tmp)
++ return p;
++ p[tmp-1] = 0;
++ memcpy(p, ptr, nbytes);
++ return p + tmp;
++}
++
++#define WRITE32(n) *p++ = htonl(n)
++#define WRITEMEM(ptr,nbytes) do { \
++ p = xdr_writemem(p, ptr, nbytes); \
++} while (0)
++#define RESERVE_SPACE(nbytes) do { \
++ p = xdr_reserve_space(xdr, nbytes); \
++ if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \
++ BUG_ON(!p); \
++} while (0)
++
++/*
++ * Generic decode routines from fs/nfs/nfs4xdr.c
++ */
++#define DECODE_TAIL \
++ status = 0; \
++out: \
++ return status; \
++xdr_error: \
++ dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
++ status = -EIO; \
++ goto out
++
++#define READ32(x) (x) = ntohl(*p++)
++#define READ64(x) do { \
++ (x) = (u64)ntohl(*p++) << 32; \
++ (x) |= ntohl(*p++); \
++} while (0)
++#define READTIME(x) do { \
++ p++; \
++ (x.tv_sec) = ntohl(*p++); \
++ (x.tv_nsec) = ntohl(*p++); \
++} while (0)
++#define READ_BUF(nbytes) do { \
++ p = xdr_inline_decode(xdr, nbytes); \
++ if (!p) { \
++ dprintk("NFSD: %s: reply buffer overflowed in line %d.", \
++ __FUNCTION__, __LINE__); \
++ return -EIO; \
++ } \
++} while (0)
++
++struct nfs4_cb_compound_hdr {
++ int status;
++ u32 ident;
++ u32 nops;
++ u32 taglen;
++ char * tag;
++};
++
++struct nfs4_cb_getattr {
++ struct nfs_fh fh;
++ u32 bm0;
++ u32 bm1;
++ __u64 change_attr;
++ __u64 size;
++ struct timespec mtime;
++};
++
++struct nfs4_cb_recall {
++ nfs4_stateid stateid;
++ int trunc;
++ struct nfs_fh fh;
++};
++
++static struct {
++ int stat;
++ int errno;
++} nfs_cb_errtbl[] = {
++ { NFS4_OK, 0 },
++ { NFS4ERR_PERM, EPERM },
++ { NFS4ERR_NOENT, ENOENT },
++ { NFS4ERR_IO, EIO },
++ { NFS4ERR_NXIO, ENXIO },
++ { NFS4ERR_ACCESS, EACCES },
++ { NFS4ERR_EXIST, EEXIST },
++ { NFS4ERR_XDEV, EXDEV },
++ { NFS4ERR_NOTDIR, ENOTDIR },
++ { NFS4ERR_ISDIR, EISDIR },
++ { NFS4ERR_INVAL, EINVAL },
++ { NFS4ERR_FBIG, EFBIG },
++ { NFS4ERR_NOSPC, ENOSPC },
++ { NFS4ERR_ROFS, EROFS },
++ { NFS4ERR_MLINK, EMLINK },
++ { NFS4ERR_NAMETOOLONG, ENAMETOOLONG },
++ { NFS4ERR_NOTEMPTY, ENOTEMPTY },
++ { NFS4ERR_DQUOT, EDQUOT },
++ { NFS4ERR_STALE, ESTALE },
++ { NFS4ERR_BADHANDLE, EBADHANDLE },
++ { NFS4ERR_BAD_COOKIE, EBADCOOKIE },
++ { NFS4ERR_NOTSUPP, ENOTSUPP },
++ { NFS4ERR_TOOSMALL, ETOOSMALL },
++ { NFS4ERR_SERVERFAULT, ESERVERFAULT },
++ { NFS4ERR_BADTYPE, EBADTYPE },
++ { NFS4ERR_LOCKED, EAGAIN },
++ { NFS4ERR_RESOURCE, EREMOTEIO },
++ { NFS4ERR_SYMLINK, ELOOP },
++ { NFS4ERR_OP_ILLEGAL, EOPNOTSUPP },
++ { NFS4ERR_DEADLOCK, EDEADLK },
++ { -1, EIO }
++};
++
++static int
++nfs_cb_stat_to_errno(int stat)
++{
++ int i;
++ for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
++ if (nfs_cb_errtbl[i].stat == stat)
++ return nfs_cb_errtbl[i].errno;
++ }
++ /* If we cannot translate the error, the recovery routines should
++ * handle it.
++ * Note: remaining NFSv4 error codes have values > 10000, so should
++ * not conflict with native Linux error codes.
++ */
++ return stat;
++}
++
++/*
++ * XDR encode
++ */
++
++static int
++encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
++{
++ u32 * p;
++
++ RESERVE_SPACE(16);
++ WRITE32(0); /* tag length is always 0 */
++ WRITE32(NFS4_MINOR_VERSION);
++ WRITE32(hdr->ident);
++ WRITE32(hdr->nops);
++ return 0;
++}
++
++static int
++encode_cb_getattr(struct xdr_stream *xdr, struct nfs4_cb_getattr *cb_get)
++{
++ u32 *p;
++ int len = cb_get->fh.size;
++
++ RESERVE_SPACE(20 + len);
++ WRITE32(OP_CB_GETATTR);
++ WRITE32(len);
++ WRITEMEM(cb_get->fh.data, len);
++ WRITE32(2);
++ WRITE32(cb_get->bm0);
++ WRITE32(cb_get->bm1);
++ return 0;
++}
++
++static int
++encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
++{
++ u32 *p;
++ int len = cb_rec->fh.size;
++
++ RESERVE_SPACE(8+sizeof(cb_rec->stateid.data));
++ WRITE32(OP_CB_RECALL);
++ WRITEMEM(cb_rec->stateid.data, sizeof(cb_rec->stateid.data));
++ WRITE32(cb_rec->trunc);
++ WRITE32(len);
++ WRITEMEM(cb_rec->fh.data, len);
++ return 0;
++}
++
++static int
++nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req, u32 *p, struct nfs4_cb_getattr *args)
++{
++ struct xdr_stream xdr;
++ struct nfs4_cb_compound_hdr hdr = {
++ .nops = 1,
++ };
++
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_cb_compound_hdr(&xdr, &hdr);
++ return (encode_cb_getattr(&xdr, args));
++}
++
++static int
++nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args)
++{
++ struct xdr_stream xdr;
++ struct nfs4_cb_compound_hdr hdr = {
++ .nops = 1,
++ };
++
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_cb_compound_hdr(&xdr, &hdr);
++ return (encode_cb_recall(&xdr, args));
++}
++
++
++static int
++decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
++ u32 *p;
++
++ READ_BUF(8);
++ READ32(hdr->status);
++ READ32(hdr->taglen);
++ READ_BUF(hdr->taglen + 4);
++ hdr->tag = (char *)p;
++ p += XDR_QUADLEN(hdr->taglen);
++ READ32(hdr->nops);
++ return 0;
++}
++
++static int
++decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
++{
++ u32 *p;
++ u32 op;
++ int32_t nfserr;
++
++ READ_BUF(8);
++ READ32(op);
++ if (op != expected) {
++ dprintk("NFSD: decode_cb_op_hdr: Callback server returned operation"
++ " %d but we issued a request for %d\n",
++ op, expected);
++ return -EIO;
++ }
++ READ32(nfserr);
++ if (nfserr != NFS_OK)
++ return -nfs_cb_stat_to_errno(nfserr);
++ return 0;
++}
++
++static int
++decode_cb_getattr(struct xdr_stream *xdr, struct nfs4_cb_getattr *cb_get)
++{
++ int status;
++ u32 bmlen,
++ attrlen =0,
++ bmval0 =0,
++ bmval1 =0,
++ len = 0;
++ u32 *p;
++
++ status = decode_cb_op_hdr(xdr, OP_CB_GETATTR);
++ if (status)
++ return status;
++ READ_BUF(4);
++ READ32(bmlen);
++ if( (bmlen < 1) || (bmlen > 2))
++ goto xdr_error;
++ READ_BUF((bmlen << 2) + 4);
++ READ32(bmval0);
++ if (bmval0 & ~(FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE))
++ goto out_bad_bitmap;
++ if (bmlen == 2) {
++ READ32(bmval1);
++ if (bmval1 & ~ FATTR4_WORD1_TIME_MODIFY)
++ goto out_bad_bitmap;
++ }
++ READ32(attrlen);
++ if (bmval0 & FATTR4_WORD0_CHANGE) {
++ READ_BUF(8);
++ len += 8;
++ READ64(cb_get->change_attr);
++ dprintk("decode_cb_getattr: changeid=%Ld\n",
++ (long long)cb_get->change_attr);
++ }
++ if (bmval0 & FATTR4_WORD0_SIZE) {
++ READ_BUF(8);
++ len += 8;
++ READ64(cb_get->size);
++ dprintk("decode_cb_getattr: size=%Ld\n",
++ (long long)cb_get->size);
++ }
++ if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
++ READ_BUF(12);
++ len += 12;
++ READTIME(cb_get->mtime);
++ dprintk("decode_cb_gatattr: mtime=%ld\n",
++ (long)cb_get->mtime.tv_sec);
++ }
++ if (len != attrlen)
++ goto xdr_error;
++
++ DECODE_TAIL;
++
++out_bad_bitmap:
++ dprintk("NFSD: %s Callback server returned bad attribute bitmap\n",
++ __FUNCTION__);
++ return -EIO;
++
++}
++
++static int
++nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, u32 *p, struct nfs4_cb_getattr *res)
++{
++ struct xdr_stream xdr;
++ struct nfs4_cb_compound_hdr hdr;
++ int status;
++
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_cb_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_cb_getattr(&xdr, res);
++out:
++ return status;
++}
++
++static int
++nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, u32 *p)
++{
++ struct xdr_stream xdr;
++ struct nfs4_cb_compound_hdr hdr;
++ int status;
++
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_cb_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
++out:
++ return status;
++}
++
++static int
++nfs4_xdr_enc_null(struct rpc_rqst *req, u32 *p)
++{
++ struct xdr_stream xdrs, *xdr = &xdrs;
++
++ xdr_init_encode(&xdrs, &req->rq_snd_buf, p);
++ RESERVE_SPACE(0);
++ return 0;
++}
++
++static int
++nfs4_xdr_dec_null(struct rpc_rqst *req, u32 *p)
++{
++ return 0;
++}
++
++/*
++ * RPC procedure tables
++ */
++#ifndef MAX
++# define MAX(a, b) (((a) > (b))? (a) : (b))
++#endif
++
++#define PROC(proc, argtype, restype) \
++[NFSPROC4_CLNT_##proc] = { \
++ .p_proc = NFSPROC4_CB_COMPOUND, \
++ .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \
++ .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \
++ .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2, \
++}
++
++struct rpc_procinfo nfs4_cb_procedures[] = {
++ PROC(CB_GETATTR, enc_cb_getattr, dec_cb_getattr),
++ PROC(CB_RECALL, enc_cb_recall, dec_cb_recall),
++};
++
++struct rpc_version nfs_cb_version4 = {
++ .number = 1,
++ .nrprocs = sizeof(nfs4_cb_procedures)/sizeof(nfs4_cb_procedures[0]),
++ .procs = nfs4_cb_procedures
++};
++
++static struct rpc_version * nfs_cb_version[] = {
++ NULL,
++ &nfs_cb_version4,
++};
++
++struct rpc_procinfo nfs4_cb_null_proc= {
++ .p_proc = NFSPROC4_CB_NULL,
++ .p_encode = (kxdrproc_t)nfs4_xdr_enc_null,
++ .p_decode = (kxdrproc_t) nfs4_xdr_dec_null,
++ .p_bufsiz = 0,
++};
++
++/*
++ * Use the SETCLIENTID credential
++ */
++struct rpc_cred *
++nfsd4_lookupcred(struct nfs4_client *clp, int taskflags)
++{
++ struct auth_cred acred;
++ struct rpc_clnt *clnt = clp->cl_callback.cb_client;
++ struct rpc_cred *ret = NULL;
++
++ if (!clnt)
++ goto out;
++ get_group_info(clp->cl_cred.cr_group_info);
++ acred.uid = clp->cl_cred.cr_uid;
++ acred.gid = clp->cl_cred.cr_gid;
++ acred.group_info = clp->cl_cred.cr_group_info;
++
++ dprintk("NFSD: looking up %s cred\n",
++ clnt->cl_auth->au_ops->au_name);
++ ret = rpcauth_lookup_credcache(clnt->cl_auth, &acred, taskflags);
++ put_group_info(clp->cl_cred.cr_group_info);
++out:
++ return ret;
++}
++
++/*
++ * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
++ */
++void
++nfsd4_probe_callback(struct nfs4_client *clp)
++{
++ struct sockaddr_in addr;
++ struct nfs4_callback *cb = &clp->cl_callback;
++ struct rpc_timeout timeparms;
++ struct rpc_xprt * xprt;
++ struct rpc_program * program = &cb->cb_program;
++ struct rpc_stat * stat = &cb->cb_stat;
++ struct rpc_clnt * clnt;
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_cb_null_proc,
++ .rpc_argp = clp,
++ };
++ char hostname[32];
++ int status;
++
++ dprintk("NFSD: probe_callback. cb_parsed %d cb_set %d 1\n",
++ cb->cb_parsed, cb->cb_set);
++ if (!cb->cb_parsed || cb->cb_set)
++ goto out_err;
++
++ /* Currently, we only support tcp for the callback channel */
++ if (cb->cb_netid.len !=3 || memcmp((char *)cb->cb_netid.data, "tcp", 3))
++ goto out_err;
++
++ /* Initialize address */
++ memset(&addr, 0, sizeof(addr));
++ addr.sin_family = AF_INET;
++ addr.sin_port = htons(cb->cb_port);
++ addr.sin_addr.s_addr = htonl(cb->cb_addr);
++
++ /* Initialize timeout */
++ timeparms.to_initval = HZ;
++ timeparms.to_retries = 5;
++ timeparms.to_maxval = NFSD_LEASE_TIME*HZ;
++ timeparms.to_exponential = 1;
++
++ /* Create RPC transport */
++ if (!(xprt = xprt_create_proto(IPPROTO_TCP, &addr, &timeparms))) {
++ dprintk("NFSD: couldn't create callback transport!\n");
++ goto out_err;
++ }
++
++ /* Initialize rpc_program */
++ program->name = "nfs4_cb";
++ program->number = cb->cb_prog;
++ program->nrvers = sizeof(nfs_cb_version)/sizeof(nfs_cb_version[0]);
++ program->version = nfs_cb_version;
++ program->stats = stat;
++
++ /* Initialize rpc_stat */
++ memset(stat, 0, sizeof(struct rpc_stat));
++ stat->program = program;
++
++ /* Create RPC client
++ *
++ * XXX AUTH_UNIX only - need AUTH_GSS....
++ */
++ sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(addr.sin_addr.s_addr));
++ if (!(clnt = rpc_create_client(xprt, hostname, program, 1, RPC_AUTH_UNIX))) {
++ dprintk("NFSD: couldn't create callback client\n");
++ goto out_xprt;
++ }
++ clnt->cl_intr = 1;
++ clnt->cl_softrtry = 1;
++ clnt->cl_chatty = 1;
++ cb->cb_client = clnt;
++
++ /* Kick rpciod, put the call on the wire. */
++
++ if (rpciod_up() != 0) {
++ dprintk("nfsd: couldn't start rpciod for callbacks!\n");
++ goto out_clnt;
++ }
++
++ /* the task holds a reference to the nfs4_client struct */
++ atomic_inc(&clp->cl_count);
++
++ msg.rpc_cred = nfsd4_lookupcred(clp,0);
++ status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, nfs4_cb_null, 0);
++
++ if (status != 0) {
++ dprintk("NFSD: asynchronous NFSPROC4_CB_NULL failed!\n");
++ goto out_rpciod;
++ }
++ return;
++
++out_rpciod:
++ rpciod_down();
++out_clnt:
++ rpc_shutdown_client(clnt);
++ goto out_err;
++out_xprt:
++ xprt_destroy(xprt);
++out_err:
++ dprintk("NFSD: warning: no callback path to client %.*s\n",
++ clp->cl_name.len, clp->cl_name.data);
++ cb->cb_client = NULL;
++}
++
++static void
++nfs4_cb_null(struct rpc_task *task)
++{
++ struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
++ struct nfs4_callback *cb = &clp->cl_callback;
++ u32 addr = htonl(cb->cb_addr);
++
++ dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status);
++
++ if (task->tk_status < 0) {
++ dprintk("NFSD: callback establishment to client %.*s failed\n",
++ clp->cl_name.len, clp->cl_name.data);
++ goto out;
++ }
++ cb->cb_set = 1;
++ dprintk("NFSD: callback set to client %u.%u.%u.%u\n", NIPQUAD(addr));
++out:
++ put_nfs4_client(clp);
++}
+--- linux-2.6.7/fs/nfsd/nfs4xdr.c.lsec 2004-06-15 23:19:52.000000000 -0600
++++ linux-2.6.7/fs/nfsd/nfs4xdr.c 2005-03-23 14:28:23.924412632 -0700
+@@ -55,6 +55,8 @@
+ #include <linux/nfsd/state.h>
+ #include <linux/nfsd/xdr4.h>
+ #include <linux/nfsd_idmap.h>
++#include <linux/nfs4.h>
++#include <linux/nfs4_acl.h>
+
+ #define NFSDDBG_FACILITY NFSDDBG_XDR
+
+@@ -287,27 +289,40 @@ u32 *read_buf(struct nfsd4_compoundargs
+ return p;
+ }
+
+-char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
++static int
++defer_free(struct nfsd4_compoundargs *argp,
++ void (*release)(const void *), void *p)
+ {
+ struct tmpbuf *tb;
++
++ tb = kmalloc(sizeof(*tb), GFP_KERNEL);
++ if (!tb)
++ return -ENOMEM;
++ tb->buf = p;
++ tb->release = release;
++ tb->next = argp->to_free;
++ argp->to_free = tb;
++ return 0;
++}
++
++char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
++{
++ void *new = NULL;
+ if (p == argp->tmp) {
+- p = kmalloc(nbytes, GFP_KERNEL);
+- if (!p) return NULL;
++ new = kmalloc(nbytes, GFP_KERNEL);
++ if (!new) return NULL;
++ p = new;
+ memcpy(p, argp->tmp, nbytes);
+ } else {
+ if (p != argp->tmpp)
+ BUG();
+ argp->tmpp = NULL;
+ }
+- tb = kmalloc(sizeof(*tb), GFP_KERNEL);
+- if (!tb) {
+- kfree(p);
++ if (defer_free(argp, kfree, p)) {
++ kfree(new);
+ return NULL;
+- }
+- tb->buf = p;
+- tb->next = argp->to_free;
+- argp->to_free = tb;
+- return (char*)p;
++ } else
++ return (char *)p;
+ }
+
+
+@@ -335,7 +350,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoun
+ }
+
+ static int
+-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr)
++nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
++ struct nfs4_acl **acl)
+ {
+ int expected_len, len = 0;
+ u32 dummy32;
+@@ -364,6 +380,51 @@ nfsd4_decode_fattr(struct nfsd4_compound
+ READ64(iattr->ia_size);
+ iattr->ia_valid |= ATTR_SIZE;
+ }
++ if (bmval[0] & FATTR4_WORD0_ACL) {
++ int nace, i;
++ struct nfs4_ace ace;
++
++ READ_BUF(4); len += 4;
++ READ32(nace);
++
++ *acl = nfs4_acl_new();
++ if (*acl == NULL) {
++ status = -ENOMEM;
++ goto out_nfserr;
++ }
++ defer_free(argp, (void (*)(const void *))nfs4_acl_free, *acl);
++
++ for (i = 0; i < nace; i++) {
++ READ_BUF(16); len += 16;
++ READ32(ace.type);
++ READ32(ace.flag);
++ READ32(ace.access_mask);
++ READ32(dummy32);
++ READ_BUF(dummy32);
++ len += XDR_QUADLEN(dummy32) << 2;
++ READMEM(buf, dummy32);
++ if (check_utf8(buf, dummy32))
++ return nfserr_inval;
++ ace.whotype = nfs4_acl_get_whotype(buf, dummy32);
++ status = 0;
++ if (ace.whotype != NFS4_ACL_WHO_NAMED)
++ ace.who = 0;
++ else if (ace.flag & NFS4_ACE_IDENTIFIER_GROUP)
++ status = nfsd_map_name_to_gid(argp->rqstp,
++ buf, dummy32, &ace.who);
++ else
++ status = nfsd_map_name_to_uid(argp->rqstp,
++ buf, dummy32, &ace.who);
++ if (status)
++ goto out_nfserr;
++ if (nfs4_acl_add_ace(*acl, ace.type, ace.flag,
++ ace.access_mask, ace.whotype, ace.who) != 0) {
++ status = -ENOMEM;
++ goto out_nfserr;
++ }
++ }
++ } else
++ *acl = NULL;
+ if (bmval[1] & FATTR4_WORD1_MODE) {
+ READ_BUF(4);
+ len += 4;
+@@ -549,7 +610,7 @@ nfsd4_decode_create(struct nfsd4_compoun
+ if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
+ return status;
+
+- if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr)))
++ if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl)))
+ goto out;
+
+ DECODE_TAIL;
+@@ -698,7 +759,7 @@ nfsd4_decode_open(struct nfsd4_compounda
+ switch (open->op_createmode) {
+ case NFS4_CREATE_UNCHECKED:
+ case NFS4_CREATE_GUARDED:
+- if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr)))
++ if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl)))
+ goto out;
+ break;
+ case NFS4_CREATE_EXCLUSIVE:
+@@ -875,7 +936,7 @@ nfsd4_decode_setattr(struct nfsd4_compou
+ READ_BUF(sizeof(stateid_t));
+ READ32(setattr->sa_stateid.si_generation);
+ COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t));
+- if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr)))
++ if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl)))
+ goto out;
+
+ DECODE_TAIL;
+@@ -1288,32 +1349,24 @@ static u32 nfs4_ftypes[16] = {
+ NF4SOCK, NF4BAD, NF4LNK, NF4BAD,
+ };
+
+-static inline int
+-xdr_padding(int l)
+-{
+- return 3 - ((l - 1) & 3); /* smallest i>=0 such that (l+i)%4 = 0 */
+-}
+-
+ static int
+-nfsd4_encode_name(struct svc_rqst *rqstp, int group, uid_t id,
++nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
+ u32 **p, int *buflen)
+ {
+ int status;
+- u32 len;
+
+ if (*buflen < (XDR_QUADLEN(IDMAP_NAMESZ) << 2) + 4)
+ return nfserr_resource;
+- if (group)
++ if (whotype != NFS4_ACL_WHO_NAMED)
++ status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1));
++ else if (group)
+ status = nfsd_map_gid_to_name(rqstp, id, (u8 *)(*p + 1));
+ else
+ status = nfsd_map_uid_to_name(rqstp, id, (u8 *)(*p + 1));
+ if (status < 0)
+ return nfserrno(status);
+- len = (unsigned)status;
+- *(*p)++ = htonl(len);
+- memset((u8 *)*p + len, 0, xdr_padding(len));
+- *p += XDR_QUADLEN(len);
+- *buflen -= (XDR_QUADLEN(len) << 2) + 4;
++ *p = xdr_encode_opaque(*p, NULL, status);
++ *buflen -= (XDR_QUADLEN(status) << 2) + 4;
+ BUG_ON(*buflen < 0);
+ return 0;
+ }
+@@ -1321,13 +1374,20 @@ nfsd4_encode_name(struct svc_rqst *rqstp
+ static inline int
+ nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, u32 **p, int *buflen)
+ {
+- return nfsd4_encode_name(rqstp, uid, 0, p, buflen);
++ return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen);
+ }
+
+ static inline int
+ nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, u32 **p, int *buflen)
+ {
+- return nfsd4_encode_name(rqstp, gid, 1, p, buflen);
++ return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen);
++}
++
++static inline int
++nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
++ u32 **p, int *buflen)
++{
++ return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen);
+ }
+
+
+@@ -1354,6 +1414,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s
+ u64 dummy64;
+ u32 *p = buffer;
+ int status;
++ int aclsupport = 0;
++ struct nfs4_acl *acl = NULL;
+
+ BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
+ BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
+@@ -1376,6 +1438,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s
+ goto out;
+ fhp = &tempfh;
+ }
++ if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
++ | FATTR4_WORD0_SUPPORTED_ATTRS)) {
++ status = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
++ aclsupport = (status == 0);
++ if (bmval0 & FATTR4_WORD0_ACL) {
++ if (status == -EOPNOTSUPP)
++ bmval0 &= ~FATTR4_WORD0_ACL;
++ else if (status != 0)
++ goto out_nfserr;
++ }
++ }
+ if ((buflen -= 16) < 0)
+ goto out_resource;
+
+@@ -1388,7 +1461,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s
+ if ((buflen -= 12) < 0)
+ goto out_resource;
+ WRITE32(2);
+- WRITE32(NFSD_SUPPORTED_ATTRS_WORD0);
++ WRITE32(aclsupport ?
++ NFSD_SUPPORTED_ATTRS_WORD0 :
++ NFSD_SUPPORTED_ATTRS_WORD0 & ~FATTR4_WORD0_ACL);
+ WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
+ }
+ if (bmval0 & FATTR4_WORD0_TYPE) {
+@@ -1459,10 +1534,44 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s
+ goto out_resource;
+ WRITE32(0);
+ }
++ if (bmval0 & FATTR4_WORD0_ACL) {
++ struct nfs4_ace *ace;
++ struct list_head *h;
++
++ if (acl == NULL) {
++ if ((buflen -= 4) < 0)
++ goto out_resource;
++
++ WRITE32(0);
++ goto out_acl;
++ }
++ if ((buflen -= 4) < 0)
++ goto out_resource;
++ WRITE32(acl->naces);
++
++ list_for_each(h, &acl->ace_head) {
++ ace = list_entry(h, struct nfs4_ace, l_ace);
++
++ if ((buflen -= 4*3) < 0)
++ goto out_resource;
++ WRITE32(ace->type);
++ WRITE32(ace->flag);
++ WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL);
++ status = nfsd4_encode_aclname(rqstp, ace->whotype,
++ ace->who, ace->flag & NFS4_ACE_IDENTIFIER_GROUP,
++ &p, &buflen);
++ if (status == nfserr_resource)
++ goto out_resource;
++ if (status)
++ goto out;
++ }
++ }
++out_acl:
+ if (bmval0 & FATTR4_WORD0_ACLSUPPORT) {
+ if ((buflen -= 4) < 0)
+ goto out_resource;
+- WRITE32(0);
++ WRITE32(aclsupport ?
++ ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
+ }
+ if (bmval0 & FATTR4_WORD0_CANSETTIME) {
+ if ((buflen -= 4) < 0)
+@@ -1645,6 +1754,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s
+ status = nfs_ok;
+
+ out:
++ nfs4_acl_free(acl);
+ if (fhp == &tempfh)
+ fh_put(&tempfh);
+ return status;
+@@ -2471,6 +2581,24 @@ nfs4svc_encode_voidres(struct svc_rqst *
+ return xdr_ressize_check(rqstp, p);
+ }
+
++void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args)
++{
++ if (args->ops != args->iops) {
++ kfree(args->ops);
++ args->ops = args->iops;
++ }
++ if (args->tmpp) {
++ kfree(args->tmpp);
++ args->tmpp = NULL;
++ }
++ while (args->to_free) {
++ struct tmpbuf *tb = args->to_free;
++ args->to_free = tb->next;
++ tb->release(tb->buf);
++ kfree(tb);
++ }
++}
++
+ int
+ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoundargs *args)
+ {
+@@ -2487,20 +2615,7 @@ nfs4svc_decode_compoundargs(struct svc_r
+
+ status = nfsd4_decode_compound(args);
+ if (status) {
+- if (args->ops != args->iops) {
+- kfree(args->ops);
+- args->ops = args->iops;
+- }
+- if (args->tmpp) {
+- kfree(args->tmpp);
+- args->tmpp = NULL;
+- }
+- while (args->to_free) {
+- struct tmpbuf *tb = args->to_free;
+- args->to_free = tb->next;
+- kfree(tb->buf);
+- kfree(tb);
+- }
++ nfsd4_release_compoundargs(args);
+ }
+ return !status;
+ }
+--- linux-2.6.7/fs/nfsd/nfs4proc.c.lsec 2004-06-15 23:20:26.000000000 -0600
++++ linux-2.6.7/fs/nfsd/nfs4proc.c 2005-03-23 14:28:24.080388920 -0700
+@@ -52,6 +52,7 @@
+ #include <linux/nfs4.h>
+ #include <linux/nfsd/state.h>
+ #include <linux/nfsd/xdr4.h>
++#include <linux/nfs4_acl.h>
+
+ #define NFSDDBG_FACILITY NFSDDBG_PROC
+
+@@ -135,9 +136,11 @@ do_open_fhandle(struct svc_rqst *rqstp,
+ {
+ int status;
+
+- dprintk("NFSD: do_open_fhandle\n");
++ /* Only reclaims from previously confirmed clients are valid */
++ if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
++ return status;
+
+- /* we don't know the target directory, and therefore can not
++ /* We don't know the target directory, and therefore can not
+ * set the change info
+ */
+
+@@ -172,8 +175,7 @@ nfsd4_open(struct svc_rqst *rqstp, struc
+ if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+ return nfserr_grace;
+
+- if (nfs4_in_no_grace() &&
+- open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
++ if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+ return nfserr_no_grace;
+
+ /* This check required by spec. */
+@@ -318,7 +320,7 @@ nfsd4_commit(struct svc_rqst *rqstp, str
+ return status;
+ }
+
+-static inline int
++static int
+ nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_create *create)
+ {
+ struct svc_fh resfh;
+@@ -435,7 +437,7 @@ nfsd4_link(struct svc_rqst *rqstp, struc
+ return status;
+ }
+
+-static inline int
++static int
+ nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh)
+ {
+ struct svc_fh tmp_fh;
+@@ -619,7 +621,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, st
+ status = nfserr_bad_stateid;
+ if (ZERO_STATEID(&setattr->sa_stateid) || ONE_STATEID(&setattr->sa_stateid)) {
+ dprintk("NFSD: nfsd4_setattr: magic stateid!\n");
+- return status;
++ goto out;
+ }
+
+ nfs4_lock_state();
+@@ -627,17 +629,25 @@ nfsd4_setattr(struct svc_rqst *rqstp, st
+ &setattr->sa_stateid,
+ CHECK_FH | RDWR_STATE, &stp))) {
+ dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
+- goto out;
++ goto out_unlock;
+ }
+ status = nfserr_openmode;
+ if (!access_bits_permit_write(stp->st_access_bmap)) {
+ dprintk("NFSD: nfsd4_setattr: not opened for write!\n");
+- goto out;
++ goto out_unlock;
+ }
+ nfs4_unlock_state();
+ }
+- return (nfsd_setattr(rqstp, current_fh, &setattr->sa_iattr, 0, (time_t)0));
++ status = nfs_ok;
++ if (setattr->sa_acl != NULL)
++ status = nfsd4_set_nfs4_acl(rqstp, current_fh, setattr->sa_acl);
++ if (status)
++ goto out;
++ status = nfsd_setattr(rqstp, current_fh, &setattr->sa_iattr,
++ 0, (time_t)0);
+ out:
++ return status;
++out_unlock:
+ nfs4_unlock_state();
+ return status;
+ }
+@@ -773,13 +783,20 @@ nfsd4_proc_compound(struct svc_rqst *rqs
+ struct nfsd4_compoundres *resp)
+ {
+ struct nfsd4_op *op;
+- struct svc_fh current_fh;
+- struct svc_fh save_fh;
++ struct svc_fh *current_fh = NULL;
++ struct svc_fh *save_fh = NULL;
+ int slack_space; /* in words, not bytes! */
+ int status;
+
+- fh_init(¤t_fh, NFS4_FHSIZE);
+- fh_init(&save_fh, NFS4_FHSIZE);
++ status = nfserr_resource;
++ current_fh = kmalloc(sizeof(*current_fh), GFP_KERNEL);
++ if (current_fh == NULL)
++ goto out;
++ fh_init(current_fh, NFS4_FHSIZE);
++ save_fh = kmalloc(sizeof(*save_fh), GFP_KERNEL);
++ if (save_fh == NULL)
++ goto out;
++ fh_init(save_fh, NFS4_FHSIZE);
+
+ resp->xbuf = &rqstp->rq_res;
+ resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
+@@ -831,7 +848,7 @@ nfsd4_proc_compound(struct svc_rqst *rqs
+ * SETATTR NOFILEHANDLE error handled in nfsd4_setattr
+ * due to required returned bitmap argument
+ */
+- if ((!current_fh.fh_dentry) &&
++ if ((!current_fh->fh_dentry) &&
+ !((op->opnum == OP_PUTFH) || (op->opnum == OP_PUTROOTFH) ||
+ (op->opnum == OP_SETCLIENTID) ||
+ (op->opnum == OP_SETCLIENTID_CONFIRM) ||
+@@ -843,105 +860,105 @@ nfsd4_proc_compound(struct svc_rqst *rqs
+ }
+ switch (op->opnum) {
+ case OP_ACCESS:
+- op->status = nfsd4_access(rqstp, ¤t_fh, &op->u.access);
++ op->status = nfsd4_access(rqstp, current_fh, &op->u.access);
+ break;
+ case OP_CLOSE:
+- op->status = nfsd4_close(rqstp, ¤t_fh, &op->u.close);
++ op->status = nfsd4_close(rqstp, current_fh, &op->u.close);
+ if (op->u.close.cl_stateowner)
+ op->replay =
+ &op->u.close.cl_stateowner->so_replay;
+ break;
+ case OP_COMMIT:
+- op->status = nfsd4_commit(rqstp, ¤t_fh, &op->u.commit);
++ op->status = nfsd4_commit(rqstp, current_fh, &op->u.commit);
+ break;
+ case OP_CREATE:
+- op->status = nfsd4_create(rqstp, ¤t_fh, &op->u.create);
++ op->status = nfsd4_create(rqstp, current_fh, &op->u.create);
+ break;
+ case OP_GETATTR:
+- op->status = nfsd4_getattr(rqstp, ¤t_fh, &op->u.getattr);
++ op->status = nfsd4_getattr(rqstp, current_fh, &op->u.getattr);
+ break;
+ case OP_GETFH:
+- op->status = nfsd4_getfh(¤t_fh, &op->u.getfh);
++ op->status = nfsd4_getfh(current_fh, &op->u.getfh);
+ break;
+ case OP_LINK:
+- op->status = nfsd4_link(rqstp, ¤t_fh, &save_fh, &op->u.link);
++ op->status = nfsd4_link(rqstp, current_fh, save_fh, &op->u.link);
+ break;
+ case OP_LOCK:
+- op->status = nfsd4_lock(rqstp, ¤t_fh, &op->u.lock);
++ op->status = nfsd4_lock(rqstp, current_fh, &op->u.lock);
+ if (op->u.lock.lk_stateowner)
+ op->replay =
+ &op->u.lock.lk_stateowner->so_replay;
+ break;
+ case OP_LOCKT:
+- op->status = nfsd4_lockt(rqstp, ¤t_fh, &op->u.lockt);
++ op->status = nfsd4_lockt(rqstp, current_fh, &op->u.lockt);
+ break;
+ case OP_LOCKU:
+- op->status = nfsd4_locku(rqstp, ¤t_fh, &op->u.locku);
++ op->status = nfsd4_locku(rqstp, current_fh, &op->u.locku);
+ if (op->u.locku.lu_stateowner)
+ op->replay =
+ &op->u.locku.lu_stateowner->so_replay;
+ break;
+ case OP_LOOKUP:
+- op->status = nfsd4_lookup(rqstp, ¤t_fh, &op->u.lookup);
++ op->status = nfsd4_lookup(rqstp, current_fh, &op->u.lookup);
+ break;
+ case OP_LOOKUPP:
+- op->status = nfsd4_lookupp(rqstp, ¤t_fh);
++ op->status = nfsd4_lookupp(rqstp, current_fh);
+ break;
+ case OP_NVERIFY:
+- op->status = nfsd4_verify(rqstp, ¤t_fh, &op->u.nverify);
++ op->status = nfsd4_verify(rqstp, current_fh, &op->u.nverify);
+ if (op->status == nfserr_not_same)
+ op->status = nfs_ok;
+ break;
+ case OP_OPEN:
+- op->status = nfsd4_open(rqstp, ¤t_fh, &op->u.open);
++ op->status = nfsd4_open(rqstp, current_fh, &op->u.open);
+ if (op->u.open.op_stateowner)
+ op->replay =
+ &op->u.open.op_stateowner->so_replay;
+ break;
+ case OP_OPEN_CONFIRM:
+- op->status = nfsd4_open_confirm(rqstp, ¤t_fh, &op->u.open_confirm);
++ op->status = nfsd4_open_confirm(rqstp, current_fh, &op->u.open_confirm);
+ if (op->u.open_confirm.oc_stateowner)
+ op->replay =
+ &op->u.open_confirm.oc_stateowner->so_replay;
+ break;
+ case OP_OPEN_DOWNGRADE:
+- op->status = nfsd4_open_downgrade(rqstp, ¤t_fh, &op->u.open_downgrade);
++ op->status = nfsd4_open_downgrade(rqstp, current_fh, &op->u.open_downgrade);
+ if (op->u.open_downgrade.od_stateowner)
+ op->replay =
+ &op->u.open_downgrade.od_stateowner->so_replay;
+ break;
+ case OP_PUTFH:
+- op->status = nfsd4_putfh(rqstp, ¤t_fh, &op->u.putfh);
++ op->status = nfsd4_putfh(rqstp, current_fh, &op->u.putfh);
+ break;
+ case OP_PUTROOTFH:
+- op->status = nfsd4_putrootfh(rqstp, ¤t_fh);
++ op->status = nfsd4_putrootfh(rqstp, current_fh);
+ break;
+ case OP_READ:
+- op->status = nfsd4_read(rqstp, ¤t_fh, &op->u.read);
++ op->status = nfsd4_read(rqstp, current_fh, &op->u.read);
+ break;
+ case OP_READDIR:
+- op->status = nfsd4_readdir(rqstp, ¤t_fh, &op->u.readdir);
++ op->status = nfsd4_readdir(rqstp, current_fh, &op->u.readdir);
+ break;
+ case OP_READLINK:
+- op->status = nfsd4_readlink(rqstp, ¤t_fh, &op->u.readlink);
++ op->status = nfsd4_readlink(rqstp, current_fh, &op->u.readlink);
+ break;
+ case OP_REMOVE:
+- op->status = nfsd4_remove(rqstp, ¤t_fh, &op->u.remove);
++ op->status = nfsd4_remove(rqstp, current_fh, &op->u.remove);
+ break;
+ case OP_RENAME:
+- op->status = nfsd4_rename(rqstp, ¤t_fh, &save_fh, &op->u.rename);
++ op->status = nfsd4_rename(rqstp, current_fh, save_fh, &op->u.rename);
+ break;
+ case OP_RENEW:
+ op->status = nfsd4_renew(&op->u.renew);
+ break;
+ case OP_RESTOREFH:
+- op->status = nfsd4_restorefh(¤t_fh, &save_fh);
++ op->status = nfsd4_restorefh(current_fh, save_fh);
+ break;
+ case OP_SAVEFH:
+- op->status = nfsd4_savefh(¤t_fh, &save_fh);
++ op->status = nfsd4_savefh(current_fh, save_fh);
+ break;
+ case OP_SETATTR:
+- op->status = nfsd4_setattr(rqstp, ¤t_fh, &op->u.setattr);
++ op->status = nfsd4_setattr(rqstp, current_fh, &op->u.setattr);
+ break;
+ case OP_SETCLIENTID:
+ op->status = nfsd4_setclientid(rqstp, &op->u.setclientid);
+@@ -950,12 +967,12 @@ nfsd4_proc_compound(struct svc_rqst *rqs
+ op->status = nfsd4_setclientid_confirm(rqstp, &op->u.setclientid_confirm);
+ break;
+ case OP_VERIFY:
+- op->status = nfsd4_verify(rqstp, ¤t_fh, &op->u.verify);
++ op->status = nfsd4_verify(rqstp, current_fh, &op->u.verify);
+ if (op->status == nfserr_same)
+ op->status = nfs_ok;
+ break;
+ case OP_WRITE:
+- op->status = nfsd4_write(rqstp, ¤t_fh, &op->u.write);
++ op->status = nfsd4_write(rqstp, current_fh, &op->u.write);
+ break;
+ case OP_RELEASE_LOCKOWNER:
+ op->status = nfsd4_release_lockowner(rqstp, &op->u.release_lockowner);
+@@ -976,22 +993,13 @@ encode_op:
+ }
+
+ out:
+- if (args->ops != args->iops) {
+- kfree(args->ops);
+- args->ops = args->iops;
+- }
+- if (args->tmpp) {
+- kfree(args->tmpp);
+- args->tmpp = NULL;
+- }
+- while (args->to_free) {
+- struct tmpbuf *tb = args->to_free;
+- args->to_free = tb->next;
+- kfree(tb->buf);
+- kfree(tb);
+- }
+- fh_put(¤t_fh);
+- fh_put(&save_fh);
++ nfsd4_release_compoundargs(args);
++ if (current_fh)
++ fh_put(current_fh);
++ kfree(current_fh);
++ if (save_fh)
++ fh_put(save_fh);
++ kfree(save_fh);
+ return status;
+ }
+
+--- linux-2.6.7/fs/nfsd/nfs4state.c.lsec 2004-06-15 23:19:43.000000000 -0600
++++ linux-2.6.7/fs/nfsd/nfs4state.c 2005-03-23 14:28:24.028396824 -0700
+@@ -51,6 +51,9 @@
+ #define NFSDDBG_FACILITY NFSDDBG_PROC
+
+ /* Globals */
++static time_t lease_time = 90; /* default lease time */
++static time_t old_lease_time = 90; /* past incarnation lease time */
++static u32 nfs4_reclaim_init = 0;
+ time_t boot_time;
+ static time_t grace_end = 0;
+ static u32 current_clientid = 1;
+@@ -82,7 +85,7 @@ struct nfs4_stateid * find_stateid(state
+ * protects clientid_hashtbl[], clientstr_hashtbl[],
+ * unconfstr_hashtbl[], uncofid_hashtbl[].
+ */
+-static struct semaphore client_sema;
++static DECLARE_MUTEX(client_sema);
+
+ void
+ nfs4_lock_state(void)
+@@ -131,8 +134,11 @@ static void release_file(struct nfs4_fil
+ ((id) & CLIENT_HASH_MASK)
+ #define clientstr_hashval(name, namelen) \
+ (opaque_hashval((name), (namelen)) & CLIENT_HASH_MASK)
+-
+-/* conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
++/*
++ * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
++ * used in reboot/reset lease grace period processing
++ *
++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
+ * setclientid_confirmed info.
+ *
+ * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed
+@@ -144,6 +150,8 @@ static void release_file(struct nfs4_fil
+ * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
+ * for last close replay.
+ */
++static struct list_head reclaim_str_hashtbl[CLIENT_HASH_SIZE];
++static int reclaim_str_hashtbl_size;
+ static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE];
+ static struct list_head conf_str_hashtbl[CLIENT_HASH_SIZE];
+ static struct list_head unconf_str_hashtbl[CLIENT_HASH_SIZE];
+@@ -208,12 +216,20 @@ free_client(struct nfs4_client *clp)
+ kfree(clp);
+ }
+
+-static void
++void
++put_nfs4_client(struct nfs4_client *clp)
++{
++ if (atomic_dec_and_test(&clp->cl_count))
++ free_client(clp);
++}
++
++void
+ expire_client(struct nfs4_client *clp)
+ {
+ struct nfs4_stateowner *sop;
+
+- dprintk("NFSD: expire_client\n");
++ dprintk("NFSD: expire_client cl_count %d\n",
++ atomic_read(&clp->cl_count));
+ list_del(&clp->cl_idhash);
+ list_del(&clp->cl_strhash);
+ list_del(&clp->cl_lru);
+@@ -221,7 +237,7 @@ expire_client(struct nfs4_client *clp)
+ sop = list_entry(clp->cl_perclient.next, struct nfs4_stateowner, so_perclient);
+ release_stateowner(sop);
+ }
+- free_client(clp);
++ put_nfs4_client(clp);
+ }
+
+ static struct nfs4_client *
+@@ -230,6 +246,7 @@ create_client(struct xdr_netobj name) {
+
+ if(!(clp = alloc_client(name)))
+ goto out;
++ atomic_set(&clp->cl_count, 1);
+ INIT_LIST_HEAD(&clp->cl_idhash);
+ INIT_LIST_HEAD(&clp->cl_strhash);
+ INIT_LIST_HEAD(&clp->cl_perclient);
+@@ -339,6 +356,99 @@ move_to_confirmed(struct nfs4_client *cl
+ renew_client(clp);
+ }
+
++
++/* a helper function for parse_callback */
++static int
++parse_octet(unsigned int *lenp, char **addrp)
++{
++ unsigned int len = *lenp;
++ char *p = *addrp;
++ int n = -1;
++ char c;
++
++ for (;;) {
++ if (!len)
++ break;
++ len--;
++ c = *p++;
++ if (c == '.')
++ break;
++ if ((c < '0') || (c > '9')) {
++ n = -1;
++ break;
++ }
++ if (n < 0)
++ n = 0;
++ n = (n * 10) + (c - '0');
++ if (n > 255) {
++ n = -1;
++ break;
++ }
++ }
++ *lenp = len;
++ *addrp = p;
++ return n;
++}
++
++/* parse and set the setclientid ipv4 callback address */
++int
++parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp)
++{
++ int temp = 0;
++ u32 cbaddr = 0;
++ u16 cbport = 0;
++ u32 addrlen = addr_len;
++ char *addr = addr_val;
++ int i, shift;
++
++ /* ipaddress */
++ shift = 24;
++ for(i = 4; i > 0 ; i--) {
++ if ((temp = parse_octet(&addrlen, &addr)) < 0) {
++ return 0;
++ }
++ cbaddr |= (temp << shift);
++ if(shift > 0)
++ shift -= 8;
++ }
++ *cbaddrp = cbaddr;
++
++ /* port */
++ shift = 8;
++ for(i = 2; i > 0 ; i--) {
++ if ((temp = parse_octet(&addrlen, &addr)) < 0) {
++ return 0;
++ }
++ cbport |= (temp << shift);
++ if(shift > 0)
++ shift -= 8;
++ }
++ *cbportp = cbport;
++ return 1;
++}
++
++void
++gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
++{
++ struct nfs4_callback *cb = &clp->cl_callback;
++
++ if( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
++ &cb->cb_addr, &cb->cb_port))) {
++ printk(KERN_INFO "NFSD: BAD callback address. client will not receive delegations\n");
++ printk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
++ "will not receive delegations\n",
++ clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
++
++ cb->cb_parsed = 0;
++ return;
++ }
++ cb->cb_netid.len = se->se_callback_netid_len;
++ cb->cb_netid.data = se->se_callback_netid_val;
++ cb->cb_prog = se->se_callback_prog;
++ cb->cb_ident = se->se_callback_ident;
++ cb->cb_parsed = 1;
++}
++
+ /*
+ * RFC 3010 has a complex implmentation description of processing a
+ * SETCLIENTID request consisting of 5 bullets, labeled as
+@@ -450,6 +560,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp
+ copy_cred(&new->cl_cred,&rqstp->rq_cred);
+ gen_clid(new);
+ gen_confirm(new);
++ gen_callback(new, setclid);
+ add_to_unconfirmed(new, strhashval);
+ } else if (cmp_verf(&conf->cl_verifier, &clverifier)) {
+ /*
+@@ -477,6 +588,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp
+ copy_cred(&new->cl_cred,&rqstp->rq_cred);
+ copy_clid(new, conf);
+ gen_confirm(new);
++ gen_callback(new, setclid);
+ add_to_unconfirmed(new,strhashval);
+ } else if (!unconf) {
+ /*
+@@ -494,6 +606,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp
+ copy_cred(&new->cl_cred,&rqstp->rq_cred);
+ gen_clid(new);
+ gen_confirm(new);
++ gen_callback(new, setclid);
+ add_to_unconfirmed(new, strhashval);
+ } else if (!cmp_verf(&conf->cl_confirm, &unconf->cl_confirm)) {
+ /*
+@@ -519,6 +632,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp
+ copy_cred(&new->cl_cred,&rqstp->rq_cred);
+ gen_clid(new);
+ gen_confirm(new);
++ gen_callback(new, setclid);
+ add_to_unconfirmed(new, strhashval);
+ } else {
+ /* No cases hit !!! */
+@@ -529,7 +643,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp
+ setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
+ setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
+ memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
+- printk(KERN_INFO "NFSD: this client will not receive delegations\n");
+ status = nfs_ok;
+ out:
+ nfs4_unlock_state();
+@@ -575,7 +688,7 @@ nfsd4_setclientid_confirm(struct svc_rqs
+ * not been found.
+ */
+ if (clp->cl_addr != ip_addr) {
+- printk("NFSD: setclientid: string in use by client"
++ dprintk("NFSD: setclientid: string in use by client"
+ "(clientid %08x/%08x)\n",
+ clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
+ goto out;
+@@ -588,7 +701,7 @@ nfsd4_setclientid_confirm(struct svc_rqs
+ continue;
+ status = nfserr_inval;
+ if (clp->cl_addr != ip_addr) {
+- printk("NFSD: setclientid: string in use by client"
++ dprintk("NFSD: setclientid: string in use by client"
+ "(clientid %08x/%08x)\n",
+ clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
+ goto out;
+@@ -610,6 +723,7 @@ nfsd4_setclientid_confirm(struct svc_rqs
+ status = nfserr_clid_inuse;
+ else {
+ expire_client(conf);
++ clp = unconf;
+ move_to_confirmed(unconf, idhashval);
+ status = nfs_ok;
+ }
+@@ -627,6 +741,7 @@ nfsd4_setclientid_confirm(struct svc_rqs
+ if (!cmp_creds(&conf->cl_cred,&rqstp->rq_cred)) {
+ status = nfserr_clid_inuse;
+ } else {
++ clp = conf;
+ status = nfs_ok;
+ }
+ goto out;
+@@ -641,6 +756,7 @@ nfsd4_setclientid_confirm(struct svc_rqs
+ status = nfserr_clid_inuse;
+ } else {
+ status = nfs_ok;
++ clp = unconf;
+ move_to_confirmed(unconf, idhashval);
+ }
+ goto out;
+@@ -660,7 +776,9 @@ nfsd4_setclientid_confirm(struct svc_rqs
+ status = nfserr_inval;
+ goto out;
+ out:
+- /* XXX if status == nfs_ok, probe callback path */
++ if (!status)
++ nfsd4_probe_callback(clp);
++
+ nfs4_unlock_state();
+ return status;
+ }
+@@ -1510,10 +1628,12 @@ nfs4_preprocess_seqid_op(struct svc_fh *
+
+ status = nfserr_bad_stateid;
+
+- /* for new lock stateowners, check that the lock->v.new.open_stateid
+- * refers to an open stateowner, and that the lockclid
+- * (nfs4_lock->v.new.clientid) is the same as the
+- * open_stateid->st_stateowner->so_client->clientid
++ /* for new lock stateowners:
++ * check that the lock->v.new.open_stateid
++ * refers to an open stateowner
++ *
++ * check that the lockclid (nfs4_lock->v.new.clientid) is the same
++ * as the open_stateid->st_stateowner->so_client->clientid
+ */
+ if (lockclid) {
+ struct nfs4_stateowner *sop = stp->st_stateowner;
+@@ -1599,6 +1719,17 @@ check_replay:
+ }
+
+ /*
++ * eventually, this will perform an upcall to the 'state daemon' as well as
++ * set the cl_first_state field.
++ */
++void
++first_state(struct nfs4_client *clp)
++{
++ if (!clp->cl_first_state)
++ clp->cl_first_state = get_seconds();
++}
++
++/*
+ * nfs4_unlock_state(); called in encode
+ */
+ int
+@@ -1635,6 +1766,7 @@ nfsd4_open_confirm(struct svc_rqst *rqst
+ stp->st_stateid.si_fileid,
+ stp->st_stateid.si_generation);
+ status = nfs_ok;
++ first_state(sop->so_client);
+ out:
+ return status;
+ }
+@@ -1850,6 +1982,21 @@ nfs4_set_lock_denied(struct file_lock *f
+ deny->ld_type = NFS4_WRITE_LT;
+ }
+
++static struct nfs4_stateowner *
++find_lockstateowner(struct xdr_netobj *owner, clientid_t *clid)
++{
++ struct nfs4_stateowner *local = NULL;
++ int i;
++
++ for (i = 0; i < LOCK_HASH_SIZE; i++) {
++ list_for_each_entry(local, &lock_ownerid_hashtbl[i], so_idhash) {
++ if(!cmp_owner_str(local, owner, clid))
++ continue;
++ return local;
++ }
++ }
++ return NULL;
++}
+
+ static int
+ find_lockstateowner_str(unsigned int hashval, struct xdr_netobj *owner, clientid_t *clid, struct nfs4_stateowner **op) {
+@@ -1969,7 +2116,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struc
+
+ if (nfs4_in_grace() && !lock->lk_reclaim)
+ return nfserr_grace;
+- if (nfs4_in_no_grace() && lock->lk_reclaim)
++ if (!nfs4_in_grace() && lock->lk_reclaim)
+ return nfserr_no_grace;
+
+ if (check_lock_length(lock->lk_offset, lock->lk_length))
+@@ -1992,7 +2139,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struc
+ printk("NFSD: nfsd4_lock: clientid is stale!\n");
+ goto out;
+ }
+- /* does the clientid in the lock owner own the open stateid? */
++
++ /* is the new lock seqid presented by the client zero? */
++ status = nfserr_bad_seqid;
++ if (lock->v.new.lock_seqid != 0)
++ goto out;
+
+ /* validate and update open stateid and open seqid */
+ status = nfs4_preprocess_seqid_op(current_fh,
+@@ -2011,15 +2162,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struc
+ strhashval = lock_ownerstr_hashval(fp->fi_inode,
+ open_sop->so_client->cl_clientid.cl_id,
+ lock->v.new.owner);
+-
+ /*
+ * If we already have this lock owner, the client is in
+ * error (or our bookeeping is wrong!)
+ * for asking for a 'new lock'.
+ */
+ status = nfserr_bad_stateid;
+- if (find_lockstateowner_str(strhashval, &lock->v.new.owner,
+- &lock->v.new.clientid, &lock_sop))
++ lock_sop = find_lockstateowner(&lock->v.new.owner,
++ &lock->v.new.clientid);
++ if (lock_sop)
+ goto out;
+ status = nfserr_resource;
+ if (!(lock->lk_stateowner = alloc_init_lock_stateowner(strhashval, open_sop->so_client, open_stp, lock)))
+@@ -2315,7 +2466,7 @@ nfsd4_release_lockowner(struct svc_rqst
+ clientid_t *clid = &rlockowner->rl_clientid;
+ struct nfs4_stateowner *local = NULL;
+ struct xdr_netobj *owner = &rlockowner->rl_owner;
+- int status, i;
++ int status;
+
+ dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
+ clid->cl_boot, clid->cl_id);
+@@ -2330,34 +2481,136 @@ nfsd4_release_lockowner(struct svc_rqst
+
+ nfs4_lock_state();
+
+- /* find the lockowner */
+ status = nfs_ok;
+- for (i=0; i < LOCK_HASH_SIZE; i++)
+- list_for_each_entry(local, &lock_ownerstr_hashtbl[i], so_strhash)
+- if(cmp_owner_str(local, owner, clid)) {
+- struct nfs4_stateid *stp;
+-
+- /* check for any locks held by any stateid
+- * associated with the (lock) stateowner */
+- status = nfserr_locks_held;
+- list_for_each_entry(stp, &local->so_perfilestate,
+- st_perfilestate) {
+- if(stp->st_vfs_set) {
+- if (check_for_locks(&stp->st_vfs_file,
+- local))
+- goto out;
+- }
+- }
+- /* no locks held by (lock) stateowner */
+- status = nfs_ok;
+- release_stateowner(local);
+- goto out;
++ local = find_lockstateowner(owner, clid);
++ if (local) {
++ struct nfs4_stateid *stp;
++
++ /* check for any locks held by any stateid
++ * associated with the (lock) stateowner */
++ status = nfserr_locks_held;
++ list_for_each_entry(stp, &local->so_perfilestate,
++ st_perfilestate) {
++ if(stp->st_vfs_set) {
++ if (check_for_locks(&stp->st_vfs_file, local))
++ goto out;
+ }
++ }
++ /* no locks held by (lock) stateowner */
++ status = nfs_ok;
++ release_stateowner(local);
++ }
+ out:
+ nfs4_unlock_state();
+ return status;
+ }
+
++static inline struct nfs4_client_reclaim *
++alloc_reclaim(int namelen)
++{
++ struct nfs4_client_reclaim *crp = NULL;
++
++ crp = kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
++ if (!crp)
++ return NULL;
++ crp->cr_name.data = kmalloc(namelen, GFP_KERNEL);
++ if (!crp->cr_name.data) {
++ kfree(crp);
++ return NULL;
++ }
++ return crp;
++}
++
++/*
++ * failure => all reset bets are off, nfserr_no_grace...
++ */
++static int
++nfs4_client_to_reclaim(struct nfs4_client *clp)
++{
++ unsigned int strhashval;
++ struct nfs4_client_reclaim *crp = NULL;
++
++ crp = alloc_reclaim(clp->cl_name.len);
++ if (!crp)
++ return 0;
++ strhashval = clientstr_hashval(clp->cl_name.data, clp->cl_name.len);
++ INIT_LIST_HEAD(&crp->cr_strhash);
++ list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]);
++ memcpy(crp->cr_name.data, clp->cl_name.data, clp->cl_name.len);
++ crp->cr_name.len = clp->cl_name.len;
++ crp->cr_first_state = clp->cl_first_state;
++ crp->cr_expired = 0;
++ return 1;
++}
++
++static void
++nfs4_release_reclaim(void)
++{
++ struct nfs4_client_reclaim *crp = NULL;
++ int i;
++
++ BUG_ON(!nfs4_reclaim_init);
++ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
++ while (!list_empty(&reclaim_str_hashtbl[i])) {
++ crp = list_entry(reclaim_str_hashtbl[i].next,
++ struct nfs4_client_reclaim, cr_strhash);
++ list_del(&crp->cr_strhash);
++ kfree(crp->cr_name.data);
++ kfree(crp);
++ reclaim_str_hashtbl_size--;
++ }
++ }
++ BUG_ON(reclaim_str_hashtbl_size);
++}
++
++/*
++ * called from OPEN, CLAIM_PREVIOUS with a new clientid. */
++struct nfs4_client_reclaim *
++nfs4_find_reclaim_client(clientid_t *clid)
++{
++ unsigned int idhashval = clientid_hashval(clid->cl_id);
++ unsigned int strhashval;
++ struct nfs4_client *clp, *client = NULL;
++ struct nfs4_client_reclaim *crp = NULL;
++
++
++ /* find clientid in conf_id_hashtbl */
++ list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
++ if (cmp_clid(&clp->cl_clientid, clid)) {
++ client = clp;
++ break;
++ }
++ }
++ if (!client)
++ return NULL;
++
++ /* find clp->cl_name in reclaim_str_hashtbl */
++ strhashval = clientstr_hashval(client->cl_name.data,
++ client->cl_name.len);
++ list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) {
++ if(cmp_name(&crp->cr_name, &client->cl_name)) {
++ return crp;
++ }
++ }
++ return NULL;
++}
++
++/*
++* Called from OPEN. Look for clientid in reclaim list.
++*/
++int
++nfs4_check_open_reclaim(clientid_t *clid)
++{
++ struct nfs4_client_reclaim *crp;
++
++ if ((crp = nfs4_find_reclaim_client(clid)) == NULL)
++ return nfserr_reclaim_bad;
++ if (crp->cr_expired)
++ return nfserr_no_grace;
++ return nfs_ok;
++}
++
++
+ /*
+ * Start and stop routines
+ */
+@@ -2366,10 +2619,16 @@ void
+ nfs4_state_init(void)
+ {
+ int i;
+- time_t start = get_seconds();
++ time_t grace_time;
+
+ if (nfs4_init)
+ return;
++ if (!nfs4_reclaim_init) {
++ for (i = 0; i < CLIENT_HASH_SIZE; i++)
++ INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
++ reclaim_str_hashtbl_size = 0;
++ nfs4_reclaim_init = 1;
++ }
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ INIT_LIST_HEAD(&conf_id_hashtbl[i]);
+ INIT_LIST_HEAD(&conf_str_hashtbl[i]);
+@@ -2396,27 +2655,36 @@ nfs4_state_init(void)
+
+ INIT_LIST_HEAD(&close_lru);
+ INIT_LIST_HEAD(&client_lru);
+- init_MUTEX(&client_sema);
+- boot_time = start;
+- grace_end = start + NFSD_LEASE_TIME;
++ boot_time = get_seconds();
++ grace_time = max(old_lease_time, lease_time);
++ if (reclaim_str_hashtbl_size == 0)
++ grace_time = 0;
++ if (grace_time)
++ printk("NFSD: starting %ld-second grace period\n", grace_time);
++ grace_end = boot_time + grace_time;
+ INIT_WORK(&laundromat_work,laundromat_main, NULL);
+ schedule_delayed_work(&laundromat_work, NFSD_LEASE_TIME*HZ);
+ nfs4_init = 1;
+-
+ }
+
+ int
+ nfs4_in_grace(void)
+ {
+- return time_before(get_seconds(), (unsigned long)grace_end);
++ return get_seconds() < grace_end;
+ }
+
+-int
+-nfs4_in_no_grace(void)
++void
++set_no_grace(void)
+ {
+- return (grace_end < get_seconds());
++ printk("NFSD: ERROR in reboot recovery. State reclaims will fail.\n");
++ grace_end = get_seconds();
+ }
+
++time_t
++nfs4_lease_time(void)
++{
++ return lease_time;
++}
+
+ static void
+ __nfs4_state_shutdown(void)
+@@ -2454,6 +2722,61 @@ void
+ nfs4_state_shutdown(void)
+ {
+ nfs4_lock_state();
++ nfs4_release_reclaim();
+ __nfs4_state_shutdown();
+ nfs4_unlock_state();
+ }
++
++/*
++ * Called when leasetime is changed.
++ *
++ * if nfsd is not started, simply set the global lease.
++ *
++ * if nfsd(s) are running, lease change requires nfsv4 state to be reset.
++ * e.g: boot_time is reset, existing nfs4_client structs are
++ * used to fill reclaim_str_hashtbl, then all state (except for the
++ * reclaim_str_hashtbl) is re-initialized.
++ *
++ * if the old lease time is greater than the new lease time, the grace
++ * period needs to be set to the old lease time to allow clients to reclaim
++ * their state. XXX - we may want to set the grace period == lease time
++ * after an initial grace period == old lease time
++ *
++ * if an error occurs in this process, the new lease is set, but the server
++ * will not honor OPEN or LOCK reclaims, and will return nfserr_no_grace
++ * which means OPEN/LOCK/READ/WRITE will fail during grace period.
++ *
++ * clients will attempt to reset all state with SETCLIENTID/CONFIRM, and
++ * OPEN and LOCK reclaims.
++ */
++void
++nfs4_reset_lease(time_t leasetime)
++{
++ struct nfs4_client *clp;
++ int i;
++
++ printk("NFSD: New leasetime %ld\n",leasetime);
++ if (!nfs4_init)
++ return;
++ nfs4_lock_state();
++ old_lease_time = lease_time;
++ lease_time = leasetime;
++
++ nfs4_release_reclaim();
++
++ /* populate reclaim_str_hashtbl with current confirmed nfs4_clientid */
++ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
++ list_for_each_entry(clp, &conf_id_hashtbl[i], cl_idhash) {
++ if (!nfs4_client_to_reclaim(clp)) {
++ nfs4_release_reclaim();
++ goto init_state;
++ }
++ reclaim_str_hashtbl_size++;
++ }
++ }
++init_state:
++ __nfs4_state_shutdown();
++ nfs4_state_init();
++ nfs4_unlock_state();
++}
++
+--- linux-2.6.7/fs/nfsd/vfs.c.lsec 2004-06-15 23:19:13.000000000 -0600
++++ linux-2.6.7/fs/nfsd/vfs.c 2005-03-23 14:28:24.520322040 -0700
+@@ -44,6 +44,16 @@
+ #include <linux/nfsd/nfsfh.h>
+ #include <linux/quotaops.h>
+ #include <linux/dnotify.h>
++#ifdef CONFIG_NFSD_V4
++#include <linux/posix_acl.h>
++#include <linux/posix_acl_xattr.h>
++#include <linux/xattr_acl.h>
++#include <linux/xattr.h>
++#include <linux/nfs4.h>
++#include <linux/nfs4_acl.h>
++#include <linux/nfsd_idmap.h>
++#include <linux/security.h>
++#endif /* CONFIG_NFSD_V4 */
+
+ #include <asm/uaccess.h>
+
+@@ -344,6 +354,177 @@ out_nfserr:
+ goto out;
+ }
+
++#if defined(CONFIG_NFSD_V4)
++
++static int
++set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
++{
++ int len;
++ size_t buflen;
++ char *buf = NULL;
++ int error = 0;
++ struct inode *inode = dentry->d_inode;
++
++ buflen = posix_acl_xattr_size(pacl->a_count);
++ buf = kmalloc(buflen, GFP_KERNEL);
++ error = -ENOMEM;
++ if (buf == NULL)
++ goto out;
++
++ len = posix_acl_to_xattr(pacl, buf, buflen);
++ if (len < 0) {
++ error = len;
++ goto out;
++ }
++
++ error = -EOPNOTSUPP;
++ if (inode->i_op && inode->i_op->setxattr) {
++ down(&inode->i_sem);
++ security_inode_setxattr(dentry, key, buf, len, 0);
++ error = inode->i_op->setxattr(dentry, key, buf, len, 0);
++ if (!error)
++ security_inode_post_setxattr(dentry, key, buf, len, 0);
++ up(&inode->i_sem);
++ }
++out:
++ kfree(buf);
++ return (error);
++}
++
++int
++nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
++ struct nfs4_acl *acl)
++{
++ int error;
++ struct dentry *dentry;
++ struct inode *inode;
++ struct posix_acl *pacl = NULL, *dpacl = NULL;
++ unsigned int flags = 0;
++
++ /* Get inode */
++ error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);
++ if (error)
++ goto out;
++
++ dentry = fhp->fh_dentry;
++ inode = dentry->d_inode;
++ if (S_ISDIR(inode->i_mode))
++ flags = NFS4_ACL_DIR;
++
++ error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
++ if (error < 0)
++ goto out_nfserr;
++
++ if (pacl) {
++ error = set_nfsv4_acl_one(dentry, pacl, XATTR_NAME_ACL_ACCESS);
++ if (error < 0)
++ goto out_nfserr;
++ }
++
++ if (dpacl) {
++ error = set_nfsv4_acl_one(dentry, dpacl, XATTR_NAME_ACL_DEFAULT);
++ if (error < 0)
++ goto out_nfserr;
++ }
++
++ error = nfs_ok;
++
++out:
++ posix_acl_release(pacl);
++ posix_acl_release(dpacl);
++ return (error);
++out_nfserr:
++ error = nfserrno(error);
++ goto out;
++}
++
++static struct posix_acl *
++_get_posix_acl(struct dentry *dentry, char *key)
++{
++ struct inode *inode = dentry->d_inode;
++ char *buf = NULL;
++ int buflen, error = 0;
++ struct posix_acl *pacl = NULL;
++
++ down(&inode->i_sem);
++
++ buflen = inode->i_op->getxattr(dentry, key, NULL, 0);
++ if (buflen <= 0) {
++ error = buflen < 0 ? buflen : -ENODATA;
++ goto out_sem;
++ }
++
++ buf = kmalloc(buflen, GFP_KERNEL);
++ if (buf == NULL) {
++ error = -ENOMEM;
++ goto out_sem;
++ }
++
++ error = -EOPNOTSUPP;
++ if (inode->i_op && inode->i_op->getxattr) {
++ error = security_inode_getxattr(dentry, key);
++ if (error)
++ goto out_sem;
++ error = inode->i_op->getxattr(dentry, key, buf, buflen);
++ }
++ if (error < 0)
++ goto out_sem;
++
++ error = 0;
++ up(&inode->i_sem);
++
++ pacl = posix_acl_from_xattr(buf, buflen);
++ out:
++ kfree(buf);
++ return pacl;
++ out_sem:
++ up(&inode->i_sem);
++ pacl = ERR_PTR(error);
++ goto out;
++}
++
++int
++nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl)
++{
++ struct inode *inode = dentry->d_inode;
++ int error = 0;
++ struct posix_acl *pacl = NULL, *dpacl = NULL;
++ unsigned int flags = 0;
++
++ pacl = _get_posix_acl(dentry, XATTR_NAME_ACL_ACCESS);
++ if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA)
++ pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
++ if (IS_ERR(pacl)) {
++ error = PTR_ERR(pacl);
++ pacl = NULL;
++ goto out;
++ }
++
++ if (S_ISDIR(inode->i_mode)) {
++ dpacl = _get_posix_acl(dentry, XATTR_NAME_ACL_DEFAULT);
++ if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA)
++ dpacl = NULL;
++ else if (IS_ERR(dpacl)) {
++ error = PTR_ERR(dpacl);
++ dpacl = NULL;
++ goto out;
++ }
++ flags = NFS4_ACL_DIR;
++ }
++
++ *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags);
++ if (IS_ERR(*acl)) {
++ error = PTR_ERR(*acl);
++ *acl = NULL;
++ }
++ out:
++ posix_acl_release(pacl);
++ posix_acl_release(dpacl);
++ return error;
++}
++
++#endif /* defined(CONFIG_NFS_V4) */
++
+ #ifdef CONFIG_NFSD_V3
+ /*
+ * Check server access rights to a file system object
+--- linux-2.6.7/fs/nfsd/nfs4idmap.c.lsec 2004-06-15 23:19:43.000000000 -0600
++++ linux-2.6.7/fs/nfsd/nfs4idmap.c 2005-03-23 14:28:24.687296656 -0700
+@@ -78,9 +78,9 @@ struct ent {
+
+ #define DefineSimpleCacheLookupMap(STRUCT, FUNC) \
+ DefineCacheLookup(struct STRUCT, h, FUNC##_lookup, \
+- (struct STRUCT *item, int set), /*no setup */, \
++ (struct STRUCT *item, int set), \
+ & FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp), \
+- STRUCT##_init(new, item), STRUCT##_update(tmp, item), 0)
++ STRUCT##_init(new, item), STRUCT##_update(tmp, item))
+
+ /* Common entry handling */
+
+--- linux-2.6.7/fs/nfsd/nfs4acl.c.lsec 2005-03-23 14:28:24.463330704 -0700
++++ linux-2.6.7/fs/nfsd/nfs4acl.c 2005-03-23 14:28:24.463330704 -0700
+@@ -0,0 +1,974 @@
++/*
++ * fs/nfs4acl/acl.c
++ *
++ * Common NFSv4 ACL handling code.
++ *
++ * Copyright (c) 2002, 2003 The Regents of the University of Michigan.
++ * All rights reserved.
++ *
++ * Marius Aamodt Eriksen <marius@umich.edu>
++ * Jeff Sedlak <jsedlak@umich.edu>
++ * J. Bruce Fields <bfields@umich.edu>
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the University nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/types.h>
++#include <linux/fs.h>
++#include <linux/module.h>
++#include <linux/nfs_fs.h>
++#include <linux/posix_acl.h>
++#include <linux/nfs4.h>
++#include <linux/nfs4_acl.h>
++
++
++/* mode bit translations: */
++#define NFS4_READ_MODE (NFS4_ACE_READ_DATA | NFS4_ACE_READ_NAMED_ATTRS)
++#define NFS4_WRITE_MODE (NFS4_ACE_WRITE_DATA | NFS4_ACE_WRITE_NAMED_ATTRS | NFS4_ACE_APPEND_DATA)
++#define NFS4_EXECUTE_MODE NFS4_ACE_EXECUTE
++#define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE)
++#define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL)
++
++/* flags used to simulate posix default ACLs */
++#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \
++ | NFS4_ACE_DIRECTORY_INHERIT_ACE | NFS4_ACE_INHERIT_ONLY_ACE)
++
++#define MASK_EQUAL(mask1, mask2) \
++ ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) )
++
++static u32
++mask_from_posix(unsigned short perm, unsigned int flags)
++{
++ int mask = NFS4_ANYONE_MODE;
++
++ if (flags & NFS4_ACL_OWNER)
++ mask |= NFS4_OWNER_MODE;
++ if (perm & ACL_READ)
++ mask |= NFS4_READ_MODE;
++ if (perm & ACL_WRITE)
++ mask |= NFS4_WRITE_MODE;
++ if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR))
++ mask |= NFS4_ACE_DELETE_CHILD;
++ if (perm & ACL_EXECUTE)
++ mask |= NFS4_EXECUTE_MODE;
++ return mask;
++}
++
++static u32
++deny_mask(u32 allow_mask, unsigned int flags)
++{
++ u32 ret = ~allow_mask & ~NFS4_ACE_DELETE;
++ if (!(flags & NFS4_ACL_DIR))
++ ret &= ~NFS4_ACE_DELETE_CHILD;
++ return ret;
++}
++
++static int
++mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
++{
++ u32 ignore = 0;
++
++ if (!(flags & NFS4_ACL_DIR))
++ ignore |= NFS4_ACE_DELETE_CHILD; /* ignore it */
++ perm |= ignore;
++ *mode = 0;
++ if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE)
++ *mode |= ACL_READ;
++ if ((perm & NFS4_WRITE_MODE) == NFS4_WRITE_MODE)
++ *mode |= ACL_WRITE;
++ if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE)
++ *mode |= ACL_EXECUTE;
++ if (!MASK_EQUAL(perm, ignore|mask_from_posix(*mode, flags)))
++ return -EINVAL;
++ return 0;
++}
++
++struct ace_container {
++ struct nfs4_ace *ace;
++ struct list_head ace_l;
++};
++
++static short ace2type(struct nfs4_ace *);
++static int _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int);
++static struct posix_acl *_nfsv4_to_posix_one(struct nfs4_acl *, unsigned int);
++int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
++int nfs4_acl_split(struct nfs4_acl *, struct nfs4_acl *);
++
++struct nfs4_acl *
++nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl,
++ unsigned int flags)
++{
++ struct nfs4_acl *acl;
++ int error = -EINVAL;
++
++ if ((pacl != NULL &&
++ (posix_acl_valid(pacl) < 0 || pacl->a_count == 0)) ||
++ (dpacl != NULL &&
++ (posix_acl_valid(dpacl) < 0 || dpacl->a_count == 0)))
++ goto out_err;
++
++ acl = nfs4_acl_new();
++ if (acl == NULL) {
++ error = -ENOMEM;
++ goto out_err;
++ }
++
++ if (pacl != NULL) {
++ error = _posix_to_nfsv4_one(pacl, acl,
++ flags & ~NFS4_ACL_TYPE_DEFAULT);
++ if (error < 0)
++ goto out_acl;
++ }
++
++ if (dpacl != NULL) {
++ error = _posix_to_nfsv4_one(dpacl, acl,
++ flags | NFS4_ACL_TYPE_DEFAULT);
++ if (error < 0)
++ goto out_acl;
++ }
++
++ return acl;
++
++out_acl:
++ nfs4_acl_free(acl);
++out_err:
++ acl = ERR_PTR(error);
++
++ return acl;
++}
++
++static int
++nfs4_acl_add_pair(struct nfs4_acl *acl, int eflag, u32 mask, int whotype,
++ uid_t owner, unsigned int flags)
++{
++ int error;
++
++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE,
++ eflag, mask, whotype, owner);
++ if (error < 0)
++ return error;
++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
++ eflag, deny_mask(mask, flags), whotype, owner);
++ return error;
++}
++
++/* We assume the acl has been verified with posix_acl_valid. */
++static int
++_posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
++ unsigned int flags)
++{
++ struct posix_acl_entry *pa, *pe, *group_owner_entry;
++ int error = -EINVAL;
++ u32 mask, mask_mask;
++ int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ?
++ NFS4_INHERITANCE_FLAGS : 0);
++
++ BUG_ON(pacl->a_count < 3);
++ pe = pacl->a_entries + pacl->a_count;
++ pa = pe - 2; /* if mask entry exists, it's second from the last. */
++ if (pa->e_tag == ACL_MASK)
++ mask_mask = deny_mask(mask_from_posix(pa->e_perm, flags), flags);
++ else
++ mask_mask = 0;
++
++ pa = pacl->a_entries;
++ BUG_ON(pa->e_tag != ACL_USER_OBJ);
++ mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER);
++ error = nfs4_acl_add_pair(acl, eflag, mask, NFS4_ACL_WHO_OWNER, 0, flags);
++ if (error < 0)
++ goto out;
++ pa++;
++
++ while (pa->e_tag == ACL_USER) {
++ mask = mask_from_posix(pa->e_perm, flags);
++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
++ eflag, mask_mask, NFS4_ACL_WHO_NAMED, pa->e_id);
++ if (error < 0)
++ goto out;
++
++
++ error = nfs4_acl_add_pair(acl, eflag, mask,
++ NFS4_ACL_WHO_NAMED, pa->e_id, flags);
++ if (error < 0)
++ goto out;
++ pa++;
++ }
++
++ /* In the case of groups, we apply allow ACEs first, then deny ACEs,
++ * since a user can be in more than one group. */
++
++ /* allow ACEs */
++
++ if (pacl->a_count > 3) {
++ BUG_ON(pa->e_tag != ACL_GROUP_OBJ);
++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
++ NFS4_ACE_IDENTIFIER_GROUP | eflag, mask_mask,
++ NFS4_ACL_WHO_GROUP, 0);
++ if (error < 0)
++ goto out;
++ }
++ group_owner_entry = pa;
++ mask = mask_from_posix(pa->e_perm, flags);
++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE,
++ NFS4_ACE_IDENTIFIER_GROUP | eflag, mask,
++ NFS4_ACL_WHO_GROUP, 0);
++ if (error < 0)
++ goto out;
++ pa++;
++
++ while (pa->e_tag == ACL_GROUP) {
++ mask = mask_from_posix(pa->e_perm, flags);
++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
++ NFS4_ACE_IDENTIFIER_GROUP | eflag, mask_mask,
++ NFS4_ACL_WHO_NAMED, pa->e_id);
++ if (error < 0)
++ goto out;
++
++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE,
++ NFS4_ACE_IDENTIFIER_GROUP | eflag, mask,
++ NFS4_ACL_WHO_NAMED, pa->e_id);
++ if (error < 0)
++ goto out;
++ pa++;
++ }
++
++ /* deny ACEs */
++
++ pa = group_owner_entry;
++ mask = mask_from_posix(pa->e_perm, flags);
++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
++ NFS4_ACE_IDENTIFIER_GROUP | eflag,
++ deny_mask(mask, flags), NFS4_ACL_WHO_GROUP, 0);
++ if (error < 0)
++ goto out;
++ pa++;
++ while (pa->e_tag == ACL_GROUP) {
++ mask = mask_from_posix(pa->e_perm, flags);
++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
++ NFS4_ACE_IDENTIFIER_GROUP | eflag,
++ deny_mask(mask, flags), NFS4_ACL_WHO_NAMED, pa->e_id);
++ if (error < 0)
++ goto out;
++ pa++;
++ }
++
++ if (pa->e_tag == ACL_MASK)
++ pa++;
++ BUG_ON(pa->e_tag != ACL_OTHER);
++ mask = mask_from_posix(pa->e_perm, flags);
++ error = nfs4_acl_add_pair(acl, eflag, mask, NFS4_ACL_WHO_EVERYONE, 0, flags);
++
++out:
++ return error;
++}
++
++static void
++sort_pacl_range(struct posix_acl *pacl, int start, int end) {
++ int sorted = 0, i;
++ struct posix_acl_entry tmp;
++
++ /* We just do a bubble sort; easy to do in place, and we're not
++ * expecting acl's to be long enough to justify anything more. */
++ while (!sorted) {
++ sorted = 1;
++ for (i = start; i < end; i++) {
++ if (pacl->a_entries[i].e_id
++ > pacl->a_entries[i+1].e_id) {
++ sorted = 0;
++ tmp = pacl->a_entries[i];
++ pacl->a_entries[i] = pacl->a_entries[i+1];
++ pacl->a_entries[i+1] = tmp;
++ }
++ }
++ }
++}
++
++static void
++sort_pacl(struct posix_acl *pacl)
++{
++ /* posix_acl_valid requires that users and groups be in order
++ * by uid/gid. */
++ int i, j;
++
++ if (pacl->a_count <= 4)
++ return; /* no users or groups */
++ i = 1;
++ while (pacl->a_entries[i].e_tag == ACL_USER)
++ i++;
++ sort_pacl_range(pacl, 1, i-1);
++
++ BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ);
++ j = i++;
++ while (pacl->a_entries[j].e_tag == ACL_GROUP)
++ j++;
++ sort_pacl_range(pacl, i, j-1);
++ return;
++}
++
++static int
++write_pace(struct nfs4_ace *ace, struct posix_acl *pacl,
++ struct posix_acl_entry **pace, short tag, unsigned int flags)
++{
++ struct posix_acl_entry *this = *pace;
++
++ if (*pace == pacl->a_entries + pacl->a_count)
++ return -EINVAL; /* fell off the end */
++ (*pace)++;
++ this->e_tag = tag;
++ if (tag == ACL_USER_OBJ)
++ flags |= NFS4_ACL_OWNER;
++ if (mode_from_nfs4(ace->access_mask, &this->e_perm, flags))
++ return -EINVAL;
++ this->e_id = (tag == ACL_USER || tag == ACL_GROUP ?
++ ace->who : ACL_UNDEFINED_ID);
++ return 0;
++}
++
++static struct nfs4_ace *
++get_next_v4_ace(struct list_head **p, struct list_head *head)
++{
++ struct nfs4_ace *ace;
++
++ *p = (*p)->next;
++ if (*p == head)
++ return NULL;
++ ace = list_entry(*p, struct nfs4_ace, l_ace);
++
++ return ace;
++}
++
++int
++nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
++ struct posix_acl **dpacl, unsigned int flags)
++{
++ struct nfs4_acl *dacl;
++ int error = -ENOMEM;
++
++ *pacl = NULL;
++ *dpacl = NULL;
++
++ dacl = nfs4_acl_new();
++ if (dacl == NULL)
++ goto out;
++
++ error = nfs4_acl_split(acl, dacl);
++ if (error < 0)
++ goto out_acl;
++
++ if (pacl != NULL) {
++ if (acl->naces == 0) {
++ error = -ENODATA;
++ goto try_dpacl;
++ }
++
++ *pacl = _nfsv4_to_posix_one(acl, flags);
++ if (IS_ERR(*pacl)) {
++ error = PTR_ERR(*pacl);
++ *pacl = NULL;
++ goto out_acl;
++ }
++ }
++
++try_dpacl:
++ if (dpacl != NULL) {
++ if (dacl->naces == 0) {
++ if (pacl == NULL || *pacl == NULL)
++ error = -ENODATA;
++ goto out_acl;
++ }
++
++ error = 0;
++ *dpacl = _nfsv4_to_posix_one(dacl, flags);
++ if (IS_ERR(*dpacl)) {
++ error = PTR_ERR(*dpacl);
++ *dpacl = NULL;
++ goto out_acl;
++ }
++ }
++
++out_acl:
++ if (error && pacl) {
++ posix_acl_release(*pacl);
++ *pacl = NULL;
++ }
++ nfs4_acl_free(dacl);
++out:
++ return error;
++}
++
++static int
++same_who(struct nfs4_ace *a, struct nfs4_ace *b)
++{
++ return a->whotype == b->whotype &&
++ (a->whotype != NFS4_ACL_WHO_NAMED || a->who == b->who);
++}
++
++static int
++complementary_ace_pair(struct nfs4_ace *allow, struct nfs4_ace *deny,
++ unsigned int flags)
++{
++ int ignore = 0;
++ if (!(flags & NFS4_ACL_DIR))
++ ignore |= NFS4_ACE_DELETE_CHILD;
++ return MASK_EQUAL(ignore|deny_mask(allow->access_mask, flags),
++ ignore|deny->access_mask) &&
++ allow->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
++ deny->type == NFS4_ACE_ACCESS_DENIED_ACE_TYPE &&
++ allow->flag == deny->flag &&
++ same_who(allow, deny);
++}
++
++static inline int
++user_obj_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
++ struct posix_acl *pacl, struct posix_acl_entry **pace,
++ unsigned int flags)
++{
++ int error = -EINVAL;
++ struct nfs4_ace *ace, *ace2;
++
++ ace = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace == NULL)
++ goto out;
++ if (ace2type(ace) != ACL_USER_OBJ)
++ goto out;
++ error = write_pace(ace, pacl, pace, ACL_USER_OBJ, flags);
++ if (error < 0)
++ goto out;
++ error = -EINVAL;
++ ace2 = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace2 == NULL)
++ goto out;
++ if (!complementary_ace_pair(ace, ace2, flags))
++ goto out;
++ error = 0;
++out:
++ return error;
++}
++
++static inline int
++users_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
++ struct nfs4_ace **mask_ace,
++ struct posix_acl *pacl, struct posix_acl_entry **pace,
++ unsigned int flags)
++{
++ int error = -EINVAL;
++ struct nfs4_ace *ace, *ace2;
++
++ ace = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace == NULL)
++ goto out;
++ while (ace2type(ace) == ACL_USER) {
++ if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
++ goto out;
++ if (*mask_ace &&
++ !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
++ goto out;
++ *mask_ace = ace;
++ ace = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace == NULL)
++ goto out;
++ if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
++ goto out;
++ error = write_pace(ace, pacl, pace, ACL_USER, flags);
++ if (error < 0)
++ goto out;
++ error = -EINVAL;
++ ace2 = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace2 == NULL)
++ goto out;
++ if (!complementary_ace_pair(ace, ace2, flags))
++ goto out;
++ if ((*mask_ace)->flag != ace2->flag ||
++ !same_who(*mask_ace, ace2))
++ goto out;
++ ace = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace == NULL)
++ goto out;
++ }
++ error = 0;
++out:
++ return error;
++}
++
++static inline int
++group_obj_and_groups_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
++ struct nfs4_ace **mask_ace,
++ struct posix_acl *pacl, struct posix_acl_entry **pace,
++ unsigned int flags)
++{
++ int error = -EINVAL;
++ struct nfs4_ace *ace, *ace2;
++ struct ace_container *ac;
++ struct list_head group_l;
++
++ INIT_LIST_HEAD(&group_l);
++ ace = list_entry(*p, struct nfs4_ace, l_ace);
++
++ /* group owner (mask and allow aces) */
++
++ if (pacl->a_count != 3) {
++ /* then the group owner should be preceded by mask */
++ if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
++ goto out;
++ if (*mask_ace &&
++ !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
++ goto out;
++ *mask_ace = ace;
++ ace = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace == NULL)
++ goto out;
++
++ if ((*mask_ace)->flag != ace->flag || !same_who(*mask_ace, ace))
++ goto out;
++ }
++
++ if (ace2type(ace) != ACL_GROUP_OBJ)
++ goto out;
++
++ ac = kmalloc(sizeof(*ac), GFP_KERNEL);
++ error = -ENOMEM;
++ if (ac == NULL)
++ goto out;
++ ac->ace = ace;
++ list_add_tail(&ac->ace_l, &group_l);
++
++ error = -EINVAL;
++ if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
++ goto out;
++
++ error = write_pace(ace, pacl, pace, ACL_GROUP_OBJ, flags);
++ if (error < 0)
++ goto out;
++
++ error = -EINVAL;
++ ace = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace == NULL)
++ goto out;
++
++ /* groups (mask and allow aces) */
++
++ while (ace2type(ace) == ACL_GROUP) {
++ if (*mask_ace == NULL)
++ goto out;
++
++ if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE ||
++ !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
++ goto out;
++ *mask_ace = ace;
++
++ ace = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace == NULL)
++ goto out;
++ ac = kmalloc(sizeof(*ac), GFP_KERNEL);
++ error = -ENOMEM;
++ if (ac == NULL)
++ goto out;
++ error = -EINVAL;
++ if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE ||
++ !same_who(ace, *mask_ace))
++ goto out;
++
++ ac->ace = ace;
++ list_add_tail(&ac->ace_l, &group_l);
++
++ error = write_pace(ace, pacl, pace, ACL_GROUP, flags);
++ if (error < 0)
++ goto out;
++ error = -EINVAL;
++ ace = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace == NULL)
++ goto out;
++ }
++
++ /* group owner (deny ace) */
++
++ if (ace2type(ace) != ACL_GROUP_OBJ)
++ goto out;
++ ac = list_entry(group_l.next, struct ace_container, ace_l);
++ ace2 = ac->ace;
++ if (!complementary_ace_pair(ace2, ace, flags))
++ goto out;
++ list_del(group_l.next);
++ kfree(ac);
++
++ /* groups (deny aces) */
++
++ while (!list_empty(&group_l)) {
++ ace = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace == NULL)
++ goto out;
++ if (ace2type(ace) != ACL_GROUP)
++ goto out;
++ ac = list_entry(group_l.next, struct ace_container, ace_l);
++ ace2 = ac->ace;
++ if (!complementary_ace_pair(ace2, ace, flags))
++ goto out;
++ list_del(group_l.next);
++ kfree(ac);
++ }
++
++ ace = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace == NULL)
++ goto out;
++ if (ace2type(ace) != ACL_OTHER)
++ goto out;
++ error = 0;
++out:
++ while (!list_empty(&group_l)) {
++ ac = list_entry(group_l.next, struct ace_container, ace_l);
++ list_del(group_l.next);
++ kfree(ac);
++ }
++ return error;
++}
++
++static inline int
++mask_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
++ struct nfs4_ace **mask_ace,
++ struct posix_acl *pacl, struct posix_acl_entry **pace,
++ unsigned int flags)
++{
++ int error = -EINVAL;
++ struct nfs4_ace *ace;
++
++ ace = list_entry(*p, struct nfs4_ace, l_ace);
++ if (pacl->a_count != 3) {
++ if (*mask_ace == NULL)
++ goto out;
++ (*mask_ace)->access_mask = deny_mask((*mask_ace)->access_mask, flags);
++ write_pace(*mask_ace, pacl, pace, ACL_MASK, flags);
++ }
++ error = 0;
++out:
++ return error;
++}
++
++static inline int
++other_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
++ struct posix_acl *pacl, struct posix_acl_entry **pace,
++ unsigned int flags)
++{
++ int error = -EINVAL;
++ struct nfs4_ace *ace, *ace2;
++
++ ace = list_entry(*p, struct nfs4_ace, l_ace);
++ if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
++ goto out;
++ error = write_pace(ace, pacl, pace, ACL_OTHER, flags);
++ if (error < 0)
++ goto out;
++ error = -EINVAL;
++ ace2 = get_next_v4_ace(p, &n4acl->ace_head);
++ if (ace2 == NULL)
++ goto out;
++ if (!complementary_ace_pair(ace, ace2, flags))
++ goto out;
++ error = 0;
++out:
++ return error;
++}
++
++static int
++calculate_posix_ace_count(struct nfs4_acl *n4acl)
++{
++ if (n4acl->naces == 6) /* owner, owner group, and other only */
++ return 3;
++ else { /* Otherwise there must be a mask entry. */
++ /* Also, the remaining entries are for named users and
++ * groups, and come in threes (mask, allow, deny): */
++ if (n4acl->naces < 7)
++ return -1;
++ if ((n4acl->naces - 7) % 3)
++ return -1;
++ return 4 + (n4acl->naces - 7)/3;
++ }
++}
++
++
++static struct posix_acl *
++_nfsv4_to_posix_one(struct nfs4_acl *n4acl, unsigned int flags)
++{
++ struct posix_acl *pacl;
++ int error = -EINVAL, nace = 0;
++ struct list_head *p;
++ struct nfs4_ace *mask_ace = NULL;
++ struct posix_acl_entry *pace;
++
++ nace = calculate_posix_ace_count(n4acl);
++ if (nace < 0)
++ goto out_err;
++
++ pacl = posix_acl_alloc(nace, GFP_KERNEL);
++ error = -ENOMEM;
++ if (pacl == NULL)
++ goto out_err;
++
++ pace = &pacl->a_entries[0];
++ p = &n4acl->ace_head;
++
++ error = user_obj_from_v4(n4acl, &p, pacl, &pace, flags);
++ if (error)
++ goto out_acl;
++
++ error = users_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags);
++ if (error)
++ goto out_acl;
++
++ error = group_obj_and_groups_from_v4(n4acl, &p, &mask_ace, pacl, &pace,
++ flags);
++ if (error)
++ goto out_acl;
++
++ error = mask_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags);
++ if (error)
++ goto out_acl;
++ error = other_from_v4(n4acl, &p, pacl, &pace, flags);
++ if (error)
++ goto out_acl;
++
++ error = -EINVAL;
++ if (p->next != &n4acl->ace_head)
++ goto out_acl;
++ if (pace != pacl->a_entries + pacl->a_count)
++ goto out_acl;
++
++ sort_pacl(pacl);
++
++ return pacl;
++out_acl:
++ posix_acl_release(pacl);
++out_err:
++ pacl = ERR_PTR(error);
++ return pacl;
++}
++
++int
++nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl)
++{
++ struct list_head *h, *n;
++ struct nfs4_ace *ace;
++ int error = 0;
++
++ list_for_each_safe(h, n, &acl->ace_head) {
++ ace = list_entry(h, struct nfs4_ace, l_ace);
++
++ if ((ace->flag & NFS4_INHERITANCE_FLAGS)
++ != NFS4_INHERITANCE_FLAGS)
++ continue;
++
++ error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
++ ace->access_mask, ace->whotype, ace->who) == -1;
++ if (error < 0)
++ goto out;
++
++ list_del(h);
++ kfree(ace);
++ acl->naces--;
++ }
++
++out:
++ return error;
++}
++
++static short
++ace2type(struct nfs4_ace *ace)
++{
++ switch (ace->whotype) {
++ case NFS4_ACL_WHO_NAMED:
++ return (ace->flag & NFS4_ACE_IDENTIFIER_GROUP ?
++ ACL_GROUP : ACL_USER);
++ case NFS4_ACL_WHO_OWNER:
++ return ACL_USER_OBJ;
++ case NFS4_ACL_WHO_GROUP:
++ return ACL_GROUP_OBJ;
++ case NFS4_ACL_WHO_EVERYONE:
++ return ACL_OTHER;
++ }
++ BUG();
++ return -1;
++}
++
++EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4);
++EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix);
++
++struct nfs4_acl *
++nfs4_acl_new(void)
++{
++ struct nfs4_acl *acl;
++
++ if ((acl = kmalloc(sizeof(*acl), GFP_KERNEL)) == NULL)
++ return NULL;
++
++ acl->naces = 0;
++ INIT_LIST_HEAD(&acl->ace_head);
++
++ return acl;
++}
++
++void
++nfs4_acl_free(struct nfs4_acl *acl)
++{
++ struct list_head *h;
++ struct nfs4_ace *ace;
++
++ if (!acl)
++ return;
++
++ while (!list_empty(&acl->ace_head)) {
++ h = acl->ace_head.next;
++ list_del(h);
++ ace = list_entry(h, struct nfs4_ace, l_ace);
++ kfree(ace);
++ }
++
++ kfree(acl);
++
++ return;
++}
++
++int
++nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask,
++ int whotype, uid_t who)
++{
++ struct nfs4_ace *ace;
++
++ if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL)
++ return -1;
++
++ ace->type = type;
++ ace->flag = flag;
++ ace->access_mask = access_mask;
++ ace->whotype = whotype;
++ ace->who = who;
++
++ list_add_tail(&ace->l_ace, &acl->ace_head);
++ acl->naces++;
++
++ return 0;
++}
++
++static struct {
++ char *string;
++ int stringlen;
++ int type;
++} s2t_map[] = {
++ {
++ .string = "OWNER@",
++ .stringlen = sizeof("OWNER@") - 1,
++ .type = NFS4_ACL_WHO_OWNER,
++ },
++ {
++ .string = "GROUP@",
++ .stringlen = sizeof("GROUP@") - 1,
++ .type = NFS4_ACL_WHO_GROUP,
++ },
++ {
++ .string = "EVERYONE@",
++ .stringlen = sizeof("EVERYONE@") - 1,
++ .type = NFS4_ACL_WHO_EVERYONE,
++ },
++};
++
++int
++nfs4_acl_get_whotype(char *p, u32 len)
++{
++ int i;
++
++ for (i=0; i < sizeof(s2t_map) / sizeof(*s2t_map); i++) {
++ if (s2t_map[i].stringlen == len &&
++ 0 == memcmp(s2t_map[i].string, p, len))
++ return s2t_map[i].type;
++ }
++ return NFS4_ACL_WHO_NAMED;
++}
++
++int
++nfs4_acl_write_who(int who, char *p)
++{
++ int i;
++
++ for (i=0; i < sizeof(s2t_map) / sizeof(*s2t_map); i++) {
++ if (s2t_map[i].type == who) {
++ memcpy(p, s2t_map[i].string, s2t_map[i].stringlen);
++ return s2t_map[i].stringlen;
++ }
++ }
++ BUG();
++ return -1;
++}
++
++static inline int
++match_who(struct nfs4_ace *ace, uid_t owner, gid_t group, uid_t who)
++{
++ switch (ace->whotype) {
++ case NFS4_ACL_WHO_NAMED:
++ return who == ace->who;
++ case NFS4_ACL_WHO_OWNER:
++ return who == owner;
++ case NFS4_ACL_WHO_GROUP:
++ return who == group;
++ case NFS4_ACL_WHO_EVERYONE:
++ return 1;
++ default:
++ return 0;
++ }
++}
++
++/* 0 = granted, -EACCES = denied; mask is an nfsv4 mask, not mode bits */
++int
++nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
++ uid_t who, u32 mask)
++{
++ struct nfs4_ace *ace;
++ u32 allowed = 0;
++
++ list_for_each_entry(ace, &acl->ace_head, l_ace) {
++ if (!match_who(ace, group, owner, who))
++ continue;
++ switch (ace->type) {
++ case NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE:
++ allowed |= ace->access_mask;
++ if ((allowed & mask) == mask)
++ return 0;
++ break;
++ case NFS4_ACE_ACCESS_DENIED_ACE_TYPE:
++ if (ace->access_mask & mask)
++ return -EACCES;
++ break;
++ }
++ }
++ return -EACCES;
++}
++
++EXPORT_SYMBOL(nfs4_acl_new);
++EXPORT_SYMBOL(nfs4_acl_free);
++EXPORT_SYMBOL(nfs4_acl_add_ace);
++EXPORT_SYMBOL(nfs4_acl_get_whotype);
++EXPORT_SYMBOL(nfs4_acl_write_who);
++EXPORT_SYMBOL(nfs4_acl_permission);
+--- linux-2.6.7/fs/nfsd/Makefile.lsec 2004-06-15 23:19:13.000000000 -0600
++++ linux-2.6.7/fs/nfsd/Makefile 2005-03-23 14:28:24.461331008 -0700
+@@ -7,5 +7,6 @@ obj-$(CONFIG_NFSD) += nfsd.o
+ nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
+ export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
+-nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o
++nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
++ nfs4acl.o nfs4callback.o
+ nfsd-objs := $(nfsd-y)
+--- linux-2.6.7/fs/nfsd/nfsctl.c.lsec 2004-06-15 23:19:01.000000000 -0600
++++ linux-2.6.7/fs/nfsd/nfsctl.c 2005-03-23 14:28:24.132381016 -0700
+@@ -36,7 +36,7 @@
+ #include <asm/uaccess.h>
+
+ /*
+- * We have a single directory with 8 nodes in it.
++ * We have a single directory with 9 nodes in it.
+ */
+ enum {
+ NFSD_Root = 1,
+@@ -50,6 +50,7 @@ enum {
+ NFSD_List,
+ NFSD_Fh,
+ NFSD_Threads,
++ NFSD_Leasetime,
+ };
+
+ /*
+@@ -64,6 +65,7 @@ static ssize_t write_getfd(struct file *
+ static ssize_t write_getfs(struct file *file, char *buf, size_t size);
+ static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
+ static ssize_t write_threads(struct file *file, char *buf, size_t size);
++static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
+
+ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
+ [NFSD_Svc] = write_svc,
+@@ -75,6 +77,7 @@ static ssize_t (*write_op[])(struct file
+ [NFSD_Getfs] = write_getfs,
+ [NFSD_Fh] = write_filehandle,
+ [NFSD_Threads] = write_threads,
++ [NFSD_Leasetime] = write_leasetime,
+ };
+
+ /* an argresp is stored in an allocated page and holds the
+@@ -393,6 +396,29 @@ static ssize_t write_threads(struct file
+ return strlen(buf);
+ }
+
++extern time_t nfs4_leasetime(void);
++
++static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
++{
++ /* if size > 10 seconds, call
++ * nfs4_reset_lease() then write out the new lease (seconds) as reply
++ */
++ char *mesg = buf;
++ int rv;
++
++ if (size > 0) {
++ int lease;
++ rv = get_int(&mesg, &lease);
++ if (rv)
++ return rv;
++ if (lease < 10 || lease > 3600)
++ return -EINVAL;
++ nfs4_reset_lease(lease);
++ }
++ sprintf(buf, "%ld\n", nfs4_lease_time());
++ return strlen(buf);
++}
++
+ /*----------------------------------------------------------------------------*/
+ /*
+ * populating the filesystem.
+@@ -411,6 +437,7 @@ static int nfsd_fill_super(struct super_
+ [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
+ [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
++ [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
+ /* last one */ {""}
+ };
+ return simple_fill_super(sb, 0x6e667364, nfsd_files);
+--- linux-2.6.7/fs/nfs/callback_proc.c.lsec 2005-03-23 14:28:22.485631360 -0700
++++ linux-2.6.7/fs/nfs/callback_proc.c 2005-03-23 14:28:22.485631360 -0700
+@@ -0,0 +1,85 @@
++/*
++ * linux/fs/nfs/callback_proc.c
++ *
++ * Copyright (C) 2004 Trond Myklebust
++ *
++ * NFSv4 callback procedures
++ */
++#include <linux/config.h>
++#include <linux/nfs4.h>
++#include <linux/nfs_fs.h>
++#include "callback.h"
++#include "delegation.h"
++
++#define NFSDBG_FACILITY NFSDBG_CALLBACK
++
++unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
++{
++ struct nfs4_client *clp;
++ struct nfs_delegation *delegation;
++ struct nfs_inode *nfsi;
++ struct inode *inode;
++
++ res->bitmap[0] = res->bitmap[1] = 0;
++ res->status = htonl(NFS4ERR_BADHANDLE);
++ clp = nfs4_find_client(&args->addr->sin_addr);
++ if (clp == NULL)
++ goto out;
++ inode = nfs_delegation_find_inode(clp, &args->fh);
++ if (inode == NULL)
++ goto out_putclient;
++ nfsi = NFS_I(inode);
++ down_read(&nfsi->rwsem);
++ delegation = nfsi->delegation;
++ if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
++ goto out_iput;
++ res->size = i_size_read(inode);
++ res->change_attr = NFS_CHANGE_ATTR(inode);
++ res->ctime = inode->i_ctime;
++ res->mtime = inode->i_mtime;
++ res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
++ args->bitmap[0];
++ res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
++ args->bitmap[1];
++ res->status = 0;
++out_iput:
++ up_read(&nfsi->rwsem);
++ iput(inode);
++out_putclient:
++ nfs4_put_client(clp);
++out:
++ dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res->status));
++ return res->status;
++}
++
++unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
++{
++ struct nfs4_client *clp;
++ struct inode *inode;
++ unsigned res;
++
++ res = htonl(NFS4ERR_BADHANDLE);
++ clp = nfs4_find_client(&args->addr->sin_addr);
++ if (clp == NULL)
++ goto out;
++ inode = nfs_delegation_find_inode(clp, &args->fh);
++ if (inode == NULL)
++ goto out_putclient;
++ /* Set up a helper thread to actually return the delegation */
++ switch(nfs_async_inode_return_delegation(inode, &args->stateid)) {
++ case 0:
++ res = 0;
++ break;
++ case -ENOENT:
++ res = htonl(NFS4ERR_BAD_STATEID);
++ break;
++ default:
++ res = htonl(NFS4ERR_RESOURCE);
++ }
++ iput(inode);
++out_putclient:
++ nfs4_put_client(clp);
++out:
++ dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res));
++ return res;
++}
+--- linux-2.6.7/fs/nfs/delegation.c.lsec 2005-03-23 14:28:22.546622088 -0700
++++ linux-2.6.7/fs/nfs/delegation.c 2005-03-23 14:28:22.545622240 -0700
+@@ -0,0 +1,320 @@
++/*
++ * linux/fs/nfs/delegation.c
++ *
++ * Copyright (C) 2004 Trond Myklebust
++ *
++ * NFS file delegation management
++ *
++ */
++#include <linux/config.h>
++#include <linux/completion.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/spinlock.h>
++
++#include <linux/nfs4.h>
++#include <linux/nfs_fs.h>
++#include <linux/nfs_xdr.h>
++
++#include "delegation.h"
++
++static struct nfs_delegation *nfs_alloc_delegation(void)
++{
++ return (struct nfs_delegation *)kmalloc(sizeof(struct nfs_delegation), GFP_KERNEL);
++}
++
++static void nfs_free_delegation(struct nfs_delegation *delegation)
++{
++ if (delegation->cred)
++ put_rpccred(delegation->cred);
++ kfree(delegation);
++}
++
++static void nfs_delegation_claim_opens(struct inode *inode)
++{
++ struct nfs_inode *nfsi = NFS_I(inode);
++ struct nfs_open_context *ctx;
++ struct nfs4_state *state;
++
++again:
++ spin_lock(&inode->i_lock);
++ list_for_each_entry(ctx, &nfsi->open_files, list) {
++ state = ctx->state;
++ if (state == NULL)
++ continue;
++ if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
++ continue;
++ get_nfs_open_context(ctx);
++ spin_unlock(&inode->i_lock);
++ if (nfs4_open_delegation_recall(ctx->dentry, state) < 0)
++ return;
++ put_nfs_open_context(ctx);
++ goto again;
++ }
++ spin_unlock(&inode->i_lock);
++}
++
++/*
++ * Set up a delegation on an inode
++ */
++void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
++{
++ struct nfs_delegation *delegation = NFS_I(inode)->delegation;
++
++ if (delegation == NULL)
++ return;
++ memcpy(delegation->stateid.data, res->delegation.data,
++ sizeof(delegation->stateid.data));
++ delegation->type = res->delegation_type;
++ delegation->maxsize = res->maxsize;
++ put_rpccred(cred);
++ delegation->cred = get_rpccred(cred);
++ delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM;
++ NFS_I(inode)->delegation_state = delegation->type;
++ wmb();
++}
++
++/*
++ * Set up a delegation on an inode
++ */
++int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
++{
++ struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
++ struct nfs_inode *nfsi = NFS_I(inode);
++ struct nfs_delegation *delegation;
++ int status = 0;
++
++ delegation = nfs_alloc_delegation();
++ if (delegation == NULL)
++ return -ENOMEM;
++ memcpy(delegation->stateid.data, res->delegation.data,
++ sizeof(delegation->stateid.data));
++ delegation->type = res->delegation_type;
++ delegation->maxsize = res->maxsize;
++ delegation->cred = get_rpccred(cred);
++ delegation->inode = inode;
++
++ spin_lock(&clp->cl_lock);
++ if (nfsi->delegation == NULL) {
++ list_add(&delegation->super_list, &clp->cl_delegations);
++ nfsi->delegation = delegation;
++ nfsi->delegation_state = delegation->type;
++ delegation = NULL;
++ } else {
++ if (memcmp(&delegation->stateid, &nfsi->delegation->stateid,
++ sizeof(delegation->stateid)) != 0 ||
++ delegation->type != nfsi->delegation->type) {
++ printk("%s: server %u.%u.%u.%u, handed out a duplicate delegation!\n",
++ __FUNCTION__, NIPQUAD(clp->cl_addr));
++ status = -EIO;
++ }
++ }
++ spin_unlock(&clp->cl_lock);
++ if (delegation != NULL)
++ kfree(delegation);
++ return status;
++}
++
++static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
++{
++ int res = 0;
++
++ __nfs_revalidate_inode(NFS_SERVER(inode), inode);
++
++ res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid);
++ nfs_free_delegation(delegation);
++ return res;
++}
++
++/* Sync all data to disk upon delegation return */
++static void nfs_msync_inode(struct inode *inode)
++{
++ down(&inode->i_sem);
++ filemap_fdatawrite(inode->i_mapping);
++ nfs_wb_all(inode);
++ filemap_fdatawait(inode->i_mapping);
++ up(&inode->i_sem);
++}
++
++/*
++ * Basic procedure for returning a delegation to the server
++ */
++int nfs_inode_return_delegation(struct inode *inode)
++{
++ struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
++ struct nfs_inode *nfsi = NFS_I(inode);
++ struct nfs_delegation *delegation;
++ int res = 0;
++
++ nfs_msync_inode(inode);
++ down_read(&clp->cl_sem);
++ /* Guard against new delegated open calls */
++ down_write(&nfsi->rwsem);
++ spin_lock(&clp->cl_lock);
++ delegation = nfsi->delegation;
++ if (delegation != NULL) {
++ list_del_init(&delegation->super_list);
++ nfsi->delegation = NULL;
++ nfsi->delegation_state = 0;
++ }
++ spin_unlock(&clp->cl_lock);
++ nfs_delegation_claim_opens(inode);
++ up_write(&nfsi->rwsem);
++ up_read(&clp->cl_sem);
++ nfs_msync_inode(inode);
++
++ if (delegation != NULL)
++ res = nfs_do_return_delegation(inode, delegation);
++ return res;
++}
++
++/*
++ * Return all delegations associated to a super block
++ */
++void nfs_return_all_delegations(struct super_block *sb)
++{
++ struct nfs4_client *clp = NFS_SB(sb)->nfs4_state;
++ struct nfs_delegation *delegation;
++ struct inode *inode;
++
++ if (clp == NULL)
++ return;
++restart:
++ spin_lock(&clp->cl_lock);
++ list_for_each_entry(delegation, &clp->cl_delegations, super_list) {
++ if (delegation->inode->i_sb != sb)
++ continue;
++ inode = igrab(delegation->inode);
++ if (inode == NULL)
++ continue;
++ spin_unlock(&clp->cl_lock);
++ nfs_inode_return_delegation(inode);
++ iput(inode);
++ goto restart;
++ }
++ spin_unlock(&clp->cl_lock);
++}
++
++struct recall_threadargs {
++ struct inode *inode;
++ struct nfs4_client *clp;
++ const nfs4_stateid *stateid;
++
++ struct completion started;
++ int result;
++};
++
++static int recall_thread(void *data)
++{
++ struct recall_threadargs *args = (struct recall_threadargs *)data;
++ struct inode *inode = igrab(args->inode);
++ struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
++ struct nfs_inode *nfsi = NFS_I(inode);
++ struct nfs_delegation *delegation;
++
++ daemonize("nfsv4-delegreturn");
++
++ nfs_msync_inode(inode);
++ down_read(&clp->cl_sem);
++ down_write(&nfsi->rwsem);
++ spin_lock(&clp->cl_lock);
++ delegation = nfsi->delegation;
++ if (delegation != NULL && memcmp(delegation->stateid.data,
++ args->stateid->data,
++ sizeof(delegation->stateid.data)) == 0) {
++ list_del_init(&delegation->super_list);
++ nfsi->delegation = NULL;
++ nfsi->delegation_state = 0;
++ args->result = 0;
++ } else {
++ delegation = NULL;
++ args->result = -ENOENT;
++ }
++ spin_unlock(&clp->cl_lock);
++ complete(&args->started);
++ nfs_delegation_claim_opens(inode);
++ up_write(&nfsi->rwsem);
++ up_read(&clp->cl_sem);
++ nfs_msync_inode(inode);
++
++ if (delegation != NULL)
++ nfs_do_return_delegation(inode, delegation);
++ iput(inode);
++ module_put_and_exit(0);
++}
++
++/*
++ * Asynchronous delegation recall!
++ */
++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
++{
++ struct recall_threadargs data = {
++ .inode = inode,
++ .stateid = stateid,
++ };
++ int status;
++
++ init_completion(&data.started);
++ __module_get(THIS_MODULE);
++ status = kernel_thread(recall_thread, &data, CLONE_KERNEL);
++ if (status < 0)
++ goto out_module_put;
++ wait_for_completion(&data.started);
++ return data.result;
++out_module_put:
++ module_put(THIS_MODULE);
++ return status;
++}
++
++/*
++ * Retrieve the inode associated with a delegation
++ */
++struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle)
++{
++ struct nfs_delegation *delegation;
++ struct inode *res = NULL;
++ spin_lock(&clp->cl_lock);
++ list_for_each_entry(delegation, &clp->cl_delegations, super_list) {
++ if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
++ res = igrab(delegation->inode);
++ break;
++ }
++ }
++ spin_unlock(&clp->cl_lock);
++ return res;
++}
++
++/*
++ * Mark all delegations as needing to be reclaimed
++ */
++void nfs_delegation_mark_reclaim(struct nfs4_client *clp)
++{
++ struct nfs_delegation *delegation;
++ spin_lock(&clp->cl_lock);
++ list_for_each_entry(delegation, &clp->cl_delegations, super_list)
++ delegation->flags |= NFS_DELEGATION_NEED_RECLAIM;
++ spin_unlock(&clp->cl_lock);
++}
++
++/*
++ * Reap all unclaimed delegations after reboot recovery is done
++ */
++void nfs_delegation_reap_unclaimed(struct nfs4_client *clp)
++{
++ struct nfs_delegation *delegation, *n;
++ LIST_HEAD(head);
++ spin_lock(&clp->cl_lock);
++ list_for_each_entry_safe(delegation, n, &clp->cl_delegations, super_list) {
++ if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0)
++ continue;
++ list_move(&delegation->super_list, &head);
++ NFS_I(delegation->inode)->delegation = NULL;
++ NFS_I(delegation->inode)->delegation_state = 0;
++ }
++ spin_unlock(&clp->cl_lock);
++ while(!list_empty(&head)) {
++ delegation = list_entry(head.next, struct nfs_delegation, super_list);
++ list_del(&delegation->super_list);
++ nfs_free_delegation(delegation);
++ }
++}
+--- linux-2.6.7/fs/nfs/delegation.h.lsec 2005-03-23 14:28:22.546622088 -0700
++++ linux-2.6.7/fs/nfs/delegation.h 2005-03-23 14:28:22.546622088 -0700
+@@ -0,0 +1,56 @@
++/*
++ * linux/fs/nfs/delegation.h
++ *
++ * Copyright (c) Trond Myklebust
++ *
++ * Definitions pertaining to NFS delegated files
++ */
++#ifndef FS_NFS_DELEGATION_H
++#define FS_NFS_DELEGATION_H
++
++#if defined(CONFIG_NFS_V4)
++/*
++ * NFSv4 delegation
++ */
++struct nfs_delegation {
++ struct list_head super_list;
++ struct rpc_cred *cred;
++ struct inode *inode;
++ nfs4_stateid stateid;
++ int type;
++#define NFS_DELEGATION_NEED_RECLAIM 1
++ long flags;
++ loff_t maxsize;
++};
++
++int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
++void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
++int nfs_inode_return_delegation(struct inode *inode);
++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
++
++struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle);
++void nfs_return_all_delegations(struct super_block *sb);
++
++void nfs_delegation_mark_reclaim(struct nfs4_client *clp);
++void nfs_delegation_reap_unclaimed(struct nfs4_client *clp);
++
++/* NFSv4 delegation-related procedures */
++int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid);
++int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state);
++
++static inline int nfs_have_delegation(struct inode *inode, int flags)
++{
++ flags &= FMODE_READ|FMODE_WRITE;
++ rmb();
++ if ((NFS_I(inode)->delegation_state & flags) == flags)
++ return 1;
++ return 0;
++}
++#else
++static inline int nfs_have_delegation(struct inode *inode, int flags)
++{
++ return 0;
++}
++#endif
++
++#endif
+--- linux-2.6.7/fs/nfs/nfs3proc.c.lsec 2004-06-15 23:19:23.000000000 -0600
++++ linux-2.6.7/fs/nfs/nfs3proc.c 2005-03-23 14:28:22.820580440 -0700
+@@ -68,18 +68,6 @@ nfs3_async_handle_jukebox(struct rpc_tas
+ return 1;
+ }
+
+-static struct rpc_cred *
+-nfs_cred(struct inode *inode, struct file *filp)
+-{
+- struct rpc_cred *cred = NULL;
+-
+- if (filp)
+- cred = (struct rpc_cred *)filp->private_data;
+- if (!cred)
+- cred = NFS_I(inode)->mm_cred;
+- return cred;
+-}
+-
+ /*
+ * Bare-bones access to getattr: this is for nfs_read_super.
+ */
+@@ -164,8 +152,7 @@ nfs3_proc_lookup(struct inode *dir, stru
+ return status;
+ }
+
+-static int
+-nfs3_proc_access(struct inode *inode, struct rpc_cred *cred, int mode)
++static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+ {
+ struct nfs_fattr fattr;
+ struct nfs3_accessargs arg = {
+@@ -178,9 +165,10 @@ nfs3_proc_access(struct inode *inode, st
+ .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+- .rpc_cred = cred
++ .rpc_cred = entry->cred
+ };
+- int status;
++ int mode = entry->mask;
++ int status;
+
+ dprintk("NFS call access\n");
+ fattr.valid = 0;
+@@ -200,10 +188,16 @@ nfs3_proc_access(struct inode *inode, st
+ }
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ nfs_refresh_inode(inode, &fattr);
+- dprintk("NFS reply access\n");
+-
+- if (status == 0 && (arg.access & res.access) != arg.access)
+- status = -EACCES;
++ if (status == 0) {
++ entry->mask = 0;
++ if (res.access & NFS3_ACCESS_READ)
++ entry->mask |= MAY_READ;
++ if (res.access & (NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE))
++ entry->mask |= MAY_WRITE;
++ if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
++ entry->mask |= MAY_EXEC;
++ }
++ dprintk("NFS reply access, status = %d\n", status);
+ return status;
+ }
+
+@@ -227,8 +221,7 @@ nfs3_proc_readlink(struct inode *inode,
+ return status;
+ }
+
+-static int
+-nfs3_proc_read(struct nfs_read_data *rdata, struct file *filp)
++static int nfs3_proc_read(struct nfs_read_data *rdata)
+ {
+ int flags = rdata->flags;
+ struct inode * inode = rdata->inode;
+@@ -237,13 +230,13 @@ nfs3_proc_read(struct nfs_read_data *rda
+ .rpc_proc = &nfs3_procedures[NFS3PROC_READ],
+ .rpc_argp = &rdata->args,
+ .rpc_resp = &rdata->res,
++ .rpc_cred = rdata->cred,
+ };
+ int status;
+
+ dprintk("NFS call read %d @ %Ld\n", rdata->args.count,
+ (long long) rdata->args.offset);
+ fattr->valid = 0;
+- msg.rpc_cred = nfs_cred(inode, filp);
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags);
+ if (status >= 0)
+ nfs_refresh_inode(inode, fattr);
+@@ -251,8 +244,7 @@ nfs3_proc_read(struct nfs_read_data *rda
+ return status;
+ }
+
+-static int
+-nfs3_proc_write(struct nfs_write_data *wdata, struct file *filp)
++static int nfs3_proc_write(struct nfs_write_data *wdata)
+ {
+ int rpcflags = wdata->flags;
+ struct inode * inode = wdata->inode;
+@@ -261,13 +253,13 @@ nfs3_proc_write(struct nfs_write_data *w
+ .rpc_proc = &nfs3_procedures[NFS3PROC_WRITE],
+ .rpc_argp = &wdata->args,
+ .rpc_resp = &wdata->res,
++ .rpc_cred = wdata->cred,
+ };
+ int status;
+
+ dprintk("NFS call write %d @ %Ld\n", wdata->args.count,
+ (long long) wdata->args.offset);
+ fattr->valid = 0;
+- msg.rpc_cred = nfs_cred(inode, filp);
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, rpcflags);
+ if (status >= 0)
+ nfs_refresh_inode(inode, fattr);
+@@ -275,8 +267,7 @@ nfs3_proc_write(struct nfs_write_data *w
+ return status < 0? status : wdata->res.count;
+ }
+
+-static int
+-nfs3_proc_commit(struct nfs_write_data *cdata, struct file *filp)
++static int nfs3_proc_commit(struct nfs_write_data *cdata)
+ {
+ struct inode * inode = cdata->inode;
+ struct nfs_fattr * fattr = cdata->res.fattr;
+@@ -284,13 +275,13 @@ nfs3_proc_commit(struct nfs_write_data *
+ .rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT],
+ .rpc_argp = &cdata->args,
+ .rpc_resp = &cdata->res,
++ .rpc_cred = cdata->cred,
+ };
+ int status;
+
+ dprintk("NFS call commit %d @ %Ld\n", cdata->args.count,
+ (long long) cdata->args.offset);
+ fattr->valid = 0;
+- msg.rpc_cred = nfs_cred(inode, filp);
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ if (status >= 0)
+ nfs_refresh_inode(inode, fattr);
+@@ -534,6 +525,8 @@ nfs3_proc_symlink(struct inode *dir, str
+ };
+ int status;
+
++ if (path->len > NFS3_MAXPATHLEN)
++ return -ENAMETOOLONG;
+ dprintk("NFS call symlink %s -> %s\n", name->name, path->name);
+ dir_attr.valid = 0;
+ fattr->valid = 0;
+@@ -832,27 +825,6 @@ nfs3_proc_commit_setup(struct nfs_write_
+ rpc_call_setup(task, &msg, 0);
+ }
+
+-/*
+- * Set up the nfspage struct with the right credentials
+- */
+-void
+-nfs3_request_init(struct nfs_page *req, struct file *filp)
+-{
+- req->wb_cred = get_rpccred(nfs_cred(req->wb_inode, filp));
+-}
+-
+-static int
+-nfs3_request_compatible(struct nfs_page *req, struct file *filp, struct page *page)
+-{
+- if (req->wb_file != filp)
+- return 0;
+- if (req->wb_page != page)
+- return 0;
+- if (req->wb_cred != nfs_file_cred(filp))
+- return 0;
+- return 1;
+-}
+-
+ static int
+ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
+ {
+@@ -863,6 +835,7 @@ struct nfs_rpc_ops nfs_v3_clientops = {
+ .version = 3, /* protocol version */
+ .dentry_ops = &nfs_dentry_operations,
+ .dir_inode_ops = &nfs_dir_inode_operations,
++ .file_inode_ops = &nfs_file_inode_operations,
+ .getroot = nfs3_proc_get_root,
+ .getattr = nfs3_proc_getattr,
+ .setattr = nfs3_proc_setattr,
+@@ -892,7 +865,5 @@ struct nfs_rpc_ops nfs_v3_clientops = {
+ .commit_setup = nfs3_proc_commit_setup,
+ .file_open = nfs_open,
+ .file_release = nfs_release,
+- .request_init = nfs3_request_init,
+- .request_compatible = nfs3_request_compatible,
+ .lock = nfs3_proc_lock,
+ };
+--- linux-2.6.7/fs/nfs/proc.c.lsec 2004-06-15 23:20:03.000000000 -0600
++++ linux-2.6.7/fs/nfs/proc.c 2005-03-23 14:28:23.058544264 -0700
+@@ -49,18 +49,6 @@
+
+ extern struct rpc_procinfo nfs_procedures[];
+
+-static struct rpc_cred *
+-nfs_cred(struct inode *inode, struct file *filp)
+-{
+- struct rpc_cred *cred = NULL;
+-
+- if (filp)
+- cred = (struct rpc_cred *)filp->private_data;
+- if (!cred)
+- cred = NFS_I(inode)->mm_cred;
+- return cred;
+-}
+-
+ /*
+ * Bare-bones access to getattr: this is for nfs_read_super.
+ */
+@@ -167,8 +155,7 @@ nfs_proc_readlink(struct inode *inode, s
+ return status;
+ }
+
+-static int
+-nfs_proc_read(struct nfs_read_data *rdata, struct file *filp)
++static int nfs_proc_read(struct nfs_read_data *rdata)
+ {
+ int flags = rdata->flags;
+ struct inode * inode = rdata->inode;
+@@ -177,15 +164,14 @@ nfs_proc_read(struct nfs_read_data *rdat
+ .rpc_proc = &nfs_procedures[NFSPROC_READ],
+ .rpc_argp = &rdata->args,
+ .rpc_resp = &rdata->res,
++ .rpc_resp = rdata->cred,
+ };
+ int status;
+
+ dprintk("NFS call read %d @ %Ld\n", rdata->args.count,
+ (long long) rdata->args.offset);
+ fattr->valid = 0;
+- msg.rpc_cred = nfs_cred(inode, filp);
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags);
+-
+ if (status >= 0) {
+ nfs_refresh_inode(inode, fattr);
+ /* Emulate the eof flag, which isn't normally needed in NFSv2
+@@ -198,8 +184,7 @@ nfs_proc_read(struct nfs_read_data *rdat
+ return status;
+ }
+
+-static int
+-nfs_proc_write(struct nfs_write_data *wdata, struct file *filp)
++static int nfs_proc_write(struct nfs_write_data *wdata)
+ {
+ int flags = wdata->flags;
+ struct inode * inode = wdata->inode;
+@@ -208,13 +193,13 @@ nfs_proc_write(struct nfs_write_data *wd
+ .rpc_proc = &nfs_procedures[NFSPROC_WRITE],
+ .rpc_argp = &wdata->args,
+ .rpc_resp = &wdata->res,
++ .rpc_resp = wdata->cred,
+ };
+ int status;
+
+ dprintk("NFS call write %d @ %Ld\n", wdata->args.count,
+ (long long) wdata->args.offset);
+ fattr->valid = 0;
+- msg.rpc_cred = nfs_cred(inode, filp);
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags);
+ if (status >= 0) {
+ nfs_refresh_inode(inode, fattr);
+@@ -400,6 +385,8 @@ nfs_proc_symlink(struct inode *dir, stru
+ };
+ int status;
+
++ if (path->len > NFS2_MAXPATHLEN)
++ return -ENAMETOOLONG;
+ dprintk("NFS call symlink %s -> %s\n", name->name, path->name);
+ fattr->valid = 0;
+ status = rpc_call(NFS_CLIENT(dir), NFSPROC_SYMLINK, &arg, NULL, 0);
+@@ -619,27 +606,6 @@ nfs_proc_commit_setup(struct nfs_write_d
+ BUG();
+ }
+
+-/*
+- * Set up the nfspage struct with the right credentials
+- */
+-static void
+-nfs_request_init(struct nfs_page *req, struct file *filp)
+-{
+- req->wb_cred = get_rpccred(nfs_cred(req->wb_inode, filp));
+-}
+-
+-static int
+-nfs_request_compatible(struct nfs_page *req, struct file *filp, struct page *page)
+-{
+- if (req->wb_file != filp)
+- return 0;
+- if (req->wb_page != page)
+- return 0;
+- if (req->wb_cred != nfs_file_cred(filp))
+- return 0;
+- return 1;
+-}
+-
+ static int
+ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
+ {
+@@ -651,6 +617,7 @@ struct nfs_rpc_ops nfs_v2_clientops = {
+ .version = 2, /* protocol version */
+ .dentry_ops = &nfs_dentry_operations,
+ .dir_inode_ops = &nfs_dir_inode_operations,
++ .file_inode_ops = &nfs_file_inode_operations,
+ .getroot = nfs_proc_get_root,
+ .getattr = nfs_proc_getattr,
+ .setattr = nfs_proc_setattr,
+@@ -680,7 +647,5 @@ struct nfs_rpc_ops nfs_v2_clientops = {
+ .commit_setup = nfs_proc_commit_setup,
+ .file_open = nfs_open,
+ .file_release = nfs_release,
+- .request_init = nfs_request_init,
+- .request_compatible = nfs_request_compatible,
+ .lock = nfs_proc_lock,
+ };
+--- linux-2.6.7/fs/nfs/file.c.lsec 2004-06-15 23:19:37.000000000 -0600
++++ linux-2.6.7/fs/nfs/file.c 2005-03-23 14:28:22.760589560 -0700
+@@ -31,6 +31,8 @@
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+
++#include "delegation.h"
++
+ #define NFSDBG_FACILITY NFSDBG_FILE
+
+ static long nfs_file_fcntl(int fd, unsigned int cmd,
+@@ -66,6 +68,19 @@ struct inode_operations nfs_file_inode_o
+ .setattr = nfs_setattr,
+ };
+
++#ifdef CONFIG_NFS_V4
++
++struct inode_operations nfs4_file_inode_operations = {
++ .permission = nfs_permission,
++ .getattr = nfs_getattr,
++ .setattr = nfs_setattr,
++ .getxattr = nfs_getxattr,
++ .setxattr = nfs_setxattr,
++ .listxattr = nfs_listxattr,
++};
++
++#endif /* CONFIG_NFS_V4 */
++
+ /* Hack for future NFS swap support */
+ #ifndef IS_SWAPFILE
+ # define IS_SWAPFILE(inode) (0)
+@@ -127,6 +142,7 @@ nfs_file_release(struct inode *inode, st
+ static int
+ nfs_file_flush(struct file *file)
+ {
++ struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+ struct inode *inode = file->f_dentry->d_inode;
+ int status;
+
+@@ -138,9 +154,9 @@ nfs_file_flush(struct file *file)
+ /* Ensure that data+attribute caches are up to date after close() */
+ status = nfs_wb_all(inode);
+ if (!status) {
+- status = file->f_error;
+- file->f_error = 0;
+- if (!status)
++ status = ctx->error;
++ ctx->error = 0;
++ if (!status && !nfs_have_delegation(inode, FMODE_READ))
+ __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ }
+ unlock_kernel();
+@@ -211,6 +227,7 @@ nfs_file_mmap(struct file * file, struct
+ static int
+ nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+ {
++ struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+ struct inode *inode = dentry->d_inode;
+ int status;
+
+@@ -219,8 +236,8 @@ nfs_fsync(struct file *file, struct dent
+ lock_kernel();
+ status = nfs_wb_all(inode);
+ if (!status) {
+- status = file->f_error;
+- file->f_error = 0;
++ status = ctx->error;
++ ctx->error = 0;
+ }
+ unlock_kernel();
+ return status;
+@@ -302,6 +319,90 @@ out_swapfile:
+ goto out;
+ }
+
++static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
++{
++ struct inode *inode = filp->f_mapping->host;
++ int status;
++
++ lock_kernel();
++ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++ unlock_kernel();
++ return status;
++}
++
++static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
++{
++ struct inode *inode = filp->f_mapping->host;
++ sigset_t oldset;
++ int status;
++
++ rpc_clnt_sigmask(NFS_CLIENT(inode), &oldset);
++ /*
++ * Flush all pending writes before doing anything
++ * with locks..
++ */
++ filemap_fdatawrite(filp->f_mapping);
++ down(&inode->i_sem);
++ nfs_wb_all(inode);
++ up(&inode->i_sem);
++ filemap_fdatawait(filp->f_mapping);
++
++ /* NOTE: special case
++ * If we're signalled while cleaning up locks on process exit, we
++ * still need to complete the unlock.
++ */
++ lock_kernel();
++ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++ rpc_clnt_sigunmask(NFS_CLIENT(inode), &oldset);
++ return status;
++}
++
++static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
++{
++ struct inode *inode = filp->f_mapping->host;
++ int status;
++
++ /*
++ * Flush all pending writes before doing anything
++ * with locks..
++ */
++ status = filemap_fdatawrite(filp->f_mapping);
++ if (status == 0) {
++ down(&inode->i_sem);
++ status = nfs_wb_all(inode);
++ up(&inode->i_sem);
++ if (status == 0)
++ status = filemap_fdatawait(filp->f_mapping);
++ }
++ if (status < 0)
++ return status;
++
++ lock_kernel();
++ status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++ /* If we were signalled we still need to ensure that
++ * we clean up any state on the server. We therefore
++ * record the lock call as having succeeded in order to
++ * ensure that locks_remove_posix() cleans it out when
++ * the process exits.
++ */
++ if (status == -EINTR || status == -ERESTARTSYS)
++ posix_lock_file(filp, fl);
++ unlock_kernel();
++ if (status < 0)
++ return status;
++ /*
++ * Make sure we clear the cache whenever we try to get the lock.
++ * This makes locking act as a cache coherency point.
++ */
++ filemap_fdatawrite(filp->f_mapping);
++ down(&inode->i_sem);
++ nfs_wb_all(inode); /* we may have slept */
++ up(&inode->i_sem);
++ filemap_fdatawait(filp->f_mapping);
++ nfs_zap_caches(inode);
++ return 0;
++}
++
+ /*
+ * Lock a (portion of) a file
+ */
+@@ -309,8 +410,6 @@ int
+ nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
+ {
+ struct inode * inode = filp->f_mapping->host;
+- int status = 0;
+- int status2;
+
+ dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n",
+ inode->i_sb->s_id, inode->i_ino,
+@@ -328,8 +427,8 @@ nfs_lock(struct file *filp, int cmd, str
+ /* Fake OK code if mounted without NLM support */
+ if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) {
+ if (IS_GETLK(cmd))
+- status = LOCK_USE_CLNT;
+- goto out_ok;
++ return LOCK_USE_CLNT;
++ return 0;
+ }
+ }
+
+@@ -340,45 +439,12 @@ nfs_lock(struct file *filp, int cmd, str
+ * Not sure whether that would be unique, though, or whether
+ * that would break in other places.
+ */
+- if (!fl->fl_owner || !(fl->fl_flags & FL_POSIX))
++ if (!(fl->fl_flags & FL_POSIX))
+ return -ENOLCK;
+
+- /*
+- * Flush all pending writes before doing anything
+- * with locks..
+- */
+- status = filemap_fdatawrite(filp->f_mapping);
+- down(&inode->i_sem);
+- status2 = nfs_wb_all(inode);
+- if (!status)
+- status = status2;
+- up(&inode->i_sem);
+- status2 = filemap_fdatawait(filp->f_mapping);
+- if (!status)
+- status = status2;
+- if (status < 0)
+- return status;
+-
+- lock_kernel();
+- status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+- unlock_kernel();
+- if (status < 0)
+- return status;
+-
+- status = 0;
+-
+- /*
+- * Make sure we clear the cache whenever we try to get the lock.
+- * This makes locking act as a cache coherency point.
+- */
+- out_ok:
+- if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+- filemap_fdatawrite(filp->f_mapping);
+- down(&inode->i_sem);
+- nfs_wb_all(inode); /* we may have slept */
+- up(&inode->i_sem);
+- filemap_fdatawait(filp->f_mapping);
+- nfs_zap_caches(inode);
+- }
+- return status;
++ if (IS_GETLK(cmd))
++ return do_getlk(filp, cmd, fl);
++ if (fl->fl_type == F_UNLCK)
++ return do_unlk(filp, cmd, fl);
++ return do_setlk(filp, cmd, fl);
+ }
+--- linux-2.6.7/fs/nfs/write.c.lsec 2004-06-15 23:19:43.000000000 -0600
++++ linux-2.6.7/fs/nfs/write.c 2005-03-23 14:28:23.225518880 -0700
+@@ -63,6 +63,8 @@
+ #include <linux/smp_lock.h>
+ #include <linux/mempool.h>
+
++#include "delegation.h"
++
+ #define NFSDBG_FACILITY NFSDBG_PAGECACHE
+
+ #define MIN_POOL_WRITE (32)
+@@ -71,7 +73,8 @@
+ /*
+ * Local function declarations
+ */
+-static struct nfs_page * nfs_update_request(struct file*, struct inode *,
++static struct nfs_page * nfs_update_request(struct nfs_open_context*,
++ struct inode *,
+ struct page *,
+ unsigned int, unsigned int);
+ static void nfs_writeback_done_partial(struct nfs_write_data *, int);
+@@ -173,7 +176,7 @@ static void nfs_mark_uptodate(struct pag
+ * Write a page synchronously.
+ * Offset is the data offset within the page.
+ */
+-static int nfs_writepage_sync(struct file *file, struct inode *inode,
++static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
+ struct page *page, unsigned int offset, unsigned int count,
+ int how)
+ {
+@@ -187,9 +190,10 @@ static int nfs_writepage_sync(struct fil
+
+ memset(wdata, 0, sizeof(*wdata));
+ wdata->flags = how;
++ wdata->cred = ctx->cred;
+ wdata->inode = inode;
+ wdata->args.fh = NFS_FH(inode);
+- wdata->args.lockowner = current->files;
++ wdata->args.context = ctx;
+ wdata->args.pages = &page;
+ wdata->args.stable = NFS_FILE_SYNC;
+ wdata->args.pgbase = offset;
+@@ -208,7 +212,7 @@ static int nfs_writepage_sync(struct fil
+ wdata->args.count = count;
+ wdata->args.offset = page_offset(page) + wdata->args.pgbase;
+
+- result = NFS_PROTO(inode)->write(wdata, file);
++ result = NFS_PROTO(inode)->write(wdata);
+
+ if (result < 0) {
+ /* Must mark the page invalid after I/O error */
+@@ -241,13 +245,14 @@ io_error:
+ return written ? written : result;
+ }
+
+-static int nfs_writepage_async(struct file *file, struct inode *inode,
+- struct page *page, unsigned int offset, unsigned int count)
++static int nfs_writepage_async(struct nfs_open_context *ctx,
++ struct inode *inode, struct page *page,
++ unsigned int offset, unsigned int count)
+ {
+ struct nfs_page *req;
+ int status;
+
+- req = nfs_update_request(file, inode, page, offset, count);
++ req = nfs_update_request(ctx, inode, page, offset, count);
+ status = (IS_ERR(req)) ? PTR_ERR(req) : 0;
+ if (status < 0)
+ goto out;
+@@ -274,6 +279,7 @@ static int wb_priority(struct writeback_
+ */
+ int nfs_writepage(struct page *page, struct writeback_control *wbc)
+ {
++ struct nfs_open_context *ctx;
+ struct inode *inode = page->mapping->host;
+ unsigned long end_index;
+ unsigned offset = PAGE_CACHE_SIZE;
+@@ -308,16 +314,21 @@ int nfs_writepage(struct page *page, str
+ if (page->index >= end_index+1 || !offset)
+ goto out;
+ do_it:
++ ctx = nfs_find_open_context(inode, FMODE_WRITE);
++ if (ctx == NULL) {
++ err = -EBADF;
++ goto out;
++ }
+ lock_kernel();
+ if (!IS_SYNC(inode) && inode_referenced) {
+- err = nfs_writepage_async(NULL, inode, page, 0, offset);
++ err = nfs_writepage_async(ctx, inode, page, 0, offset);
+ if (err >= 0) {
+ err = 0;
+ if (wbc->for_reclaim)
+ nfs_flush_inode(inode, 0, 0, FLUSH_STABLE);
+ }
+ } else {
+- err = nfs_writepage_sync(NULL, inode, page, 0,
++ err = nfs_writepage_sync(ctx, inode, page, 0,
+ offset, priority);
+ if (err >= 0) {
+ if (err != offset)
+@@ -326,6 +337,7 @@ do_it:
+ }
+ }
+ unlock_kernel();
++ put_nfs_open_context(ctx);
+ out:
+ unlock_page(page);
+ if (inode_referenced)
+@@ -374,8 +386,7 @@ out:
+ /*
+ * Insert a write request into an inode
+ */
+-static inline int
+-nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
++static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+ {
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int error;
+@@ -387,6 +398,8 @@ nfs_inode_add_request(struct inode *inod
+ if (!nfsi->npages) {
+ igrab(inode);
+ nfs_begin_data_update(inode);
++ if (nfs_have_delegation(inode, FMODE_WRITE))
++ nfsi->change_attr++;
+ }
+ nfsi->npages++;
+ req->wb_count++;
+@@ -404,7 +417,7 @@ nfs_inode_remove_request(struct nfs_page
+
+ BUG_ON (!NFS_WBACK_BUSY(req));
+ spin_lock(&nfs_wreq_lock);
+- inode = req->wb_inode;
++ inode = req->wb_context->dentry->d_inode;
+ nfsi = NFS_I(inode);
+ radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
+ nfsi->npages--;
+@@ -450,7 +463,7 @@ nfs_find_request(struct inode *inode, un
+ static void
+ nfs_mark_request_dirty(struct nfs_page *req)
+ {
+- struct inode *inode = req->wb_inode;
++ struct inode *inode = req->wb_context->dentry->d_inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ spin_lock(&nfs_wreq_lock);
+@@ -467,7 +480,7 @@ nfs_mark_request_dirty(struct nfs_page *
+ static inline int
+ nfs_dirty_request(struct nfs_page *req)
+ {
+- struct nfs_inode *nfsi = NFS_I(req->wb_inode);
++ struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode);
+ return !list_empty(&req->wb_list) && req->wb_list_head == &nfsi->dirty;
+ }
+
+@@ -478,7 +491,7 @@ nfs_dirty_request(struct nfs_page *req)
+ static void
+ nfs_mark_request_commit(struct nfs_page *req)
+ {
+- struct inode *inode = req->wb_inode;
++ struct inode *inode = req->wb_context->dentry->d_inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ spin_lock(&nfs_wreq_lock);
+@@ -619,9 +632,9 @@ static int nfs_wait_on_write_congestion(
+ *
+ * Note: Should always be called with the Page Lock held!
+ */
+-static struct nfs_page *
+-nfs_update_request(struct file* file, struct inode *inode, struct page *page,
+- unsigned int offset, unsigned int bytes)
++static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
++ struct inode *inode, struct page *page,
++ unsigned int offset, unsigned int bytes)
+ {
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_page *req, *new = NULL;
+@@ -668,13 +681,9 @@ nfs_update_request(struct file* file, st
+ }
+ spin_unlock(&nfs_wreq_lock);
+
+- new = nfs_create_request(file, inode, page, offset, bytes);
++ new = nfs_create_request(ctx, inode, page, offset, bytes);
+ if (IS_ERR(new))
+ return new;
+- if (file) {
+- new->wb_file = file;
+- get_file(file);
+- }
+ }
+
+ /* We have a request for our page.
+@@ -684,7 +693,7 @@ nfs_update_request(struct file* file, st
+ * request.
+ */
+ rqend = req->wb_offset + req->wb_bytes;
+- if (req->wb_file != file
++ if (req->wb_context != ctx
+ || req->wb_page != page
+ || !nfs_dirty_request(req)
+ || offset > rqend || end < req->wb_offset) {
+@@ -705,9 +714,9 @@ nfs_update_request(struct file* file, st
+ return req;
+ }
+
+-int
+-nfs_flush_incompatible(struct file *file, struct page *page)
++int nfs_flush_incompatible(struct file *file, struct page *page)
+ {
++ struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+ struct inode *inode = page->mapping->host;
+ struct nfs_page *req;
+ int status = 0;
+@@ -721,7 +730,7 @@ nfs_flush_incompatible(struct file *file
+ */
+ req = nfs_find_request(inode, page->index);
+ if (req) {
+- if (!NFS_PROTO(inode)->request_compatible(req, file, page))
++ if (req->wb_page != page || ctx != req->wb_context)
+ status = nfs_wb_page(inode, page);
+ nfs_release_request(req);
+ }
+@@ -737,6 +746,7 @@ nfs_flush_incompatible(struct file *file
+ int nfs_updatepage(struct file *file, struct page *page,
+ unsigned int offset, unsigned int count)
+ {
++ struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+ struct dentry *dentry = file->f_dentry;
+ struct inode *inode = page->mapping->host;
+ struct nfs_page *req;
+@@ -747,7 +757,7 @@ int nfs_updatepage(struct file *file, st
+ count, (long long)(page_offset(page) +offset));
+
+ if (IS_SYNC(inode)) {
+- status = nfs_writepage_sync(file, inode, page, offset, count, 0);
++ status = nfs_writepage_sync(ctx, inode, page, offset, count, 0);
+ if (status > 0) {
+ if (offset == 0 && status == PAGE_CACHE_SIZE)
+ SetPageUptodate(page);
+@@ -784,7 +794,7 @@ int nfs_updatepage(struct file *file, st
+ * it out now.
+ */
+ do {
+- req = nfs_update_request(file, inode, page, offset, count);
++ req = nfs_update_request(ctx, inode, page, offset, count);
+ status = (IS_ERR(req)) ? PTR_ERR(req) : 0;
+ if (status != -EBUSY)
+ break;
+@@ -860,16 +870,15 @@ static void nfs_write_rpcsetup(struct nf
+ * NB: take care not to mess about with data->commit et al. */
+
+ data->req = req;
+- data->inode = inode = req->wb_inode;
+- data->cred = req->wb_cred;
++ data->inode = inode = req->wb_context->dentry->d_inode;
++ data->cred = req->wb_context->cred;
+
+ data->args.fh = NFS_FH(inode);
+ data->args.offset = req_offset(req) + offset;
+ data->args.pgbase = req->wb_pgbase + offset;
+ data->args.pages = data->pagevec;
+ data->args.count = count;
+- data->args.lockowner = req->wb_lockowner;
+- data->args.state = req->wb_state;
++ data->args.context = req->wb_context;
+
+ data->res.fattr = &data->fattr;
+ data->res.count = count;
+@@ -1029,7 +1038,7 @@ nfs_flush_list(struct list_head *head, i
+ while (!list_empty(head)) {
+ pages += nfs_coalesce_requests(head, &one_request, wpages);
+ req = nfs_list_entry(one_request.next);
+- error = nfs_flush_one(&one_request, req->wb_inode, how);
++ error = nfs_flush_one(&one_request, req->wb_context->dentry->d_inode, how);
+ if (error < 0)
+ break;
+ }
+@@ -1054,16 +1063,15 @@ static void nfs_writeback_done_partial(s
+ struct page *page = req->wb_page;
+
+ dprintk("NFS: write (%s/%Ld %d@%Ld)",
+- req->wb_inode->i_sb->s_id,
+- (long long)NFS_FILEID(req->wb_inode),
++ req->wb_context->dentry->d_inode->i_sb->s_id,
++ (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+ req->wb_bytes,
+ (long long)req_offset(req));
+
+ if (status < 0) {
+ ClearPageUptodate(page);
+ SetPageError(page);
+- if (req->wb_file)
+- req->wb_file->f_error = status;
++ req->wb_context->error = status;
+ dprintk(", error = %d\n", status);
+ } else {
+ #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+@@ -1104,16 +1112,15 @@ static void nfs_writeback_done_full(stru
+ page = req->wb_page;
+
+ dprintk("NFS: write (%s/%Ld %d@%Ld)",
+- req->wb_inode->i_sb->s_id,
+- (long long)NFS_FILEID(req->wb_inode),
++ req->wb_context->dentry->d_inode->i_sb->s_id,
++ (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+ req->wb_bytes,
+ (long long)req_offset(req));
+
+ if (status < 0) {
+ ClearPageUptodate(page);
+ SetPageError(page);
+- if (req->wb_file)
+- req->wb_file->f_error = status;
++ req->wb_context->error = status;
+ end_page_writeback(page);
+ nfs_inode_remove_request(req);
+ dprintk(", error = %d\n", status);
+@@ -1232,7 +1239,7 @@ static void nfs_commit_rpcsetup(struct l
+ list_splice_init(head, &data->pages);
+ first = nfs_list_entry(data->pages.next);
+ last = nfs_list_entry(data->pages.prev);
+- inode = first->wb_inode;
++ inode = first->wb_context->dentry->d_inode;
+
+ /*
+ * Determine the offset range of requests in the COMMIT call.
+@@ -1246,7 +1253,7 @@ static void nfs_commit_rpcsetup(struct l
+ len = 0;
+
+ data->inode = inode;
+- data->cred = first->wb_cred;
++ data->cred = first->wb_context->cred;
+
+ data->args.fh = NFS_FH(data->inode);
+ data->args.offset = start;
+@@ -1313,13 +1320,12 @@ nfs_commit_done(struct rpc_task *task)
+ nfs_list_remove_request(req);
+
+ dprintk("NFS: commit (%s/%Ld %d@%Ld)",
+- req->wb_inode->i_sb->s_id,
+- (long long)NFS_FILEID(req->wb_inode),
++ req->wb_context->dentry->d_inode->i_sb->s_id,
++ (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+ req->wb_bytes,
+ (long long)req_offset(req));
+ if (task->tk_status < 0) {
+- if (req->wb_file)
+- req->wb_file->f_error = task->tk_status;
++ req->wb_context->error = task->tk_status;
+ nfs_inode_remove_request(req);
+ dprintk(", error = %d\n", task->tk_status);
+ goto next;
+--- linux-2.6.7/fs/nfs/nfs4xdr.c.lsec 2004-06-15 23:20:26.000000000 -0600
++++ linux-2.6.7/fs/nfs/nfs4xdr.c 2005-03-23 14:28:23.056544568 -0700
+@@ -84,9 +84,13 @@ static int nfs_stat_to_errno(int);
+ ((3+NFS4_FHSIZE) >> 2))
+ #define encode_getattr_maxsz (op_encode_hdr_maxsz + 3)
+ #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2))
++#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
+ #define nfs4_fattr_bitmap_maxsz (36 + 2 * nfs4_name_maxsz)
+ #define decode_getattr_maxsz (op_decode_hdr_maxsz + 3 + \
+ nfs4_fattr_bitmap_maxsz)
++#define encode_setattr_maxsz (op_decode_hdr_maxsz + 4 + \
++ nfs4_fattr_bitmap_maxsz)
++#define decode_setattr_maxsz (op_decode_hdr_maxsz + 3)
+ #define encode_savefh_maxsz (op_encode_hdr_maxsz)
+ #define decode_savefh_maxsz (op_decode_hdr_maxsz)
+ #define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2)
+@@ -118,10 +122,17 @@ static int nfs_stat_to_errno(int);
+ #define encode_link_maxsz (op_encode_hdr_maxsz + \
+ nfs4_name_maxsz)
+ #define decode_link_maxsz (op_decode_hdr_maxsz + 5)
++#define encode_symlink_maxsz (op_encode_hdr_maxsz + \
++ 1 + nfs4_name_maxsz + \
++ nfs4_path_maxsz + \
++ nfs4_fattr_bitmap_maxsz)
++#define decode_symlink_maxsz (op_decode_hdr_maxsz + 8)
+ #define encode_create_maxsz (op_encode_hdr_maxsz + \
+- 2 + 2 * nfs4_name_maxsz + \
++ 2 + nfs4_name_maxsz + \
+ nfs4_fattr_bitmap_maxsz)
+ #define decode_create_maxsz (op_decode_hdr_maxsz + 8)
++#define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4)
++#define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
+ #define NFS4_enc_compound_sz (1024) /* XXX: large enough? */
+ #define NFS4_dec_compound_sz (1024) /* XXX: large enough? */
+ #define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \
+@@ -172,16 +183,14 @@ static int nfs_stat_to_errno(int);
+ #define NFS4_dec_open_confirm_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ op_decode_hdr_maxsz + 4)
+-#define NFS4_enc_open_reclaim_sz (compound_encode_hdr_maxsz + \
++#define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ op_encode_hdr_maxsz + \
+- 11 + \
+- encode_getattr_maxsz)
+-#define NFS4_dec_open_reclaim_sz (compound_decode_hdr_maxsz + \
++ 11)
++#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ op_decode_hdr_maxsz + \
+- 4 + 5 + 2 + 3 + \
+- decode_getattr_maxsz)
++ 4 + 5 + 2 + 3)
+ #define NFS4_enc_open_downgrade_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+@@ -313,6 +322,16 @@ static int nfs_stat_to_errno(int);
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_link_maxsz)
++#define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \
++ encode_putfh_maxsz + \
++ encode_symlink_maxsz + \
++ encode_getattr_maxsz + \
++ encode_getfh_maxsz)
++#define NFS4_dec_symlink_sz (compound_decode_hdr_maxsz + \
++ decode_putfh_maxsz + \
++ decode_symlink_maxsz + \
++ decode_getattr_maxsz + \
++ decode_getfh_maxsz)
+ #define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_create_maxsz + \
+@@ -339,6 +358,33 @@ static int nfs_stat_to_errno(int);
+ encode_getattr_maxsz)
+ #define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \
+ decode_getattr_maxsz)
++#define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \
++ encode_putfh_maxsz + \
++ encode_delegreturn_maxsz)
++#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
++ decode_delegreturn_maxsz)
++#define username_maxsz (1 + ((IDMAP_NAMESZ + 3) >> 2))
++/* XXX: fix ACL bounds */
++#define ace_maxsz (3 + username_maxsz)
++#define NFS_ACL_MAX_ENTRIES 32
++#define acl_maxentries ((NFS_ACL_MAX_ENTRIES - 3) * 3 + 6)
++#define acl_maxsz (1 + acl_maxentries * ace_maxsz)
++#define NFS4_enc_getacl_sz compound_encode_hdr_maxsz + \
++ encode_putfh_maxsz + \
++ encode_getattr_maxsz
++#define username_maxsz (1 + ((IDMAP_NAMESZ + 3) >> 2))
++#define ace_maxsz (3 + username_maxsz)
++#define acl_maxentries ((NFS_ACL_MAX_ENTRIES - 3) * 3 + 6)
++#define acl_maxsz (1 + acl_maxentries * ace_maxsz)
++#define NFS4_dec_getacl_sz (compound_decode_hdr_maxsz + \
++ decode_putfh_maxsz + \
++ op_decode_hdr_maxsz + 3 + 1 + acl_maxsz)
++#define NFS4_enc_setacl_sz (compound_encode_hdr_maxsz + \
++ encode_putfh_maxsz + \
++ op_encode_hdr_maxsz + 4 + 1 + acl_maxsz)
++#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \
++ decode_putfh_maxsz + \
++ decode_setattr_maxsz)
+
+ static struct {
+ unsigned int mode;
+@@ -388,6 +434,15 @@ struct compound_hdr {
+ BUG_ON(!p); \
+ } while (0)
+
++static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
++{
++ uint32_t *p;
++
++ p = xdr_reserve_space(xdr, 4 + len);
++ BUG_ON(p == NULL);
++ xdr_encode_opaque(p, str, len);
++}
++
+ static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+ {
+ uint32_t *p;
+@@ -402,6 +457,15 @@ static int encode_compound_hdr(struct xd
+ return 0;
+ }
+
++static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
++{
++ uint32_t *p;
++
++ p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
++ BUG_ON(p == NULL);
++ xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE);
++}
++
+ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+ {
+ char owner_name[IDMAP_NAMESZ];
+@@ -420,7 +484,7 @@ static int encode_attrs(struct xdr_strea
+ * In the worst-case, this would be
+ * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
+ * = 36 bytes, plus any contribution from variable-length fields
+- * such as owner/group/acl's.
++ * such as owner/group.
+ */
+ len = 16;
+
+@@ -742,19 +806,12 @@ static int encode_lookup(struct xdr_stre
+ return 0;
+ }
+
+-static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg)
++static void encode_share_access(struct xdr_stream *xdr, int open_flags)
+ {
+- int status;
+ uint32_t *p;
+
+- /*
+- * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
+- * owner 4, opentype 4 = 36
+- */
+- RESERVE_SPACE(36);
+- WRITE32(OP_OPEN);
+- WRITE32(arg->seqid);
+- switch (arg->share_access) {
++ RESERVE_SPACE(8);
++ switch (open_flags & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_READ:
+ WRITE32(NFS4_SHARE_ACCESS_READ);
+ break;
+@@ -767,84 +824,135 @@ static int encode_open(struct xdr_stream
+ default:
+ BUG();
+ }
+- WRITE32(0); /* for linux, share_deny = 0 always */
++ WRITE32(0); /* for linux, share_deny = 0 always */
++}
++
++static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
++{
++ uint32_t *p;
++ /*
++ * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
++ * owner 4 = 32
++ */
++ RESERVE_SPACE(8);
++ WRITE32(OP_OPEN);
++ WRITE32(arg->seqid);
++ encode_share_access(xdr, arg->open_flags);
++ RESERVE_SPACE(16);
+ WRITE64(arg->clientid);
+ WRITE32(4);
+ WRITE32(arg->id);
+- WRITE32(arg->opentype);
++}
+
+- if (arg->opentype == NFS4_OPEN_CREATE) {
+- if (arg->createmode == NFS4_CREATE_EXCLUSIVE) {
+- RESERVE_SPACE(12);
+- WRITE32(arg->createmode);
+- WRITEMEM(arg->u.verifier.data, sizeof(arg->u.verifier.data));
+- }
+- else if (arg->u.attrs) {
+- RESERVE_SPACE(4);
+- WRITE32(arg->createmode);
+- if ((status = encode_attrs(xdr, arg->u.attrs, arg->server)))
+- return status;
+- }
+- else {
+- RESERVE_SPACE(12);
+- WRITE32(arg->createmode);
+- WRITE32(0);
+- WRITE32(0);
+- }
++static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
++{
++ uint32_t *p;
++
++ RESERVE_SPACE(4);
++ switch(arg->open_flags & O_EXCL) {
++ case 0:
++ WRITE32(NFS4_CREATE_UNCHECKED);
++ encode_attrs(xdr, arg->u.attrs, arg->server);
++ break;
++ default:
++ WRITE32(NFS4_CREATE_EXCLUSIVE);
++ encode_nfs4_verifier(xdr, &arg->u.verifier);
+ }
++}
+
+- RESERVE_SPACE(8 + arg->name->len);
+- WRITE32(NFS4_OPEN_CLAIM_NULL);
+- WRITE32(arg->name->len);
+- WRITEMEM(arg->name->name, arg->name->len);
++static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg)
++{
++ uint32_t *p;
+
+- return 0;
++ RESERVE_SPACE(4);
++ switch (arg->open_flags & O_CREAT) {
++ case 0:
++ WRITE32(NFS4_OPEN_NOCREATE);
++ break;
++ default:
++ BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
++ WRITE32(NFS4_OPEN_CREATE);
++ encode_createmode(xdr, arg);
++ }
+ }
+
+-static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg)
++static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type)
+ {
+ uint32_t *p;
+
+- RESERVE_SPACE(8+sizeof(arg->stateid.data));
+- WRITE32(OP_OPEN_CONFIRM);
+- WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
+- WRITE32(arg->seqid);
++ RESERVE_SPACE(4);
++ switch (delegation_type) {
++ case 0:
++ WRITE32(NFS4_OPEN_DELEGATE_NONE);
++ break;
++ case FMODE_READ:
++ WRITE32(NFS4_OPEN_DELEGATE_READ);
++ break;
++ case FMODE_WRITE|FMODE_READ:
++ WRITE32(NFS4_OPEN_DELEGATE_WRITE);
++ break;
++ default:
++ BUG();
++ }
++}
+
+- return 0;
++static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name)
++{
++ uint32_t *p;
++
++ RESERVE_SPACE(4);
++ WRITE32(NFS4_OPEN_CLAIM_NULL);
++ encode_string(xdr, name->len, name->name);
+ }
+
++static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
++{
++ uint32_t *p;
++
++ RESERVE_SPACE(4);
++ WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
++ encode_delegation_type(xdr, type);
++}
+
+-static int encode_open_reclaim(struct xdr_stream *xdr, const struct nfs_open_reclaimargs *arg)
++static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid)
+ {
+ uint32_t *p;
+
+- /*
+- * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
+- * owner 4, opentype 4, claim 4, delegation_type 4 = 44
+- */
+- RESERVE_SPACE(44);
+- WRITE32(OP_OPEN);
+- WRITE32(arg->seqid);
+- switch (arg->share_access) {
+- case FMODE_READ:
+- WRITE32(NFS4_SHARE_ACCESS_READ);
++ RESERVE_SPACE(4+sizeof(stateid->data));
++ WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
++ WRITEMEM(stateid->data, sizeof(stateid->data));
++ encode_string(xdr, name->len, name->name);
++}
++
++static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg)
++{
++ encode_openhdr(xdr, arg);
++ encode_opentype(xdr, arg);
++ switch (arg->claim) {
++ case NFS4_OPEN_CLAIM_NULL:
++ encode_claim_null(xdr, arg->name);
+ break;
+- case FMODE_WRITE:
+- WRITE32(NFS4_SHARE_ACCESS_WRITE);
++ case NFS4_OPEN_CLAIM_PREVIOUS:
++ encode_claim_previous(xdr, arg->u.delegation_type);
+ break;
+- case FMODE_READ|FMODE_WRITE:
+- WRITE32(NFS4_SHARE_ACCESS_BOTH);
++ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
++ encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
+ break;
+ default:
+ BUG();
+ }
+- WRITE32(0); /* for linux, share_deny = 0 always */
+- WRITE64(arg->clientid);
+- WRITE32(4);
+- WRITE32(arg->id);
+- WRITE32(NFS4_OPEN_NOCREATE);
+- WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
+- WRITE32(NFS4_OPEN_DELEGATE_NONE);
++ return 0;
++}
++
++static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg)
++{
++ uint32_t *p;
++
++ RESERVE_SPACE(8+sizeof(arg->stateid.data));
++ WRITE32(OP_OPEN_CONFIRM);
++ WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
++ WRITE32(arg->seqid);
++
+ return 0;
+ }
+
+@@ -852,14 +960,11 @@ static int encode_open_downgrade(struct
+ {
+ uint32_t *p;
+
+- RESERVE_SPACE(16+sizeof(arg->stateid.data));
++ RESERVE_SPACE(8+sizeof(arg->stateid.data));
+ WRITE32(OP_OPEN_DOWNGRADE);
+ WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
+ WRITE32(arg->seqid);
+- WRITE32(arg->share_access);
+- /* No deny modes */
+- WRITE32(0);
+-
++ encode_share_access(xdr, arg->open_flags);
+ return 0;
+ }
+
+@@ -887,15 +992,15 @@ static int encode_putrootfh(struct xdr_s
+ return 0;
+ }
+
+-static void encode_stateid(struct xdr_stream *xdr, struct nfs4_state *state, fl_owner_t lockowner)
++static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
+ {
+ extern nfs4_stateid zero_stateid;
+ nfs4_stateid stateid;
+ uint32_t *p;
+
+ RESERVE_SPACE(16);
+- if (state != NULL) {
+- nfs4_copy_stateid(&stateid, state, lockowner);
++ if (ctx->state != NULL) {
++ nfs4_copy_stateid(&stateid, ctx->state, ctx->pid);
+ WRITEMEM(stateid.data, sizeof(stateid.data));
+ } else
+ WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data));
+@@ -908,7 +1013,7 @@ static int encode_read(struct xdr_stream
+ RESERVE_SPACE(4);
+ WRITE32(OP_READ);
+
+- encode_stateid(xdr, args->state, args->lockowner);
++ encode_stateid(xdr, args->context);
+
+ RESERVE_SPACE(12);
+ WRITE64(args->offset);
+@@ -1003,6 +1108,45 @@ static int encode_renew(struct xdr_strea
+ return 0;
+ }
+
++extern nfs4_stateid zero_stateid;
++
++static int
++encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
++{
++ uint32_t *p;
++ uint32_t *q = (uint32_t *)arg->acl;
++ uint32_t *end = (uint32_t *)(arg->acl + arg->acl_len);
++ uint32_t tmp;
++ int naces, i;
++
++ RESERVE_SPACE(4+sizeof(zero_stateid.data));
++ WRITE32(OP_SETATTR);
++ WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data));
++ RESERVE_SPACE(4*4);
++ WRITE32(1);
++ WRITE32(FATTR4_WORD0_ACL);
++ WRITE32(arg->acl_len);
++ if (q + 1 > end)
++ return -EINVAL;
++ naces = ntohl(*q++);
++ WRITE32(naces);
++ for (i = 0; i < naces; i++) {
++ if (q + 4 > end)
++ return -EINVAL;
++ RESERVE_SPACE(3*4);
++ memcpy(p, q, 3*4); /* type, flag, access_mask, length */
++ q += 3;
++ tmp = ntohl(*q++); /* length */
++ if (tmp > XDR_MAX_NETOBJ)
++ return -EINVAL;
++ if (q + XDR_QUADLEN(tmp) > end)
++ return -EINVAL;
++ RESERVE_SPACE((XDR_QUADLEN(tmp) << 2) + 4);
++ p = xdr_encode_opaque(p, q, tmp);
++ }
++ return 0;
++}
++
+ static int
+ encode_savefh(struct xdr_stream *xdr)
+ {
+@@ -1031,26 +1175,18 @@ static int encode_setattr(struct xdr_str
+
+ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid)
+ {
+- uint32_t total_len;
+- uint32_t len1, len2, len3;
+ uint32_t *p;
+
+- len1 = strlen(setclientid->sc_name);
+- len2 = strlen(setclientid->sc_netid);
+- len3 = strlen(setclientid->sc_uaddr);
+- total_len = XDR_QUADLEN(len1) + XDR_QUADLEN(len2) + XDR_QUADLEN(len3);
+- total_len = (total_len << 2) + 24 + sizeof(setclientid->sc_verifier.data);
+-
+- RESERVE_SPACE(total_len);
++ RESERVE_SPACE(4 + sizeof(setclientid->sc_verifier->data));
+ WRITE32(OP_SETCLIENTID);
+- WRITEMEM(setclientid->sc_verifier.data, sizeof(setclientid->sc_verifier.data));
+- WRITE32(len1);
+- WRITEMEM(setclientid->sc_name, len1);
++ WRITEMEM(setclientid->sc_verifier->data, sizeof(setclientid->sc_verifier->data));
++
++ encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
++ RESERVE_SPACE(4);
+ WRITE32(setclientid->sc_prog);
+- WRITE32(len2);
+- WRITEMEM(setclientid->sc_netid, len2);
+- WRITE32(len3);
+- WRITEMEM(setclientid->sc_uaddr, len3);
++ encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
++ encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
++ RESERVE_SPACE(4);
+ WRITE32(setclientid->sc_cb_ident);
+
+ return 0;
+@@ -1075,7 +1211,7 @@ static int encode_write(struct xdr_strea
+ RESERVE_SPACE(4);
+ WRITE32(OP_WRITE);
+
+- encode_stateid(xdr, args->state, args->lockowner);
++ encode_stateid(xdr, args->context);
+
+ RESERVE_SPACE(16);
+ WRITE64(args->offset);
+@@ -1086,6 +1222,18 @@ static int encode_write(struct xdr_strea
+
+ return 0;
+ }
++
++static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid)
++{
++ uint32_t *p;
++
++ RESERVE_SPACE(20);
++
++ WRITE32(OP_DELEGRETURN);
++ WRITEMEM(stateid->data, sizeof(stateid->data));
++ return 0;
++
++}
+ /*
+ * END OF "GENERIC" ENCODE ROUTINES.
+ */
+@@ -1244,6 +1392,14 @@ out:
+ }
+
+ /*
++ * Encode SYMLINK request
++ */
++static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, uint32_t *p, const struct nfs4_create_arg *args)
++{
++ return nfs4_xdr_enc_create(req, p, args);
++}
++
++/*
+ * Encode GETATTR request
+ */
+ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, uint32_t *p, const struct nfs4_getattr_arg *args)
+@@ -1331,13 +1487,13 @@ out:
+ }
+
+ /*
+- * Encode an OPEN request
++ * Encode an OPEN request with no attributes.
+ */
+-static int nfs4_xdr_enc_open_reclaim(struct rpc_rqst *req, uint32_t *p, struct nfs_open_reclaimargs *args)
++static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nfs_openargs *args)
+ {
+ struct xdr_stream xdr;
+ struct compound_hdr hdr = {
+- .nops = 3,
++ .nops = 2,
+ };
+ int status;
+
+@@ -1346,10 +1502,7 @@ static int nfs4_xdr_enc_open_reclaim(str
+ status = encode_putfh(&xdr, args->fh);
+ if (status)
+ goto out;
+- status = encode_open_reclaim(&xdr, args);
+- if (status)
+- goto out;
+- status = encode_getfattr(&xdr, args->bitmask);
++ status = encode_open(&xdr, args);
+ out:
+ return status;
+ }
+@@ -1538,6 +1691,52 @@ out:
+ }
+
+ /*
++ * Encode an SETACL request
++ */
++static int
++nfs4_xdr_enc_setacl(struct rpc_rqst *req, uint32_t *p, struct nfs_setaclargs *args)
++
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr = {
++ .nops = 2,
++ };
++ int status;
++
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_compound_hdr(&xdr, &hdr);
++ status = encode_putfh(&xdr, args->fh);
++ if(status)
++ goto out;
++ status = encode_setacl(&xdr, args);
++out:
++ return status;
++}
++
++/*
++ * Encode a GETACL request
++ */
++static int
++nfs4_xdr_enc_getacl(struct rpc_rqst *req, uint32_t *p,struct nfs_fh *fhandle)
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr = {
++ .nops = 2,
++ };
++ int status;
++
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_compound_hdr(&xdr, &hdr);
++ status = encode_putfh(&xdr, fhandle);
++ if (status)
++ goto out;
++ status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0);
++out:
++ return status;
++
++}
++
++/*
+ * Encode a WRITE request
+ */
+ static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args)
+@@ -1716,6 +1915,24 @@ static int nfs4_xdr_enc_setclientid_conf
+ }
+
+ /*
++ * DELEGRETURN request
++ */
++static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, uint32_t *p, const struct nfs4_delegreturnargs *args)
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr = {
++ .nops = 2,
++ };
++ int status;
++
++ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++ encode_compound_hdr(&xdr, &hdr);
++ if ((status = encode_putfh(&xdr, args->fhandle)) == 0)
++ status = encode_delegreturn(&xdr, args->stateid);
++ return status;
++}
++
++/*
+ * START OF "GENERIC" DECODE ROUTINES.
+ * These may look a little ugly since they are imported from a "generic"
+ * set of XDR encode/decode routines which are intended to be shared by
+@@ -1749,6 +1966,17 @@ static int nfs4_xdr_enc_setclientid_conf
+ } \
+ } while (0)
+
++static int decode_opaque_inline(struct xdr_stream *xdr, uint32_t *len, char **string)
++{
++ uint32_t *p;
++
++ READ_BUF(4);
++ READ32(*len);
++ READ_BUF(*len);
++ *string = (char *)p;
++ return 0;
++}
++
+ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+ {
+ uint32_t *p;
+@@ -1785,6 +2013,17 @@ static int decode_op_hdr(struct xdr_stre
+ return 0;
+ }
+
++/* Dummy routine */
++static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp)
++{
++ uint32_t *p;
++ uint32_t strlen;
++ char *str;
++
++ READ_BUF(12);
++ return decode_opaque_inline(xdr, &strlen, &str);
++}
++
+ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+ {
+ uint32_t bmlen, *p;
+@@ -2717,10 +2956,56 @@ static int decode_lookup(struct xdr_stre
+ return decode_op_hdr(xdr, OP_LOOKUP);
+ }
+
++/* This is too sick! */
++static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
++{
++ uint32_t *p;
++ uint32_t limit_type, nblocks, blocksize;
++
++ READ_BUF(12);
++ READ32(limit_type);
++ switch (limit_type) {
++ case 1:
++ READ64(*maxsize);
++ break;
++ case 2:
++ READ32(nblocks);
++ READ32(blocksize);
++ *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
++ }
++ return 0;
++}
++
++static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
++{
++ uint32_t *p;
++ uint32_t delegation_type;
++
++ READ_BUF(4);
++ READ32(delegation_type);
++ if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
++ res->delegation_type = 0;
++ return 0;
++ }
++ READ_BUF(20);
++ COPYMEM(res->delegation.data, sizeof(res->delegation.data));
++ READ32(res->do_recall);
++ switch (delegation_type) {
++ case NFS4_OPEN_DELEGATE_READ:
++ res->delegation_type = FMODE_READ;
++ break;
++ case NFS4_OPEN_DELEGATE_WRITE:
++ res->delegation_type = FMODE_WRITE|FMODE_READ;
++ if (decode_space_limit(xdr, &res->maxsize) < 0)
++ return -EIO;
++ }
++ return decode_ace(xdr, NULL, res->server->nfs4_state);
++}
++
+ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
+ {
+ uint32_t *p;
+- uint32_t bmlen, delegation_type;
++ uint32_t bmlen;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_OPEN);
+@@ -2737,11 +3022,9 @@ static int decode_open(struct xdr_stream
+ if (bmlen > 10)
+ goto xdr_error;
+
+- READ_BUF((bmlen << 2) + 4);
++ READ_BUF(bmlen << 2);
+ p += bmlen;
+- READ32(delegation_type);
+- if (delegation_type == NFS4_OPEN_DELEGATE_NONE)
+- return 0;
++ return decode_delegation(xdr, res);
+ xdr_error:
+ printk(KERN_NOTICE "%s: xdr error!\n", __FUNCTION__);
+ return -EIO;
+@@ -2967,6 +3250,72 @@ static int decode_renew(struct xdr_strea
+ return decode_op_hdr(xdr, OP_RENEW);
+ }
+
++static int decode_attr_acl(struct xdr_stream *xdr, uint32_t *bitmap,
++ struct nfs_getaclres *res)
++{
++ uint32_t *p;
++
++ if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
++ return -EIO;
++ if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
++ ssize_t size = res->acl_len;
++ uint32_t nace, tmp;
++ u32 *start;
++ int i;
++
++ res->acl_len = 0;
++ READ_BUF(4);
++ start = p;
++ READ32(nace);
++ res->acl_len += 4;
++
++ for (i = 0; i < nace; i++) {
++ READ_BUF(4*4);
++ res->acl_len += 4*4;
++ p += 3;
++ READ32(tmp); /* namelen */
++ READ_BUF(tmp);
++ if (tmp > XDR_MAX_NETOBJ) {
++ printk(KERN_WARNING "%s: name too long (%u)!\n",
++ __FUNCTION__, tmp);
++ return -EIO;
++ }
++ res->acl_len += XDR_QUADLEN(tmp) << 2;
++ }
++ if (size && res->acl_len > size)
++ return -ERANGE;
++ if (size == 0 && res->acl_len <= XATTR_SIZE_MAX)
++ res->acl = kmalloc(res->acl_len, GFP_KERNEL);
++ if (res->acl)
++ memcpy(res->acl, start, res->acl_len);
++ }
++ return 0;
++}
++
++static int decode_getacl(struct xdr_stream *xdr, struct nfs_getaclres *res)
++{
++ uint32_t *savep;
++ uint32_t attrlen,
++ bitmap[2] = {0};
++ int status;
++
++ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
++ goto xdr_error;
++ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
++ goto xdr_error;
++ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
++ goto xdr_error;
++
++ if ((status = decode_attr_acl(xdr, bitmap, res)) != 0)
++ goto xdr_error;
++
++ status = verify_attr_len(xdr, savep, attrlen);
++xdr_error:
++ if (status != 0)
++ printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status);
++ return status;
++}
++
+ static int
+ decode_savefh(struct xdr_stream *xdr)
+ {
+@@ -3048,6 +3397,11 @@ static int decode_write(struct xdr_strea
+ return 0;
+ }
+
++static int decode_delegreturn(struct xdr_stream *xdr)
++{
++ return decode_op_hdr(xdr, OP_DELEGRETURN);
++}
++
+ /*
+ * Decode OPEN_DOWNGRADE response
+ */
+@@ -3222,6 +3576,14 @@ out:
+ }
+
+ /*
++ * Decode SYMLINK response
++ */
++static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_create_res *res)
++{
++ return nfs4_xdr_dec_create(rqstp, p, res);
++}
++
++/*
+ * Decode GETATTR response
+ */
+ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_getattr_res *res)
+@@ -3243,6 +3605,50 @@ out:
+
+ }
+
++/*
++ * Decode SETACL response
++ */
++static int
++nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, uint32_t *p, void *res)
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr;
++ int status;
++
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_putfh(&xdr);
++ if (status)
++ goto out;
++ status = decode_setattr(&xdr, res);
++out:
++ return status;
++}
++
++/*
++ * Decode GETACL response
++ */
++static int
++nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_getaclres *res)
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr;
++ int status;
++
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_compound_hdr(&xdr, &hdr);
++ if (status)
++ goto out;
++ status = decode_putfh(&xdr);
++ if (status)
++ goto out;
++ status = decode_getacl(&xdr, res);
++
++out:
++ return status;
++}
+
+ /*
+ * Decode CLOSE response
+@@ -3314,9 +3720,9 @@ out:
+ }
+
+ /*
+- * Decode OPEN_RECLAIM response
++ * Decode OPEN response
+ */
+-static int nfs4_xdr_dec_open_reclaim(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res)
++static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res)
+ {
+ struct xdr_stream xdr;
+ struct compound_hdr hdr;
+@@ -3330,9 +3736,6 @@ static int nfs4_xdr_dec_open_reclaim(str
+ if (status)
+ goto out;
+ status = decode_open(&xdr, res);
+- if (status)
+- goto out;
+- status = decode_getfattr(&xdr, res->f_attr, res->server);
+ out:
+ return status;
+ }
+@@ -3665,6 +4068,25 @@ static int nfs4_xdr_dec_setclientid_conf
+ return status;
+ }
+
++/*
++ * DELEGRETURN request
++ */
++static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
++{
++ struct xdr_stream xdr;
++ struct compound_hdr hdr;
++ int status;
++
++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++ status = decode_compound_hdr(&xdr, &hdr);
++ if (status == 0) {
++ status = decode_putfh(&xdr);
++ if (status == 0)
++ status = decode_delegreturn(&xdr);
++ }
++ return status;
++}
++
+ uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus)
+ {
+ uint32_t len;
+@@ -3756,7 +4178,7 @@ nfs_stat_to_errno(int stat)
+ if (nfs_errtbl[i].stat == stat)
+ return nfs_errtbl[i].errno;
+ }
+- if (stat < 0) {
++ if (stat <= 10000 || stat > 10100) {
+ /* The server is looney tunes. */
+ return ESERVERFAULT;
+ }
+@@ -3786,7 +4208,7 @@ struct rpc_procinfo nfs4_procedures[] =
+ PROC(COMMIT, enc_commit, dec_commit),
+ PROC(OPEN, enc_open, dec_open),
+ PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm),
+- PROC(OPEN_RECLAIM, enc_open_reclaim, dec_open_reclaim),
++ PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr),
+ PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade),
+ PROC(CLOSE, enc_close, dec_close),
+ PROC(SETATTR, enc_setattr, dec_setattr),
+@@ -3804,12 +4226,16 @@ struct rpc_procinfo nfs4_procedures[] =
+ PROC(REMOVE, enc_remove, dec_remove),
+ PROC(RENAME, enc_rename, dec_rename),
+ PROC(LINK, enc_link, dec_link),
++ PROC(SYMLINK, enc_symlink, dec_symlink),
+ PROC(CREATE, enc_create, dec_create),
+ PROC(PATHCONF, enc_pathconf, dec_pathconf),
+ PROC(STATFS, enc_statfs, dec_statfs),
+ PROC(READLINK, enc_readlink, dec_readlink),
+ PROC(READDIR, enc_readdir, dec_readdir),
+ PROC(SERVER_CAPS, enc_server_caps, dec_server_caps),
++ PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn),
++ PROC(GETACL, enc_getacl, dec_getacl),
++ PROC(SETACL, enc_setacl, dec_setacl),
+ };
+
+ struct rpc_version nfs_version4 = {
+--- linux-2.6.7/fs/nfs/pagelist.c.lsec 2004-06-15 23:20:03.000000000 -0600
++++ linux-2.6.7/fs/nfs/pagelist.c 2005-03-23 14:28:23.057544416 -0700
+@@ -36,7 +36,6 @@ nfs_page_alloc(void)
+ if (p) {
+ memset(p, 0, sizeof(*p));
+ INIT_LIST_HEAD(&p->wb_list);
+- init_waitqueue_head(&p->wb_wait);
+ }
+ return p;
+ }
+@@ -62,7 +61,7 @@ nfs_page_free(struct nfs_page *p)
+ * User should ensure it is safe to sleep in this function.
+ */
+ struct nfs_page *
+-nfs_create_request(struct file *file, struct inode *inode,
++nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
+ struct page *page,
+ unsigned int offset, unsigned int count)
+ {
+@@ -94,33 +93,38 @@ nfs_create_request(struct file *file, st
+ req->wb_offset = offset;
+ req->wb_pgbase = offset;
+ req->wb_bytes = count;
+- req->wb_inode = inode;
+ req->wb_count = 1;
+- server->rpc_ops->request_init(req, file);
++ req->wb_context = get_nfs_open_context(ctx);
+
+ return req;
+ }
+
+ /**
++ * nfs_unlock_request - Unlock request and wake up sleepers.
++ * @req:
++ */
++void nfs_unlock_request(struct nfs_page *req)
++{
++ if (!NFS_WBACK_BUSY(req)) {
++ printk(KERN_ERR "NFS: Invalid unlock attempted\n");
++ BUG();
++ }
++ smp_mb__before_clear_bit();
++ clear_bit(PG_BUSY, &req->wb_flags);
++ smp_mb__after_clear_bit();
++ wake_up_all(&req->wb_context->waitq);
++ nfs_release_request(req);
++}
++
++/**
+ * nfs_clear_request - Free up all resources allocated to the request
+ * @req:
+ *
+- * Release all resources associated with a write request after it
++ * Release page resources associated with a write request after it
+ * has completed.
+ */
+ void nfs_clear_request(struct nfs_page *req)
+ {
+- if (req->wb_state)
+- req->wb_state = NULL;
+- /* Release struct file or cached credential */
+- if (req->wb_file) {
+- fput(req->wb_file);
+- req->wb_file = NULL;
+- }
+- if (req->wb_cred) {
+- put_rpccred(req->wb_cred);
+- req->wb_cred = NULL;
+- }
+ if (req->wb_page) {
+ page_cache_release(req->wb_page);
+ req->wb_page = NULL;
+@@ -151,6 +155,7 @@ nfs_release_request(struct nfs_page *req
+
+ /* Release struct file or cached credential */
+ nfs_clear_request(req);
++ put_nfs_open_context(req->wb_context);
+ nfs_page_free(req);
+ }
+
+@@ -194,12 +199,12 @@ nfs_list_add_request(struct nfs_page *re
+ int
+ nfs_wait_on_request(struct nfs_page *req)
+ {
+- struct inode *inode = req->wb_inode;
++ struct inode *inode = req->wb_context->dentry->d_inode;
+ struct rpc_clnt *clnt = NFS_CLIENT(inode);
+
+ if (!NFS_WBACK_BUSY(req))
+ return 0;
+- return nfs_wait_event(clnt, req->wb_wait, !NFS_WBACK_BUSY(req));
++ return nfs_wait_event(clnt, req->wb_context->waitq, !NFS_WBACK_BUSY(req));
+ }
+
+ /**
+@@ -224,7 +229,11 @@ nfs_coalesce_requests(struct list_head *
+
+ req = nfs_list_entry(head->next);
+ if (prev) {
+- if (req->wb_cred != prev->wb_cred)
++ if (req->wb_context->cred != prev->wb_context->cred)
++ break;
++ if (req->wb_context->pid != prev->wb_context->pid)
++ break;
++ if (req->wb_context->state != prev->wb_context->state)
+ break;
+ if (req->wb_index != (prev->wb_index + 1))
+ break;
+--- linux-2.6.7/fs/nfs/nfs4proc.c.lsec 2004-06-15 23:19:44.000000000 -0600
++++ linux-2.6.7/fs/nfs/nfs4proc.c 2005-03-23 14:32:35.532162440 -0700
+@@ -47,12 +47,16 @@
+ #include <linux/smp_lock.h>
+ #include <linux/namei.h>
+
++#include "delegation.h"
++
+ #define NFSDBG_FACILITY NFSDBG_PROC
+
+-#define NFS4_POLL_RETRY_TIME (15*HZ)
++#define NFS4_POLL_RETRY_MIN (1*HZ)
++#define NFS4_POLL_RETRY_MAX (15*HZ)
+
+ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
+ static int nfs4_async_handle_error(struct rpc_task *, struct nfs_server *);
++static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
+ extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
+ extern struct rpc_procinfo nfs4_procedures[];
+
+@@ -189,53 +193,296 @@ static void update_changeattr(struct ino
+ * reclaim state on the server after a reboot.
+ * Assumes caller is holding the sp->so_sem
+ */
+-int
+-nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
++static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
+ {
+ struct inode *inode = state->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+- struct nfs_fattr fattr = {
+- .valid = 0,
+- };
+- struct nfs_open_reclaimargs o_arg = {
++ struct nfs_delegation *delegation = NFS_I(inode)->delegation;
++ struct nfs_openargs o_arg = {
+ .fh = NFS_FH(inode),
+ .seqid = sp->so_seqid,
+ .id = sp->so_id,
+- .share_access = state->state,
++ .open_flags = state->state,
+ .clientid = server->nfs4_state->cl_clientid,
+ .claim = NFS4_OPEN_CLAIM_PREVIOUS,
+ .bitmask = server->attr_bitmask,
+ };
+ struct nfs_openres o_res = {
+- .f_attr = &fattr,
+ .server = server, /* Grrr */
+ };
+ struct rpc_message msg = {
+- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_RECLAIM],
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR],
+ .rpc_argp = &o_arg,
+ .rpc_resp = &o_res,
+ .rpc_cred = sp->so_cred,
+ };
+ int status;
+
++ if (delegation != NULL) {
++ if (!(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) {
++ memcpy(&state->stateid, &delegation->stateid,
++ sizeof(state->stateid));
++ set_bit(NFS_DELEGATED_STATE, &state->flags);
++ return 0;
++ }
++ o_arg.u.delegation_type = delegation->type;
++ }
+ status = rpc_call_sync(server->client, &msg, 0);
+ nfs4_increment_seqid(status, sp);
+- if (status == 0)
++ if (status == 0) {
+ memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid));
+- /* Update the inode attributes */
+- nfs_refresh_inode(inode, &fattr);
++ if (o_res.delegation_type != 0) {
++ nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res);
++ /* Did the server issue an immediate delegation recall? */
++ if (o_res.do_recall)
++ nfs_async_inode_return_delegation(inode, &o_res.stateid);
++ }
++ }
++ clear_bit(NFS_DELEGATED_STATE, &state->flags);
++ /* Ensure we update the inode attributes */
++ NFS_CACHEINV(inode);
+ return status;
+ }
+
++int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
++{
++ struct nfs_server *server = NFS_SERVER(state->inode);
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = _nfs4_open_reclaim(sp, state);
++ switch (err) {
++ case 0:
++ case -NFS4ERR_STALE_CLIENTID:
++ case -NFS4ERR_STALE_STATEID:
++ case -NFS4ERR_EXPIRED:
++ return err;
++ }
++ err = nfs4_handle_exception(server, err, &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
++{
++ struct nfs4_state_owner *sp = state->owner;
++ struct inode *inode = dentry->d_inode;
++ struct nfs_server *server = NFS_SERVER(inode);
++ struct dentry *parent = dget_parent(dentry);
++ struct nfs_openargs arg = {
++ .fh = NFS_FH(parent->d_inode),
++ .clientid = server->nfs4_state->cl_clientid,
++ .name = &dentry->d_name,
++ .id = sp->so_id,
++ .server = server,
++ .bitmask = server->attr_bitmask,
++ .claim = NFS4_OPEN_CLAIM_DELEGATE_CUR,
++ };
++ struct nfs_openres res = {
++ .server = server,
++ };
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR],
++ .rpc_argp = &arg,
++ .rpc_resp = &res,
++ .rpc_cred = sp->so_cred,
++ };
++ int status = 0;
++
++ down(&sp->so_sema);
++ if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
++ goto out;
++ if (state->state == 0)
++ goto out;
++ arg.seqid = sp->so_seqid;
++ arg.open_flags = state->state;
++ memcpy(arg.u.delegation.data, state->stateid.data, sizeof(arg.u.delegation.data));
++ status = rpc_call_sync(server->client, &msg, 0);
++ nfs4_increment_seqid(status, sp);
++ if (status >= 0) {
++ memcpy(state->stateid.data, res.stateid.data,
++ sizeof(state->stateid.data));
++ clear_bit(NFS_DELEGATED_STATE, &state->flags);
++ }
++out:
++ up(&sp->so_sema);
++ dput(parent);
++ return status;
++}
++
++int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
++{
++ struct nfs4_exception exception = { };
++ struct nfs_server *server = NFS_SERVER(dentry->d_inode);
++ int err;
++ do {
++ err = _nfs4_open_delegation_recall(dentry, state);
++ switch (err) {
++ case 0:
++ return err;
++ case -NFS4ERR_STALE_CLIENTID:
++ case -NFS4ERR_STALE_STATEID:
++ case -NFS4ERR_EXPIRED:
++ /* Don't recall a delegation if it was lost */
++ nfs4_schedule_state_recovery(server->nfs4_state);
++ return err;
++ }
++ err = nfs4_handle_exception(server, err, &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid)
++{
++ struct nfs_open_confirmargs arg = {
++ .fh = fh,
++ .seqid = sp->so_seqid,
++ .stateid = *stateid,
++ };
++ struct nfs_open_confirmres res;
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
++ .rpc_argp = &arg,
++ .rpc_resp = &res,
++ .rpc_cred = sp->so_cred,
++ };
++ int status;
++
++ status = rpc_call_sync(clnt, &msg, 0);
++ nfs4_increment_seqid(status, sp);
++ if (status >= 0)
++ memcpy(stateid, &res.stateid, sizeof(*stateid));
++ return status;
++}
++
++static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
++{
++ struct nfs_access_entry cache;
++ int status;
++
++ status = nfs_access_get_cached(inode, cred, &cache);
++ if (status == 0)
++ goto out;
++
++ /* Be clever: ask server to check for all possible rights */
++ cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
++ cache.cred = cred;
++ cache.jiffies = jiffies;
++ status = _nfs4_proc_access(inode, &cache);
++ if (status != 0)
++ return status;
++ nfs_access_add_cache(inode, &cache);
++out:
++ if ((cache.mask & mask) == mask)
++ return 0;
++ return -EACCES;
++}
++
++/*
++ * Returns an nfs4_state + an extra reference to the inode
++ */
++int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred, struct nfs4_state **res)
++{
++ struct nfs_delegation *delegation;
++ struct nfs_server *server = NFS_SERVER(inode);
++ struct nfs4_client *clp = server->nfs4_state;
++ struct nfs_inode *nfsi = NFS_I(inode);
++ struct nfs4_state_owner *sp = NULL;
++ struct nfs4_state *state = NULL;
++ int open_flags = flags & (FMODE_READ|FMODE_WRITE);
++ int mask = 0;
++ int err;
++
++ /* Protect against reboot recovery - NOTE ORDER! */
++ down_read(&clp->cl_sem);
++ /* Protect against delegation recall */
++ down_read(&nfsi->rwsem);
++ delegation = NFS_I(inode)->delegation;
++ err = -ENOENT;
++ if (delegation == NULL || (delegation->type & open_flags) != open_flags)
++ goto out_err;
++ err = -ENOMEM;
++ if (!(sp = nfs4_get_state_owner(server, cred))) {
++ dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__);
++ goto out_err;
++ }
++ down(&sp->so_sema);
++ state = nfs4_get_open_state(inode, sp);
++ if (state == NULL)
++ goto out_err;
++
++ err = -ENOENT;
++ if ((state->state & open_flags) == open_flags) {
++ spin_lock(&inode->i_lock);
++ if (open_flags & FMODE_READ)
++ state->nreaders++;
++ if (open_flags & FMODE_WRITE)
++ state->nwriters++;
++ spin_unlock(&inode->i_lock);
++ goto out_ok;
++ } else if (state->state != 0)
++ goto out_err;
++
++ lock_kernel();
++ err = _nfs4_do_access(inode, cred, mask);
++ unlock_kernel();
++ if (err != 0)
++ goto out_err;
++ spin_lock(&inode->i_lock);
++ memcpy(state->stateid.data, delegation->stateid.data,
++ sizeof(state->stateid.data));
++ state->state |= open_flags;
++ if (open_flags & FMODE_READ)
++ state->nreaders++;
++ if (open_flags & FMODE_WRITE)
++ state->nwriters++;
++ set_bit(NFS_DELEGATED_STATE, &state->flags);
++ spin_unlock(&inode->i_lock);
++out_ok:
++ up(&sp->so_sema);
++ nfs4_put_state_owner(sp);
++ up_read(&nfsi->rwsem);
++ up_read(&clp->cl_sem);
++ igrab(inode);
++ *res = state;
++ return 0;
++out_err:
++ if (sp != NULL) {
++ if (state != NULL)
++ nfs4_put_open_state(state);
++ up(&sp->so_sema);
++ nfs4_put_state_owner(sp);
++ }
++ up_read(&nfsi->rwsem);
++ up_read(&clp->cl_sem);
++ return err;
++}
++
++static struct nfs4_state *nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred)
++{
++ struct nfs4_exception exception = { };
++ struct nfs4_state *res;
++ int err;
++
++ do {
++ err = _nfs4_open_delegated(inode, flags, cred, &res);
++ if (err == 0)
++ break;
++ res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(inode),
++ err, &exception));
++ } while (exception.retry);
++ return res;
++}
++
+ /*
+ * Returns an nfs4_state + an referenced inode
+ */
+-struct nfs4_state *
+-nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred)
++static int _nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+ {
+ struct nfs4_state_owner *sp;
+ struct nfs4_state *state = NULL;
+ struct nfs_server *server = NFS_SERVER(dir);
++ struct nfs4_client *clp = server->nfs4_state;
+ struct inode *inode = NULL;
+ int status;
+ struct nfs_fattr f_attr = {
+@@ -243,12 +490,11 @@ nfs4_do_open(struct inode *dir, struct q
+ };
+ struct nfs_openargs o_arg = {
+ .fh = NFS_FH(dir),
+- .share_access = flags & (FMODE_READ|FMODE_WRITE),
+- .opentype = (flags & O_CREAT) ? NFS4_OPEN_CREATE : NFS4_OPEN_NOCREATE,
+- .createmode = (flags & O_EXCL) ? NFS4_CREATE_EXCLUSIVE : NFS4_CREATE_UNCHECKED,
++ .open_flags = flags,
+ .name = name,
+ .server = server,
+ .bitmask = server->attr_bitmask,
++ .claim = NFS4_OPEN_CLAIM_NULL,
+ };
+ struct nfs_openres o_res = {
+ .f_attr = &f_attr,
+@@ -261,60 +507,44 @@ nfs4_do_open(struct inode *dir, struct q
+ .rpc_cred = cred,
+ };
+
+-retry:
++ /* Protect against reboot recovery conflicts */
++ down_read(&clp->cl_sem);
+ status = -ENOMEM;
+- if (!(sp = nfs4_get_state_owner(NFS_SERVER(dir), cred))) {
++ if (!(sp = nfs4_get_state_owner(server, cred))) {
+ dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
+- goto out;
++ goto out_err;
+ }
+- if (o_arg.createmode & NFS4_CREATE_EXCLUSIVE){
++ if (flags & O_EXCL) {
+ u32 *p = (u32 *) o_arg.u.verifier.data;
+ p[0] = jiffies;
+ p[1] = current->pid;
+- } else if (o_arg.createmode == NFS4_CREATE_UNCHECKED) {
++ } else
+ o_arg.u.attrs = sattr;
+- }
+ /* Serialization for the sequence id */
+ down(&sp->so_sema);
+ o_arg.seqid = sp->so_seqid;
+ o_arg.id = sp->so_id;
+- o_arg.clientid = NFS_SERVER(dir)->nfs4_state->cl_clientid,
++ o_arg.clientid = clp->cl_clientid;
+
+ status = rpc_call_sync(server->client, &msg, 0);
+ nfs4_increment_seqid(status, sp);
+ if (status)
+- goto out_up;
++ goto out_err;
+ update_changeattr(dir, &o_res.cinfo);
++ if(o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) {
++ status = _nfs4_proc_open_confirm(server->client, &o_res.fh, sp, &o_res.stateid);
++ if (status)
++ goto out_err;
++ }
+
+ status = -ENOMEM;
+ inode = nfs_fhget(dir->i_sb, &o_res.fh, &f_attr);
+ if (!inode)
+- goto out_up;
++ goto out_err;
+ state = nfs4_get_open_state(inode, sp);
+ if (!state)
+- goto out_up;
+-
+- if(o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) {
+- struct nfs_open_confirmargs oc_arg = {
+- .fh = &o_res.fh,
+- .seqid = sp->so_seqid,
+- };
+- struct nfs_open_confirmres oc_res;
+- struct rpc_message msg = {
+- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
+- .rpc_argp = &oc_arg,
+- .rpc_resp = &oc_res,
+- .rpc_cred = cred,
+- };
+-
+- memcpy(&oc_arg.stateid, &o_res.stateid, sizeof(oc_arg.stateid));
+- status = rpc_call_sync(server->client, &msg, 0);
+- nfs4_increment_seqid(status, sp);
+- if (status)
+- goto out_up;
+- memcpy(&state->stateid, &oc_res.stateid, sizeof(state->stateid));
+- } else
+- memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid));
++ goto out_err;
++ memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid));
+ spin_lock(&inode->i_lock);
+ if (flags & FMODE_READ)
+ state->nreaders++;
+@@ -322,47 +552,62 @@ retry:
+ state->nwriters++;
+ state->state |= flags & (FMODE_READ|FMODE_WRITE);
+ spin_unlock(&inode->i_lock);
+-
++ if (o_res.delegation_type != 0)
++ nfs_inode_set_delegation(inode, cred, &o_res);
+ up(&sp->so_sema);
+ nfs4_put_state_owner(sp);
+- return state;
+-
+-out_up:
+- up(&sp->so_sema);
+- nfs4_put_state_owner(sp);
+- if (state) {
+- nfs4_put_open_state(state);
+- state = NULL;
+- }
+- if (inode) {
++ up_read(&clp->cl_sem);
++ *res = state;
++ return 0;
++out_err:
++ if (sp != NULL) {
++ if (state != NULL)
++ nfs4_put_open_state(state);
++ up(&sp->so_sema);
++ nfs4_put_state_owner(sp);
++ }
++ /* Note: clp->cl_sem must be released before nfs4_put_open_state()! */
++ up_read(&clp->cl_sem);
++ if (inode != NULL)
+ iput(inode);
+- inode = NULL;
+- }
+- /* NOTE: BAD_SEQID means the server and client disagree about the
+- * book-keeping w.r.t. state-changing operations
+- * (OPEN/CLOSE/LOCK/LOCKU...)
+- * It is actually a sign of a bug on the client or on the server.
+- *
+- * If we receive a BAD_SEQID error in the particular case of
+- * doing an OPEN, we assume that nfs4_increment_seqid() will
+- * have unhashed the old state_owner for us, and that we can
+- * therefore safely retry using a new one. We should still warn
+- * the user though...
+- */
+- if (status == -NFS4ERR_BAD_SEQID) {
+- printk(KERN_WARNING "NFS: v4 server returned a bad sequence-id error!\n");
+- goto retry;
+- }
+- status = nfs4_handle_error(server, status);
+- if (!status)
+- goto retry;
+- BUG_ON(status < -1000 || status > 0);
+-out:
+- return ERR_PTR(status);
++ *res = NULL;
++ return status;
+ }
+
+-int
+-nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
++
++struct nfs4_state *nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred)
++{
++ struct nfs4_exception exception = { };
++ struct nfs4_state *res;
++ int status;
++
++ do {
++ status = _nfs4_do_open(dir, name, flags, sattr, cred, &res);
++ if (status == 0)
++ break;
++ /* NOTE: BAD_SEQID means the server and client disagree about the
++ * book-keeping w.r.t. state-changing operations
++ * (OPEN/CLOSE/LOCK/LOCKU...)
++ * It is actually a sign of a bug on the client or on the server.
++ *
++ * If we receive a BAD_SEQID error in the particular case of
++ * doing an OPEN, we assume that nfs4_increment_seqid() will
++ * have unhashed the old state_owner for us, and that we can
++ * therefore safely retry using a new one. We should still warn
++ * the user though...
++ */
++ if (status == -NFS4ERR_BAD_SEQID) {
++ printk(KERN_WARNING "NFS: v4 server returned a bad sequence-id error!\n");
++ exception.retry = 1;
++ continue;
++ }
++ res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir),
++ status, &exception));
++ } while (exception.retry);
++ return res;
++}
++
++static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
+ struct nfs_fh *fhandle, struct iattr *sattr,
+ struct nfs4_state *state)
+ {
+@@ -381,9 +626,7 @@ nfs4_do_setattr(struct nfs_server *serve
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+- int status;
+
+-retry:
+ fattr->valid = 0;
+
+ if (sattr->ia_valid & ATTR_SIZE)
+@@ -391,13 +634,22 @@ retry:
+ else
+ memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
+
+- status = rpc_call_sync(server->client, &msg, 0);
+- if (status) {
+- status = nfs4_handle_error(server, status);
+- if (!status)
+- goto retry;
+- }
+- return status;
++ return rpc_call_sync(server->client, &msg, 0);
++}
++
++int nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
++ struct nfs_fh *fhandle, struct iattr *sattr,
++ struct nfs4_state *state)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(server,
++ _nfs4_do_setattr(server, fattr, fhandle, sattr,
++ state),
++ &exception);
++ } while (exception.retry);
++ return err;
+ }
+
+ /*
+@@ -411,8 +663,7 @@ retry:
+ *
+ * NOTE: Caller must be holding the sp->so_owner semaphore!
+ */
+-int
+-nfs4_do_close(struct inode *inode, struct nfs4_state *state)
++static int _nfs4_do_close(struct inode *inode, struct nfs4_state *state)
+ {
+ struct nfs4_state_owner *sp = state->owner;
+ int status = 0;
+@@ -426,6 +677,8 @@ nfs4_do_close(struct inode *inode, struc
+ .rpc_resp = &res,
+ };
+
++ if (test_bit(NFS_DELEGATED_STATE, &state->flags))
++ return 0;
+ memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid));
+ /* Serialization for the sequence id */
+ arg.seqid = sp->so_seqid,
+@@ -441,15 +694,34 @@ nfs4_do_close(struct inode *inode, struc
+ return status;
+ }
+
+-int
+-nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode)
++int nfs4_do_close(struct inode *inode, struct nfs4_state *state)
++{
++ struct nfs_server *server = NFS_SERVER(state->inode);
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = _nfs4_do_close(inode, state);
++ switch (err) {
++ case -NFS4ERR_STALE_STATEID:
++ case -NFS4ERR_EXPIRED:
++ nfs4_schedule_state_recovery(server->nfs4_state);
++ case 0:
++ state->state = 0;
++ return 0;
++ }
++ err = nfs4_handle_exception(server, err, &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode)
+ {
+ struct nfs4_state_owner *sp = state->owner;
+ int status = 0;
+ struct nfs_closeargs arg = {
+ .fh = NFS_FH(inode),
+ .seqid = sp->so_seqid,
+- .share_access = mode,
++ .open_flags = mode,
+ };
+ struct nfs_closeres res;
+ struct rpc_message msg = {
+@@ -458,6 +730,8 @@ nfs4_do_downgrade(struct inode *inode, s
+ .rpc_resp = &res,
+ };
+
++ if (test_bit(NFS_DELEGATED_STATE, &state->flags))
++ return 0;
+ memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid));
+ status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0);
+ nfs4_increment_seqid(status, sp);
+@@ -467,6 +741,26 @@ nfs4_do_downgrade(struct inode *inode, s
+ return status;
+ }
+
++int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode)
++{
++ struct nfs_server *server = NFS_SERVER(state->inode);
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = _nfs4_do_downgrade(inode, state, mode);
++ switch (err) {
++ case -NFS4ERR_STALE_STATEID:
++ case -NFS4ERR_EXPIRED:
++ nfs4_schedule_state_recovery(server->nfs4_state);
++ case 0:
++ state->state = mode;
++ return 0;
++ }
++ err = nfs4_handle_exception(server, err, &exception);
++ } while (exception.retry);
++ return err;
++}
++
+ struct inode *
+ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+ {
+@@ -500,7 +794,9 @@ nfs4_open_revalidate(struct inode *dir,
+ struct inode *inode;
+
+ cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
+- state = nfs4_do_open(dir, &dentry->d_name, openflags, NULL, cred);
++ state = nfs4_open_delegated(dentry->d_inode, openflags, cred);
++ if (IS_ERR(state))
++ state = nfs4_do_open(dir, &dentry->d_name, openflags, NULL, cred);
+ put_rpccred(cred);
+ if (state == ERR_PTR(-ENOENT) && dentry->d_inode == 0)
+ return 1;
+@@ -518,7 +814,7 @@ nfs4_open_revalidate(struct inode *dir,
+ }
+
+
+-static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
++static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+ {
+ struct nfs4_server_caps_res res = {};
+ struct rpc_message msg = {
+@@ -542,7 +838,19 @@ static int nfs4_server_capabilities(stru
+ return status;
+ }
+
+-static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
++static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(server,
++ _nfs4_server_capabilities(server, fhandle),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+ {
+ struct nfs_fattr * fattr = info->fattr;
+@@ -563,6 +871,19 @@ static int nfs4_lookup_root(struct nfs_s
+ return rpc_call_sync(server->client, &msg, 0);
+ }
+
++static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
++ struct nfs_fsinfo *info)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(server,
++ _nfs4_lookup_root(server, fhandle, info),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
+ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+ {
+@@ -597,6 +918,8 @@ static int nfs4_proc_get_root(struct nfs
+
+ p = server->mnt_path;
+ for (;;) {
++ struct nfs4_exception exception = { };
++
+ while (*p == '/')
+ p++;
+ if (!*p)
+@@ -606,9 +929,13 @@ static int nfs4_proc_get_root(struct nfs
+ p++;
+ q.len = p - q.name;
+
+- fattr->valid = 0;
+- status = rpc_call_sync(server->client, &msg, 0);
+- if (!status)
++ do {
++ fattr->valid = 0;
++ status = nfs4_handle_exception(server,
++ rpc_call_sync(server->client, &msg, 0),
++ &exception);
++ } while (exception.retry);
++ if (status == 0)
+ continue;
+ if (status == -ENOENT) {
+ printk(KERN_NOTICE "NFS: mount path %s does not exist!\n", server->mnt_path);
+@@ -621,10 +948,10 @@ static int nfs4_proc_get_root(struct nfs
+ if (status == 0)
+ status = nfs4_do_fsinfo(server, fhandle, info);
+ out:
+- return nfs4_map_errors(status);
++ return status;
+ }
+
+-static int nfs4_proc_getattr(struct inode *inode, struct nfs_fattr *fattr)
++static int _nfs4_proc_getattr(struct inode *inode, struct nfs_fattr *fattr)
+ {
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs4_getattr_arg args = {
+@@ -642,8 +969,19 @@ static int nfs4_proc_getattr(struct inod
+ };
+
+ fattr->valid = 0;
++ return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
++}
+
+- return nfs4_map_errors(rpc_call_sync(NFS_CLIENT(inode), &msg, 0));
++static int nfs4_proc_getattr(struct inode *inode, struct nfs_fattr *fattr)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(inode),
++ _nfs4_proc_getattr(inode, fattr),
++ &exception);
++ } while (exception.retry);
++ return err;
+ }
+
+ /*
+@@ -678,9 +1016,13 @@ nfs4_proc_setattr(struct dentry *dentry,
+ if (size_change) {
+ struct rpc_cred *cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0);
+ state = nfs4_find_state(inode, cred, FMODE_WRITE);
+- if (!state) {
+- state = nfs4_do_open(dentry->d_parent->d_inode,
+- &dentry->d_name, FMODE_WRITE, NULL, cred);
++ if (state == NULL) {
++ state = nfs4_open_delegated(dentry->d_inode,
++ FMODE_WRITE, cred);
++ if (IS_ERR(state))
++ state = nfs4_do_open(dentry->d_parent->d_inode,
++ &dentry->d_name, FMODE_WRITE,
++ NULL, cred);
+ need_iput = 1;
+ }
+ put_rpccred(cred);
+@@ -705,7 +1047,7 @@ out:
+ return status;
+ }
+
+-static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
++static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+ {
+ int status;
+@@ -731,12 +1073,23 @@ static int nfs4_proc_lookup(struct inode
+ dprintk("NFS call lookup %s\n", name->name);
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ dprintk("NFS reply lookup: %d\n", status);
+- return nfs4_map_errors(status);
++ return status;
+ }
+
+-static int nfs4_proc_access(struct inode *inode, struct rpc_cred *cred, int mode)
++static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(dir),
++ _nfs4_proc_lookup(dir, name, fhandle, fattr),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+ {
+- int status;
+ struct nfs4_accessargs args = {
+ .fh = NFS_FH(inode),
+ };
+@@ -745,8 +1098,10 @@ static int nfs4_proc_access(struct inode
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+- .rpc_cred = cred,
++ .rpc_cred = entry->cred,
+ };
++ int mode = entry->mask;
++ int status;
+
+ /*
+ * Determine which access bits we want to ask for...
+@@ -758,8 +1113,7 @@ static int nfs4_proc_access(struct inode
+ args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE;
+ if (mode & MAY_EXEC)
+ args.access |= NFS4_ACCESS_LOOKUP;
+- }
+- else {
++ } else {
+ if (mode & MAY_WRITE)
+ args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND;
+ if (mode & MAY_EXEC)
+@@ -767,13 +1121,27 @@ static int nfs4_proc_access(struct inode
+ }
+ status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ if (!status) {
+- if (args.access != res.supported) {
+- printk(KERN_NOTICE "NFS: server didn't support all access bits!\n");
+- status = -ENOTSUPP;
+- } else if ((args.access & res.access) != args.access)
+- status = -EACCES;
++ entry->mask = 0;
++ if (res.access & NFS4_ACCESS_READ)
++ entry->mask |= MAY_READ;
++ if (res.access & (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE))
++ entry->mask |= MAY_WRITE;
++ if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
++ entry->mask |= MAY_EXEC;
+ }
+- return nfs4_map_errors(status);
++ return status;
++}
++
++static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(inode),
++ _nfs4_proc_access(inode, entry),
++ &exception);
++ } while (exception.retry);
++ return err;
+ }
+
+ /*
+@@ -800,7 +1168,7 @@ static int nfs4_proc_access(struct inode
+ * Both of these changes to the XDR layer would in fact be quite
+ * minor, but I decided to leave them for a subsequent patch.
+ */
+-static int nfs4_proc_readlink(struct inode *inode, struct page *page)
++static int _nfs4_proc_readlink(struct inode *inode, struct page *page)
+ {
+ struct nfs4_readlink args = {
+ .fh = NFS_FH(inode),
+@@ -813,11 +1181,22 @@ static int nfs4_proc_readlink(struct ino
+ .rpc_resp = NULL,
+ };
+
+- return nfs4_map_errors(rpc_call_sync(NFS_CLIENT(inode), &msg, 0));
++ return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ }
+
+-static int
+-nfs4_proc_read(struct nfs_read_data *rdata, struct file *filp)
++static int nfs4_proc_readlink(struct inode *inode, struct page *page)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(inode),
++ _nfs4_proc_readlink(inode, page),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_proc_read(struct nfs_read_data *rdata)
+ {
+ int flags = rdata->flags;
+ struct inode *inode = rdata->inode;
+@@ -827,6 +1206,7 @@ nfs4_proc_read(struct nfs_read_data *rda
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ],
+ .rpc_argp = &rdata->args,
+ .rpc_resp = &rdata->res,
++ .rpc_cred = rdata->cred,
+ };
+ unsigned long timestamp = jiffies;
+ int status;
+@@ -834,29 +1214,27 @@ nfs4_proc_read(struct nfs_read_data *rda
+ dprintk("NFS call read %d @ %Ld\n", rdata->args.count,
+ (long long) rdata->args.offset);
+
+- /*
+- * Try first to use O_RDONLY, then O_RDWR stateid.
+- */
+- if (filp) {
+- struct nfs4_state *state;
+- state = (struct nfs4_state *)filp->private_data;
+- rdata->args.state = state;
+- msg.rpc_cred = state->owner->so_cred;
+- } else {
+- rdata->args.state = NULL;
+- msg.rpc_cred = NFS_I(inode)->mm_cred;
+- }
+-
+ fattr->valid = 0;
+ status = rpc_call_sync(server->client, &msg, flags);
+ if (!status)
+ renew_lease(server, timestamp);
+ dprintk("NFS reply read: %d\n", status);
+- return nfs4_map_errors(status);
++ return status;
+ }
+
+-static int
+-nfs4_proc_write(struct nfs_write_data *wdata, struct file *filp)
++static int nfs4_proc_read(struct nfs_read_data *rdata)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(rdata->inode),
++ _nfs4_proc_read(rdata),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_proc_write(struct nfs_write_data *wdata)
+ {
+ int rpcflags = wdata->flags;
+ struct inode *inode = wdata->inode;
+@@ -866,33 +1244,32 @@ nfs4_proc_write(struct nfs_write_data *w
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE],
+ .rpc_argp = &wdata->args,
+ .rpc_resp = &wdata->res,
++ .rpc_cred = wdata->cred,
+ };
+ int status;
+
+ dprintk("NFS call write %d @ %Ld\n", wdata->args.count,
+ (long long) wdata->args.offset);
+
+- /*
+- * Try first to use O_WRONLY, then O_RDWR stateid.
+- */
+- if (filp) {
+- struct nfs4_state *state;
+- state = (struct nfs4_state *)filp->private_data;
+- wdata->args.state = state;
+- msg.rpc_cred = state->owner->so_cred;
+- } else {
+- wdata->args.state = NULL;
+- msg.rpc_cred = NFS_I(inode)->mm_cred;
+- }
+-
+ fattr->valid = 0;
+ status = rpc_call_sync(server->client, &msg, rpcflags);
+ dprintk("NFS reply write: %d\n", status);
+- return nfs4_map_errors(status);
++ return status;
+ }
+
+-static int
+-nfs4_proc_commit(struct nfs_write_data *cdata, struct file *filp)
++static int nfs4_proc_write(struct nfs_write_data *wdata)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(wdata->inode),
++ _nfs4_proc_write(wdata),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_proc_commit(struct nfs_write_data *cdata)
+ {
+ struct inode *inode = cdata->inode;
+ struct nfs_fattr *fattr = cdata->res.fattr;
+@@ -901,24 +1278,29 @@ nfs4_proc_commit(struct nfs_write_data *
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT],
+ .rpc_argp = &cdata->args,
+ .rpc_resp = &cdata->res,
++ .rpc_cred = cdata->cred,
+ };
+ int status;
+
+ dprintk("NFS call commit %d @ %Ld\n", cdata->args.count,
+ (long long) cdata->args.offset);
+
+- /*
+- * Try first to use O_WRONLY, then O_RDWR stateid.
+- */
+- if (filp)
+- msg.rpc_cred = ((struct nfs4_state *)filp->private_data)->owner->so_cred;
+- else
+- msg.rpc_cred = NFS_I(inode)->mm_cred;
+-
+ fattr->valid = 0;
+ status = rpc_call_sync(server->client, &msg, 0);
+ dprintk("NFS reply commit: %d\n", status);
+- return nfs4_map_errors(status);
++ return status;
++}
++
++static int nfs4_proc_commit(struct nfs_write_data *cdata)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(cdata->inode),
++ _nfs4_proc_commit(cdata),
++ &exception);
++ } while (exception.retry);
++ return err;
+ }
+
+ /*
+@@ -965,7 +1347,7 @@ nfs4_proc_create(struct inode *dir, stru
+ return inode;
+ }
+
+-static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
++static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
+ {
+ struct nfs4_remove_arg args = {
+ .fh = NFS_FH(dir),
+@@ -982,7 +1364,19 @@ static int nfs4_proc_remove(struct inode
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ if (status == 0)
+ update_changeattr(dir, &res);
+- return nfs4_map_errors(status);
++ return status;
++}
++
++static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(dir),
++ _nfs4_proc_remove(dir, name),
++ &exception);
++ } while (exception.retry);
++ return err;
+ }
+
+ struct unlink_desc {
+@@ -1023,7 +1417,7 @@ static int nfs4_proc_unlink_done(struct
+ return 0;
+ }
+
+-static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
++static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
+ struct inode *new_dir, struct qstr *new_name)
+ {
+ struct nfs4_rename_arg arg = {
+@@ -1046,10 +1440,24 @@ static int nfs4_proc_rename(struct inode
+ update_changeattr(old_dir, &res.old_cinfo);
+ update_changeattr(new_dir, &res.new_cinfo);
+ }
+- return nfs4_map_errors(status);
++ return status;
+ }
+
+-static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
++static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
++ struct inode *new_dir, struct qstr *new_name)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(old_dir),
++ _nfs4_proc_rename(old_dir, old_name,
++ new_dir, new_name),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+ {
+ struct nfs4_link_arg arg = {
+ .fh = NFS_FH(inode),
+@@ -1068,10 +1476,22 @@ static int nfs4_proc_link(struct inode *
+ if (!status)
+ update_changeattr(dir, &cinfo);
+
+- return nfs4_map_errors(status);
++ return status;
++}
++
++static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(inode),
++ _nfs4_proc_link(inode, dir, name),
++ &exception);
++ } while (exception.retry);
++ return err;
+ }
+
+-static int nfs4_proc_symlink(struct inode *dir, struct qstr *name,
++static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name,
+ struct qstr *path, struct iattr *sattr, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr)
+ {
+@@ -1090,22 +1510,39 @@ static int nfs4_proc_symlink(struct inod
+ .fattr = fattr,
+ };
+ struct rpc_message msg = {
+- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ };
+ int status;
+
++ if (path->len > NFS4_MAXPATHLEN)
++ return -ENAMETOOLONG;
+ arg.u.symlink = path;
+ fattr->valid = 0;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ if (!status)
+ update_changeattr(dir, &res.dir_cinfo);
+- return nfs4_map_errors(status);
++ return status;
+ }
+
+-static int nfs4_proc_mkdir(struct inode *dir, struct qstr *name,
++static int nfs4_proc_symlink(struct inode *dir, struct qstr *name,
++ struct qstr *path, struct iattr *sattr, struct nfs_fh *fhandle,
++ struct nfs_fattr *fattr)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(dir),
++ _nfs4_proc_symlink(dir, name, path, sattr,
++ fhandle, fattr),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_proc_mkdir(struct inode *dir, struct qstr *name,
+ struct iattr *sattr, struct nfs_fh *fhandle,
+ struct nfs_fattr *fattr)
+ {
+@@ -1135,10 +1572,25 @@ static int nfs4_proc_mkdir(struct inode
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ if (!status)
+ update_changeattr(dir, &res.dir_cinfo);
+- return nfs4_map_errors(status);
++ return status;
+ }
+
+-static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
++static int nfs4_proc_mkdir(struct inode *dir, struct qstr *name,
++ struct iattr *sattr, struct nfs_fh *fhandle,
++ struct nfs_fattr *fattr)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(dir),
++ _nfs4_proc_mkdir(dir, name, sattr,
++ fhandle, fattr),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+ u64 cookie, struct page *page, unsigned int count, int plus)
+ {
+ struct inode *dir = dentry->d_inode;
+@@ -1164,10 +1616,24 @@ static int nfs4_proc_readdir(struct dent
+ if (status == 0)
+ memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
+ unlock_kernel();
+- return nfs4_map_errors(status);
++ return status;
+ }
+
+-static int nfs4_proc_mknod(struct inode *dir, struct qstr *name,
++static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
++ u64 cookie, struct page *page, unsigned int count, int plus)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
++ _nfs4_proc_readdir(dentry, cred, cookie,
++ page, count, plus),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_proc_mknod(struct inode *dir, struct qstr *name,
+ struct iattr *sattr, dev_t rdev, struct nfs_fh *fh,
+ struct nfs_fattr *fattr)
+ {
+@@ -1214,10 +1680,25 @@ static int nfs4_proc_mknod(struct inode
+ status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+ if (!status)
+ update_changeattr(dir, &res.dir_cinfo);
+- return nfs4_map_errors(status);
++ return status;
++}
++
++static int nfs4_proc_mknod(struct inode *dir, struct qstr *name,
++ struct iattr *sattr, dev_t rdev, struct nfs_fh *fh,
++ struct nfs_fattr *fattr)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(dir),
++ _nfs4_proc_mknod(dir, name, sattr, rdev,
++ fh, fattr),
++ &exception);
++ } while (exception.retry);
++ return err;
+ }
+
+-static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
++static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsstat *fsstat)
+ {
+ struct nfs4_statfs_arg args = {
+@@ -1231,10 +1712,22 @@ static int nfs4_proc_statfs(struct nfs_s
+ };
+
+ fsstat->fattr->valid = 0;
+- return nfs4_map_errors(rpc_call_sync(server->client, &msg, 0));
++ return rpc_call_sync(server->client, &msg, 0);
+ }
+
+-static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
++static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
++{
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = nfs4_handle_exception(server,
++ _nfs4_proc_statfs(server, fhandle, fsstat),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *fsinfo)
+ {
+ struct nfs4_fsinfo_arg args = {
+@@ -1247,16 +1740,29 @@ static int nfs4_do_fsinfo(struct nfs_ser
+ .rpc_resp = fsinfo,
+ };
+
+- return nfs4_map_errors(rpc_call_sync(server->client, &msg, 0));
++ return rpc_call_sync(server->client, &msg, 0);
++}
++
++static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
++{
++ struct nfs4_exception exception = { };
++ int err;
++
++ do {
++ err = nfs4_handle_exception(server,
++ _nfs4_do_fsinfo(server, fhandle, fsinfo),
++ &exception);
++ } while (exception.retry);
++ return err;
+ }
+
+ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
+ {
+ fsinfo->fattr->valid = 0;
+- return nfs4_map_errors(nfs4_do_fsinfo(server, fhandle, fsinfo));
++ return nfs4_do_fsinfo(server, fhandle, fsinfo);
+ }
+
+-static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
++static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_pathconf *pathconf)
+ {
+ struct nfs4_pathconf_arg args = {
+@@ -1276,7 +1782,21 @@ static int nfs4_proc_pathconf(struct nfs
+ }
+
+ pathconf->fattr->valid = 0;
+- return nfs4_map_errors(rpc_call_sync(server->client, &msg, 0));
++ return rpc_call_sync(server->client, &msg, 0);
++}
++
++static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
++ struct nfs_pathconf *pathconf)
++{
++ struct nfs4_exception exception = { };
++ int err;
++
++ do {
++ err = nfs4_handle_exception(server,
++ _nfs4_proc_pathconf(server, fhandle, pathconf),
++ &exception);
++ } while (exception.retry);
++ return err;
+ }
+
+ static void
+@@ -1467,8 +1987,10 @@ static int
+ nfs4_proc_file_open(struct inode *inode, struct file *filp)
+ {
+ struct dentry *dentry = filp->f_dentry;
+- struct nfs4_state *state;
++ struct nfs_open_context *ctx;
++ struct nfs4_state *state = NULL;
+ struct rpc_cred *cred;
++ int status = -ENOMEM;
+
+ dprintk("nfs4_proc_file_open: starting on (%.*s/%.*s)\n",
+ (int)dentry->d_parent->d_name.len,
+@@ -1478,21 +2000,28 @@ nfs4_proc_file_open(struct inode *inode,
+
+ /* Find our open stateid */
+ cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0);
+- state = nfs4_find_state(inode, cred, filp->f_mode);
++ if (unlikely(cred == NULL))
++ return -ENOMEM;
++ ctx = alloc_nfs_open_context(dentry, cred);
+ put_rpccred(cred);
+- if (state == NULL) {
+- printk(KERN_WARNING "NFS: v4 raced in function %s\n", __FUNCTION__);
+- return -EIO; /* ERACE actually */
+- }
++ if (unlikely(ctx == NULL))
++ return -ENOMEM;
++ status = -EIO; /* ERACE actually */
++ state = nfs4_find_state(inode, cred, filp->f_mode);
++ if (unlikely(state == NULL))
++ goto no_state;
++ ctx->state = state;
+ nfs4_close_state(state, filp->f_mode);
+- if (filp->f_mode & FMODE_WRITE) {
+- lock_kernel();
+- nfs_set_mmcred(inode, state->owner->so_cred);
++ ctx->mode = filp->f_mode;
++ nfs_file_set_open_context(filp, ctx);
++ put_nfs_open_context(ctx);
++ if (filp->f_mode & FMODE_WRITE)
+ nfs_begin_data_update(inode);
+- unlock_kernel();
+- }
+- filp->private_data = state;
+ return 0;
++no_state:
++ printk(KERN_WARNING "NFS: v4 raced in function %s\n", __FUNCTION__);
++ put_nfs_open_context(ctx);
++ return status;
+ }
+
+ /*
+@@ -1501,35 +2030,148 @@ nfs4_proc_file_open(struct inode *inode,
+ static int
+ nfs4_proc_file_release(struct inode *inode, struct file *filp)
+ {
+- struct nfs4_state *state = (struct nfs4_state *)filp->private_data;
+-
+- if (state)
+- nfs4_close_state(state, filp->f_mode);
+- if (filp->f_mode & FMODE_WRITE) {
+- lock_kernel();
++ if (filp->f_mode & FMODE_WRITE)
+ nfs_end_data_update(inode);
+- unlock_kernel();
+- }
++ nfs_file_clear_open_context(filp);
+ return 0;
+ }
+
+-/*
+- * Set up the nfspage struct with the right state info and credentials
+- */
++static ssize_t
++nfs4_read_acl_attr(struct inode *inode, char *buf, ssize_t buflen)
++{
++ struct nfs_inode *nfsi = NFS_I(inode);
++ int ret;
++
++ spin_lock(&inode->i_lock);
++ if (buf == NULL && nfsi->acl_len)
++ goto out_len;
++ ret = -ENOENT;
++ if (nfsi->acl_len == 0)
++ goto out;
++ ret = -ERANGE; /* see getxattr(2) man page */
++ if (nfsi->acl_len > buflen)
++ goto out;
++ memcpy(buf, nfsi->acl, nfsi->acl_len);
++out_len:
++ ret = nfsi->acl_len;
++out:
++ spin_unlock(&inode->i_lock);
++ return ret;
++}
++
+ static void
+-nfs4_request_init(struct nfs_page *req, struct file *filp)
++nfs4_set_acl_attr(struct inode *inode, char *buf, ssize_t buflen)
+ {
+- struct nfs4_state *state;
++ struct nfs_inode *nfsi = NFS_I(inode);
+
+- if (!filp) {
+- req->wb_cred = get_rpccred(NFS_I(req->wb_inode)->mm_cred);
+- req->wb_state = NULL;
+- return;
++ spin_lock(&inode->i_lock);
++ kfree(nfsi->acl);
++ nfsi->acl = buf;
++ nfsi->acl_len = buflen;
++ spin_unlock(&inode->i_lock);
++}
++
++static int
++nfs4_write_acl_attr(struct inode *inode, const char *buf, ssize_t buflen)
++{
++ void *abuf = NULL;
++
++ if (buflen > PAGE_SIZE)
++ goto out_nomem;
++ abuf = kmalloc(buflen, GFP_KERNEL);
++ if (abuf == NULL)
++ goto out_nomem;
++ memcpy(abuf, buf, buflen);
++ nfs4_set_acl_attr(inode, abuf, buflen);
++ return 0;
++out_nomem:
++ nfs4_set_acl_attr(inode, NULL, 0);
++ return -ENOMEM;
++}
++
++void
++nfs4_zap_acl_attr(struct inode *inode)
++{
++ nfs4_set_acl_attr(inode, NULL, 0);
++}
++
++static int
++nfs4_server_supports_acls(struct nfs_server *server)
++{
++ return (server->caps & NFS_CAP_ACLS)
++ && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
++ && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL);
++}
++
++ssize_t
++nfs4_proc_get_acl(struct inode *inode, void *buf, ssize_t buflen)
++{
++ struct nfs_server *server = NFS_SERVER(inode);
++ struct nfs_getaclres res = {
++ .acl = buf,
++ .acl_len = buflen,
++ .server = server,
++ };
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
++ .rpc_argp = NFS_FH(inode),
++ .rpc_resp = &res,
++ };
++ int ret;
++
++ if (!nfs4_server_supports_acls(server))
++ return -EOPNOTSUPP;
++ lock_kernel();
++ ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
++ if (ret < 0)
++ goto out;
++ ret = nfs4_read_acl_attr(inode, buf, buflen);
++ if (ret == -ENOENT) {
++ ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
++ if (ret == 0) {
++ nfs4_write_acl_attr(inode, res.acl, res.acl_len);
++ ret = res.acl_len;
++ }
++ if (res.acl != buf) {
++ /* xdr decode allocated the memory: */
++ kfree(res.acl);
++ }
+ }
+- state = (struct nfs4_state *)filp->private_data;
+- req->wb_state = state;
+- req->wb_cred = get_rpccred(state->owner->so_cred);
+- req->wb_lockowner = current->files;
++out:
++ unlock_kernel();
++ return ret;
++}
++
++int
++nfs4_proc_set_acl(struct inode *inode, const void *buf, ssize_t buflen)
++{
++ struct nfs_server *server = NFS_SERVER(inode);
++ struct nfs_setaclargs arg = {
++ .fh = NFS_FH(inode),
++ .server = server,
++ .acl = buf,
++ .acl_len = buflen,
++ };
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETACL],
++ .rpc_argp = &arg,
++ .rpc_resp = NULL,
++ };
++ int ret;
++
++ if (!nfs4_server_supports_acls(server))
++ return -EOPNOTSUPP;
++
++ /* XXX: should check for buflen too large? */
++
++ lock_kernel();
++ ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0);
++ unlock_kernel();
++
++ if (ret == 0)
++ nfs4_write_acl_attr(inode, buf, buflen);
++
++ return ret;
+ }
+
+ static int
+@@ -1545,11 +2187,13 @@ nfs4_async_handle_error(struct rpc_task
+ case -NFS4ERR_EXPIRED:
+ rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL);
+ nfs4_schedule_state_recovery(clp);
++ if (test_bit(NFS4CLNT_OK, &clp->cl_state))
++ rpc_wake_up_task(task);
+ task->tk_status = 0;
+ return -EAGAIN;
+ case -NFS4ERR_GRACE:
+ case -NFS4ERR_DELAY:
+- rpc_delay(task, NFS4_POLL_RETRY_TIME);
++ rpc_delay(task, NFS4_POLL_RETRY_MAX);
+ task->tk_status = 0;
+ return -EAGAIN;
+ case -NFS4ERR_OLD_STATEID:
+@@ -1560,12 +2204,11 @@ nfs4_async_handle_error(struct rpc_task
+ return 0;
+ }
+
+-int
+-nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp)
++int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp)
+ {
+ DEFINE_WAIT(wait);
+ sigset_t oldset;
+- int interruptible, res;
++ int interruptible, res = 0;
+
+ might_sleep();
+
+@@ -1573,101 +2216,85 @@ nfs4_wait_clnt_recover(struct rpc_clnt *
+ interruptible = TASK_UNINTERRUPTIBLE;
+ if (clnt->cl_intr)
+ interruptible = TASK_INTERRUPTIBLE;
+- do {
+- res = 0;
+- prepare_to_wait(&clp->cl_waitq, &wait, interruptible);
+- nfs4_schedule_state_recovery(clp);
+- if (test_bit(NFS4CLNT_OK, &clp->cl_state) &&
+- !test_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state))
+- break;
+- if (clnt->cl_intr && signalled()) {
+- res = -ERESTARTSYS;
+- break;
+- }
++ prepare_to_wait(&clp->cl_waitq, &wait, interruptible);
++ nfs4_schedule_state_recovery(clp);
++ if (clnt->cl_intr && signalled())
++ res = -ERESTARTSYS;
++ else if (!test_bit(NFS4CLNT_OK, &clp->cl_state))
+ schedule();
+- } while(!test_bit(NFS4CLNT_OK, &clp->cl_state));
+ finish_wait(&clp->cl_waitq, &wait);
+ rpc_clnt_sigunmask(clnt, &oldset);
+ return res;
+ }
+
+-static int
+-nfs4_delay(struct rpc_clnt *clnt)
++static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
+ {
+ sigset_t oldset;
+ int res = 0;
+
+ might_sleep();
+
++ if (*timeout <= 0)
++ *timeout = NFS4_POLL_RETRY_MIN;
++ if (*timeout > NFS4_POLL_RETRY_MAX)
++ *timeout = NFS4_POLL_RETRY_MAX;
+ rpc_clnt_sigmask(clnt, &oldset);
+ if (clnt->cl_intr) {
+ set_current_state(TASK_INTERRUPTIBLE);
+- schedule_timeout(NFS4_POLL_RETRY_TIME);
++ schedule_timeout(*timeout);
+ if (signalled())
+ res = -ERESTARTSYS;
+ } else {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+- schedule_timeout(NFS4_POLL_RETRY_TIME);
++ schedule_timeout(*timeout);
+ }
+ rpc_clnt_sigunmask(clnt, &oldset);
++ *timeout <<= 1;
+ return res;
+ }
+
+ /* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+-int
+-nfs4_handle_error(struct nfs_server *server, int errorcode)
++int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+ {
+ struct nfs4_client *clp = server->nfs4_state;
+ int ret = errorcode;
+
++ exception->retry = 0;
+ switch(errorcode) {
++ case 0:
++ return 0;
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_EXPIRED:
+ ret = nfs4_wait_clnt_recover(server->client, clp);
++ if (ret == 0)
++ exception->retry = 1;
+ break;
+ case -NFS4ERR_GRACE:
+ case -NFS4ERR_DELAY:
+- ret = nfs4_delay(server->client);
++ ret = nfs4_delay(server->client, &exception->timeout);
++ if (ret == 0)
++ exception->retry = 1;
+ break;
+ case -NFS4ERR_OLD_STATEID:
+- ret = 0;
++ if (ret == 0)
++ exception->retry = 1;
+ }
+ /* We failed to handle the error */
+ return nfs4_map_errors(ret);
+ }
+
+-
+-static int
+-nfs4_request_compatible(struct nfs_page *req, struct file *filp, struct page *page)
+-{
+- struct nfs4_state *state = NULL;
+- struct rpc_cred *cred = NULL;
+-
+- if (req->wb_file != filp)
+- return 0;
+- if (req->wb_page != page)
+- return 0;
+- state = (struct nfs4_state *)filp->private_data;
+- if (req->wb_state != state)
+- return 0;
+- if (req->wb_lockowner != current->files)
+- return 0;
+- cred = state->owner->so_cred;
+- if (req->wb_cred != cred)
+- return 0;
+- return 1;
+-}
+-
+-int
+-nfs4_proc_setclientid(struct nfs4_client *clp,
+- u32 program, unsigned short port)
++int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short port)
+ {
+- u32 *p;
+- struct nfs4_setclientid setclientid;
+- struct timespec tv;
++ static nfs4_verifier sc_verifier;
++ static int initialized;
++
++ struct nfs4_setclientid setclientid = {
++ .sc_verifier = &sc_verifier,
++ .sc_prog = program,
++ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
+ .rpc_argp = &setclientid,
+@@ -1675,15 +2302,24 @@ nfs4_proc_setclientid(struct nfs4_client
+ .rpc_cred = clp->cl_cred,
+ };
+
+- tv = CURRENT_TIME;
+- p = (u32*)setclientid.sc_verifier.data;
+- *p++ = (u32)tv.tv_sec;
+- *p = (u32)tv.tv_nsec;
+- setclientid.sc_name = clp->cl_ipaddr;
+- sprintf(setclientid.sc_netid, "tcp");
+- sprintf(setclientid.sc_uaddr, "%s.%d.%d", clp->cl_ipaddr, port >> 8, port & 255);
+- setclientid.sc_prog = htonl(program);
+- setclientid.sc_cb_ident = 0;
++ if (!initialized) {
++ struct timespec boot_time;
++ u32 *p;
++
++ initialized = 1;
++ boot_time = CURRENT_TIME;
++ p = (u32*)sc_verifier.data;
++ *p++ = htonl((u32)boot_time.tv_sec);
++ *p = htonl((u32)boot_time.tv_nsec);
++ }
++ setclientid.sc_name_len = scnprintf(setclientid.sc_name,
++ sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u",
++ clp->cl_ipaddr, NIPQUAD(clp->cl_addr.s_addr));
++ setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
++ sizeof(setclientid.sc_netid), "tcp");
++ setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
++ sizeof(setclientid.sc_uaddr), "%s.%d.%d",
++ clp->cl_ipaddr, port >> 8, port & 255);
+
+ return rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+ }
+@@ -1712,6 +2348,40 @@ nfs4_proc_setclientid_confirm(struct nfs
+ return status;
+ }
+
++static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
++{
++ struct nfs4_delegreturnargs args = {
++ .fhandle = NFS_FH(inode),
++ .stateid = stateid,
++ };
++ struct rpc_message msg = {
++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
++ .rpc_argp = &args,
++ .rpc_cred = cred,
++ };
++
++ return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
++}
++
++int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
++{
++ struct nfs_server *server = NFS_SERVER(inode);
++ struct nfs4_exception exception = { };
++ int err;
++ do {
++ err = _nfs4_proc_delegreturn(inode, cred, stateid);
++ switch (err) {
++ case -NFS4ERR_STALE_STATEID:
++ case -NFS4ERR_EXPIRED:
++ nfs4_schedule_state_recovery(server->nfs4_state);
++ case 0:
++ return 0;
++ }
++ err = nfs4_handle_exception(server, err, &exception);
++ } while (exception.retry);
++ return err;
++}
++
+ #define NFS4_LOCK_MINTIMEOUT (1 * HZ)
+ #define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
+
+@@ -1753,8 +2423,7 @@ nfs4_lck_length(struct file_lock *reques
+ return request->fl_end - request->fl_start + 1;
+ }
+
+-int
+-nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
++static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+ {
+ struct inode *inode = state->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+@@ -1778,9 +2447,10 @@ nfs4_proc_getlk(struct nfs4_state *state
+ struct nfs4_lock_state *lsp;
+ int status;
+
++ down_read(&clp->cl_sem);
+ nlo.clientid = clp->cl_clientid;
+ down(&state->lock_sema);
+- lsp = nfs4_find_lock_state(state, request->fl_owner);
++ lsp = nfs4_find_lock_state(state, request->fl_pid);
+ if (lsp)
+ nlo.id = lsp->ls_id;
+ else {
+@@ -1811,14 +2481,28 @@ nfs4_proc_getlk(struct nfs4_state *state
+ if (lsp)
+ nfs4_put_lock_state(lsp);
+ up(&state->lock_sema);
+- return nfs4_map_errors(status);
++ up_read(&clp->cl_sem);
++ return status;
+ }
+
+-int
+-nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
++static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
++{
++ struct nfs4_exception exception = { };
++ int err;
++
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(state->inode),
++ _nfs4_proc_getlk(state, cmd, request),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
+ {
+ struct inode *inode = state->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
++ struct nfs4_client *clp = server->nfs4_state;
+ struct nfs_lockargs arg = {
+ .fh = NFS_FH(inode),
+ .type = nfs4_lck_type(cmd, request),
+@@ -1838,29 +2522,46 @@ nfs4_proc_unlck(struct nfs4_state *state
+ struct nfs_locku_opargs luargs;
+ int status = 0;
+
++ down_read(&clp->cl_sem);
+ down(&state->lock_sema);
+- lsp = nfs4_find_lock_state(state, request->fl_owner);
++ lsp = nfs4_find_lock_state(state, request->fl_pid);
+ if (!lsp)
+ goto out;
+- luargs.seqid = lsp->ls_seqid;
+- memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid));
+- arg.u.locku = &luargs;
+- status = rpc_call_sync(server->client, &msg, 0);
+- nfs4_increment_lock_seqid(status, lsp);
++ /* We might have lost the locks! */
++ if ((lsp->flags & NFS_LOCK_INITIALIZED) != 0) {
++ luargs.seqid = lsp->ls_seqid;
++ memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid));
++ arg.u.locku = &luargs;
++ status = rpc_call_sync(server->client, &msg, 0);
++ nfs4_increment_lock_seqid(status, lsp);
++ }
+
+ if (status == 0) {
+ memcpy(&lsp->ls_stateid, &res.u.stateid,
+ sizeof(lsp->ls_stateid));
+- nfs4_notify_unlck(inode, request, lsp);
++ nfs4_notify_unlck(state, request, lsp);
+ }
+ nfs4_put_lock_state(lsp);
+ out:
+ up(&state->lock_sema);
+- return nfs4_map_errors(status);
++ up_read(&clp->cl_sem);
++ return status;
+ }
+
+-static int
+-nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
++static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
++{
++ struct nfs4_exception exception = { };
++ int err;
++
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(state->inode),
++ _nfs4_proc_unlck(state, cmd, request),
++ &exception);
++ } while (exception.retry);
++ return err;
++}
++
++static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *request, int reclaim)
+ {
+ struct inode *inode = state->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+@@ -1881,23 +2582,22 @@ nfs4_proc_setlk(struct nfs4_state *state
+ .rpc_cred = state->owner->so_cred,
+ };
+ struct nfs_lock_opargs largs = {
++ .reclaim = reclaim,
+ .new_lock_owner = 0,
+ };
+ int status;
+
+- down(&state->lock_sema);
+- lsp = nfs4_find_lock_state(state, request->fl_owner);
+- if (lsp == NULL) {
++ lsp = nfs4_get_lock_state(state, request->fl_pid);
++ if (lsp == NULL)
++ return -ENOMEM;
++ if (!(lsp->flags & NFS_LOCK_INITIALIZED)) {
+ struct nfs4_state_owner *owner = state->owner;
+ struct nfs_open_to_lock otl = {
+ .lock_owner = {
+ .clientid = server->nfs4_state->cl_clientid,
+ },
+ };
+- status = -ENOMEM;
+- lsp = nfs4_alloc_lock_state(state, request->fl_owner);
+- if (!lsp)
+- goto out;
++
+ otl.lock_seqid = lsp->ls_seqid;
+ otl.lock_owner.id = lsp->ls_id;
+ memcpy(&otl.open_stateid, &state->stateid, sizeof(otl.open_stateid));
+@@ -1926,25 +2626,60 @@ nfs4_proc_setlk(struct nfs4_state *state
+ /* save the returned stateid. */
+ if (status == 0) {
+ memcpy(&lsp->ls_stateid, &res.u.stateid, sizeof(nfs4_stateid));
+- nfs4_notify_setlk(inode, request, lsp);
++ if (!reclaim)
++ nfs4_notify_setlk(state, request, lsp);
+ } else if (status == -NFS4ERR_DENIED)
+ status = -EAGAIN;
+ nfs4_put_lock_state(lsp);
+-out:
++ return status;
++}
++
++int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request)
++{
++#ifdef F_SETLK64
++ return _nfs4_do_setlk(state, F_SETLK64, request, 1);
++#else
++ return _nfs4_do_setlk(state, F_SETLK, request, 1);
++#endif
++}
++
++static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
++{
++ struct nfs4_client *clp = state->owner->so_client;
++ int status;
++
++ down_read(&clp->cl_sem);
++ down(&state->lock_sema);
++ status = _nfs4_do_setlk(state, cmd, request, 0);
+ up(&state->lock_sema);
+- return nfs4_map_errors(status);
++ up_read(&clp->cl_sem);
++ return status;
++}
++
++static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
++{
++ struct nfs4_exception exception = { };
++ int err;
++
++ do {
++ err = nfs4_handle_exception(NFS_SERVER(state->inode),
++ _nfs4_proc_setlk(state, cmd, request),
++ &exception);
++ } while (exception.retry);
++ return err;
+ }
+
+ static int
+ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
+ {
++ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
+ unsigned long timeout = NFS4_LOCK_MINTIMEOUT;
+ int status;
+
+ /* verify open state */
+- state = (struct nfs4_state *)filp->private_data;
+- BUG_ON(!state);
++ ctx = (struct nfs_open_context *)filp->private_data;
++ state = ctx->state;
+
+ if (request->fl_start < 0 || request->fl_end < 0)
+ return -EINVAL;
+@@ -1975,6 +2710,7 @@ struct nfs_rpc_ops nfs_v4_clientops = {
+ .version = 4, /* protocol version */
+ .dentry_ops = &nfs4_dentry_operations,
+ .dir_inode_ops = &nfs4_dir_inode_operations,
++ .file_inode_ops = &nfs4_file_inode_operations,
+ .getroot = nfs4_proc_get_root,
+ .getattr = nfs4_proc_getattr,
+ .setattr = nfs4_proc_setattr,
+@@ -2004,8 +2740,6 @@ struct nfs_rpc_ops nfs_v4_clientops = {
+ .commit_setup = nfs4_proc_commit_setup,
+ .file_open = nfs4_proc_file_open,
+ .file_release = nfs4_proc_file_release,
+- .request_init = nfs4_request_init,
+- .request_compatible = nfs4_request_compatible,
+ .lock = nfs4_proc_lock,
+ };
+
+--- linux-2.6.7/fs/nfs/callback.h.lsec 2005-03-23 14:28:22.484631512 -0700
++++ linux-2.6.7/fs/nfs/callback.h 2005-03-23 14:28:22.484631512 -0700
+@@ -0,0 +1,70 @@
++/*
++ * linux/fs/nfs/callback.h
++ *
++ * Copyright (C) 2004 Trond Myklebust
++ *
++ * NFSv4 callback definitions
++ */
++#ifndef __LINUX_FS_NFS_CALLBACK_H
++#define __LINUX_FS_NFS_CALLBACK_H
++
++#define NFS4_CALLBACK 0x40000000
++#define NFS4_CALLBACK_XDRSIZE 2048
++#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE)
++
++enum nfs4_callback_procnum {
++ CB_NULL = 0,
++ CB_COMPOUND = 1,
++};
++
++enum nfs4_callback_opnum {
++ OP_CB_GETATTR = 3,
++ OP_CB_RECALL = 4,
++ OP_CB_ILLEGAL = 10044,
++};
++
++struct cb_compound_hdr_arg {
++ int taglen;
++ const char *tag;
++ unsigned int callback_ident;
++ unsigned nops;
++};
++
++struct cb_compound_hdr_res {
++ uint32_t *status;
++ int taglen;
++ const char *tag;
++ uint32_t *nops;
++};
++
++struct cb_getattrargs {
++ struct sockaddr_in *addr;
++ struct nfs_fh fh;
++ uint32_t bitmap[2];
++};
++
++struct cb_getattrres {
++ uint32_t status;
++ uint32_t bitmap[2];
++ uint64_t size;
++ uint64_t change_attr;
++ struct timespec ctime;
++ struct timespec mtime;
++};
++
++struct cb_recallargs {
++ struct sockaddr_in *addr;
++ struct nfs_fh fh;
++ nfs4_stateid stateid;
++ uint32_t truncate;
++};
++
++extern unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
++extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
++
++extern int nfs_callback_up(void);
++extern int nfs_callback_down(void);
++
++extern unsigned short nfs_callback_tcpport;
++
++#endif /* __LINUX_FS_NFS_CALLBACK_H */
+--- linux-2.6.7/fs/nfs/direct.c.lsec 2004-06-15 23:19:53.000000000 -0600
++++ linux-2.6.7/fs/nfs/direct.c 2005-03-23 14:28:22.702598376 -0700
+@@ -110,7 +110,7 @@ nfs_free_user_pages(struct page **pages,
+ * nfs_direct_read_seg - Read in one iov segment. Generate separate
+ * read RPCs for each "rsize" bytes.
+ * @inode: target inode
+- * @file: target file (may be NULL)
++ * @ctx: target file open context
+ * user_addr: starting address of this segment of user's buffer
+ * count: size of this segment
+ * file_offset: offset in file to begin the operation
+@@ -118,7 +118,7 @@ nfs_free_user_pages(struct page **pages,
+ * nr_pages: size of pages array
+ */
+ static int
+-nfs_direct_read_seg(struct inode *inode, struct file *file,
++nfs_direct_read_seg(struct inode *inode, struct nfs_open_context *ctx,
+ unsigned long user_addr, size_t count, loff_t file_offset,
+ struct page **pages, int nr_pages)
+ {
+@@ -127,9 +127,10 @@ nfs_direct_read_seg(struct inode *inode,
+ int curpage = 0;
+ struct nfs_read_data rdata = {
+ .inode = inode,
++ .cred = ctx->cred,
+ .args = {
+ .fh = NFS_FH(inode),
+- .lockowner = current->files,
++ .context = ctx,
+ },
+ .res = {
+ .fattr = &rdata.fattr,
+@@ -151,7 +152,7 @@ nfs_direct_read_seg(struct inode *inode,
+ user_addr + tot_bytes, rdata.args.pgbase, curpage);
+
+ lock_kernel();
+- result = NFS_PROTO(inode)->read(&rdata, file);
++ result = NFS_PROTO(inode)->read(&rdata);
+ unlock_kernel();
+
+ if (result <= 0) {
+@@ -183,7 +184,7 @@ nfs_direct_read_seg(struct inode *inode,
+ * nfs_direct_read - For each iov segment, map the user's buffer
+ * then generate read RPCs.
+ * @inode: target inode
+- * @file: target file (may be NULL)
++ * @ctx: target file open context
+ * @iov: array of vectors that define I/O buffer
+ * file_offset: offset in file to begin the operation
+ * nr_segs: size of iovec array
+@@ -193,7 +194,7 @@ nfs_direct_read_seg(struct inode *inode,
+ * server.
+ */
+ static ssize_t
+-nfs_direct_read(struct inode *inode, struct file *file,
++nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx,
+ const struct iovec *iov, loff_t file_offset,
+ unsigned long nr_segs)
+ {
+@@ -216,7 +217,7 @@ nfs_direct_read(struct inode *inode, str
+ return page_count;
+ }
+
+- result = nfs_direct_read_seg(inode, file, user_addr, size,
++ result = nfs_direct_read_seg(inode, ctx, user_addr, size,
+ file_offset, pages, page_count);
+
+ nfs_free_user_pages(pages, page_count, 1);
+@@ -239,7 +240,7 @@ nfs_direct_read(struct inode *inode, str
+ * nfs_direct_write_seg - Write out one iov segment. Generate separate
+ * write RPCs for each "wsize" bytes, then commit.
+ * @inode: target inode
+- * @file: target file (may be NULL)
++ * @ctx: target file open context
+ * user_addr: starting address of this segment of user's buffer
+ * count: size of this segment
+ * file_offset: offset in file to begin the operation
+@@ -247,7 +248,7 @@ nfs_direct_read(struct inode *inode, str
+ * nr_pages: size of pages array
+ */
+ static int
+-nfs_direct_write_seg(struct inode *inode, struct file *file,
++nfs_direct_write_seg(struct inode *inode, struct nfs_open_context *ctx,
+ unsigned long user_addr, size_t count, loff_t file_offset,
+ struct page **pages, int nr_pages)
+ {
+@@ -257,9 +258,10 @@ nfs_direct_write_seg(struct inode *inode
+ struct nfs_writeverf first_verf;
+ struct nfs_write_data wdata = {
+ .inode = inode,
++ .cred = ctx->cred,
+ .args = {
+ .fh = NFS_FH(inode),
+- .lockowner = current->files,
++ .context = ctx,
+ },
+ .res = {
+ .fattr = &wdata.fattr,
+@@ -290,7 +292,7 @@ retry:
+ user_addr + tot_bytes, wdata.args.pgbase, curpage);
+
+ lock_kernel();
+- result = NFS_PROTO(inode)->write(&wdata, file);
++ result = NFS_PROTO(inode)->write(&wdata);
+ unlock_kernel();
+
+ if (result <= 0) {
+@@ -325,7 +327,7 @@ retry:
+ wdata.args.offset = file_offset;
+
+ lock_kernel();
+- result = NFS_PROTO(inode)->commit(&wdata, file);
++ result = NFS_PROTO(inode)->commit(&wdata);
+ unlock_kernel();
+
+ if (result < 0 || memcmp(&first_verf.verifier,
+@@ -349,7 +351,7 @@ sync_retry:
+ * nfs_direct_write - For each iov segment, map the user's buffer
+ * then generate write and commit RPCs.
+ * @inode: target inode
+- * @file: target file (may be NULL)
++ * @ctx: target file open context
+ * @iov: array of vectors that define I/O buffer
+ * file_offset: offset in file to begin the operation
+ * nr_segs: size of iovec array
+@@ -358,8 +360,7 @@ sync_retry:
+ * that non-direct readers might access, so they will pick up these
+ * writes immediately.
+ */
+-static ssize_t
+-nfs_direct_write(struct inode *inode, struct file *file,
++static int nfs_direct_write(struct inode *inode, struct nfs_open_context *ctx,
+ const struct iovec *iov, loff_t file_offset,
+ unsigned long nr_segs)
+ {
+@@ -382,7 +383,7 @@ nfs_direct_write(struct inode *inode, st
+ return page_count;
+ }
+
+- result = nfs_direct_write_seg(inode, file, user_addr, size,
++ result = nfs_direct_write_seg(inode, ctx, user_addr, size,
+ file_offset, pages, page_count);
+ nfs_free_user_pages(pages, page_count, 0);
+
+@@ -414,6 +415,7 @@ nfs_direct_IO(int rw, struct kiocb *iocb
+ {
+ ssize_t result = -EINVAL;
+ struct file *file = iocb->ki_filp;
++ struct nfs_open_context *ctx;
+ struct dentry *dentry = file->f_dentry;
+ struct inode *inode = dentry->d_inode;
+
+@@ -423,19 +425,20 @@ nfs_direct_IO(int rw, struct kiocb *iocb
+ if (!is_sync_kiocb(iocb))
+ return result;
+
++ ctx = (struct nfs_open_context *)file->private_data;
+ switch (rw) {
+ case READ:
+ dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n",
+ dentry->d_name.name, file_offset, nr_segs);
+
+- result = nfs_direct_read(inode, file, iov,
++ result = nfs_direct_read(inode, ctx, iov,
+ file_offset, nr_segs);
+ break;
+ case WRITE:
+ dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n",
+ dentry->d_name.name, file_offset, nr_segs);
+
+- result = nfs_direct_write(inode, file, iov,
++ result = nfs_direct_write(inode, ctx, iov,
+ file_offset, nr_segs);
+ break;
+ default:
+@@ -471,6 +474,8 @@ nfs_file_direct_read(struct kiocb *iocb,
+ ssize_t retval = -EINVAL;
+ loff_t *ppos = &iocb->ki_pos;
+ struct file *file = iocb->ki_filp;
++ struct nfs_open_context *ctx =
++ (struct nfs_open_context *) file->private_data;
+ struct dentry *dentry = file->f_dentry;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+@@ -502,7 +507,7 @@ nfs_file_direct_read(struct kiocb *iocb,
+ goto out;
+ }
+
+- retval = nfs_direct_read(inode, file, &iov, pos, 1);
++ retval = nfs_direct_read(inode, ctx, &iov, pos, 1);
+ if (retval > 0)
+ *ppos = pos + retval;
+
+@@ -542,6 +547,8 @@ nfs_file_direct_write(struct kiocb *iocb
+ loff_t *ppos = &iocb->ki_pos;
+ unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+ struct file *file = iocb->ki_filp;
++ struct nfs_open_context *ctx =
++ (struct nfs_open_context *) file->private_data;
+ struct dentry *dentry = file->f_dentry;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+@@ -589,7 +596,7 @@ nfs_file_direct_write(struct kiocb *iocb
+ goto out;
+ }
+
+- retval = nfs_direct_write(inode, file, &iov, pos, 1);
++ retval = nfs_direct_write(inode, ctx, &iov, pos, 1);
+ if (mapping->nrpages)
+ invalidate_inode_pages2(mapping);
+ if (retval > 0)
+--- linux-2.6.7/fs/nfs/nfs4state.c.lsec 2004-06-15 23:18:47.000000000 -0600
++++ linux-2.6.7/fs/nfs/nfs4state.c 2005-03-23 14:28:22.939562352 -0700
+@@ -40,11 +40,15 @@
+
+ #include <linux/config.h>
+ #include <linux/slab.h>
++#include <linux/smp_lock.h>
+ #include <linux/nfs_fs.h>
+ #include <linux/nfs_idmap.h>
+ #include <linux/workqueue.h>
+ #include <linux/bitops.h>
+
++#include "callback.h"
++#include "delegation.h"
++
+ #define OPENOWNER_POOL_SIZE 8
+
+ static spinlock_t state_spinlock = SPIN_LOCK_UNLOCKED;
+@@ -93,21 +97,26 @@ nfs4_alloc_client(struct in_addr *addr)
+ {
+ struct nfs4_client *clp;
+
+- if ((clp = kmalloc(sizeof(*clp), GFP_KERNEL))) {
+- memset(clp, 0, sizeof(*clp));
+- memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
+- init_rwsem(&clp->cl_sem);
+- INIT_LIST_HEAD(&clp->cl_state_owners);
+- INIT_LIST_HEAD(&clp->cl_unused);
+- spin_lock_init(&clp->cl_lock);
+- atomic_set(&clp->cl_count, 1);
+- INIT_WORK(&clp->cl_recoverd, nfs4_recover_state, clp);
+- INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
+- INIT_LIST_HEAD(&clp->cl_superblocks);
+- init_waitqueue_head(&clp->cl_waitq);
+- rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client");
+- clp->cl_state = 1 << NFS4CLNT_NEW;
++ if (nfs_callback_up() < 0)
++ return NULL;
++ if ((clp = kmalloc(sizeof(*clp), GFP_KERNEL)) == NULL) {
++ nfs_callback_down();
++ return NULL;
+ }
++ memset(clp, 0, sizeof(*clp));
++ memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
++ init_rwsem(&clp->cl_sem);
++ INIT_LIST_HEAD(&clp->cl_delegations);
++ INIT_LIST_HEAD(&clp->cl_state_owners);
++ INIT_LIST_HEAD(&clp->cl_unused);
++ spin_lock_init(&clp->cl_lock);
++ atomic_set(&clp->cl_count, 1);
++ INIT_WORK(&clp->cl_recoverd, nfs4_recover_state, clp);
++ INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
++ INIT_LIST_HEAD(&clp->cl_superblocks);
++ init_waitqueue_head(&clp->cl_waitq);
++ rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client");
++ clp->cl_state = 1 << NFS4CLNT_OK;
+ return clp;
+ }
+
+@@ -130,25 +139,52 @@ nfs4_free_client(struct nfs4_client *clp
+ if (clp->cl_rpcclient)
+ rpc_shutdown_client(clp->cl_rpcclient);
+ kfree(clp);
++ nfs_callback_down();
++}
++
++static struct nfs4_client *__nfs4_find_client(struct in_addr *addr)
++{
++ struct nfs4_client *clp;
++ list_for_each_entry(clp, &nfs4_clientid_list, cl_servers) {
++ if (memcmp(&clp->cl_addr, addr, sizeof(clp->cl_addr)) == 0) {
++ atomic_inc(&clp->cl_count);
++ return clp;
++ }
++ }
++ return NULL;
++}
++
++struct nfs4_client *nfs4_find_client(struct in_addr *addr)
++{
++ struct nfs4_client *clp;
++ spin_lock(&state_spinlock);
++ clp = __nfs4_find_client(addr);
++ spin_unlock(&state_spinlock);
++ return clp;
+ }
+
+ struct nfs4_client *
+ nfs4_get_client(struct in_addr *addr)
+ {
+- struct nfs4_client *new, *clp = NULL;
++ struct nfs4_client *clp, *new = NULL;
+
+- new = nfs4_alloc_client(addr);
+ spin_lock(&state_spinlock);
+- list_for_each_entry(clp, &nfs4_clientid_list, cl_servers) {
+- if (memcmp(&clp->cl_addr, addr, sizeof(clp->cl_addr)) == 0)
+- goto found;
++ for (;;) {
++ clp = __nfs4_find_client(addr);
++ if (clp != NULL)
++ break;
++ clp = new;
++ if (clp != NULL) {
++ list_add(&clp->cl_servers, &nfs4_clientid_list);
++ new = NULL;
++ break;
++ }
++ spin_unlock(&state_spinlock);
++ new = nfs4_alloc_client(addr);
++ spin_lock(&state_spinlock);
++ if (new == NULL)
++ break;
+ }
+- if (new)
+- list_add(&new->cl_servers, &nfs4_clientid_list);
+- spin_unlock(&state_spinlock);
+- return new;
+-found:
+- atomic_inc(&clp->cl_count);
+ spin_unlock(&state_spinlock);
+ if (new)
+ nfs4_free_client(new);
+@@ -169,6 +205,16 @@ nfs4_put_client(struct nfs4_client *clp)
+ nfs4_free_client(clp);
+ }
+
++int nfs4_init_client(struct nfs4_client *clp)
++{
++ int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, nfs_callback_tcpport);
++ if (status == 0)
++ status = nfs4_proc_setclientid_confirm(clp);
++ if (status == 0)
++ nfs4_schedule_state_renewal(clp);
++ return status;
++}
++
+ u32
+ nfs4_alloc_lockowner_id(struct nfs4_client *clp)
+ {
+@@ -185,7 +231,6 @@ nfs4_client_grab_unused(struct nfs4_clie
+ atomic_inc(&sp->so_count);
+ sp->so_cred = cred;
+ list_move(&sp->so_list, &clp->cl_state_owners);
+- sp->so_generation = clp->cl_generation;
+ clp->cl_nunused--;
+ }
+ return sp;
+@@ -224,6 +269,7 @@ nfs4_alloc_state_owner(void)
+ init_MUTEX(&sp->so_sema);
+ sp->so_seqid = 0; /* arbitrary */
+ INIT_LIST_HEAD(&sp->so_states);
++ INIT_LIST_HEAD(&sp->so_delegations);
+ atomic_set(&sp->so_count, 1);
+ return sp;
+ }
+@@ -237,8 +283,11 @@ nfs4_unhash_state_owner(struct nfs4_stat
+ spin_unlock(&clp->cl_lock);
+ }
+
+-struct nfs4_state_owner *
+-nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
++/*
++ * Note: must be called with clp->cl_sem held in order to prevent races
++ * with reboot recovery!
++ */
++struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+ {
+ struct nfs4_client *clp = server->nfs4_state;
+ struct nfs4_state_owner *sp, *new;
+@@ -254,23 +303,23 @@ nfs4_get_state_owner(struct nfs_server *
+ new->so_client = clp;
+ new->so_id = nfs4_alloc_lockowner_id(clp);
+ new->so_cred = cred;
+- new->so_generation = clp->cl_generation;
+ sp = new;
+ new = NULL;
+ }
+ spin_unlock(&clp->cl_lock);
+ if (new)
+ kfree(new);
+- if (sp) {
+- if (!test_bit(NFS4CLNT_OK, &clp->cl_state))
+- nfs4_wait_clnt_recover(server->client, clp);
+- } else
+- put_rpccred(cred);
+- return sp;
++ if (sp != NULL)
++ return sp;
++ put_rpccred(cred);
++ return NULL;
+ }
+
+-void
+-nfs4_put_state_owner(struct nfs4_state_owner *sp)
++/*
++ * Must be called with clp->cl_sem held in order to avoid races
++ * with state recovery...
++ */
++void nfs4_put_state_owner(struct nfs4_state_owner *sp)
+ {
+ struct nfs4_client *clp = sp->so_client;
+ struct rpc_cred *cred = sp->so_cred;
+@@ -330,8 +379,6 @@ __nfs4_find_state(struct inode *inode, s
+ continue;
+ if ((state->state & mode) != mode)
+ continue;
+- /* Add the state to the head of the inode's list */
+- list_move(&state->inode_states, &nfsi->open_states);
+ atomic_inc(&state->count);
+ if (mode & FMODE_READ)
+ state->nreaders++;
+@@ -353,8 +400,6 @@ __nfs4_find_state_byowner(struct inode *
+ if (state->nreaders == 0 && state->nwriters == 0)
+ continue;
+ if (state->owner == owner) {
+- /* Add the state to the head of the inode's list */
+- list_move(&state->inode_states, &nfsi->open_states);
+ atomic_inc(&state->count);
+ return state;
+ }
+@@ -411,51 +456,40 @@ out:
+ return state;
+ }
+
+-static void
+-__nfs4_put_open_state(struct nfs4_state *state)
++/*
++ * Beware! Caller must be holding exactly one
++ * reference to clp->cl_sem and owner->so_sema!
++ */
++void nfs4_put_open_state(struct nfs4_state *state)
+ {
+ struct inode *inode = state->inode;
+ struct nfs4_state_owner *owner = state->owner;
+- int status = 0;
+
+- if (!atomic_dec_and_lock(&state->count, &inode->i_lock)) {
+- up(&owner->so_sema);
++ if (!atomic_dec_and_lock(&state->count, &inode->i_lock))
+ return;
+- }
+ if (!list_empty(&state->inode_states))
+ list_del(&state->inode_states);
+ spin_unlock(&inode->i_lock);
+ list_del(&state->open_states);
+- if (state->state != 0) {
+- do {
+- status = nfs4_do_close(inode, state);
+- if (!status)
+- break;
+- up(&owner->so_sema);
+- status = nfs4_handle_error(NFS_SERVER(inode), status);
+- down(&owner->so_sema);
+- } while (!status);
+- }
+- up(&owner->so_sema);
++ BUG_ON (state->state != 0);
+ nfs4_free_open_state(state);
+ nfs4_put_state_owner(owner);
+ }
+
+-void
+-nfs4_put_open_state(struct nfs4_state *state)
+-{
+- down(&state->owner->so_sema);
+- __nfs4_put_open_state(state);
+-}
+-
+-void
+-nfs4_close_state(struct nfs4_state *state, mode_t mode)
++/*
++ * Beware! Caller must be holding no references to clp->cl_sem!
++ * of owner->so_sema!
++ */
++void nfs4_close_state(struct nfs4_state *state, mode_t mode)
+ {
+ struct inode *inode = state->inode;
+ struct nfs4_state_owner *owner = state->owner;
++ struct nfs4_client *clp = owner->so_client;
+ int newstate;
+ int status = 0;
+
++ atomic_inc(&owner->so_count);
++ down_read(&clp->cl_sem);
+ down(&owner->so_sema);
+ /* Protect against nfs4_find_state() */
+ spin_lock(&inode->i_lock);
+@@ -466,29 +500,24 @@ nfs4_close_state(struct nfs4_state *stat
+ if (state->nwriters == 0 && state->nreaders == 0)
+ list_del_init(&state->inode_states);
+ spin_unlock(&inode->i_lock);
+- do {
+- newstate = 0;
+- if (state->state == 0)
+- break;
++ newstate = 0;
++ if (state->state != 0) {
+ if (state->nreaders)
+ newstate |= FMODE_READ;
+ if (state->nwriters)
+ newstate |= FMODE_WRITE;
+ if (state->state == newstate)
+- break;
++ goto out;
+ if (newstate != 0)
+ status = nfs4_do_downgrade(inode, state, newstate);
+ else
+ status = nfs4_do_close(inode, state);
+- if (!status) {
+- state->state = newstate;
+- break;
+- }
+- up(&owner->so_sema);
+- status = nfs4_handle_error(NFS_SERVER(inode), status);
+- down(&owner->so_sema);
+- } while (!status);
+- __nfs4_put_open_state(state);
++ }
++out:
++ nfs4_put_open_state(state);
++ up(&owner->so_sema);
++ nfs4_put_state_owner(owner);
++ up_read(&clp->cl_sem);
+ }
+
+ /*
+@@ -496,11 +525,11 @@ nfs4_close_state(struct nfs4_state *stat
+ * that is compatible with current->files
+ */
+ static struct nfs4_lock_state *
+-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
++__nfs4_find_lock_state(struct nfs4_state *state, unsigned int pid)
+ {
+ struct nfs4_lock_state *pos;
+ list_for_each_entry(pos, &state->lock_states, ls_locks) {
+- if (pos->ls_owner != fl_owner)
++ if (pos->ls_pid != pid)
+ continue;
+ atomic_inc(&pos->ls_count);
+ return pos;
+@@ -509,23 +538,16 @@ __nfs4_find_lock_state(struct nfs4_state
+ }
+
+ struct nfs4_lock_state *
+-nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
++nfs4_find_lock_state(struct nfs4_state *state, unsigned int pid)
+ {
+ struct nfs4_lock_state *lsp;
+ read_lock(&state->state_lock);
+- lsp = __nfs4_find_lock_state(state, fl_owner);
++ lsp = __nfs4_find_lock_state(state, pid);
+ read_unlock(&state->state_lock);
+ return lsp;
+ }
+
+-/*
+- * Return a compatible lock_state. If no initialized lock_state structure
+- * exists, return an uninitialized one.
+- *
+- * The caller must be holding state->lock_sema
+- */
+-struct nfs4_lock_state *
+-nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
++static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, unsigned int pid)
+ {
+ struct nfs4_lock_state *lsp;
+ struct nfs4_client *clp = state->owner->so_client;
+@@ -533,12 +555,12 @@ nfs4_alloc_lock_state(struct nfs4_state
+ lsp = kmalloc(sizeof(*lsp), GFP_KERNEL);
+ if (lsp == NULL)
+ return NULL;
++ lsp->flags = 0;
+ lsp->ls_seqid = 0; /* arbitrary */
+ lsp->ls_id = -1;
+ memset(lsp->ls_stateid.data, 0, sizeof(lsp->ls_stateid.data));
+ atomic_set(&lsp->ls_count, 1);
+- lsp->ls_owner = fl_owner;
+- lsp->ls_parent = state;
++ lsp->ls_pid = pid;
+ INIT_LIST_HEAD(&lsp->ls_locks);
+ spin_lock(&clp->cl_lock);
+ lsp->ls_id = nfs4_alloc_lockowner_id(clp);
+@@ -547,16 +569,32 @@ nfs4_alloc_lock_state(struct nfs4_state
+ }
+
+ /*
++ * Return a compatible lock_state. If no initialized lock_state structure
++ * exists, return an uninitialized one.
++ *
++ * The caller must be holding state->lock_sema and clp->cl_sem
++ */
++struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, unsigned int pid)
++{
++ struct nfs4_lock_state * lsp;
++
++ lsp = nfs4_find_lock_state(state, pid);
++ if (lsp == NULL)
++ lsp = nfs4_alloc_lock_state(state, pid);
++ return lsp;
++}
++
++/*
+ * Byte-range lock aware utility to initialize the stateid of read/write
+ * requests.
+ */
+ void
+-nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner)
++nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, unsigned int pid)
+ {
+ if (test_bit(LK_STATE_IN_USE, &state->flags)) {
+ struct nfs4_lock_state *lsp;
+
+- lsp = nfs4_find_lock_state(state, fl_owner);
++ lsp = nfs4_find_lock_state(state, pid);
+ if (lsp) {
+ memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
+ nfs4_put_lock_state(lsp);
+@@ -567,13 +605,14 @@ nfs4_copy_stateid(nfs4_stateid *dst, str
+ }
+
+ /*
+-* Called with state->lock_sema held.
++* Called with state->lock_sema and clp->cl_sem held.
+ */
+-void
+-nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp)
++void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp)
+ {
+- if (status == NFS_OK || seqid_mutating_err(-status))
++ if (status == NFS_OK || seqid_mutating_err(-status)) {
+ lsp->ls_seqid++;
++ lsp->flags |= NFS_LOCK_INITIALIZED;
++ }
+ }
+
+ /*
+@@ -598,12 +637,11 @@ nfs4_check_unlock(struct file_lock *fl,
+ * Post an initialized lock_state on the state->lock_states list.
+ */
+ void
+-nfs4_notify_setlk(struct inode *inode, struct file_lock *request, struct nfs4_lock_state *lsp)
++nfs4_notify_setlk(struct nfs4_state *state, struct file_lock *request, struct nfs4_lock_state *lsp)
+ {
+- struct nfs4_state *state = lsp->ls_parent;
+-
+ if (!list_empty(&lsp->ls_locks))
+ return;
++ atomic_inc(&lsp->ls_count);
+ write_lock(&state->state_lock);
+ list_add(&lsp->ls_locks, &state->lock_states);
+ set_bit(LK_STATE_IN_USE, &state->flags);
+@@ -620,15 +658,15 @@ nfs4_notify_setlk(struct inode *inode, s
+ *
+ */
+ void
+-nfs4_notify_unlck(struct inode *inode, struct file_lock *request, struct nfs4_lock_state *lsp)
++nfs4_notify_unlck(struct nfs4_state *state, struct file_lock *request, struct nfs4_lock_state *lsp)
+ {
+- struct nfs4_state *state = lsp->ls_parent;
++ struct inode *inode = state->inode;
+ struct file_lock *fl;
+
+ for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+ if (!(fl->fl_flags & FL_POSIX))
+ continue;
+- if (fl->fl_owner != lsp->ls_owner)
++ if (fl->fl_pid != lsp->ls_pid)
+ continue;
+ /* Exit if we find at least one lock which is not consumed */
+ if (nfs4_check_unlock(fl,request) == 0)
+@@ -640,6 +678,7 @@ nfs4_notify_unlck(struct inode *inode, s
+ if (list_empty(&state->lock_states))
+ clear_bit(LK_STATE_IN_USE, &state->flags);
+ write_unlock(&state->state_lock);
++ nfs4_put_lock_state(lsp);
+ }
+
+ /*
+@@ -651,20 +690,18 @@ nfs4_put_lock_state(struct nfs4_lock_sta
+ {
+ if (!atomic_dec_and_test(&lsp->ls_count))
+ return;
+- if (!list_empty(&lsp->ls_locks))
+- return;
++ BUG_ON (!list_empty(&lsp->ls_locks));
+ kfree(lsp);
+ }
+
+ /*
+-* Called with sp->so_sema held.
++* Called with sp->so_sema and clp->cl_sem held.
+ *
+ * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
+ * failed with a seqid incrementing error -
+ * see comments nfs_fs.h:seqid_mutating_error()
+ */
+-void
+-nfs4_increment_seqid(int status, struct nfs4_state_owner *sp)
++void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp)
+ {
+ if (status == NFS_OK || seqid_mutating_err(-status))
+ sp->so_seqid++;
+@@ -693,21 +730,14 @@ nfs4_recover_state(void *data)
+
+ init_completion(&args.complete);
+
+- down_read(&clp->cl_sem);
+- if (test_and_set_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state))
+- goto out_failed;
+ if (kernel_thread(reclaimer, &args, CLONE_KERNEL) < 0)
+ goto out_failed_clear;
+ wait_for_completion(&args.complete);
+ return;
+ out_failed_clear:
+- smp_mb__before_clear_bit();
+- clear_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state);
+- smp_mb__after_clear_bit();
++ set_bit(NFS4CLNT_OK, &clp->cl_state);
+ wake_up_all(&clp->cl_waitq);
+ rpc_wake_up(&clp->cl_rpcwaitq);
+-out_failed:
+- up_read(&clp->cl_sem);
+ }
+
+ /*
+@@ -718,24 +748,66 @@ nfs4_schedule_state_recovery(struct nfs4
+ {
+ if (!clp)
+ return;
+- smp_mb__before_clear_bit();
+- clear_bit(NFS4CLNT_OK, &clp->cl_state);
+- smp_mb__after_clear_bit();
+- schedule_work(&clp->cl_recoverd);
++ if (test_and_clear_bit(NFS4CLNT_OK, &clp->cl_state))
++ schedule_work(&clp->cl_recoverd);
+ }
+
+-static int
+-nfs4_reclaim_open_state(struct nfs4_state_owner *sp)
++static int nfs4_reclaim_locks(struct nfs4_state *state)
++{
++ struct inode *inode = state->inode;
++ struct file_lock *fl;
++ int status = 0;
++
++ for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) {
++ if (!(fl->fl_flags & FL_POSIX))
++ continue;
++ if (((struct nfs_open_context *)fl->fl_file->private_data)->state != state)
++ continue;
++ status = nfs4_lock_reclaim(state, fl);
++ if (status >= 0)
++ continue;
++ switch (status) {
++ default:
++ printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
++ __FUNCTION__, status);
++ case -NFS4ERR_EXPIRED:
++ case -NFS4ERR_NO_GRACE:
++ case -NFS4ERR_RECLAIM_BAD:
++ case -NFS4ERR_RECLAIM_CONFLICT:
++ /* kill_proc(fl->fl_pid, SIGLOST, 1); */
++ break;
++ case -NFS4ERR_STALE_CLIENTID:
++ goto out_err;
++ }
++ }
++ return 0;
++out_err:
++ return status;
++}
++
++static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp)
+ {
+ struct nfs4_state *state;
++ struct nfs4_lock_state *lock;
+ int status = 0;
+
+ list_for_each_entry(state, &sp->so_states, open_states) {
+ if (state->state == 0)
+ continue;
+ status = nfs4_open_reclaim(sp, state);
+- if (status >= 0)
++ list_for_each_entry(lock, &state->lock_states, ls_locks)
++ lock->flags &= ~NFS_LOCK_INITIALIZED;
++ if (status >= 0) {
++ status = nfs4_reclaim_locks(state);
++ if (status < 0)
++ goto out_err;
++ list_for_each_entry(lock, &state->lock_states, ls_locks) {
++ if (!(lock->flags & NFS_LOCK_INITIALIZED))
++ printk("%s: Lock reclaim failed!\n",
++ __FUNCTION__);
++ }
+ continue;
++ }
+ switch (status) {
+ default:
+ printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
+@@ -762,75 +834,55 @@ out_err:
+ return status;
+ }
+
+-static int
+-reclaimer(void *ptr)
++static int reclaimer(void *ptr)
+ {
+ struct reclaimer_args *args = (struct reclaimer_args *)ptr;
+ struct nfs4_client *clp = args->clp;
+ struct nfs4_state_owner *sp;
+- int generation;
+ int status;
+
+ daemonize("%u.%u.%u.%u-reclaim", NIPQUAD(clp->cl_addr));
+ allow_signal(SIGKILL);
+
++ atomic_inc(&clp->cl_count);
+ complete(&args->complete);
+
++ /* Ensure exclusive access to NFSv4 state */
++ lock_kernel();
++ down_write(&clp->cl_sem);
+ /* Are there any NFS mounts out there? */
+ if (list_empty(&clp->cl_superblocks))
+ goto out;
+- if (!test_bit(NFS4CLNT_NEW, &clp->cl_state)) {
+- status = nfs4_proc_renew(clp);
+- if (status == 0) {
+- set_bit(NFS4CLNT_OK, &clp->cl_state);
+- goto out;
+- }
+- }
+- status = nfs4_proc_setclientid(clp, 0, 0);
+- if (status)
+- goto out_error;
+- status = nfs4_proc_setclientid_confirm(clp);
++restart_loop:
++ status = nfs4_proc_renew(clp);
++ if (status == 0)
++ goto out;
++ status = nfs4_init_client(clp);
+ if (status)
+ goto out_error;
+- generation = ++(clp->cl_generation);
+- clear_bit(NFS4CLNT_NEW, &clp->cl_state);
+- set_bit(NFS4CLNT_OK, &clp->cl_state);
+- up_read(&clp->cl_sem);
+- nfs4_schedule_state_renewal(clp);
+-restart_loop:
+- spin_lock(&clp->cl_lock);
++ /* Mark all delagations for reclaim */
++ nfs_delegation_mark_reclaim(clp);
++ /* Note: list is protected by exclusive lock on cl->cl_sem */
+ list_for_each_entry(sp, &clp->cl_state_owners, so_list) {
+- if (sp->so_generation - generation >= 0)
+- continue;
+- atomic_inc(&sp->so_count);
+- spin_unlock(&clp->cl_lock);
+- down(&sp->so_sema);
+- if (sp->so_generation - generation < 0) {
+- smp_rmb();
+- sp->so_generation = clp->cl_generation;
+- status = nfs4_reclaim_open_state(sp);
+- }
+- up(&sp->so_sema);
+- nfs4_put_state_owner(sp);
++ status = nfs4_reclaim_open_state(sp);
+ if (status < 0) {
+ if (status == -NFS4ERR_STALE_CLIENTID)
+- nfs4_schedule_state_recovery(clp);
+- goto out;
++ goto restart_loop;
++ goto out_error;
+ }
+- goto restart_loop;
+ }
+- spin_unlock(&clp->cl_lock);
++ nfs_delegation_reap_unclaimed(clp);
+ out:
+- smp_mb__before_clear_bit();
+- clear_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state);
+- smp_mb__after_clear_bit();
++ set_bit(NFS4CLNT_OK, &clp->cl_state);
++ up_write(&clp->cl_sem);
++ unlock_kernel();
+ wake_up_all(&clp->cl_waitq);
+ rpc_wake_up(&clp->cl_rpcwaitq);
++ nfs4_put_client(clp);
+ return 0;
+ out_error:
+- printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u\n",
+- NIPQUAD(clp->cl_addr.s_addr));
+- up_read(&clp->cl_sem);
++ printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n",
++ NIPQUAD(clp->cl_addr.s_addr), -status);
+ goto out;
+ }
+
+--- linux-2.6.7/fs/nfs/inode.c.lsec 2004-06-15 23:19:44.000000000 -0600
++++ linux-2.6.7/fs/nfs/inode.c 2005-03-23 14:28:22.818580744 -0700
+@@ -39,6 +39,8 @@
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
+
++#include "delegation.h"
++
+ #define NFSDBG_FACILITY NFSDBG_VFS
+ #define NFS_PARANOIA 1
+
+@@ -123,8 +125,9 @@ nfs_delete_inode(struct inode * inode)
+ {
+ dprintk("NFS: delete_inode(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+
++ nfs_wb_all(inode);
+ /*
+- * The following can never actually happen...
++ * The following should never happen...
+ */
+ if (nfs_have_writebacks(inode)) {
+ printk(KERN_ERR "nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino);
+@@ -133,18 +136,15 @@ nfs_delete_inode(struct inode * inode)
+ clear_inode(inode);
+ }
+
+-/*
+- * For the moment, the only task for the NFS clear_inode method is to
+- * release the mmap credential
+- */
+ static void
+ nfs_clear_inode(struct inode *inode)
+ {
+ struct nfs_inode *nfsi = NFS_I(inode);
+- struct rpc_cred *cred = nfsi->mm_cred;
++ struct rpc_cred *cred;
+
+- if (cred)
+- put_rpccred(cred);
++ nfs4_zap_acl_attr(inode);
++ nfs_wb_all(inode);
++ BUG_ON (!list_empty(&nfsi->open_files));
+ cred = nfsi->cache_access.cred;
+ if (cred)
+ put_rpccred(cred);
+@@ -704,7 +704,7 @@ nfs_fhget(struct super_block *sb, struct
+ /* Why so? Because we want revalidate for devices/FIFOs, and
+ * that's precisely what we have in nfs_file_inode_operations.
+ */
+- inode->i_op = &nfs_file_inode_operations;
++ inode->i_op = NFS_SB(sb)->rpc_ops->file_inode_ops;
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_fop = &nfs_file_operations;
+ inode->i_data.a_ops = &nfs_file_aops;
+@@ -859,53 +859,114 @@ int nfs_getattr(struct vfsmount *mnt, st
+ return err;
+ }
+
++struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred)
++{
++ struct nfs_open_context *ctx;
++
++ ctx = (struct nfs_open_context *)kmalloc(sizeof(*ctx), GFP_KERNEL);
++ if (ctx != NULL) {
++ atomic_set(&ctx->count, 1);
++ ctx->dentry = dget(dentry);
++ ctx->cred = get_rpccred(cred);
++ ctx->state = NULL;
++ ctx->pid = current->tgid;
++ ctx->error = 0;
++ init_waitqueue_head(&ctx->waitq);
++ }
++ return ctx;
++}
++
++struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
++{
++ if (ctx != NULL)
++ atomic_inc(&ctx->count);
++ return ctx;
++}
++
++void put_nfs_open_context(struct nfs_open_context *ctx)
++{
++ if (atomic_dec_and_test(&ctx->count)) {
++ if (ctx->state != NULL)
++ nfs4_close_state(ctx->state, ctx->mode);
++ if (ctx->cred != NULL)
++ put_rpccred(ctx->cred);
++ dput(ctx->dentry);
++ kfree(ctx);
++ }
++}
++
+ /*
+ * Ensure that mmap has a recent RPC credential for use when writing out
+ * shared pages
+ */
+-void
+-nfs_set_mmcred(struct inode *inode, struct rpc_cred *cred)
++void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
++{
++ struct inode *inode = filp->f_dentry->d_inode;
++ struct nfs_inode *nfsi = NFS_I(inode);
++
++ filp->private_data = get_nfs_open_context(ctx);
++ spin_lock(&inode->i_lock);
++ list_add(&ctx->list, &nfsi->open_files);
++ spin_unlock(&inode->i_lock);
++}
++
++struct nfs_open_context *nfs_find_open_context(struct inode *inode, int mode)
++{
++ struct nfs_inode *nfsi = NFS_I(inode);
++ struct nfs_open_context *pos, *ctx = NULL;
++
++ spin_lock(&inode->i_lock);
++ list_for_each_entry(pos, &nfsi->open_files, list) {
++ if ((pos->mode & mode) == mode) {
++ ctx = get_nfs_open_context(pos);
++ break;
++ }
++ }
++ spin_unlock(&inode->i_lock);
++ return ctx;
++}
++
++void nfs_file_clear_open_context(struct file *filp)
+ {
+- struct rpc_cred **p = &NFS_I(inode)->mm_cred,
+- *oldcred = *p;
++ struct inode *inode = filp->f_dentry->d_inode;
++ struct nfs_open_context *ctx = (struct nfs_open_context *)filp->private_data;
+
+- *p = get_rpccred(cred);
+- if (oldcred)
+- put_rpccred(oldcred);
++ if (ctx) {
++ filp->private_data = NULL;
++ spin_lock(&inode->i_lock);
++ list_del(&ctx->list);
++ spin_unlock(&inode->i_lock);
++ put_nfs_open_context(ctx);
++ }
+ }
+
+ /*
+- * These are probably going to contain hooks for
+- * allocating and releasing RPC credentials for
+- * the file. I'll have to think about Tronds patch
+- * a bit more..
++ * These allocate and release file read/write context information.
+ */
+ int nfs_open(struct inode *inode, struct file *filp)
+ {
+- struct rpc_auth *auth;
++ struct nfs_open_context *ctx;
+ struct rpc_cred *cred;
+
+- auth = NFS_CLIENT(inode)->cl_auth;
+- cred = rpcauth_lookupcred(auth, 0);
+- filp->private_data = cred;
+- if ((filp->f_mode & FMODE_WRITE) != 0) {
+- nfs_set_mmcred(inode, cred);
++ if ((cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0)) == NULL)
++ return -ENOMEM;
++ ctx = alloc_nfs_open_context(filp->f_dentry, cred);
++ put_rpccred(cred);
++ if (ctx == NULL)
++ return -ENOMEM;
++ ctx->mode = filp->f_mode;
++ nfs_file_set_open_context(filp, ctx);
++ put_nfs_open_context(ctx);
++ if ((filp->f_mode & FMODE_WRITE) != 0)
+ nfs_begin_data_update(inode);
+- }
+ return 0;
+ }
+
+ int nfs_release(struct inode *inode, struct file *filp)
+ {
+- struct rpc_cred *cred;
+-
+- lock_kernel();
+ if ((filp->f_mode & FMODE_WRITE) != 0)
+ nfs_end_data_update(inode);
+- cred = nfs_file_cred(filp);
+- if (cred)
+- put_rpccred(cred);
+- unlock_kernel();
++ nfs_file_clear_open_context(filp);
+ return 0;
+ }
+
+@@ -1002,6 +1063,30 @@ out:
+ return status;
+ }
+
++int nfs_attribute_timeout(struct inode *inode)
++{
++ struct nfs_inode *nfsi = NFS_I(inode);
++
++ if (nfs_have_delegation(inode, FMODE_READ))
++ return 0;
++ return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo);
++}
++
++/**
++ * nfs_revalidate_inode - Revalidate the inode attributes
++ * @server - pointer to nfs_server struct
++ * @inode - pointer to inode struct
++ *
++ * Updates inode attribute information by retrieving the data from the server.
++ */
++int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
++{
++ if (!(NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
++ && !nfs_attribute_timeout(inode))
++ return NFS_STALE(inode) ? -ESTALE : 0;
++ return __nfs_revalidate_inode(server, inode);
++}
++
+ /**
+ * nfs_begin_data_update
+ * @inode - pointer to inode
+@@ -1023,11 +1108,13 @@ void nfs_end_data_update(struct inode *i
+ {
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+- /* Mark the attribute cache for revalidation */
+- nfsi->flags |= NFS_INO_INVALID_ATTR;
+- /* Directories and symlinks: invalidate page cache too */
+- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+- nfsi->flags |= NFS_INO_INVALID_DATA;
++ if (!nfs_have_delegation(inode, FMODE_READ)) {
++ /* Mark the attribute cache for revalidation */
++ nfsi->flags |= NFS_INO_INVALID_ATTR;
++ /* Directories and symlinks: invalidate page cache too */
++ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
++ nfsi->flags |= NFS_INO_INVALID_DATA;
++ }
+ nfsi->cache_change_attribute ++;
+ atomic_dec(&nfsi->data_updates);
+ }
+@@ -1068,6 +1155,10 @@ int nfs_refresh_inode(struct inode *inod
+ loff_t cur_size, new_isize;
+ int data_unstable;
+
++ /* Do we hold a delegation? */
++ if (nfs_have_delegation(inode, FMODE_READ))
++ return 0;
++
+ /* Are we in the process of updating data on the server? */
+ data_unstable = nfs_caches_unstable(inode);
+
+@@ -1240,6 +1331,7 @@ static int nfs_update_inode(struct inode
+ inode->i_nlink = fattr->nlink;
+ inode->i_uid = fattr->uid;
+ inode->i_gid = fattr->gid;
++ nfs4_zap_acl_attr(inode);
+
+ if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+ /*
+@@ -1265,7 +1357,8 @@ static int nfs_update_inode(struct inode
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+ || S_ISLNK(inode->i_mode)))
+ invalid &= ~NFS_INO_INVALID_DATA;
+- nfsi->flags |= invalid;
++ if (!nfs_have_delegation(inode, FMODE_READ))
++ nfsi->flags |= invalid;
+
+ return 0;
+ out_changed:
+@@ -1400,6 +1493,52 @@ static struct file_system_type nfs_fs_ty
+
+ #ifdef CONFIG_NFS_V4
+
++#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
++
++int
++nfs_setxattr(struct dentry *dentry, const char *key, const void *buf,
++ size_t buflen, int flags)
++{
++ struct inode *inode = dentry->d_inode;
++
++ if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
++ return -EINVAL;
++
++ if (!S_ISREG(inode->i_mode) &&
++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++ return -EPERM;
++
++ return nfs4_proc_set_acl(inode, buf, buflen);
++}
++
++/* The getxattr man page suggests returning -ENODATA for unknown attributes,
++ * and that's what we'll do for e.g. user attributes that haven't been set.
++ * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
++ * attributes in kernel-managed attribute namespaces. */
++ssize_t
++nfs_getxattr(struct dentry *dentry, const char *key, void *buf,
++ size_t buflen)
++{
++ struct inode *inode = dentry->d_inode;
++
++ if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
++ return -EOPNOTSUPP;
++
++ return nfs4_proc_get_acl(inode, buf, buflen);
++}
++
++ssize_t
++nfs_listxattr(struct dentry *dentry, char *buf, size_t buflen)
++{
++ ssize_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1;
++
++ if (buf && buflen < len)
++ return -ERANGE;
++ if (buf)
++ memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
++ return len;
++}
++
+ static void nfs4_clear_inode(struct inode *);
+
+ static struct super_operations nfs4_sops = {
+@@ -1423,6 +1562,12 @@ static void nfs4_clear_inode(struct inod
+ {
+ struct nfs_inode *nfsi = NFS_I(inode);
+
++ /* If we are holding a delegation, return it! */
++ if (nfsi->delegation != NULL)
++ nfs_inode_return_delegation(inode);
++ /* First call standard NFS clear_inode() code */
++ nfs_clear_inode(inode);
++ /* Now clear out any remaining state */
+ while (!list_empty(&nfsi->open_states)) {
+ struct nfs4_state *state;
+
+@@ -1437,8 +1582,6 @@ static void nfs4_clear_inode(struct inod
+ BUG_ON(atomic_read(&state->count) != 1);
+ nfs4_close_state(state, state->state);
+ }
+- /* Now call standard NFS clear_inode() code */
+- nfs_clear_inode(inode);
+ }
+
+
+@@ -1536,8 +1679,19 @@ static int nfs4_fill_super(struct super_
+ memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
+ nfs_idmap_new(clp);
+ }
+- if (list_empty(&clp->cl_superblocks))
+- clear_bit(NFS4CLNT_OK, &clp->cl_state);
++ /* Fire up rpciod if not yet running */
++ if (rpciod_up() != 0) {
++ printk(KERN_WARNING "NFS: couldn't start rpciod!\n");
++ goto out_fail;
++ }
++
++ if (list_empty(&clp->cl_superblocks)) {
++ err = nfs4_init_client(clp);
++ if (err != 0) {
++ up_write(&clp->cl_sem);
++ goto out_rpciod;
++ }
++ }
+ list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+ clnt = rpc_clone_client(clp->cl_rpcclient);
+ if (!IS_ERR(clnt))
+@@ -1567,17 +1721,10 @@ static int nfs4_fill_super(struct super_
+ }
+ }
+
+- /* Fire up rpciod if not yet running */
+- if (rpciod_up() != 0) {
+- printk(KERN_WARNING "NFS: couldn't start rpciod!\n");
+- goto out_shutdown;
+- }
+-
+ sb->s_op = &nfs4_sops;
+ err = nfs_sb_init(sb, authflavour);
+ if (err == 0)
+ return 0;
+- rpciod_down();
+ out_shutdown:
+ rpc_shutdown_client(server->client);
+ out_remove_list:
+@@ -1585,6 +1732,8 @@ out_remove_list:
+ list_del_init(&server->nfs4_siblings);
+ up_write(&server->nfs4_state->cl_sem);
+ destroy_nfsv4_state(server);
++out_rpciod:
++ rpciod_down();
+ out_fail:
+ if (clp)
+ nfs4_put_client(clp);
+@@ -1709,22 +1858,31 @@ out_free:
+ return s;
+ }
+
++static void nfs4_kill_super(struct super_block *sb)
++{
++ nfs_return_all_delegations(sb);
++ nfs_kill_super(sb);
++}
++
+ static struct file_system_type nfs4_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "nfs4",
+ .get_sb = nfs4_get_sb,
+- .kill_sb = nfs_kill_super,
++ .kill_sb = nfs4_kill_super,
+ .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ };
+
+-#define nfs4_zero_state(nfsi) \
++#define nfs4_init_once(nfsi) \
+ do { \
+ INIT_LIST_HEAD(&(nfsi)->open_states); \
++ nfsi->delegation = NULL; \
++ nfsi->delegation_state = 0; \
++ init_rwsem(&nfsi->rwsem); \
+ } while(0)
+ #define register_nfs4fs() register_filesystem(&nfs4_fs_type)
+ #define unregister_nfs4fs() unregister_filesystem(&nfs4_fs_type)
+ #else
+-#define nfs4_zero_state(nfsi) \
++#define nfs4_init_once(nfsi) \
+ do { } while (0)
+ #define register_nfs4fs() (0)
+ #define unregister_nfs4fs()
+@@ -1746,8 +1904,8 @@ static struct inode *nfs_alloc_inode(str
+ if (!nfsi)
+ return NULL;
+ nfsi->flags = 0;
+- nfsi->mm_cred = NULL;
+- nfs4_zero_state(nfsi);
++ nfsi->acl_len = 0;
++ nfsi->acl = NULL;
+ return &nfsi->vfs_inode;
+ }
+
+@@ -1765,12 +1923,14 @@ static void init_once(void * foo, kmem_c
+ inode_init_once(&nfsi->vfs_inode);
+ INIT_LIST_HEAD(&nfsi->dirty);
+ INIT_LIST_HEAD(&nfsi->commit);
++ INIT_LIST_HEAD(&nfsi->open_files);
+ INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
+ atomic_set(&nfsi->data_updates, 0);
+ nfsi->ndirty = 0;
+ nfsi->ncommit = 0;
+ nfsi->npages = 0;
+ init_waitqueue_head(&nfsi->nfs_i_wait);
++ nfs4_init_once(nfsi);
+ }
+ }
+
+--- linux-2.6.7/fs/nfs/dir.c.lsec 2004-06-15 23:19:23.000000000 -0600
++++ linux-2.6.7/fs/nfs/dir.c 2005-03-23 14:28:22.701598528 -0700
+@@ -32,6 +32,8 @@
+ #include <linux/smp_lock.h>
+ #include <linux/namei.h>
+
++#include "delegation.h"
++
+ #define NFS_PARANOIA 1
+ /* #define NFS_DEBUG_VERBOSE 1 */
+
+@@ -88,6 +90,9 @@ struct inode_operations nfs4_dir_inode_o
+ .permission = nfs_permission,
+ .getattr = nfs_getattr,
+ .setattr = nfs_setattr,
++ .getxattr = nfs_getxattr,
++ .setxattr = nfs_setxattr,
++ .listxattr = nfs_listxattr,
+ };
+
+ #endif /* CONFIG_NFS_V4 */
+@@ -850,22 +855,22 @@ static int nfs_open_revalidate(struct de
+ unsigned long verifier;
+ int openflags, ret = 0;
+
+- /* NFS only supports OPEN for regular files */
+- if (inode && !S_ISREG(inode->i_mode))
+- goto no_open;
+ parent = dget_parent(dentry);
+ dir = parent->d_inode;
+ if (!is_atomic_open(dir, nd))
+ goto no_open;
++ /* We can't create new files in nfs_open_revalidate(), so we
++ * optimize away revalidation of negative dentries.
++ */
++ if (inode == NULL)
++ goto out;
++ /* NFS only supports OPEN on regular files */
++ if (!S_ISREG(inode->i_mode))
++ goto no_open;
+ openflags = nd->intent.open.flags;
+- if (openflags & O_CREAT) {
+- /* If this is a negative dentry, just drop it */
+- if (!inode)
+- goto out;
+- /* If this is exclusive open, just revalidate */
+- if (openflags & O_EXCL)
+- goto no_open;
+- }
++ /* We cannot do exclusive creation on a positive dentry */
++ if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
++ goto no_open;
+ /* We can't create new files, or truncate existing ones here */
+ openflags &= ~(O_CREAT|O_TRUNC);
+
+@@ -887,6 +892,8 @@ out:
+ return ret;
+ no_open:
+ dput(parent);
++ if (inode != NULL && nfs_have_delegation(inode, FMODE_READ))
++ return 1;
+ return nfs_lookup_revalidate(dentry, nd);
+ }
+ #endif /* CONFIG_NFSV4 */
+@@ -1299,19 +1306,6 @@ nfs_symlink(struct inode *dir, struct de
+ dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id,
+ dir->i_ino, dentry->d_name.name, symname);
+
+- error = -ENAMETOOLONG;
+- switch (NFS_PROTO(dir)->version) {
+- case 2:
+- if (strlen(symname) > NFS2_MAXPATHLEN)
+- goto out;
+- break;
+- case 3:
+- if (strlen(symname) > NFS3_MAXPATHLEN)
+- goto out;
+- default:
+- break;
+- }
+-
+ #ifdef NFS_PARANOIA
+ if (dentry->d_inode)
+ printk("nfs_proc_symlink: %s/%s not negative!\n",
+@@ -1341,8 +1335,6 @@ dentry->d_parent->d_name.name, dentry->d
+ d_drop(dentry);
+ }
+ unlock_kernel();
+-
+-out:
+ return error;
+ }
+
+@@ -1498,10 +1490,56 @@ out:
+ return error;
+ }
+
+-int
+-nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
++int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
++{
++ struct nfs_access_entry *cache = &NFS_I(inode)->cache_access;
++
++ if (cache->cred != cred
++ || time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))
++ || (NFS_FLAGS(inode) & NFS_INO_INVALID_ATTR))
++ return -ENOENT;
++ memcpy(res, cache, sizeof(*res));
++ return 0;
++}
++
++void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
++{
++ struct nfs_access_entry *cache = &NFS_I(inode)->cache_access;
++
++ if (cache->cred != set->cred) {
++ if (cache->cred)
++ put_rpccred(cache->cred);
++ cache->cred = get_rpccred(set->cred);
++ }
++ cache->jiffies = set->jiffies;
++ cache->mask = set->mask;
++}
++
++static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
++{
++ struct nfs_access_entry cache;
++ int status;
++
++ status = nfs_access_get_cached(inode, cred, &cache);
++ if (status == 0)
++ goto out;
++
++ /* Be clever: ask server to check for all possible rights */
++ cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
++ cache.cred = cred;
++ cache.jiffies = jiffies;
++ status = NFS_PROTO(inode)->access(inode, &cache);
++ if (status != 0)
++ return status;
++ nfs_access_add_cache(inode, &cache);
++out:
++ if ((cache.mask & mask) == mask)
++ return 0;
++ return -EACCES;
++}
++
++int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+ {
+- struct nfs_access_cache *cache = &NFS_I(inode)->cache_access;
+ struct rpc_cred *cred;
+ int mode = inode->i_mode;
+ int res;
+@@ -1542,24 +1580,7 @@ nfs_permission(struct inode *inode, int
+ goto out_notsup;
+
+ cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0);
+- if (cache->cred == cred
+- && time_before(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))
+- && !(NFS_FLAGS(inode) & NFS_INO_INVALID_ATTR)) {
+- if (!(res = cache->err)) {
+- /* Is the mask a subset of an accepted mask? */
+- if ((cache->mask & mask) == mask)
+- goto out;
+- } else {
+- /* ...or is it a superset of a rejected mask? */
+- if ((cache->mask & mask) == cache->mask)
+- goto out;
+- }
+- }
+-
+- res = NFS_PROTO(inode)->access(inode, cred, mask);
+- if (!res || res == -EACCES)
+- goto add_cache;
+-out:
++ res = nfs_do_access(inode, cred, mask);
+ put_rpccred(cred);
+ unlock_kernel();
+ return res;
+@@ -1568,15 +1589,6 @@ out_notsup:
+ res = vfs_permission(inode, mask);
+ unlock_kernel();
+ return res;
+-add_cache:
+- cache->jiffies = jiffies;
+- if (cache->cred)
+- put_rpccred(cache->cred);
+- cache->cred = cred;
+- cache->mask = mask;
+- cache->err = res;
+- unlock_kernel();
+- return res;
+ }
+
+ /*
+--- linux-2.6.7/fs/nfs/unlink.c.lsec 2004-06-15 23:20:04.000000000 -0600
++++ linux-2.6.7/fs/nfs/unlink.c 2005-03-23 14:28:23.170527240 -0700
+@@ -215,7 +215,6 @@ nfs_complete_unlink(struct dentry *dentr
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+ spin_unlock(&dentry->d_lock);
+- if (data->task.tk_rpcwait == &nfs_delete_queue)
+- rpc_wake_up_task(&data->task);
++ rpc_wake_up_task(&data->task);
+ nfs_put_unlinkdata(data);
+ }
+--- linux-2.6.7/fs/nfs/callback_xdr.c.lsec 2005-03-23 14:28:22.545622240 -0700
++++ linux-2.6.7/fs/nfs/callback_xdr.c 2005-03-23 14:28:22.544622392 -0700
+@@ -0,0 +1,481 @@
++/*
++ * linux/fs/nfs/callback_xdr.c
++ *
++ * Copyright (C) 2004 Trond Myklebust
++ *
++ * NFSv4 callback encode/decode procedures
++ */
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/sunrpc/svc.h>
++#include <linux/nfs4.h>
++#include <linux/nfs_fs.h>
++#include "callback.h"
++
++#define CB_OP_TAGLEN_MAXSZ (512)
++#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ)
++#define CB_OP_GETATTR_BITMAP_MAXSZ (4)
++#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
++ CB_OP_GETATTR_BITMAP_MAXSZ + \
++ 2 + 2 + 3 + 3)
++#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
++
++#define NFSDBG_FACILITY NFSDBG_CALLBACK
++
++typedef unsigned (*callback_process_op_t)(void *, void *);
++typedef unsigned (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
++typedef unsigned (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
++
++
++struct callback_op {
++ callback_process_op_t process_op;
++ callback_decode_arg_t decode_args;
++ callback_encode_res_t encode_res;
++ long res_maxsize;
++};
++
++static struct callback_op callback_ops[];
++
++static int nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp)
++{
++ return htonl(NFS4_OK);
++}
++
++static int nfs4_decode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy)
++{
++ return xdr_argsize_check(rqstp, p);
++}
++
++static int nfs4_encode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy)
++{
++ return xdr_ressize_check(rqstp, p);
++}
++
++static uint32_t *read_buf(struct xdr_stream *xdr, int nbytes)
++{
++ uint32_t *p;
++
++ p = xdr_inline_decode(xdr, nbytes);
++ if (unlikely(p == NULL))
++ printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n");
++ return p;
++}
++
++static unsigned decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str)
++{
++ uint32_t *p;
++
++ p = read_buf(xdr, 4);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ *len = ntohl(*p);
++
++ if (*len != 0) {
++ p = read_buf(xdr, *len);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ *str = (const char *)p;
++ } else
++ *str = NULL;
++
++ return 0;
++}
++
++static unsigned decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
++{
++ uint32_t *p;
++
++ p = read_buf(xdr, 4);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ fh->size = ntohl(*p);
++ if (fh->size > NFS4_FHSIZE)
++ return htonl(NFS4ERR_BADHANDLE);
++ p = read_buf(xdr, fh->size);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ memcpy(&fh->data[0], p, fh->size);
++ memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size);
++ return 0;
++}
++
++static unsigned decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
++{
++ uint32_t *p;
++ unsigned int attrlen;
++
++ p = read_buf(xdr, 4);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ attrlen = ntohl(*p);
++ p = read_buf(xdr, attrlen << 2);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ if (likely(attrlen > 0))
++ bitmap[0] = ntohl(*p++);
++ if (attrlen > 1)
++ bitmap[1] = ntohl(*p);
++ return 0;
++}
++
++static unsigned decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
++{
++ uint32_t *p;
++
++ p = read_buf(xdr, 16);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ memcpy(stateid->data, p, 16);
++ return 0;
++}
++
++static unsigned decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
++{
++ uint32_t *p;
++ unsigned int minor_version;
++ unsigned status;
++
++ status = decode_string(xdr, &hdr->taglen, &hdr->tag);
++ if (unlikely(status != 0))
++ return status;
++ /* We do not like overly long tags! */
++ if (hdr->taglen > CB_OP_TAGLEN_MAXSZ-12 || hdr->taglen < 0) {
++ printk("NFSv4 CALLBACK %s: client sent tag of length %u\n",
++ __FUNCTION__, hdr->taglen);
++ return htonl(NFS4ERR_RESOURCE);
++ }
++ p = read_buf(xdr, 12);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ minor_version = ntohl(*p++);
++ /* Check minor version is zero. */
++ if (minor_version != 0) {
++ printk(KERN_WARNING "%s: NFSv4 server callback with illegal minor version %u!\n",
++ __FUNCTION__, minor_version);
++ return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
++ }
++ hdr->callback_ident = ntohl(*p++);
++ hdr->nops = ntohl(*p);
++ return 0;
++}
++
++static unsigned decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
++{
++ uint32_t *p;
++ p = read_buf(xdr, 4);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ *op = ntohl(*p);
++ return 0;
++}
++
++static unsigned decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args)
++{
++ unsigned status;
++
++ status = decode_fh(xdr, &args->fh);
++ if (unlikely(status != 0))
++ goto out;
++ args->addr = &rqstp->rq_addr;
++ status = decode_bitmap(xdr, args->bitmap);
++out:
++ dprintk("%s: exit with status = %d\n", __FUNCTION__, status);
++ return status;
++}
++
++static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
++{
++ uint32_t *p;
++ unsigned status;
++
++ args->addr = &rqstp->rq_addr;
++ status = decode_stateid(xdr, &args->stateid);
++ if (unlikely(status != 0))
++ goto out;
++ p = read_buf(xdr, 4);
++ if (unlikely(p == NULL)) {
++ status = htonl(NFS4ERR_RESOURCE);
++ goto out;
++ }
++ args->truncate = ntohl(*p);
++ status = decode_fh(xdr, &args->fh);
++out:
++ dprintk("%s: exit with status = %d\n", __FUNCTION__, status);
++ return 0;
++}
++
++static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
++{
++ uint32_t *p;
++
++ p = xdr_reserve_space(xdr, 4 + len);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ xdr_encode_opaque(p, str, len);
++ return 0;
++}
++
++#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE)
++#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY)
++static unsigned encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, uint32_t **savep)
++{
++ uint32_t bm[2];
++ uint32_t *p;
++
++ bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0);
++ bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1);
++ if (bm[1] != 0) {
++ p = xdr_reserve_space(xdr, 16);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ *p++ = htonl(2);
++ *p++ = bm[0];
++ *p++ = bm[1];
++ } else if (bm[0] != 0) {
++ p = xdr_reserve_space(xdr, 12);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ *p++ = htonl(1);
++ *p++ = bm[0];
++ } else {
++ p = xdr_reserve_space(xdr, 8);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ *p++ = htonl(0);
++ }
++ *savep = p;
++ return 0;
++}
++
++static unsigned encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
++{
++ uint32_t *p;
++
++ if (!(bitmap[0] & FATTR4_WORD0_CHANGE))
++ return 0;
++ p = xdr_reserve_space(xdr, 8);
++ if (unlikely(p == 0))
++ return htonl(NFS4ERR_RESOURCE);
++ p = xdr_encode_hyper(p, change);
++ return 0;
++}
++
++static unsigned encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
++{
++ uint32_t *p;
++
++ if (!(bitmap[0] & FATTR4_WORD0_SIZE))
++ return 0;
++ p = xdr_reserve_space(xdr, 8);
++ if (unlikely(p == 0))
++ return htonl(NFS4ERR_RESOURCE);
++ p = xdr_encode_hyper(p, size);
++ return 0;
++}
++
++static unsigned encode_attr_time(struct xdr_stream *xdr, const struct timespec *time)
++{
++ uint32_t *p;
++
++ p = xdr_reserve_space(xdr, 12);
++ if (unlikely(p == 0))
++ return htonl(NFS4ERR_RESOURCE);
++ p = xdr_encode_hyper(p, time->tv_sec);
++ *p = htonl(time->tv_nsec);
++ return 0;
++}
++
++static unsigned encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
++{
++ if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA))
++ return 0;
++ return encode_attr_time(xdr,time);
++}
++
++static unsigned encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
++{
++ if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY))
++ return 0;
++ return encode_attr_time(xdr,time);
++}
++
++static unsigned encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
++{
++ unsigned status;
++
++ hdr->status = xdr_reserve_space(xdr, 4);
++ if (unlikely(hdr->status == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ status = encode_string(xdr, hdr->taglen, hdr->tag);
++ if (unlikely(status != 0))
++ return status;
++ hdr->nops = xdr_reserve_space(xdr, 4);
++ if (unlikely(hdr->nops == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ return 0;
++}
++
++static unsigned encode_op_hdr(struct xdr_stream *xdr, uint32_t op, uint32_t res)
++{
++ uint32_t *p;
++
++ p = xdr_reserve_space(xdr, 8);
++ if (unlikely(p == NULL))
++ return htonl(NFS4ERR_RESOURCE);
++ *p++ = htonl(op);
++ *p = htonl(res);
++ return 0;
++}
++
++static unsigned encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res)
++{
++ uint32_t *savep;
++ unsigned status = res->status;
++
++ if (unlikely(status != 0))
++ goto out;
++ status = encode_attr_bitmap(xdr, res->bitmap, &savep);
++ if (unlikely(status != 0))
++ goto out;
++ status = encode_attr_change(xdr, res->bitmap, res->change_attr);
++ if (unlikely(status != 0))
++ goto out;
++ status = encode_attr_size(xdr, res->bitmap, res->size);
++ if (unlikely(status != 0))
++ goto out;
++ status = encode_attr_ctime(xdr, res->bitmap, &res->ctime);
++ if (unlikely(status != 0))
++ goto out;
++ status = encode_attr_mtime(xdr, res->bitmap, &res->mtime);
++ *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1)));
++out:
++ dprintk("%s: exit with status = %d\n", __FUNCTION__, status);
++ return status;
++}
++
++static unsigned process_op(struct svc_rqst *rqstp,
++ struct xdr_stream *xdr_in, void *argp,
++ struct xdr_stream *xdr_out, void *resp)
++{
++ struct callback_op *op;
++ unsigned int op_nr;
++ unsigned int status = 0;
++ long maxlen;
++ unsigned res;
++
++ dprintk("%s: start\n", __FUNCTION__);
++ status = decode_op_hdr(xdr_in, &op_nr);
++ if (unlikely(status != 0)) {
++ op_nr = OP_CB_ILLEGAL;
++ op = &callback_ops[0];
++ } else if (unlikely(op_nr != OP_CB_GETATTR && op_nr != OP_CB_RECALL)) {
++ op_nr = OP_CB_ILLEGAL;
++ op = &callback_ops[0];
++ status = htonl(NFS4ERR_OP_ILLEGAL);
++ } else
++ op = &callback_ops[op_nr];
++
++ maxlen = xdr_out->end - xdr_out->p;
++ if (maxlen > 0 && maxlen < PAGE_SIZE) {
++ if (likely(status == 0 && op->decode_args != NULL))
++ status = op->decode_args(rqstp, xdr_in, argp);
++ if (likely(status == 0 && op->process_op != NULL))
++ status = op->process_op(argp, resp);
++ } else
++ status = htonl(NFS4ERR_RESOURCE);
++
++ res = encode_op_hdr(xdr_out, op_nr, status);
++ if (status == 0)
++ status = res;
++ if (op->encode_res != NULL && status == 0)
++ status = op->encode_res(rqstp, xdr_out, resp);
++ dprintk("%s: done, status = %d\n", __FUNCTION__, status);
++ return status;
++}
++
++/*
++ * Decode, process and encode a COMPOUND
++ */
++static int nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
++{
++ struct cb_compound_hdr_arg hdr_arg;
++ struct cb_compound_hdr_res hdr_res;
++ struct xdr_stream xdr_in, xdr_out;
++ uint32_t *p;
++ unsigned int status;
++ unsigned int nops = 1;
++
++ dprintk("%s: start\n", __FUNCTION__);
++
++ xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
++
++ p = (uint32_t*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
++ rqstp->rq_res.head[0].iov_len = PAGE_SIZE;
++ xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
++
++ decode_compound_hdr_arg(&xdr_in, &hdr_arg);
++ hdr_res.taglen = hdr_arg.taglen;
++ hdr_res.tag = hdr_arg.tag;
++ encode_compound_hdr_res(&xdr_out, &hdr_res);
++
++ for (;;) {
++ status = process_op(rqstp, &xdr_in, argp, &xdr_out, resp);
++ if (status != 0)
++ break;
++ if (nops == hdr_arg.nops)
++ break;
++ nops++;
++ }
++ *hdr_res.status = status;
++ *hdr_res.nops = htonl(nops);
++ dprintk("%s: done, status = %u\n", __FUNCTION__, status);
++ return rpc_success;
++}
++
++/*
++ * Define NFS4 callback COMPOUND ops.
++ */
++static struct callback_op callback_ops[] = {
++ [0] = {
++ .res_maxsize = CB_OP_HDR_RES_MAXSZ,
++ },
++ [OP_CB_GETATTR] = {
++ .process_op = (callback_process_op_t)nfs4_callback_getattr,
++ .decode_args = (callback_decode_arg_t)decode_getattr_args,
++ .encode_res = (callback_encode_res_t)encode_getattr_res,
++ .res_maxsize = CB_OP_GETATTR_RES_MAXSZ,
++ },
++ [OP_CB_RECALL] = {
++ .process_op = (callback_process_op_t)nfs4_callback_recall,
++ .decode_args = (callback_decode_arg_t)decode_recall_args,
++ .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
++ }
++};
++
++/*
++ * Define NFS4 callback procedures
++ */
++static struct svc_procedure nfs4_callback_procedures1[] = {
++ [CB_NULL] = {
++ .pc_func = nfs4_callback_null,
++ .pc_decode = (kxdrproc_t)nfs4_decode_void,
++ .pc_encode = (kxdrproc_t)nfs4_encode_void,
++ .pc_xdrressize = 1,
++ },
++ [CB_COMPOUND] = {
++ .pc_func = nfs4_callback_compound,
++ .pc_encode = (kxdrproc_t)nfs4_encode_void,
++ .pc_argsize = 256,
++ .pc_ressize = 256,
++ .pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
++ }
++};
++
++struct svc_version nfs4_callback_version1 = {
++ .vs_vers = 1,
++ .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
++ .vs_proc = nfs4_callback_procedures1,
++ .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
++ .vs_dispatch = NULL,
++};
++
+--- linux-2.6.7/fs/nfs/callback.c.lsec 2005-03-23 14:28:22.484631512 -0700
++++ linux-2.6.7/fs/nfs/callback.c 2005-03-23 14:28:22.483631664 -0700
+@@ -0,0 +1,325 @@
++/*
++ * linux/fs/nfs/callback.c
++ *
++ * Copyright (C) 2004 Trond Myklebust
++ *
++ * NFSv4 callback handling
++ */
++
++#include <linux/config.h>
++#include <linux/completion.h>
++#include <linux/ip.h>
++#include <linux/module.h>
++#include <linux/smp_lock.h>
++#include <linux/sunrpc/svc.h>
++#include <linux/sunrpc/svcsock.h>
++#include <linux/nfs_fs.h>
++#include "callback.h"
++
++#define NFSDBG_FACILITY NFSDBG_CALLBACK
++
++struct nfs_callback_data {
++ unsigned int users;
++ struct svc_serv *serv;
++ pid_t pid;
++ struct completion started;
++ struct completion stopped;
++};
++
++static struct nfs_callback_data nfs_callback_info;
++static DECLARE_MUTEX(nfs_callback_sema);
++static struct svc_program nfs4_callback_program;
++
++unsigned short nfs_callback_tcpport;
++
++/*
++ * This is the callback kernel thread.
++ */
++static void nfs_callback_svc(struct svc_rqst *rqstp)
++{
++ struct svc_serv *serv = rqstp->rq_server;
++ int err;
++
++ __module_get(THIS_MODULE);
++ lock_kernel();
++
++ nfs_callback_info.pid = current->pid;
++ daemonize("nfsv4-svc");
++ /* Process request with signals blocked, but allow SIGKILL. */
++ allow_signal(SIGKILL);
++
++ complete(&nfs_callback_info.started);
++
++ while (nfs_callback_info.users != 0 || !signalled()) {
++ /*
++ * Listen for a request on the socket
++ */
++ err = svc_recv(serv, rqstp, MAX_SCHEDULE_TIMEOUT);
++ if (err == -EAGAIN || err == -EINTR)
++ continue;
++ if (err < 0) {
++ printk(KERN_WARNING
++ "%s: terminating on error %d\n",
++ __FUNCTION__, -err);
++ break;
++ }
++ dprintk("%s: request from %u.%u.%u.%u\n", __FUNCTION__,
++ NIPQUAD(rqstp->rq_addr.sin_addr.s_addr));
++ svc_process(serv, rqstp);
++ }
++
++ nfs_callback_info.pid = 0;
++ complete(&nfs_callback_info.stopped);
++ unlock_kernel();
++ module_put_and_exit(0);
++}
++
++/*
++ * Bring up the server process if it is not already up.
++ */
++int nfs_callback_up(void)
++{
++ struct svc_serv *serv;
++ struct svc_sock *svsk;
++ int ret = 0;
++
++ lock_kernel();
++ down(&nfs_callback_sema);
++ if (nfs_callback_info.users++ || nfs_callback_info.pid != 0)
++ goto out;
++ init_completion(&nfs_callback_info.started);
++ init_completion(&nfs_callback_info.stopped);
++ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE);
++ ret = -ENOMEM;
++ if (!serv)
++ goto out_err;
++ /* FIXME: We don't want to register this socket with the portmapper */
++ ret = svc_makesock(serv, IPPROTO_TCP, 0);
++ if (ret < 0)
++ goto out_destroy;
++ if (!list_empty(&serv->sv_permsocks)) {
++ svsk = list_entry(serv->sv_permsocks.next,
++ struct svc_sock, sk_list);
++ nfs_callback_tcpport = ntohs(inet_sk(svsk->sk_sk)->sport);
++ dprintk ("Callback port = 0x%x\n", nfs_callback_tcpport);
++ } else
++ BUG();
++ ret = svc_create_thread(nfs_callback_svc, serv);
++ if (ret < 0)
++ goto out_destroy;
++ nfs_callback_info.serv = serv;
++ wait_for_completion(&nfs_callback_info.started);
++out:
++ up(&nfs_callback_sema);
++ unlock_kernel();
++ return ret;
++out_destroy:
++ svc_destroy(serv);
++out_err:
++ nfs_callback_info.users--;
++ goto out;
++}
++
++/*
++ * Kill the server process if it is not already up.
++ */
++int nfs_callback_down(void)
++{
++ int ret = 0;
++
++ lock_kernel();
++ down(&nfs_callback_sema);
++ if (--nfs_callback_info.users || nfs_callback_info.pid == 0)
++ goto out;
++ kill_proc(nfs_callback_info.pid, SIGKILL, 1);
++ wait_for_completion(&nfs_callback_info.stopped);
++out:
++ up(&nfs_callback_sema);
++ unlock_kernel();
++ return ret;
++}
++
++/*
++ * AUTH_NULL authentication
++ */
++static int nfs_callback_null_accept(struct svc_rqst *rqstp, u32 *authp)
++{
++ struct iovec *argv = &rqstp->rq_arg.head[0];
++ struct iovec *resv = &rqstp->rq_res.head[0];
++
++ if (argv->iov_len < 3*4)
++ return SVC_GARBAGE;
++
++ if (svc_getu32(argv) != 0) {
++ dprintk("svc: bad null cred\n");
++ *authp = rpc_autherr_badcred;
++ return SVC_DENIED;
++ }
++ if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) {
++ dprintk("svc: bad null verf\n");
++ *authp = rpc_autherr_badverf;
++ return SVC_DENIED;
++ }
++
++ /* Signal that mapping to nobody uid/gid is required */
++ rqstp->rq_cred.cr_uid = (uid_t) -1;
++ rqstp->rq_cred.cr_gid = (gid_t) -1;
++ rqstp->rq_cred.cr_group_info = groups_alloc(0);
++ if (rqstp->rq_cred.cr_group_info == NULL)
++ return SVC_DROP; /* kmalloc failure - client must retry */
++
++ /* Put NULL verifier */
++ svc_putu32(resv, RPC_AUTH_NULL);
++ svc_putu32(resv, 0);
++ dprintk("%s: success, returning %d!\n", __FUNCTION__, SVC_OK);
++ return SVC_OK;
++}
++
++static int nfs_callback_null_release(struct svc_rqst *rqstp)
++{
++ if (rqstp->rq_cred.cr_group_info)
++ put_group_info(rqstp->rq_cred.cr_group_info);
++ rqstp->rq_cred.cr_group_info = NULL;
++ return 0; /* don't drop */
++}
++
++static struct auth_ops nfs_callback_auth_null = {
++ .name = "null",
++ .flavour = RPC_AUTH_NULL,
++ .accept = nfs_callback_null_accept,
++ .release = nfs_callback_null_release,
++};
++
++/*
++ * AUTH_SYS authentication
++ */
++static int nfs_callback_unix_accept(struct svc_rqst *rqstp, u32 *authp)
++{
++ struct iovec *argv = &rqstp->rq_arg.head[0];
++ struct iovec *resv = &rqstp->rq_res.head[0];
++ struct svc_cred *cred = &rqstp->rq_cred;
++ u32 slen, i;
++ int len = argv->iov_len;
++
++ dprintk("%s: start\n", __FUNCTION__);
++ cred->cr_group_info = NULL;
++ rqstp->rq_client = NULL;
++ if ((len -= 3*4) < 0)
++ return SVC_GARBAGE;
++
++ /* Get length, time stamp and machine name */
++ svc_getu32(argv);
++ svc_getu32(argv);
++ slen = XDR_QUADLEN(ntohl(svc_getu32(argv)));
++ if (slen > 64 || (len -= (slen + 3)*4) < 0)
++ goto badcred;
++ argv->iov_base = (void*)((u32*)argv->iov_base + slen);
++ argv->iov_len -= slen*4;
++
++ cred->cr_uid = ntohl(svc_getu32(argv));
++ cred->cr_gid = ntohl(svc_getu32(argv));
++ slen = ntohl(svc_getu32(argv));
++ if (slen > 16 || (len -= (slen + 2)*4) < 0)
++ goto badcred;
++ cred->cr_group_info = groups_alloc(slen);
++ if (cred->cr_group_info == NULL)
++ return SVC_DROP;
++ for (i = 0; i < slen; i++)
++ GROUP_AT(cred->cr_group_info, i) = ntohl(svc_getu32(argv));
++
++ if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) {
++ *authp = rpc_autherr_badverf;
++ return SVC_DENIED;
++ }
++ /* Put NULL verifier */
++ svc_putu32(resv, RPC_AUTH_NULL);
++ svc_putu32(resv, 0);
++ dprintk("%s: success, returning %d!\n", __FUNCTION__, SVC_OK);
++ return SVC_OK;
++badcred:
++ *authp = rpc_autherr_badcred;
++ return SVC_DENIED;
++}
++
++static int nfs_callback_unix_release(struct svc_rqst *rqstp)
++{
++ if (rqstp->rq_cred.cr_group_info)
++ put_group_info(rqstp->rq_cred.cr_group_info);
++ rqstp->rq_cred.cr_group_info = NULL;
++ return 0;
++}
++
++static struct auth_ops nfs_callback_auth_unix = {
++ .name = "unix",
++ .flavour = RPC_AUTH_UNIX,
++ .accept = nfs_callback_unix_accept,
++ .release = nfs_callback_unix_release,
++};
++
++/*
++ * Hook the authentication protocol
++ */
++static int nfs_callback_auth(struct svc_rqst *rqstp, u32 *authp)
++{
++ struct in_addr *addr = &rqstp->rq_addr.sin_addr;
++ struct nfs4_client *clp;
++ struct iovec *argv = &rqstp->rq_arg.head[0];
++ int flavour;
++ int retval;
++
++ /* Don't talk to strangers */
++ clp = nfs4_find_client(addr);
++ if (clp == NULL)
++ return SVC_DROP;
++ dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr));
++ nfs4_put_client(clp);
++ flavour = ntohl(svc_getu32(argv));
++ switch(flavour) {
++ case RPC_AUTH_NULL:
++ if (rqstp->rq_proc != CB_NULL) {
++ *authp = rpc_autherr_tooweak;
++ retval = SVC_DENIED;
++ break;
++ }
++ rqstp->rq_authop = &nfs_callback_auth_null;
++ retval = nfs_callback_null_accept(rqstp, authp);
++ break;
++ case RPC_AUTH_UNIX:
++ /* Eat the authentication flavour */
++ rqstp->rq_authop = &nfs_callback_auth_unix;
++ retval = nfs_callback_unix_accept(rqstp, authp);
++ break;
++ default:
++ /* FIXME: need to add RPCSEC_GSS upcalls */
++#if 0
++ svc_ungetu32(argv);
++ retval = svc_authenticate(rqstp, authp);
++#else
++ *authp = rpc_autherr_rejectedcred;
++ retval = SVC_DENIED;
++#endif
++ }
++ dprintk("%s: flavour %d returning error %d\n", __FUNCTION__, flavour, retval);
++ return retval;
++}
++
++/*
++ * Define NFS4 callback program
++ */
++extern struct svc_version nfs4_callback_version1;
++
++static struct svc_version *nfs4_callback_version[] = {
++ [1] = &nfs4_callback_version1,
++};
++
++static struct svc_stat nfs4_callback_stats;
++
++static struct svc_program nfs4_callback_program = {
++ .pg_prog = NFS4_CALLBACK, /* RPC service number */
++ .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */
++ .pg_vers = nfs4_callback_version, /* version table */
++ .pg_name = "NFSv4 callback", /* service name */
++ .pg_class = "nfs", /* authentication class */
++ .pg_stats = &nfs4_callback_stats,
++ .pg_authenticate = nfs_callback_auth,
++};
+--- linux-2.6.7/fs/nfs/read.c.lsec 2004-06-15 23:18:37.000000000 -0600
++++ linux-2.6.7/fs/nfs/read.c 2005-03-23 14:28:23.114535752 -0700
+@@ -91,8 +91,8 @@ int nfs_return_empty_page(struct page *p
+ /*
+ * Read a page synchronously.
+ */
+-static int
+-nfs_readpage_sync(struct file *file, struct inode *inode, struct page *page)
++static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
++ struct page *page)
+ {
+ unsigned int rsize = NFS_SERVER(inode)->rsize;
+ unsigned int count = PAGE_CACHE_SIZE;
+@@ -105,10 +105,11 @@ nfs_readpage_sync(struct file *file, str
+
+ memset(rdata, 0, sizeof(*rdata));
+ rdata->flags = (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
++ rdata->cred = ctx->cred;
+ rdata->inode = inode;
+ INIT_LIST_HEAD(&rdata->pages);
+ rdata->args.fh = NFS_FH(inode);
+- rdata->args.lockowner = current->files;
++ rdata->args.context = ctx;
+ rdata->args.pages = &page;
+ rdata->args.pgbase = 0UL;
+ rdata->args.count = rsize;
+@@ -134,7 +135,7 @@ nfs_readpage_sync(struct file *file, str
+ rdata->args.count);
+
+ lock_kernel();
+- result = NFS_PROTO(inode)->read(rdata, file);
++ result = NFS_PROTO(inode)->read(rdata);
+ unlock_kernel();
+
+ /*
+@@ -169,8 +170,8 @@ io_error:
+ return result;
+ }
+
+-static int
+-nfs_readpage_async(struct file *file, struct inode *inode, struct page *page)
++static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
++ struct page *page)
+ {
+ LIST_HEAD(one_request);
+ struct nfs_page *new;
+@@ -179,7 +180,7 @@ nfs_readpage_async(struct file *file, st
+ len = nfs_page_length(inode, page);
+ if (len == 0)
+ return nfs_return_empty_page(page);
+- new = nfs_create_request(file, inode, page, 0, len);
++ new = nfs_create_request(ctx, inode, page, 0, len);
+ if (IS_ERR(new)) {
+ unlock_page(page);
+ return PTR_ERR(new);
+@@ -202,8 +203,8 @@ static void nfs_readpage_release(struct
+ nfs_unlock_request(req);
+
+ dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
+- req->wb_inode->i_sb->s_id,
+- (long long)NFS_FILEID(req->wb_inode),
++ req->wb_context->dentry->d_inode->i_sb->s_id,
++ (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+ req->wb_bytes,
+ (long long)req_offset(req));
+ }
+@@ -217,16 +218,15 @@ static void nfs_read_rpcsetup(struct nfs
+ struct inode *inode;
+
+ data->req = req;
+- data->inode = inode = req->wb_inode;
+- data->cred = req->wb_cred;
++ data->inode = inode = req->wb_context->dentry->d_inode;
++ data->cred = req->wb_context->cred;
+
+ data->args.fh = NFS_FH(inode);
+ data->args.offset = req_offset(req) + offset;
+ data->args.pgbase = req->wb_pgbase + offset;
+ data->args.pages = data->pagevec;
+ data->args.count = count;
+- data->args.lockowner = req->wb_lockowner;
+- data->args.state = req->wb_state;
++ data->args.context = req->wb_context;
+
+ data->res.fattr = &data->fattr;
+ data->res.count = count;
+@@ -396,7 +396,7 @@ nfs_pagein_list(struct list_head *head,
+ while (!list_empty(head)) {
+ pages += nfs_coalesce_requests(head, &one_request, rpages);
+ req = nfs_list_entry(one_request.next);
+- error = nfs_pagein_one(&one_request, req->wb_inode);
++ error = nfs_pagein_one(&one_request, req->wb_context->dentry->d_inode);
+ if (error < 0)
+ break;
+ }
+@@ -500,9 +500,9 @@ void nfs_readpage_result(struct rpc_task
+ * - The error flag is set for this page. This happens only when a
+ * previous async read operation failed.
+ */
+-int
+-nfs_readpage(struct file *file, struct page *page)
++int nfs_readpage(struct file *file, struct page *page)
+ {
++ struct nfs_open_context *ctx;
+ struct inode *inode = page->mapping->host;
+ int error;
+
+@@ -519,25 +519,33 @@ nfs_readpage(struct file *file, struct p
+ if (error)
+ goto out_error;
+
++ if (file == NULL) {
++ ctx = nfs_find_open_context(inode, FMODE_READ);
++ if (ctx == NULL)
++ return -EBADF;
++ } else
++ ctx = get_nfs_open_context((struct nfs_open_context *)
++ file->private_data);
+ if (!IS_SYNC(inode)) {
+- error = nfs_readpage_async(file, inode, page);
++ error = nfs_readpage_async(ctx, inode, page);
+ goto out;
+ }
+
+- error = nfs_readpage_sync(file, inode, page);
++ error = nfs_readpage_sync(ctx, inode, page);
+ if (error < 0 && IS_SWAPFILE(inode))
+ printk("Aiee.. nfs swap-in of page failed!\n");
+ out:
++ put_nfs_open_context(ctx);
+ return error;
+
+ out_error:
+ unlock_page(page);
+- goto out;
++ return error;
+ }
+
+ struct nfs_readdesc {
+ struct list_head *head;
+- struct file *filp;
++ struct nfs_open_context *ctx;
+ };
+
+ static int
+@@ -552,7 +560,7 @@ readpage_async_filler(void *data, struct
+ len = nfs_page_length(inode, page);
+ if (len == 0)
+ return nfs_return_empty_page(page);
+- new = nfs_create_request(desc->filp, inode, page, 0, len);
++ new = nfs_create_request(desc->ctx, inode, page, 0, len);
+ if (IS_ERR(new)) {
+ SetPageError(page);
+ unlock_page(page);
+@@ -565,13 +573,11 @@ readpage_async_filler(void *data, struct
+ return 0;
+ }
+
+-int
+-nfs_readpages(struct file *filp, struct address_space *mapping,
++int nfs_readpages(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+ {
+ LIST_HEAD(head);
+ struct nfs_readdesc desc = {
+- .filp = filp,
+ .head = &head,
+ };
+ struct inode *inode = mapping->host;
+@@ -583,12 +589,20 @@ nfs_readpages(struct file *filp, struct
+ (long long)NFS_FILEID(inode),
+ nr_pages);
+
++ if (filp == NULL) {
++ desc.ctx = nfs_find_open_context(inode, FMODE_READ);
++ if (desc.ctx == NULL)
++ return -EBADF;
++ } else
++ desc.ctx = get_nfs_open_context((struct nfs_open_context *)
++ filp->private_data);
+ ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
+ if (!list_empty(&head)) {
+ int err = nfs_pagein_list(&head, server->rpages);
+ if (!ret)
+ ret = err;
+ }
++ put_nfs_open_context(desc.ctx);
+ return ret;
+ }
+
+--- linux-2.6.7/fs/nfs/Makefile.lsec 2004-06-15 23:19:01.000000000 -0600
++++ linux-2.6.7/fs/nfs/Makefile 2005-03-23 14:28:22.819580592 -0700
+@@ -9,6 +9,7 @@ nfs-y := dir.o file.o inode.o nfs2xdr
+ nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o
+ nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
+ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
+- idmap.o
++ delegation.o idmap.o \
++ callback.o callback_xdr.o callback_proc.o
+ nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
+ nfs-objs := $(nfs-y)
+--- linux-2.6.7/fs/Kconfig.lsec 2004-06-15 23:19:36.000000000 -0600
++++ linux-2.6.7/fs/Kconfig 2005-03-23 14:28:23.871420688 -0700
+@@ -322,7 +322,7 @@ config FS_POSIX_ACL
+ # Never use this symbol for ifdefs.
+ #
+ bool
+- depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL
++ depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL || NFS_V4
+ default y
+
+ config XFS_FS
+@@ -1443,6 +1443,7 @@ config NFSD_V3
+ config NFSD_V4
+ bool "Provide NFSv4 server support (EXPERIMENTAL)"
+ depends on NFSD_V3 && EXPERIMENTAL
++ select NFSD_TCP
+ help
+ If you would like to include the NFSv4 server as well as the NFSv2
+ and NFSv3 servers, say Y here. This feature is experimental, and
+@@ -1450,11 +1451,13 @@ config NFSD_V4
+ If unsure, say N.
+
+ config NFSD_TCP
+- bool "Provide NFS server over TCP support (EXPERIMENTAL)"
+- depends on NFSD && EXPERIMENTAL
++ bool "Provide NFS server over TCP support"
++ depends on NFSD
++ default y
+ help
+- Enable NFS service over TCP connections. This the officially
+- still experimental, but seems to work well.
++ If you want your NFS server to support TCP connections, say Y here.
++ TCP connections usually perform better than the default UDP when
++ the network is lossy or congested. If unsure, say Y.
+
+ config ROOT_NFS
+ bool "Root file system on NFS"
+@@ -1505,6 +1508,22 @@ config RPCSEC_GSS_KRB5
+
+ If unsure, say N.
+
++config RPCSEC_GSS_SPKM3
++ tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)"
++ depends on SUNRPC && EXPERIMENTAL
++ select SUNRPC_GSS
++ select CRYPTO
++ select CRYPTO_MD5
++ select CRYPTO_DES
++ help
++ Provides for secure RPC calls by means of a gss-api
++ mechanism based on the SPKM3 public-key mechanism.
++
++ Note: Requires an auxiliary userspace daemon which may be found on
++ http://www.citi.umich.edu/projects/nfsv4/
++
++ If unsure, say N.
++
+ config SMB_FS
+ tristate "SMB file system support (to mount Windows shares etc.)"
+ depends on INET
+--- linux-2.6.7/include/linux/fs.h.lsec 2005-03-23 14:26:03.300790672 -0700
++++ linux-2.6.7/include/linux/fs.h 2005-03-23 14:28:23.280510520 -0700
+@@ -632,7 +632,7 @@ struct file_lock {
+ struct file_lock *fl_next; /* singly linked list for this inode */
+ struct list_head fl_link; /* doubly linked list of all locks */
+ struct list_head fl_block; /* circular list of blocked processes */
+- fl_owner_t fl_owner;
++ fl_owner_t fl_owner; /* 0 if lock owned by a local process */
+ unsigned int fl_pid;
+ wait_queue_head_t fl_wait;
+ struct file *fl_file;
+--- linux-2.6.7/include/linux/nfs4.h.lsec 2004-06-15 23:19:22.000000000 -0600
++++ linux-2.6.7/include/linux/nfs4.h 2005-03-23 14:28:23.335502160 -0700
+@@ -13,8 +13,12 @@
+ #ifndef _LINUX_NFS4_H
+ #define _LINUX_NFS4_H
+
++#include <linux/types.h>
++#include <linux/list.h>
++
+ #define NFS4_VERIFIER_SIZE 8
+ #define NFS4_FHSIZE 128
++#define NFS4_MAXPATHLEN PATH_MAX
+ #define NFS4_MAXNAMLEN NAME_MAX
+
+ #define NFS4_ACCESS_READ 0x0001
+@@ -52,6 +56,60 @@
+ #define ACL4_SUPPORT_AUDIT_ACL 0x04
+ #define ACL4_SUPPORT_ALARM_ACL 0x08
+
++#define NFS4_ACE_FILE_INHERIT_ACE 0x00000001
++#define NFS4_ACE_DIRECTORY_INHERIT_ACE 0x00000002
++#define NFS4_ACE_NO_PROPAGATE_INHERIT_ACE 0x00000004
++#define NFS4_ACE_INHERIT_ONLY_ACE 0x00000008
++#define NFS4_ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x00000010
++#define NFS4_ACE_FAILED_ACCESS_ACE_FLAG 0x00000020
++#define NFS4_ACE_IDENTIFIER_GROUP 0x00000040
++#define NFS4_ACE_OWNER 0x00000080
++#define NFS4_ACE_GROUP 0x00000100
++#define NFS4_ACE_EVERYONE 0x00000200
++
++#define NFS4_ACE_READ_DATA 0x00000001
++#define NFS4_ACE_LIST_DIRECTORY 0x00000001
++#define NFS4_ACE_WRITE_DATA 0x00000002
++#define NFS4_ACE_ADD_FILE 0x00000002
++#define NFS4_ACE_APPEND_DATA 0x00000004
++#define NFS4_ACE_ADD_SUBDIRECTORY 0x00000004
++#define NFS4_ACE_READ_NAMED_ATTRS 0x00000008
++#define NFS4_ACE_WRITE_NAMED_ATTRS 0x00000010
++#define NFS4_ACE_EXECUTE 0x00000020
++#define NFS4_ACE_DELETE_CHILD 0x00000040
++#define NFS4_ACE_READ_ATTRIBUTES 0x00000080
++#define NFS4_ACE_WRITE_ATTRIBUTES 0x00000100
++#define NFS4_ACE_DELETE 0x00010000
++#define NFS4_ACE_READ_ACL 0x00020000
++#define NFS4_ACE_WRITE_ACL 0x00040000
++#define NFS4_ACE_WRITE_OWNER 0x00080000
++#define NFS4_ACE_SYNCHRONIZE 0x00100000
++#define NFS4_ACE_GENERIC_READ 0x00120081
++#define NFS4_ACE_GENERIC_WRITE 0x00160106
++#define NFS4_ACE_GENERIC_EXECUTE 0x001200A0
++#define NFS4_ACE_MASK_ALL 0x001F01FF
++
++enum nfs4_acl_whotype {
++ NFS4_ACL_WHO_NAMED = 0,
++ NFS4_ACL_WHO_OWNER,
++ NFS4_ACL_WHO_GROUP,
++ NFS4_ACL_WHO_EVERYONE,
++};
++
++struct nfs4_ace {
++ uint32_t type;
++ uint32_t flag;
++ uint32_t access_mask;
++ int whotype;
++ uid_t who;
++ struct list_head l_ace;
++};
++
++struct nfs4_acl {
++ uint32_t naces;
++ struct list_head ace_head;
++};
++
+ typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
+ typedef struct { char data[16]; } nfs4_stateid;
+
+@@ -297,7 +355,7 @@ enum {
+ NFSPROC4_CLNT_COMMIT,
+ NFSPROC4_CLNT_OPEN,
+ NFSPROC4_CLNT_OPEN_CONFIRM,
+- NFSPROC4_CLNT_OPEN_RECLAIM,
++ NFSPROC4_CLNT_OPEN_NOATTR,
+ NFSPROC4_CLNT_OPEN_DOWNGRADE,
+ NFSPROC4_CLNT_CLOSE,
+ NFSPROC4_CLNT_SETATTR,
+@@ -315,12 +373,16 @@ enum {
+ NFSPROC4_CLNT_REMOVE,
+ NFSPROC4_CLNT_RENAME,
+ NFSPROC4_CLNT_LINK,
++ NFSPROC4_CLNT_SYMLINK,
+ NFSPROC4_CLNT_CREATE,
+ NFSPROC4_CLNT_PATHCONF,
+ NFSPROC4_CLNT_STATFS,
+ NFSPROC4_CLNT_READLINK,
+ NFSPROC4_CLNT_READDIR,
+ NFSPROC4_CLNT_SERVER_CAPS,
++ NFSPROC4_CLNT_DELEGRETURN,
++ NFSPROC4_CLNT_GETACL,
++ NFSPROC4_CLNT_SETACL,
+ };
+
+ #endif
+--- linux-2.6.7/include/linux/nfs_page.h.lsec 2004-06-15 23:18:57.000000000 -0600
++++ linux-2.6.7/include/linux/nfs_page.h 2005-03-23 14:28:23.392493496 -0700
+@@ -29,14 +29,9 @@
+ struct nfs_page {
+ struct list_head wb_list, /* Defines state of page: */
+ *wb_list_head; /* read/write/commit */
+- struct file *wb_file;
+- fl_owner_t wb_lockowner;
+- struct inode *wb_inode;
+- struct rpc_cred *wb_cred;
+- struct nfs4_state *wb_state;
+ struct page *wb_page; /* page to read in/write out */
++ struct nfs_open_context *wb_context; /* File state context info */
+ atomic_t wb_complete; /* i/os we're waiting for */
+- wait_queue_head_t wb_wait; /* wait queue */
+ unsigned long wb_index; /* Offset >> PAGE_CACHE_SHIFT */
+ unsigned int wb_offset, /* Offset & ~PAGE_CACHE_MASK */
+ wb_pgbase, /* Start of page data */
+@@ -50,9 +45,11 @@ struct nfs_page {
+ #define NFS_NEED_COMMIT(req) (test_bit(PG_NEED_COMMIT,&(req)->wb_flags))
+ #define NFS_NEED_RESCHED(req) (test_bit(PG_NEED_RESCHED,&(req)->wb_flags))
+
+-extern struct nfs_page *nfs_create_request(struct file *, struct inode *,
+- struct page *,
+- unsigned int, unsigned int);
++extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
++ struct inode *inode,
++ struct page *page,
++ unsigned int offset,
++ unsigned int count);
+ extern void nfs_clear_request(struct nfs_page *req);
+ extern void nfs_release_request(struct nfs_page *req);
+
+@@ -64,6 +61,7 @@ extern int nfs_scan_list(struct list_hea
+ extern int nfs_coalesce_requests(struct list_head *, struct list_head *,
+ unsigned int);
+ extern int nfs_wait_on_request(struct nfs_page *);
++extern void nfs_unlock_request(struct nfs_page *req);
+
+ extern spinlock_t nfs_wreq_lock;
+
+@@ -90,19 +88,6 @@ nfs_lock_request(struct nfs_page *req)
+ return 1;
+ }
+
+-static inline void
+-nfs_unlock_request(struct nfs_page *req)
+-{
+- if (!NFS_WBACK_BUSY(req)) {
+- printk(KERN_ERR "NFS: Invalid unlock attempted\n");
+- BUG();
+- }
+- smp_mb__before_clear_bit();
+- clear_bit(PG_BUSY, &req->wb_flags);
+- smp_mb__after_clear_bit();
+- wake_up_all(&req->wb_wait);
+- nfs_release_request(req);
+-}
+
+ /**
+ * nfs_list_remove_request - Remove a request from its wb_list
+--- linux-2.6.7/include/linux/sunrpc/svc.h.lsec 2004-06-15 23:19:35.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/svc.h 2005-03-23 14:28:23.541470848 -0700
+@@ -87,6 +87,14 @@ static inline u32 svc_getu32(struct iove
+ iov->iov_len -= sizeof(u32);
+ return val;
+ }
++
++static inline void svc_ungetu32(struct iovec *iov)
++{
++ u32 *vp = (u32 *)iov->iov_base;
++ iov->iov_base = (void *)(vp - 1);
++ iov->iov_len += sizeof(*vp);
++}
++
+ static inline void svc_putu32(struct iovec *iov, u32 val)
+ {
+ u32 *vp = iov->iov_base + iov->iov_len;
+@@ -243,6 +251,8 @@ struct svc_program {
+ char * pg_name; /* service name */
+ char * pg_class; /* class name: services sharing authentication */
+ struct svc_stat * pg_stats; /* rpc statistics */
++ /* Override authentication. NULL means use default */
++ int (*pg_authenticate)(struct svc_rqst *, u32 *);
+ };
+
+ /*
+--- linux-2.6.7/include/linux/sunrpc/gss_spkm3.h.lsec 2005-03-23 14:28:24.186372808 -0700
++++ linux-2.6.7/include/linux/sunrpc/gss_spkm3.h 2005-03-23 14:28:24.185372960 -0700
+@@ -0,0 +1,61 @@
++/*
++ * linux/include/linux/sunrpc/gss_spkm3.h
++ *
++ * Copyright (c) 2000 The Regents of the University of Michigan.
++ * All rights reserved.
++ *
++ * Andy Adamson <andros@umich.edu>
++ */
++
++#include <linux/sunrpc/auth_gss.h>
++#include <linux/sunrpc/gss_err.h>
++#include <linux/sunrpc/gss_asn1.h>
++
++struct spkm3_ctx {
++ struct xdr_netobj ctx_id; /* per message context id */
++ int qop; /* negotiated qop */
++ struct xdr_netobj mech_used;
++ unsigned int ret_flags ;
++ unsigned int req_flags ;
++ struct xdr_netobj share_key;
++ int conf_alg;
++ struct crypto_tfm* derived_conf_key;
++ int intg_alg;
++ struct crypto_tfm* derived_integ_key;
++ int keyestb_alg; /* alg used to get share_key */
++ int owf_alg; /* one way function */
++};
++
++/* from openssl/objects.h */
++/* XXX need SEAL_ALG_NONE */
++#define NID_md5 4
++#define NID_dhKeyAgreement 28
++#define NID_des_cbc 31
++#define NID_sha1 64
++#define NID_cast5_cbc 108
++
++/* SPKM InnerContext Token types */
++
++#define SPKM_ERROR_TOK 3
++#define SPKM_MIC_TOK 4
++#define SPKM_WRAP_TOK 5
++#define SPKM_DEL_TOK 6
++
++u32 spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, struct xdr_buf * text, struct xdr_netobj * token, int toktype);
++
++u32 spkm3_read_token(struct spkm3_ctx *ctx, struct xdr_netobj *read_token, struct xdr_buf *message_buffer, int *qop_state, int toktype);
++
++#define CKSUMTYPE_RSA_MD5 0x0007
++
++s32 make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
++ struct xdr_netobj *cksum);
++void asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits);
++int decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen,
++ int explen);
++void spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen,
++ unsigned char *ctxhdr, int elen, int zbit);
++void spkm3_make_mic_token(unsigned char **tokp, int toklen,
++ struct xdr_netobj *mic_hdr,
++ struct xdr_netobj *md5cksum, int md5elen, int md5zbit);
++u32 spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen,
++ unsigned char **cksum);
+--- linux-2.6.7/include/linux/sunrpc/sched.h.lsec 2004-06-15 23:19:42.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/sched.h 2005-03-23 14:28:23.540471000 -0700
+@@ -11,7 +11,9 @@
+
+ #include <linux/timer.h>
+ #include <linux/sunrpc/types.h>
++#include <linux/spinlock.h>
+ #include <linux/wait.h>
++#include <linux/workqueue.h>
+ #include <linux/sunrpc/xdr.h>
+
+ /*
+@@ -25,11 +27,18 @@ struct rpc_message {
+ struct rpc_cred * rpc_cred; /* Credentials */
+ };
+
++struct rpc_wait_queue;
++struct rpc_wait {
++ struct list_head list; /* wait queue links */
++ struct list_head links; /* Links to related tasks */
++ wait_queue_head_t waitq; /* sync: sleep on this q */
++ struct rpc_wait_queue * rpc_waitq; /* RPC wait queue we're on */
++};
++
+ /*
+ * This is the RPC task struct
+ */
+ struct rpc_task {
+- struct list_head tk_list; /* wait queue links */
+ #ifdef RPC_DEBUG
+ unsigned long tk_magic; /* 0xf00baa */
+ #endif
+@@ -37,7 +46,6 @@ struct rpc_task {
+ struct rpc_clnt * tk_client; /* RPC client */
+ struct rpc_rqst * tk_rqstp; /* RPC request */
+ int tk_status; /* result of last operation */
+- struct rpc_wait_queue * tk_rpcwait; /* RPC wait queue we're on */
+
+ /*
+ * RPC call state
+@@ -70,13 +78,18 @@ struct rpc_task {
+ * you have a pathological interest in kernel oopses.
+ */
+ struct timer_list tk_timer; /* kernel timer */
+- wait_queue_head_t tk_wait; /* sync: sleep on this q */
+ unsigned long tk_timeout; /* timeout for rpc_sleep() */
+ unsigned short tk_flags; /* misc flags */
+ unsigned char tk_active : 1;/* Task has been activated */
+ unsigned char tk_priority : 2;/* Task priority */
+ unsigned long tk_runstate; /* Task run status */
+- struct list_head tk_links; /* links to related tasks */
++ struct workqueue_struct *tk_workqueue; /* Normally rpciod, but could
++ * be any workqueue
++ */
++ union {
++ struct work_struct tk_work; /* Async task work queue */
++ struct rpc_wait tk_wait; /* RPC wait */
++ } u;
+ #ifdef RPC_DEBUG
+ unsigned short tk_pid; /* debugging aid */
+ #endif
+@@ -87,11 +100,11 @@ struct rpc_task {
+ /* support walking a list of tasks on a wait queue */
+ #define task_for_each(task, pos, head) \
+ list_for_each(pos, head) \
+- if ((task=list_entry(pos, struct rpc_task, tk_list)),1)
++ if ((task=list_entry(pos, struct rpc_task, u.tk_wait.list)),1)
+
+ #define task_for_first(task, head) \
+ if (!list_empty(head) && \
+- ((task=list_entry((head)->next, struct rpc_task, tk_list)),1))
++ ((task=list_entry((head)->next, struct rpc_task, u.tk_wait.list)),1))
+
+ /* .. and walking list of all tasks */
+ #define alltask_for_each(task, pos, head) \
+@@ -124,22 +137,24 @@ typedef void (*rpc_action)(struct rpc_
+ #define RPC_DO_CALLBACK(t) ((t)->tk_callback != NULL)
+ #define RPC_IS_SOFT(t) ((t)->tk_flags & RPC_TASK_SOFT)
+
+-#define RPC_TASK_SLEEPING 0
+-#define RPC_TASK_RUNNING 1
+-#define RPC_IS_SLEEPING(t) (test_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate))
+-#define RPC_IS_RUNNING(t) (test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
++#define RPC_TASK_RUNNING 0
++#define RPC_TASK_QUEUED 1
+
++#define RPC_IS_RUNNING(t) (test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
+ #define rpc_set_running(t) (set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
+-#define rpc_clear_running(t) (clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
+-
+-#define rpc_set_sleeping(t) (set_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate))
+-
+-#define rpc_clear_sleeping(t) \
++#define rpc_test_and_set_running(t) \
++ (test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
++#define rpc_clear_running(t) \
+ do { \
+ smp_mb__before_clear_bit(); \
+- clear_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate); \
++ clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \
+ smp_mb__after_clear_bit(); \
+- } while(0)
++ } while (0)
++
++#define RPC_IS_QUEUED(t) (test_bit(RPC_TASK_QUEUED, &(t)->tk_runstate))
++#define rpc_set_queued(t) (set_bit(RPC_TASK_QUEUED, &(t)->tk_runstate))
++#define rpc_test_and_clear_queued(t) \
++ (test_and_clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate))
+
+ /*
+ * Task priorities.
+@@ -155,6 +170,7 @@ typedef void (*rpc_action)(struct rpc_
+ * RPC synchronization objects
+ */
+ struct rpc_wait_queue {
++ spinlock_t lock;
+ struct list_head tasks[RPC_NR_PRIORITY]; /* task queue for each priority level */
+ unsigned long cookie; /* cookie of last task serviced */
+ unsigned char maxpriority; /* maximum priority (0 if queue is not a priority queue) */
+@@ -175,6 +191,7 @@ struct rpc_wait_queue {
+
+ #ifndef RPC_DEBUG
+ # define RPC_WAITQ_INIT(var,qname) { \
++ .lock = SPIN_LOCK_UNLOCKED, \
+ .tasks = { \
+ [0] = LIST_HEAD_INIT(var.tasks[0]), \
+ [1] = LIST_HEAD_INIT(var.tasks[1]), \
+@@ -183,6 +200,7 @@ struct rpc_wait_queue {
+ }
+ #else
+ # define RPC_WAITQ_INIT(var,qname) { \
++ .lock = SPIN_LOCK_UNLOCKED, \
+ .tasks = { \
+ [0] = LIST_HEAD_INIT(var.tasks[0]), \
+ [1] = LIST_HEAD_INIT(var.tasks[1]), \
+@@ -207,13 +225,10 @@ void rpc_killall_tasks(struct rpc_clnt
+ int rpc_execute(struct rpc_task *);
+ void rpc_run_child(struct rpc_task *parent, struct rpc_task *child,
+ rpc_action action);
+-int rpc_add_wait_queue(struct rpc_wait_queue *, struct rpc_task *);
+-void rpc_remove_wait_queue(struct rpc_task *);
+ void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
+ void rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
+ void rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *,
+ rpc_action action, rpc_action timer);
+-void rpc_add_timer(struct rpc_task *, rpc_action);
+ void rpc_wake_up_task(struct rpc_task *);
+ void rpc_wake_up(struct rpc_wait_queue *);
+ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
+--- linux-2.6.7/include/linux/sunrpc/gss_api.h.lsec 2004-06-15 23:20:03.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/gss_api.h 2005-03-23 14:28:24.688296504 -0700
+@@ -47,6 +47,18 @@ u32 gss_verify_mic(
+ struct xdr_buf *message,
+ struct xdr_netobj *mic_token,
+ u32 *qstate);
++u32 gss_wrap(
++ struct gss_ctx *ctx_id,
++ u32 qop,
++ int offset,
++ struct xdr_buf *outbuf,
++ struct page **inpages);
++u32 gss_unwrap(
++ struct gss_ctx *ctx_id,
++ u32 *qop,
++ int offset,
++ struct xdr_buf *inbuf,
++ int *out_offset);
+ u32 gss_delete_sec_context(
+ struct gss_ctx **ctx_id);
+
+@@ -93,6 +105,18 @@ struct gss_api_ops {
+ struct xdr_buf *message,
+ struct xdr_netobj *mic_token,
+ u32 *qstate);
++ u32 (*gss_wrap)(
++ struct gss_ctx *ctx_id,
++ u32 qop,
++ int offset,
++ struct xdr_buf *outbuf,
++ struct page **inpages);
++ u32 (*gss_unwrap)(
++ struct gss_ctx *ctx_id,
++ u32 *qop,
++ int offset,
++ struct xdr_buf *buf,
++ int *out_offset);
+ void (*gss_delete_sec_context)(
+ void *internal_ctx_id);
+ };
+--- linux-2.6.7/include/linux/sunrpc/xprt.h.lsec 2004-06-15 23:19:43.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/xprt.h 2005-03-23 14:28:24.783282064 -0700
+@@ -95,7 +95,10 @@ struct rpc_rqst {
+ int rq_cong; /* has incremented xprt->cong */
+ int rq_received; /* receive completed */
+ u32 rq_seqno; /* gss seq no. used on req. */
+-
++ int rq_enc_pages_num;
++ struct page **rq_enc_pages; /* scratch pages for use by
++ gss privacy code */
++ void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */
+ struct list_head rq_list;
+
+ struct xdr_buf rq_private_buf; /* The receive buffer
+--- linux-2.6.7/include/linux/sunrpc/gss_krb5.h.lsec 2004-06-15 23:19:29.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/gss_krb5.h 2005-03-23 14:28:24.840273400 -0700
+@@ -53,6 +53,8 @@ struct krb5_ctx {
+ struct xdr_netobj mech_used;
+ };
+
++extern spinlock_t krb5_seq_lock;
++
+ #define KG_TOK_MIC_MSG 0x0101
+ #define KG_TOK_WRAP_MSG 0x0201
+
+@@ -116,18 +118,25 @@ enum seal_alg {
+
+ s32
+ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
+- struct xdr_netobj *cksum);
++ int body_offset, struct xdr_netobj *cksum);
+
+ u32
+ krb5_make_token(struct krb5_ctx *context_handle, int qop_req,
+ struct xdr_buf *input_message_buffer,
+- struct xdr_netobj *output_message_buffer, int toktype);
++ struct xdr_netobj *output_message_buffer);
+
+ u32
+ krb5_read_token(struct krb5_ctx *context_handle,
+ struct xdr_netobj *input_token_buffer,
+- struct xdr_buf *message_buffer,
+- int *qop_state, int toktype);
++ struct xdr_buf *message_buffer, int *qop_state);
++
++u32
++gss_wrap_kerberos(struct gss_ctx *ctx_id, u32 qop, int offset,
++ struct xdr_buf *outbuf, struct page **pages);
++
++u32
++gss_unwrap_kerberos(struct gss_ctx *ctx_id, u32 *qop, int offset,
++ struct xdr_buf *buf, int *out_offset);
+
+ u32
+ krb5_encrypt(struct crypto_tfm * key,
+@@ -137,6 +146,13 @@ u32
+ krb5_decrypt(struct crypto_tfm * key,
+ void *iv, void *in, void *out, int length);
+
++int
++gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *outbuf, int offset,
++ struct page **pages);
++
++int
++gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *inbuf, int offset);
++
+ s32
+ krb5_make_seq_num(struct crypto_tfm * key,
+ int direction,
+--- linux-2.6.7/include/linux/sunrpc/gss_asn1.h.lsec 2004-06-15 23:20:04.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/gss_asn1.h 2005-03-23 14:28:23.706445768 -0700
+@@ -69,7 +69,6 @@ u32 g_verify_token_header(
+ struct xdr_netobj *mech,
+ int *body_size,
+ unsigned char **buf_in,
+- int tok_type,
+ int toksize);
+
+ u32 g_get_mech_oid(struct xdr_netobj *mech, struct xdr_netobj * in_buf);
+--- linux-2.6.7/include/linux/sunrpc/cache.h.lsec 2004-06-15 23:19:28.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/cache.h 2005-03-23 14:28:24.349348032 -0700
+@@ -128,20 +128,17 @@ struct cache_deferred_req {
+ * just like a template in C++, this macro does cache lookup
+ * for us.
+ * The function is passed some sort of HANDLE from which a cache_detail
+- * structure can be determined (via SETUP, DETAIL), a template
++ * structure can be determined (via DETAIL), a template
+ * cache entry (type RTN*), and a "set" flag. Using the HASHFN and the
+ * TEST, the function will try to find a matching cache entry in the cache.
+ * If "set" == 0 :
+ * If an entry is found, it is returned
+ * If no entry is found, a new non-VALID entry is created.
+- * If "set" == 1 and INPLACE == 0 :
++ * If "set" == 1:
+ * If no entry is found a new one is inserted with data from "template"
+ * If a non-CACHE_VALID entry is found, it is updated from template using UPDATE
+ * If a CACHE_VALID entry is found, a new entry is swapped in with data
+ * from "template"
+- * If set == 1, and INPLACE == 1 :
+- * As above, except that if a CACHE_VALID entry is found, we UPDATE in place
+- * instead of swapping in a new entry.
+ *
+ * If the passed handle has the CACHE_NEGATIVE flag set, then UPDATE is not
+ * run but insteead CACHE_NEGATIVE is set in any new item.
+@@ -153,21 +150,18 @@ struct cache_deferred_req {
+ * MEMBER is the member of the cache which is cache_head, which must be first
+ * FNAME is the name for the function
+ * ARGS are arguments to function and must contain RTN *item, int set. May
+- * also contain something to be usedby SETUP or DETAIL to find cache_detail.
+- * SETUP locates the cache detail and makes it available as...
+- * DETAIL identifies the cache detail, possibly set up by SETUP
++ * also contain something to be used by DETAIL to find cache_detail.
++ * DETAIL identifies the cache detail
+ * HASHFN returns a hash value of the cache entry "item"
+ * TEST tests if "tmp" matches "item"
+ * INIT copies key information from "item" to "new"
+ * UPDATE copies content information from "item" to "tmp"
+- * INPLACE is true if updates can happen inplace rather than allocating a new structure
+ */
+-#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE,INPLACE) \
++#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,DETAIL,HASHFN,TEST,INIT,UPDATE) \
+ RTN *FNAME ARGS \
+ { \
+ RTN *tmp, *new=NULL; \
+ struct cache_head **hp, **head; \
+- SETUP; \
+ head = &(DETAIL)->hash_table[HASHFN]; \
+ retry: \
+ if (set||new) write_lock(&(DETAIL)->hash_lock); \
+@@ -176,14 +170,14 @@ RTN *FNAME ARGS \
+ tmp = container_of(*hp, RTN, MEMBER); \
+ if (TEST) { /* found a match */ \
+ \
+- if (set && !INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \
++ if (set && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \
+ break; \
+ \
+ if (new) \
+ {INIT;} \
+ cache_get(&tmp->MEMBER); \
+ if (set) { \
+- if (!INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
++ if (test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
+ { /* need to swap in new */ \
+ RTN *t2; \
+ \
+@@ -205,7 +199,7 @@ RTN *FNAME ARGS \
+ else read_unlock(&(DETAIL)->hash_lock); \
+ if (set) \
+ cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \
+- if (set && !INPLACE && new) cache_fresh(DETAIL, &new->MEMBER, 0); \
++ if (set && new) cache_fresh(DETAIL, &new->MEMBER, 0); \
+ if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL); \
+ return tmp; \
+ } \
+@@ -233,16 +227,15 @@ RTN *FNAME ARGS \
+ new = kmalloc(sizeof(*new), GFP_KERNEL); \
+ if (new) { \
+ cache_init(&new->MEMBER); \
+- cache_get(&new->MEMBER); \
+ goto retry; \
+ } \
+ return NULL; \
+ }
+
+-#define DefineSimpleCacheLookup(STRUCT,INPLACE) \
+- DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), /*no setup */, \
++#define DefineSimpleCacheLookup(STRUCT) \
++ DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), \
+ & STRUCT##_cache, STRUCT##_hash(item), STRUCT##_match(item, tmp),\
+- STRUCT##_init(new, item), STRUCT##_update(tmp, item),INPLACE)
++ STRUCT##_init(new, item), STRUCT##_update(tmp, item))
+
+ #define cache_for_each(pos, detail, index, member) \
+ for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ; \
+--- linux-2.6.7/include/linux/sunrpc/xdr.h.lsec 2004-06-15 23:20:26.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/xdr.h 2005-03-23 14:28:24.783282064 -0700
+@@ -192,6 +192,7 @@ extern void xdr_write_pages(struct xdr_s
+ extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p);
+ extern uint32_t *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
+ extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
++extern void truncate_xdr_buf(struct xdr_buf *xdr, int len);
+
+ #endif /* __KERNEL__ */
+
+--- linux-2.6.7/include/linux/nfsd/state.h.lsec 2004-06-15 23:18:56.000000000 -0600
++++ linux-2.6.7/include/linux/nfsd/state.h 2005-03-23 14:28:24.081388768 -0700
+@@ -38,6 +38,7 @@
+ #define _NFSD4_STATE_H
+
+ #include <linux/list.h>
++#include <linux/sunrpc/clnt.h>
+
+ #define NFS4_OPAQUE_LIMIT 1024
+ typedef struct {
+@@ -65,6 +66,22 @@ extern stateid_t onestateid;
+ #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
+ #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
+
++/* client delegation callback info */
++struct nfs4_callback {
++ /* SETCLIENTID info */
++ u32 cb_parsed; /* addr parsed */
++ u32 cb_addr;
++ unsigned short cb_port;
++ u32 cb_prog;
++ u32 cb_ident;
++ struct xdr_netobj cb_netid;
++ /* RPC client info */
++ u32 cb_set; /* successful CB_NULL call */
++ struct rpc_program cb_program;
++ struct rpc_stat cb_stat;
++ struct rpc_clnt * cb_client;
++};
++
+ /*
+ * struct nfs4_client - one per client. Clientids live here.
+ * o Each nfs4_client is hashed by clientid.
+@@ -87,6 +104,21 @@ struct nfs4_client {
+ struct svc_cred cl_cred; /* setclientid principal */
+ clientid_t cl_clientid; /* generated by server */
+ nfs4_verifier cl_confirm; /* generated by server */
++ struct nfs4_callback cl_callback; /* callback info */
++ time_t cl_first_state; /* first state aquisition*/
++ atomic_t cl_count; /* ref count */
++};
++
++/* struct nfs4_client_reset
++ * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
++ * upon lease reset, or from upcall to state_daemon (to read in state
++ * from non-volitile storage) upon reboot.
++ */
++struct nfs4_client_reclaim {
++ struct list_head cr_strhash; /* hash by cr_name */
++ struct xdr_netobj cr_name; /* id generated by client */
++ time_t cr_first_state; /* first state aquisition */
++ u32 cr_expired; /* boolean: lease expired? */
+ };
+
+ static inline void
+@@ -216,5 +248,8 @@ extern int nfs4_share_conflict(struct sv
+ extern void nfs4_lock_state(void);
+ extern void nfs4_unlock_state(void);
+ extern int nfs4_in_grace(void);
+-extern int nfs4_in_no_grace(void);
++extern int nfs4_check_open_reclaim(clientid_t *clid);
++extern void nfsd4_probe_callback(struct nfs4_client *clp);
++extern void expire_client(struct nfs4_client *clp);
++extern void put_nfs4_client(struct nfs4_client *clp);
+ #endif /* NFSD4_STATE_H */
+--- linux-2.6.7/include/linux/nfsd/nfsd.h.lsec 2004-06-15 23:20:04.000000000 -0600
++++ linux-2.6.7/include/linux/nfsd/nfsd.h 2005-03-23 14:28:24.133380864 -0700
+@@ -76,6 +76,11 @@ int nfsd_lookup(struct svc_rqst *, stru
+ const char *, int, struct svc_fh *);
+ int nfsd_setattr(struct svc_rqst *, struct svc_fh *,
+ struct iattr *, int, time_t);
++#ifdef CONFIG_NFSD_V4
++int nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
++ struct nfs4_acl *);
++int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
++#endif /* CONFIG_NFSD_V4 */
+ int nfsd_create(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, struct iattr *attrs,
+ int type, dev_t rdev, struct svc_fh *res);
+@@ -126,9 +131,13 @@ int nfsd_permission(struct svc_export *
+ #ifdef CONFIG_NFSD_V4
+ void nfs4_state_init(void);
+ void nfs4_state_shutdown(void);
++time_t nfs4_lease_time(void);
++void nfs4_reset_lease(time_t leasetime);
+ #else
+ void static inline nfs4_state_init(void){}
+ void static inline nfs4_state_shutdown(void){}
++time_t static inline nfs4_lease_time(void){return 0;}
++void static inline nfs4_reset_lease(time_t leasetime){}
+ #endif
+
+ /*
+@@ -249,12 +258,11 @@ static inline int is_fsid(struct svc_fh
+ #define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
+ #define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
+
+-#define NFSD_LEASE_TIME 60 /* seconds */
++#define NFSD_LEASE_TIME (nfs4_lease_time())
+ #define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */
+
+ /*
+ * The following attributes are currently not supported by the NFSv4 server:
+- * ACL (will be supported in a forthcoming patch)
+ * ARCHIVE (deprecated anyway)
+ * FS_LOCATIONS (will be supported eventually)
+ * HIDDEN (unlikely to be supported any time soon)
+@@ -274,7 +282,7 @@ static inline int is_fsid(struct svc_fh
+ | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \
+ | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_HOMOGENEOUS \
+ | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \
+- | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE)
++ | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_ACL)
+
+ #define NFSD_SUPPORTED_ATTRS_WORD1 \
+ (FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \
+@@ -289,7 +297,8 @@ static inline int is_fsid(struct svc_fh
+ (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+
+ /* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
+-#define NFSD_WRITEABLE_ATTRS_WORD0 FATTR4_WORD0_SIZE
++#define NFSD_WRITEABLE_ATTRS_WORD0 \
++(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL )
+ #define NFSD_WRITEABLE_ATTRS_WORD1 \
+ (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
+ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY_SET)
+--- linux-2.6.7/include/linux/nfsd/xdr4.h.lsec 2004-06-15 23:18:59.000000000 -0600
++++ linux-2.6.7/include/linux/nfsd/xdr4.h 2005-03-23 14:28:24.082388616 -0700
+@@ -39,6 +39,8 @@
+ #ifndef _LINUX_NFSD_XDR4_H
+ #define _LINUX_NFSD_XDR4_H
+
++#include <linux/nfs4.h>
++
+ #define NFSD4_MAX_TAGLEN 128
+ #define XDR_LEN(n) (((n) + 3) & ~3)
+
+@@ -95,6 +97,7 @@ struct nfsd4_create {
+ u32 cr_bmval[2]; /* request */
+ struct iattr cr_iattr; /* request */
+ struct nfsd4_change_info cr_cinfo; /* response */
++ struct nfs4_acl *cr_acl;
+ };
+ #define cr_linklen u.link.namelen
+ #define cr_linkname u.link.name
+@@ -216,7 +219,7 @@ struct nfsd4_open {
+ u32 op_rflags; /* response */
+ int op_truncate; /* used during processing */
+ struct nfs4_stateowner *op_stateowner; /* used during processing */
+-
++ struct nfs4_acl *op_acl;
+ };
+ #define op_iattr u.iattr
+ #define op_verf u.verf
+@@ -291,6 +294,7 @@ struct nfsd4_setattr {
+ stateid_t sa_stateid; /* request */
+ u32 sa_bmval[2]; /* request */
+ struct iattr sa_iattr; /* request */
++ struct nfs4_acl *sa_acl;
+ };
+
+ struct nfsd4_setclientid {
+@@ -378,6 +382,7 @@ struct nfsd4_compoundargs {
+ u32 * tmpp;
+ struct tmpbuf {
+ struct tmpbuf *next;
++ void (*release)(const void *);
+ void *buf;
+ } *to_free;
+
+@@ -449,6 +454,7 @@ extern int nfsd4_locku(struct svc_rqst *
+ extern int
+ nfsd4_release_lockowner(struct svc_rqst *rqstp,
+ struct nfsd4_release_lockowner *rlockowner);
++extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *);
+ #endif
+
+ /*
+--- linux-2.6.7/include/linux/nfs_fs.h.lsec 2004-06-15 23:19:13.000000000 -0600
++++ linux-2.6.7/include/linux/nfs_fs.h 2005-03-23 14:28:23.338501704 -0700
+@@ -28,6 +28,7 @@
+ #include <linux/nfs3.h>
+ #include <linux/nfs4.h>
+ #include <linux/nfs_xdr.h>
++#include <linux/rwsem.h>
+ #include <linux/workqueue.h>
+
+ /*
+@@ -75,15 +76,33 @@
+ #ifdef __KERNEL__
+
+ /*
+- * NFSv3 Access mode cache
++ * NFSv3/v4 Access mode cache entry
+ */
+-struct nfs_access_cache {
++struct nfs_access_entry {
+ unsigned long jiffies;
+ struct rpc_cred * cred;
+ int mask;
+- int err;
+ };
+
++struct nfs4_state;
++struct nfs_open_context {
++ atomic_t count;
++ struct dentry *dentry;
++ struct rpc_cred *cred;
++ struct nfs4_state *state;
++ unsigned int pid;
++ int mode;
++ int error;
++
++ struct list_head list;
++ wait_queue_head_t waitq;
++};
++
++/*
++ * NFSv4 delegation
++ */
++struct nfs_delegation;
++
+ /*
+ * nfs fs inode data in memory
+ */
+@@ -137,7 +156,7 @@ struct nfs_inode {
+ */
+ atomic_t data_updates;
+
+- struct nfs_access_cache cache_access;
++ struct nfs_access_entry cache_access;
+
+ /*
+ * This is the cookie verifier used for NFSv3 readdir
+@@ -156,16 +175,20 @@ struct nfs_inode {
+ ncommit,
+ npages;
+
+- /* Credentials for shared mmap */
+- struct rpc_cred *mm_cred;
++ /* Open contexts for shared mmap writes */
++ struct list_head open_files;
+
+ wait_queue_head_t nfs_i_wait;
+
+ #ifdef CONFIG_NFS_V4
+ /* NFSv4 state */
+ struct list_head open_states;
++ struct nfs_delegation *delegation;
++ int delegation_state;
++ struct rw_semaphore rwsem;
+ #endif /* CONFIG_NFS_V4*/
+-
++ void *acl;
++ ssize_t acl_len;
+ struct inode vfs_inode;
+ };
+
+@@ -259,6 +282,18 @@ static inline int nfs_verify_change_attr
+ && chattr == NFS_I(inode)->cache_change_attribute;
+ }
+
++/**
++ * nfs_compare_fh - compare two filehandles for equality
++ * @fh1 - pointer to first filehandle
++ * @fh2 - pointer to second filehandle
++ */
++static inline int nfs_compare_fh(const struct nfs_fh *fh1, const struct nfs_fh *fh2)
++{
++ if (fh1->size == fh2->size)
++ return memcmp(fh1->data, fh2->data, fh1->size);
++ return (fh1->size > fh2->size) ? 1 : -1;
++}
++
+ /*
+ * linux/fs/nfs/inode.c
+ */
+@@ -268,9 +303,12 @@ extern struct inode *nfs_fhget(struct su
+ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
+ extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+ extern int nfs_permission(struct inode *, int, struct nameidata *);
+-extern void nfs_set_mmcred(struct inode *, struct rpc_cred *);
++extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *);
++extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
+ extern int nfs_open(struct inode *, struct file *);
+ extern int nfs_release(struct inode *, struct file *);
++extern int nfs_attribute_timeout(struct inode *inode);
++extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
+ extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
+ extern int nfs_setattr(struct dentry *, struct iattr *);
+ extern void nfs_begin_attr_update(struct inode *);
+@@ -278,6 +316,12 @@ extern void nfs_end_attr_update(struct i
+ extern void nfs_begin_data_update(struct inode *);
+ extern void nfs_end_data_update(struct inode *);
+ extern void nfs_end_data_update_defer(struct inode *);
++extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred);
++extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
++extern void put_nfs_open_context(struct nfs_open_context *ctx);
++extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx);
++extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, int mode);
++extern void nfs_file_clear_open_context(struct file *filp);
+
+ /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
+ extern u32 root_nfs_parse_addr(char *name); /*__init*/
+@@ -289,16 +333,15 @@ extern struct inode_operations nfs_file_
+ extern struct file_operations nfs_file_operations;
+ extern struct address_space_operations nfs_file_aops;
+
+-static __inline__ struct rpc_cred *
+-nfs_file_cred(struct file *file)
++static inline struct rpc_cred *nfs_file_cred(struct file *file)
+ {
+- struct rpc_cred *cred = NULL;
+- if (file)
+- cred = (struct rpc_cred *)file->private_data;
+-#ifdef RPC_DEBUG
+- BUG_ON(cred && cred->cr_magic != RPCAUTH_CRED_MAGIC);
+-#endif
+- return cred;
++ if (file != NULL) {
++ struct nfs_open_context *ctx;
++
++ ctx = (struct nfs_open_context*)file->private_data;
++ return ctx->cred;
++ }
++ return NULL;
+ }
+
+ /*
+@@ -418,28 +461,6 @@ extern int nfsroot_mount(struct sockadd
+ * inline functions
+ */
+
+-static inline int nfs_attribute_timeout(struct inode *inode)
+-{
+- struct nfs_inode *nfsi = NFS_I(inode);
+-
+- return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo);
+-}
+-
+-/**
+- * nfs_revalidate_inode - Revalidate the inode attributes
+- * @server - pointer to nfs_server struct
+- * @inode - pointer to inode struct
+- *
+- * Updates inode attribute information by retrieving the data from the server.
+- */
+-static inline int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+-{
+- if (!(NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+- && !nfs_attribute_timeout(inode))
+- return NFS_STALE(inode) ? -ESTALE : 0;
+- return __nfs_revalidate_inode(server, inode);
+-}
+-
+ static inline loff_t
+ nfs_size_to_loff_t(__u64 size)
+ {
+@@ -507,8 +528,6 @@ struct idmap;
+
+ enum nfs4_client_state {
+ NFS4CLNT_OK = 0,
+- NFS4CLNT_NEW,
+- NFS4CLNT_SETUP_STATE,
+ };
+
+ /*
+@@ -520,7 +539,6 @@ struct nfs4_client {
+ u64 cl_clientid; /* constant */
+ nfs4_verifier cl_confirm;
+ unsigned long cl_state;
+- long cl_generation;
+
+ u32 cl_lockowner_id;
+
+@@ -530,6 +548,7 @@ struct nfs4_client {
+ */
+ struct rw_semaphore cl_sem;
+
++ struct list_head cl_delegations;
+ struct list_head cl_state_owners;
+ struct list_head cl_unused;
+ int cl_nunused;
+@@ -573,12 +592,11 @@ struct nfs4_state_owner {
+ u32 so_id; /* 32-bit identifier, unique */
+ struct semaphore so_sema;
+ u32 so_seqid; /* protected by so_sema */
+- unsigned int so_flags; /* protected by so_sema */
+ atomic_t so_count;
+- long so_generation;
+
+ struct rpc_cred *so_cred; /* Associated cred */
+ struct list_head so_states;
++ struct list_head so_delegations;
+ };
+
+ /*
+@@ -593,10 +611,13 @@ struct nfs4_state_owner {
+ * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
+ */
+
++/* bits for nfs4_lock_state->flags */
++
+ struct nfs4_lock_state {
+ struct list_head ls_locks; /* Other lock stateids */
+- fl_owner_t ls_owner; /* POSIX lock owner */
+- struct nfs4_state * ls_parent; /* Parent nfs4_state */
++ unsigned int ls_pid; /* pid of owner process */
++#define NFS_LOCK_INITIALIZED 1
++ int flags;
+ u32 ls_seqid;
+ u32 ls_id;
+ nfs4_stateid ls_stateid;
+@@ -606,6 +627,7 @@ struct nfs4_lock_state {
+ /* bits for nfs4_state->flags */
+ enum {
+ LK_STATE_IN_USE,
++ NFS_DELEGATED_STATE,
+ };
+
+ struct nfs4_state {
+@@ -629,8 +651,19 @@ struct nfs4_state {
+ };
+
+
++struct nfs4_exception {
++ long timeout;
++ int retry;
++};
++
+ extern struct dentry_operations nfs4_dentry_operations;
+ extern struct inode_operations nfs4_dir_inode_operations;
++extern struct inode_operations nfs4_file_inode_operations;
++
++/* inode.c */
++extern ssize_t nfs_getxattr(struct dentry *, const char *, void *, size_t);
++extern int nfs_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t nfs_listxattr(struct dentry *, char *, size_t);
+
+ /* nfs4proc.c */
+ extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short);
+@@ -639,10 +672,15 @@ extern int nfs4_open_reclaim(struct nfs4
+ extern int nfs4_proc_async_renew(struct nfs4_client *);
+ extern int nfs4_proc_renew(struct nfs4_client *);
+ extern int nfs4_do_close(struct inode *, struct nfs4_state *);
+-int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode);
++extern int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode);
+ extern int nfs4_wait_clnt_recover(struct rpc_clnt *, struct nfs4_client *);
+ extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
+ extern int nfs4_open_revalidate(struct inode *, struct dentry *, int);
++extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *);
++extern int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request);
++extern ssize_t nfs4_proc_get_acl(struct inode *, void *buf, ssize_t buflen);
++extern int nfs4_proc_set_acl(struct inode *, const void *buf, ssize_t buflen);
++extern void nfs4_zap_acl_attr(struct inode *inode);
+
+ /* nfs4renewd.c */
+ extern void nfs4_schedule_state_renewal(struct nfs4_client *);
+@@ -654,6 +692,8 @@ extern void init_nfsv4_state(struct nfs_
+ extern void destroy_nfsv4_state(struct nfs_server *);
+ extern struct nfs4_client *nfs4_get_client(struct in_addr *);
+ extern void nfs4_put_client(struct nfs4_client *clp);
++extern int nfs4_init_client(struct nfs4_client *clp);
++extern struct nfs4_client *nfs4_find_client(struct in_addr *);
+ extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *);
+
+ extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
+@@ -663,15 +703,14 @@ extern void nfs4_put_open_state(struct n
+ extern void nfs4_close_state(struct nfs4_state *, mode_t);
+ extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode);
+ extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp);
+-extern int nfs4_handle_error(struct nfs_server *, int);
+ extern void nfs4_schedule_state_recovery(struct nfs4_client *);
+-extern struct nfs4_lock_state *nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t);
+-extern struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t);
++extern struct nfs4_lock_state *nfs4_find_lock_state(struct nfs4_state *state, unsigned int pid);
++extern struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, unsigned int pid);
+ extern void nfs4_put_lock_state(struct nfs4_lock_state *state);
+ extern void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *ls);
+-extern void nfs4_notify_setlk(struct inode *, struct file_lock *, struct nfs4_lock_state *);
+-extern void nfs4_notify_unlck(struct inode *, struct file_lock *, struct nfs4_lock_state *);
+-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
++extern void nfs4_notify_setlk(struct nfs4_state *, struct file_lock *, struct nfs4_lock_state *);
++extern void nfs4_notify_unlck(struct nfs4_state *, struct file_lock *, struct nfs4_lock_state *);
++extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, unsigned int pid);
+
+
+
+@@ -681,6 +720,7 @@ struct nfs4_mount_data;
+ #define destroy_nfsv4_state(server) do { } while (0)
+ #define nfs4_put_state_owner(inode, owner) do { } while (0)
+ #define nfs4_put_open_state(state) do { } while (0)
++#define nfs4_close_state(a, b) do { } while (0)
+ #define nfs4_renewd_prepare_shutdown(server) do { } while (0)
+ #endif
+
+@@ -697,6 +737,7 @@ struct nfs4_mount_data;
+ #define NFSDBG_XDR 0x0020
+ #define NFSDBG_FILE 0x0040
+ #define NFSDBG_ROOT 0x0080
++#define NFSDBG_CALLBACK 0x0100
+ #define NFSDBG_ALL 0xFFFF
+
+ #ifdef __KERNEL__
+--- linux-2.6.7/include/linux/nfs4_acl.h.lsec 2005-03-23 14:28:24.519322192 -0700
++++ linux-2.6.7/include/linux/nfs4_acl.h 2005-03-23 14:28:24.518322344 -0700
+@@ -0,0 +1,59 @@
++/*
++ * include/linux/nfs4_acl.c
++ *
++ * Common NFSv4 ACL handling definitions.
++ *
++ * Copyright (c) 2002 The Regents of the University of Michigan.
++ * All rights reserved.
++ *
++ * Marius Aamodt Eriksen <marius@umich.edu>
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the University nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#ifndef LINUX_NFS4_ACL_H
++#define LINUX_NFS4_ACL_H
++
++#include <linux/posix_acl.h>
++
++struct nfs4_acl *nfs4_acl_new(void);
++void nfs4_acl_free(struct nfs4_acl *);
++int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
++int nfs4_acl_get_whotype(char *, u32);
++int nfs4_acl_write_who(int who, char *p);
++int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
++ uid_t who, u32 mask);
++
++#define NFS4_ACL_TYPE_DEFAULT 0x01
++#define NFS4_ACL_DIR 0x02
++#define NFS4_ACL_OWNER 0x04
++
++struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
++ struct posix_acl *, unsigned int flags);
++int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
++ struct posix_acl **, unsigned int flags);
++
++#endif /* LINUX_NFS4_ACL_H */
+--- linux-2.6.7/include/linux/nfs_xdr.h.lsec 2004-06-15 23:19:52.000000000 -0600
++++ linux-2.6.7/include/linux/nfs_xdr.h 2005-03-23 14:28:23.539471152 -0700
+@@ -99,20 +99,21 @@ struct nfs4_change_info {
+ * Arguments to the open call.
+ */
+ struct nfs_openargs {
+- struct nfs_fh * fh;
++ const struct nfs_fh * fh;
+ __u32 seqid;
+- __u32 share_access;
++ int open_flags;
+ __u64 clientid;
+ __u32 id;
+- __u32 opentype;
+- __u32 createmode;
+ union {
+ struct iattr * attrs; /* UNCHECKED, GUARDED */
+ nfs4_verifier verifier; /* EXCLUSIVE */
++ nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */
++ int delegation_type; /* CLAIM_PREVIOUS */
+ } u;
+ const struct qstr * name;
+ const struct nfs_server *server; /* Needed for ID mapping */
+ const u32 * bitmask;
++ __u32 claim;
+ };
+
+ struct nfs_openres {
+@@ -122,13 +123,17 @@ struct nfs_openres {
+ __u32 rflags;
+ struct nfs_fattr * f_attr;
+ const struct nfs_server *server;
++ int delegation_type;
++ nfs4_stateid delegation;
++ __u32 do_recall;
++ __u64 maxsize;
+ };
+
+ /*
+ * Arguments to the open_confirm call.
+ */
+ struct nfs_open_confirmargs {
+- struct nfs_fh * fh;
++ const struct nfs_fh * fh;
+ nfs4_stateid stateid;
+ __u32 seqid;
+ };
+@@ -138,26 +143,13 @@ struct nfs_open_confirmres {
+ };
+
+ /*
+- * Arguments to the open_reclaim call.
+- */
+-struct nfs_open_reclaimargs {
+- struct nfs_fh * fh;
+- __u64 clientid;
+- __u32 seqid;
+- __u32 id;
+- __u32 share_access;
+- __u32 claim;
+- const __u32 * bitmask;
+-};
+-
+-/*
+ * Arguments to the close call.
+ */
+ struct nfs_closeargs {
+ struct nfs_fh * fh;
+ nfs4_stateid stateid;
+ __u32 seqid;
+- __u32 share_access;
++ int open_flags;
+ };
+
+ struct nfs_closeres {
+@@ -224,6 +216,11 @@ struct nfs_lockres {
+ const struct nfs_server * server;
+ };
+
++struct nfs4_delegreturnargs {
++ const struct nfs_fh *fhandle;
++ const nfs4_stateid *stateid;
++};
++
+ /*
+ * Arguments to the read call.
+ */
+@@ -235,8 +232,7 @@ struct nfs_lockres {
+
+ struct nfs_readargs {
+ struct nfs_fh * fh;
+- fl_owner_t lockowner;
+- struct nfs4_state * state;
++ struct nfs_open_context *context;
+ __u64 offset;
+ __u32 count;
+ unsigned int pgbase;
+@@ -259,8 +255,7 @@ struct nfs_readres {
+
+ struct nfs_writeargs {
+ struct nfs_fh * fh;
+- fl_owner_t lockowner;
+- struct nfs4_state * state;
++ struct nfs_open_context *context;
+ __u64 offset;
+ __u32 count;
+ enum nfs3_stable_how stable;
+@@ -331,6 +326,19 @@ struct nfs_setattrargs {
+ const u32 * bitmask;
+ };
+
++struct nfs_setaclargs {
++ struct nfs_fh * fh;
++ const char * acl;
++ ssize_t acl_len;
++ const struct nfs_server * server; /* Needed for name mapping */
++};
++
++struct nfs_getaclres {
++ char * acl;
++ ssize_t acl_len;
++ const struct nfs_server * server; /* Needed for name mapping */
++};
++
+ struct nfs_setattrres {
+ struct nfs_fattr * fattr;
+ const struct nfs_server * server;
+@@ -597,13 +605,15 @@ struct nfs4_rename_res {
+ };
+
+ struct nfs4_setclientid {
+- nfs4_verifier sc_verifier; /* request */
+- char * sc_name; /* request */
++ const nfs4_verifier * sc_verifier; /* request */
++ unsigned int sc_name_len;
++ char sc_name[32]; /* request */
+ u32 sc_prog; /* request */
++ unsigned int sc_netid_len;
+ char sc_netid[4]; /* request */
++ unsigned int sc_uaddr_len;
+ char sc_uaddr[24]; /* request */
+ u32 sc_cb_ident; /* request */
+- struct nfs4_client * sc_state; /* response */
+ };
+
+ struct nfs4_statfs_arg {
+@@ -657,6 +667,8 @@ struct nfs_write_data {
+ void (*complete) (struct nfs_write_data *, int);
+ };
+
++struct nfs_access_entry;
++
+ /*
+ * RPC procedure vector for NFSv2/NFSv3 demuxing
+ */
+@@ -664,6 +676,7 @@ struct nfs_rpc_ops {
+ int version; /* Protocol version */
+ struct dentry_operations *dentry_ops;
+ struct inode_operations *dir_inode_ops;
++ struct inode_operations *file_inode_ops;
+
+ int (*getroot) (struct nfs_server *, struct nfs_fh *,
+ struct nfs_fsinfo *);
+@@ -672,11 +685,11 @@ struct nfs_rpc_ops {
+ struct iattr *);
+ int (*lookup) (struct inode *, struct qstr *,
+ struct nfs_fh *, struct nfs_fattr *);
+- int (*access) (struct inode *, struct rpc_cred *, int);
++ int (*access) (struct inode *, struct nfs_access_entry *);
+ int (*readlink)(struct inode *, struct page *);
+- int (*read) (struct nfs_read_data *, struct file *);
+- int (*write) (struct nfs_write_data *, struct file *);
+- int (*commit) (struct nfs_write_data *, struct file *);
++ int (*read) (struct nfs_read_data *);
++ int (*write) (struct nfs_write_data *);
++ int (*commit) (struct nfs_write_data *);
+ struct inode * (*create) (struct inode *, struct qstr *,
+ struct iattr *, int);
+ int (*remove) (struct inode *, struct qstr *);
+@@ -708,8 +721,6 @@ struct nfs_rpc_ops {
+ void (*commit_setup) (struct nfs_write_data *, int how);
+ int (*file_open) (struct inode *, struct file *);
+ int (*file_release) (struct inode *, struct file *);
+- void (*request_init)(struct nfs_page *, struct file *);
+- int (*request_compatible)(struct nfs_page *, struct file *, struct page *);
+ int (*lock)(struct file *, int, struct file_lock *);
+ };
+
+--- linux-2.6.7/arch/s390/defconfig.lsec 2004-06-15 23:19:52.000000000 -0600
++++ linux-2.6.7/arch/s390/defconfig 2005-03-23 14:28:23.869420992 -0700
+@@ -422,7 +422,7 @@ CONFIG_NFS_V3=y
+ CONFIG_NFSD=y
+ CONFIG_NFSD_V3=y
+ # CONFIG_NFSD_V4 is not set
+-# CONFIG_NFSD_TCP is not set
++CONFIG_NFSD_TCP=y
+ CONFIG_LOCKD=y
+ CONFIG_LOCKD_V4=y
+ CONFIG_EXPORTFS=y
+--- linux-2.6.7/arch/ia64/defconfig.lsec 2004-06-15 23:18:57.000000000 -0600
++++ linux-2.6.7/arch/ia64/defconfig 2005-03-23 14:28:23.816429048 -0700
+@@ -987,7 +987,7 @@ CONFIG_NFS_DIRECTIO=y
+ CONFIG_NFSD=y
+ CONFIG_NFSD_V3=y
+ # CONFIG_NFSD_V4 is not set
+-# CONFIG_NFSD_TCP is not set
++CONFIG_NFSD_TCP=y
+ CONFIG_LOCKD=y
+ CONFIG_LOCKD_V4=y
+ CONFIG_EXPORTFS=y
+--- linux-2.6.7/arch/ppc/defconfig.lsec 2004-06-15 23:19:52.000000000 -0600
++++ linux-2.6.7/arch/ppc/defconfig 2005-03-23 14:28:23.817428896 -0700
+@@ -1230,7 +1230,7 @@ CONFIG_NFS_V3=y
+ CONFIG_NFSD=y
+ CONFIG_NFSD_V3=y
+ # CONFIG_NFSD_V4 is not set
+-# CONFIG_NFSD_TCP is not set
++CONFIG_NFSD_TCP=y
+ CONFIG_LOCKD=y
+ CONFIG_LOCKD_V4=y
+ CONFIG_EXPORTFS=y
+--- linux-2.6.7/arch/i386/defconfig.lsec 2004-06-15 23:19:42.000000000 -0600
++++ linux-2.6.7/arch/i386/defconfig 2005-03-23 14:28:23.763437104 -0700
+@@ -1148,7 +1148,7 @@ CONFIG_NFS_FS=y
+ # CONFIG_NFS_DIRECTIO is not set
+ CONFIG_NFSD=y
+ # CONFIG_NFSD_V3 is not set
+-# CONFIG_NFSD_TCP is not set
++CONFIG_NFSD_TCP=y
+ CONFIG_LOCKD=y
+ CONFIG_EXPORTFS=y
+ CONFIG_SUNRPC=y
+--- linux-2.6.7/arch/alpha/defconfig.lsec 2004-06-15 23:19:23.000000000 -0600
++++ linux-2.6.7/arch/alpha/defconfig 2005-03-23 14:28:23.762437256 -0700
+@@ -791,7 +791,7 @@ CONFIG_NFS_V3=y
+ CONFIG_NFSD=m
+ CONFIG_NFSD_V3=y
+ # CONFIG_NFSD_V4 is not set
+-# CONFIG_NFSD_TCP is not set
++CONFIG_NFSD_TCP=y
+ CONFIG_LOCKD=m
+ CONFIG_LOCKD_V4=y
+ CONFIG_EXPORTFS=m
+--- linux-2.6.7/net/sunrpc/svcauth_unix.c.lsec 2004-06-15 23:19:37.000000000 -0600
++++ linux-2.6.7/net/sunrpc/svcauth_unix.c 2005-03-23 14:28:24.295356240 -0700
+@@ -55,12 +55,10 @@ struct auth_domain *unix_domain_find(cha
+ if (new == NULL)
+ return NULL;
+ cache_init(&new->h.h);
+- atomic_inc(&new->h.h.refcnt);
+ new->h.name = strdup(name);
+ new->h.flavour = RPC_AUTH_UNIX;
+ new->addr_changes = 0;
+ new->h.h.expiry_time = NEVER;
+- new->h.h.flags = 0;
+
+ rv = auth_domain_lookup(&new->h, 2);
+ if (rv == &new->h) {
+@@ -262,7 +260,7 @@ struct cache_detail ip_map_cache = {
+ .cache_show = ip_map_show,
+ };
+
+-static DefineSimpleCacheLookup(ip_map, 0)
++static DefineSimpleCacheLookup(ip_map)
+
+
+ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
+@@ -318,7 +316,8 @@ struct auth_domain *auth_unix_lookup(str
+ return NULL;
+
+ if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) {
+- set_bit(CACHE_NEGATIVE, &ipm->h.flags);
++ if (test_and_set_bit(CACHE_NEGATIVE, &ipm->h.flags) == 0)
++ auth_domain_put(&ipm->m_client->h);
+ rv = NULL;
+ } else {
+ rv = &ipm->m_client->h;
+@@ -405,6 +404,9 @@ svcauth_null_release(struct svc_rqst *rq
+ if (rqstp->rq_client)
+ auth_domain_put(rqstp->rq_client);
+ rqstp->rq_client = NULL;
++ if (rqstp->rq_cred.cr_group_info)
++ put_group_info(rqstp->rq_cred.cr_group_info);
++ rqstp->rq_cred.cr_group_info = NULL;
+
+ return 0; /* don't drop */
+ }
+--- linux-2.6.7/net/sunrpc/xprt.c.lsec 2004-06-15 23:19:42.000000000 -0600
++++ linux-2.6.7/net/sunrpc/xprt.c 2005-03-23 14:28:23.706445768 -0700
+@@ -1099,7 +1099,7 @@ xprt_write_space(struct sock *sk)
+ goto out;
+
+ spin_lock_bh(&xprt->sock_lock);
+- if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->pending)
++ if (xprt->snd_task)
+ rpc_wake_up_task(xprt->snd_task);
+ spin_unlock_bh(&xprt->sock_lock);
+ out:
+@@ -1357,6 +1357,7 @@ xprt_request_init(struct rpc_task *task,
+ req->rq_task = task;
+ req->rq_xprt = xprt;
+ req->rq_xid = xprt_alloc_xid(xprt);
++ req->rq_release_snd_buf = NULL;
+ dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid,
+ req, req->rq_xid);
+ }
+@@ -1382,6 +1383,8 @@ xprt_release(struct rpc_task *task)
+ mod_timer(&xprt->timer, xprt->last_used + XPRT_IDLE_TIMEOUT);
+ spin_unlock_bh(&xprt->sock_lock);
+ task->tk_rqstp = NULL;
++ if (req->rq_release_snd_buf)
++ req->rq_release_snd_buf(req);
+ memset(req, 0, sizeof(*req)); /* mark unused */
+
+ dprintk("RPC: %4d release request %p\n", task->tk_pid, req);
+--- linux-2.6.7/net/sunrpc/sched.c.lsec 2004-06-15 23:19:35.000000000 -0600
++++ linux-2.6.7/net/sunrpc/sched.c 2005-03-23 14:28:23.651454128 -0700
+@@ -41,13 +41,7 @@ static mempool_t *rpc_buffer_mempool;
+
+ static void __rpc_default_timer(struct rpc_task *task);
+ static void rpciod_killall(void);
+-
+-/*
+- * When an asynchronous RPC task is activated within a bottom half
+- * handler, or while executing another RPC task, it is put on
+- * schedq, and rpciod is woken up.
+- */
+-static RPC_WAITQ(schedq, "schedq");
++static void rpc_async_schedule(void *);
+
+ /*
+ * RPC tasks that create another task (e.g. for contacting the portmapper)
+@@ -68,26 +62,18 @@ static LIST_HEAD(all_tasks);
+ /*
+ * rpciod-related stuff
+ */
+-static DECLARE_WAIT_QUEUE_HEAD(rpciod_idle);
+-static DECLARE_COMPLETION(rpciod_killer);
+ static DECLARE_MUTEX(rpciod_sema);
+ static unsigned int rpciod_users;
+-static pid_t rpciod_pid;
+-static int rpc_inhibit;
++static struct workqueue_struct *rpciod_workqueue;
+
+ /*
+- * Spinlock for wait queues. Access to the latter also has to be
+- * interrupt-safe in order to allow timers to wake up sleeping tasks.
+- */
+-static spinlock_t rpc_queue_lock = SPIN_LOCK_UNLOCKED;
+-/*
+ * Spinlock for other critical sections of code.
+ */
+ static spinlock_t rpc_sched_lock = SPIN_LOCK_UNLOCKED;
+
+ /*
+ * Disable the timer for a given RPC task. Should be called with
+- * rpc_queue_lock and bh_disabled in order to avoid races within
++ * queue->lock and bh_disabled in order to avoid races within
+ * rpc_run_timer().
+ */
+ static inline void
+@@ -105,16 +91,13 @@ __rpc_disable_timer(struct rpc_task *tas
+ * without calling del_timer_sync(). The latter could cause a
+ * deadlock if called while we're holding spinlocks...
+ */
+-static void
+-rpc_run_timer(struct rpc_task *task)
++static void rpc_run_timer(struct rpc_task *task)
+ {
+ void (*callback)(struct rpc_task *);
+
+- spin_lock_bh(&rpc_queue_lock);
+ callback = task->tk_timeout_fn;
+ task->tk_timeout_fn = NULL;
+- spin_unlock_bh(&rpc_queue_lock);
+- if (callback) {
++ if (callback && RPC_IS_QUEUED(task)) {
+ dprintk("RPC: %4d running timer\n", task->tk_pid);
+ callback(task);
+ }
+@@ -140,19 +123,8 @@ __rpc_add_timer(struct rpc_task *task, r
+ }
+
+ /*
+- * Set up a timer for an already sleeping task.
+- */
+-void rpc_add_timer(struct rpc_task *task, rpc_action timer)
+-{
+- spin_lock_bh(&rpc_queue_lock);
+- if (!RPC_IS_RUNNING(task))
+- __rpc_add_timer(task, timer);
+- spin_unlock_bh(&rpc_queue_lock);
+-}
+-
+-/*
+ * Delete any timer for the current task. Because we use del_timer_sync(),
+- * this function should never be called while holding rpc_queue_lock.
++ * this function should never be called while holding queue->lock.
+ */
+ static inline void
+ rpc_delete_timer(struct rpc_task *task)
+@@ -169,16 +141,17 @@ static void __rpc_add_wait_queue_priorit
+ struct list_head *q;
+ struct rpc_task *t;
+
++ INIT_LIST_HEAD(&task->u.tk_wait.links);
+ q = &queue->tasks[task->tk_priority];
+ if (unlikely(task->tk_priority > queue->maxpriority))
+ q = &queue->tasks[queue->maxpriority];
+- list_for_each_entry(t, q, tk_list) {
++ list_for_each_entry(t, q, u.tk_wait.list) {
+ if (t->tk_cookie == task->tk_cookie) {
+- list_add_tail(&task->tk_list, &t->tk_links);
++ list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links);
+ return;
+ }
+ }
+- list_add_tail(&task->tk_list, q);
++ list_add_tail(&task->u.tk_wait.list, q);
+ }
+
+ /*
+@@ -189,37 +162,21 @@ static void __rpc_add_wait_queue_priorit
+ * improve overall performance.
+ * Everyone else gets appended to the queue to ensure proper FIFO behavior.
+ */
+-static int __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
++static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
+ {
+- if (task->tk_rpcwait == queue)
+- return 0;
++ BUG_ON (RPC_IS_QUEUED(task));
+
+- if (task->tk_rpcwait) {
+- printk(KERN_WARNING "RPC: doubly enqueued task!\n");
+- return -EWOULDBLOCK;
+- }
+ if (RPC_IS_PRIORITY(queue))
+ __rpc_add_wait_queue_priority(queue, task);
+ else if (RPC_IS_SWAPPER(task))
+- list_add(&task->tk_list, &queue->tasks[0]);
++ list_add(&task->u.tk_wait.list, &queue->tasks[0]);
+ else
+- list_add_tail(&task->tk_list, &queue->tasks[0]);
+- task->tk_rpcwait = queue;
++ list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
++ task->u.tk_wait.rpc_waitq = queue;
++ rpc_set_queued(task);
+
+ dprintk("RPC: %4d added to queue %p \"%s\"\n",
+ task->tk_pid, queue, rpc_qname(queue));
+-
+- return 0;
+-}
+-
+-int rpc_add_wait_queue(struct rpc_wait_queue *q, struct rpc_task *task)
+-{
+- int result;
+-
+- spin_lock_bh(&rpc_queue_lock);
+- result = __rpc_add_wait_queue(q, task);
+- spin_unlock_bh(&rpc_queue_lock);
+- return result;
+ }
+
+ /*
+@@ -229,12 +186,12 @@ static void __rpc_remove_wait_queue_prio
+ {
+ struct rpc_task *t;
+
+- if (!list_empty(&task->tk_links)) {
+- t = list_entry(task->tk_links.next, struct rpc_task, tk_list);
+- list_move(&t->tk_list, &task->tk_list);
+- list_splice_init(&task->tk_links, &t->tk_links);
++ if (!list_empty(&task->u.tk_wait.links)) {
++ t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list);
++ list_move(&t->u.tk_wait.list, &task->u.tk_wait.list);
++ list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links);
+ }
+- list_del(&task->tk_list);
++ list_del(&task->u.tk_wait.list);
+ }
+
+ /*
+@@ -243,31 +200,17 @@ static void __rpc_remove_wait_queue_prio
+ */
+ static void __rpc_remove_wait_queue(struct rpc_task *task)
+ {
+- struct rpc_wait_queue *queue = task->tk_rpcwait;
+-
+- if (!queue)
+- return;
++ struct rpc_wait_queue *queue;
++ queue = task->u.tk_wait.rpc_waitq;
+
+ if (RPC_IS_PRIORITY(queue))
+ __rpc_remove_wait_queue_priority(task);
+ else
+- list_del(&task->tk_list);
+- task->tk_rpcwait = NULL;
+-
++ list_del(&task->u.tk_wait.list);
+ dprintk("RPC: %4d removed from queue %p \"%s\"\n",
+ task->tk_pid, queue, rpc_qname(queue));
+ }
+
+-void
+-rpc_remove_wait_queue(struct rpc_task *task)
+-{
+- if (!task->tk_rpcwait)
+- return;
+- spin_lock_bh(&rpc_queue_lock);
+- __rpc_remove_wait_queue(task);
+- spin_unlock_bh(&rpc_queue_lock);
+-}
+-
+ static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority)
+ {
+ queue->priority = priority;
+@@ -290,6 +233,7 @@ static void __rpc_init_priority_wait_que
+ {
+ int i;
+
++ spin_lock_init(&queue->lock);
+ for (i = 0; i < ARRAY_SIZE(queue->tasks); i++)
+ INIT_LIST_HEAD(&queue->tasks[i]);
+ queue->maxpriority = maxprio;
+@@ -316,34 +260,27 @@ EXPORT_SYMBOL(rpc_init_wait_queue);
+ * Note: If the task is ASYNC, this must be called with
+ * the spinlock held to protect the wait queue operation.
+ */
+-static inline void
+-rpc_make_runnable(struct rpc_task *task)
++static void rpc_make_runnable(struct rpc_task *task)
+ {
+- if (task->tk_timeout_fn) {
+- printk(KERN_ERR "RPC: task w/ running timer in rpc_make_runnable!!\n");
++ if (rpc_test_and_set_running(task))
+ return;
+- }
+- rpc_set_running(task);
++ BUG_ON(task->tk_timeout_fn);
+ if (RPC_IS_ASYNC(task)) {
+- if (RPC_IS_SLEEPING(task)) {
+- int status;
+- status = __rpc_add_wait_queue(&schedq, task);
+- if (status < 0) {
+- printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
+- task->tk_status = status;
+- return;
+- }
+- rpc_clear_sleeping(task);
+- wake_up(&rpciod_idle);
++ int status;
++
++ INIT_WORK(&task->u.tk_work, rpc_async_schedule, (void *)task);
++ status = queue_work(task->tk_workqueue, &task->u.tk_work);
++ if (status < 0) {
++ printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
++ task->tk_status = status;
++ return;
+ }
+- } else {
+- rpc_clear_sleeping(task);
+- wake_up(&task->tk_wait);
+- }
++ } else
++ wake_up(&task->u.tk_wait.waitq);
+ }
+
+ /*
+- * Place a newly initialized task on the schedq.
++ * Place a newly initialized task on the workqueue.
+ */
+ static inline void
+ rpc_schedule_run(struct rpc_task *task)
+@@ -352,33 +289,18 @@ rpc_schedule_run(struct rpc_task *task)
+ if (RPC_IS_ACTIVATED(task))
+ return;
+ task->tk_active = 1;
+- rpc_set_sleeping(task);
+ rpc_make_runnable(task);
+ }
+
+ /*
+- * For other people who may need to wake the I/O daemon
+- * but should (for now) know nothing about its innards
+- */
+-void rpciod_wake_up(void)
+-{
+- if(rpciod_pid==0)
+- printk(KERN_ERR "rpciod: wot no daemon?\n");
+- wake_up(&rpciod_idle);
+-}
+-
+-/*
+ * Prepare for sleeping on a wait queue.
+ * By always appending tasks to the list we ensure FIFO behavior.
+ * NB: An RPC task will only receive interrupt-driven events as long
+ * as it's on a wait queue.
+ */
+-static void
+-__rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
++static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
+ rpc_action action, rpc_action timer)
+ {
+- int status;
+-
+ dprintk("RPC: %4d sleep_on(queue \"%s\" time %ld)\n", task->tk_pid,
+ rpc_qname(q), jiffies);
+
+@@ -388,49 +310,36 @@ __rpc_sleep_on(struct rpc_wait_queue *q,
+ }
+
+ /* Mark the task as being activated if so needed */
+- if (!RPC_IS_ACTIVATED(task)) {
++ if (!RPC_IS_ACTIVATED(task))
+ task->tk_active = 1;
+- rpc_set_sleeping(task);
+- }
+
+- status = __rpc_add_wait_queue(q, task);
+- if (status) {
+- printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
+- task->tk_status = status;
+- } else {
+- rpc_clear_running(task);
+- if (task->tk_callback) {
+- dprintk(KERN_ERR "RPC: %4d overwrites an active callback\n", task->tk_pid);
+- BUG();
+- }
+- task->tk_callback = action;
+- __rpc_add_timer(task, timer);
+- }
++ __rpc_add_wait_queue(q, task);
++
++ BUG_ON(task->tk_callback != NULL);
++ task->tk_callback = action;
++ __rpc_add_timer(task, timer);
+ }
+
+-void
+-rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
++void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
+ rpc_action action, rpc_action timer)
+ {
+ /*
+ * Protect the queue operations.
+ */
+- spin_lock_bh(&rpc_queue_lock);
++ spin_lock_bh(&q->lock);
+ __rpc_sleep_on(q, task, action, timer);
+- spin_unlock_bh(&rpc_queue_lock);
++ spin_unlock_bh(&q->lock);
+ }
+
+ /**
+- * __rpc_wake_up_task - wake up a single rpc_task
++ * __rpc_do_wake_up_task - wake up a single rpc_task
+ * @task: task to be woken up
+ *
+- * Caller must hold rpc_queue_lock
++ * Caller must hold queue->lock, and have cleared the task queued flag.
+ */
+-static void
+-__rpc_wake_up_task(struct rpc_task *task)
++static void __rpc_do_wake_up_task(struct rpc_task *task)
+ {
+- dprintk("RPC: %4d __rpc_wake_up_task (now %ld inh %d)\n",
+- task->tk_pid, jiffies, rpc_inhibit);
++ dprintk("RPC: %4d __rpc_wake_up_task (now %ld)\n", task->tk_pid, jiffies);
+
+ #ifdef RPC_DEBUG
+ if (task->tk_magic != 0xf00baa) {
+@@ -445,12 +354,9 @@ __rpc_wake_up_task(struct rpc_task *task
+ printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task);
+ return;
+ }
+- if (RPC_IS_RUNNING(task))
+- return;
+
+ __rpc_disable_timer(task);
+- if (task->tk_rpcwait != &schedq)
+- __rpc_remove_wait_queue(task);
++ __rpc_remove_wait_queue(task);
+
+ rpc_make_runnable(task);
+
+@@ -458,6 +364,15 @@ __rpc_wake_up_task(struct rpc_task *task
+ }
+
+ /*
++ * Wake up the specified task
++ */
++static void __rpc_wake_up_task(struct rpc_task *task)
++{
++ if (rpc_test_and_clear_queued(task))
++ __rpc_do_wake_up_task(task);
++}
++
++/*
+ * Default timeout handler if none specified by user
+ */
+ static void
+@@ -471,14 +386,15 @@ __rpc_default_timer(struct rpc_task *tas
+ /*
+ * Wake up the specified task
+ */
+-void
+-rpc_wake_up_task(struct rpc_task *task)
++void rpc_wake_up_task(struct rpc_task *task)
+ {
+- if (RPC_IS_RUNNING(task))
+- return;
+- spin_lock_bh(&rpc_queue_lock);
+- __rpc_wake_up_task(task);
+- spin_unlock_bh(&rpc_queue_lock);
++ if (rpc_test_and_clear_queued(task)) {
++ struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq;
++
++ spin_lock_bh(&queue->lock);
++ __rpc_do_wake_up_task(task);
++ spin_unlock_bh(&queue->lock);
++ }
+ }
+
+ /*
+@@ -494,11 +410,11 @@ static struct rpc_task * __rpc_wake_up_n
+ */
+ q = &queue->tasks[queue->priority];
+ if (!list_empty(q)) {
+- task = list_entry(q->next, struct rpc_task, tk_list);
++ task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+ if (queue->cookie == task->tk_cookie) {
+ if (--queue->nr)
+ goto out;
+- list_move_tail(&task->tk_list, q);
++ list_move_tail(&task->u.tk_wait.list, q);
+ }
+ /*
+ * Check if we need to switch queues.
+@@ -516,7 +432,7 @@ static struct rpc_task * __rpc_wake_up_n
+ else
+ q = q - 1;
+ if (!list_empty(q)) {
+- task = list_entry(q->next, struct rpc_task, tk_list);
++ task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+ goto new_queue;
+ }
+ } while (q != &queue->tasks[queue->priority]);
+@@ -541,14 +457,14 @@ struct rpc_task * rpc_wake_up_next(struc
+ struct rpc_task *task = NULL;
+
+ dprintk("RPC: wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue));
+- spin_lock_bh(&rpc_queue_lock);
++ spin_lock_bh(&queue->lock);
+ if (RPC_IS_PRIORITY(queue))
+ task = __rpc_wake_up_next_priority(queue);
+ else {
+ task_for_first(task, &queue->tasks[0])
+ __rpc_wake_up_task(task);
+ }
+- spin_unlock_bh(&rpc_queue_lock);
++ spin_unlock_bh(&queue->lock);
+
+ return task;
+ }
+@@ -557,25 +473,25 @@ struct rpc_task * rpc_wake_up_next(struc
+ * rpc_wake_up - wake up all rpc_tasks
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ *
+- * Grabs rpc_queue_lock
++ * Grabs queue->lock
+ */
+ void rpc_wake_up(struct rpc_wait_queue *queue)
+ {
+ struct rpc_task *task;
+
+ struct list_head *head;
+- spin_lock_bh(&rpc_queue_lock);
++ spin_lock_bh(&queue->lock);
+ head = &queue->tasks[queue->maxpriority];
+ for (;;) {
+ while (!list_empty(head)) {
+- task = list_entry(head->next, struct rpc_task, tk_list);
++ task = list_entry(head->next, struct rpc_task, u.tk_wait.list);
+ __rpc_wake_up_task(task);
+ }
+ if (head == &queue->tasks[0])
+ break;
+ head--;
+ }
+- spin_unlock_bh(&rpc_queue_lock);
++ spin_unlock_bh(&queue->lock);
+ }
+
+ /**
+@@ -583,18 +499,18 @@ void rpc_wake_up(struct rpc_wait_queue *
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ * @status: status value to set
+ *
+- * Grabs rpc_queue_lock
++ * Grabs queue->lock
+ */
+ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
+ {
+ struct list_head *head;
+ struct rpc_task *task;
+
+- spin_lock_bh(&rpc_queue_lock);
++ spin_lock_bh(&queue->lock);
+ head = &queue->tasks[queue->maxpriority];
+ for (;;) {
+ while (!list_empty(head)) {
+- task = list_entry(head->next, struct rpc_task, tk_list);
++ task = list_entry(head->next, struct rpc_task, u.tk_wait.list);
+ task->tk_status = status;
+ __rpc_wake_up_task(task);
+ }
+@@ -602,7 +518,7 @@ void rpc_wake_up_status(struct rpc_wait_
+ break;
+ head--;
+ }
+- spin_unlock_bh(&rpc_queue_lock);
++ spin_unlock_bh(&queue->lock);
+ }
+
+ /*
+@@ -626,18 +542,14 @@ __rpc_atrun(struct rpc_task *task)
+ /*
+ * This is the RPC `scheduler' (or rather, the finite state machine).
+ */
+-static int
+-__rpc_execute(struct rpc_task *task)
++static int __rpc_execute(struct rpc_task *task)
+ {
+ int status = 0;
+
+ dprintk("RPC: %4d rpc_execute flgs %x\n",
+ task->tk_pid, task->tk_flags);
+
+- if (!RPC_IS_RUNNING(task)) {
+- printk(KERN_WARNING "RPC: rpc_execute called for sleeping task!!\n");
+- return 0;
+- }
++ BUG_ON(RPC_IS_QUEUED(task));
+
+ restarted:
+ while (1) {
+@@ -657,7 +569,9 @@ __rpc_execute(struct rpc_task *task)
+ */
+ save_callback=task->tk_callback;
+ task->tk_callback=NULL;
++ lock_kernel();
+ save_callback(task);
++ unlock_kernel();
+ }
+
+ /*
+@@ -665,43 +579,41 @@ __rpc_execute(struct rpc_task *task)
+ * tk_action may be NULL when the task has been killed
+ * by someone else.
+ */
+- if (RPC_IS_RUNNING(task)) {
++ if (!RPC_IS_QUEUED(task)) {
+ /*
+ * Garbage collection of pending timers...
+ */
+ rpc_delete_timer(task);
+ if (!task->tk_action)
+ break;
++ lock_kernel();
+ task->tk_action(task);
+- /* micro-optimization to avoid spinlock */
+- if (RPC_IS_RUNNING(task))
+- continue;
++ unlock_kernel();
+ }
+
+ /*
+- * Check whether task is sleeping.
++ * Lockless check for whether task is sleeping or not.
+ */
+- spin_lock_bh(&rpc_queue_lock);
+- if (!RPC_IS_RUNNING(task)) {
+- rpc_set_sleeping(task);
+- if (RPC_IS_ASYNC(task)) {
+- spin_unlock_bh(&rpc_queue_lock);
++ if (!RPC_IS_QUEUED(task))
++ continue;
++ if (RPC_IS_ASYNC(task)) {
++ rpc_clear_running(task);
++ /* Careful! we may have raced... */
++ if (RPC_IS_QUEUED(task))
+ return 0;
+- }
++ if (rpc_test_and_set_running(task))
++ return 0;
++ continue;
+ }
+- spin_unlock_bh(&rpc_queue_lock);
+
+- if (!RPC_IS_SLEEPING(task))
+- continue;
++ init_waitqueue_head(&task->u.tk_wait.waitq);
++ rpc_clear_running(task);
+ /* sync task: sleep here */
+ dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid);
+- if (current->pid == rpciod_pid)
+- printk(KERN_ERR "RPC: rpciod waiting on sync task!\n");
+-
+ if (!task->tk_client->cl_intr) {
+- __wait_event(task->tk_wait, !RPC_IS_SLEEPING(task));
++ __wait_event(task->u.tk_wait.waitq, RPC_IS_RUNNING(task));
+ } else {
+- __wait_event_interruptible(task->tk_wait, !RPC_IS_SLEEPING(task), status);
++ __wait_event_interruptible(task->u.tk_wait.waitq, RPC_IS_RUNNING(task), status);
+ /*
+ * When a sync task receives a signal, it exits with
+ * -ERESTARTSYS. In order to catch any callbacks that
+@@ -719,7 +631,9 @@ __rpc_execute(struct rpc_task *task)
+ }
+
+ if (task->tk_exit) {
++ lock_kernel();
+ task->tk_exit(task);
++ unlock_kernel();
+ /* If tk_action is non-null, the user wants us to restart */
+ if (task->tk_action) {
+ if (!RPC_ASSASSINATED(task)) {
+@@ -738,7 +652,6 @@ __rpc_execute(struct rpc_task *task)
+
+ /* Release all resources associated with the task */
+ rpc_release_task(task);
+-
+ return status;
+ }
+
+@@ -754,57 +667,16 @@ __rpc_execute(struct rpc_task *task)
+ int
+ rpc_execute(struct rpc_task *task)
+ {
+- int status = -EIO;
+- if (rpc_inhibit) {
+- printk(KERN_INFO "RPC: execution inhibited!\n");
+- goto out_release;
+- }
+-
+- status = -EWOULDBLOCK;
+- if (task->tk_active) {
+- printk(KERN_ERR "RPC: active task was run twice!\n");
+- goto out_err;
+- }
++ BUG_ON(task->tk_active);
+
+ task->tk_active = 1;
+ rpc_set_running(task);
+ return __rpc_execute(task);
+- out_release:
+- rpc_release_task(task);
+- out_err:
+- return status;
+ }
+
+-/*
+- * This is our own little scheduler for async RPC tasks.
+- */
+-static void
+-__rpc_schedule(void)
++static void rpc_async_schedule(void *arg)
+ {
+- struct rpc_task *task;
+- int count = 0;
+-
+- dprintk("RPC: rpc_schedule enter\n");
+- while (1) {
+-
+- task_for_first(task, &schedq.tasks[0]) {
+- __rpc_remove_wait_queue(task);
+- spin_unlock_bh(&rpc_queue_lock);
+-
+- __rpc_execute(task);
+- spin_lock_bh(&rpc_queue_lock);
+- } else {
+- break;
+- }
+-
+- if (++count >= 200 || need_resched()) {
+- count = 0;
+- spin_unlock_bh(&rpc_queue_lock);
+- schedule();
+- spin_lock_bh(&rpc_queue_lock);
+- }
+- }
+- dprintk("RPC: rpc_schedule leave\n");
++ __rpc_execute((struct rpc_task *)arg);
+ }
+
+ /*
+@@ -862,7 +734,6 @@ void rpc_init_task(struct rpc_task *task
+ task->tk_client = clnt;
+ task->tk_flags = flags;
+ task->tk_exit = callback;
+- init_waitqueue_head(&task->tk_wait);
+ if (current->uid != current->fsuid || current->gid != current->fsgid)
+ task->tk_flags |= RPC_TASK_SETUID;
+
+@@ -873,7 +744,9 @@ void rpc_init_task(struct rpc_task *task
+
+ task->tk_priority = RPC_PRIORITY_NORMAL;
+ task->tk_cookie = (unsigned long)current;
+- INIT_LIST_HEAD(&task->tk_links);
++
++ /* Initialize workqueue for async tasks */
++ task->tk_workqueue = rpciod_workqueue;
+
+ /* Add to global list of all tasks */
+ spin_lock(&rpc_sched_lock);
+@@ -942,8 +815,7 @@ cleanup:
+ goto out;
+ }
+
+-void
+-rpc_release_task(struct rpc_task *task)
++void rpc_release_task(struct rpc_task *task)
+ {
+ dprintk("RPC: %4d release task\n", task->tk_pid);
+
+@@ -961,19 +833,9 @@ rpc_release_task(struct rpc_task *task)
+ list_del(&task->tk_task);
+ spin_unlock(&rpc_sched_lock);
+
+- /* Protect the execution below. */
+- spin_lock_bh(&rpc_queue_lock);
+-
+- /* Disable timer to prevent zombie wakeup */
+- __rpc_disable_timer(task);
+-
+- /* Remove from any wait queue we're still on */
+- __rpc_remove_wait_queue(task);
+-
++ BUG_ON (rpc_test_and_clear_queued(task));
+ task->tk_active = 0;
+
+- spin_unlock_bh(&rpc_queue_lock);
+-
+ /* Synchronously delete any running timer */
+ rpc_delete_timer(task);
+
+@@ -1003,10 +865,9 @@ rpc_release_task(struct rpc_task *task)
+ * queue 'childq'. If so returns a pointer to the parent.
+ * Upon failure returns NULL.
+ *
+- * Caller must hold rpc_queue_lock
++ * Caller must hold childq.lock
+ */
+-static inline struct rpc_task *
+-rpc_find_parent(struct rpc_task *child)
++static inline struct rpc_task *rpc_find_parent(struct rpc_task *child)
+ {
+ struct rpc_task *task, *parent;
+ struct list_head *le;
+@@ -1019,17 +880,16 @@ rpc_find_parent(struct rpc_task *child)
+ return NULL;
+ }
+
+-static void
+-rpc_child_exit(struct rpc_task *child)
++static void rpc_child_exit(struct rpc_task *child)
+ {
+ struct rpc_task *parent;
+
+- spin_lock_bh(&rpc_queue_lock);
++ spin_lock_bh(&childq.lock);
+ if ((parent = rpc_find_parent(child)) != NULL) {
+ parent->tk_status = child->tk_status;
+ __rpc_wake_up_task(parent);
+ }
+- spin_unlock_bh(&rpc_queue_lock);
++ spin_unlock_bh(&childq.lock);
+ }
+
+ /*
+@@ -1052,22 +912,20 @@ fail:
+ return NULL;
+ }
+
+-void
+-rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func)
++void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func)
+ {
+- spin_lock_bh(&rpc_queue_lock);
++ spin_lock_bh(&childq.lock);
+ /* N.B. Is it possible for the child to have already finished? */
+ __rpc_sleep_on(&childq, task, func, NULL);
+ rpc_schedule_run(child);
+- spin_unlock_bh(&rpc_queue_lock);
++ spin_unlock_bh(&childq.lock);
+ }
+
+ /*
+ * Kill all tasks for the given client.
+ * XXX: kill their descendants as well?
+ */
+-void
+-rpc_killall_tasks(struct rpc_clnt *clnt)
++void rpc_killall_tasks(struct rpc_clnt *clnt)
+ {
+ struct rpc_task *rovr;
+ struct list_head *le;
+@@ -1089,93 +947,14 @@ rpc_killall_tasks(struct rpc_clnt *clnt)
+
+ static DECLARE_MUTEX_LOCKED(rpciod_running);
+
+-static inline int
+-rpciod_task_pending(void)
+-{
+- return !list_empty(&schedq.tasks[0]);
+-}
+-
+-
+-/*
+- * This is the rpciod kernel thread
+- */
+-static int
+-rpciod(void *ptr)
+-{
+- int rounds = 0;
+-
+- lock_kernel();
+- /*
+- * Let our maker know we're running ...
+- */
+- rpciod_pid = current->pid;
+- up(&rpciod_running);
+-
+- daemonize("rpciod");
+- allow_signal(SIGKILL);
+-
+- dprintk("RPC: rpciod starting (pid %d)\n", rpciod_pid);
+- spin_lock_bh(&rpc_queue_lock);
+- while (rpciod_users) {
+- DEFINE_WAIT(wait);
+- if (signalled()) {
+- spin_unlock_bh(&rpc_queue_lock);
+- rpciod_killall();
+- flush_signals(current);
+- spin_lock_bh(&rpc_queue_lock);
+- }
+- __rpc_schedule();
+- if (current->flags & PF_FREEZE) {
+- spin_unlock_bh(&rpc_queue_lock);
+- refrigerator(PF_FREEZE);
+- spin_lock_bh(&rpc_queue_lock);
+- }
+-
+- if (++rounds >= 64) { /* safeguard */
+- spin_unlock_bh(&rpc_queue_lock);
+- schedule();
+- rounds = 0;
+- spin_lock_bh(&rpc_queue_lock);
+- }
+-
+- dprintk("RPC: rpciod back to sleep\n");
+- prepare_to_wait(&rpciod_idle, &wait, TASK_INTERRUPTIBLE);
+- if (!rpciod_task_pending() && !signalled()) {
+- spin_unlock_bh(&rpc_queue_lock);
+- schedule();
+- rounds = 0;
+- spin_lock_bh(&rpc_queue_lock);
+- }
+- finish_wait(&rpciod_idle, &wait);
+- dprintk("RPC: switch to rpciod\n");
+- }
+- spin_unlock_bh(&rpc_queue_lock);
+-
+- dprintk("RPC: rpciod shutdown commences\n");
+- if (!list_empty(&all_tasks)) {
+- printk(KERN_ERR "rpciod: active tasks at shutdown?!\n");
+- rpciod_killall();
+- }
+-
+- dprintk("RPC: rpciod exiting\n");
+- unlock_kernel();
+-
+- rpciod_pid = 0;
+- complete_and_exit(&rpciod_killer, 0);
+- return 0;
+-}
+-
+-static void
+-rpciod_killall(void)
++static void rpciod_killall(void)
+ {
+ unsigned long flags;
+
+ while (!list_empty(&all_tasks)) {
+ clear_thread_flag(TIF_SIGPENDING);
+ rpc_killall_tasks(NULL);
+- spin_lock_bh(&rpc_queue_lock);
+- __rpc_schedule();
+- spin_unlock_bh(&rpc_queue_lock);
++ flush_workqueue(rpciod_workqueue);
+ if (!list_empty(&all_tasks)) {
+ dprintk("rpciod_killall: waiting for tasks to exit\n");
+ yield();
+@@ -1193,28 +972,30 @@ rpciod_killall(void)
+ int
+ rpciod_up(void)
+ {
++ struct workqueue_struct *wq;
+ int error = 0;
+
+ down(&rpciod_sema);
+- dprintk("rpciod_up: pid %d, users %d\n", rpciod_pid, rpciod_users);
++ dprintk("rpciod_up: users %d\n", rpciod_users);
+ rpciod_users++;
+- if (rpciod_pid)
++ if (rpciod_workqueue)
+ goto out;
+ /*
+ * If there's no pid, we should be the first user.
+ */
+ if (rpciod_users > 1)
+- printk(KERN_WARNING "rpciod_up: no pid, %d users??\n", rpciod_users);
++ printk(KERN_WARNING "rpciod_up: no workqueue, %d users??\n", rpciod_users);
+ /*
+ * Create the rpciod thread and wait for it to start.
+ */
+- error = kernel_thread(rpciod, NULL, 0);
+- if (error < 0) {
+- printk(KERN_WARNING "rpciod_up: create thread failed, error=%d\n", error);
++ error = -ENOMEM;
++ wq = create_workqueue("rpciod");
++ if (wq == NULL) {
++ printk(KERN_WARNING "rpciod_up: create workqueue failed, error=%d\n", error);
+ rpciod_users--;
+ goto out;
+ }
+- down(&rpciod_running);
++ rpciod_workqueue = wq;
+ error = 0;
+ out:
+ up(&rpciod_sema);
+@@ -1225,20 +1006,21 @@ void
+ rpciod_down(void)
+ {
+ down(&rpciod_sema);
+- dprintk("rpciod_down pid %d sema %d\n", rpciod_pid, rpciod_users);
++ dprintk("rpciod_down sema %d\n", rpciod_users);
+ if (rpciod_users) {
+ if (--rpciod_users)
+ goto out;
+ } else
+- printk(KERN_WARNING "rpciod_down: pid=%d, no users??\n", rpciod_pid);
++ printk(KERN_WARNING "rpciod_down: no users??\n");
+
+- if (!rpciod_pid) {
++ if (!rpciod_workqueue) {
+ dprintk("rpciod_down: Nothing to do!\n");
+ goto out;
+ }
++ rpciod_killall();
+
+- kill_proc(rpciod_pid, SIGKILL, 1);
+- wait_for_completion(&rpciod_killer);
++ destroy_workqueue(rpciod_workqueue);
++ rpciod_workqueue = NULL;
+ out:
+ up(&rpciod_sema);
+ }
+@@ -1256,7 +1038,12 @@ void rpc_show_tasks(void)
+ }
+ printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout "
+ "-rpcwait -action- --exit--\n");
+- alltask_for_each(t, le, &all_tasks)
++ alltask_for_each(t, le, &all_tasks) {
++ const char *rpc_waitq = "none";
++
++ if (RPC_IS_QUEUED(t))
++ rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
++
+ printk("%05d %04d %04x %06d %8p %6d %8p %08ld %8s %8p %8p\n",
+ t->tk_pid,
+ (t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1),
+@@ -1264,8 +1051,9 @@ void rpc_show_tasks(void)
+ t->tk_client,
+ (t->tk_client ? t->tk_client->cl_prog : 0),
+ t->tk_rqstp, t->tk_timeout,
+- rpc_qname(t->tk_rpcwait),
++ rpc_waitq,
+ t->tk_action, t->tk_exit);
++ }
+ spin_unlock(&rpc_sched_lock);
+ }
+ #endif
+--- linux-2.6.7/net/sunrpc/svcsock.c.lsec 2004-06-15 23:18:57.000000000 -0600
++++ linux-2.6.7/net/sunrpc/svcsock.c 2005-03-23 14:28:24.029396672 -0700
+@@ -414,7 +414,6 @@ svc_sendto(struct svc_rqst *rqstp, struc
+ }
+ /* send tail */
+ if (xdr->tail[0].iov_len) {
+- /* The tail *will* be in respages[0]; */
+ result = sock->ops->sendpage(sock, rqstp->rq_respages[rqstp->rq_restailpage],
+ ((unsigned long)xdr->tail[0].iov_base)& (PAGE_SIZE-1),
+ xdr->tail[0].iov_len, 0);
+--- linux-2.6.7/net/sunrpc/clnt.c.lsec 2004-06-15 23:19:13.000000000 -0600
++++ linux-2.6.7/net/sunrpc/clnt.c 2005-03-23 14:28:23.595462640 -0700
+@@ -351,7 +351,9 @@ int rpc_call_sync(struct rpc_clnt *clnt,
+ rpc_clnt_sigmask(clnt, &oldset);
+
+ /* Create/initialize a new RPC task */
+- rpc_init_task(task, clnt, NULL, flags);
++ task = rpc_new_task(clnt, NULL, flags);
++ if (task == NULL)
++ return -ENOMEM;
+ rpc_call_setup(task, msg, 0);
+
+ /* Set up the call info struct and execute the task */
+@@ -620,8 +622,14 @@ call_encode(struct rpc_task *task)
+ rpc_exit(task, -EIO);
+ return;
+ }
+- if (encode && (status = rpcauth_wrap_req(task, encode, req, p,
+- task->tk_msg.rpc_argp)) < 0) {
++ if (encode == NULL)
++ return;
++
++ status = rpcauth_wrap_req(task, encode, req, p, task->tk_msg.rpc_argp);
++ if (status == -EAGAIN) {
++ printk("XXXJBF: out of memeory? Should retry here!!!\n");
++ }
++ if (status < 0) {
+ printk(KERN_WARNING "%s: can't encode arguments: %d\n",
+ clnt->cl_protname, -status);
+ rpc_exit(task, status);
+--- linux-2.6.7/net/sunrpc/sunrpc_syms.c.lsec 2004-06-15 23:19:52.000000000 -0600
++++ linux-2.6.7/net/sunrpc/sunrpc_syms.c 2005-03-23 14:32:35.589153776 -0700
+@@ -58,6 +58,8 @@ EXPORT_SYMBOL(rpc_unlink);
+ EXPORT_SYMBOL(rpc_wake_up);
+ EXPORT_SYMBOL(rpc_queue_upcall);
+ EXPORT_SYMBOL(rpc_mkpipe);
++EXPORT_SYMBOL(rpc_mkdir);
++EXPORT_SYMBOL(rpc_rmdir);
+
+ /* Client transport */
+ EXPORT_SYMBOL(xprt_create_proto);
+@@ -89,6 +91,7 @@ EXPORT_SYMBOL(svc_makesock);
+ EXPORT_SYMBOL(svc_reserve);
+ EXPORT_SYMBOL(svc_auth_register);
+ EXPORT_SYMBOL(auth_domain_lookup);
++EXPORT_SYMBOL(svc_authenticate);
+
+ /* RPC statistics */
+ #ifdef CONFIG_PROC_FS
+--- linux-2.6.7/net/sunrpc/pmap_clnt.c.lsec 2004-06-15 23:19:23.000000000 -0600
++++ linux-2.6.7/net/sunrpc/pmap_clnt.c 2005-03-23 14:28:24.134380712 -0700
+@@ -183,8 +183,10 @@ rpc_register(u32 prog, u32 vers, int pro
+ map.pm_prot = prot;
+ map.pm_port = port;
+
++ rpciod_up();
+ error = rpc_call(pmap_clnt, port? PMAP_SET : PMAP_UNSET,
+ &map, okay, 0);
++ rpciod_down();
+
+ if (error < 0) {
+ printk(KERN_WARNING
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_unseal.c.lsec 2004-06-15 23:19:44.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_unseal.c 2005-03-23 14:28:23.761437408 -0700
+@@ -68,20 +68,13 @@
+ #endif
+
+
+-/* message_buffer is an input if toktype is MIC and an output if it is WRAP:
+- * If toktype is MIC: read_token is a mic token, and message_buffer is the
+- * data that the mic was supposedly taken over.
+- * If toktype is WRAP: read_token is a wrap token, and message_buffer is used
+- * to return the decrypted data.
+- */
++/* read_token is a mic token, and message_buffer is the data that the mic was
++ * supposedly taken over. */
+
+-/* XXX will need to change prototype and/or just split into a separate function
+- * when we add privacy (because read_token will be in pages too). */
+ u32
+ krb5_read_token(struct krb5_ctx *ctx,
+ struct xdr_netobj *read_token,
+- struct xdr_buf *message_buffer,
+- int *qop_state, int toktype)
++ struct xdr_buf *message_buffer, int *qop_state)
+ {
+ int signalg;
+ int sealalg;
+@@ -96,20 +89,16 @@ krb5_read_token(struct krb5_ctx *ctx,
+
+ dprintk("RPC: krb5_read_token\n");
+
+- if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr, toktype,
++ if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr,
+ read_token->len))
+ goto out;
+
+- if ((*ptr++ != ((toktype>>8)&0xff)) || (*ptr++ != (toktype&0xff)))
++ if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) ||
++ (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) )
+ goto out;
+
+ /* XXX sanity-check bodysize?? */
+
+- if (toktype == KG_TOK_WRAP_MSG) {
+- /* XXX gone */
+- goto out;
+- }
+-
+ /* get the sign and seal algorithms */
+
+ signalg = ptr[0] + (ptr[1] << 8);
+@@ -120,14 +109,7 @@ krb5_read_token(struct krb5_ctx *ctx,
+ if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
+ goto out;
+
+- if (((toktype != KG_TOK_WRAP_MSG) && (sealalg != 0xffff)) ||
+- ((toktype == KG_TOK_WRAP_MSG) && (sealalg == 0xffff)))
+- goto out;
+-
+- /* in the current spec, there is only one valid seal algorithm per
+- key type, so a simple comparison is ok */
+-
+- if ((toktype == KG_TOK_WRAP_MSG) && !(sealalg == ctx->sealalg))
++ if (sealalg != 0xffff)
+ goto out;
+
+ /* there are several mappings of seal algorithms to sign algorithms,
+@@ -154,7 +136,7 @@ krb5_read_token(struct krb5_ctx *ctx,
+ switch (signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ ret = make_checksum(checksum_type, ptr - 2, 8,
+- message_buffer, &md5cksum);
++ message_buffer, 0, &md5cksum);
+ if (ret)
+ goto out;
+
+--- linux-2.6.7/net/sunrpc/auth_gss/auth_gss.c.lsec 2004-06-15 23:19:22.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/auth_gss.c 2005-03-23 14:28:24.185372960 -0700
+@@ -45,6 +45,7 @@
+ #include <linux/socket.h>
+ #include <linux/in.h>
+ #include <linux/sched.h>
++#include <linux/pagemap.h>
+ #include <linux/sunrpc/clnt.h>
+ #include <linux/sunrpc/auth.h>
+ #include <linux/sunrpc/auth_gss.h>
+@@ -397,7 +398,7 @@ retry:
+ spin_unlock(&gss_auth->lock);
+ }
+ gss_release_msg(gss_msg);
+- dprintk("RPC: %4u gss_upcall for uid %u result %d", task->tk_pid,
++ dprintk("RPC: %4u gss_upcall for uid %u result %d\n", task->tk_pid,
+ uid, res);
+ return res;
+ out_sleep:
+@@ -740,6 +741,8 @@ gss_marshal(struct rpc_task *task, u32 *
+ maj_stat = gss_get_mic(ctx->gc_gss_ctx,
+ GSS_C_QOP_DEFAULT,
+ &verf_buf, &mic);
++ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++ cred->cr_flags |= RPCAUTH_CRED_DEAD;
+ if(maj_stat != 0){
+ printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat);
+ goto out_put_ctx;
+@@ -779,6 +782,7 @@ gss_validate(struct rpc_task *task, u32
+ struct xdr_netobj mic;
+ u32 flav,len;
+ u32 service;
++ u32 maj_stat;
+
+ dprintk("RPC: %4u gss_validate\n", task->tk_pid);
+
+@@ -794,8 +798,11 @@ gss_validate(struct rpc_task *task, u32
+ mic.data = (u8 *)p;
+ mic.len = len;
+
+- if (gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state))
+- goto out_bad;
++ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state);
++ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++ cred->cr_flags |= RPCAUTH_CRED_DEAD;
++ if (maj_stat)
++ goto out_bad;
+ service = gss_pseudoflavor_to_service(ctx->gc_gss_ctx->mech_type,
+ gss_cred->gc_flavor);
+ switch (service) {
+@@ -807,6 +814,11 @@ gss_validate(struct rpc_task *task, u32
+ /* verifier data, flavor, length, length, sequence number: */
+ task->tk_auth->au_rslack = XDR_QUADLEN(len) + 4;
+ break;
++ case RPC_GSS_SVC_PRIVACY:
++ /* XXXJBF: Ugh. Going for a wild overestimate.
++ * Need some info from krb5 layer? */
++ task->tk_auth->au_rslack = XDR_QUADLEN(len) + 32;
++ break;
+ default:
+ goto out_bad;
+ }
+@@ -821,11 +833,11 @@ out_bad:
+ }
+
+ static inline int
+-gss_wrap_req_integ(struct gss_cl_ctx *ctx,
+- kxdrproc_t encode, void *rqstp, u32 *p, void *obj)
++gss_wrap_req_integ(struct rpc_cred *cred, kxdrproc_t encode,
++ struct rpc_rqst *rqstp, u32 *p, void *obj)
+ {
+- struct rpc_rqst *req = (struct rpc_rqst *)rqstp;
+- struct xdr_buf *snd_buf = &req->rq_snd_buf;
++ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
++ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
+ struct xdr_buf integ_buf;
+ u32 *integ_len = NULL;
+ struct xdr_netobj mic;
+@@ -836,7 +848,7 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct
+
+ integ_len = p++;
+ offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
+- *p++ = htonl(req->rq_seqno);
++ *p++ = htonl(rqstp->rq_seqno);
+
+ status = encode(rqstp, p, obj);
+ if (status)
+@@ -848,7 +860,7 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct
+ *integ_len = htonl(integ_buf.len);
+
+ /* guess whether we're in the head or the tail: */
+- if (snd_buf->page_len || snd_buf->tail[0].iov_len)
++ if (snd_buf->page_len || snd_buf->tail[0].iov_len)
+ iov = snd_buf->tail;
+ else
+ iov = snd_buf->head;
+@@ -857,6 +869,8 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct
+
+ maj_stat = gss_get_mic(ctx->gc_gss_ctx,
+ GSS_C_QOP_DEFAULT, &integ_buf, &mic);
++ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++ cred->cr_flags |= RPCAUTH_CRED_DEAD;
+ status = -EIO; /* XXX? */
+ if (maj_stat)
+ return status;
+@@ -868,6 +882,113 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct
+ return 0;
+ }
+
++static void
++priv_release_snd_buf(struct rpc_rqst *rqstp)
++{
++ int i;
++
++ for (i=0; i < rqstp->rq_enc_pages_num; i++)
++ __free_page(rqstp->rq_enc_pages[i]);
++ kfree(rqstp->rq_enc_pages);
++}
++
++static int
++alloc_enc_pages(struct rpc_rqst *rqstp)
++{
++ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
++ int first, last, i;
++
++ if (snd_buf->page_len == 0) {
++ rqstp->rq_enc_pages_num = 0;
++ return 0;
++ }
++
++ first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
++ last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT;
++ rqstp->rq_enc_pages_num = last - first + 1 + 1;
++ rqstp->rq_enc_pages
++ = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *),
++ GFP_NOFS);
++ if (!rqstp->rq_enc_pages)
++ goto out;
++ for (i=0; i < rqstp->rq_enc_pages_num; i++) {
++ rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS);
++ if (rqstp->rq_enc_pages[i] == NULL)
++ goto out_free;
++ }
++ rqstp->rq_release_snd_buf = priv_release_snd_buf;
++ return 0;
++out_free:
++ for (i--; i >= 0; i--) {
++ __free_page(rqstp->rq_enc_pages[i]);
++ }
++out:
++ return -EAGAIN;
++}
++
++static inline int
++gss_wrap_req_priv(struct rpc_cred *cred, kxdrproc_t encode,
++ struct rpc_rqst *rqstp, u32 *p, void *obj)
++{
++ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
++ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
++ u32 offset;
++ u32 maj_stat;
++ int status;
++ u32 *opaque_len;
++ struct page **inpages;
++ int first;
++ int pad;
++ struct iovec *iov;
++ char *tmp;
++
++ opaque_len = p++;
++ offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
++ *p++ = htonl(rqstp->rq_seqno);
++
++ status = encode(rqstp, p, obj);
++ if (status)
++ return status;
++
++ status = alloc_enc_pages(rqstp);
++ if (status)
++ return status;
++ /* XXXJBF: Oops! Do we need rq_enc_pages really any more?? */
++ first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
++ inpages = snd_buf->pages + first;
++ snd_buf->pages = rqstp->rq_enc_pages;
++ snd_buf->page_base -= first << PAGE_CACHE_SHIFT;
++ /* XXX?: tail needs to be separate if we want to be able to expand
++ * the head (since it's often put right after the head). But is
++ * expanding the head safe in any case? */
++ if (snd_buf->page_len || snd_buf->tail[0].iov_len) {
++ tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]);
++ memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len);
++ snd_buf->tail[0].iov_base = tmp;
++ }
++ maj_stat = gss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, offset,
++ snd_buf, inpages);
++ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++ cred->cr_flags |= RPCAUTH_CRED_DEAD;
++ status = -EIO; /* XXX? */
++ if (maj_stat)
++ return status;
++
++ *opaque_len = htonl(snd_buf->len - offset);
++ /* guess whether we're in the head or the tail: */
++ if (snd_buf->page_len || snd_buf->tail[0].iov_len)
++ iov = snd_buf->tail;
++ else
++ iov = snd_buf->head;
++ p = iov->iov_base + iov->iov_len;
++ pad = 3 - ((snd_buf->len - offset - 1) & 3);
++ memset(p, 0, pad);
++ iov->iov_len += pad;
++ snd_buf->len += pad;
++
++ return 0;
++}
++
+ static int
+ gss_wrap_req(struct rpc_task *task,
+ kxdrproc_t encode, void *rqstp, u32 *p, void *obj)
+@@ -894,9 +1015,11 @@ gss_wrap_req(struct rpc_task *task,
+ status = encode(rqstp, p, obj);
+ goto out;
+ case RPC_GSS_SVC_INTEGRITY:
+- status = gss_wrap_req_integ(ctx, encode, rqstp, p, obj);
++ status = gss_wrap_req_integ(cred, encode, rqstp, p, obj);
+ goto out;
+ case RPC_GSS_SVC_PRIVACY:
++ status = gss_wrap_req_priv(cred, encode, rqstp, p, obj);
++ goto out;
+ default:
+ goto out;
+ }
+@@ -907,11 +1030,10 @@ out:
+ }
+
+ static inline int
+-gss_unwrap_resp_integ(struct gss_cl_ctx *ctx,
+- kxdrproc_t decode, void *rqstp, u32 **p, void *obj)
++gss_unwrap_resp_integ(struct rpc_cred *cred, struct rpc_rqst *rqstp, u32 **p)
+ {
+- struct rpc_rqst *req = (struct rpc_rqst *)rqstp;
+- struct xdr_buf *rcv_buf = &req->rq_rcv_buf;
++ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
++ struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
+ struct xdr_buf integ_buf;
+ struct xdr_netobj mic;
+ u32 data_offset, mic_offset;
+@@ -926,7 +1048,7 @@ gss_unwrap_resp_integ(struct gss_cl_ctx
+ mic_offset = integ_len + data_offset;
+ if (mic_offset > rcv_buf->len)
+ return status;
+- if (ntohl(*(*p)++) != req->rq_seqno)
++ if (ntohl(*(*p)++) != rqstp->rq_seqno)
+ return status;
+
+ if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset,
+@@ -938,11 +1060,44 @@ gss_unwrap_resp_integ(struct gss_cl_ctx
+
+ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf,
+ &mic, NULL);
++ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++ cred->cr_flags |= RPCAUTH_CRED_DEAD;
+ if (maj_stat != GSS_S_COMPLETE)
+ return status;
+ return 0;
+ }
+
++static inline int
++gss_unwrap_resp_priv(struct rpc_cred *cred, struct rpc_rqst *rqstp, u32 **p)
++{
++ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
++ struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf;
++ u32 offset, out_offset;
++ u32 opaque_len;
++ u32 maj_stat;
++ int status = -EIO;
++
++ opaque_len = ntohl(*(*p)++);
++ offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
++ if (offset + opaque_len > rcv_buf->len)
++ return status;
++ /* remove padding: */
++ rcv_buf->len = offset + opaque_len;
++
++ maj_stat = gss_unwrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT,
++ offset, rcv_buf, &out_offset);
++ if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++ cred->cr_flags |= RPCAUTH_CRED_DEAD;
++ if (maj_stat != GSS_S_COMPLETE)
++ return status;
++ *p = (u32 *)(rcv_buf->head[0].iov_base + out_offset);
++ if (ntohl(*(*p)++) != rqstp->rq_seqno)
++ return status;
++
++ return 0;
++}
++
++
+ static int
+ gss_unwrap_resp(struct rpc_task *task,
+ kxdrproc_t decode, void *rqstp, u32 *p, void *obj)
+@@ -962,12 +1117,16 @@ gss_unwrap_resp(struct rpc_task *task,
+ case RPC_GSS_SVC_NONE:
+ goto out_decode;
+ case RPC_GSS_SVC_INTEGRITY:
+- status = gss_unwrap_resp_integ(ctx, decode,
+- rqstp, &p, obj);
++ status = gss_unwrap_resp_integ(cred, rqstp, &p);
+ if (status)
+ goto out;
+ break;
+ case RPC_GSS_SVC_PRIVACY:
++ status = gss_unwrap_resp_priv(cred, rqstp, &p);
++ if (status)
++ goto out;
++ break;
++
+ default:
+ goto out;
+ }
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_mech.c.lsec 2005-03-23 14:28:24.187372656 -0700
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_mech.c 2005-03-23 14:28:24.186372808 -0700
+@@ -0,0 +1,296 @@
++/*
++ * linux/net/sunrpc/gss_spkm3_mech.c
++ *
++ * Copyright (c) 2003 The Regents of the University of Michigan.
++ * All rights reserved.
++ *
++ * Andy Adamson <andros@umich.edu>
++ * J. Bruce Fields <bfields@umich.edu>
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the University nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ */
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/sunrpc/auth.h>
++#include <linux/in.h>
++#include <linux/sunrpc/svcauth_gss.h>
++#include <linux/sunrpc/gss_spkm3.h>
++#include <linux/sunrpc/xdr.h>
++#include <linux/crypto.h>
++
++#ifdef RPC_DEBUG
++# define RPCDBG_FACILITY RPCDBG_AUTH
++#endif
++
++struct xdr_netobj gss_mech_spkm3_oid =
++ {7, "\053\006\001\005\005\001\003"};
++
++static inline int
++get_bytes(char **ptr, const char *end, void *res, int len)
++{
++ char *p, *q;
++ p = *ptr;
++ q = p + len;
++ if (q > end || q < p)
++ return -1;
++ memcpy(res, p, len);
++ *ptr = q;
++ return 0;
++}
++
++static inline int
++get_netobj(char **ptr, const char *end, struct xdr_netobj *res)
++{
++ char *p, *q;
++ p = *ptr;
++ if (get_bytes(&p, end, &res->len, sizeof(res->len)))
++ return -1;
++ q = p + res->len;
++ if(res->len == 0)
++ goto out_nocopy;
++ if (q > end || q < p)
++ return -1;
++ if (!(res->data = kmalloc(res->len, GFP_KERNEL)))
++ return -1;
++ memcpy(res->data, p, res->len);
++out_nocopy:
++ *ptr = q;
++ return 0;
++}
++
++static inline int
++get_key(char **p, char *end, struct crypto_tfm **res, int *resalg)
++{
++ struct xdr_netobj key = {
++ .len = 0,
++ .data = NULL,
++ };
++ int alg_mode,setkey = 0;
++ char *alg_name;
++
++ if (get_bytes(p, end, resalg, sizeof(int)))
++ goto out_err;
++ if ((get_netobj(p, end, &key)))
++ goto out_err;
++
++ switch (*resalg) {
++ case NID_des_cbc:
++ alg_name = "des";
++ alg_mode = CRYPTO_TFM_MODE_CBC;
++ setkey = 1;
++ break;
++ case NID_md5:
++ if (key.len == 0) {
++ dprintk("RPC: SPKM3 get_key: NID_md5 zero Key length\n");
++ }
++ alg_name = "md5";
++ alg_mode = 0;
++ setkey = 0;
++ break;
++ case NID_cast5_cbc:
++ dprintk("RPC: SPKM3 get_key: case cast5_cbc, UNSUPPORTED \n");
++ goto out_err;
++ break;
++ default:
++ dprintk("RPC: SPKM3 get_key: unsupported algorithm %d", *resalg);
++ goto out_err_free_key;
++ }
++ if (!(*res = crypto_alloc_tfm(alg_name, alg_mode)))
++ goto out_err_free_key;
++ if (setkey) {
++ if (crypto_cipher_setkey(*res, key.data, key.len))
++ goto out_err_free_tfm;
++ }
++
++ if(key.len > 0)
++ kfree(key.data);
++ return 0;
++
++out_err_free_tfm:
++ crypto_free_tfm(*res);
++out_err_free_key:
++ if(key.len > 0)
++ kfree(key.data);
++out_err:
++ return -1;
++}
++
++static u32
++gss_import_sec_context_spkm3(struct xdr_netobj *inbuf,
++ struct gss_ctx *ctx_id)
++{
++ char *p = inbuf->data;
++ char *end = inbuf->data + inbuf->len;
++ struct spkm3_ctx *ctx;
++
++ if (!(ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)))
++ goto out_err;
++ memset(ctx, 0, sizeof(*ctx));
++
++ if (get_netobj(&p, end, &ctx->ctx_id))
++ goto out_err_free_ctx;
++
++ if (get_bytes(&p, end, &ctx->qop, sizeof(ctx->qop)))
++ goto out_err_free_ctx_id;
++
++ if (get_netobj(&p, end, &ctx->mech_used))
++ goto out_err_free_mech;
++
++ if (get_bytes(&p, end, &ctx->ret_flags, sizeof(ctx->ret_flags)))
++ goto out_err_free_mech;
++
++ if (get_bytes(&p, end, &ctx->req_flags, sizeof(ctx->req_flags)))
++ goto out_err_free_mech;
++
++ if (get_netobj(&p, end, &ctx->share_key))
++ goto out_err_free_s_key;
++
++ if (get_key(&p, end, &ctx->derived_conf_key, &ctx->conf_alg)) {
++ dprintk("RPC: SPKM3 confidentiality key will be NULL\n");
++ }
++
++ if (get_key(&p, end, &ctx->derived_integ_key, &ctx->intg_alg)) {
++ dprintk("RPC: SPKM3 integrity key will be NULL\n");
++ }
++
++ if (get_bytes(&p, end, &ctx->owf_alg, sizeof(ctx->owf_alg)))
++ goto out_err_free_s_key;
++
++ if (get_bytes(&p, end, &ctx->owf_alg, sizeof(ctx->owf_alg)))
++ goto out_err_free_s_key;
++
++ if (p != end)
++ goto out_err_free_s_key;
++
++ ctx_id->internal_ctx_id = ctx;
++
++ dprintk("Succesfully imported new spkm context.\n");
++ return 0;
++
++out_err_free_s_key:
++ kfree(ctx->share_key.data);
++out_err_free_mech:
++ kfree(ctx->mech_used.data);
++out_err_free_ctx_id:
++ kfree(ctx->ctx_id.data);
++out_err_free_ctx:
++ kfree(ctx);
++out_err:
++ return GSS_S_FAILURE;
++}
++
++void
++gss_delete_sec_context_spkm3(void *internal_ctx) {
++ struct spkm3_ctx *sctx = internal_ctx;
++
++ if(sctx->derived_integ_key)
++ crypto_free_tfm(sctx->derived_integ_key);
++ if(sctx->derived_conf_key)
++ crypto_free_tfm(sctx->derived_conf_key);
++ if(sctx->share_key.data)
++ kfree(sctx->share_key.data);
++ if(sctx->mech_used.data)
++ kfree(sctx->mech_used.data);
++ kfree(sctx);
++}
++
++u32
++gss_verify_mic_spkm3(struct gss_ctx *ctx,
++ struct xdr_buf *signbuf,
++ struct xdr_netobj *checksum,
++ u32 *qstate) {
++ u32 maj_stat = 0;
++ int qop_state = 0;
++ struct spkm3_ctx *sctx = ctx->internal_ctx_id;
++
++ dprintk("RPC: gss_verify_mic_spkm3 calling spkm3_read_token\n");
++ maj_stat = spkm3_read_token(sctx, checksum, signbuf, &qop_state,
++ SPKM_MIC_TOK);
++
++ if (!maj_stat && qop_state)
++ *qstate = qop_state;
++
++ dprintk("RPC: gss_verify_mic_spkm3 returning %d\n", maj_stat);
++ return maj_stat;
++}
++
++u32
++gss_get_mic_spkm3(struct gss_ctx *ctx,
++ u32 qop,
++ struct xdr_buf *message_buffer,
++ struct xdr_netobj *message_token) {
++ u32 err = 0;
++ struct spkm3_ctx *sctx = ctx->internal_ctx_id;
++
++ dprintk("RPC: gss_get_mic_spkm3\n");
++
++ err = spkm3_make_token(sctx, qop, message_buffer,
++ message_token, SPKM_MIC_TOK);
++ return err;
++}
++
++static struct gss_api_ops gss_spkm3_ops = {
++ .gss_import_sec_context = gss_import_sec_context_spkm3,
++ .gss_get_mic = gss_get_mic_spkm3,
++ .gss_verify_mic = gss_verify_mic_spkm3,
++ .gss_delete_sec_context = gss_delete_sec_context_spkm3,
++};
++
++static struct pf_desc gss_spkm3_pfs[] = {
++ {RPC_AUTH_GSS_SPKM, 0, RPC_GSS_SVC_NONE, "spkm3"},
++ {RPC_AUTH_GSS_SPKMI, 0, RPC_GSS_SVC_INTEGRITY, "spkm3i"},
++};
++
++static struct gss_api_mech gss_spkm3_mech = {
++ .gm_name = "spkm3",
++ .gm_owner = THIS_MODULE,
++ .gm_ops = &gss_spkm3_ops,
++ .gm_pf_num = ARRAY_SIZE(gss_spkm3_pfs),
++ .gm_pfs = gss_spkm3_pfs,
++};
++
++static int __init init_spkm3_module(void)
++{
++ int status;
++
++ status = gss_mech_register(&gss_spkm3_mech);
++ if (status)
++ printk("Failed to register spkm3 gss mechanism!\n");
++ return 0;
++}
++
++static void __exit cleanup_spkm3_module(void)
++{
++ gss_mech_unregister(&gss_spkm3_mech);
++}
++
++MODULE_LICENSE("GPL");
++module_init(init_spkm3_module);
++module_exit(cleanup_spkm3_module);
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_crypto.c.lsec 2004-06-15 23:18:55.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_crypto.c 2005-03-23 14:28:24.840273400 -0700
+@@ -139,17 +139,91 @@ buf_to_sg(struct scatterlist *sg, char *
+ sg->length = len;
+ }
+
++static int
++process_xdr_buf(struct xdr_buf *buf, int offset, int len,
++ int (*actor)(struct scatterlist *, void *), void *data)
++{
++ int i, page_len, thislen, page_offset, ret = 0;
++ struct scatterlist sg[1];
++
++ if (offset >= buf->head[0].iov_len) {
++ offset -= buf->head[0].iov_len;
++ } else {
++ thislen = buf->head[0].iov_len - offset;
++ if (thislen > len)
++ thislen = len;
++ buf_to_sg(sg, buf->head[0].iov_base + offset, thislen);
++ ret = actor(sg, data);
++ if (ret)
++ goto out;
++ offset = 0;
++ len -= thislen;
++ }
++ if (len == 0)
++ goto out;
++
++ if (offset >= buf->page_len) {
++ offset -= buf->page_len;
++ } else {
++ page_len = buf->page_len - offset;
++ if (page_len > len)
++ page_len = len;
++ len -= page_len;
++ page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1);
++ i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT;
++ thislen = PAGE_CACHE_SIZE - page_offset;
++ do {
++ if (thislen > page_len)
++ thislen = page_len;
++ sg->page = buf->pages[i];
++ sg->offset = page_offset;
++ sg->length = thislen;
++ ret = actor(sg, data);
++ if (ret)
++ goto out;
++ page_len -= thislen;
++ i++;
++ page_offset = 0;
++ thislen = PAGE_CACHE_SIZE;
++ } while (page_len != 0);
++ offset = 0;
++ }
++ if (len == 0)
++ goto out;
++
++ if (offset < buf->tail[0].iov_len) {
++ thislen = buf->tail[0].iov_len - offset;
++ if (thislen > len)
++ thislen = len;
++ buf_to_sg(sg, buf->tail[0].iov_base + offset, thislen);
++ ret = actor(sg, data);
++ len -= thislen;
++ }
++ if (len != 0)
++ ret = -EINVAL;
++out:
++ return ret;
++}
++
++static int
++checksummer(struct scatterlist *sg, void *data)
++{
++ struct crypto_tfm *tfm = (struct crypto_tfm *)data;
++
++ crypto_digest_update(tfm, sg, 1);
++
++ return 0;
++}
++
+ /* checksum the plaintext data and hdrlen bytes of the token header */
+ s32
+ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
+- struct xdr_netobj *cksum)
++ int body_offset, struct xdr_netobj *cksum)
+ {
+ char *cksumname;
+ struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */
+ struct scatterlist sg[1];
+ u32 code = GSS_S_FAILURE;
+- int len, thislen, offset;
+- int i;
+
+ switch (cksumtype) {
+ case CKSUMTYPE_RSA_MD5:
+@@ -169,35 +243,8 @@ make_checksum(s32 cksumtype, char *heade
+ crypto_digest_init(tfm);
+ buf_to_sg(sg, header, hdrlen);
+ crypto_digest_update(tfm, sg, 1);
+- if (body->head[0].iov_len) {
+- buf_to_sg(sg, body->head[0].iov_base, body->head[0].iov_len);
+- crypto_digest_update(tfm, sg, 1);
+- }
+-
+- len = body->page_len;
+- if (len != 0) {
+- offset = body->page_base & (PAGE_CACHE_SIZE - 1);
+- i = body->page_base >> PAGE_CACHE_SHIFT;
+- thislen = PAGE_CACHE_SIZE - offset;
+- do {
+- if (thislen > len)
+- thislen = len;
+- sg->page = body->pages[i];
+- sg->offset = offset;
+- sg->length = thislen;
+- kmap(sg->page); /* XXX kmap_atomic? */
+- crypto_digest_update(tfm, sg, 1);
+- kunmap(sg->page);
+- len -= thislen;
+- i++;
+- offset = 0;
+- thislen = PAGE_CACHE_SIZE;
+- } while(len != 0);
+- }
+- if (body->tail[0].iov_len) {
+- buf_to_sg(sg, body->tail[0].iov_base, body->tail[0].iov_len);
+- crypto_digest_update(tfm, sg, 1);
+- }
++ process_xdr_buf(body, body_offset, body->len - body_offset,
++ checksummer, tfm);
+ crypto_digest_final(tfm, cksum->data);
+ code = 0;
+ out:
+@@ -207,3 +254,154 @@ out:
+ }
+
+ EXPORT_SYMBOL(make_checksum);
++
++struct encryptor_desc {
++ u8 iv[8]; /* XXX hard-coded blocksize */
++ struct crypto_tfm *tfm;
++ int pos;
++ struct xdr_buf *outbuf;
++ struct page **pages;
++ struct scatterlist infrags[4];
++ struct scatterlist outfrags[4];
++ int fragno;
++ int fraglen;
++};
++
++static int
++encryptor(struct scatterlist *sg, void *data)
++{
++ struct encryptor_desc *desc = data;
++ struct xdr_buf *outbuf = desc->outbuf;
++ struct page *in_page;
++ int thislen = desc->fraglen + sg->length;
++ int fraglen, ret;
++ int page_pos;
++
++ /* Worst case is 4 fragments: head, end of page 1, start
++ * of page 2, tail. Anything more is a bug. */
++ BUG_ON(desc->fragno > 3);
++ desc->infrags[desc->fragno] = *sg;
++ desc->outfrags[desc->fragno] = *sg;
++
++ page_pos = desc->pos - outbuf->head[0].iov_len;
++ if (page_pos >= 0 && page_pos < outbuf->page_len) {
++ /* pages are not in place: */
++ int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT;
++ in_page = desc->pages[i];
++ } else {
++ in_page = sg->page;
++ }
++ desc->infrags[desc->fragno].page = in_page;
++ desc->fragno++;
++ desc->fraglen += sg->length;
++ desc->pos += sg->length;
++
++ fraglen = thislen & 7; /* XXX hardcoded blocksize */
++ thislen -= fraglen;
++
++ if (thislen == 0)
++ return 0;
++
++ ret = crypto_cipher_encrypt_iv(desc->tfm, desc->outfrags, desc->infrags,
++ thislen, desc->iv);
++ if (ret)
++ return ret;
++ if (fraglen) {
++ desc->outfrags[0].page = sg->page;
++ desc->outfrags[0].offset = sg->offset + sg->length - fraglen;
++ desc->outfrags[0].length = fraglen;
++ desc->infrags[0] = desc->outfrags[0];
++ desc->infrags[0].page = in_page;
++ desc->fragno = 1;
++ desc->fraglen = fraglen;
++ } else {
++ desc->fragno = 0;
++ desc->fraglen = 0;
++ }
++ return 0;
++}
++
++int
++gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset,
++ struct page **pages)
++{
++ int ret;
++ struct encryptor_desc desc;
++
++ BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0);
++
++ memset(desc.iv, 0, sizeof(desc.iv));
++ desc.tfm = tfm;
++ desc.pos = offset;
++ desc.outbuf = buf;
++ desc.pages = pages;
++ desc.fragno = 0;
++ desc.fraglen = 0;
++
++ ret = process_xdr_buf(buf, offset, buf->len - offset, encryptor, &desc);
++ return ret;
++}
++
++EXPORT_SYMBOL(gss_encrypt_xdr_buf);
++
++struct decryptor_desc {
++ u8 iv[8]; /* XXX hard-coded blocksize */
++ struct crypto_tfm *tfm;
++ struct scatterlist frags[4];
++ int fragno;
++ int fraglen;
++};
++
++static int
++decryptor(struct scatterlist *sg, void *data)
++{
++ struct decryptor_desc *desc = data;
++ int thislen = desc->fraglen + sg->length;
++ int fraglen, ret;
++
++ /* Worst case is 4 fragments: head, end of page 1, start
++ * of page 2, tail. Anything more is a bug. */
++ BUG_ON(desc->fragno > 3);
++ desc->frags[desc->fragno] = *sg;
++ desc->fragno++;
++ desc->fraglen += sg->length;
++
++ fraglen = thislen & 7; /* XXX hardcoded blocksize */
++ thislen -= fraglen;
++
++ if (thislen == 0)
++ return 0;
++
++ ret = crypto_cipher_decrypt_iv(desc->tfm, desc->frags, desc->frags,
++ thislen, desc->iv);
++ if (ret)
++ return ret;
++ if (fraglen) {
++ desc->frags[0].page = sg->page;
++ desc->frags[0].offset = sg->offset + sg->length - fraglen;
++ desc->frags[0].length = fraglen;
++ desc->fragno = 1;
++ desc->fraglen = fraglen;
++ } else {
++ desc->fragno = 0;
++ desc->fraglen = 0;
++ }
++ return 0;
++}
++
++int
++gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset)
++{
++ struct decryptor_desc desc;
++
++ /* XXXJBF: */
++ BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0);
++
++ memset(desc.iv, 0, sizeof(desc.iv));
++ desc.tfm = tfm;
++ desc.fragno = 0;
++ desc.fraglen = 0;
++ return process_xdr_buf(buf, offset, buf->len - offset, decryptor, &desc);
++}
++
++EXPORT_SYMBOL(gss_decrypt_xdr_buf);
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_seal.c.lsec 2005-03-23 14:28:24.239364752 -0700
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_seal.c 2005-03-23 14:28:24.238364904 -0700
+@@ -0,0 +1,132 @@
++/*
++ * linux/net/sunrpc/gss_spkm3_seal.c
++ *
++ * Copyright (c) 2003 The Regents of the University of Michigan.
++ * All rights reserved.
++ *
++ * Andy Adamson <andros@umich.edu>
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the University nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ */
++
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/sunrpc/gss_spkm3.h>
++#include <linux/random.h>
++#include <linux/crypto.h>
++
++#ifdef RPC_DEBUG
++# define RPCDBG_FACILITY RPCDBG_AUTH
++#endif
++
++/*
++ * spkm3_make_token()
++ *
++ * Only SPKM_MIC_TOK with md5 intg-alg is supported
++ */
++
++u32
++spkm3_make_token(struct spkm3_ctx *ctx, int qop_req,
++ struct xdr_buf * text, struct xdr_netobj * token,
++ int toktype)
++{
++ s32 checksum_type;
++ char tokhdrbuf[25];
++ struct xdr_netobj md5cksum = {.len = 0, .data = NULL};
++ struct xdr_netobj mic_hdr = {.len = 0, .data = tokhdrbuf};
++ int tmsglen, tokenlen = 0;
++ unsigned char *ptr;
++ s32 now;
++ int ctxelen = 0, ctxzbit = 0;
++ int md5elen = 0, md5zbit = 0;
++
++ dprintk("RPC: spkm3_make_token\n");
++
++ now = jiffies;
++ if (qop_req != 0)
++ goto out_err;
++
++ if (ctx->ctx_id.len != 16) {
++ dprintk("RPC: spkm3_make_token BAD ctx_id.len %d\n",
++ ctx->ctx_id.len);
++ goto out_err;
++ }
++
++ switch (ctx->intg_alg) {
++ case NID_md5:
++ checksum_type = CKSUMTYPE_RSA_MD5;
++ break;
++ default:
++ dprintk("RPC: gss_spkm3_seal: ctx->signalg %d not"
++ " supported\n", ctx->intg_alg);
++ goto out_err;
++ }
++ /* XXX since we don't support WRAP, perhaps we don't care... */
++ if (ctx->conf_alg != NID_cast5_cbc) {
++ dprintk("RPC: gss_spkm3_seal: ctx->sealalg %d not supported\n",
++ ctx->conf_alg);
++ goto out_err;
++ }
++
++ if (toktype == SPKM_MIC_TOK) {
++ tmsglen = 0;
++ /* Calculate checksum over the mic-header */
++ asn1_bitstring_len(&ctx->ctx_id, &ctxelen, &ctxzbit);
++ spkm3_mic_header(&mic_hdr.data, &mic_hdr.len, ctx->ctx_id.data,
++ ctxelen, ctxzbit);
++
++ if (make_checksum(checksum_type, mic_hdr.data, mic_hdr.len,
++ text, &md5cksum))
++ goto out_err;
++
++ asn1_bitstring_len(&md5cksum, &md5elen, &md5zbit);
++ tokenlen = 10 + ctxelen + 1 + 2 + md5elen + 1;
++
++ /* Create token header using generic routines */
++ token->len = g_token_size(&ctx->mech_used, tokenlen + tmsglen);
++
++ ptr = token->data;
++ g_make_token_header(&ctx->mech_used, tokenlen + tmsglen, &ptr);
++
++ spkm3_make_mic_token(&ptr, tokenlen, &mic_hdr, &md5cksum, md5elen, md5zbit);
++ } else if (toktype == SPKM_WRAP_TOK) { /* Not Supported */
++ dprintk("RPC: gss_spkm3_seal: SPKM_WRAP_TOK not supported\n");
++ goto out_err;
++ }
++ kfree(md5cksum.data);
++
++ /* XXX need to implement sequence numbers, and ctx->expired */
++
++ return GSS_S_COMPLETE;
++out_err:
++ if (md5cksum.data)
++ kfree(md5cksum.data);
++ token->data = 0;
++ token->len = 0;
++ return GSS_S_FAILURE;
++}
+--- linux-2.6.7/net/sunrpc/auth_gss/svcauth_gss.c.lsec 2004-06-15 23:19:22.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/svcauth_gss.c 2005-03-23 14:28:24.405339520 -0700
+@@ -37,6 +37,7 @@
+ *
+ */
+
++#include <asm/bitops.h>
+ #include <linux/types.h>
+ #include <linux/module.h>
+ #include <linux/pagemap.h>
+@@ -78,7 +79,6 @@ struct rsi {
+
+ static struct cache_head *rsi_table[RSI_HASHMAX];
+ static struct cache_detail rsi_cache;
+-static struct rsi *rsi_lookup(struct rsi *item, int set);
+
+ static void rsi_free(struct rsi *rsii)
+ {
+@@ -125,38 +125,6 @@ static inline int dup_netobj(struct xdr_
+ return dup_to_netobj(dst, src->data, src->len);
+ }
+
+-static inline void rsi_init(struct rsi *new, struct rsi *item)
+-{
+- new->out_handle.data = NULL;
+- new->out_handle.len = 0;
+- new->out_token.data = NULL;
+- new->out_token.len = 0;
+- new->in_handle.len = item->in_handle.len;
+- item->in_handle.len = 0;
+- new->in_token.len = item->in_token.len;
+- item->in_token.len = 0;
+- new->in_handle.data = item->in_handle.data;
+- item->in_handle.data = NULL;
+- new->in_token.data = item->in_token.data;
+- item->in_token.data = NULL;
+-}
+-
+-static inline void rsi_update(struct rsi *new, struct rsi *item)
+-{
+- BUG_ON(new->out_handle.data || new->out_token.data);
+- new->out_handle.len = item->out_handle.len;
+- item->out_handle.len = 0;
+- new->out_token.len = item->out_token.len;
+- item->out_token.len = 0;
+- new->out_handle.data = item->out_handle.data;
+- item->out_handle.data = NULL;
+- new->out_token.data = item->out_token.data;
+- item->out_token.data = NULL;
+-
+- new->major_status = item->major_status;
+- new->minor_status = item->minor_status;
+-}
+-
+ static void rsi_request(struct cache_detail *cd,
+ struct cache_head *h,
+ char **bpp, int *blen)
+@@ -168,6 +136,75 @@ static void rsi_request(struct cache_det
+ (*bpp)[-1] = '\n';
+ }
+
++static inline int
++gssd_reply(struct rsi *item)
++{
++ struct rsi *tmp;
++ struct cache_head **hp, **head;
++
++ head = &rsi_cache.hash_table[rsi_hash(item)];
++ write_lock(&rsi_cache.hash_lock);
++ for (hp = head; *hp != NULL; hp = &tmp->h.next) {
++ tmp = container_of(*hp, struct rsi, h);
++ if (rsi_match(tmp, item)) {
++ cache_get(&tmp->h);
++ clear_bit(CACHE_HASHED, &tmp->h.flags);
++ *hp = tmp->h.next;
++ tmp->h.next = NULL;
++ rsi_cache.entries--;
++ if (test_bit(CACHE_VALID, &tmp->h.flags)) {
++ write_unlock(&rsi_cache.hash_lock);
++ rsi_put(&tmp->h, &rsi_cache);
++ return -EINVAL;
++ }
++ set_bit(CACHE_HASHED, &item->h.flags);
++ item->h.next = *hp;
++ *hp = &item->h;
++ rsi_cache.entries++;
++ set_bit(CACHE_VALID, &item->h.flags);
++ item->h.last_refresh = get_seconds();
++ write_unlock(&rsi_cache.hash_lock);
++ cache_fresh(&rsi_cache, &tmp->h, 0);
++ rsi_put(&tmp->h, &rsi_cache);
++ return 0;
++ }
++ }
++ write_unlock(&rsi_cache.hash_lock);
++ return -EINVAL;
++}
++
++static inline struct rsi *
++gssd_upcall(struct rsi *item, struct svc_rqst *rqstp)
++{
++ struct rsi *tmp;
++ struct cache_head **hp, **head;
++
++ head = &rsi_cache.hash_table[rsi_hash(item)];
++ read_lock(&rsi_cache.hash_lock);
++ for (hp = head; *hp != NULL; hp = &tmp->h.next) {
++ tmp = container_of(*hp, struct rsi, h);
++ if (rsi_match(tmp, item)) {
++ if (!test_bit(CACHE_VALID, &tmp->h.flags)) {
++ read_unlock(&rsi_cache.hash_lock);
++ return NULL;
++ }
++ *hp = tmp->h.next;
++ tmp->h.next = NULL;
++ rsi_cache.entries--;
++ read_unlock(&rsi_cache.hash_lock);
++ return tmp;
++ }
++ }
++ cache_get(&item->h);
++ item->h.next = *head;
++ *head = &item->h;
++ rsi_cache.entries++;
++ read_unlock(&rsi_cache.hash_lock);
++ cache_get(&item->h);
++ if (cache_check(&rsi_cache, &item->h, &rqstp->rq_chandle))
++ return NULL;
++ return item;
++}
+
+ static int rsi_parse(struct cache_detail *cd,
+ char *mesg, int mlen)
+@@ -176,17 +213,22 @@ static int rsi_parse(struct cache_detail
+ char *buf = mesg;
+ char *ep;
+ int len;
+- struct rsi rsii, *rsip = NULL;
++ struct rsi *rsii;
+ time_t expiry;
+ int status = -EINVAL;
+
+- memset(&rsii, 0, sizeof(rsii));
++ rsii = kmalloc(sizeof(*rsii), GFP_KERNEL);
++ if (!rsii)
++ return -ENOMEM;
++ memset(rsii, 0, sizeof(*rsii));
++ cache_init(&rsii->h);
++
+ /* handle */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+- if (dup_to_netobj(&rsii.in_handle, buf, len))
++ if (dup_to_netobj(&rsii->in_handle, buf, len))
+ goto out;
+
+ /* token */
+@@ -195,10 +237,9 @@ static int rsi_parse(struct cache_detail
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+- if (dup_to_netobj(&rsii.in_token, buf, len))
++ if (dup_to_netobj(&rsii->in_token, buf, len))
+ goto out;
+
+- rsii.h.flags = 0;
+ /* expiry */
+ expiry = get_expiry(&mesg);
+ status = -EINVAL;
+@@ -212,13 +253,13 @@ static int rsi_parse(struct cache_detail
+ if (len == 0) {
+ goto out;
+ } else {
+- rsii.major_status = simple_strtoul(buf, &ep, 10);
++ rsii->major_status = simple_strtoul(buf, &ep, 10);
+ if (*ep)
+ goto out;
+ len = qword_get(&mesg, buf, mlen);
+ if (len <= 0)
+ goto out;
+- rsii.minor_status = simple_strtoul(buf, &ep, 10);
++ rsii->minor_status = simple_strtoul(buf, &ep, 10);
+ if (*ep)
+ goto out;
+
+@@ -227,7 +268,7 @@ static int rsi_parse(struct cache_detail
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+- if (dup_to_netobj(&rsii.out_handle, buf, len))
++ if (dup_to_netobj(&rsii->out_handle, buf, len))
+ goto out;
+
+ /* out_token */
+@@ -236,16 +277,14 @@ static int rsi_parse(struct cache_detail
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+- if (dup_to_netobj(&rsii.out_token, buf, len))
++ if (dup_to_netobj(&rsii->out_token, buf, len))
+ goto out;
+ }
+- rsii.h.expiry_time = expiry;
+- rsip = rsi_lookup(&rsii, 1);
+- status = 0;
++ rsii->h.expiry_time = expiry;
++ status = gssd_reply(rsii);
+ out:
+- rsi_free(&rsii);
+- if (rsip)
+- rsi_put(&rsip->h, &rsi_cache);
++ if (rsii)
++ rsi_put(&rsii->h, &rsi_cache);
+ return status;
+ }
+
+@@ -258,8 +297,6 @@ static struct cache_detail rsi_cache = {
+ .cache_parse = rsi_parse,
+ };
+
+-static DefineSimpleCacheLookup(rsi, 0)
+-
+ /*
+ * The rpcsec_context cache is used to store a context that is
+ * used in data exchange.
+@@ -292,7 +329,6 @@ struct rsc {
+
+ static struct cache_head *rsc_table[RSC_HASHMAX];
+ static struct cache_detail rsc_cache;
+-static struct rsc *rsc_lookup(struct rsc *item, int set);
+
+ static void rsc_free(struct rsc *rsci)
+ {
+@@ -325,26 +361,44 @@ rsc_match(struct rsc *new, struct rsc *t
+ return netobj_equal(&new->handle, &tmp->handle);
+ }
+
+-static inline void
+-rsc_init(struct rsc *new, struct rsc *tmp)
++static struct rsc *rsc_lookup(struct rsc *item, int set)
+ {
+- new->handle.len = tmp->handle.len;
+- tmp->handle.len = 0;
+- new->handle.data = tmp->handle.data;
+- tmp->handle.data = NULL;
+- new->mechctx = NULL;
+- new->cred.cr_group_info = NULL;
+-}
+-
+-static inline void
+-rsc_update(struct rsc *new, struct rsc *tmp)
+-{
+- new->mechctx = tmp->mechctx;
+- tmp->mechctx = NULL;
+- memset(&new->seqdata, 0, sizeof(new->seqdata));
+- spin_lock_init(&new->seqdata.sd_lock);
+- new->cred = tmp->cred;
+- tmp->cred.cr_group_info = NULL;
++ struct rsc *tmp = NULL;
++ struct cache_head **hp, **head;
++ head = &rsc_cache.hash_table[rsc_hash(item)];
++
++ if (set)
++ write_lock(&rsc_cache.hash_lock);
++ else
++ read_lock(&rsc_cache.hash_lock);
++ for (hp = head; *hp != NULL; hp = &tmp->h.next) {
++ tmp = container_of(*hp, struct rsc, h);
++ if (!rsc_match(tmp, item))
++ continue;
++ cache_get(&tmp->h);
++ if (!set)
++ goto out_noset;
++ *hp = tmp->h.next;
++ tmp->h.next = NULL;
++ clear_bit(CACHE_HASHED, &tmp->h.flags);
++ rsc_put(&tmp->h, &rsc_cache);
++ goto out_set;
++ }
++ /* Didn't find anything */
++ if (!set)
++ goto out_noset;
++ rsc_cache.entries++;
++out_set:
++ set_bit(CACHE_HASHED, &item->h.flags);
++ item->h.next = *head;
++ *head = &item->h;
++ write_unlock(&rsc_cache.hash_lock);
++ cache_fresh(&rsc_cache, &item->h, item->h.expiry_time);
++ cache_get(&item->h);
++ return item;
++out_noset:
++ read_unlock(&rsc_cache.hash_lock);
++ return tmp;
+ }
+
+ static int rsc_parse(struct cache_detail *cd,
+@@ -353,19 +407,22 @@ static int rsc_parse(struct cache_detail
+ /* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */
+ char *buf = mesg;
+ int len, rv;
+- struct rsc rsci, *rscp = NULL;
++ struct rsc *rsci, *res = NULL;
+ time_t expiry;
+ int status = -EINVAL;
+
+- memset(&rsci, 0, sizeof(rsci));
++ rsci = kmalloc(sizeof(*rsci), GFP_KERNEL);
++ if (!rsci)
++ return -ENOMEM;
++ memset(rsci, 0, sizeof(*rsci));
++ cache_init(&rsci->h);
+ /* context handle */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0) goto out;
+ status = -ENOMEM;
+- if (dup_to_netobj(&rsci.handle, buf, len))
++ if (dup_to_netobj(&rsci->handle, buf, len))
+ goto out;
+
+- rsci.h.flags = 0;
+ /* expiry */
+ expiry = get_expiry(&mesg);
+ status = -EINVAL;
+@@ -373,26 +430,26 @@ static int rsc_parse(struct cache_detail
+ goto out;
+
+ /* uid, or NEGATIVE */
+- rv = get_int(&mesg, &rsci.cred.cr_uid);
++ rv = get_int(&mesg, &rsci->cred.cr_uid);
+ if (rv == -EINVAL)
+ goto out;
+ if (rv == -ENOENT)
+- set_bit(CACHE_NEGATIVE, &rsci.h.flags);
++ set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+ else {
+ int N, i;
+ struct gss_api_mech *gm;
+ struct xdr_netobj tmp_buf;
+
+ /* gid */
+- if (get_int(&mesg, &rsci.cred.cr_gid))
++ if (get_int(&mesg, &rsci->cred.cr_gid))
+ goto out;
+
+ /* number of additional gid's */
+ if (get_int(&mesg, &N))
+ goto out;
+ status = -ENOMEM;
+- rsci.cred.cr_group_info = groups_alloc(N);
+- if (rsci.cred.cr_group_info == NULL)
++ rsci->cred.cr_group_info = groups_alloc(N);
++ if (rsci->cred.cr_group_info == NULL)
+ goto out;
+
+ /* gid's */
+@@ -401,7 +458,7 @@ static int rsc_parse(struct cache_detail
+ gid_t gid;
+ if (get_int(&mesg, &gid))
+ goto out;
+- GROUP_AT(rsci.cred.cr_group_info, i) = gid;
++ GROUP_AT(rsci->cred.cr_group_info, i) = gid;
+ }
+
+ /* mech name */
+@@ -422,19 +479,21 @@ static int rsc_parse(struct cache_detail
+ }
+ tmp_buf.len = len;
+ tmp_buf.data = buf;
+- if (gss_import_sec_context(&tmp_buf, gm, &rsci.mechctx)) {
++ if (gss_import_sec_context(&tmp_buf, gm, &rsci->mechctx)) {
+ gss_mech_put(gm);
+ goto out;
+ }
+ gss_mech_put(gm);
+ }
+- rsci.h.expiry_time = expiry;
+- rscp = rsc_lookup(&rsci, 1);
++ rsci->h.expiry_time = expiry;
++ spin_lock_init(&rsci->seqdata.sd_lock);
++ res = rsc_lookup(rsci, 1);
++ rsc_put(&res->h, &rsc_cache);
++ rsci = NULL;
+ status = 0;
+ out:
+- rsc_free(&rsci);
+- if (rscp)
+- rsc_put(&rscp->h, &rsc_cache);
++ if (rsci)
++ rsc_put(&rsci->h, &rsc_cache);
+ return status;
+ }
+
+@@ -446,19 +505,14 @@ static struct cache_detail rsc_cache = {
+ .cache_parse = rsc_parse,
+ };
+
+-static DefineSimpleCacheLookup(rsc, 0);
+-
+ struct rsc *
+ gss_svc_searchbyctx(struct xdr_netobj *handle)
+ {
+ struct rsc rsci;
+ struct rsc *found;
+
+- memset(&rsci, 0, sizeof(rsci));
+- if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
+- return NULL;
++ rsci.handle = *handle;
+ found = rsc_lookup(&rsci, 0);
+- rsc_free(&rsci);
+ if (!found)
+ return NULL;
+ if (cache_check(&rsc_cache, &found->h, NULL))
+@@ -643,7 +697,6 @@ svcauth_gss_register_pseudoflavor(u32 ps
+ if (!new)
+ goto out;
+ cache_init(&new->h.h);
+- atomic_inc(&new->h.h.refcnt);
+ new->h.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+ if (!new->h.name)
+ goto out_free_dom;
+@@ -651,7 +704,6 @@ svcauth_gss_register_pseudoflavor(u32 ps
+ new->h.flavour = RPC_AUTH_GSS;
+ new->pseudoflavor = pseudoflavor;
+ new->h.h.expiry_time = NEVER;
+- new->h.h.flags = 0;
+
+ test = auth_domain_lookup(&new->h, 1);
+ if (test == &new->h) {
+@@ -723,6 +775,45 @@ out:
+ return stat;
+ }
+
++static int
++unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
++{
++ int stat = -EINVAL;
++ int out_offset;
++ u32 * lenp;
++ u32 priv_len, maj_stat;
++ int saved_len;
++
++ lenp = buf->head[0].iov_base;
++ priv_len = ntohl(svc_getu32(&buf->head[0]));
++ if (priv_len > buf->len) /* XXXJBF: wrong check */
++ goto out;
++ /* XXXJBF: bizarre hack: to handle revisits (and not decrypt
++ * twice), the first time through we write an offset
++ * telling us where to skip to find the already-decrypted data */
++ if (rqstp->rq_deferred) {
++ buf->head[0].iov_base += priv_len;
++ buf->head[0].iov_len -= priv_len;
++ return 0;
++ }
++ saved_len = buf->len; /* XXX HACK */
++ buf->len = priv_len;
++ maj_stat = gss_unwrap(ctx, GSS_C_QOP_DEFAULT, 0, buf, &out_offset);
++ buf->len = saved_len;
++ buf->head[0].iov_base += out_offset;
++ buf->head[0].iov_len -= out_offset;
++ BUG_ON(buf->head[0].iov_len <= 0);
++ if (maj_stat != GSS_S_COMPLETE)
++ goto out;
++ if (ntohl(svc_getu32(&buf->head[0])) != seq)
++ goto out;
++ /* XXXJBF: see "bizarre hack", above. */
++ *lenp = htonl(out_offset + 4);
++ stat = 0;
++out:
++ return stat;
++}
++
+ struct gss_svc_data {
+ /* decoded gss client cred: */
+ struct rpc_gss_wire_cred clcred;
+@@ -750,7 +841,7 @@ svcauth_gss_accept(struct svc_rqst *rqst
+ struct gss_svc_data *svcdata = rqstp->rq_auth_data;
+ struct rpc_gss_wire_cred *gc;
+ struct rsc *rsci = NULL;
+- struct rsi *rsip, rsikey;
++ struct rsi *rsip, *rsikey = NULL;
+ u32 *rpcstart;
+ u32 *reject_stat = resv->iov_base + resv->iov_len;
+ int ret;
+@@ -843,30 +934,23 @@ svcauth_gss_accept(struct svc_rqst *rqst
+ *authp = rpc_autherr_badcred;
+ if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0)
+ goto auth_err;
+- memset(&rsikey, 0, sizeof(rsikey));
+- if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx))
++ rsikey = kmalloc(sizeof(*rsikey), GFP_KERNEL);
++ if (!rsikey)
++ goto drop;
++ memset(rsikey, 0, sizeof(*rsikey));
++ cache_init(&rsikey->h);
++ if (dup_netobj(&rsikey->in_handle, &gc->gc_ctx))
+ goto drop;
+ *authp = rpc_autherr_badverf;
+- if (svc_safe_getnetobj(argv, &tmpobj)) {
+- kfree(rsikey.in_handle.data);
++ if (svc_safe_getnetobj(argv, &tmpobj))
+ goto auth_err;
+- }
+- if (dup_netobj(&rsikey.in_token, &tmpobj)) {
+- kfree(rsikey.in_handle.data);
++ if (dup_netobj(&rsikey->in_token, &tmpobj))
+ goto drop;
+- }
+
+- rsip = rsi_lookup(&rsikey, 0);
+- rsi_free(&rsikey);
+- if (!rsip) {
+- goto drop;
+- }
+- switch(cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) {
+- case -EAGAIN:
++ rsip = gssd_upcall(rsikey, rqstp);
++ if (!rsip)
+ goto drop;
+- case -ENOENT:
+- goto drop;
+- case 0:
++ else {
+ rsci = gss_svc_searchbyctx(&rsip->out_handle);
+ if (!rsci) {
+ goto drop;
+@@ -921,7 +1005,16 @@ svcauth_gss_accept(struct svc_rqst *rqst
+ svc_putu32(resv, 0);
+ break;
+ case RPC_GSS_SVC_PRIVACY:
+- /* currently unsupported */
++ if (unwrap_priv_data(rqstp, &rqstp->rq_arg,
++ gc->gc_seq, rsci->mechctx))
++ goto auth_err;
++ svcdata->rsci = rsci;
++ cache_get(&rsci->h);
++ /* placeholders for length and seq. number: */
++ svcdata->body_start = resv->iov_base + resv->iov_len;
++ svc_putu32(resv, 0);
++ svc_putu32(resv, 0);
++ break;
+ default:
+ goto auth_err;
+ }
+@@ -939,13 +1032,15 @@ complete:
+ drop:
+ ret = SVC_DROP;
+ out:
++ if (rsikey)
++ rsi_put(&rsikey->h, &rsi_cache);
+ if (rsci)
+ rsc_put(&rsci->h, &rsc_cache);
+ return ret;
+ }
+
+-static int
+-svcauth_gss_release(struct svc_rqst *rqstp)
++static inline int
++svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
+ {
+ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
+ struct rpc_gss_wire_cred *gc = &gsd->clcred;
+@@ -957,6 +1052,156 @@ svcauth_gss_release(struct svc_rqst *rqs
+ int integ_offset, integ_len;
+ int stat = -EINVAL;
+
++ p = gsd->body_start;
++ gsd->body_start = 0;
++ /* move accept_stat to right place: */
++ memcpy(p, p + 2, 4);
++ /* Don't wrap in failure case: */
++ /* Counting on not getting here if call was not even accepted! */
++ if (*p != rpc_success) {
++ resbuf->head[0].iov_len -= 2 * 4;
++ goto out;
++ }
++ p++;
++ integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
++ integ_len = resbuf->len - integ_offset;
++ BUG_ON(integ_len % 4);
++ *p++ = htonl(integ_len);
++ *p++ = htonl(gc->gc_seq);
++ if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
++ integ_len))
++ BUG();
++ if (resbuf->page_len == 0
++ && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE
++ < PAGE_SIZE) {
++ BUG_ON(resbuf->tail[0].iov_len);
++ /* Use head for everything */
++ resv = &resbuf->head[0];
++ } else if (resbuf->tail[0].iov_base == NULL) {
++ /* copied from nfsd4_encode_read */
++ svc_take_page(rqstp);
++ resbuf->tail[0].iov_base = page_address(rqstp
++ ->rq_respages[rqstp->rq_resused-1]);
++ rqstp->rq_restailpage = rqstp->rq_resused-1;
++ resbuf->tail[0].iov_len = 0;
++ resv = &resbuf->tail[0];
++ } else {
++ resv = &resbuf->tail[0];
++ }
++ mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
++ if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic))
++ goto out_err;
++ svc_putu32(resv, htonl(mic.len));
++ memset(mic.data + mic.len, 0,
++ round_up_to_quad(mic.len) - mic.len);
++ resv->iov_len += XDR_QUADLEN(mic.len) << 2;
++ /* not strictly required: */
++ resbuf->len += XDR_QUADLEN(mic.len) << 2;
++ BUG_ON(resv->iov_len > PAGE_SIZE);
++out:
++ stat = 0;
++out_err:
++ return stat;
++}
++
++/* XXXJBF: Look for chances to share code with client */
++/* XXXJBF: Do we need to preallocate these pages somehow? E.g. see
++ * buffer size calculations in svcsock.c */
++/* XXXJBF: how does reference counting on pages work? */
++static struct page **
++svc_alloc_enc_pages(struct xdr_buf *buf)
++{
++ struct page **ret;
++ int last, i;
++
++ if (buf->page_len == 0)
++ return NULL;
++ BUG_ON(buf->page_base >> PAGE_CACHE_SHIFT);
++ last = (buf->page_base + buf->page_len - 1) >> PAGE_CACHE_SHIFT;
++ ret = kmalloc((last + 1) * sizeof(struct page *), GFP_KERNEL);
++ if (!ret)
++ goto out;
++ for (i = 0; i<= last; i++) {
++ ret[i] = alloc_page(GFP_KERNEL);
++ if (ret[i] == NULL)
++ goto out_free;
++ }
++out:
++ return ret;
++out_free:
++ for (i--; i >= 0; i--) {
++ __free_page(ret[i]);
++ }
++ return NULL;
++}
++
++static inline int
++svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp)
++{
++ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
++ struct rpc_gss_wire_cred *gc = &gsd->clcred;
++ struct xdr_buf *resbuf = &rqstp->rq_res;
++ struct page **inpages;
++ u32 *p;
++ int offset, *len;
++ int pad;
++ int stat = -EINVAL;
++
++ p = gsd->body_start;
++ gsd->body_start = 0;
++ /* move accept_stat to right place: */
++ memcpy(p, p + 2, 4);
++ /* Don't wrap in failure case: */
++ /* Counting on not getting here if call was not even accepted! */
++ if (*p != rpc_success) {
++ resbuf->head[0].iov_len -= 2 * 4;
++ goto out;
++ }
++ p++;
++ len = p++;
++ offset = (u8 *)p - (u8 *)resbuf->head[0].iov_base;
++ *p++ = htonl(gc->gc_seq);
++ stat = -ENOMEM;
++ inpages = resbuf->pages;
++ /* XXXJBF: huge memory leaks here: allocated pages probably aren't
++ * freed, and neither is memory used to hold page array. */
++ resbuf->pages = svc_alloc_enc_pages(resbuf);
++ if (resbuf->page_len && !resbuf->pages)
++ goto out_err; /* XXX sleep and retry? Reserve ahead of time
++ and BUG_ON? */
++ if (resbuf->tail[0].iov_len == 0 || resbuf->tail[0].iov_base == NULL) {
++ /* copied from nfsd4_encode_read */
++ {int i = svc_take_page(rqstp); BUG_ON(i); }
++ resbuf->tail[0].iov_base = page_address(rqstp
++ ->rq_respages[rqstp->rq_resused-1]);
++ rqstp->rq_restailpage = rqstp->rq_resused-1;
++ resbuf->tail[0].iov_len = 0;
++ }
++ /* XXX: Will svc code attempt to free stuff in xdr_buf->pages?
++ * Or can we leave it in any old state on error?? */
++ stat = -EINVAL;
++ if (gss_wrap(gsd->rsci->mechctx, GSS_C_QOP_DEFAULT, offset,
++ resbuf, inpages))
++ goto out_err;
++ *len = htonl(resbuf->len - offset);
++ pad = 3 - ((resbuf->len - offset - 1)&3);
++ p = (u32 *)(resbuf->tail[0].iov_base + resbuf->tail[0].iov_len);
++ memset(p, 0, pad);
++ resbuf->tail[0].iov_len += pad;
++out:
++ return 0;
++out_err:
++ return stat;
++}
++
++static int
++svcauth_gss_release(struct svc_rqst *rqstp)
++{
++ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
++ struct rpc_gss_wire_cred *gc = &gsd->clcred;
++ struct xdr_buf *resbuf = &rqstp->rq_res;
++ int stat = -EINVAL;
++
+ if (gc->gc_proc != RPC_GSS_PROC_DATA)
+ goto out;
+ /* Release can be called twice, but we only wrap once. */
+@@ -969,55 +1214,15 @@ svcauth_gss_release(struct svc_rqst *rqs
+ case RPC_GSS_SVC_NONE:
+ break;
+ case RPC_GSS_SVC_INTEGRITY:
+- p = gsd->body_start;
+- gsd->body_start = 0;
+- /* move accept_stat to right place: */
+- memcpy(p, p + 2, 4);
+- /* don't wrap in failure case: */
+- /* Note: counting on not getting here if call was not even
+- * accepted! */
+- if (*p != rpc_success) {
+- resbuf->head[0].iov_len -= 2 * 4;
+- goto out;
+- }
+- p++;
+- integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
+- integ_len = resbuf->len - integ_offset;
+- BUG_ON(integ_len % 4);
+- *p++ = htonl(integ_len);
+- *p++ = htonl(gc->gc_seq);
+- if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
+- integ_len))
+- BUG();
+- if (resbuf->page_len == 0
+- && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE
+- < PAGE_SIZE) {
+- BUG_ON(resbuf->tail[0].iov_len);
+- /* Use head for everything */
+- resv = &resbuf->head[0];
+- } else if (resbuf->tail[0].iov_base == NULL) {
+- /* copied from nfsd4_encode_read */
+- svc_take_page(rqstp);
+- resbuf->tail[0].iov_base = page_address(rqstp
+- ->rq_respages[rqstp->rq_resused-1]);
+- rqstp->rq_restailpage = rqstp->rq_resused-1;
+- resbuf->tail[0].iov_len = 0;
+- resv = &resbuf->tail[0];
+- } else {
+- resv = &resbuf->tail[0];
+- }
+- mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
+- if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic))
++ stat = svcauth_gss_wrap_resp_integ(rqstp);
++ if (stat)
+ goto out_err;
+- svc_putu32(resv, htonl(mic.len));
+- memset(mic.data + mic.len, 0,
+- round_up_to_quad(mic.len) - mic.len);
+- resv->iov_len += XDR_QUADLEN(mic.len) << 2;
+- /* not strictly required: */
+- resbuf->len += XDR_QUADLEN(mic.len) << 2;
+- BUG_ON(resv->iov_len > PAGE_SIZE);
+ break;
+ case RPC_GSS_SVC_PRIVACY:
++ stat = svcauth_gss_wrap_resp_priv(rqstp);
++ if (stat)
++ goto out_err;
++ break;
+ default:
+ goto out_err;
+ }
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_wrap.c.lsec 2005-03-23 14:28:24.900264280 -0700
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_wrap.c 2005-03-23 14:28:24.900264280 -0700
+@@ -0,0 +1,337 @@
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/sunrpc/gss_krb5.h>
++#include <linux/random.h>
++#include <linux/pagemap.h>
++#include <asm/scatterlist.h>
++#include <linux/crypto.h>
++
++#ifdef RPC_DEBUG
++# define RPCDBG_FACILITY RPCDBG_AUTH
++#endif
++
++static inline int
++gss_krb5_padding(int blocksize, int length)
++{
++ /* Most of the code is block-size independent but currently we
++ * use only 8: */
++ BUG_ON(blocksize != 8);
++ return 8 - (length & 7);
++}
++
++static inline void
++gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize)
++{
++ int padding = gss_krb5_padding(blocksize, buf->len - offset);
++ char *p;
++ struct iovec *iov;
++
++ if (buf->page_len || buf->tail[0].iov_len)
++ iov = &buf->tail[0];
++ else
++ iov = &buf->head[0];
++ p = iov->iov_base + iov->iov_len;
++ iov->iov_len += padding;
++ buf->len += padding;
++ memset(p, padding, padding);
++}
++
++static inline int
++gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
++{
++ u8 *ptr;
++ u8 pad;
++ int len = buf->len;
++
++ if (len <= buf->head[0].iov_len) {
++ pad = *(u8 *)(buf->head[0].iov_base + len - 1);
++ goto out;
++ } else
++ len -= buf->head[0].iov_len;
++ if (len <= buf->page_len) {
++ int last = (buf->page_base + len - 1)
++ >>PAGE_CACHE_SHIFT;
++ int offset = (buf->page_base + len - 1)
++ & (PAGE_CACHE_SIZE - 1);
++ ptr = kmap_atomic(buf->pages[last], KM_SKB_SUNRPC_DATA);
++ pad = *(ptr + offset);
++ kunmap_atomic(ptr, KM_SKB_SUNRPC_DATA);
++ goto out;
++ } else
++ len -= buf->page_len;
++ BUG_ON(len > buf->tail[0].iov_len);
++ pad = *(u8 *)(buf->tail[0].iov_base + len - 1);
++out:
++ if (pad > blocksize)
++ return -EINVAL;
++ buf->len -= pad;
++ return 0;
++}
++
++static inline void
++make_confounder(char *p, int blocksize)
++{
++ /* XXX? Is this OK to do on every packet? */
++ get_random_bytes(p, blocksize);
++}
++
++/* Assumptions: the head and tail of inbuf are ours to play with.
++ * The pages, however, may be real pages in the page cache and we replace
++ * them with scratch pages from **pages before writing to them. */
++/* XXX: obviously the above should be documentation of wrap interface,
++ * and shouldn't be in this kerberos-specific file. */
++
++/* XXX factor out common code with seal/unseal. */
++
++u32
++gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset,
++ struct xdr_buf *buf, struct page **pages)
++{
++ struct krb5_ctx *kctx = ctx->internal_ctx_id;
++ s32 checksum_type;
++ struct xdr_netobj md5cksum = {.len = 0, .data = NULL};
++ int blocksize = 0, plainlen;
++ unsigned char *ptr, *krb5_hdr, *msg_start;
++ s32 now;
++ int headlen;
++ struct page **tmp_pages;
++ u32 seq_send;
++
++ dprintk("RPC: gss_wrap_kerberos\n");
++
++ now = get_seconds();
++
++ if (qop != 0)
++ goto out_err;
++
++ switch (kctx->signalg) {
++ case SGN_ALG_DES_MAC_MD5:
++ checksum_type = CKSUMTYPE_RSA_MD5;
++ break;
++ default:
++ dprintk("RPC: gss_krb5_seal: kctx->signalg %d not"
++ " supported\n", kctx->signalg);
++ goto out_err;
++ }
++ if (kctx->sealalg != SEAL_ALG_NONE && kctx->sealalg != SEAL_ALG_DES) {
++ dprintk("RPC: gss_krb5_seal: kctx->sealalg %d not supported\n",
++ kctx->sealalg);
++ goto out_err;
++ }
++
++ blocksize = crypto_tfm_alg_blocksize(kctx->enc);
++ gss_krb5_add_padding(buf, offset, blocksize);
++ BUG_ON((buf->len - offset) % blocksize);
++ plainlen = blocksize + buf->len - offset;
++
++ headlen = g_token_size(&kctx->mech_used, 22 + plainlen) -
++ (buf->len - offset);
++
++ ptr = buf->head[0].iov_base + offset;
++ /* shift data to make room for header. */
++ /* XXX Would be cleverer to encrypt while copying. */
++ /* XXX bounds checking, slack, etc. */
++ memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset);
++ buf->head[0].iov_len += headlen;
++ buf->len += headlen;
++ BUG_ON((buf->len - offset - headlen) % blocksize);
++
++ g_make_token_header(&kctx->mech_used, 22 + plainlen, &ptr);
++
++
++ *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff);
++ *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff);
++
++ /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
++ krb5_hdr = ptr - 2;
++ msg_start = krb5_hdr + 24;
++ /* XXXJBF: */ BUG_ON(buf->head[0].iov_base + offset + headlen != msg_start + blocksize);
++
++ *(u16 *)(krb5_hdr + 2) = htons(kctx->signalg);
++ memset(krb5_hdr + 4, 0xff, 4);
++ *(u16 *)(krb5_hdr + 4) = htons(kctx->sealalg);
++
++ make_confounder(msg_start, blocksize);
++
++ /* XXXJBF: UGH!: */
++ tmp_pages = buf->pages;
++ buf->pages = pages;
++ if (make_checksum(checksum_type, krb5_hdr, 8, buf,
++ offset + headlen - blocksize, &md5cksum))
++ goto out_err;
++ buf->pages = tmp_pages;
++
++ switch (kctx->signalg) {
++ case SGN_ALG_DES_MAC_MD5:
++ if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
++ md5cksum.data, md5cksum.len))
++ goto out_err;
++ memcpy(krb5_hdr + 16,
++ md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH,
++ KRB5_CKSUM_LENGTH);
++
++ dprintk("RPC: make_seal_token: cksum data: \n");
++ print_hexl((u32 *) (krb5_hdr + 16), KRB5_CKSUM_LENGTH, 0);
++ break;
++ default:
++ BUG();
++ }
++
++ kfree(md5cksum.data);
++
++ spin_lock(&krb5_seq_lock);
++ seq_send = kctx->seq_send++;
++ spin_unlock(&krb5_seq_lock);
++
++ /* XXX would probably be more efficient to compute checksum
++ * and encrypt at the same time: */
++ if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff,
++ seq_send, krb5_hdr + 16, krb5_hdr + 8)))
++ goto out_err;
++
++ if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize,
++ pages))
++ goto out_err;
++
++ return ((kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
++out_err:
++ if (md5cksum.data) kfree(md5cksum.data);
++ return GSS_S_FAILURE;
++}
++
++u32
++gss_unwrap_kerberos(struct gss_ctx *ctx, u32 *qop, int offset,
++ struct xdr_buf *buf, int *out_offset)
++{
++ struct krb5_ctx *kctx = ctx->internal_ctx_id;
++ int signalg;
++ int sealalg;
++ s32 checksum_type;
++ struct xdr_netobj md5cksum = {.len = 0, .data = NULL};
++ s32 now;
++ int direction;
++ s32 seqnum;
++ unsigned char *ptr;
++ int bodysize;
++ u32 ret = GSS_S_DEFECTIVE_TOKEN;
++ u8 *data_start;
++ int blocksize;
++
++ dprintk("RPC: gss_unwrap_kerberos\n");
++
++ ptr = (u8 *)buf->head[0].iov_base + offset;
++ if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr,
++ buf->len - offset))
++ goto out;
++
++ if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) ||
++ (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) )
++ goto out;
++
++ /* XXX sanity-check bodysize?? */
++
++ /* get the sign and seal algorithms */
++
++ signalg = ptr[0] + (ptr[1] << 8);
++ sealalg = ptr[2] + (ptr[3] << 8);
++
++ /* Sanity checks */
++
++ if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
++ goto out;
++
++ if (sealalg == 0xffff)
++ goto out;
++
++ /* in the current spec, there is only one valid seal algorithm per
++ key type, so a simple comparison is ok */
++
++ if (sealalg != kctx->sealalg)
++ goto out;
++
++ /* there are several mappings of seal algorithms to sign algorithms,
++ but few enough that we can try them all. */
++
++ if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) ||
++ (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) ||
++ (kctx->sealalg == SEAL_ALG_DES3KD &&
++ signalg != SGN_ALG_HMAC_SHA1_DES3_KD))
++ goto out;
++
++ if (gss_decrypt_xdr_buf(kctx->enc, buf,
++ ptr + 22 - (unsigned char *)buf->head[0].iov_base))
++ goto out;
++
++ /* compute the checksum of the message */
++
++ /* initialize the the cksum */
++ switch (signalg) {
++ case SGN_ALG_DES_MAC_MD5:
++ checksum_type = CKSUMTYPE_RSA_MD5;
++ break;
++ default:
++ ret = GSS_S_DEFECTIVE_TOKEN;
++ goto out;
++ }
++
++ switch (signalg) {
++ case SGN_ALG_DES_MAC_MD5:
++ ret = make_checksum(checksum_type, ptr - 2, 8, buf,
++ ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum);
++ if (ret)
++ goto out;
++
++ ret = krb5_encrypt(kctx->seq, NULL, md5cksum.data,
++ md5cksum.data, md5cksum.len);
++ if (ret)
++ goto out;
++
++ if (memcmp(md5cksum.data + 8, ptr + 14, 8)) {
++ ret = GSS_S_BAD_SIG;
++ goto out;
++ }
++ break;
++ default:
++ ret = GSS_S_DEFECTIVE_TOKEN;
++ goto out;
++ }
++
++ /* it got through unscathed. Make sure the context is unexpired */
++
++ if (qop)
++ *qop = GSS_C_QOP_DEFAULT;
++
++ now = get_seconds();
++
++ ret = GSS_S_CONTEXT_EXPIRED;
++ if (now > kctx->endtime)
++ goto out;
++
++ /* do sequencing checks */
++
++ ret = GSS_S_BAD_SIG;
++ if ((ret = krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction,
++ &seqnum)))
++ goto out;
++
++ if ((kctx->initiate && direction != 0xff) ||
++ (!kctx->initiate && direction != 0))
++ goto out;
++
++ /* Copy the data back to the right position. XXX: Would probably be
++ * better to copy and encrypt at the same time. */
++
++ blocksize = crypto_tfm_alg_blocksize(kctx->enc);
++ data_start = ptr + 22 + blocksize;
++ *out_offset = data_start - (u8 *)buf->head[0].iov_base;
++
++ ret = GSS_S_DEFECTIVE_TOKEN;
++ if (gss_krb5_remove_padding(buf, blocksize))
++ goto out;
++
++ ret = GSS_S_COMPLETE;
++out:
++ if (md5cksum.data) kfree(md5cksum.data);
++ return ret;
++}
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_mech_switch.c.lsec 2004-06-15 23:19:37.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_mech_switch.c 2005-03-23 14:28:24.782282216 -0700
+@@ -279,6 +279,29 @@ gss_verify_mic(struct gss_ctx *context_
+ qstate);
+ }
+
++u32
++gss_wrap(struct gss_ctx *ctx_id,
++ u32 qop,
++ int offset,
++ struct xdr_buf *buf,
++ struct page **inpages)
++{
++ return ctx_id->mech_type->gm_ops
++ ->gss_wrap(ctx_id, qop, offset, buf, inpages);
++}
++
++u32
++gss_unwrap(struct gss_ctx *ctx_id,
++ u32 *qop,
++ int offset,
++ struct xdr_buf *buf,
++ int *out_offset)
++{
++ return ctx_id->mech_type->gm_ops
++ ->gss_unwrap(ctx_id, qop, offset, buf, out_offset);
++}
++
++
+ /* gss_delete_sec_context: free all resources associated with context_handle.
+ * Note this differs from the RFC 2744-specified prototype in that we don't
+ * bother returning an output token, since it would never be used anyway. */
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_mech.c.lsec 2004-06-15 23:19:42.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_mech.c 2005-03-23 14:28:24.841273248 -0700
+@@ -182,6 +182,7 @@ gss_delete_sec_context_kerberos(void *in
+ kfree(kctx);
+ }
+
++/* XXX the following wrappers have become pointless; kill them. */
+ static u32
+ gss_verify_mic_kerberos(struct gss_ctx *ctx,
+ struct xdr_buf *message,
+@@ -191,8 +192,7 @@ gss_verify_mic_kerberos(struct gss_ctx
+ int qop_state;
+ struct krb5_ctx *kctx = ctx->internal_ctx_id;
+
+- maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state,
+- KG_TOK_MIC_MSG);
++ maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state);
+ if (!maj_stat && qop_state)
+ *qstate = qop_state;
+
+@@ -208,7 +208,7 @@ gss_get_mic_kerberos(struct gss_ctx *ctx
+ u32 err = 0;
+ struct krb5_ctx *kctx = ctx->internal_ctx_id;
+
+- err = krb5_make_token(kctx, qop, message, mic_token, KG_TOK_MIC_MSG);
++ err = krb5_make_token(kctx, qop, message, mic_token);
+
+ dprintk("RPC: gss_get_mic_kerberos returning %d\n",err);
+
+@@ -219,6 +219,8 @@ static struct gss_api_ops gss_kerberos_o
+ .gss_import_sec_context = gss_import_sec_context_kerberos,
+ .gss_get_mic = gss_get_mic_kerberos,
+ .gss_verify_mic = gss_verify_mic_kerberos,
++ .gss_wrap = gss_wrap_kerberos,
++ .gss_unwrap = gss_unwrap_kerberos,
+ .gss_delete_sec_context = gss_delete_sec_context_kerberos,
+ };
+
+@@ -233,6 +235,11 @@ static struct pf_desc gss_kerberos_pfs[]
+ .service = RPC_GSS_SVC_INTEGRITY,
+ .name = "krb5i",
+ },
++ [2] = {
++ .pseudoflavor = RPC_AUTH_GSS_KRB5P,
++ .service = RPC_GSS_SVC_PRIVACY,
++ .name = "krb5p",
++ },
+ };
+
+ static struct gss_api_mech gss_kerberos_mech = {
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_seal.c.lsec 2004-06-15 23:18:37.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_seal.c 2005-03-23 14:28:24.898264584 -0700
+@@ -70,24 +70,17 @@
+ # define RPCDBG_FACILITY RPCDBG_AUTH
+ #endif
+
+-static inline int
+-gss_krb5_padding(int blocksize, int length) {
+- /* Most of the code is block-size independent but in practice we
+- * use only 8: */
+- BUG_ON(blocksize != 8);
+- return 8 - (length & 7);
+-}
++spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED;
+
+ u32
+ krb5_make_token(struct krb5_ctx *ctx, int qop_req,
+- struct xdr_buf *text, struct xdr_netobj *token,
+- int toktype)
++ struct xdr_buf *text, struct xdr_netobj *token)
+ {
+ s32 checksum_type;
+ struct xdr_netobj md5cksum = {.len = 0, .data = NULL};
+- int blocksize = 0, tmsglen;
+ unsigned char *ptr, *krb5_hdr, *msg_start;
+ s32 now;
++ u32 seq_send;
+
+ dprintk("RPC: gss_krb5_seal\n");
+
+@@ -111,21 +104,13 @@ krb5_make_token(struct krb5_ctx *ctx, in
+ goto out_err;
+ }
+
+- if (toktype == KG_TOK_WRAP_MSG) {
+- blocksize = crypto_tfm_alg_blocksize(ctx->enc);
+- tmsglen = blocksize + text->len
+- + gss_krb5_padding(blocksize, blocksize + text->len);
+- } else {
+- tmsglen = 0;
+- }
+-
+- token->len = g_token_size(&ctx->mech_used, 22 + tmsglen);
++ token->len = g_token_size(&ctx->mech_used, 22);
+
+ ptr = token->data;
+- g_make_token_header(&ctx->mech_used, 22 + tmsglen, &ptr);
++ g_make_token_header(&ctx->mech_used, 22, &ptr);
+
+- *ptr++ = (unsigned char) ((toktype>>8)&0xff);
+- *ptr++ = (unsigned char) (toktype&0xff);
++ *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff);
++ *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff);
+
+ /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
+ krb5_hdr = ptr - 2;
+@@ -133,17 +118,9 @@ krb5_make_token(struct krb5_ctx *ctx, in
+
+ *(u16 *)(krb5_hdr + 2) = htons(ctx->signalg);
+ memset(krb5_hdr + 4, 0xff, 4);
+- if (toktype == KG_TOK_WRAP_MSG)
+- *(u16 *)(krb5_hdr + 4) = htons(ctx->sealalg);
+
+- if (toktype == KG_TOK_WRAP_MSG) {
+- /* XXX removing support for now */
+- goto out_err;
+- } else { /* Sign only. */
+- if (make_checksum(checksum_type, krb5_hdr, 8, text,
+- &md5cksum))
++ if (make_checksum(checksum_type, krb5_hdr, 8, text, 0, &md5cksum))
+ goto out_err;
+- }
+
+ switch (ctx->signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+@@ -163,12 +140,14 @@ krb5_make_token(struct krb5_ctx *ctx, in
+
+ kfree(md5cksum.data);
+
++ spin_lock(&krb5_seq_lock);
++ seq_send = ctx->seq_send++;
++ spin_unlock(&krb5_seq_lock);
++
+ if ((krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff,
+- ctx->seq_send, krb5_hdr + 16, krb5_hdr + 8)))
++ seq_send, krb5_hdr + 16, krb5_hdr + 8)))
+ goto out_err;
+
+- ctx->seq_send++;
+-
+ return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
+ out_err:
+ if (md5cksum.data) kfree(md5cksum.data);
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_token.c.lsec 2005-03-23 14:28:24.240364600 -0700
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_token.c 2005-03-23 14:28:24.239364752 -0700
+@@ -0,0 +1,266 @@
++/*
++ * linux/net/sunrpc/gss_spkm3_token.c
++ *
++ * Copyright (c) 2003 The Regents of the University of Michigan.
++ * All rights reserved.
++ *
++ * Andy Adamson <andros@umich.edu>
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the University nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ */
++
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/sunrpc/gss_spkm3.h>
++#include <linux/random.h>
++#include <linux/crypto.h>
++
++#ifdef RPC_DEBUG
++# define RPCDBG_FACILITY RPCDBG_AUTH
++#endif
++
++/*
++ * asn1_bitstring_len()
++ *
++ * calculate the asn1 bitstring length of the xdr_netobject
++ */
++void
++asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits)
++{
++ int i, zbit = 0,elen = in->len;
++ char *ptr;
++
++ ptr = &in->data[in->len -1];
++
++ /* count trailing 0's */
++ for(i = in->len; i > 0; i--) {
++ if (*ptr == 0) {
++ ptr--;
++ elen--;
++ } else
++ break;
++ }
++
++ /* count number of 0 bits in final octet */
++ ptr = &in->data[elen - 1];
++ for(i = 0; i < 8; i++) {
++ short mask = 0x01;
++
++ if (!((mask << i) & *ptr))
++ zbit++;
++ else
++ break;
++ }
++ *enclen = elen;
++ *zerobits = zbit;
++}
++
++/*
++ * decode_asn1_bitstring()
++ *
++ * decode a bitstring into a buffer of the expected length.
++ * enclen = bit string length
++ * explen = expected length (define in rfc)
++ */
++int
++decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen, int explen)
++{
++ if (!(out->data = kmalloc(explen,GFP_KERNEL)))
++ return 0;
++ out->len = explen;
++ memset(out->data, 0, explen);
++ memcpy(out->data, in, enclen);
++ return 1;
++}
++
++/*
++ * SPKMInnerContextToken choice SPKM_MIC asn1 token layout
++ *
++ * contextid is always 16 bytes plain data. max asn1 bitstring len = 17.
++ *
++ * tokenlen = pos[0] to end of token (max pos[45] with MD5 cksum)
++ *
++ * pos value
++ * ----------
++ * [0] a4 SPKM-MIC tag
++ * [1] ?? innertoken length (max 44)
++ *
++ *
++ * tok_hdr piece of checksum data starts here
++ *
++ * the maximum mic-header len = 9 + 17 = 26
++ * mic-header
++ * ----------
++ * [2] 30 SEQUENCE tag
++ * [3] ?? mic-header length: (max 23) = TokenID + ContextID
++ *
++ * TokenID - all fields constant and can be hardcoded
++ * -------
++ * [4] 02 Type 2
++ * [5] 02 Length 2
++ * [6][7] 01 01 TokenID (SPKM_MIC_TOK)
++ *
++ * ContextID - encoded length not constant, calculated
++ * ---------
++ * [8] 03 Type 3
++ * [9] ?? encoded length
++ * [10] ?? ctxzbit
++ * [11] contextid
++ *
++ * mic_header piece of checksum data ends here.
++ *
++ * int-cksum - encoded length not constant, calculated
++ * ---------
++ * [??] 03 Type 3
++ * [??] ?? encoded length
++ * [??] ?? md5zbit
++ * [??] int-cksum (NID_md5 = 16)
++ *
++ * maximum SPKM-MIC innercontext token length =
++ * 10 + encoded contextid_size(17 max) + 2 + encoded
++ * cksum_size (17 maxfor NID_md5) = 46
++ */
++
++/*
++ * spkm3_mic_header()
++ *
++ * Prepare the SPKM_MIC_TOK mic-header for check-sum calculation
++ * elen: 16 byte context id asn1 bitstring encoded length
++ */
++void
++spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen, unsigned char *ctxdata, int elen, int zbit)
++{
++ char *hptr = *hdrbuf;
++ char *top = *hdrbuf;
++
++ *(u8 *)hptr++ = 0x30;
++ *(u8 *)hptr++ = elen + 7; /* on the wire header length */
++
++ /* tokenid */
++ *(u8 *)hptr++ = 0x02;
++ *(u8 *)hptr++ = 0x02;
++ *(u8 *)hptr++ = 0x01;
++ *(u8 *)hptr++ = 0x01;
++
++ /* coniextid */
++ *(u8 *)hptr++ = 0x03;
++ *(u8 *)hptr++ = elen + 1; /* add 1 to include zbit */
++ *(u8 *)hptr++ = zbit;
++ memcpy(hptr, ctxdata, elen);
++ hptr += elen;
++ *hdrlen = hptr - top;
++}
++
++/*
++ * spkm3_mic_innercontext_token()
++ *
++ * *tokp points to the beginning of the SPKM_MIC token described
++ * in rfc 2025, section 3.2.1:
++ *
++ */
++void
++spkm3_make_mic_token(unsigned char **tokp, int toklen, struct xdr_netobj *mic_hdr, struct xdr_netobj *md5cksum, int md5elen, int md5zbit)
++{
++ unsigned char *ict = *tokp;
++
++ *(u8 *)ict++ = 0xa4;
++ *(u8 *)ict++ = toklen - 2;
++ memcpy(ict, mic_hdr->data, mic_hdr->len);
++ ict += mic_hdr->len;
++
++ *(u8 *)ict++ = 0x03;
++ *(u8 *)ict++ = md5elen + 1; /* add 1 to include zbit */
++ *(u8 *)ict++ = md5zbit;
++ memcpy(ict, md5cksum->data, md5elen);
++}
++
++u32
++spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, unsigned char **cksum)
++{
++ struct xdr_netobj spkm3_ctx_id = {.len =0, .data = NULL};
++ unsigned char *ptr = *tokp;
++ int ctxelen;
++ u32 ret = GSS_S_DEFECTIVE_TOKEN;
++
++ /* spkm3 innercontext token preamble */
++ if ((ptr[0] != 0xa4) || (ptr[2] != 0x30)) {
++ dprintk("RPC: BAD SPKM ictoken preamble\n");
++ goto out;
++ }
++
++ *mic_hdrlen = ptr[3];
++
++ /* token type */
++ if ((ptr[4] != 0x02) || (ptr[5] != 0x02)) {
++ dprintk("RPC: BAD asn1 SPKM3 token type\n");
++ goto out;
++ }
++
++ /* only support SPKM_MIC_TOK */
++ if((ptr[6] != 0x01) || (ptr[7] != 0x01)) {
++ dprintk("RPC: ERROR unsupported SPKM3 token \n");
++ goto out;
++ }
++
++ /* contextid */
++ if (ptr[8] != 0x03) {
++ dprintk("RPC: BAD SPKM3 asn1 context-id type\n");
++ goto out;
++ }
++
++ ctxelen = ptr[9];
++ if (ctxelen > 17) { /* length includes asn1 zbit octet */
++ dprintk("RPC: BAD SPKM3 contextid len %d\n", ctxelen);
++ goto out;
++ }
++
++ /* ignore ptr[10] */
++
++ if(!decode_asn1_bitstring(&spkm3_ctx_id, &ptr[11], ctxelen - 1, 16))
++ goto out;
++
++ /*
++ * in the current implementation: the optional int-alg is not present
++ * so the default int-alg (md5) is used the optional snd-seq field is
++ * also not present
++ */
++
++ if (*mic_hdrlen != 6 + ctxelen) {
++ dprintk("RPC: BAD SPKM_ MIC_TOK header len %d: we only support default int-alg (should be absent) and do not support snd-seq\n", *mic_hdrlen);
++ goto out;
++ }
++ /* checksum */
++ *cksum = (&ptr[10] + ctxelen); /* ctxelen includes ptr[10] */
++
++ ret = GSS_S_COMPLETE;
++out:
++ if (spkm3_ctx_id.data)
++ kfree(spkm3_ctx_id.data);
++ return ret;
++}
++
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_generic_token.c.lsec 2004-06-15 23:19:10.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_generic_token.c 2005-03-23 14:28:23.707445616 -0700
+@@ -179,7 +179,7 @@ EXPORT_SYMBOL(g_make_token_header);
+ */
+ u32
+ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
+- unsigned char **buf_in, int tok_type, int toksize)
++ unsigned char **buf_in, int toksize)
+ {
+ unsigned char *buf = *buf_in;
+ int seqsize;
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_unseal.c.lsec 2005-03-23 14:28:24.240364600 -0700
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_unseal.c 2005-03-23 14:28:24.240364600 -0700
+@@ -0,0 +1,128 @@
++/*
++ * linux/net/sunrpc/gss_spkm3_unseal.c
++ *
++ * Copyright (c) 2003 The Regents of the University of Michigan.
++ * All rights reserved.
++ *
++ * Andy Adamson <andros@umich.edu>
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 2. Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the distribution.
++ * 3. Neither the name of the University nor the names of its
++ * contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ */
++
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/sunrpc/gss_spkm3.h>
++#include <linux/crypto.h>
++
++#ifdef RPC_DEBUG
++# define RPCDBG_FACILITY RPCDBG_AUTH
++#endif
++
++/*
++ * spkm3_read_token()
++ *
++ * only SPKM_MIC_TOK with md5 intg-alg is supported
++ */
++u32
++spkm3_read_token(struct spkm3_ctx *ctx,
++ struct xdr_netobj *read_token, /* checksum */
++ struct xdr_buf *message_buffer, /* signbuf */
++ int *qop_state, int toktype)
++{
++ s32 code;
++ struct xdr_netobj wire_cksum = {.len =0, .data = NULL};
++ struct xdr_netobj md5cksum = {.len = 0, .data = NULL};
++ unsigned char *ptr = (unsigned char *)read_token->data;
++ unsigned char *cksum;
++ int bodysize, md5elen;
++ int mic_hdrlen;
++ u32 ret = GSS_S_DEFECTIVE_TOKEN;
++
++ dprintk("RPC: spkm3_read_token read_token->len %d\n", read_token->len);
++
++ if (g_verify_token_header((struct xdr_netobj *) &ctx->mech_used,
++ &bodysize, &ptr, read_token->len))
++ goto out;
++
++ /* decode the token */
++
++ if (toktype == SPKM_MIC_TOK) {
++
++ if ((ret = spkm3_verify_mic_token(&ptr, &mic_hdrlen, &cksum)))
++ goto out;
++
++ if (*cksum++ != 0x03) {
++ dprintk("RPC: spkm3_read_token BAD checksum type\n");
++ goto out;
++ }
++ md5elen = *cksum++;
++ cksum++; /* move past the zbit */
++
++ if(!decode_asn1_bitstring(&wire_cksum, cksum, md5elen - 1, 16))
++ goto out;
++
++ /* HARD CODED FOR MD5 */
++
++ /* compute the checksum of the message.
++ * ptr + 2 = start of header piece of checksum
++ * mic_hdrlen + 2 = length of header piece of checksum
++ */
++ ret = GSS_S_DEFECTIVE_TOKEN;
++ code = make_checksum(CKSUMTYPE_RSA_MD5, ptr + 2,
++ mic_hdrlen + 2,
++ message_buffer, &md5cksum);
++
++ if (code)
++ goto out;
++
++ dprintk("RPC: spkm3_read_token: digest wire_cksum.len %d:\n",
++ wire_cksum.len);
++ dprintk(" md5cksum.data\n");
++ print_hexl((u32 *) md5cksum.data, 16, 0);
++ dprintk(" cksum.data:\n");
++ print_hexl((u32 *) wire_cksum.data, wire_cksum.len, 0);
++
++ ret = GSS_S_BAD_SIG;
++ code = memcmp(md5cksum.data, wire_cksum.data, wire_cksum.len);
++ if (code)
++ goto out;
++
++ } else {
++ dprintk("RPC: BAD or UNSUPPORTED SPKM3 token type: %d\n",toktype);
++ goto out;
++ }
++
++ /* XXX: need to add expiration and sequencing */
++ ret = GSS_S_COMPLETE;
++out:
++ if (md5cksum.data)
++ kfree(md5cksum.data);
++ if (wire_cksum.data)
++ kfree(wire_cksum.data);
++ return ret;
++}
+--- linux-2.6.7/net/sunrpc/auth_gss/Makefile.lsec 2004-06-15 23:19:22.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/Makefile 2005-03-23 14:28:24.294356392 -0700
+@@ -10,5 +10,9 @@ auth_rpcgss-objs := auth_gss.o gss_gener
+ obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
+
+ rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \
+- gss_krb5_seqnum.o
++ gss_krb5_seqnum.o gss_krb5_wrap.o
+
++obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o
++
++rpcsec_gss_spkm3-objs := gss_spkm3_mech.o gss_spkm3_seal.o gss_spkm3_unseal.o \
++ gss_spkm3_token.o
+--- linux-2.6.7/net/sunrpc/cache.c.lsec 2004-06-15 23:19:36.000000000 -0600
++++ linux-2.6.7/net/sunrpc/cache.c 2005-03-23 14:28:24.406339368 -0700
+@@ -38,7 +38,7 @@ void cache_init(struct cache_head *h)
+ time_t now = get_seconds();
+ h->next = NULL;
+ h->flags = 0;
+- atomic_set(&h->refcnt, 0);
++ atomic_set(&h->refcnt, 1);
+ h->expiry_time = now + CACHE_NEW_EXPIRY;
+ h->last_refresh = now;
+ }
+--- linux-2.6.7/net/sunrpc/svc.c.lsec 2004-06-15 23:20:03.000000000 -0600
++++ linux-2.6.7/net/sunrpc/svc.c 2005-03-23 14:28:23.652453976 -0700
+@@ -263,6 +263,7 @@ svc_process(struct svc_serv *serv, struc
+ u32 *statp;
+ u32 dir, prog, vers, proc,
+ auth_stat, rpc_stat;
++ int auth_res;
+
+ rpc_stat = rpc_success;
+
+@@ -304,12 +305,17 @@ svc_process(struct svc_serv *serv, struc
+ rqstp->rq_vers = vers = ntohl(svc_getu32(argv)); /* version number */
+ rqstp->rq_proc = proc = ntohl(svc_getu32(argv)); /* procedure number */
+
++ progp = serv->sv_program;
+ /*
+ * Decode auth data, and add verifier to reply buffer.
+ * We do this before anything else in order to get a decent
+ * auth verifier.
+ */
+- switch (svc_authenticate(rqstp, &auth_stat)) {
++ if (progp->pg_authenticate != NULL)
++ auth_res = progp->pg_authenticate(rqstp, &auth_stat);
++ else
++ auth_res = svc_authenticate(rqstp, &auth_stat);
++ switch (auth_res) {
+ case SVC_OK:
+ break;
+ case SVC_GARBAGE:
+@@ -326,7 +332,6 @@ svc_process(struct svc_serv *serv, struc
+ goto sendit;
+ }
+
+- progp = serv->sv_program;
+ if (prog != progp->pg_prog)
+ goto err_bad_prog;
+
+--- linux-2.6.7/net/sunrpc/svcauth.c.lsec 2004-06-15 23:19:44.000000000 -0600
++++ linux-2.6.7/net/sunrpc/svcauth.c 2005-03-23 14:28:24.407339216 -0700
+@@ -156,25 +156,47 @@ static inline int auth_domain_match(stru
+ {
+ return strcmp(tmp->name, item->name) == 0;
+ }
+-DefineCacheLookup(struct auth_domain,
+- h,
+- auth_domain_lookup,
+- (struct auth_domain *item, int set),
+- /* no setup */,
+- &auth_domain_cache,
+- auth_domain_hash(item),
+- auth_domain_match(tmp, item),
+- kfree(new); if(!set) {
+- if (new)
+- write_unlock(&auth_domain_cache.hash_lock);
+- else
+- read_unlock(&auth_domain_cache.hash_lock);
+- return NULL;
+- }
+- new=item; atomic_inc(&new->h.refcnt),
+- /* no update */,
+- 0 /* no inplace updates */
+- )
++
++struct auth_domain *
++auth_domain_lookup(struct auth_domain *item, int set)
++{
++ struct auth_domain *tmp = NULL;
++ struct cache_head **hp, **head;
++ head = &auth_domain_cache.hash_table[auth_domain_hash(item)];
++
++ if (set)
++ write_lock(&auth_domain_cache.hash_lock);
++ else
++ read_lock(&auth_domain_cache.hash_lock);
++ for (hp=head; *hp != NULL; hp = &tmp->h.next) {
++ tmp = container_of(*hp, struct auth_domain, h);
++ if (!auth_domain_match(tmp, item))
++ continue;
++ cache_get(&tmp->h);
++ if (!set)
++ goto out_noset;
++ *hp = tmp->h.next;
++ tmp->h.next = NULL;
++ clear_bit(CACHE_HASHED, &tmp->h.flags);
++ auth_domain_drop(&tmp->h, &auth_domain_cache);
++ goto out_set;
++ }
++ /* Didn't find anything */
++ if (!set)
++ goto out_noset;
++ auth_domain_cache.entries++;
++out_set:
++ set_bit(CACHE_HASHED, &item->h.flags);
++ item->h.next = *head;
++ *head = &item->h;
++ write_unlock(&auth_domain_cache.hash_lock);
++ cache_fresh(&auth_domain_cache, &item->h, item->h.expiry_time);
++ cache_get(&item->h);
++ return item;
++out_noset:
++ read_unlock(&auth_domain_cache.hash_lock);
++ return tmp;
++}
+
+ struct auth_domain *auth_domain_find(char *name)
+ {
--- /dev/null
+Introduce lock-free versions of d_rehash and d_move.
+
+ fs/dcache.c | 22 ++++++++++++++++++----
+ include/linux/dcache.h | 2 ++
+ 2 files changed, 20 insertions(+), 4 deletions(-)
+
+Index: linus-2.6.7-bk5/fs/dcache.c
+===================================================================
+--- linus-2.6.7-bk5.orig/fs/dcache.c 2004-06-24 10:39:11.232154728 +0300
++++ linus-2.6.7-bk5/fs/dcache.c 2004-06-24 10:56:01.043640048 +0300
+@@ -1115,16 +1115,23 @@
+ * Adds a dentry to the hash according to its name.
+ */
+
+-void d_rehash(struct dentry * entry)
++void __d_rehash(struct dentry * entry)
+ {
+ struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash);
+
+- spin_lock(&dcache_lock);
+ spin_lock(&entry->d_lock);
+ entry->d_flags &= ~DCACHE_UNHASHED;
+ spin_unlock(&entry->d_lock);
+ entry->d_bucket = list;
+ hlist_add_head_rcu(&entry->d_hash, list);
++}
++
++EXPORT_SYMBOL(__d_rehash);
++
++void d_rehash(struct dentry * entry)
++{
++ spin_lock(&dcache_lock);
++ __d_rehash(entry);
+ spin_unlock(&dcache_lock);
+ }
+
+@@ -1200,12 +1207,11 @@
+ * dcache entries should not be moved in this way.
+ */
+
+-void d_move(struct dentry * dentry, struct dentry * target)
++void __d_move(struct dentry * dentry, struct dentry * target)
+ {
+ if (!dentry->d_inode)
+ printk(KERN_WARNING "VFS: moving negative dcache entry\n");
+
+- spin_lock(&dcache_lock);
+ write_seqlock(&rename_lock);
+ /*
+ * XXXX: do we really need to take target->d_lock?
+@@ -1257,6 +1263,14 @@
+ spin_unlock(&target->d_lock);
+ spin_unlock(&dentry->d_lock);
+ write_sequnlock(&rename_lock);
++}
++
++EXPORT_SYMBOL(__d_move);
++
++void d_move(struct dentry *dentry, struct dentry *target)
++{
++ spin_lock(&dcache_lock);
++ __d_move(dentry, target);
+ spin_unlock(&dcache_lock);
+ }
+
+Index: linus-2.6.7-bk5/include/linux/dcache.h
+===================================================================
+--- linus-2.6.7-bk5.orig/include/linux/dcache.h 2004-06-24 10:39:29.534372368 +0300
++++ linus-2.6.7-bk5/include/linux/dcache.h 2004-06-24 10:53:10.319594048 +0300
+@@ -227,6 +227,7 @@
+ * This adds the entry to the hash queues.
+ */
+ extern void d_rehash(struct dentry *);
++extern void __d_rehash(struct dentry *);
+
+ /**
+ * d_add - add dentry to hash queues
+@@ -245,6 +246,7 @@
+
+ /* used for rename() and baskets */
+ extern void d_move(struct dentry *, struct dentry *);
++extern void __d_move(struct dentry *, struct dentry *);
+
+ /* appendix may either be NULL or be used for transname suffixes */
+ extern struct dentry * d_lookup(struct dentry *, struct qstr *);
--- /dev/null
+%diffstat
+ fs/dcache.c | 7 +++++++
+ include/linux/dcache.h | 1 +
+ 2 files changed, 8 insertions(+)
+
+%patch
+Index: linux-2.6.6/fs/dcache.c
+===================================================================
+--- linux-2.6.6.orig/fs/dcache.c 2004-05-22 02:11:17.000000000 +0800
++++ linux-2.6.6/fs/dcache.c 2004-05-22 02:14:46.000000000 +0800
+@@ -217,6 +217,13 @@ int d_invalidate(struct dentry * dentry)
+ spin_unlock(&dcache_lock);
+ return 0;
+ }
++
++ /* network invalidation by Lustre */
++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) {
++ spin_unlock(&dcache_lock);
++ return 0;
++ }
++
+ /*
+ * Check whether to do a partial shrink_dcache
+ * to get rid of unused child entries.
+Index: linux-2.6.6/include/linux/dcache.h
+===================================================================
+--- linux-2.6.6.orig/include/linux/dcache.h 2004-05-22 02:10:01.000000000 +0800
++++ linux-2.6.6/include/linux/dcache.h 2004-05-22 02:15:17.000000000 +0800
+@@ -153,6 +153,7 @@ d_iput: no no yes
+
+ #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */
+ #define DCACHE_UNHASHED 0x0010
++#define DCACHE_LUSTRE_INVALID 0x0020 /* invalidated by Lustre */
+
+ extern spinlock_t dcache_lock;
+
+
--- /dev/null
+Index: linux-2.6.6/fs/namei.c
+===================================================================
+--- linux-2.6.6.orig/fs/namei.c 2004-05-30 23:17:06.267030976 +0300
++++ linux-2.6.6/fs/namei.c 2004-05-30 23:23:15.642877312 +0300
+@@ -1270,7 +1270,7 @@
+ if (!error) {
+ DQUOT_INIT(inode);
+
+- error = do_truncate(dentry, 0);
++ error = do_truncate(dentry, 0, 1);
+ }
+ put_write_access(inode);
+ if (error)
+Index: linux-2.6.6/fs/open.c
+===================================================================
+--- linux-2.6.6.orig/fs/open.c 2004-05-30 20:05:26.857206992 +0300
++++ linux-2.6.6/fs/open.c 2004-05-30 23:24:38.908219056 +0300
+@@ -189,7 +189,7 @@
+ return error;
+ }
+
+-int do_truncate(struct dentry *dentry, loff_t length)
++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open)
+ {
+ int err;
+ struct iattr newattrs;
+@@ -202,6 +202,8 @@
+ newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+ down(&dentry->d_inode->i_sem);
+ down_write(&dentry->d_inode->i_alloc_sem);
++ if (called_from_open)
++ newattrs.ia_valid |= ATTR_FROM_OPEN;
+ err = notify_change(dentry, &newattrs);
+ up_write(&dentry->d_inode->i_alloc_sem);
+ up(&dentry->d_inode->i_sem);
+@@ -259,7 +261,7 @@
+ error = locks_verify_truncate(inode, NULL, length);
+ if (!error) {
+ DQUOT_INIT(inode);
+- error = do_truncate(nd.dentry, length);
++ error = do_truncate(nd.dentry, length, 0);
+ }
+ put_write_access(inode);
+
+@@ -311,7 +313,7 @@
+
+ error = locks_verify_truncate(inode, file, length);
+ if (!error)
+- error = do_truncate(dentry, length);
++ error = do_truncate(dentry, length, 0);
+ out_putf:
+ fput(file);
+ out:
+Index: linux-2.6.6/fs/exec.c
+===================================================================
+--- linux-2.6.6.orig/fs/exec.c 2004-05-30 20:05:26.862206232 +0300
++++ linux-2.6.6/fs/exec.c 2004-05-30 23:23:15.648876400 +0300
+@@ -1395,7 +1395,7 @@
+ goto close_fail;
+ if (!file->f_op->write)
+ goto close_fail;
+- if (do_truncate(file->f_dentry, 0) != 0)
++ if (do_truncate(file->f_dentry, 0, 0) != 0)
+ goto close_fail;
+
+ retval = binfmt->core_dump(signr, regs, file);
+Index: linux-2.6.6/include/linux/fs.h
+===================================================================
+--- linux-2.6.6.orig/include/linux/fs.h 2004-05-30 23:20:11.979798344 +0300
++++ linux-2.6.6/include/linux/fs.h 2004-05-30 23:25:29.167578472 +0300
+@@ -249,6 +249,7 @@
+ #define ATTR_ATTR_FLAG 1024
+ #define ATTR_KILL_SUID 2048
+ #define ATTR_KILL_SGID 4096
++#define ATTR_FROM_OPEN 16384 /* called from open path, ie O_TRUNC */
+
+ /*
+ * This is the Inode Attributes structure, used for notify_change(). It
+@@ -1189,7 +1190,7 @@
+
+ /* fs/open.c */
+
+-extern int do_truncate(struct dentry *, loff_t start);
++extern int do_truncate(struct dentry *, loff_t start, int called_from_open);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
+ extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *);
--- /dev/null
+Index: linux-2.6.7/fs/namespace.c
+===================================================================
+--- linux-2.6.7.orig/fs/namespace.c 2004-11-21 00:25:13.000000000 +0200
++++ linux-2.6.7/fs/namespace.c 2004-11-21 00:25:15.000000000 +0200
+@@ -360,7 +360,7 @@
+ }
+ }
+
+-static int do_umount(struct vfsmount *mnt, int flags)
++int do_umount(struct vfsmount *mnt, int flags)
+ {
+ struct super_block * sb = mnt->mnt_sb;
+ int retval;
+@@ -434,6 +434,8 @@
+ return retval;
+ }
+
++EXPORT_SYMBOL(do_umount);
++
+ /*
+ * Now umount can handle mount points as well as block devices.
+ * This is important for filesystems which use unnamed block devices.
+Index: linux-2.6.7/include/linux/mount.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/mount.h 2004-11-21 00:25:13.000000000 +0200
++++ linux-2.6.7/include/linux/mount.h 2005-01-11 15:28:26.627030408 +0200
+@@ -56,6 +56,7 @@
+ extern struct vfsmount *alloc_vfsmnt(const char *name);
+ extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
+ const char *name, void *data);
++extern int do_umount(struct vfsmount *mnt, int flags);
+ extern spinlock_t vfsmount_lock;
+
+ #endif
--- /dev/null
+Index: linus-2.6.7-bk-latest/include/linux/namei.h
+===================================================================
+--- linus-2.6.7-bk-latest.orig/include/linux/namei.h 2004-07-07 10:56:34.232378296 +0300
++++ linus-2.6.7-bk-latest/include/linux/namei.h 2004-07-07 11:41:48.569736296 +0300
+@@ -2,13 +2,40 @@
+ #define _LINUX_NAMEI_H
+
+ #include <linux/linkage.h>
++#include <linux/string.h>
+
+ struct vfsmount;
+
++/* intent opcodes */
++#define IT_OPEN (1)
++#define IT_CREAT (1<<1)
++#define IT_READDIR (1<<2)
++#define IT_GETATTR (1<<3)
++#define IT_LOOKUP (1<<4)
++#define IT_UNLINK (1<<5)
++#define IT_TRUNC (1<<6)
++#define IT_GETXATTR (1<<7)
++
++#define INTENT_MAGIC 0x19620323
++
+ struct open_intent {
++ int magic;
++ int op;
++ void (*op_release)(struct open_intent *);
+ int flags;
+ int create_mode;
++ union {
++ void *fs_data; /* FS-specific intent data */
++ } d;
+ };
+
++static inline void intent_init(struct open_intent *it, int op)
++{
++ memset(it, 0, sizeof(*it));
++ it->magic = INTENT_MAGIC;
++ it->op = op;
++}
++
++
+ struct nameidata {
+ struct dentry *dentry;
+@@ -53,14 +76,22 @@
+ #define LOOKUP_ACCESS (0x0400)
+
+ extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
++extern int FASTCALL(__user_walk_it(const char __user *, unsigned, struct nameidata *));
+ #define user_path_walk(name,nd) \
+ __user_walk(name, LOOKUP_FOLLOW, nd)
++#define user_path_walk_it(name,nd) \
++ __user_walk_it(name, LOOKUP_FOLLOW, nd)
+ #define user_path_walk_link(name,nd) \
+ __user_walk(name, 0, nd)
++#define user_path_walk_link_it(name,nd) \
++ __user_walk_it(name, 0, nd)
+ extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
++extern int FASTCALL(path_lookup_it(const char *, unsigned, struct nameidata *));
+ extern int FASTCALL(path_walk(const char *, struct nameidata *));
++extern int FASTCALL(path_walk_it(const char *, struct nameidata *));
+ extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
+ extern void path_release(struct nameidata *);
++extern void intent_release(struct open_intent *);
+
+ extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
+ extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
+Index: linus-2.6.7-bk-latest/include/linux/fs.h
+===================================================================
+--- linus-2.6.7-bk-latest.orig/include/linux/fs.h 2004-07-07 10:56:33.720456120 +0300
++++ linus-2.6.7-bk-latest/include/linux/fs.h 2004-07-07 11:38:42.864967712 +0300
+@@ -583,6 +583,7 @@
+ spinlock_t f_ep_lock;
+ #endif /* #ifdef CONFIG_EPOLL */
+ struct address_space *f_mapping;
++ struct open_intent *f_it;
+ };
+ extern spinlock_t files_lock;
+ #define file_list_lock() spin_lock(&files_lock);
+@@ -1201,6 +1202,7 @@
+ extern int do_truncate(struct dentry *, loff_t start);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct open_intent *);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char __user *);
+
+Index: linus-2.6.7-bk-latest/fs/namei.c
+===================================================================
+--- linus-2.6.7-bk-latest.orig/fs/namei.c 2004-07-07 10:56:13.455536856 +0300
++++ linus-2.6.7-bk-latest/fs/namei.c 2004-07-07 11:38:42.866967408 +0300
+@@ -272,8 +272,19 @@
+ return 0;
+ }
+
++void intent_release(struct open_intent *it)
++{
++ if (!it)
++ return;
++ if (it->magic != INTENT_MAGIC)
++ return;
++ if (it->op_release)
++ it->op_release(it);
++}
++
+ void path_release(struct nameidata *nd)
+ {
++ intent_release(&nd->intent.open);
+ dput(nd->dentry);
+ mntput(nd->mnt);
+ }
+@@ -790,8 +801,14 @@
+ return err;
+ }
+
++int fastcall path_walk_it(const char * name, struct nameidata *nd)
++{
++ current->total_link_count = 0;
++ return link_path_walk(name, nd);
++}
+ int fastcall path_walk(const char * name, struct nameidata *nd)
+ {
++ intent_init(&nd->intent.open, IT_LOOKUP);
+ current->total_link_count = 0;
+ return link_path_walk(name, nd);
+ }
+@@ -800,7 +817,7 @@
+ /* returns 1 if everything is done */
+ static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
+ {
+- if (path_walk(name, nd))
++ if (path_walk_it(name, nd))
+ return 0; /* something went wrong... */
+
+ if (!nd->dentry->d_inode || S_ISDIR(nd->dentry->d_inode->i_mode)) {
+@@ -878,7 +895,18 @@
+ return 1;
+ }
+
+-int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
++static inline int it_mode_from_lookup_flags(int flags)
++{
++ int mode = IT_LOOKUP;
++
++ if (flags & LOOKUP_OPEN)
++ mode = IT_OPEN;
++ if (flags & LOOKUP_CREATE)
++ mode |= IT_CREAT;
++ return mode;
++}
++
++int fastcall path_lookup_it(const char *name, unsigned int flags, struct nameidata *nd)
+ {
+ int retval;
+
+@@ -914,6 +942,12 @@
+ return retval;
+ }
+
++int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
++{
++ intent_init(&nd->intent.open, it_mode_from_lookup_flags(flags));
++ return path_lookup_it(name, flags, nd);
++}
++
+ /*
+ * Restricted form of lookup. Doesn't follow links, single-component only,
+ * needs parent already locked. Doesn't follow mounts.
+@@ -964,7 +998,7 @@
+ }
+
+ /* SMP-safe */
+-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd)
+ {
+ unsigned long hash;
+ struct qstr this;
+@@ -984,11 +1018,16 @@
+ }
+ this.hash = end_name_hash(hash);
+
+- return lookup_hash(&this, base);
++ return __lookup_hash(&this, base, nd);
+ access:
+ return ERR_PTR(-EACCES);
+ }
+
++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++{
++ return lookup_one_len_it(name, base, len, NULL);
++}
++
+ /*
+ * namei()
+ *
+@@ -1000,18 +1039,24 @@
+ * that namei follows links, while lnamei does not.
+ * SMP-safe
+ */
+-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd)
+ {
+ char *tmp = getname(name);
+ int err = PTR_ERR(tmp);
+
+ if (!IS_ERR(tmp)) {
+- err = path_lookup(tmp, flags, nd);
++ err = path_lookup_it(tmp, flags, nd);
+ putname(tmp);
+ }
+ return err;
+ }
+
++int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++{
++ intent_init(&nd->intent.open, it_mode_from_lookup_flags(flags));
++ return __user_walk_it(name, flags, nd);
++}
++
+ /*
+ * It's inline, so penalty for filesystems that don't use sticky bit is
+ * minimal.
+@@ -1296,7 +1341,7 @@
+ * The simplest case - just a plain lookup.
+ */
+ if (!(flag & O_CREAT)) {
+- error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd);
++ error = path_lookup_it(pathname, lookup_flags(flag), nd);
+ if (error)
+ return error;
+ goto ok;
+@@ -1305,7 +1350,8 @@
+ /*
+ * Create - we need to know the parent.
+ */
+- error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
++ nd->intent.open.op |= IT_CREAT;
++ error = path_lookup_it(pathname, LOOKUP_PARENT, nd);
+ if (error)
+ return error;
+
+@@ -2214,6 +2260,7 @@
+ static int __vfs_follow_link(struct nameidata *nd, const char *link)
+ {
+ int res = 0;
++ struct open_intent it = nd->intent.open;
+ char *name;
+ if (IS_ERR(link))
+ goto fail;
+@@ -2224,6 +2271,10 @@
+ /* weird __emul_prefix() stuff did it */
+ goto out;
+ }
++ intent_release(&nd->intent.open);
++ intent_init(&nd->intent.open, it.op);
++ nd->intent.open.flags = it.flags;
++ nd->intent.open.create_mode = it.create_mode;
+ res = link_path_walk(link, nd);
+ out:
+ if (nd->depth || res || nd->last_type!=LAST_NORM)
+@@ -2322,6 +2372,7 @@
+ return res;
+ }
+
++
+ int page_symlink(struct inode *inode, const char *symname, int len)
+ {
+ struct address_space *mapping = inode->i_mapping;
+@@ -2385,8 +2436,10 @@
+ EXPORT_SYMBOL(page_symlink);
+ EXPORT_SYMBOL(page_symlink_inode_operations);
+ EXPORT_SYMBOL(path_lookup);
++EXPORT_SYMBOL(path_lookup_it);
+ EXPORT_SYMBOL(path_release);
+ EXPORT_SYMBOL(path_walk);
++EXPORT_SYMBOL(path_walk_it);
+ EXPORT_SYMBOL(permission);
+ EXPORT_SYMBOL(unlock_rename);
+ EXPORT_SYMBOL(vfs_create);
+Index: linus-2.6.7-bk-latest/fs/open.c
+===================================================================
+--- linus-2.6.7-bk-latest.orig/fs/open.c 2004-07-07 10:56:13.610513296 +0300
++++ linus-2.6.7-bk-latest/fs/open.c 2004-07-07 11:38:42.867967256 +0300
+@@ -216,11 +216,12 @@
+ struct inode * inode;
+ int error;
+
++ intent_init(&nd.intent.open, IT_GETATTR);
+ error = -EINVAL;
+ if (length < 0) /* sorry, but loff_t says... */
+ goto out;
+
+- error = user_path_walk(path, &nd);
++ error = user_path_walk_it(path, &nd);
+ if (error)
+ goto out;
+ inode = nd.dentry->d_inode;
+@@ -475,6 +476,7 @@
+ kernel_cap_t old_cap;
+ int res;
+
++ intent_init(&nd.intent.open, IT_GETATTR);
+ if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
+ return -EINVAL;
+
+@@ -498,7 +500,7 @@
+ else
+ current->cap_effective = current->cap_permitted;
+
+- res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
++ res = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+ if (!res) {
+ res = permission(nd.dentry->d_inode, mode, &nd);
+ /* SuS v2 requires we report a read only fs too */
+@@ -520,7 +522,8 @@
+ struct nameidata nd;
+ int error;
+
+- error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
++ intent_init(&nd.intent.open, IT_GETATTR);
++ error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+ if (error)
+ goto out;
+
+@@ -571,7 +574,8 @@
+ struct nameidata nd;
+ int error;
+
+- error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
++ intent_init(&nd.intent.open, IT_GETATTR);
++ error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+ if (error)
+ goto out;
+
+@@ -754,6 +758,7 @@
+ {
+ int namei_flags, error;
+ struct nameidata nd;
++ intent_init(&nd.intent.open, IT_OPEN);
+
+ namei_flags = flags;
+ if ((namei_flags+1) & O_ACCMODE)
+@@ -763,14 +768,14 @@
+
+ error = open_namei(filename, namei_flags, mode, &nd);
+ if (!error)
+- return dentry_open(nd.dentry, nd.mnt, flags);
++ return dentry_open_it(nd.dentry, nd.mnt, flags, &nd.intent.open);
+
+ return ERR_PTR(error);
+ }
+
+ EXPORT_SYMBOL(filp_open);
+
+-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, int flags, struct open_intent *it)
+ {
+ struct file * f;
+ struct inode *inode;
+@@ -782,6 +787,7 @@
+ goto cleanup_dentry;
+ f->f_flags = flags;
+ f->f_mode = (flags+1) & O_ACCMODE;
++ f->f_it = it;
+ inode = dentry->d_inode;
+ if (f->f_mode & FMODE_WRITE) {
+ error = get_write_access(inode);
+@@ -800,6 +806,7 @@
+ error = f->f_op->open(inode,f);
+ if (error)
+ goto cleanup_all;
++ intent_release(it);
+ }
+ f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+
+@@ -825,11 +832,20 @@
+ cleanup_file:
+ put_filp(f);
+ cleanup_dentry:
++ intent_release(it);
+ dput(dentry);
+ mntput(mnt);
+ return ERR_PTR(error);
+ }
+
++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++{
++ struct open_intent it;
++ intent_init(&it, IT_LOOKUP);
++
++ return dentry_open_it(dentry, mnt, flags, &it);
++}
++
+ EXPORT_SYMBOL(dentry_open);
+
+ /*
+Index: linus-2.6.7-bk-latest/fs/stat.c
+===================================================================
+--- linus-2.6.7-bk-latest.orig/fs/stat.c 2004-07-07 10:56:13.635509496 +0300
++++ linus-2.6.7-bk-latest/fs/stat.c 2004-07-07 11:38:42.868967104 +0300
+@@ -59,15 +59,15 @@
+ }
+ return 0;
+ }
+-
+ EXPORT_SYMBOL(vfs_getattr);
+
+ int vfs_stat(char __user *name, struct kstat *stat)
+ {
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent.open, IT_GETATTR);
+
+- error = user_path_walk(name, &nd);
++ error = user_path_walk_it(name, &nd);
+ if (!error) {
+ error = vfs_getattr(nd.mnt, nd.dentry, stat);
+ path_release(&nd);
+@@ -81,8 +81,9 @@
+ {
+ struct nameidata nd;
+ int error;
++ intent_init(&nd.intent.open, IT_GETATTR);
+
+- error = user_path_walk_link(name, &nd);
++ error = user_path_walk_link_it(name, &nd);
+ if (!error) {
+ error = vfs_getattr(nd.mnt, nd.dentry, stat);
+ path_release(&nd);
+@@ -96,9 +97,12 @@
+ {
+ struct file *f = fget(fd);
+ int error = -EBADF;
++ struct nameidata nd;
++ intent_init(&nd.intent.open, IT_GETATTR);
+
+ if (f) {
+ error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat);
++ intent_release(&nd.intent.open);
+ fput(f);
+ }
+ return error;
+Index: linus-2.6.7-bk-latest/fs/namespace.c
+===================================================================
+--- linus-2.6.7-bk-latest.orig/fs/namespace.c 2004-07-07 10:56:13.605514056 +0300
++++ linus-2.6.7-bk-latest/fs/namespace.c 2004-07-07 11:38:42.868967104 +0300
+@@ -117,6 +117,7 @@
+
+ static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
+ {
++ memset(old_nd, 0, sizeof(*old_nd));
+ old_nd->dentry = mnt->mnt_mountpoint;
+ old_nd->mnt = mnt->mnt_parent;
+ mnt->mnt_parent = mnt;
+Index: linus-2.6.7-bk-latest/fs/exec.c
+===================================================================
+--- linus-2.6.7-bk-latest.orig/fs/exec.c 2004-07-07 10:56:13.395545976 +0300
++++ linus-2.6.7-bk-latest/fs/exec.c 2004-07-07 11:38:42.869966952 +0300
+@@ -121,8 +121,9 @@
+ struct nameidata nd;
+ int error;
+
++ intent_init(&nd.intent.open, IT_OPEN);
+ nd.intent.open.flags = FMODE_READ;
+- error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++ error = user_path_walk_it(library, &nd);
+ if (error)
+ goto out;
+
+@@ -134,7 +135,7 @@
+ if (error)
+ goto exit;
+
+- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent.open);
+ error = PTR_ERR(file);
+ if (IS_ERR(file))
+ goto out;
+@@ -474,8 +475,9 @@
+ int err;
+ struct file *file;
+
++ intent_init(&nd.intent.open, IT_OPEN);
+ nd.intent.open.flags = FMODE_READ;
+- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++ err = path_lookup_it(name, LOOKUP_FOLLOW, &nd);
+ file = ERR_PTR(err);
+
+ if (!err) {
+@@ -488,7 +490,7 @@
+ err = -EACCES;
+ file = ERR_PTR(err);
+ if (!err) {
+- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent.open);
+ if (!IS_ERR(file)) {
+ err = deny_write_access(file);
+ if (err) {
+Index: linus-2.6.7-bk-latest/fs/xattr.c
+===================================================================
+--- linus-2.6.7-bk-latest.orig/fs/xattr.c 2004-07-07 10:56:13.643508280 +0300
++++ linus-2.6.7-bk-latest/fs/xattr.c 2004-07-07 11:38:42.870966800 +0300
+@@ -161,7 +161,8 @@
+ struct nameidata nd;
+ ssize_t error;
+
+- error = user_path_walk(path, &nd);
++ intent_init(&nd.intent.open, IT_GETXATTR);
++ error = user_path_walk_it(path, &nd);
+ if (error)
+ return error;
+ error = getxattr(nd.dentry, name, value, size);
+@@ -176,7 +177,8 @@
+ struct nameidata nd;
+ ssize_t error;
+
+- error = user_path_walk_link(path, &nd);
++ intent_init(&nd.intent.open, IT_GETXATTR);
++ error = user_path_walk_link_it(path, &nd);
+ if (error)
+ return error;
+ error = getxattr(nd.dentry, name, value, size);
+@@ -242,7 +244,8 @@
+ struct nameidata nd;
+ ssize_t error;
+
+- error = user_path_walk(path, &nd);
++ intent_init(&nd.intent.open, IT_GETXATTR);
++ error = user_path_walk_it(path, &nd);
+ if (error)
+ return error;
+ error = listxattr(nd.dentry, list, size);
+@@ -256,7 +259,8 @@
+ struct nameidata nd;
+ ssize_t error;
+
+- error = user_path_walk_link(path, &nd);
++ intent_init(&nd.intent.open, IT_GETXATTR);
++ error = user_path_walk_link_it(path, &nd);
+ if (error)
+ return error;
+ error = listxattr(nd.dentry, list, size);
+
+--- linux-2.6.7.orig/include/linux/mount.h 2004-06-16 13:18:57.000000000 +0800
++++ linux-2.6.7/include/linux/mount.h 2004-09-06 21:05:29.000000000 +0800
+@@ -31,6 +31,8 @@
+ int mnt_flags;
+ char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
+ struct list_head mnt_list;
++ struct list_head mnt_lustre_list; /* GNS mount list */
++ unsigned long mnt_last_used; /* for GNS auto-umount (jiffies) */
+ };
+
+ static inline struct vfsmount *mntget(struct vfsmount *mnt)
--- /dev/null
+Index: linus-2.6.7/fs/namei.c
+===================================================================
+--- linus-2.6.7.orig/fs/namei.c 2005-03-05 20:24:52.000000000 +0200
++++ linus-2.6.7/fs/namei.c 2005-03-28 17:11:20.486991680 +0300
+@@ -676,8 +676,11 @@
+ goto out_dput;
+
+ if (inode->i_op->follow_link) {
++ int saved_flags = nd->flags;
+ mntget(next.mnt);
++ nd->flags |= LOOKUP_LINK_NOTLAST;
+ err = do_follow_link(next.dentry, nd);
++ nd->flags = saved_flags;
+ dput(next.dentry);
+ mntput(next.mnt);
+ if (err)
+@@ -723,7 +726,9 @@
+ if (err < 0)
+ break;
+ }
++ nd->flags |= LOOKUP_LAST;
+ err = do_lookup(nd, &this, &next);
++ nd->flags &= ~LOOKUP_LAST;
+ if (err)
+ break;
+ follow_mount(&next.mnt, &next.dentry);
+@@ -769,10 +774,14 @@
+ */
+ if (nd->dentry && nd->dentry->d_sb &&
+ (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
+- err = -ESTALE;
++ nd->flags |= LOOKUP_LAST;
++ err = !nd->dentry->d_op->d_revalidate(nd->dentry, nd);
++ nd->flags &= ~LOOKUP_LAST;
+ /* Note: we do not d_invalidate() */
+- if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
++ if (err) {
++ err = -ESTALE;
+ break;
++ }
+ }
+ return_base:
+ return 0;
+@@ -1344,7 +1353,9 @@
+ dir = nd->dentry;
+ nd->flags &= ~LOOKUP_PARENT;
+ down(&dir->d_inode->i_sem);
++ nd->flags |= LOOKUP_LAST;
+ dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++ nd->flags &= ~LOOKUP_LAST;
+
+ do_last:
+ error = PTR_ERR(dentry);
+@@ -1449,7 +1460,9 @@
+ }
+ dir = nd->dentry;
+ down(&dir->d_inode->i_sem);
++ nd->flags |= LOOKUP_LAST;
+ dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++ nd->flags &= ~LOOKUP_LAST;
+ putname(nd->last.name);
+ goto do_last;
+ }
+Index: linus-2.6.7/include/linux/namei.h
+===================================================================
+--- linus-2.6.7.orig/include/linux/namei.h 2005-03-05 20:24:52.000000000 +0200
++++ linus-2.6.7/include/linux/namei.h 2005-03-05 20:24:52.000000000 +0200
+@@ -68,6 +68,9 @@
+ #define LOOKUP_CONTINUE 4
+ #define LOOKUP_PARENT 16
+ #define LOOKUP_NOALT 32
++#define LOOKUP_LAST 64
++#define LOOKUP_LINK_NOTLAST 128
++
+ /*
+ * Intent data
+ */
/* In order to reduce some races, while at the same time doing additional
* checking and hopefully speeding things up, we copy filenames to the
* kernel data space before using them..
-@@ -362,10 +394,11 @@
+@@ -362,8 +394,9 @@
+ {
struct dentry * result;
struct inode *dir = parent->d_inode;
- int counter = 0;
+ void *lock;
- again:
- counter++;
- down(&dir->i_sem);
+ lock = lock_dir(dir, name);
/*
out2:
path_release(&nd);
out:
-@@ -1765,14 +1798,14 @@
- goto exit1;
- }
-
+@@ -1735,14 +1735,14 @@
+ error = -EBUSY;
+ goto exit1;
+ }
- down(&nd.dentry->d_inode->i_sem);
+ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last);
dentry = lookup_hash(&nd.last, nd.dentry);
exit1:
path_release(&nd);
exit:
-@@ -1842,7 +1875,7 @@
- if (error != -EOPNOTSUPP)
- goto exit1;
- }
+@@ -1808,7 +1808,7 @@
+ error = -EISDIR;
+ if (nd.last_type != LAST_NORM)
+ goto exit1;
- down(&nd.dentry->d_inode->i_sem);
+ nd.lock = lock_dir(nd.dentry->d_inode, &nd.last);
dentry = lookup_hash(&nd.last, nd.dentry);
@@ -52,6 +52,7 @@
unsigned int flags;
int last_type;
- struct lookup_intent intent;
+ void *lock;
- };
- /*
+ /* Intent data */
+ union {
+ struct open_intent open;
--- /dev/null
+Index: linus-2.6.7/fs/namei.c
+===================================================================
+--- linus-2.6.7.orig/fs/namei.c 2005-03-05 20:24:52.000000000 +0200
++++ linus-2.6.7/fs/namei.c 2005-03-23 13:37:48.563339840 +0200
+@@ -758,14 +758,20 @@
+ lookup_parent:
+ nd->last = this;
+ nd->last_type = LAST_NORM;
+- if (this.name[0] != '.')
+- goto return_base;
+- if (this.len == 1)
+- nd->last_type = LAST_DOT;
+- else if (this.len == 2 && this.name[1] == '.')
+- nd->last_type = LAST_DOTDOT;
+- else
+- goto return_base;
++ if (this.name[0] == '.') {
++ if (this.len == 1)
++ nd->last_type = LAST_DOT;
++ else if (this.len == 2 && this.name[1] == '.')
++ nd->last_type = LAST_DOTDOT;
++ }
++
++ if ((nd->last_type == LAST_NORM) && inode->i_op &&
++ inode->i_op->endparentlookup) {
++ err = inode->i_op->endparentlookup(nd);
++ if (err)
++ break;
++ }
++ goto return_base;
+ return_reval:
+ /*
+ * We bypassed the ordinary revalidation routines.
+@@ -1535,9 +1541,16 @@
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+
+- error = path_lookup(tmp, LOOKUP_PARENT, &nd);
++ intent_init(&nd.intent.open, IT_MKNOD);
++ nd.intent.open.create_mode = mode;
++ nd.intent.open.create.dev = dev;
++
++ error = path_lookup_it(tmp, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
++ if (nd.intent.open.flags & IT_STATUS_RAW)
++ goto out2;
++
+ dentry = lookup_create(&nd, 0);
+ error = PTR_ERR(dentry);
+
+@@ -1564,6 +1577,7 @@
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(tmp);
+@@ -1606,9 +1620,13 @@
+ struct dentry *dentry;
+ struct nameidata nd;
+
+- error = path_lookup(tmp, LOOKUP_PARENT, &nd);
++ intent_init(&nd.intent.open, IT_MKDIR);
++ nd.intent.open.create_mode = mode;
++ error = path_lookup_it(tmp, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
++ if (nd.intent.open.flags & IT_STATUS_RAW)
++ goto out2;
+ dentry = lookup_create(&nd, 1);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+@@ -1618,6 +1636,7 @@
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(tmp);
+@@ -1703,9 +1722,12 @@
+ if(IS_ERR(name))
+ return PTR_ERR(name);
+
+- error = path_lookup(name, LOOKUP_PARENT, &nd);
++ intent_init(&nd.intent.open, IT_RMDIR);
++ error = path_lookup_it(name, LOOKUP_PARENT, &nd);
+ if (error)
+ goto exit;
++ if (nd.intent.open.flags & IT_STATUS_RAW)
++ goto exit1;
+
+ switch(nd.last_type) {
+ case LAST_DOTDOT:
+@@ -1781,9 +1803,13 @@
+ if(IS_ERR(name))
+ return PTR_ERR(name);
+
+- error = path_lookup(name, LOOKUP_PARENT, &nd);
++ intent_init(&nd.intent.open, IT_UNLINK);
++ error = path_lookup_it(name, LOOKUP_PARENT, &nd);
+ if (error)
+ goto exit;
++ if (nd.intent.open.flags & IT_STATUS_RAW)
++ goto exit1;
++
+ error = -EISDIR;
+ if (nd.last_type != LAST_NORM)
+ goto exit1;
+@@ -1855,9 +1881,13 @@
+ struct dentry *dentry;
+ struct nameidata nd;
+
+- error = path_lookup(to, LOOKUP_PARENT, &nd);
++ intent_init(&nd.intent.open, IT_SYMLINK);
++ nd.intent.open.create.link = from;
++ error = path_lookup_it(to, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
++ if (nd.intent.open.flags & IT_STATUS_RAW)
++ goto out2;
+ dentry = lookup_create(&nd, 0);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+@@ -1865,6 +1895,7 @@
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(to);
+@@ -1936,9 +1967,13 @@
+ error = __user_walk(oldname, 0, &old_nd);
+ if (error)
+ goto exit;
+- error = path_lookup(to, LOOKUP_PARENT, &nd);
++ intent_init(&nd.intent.open, IT_LINK);
++ nd.intent.open.create.source_nd = &old_nd;
++ error = path_lookup_it(to, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
++ if (nd.intent.open.flags & IT_STATUS_RAW)
++ goto out_release;
+ error = -EXDEV;
+ if (old_nd.mnt != nd.mnt)
+ goto out_release;
+@@ -2119,9 +2154,18 @@
+ if (error)
+ goto exit;
+
+- error = path_lookup(newname, LOOKUP_PARENT, &newnd);
++ error = -EBUSY;
++ if (oldnd.last_type != LAST_NORM)
++ goto exit1;
++
++ intent_init(&newnd.intent.open, IT_RENAME);
++ newnd.intent.open.create.source_nd = &oldnd;
++ error = path_lookup_it(newname, LOOKUP_PARENT, &newnd);
+ if (error)
+ goto exit1;
++ if (newnd.intent.open.flags & IT_STATUS_RAW) {
++ goto exit2;
++ }
+
+ error = -EXDEV;
+ if (oldnd.mnt != newnd.mnt)
+@@ -2129,8 +2173,6 @@
+
+ old_dir = oldnd.dentry;
+ error = -EBUSY;
+- if (oldnd.last_type != LAST_NORM)
+- goto exit2;
+
+ new_dir = newnd.dentry;
+ if (newnd.last_type != LAST_NORM)
+@@ -2238,6 +2280,7 @@
+ intent_init(&nd->intent.open, it.op);
+ nd->intent.open.flags = it.flags;
+ nd->intent.open.create_mode = it.create_mode;
++ nd->intent.open.create = it.create;
+ res = link_path_walk(link, nd);
+ out:
+ if (current->link_count || res || nd->last_type!=LAST_NORM)
+Index: linus-2.6.7/include/linux/namei.h
+===================================================================
+--- linus-2.6.7.orig/include/linux/namei.h 2005-03-05 20:24:52.000000000 +0200
++++ linus-2.6.7/include/linux/namei.h 2005-03-23 13:34:56.632477304 +0200
+@@ -15,9 +15,19 @@
+ #define IT_UNLINK (1<<5)
+ #define IT_TRUNC (1<<6)
+ #define IT_GETXATTR (1<<7)
++#define IT_RMDIR (1<<8)
++#define IT_LINK (1<<9)
++#define IT_RENAME (1<<10)
++#define IT_MKDIR (1<<11)
++#define IT_MKNOD (1<<12)
++#define IT_SYMLINK (1<<13)
++#define IT_CHDIR (1<<14)
+
+ #define INTENT_MAGIC 0x19620323
+-
++#define IT_STATUS_RAW (1<<10) /* Setting this in it_flags on exit from lookup
++ means everything was done already and return
++ value from lookup is in fact status of
++ already performed operation */
+ struct open_intent {
+ int magic;
+ int op;
+@@ -25,6 +35,11 @@
+ int flags;
+ int create_mode;
+ union {
++ unsigned dev; /* For mknod */
++ char *link; /* For symlink */
++ struct nameidata *source_nd; /* For link/rename */
++ } create;
++ union {
+ void *fs_data; /* FS-specific intent data */
+ } d;
+ };
+Index: linus-2.6.7/include/linux/fs.h
+===================================================================
+--- linus-2.6.7.orig/include/linux/fs.h 2005-03-05 20:24:52.000000000 +0200
++++ linus-2.6.7/include/linux/fs.h 2005-03-23 13:35:08.796628072 +0200
+@@ -909,6 +909,7 @@
+ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+ ssize_t (*listxattr) (struct dentry *, char *, size_t);
+ int (*removexattr) (struct dentry *, const char *);
++ int (*endparentlookup) (struct nameidata *);
+ };
+
+ struct seq_file;
--- /dev/null
+ fs/exec.c | 4 ++--
+ include/linux/fs.h | 1 +
+ 2 files changed, 3 insertions(+), 2 deletions(-)
+
+--- linus-2.6.7-bk-latest/include/linux/fs.h.orig 2004-07-07 12:33:21.246507224 +0300
++++ linus-2.6.7-bk-latest/include/linux/fs.h 2004-07-07 12:33:55.069365368 +0300
+@@ -74,6 +74,7 @@ extern int leases_enable, dir_notify_ena
+
+ #define FMODE_READ 1
+ #define FMODE_WRITE 2
++#define FMODE_EXEC 4
+
+ #define RW_MASK 1
+ #define RWA_MASK 2
+--- linus-2.6.7-bk-latest/fs/exec.c.orig 2004-07-07 12:33:05.466906088 +0300
++++ linus-2.6.7-bk-latest/fs/exec.c 2004-07-07 12:33:38.127940856 +0300
+@@ -122,7 +122,7 @@ asmlinkage long sys_uselib(const char __
+ int error;
+
+ intent_init(&nd.intent.open, IT_OPEN);
+- nd.intent.open.flags = FMODE_READ;
++ nd.intent.open.flags = FMODE_READ|FMODE_EXEC;
+ error = user_path_walk_it(library, &nd);
+ if (error)
+ goto out;
+@@ -476,7 +476,7 @@ struct file *open_exec(const char *name)
+ struct file *file;
+
+ intent_init(&nd.intent.open, IT_OPEN);
+- nd.intent.open.flags = FMODE_READ;
++ nd.intent.open.flags = FMODE_READ|FMODE_EXEC;
+ err = path_lookup_it(name, LOOKUP_FOLLOW, &nd);
+ file = ERR_PTR(err);
+
--- /dev/null
+diff -rupN linux-2.6.7/fs/namei.c linux-2.6.7.new/fs/namei.c
+--- linux-2.6.7/fs/namei.c 2005-03-29 18:54:13.000000000 +0300
++++ linux-2.6.7.new/fs/namei.c 2005-03-31 14:42:01.605302456 +0300
+@@ -422,6 +422,16 @@ static struct dentry * real_lookup(struc
+ result = dentry;
+ }
+ unlock_dir(dir, lock);
++ if (!IS_ERR(result)) {
++ spin_lock(&result->d_lock);
++ if (result->d_flags & DCACHE_GNS_PENDING) {
++ spin_unlock(&result->d_lock);
++ if (result->d_op && result->d_op->d_revalidate)
++ result->d_op->d_revalidate(result, nd);
++ } else {
++ spin_unlock(&result->d_lock);
++ }
++ }
+ return result;
+ }
+
+diff -rupN linux-2.6.7/fs/namespace.c linux-2.6.7.new/fs/namespace.c
+--- linux-2.6.7/fs/namespace.c 2005-03-29 18:54:13.000000000 +0300
++++ linux-2.6.7.new/fs/namespace.c 2005-03-30 17:51:39.000000000 +0300
+@@ -60,6 +60,7 @@ struct vfsmount *alloc_vfsmnt(const char
+ INIT_LIST_HEAD(&mnt->mnt_child);
+ INIT_LIST_HEAD(&mnt->mnt_mounts);
+ INIT_LIST_HEAD(&mnt->mnt_list);
++ INIT_LIST_HEAD(&mnt->mnt_lustre_list);
+ if (name) {
+ int size = strlen(name)+1;
+ char *newname = kmalloc(size, GFP_KERNEL);
+@@ -173,6 +174,9 @@ void __mntput(struct vfsmount *mnt)
+ {
+ struct super_block *sb = mnt->mnt_sb;
+ dput(mnt->mnt_root);
++ spin_lock(&dcache_lock);
++ list_del(&mnt->mnt_lustre_list);
++ spin_unlock(&dcache_lock);
+ free_vfsmnt(mnt);
+ deactivate_super(sb);
+ }
+diff -rupN linux-2.6.7/include/linux/dcache.h linux-2.6.7.new/include/linux/dcache.h
+--- linux-2.6.7/include/linux/dcache.h 2005-03-29 18:54:13.000000000 +0300
++++ linux-2.6.7.new/include/linux/dcache.h 2005-03-31 14:35:51.589553400 +0300
+@@ -167,7 +167,9 @@ d_iput: no no no yes
+ #define DCACHE_UNHASHED 0x0010
+ #define DCACHE_LUSTRE_INVALID 0x0020 /* invalidated by Lustre */
+
+-#define DCACHE_CROSS_REF 0x0040 /* entry points to inode on another MDS */
++#define DCACHE_CROSS_REF 0x0040 /* entry points to inode on another MDS */
++#define DCACHE_GNS_PENDING 0x0080 /* entry is GNS pending mount point */
++#define DCACHE_GNS_MOUNTING 0x0100 /* entry is GNS mount in progress */
+
+ extern spinlock_t dcache_lock;
+
int nr_unused;
Index: linux-2.6.7/include/linux/fs.h
===================================================================
---- linux-2.6.7.orig/include/linux/fs.h 2004-08-26 17:12:41.000000000 +0400
-+++ linux-2.6.7/include/linux/fs.h 2005-01-18 11:27:18.092496832 +0300
-@@ -74,6 +74,7 @@
+--- linux-2.6.7.old/include/linux/fs.h 2005-01-31 14:27:16.000000000 +0800
++++ linux-2.6.7/include/linux/fs.h 2005-01-31 14:32:19.000000000 +0800
+@@ -74,6 +74,7 @@ extern int leases_enable, dir_notify_ena
#define FMODE_READ 1
#define FMODE_WRITE 2
#define RW_MASK 1
#define RWA_MASK 2
-@@ -250,6 +251,8 @@
+@@ -250,6 +251,13 @@ typedef void (dio_iodone_t)(struct inode
#define ATTR_ATTR_FLAG 1024
#define ATTR_KILL_SUID 2048
#define ATTR_KILL_SGID 4096
+#define ATTR_RAW 8192 /* file system, not vfs will massage attrs */
+#define ATTR_FROM_OPEN 16384 /* called from open path, ie O_TRUNC */
++
++#define ATTR_CTIME_SET 0x2000
++/* ea support */
++#define ATTR_EA 0x40000
++#define ATTR_EA_RM 0x80000
/*
* This is the Inode Attributes structure, used for notify_change(). It
-@@ -446,6 +449,7 @@
+@@ -446,6 +454,7 @@ struct inode {
struct block_device *i_bdev;
struct cdev *i_cdev;
int i_cindex;
unsigned long i_dnotify_mask; /* Directory notify events */
struct dnotify_struct *i_dnotify; /* for directory notifications */
-@@ -579,6 +583,7 @@
+@@ -579,6 +588,7 @@ struct file {
spinlock_t f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
};
extern spinlock_t files_lock;
#define file_list_lock() spin_lock(&files_lock);
-@@ -903,7 +908,9 @@
+@@ -903,7 +913,9 @@ struct inode_operations {
void (*truncate) (struct inode *);
int (*permission) (struct inode *, int, struct nameidata *);
int (*setattr) (struct dentry *, struct iattr *);
int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
-@@ -943,6 +950,7 @@
+@@ -943,6 +955,7 @@ struct super_operations {
int (*remount_fs) (struct super_block *, int *, char *);
void (*clear_inode) (struct inode *);
void (*umount_begin) (struct super_block *);
int (*show_options)(struct seq_file *, struct vfsmount *);
};
-@@ -1131,6 +1139,7 @@
+@@ -1131,6 +1144,7 @@ extern int unregister_filesystem(struct
extern struct vfsmount *kern_mount(struct file_system_type *);
extern int may_umount_tree(struct vfsmount *);
extern int may_umount(struct vfsmount *);
extern long do_mount(char *, char *, char *, unsigned long, void *);
extern int vfs_statfs(struct super_block *, struct kstatfs *);
-@@ -1195,6 +1204,7 @@
+@@ -1195,6 +1209,7 @@ static inline int break_lease(struct ino
extern int do_truncate(struct dentry *, loff_t start);
extern struct file *filp_open(const char *, int, int);
extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
+extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *);
extern int filp_close(struct file *, fl_owner_t id);
extern char * getname(const char __user *);
-
+
Index: linux-2.6.7/include/linux/namei.h
===================================================================
--- linux-2.6.7.orig/include/linux/namei.h 2003-07-24 15:52:31.000000000 +0400
uml-2.6.7-01-bb2.patch
lustre_version.patch
-vfs_intent-2.6-vanilla.patch
-vfs_nointent-2.6-vanilla.patch
-vfs_races-2.6-vanilla.patch
-vfs-wantedi-misc-2.6-suse.patch
-nfs-cifs-intent-2.6-vanilla.patch
-iopen-misc-2.6-suse.patch
-export-truncate-2.6-suse.patch
-export_symbols-2.6-suse.patch
-dev_read_only-2.6-suse.patch
-export-2.6-suse.patch
-header-guards-2.6-suse.patch
+vfs-dcache_locking-vanilla-2.6.patch
+vfs-dcache_lustre_invalid-vanilla-2.6.patch
+vfs-intent_api-vanilla-2.6.patch
+vfs-lookup_last-vanilla-2.6.patch
+vfs-raw_ops-vanilla-2.6.patch
+export-vanilla-2.6.patch
+header_guards-vanilla-2.6.patch
+vfs-do_truncate.patch
+vfs_fmode_exec-2.6.patch
+vfs-gns_export_doumount.patch
ext3-super-ntohl.patch
-lookup_bdev_init_intent.patch
-dcache-mds-num-2.6.7.patch
+dcache-mds-num-2.6.7.patch
dynamic-locks-2.6.7.patch
vfs-pdirops-2.6.7.patch
dcache-fid-2.6.7.patch
-jbd-buffer-release-2.6.7.patch
+vfs-wantedi-misc-2.6-suse.patch
+jbd-buffer-release-2.6.7.patch
+dev_read_only-2.6-suse.patch
+vfs_gns-2.6-vanilla.patch
+linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch
#include <linux/lustre_mgmt.h>
#include <linux/lustre_dlm.h>
#include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
+
/* @priority: if non-zero, move the selected to the list head
* @nocreate: if non-zero, only search in existed connections
*/
int client_obd_cleanup(struct obd_device *obddev, int flags)
{
struct client_obd *cli = &obddev->u.cli;
+ ENTRY;
if (!cli->cl_import)
RETURN(-EINVAL);
dereg_f(cli->cl_mgmtcli_obd, obddev);
inter_module_put("mgmtcli_deregister_for_events");
}
+
+ /* Here we try to drop the security structure after destroy import,
+ * to avoid issue of "sleep in spinlock".
+ */
+ class_import_get(cli->cl_import);
class_destroy_import(cli->cl_import);
+ ptlrpcs_import_drop_sec(cli->cl_import);
+ class_import_put(cli->cl_import);
cli->cl_import = NULL;
ldlm_put_ref(flags & OBD_OPT_FORCE);
if (obd->obd_namespace == NULL)
GOTO(out_disco, rc = -ENOMEM);
+ rc = ptlrpcs_import_get_sec(imp);
+ if (rc != 0)
+ GOTO(out_ldlm, rc);
+
imp->imp_dlm_handle = *dlm_handle;
rc = ptlrpc_init_import(imp);
if (rc != 0)
memcpy(&conn, lustre_msg_buf(req->rq_reqmsg, 2, sizeof conn),
sizeof conn);
- if (export->exp_imp_reverse != NULL)
+ if (export->exp_imp_reverse != NULL) {
+ /* same logic as client_obd_cleanup */
+ class_import_get(export->exp_imp_reverse);
class_destroy_import(export->exp_imp_reverse);
+ ptlrpcs_import_drop_sec(export->exp_imp_reverse);
+ class_import_put(export->exp_imp_reverse);
+ }
+
+ /* for the rest part, we return -ENOTCONN in case of errors
+ * in order to let client initialize connection again.
+ */
revimp = export->exp_imp_reverse = class_new_import();
+ if (!revimp) {
+ CERROR("fail to alloc new reverse import.\n");
+ GOTO(out, rc = -ENOTCONN);
+ }
+
revimp->imp_connection = ptlrpc_connection_addref(export->exp_connection);
revimp->imp_client = &export->exp_obd->obd_ldlm_client;
revimp->imp_remote_handle = conn;
revimp->imp_obd = target;
revimp->imp_dlm_fake = 1;
revimp->imp_state = LUSTRE_IMP_FULL;
+
+ rc = ptlrpcs_import_get_sec(revimp);
+ if (rc) {
+ CERROR("reverse import can not get sec: %d\n", rc);
+ class_destroy_import(revimp);
+ export->exp_imp_reverse = NULL;
+ GOTO(out, rc = -ENOTCONN);
+ }
+
class_import_put(revimp);
rc = obd_connect_post(export, connect_flags);
{
/* exports created from last_rcvd data, and "fake"
exports created by lctl don't have an import */
- if (exp->exp_imp_reverse != NULL)
+ if (exp->exp_imp_reverse != NULL) {
+ ptlrpcs_import_drop_sec(exp->exp_imp_reverse);
class_destroy_import(exp->exp_imp_reverse);
+ }
/* We cancel locks at disconnect time, but this will catch any locks
* granted in a race with recovery-induced disconnect. */
memcpy(copy_req, orig_req, sizeof *copy_req);
memcpy(copy_reqmsg, orig_req->rq_reqmsg, orig_req->rq_reqlen);
- /* the copied req takes over the reply state */
+ /* the copied req takes over the reply state and security data */
orig_req->rq_reply_state = NULL;
+ orig_req->rq_sec_svcdata = NULL;
copy_req->rq_reqmsg = copy_reqmsg;
class_export_get(copy_req->rq_export);
}
void ptlrpc_free_clone( struct ptlrpc_request *req)
{
+ if (req->rq_svcsec)
+ svcsec_cleanup_req(req);
+
class_export_put(req->rq_export);
list_del(&req->rq_list);
OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
static void target_release_saved_req(struct ptlrpc_request *req)
{
+ if (req->rq_svcsec)
+ svcsec_cleanup_req(req);
+
class_export_put(req->rq_export);
OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
OBD_FREE(req, sizeof *req);
#ifdef __KERNEL__
# include <linux/slab.h>
+# include <linux/dcache.h>
+# include <linux/namei.h>
# include <linux/module.h>
# include <linux/lustre_dlm.h>
#else
## Liblustre excecutables & libraries Makefile
-SUBDIRS = . tests
+
+# FIXME: we disable building any executables for this moment.
+#SUBDIRS = . tests
AM_CPPFLAGS = $(HAVE_EFENCE) -I$(SYSIO)/include -D_LARGEFILE64_SOURCE=1 \
$(LLCPPFLAGS) -I$(top_srcdir)/portals/unals
$(top_builddir)/lustre/osc/libosc.a \
$(top_builddir)/lustre/mdc/libmdc.a \
$(top_builddir)/lustre/ptlrpc/libptlrpc.a \
+ $(top_builddir)/lustre/sec/libptlrpcs.a \
$(top_builddir)/lustre/obdclass/liblustreclass.a \
$(top_builddir)/lustre/lvfs/liblvfs.a
&data, &lockh, NULL, 0,
ldlm_completion_ast, llu_mdc_blocking_ast,
inode);
- request = (struct ptlrpc_request *)it.d.lustre.it_data;
+ request = (struct ptlrpc_request *)LUSTRE_IT(&it)->it_data;
if (request)
ptlrpc_req_finished(request);
if (rc < 0) {
static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it)
{
- struct ptlrpc_request *req = it->d.lustre.it_data;
+ struct ptlrpc_request *req = LUSTRE_IT(it)->it_data;
struct ll_file_data *fd;
struct mds_body *body;
ENTRY;
fd->fd_mds_och.och_magic = OBD_CLIENT_HANDLE_MAGIC;
lli->lli_file_data = fd;
- mdc_set_open_replay_data(NULL, &fd->fd_mds_och, it->d.lustre.it_data);
+ mdc_set_open_replay_data(NULL, &fd->fd_mds_och, LUSTRE_IT(it)->it_data);
RETURN(0);
}
CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", lli->lli_st_ino);
LL_GET_INTENT(inode, it);
- if (!it->d.lustre.it_disposition) {
+ if (!LUSTRE_IT(it)->it_disposition)
LBUG();
- }
rc = it_open_error(DISP_OPEN_OPEN, it);
if (rc)
lli->lli_open_flags = flags & ~(O_CREAT | O_EXCL | O_TRUNC);
out_release:
- request = it->d.lustre.it_data;
+ request = LUSTRE_IT(it)->it_data;
ptlrpc_req_finished(request);
it->it_op_release(it);
build_obj_list ../osc libosc.a
build_obj_list ../mdc libmdc.a
build_obj_list ../ptlrpc libptlrpc.a
+build_obj_list ../sec libptlrpcs.a
build_obj_list ../obdclass liblustreclass.a
build_obj_list ../lvfs liblvfs.a
{
struct lustre_handle *handle;
- if (it->it_op && it->d.lustre.it_lock_mode) {
- handle = (struct lustre_handle *)&it->d.lustre.it_lock_handle;
+ if (it->it_op && LUSTRE_IT(it)->it_lock_mode) {
+ handle = (struct lustre_handle *)&LUSTRE_IT(it)->it_lock_handle;
CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64
" from it %p\n", handle->cookie, it);
- ldlm_lock_decref(handle, it->d.lustre.it_lock_mode);
+ ldlm_lock_decref(handle, LUSTRE_IT(it)->it_lock_mode);
/* bug 494: intent_release may be called multiple times, from
* this thread and we don't want to double-decref this lock */
- it->d.lustre.it_lock_mode = 0;
+ LUSTRE_IT(it)->it_lock_mode = 0;
}
}
ll_intent_drop_lock(it);
it->it_magic = 0;
it->it_op_release = 0;
- it->d.lustre.it_disposition = 0;
- it->d.lustre.it_data = NULL;
+ LUSTRE_IT(it)->it_disposition = 0;
+ LUSTRE_IT(it)->it_data = NULL;
EXIT;
}
CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%lu)\n",
inode, llu_i2info(inode)->lli_st_ino,
llu_i2info(inode)->lli_st_generation);
- mdc_set_lock_data(NULL, &it->d.lustre.it_lock_handle, inode);
+ mdc_set_lock_data(NULL, &LUSTRE_IT(it)->it_lock_handle, inode);
}
/* drop lookup/getattr locks */
valid &= src->o_valid;
if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
- CDEBUG(D_INODE, "valid %x, cur time %lu/%lu, new %lu/%lu\n",
- src->o_valid,
+ CDEBUG(D_INODE, "valid %llx, cur time %lu/%lu, new %lu/%lu\n",
+ (unsigned long long)src->o_valid,
LTIME_S(lli->lli_st_mtime), LTIME_S(lli->lli_st_ctime),
(long)src->o_mtime, (long)src->o_ctime);
obd_valid newvalid = 0;
if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
- CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n",
- valid, LTIME_S(lli->lli_st_mtime),
+ CDEBUG(D_INODE, "valid %llx, new time %lu/%lu\n",
+ (unsigned long long)valid, LTIME_S(lli->lli_st_mtime),
LTIME_S(lli->lli_st_ctime));
if (valid & OBD_MD_FLATIME) {
valid |= OBD_MD_FLEASIZE;
}
ll_inode2id(&id, inode);
- rc = mdc_getattr(sbi->ll_md_exp, &id, valid, ealen, &req);
+ rc = mdc_getattr(sbi->ll_md_exp, &id, valid, NULL, 0,
+ ealen, &req);
if (rc) {
CERROR("failure %d inode %lu\n", rc, lli->lli_st_ino);
RETURN(-abs(rc));
ll_inode2id(&id, inode);
rc = mdc_getattr(sbi->ll_md_exp, &id,
- OBD_MD_LINKNAME, symlen, request);
+ OBD_MD_LINKNAME, NULL, 0, symlen, request);
if (rc) {
CERROR("inode %lu: rc = %d\n", lli->lli_st_ino, rc);
RETURN(rc);
if ((md->body->valid &
(OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) !=
(OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) {
- CERROR("bad md body valid mask 0x%x\n", md->body->valid);
+ CERROR("bad md body valid mask 0x%llx\n",
+ (unsigned long long)md->body->valid);
LBUG();
return ERR_PTR(-EPERM);
}
/* fetch attr of root inode */
err = mdc_getattr(sbi->ll_md_exp, &rootid,
- OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, 0, &request);
+ OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, NULL, 0,
+ 0, &request);
if (err) {
CERROR("mdc_getattr failed for root: rc = %d\n", err);
GOTO(out_lov, err);
struct ll_dentry_data *lld;
ENTRY;
LASSERT(de != NULL);
+
+ CDEBUG(D_DENTRY, "releasing dentry %p\n", de);
+
lld = ll_d2d(de);
- LASSERT(lld != NULL);
- LASSERT(lld->lld_cwd_count == 0);
- LASSERT(lld->lld_mnt_count == 0);
- OBD_FREE(de->d_fsdata, sizeof(struct ll_dentry_data));
+ if (lld) { /* Root dentry does not have ll_dentry_data */
+ LASSERT(lld->lld_cwd_count == 0);
+ LASSERT(lld->lld_mnt_count == 0);
+ OBD_FREE(de->d_fsdata, sizeof(struct ll_dentry_data));
+ }
EXIT;
}
void ll_intent_drop_lock(struct lookup_intent *it)
{
struct lustre_handle *handle;
+ struct lustre_intent_data *itdata = LUSTRE_IT(it);
- if (it->it_op && it->d.lustre.it_lock_mode) {
- handle = (struct lustre_handle *)&it->d.lustre.it_lock_handle;
+ if (it->it_op && itdata && itdata->it_lock_mode) {
+ handle = (struct lustre_handle *)&itdata->it_lock_handle;
CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64
" from it %p\n", handle->cookie, it);
- ldlm_lock_decref(handle, it->d.lustre.it_lock_mode);
+ ldlm_lock_decref(handle, itdata->it_lock_mode);
/* bug 494: intent_release may be called multiple times, from
* this thread and we don't want to double-decref this lock */
- it->d.lustre.it_lock_mode = 0;
+ itdata->it_lock_mode = 0;
}
}
ll_intent_drop_lock(it);
it->it_magic = 0;
it->it_op_release = 0;
- it->d.lustre.it_disposition = 0;
- it->d.lustre.it_data = NULL;
+ ll_intent_free(it);
EXIT;
}
+void ll_intent_free(struct lookup_intent *it)
+{
+ if (it->d.fs_data) {
+ OBD_SLAB_FREE(it->d.fs_data, ll_intent_slab,
+ sizeof(struct lustre_intent_data));
+ it->d.fs_data = NULL;
+ }
+}
+
void ll_unhash_aliases(struct inode *inode)
{
struct list_head *tmp, *head;
LASSERT(it != NULL);
LASSERT(dentry != NULL);
- if (it->d.lustre.it_lock_mode && dentry->d_inode != NULL) {
+ if (LUSTRE_IT(it)->it_lock_mode && dentry->d_inode != NULL) {
struct inode *inode = dentry->d_inode;
CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
inode, inode->i_ino, inode->i_generation);
- mdc_set_lock_data(NULL, &it->d.lustre.it_lock_handle, inode);
+ mdc_set_lock_data(NULL, &LUSTRE_IT(it)->it_lock_handle, inode);
}
/* drop lookup or getattr locks immediately */
void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft)
{
struct lookup_intent *it = *itp;
-
+
#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
if (it) {
LASSERTF(it->it_magic == INTENT_MAGIC, "bad intent magic: %x\n",
if (!it || it->it_op == IT_GETXATTR)
it = *itp = deft;
+ if (it->d.fs_data)
+ return;
+
+ if (ll_intent_alloc(it)) {
+ CERROR("Failed to allocate memory for lustre specific intent "
+ "data\n");
+ /* XXX: we cannot return status just yet */
+ LBUG();
+ }
+}
+
+int ll_intent_alloc(struct lookup_intent *it)
+{
+ if (it->d.fs_data) {
+ CERROR("Intent alloc on already allocated intent\n");
+ return 0;
+ }
+ OBD_SLAB_ALLOC(it->d.fs_data, ll_intent_slab, SLAB_KERNEL,
+ sizeof(struct lustre_intent_data));
+ if (!it->d.fs_data) {
+ CERROR("Failed to allocate memory for lustre specific intent "
+ "data\n");
+ return -ENOMEM;
+ }
+
it->it_op_release = ll_intent_release;
+
+ return 0;
}
int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd,
struct obd_export *exp;
struct lustre_id pid;
struct lustre_id cid;
- int rc;
+ int orig_it, rc = 0;
ENTRY;
- CDEBUG(D_VFSTRACE, "VFS Op:name=%s, intent=%s\n", de->d_name.name,
- LL_IT2STR(it));
+ spin_lock(&de->d_lock);
+
+ if ((de->d_flags & DCACHE_GNS_PENDING) &&
+ !(de->d_flags & DCACHE_GNS_MOUNTING))
+ {
+ spin_unlock(&de->d_lock);
+
+ if (nd) {
+ int err = ll_gns_mount_object(de, nd->mnt);
+ if (err)
+ CERROR("can't mount %s, err = %d\n",
+ de->d_name.name, err);
+ }
+ RETURN(1);
+ }
+ spin_unlock(&de->d_lock);
+
+ CDEBUG(D_VFSTRACE, "VFS Op:name=%s (%p), intent=%s\n", de->d_name.name,
+ de, LL_IT2STR(it));
/* Cached negative dentries are unsafe for now - look them up again */
if (de->d_inode == NULL)
RETURN(0);
+ /* Root of the tree is always valid, attributes would be fixed in
+ ll_inode_revalidate_it */
+ if (de->d_sb->s_root == de)
+ RETURN(1);
+
CDEBUG(D_INODE, "revalidate 0x%p: %*s -> %lu/%lu\n",
de, de->d_name.len, de->d_name.name,
(unsigned long) de->d_inode->i_ino,
if (nd != NULL)
nd->mnt->mnt_last_used = jiffies;
+ orig_it = it ? it->it_op : IT_OPEN;
ll_frob_intent(&it, &lookup_it);
LASSERT(it != NULL);
if (it->it_op == IT_GETATTR) { /* We need to check for LOOKUP lock as
well */
+ rc = ll_intent_alloc(&lookup_it);
+ if (rc)
+ LBUG(); /* Can't think of better idea just yet */
+
+
rc = md_intent_lock(exp, &pid, de->d_name.name,
de->d_name.len, NULL, 0, &cid, &lookup_it,
flags, &req, ll_mdc_blocking_ast);
UPDATE lock */
if (!rc) {
it = &lookup_it;
+ if (!req) {
+ ll_intent_free(it);
+ goto do_lookup;
+ }
GOTO(out, rc);
}
if (it_disposition(&lookup_it, DISP_LOOKUP_NEG)) {
- ll_intent_release(&lookup_it);
it = &lookup_it;
+ ll_intent_free(it);
GOTO(out, rc = 0);
}
ptlrpc_req_finished(req);
req = NULL;
ll_lookup_finish_locks(&lookup_it, de);
+ /* XXX: on 2.6 ll_lookup_finish_locks does not call ll_intent_release */
+ ll_intent_release(&lookup_it);
}
rc = md_intent_lock(exp, &pid, de->d_name.name, de->d_name.len,
/* If req is NULL, then mdc_intent_lock only tried to do a lock match;
* if all was well, it will return 1 if it found locks, 0 otherwise. */
- if (req == NULL && rc >= 0)
+ if (req == NULL && rc >= 0) {
+ if (!rc)
+ goto do_lookup;
GOTO(out, rc);
+ }
if (rc < 0) {
if (rc != -ESTALE) {
CDEBUG(D_INFO, "ll_intent_lock(): rc %d : it->it_status "
- "%d\n", rc, it->d.lustre.it_status);
+ "%d\n", rc, LUSTRE_IT(it)->it_status);
}
GOTO(out, rc = 0);
}
-
+revalidate_finish:
rc = revalidate_it_finish(req, 1, it, de);
if (rc != 0) {
ll_intent_release(it);
dentry */
spin_lock(&dcache_lock);
hlist_del_init(&de->d_hash);
- __d_rehash(de, 0);
+ __d_rehash(de);
spin_unlock(&dcache_lock);
GOTO(out, rc);
out:
if (req != NULL && rc == 1)
ptlrpc_req_finished(req);
+
if (rc == 0) {
+ if (it == &lookup_it) {
+ ll_intent_release(it);
+ if (req) /* Special case: We did lookup and it failed,
+ need to free request */
+ ptlrpc_req_finished(req);
+ }
ll_unhash_aliases(de->d_inode);
return rc;
}
atomic_read(&de->d_count));
ll_lookup_finish_locks(it, de);
de->d_flags &= ~DCACHE_LUSTRE_INVALID;
- if (!((de->d_inode->i_mode & S_ISUID) &&S_ISDIR(de->d_inode->i_mode)) ||
- !(flags & LOOKUP_CONTINUE || (it->it_op & (IT_CHDIR | IT_OPEN))))
+ if (it == &lookup_it)
+ ll_intent_release(it);
+
+ if (!((de->d_inode->i_mode & S_ISUID) && S_ISDIR(de->d_inode->i_mode)) ||
+ !(flags & LOOKUP_CONTINUE || (orig_it & (IT_CHDIR | IT_OPEN))))
return rc;
- if (nd)
- (void)ll_dir_process_mount_object(de, nd->mnt);
+ if (nd && !(de->d_flags & DCACHE_GNS_MOUNTING)) {
+ int err = ll_gns_mount_object(de, nd->mnt);
+ if (err)
+ CERROR("can't mount %s, err = %d\n",
+ de->d_name.name, err);
+ }
return rc;
+do_lookup:
+ it = &lookup_it;
+ if (ll_intent_alloc(it))
+ LBUG();
+// We did that already, right? ll_inode2id(&pid, de->d_parent->d_inode);
+ rc = md_intent_lock(exp, &pid, de->d_name.name,
+ de->d_name.len, NULL, 0, NULL,
+ it, 0, &req, ll_mdc_blocking_ast);
+ if (rc >= 0) {
+ struct mds_body *mds_body = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*mds_body));
+
+ /* See if we got same inode, if not - return error */
+ if (id_equal_stc(&cid, &mds_body->id1))
+ goto revalidate_finish;
+ }
+
+ GOTO(out, rc = 0);
}
/*static*/ void ll_pin(struct dentry *de, struct vfsmount *mnt, int flag)
ENTRY;
if (nd && nd->flags & LOOKUP_LAST && !(nd->flags & LOOKUP_LINK_NOTLAST))
- rc = ll_revalidate_it(dentry, nd->flags, nd, &nd->intent);
+ rc = ll_revalidate_it(dentry, nd->flags, nd, &nd->intent.open);
else
rc = ll_revalidate_it(dentry, 0, nd, NULL);
struct ll_sb_info *sbi = ll_i2sbi(inode);
struct lustre_id parent, child;
- LASSERT(dentry->d_parent && dentry->d_parent->d_inode);
- ll_inode2id(&parent, dentry->d_parent->d_inode);
- ll_inode2id(&child, inode);
- md_change_cbdata_name(sbi->ll_md_exp, &parent,
- (char *)dentry->d_name.name,
- dentry->d_name.len, &child,
- null_if_equal, inode);
+ if (dentry->d_parent != dentry) {
+ /* Do not do this for root of the tree */
+ LASSERT(dentry->d_parent && dentry->d_parent->d_inode);
+ ll_inode2id(&parent, dentry->d_parent->d_inode);
+ ll_inode2id(&child, inode);
+ md_change_cbdata_name(sbi->ll_md_exp, &parent,
+ (char *)dentry->d_name.name,
+ dentry->d_name.len, &child,
+ null_if_equal, inode);
+ }
iput(inode);
+
}
#endif
ll_prepare_mdc_data(op_data, dir, NULL, NULL, 0, 0);
+ rc = ll_intent_alloc(&it);
+ if (rc)
+ return ERR_PTR(rc);
+
rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, LDLM_IBITS, &it,
LCK_PR, op_data, &lockh, NULL, 0,
ldlm_completion_ast, ll_mdc_blocking_ast, dir);
OBD_FREE(op_data, sizeof(*op_data));
- request = (struct ptlrpc_request *)it.d.lustre.it_data;
+ request = (struct ptlrpc_request *)LUSTRE_IT(&it)->it_data;
+ ll_intent_free(&it);
+
if (request)
ptlrpc_req_finished(request);
if (rc < 0) {
}
case LL_IOC_MDC_MKDIRSTRIPE:
RETURN(ll_mkdir_stripe(inode, arg));
- case IOC_MDC_FINISH_GNS:
- RETURN(ll_finish_gns(sbi));
case LL_IOC_LOV_SETSTRIPE: {
struct ptlrpc_request *request = NULL;
struct mdc_op_data *op_data;
valid |= OBD_MD_FLDIREA;
ll_inode2id(&id, inode);
- rc = md_getattr(sbi->ll_md_exp, &id, valid,
+ rc = md_getattr(sbi->ll_md_exp, &id, valid, NULL, 0,
obd_size_diskmd(sbi->ll_dt_exp, NULL),
&request);
if (rc < 0) {
#include <linux/lustre_lite.h>
#include <linux/pagemap.h>
#include <linux/file.h>
+#include <linux/lustre_acl.h>
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
#include <linux/lustre_compat25.h>
#endif
#include "llite_internal.h"
#include <linux/obd_lov.h>
+#define XATTR_NAME_MAX 255
int ll_md_close(struct obd_export *md_exp, struct inode *inode,
struct file *file)
{
ll_mdc_blocking_ast, NULL);
OBD_FREE(op_data, sizeof(*op_data));
if (rc == 0) {
- if (itp->d.lustre.it_lock_mode)
- memcpy(&itp->d.lustre.it_lock_handle,
+ if (LUSTRE_IT(itp)->it_lock_mode)
+ memcpy(&LUSTRE_IT(itp)->it_lock_handle,
&lockh, sizeof(lockh));
+
} else if (rc < 0) {
CERROR("lock enqueue: err: %d\n", rc);
}
int ll_local_open(struct file *file, struct lookup_intent *it)
{
- struct ptlrpc_request *req = it->d.lustre.it_data;
+ struct ptlrpc_request *req = LUSTRE_IT(it)->it_data;
struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
struct obd_export *md_exp = ll_i2mdexp(file->f_dentry->d_inode);
struct ll_file_data *fd;
lli->lli_io_epoch = body->io_epoch;
- mdc_set_open_replay_data(md_exp, &fd->fd_mds_och, it->d.lustre.it_data);
-
+ mdc_set_open_replay_data(md_exp, &fd->fd_mds_och, LUSTRE_IT(it)->it_data);
+
RETURN(0);
}
it = file->f_it;
- if (!it || !it->d.lustre.it_disposition) {
+ if (!it || !LUSTRE_IT(it) || !LUSTRE_IT(it)->it_disposition) {
it = &oit;
+ rc = ll_intent_alloc(it);
+ if (rc)
+ GOTO(out, rc);
rc = ll_intent_file_open(file, NULL, 0, it);
if (rc)
GOTO(out, rc);
}
+
lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
/* mdc_intent_lock() didn't get a request ref if there was an open
* error, so don't do cleanup on the request here (bug 3430) */
file->f_flags &= ~O_LOV_DELAY_CREATE;
GOTO(out, rc);
out:
- req = it->d.lustre.it_data;
+ req = LUSTRE_IT(it)->it_data;
+ ll_intent_release(it);
+
ptlrpc_req_finished(req);
if (rc == 0)
ll_open_complete(inode);
f->f_dentry = file->f_dentry;
f->f_vfsmnt = file->f_vfsmnt;
+ rc = ll_intent_alloc(&oit);
+ if (rc)
+ GOTO(out, rc);
+
rc = ll_intent_file_open(f, lum, lum_size, &oit);
if (rc)
GOTO(out, rc);
if (it_disposition(&oit, DISP_LOOKUP_NEG))
GOTO(out, -ENOENT);
- req = oit.d.lustre.it_data;
- rc = oit.d.lustre.it_status;
+
+ req = LUSTRE_IT(&oit)->it_data;
+ rc = LUSTRE_IT(&oit)->it_status;
if (rc < 0)
GOTO(out, rc);
rc = ll_file_release(f->f_dentry->d_inode, f);
EXIT;
out:
+ ll_intent_release(&oit);
if (f)
put_filp(f);
up(&lli->lli_open_sem);
RETURN(rc);
}
-int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
+int ll_inode_revalidate_it(struct dentry *dentry)
{
struct lookup_intent oit = { .it_op = IT_GETATTR };
struct inode *inode = dentry->d_inode;
struct ll_sb_info *sbi;
struct lustre_id id;
int rc;
-
ENTRY;
if (!inode) {
lli = ll_i2info(inode);
LASSERT(id_fid(&id) != 0);
- CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), name=%s, intent=%s\n",
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), name=%s(%p)\n",
inode->i_ino, inode->i_generation, inode, dentry->d_name.name,
- LL_IT2STR(it));
+ dentry);
#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE);
#endif
+ rc = ll_intent_alloc(&oit);
+ if (rc)
+ RETURN(-ENOMEM);
+
rc = md_intent_lock(sbi->ll_md_exp, &id, NULL, 0, NULL, 0, &id,
&oit, 0, &req, ll_mdc_blocking_ast);
if (rc < 0)
rc = revalidate_it_finish(req, 1, &oit, dentry);
if (rc) {
- ll_intent_release(&oit);
GOTO(out, rc);
}
rc = ll_glimpse_size(inode);
EXIT;
out:
+ ll_intent_release(&oit);
if (req)
ptlrpc_req_finished(req);
return rc;
}
#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-int ll_getattr(struct vfsmount *mnt, struct dentry *de,
- struct lookup_intent *it, struct kstat *stat)
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
{
int res = 0;
struct inode *inode = de->d_inode;
- res = ll_inode_revalidate_it(de, it);
+ res = ll_inode_revalidate_it(de);
lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR);
if (res)
}
#endif
+static
+int ll_setxattr_internal(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags,
+ __u64 valid)
+{
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ptlrpc_request *request = NULL;
+ struct mdc_op_data op_data;
+ struct iattr attr;
+ int rc = 0;
+ ENTRY;
+
+ CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino);
+ lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_SETXATTR);
+
+ memset(&attr, 0x0, sizeof(attr));
+ attr.ia_valid |= valid;
+ attr.ia_attr_flags = flags;
+
+ ll_prepare_mdc_data(&op_data, inode, NULL, NULL, 0, 0);
+
+ rc = md_setattr(sbi->ll_md_exp, &op_data, &attr,
+ (void*) name, strnlen(name, XATTR_NAME_MAX)+1,
+ (void*) value, size, &request);
+ if (rc) {
+ CERROR("md_setattr fails: rc = %d\n", rc);
+ GOTO(out, rc);
+ }
+
+ out:
+ ptlrpc_req_finished(request);
+ RETURN(rc);
+}
+
+int ll_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
+{
+ int rc, error;
+ struct posix_acl *acl;
+ struct ll_inode_info *lli;
+ ENTRY;
+
+ rc = ll_setxattr_internal(dentry->d_inode, name, value, size,
+ flags, ATTR_EA);
+
+ /* update inode's acl info */
+ if (rc == 0 && strcmp(name, XATTR_NAME_ACL_ACCESS) == 0) {
+ if (value) {
+ acl = posix_acl_from_xattr(value, size);
+ if (IS_ERR(acl)) {
+ CERROR("convert from xattr to acl error: %ld",
+ PTR_ERR(acl));
+ GOTO(out, rc);
+ } else if (acl) {
+ error = posix_acl_valid(acl);
+ if (error) {
+ CERROR("acl valid error: %d", error);
+ posix_acl_release(acl);
+ GOTO(out, rc);
+ }
+ }
+ } else {
+ acl = NULL;
+ }
+
+ lli = ll_i2info(dentry->d_inode);
+ spin_lock(&lli->lli_lock);
+ if (lli->lli_acl_access != NULL)
+ posix_acl_release(lli->lli_acl_access);
+ lli->lli_acl_access = acl;
+ spin_unlock(&lli->lli_lock);
+ }
+ EXIT;
+out:
+ return(rc);
+}
+
+int ll_removexattr(struct dentry *dentry, const char *name)
+{
+ return ll_setxattr_internal(dentry->d_inode, name, NULL, 0, 0,
+ ATTR_EA_RM);
+}
+
+static
+int ll_getxattr_internal(struct inode *inode, const char *name, int namelen,
+ void *value, size_t size, __u64 valid)
+{
+ struct ptlrpc_request *request = NULL;
+ struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct lustre_id id;
+ struct mds_body *body;
+ void *ea_data;
+ int rc, ea_size;
+ ENTRY;
+
+ lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETXATTR);
+
+ ll_inode2id(&id, inode);
+ rc = md_getattr(sbi->ll_md_exp, &id, valid, name, namelen,
+ size, &request);
+ if (rc) {
+ if (rc != -ENODATA && rc != -EOPNOTSUPP)
+ CERROR("md_getattr fails: rc = %d\n", rc);
+ GOTO(out, rc);
+ }
+
+ body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body));
+ LASSERT(body != NULL);
+ LASSERT_REPSWABBED(request, 0);
+
+ ea_size = body->eadatasize;
+ LASSERT(ea_size <= request->rq_repmsg->buflens[0]);
+
+ if (size == 0)
+ GOTO(out, rc = ea_size);
+
+ ea_data = lustre_msg_buf(request->rq_repmsg, 1, ea_size);
+ LASSERT(ea_data != NULL);
+ LASSERT_REPSWABBED(request, 1);
+
+ if (value)
+ memcpy(value, ea_data, ea_size);
+ rc = ea_size;
+ out:
+ ptlrpc_req_finished(request);
+ RETURN(rc);
+}
+
+int ll_getxattr(struct dentry *dentry, const char *name, void *value,
+ size_t size)
+{
+ return ll_getxattr_internal(dentry->d_inode, name, strlen(name) + 1,
+ value, size, OBD_MD_FLEA);
+}
+
+int ll_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ return ll_getxattr_internal(dentry->d_inode, NULL, 0, list, size,
+ OBD_MD_FLEALIST);
+}
+
+int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int mode = inode->i_mode;
+ struct dentry de;
+ struct ll_sb_info *sbi;
+ struct lustre_id id;
+ struct ptlrpc_request *req = NULL;
+ int rc;
+ ENTRY;
+
+ sbi = ll_i2sbi(inode);
+ ll_inode2id(&id, inode);
+
+ /* Nobody gets write access to a read-only fs */
+ if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+ return -EROFS;
+ /* Nobody gets write access to an immutable file */
+ if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+ return -EACCES;
+ if (current->fsuid == inode->i_uid) {
+ mode >>= 6;
+ } else if (1) {
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct posix_acl *acl;
+
+ /* The access ACL cannot grant access if the group class
+ permission bits don't contain all requested permissions. */
+ if (((mode >> 3) & mask & S_IRWXO) != mask)
+ goto check_groups;
+
+ if (ll_intent_alloc(&it))
+ return -EACCES;
+
+ de.d_inode = inode;
+ rc = md_intent_lock(sbi->ll_md_exp, &id, NULL, 0, NULL, 0, &id,
+ &it, 0, &req, ll_mdc_blocking_ast);
+ if (rc < 0) {
+ ll_intent_free(&it);
+ GOTO(out, rc);
+ }
+
+ rc = revalidate_it_finish(req, 1, &it, &de);
+ if (rc) {
+ ll_intent_release(&it);
+ GOTO(out, rc);
+ }
+
+ ll_lookup_finish_locks(&it, &de);
+ ll_intent_free(&it);
+
+ spin_lock(&lli->lli_lock);
+ acl = posix_acl_dup(ll_i2info(inode)->lli_acl_access);
+ spin_unlock(&lli->lli_lock);
+
+ if (!acl)
+ goto check_groups;
+
+ rc = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ if (rc == -EACCES)
+ goto check_capabilities;
+ GOTO(out, rc);
+ } else {
+check_groups:
+ if (in_group_p(inode->i_gid))
+ mode >>= 3;
+ }
+ if ((mode & mask & S_IRWXO) == mask)
+ GOTO(out, rc = 0);
+
+check_capabilities:
+ rc = -EACCES;
+ /* Allowed to override Discretionary Access Control? */
+ if (!(mask & MAY_EXEC) ||
+ (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
+ if (capable(CAP_DAC_OVERRIDE))
+ GOTO(out, rc = 0);
+ /* Read and search granted if capable(CAP_DAC_READ_SEARCH) */
+ if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
+ (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
+ GOTO(out, rc = 0);
+out:
+ if (req)
+ ptlrpc_req_finished(req);
+
+ return rc;
+}
+
struct file_operations ll_file_operations = {
.read = ll_file_read,
.write = ll_file_write,
};
struct inode_operations ll_file_inode_operations = {
- .setattr_raw = ll_setattr_raw,
.setattr = ll_setattr,
.truncate = ll_truncate,
#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
- .getattr_it = ll_getattr,
+ .getattr = ll_getattr,
#else
.revalidate_it = ll_inode_revalidate_it,
#endif
+ .setxattr = ll_setxattr,
+ .getxattr = ll_getxattr,
+ .listxattr = ll_listxattr,
+ .removexattr = ll_removexattr,
+ .permission = ll_inode_permission,
};
#include <linux/lustre_lite.h>
#include "llite_internal.h"
-/* After roughly how long should we remove an inactive mount? */
-#define GNS_MOUNT_TIMEOUT 120
-/* How often should the GNS timer look for mounts to cleanup? */
-#define GNS_TICK 30
+static struct list_head gns_sbi_list = LIST_HEAD_INIT(gns_sbi_list);
+static spinlock_t gns_lock = SPIN_LOCK_UNLOCKED;
+static struct ptlrpc_thread gns_thread;
+static struct ll_gns_ctl gns_ctl;
-int ll_finish_gns(struct ll_sb_info *sbi)
+/*
+ * waits until passed dentry gets mountpoint or timeout and attempts are
+ * exhausted. Returns 1 if dentry became mountpoint and 0 otherwise.
+ */
+static int
+ll_gns_wait_for_mount(struct dentry *dentry,
+ int timeout, int tries)
{
- down(&sbi->ll_gns_sem);
- if (sbi->ll_gns_state != LL_GNS_STATE_MOUNTING) {
- up(&sbi->ll_gns_sem);
- CERROR("FINISH_GNS called on mount which was not expecting "
- "completion.\n");
- return -EINVAL;
- }
-
- sbi->ll_gns_state = LL_GNS_STATE_FINISHED;
- up(&sbi->ll_gns_sem);
- complete(&sbi->ll_gns_completion);
-
- return 0;
-}
+ struct l_wait_info lwi;
+ struct ll_sb_info *sbi;
+ int rc;
+ ENTRY;
-/* Pass exactly one (1) page in; when this function returns "page" will point
- * somewhere into the middle of the page. */
-int fill_page_with_path(struct dentry *dentry, struct vfsmount *mnt,
- char **pagep)
-{
- char *path = *pagep, *p;
-
- path[PAGE_SIZE - 1] = '\0';
- p = path + PAGE_SIZE - 1;
-
- while (1) {
- if (p - path < dentry->d_name.len + 1)
- return -ENAMETOOLONG;
- if (dentry->d_name.name[0] != '/') {
- p -= dentry->d_name.len;
- memcpy(p, dentry->d_name.name, dentry->d_name.len);
- p--;
- *p = '/';
- }
+ LASSERT(dentry != NULL);
+ LASSERT(!IS_ERR(dentry));
+ sbi = ll_s2sbi(dentry->d_sb);
+
+ for (; !d_mountpoint(dentry) && tries > 0; tries--) {
+ lwi = LWI_TIMEOUT(timeout * HZ, NULL, NULL);
+ l_wait_event(sbi->ll_gns_waitq, d_mountpoint(dentry), &lwi);
+ }
- dentry = dentry->d_parent;
- if (dentry->d_parent == dentry) {
- if (mnt->mnt_parent == mnt)
- break; /* finished walking up */
- mnt = mntget(mnt);
- dget(dentry);
- while (dentry->d_parent == dentry &&
- follow_up(&mnt, &dentry))
- ;
- mntput(mnt);
- dput(dentry);
- }
+ if ((rc = d_mountpoint(dentry) ? 1 : 0)) {
+ spin_lock(&sbi->ll_gns_lock);
+ LASSERT(sbi->ll_gns_state == LL_GNS_MOUNTING);
+ sbi->ll_gns_state = LL_GNS_FINISHED;
+ spin_unlock(&sbi->ll_gns_lock);
}
- *pagep = p;
- return 0;
+
+ complete(&sbi->ll_gns_mount_finished);
+ RETURN(rc);
}
-int ll_dir_process_mount_object(struct dentry *dentry, struct vfsmount *mnt)
+/*
+ * tries to mount the mount object under passed @dentry. In the case of success
+ * @dentry will become mount point and 0 will be retuned. Error code will be
+ * returned otherwise.
+ */
+int ll_gns_mount_object(struct dentry *dentry,
+ struct vfsmount *mnt)
{
- struct ll_sb_info *sbi;
+ struct ll_dentry_data *lld = dentry->d_fsdata;
+ char *p, *path, *pathpage, *argv[4];
struct file *mntinfo_fd = NULL;
- struct page *datapage = NULL, *pathpage;
struct address_space *mapping;
- struct ll_dentry_data *lld = dentry->d_fsdata;
- struct dentry *dchild, *tmp_dentry;
- struct vfsmount *tmp_mnt;
- char *p, *path, *argv[4];
- int stage = 0, rc = 0;
+ int cleanup_phase = 0, rc = 0;
+ struct ll_sb_info *sbi;
+ struct dentry *dchild;
+ struct page *datapage;
+ filler_t *filler;
ENTRY;
if (mnt == NULL) {
- CERROR("suid directory found, but no vfsmount available.\n");
- RETURN(-1);
+ CERROR("suid directory found, but no "
+ "vfsmount available.\n");
+ RETURN(-EINVAL);
}
+ CDEBUG(D_INODE, "mounting dentry %p\n", dentry);
+
LASSERT(dentry->d_inode != NULL);
LASSERT(S_ISDIR(dentry->d_inode->i_mode));
LASSERT(lld != NULL);
+
sbi = ll_i2sbi(dentry->d_inode);
LASSERT(sbi != NULL);
- down(&sbi->ll_gns_sem);
- if (sbi->ll_gns_state == LL_GNS_STATE_MOUNTING) {
- up(&sbi->ll_gns_sem);
- wait_for_completion(&sbi->ll_gns_completion);
+ /* another thead is in progress of mouning some entry */
+ spin_lock(&sbi->ll_gns_lock);
+ if (sbi->ll_gns_state == LL_GNS_MOUNTING) {
+ spin_unlock(&sbi->ll_gns_lock);
+
+ wait_for_completion(&sbi->ll_gns_mount_finished);
if (d_mountpoint(dentry))
RETURN(0);
- RETURN(-1);
}
- if (sbi->ll_gns_state == LL_GNS_STATE_FINISHED) {
+
+ /* another thread mounted it already */
+ if (sbi->ll_gns_state == LL_GNS_FINISHED) {
+ spin_unlock(&sbi->ll_gns_lock);
+
/* we lost a race; just return */
- up(&sbi->ll_gns_sem);
if (d_mountpoint(dentry))
RETURN(0);
- RETURN(-1);
}
- LASSERT(sbi->ll_gns_state == LL_GNS_STATE_IDLE);
- sbi->ll_gns_state = LL_GNS_STATE_MOUNTING;
+ LASSERT(sbi->ll_gns_state == LL_GNS_IDLE);
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_GNS_MOUNTING;
+ spin_unlock(&dentry->d_lock);
+
+ /* mounting started */
+ sbi->ll_gns_state = LL_GNS_MOUNTING;
+ spin_unlock(&sbi->ll_gns_lock);
+
+ /* we need to build an absolute pathname to pass to mount */
+ pathpage = (char *)__get_free_page(GFP_KERNEL);
+ if (!pathpage)
+ GOTO(cleanup, rc = -ENOMEM);
+ cleanup_phase = 1;
+
+ /* getting @dentry path stored in @pathpage. */
+ path = d_path(dentry, mnt, pathpage, PAGE_SIZE);
+ if (IS_ERR(path)) {
+ CERROR("can't build mount object path, err %d\n",
+ (int)PTR_ERR(dchild));
+ GOTO(cleanup, rc = PTR_ERR(dchild));
+ }
+
+ /* sychronizing with possible /proc/fs/...write */
+ down(&sbi->ll_gns_sem);
+
+ /*
+ * mount object name is taken from sbi, where it is set in mount time or
+ * via /proc/fs... tunable. It may be ".mntinfo" or so.
+ */
+ dchild = ll_d_lookup(sbi->ll_gns_oname, dentry,
+ strlen(sbi->ll_gns_oname));
up(&sbi->ll_gns_sem);
- /* We need to build an absolute pathname to pass to mount */
- pathpage = alloc_pages(GFP_HIGHUSER, 0);
- if (pathpage == NULL)
- GOTO(cleanup, rc = -ENOMEM);
- path = kmap(pathpage);
- LASSERT(path != NULL);
- stage = 1;
- fill_page_with_path(dentry, mnt, &path);
-
- dchild = lookup_one_len(".mntinfo", dentry, strlen(".mntinfo"));
- if (dchild == NULL || IS_ERR(dchild)) {
- CERROR("Directory %*s is setuid, but without a mount object.\n",
- dentry->d_name.len, dentry->d_name.name);
- GOTO(cleanup, rc = -1);
+ if (!dchild)
+ GOTO(cleanup, rc = -ENOENT);
+
+ if (IS_ERR(dchild)) {
+ CERROR("can't find mount object %*s/%*s err = %d.\n",
+ (int)dentry->d_name.len, dentry->d_name.name,
+ (int)dchild->d_name.len, dchild->d_name.name,
+ (int)PTR_ERR(dchild));
+ GOTO(cleanup, rc = PTR_ERR(dchild));
}
mntget(mnt);
+ /* ok, mount object if found, opening it. */
mntinfo_fd = dentry_open(dchild, mnt, 0);
if (IS_ERR(mntinfo_fd)) {
+ CERROR("can't open mount object %*s/%*s err = %d.\n",
+ (int)dentry->d_name.len, dentry->d_name.name,
+ (int)dchild->d_name.len, dchild->d_name.name,
+ (int)PTR_ERR(mntinfo_fd));
dput(dchild);
mntput(mnt);
GOTO(cleanup, rc = PTR_ERR(mntinfo_fd));
}
- stage = 2;
+ cleanup_phase = 2;
if (mntinfo_fd->f_dentry->d_inode->i_size > PAGE_SIZE) {
- CERROR("Mount object file is too big (%Ld)\n",
+ CERROR("mount object %*s/%*s is too big (%Ld)\n",
+ (int)dentry->d_name.len, dentry->d_name.name,
+ (int)dchild->d_name.len, dchild->d_name.name,
mntinfo_fd->f_dentry->d_inode->i_size);
- GOTO(cleanup, rc = -1);
+ GOTO(cleanup, rc = -EFBIG);
}
+
+ /* read data from mount object. */
mapping = mntinfo_fd->f_dentry->d_inode->i_mapping;
- datapage = read_cache_page(mapping, 0,
- (filler_t *)mapping->a_ops->readpage,
+ filler = (filler_t *)mapping->a_ops->readpage;
+ datapage = read_cache_page(mapping, 0, filler,
mntinfo_fd);
- if (IS_ERR(datapage))
+ if (IS_ERR(datapage)) {
+ CERROR("can't read data from mount object %*s/%*s\n",
+ (int)dentry->d_name.len, dentry->d_name.name,
+ (int)dchild->d_name.len, dchild->d_name.name);
GOTO(cleanup, rc = PTR_ERR(datapage));
+ }
p = kmap(datapage);
LASSERT(p != NULL);
- stage = 3;
-
p[PAGE_SIZE - 1] = '\0';
+ cleanup_phase = 3;
fput(mntinfo_fd);
mntinfo_fd = NULL;
- argv[0] = "/usr/lib/lustre/gns-upcall.sh";
+ /* sychronizing with possible /proc/fs/...write */
+ down(&sbi->ll_gns_sem);
+
+ /*
+ * upcall is initialized in mount time or via /proc/fs/... tuneable and
+ * may be /usr/lib/lustre/gns-upcall.sh
+ */
+ argv[0] = sbi->ll_gns_upcall;
argv[1] = p;
argv[2] = path;
argv[3] = NULL;
- rc = USERMODEHELPER(argv[0], argv, NULL);
+
+ up(&sbi->ll_gns_sem);
- if (rc != 0) {
- CERROR("GNS mount failed: %d\n", rc);
+ rc = USERMODEHELPER(argv[0], argv, NULL);
+ if (rc) {
+ CERROR("failed to call GNS upcall %s, err = %d\n",
+ sbi->ll_gns_upcall, rc);
GOTO(cleanup, rc);
}
- wait_for_completion(&sbi->ll_gns_completion);
- LASSERT(sbi->ll_gns_state == LL_GNS_STATE_FINISHED);
-
- if (d_mountpoint(dentry)) {
- /* successful follow_down will mntput and dput */
- tmp_mnt = mntget(mnt);
- tmp_dentry = dget(dentry);
- rc = follow_down(&tmp_mnt, &tmp_dentry);
- if (rc == 1) {
- struct ll_sb_info *sbi = ll_s2sbi(dentry->d_sb);
+ /*
+ * wait for mount completion. This is actually not need, because
+ * USERMODEHELPER() returns only when usermode process finishes. But we
+ * doing this just for case USERMODEHELPER() semanthics will be changed
+ * or usermode upcall program will start mounting in backgound and
+ * return instantly. --umka
+ */
+ if (ll_gns_wait_for_mount(dentry, 1, GNS_WAIT_ATTEMPTS)) {
+ struct dentry *rdentry;
+ struct vfsmount *rmnt;
+
+ /* mount is successful */
+ LASSERT(sbi->ll_gns_state == LL_GNS_FINISHED);
+
+ rmnt = mntget(mnt);
+ rdentry = dget(dentry);
+
+ if (follow_down(&rmnt, &rdentry)) {
+ /*
+ * registering new mount in GNS mounts list and thus
+ * make it accessible from GNS control thread.
+ */
spin_lock(&dcache_lock);
- LASSERT(list_empty(&tmp_mnt->mnt_lustre_list));
- list_add_tail(&tmp_mnt->mnt_lustre_list,
+ LASSERT(list_empty(&rmnt->mnt_lustre_list));
+ list_add_tail(&rmnt->mnt_lustre_list,
&sbi->ll_mnt_list);
spin_unlock(&dcache_lock);
-
- tmp_mnt->mnt_last_used = jiffies;
-
- mntput(tmp_mnt);
- dput(tmp_dentry);
- rc = 0;
+ rmnt->mnt_last_used = jiffies;
+ mntput(rmnt);
+ dput(rdentry);
} else {
mntput(mnt);
dput(dentry);
}
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_GNS_PENDING;
+ spin_unlock(&dentry->d_lock);
} else {
- CERROR("Woke up from GNS mount, but no mountpoint in place.\n");
- rc = -1;
+ CERROR("usermode upcall %s failed to mount %s\n",
+ sbi->ll_gns_upcall, path);
+ rc = -ETIME;
}
EXIT;
cleanup:
- switch (stage) {
+ switch (cleanup_phase) {
case 3:
kunmap(datapage);
page_cache_release(datapage);
if (mntinfo_fd != NULL)
fput(mntinfo_fd);
case 1:
- kunmap(pathpage);
- __free_pages(pathpage, 0);
+ free_page((unsigned long)pathpage);
case 0:
- down(&sbi->ll_gns_sem);
- sbi->ll_gns_state = LL_GNS_STATE_IDLE;
- up(&sbi->ll_gns_sem);
+ spin_lock(&sbi->ll_gns_lock);
+ sbi->ll_gns_state = LL_GNS_IDLE;
+ spin_unlock(&sbi->ll_gns_lock);
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_GNS_MOUNTING;
+ spin_unlock(&dentry->d_lock);
}
return rc;
}
-/* If timeout == 1, only remove the mounts which are properly aged.
- *
- * If timeout == 0, we are unmounting -- remove them all. */
-int ll_gns_umount_all(struct ll_sb_info *sbi, int timeout)
+/* tries to umount passed @mnt. */
+int ll_gns_umount_object(struct vfsmount *mnt)
{
- struct list_head kill_list = LIST_HEAD_INIT(kill_list);
- struct page *page = NULL;
- char *kpage = NULL, *path;
- int rc;
+ int rc = 0;
ENTRY;
-
- if (timeout == 0) {
- page = alloc_pages(GFP_HIGHUSER, 0);
- if (page == NULL)
- RETURN(-ENOMEM);
- kpage = kmap(page);
- LASSERT(kpage != NULL);
+
+ CDEBUG(D_INODE, "unmounting mnt %p\n", mnt);
+ rc = do_umount(mnt, 0);
+ if (rc) {
+ CDEBUG(D_INODE, "can't umount 0x%p, err = %d\n",
+ mnt, rc);
}
+
+ RETURN(rc);
+}
+
+int ll_gns_check_mounts(struct ll_sb_info *sbi, int flags)
+{
+ struct list_head check_list = LIST_HEAD_INIT(check_list);
+ struct vfsmount *mnt;
+ unsigned long pass;
+ ENTRY;
spin_lock(&dcache_lock);
- list_splice_init(&sbi->ll_mnt_list, &kill_list);
-
- /* Walk the list in reverse order, and put them on the front of the
- * sbi list each iteration; this avoids list-ordering problems if we
- * race with another gns-mounting thread */
- while (!list_empty(&kill_list)) {
- struct vfsmount *mnt =
- list_entry(kill_list.prev, struct vfsmount,
- mnt_lustre_list);
+ list_splice_init(&sbi->ll_mnt_list, &check_list);
+
+ /*
+ * walk the list in reverse order, and put them on the front of the sbi
+ * list each iteration; this avoids list-ordering problems if we race
+ * with another gns-mounting thread.
+ */
+ while (!list_empty(&check_list)) {
+ mnt = list_entry(check_list.prev,
+ struct vfsmount,
+ mnt_lustre_list);
+
mntget(mnt);
+
list_del_init(&mnt->mnt_lustre_list);
- list_add(&mnt->mnt_lustre_list, &sbi->ll_mnt_list);
- if (timeout &&
- jiffies - mnt->mnt_last_used < GNS_MOUNT_TIMEOUT * HZ) {
+ list_add(&mnt->mnt_lustre_list,
+ &sbi->ll_mnt_list);
+
+ /* check for timeout if needed */
+ pass = jiffies - mnt->mnt_last_used;
+
+ if (flags == LL_GNS_CHECK &&
+ pass < sbi->ll_gns_timeout * HZ)
+ {
mntput(mnt);
continue;
}
spin_unlock(&dcache_lock);
- CDEBUG(D_INODE, "unmounting mnt %p from sbi %p\n", mnt, sbi);
+ /* umounting @mnt */
+ ll_gns_umount_object(mnt);
- rc = do_umount(mnt, 0);
- if (rc != 0 && page != NULL) {
- int rc2;
- path = kpage;
- rc2 = fill_page_with_path(mnt->mnt_root, mnt, &path);
- CERROR("GNS umount(%s): %d\n", rc2 == 0 ? path : "",
- rc);
- }
mntput(mnt);
spin_lock(&dcache_lock);
}
spin_unlock(&dcache_lock);
-
- if (page != NULL) {
- kunmap(page);
- __free_pages(page, 0);
- }
RETURN(0);
}
-static struct list_head gns_sbi_list = LIST_HEAD_INIT(gns_sbi_list);
-static spinlock_t gns_lock = SPIN_LOCK_UNLOCKED;
-static struct ptlrpc_thread gns_thread;
-
+/*
+ * GNS timer callback function. It restarts gns timer and wakes up GNS cvontrol
+ * thread to process mounts list.
+ */
void ll_gns_timer_callback(unsigned long data)
{
struct ll_sb_info *sbi = (void *)data;
if (list_empty(&sbi->ll_gns_sbi_head))
list_add(&sbi->ll_gns_sbi_head, &gns_sbi_list);
spin_unlock(&gns_lock);
+
wake_up(&gns_thread.t_ctl_waitq);
- mod_timer(&sbi->ll_gns_timer, jiffies + GNS_TICK * HZ);
+ mod_timer(&sbi->ll_gns_timer,
+ jiffies + sbi->ll_gns_tick * HZ);
}
-static int gns_check_event(void)
+/* this function checks if something new happened to exist in gns list. */
+static int inline ll_gns_check_event(void)
{
int rc;
+
spin_lock(&gns_lock);
rc = !list_empty(&gns_sbi_list);
spin_unlock(&gns_lock);
+
return rc;
}
-static int inline gns_check_stopping(void)
+/* should we staop GNS control thread? */
+static int inline ll_gns_check_stop(void)
{
mb();
return (gns_thread.t_flags & SVC_STOPPING) ? 1 : 0;
}
+/* GNS control thread function. */
static int ll_gns_thread_main(void *arg)
{
+ struct ll_gns_ctl *ctl = arg;
unsigned long flags;
ENTRY;
snprintf(name, sizeof(name) - 1, "ll_gns");
kportal_daemonize(name);
}
+
SIGNAL_MASK_LOCK(current, flags);
sigfillset(¤t->blocked);
RECALC_SIGPENDING;
SIGNAL_MASK_UNLOCK(current, flags);
+ /*
+ * letting starting function know, that we are ready and control may be
+ * returned.
+ */
gns_thread.t_flags = SVC_RUNNING;
- wake_up(&gns_thread.t_ctl_waitq);
+ complete(&ctl->gc_starting);
- while (!gns_check_stopping()) {
+ while (!ll_gns_check_stop()) {
struct l_wait_info lwi = { 0 };
- l_wait_event(gns_thread.t_ctl_waitq, gns_check_event() ||
- gns_check_stopping(), &lwi);
-
+ l_wait_event(gns_thread.t_ctl_waitq,
+ (ll_gns_check_event() ||
+ ll_gns_check_stop()), &lwi);
+
spin_lock(&gns_lock);
while (!list_empty(&gns_sbi_list)) {
- struct ll_sb_info *sbi =
- list_entry(gns_sbi_list.prev, struct ll_sb_info,
- ll_gns_sbi_head);
+ struct ll_sb_info *sbi;
+
+ sbi = list_entry(gns_sbi_list.prev,
+ struct ll_sb_info,
+ ll_gns_sbi_head);
+
list_del_init(&sbi->ll_gns_sbi_head);
spin_unlock(&gns_lock);
- ll_gns_umount_all(sbi, 1);
+ ll_gns_check_mounts(sbi, LL_GNS_CHECK);
spin_lock(&gns_lock);
}
spin_unlock(&gns_lock);
}
+ /*
+ * letting know stop function know that thread is stoped and it may
+ * return.
+ */
+ EXIT;
gns_thread.t_flags = SVC_STOPPED;
- wake_up(&gns_thread.t_ctl_waitq);
- RETURN(0);
+ /* this is SMP-safe way to finish thread. */
+ complete_and_exit(&ctl->gc_finishing, 0);
}
void ll_gns_add_timer(struct ll_sb_info *sbi)
{
- mod_timer(&sbi->ll_gns_timer, jiffies + GNS_TICK * HZ);
+ mod_timer(&sbi->ll_gns_timer,
+ jiffies + sbi->ll_gns_tick * HZ);
}
void ll_gns_del_timer(struct ll_sb_info *sbi)
del_timer(&sbi->ll_gns_timer);
}
+/*
+ * starts GNS control thread and waits for a signal it is up and work may be
+ * continued.
+ */
int ll_gns_start_thread(void)
{
- struct l_wait_info lwi = { 0 };
int rc;
+ ENTRY;
LASSERT(gns_thread.t_flags == 0);
-
+ init_completion(&gns_ctl.gc_starting);
+ init_completion(&gns_ctl.gc_finishing);
init_waitqueue_head(&gns_thread.t_ctl_waitq);
- rc = kernel_thread(ll_gns_thread_main, NULL, CLONE_VM | CLONE_FILES);
+
+ rc = kernel_thread(ll_gns_thread_main, &gns_ctl,
+ (CLONE_VM | CLONE_FILES));
if (rc < 0) {
- CERROR("cannot start thread: %d\n", rc);
- return rc;
+ CERROR("cannot start GNS control thread, "
+ "err = %d\n", rc);
+ RETURN(rc);
}
- l_wait_event(gns_thread.t_ctl_waitq, gns_thread.t_flags & SVC_RUNNING,
- &lwi);
- return 0;
+ wait_for_completion(&gns_ctl.gc_starting);
+ LASSERT(gns_thread.t_flags == SVC_RUNNING);
+ RETURN(0);
}
+/* stops GNS control thread and waits its actual stop. */
void ll_gns_stop_thread(void)
{
- struct l_wait_info lwi = { 0 };
-
+ ENTRY;
gns_thread.t_flags = SVC_STOPPING;
-
wake_up(&gns_thread.t_ctl_waitq);
- l_wait_event(gns_thread.t_ctl_waitq, gns_thread.t_flags & SVC_STOPPED,
- &lwi);
+ wait_for_completion(&gns_ctl.gc_finishing);
+ LASSERT(gns_thread.t_flags == SVC_STOPPED);
gns_thread.t_flags = 0;
+ EXIT;
}
unsigned long ra_stats[_NR_RA_STAT];
};
+/* after roughly how long should we remove an inactive mount? */
+#define GNS_MOUNT_TIMEOUT 120
+
+/* how often should the GNS timer look for mounts to cleanup? */
+#define GNS_TICK_TIMEOUT 1
+
+/* how many times GNS will try to wait for 1 second for mount */
+#define GNS_WAIT_ATTEMPTS 10
+
struct ll_sb_info {
/* this protects pglist and max_r_a_pages. It isn't safe to grab from
* interrupt contexts. */
struct list_head ll_mnt_list;
struct semaphore ll_gns_sem;
+ spinlock_t ll_gns_lock;
wait_queue_head_t ll_gns_waitq;
- struct completion ll_gns_completion;
int ll_gns_state;
struct timer_list ll_gns_timer;
struct list_head ll_gns_sbi_head;
+
+ unsigned long ll_gns_tick;
+ unsigned long ll_gns_timeout;
+ struct completion ll_gns_mount_finished;
+
+ /* path to upcall */
+ char ll_gns_upcall[PATH_MAX];
+
+ /* mount object entry name */
+ char ll_gns_oname[PATH_MAX];
+};
+
+struct ll_gns_ctl {
+ struct completion gc_starting;
+ struct completion gc_finishing;
};
-#define LL_GNS_STATE_IDLE 1100
-#define LL_GNS_STATE_MOUNTING 1101
-#define LL_GNS_STATE_FINISHED 1102
+/* mounting states */
+#define LL_GNS_IDLE (1 << 0)
+#define LL_GNS_MOUNTING (1 << 1)
+#define LL_GNS_FINISHED (1 << 2)
+
+/* mounts checking flags */
+#define LL_GNS_UMOUNT (1 << 0)
+#define LL_GNS_CHECK (1 << 1)
struct ll_readahead_state {
spinlock_t ras_lock;
};
extern kmem_cache_t *ll_file_data_slab;
+extern kmem_cache_t *ll_intent_slab;
struct lustre_handle;
struct ll_file_data {
struct obd_client_handle fd_mds_och;
/* llite/file.c */
extern struct file_operations ll_file_operations;
extern struct inode_operations ll_file_inode_operations;
-extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *);
+extern int ll_inode_revalidate_it(struct dentry *);
+extern int ll_setxattr(struct dentry *, const char *, const void *,
+ size_t, int);
+extern int ll_getxattr(struct dentry *, const char *, void *, size_t);
+extern int ll_listxattr(struct dentry *, char *, size_t);
+extern int ll_removexattr(struct dentry *, const char *);
+extern int ll_inode_permission(struct inode *, int, struct nameidata *);
int ll_refresh_lsm(struct inode *inode, struct lov_stripe_md *lsm);
int ll_extent_lock(struct ll_file_data *, struct inode *,
struct lov_stripe_md *, int mode, ldlm_policy_data_t *,
int ll_md_close(struct obd_export *md_exp, struct inode *inode,
struct file *file);
#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-int ll_getattr(struct vfsmount *mnt, struct dentry *de,
- struct lookup_intent *it, struct kstat *stat);
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
#endif
void ll_stime_record(struct ll_sb_info *sbi, struct timeval *start,
struct obd_service_time *stime);
/* llite/dcache.c */
void ll_intent_drop_lock(struct lookup_intent *);
void ll_intent_release(struct lookup_intent *);
+int ll_intent_alloc(struct lookup_intent *);
+void ll_intent_free(struct lookup_intent *it);
extern void ll_set_dd(struct dentry *de);
void ll_unhash_aliases(struct inode *);
void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft);
/* llite/llite_gns.c */
-int ll_finish_gns(struct ll_sb_info *sbi);
-int fill_page_with_path(struct dentry *, struct vfsmount *, char **pagep);
-int ll_dir_process_mount_object(struct dentry *, struct vfsmount *);
-int ll_gns_umount_all(struct ll_sb_info *sbi, int timeout);
+int ll_gns_start_thread(void);
+void ll_gns_stop_thread(void);
+
+int ll_gns_mount_object(struct dentry *dentry,
+ struct vfsmount *mnt);
+int ll_gns_umount_object(struct vfsmount *mnt);
+
+int ll_gns_check_mounts(struct ll_sb_info *sbi,
+ int flags);
+
void ll_gns_timer_callback(unsigned long data);
void ll_gns_add_timer(struct ll_sb_info *sbi);
void ll_gns_del_timer(struct ll_sb_info *sbi);
-int ll_gns_start_thread(void);
-void ll_gns_stop_thread(void);
/* llite/llite_lib.c */
extern struct super_operations lustre_super_operations;
char *ll_read_opt(const char *opt, char *data);
int ll_set_opt(const char *opt, char *data, int fl);
-void ll_options(char *options, char **ost, char **mds, int *flags);
+void ll_options(char *options, char **ost, char **mds, char **sec, int *flags);
void ll_lli_init(struct ll_inode_info *lli);
int ll_fill_super(struct super_block *sb, void *data, int silent);
int lustre_fill_super(struct super_block *sb, void *data, int silent);
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
#define ll_s2sbi(sb) ((struct ll_sb_info *)((sb)->s_fs_info))
#define ll_set_sbi(sb, sbi) ((sb)->s_fs_info = sbi)
-void __d_rehash(struct dentry * entry, int lock);
static inline __u64 ll_ts2u64(struct timespec *time)
{
__u64 t = time->tv_sec;
#include <linux/lustre_ha.h>
#include <linux/lustre_dlm.h>
#include <linux/lprocfs_status.h>
+#include <linux/lustre_acl.h>
#include "llite_internal.h"
kmem_cache_t *ll_file_data_slab;
+kmem_cache_t *ll_intent_slab;
extern struct address_space_operations ll_aops;
extern struct address_space_operations ll_dir_aops;
INIT_LIST_HEAD(&sbi->ll_conn_chain);
INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list);
INIT_LIST_HEAD(&sbi->ll_mnt_list);
+
sema_init(&sbi->ll_gns_sem, 1);
- init_completion(&sbi->ll_gns_completion);
- sbi->ll_gns_state = LL_GNS_STATE_IDLE;
+ spin_lock_init(&sbi->ll_gns_lock);
+ INIT_LIST_HEAD(&sbi->ll_gns_sbi_head);
+ init_waitqueue_head(&sbi->ll_gns_waitq);
+ init_completion(&sbi->ll_gns_mount_finished);
+
+ /* this later may be reset via /proc/fs/... */
+ memcpy(sbi->ll_gns_oname, ".mntinfo", strlen(".mntinfo"));
+ sbi->ll_gns_oname[strlen(sbi->ll_gns_oname) - 1] = '\0';
+
+ /* this later may be reset via /proc/fs/... */
+ memset(sbi->ll_gns_upcall, 0, sizeof(sbi->ll_gns_upcall));
+
+ /* default values, may be changed via /proc/fs/... */
+ sbi->ll_gns_state = LL_GNS_IDLE;
+ sbi->ll_gns_tick = GNS_TICK_TIMEOUT;
+ sbi->ll_gns_timeout = GNS_MOUNT_TIMEOUT;
+
sbi->ll_gns_timer.data = (unsigned long)sbi;
sbi->ll_gns_timer.function = ll_gns_timer_callback;
init_timer(&sbi->ll_gns_timer);
- INIT_LIST_HEAD(&sbi->ll_gns_sbi_head);
ll_set_sbi(sb, sbi);
RETURN(rc);
}
-int lustre_common_fill_super(struct super_block *sb, char *lmv, char *lov)
+extern struct dentry_operations ll_d_ops;
+
+int lustre_common_fill_super(struct super_block *sb, char *lmv, char *lov,
+ char *security, __u32 *nllu)
{
struct ll_sb_info *sbi = ll_s2sbi(sb);
struct ptlrpc_request *request = NULL;
RETURN(-EINVAL);
}
+ if (security == NULL)
+ security = "null";
+
+ err = obd_set_info(obd->obd_self_export, strlen("sec"), "sec",
+ strlen(security), security);
+ if (err) {
+ CERROR("LMV %s: failed to set security %s, err %d\n",
+ lmv, security, err);
+ RETURN(err);
+ }
+
+ err = obd_set_info(obd->obd_self_export, strlen("nllu"), "nllu",
+ sizeof(__u32) * 2, nllu);
+ if (err) {
+ CERROR("LMV %s: failed to set NLLU, err %d\n",
+ lmv, err);
+ RETURN(err);
+ }
+
if (proc_lustre_fs_root) {
err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
lov, lmv);
/* make root inode
* XXX: move this to after cbd setup? */
err = md_getattr(sbi->ll_md_exp, &sbi->ll_rootid,
- (OBD_MD_FLNOTOBD | OBD_MD_FLBLOCKS | OBD_MD_FID),
+ (OBD_MD_FLNOTOBD | OBD_MD_FLBLOCKS | OBD_MD_FID), NULL, 0,
0, &request);
if (err) {
CERROR("md_getattr failed for root: rc = %d\n", err);
#endif
sb->s_root = d_alloc_root(root);
+ sb->s_root->d_op = &ll_d_ops;
#ifdef S_PDIROPS
CWARN("Enabling PDIROPS\n");
RETURN(fl);
}
-void ll_options(char *options, char **lov, char **lmv, int *flags)
+void ll_options(char *options, char **lov, char **lmv, char **sec, int *flags)
{
char *this_char;
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
continue;
if (!*lmv && (*lmv = ll_read_opt("mdc", this_char)))
continue;
+ if (!*sec && (*sec = ll_read_opt("sec", this_char)))
+ continue;
if (!(*flags & LL_SBI_NOLCK) &&
((*flags) = (*flags) |
ll_set_opt("nolock", this_char,
struct ll_sb_info *sbi;
char *lov = NULL;
char *lmv = NULL;
+ char *sec = NULL;
+ __u32 nllu[2] = { 99, 99 };
int err;
ENTRY;
RETURN(-ENOMEM);
sbi->ll_flags |= LL_SBI_READAHEAD;
- ll_options(data, &lov, &lmv, &sbi->ll_flags);
+ ll_options(data, &lov, &lmv, &sec, &sbi->ll_flags);
if (!lov) {
CERROR("no osc\n");
GOTO(out, err = -EINVAL);
}
- err = lustre_common_fill_super(sb, lmv, lov);
+ err = lustre_common_fill_super(sb, lmv, lov, sec, nllu);
EXIT;
out:
if (err)
lustre_free_sbi(sb);
+ if (sec)
+ OBD_FREE(sec, strlen(sec) + 1);
if (lmv)
OBD_FREE(lmv, strlen(lmv) + 1);
if (lov)
class_uuid_t uuid;
struct obd_uuid lmv_uuid;
struct llog_ctxt *ctxt;
- int rc = 0;
- int err;
+ int rc, err = 0;
ENTRY;
if (lmd_bad_magic(lmd))
PCFG_INIT(pcfg, NAL_CMD_REGISTER_MYNID);
pcfg.pcfg_nal = lmd->lmd_nal;
pcfg.pcfg_nid = lmd->lmd_local_nid;
- err = libcfs_nal_cmd(&pcfg);
- if (err <0)
- GOTO(out, err);
+ rc = libcfs_nal_cmd(&pcfg);
+ if (rc < 0)
+ GOTO(out, rc);
}
if (lmd->lmd_nal == SOCKNAL ||
pcfg.pcfg_nid = lmd->lmd_server_nid;
pcfg.pcfg_id = lmd->lmd_server_ipaddr;
pcfg.pcfg_misc = lmd->lmd_port;
- err = libcfs_nal_cmd(&pcfg);
- if (err <0)
- GOTO(out, err);
+ rc = libcfs_nal_cmd(&pcfg);
+ if (rc < 0)
+ GOTO(out, rc);
}
LCFG_INIT(lcfg, LCFG_ADD_UUID, name);
lcfg.lcfg_inllen1 = strlen(peer) + 1;
lcfg.lcfg_inlbuf1 = peer;
lcfg.lcfg_nal = lmd->lmd_nal;
- err = class_process_config(&lcfg);
- if (err < 0)
- GOTO(out_del_conn, err);
+ rc = class_process_config(&lcfg);
+ if (rc < 0)
+ GOTO(out_del_conn, rc);
LCFG_INIT(lcfg, LCFG_ATTACH, name);
lcfg.lcfg_inlbuf1 = "mdc";
lcfg.lcfg_inlbuf2 = lmv_uuid.uuid;
lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
err = class_process_config(&lcfg);
- if (err < 0)
- GOTO(out_del_uuid, err);
+ if (rc < 0)
+ GOTO(out_del_uuid, rc);
LCFG_INIT(lcfg, LCFG_SETUP, name);
lcfg.lcfg_inlbuf1 = lmd->lmd_mds;
lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
lcfg.lcfg_inlbuf2 = peer;
lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
- err = class_process_config(&lcfg);
- if (err < 0)
- GOTO(out_detach, err);
+ rc = class_process_config(&lcfg);
+ if (rc < 0)
+ GOTO(out_detach, rc);
obd = class_name2obd(name);
if (obd == NULL)
- GOTO(out_cleanup, err = -EINVAL);
+ GOTO(out_cleanup, rc = -EINVAL);
+
+ rc = obd_set_info(obd->obd_self_export, strlen("sec"), "sec",
+ strlen(lmd->lmd_security), lmd->lmd_security);
+ if (rc)
+ GOTO(out_cleanup, rc);
/* Disable initial recovery on this import */
- err = obd_set_info(obd->obd_self_export,
- strlen("initial_recov"), "initial_recov",
- sizeof(allow_recov), &allow_recov);
- if (err)
- GOTO(out_cleanup, err);
+ rc = obd_set_info(obd->obd_self_export,
+ strlen("initial_recov"), "initial_recov",
+ sizeof(allow_recov), &allow_recov);
+ if (rc)
+ GOTO(out_cleanup, rc);
- err = obd_connect(&md_conn, obd, &lmv_uuid, 0);
- if (err) {
- CERROR("cannot connect to %s: rc = %d\n", lmd->lmd_mds, err);
- GOTO(out_cleanup, err);
+ rc = obd_connect(&md_conn, obd, &lmv_uuid, 0);
+ if (rc) {
+ CERROR("cannot connect to %s: rc = %d\n", lmd->lmd_mds, rc);
+ GOTO(out_cleanup, rc);
}
exp = class_conn2export(&md_conn);
if (rc)
CERROR("class_config_process_llog failed: rc = %d\n", rc);
- err = obd_disconnect(exp, 0);
+ rc = obd_disconnect(exp, 0);
EXIT;
out_cleanup:
lmd->lmd_nal == IIBNAL ||
lmd->lmd_nal == VIBNAL ||
lmd->lmd_nal == RANAL) {
+ int err2;
+
PCFG_INIT(pcfg, NAL_CMD_DEL_PEER);
pcfg.pcfg_nal = lmd->lmd_nal;
pcfg.pcfg_nid = lmd->lmd_server_nid;
pcfg.pcfg_flags = 1; /* single_share */
- err = libcfs_nal_cmd(&pcfg);
- if (err <0)
+ err2 = libcfs_nal_cmd(&pcfg);
+ if (err2 && !err)
+ err = err2;
+ if (err < 0)
GOTO(out, err);
}
out:
CERROR("no mds name\n");
GOTO(out_free, err = -EINVAL);
}
+ lmd->lmd_security[sizeof(lmd->lmd_security) - 1] = 0;
OBD_ALLOC(sbi->ll_lmd, sizeof(*sbi->ll_lmd));
if (sbi->ll_lmd == NULL)
GOTO(out_free, err = -EINVAL);
}
- err = lustre_common_fill_super(sb, lmv, lov);
+ err = lustre_common_fill_super(sb, lmv, lov, lmd->lmd_security,
+ &lmd->lmd_nllu);
if (err)
GOTO(out_free, err);
/* If only OST attributes being set on objects, don't do MDS RPC.
* In that case, we need to check permissions and update the local
* inode ourselves so we can call obdo_from_inode() always. */
- if (ia_valid & (lsm ? ~(ATTR_SIZE | ATTR_FROM_OPEN | ATTR_RAW) : ~0)) {
+ if (ia_valid & (lsm ? ~(ATTR_SIZE | ATTR_FROM_OPEN /*| ATTR_RAW*/) : ~0)) {
struct lustre_md md;
OBD_ALLOC(op_data, sizeof(*op_data));
int ll_setattr(struct dentry *de, struct iattr *attr)
{
- LBUG(); /* code is unused, but leave this in case of VFS changes */
- RETURN(-ENOSYS);
+ LASSERT(de->d_inode);
+ return ll_setattr_raw(de->d_inode, attr);
}
int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
struct lov_stripe_md *lsm = md->lsm;
struct mds_body *body = md->body;
struct mea *mea = md->mea;
+ struct posix_acl *ll_acl_access = md->acl_access;
ENTRY;
LASSERT((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
LASSERT((mea != NULL) == ((body->valid & OBD_MD_FLDIREA) != 0));
+
if (lsm != NULL) {
LASSERT(lsm->lsm_object_gr > 0);
if (lli->lli_smd == NULL) {
if (body->valid & OBD_MD_FLGENER)
id_gen(&lli->lli_id) = id_gen(&body->id1);
+ spin_lock(&lli->lli_lock);
+ if (ll_acl_access != NULL) {
+ if (lli->lli_acl_access != NULL)
+ posix_acl_release(lli->lli_acl_access);
+ lli->lli_acl_access = ll_acl_access;
+ }
+ spin_unlock(&lli->lli_lock);
+
if (body->valid & OBD_MD_FLID)
inode->i_ino = id_ino(&body->id1);
if (body->valid & OBD_MD_FLGENER)
struct mds_body *body;
ll_inode2id(&id, inode);
- rc = md_getattr(sbi->ll_md_exp, &id, valid, 0, &req);
+ rc = md_getattr(sbi->ll_md_exp, &id, valid, NULL, 0, 0, &req);
if (rc) {
CERROR("failure %d inode %lu\n", rc, inode->i_ino);
RETURN(-abs(rc));
id_ino(&id) = (__u64)ino;
id_gen(&id) = generation;
- rc = md_getattr(sbi->ll_md_exp, &id, valid, eadatalen, &req);
+ rc = md_getattr(sbi->ll_md_exp, &id, valid, NULL, 0,
+ eadatalen, &req);
if (rc) {
CERROR("failure %d inode %lu\n", rc, ino);
return ERR_PTR(rc);
struct file_operations ll_ra_stats_fops;
struct file_operations llite_wait_times_fops;
-
#ifndef LPROCFS
int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
struct super_block *sb, char *osc, char *mdc)
return count;
}
+static int ll_rd_gns_upcall(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct super_block *sb = (struct super_block *)data;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int len;
+
+ down(&sbi->ll_gns_sem);
+ len = snprintf(page, count, "%s\n", sbi->ll_gns_upcall);
+ up(&sbi->ll_gns_sem);
+
+ return len;
+}
+
+static int ll_wr_gns_upcall(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct super_block *sb = (struct super_block *)data;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+ down(&sbi->ll_gns_sem);
+ snprintf(sbi->ll_gns_upcall, count, "%s", buffer);
+ up(&sbi->ll_gns_sem);
+
+ return count;
+}
+
+static int ll_rd_gns_object_name(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct super_block *sb = (struct super_block *)data;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int len;
+
+ down(&sbi->ll_gns_sem);
+ len = snprintf(page, count, "%s\n", sbi->ll_gns_oname);
+ up(&sbi->ll_gns_sem);
+
+ return len;
+}
+
+static int ll_wr_gns_object_name(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct super_block *sb = (struct super_block *)data;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+ down(&sbi->ll_gns_sem);
+ snprintf(sbi->ll_gns_oname, count, "%s", buffer);
+ up(&sbi->ll_gns_sem);
+
+ return count;
+}
+
+static int ll_rd_gns_timeout(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct super_block *sb = (struct super_block *)data;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int len;
+
+ down(&sbi->ll_gns_sem);
+ len = snprintf(page, count, "%lu\n",
+ (unsigned long)sbi->ll_gns_timeout);
+ up(&sbi->ll_gns_sem);
+
+ return len;
+}
+
+static int ll_wr_gns_timeout(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct super_block *sb = (struct super_block *)data;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int val, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ down(&sbi->ll_gns_sem);
+ sbi->ll_gns_timeout = val;
+ up(&sbi->ll_gns_sem);
+
+ return count;
+}
+
+static int ll_rd_gns_tick(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct super_block *sb = (struct super_block *)data;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int len;
+
+ down(&sbi->ll_gns_sem);
+ len = snprintf(page, count, "%lu\n",
+ (unsigned long)sbi->ll_gns_tick);
+ up(&sbi->ll_gns_sem);
+
+ return len;
+}
+
+static int ll_wr_gns_tick(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct super_block *sb = (struct super_block *)data;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ int val, rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ down(&sbi->ll_gns_sem);
+ if (sbi->ll_gns_tick < sbi->ll_gns_timeout)
+ sbi->ll_gns_tick = val;
+ up(&sbi->ll_gns_sem);
+
+ return count;
+}
static struct lprocfs_vars lprocfs_obd_vars[] = {
{ "uuid", ll_rd_sb_uuid, 0, 0 },
//{ "mntpt_path", ll_rd_path, 0, 0 },
{ "config_update", 0, ll_wr_config_update, 0 },
{ "max_read_ahead_mb", ll_rd_max_read_ahead_mb,
ll_wr_max_read_ahead_mb, 0 },
+
+ { "gns_upcall", ll_rd_gns_upcall,
+ ll_wr_gns_upcall, 0 },
+
+ { "gns_timeout", ll_rd_gns_timeout,
+ ll_wr_gns_timeout, 0 },
+
+ { "gns_tick", ll_rd_gns_tick,
+ ll_wr_gns_tick, 0 },
+
+ { "gns_object_name", ll_rd_gns_object_name,
+ ll_wr_gns_object_name, 0 },
+
{ 0 }
};
"direct_read" },
{ LPROC_LL_DIRECT_WRITE, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
"direct_write" },
-
+ { LPROC_LL_SETXATTR, LPROCFS_TYPE_REGS, "setxattr" },
+ { LPROC_LL_GETXATTR, LPROCFS_TYPE_REGS, "getxattr" },
};
int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
list_del_init(&dentry->d_lru);
hlist_del_init(&dentry->d_hash);
- __d_rehash(dentry, 0); /* avoid taking dcache_lock inside */
+ __d_rehash(dentry); /* avoid taking dcache_lock inside */
spin_unlock(&dcache_lock);
atomic_inc(&dentry->d_count);
iput(inode);
CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
inode, inode->i_ino, inode->i_generation);
- mdc_set_lock_data(NULL, &it->d.lustre.it_lock_handle, inode);
+ mdc_set_lock_data(NULL, &LUSTRE_IT(it)->it_lock_handle, inode);
/* If this is a stat, get the authoritative file size */
if (it->it_op == IT_GETATTR && S_ISREG(inode->i_mode) &&
}
static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
- struct nameidata *nd, struct lookup_intent *it,
- int flags)
+ struct nameidata *nd, int flags)
{
struct dentry *save = dentry, *retval;
+ struct lookup_intent *it = flags ? &nd->intent.open : NULL;
struct lustre_id pid;
struct it_cb_data icbd;
struct ptlrpc_request *req = NULL;
struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
- int rc;
+ int rc, orig_it;
ENTRY;
- if (dentry->d_name.len > EXT3_NAME_LEN)
- RETURN(ERR_PTR(-ENAMETOOLONG));
-
CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n",
dentry->d_name.name, parent->i_ino, parent->i_generation,
parent, LL_IT2STR(it));
if (nd != NULL)
nd->mnt->mnt_last_used = jiffies;
+ orig_it = it ? it->it_op : IT_OPEN;
ll_frob_intent(&it, &lookup_it);
icbd.icbd_childp = &dentry;
if (nd &&
dentry->d_inode != NULL && dentry->d_inode->i_mode & S_ISUID &&
S_ISDIR(dentry->d_inode->i_mode) &&
- (flags & LOOKUP_CONTINUE || (it->it_op & (IT_CHDIR | IT_OPEN))))
- ll_dir_process_mount_object(dentry, nd->mnt);
+ ((flags & LOOKUP_CONTINUE) || (orig_it & (IT_CHDIR | IT_OPEN))))
+ {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_GNS_PENDING;
+ spin_unlock(&dentry->d_lock);
+ }
if (dentry == save)
GOTO(out, retval = NULL);
out:
if (req)
ptlrpc_req_finished(req);
+ if (it == &lookup_it)
+ ll_intent_release(it);
if (dentry->d_inode)
CDEBUG(D_INODE, "lookup 0x%p in %lu/%lu: %*s -> %lu/%lu\n",
dentry,
ENTRY;
if (nd && nd->flags & LOOKUP_LAST && !(nd->flags & LOOKUP_LINK_NOTLAST))
- de = ll_lookup_it(parent, dentry, nd, &nd->intent, nd->flags);
+ de = ll_lookup_it(parent, dentry, nd, nd->flags);
else
- de = ll_lookup_it(parent, dentry, nd, NULL, 0);
+ de = ll_lookup_it(parent, dentry, nd, 0);
RETURN(de);
}
int rc;
ENTRY;
- LASSERT(it && it->d.lustre.it_disposition);
- request = it->d.lustre.it_data;
+ LASSERT(it && LUSTRE_IT(it)->it_disposition);
+
+ request = LUSTRE_IT(it)->it_data;
rc = ll_prep_inode(sbi->ll_dt_exp, sbi->ll_md_exp,
&inode, request, 1, dir->i_sb);
if (rc)
* stuff it in the lock. */
CDEBUG(D_DLMTRACE, "setting l_ast_data to inode %p (%lu/%u)\n",
inode, inode->i_ino, inode->i_generation);
- mdc_set_lock_data(NULL, &it->d.lustre.it_lock_handle, inode);
+ mdc_set_lock_data(NULL, &LUSTRE_IT(it)->it_lock_handle, inode);
EXIT;
out:
ptlrpc_req_finished(request);
struct lookup_intent *it)
{
struct inode *inode;
- struct ptlrpc_request *request = it->d.lustre.it_data;
+ struct ptlrpc_request *request = LUSTRE_IT(it)->it_data;
struct obd_export *md_exp = ll_i2mdexp(dir);
int rc = 0;
ENTRY;
#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
static int ll_create_nd(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
{
- return ll_create_it(dir, dentry, mode, &nd->intent);
+ return ll_create_it(dir, dentry, mode, &nd->intent.open);
}
#endif
CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n",
name, dir->i_ino, dir->i_generation, dir);
- if (dir->i_nlink >= EXT3_LINK_MAX)
- RETURN(err);
-
mode &= ~current->fs->umask;
switch (mode & S_IFMT) {
CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n",
name, dir->i_ino, dir->i_generation, dir);
- if (dir->i_nlink >= EXT3_LINK_MAX)
- RETURN(err);
-
mode &= ~current->fs->umask;
switch (mode & S_IFMT) {
CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),target=%s\n",
name, dir->i_ino, dir->i_generation, dir, tgt);
-
- if (dir->i_nlink >= EXT3_LINK_MAX)
- RETURN(err);
-
+
OBD_ALLOC(op_data, sizeof(*op_data));
if (op_data == NULL)
RETURN(-ENOMEM);
ll_prepare_mdc_data(op_data, dir, NULL, name, len, 0);
+ LASSERT(tgt);
err = md_create(sbi->ll_md_exp, op_data,
tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO,
current->fsuid, current->fsgid, 0, &request);
RETURN(err);
}
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#define LLITE_IT_RAWOPS (IT_MKNOD|IT_MKDIR|IT_SYMLINK|IT_LINK|IT_UNLINK|IT_RMDIR|IT_RENAME)
+static int ll_rawop_from_intent(struct nameidata *nd)
+{
+ int error = 0;
+
+ if (!nd || !(nd->intent.open.op & LLITE_IT_RAWOPS))
+ return 0;
+
+ switch (nd->intent.open.op) {
+ case IT_MKNOD:
+ error = ll_mknod_raw(nd, nd->intent.open.create_mode,
+ nd->intent.open.create.dev);
+ break;
+ case IT_MKDIR:
+ error = ll_mkdir_raw(nd, nd->intent.open.create_mode);
+ break;
+ case IT_RMDIR:
+ error = ll_rmdir_raw(nd);
+ break;
+ case IT_UNLINK:
+ error = ll_unlink_raw(nd);
+ break;
+ case IT_SYMLINK:
+ LASSERT(nd->intent.open.create.link);
+ error = ll_symlink_raw(nd, nd->intent.open.create.link);
+ break;
+ case IT_LINK:
+ error = ll_link_raw(nd->intent.open.create.source_nd, nd);
+ break;
+ case IT_RENAME:
+ LASSERT(nd->intent.open.create.source_nd);
+ error = ll_rename_raw(nd->intent.open.create.source_nd, nd);
+ break;
+ default:
+ LBUG();
+ }
+ if (error != -EOPNOTSUPP)
+ nd->intent.open.flags |= IT_STATUS_RAW;
+
+ return error;
+}
+#endif
+
struct inode_operations ll_dir_inode_operations = {
- .link_raw = ll_link_raw,
- .unlink_raw = ll_unlink_raw,
- .symlink_raw = ll_symlink_raw,
- .mkdir_raw = ll_mkdir_raw,
- .rmdir_raw = ll_rmdir_raw,
- .mknod_raw = ll_mknod_raw,
.mknod = ll_mknod,
- .rename_raw = ll_rename_raw,
.setattr = ll_setattr,
- .setattr_raw = ll_setattr_raw,
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
.create_it = ll_create_it,
.lookup_it = ll_lookup_it,
#else
.lookup = ll_lookup_nd,
.create = ll_create_nd,
- .getattr_it = ll_getattr,
+ .getattr = ll_getattr,
+ .endparentlookup = ll_rawop_from_intent,
#endif
+ .setxattr = ll_setxattr,
+ .getxattr = ll_getxattr,
+ .listxattr = ll_listxattr,
+ .removexattr = ll_removexattr,
+ .permission = ll_inode_permission,
};
rc = err;
}
- req = it->d.lustre.it_data;
+ req = LUSTRE_IT(it)->it_data;
if (req)
ptlrpc_req_finished(req);
}
struct inode_operations ll_special_inode_operations = {
- .setattr_raw = ll_setattr_raw,
.setattr = ll_setattr,
#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
- .getattr_it = ll_getattr,
+ .getattr = ll_getattr,
#else
.revalidate_it = ll_inode_revalidate_it,
#endif
+ .setxattr = ll_setxattr,
+ .getxattr = ll_getxattr,
+ .listxattr = ll_listxattr,
+ .removexattr = ll_removexattr,
+ .permission = ll_inode_permission,
+
};
struct file_operations ll_special_chr_inode_fops = {
static void ll_umount_lustre(struct super_block *sb)
{
struct ll_sb_info *sbi = ll_s2sbi(sb);
-
- ll_gns_umount_all(sbi, 0);
+ ll_gns_check_all(sbi, LL_GNS_UMOUNT);
}
static struct file_system_type lustre_lite_fs_type = {
if (ll_file_data_slab == NULL)
return -ENOMEM;
+ ll_intent_slab = kmem_cache_create("lustre_intent_data",
+ sizeof(struct lustre_intent_data),
+ 0, SLAB_HWCACHE_ALIGN, NULL,
+ NULL);
+ if (ll_intent_slab == NULL) {
+ kmem_cache_destroy(ll_file_data_slab);
+ return -ENOMEM;
+ }
+
+
proc_lustre_fs_root = proc_lustre_root ? proc_mkdir("llite", proc_lustre_root) : NULL;
rc = register_filesystem(&lustre_lite_fs_type);
LASSERTF(kmem_cache_destroy(ll_file_data_slab) == 0,
"couldn't destroy ll_file_data slab\n");
+ LASSERTF(kmem_cache_destroy(ll_intent_slab) == 0,
+ "couldn't destroy ll_intent_slab slab\n");
if (proc_lustre_fs_root) {
lprocfs_remove(proc_lustre_fs_root);
.name = "lustre_lite",
.get_sb = ll_get_sb,
.kill_sb = kill_anon_super,
- .fs_flags = FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
};
struct file_system_type lustre_fs_type = {
.name = "lustre",
.get_sb = lustre_get_sb,
.kill_sb = kill_anon_super,
- .fs_flags = FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
};
static int __init init_lustre_lite(void)
rc = -ENOMEM;
goto out;
}
+ ll_intent_slab = kmem_cache_create("lustre_intent_data",
+ sizeof(struct lustre_intent_data),
+ 0, SLAB_HWCACHE_ALIGN, NULL,
+ NULL);
+ if (ll_intent_slab == NULL) {
+ kmem_cache_destroy(ll_file_data_slab);
+ ll_destroy_inodecache();
+ return -ENOMEM;
+ }
+
proc_lustre_fs_root = proc_lustre_root ?
proc_mkdir("llite", proc_lustre_root) : NULL;
unregister_filesystem(&lustre_fs_type);
unregister_filesystem(&lustre_lite_fs_type);
ll_destroy_inodecache();
+
+ ll_gns_stop_thread();
LASSERTF(kmem_cache_destroy(ll_file_data_slab) == 0,
"couldn't destroy ll_file_data slab\n");
+ LASSERTF(kmem_cache_destroy(ll_intent_slab) == 0,
+ "couldn't destroy ll_intent_slab slab\n");
if (proc_lustre_fs_root) {
lprocfs_remove(proc_lustre_fs_root);
proc_lustre_fs_root = NULL;
}
ll_inode2id(&id, inode);
- rc = md_getattr(sbi->ll_md_exp, &id, OBD_MD_LINKNAME, symlen,
+ rc = md_getattr(sbi->ll_md_exp, &id, OBD_MD_LINKNAME, NULL, 0, symlen,
request);
+
if (rc) {
if (rc != -ENOENT)
CERROR("inode %lu: rc = %d\n", inode->i_ino, rc);
struct inode_operations ll_fast_symlink_inode_operations = {
.readlink = ll_readlink,
.setattr = ll_setattr,
- .setattr_raw = ll_setattr_raw,
.follow_link = ll_follow_link,
+ .setxattr = ll_setxattr,
+ .getxattr = ll_getxattr,
+ .listxattr = ll_listxattr,
+ .removexattr = ll_removexattr,
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
.revalidate_it = ll_inode_revalidate_it
#else
- .getattr_it = ll_getattr
+ .getattr = ll_getattr
#endif
};
#include <linux/lprocfs_status.h>
#include <linux/lustre_fsfilt.h>
#include <linux/obd_lmv.h>
+#include <linux/namei.h>
+#include <linux/lustre_lite.h>
#include "lmv_internal.h"
static inline void lmv_drop_intent_lock(struct lookup_intent *it)
{
- if (it->d.lustre.it_lock_mode != 0)
- ldlm_lock_decref((void *)&it->d.lustre.it_lock_handle,
- it->d.lustre.it_lock_mode);
+ if (LUSTRE_IT(it)->it_lock_mode != 0)
+ ldlm_lock_decref((void *)&LUSTRE_IT(it)->it_lock_handle,
+ LUSTRE_IT(it)->it_lock_mode);
}
int lmv_handle_remote_inode(struct obd_export *exp, void *lmm,
}
/* we got LOOKUP lock, but we really need attrs */
- pmode = it->d.lustre.it_lock_mode;
+ pmode = LUSTRE_IT(it)->it_lock_mode;
if (pmode) {
- memcpy(&plock, &it->d.lustre.it_lock_handle,
+ memcpy(&plock, &LUSTRE_IT(it)->it_lock_handle,
sizeof(plock));
- it->d.lustre.it_lock_mode = 0;
+ LUSTRE_IT(it)->it_lock_mode = 0;
}
LASSERT((body->valid & OBD_MD_FID) != 0);
nid = body->id1;
- it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE;
+ LUSTRE_IT(it)->it_disposition &= ~DISP_ENQ_COMPLETE;
rc = md_intent_lock(lmv->tgts[id_group(&nid)].ltd_exp, &nid, NULL,
0, lmm, lmmsize, NULL, it, flags, &req, cb_blocking);
*/
if (rc == 0) {
lmv_drop_intent_lock(it);
- memcpy(&it->d.lustre.it_lock_handle, &plock,
+ memcpy(&LUSTRE_IT(it)->it_lock_handle, &plock,
sizeof(plock));
- it->d.lustre.it_lock_mode = pmode;
+ LUSTRE_IT(it)->it_lock_mode = pmode;
} else if (pmode)
ldlm_lock_decref(&plock, pmode);
* nothing is found, do not access body->id1 as it is zero and thus
* pointless.
*/
- if (it->d.lustre.it_disposition & DISP_LOOKUP_NEG)
+ if (LUSTRE_IT(it)->it_disposition & DISP_LOOKUP_NEG)
RETURN(0);
/* caller may use attrs MDS returns on IT_OPEN lock request so, we have
* nothing is found, do not access body->id1 as it is zero and thus
* pointless.
*/
- if (it->d.lustre.it_disposition & DISP_LOOKUP_NEG)
+ if (LUSTRE_IT(it)->it_disposition & DISP_LOOKUP_NEG)
RETURN(0);
body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
/* is obj valid? */
memset(&it, 0, sizeof(it));
it.it_op = IT_GETATTR;
+ OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data));
+
rc = md_intent_lock(lmv->tgts[id_group(&id)].ltd_exp, &id,
NULL, 0, NULL, 0, &id, &it, 0, &req,
lmv_dirobj_blocking_ast);
- lockh = (struct lustre_handle *)&it.d.lustre.it_lock_handle;
+ lockh = (struct lustre_handle *)&LUSTRE_IT(&it)->it_lock_handle;
if (rc > 0 && req == NULL) {
/* nice, this slave is valid */
LASSERT(req == NULL);
goto release_lock;
}
- if (rc < 0)
+ if (rc < 0) {
+ OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
/* error during lookup */
GOTO(cleanup, rc);
-
+ }
lock = ldlm_handle2lock(lockh);
LASSERT(lock);
release_lock:
lmv_update_body_from_obj(body, obj->objs + i);
- if (it.d.lustre.it_lock_mode)
- ldlm_lock_decref(lockh, it.d.lustre.it_lock_mode);
+ if (LUSTRE_IT(&it)->it_lock_mode)
+ ldlm_lock_decref(lockh, LUSTRE_IT(&it)->it_lock_mode);
+ OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
}
EXIT;
memset(&it, 0, sizeof(it));
it.it_op = IT_GETATTR;
+
cb = lmv_dirobj_blocking_ast;
+ OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data));
if (id_equal_fid(&id, &obj->id)) {
if (master_valid) {
/* lmv_intent_getattr() already checked
cb = cb_blocking;
}
+
/* is obj valid? */
rc = md_intent_lock(lmv->tgts[id_group(&id)].ltd_exp,
&id, NULL, 0, NULL, 0, &id, &it, 0,
&req, cb);
- lockh = (struct lustre_handle *) &it.d.lustre.it_lock_handle;
+ lockh = (struct lustre_handle *) &LUSTRE_IT(&it)->it_lock_handle;
if (rc > 0 && req == NULL) {
/* nice, this slave is valid */
LASSERT(req == NULL);
goto release_lock;
}
- if (rc < 0)
+ if (rc < 0) {
+ OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
/* error during revalidation */
GOTO(cleanup, rc);
-
+ }
if (master) {
LASSERT(master_valid == 0);
/* save lock on master to be returned to the caller */
CDEBUG(D_OTHER, "no lock on master yet\n");
memcpy(&master_lockh, lockh, sizeof(master_lockh));
- master_lock_mode = it.d.lustre.it_lock_mode;
- it.d.lustre.it_lock_mode = 0;
+ master_lock_mode = LUSTRE_IT(&it)->it_lock_mode;
+ LUSTRE_IT(&it)->it_lock_mode = 0;
} else {
/* this is slave. we want to control it */
lock = ldlm_handle2lock(lockh);
CDEBUG(D_OTHER, "fresh: %lu\n",
(unsigned long)obj->objs[i].size);
-
+
if (req)
ptlrpc_req_finished(req);
release_lock:
size += obj->objs[i].size;
- if (it.d.lustre.it_lock_mode)
- ldlm_lock_decref(lockh, it.d.lustre.it_lock_mode);
+ if (LUSTRE_IT(&it)->it_lock_mode)
+ ldlm_lock_decref(lockh, LUSTRE_IT(&it)->it_lock_mode);
+ OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
}
if (*reqp) {
// body->mds = id_group(&obj->id);
}
if (master_valid == 0) {
- memcpy(&oit->d.lustre.it_lock_handle,
+ memcpy(&LUSTRE_IT(oit)->it_lock_handle,
&master_lockh, sizeof(master_lockh));
- oit->d.lustre.it_lock_mode = master_lock_mode;
+ LUSTRE_IT(oit)->it_lock_mode = master_lock_mode;
}
rc = 0;
} else {
/* it seems all the attrs are fresh and we did no request */
CDEBUG(D_OTHER, "all the attrs were fresh\n");
if (master_valid == 0)
- oit->d.lustre.it_lock_mode = master_lock_mode;
+ LUSTRE_IT(oit)->it_lock_mode = master_lock_mode;
rc = 1;
}
#include <linux/pagemap.h>
#include <asm/div64.h>
#include <linux/seq_file.h>
+#include <linux/namei.h>
#else
#include <liblustre.h>
#endif
#include <linux/lprocfs_status.h>
#include <linux/lustre_fsfilt.h>
#include <linux/obd_lmv.h>
+#include <linux/lustre_lite.h>
#include "lmv_internal.h"
/* object cache. */
}
static int lmv_getattr(struct obd_export *exp, struct lustre_id *id,
- __u64 valid, unsigned int ea_size,
- struct ptlrpc_request **request)
+ __u64 valid, const char *ea_name, int ea_namelen,
+ unsigned int ea_size, struct ptlrpc_request **request)
{
struct obd_device *obd = exp->exp_obd;
struct lmv_obd *lmv = &obd->u.lmv;
LASSERT(i < lmv->desc.ld_tgt_count);
+
rc = md_getattr(lmv->tgts[i].ltd_exp, id, valid,
- ea_size, request);
+ ea_name, ea_namelen, ea_size, request);
if (rc)
RETURN(rc);
/* time to update mea of parent id */
rc = md_getattr(lmv->tgts[id_group(id)].ltd_exp,
- id, valid, mealen, &req);
+ id, valid, NULL, 0, mealen, &req);
if (rc) {
CERROR("md_getattr() failed, error %d\n", rc);
GOTO(cleanup, rc);
cb_compl, cb_blocking, cb_data);
CDEBUG(D_OTHER, "take lock on slave "DLID4" -> %d/%d\n",
- OLID4(&mea->mea_ids[i]), rc, it->d.lustre.it_status);
+ OLID4(&mea->mea_ids[i]), rc, LUSTRE_IT(it)->it_status);
if (rc)
GOTO(cleanup, rc);
- if (it->d.lustre.it_data) {
+ if (LUSTRE_IT(it)->it_data) {
struct ptlrpc_request *req;
- req = (struct ptlrpc_request *)it->d.lustre.it_data;
+ req = (struct ptlrpc_request *) LUSTRE_IT(it)->it_data;
ptlrpc_req_finished(req);
}
- if (it->d.lustre.it_status)
- GOTO(cleanup, rc = it->d.lustre.it_status);
+ if (LUSTRE_IT(it)->it_status)
+ GOTO(cleanup, rc = LUSTRE_IT(it)->it_status);
}
OBD_FREE(data2, sizeof(*data2));
lmv_set_timeouts(obd);
RETURN(0);
}
-
+
+ /* maybe this could be default */
+ if ((keylen == strlen("sec") && strcmp(key, "sec") == 0) ||
+ (keylen == strlen("nllu") && strcmp(key, "nllu") == 0)) {
+ struct lmv_tgt_desc *tgt;
+ struct obd_export *exp;
+ int rc = 0, err, i;
+
+ spin_lock(&lmv->lmv_lock);
+ for (i = 0, tgt = lmv->tgts; i < lmv->desc.ld_tgt_count;
+ i++, tgt++) {
+ exp = tgt->ltd_exp;
+ /* during setup time the connections to mdc might
+ * haven't been established.
+ */
+ if (exp == NULL) {
+ struct obd_device *tgt_obd;
+
+ tgt_obd = class_find_client_obd(&tgt->uuid,
+ LUSTRE_MDC_NAME,
+ &obd->obd_uuid);
+ if (!tgt_obd) {
+ CERROR("can't set info %s, "
+ "device %s not attached?\n",
+ (char *) key, tgt->uuid.uuid);
+ rc = -EINVAL;
+ continue;
+ }
+ exp = tgt_obd->obd_self_export;
+ }
+
+ err = obd_set_info(exp, keylen, key, vallen, val);
+ if (!rc)
+ rc = err;
+ }
+ spin_unlock(&lmv->lmv_lock);
+
+ RETURN(rc);
+ }
+
RETURN(-EINVAL);
}
valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
rc = md_getattr(lmv->tgts[id_group(id)].ltd_exp,
- id, valid, mealen, &req);
+ id, valid, NULL, 0, mealen, &req);
if (rc) {
CERROR("md_getattr() failed, error %d\n", rc);
GOTO(cleanup, obj = ERR_PTR(rc));
} else if (KEY_IS("unlinked") || KEY_IS("unrecovery")) {
if (vallen != 0)
RETURN(-EINVAL);
+ } else if (KEY_IS("sec")) {
+ struct lov_tgt_desc *tgt;
+ struct obd_export *exp;
+ int rc = 0, err, i;
+
+ spin_lock(&lov->lov_lock);
+ for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count;
+ i++, tgt++) {
+ exp = tgt->ltd_exp;
+ /* during setup time the connections to osc might
+ * haven't been established.
+ */
+ if (exp == NULL) {
+ struct obd_device *tgt_obd;
+
+ tgt_obd = class_find_client_obd(&tgt->uuid,
+ LUSTRE_OSC_NAME,
+ &obddev->obd_uuid);
+ if (!tgt_obd) {
+ CERROR("can't set security flavor, "
+ "device %s not attached?\n",
+ tgt->uuid.uuid);
+ rc = -EINVAL;
+ continue;
+ }
+ exp = tgt_obd->obd_self_export;
+ }
+
+ err = obd_set_info(exp, keylen, key, vallen, val);
+ if (!rc)
+ rc = err;
+ }
+ spin_unlock(&lov->lov_lock);
+
+ RETURN(rc);
} else {
RETURN(-EINVAL);
}
handle = fsfilt->fs_start(dir, FSFILT_OP_SYMLINK, NULL, 0);
if (IS_ERR(handle))
GOTO(cleanup, rc = PTR_ERR(handle));
- rc = ll_vfs_symlink(dir, dentry, new_path);
+ rc = ll_vfs_symlink(dir, dentry, new_path, S_IALLUGO);
break;
}
case S_IFCHR:
if LIBLUSTRE
noinst_LIBRARIES = libmdc.a
-libmdc_a_SOURCES = mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c
+libmdc_a_SOURCES = #mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c
libmdc_a_CPPFLAGS = $(LLCPPFLAGS)
libmdc_a_CFLAGS = $(LLCFLAGS)
endif
#include <linux/lustre_mds.h>
#include <linux/lustre_dlm.h>
#include <linux/lprocfs_status.h>
+#include <linux/lustre_acl.h>
+#include <linux/lustre_lite.h>
#include "mdc_internal.h"
int it_disposition(struct lookup_intent *it, int flag)
{
- return it->d.lustre.it_disposition & flag;
+ return LUSTRE_IT(it)->it_disposition & flag;
}
EXPORT_SYMBOL(it_disposition);
void it_set_disposition(struct lookup_intent *it, int flag)
{
- it->d.lustre.it_disposition |= flag;
+ LUSTRE_IT(it)->it_disposition |= flag;
}
EXPORT_SYMBOL(it_set_disposition);
{
if (it_disposition(it, DISP_OPEN_OPEN)) {
if (phase == DISP_OPEN_OPEN)
- return it->d.lustre.it_status;
+ return LUSTRE_IT(it)->it_status;
else
return 0;
}
if (it_disposition(it, DISP_OPEN_CREATE)) {
if (phase == DISP_OPEN_CREATE)
- return it->d.lustre.it_status;
+ return LUSTRE_IT(it)->it_status;
else
return 0;
}
if (it_disposition(it, DISP_LOOKUP_EXECD)) {
if (phase == DISP_LOOKUP_EXECD)
- return it->d.lustre.it_status;
+ return LUSTRE_IT(it)->it_status;
else
return 0;
}
if (it_disposition(it, DISP_IT_EXECD)) {
if (phase == DISP_IT_EXECD)
- return it->d.lustre.it_status;
+ return LUSTRE_IT(it)->it_status;
else
return 0;
}
- CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition,
- it->d.lustre.it_status);
+ CERROR("it disp: %X, status: %d\n", LUSTRE_IT(it)->it_disposition,
+ LUSTRE_IT(it)->it_status);
LBUG();
return 0;
}
int reqsize[6] = {[MDS_REQ_SECDESC_OFF] = 0,
[MDS_REQ_INTENT_LOCKREQ_OFF] = sizeof(*lockreq),
[MDS_REQ_INTENT_IT_OFF] = sizeof(*lit)};
- int repsize[4] = {sizeof(struct ldlm_reply),
+ int repsize[5] = {sizeof(struct ldlm_reply),
sizeof(struct mds_body),
- obddev->u.cli.cl_max_mds_easize,
- obddev->u.cli.cl_max_mds_cookiesize};
+ obddev->u.cli.cl_max_mds_easize};
int req_buffers = 3, reply_buffers = 0;
int rc, flags = LDLM_FL_HAS_INTENT;
void *eadata;
it->it_create_mode, 0, it->it_flags,
lmm, lmmsize);
/* get ready for the reply */
- reply_buffers = 3;
- req->rq_replen = lustre_msg_size(3, repsize);
+ repsize[3] = 4;
+ repsize[4] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
+ reply_buffers = 5;
+ req->rq_replen = lustre_msg_size(5, repsize);
} else if (it->it_op & (IT_GETATTR | IT_LOOKUP | IT_CHDIR)) {
- __u64 valid = data->valid | OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE;
+ __u64 valid = data->valid | OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE |
+ OBD_MD_FLACL_ACCESS;
reqsize[req_buffers++] = sizeof(struct mds_body);
reqsize[req_buffers++] = data->namelen + 1;
valid, it->it_flags, data);
/* get ready for the reply */
- reply_buffers = 3;
- req->rq_replen = lustre_msg_size(3, repsize);
+ repsize[3] = 4;
+ repsize[4] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
+ reply_buffers = 5;
+ req->rq_replen = lustre_msg_size(5, repsize);
} else if (it->it_op == IT_READDIR) {
policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION,
LASSERT(dlm_rep != NULL); /* checked by ldlm_cli_enqueue() */
LASSERT_REPSWABBED(req, 0); /* swabbed by ldlm_cli_enqueue() */
- it->d.lustre.it_disposition = (int) dlm_rep->lock_policy_res1;
- it->d.lustre.it_status = (int) dlm_rep->lock_policy_res2;
- it->d.lustre.it_lock_mode = lock_mode;
- it->d.lustre.it_data = req;
+ LUSTRE_IT(it)->it_disposition = (int) dlm_rep->lock_policy_res1;
+ LUSTRE_IT(it)->it_status = (int) dlm_rep->lock_policy_res2;
+ LUSTRE_IT(it)->it_lock_mode = lock_mode;
+ LUSTRE_IT(it)->it_data = req;
- if (it->d.lustre.it_status < 0 && req->rq_replay) {
+ if (LUSTRE_IT(it)->it_status < 0 && req->rq_replay) {
LASSERT(req->rq_transno == 0);
/* Don't hold error requests for replay. */
spin_lock(&req->rq_lock);
}
DEBUG_REQ(D_RPCTRACE, req, "disposition: %x, status: %d",
- it->d.lustre.it_disposition, it->d.lustre.it_status);
+ LUSTRE_IT(it)->it_disposition, LUSTRE_IT(it)->it_status);
/* We know what to expect, so we do any byte flipping required here */
- LASSERT(reply_buffers == 4 || reply_buffers == 3 || reply_buffers == 1);
+ LASSERT(reply_buffers == 5 || reply_buffers == 4 ||
+ reply_buffers == 3 || reply_buffers == 1);
if (reply_buffers >= 3) {
struct mds_body *body;
* ll_create/ll_open gets called.
*
* The server will return to us, in it_disposition, an indication of
- * exactly what d.lustre.it_status refers to.
+ * exactly what d.lustre->it_status refers to.
*
- * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call,
+ * If DISP_OPEN_OPEN is set, then d.lustre->it_status refers to the open() call,
* otherwise if DISP_OPEN_CREATE is set, then it status is the
* creation failure mode. In either case, one of DISP_LOOKUP_NEG or
* DISP_LOOKUP_POS will be set, indicating whether the child lookup
* was successful.
*
- * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the
+ * Else, if DISP_LOOKUP_EXECD then d.lustre->it_status is the rc of the
* child lookup.
*/
int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid,
&lockh);
}
if (rc) {
- memcpy(&it->d.lustre.it_lock_handle, &lockh,
+ memcpy(&LUSTRE_IT(it)->it_lock_handle, &lockh,
sizeof(lockh));
- it->d.lustre.it_lock_mode = mode;
+ LUSTRE_IT(it)->it_lock_mode = mode;
}
/* Only return failure if it was not GETATTR by cid (from
if (rc < 0)
RETURN(rc);
- memcpy(&it->d.lustre.it_lock_handle, &lockh, sizeof(lockh));
+ memcpy(&LUSTRE_IT(it)->it_lock_handle, &lockh, sizeof(lockh));
}
- request = *reqp = it->d.lustre.it_data;
+ request = *reqp = LUSTRE_IT(it)->it_data;
LASSERT(request != NULL);
/* If we're doing an IT_OPEN which did not result in an actual
* 3440) */
if (it->it_op & IT_OPEN) {
if (!it_disposition(it, DISP_OPEN_OPEN) ||
- it->d.lustre.it_status != 0) {
+ LUSTRE_IT(it)->it_status != 0) {
unsigned long irqflags;
spin_lock_irqsave(&request->rq_lock, irqflags);
if (!it_disposition(it, DISP_IT_EXECD)) {
/* The server failed before it even started executing the
* intent, i.e. because it couldn't unpack the request. */
- LASSERT(it->d.lustre.it_status != 0);
- RETURN(it->d.lustre.it_status);
+ LASSERT(LUSTRE_IT(it)->it_status != 0);
+ RETURN(LUSTRE_IT(it)->it_status);
}
rc = it_open_error(DISP_IT_EXECD, it);
if (rc)
if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
LDLM_IBITS, &policy, LCK_NL, &old_lock)) {
ldlm_lock_decref_and_cancel(&lockh,
- it->d.lustre.it_lock_mode);
+ LUSTRE_IT(it)->it_lock_mode);
memcpy(&lockh, &old_lock, sizeof(old_lock));
- memcpy(&it->d.lustre.it_lock_handle, &lockh,
+ memcpy(&LUSTRE_IT(it)->it_lock_handle, &lockh,
sizeof(lockh));
}
}
CDEBUG(D_DENTRY, "D_IT dentry %*s intent: %s status %d disp %x rc %d\n",
- len, name, ldlm_it2str(it->it_op), it->d.lustre.it_status,
- it->d.lustre.it_disposition, rc);
+ len, name, ldlm_it2str(it->it_op), LUSTRE_IT(it)->it_status,
+ LUSTRE_IT(it)->it_disposition, rc);
RETURN(rc);
}
#include <linux/obd_class.h>
#include <linux/lustre_mds.h>
#include <linux/lustre_dlm.h>
+#include <linux/lustre_sec.h>
#include <linux/lprocfs_status.h>
+#include <linux/lustre_acl.h>
#include "mdc_internal.h"
#define REQUEST_MINOR 244
int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size,
struct ptlrpc_request *req)
{
- struct mds_body *body;
+ struct mds_body *body, *reqbody;
void *eadata;
int rc;
- int repsize[2] = {sizeof(*body), 0};
+ int repsize[4] = {sizeof(*body)};
int bufcount = 1;
ENTRY;
CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n",
ea_size);
}
+
+ reqbody = lustre_msg_buf(req->rq_reqmsg, 1, sizeof(*reqbody));
+
+ if (reqbody->valid & OBD_MD_FLACL_ACCESS) {
+ repsize[bufcount++] = 4;
+ repsize[bufcount++] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
+ }
+
req->rq_replen = lustre_msg_size(bufcount, repsize);
mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
CDEBUG(D_NET, "mode: %o\n", body->mode);
LASSERT_REPSWAB (req, 1);
- if (body->eadatasize != 0) {
+
+ /* Skip the check if getxattr/listxattr are called with no buffers */
+ if ((reqbody->valid & (OBD_MD_FLEA | OBD_MD_FLEALIST)) &&
+ (reqbody->eadatasize != 0)){
+ if (body->eadatasize != 0) {
/* reply indicates presence of eadata; check it's there... */
- eadata = lustre_msg_buf (req->rq_repmsg, 1, body->eadatasize);
- if (eadata == NULL) {
- CERROR ("Missing/short eadata\n");
- RETURN (-EPROTO);
- }
- }
+ eadata = lustre_msg_buf (req->rq_repmsg, 1,
+ body->eadatasize);
+ if (eadata == NULL) {
+ CERROR ("Missing/short eadata\n");
+ RETURN (-EPROTO);
+ }
+ }
+ }
RETURN (0);
}
int mdc_getattr(struct obd_export *exp, struct lustre_id *id,
- __u64 valid, unsigned int ea_size,
- struct ptlrpc_request **request)
+ __u64 valid, const char *ea_name, int ea_namelen,
+ unsigned int ea_size, struct ptlrpc_request **request)
{
struct ptlrpc_request *req;
struct mds_body *body;
- int size[2] = {0, sizeof(*body)};
+ int bufcount = 2;
+ int size[3] = {0, sizeof(*body)};
int rc;
ENTRY;
*/
size[0] = mdc_get_secdesc_size();
+ LASSERT((ea_name != NULL) == (ea_namelen != 0));
+ if (valid & (OBD_MD_FLEA | OBD_MD_FLEALIST)) {
+ size[bufcount] = ea_namelen;
+ bufcount++;
+ }
+
req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
- MDS_GETATTR, 2, size, NULL);
+ MDS_GETATTR, bufcount, size, NULL);
if (!req)
GOTO(out, rc = -ENOMEM);
body->valid = valid;
body->eadatasize = ea_size;
+
+ if (valid & OBD_MD_FLEA) {
+ LASSERT(strnlen(ea_name, ea_namelen) == (ea_namelen - 1));
+ memcpy(lustre_msg_buf(req->rq_reqmsg, 2, ea_namelen),
+ ea_name, ea_namelen);
+ }
+
rc = mdc_getattr_common(exp, ea_size, req);
if (rc != 0) {
ptlrpc_req_finished (req);
unsigned int offset, struct obd_export *exp_lov,
struct lustre_md *md)
{
+ void *buf;
+ int size, acl_off;
+ struct posix_acl *acl;
int rc = 0;
ENTRY;
CERROR("Detected invalid mea, which does not "
"support neither old either new format.\n");
} else {
- LASSERT(0);
+ LASSERT(S_ISCHR(md->body->mode) ||
+ S_ISBLK(md->body->mode) ||
+ S_ISFIFO(md->body->mode)||
+ S_ISLNK(md->body->mode) ||
+ S_ISSOCK(md->body->mode));
}
+
+ acl_off = (md->body->valid & OBD_MD_FLEASIZE) ? (offset + 2) :
+ (offset + 1);
+
+ if (md->body->valid & OBD_MD_FLACL_ACCESS) {
+ size = le32_to_cpu(*(__u32 *) lustre_msg_buf(req->rq_repmsg,
+ acl_off, 4));
+ buf = lustre_msg_buf(req->rq_repmsg, acl_off + 1, size);
+
+ acl = posix_acl_from_xattr(buf, size);
+ if (IS_ERR(acl)) {
+ rc = PTR_ERR(acl);
+ CERROR("convert xattr to acl failed: %d\n", rc);
+ RETURN(rc);
+ } else if (acl) {
+ rc = posix_acl_valid(acl);
+ if (rc) {
+ CERROR("acl valid error: %d\n", rc);
+ posix_acl_release(acl);
+ RETURN(rc);
+ }
+ }
+
+ md->acl_access = acl;
+ }
+
RETURN(rc);
}
imp->imp_server_timeout = 1;
CDEBUG(D_OTHER, "%s: timeout / 2\n", exp->exp_obd->obd_name);
RETURN(0);
+ } else if (keylen == strlen("sec") && memcmp(key, "sec", keylen) == 0) {
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+
+ if (vallen == strlen("null") &&
+ memcmp(val, "null", vallen) == 0) {
+ cli->cl_sec_flavor = PTLRPC_SEC_NULL;
+ cli->cl_sec_subflavor = 0;
+ RETURN(0);
+ }
+ if (vallen == strlen("krb5i") &&
+ memcmp(val, "krb5i", vallen) == 0) {
+ cli->cl_sec_flavor = PTLRPC_SEC_GSS;
+ cli->cl_sec_subflavor = PTLRPC_SEC_GSS_KRB5I;
+ RETURN(0);
+ }
+ if (vallen == strlen("krb5p") &&
+ memcmp(val, "krb5p", vallen) == 0) {
+ cli->cl_sec_flavor = PTLRPC_SEC_GSS;
+ cli->cl_sec_subflavor = PTLRPC_SEC_GSS_KRB5P;
+ RETURN(0);
+ }
+ CERROR("unrecognized security type %s\n", (char*) val);
+ rc = -EINVAL;
+ } else if (keylen == strlen("nllu") && memcmp(key, "nllu", keylen) == 0) {
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+
+ LASSERT(vallen == sizeof(__u32) * 2);
+ cli->cl_nllu = ((__u32 *) val)[0];
+ cli->cl_nllg = ((__u32 *) val)[1];
+ RETURN(0);
}
+
RETURN(rc);
}
MODULES := mds
mds-objs := mds_log.o mds_unlink_open.o mds_lov.o handler.o mds_reint.o
-mds-objs += mds_fs.o lproc_mds.o mds_open.o mds_lib.o mds_lmv.o mds_groups.o
+mds-objs += mds_fs.o lproc_mds.o mds_open.o mds_lib.o mds_lmv.o mds_lsd.o
@INCLUDE_RULES@
#include <linux/random.h>
#include <linux/fs.h>
#include <linux/jbd.h>
+#include <linux/namei.h>
#include <linux/ext3_fs.h>
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
# include <linux/smp_lock.h>
#include <linux/lprocfs_status.h>
#include <linux/lustre_commit_confd.h>
+#include <linux/lustre_acl.h>
#include "mds_internal.h"
static int mds_intent_policy(struct ldlm_namespace *ns,
RETURN(rc);
}
+int mds_pack_link(struct dentry *dentry, struct ptlrpc_request *req,
+ struct mds_body *repbody, int reply_off)
+{
+ struct inode *inode = dentry->d_inode;
+ char *symname;
+ int len, rc;
+ ENTRY;
+
+ symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1,0);
+ LASSERT(symname != NULL);
+ len = req->rq_repmsg->buflens[reply_off + 1];
+
+ rc = inode->i_op->readlink(dentry, symname, len);
+ if (rc < 0) {
+ CERROR("readlink failed: %d\n", rc);
+ } else if (rc != len - 1) {
+ CERROR ("Unexpected readlink rc %d: expecting %d\n",
+ rc, len - 1);
+ rc = -EINVAL;
+ } else {
+ CDEBUG(D_INODE, "read symlink dest %s\n", symname);
+ repbody->valid |= OBD_MD_LINKNAME;
+ repbody->eadatasize = rc + 1;
+ symname[rc] = 0; /* NULL terminate */
+ rc = 0;
+ }
+
+ RETURN(rc);
+}
+
+int mds_pack_ea(struct dentry *dentry, struct ptlrpc_request *req,
+ struct mds_body *repbody, int req_off, int reply_off)
+{
+ struct inode *inode = dentry->d_inode;
+ char *ea_name;
+ void *value = NULL;
+ int len, rc;
+ ENTRY;
+
+ ea_name = lustre_msg_string(req->rq_reqmsg, req_off + 1, 0);
+ len = req->rq_repmsg->buflens[reply_off + 1];
+ if (len != 0)
+ value = lustre_msg_buf(req->rq_repmsg, reply_off + 1, len);
+
+ rc = -EOPNOTSUPP;
+ if (inode->i_op && inode->i_op->getxattr)
+ rc = inode->i_op->getxattr(dentry, ea_name, value, len);
+ if (rc < 0) {
+ if (rc != -ENODATA && rc != -EOPNOTSUPP)
+ CERROR("getxattr failed: %d", rc);
+ } else {
+ repbody->valid |= OBD_MD_FLEA;
+ repbody->eadatasize = rc;
+ rc = 0;
+ }
+
+ RETURN(rc);
+}
+
+int mds_pack_ealist(struct dentry *dentry, struct ptlrpc_request *req,
+ struct mds_body *repbody, int reply_off)
+{
+ struct inode *inode = dentry->d_inode;
+ void *value = NULL;
+ int len, rc;
+ ENTRY;
+
+ len = req->rq_repmsg->buflens[reply_off + 1];
+ if (len != 0)
+ value = lustre_msg_buf(req->rq_repmsg, reply_off + 1, len);
+
+ rc = -EOPNOTSUPP;
+ if (inode->i_op && inode->i_op->getxattr)
+ rc = inode->i_op->listxattr(dentry, value, len);
+
+ if (rc < 0) {
+ CERROR("listxattr failed: %d", rc);
+ } else {
+ repbody->valid |= OBD_MD_FLEALIST;
+ repbody->eadatasize = rc;
+ rc = 0;
+ }
+ RETURN(rc);
+}
+
+int mds_pack_acl(struct obd_device *obd, struct lustre_msg *repmsg, int offset,
+ struct mds_body *body, struct inode *inode)
+{
+ struct dentry de = { .d_inode = inode };
+ void *buf;
+ __u32 buflen, *sizep, size;
+ ENTRY;
+
+ if (!inode->i_op->getxattr)
+ RETURN(0);
+
+ buflen = repmsg->buflens[offset + 1];
+ buf = lustre_msg_buf(repmsg, offset + 1, buflen);
+
+ size = inode->i_op->getxattr(&de, XATTR_NAME_ACL_ACCESS, buf, buflen);
+ if (size == -ENODATA)
+ RETURN(0);
+ if (size < 0)
+ RETURN(size);
+ LASSERT(size);
+
+ sizep = lustre_msg_buf(repmsg, offset, 4);
+ if (!sizep) {
+ CERROR("can't locate returned acl size buf\n");
+ RETURN(-EPROTO);
+ }
+
+ *sizep = cpu_to_le32(size);
+ body->valid |= OBD_MD_FLACL_ACCESS;
+
+ RETURN(0);
+}
+
+/*
+ * we only take care of fsuid/fsgid.
+ */
void mds_squash_root(struct mds_obd *mds, struct mds_req_sec_desc *rsd,
ptl_nid_t *peernid)
{
- if (!mds->mds_squash_uid ||
- (rsd->rsd_uid && rsd->rsd_fsuid))
+ if (!mds->mds_squash_uid || rsd->rsd_fsuid)
return;
if (*peernid == mds->mds_nosquash_nid)
return;
- CDEBUG(D_OTHER, "squash req from 0x%llx, (%d:%d/%x)=>(%d:%d/%x)\n",
+ CDEBUG(D_SEC, "squash req from 0x%llx, (%d:%d/%x)=>(%d:%d/%x)\n",
*peernid, rsd->rsd_fsuid, rsd->rsd_fsgid, rsd->rsd_cap,
mds->mds_squash_uid, mds->mds_squash_gid,
(rsd->rsd_cap & ~CAP_FS_MASK));
- rsd->rsd_uid = mds->mds_squash_uid;
rsd->rsd_fsuid = mds->mds_squash_uid;
rsd->rsd_fsgid = mds->mds_squash_gid;
-
- /* XXX should we remove all capabilities? */
rsd->rsd_cap &= ~CAP_FS_MASK;
}
static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
- struct ptlrpc_request *req, struct mds_body *reqbody,
- int reply_off)
+ struct ptlrpc_request *req, int req_off,
+ struct mds_body *reqbody, int reply_off)
{
struct inode *inode = dentry->d_inode;
struct mds_body *body;
OBD_MD_FLATIME | OBD_MD_FLMTIME);
} else if (S_ISLNK(inode->i_mode) &&
(reqbody->valid & OBD_MD_LINKNAME) != 0) {
- int len = req->rq_repmsg->buflens[reply_off + 1];
- char *symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1, 0);
-
- LASSERT(symname != NULL); /* caller prepped reply */
-
- if (!inode->i_op->readlink) {
- rc = -ENOSYS;
- } else {
- rc = inode->i_op->readlink(dentry, symname, len);
- if (rc < 0) {
- CERROR("readlink failed: %d\n", rc);
- } else if (rc != len - 1) {
- CERROR("Unexpected readlink rc %d: expecting %d\n",
- rc, len - 1);
- rc = -EINVAL;
- } else {
- CDEBUG(D_INODE, "read symlink dest %s\n", symname);
- body->valid |= OBD_MD_LINKNAME;
- body->eadatasize = rc + 1;
- symname[rc] = 0;
- rc = 0;
- }
- }
+ rc = mds_pack_link(dentry, req, body, reply_off);
+ } else if (reqbody->valid & OBD_MD_FLEA) {
+ rc = mds_pack_ea(dentry, req, body, req_off, reply_off);
+ } else if (reqbody->valid & OBD_MD_FLEALIST) {
+ rc = mds_pack_ealist(dentry, req, body, reply_off);
}
+
+ if (reqbody->valid & OBD_MD_FLACL_ACCESS) {
+ int inc = (reqbody->valid & OBD_MD_FLEASIZE) ? 2 : 1;
+ rc = mds_pack_acl(obd, req->rq_repmsg, reply_off + inc,
+ body, inode);
+ }
+
+ /* do reverse uid/gid mapping if needed */
+ if (rc == 0 && req->rq_remote)
+ mds_reverse_map_ugid(req, body);
RETURN(rc);
}
return rc;
}
-static int mds_getattr_pack_msg(struct ptlrpc_request *req,
- struct inode *inode,
+static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct dentry *de,
int offset)
{
+ struct inode *inode = de->d_inode;
struct mds_obd *mds = mds_req2mds(req);
struct mds_body *body;
- int rc = 0, size[2] = {sizeof(*body)}, bufcount = 1;
+ int rc = 0, size[4] = {sizeof(*body)}, bufcount = 1;
ENTRY;
body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*body));
down(&inode->i_sem);
rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0);
up(&inode->i_sem);
- CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n",
- rc, inode->i_ino);
if (rc < 0) {
if (rc != -ENODATA)
CERROR("error getting inode %lu MD: rc = %d\n",
bufcount++;
CDEBUG(D_INODE, "symlink size: %Lu, reply space: %d\n",
inode->i_size + 1, body->eadatasize);
+ } else if ((body->valid & OBD_MD_FLEA)) {
+ char *ea_name = lustre_msg_string(req->rq_reqmsg,
+ offset + 1, 0);
+ rc = -EOPNOTSUPP;
+ if (inode->i_op && inode->i_op->getxattr)
+ rc = inode->i_op->getxattr(de, ea_name, NULL, 0);
+
+ if (rc < 0) {
+ if (rc != -ENODATA)
+ CERROR("error getting inode %lu EA: rc = %d\n",
+ inode->i_ino, rc);
+ size[bufcount] = 0;
+ } else {
+ size[bufcount] = min_t(int, body->eadatasize, rc);
+ }
+ bufcount++;
+ } else if (body->valid & OBD_MD_FLEALIST) {
+ rc = -EOPNOTSUPP;
+ if (inode->i_op && inode->i_op->getxattr)
+ rc = inode->i_op->listxattr(de, NULL, 0);
+
+ if (rc < 0) {
+ if (rc != -ENODATA)
+ CERROR("error getting inode %lu EA: rc = %d\n",
+ inode->i_ino, rc);
+ size[bufcount] = 0;
+ } else {
+ size[bufcount] = min_t(int, body->eadatasize, rc);
+ }
+ bufcount++;
+ }
+
+ /* may co-exist with OBD_MD_FLEASIZE */
+ if (body->valid & OBD_MD_FLACL_ACCESS) {
+ size[bufcount++] = 4;
+ size[bufcount++] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
}
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) {
struct mds_req_sec_desc *rsd;
struct mds_body *body;
struct dentry *dparent = NULL, *dchild = NULL;
- struct lvfs_ucred uc;
+ struct lvfs_ucred uc = {NULL, NULL,};
struct lustre_handle parent_lockh[2] = {{0}, {0}};
unsigned int namesize;
int rc = 0, cleanup_phase = 0, resent_req = 0, update_mode, reply_offset;
CERROR("Can't unpack security desc\n");
RETURN(-EFAULT);
}
- mds_squash_root(mds, rsd, &req->rq_peer.peer_id.nid);
/* swab now, before anyone looks inside the request. */
body = lustre_swab_reqbuf(req, offset, sizeof(*body),
reply_offset = 0;
}
- rc = mds_init_ucred(&uc, rsd);
+ rc = mds_init_ucred(&uc, req, rsd);
if (rc) {
CERROR("can't init ucred\n");
GOTO(cleanup, rc);
id_fid(&body->id1), (unsigned long)id_group(&body->id1),
child_lockh->cookie);
- dparent = mds_id2dentry(obd, &body->id1, NULL);
- LASSERT(dparent);
-
- dchild = ll_lookup_one_len(name, dparent, namesize - 1);
- if (IS_ERR(dchild)) {
- DEBUG_REQ(D_ERROR, req, "resent, not enqueuing new locks");
- CDEBUG(D_ERROR, "lock against [%lu:%lu]/%*s\n",
- (unsigned long) id_ino(&body->id1),
- (unsigned long) id_gen(&body->id1),
- namesize - 1, name);
+ if (name) {
+ /* usual named request */
+ dparent = mds_id2dentry(obd, &body->id1, NULL);
+ LASSERT(!IS_ERR(dparent));
+ dchild = ll_lookup_one_len(name, dparent, namesize - 1);
+ if (IS_ERR(dchild)) {
+ DEBUG_REQ(D_ERROR, req, "resent, not enqueuing new locks");
+ CDEBUG(D_ERROR, "lock against [%lu:%lu]/%*s\n",
+ (unsigned long) id_ino(&body->id1),
+ (unsigned long) id_gen(&body->id1),
+ namesize - 1, name);
+ }
+ LASSERT(!IS_ERR(dchild));
+ } else {
+ /* client wants to get attr. by id */
+ dchild = mds_id2dentry(obd, &body->id1, NULL);
+ if (IS_ERR(dchild)) {
+ DEBUG_REQ(D_ERROR, req, "resent, not enqueuing new locks");
+ CDEBUG(D_ERROR, "lock against [%lu:%lu]\n",
+ (unsigned long) id_ino(&body->id1),
+ (unsigned long) id_gen(&body->id1));
+ }
+ LASSERT(!IS_ERR(dchild));
}
- LASSERT(!IS_ERR(dchild));
LDLM_LOCK_PUT(granted_lock);
}
if (dchild->d_flags & DCACHE_CROSS_REF)
rc = mds_getattr_pack_msg_cf(req, dchild, offset);
else
- rc = mds_getattr_pack_msg(req, dchild->d_inode, offset);
+ rc = mds_getattr_pack_msg(req, dchild, offset);
if (rc != 0) {
CERROR ("mds_getattr_pack_msg: %d\n", rc);
GOTO (cleanup, rc);
}
}
- rc = mds_getattr_internal(obd, dchild, req, body, reply_offset);
+ rc = mds_getattr_internal(obd, dchild, req, offset, body, reply_offset);
GOTO(cleanup, rc); /* returns the lock to the client */
cleanup:
l_dput(dchild);
case 1:
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
+ default:
mds_exit_ucred(&uc);
}
return rc;
struct dentry *de;
struct mds_req_sec_desc *rsd;
struct mds_body *body;
- struct lvfs_ucred uc;
+ struct lvfs_ucred uc = {NULL, NULL,};
int rc = 0;
ENTRY;
MD_COUNTER_INCREMENT(obd, getattr);
- rc = mds_init_ucred(&uc, rsd);
+ rc = mds_init_ucred(&uc, req, rsd);
if (rc) {
+ mds_exit_ucred(&uc);
CERROR("can't init ucred\n");
RETURN(rc);
}
GOTO(out_pop, rc);
}
- rc = mds_getattr_pack_msg(req, de->d_inode, offset);
+ rc = mds_getattr_pack_msg(req, de, offset);
if (rc != 0) {
CERROR("mds_getattr_pack_msg: %d\n", rc);
GOTO(out_pop, rc);
}
- req->rq_status = mds_getattr_internal(obd, de, req, body, 0);
-
+ req->rq_status = mds_getattr_internal(obd, de, req, offset, body, 0);
l_dput(de);
EXIT;
static int mds_readpage(struct ptlrpc_request *req, int offset)
{
struct obd_device *obd = req->rq_export->exp_obd;
- struct mds_obd *mds = &obd->u.mds;
struct vfsmount *mnt;
struct dentry *de;
struct file *file;
struct mds_body *body, *repbody;
struct lvfs_run_ctxt saved;
int rc, size = sizeof(*repbody);
- struct lvfs_ucred uc;
+ struct lvfs_ucred uc = {NULL, NULL,};
ENTRY;
rc = lustre_pack_reply(req, 1, &size, NULL);
CERROR("Can't unpack security desc\n");
GOTO (out, rc = -EFAULT);
}
- mds_squash_root(mds, rsd, &req->rq_peer.peer_id.nid);
body = lustre_swab_reqbuf(req, offset, sizeof(*body),
lustre_swab_mds_body);
GOTO (out, rc = -EFAULT);
}
- rc = mds_init_ucred(&uc, rsd);
+ rc = mds_init_ucred(&uc, req, rsd);
if (rc) {
CERROR("can't init ucred\n");
GOTO(out, rc);
filp_close(file, 0);
out_pop:
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
- mds_exit_ucred(&uc);
out:
+ mds_exit_ucred(&uc);
req->rq_status = rc;
return 0;
}
int mds_reint(struct ptlrpc_request *req, int offset,
struct lustre_handle *lockh)
{
- struct mds_obd *mds = &req->rq_export->exp_obd->u.mds;
struct mds_update_record *rec;
struct mds_req_sec_desc *rsd;
int rc;
CERROR("Can't unpack security desc\n");
GOTO(out, rc = -EFAULT);
}
- mds_squash_root(mds, rsd, &req->rq_peer.peer_id.nid);
rc = mds_update_unpack(req, offset, rec);
if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNPACK)) {
GOTO(out, req->rq_status = -EINVAL);
}
- rc = mds_init_ucred(&rec->ur_uc, rsd);
+ rc = mds_init_ucred(&rec->ur_uc, req, rsd);
if (rc) {
CERROR("can't init ucred\n");
GOTO(out, rc);
/* rc will be used to interrupt a for loop over multiple records */
rc = mds_reint_rec(rec, offset, req, lockh);
- mds_exit_ucred(&rec->ur_uc);
- EXIT;
+
+ /* do reverse uid/gid mapping if needed */
+ if (rc == 0 && req->rq_remote &&
+ (rec->ur_opcode == REINT_SETATTR ||
+ rec->ur_opcode == REINT_OPEN)) {
+ struct mds_body *body;
+ int bodyoff;
+
+ if (rec->ur_opcode == REINT_SETATTR)
+ bodyoff = 0;
+ else /* open */
+ bodyoff = (offset == 3 ? 1 : 0);
+ body = lustre_msg_buf(req->rq_repmsg, bodyoff, sizeof(*body));
+ LASSERT(body);
+
+ mds_reverse_map_ugid(req, body);
+ }
out:
+ mds_exit_ucred(&rec->ur_uc);
OBD_FREE(rec, sizeof(*rec));
- return rc;
+ RETURN(rc);
}
static int mds_filter_recovery_request(struct ptlrpc_request *req,
* this only serve to inter-mds request, don't need check group database
* here. --ericm.
*/
- uc.luc_ghash = NULL;
+ uc.luc_lsd = NULL;
uc.luc_ginfo = NULL;
uc.luc_uid = body->oa.o_uid;
uc.luc_fsuid = body->oa.o_uid;
l_dput(new);
pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
- mds_put_group_entry(mds, uc.luc_ghash);
return rc;
}
RETURN(-EINVAL);
}
+static int mds_init_export_data(struct ptlrpc_request *req)
+{
+ struct mds_export_data *med = &req->rq_export->u.eu_mds_data;
+ __u32 *nllu;
+
+ nllu = lustre_msg_buf(req->rq_reqmsg, 4, sizeof(__u32) * 2);
+ if (nllu == NULL) {
+ CERROR("failed to extract nllu, use 99:99\n");
+ med->med_nllu = 99;
+ med->med_nllg = 99;
+ } else {
+ if (lustre_msg_swabbed(req->rq_reqmsg)) {
+ __swab32s(&nllu[0]);
+ __swab32s(&nllu[1]);
+ }
+ med->med_nllu = nllu[0];
+ med->med_nllg = nllu[1];
+ }
+
+ if (req->rq_remote) {
+ CWARN("exp %p, peer "LPX64": set as remote\n",
+ req->rq_export, req->rq_peer.peer_id.nid);
+ med->med_local = 0;
+ } else
+ med->med_local = 1;
+
+ LASSERT(med->med_idmap == NULL);
+ spin_lock_init(&med->med_idmap_lock);
+
+ return 0;
+}
+
static int mds_msg_check_version(struct lustre_msg *msg)
{
int rc;
CERROR("bad opc %u version %08x, expecting %08x\n",
msg->opc, msg->version, LUSTRE_OBD_VERSION);
break;
+ case SEC_INIT:
+ case SEC_INIT_CONTINUE:
+ case SEC_FINI:
+ rc = 0;
+ break;
default:
CERROR("MDS unknown opcode %d\n", msg->opc);
rc = -ENOTSUPP;
RETURN(rc);
}
+ /* Security opc should NOT trigger any recovery events */
+ if (req->rq_reqmsg->opc == SEC_INIT ||
+ req->rq_reqmsg->opc == SEC_INIT_CONTINUE ||
+ req->rq_reqmsg->opc == SEC_FINI) {
+ GOTO(out, rc = 0);
+ }
+
LASSERT(current->journal_info == NULL);
/* XXX identical to OST */
if (req->rq_reqmsg->opc != MDS_CONNECT) {
DEBUG_REQ(D_INODE, req, "connect");
OBD_FAIL_RETURN(OBD_FAIL_MDS_CONNECT_NET, 0);
rc = target_handle_connect(req);
- if (!rc)
+ if (!rc) {
/* Now that we have an export, set mds. */
mds = mds_req2mds(req);
+ mds_init_export_data(req);
+ }
break;
case MDS_DISCONNECT:
/*
* here we use "iopen_nopriv" hardcoded, because it affects MDS utility
* and the rest of options are passed by mount options. Probably this
- * should be moved to somewhere else like startup scripts or lconf.
- */
- sprintf(options, "iopen_nopriv");
-
+ * should be moved to somewhere else like startup scripts or lconf. */
+ sprintf(options, "iopen_nopriv,acl,user_xattr");
if (lcfg->lcfg_inllen4 > 0 && lcfg->lcfg_inlbuf4)
sprintf(options + strlen(options), ",%s",
lcfg->lcfg_inlbuf4);
RETURN(rc);
}
+extern void lgss_svc_cache_purge_all(void);
static int mds_cleanup(struct obd_device *obd, int flags)
{
struct mds_obd *mds = &obd->u.mds;
dev_clear_rdonly(2);
fsfilt_put_ops(obd->obd_fsops);
+#ifdef ENABLE_GSS
+ /* XXX */
+ lgss_svc_cache_purge_all();
+#endif
RETURN(0);
}
+static int set_security(const char *value, char **sec)
+{
+ int rc = 0;
+
+ if (!strcmp(value, "null"))
+ *sec = "null";
+ else if (!strcmp(value, "krb5i"))
+ *sec = "krb5i";
+ else if (!strcmp(value, "krb5p"))
+ *sec = "krb5p";
+ else {
+ CERROR("Unrecognized value, force use NULL\n");
+ rc = -EINVAL;
+ }
+
+ return rc;
+}
+
+static int mds_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+ struct lustre_cfg *lcfg = buf;
+ struct mds_obd *mds = &obd->u.mds;
+ int rc = 0;
+ ENTRY;
+
+ switch(lcfg->lcfg_command) {
+ case LCFG_SET_SECURITY: {
+ if (!lcfg->lcfg_inllen1 || !lcfg->lcfg_inllen2)
+ GOTO(out, rc = -EINVAL);
+
+ if (!strcmp(lcfg->lcfg_inlbuf1, "mds_mds_sec"))
+ rc = set_security(lcfg->lcfg_inlbuf2,
+ &mds->mds_mds_sec);
+ else if (!strcmp(lcfg->lcfg_inlbuf1, "mds_ost_sec"))
+ rc = set_security(lcfg->lcfg_inlbuf2,
+ &mds->mds_ost_sec);
+ else {
+ CERROR("Unrecognized key\n");
+ rc = -EINVAL;
+ }
+ break;
+ }
+ default: {
+ CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+ GOTO(out, rc = -EINVAL);
+
+ }
+ }
+out:
+ RETURN(rc);
+}
+
static void fixup_handle_for_resent_req(struct ptlrpc_request *req,
int offset,
struct ldlm_lock *new_lock,
struct lustre_handle lockh[2] = {{0}, {0}};
struct ldlm_lock *new_lock = NULL;
int getattr_part = MDS_INODELOCK_UPDATE;
- int rc, repsize[4] = { sizeof(struct ldlm_reply),
- sizeof(struct mds_body),
- mds->mds_max_mdsize,
- mds->mds_max_cookiesize };
+ int rc, reply_buffers;
+ int repsize[5] = {sizeof(struct ldlm_reply),
+ sizeof(struct mds_body),
+ mds->mds_max_mdsize};
+
int offset = MDS_REQ_INTENT_REC_OFF;
ENTRY;
LDLM_DEBUG(lock, "intent policy, opc: %s", ldlm_it2str(it->opc));
- rc = lustre_pack_reply(req, 3, repsize, NULL);
+ reply_buffers = 3;
+ if (it->opc & ( IT_OPEN | IT_GETATTR | IT_LOOKUP | IT_CHDIR )) {
+ reply_buffers = 5;
+ repsize[3] = 4;
+ repsize[4] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
+ }
+
+ rc = lustre_pack_reply(req, reply_buffers, repsize, NULL);
if (rc)
RETURN(req->rq_status = rc);
.o_setup = mds_setup,
.o_precleanup = mds_precleanup,
.o_cleanup = mds_cleanup,
+ .o_process_config = mds_process_config,
.o_postrecov = mds_postrecov,
.o_statfs = mds_obd_statfs,
.o_iocontrol = mds_iocontrol,
{
struct lprocfs_static_vars lvars;
- mds_group_hash_init();
+ mds_init_lsd_cache();
lprocfs_init_multi_vars(0, &lvars);
class_register_type(&mds_obd_ops, NULL, lvars.module_vars,
static void /*__exit*/ mds_exit(void)
{
- mds_group_hash_cleanup();
+ mds_cleanup_lsd_cache();
class_unregister_type(LUSTRE_MDS_NAME);
class_unregister_type(LUSTRE_MDT_NAME);
};
/*
- * group hash proc entries handler
+ * LSD proc entry handlers
*/
-static int lprocfs_wr_group_info(struct file *file, const char *buffer,
- unsigned long count, void *data)
+static int lprocfs_wr_lsd_downcall(struct file *file, const char *buffer,
+ unsigned long count, void *data)
{
- struct {
- int err;
- uid_t uid;
- uint32_t ngroups;
- gid_t *groups;
- } param;
+ struct upcall_cache *cache = __mds_get_global_lsd_cache();
+ struct lsd_downcall_args param;
gid_t gids_local[NGROUPS_SMALL];
gid_t *gids = NULL;
CERROR("broken downcall\n");
return count;
}
+
+ if (param.err) {
+ CERROR("LSD downcall indicate error %d\n", param.err);
+ goto do_downcall;
+ }
+
if (param.ngroups > NGROUPS_MAX) {
CERROR("%d groups?\n", param.ngroups);
- return count;
+ param.err = -EINVAL;
+ goto do_downcall;
}
if (param.ngroups <= NGROUPS_SMALL)
if (!gids) {
CERROR("fail to alloc memory for %d gids\n",
param.ngroups);
- return count;
+ param.err = -ENOMEM;
+ goto do_downcall;
}
}
if (copy_from_user(gids, param.groups,
param.ngroups * sizeof(gid_t))) {
CERROR("broken downcall\n");
- goto out;
+ param.err = -EFAULT;
+ goto do_downcall;
}
- mds_handle_group_downcall(param.err, param.uid,
- param.ngroups, gids);
+ param.groups = gids;
+
+do_downcall:
+ upcall_cache_downcall(cache, (__u64) param.uid, param.err, ¶m);
-out:
if (gids && gids != gids_local)
OBD_FREE(gids, param.ngroups * sizeof(gid_t));
return count;
}
-static int lprocfs_rd_expire(char *page, char **start, off_t off, int count,
- int *eof, void *data)
+static int lprocfs_rd_lsd_expire(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
{
- struct mds_grp_hash *hash = __mds_get_global_group_hash();
+ struct upcall_cache *cache= __mds_get_global_lsd_cache();
*eof = 1;
- return snprintf(page, count, "%d\n", hash->gh_entry_expire);
+ return snprintf(page, count, "%lu\n", cache->uc_entry_expire);
}
-
-static int lprocfs_wr_expire(struct file *file, const char *buffer,
- unsigned long count, void *data)
+static int lprocfs_wr_lsd_expire(struct file *file, const char *buffer,
+ unsigned long count, void *data)
{
- struct mds_grp_hash *hash = __mds_get_global_group_hash();
+ struct upcall_cache *cache= __mds_get_global_lsd_cache();
char buf[32];
if (copy_from_user(buf, buffer, min(count, 32UL)))
return count;
buf[31] = 0;
- sscanf(buf, "%d", &hash->gh_entry_expire);
+ sscanf(buf, "%lu", &cache->uc_entry_expire);
return count;
}
-static int lprocfs_rd_ac_expire(char *page, char **start, off_t off, int count,
- int *eof, void *data)
+static int lprocfs_rd_lsd_ac_expire(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
{
- struct mds_grp_hash *hash = __mds_get_global_group_hash();
+ struct upcall_cache *cache= __mds_get_global_lsd_cache();
*eof = 1;
- return snprintf(page, count, "%d\n", hash->gh_acquire_expire);
+ return snprintf(page, count, "%lu\n", cache->uc_acquire_expire);
}
-
-static int lprocfs_wr_ac_expire(struct file *file, const char *buffer,
- unsigned long count, void *data)
+static int lprocfs_wr_lsd_ac_expire(struct file *file, const char *buffer,
+ unsigned long count, void *data)
{
- struct mds_grp_hash *hash = __mds_get_global_group_hash();
+ struct upcall_cache *cache= __mds_get_global_lsd_cache();
char buf[32];
if (copy_from_user(buf, buffer, min(count, 32UL)))
return count;
buf[31] = 0;
- sscanf(buf, "%d", &hash->gh_acquire_expire);
+ sscanf(buf, "%lu", &cache->uc_acquire_expire);
return count;
}
-static int lprocfs_rd_hash_upcall(char *page, char **start, off_t off, int count,
- int *eof, void *data)
+static int lprocfs_rd_lsd_upcall(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
{
- struct mds_grp_hash *hash = __mds_get_global_group_hash();
+ struct upcall_cache *cache= __mds_get_global_lsd_cache();
*eof = 1;
- return snprintf(page, count, "%s\n", hash->gh_upcall);
+ return snprintf(page, count, "%s\n", cache->uc_upcall);
}
-
-static int lprocfs_wr_hash_upcall(struct file *file, const char *buffer,
- unsigned long count, void *data)
+static int lprocfs_wr_lsd_upcall(struct file *file, const char *buffer,
+ unsigned long count, void *data)
{
- struct mds_grp_hash *hash = __mds_get_global_group_hash();
+ struct upcall_cache *cache= __mds_get_global_lsd_cache();
- if (count < MDSGRP_UPCALL_MAXPATH) {
- sscanf(buffer, "%1024s", hash->gh_upcall);
- hash->gh_upcall[MDSGRP_UPCALL_MAXPATH-1] = 0;
+ if (count < UC_CACHE_UPCALL_MAXPATH) {
+ sscanf(buffer, "%1024s", cache->uc_upcall);
+ cache->uc_upcall[UC_CACHE_UPCALL_MAXPATH - 1] = 0;
}
return count;
}
-static int lprocfs_wr_hash_flush(struct file *file, const char *buffer,
- unsigned long count, void *data)
-{
- mds_group_hash_flush_idle();
- return count;
-}
-
-static int lprocfs_rd_allow_setgroups(char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- struct mds_grp_hash *hash = __mds_get_global_group_hash();
-
- *eof = 1;
- return snprintf(page, count, "%d\n", hash->gh_allow_setgroups);
-}
-
-static int lprocfs_wr_allow_setgroups(struct file *file, const char *buffer,
- unsigned long count, void *data)
+extern void lgss_svc_cache_flush(__u32 uid);
+static int lprocfs_wr_lsd_flush(struct file *file, const char *buffer,
+ unsigned long count, void *data)
{
- struct mds_grp_hash *hash = __mds_get_global_group_hash();
- char buf[8];
- int val;
+ char buf[32];
+ __u32 uid;
- if (copy_from_user(buf, buffer, min(count, 8UL)))
+ if (copy_from_user(buf, buffer, min(count, 32UL)))
return count;
- buf[7] = 0;
- sscanf(buf, "%d", &val);
- hash->gh_allow_setgroups = (val != 0);
+ buf[31] = 0;
+ sscanf(buf, "%d", &uid);
+
+ mds_flush_lsd(uid);
+#ifdef ENABLE_GSS
+ lgss_svc_cache_flush(uid);
+#endif
return count;
}
struct lprocfs_vars lprocfs_mds_module_vars[] = {
- { "num_refs", lprocfs_rd_numrefs, 0, 0 },
- { "grp_hash_expire_interval",lprocfs_rd_expire,
- lprocfs_wr_expire, 0},
- { "grp_hash_acquire_expire", lprocfs_rd_ac_expire,
- lprocfs_wr_ac_expire, 0},
- { "grp_hash_upcall", lprocfs_rd_hash_upcall,
- lprocfs_wr_hash_upcall, 0},
- { "grp_hash_flush", 0, lprocfs_wr_hash_flush, 0},
- { "group_info", 0, lprocfs_wr_group_info, 0 },
- { "allow_setgroups", lprocfs_rd_allow_setgroups,
- lprocfs_wr_allow_setgroups, 0},
+ { "num_refs", lprocfs_rd_numrefs, 0, 0 },
+ /* LSD stuff */
+ { "lsd_expire_interval", lprocfs_rd_lsd_expire,
+ lprocfs_wr_lsd_expire, 0},
+ { "lsd_acquire_expire", lprocfs_rd_lsd_ac_expire,
+ lprocfs_wr_lsd_ac_expire, 0},
+ { "lsd_upcall", lprocfs_rd_lsd_upcall,
+ lprocfs_wr_lsd_upcall, 0},
+ { "lsd_flush", 0, lprocfs_wr_lsd_flush, 0},
+ { "lsd_downcall", 0, lprocfs_wr_lsd_downcall, 0},
{ 0 }
};
struct lvfs_run_ctxt saved;
int rc;
+ mds_idmap_cleanup(med);
+
if (!med->med_mcd)
RETURN(0);
+++ /dev/null
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Copyright (c) 2004 Cluster File Systems, Inc.
- *
- * This file is part of Lustre, http://www.lustre.org.
- *
- * Lustre is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Lustre is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Lustre; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define DEBUG_SUBSYSTEM S_MDS
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/kmod.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/version.h>
-#include <linux/unistd.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <asm/uaccess.h>
-#include <linux/slab.h>
-#include <asm/segment.h>
-
-#include <libcfs/list.h>
-#include <linux/obd_support.h>
-#include <linux/lustre_lib.h>
-#include <linux/lustre_mds.h>
-#include "mds_internal.h"
-
-#define GRP_HASH_NEW 0x1
-#define GRP_HASH_ACQUIRING 0x2
-#define GRP_HASH_INVALID 0x4
-#define GRP_HASH_EXPIRED 0x8
-
-#define GRP_IS_NEW(i) ((i)->ge_flags & GRP_HASH_NEW)
-#define GRP_IS_INVALID(i) ((i)->ge_flags & GRP_HASH_INVALID)
-#define GRP_IS_ACQUIRING(i) ((i)->ge_flags & GRP_HASH_ACQUIRING)
-#define GRP_IS_EXPIRED(i) ((i)->ge_flags & GRP_HASH_EXPIRED)
-#define GRP_IS_VALID(i) ((i)->ge_flags == 0)
-
-#define GRP_SET_NEW(i) (i)->ge_flags |= GRP_HASH_NEW
-#define GRP_SET_INVALID(i) (i)->ge_flags |= GRP_HASH_INVALID
-#define GRP_SET_ACQUIRING(i) (i)->ge_flags |= GRP_HASH_ACQUIRING
-#define GRP_SET_EXPIRED(i) (i)->ge_flags |= GRP_HASH_EXPIRED
-#define GRP_SET_VALID(i) (i)->ge_flags = 0
-
-#define GRP_CLEAR_NEW(i) (i)->ge_flags &= ~GRP_HASH_NEW
-#define GRP_CLEAR_ACQUIRING(i) (i)->ge_flags &= ~GRP_HASH_ACQUIRING
-#define GRP_CLEAR_INVALID(i) (i)->ge_flags &= ~GRP_HASH_INVALID
-#define GRP_CLEAR_EXPIRED(i) (i)->ge_flags &= ~GRP_HASH_EXPIRED
-
-/*
- * We need share hash table among the groups of MDSs (which server as the same
- * lustre file system), maybe MDT? but there's lprocfs problems of putting this
- * in MDT. so we make it global to the module. which brings the limitation that
- * one node couldn't running multiple MDS which server as different Lustre FS.
- * but which maybe not meaningful.
- */
-static struct mds_grp_hash _group_hash;
-
-struct mds_grp_hash *__mds_get_global_group_hash()
-{
- return &_group_hash;
-}
-
-static struct mds_grp_hash_entry *alloc_entry(uid_t uid)
-{
- struct mds_grp_hash_entry *entry;
-
- OBD_ALLOC(entry, sizeof(*entry));
- if (!entry)
- return NULL;
-
- GRP_SET_NEW(entry);
- INIT_LIST_HEAD(&entry->ge_hash);
- entry->ge_uid = uid;
- atomic_set(&entry->ge_refcount, 0);
- init_waitqueue_head(&entry->ge_waitq);
- return entry;
-}
-
-/* protected by hash lock */
-static void free_entry(struct mds_grp_hash_entry *entry)
-{
- if (entry->ge_group_info)
- groups_free(entry->ge_group_info);
- list_del(&entry->ge_hash);
- CDEBUG(D_OTHER, "destroy mds_grp_entry %p for uid %d\n",
- entry, entry->ge_uid);
- OBD_FREE(entry, sizeof(*entry));
-}
-
-static inline void get_entry(struct mds_grp_hash_entry *entry)
-{
- atomic_inc(&entry->ge_refcount);
-}
-static inline void put_entry(struct mds_grp_hash_entry *entry)
-{
- if (atomic_dec_and_test(&entry->ge_refcount) &&
- (GRP_IS_INVALID(entry) || GRP_IS_EXPIRED(entry))) {
- free_entry(entry);
- }
-}
-static int check_unlink_entry(struct mds_grp_hash_entry *entry)
-{
- if (GRP_IS_VALID(entry) &&
- time_before(jiffies, entry->ge_expire))
- return 0;
-
- if (GRP_IS_ACQUIRING(entry) &&
- time_after(jiffies, entry->ge_acquire_expire)) {
- GRP_SET_EXPIRED(entry);
- wake_up_all(&entry->ge_waitq);
- } else if (!GRP_IS_INVALID(entry)) {
- GRP_SET_EXPIRED(entry);
- }
-
- list_del_init(&entry->ge_hash);
- if (!atomic_read(&entry->ge_refcount))
- free_entry(entry);
- return 1;
-}
-
-static int refresh_entry(struct mds_grp_hash *hash,
- struct mds_grp_hash_entry *entry)
-{
- char *argv[4];
- char *envp[3];
- char uidstr[16];
- int rc;
- ENTRY;
-
- snprintf(uidstr, 16, "%d", entry->ge_uid);
-
- argv[0] = hash->gh_upcall;
- argv[1] = uidstr;
- argv[2] = NULL;
-
- envp[0] = "HOME=/";
- envp[1] = "PATH=/sbin:/usr/sbin";
- envp[2] = NULL;
-
- rc = USERMODEHELPER(argv[0], argv, envp);
- if (rc < 0) {
- CERROR("Error invoking getgroups upcall %s %s: %d; check "
- "/proc/fs/lustre/mds/grp_hash_upcall\n",
- argv[0], argv[1], rc);
- } else {
- CWARN("Invoked upcall %s %s\n",
- argv[0], argv[1]);
- }
- RETURN(rc);
-}
-
-struct mds_grp_hash_entry *mds_get_group_entry(struct mds_obd *mds, uid_t uid)
-{
- struct mds_grp_hash_entry *entry = NULL, *new = NULL, *next;
- struct mds_grp_hash *hash = &_group_hash;
- struct list_head *head;
- wait_queue_t wait;
- int rc, found;
- ENTRY;
-
- head = &hash->gh_table[MDSGRP_HASH_INDEX(uid)];
-
-find_again:
- found = 0;
- spin_lock(&hash->gh_lock);
- list_for_each_entry_safe(entry, next, head, ge_hash) {
- /* check invalid & expired items */
- if (check_unlink_entry(entry))
- continue;
- if (entry->ge_uid == uid) {
- found = 1;
- break;
- }
- }
-
- if (!found) { /* didn't found */
- if (!new) {
- spin_unlock(&hash->gh_lock);
- new = alloc_entry(uid);
- if (!new) {
- CERROR("fail to alloc entry\n");
- RETURN(NULL);
- }
- goto find_again;
- } else {
- list_add(&new->ge_hash, head);
- entry = new;
- }
- } else {
- if (new) {
- free_entry(new);
- new = NULL;
- }
- list_move(&entry->ge_hash, head);
- }
- get_entry(entry);
-
- /* acquire for new one */
- if (GRP_IS_NEW(entry)) {
- GRP_SET_ACQUIRING(entry);
- GRP_CLEAR_NEW(entry);
- entry->ge_acquire_expire = jiffies +
- hash->gh_acquire_expire * HZ;
- spin_unlock(&hash->gh_lock);
-
- rc = refresh_entry(hash, entry);
-
- spin_lock(&hash->gh_lock);
- if (rc) {
- GRP_CLEAR_ACQUIRING(entry);
- GRP_SET_INVALID(entry);
- }
- /* fall through */
- }
-
- /*
- * someone (and only one) is doing upcall upon this item, just wait it
- * complete
- */
- if (GRP_IS_ACQUIRING(entry)) {
- init_waitqueue_entry(&wait, current);
- add_wait_queue(&entry->ge_waitq, &wait);
- set_current_state(TASK_INTERRUPTIBLE);
- spin_unlock(&hash->gh_lock);
-
- schedule_timeout(hash->gh_acquire_expire * HZ);
-
- spin_lock(&hash->gh_lock);
- remove_wait_queue(&entry->ge_waitq, &wait);
- if (GRP_IS_ACQUIRING(entry)) {
- /* we're interrupted or upcall failed
- * in the middle
- */
- put_entry(entry);
- spin_unlock(&hash->gh_lock);
- RETURN(NULL);
- }
- /* fall through */
- }
-
- /* invalid means error, don't need to try again */
- if (GRP_IS_INVALID(entry)) {
- put_entry(entry);
- spin_unlock(&hash->gh_lock);
- RETURN(NULL);
- }
-
- /*
- * check expired. We can't refresh the existed one because some memory
- * might be shared by multiple processes.
- */
- if (check_unlink_entry(entry)) {
- /*
- * if expired, try again. but if this entry is created by me but
- * too quickly turn to expired without any error, should at
- * least give a chance to use it once.
- */
- if (entry != new) {
- put_entry(entry);
- spin_unlock(&hash->gh_lock);
- new = NULL;
- goto find_again;
- }
- }
-
- /* Now we know it's good */
- spin_unlock(&hash->gh_lock);
- RETURN(entry);
-}
-
-void mds_put_group_entry(struct mds_obd *mds, struct mds_grp_hash_entry *entry)
-{
- struct mds_grp_hash *hash = &_group_hash;
- ENTRY;
-
- if (!entry) {
- EXIT;
- return;
- }
-
- spin_lock(&hash->gh_lock);
- LASSERT(atomic_read(&entry->ge_refcount) > 0);
- put_entry(entry);
- spin_unlock(&hash->gh_lock);
- EXIT;
-}
-
-static int entry_set_group_info(struct mds_grp_hash_entry *entry,
- __u32 ngroups, gid_t *groups)
-{
- struct group_info *ginfo;
- ENTRY;
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4)
- if (ngroups > NGROUPS)
- ngroups = NGROUPS;
-#endif
-
- if (ngroups > NGROUPS_MAX) {
- CERROR("too many (%d) supp groups\n", ngroups);
- RETURN(-EINVAL);
- }
-
- ginfo = groups_alloc(ngroups);
- if (!ginfo) {
- CERROR("can't alloc group_info for %d groups\n", ngroups);
- RETURN(-ENOMEM);
- }
- groups_from_buffer(ginfo, groups);
-
- entry->ge_group_info = ginfo;
- RETURN(0);
-}
-
-int mds_handle_group_downcall(int err, uid_t uid, __u32 ngroups, gid_t *groups)
-{
- struct mds_grp_hash *hash = &_group_hash;
- struct mds_grp_hash_entry *entry = NULL;
- struct list_head *head;
- int found = 0, rc = 0;
- ENTRY;
-
- LASSERT(hash);
-
- head = &hash->gh_table[MDSGRP_HASH_INDEX(uid)];
-
- spin_lock(&hash->gh_lock);
- list_for_each_entry(entry, head, ge_hash) {
- if (entry->ge_uid == uid) {
- found = 1;
- break;
- }
- }
- if (!found) {
- /* haven't found, it's possible */
- spin_unlock(&hash->gh_lock);
- RETURN(-EINVAL);
- }
- if (err) {
- GRP_SET_INVALID(entry);
- GOTO(out, rc = -EINVAL);
- }
-
- if (!GRP_IS_ACQUIRING(entry) ||
- GRP_IS_INVALID(entry) ||
- GRP_IS_EXPIRED(entry)) {
- CERROR("found a stale entry %p(uid %d) in ioctl\n",
- entry, entry->ge_uid);
- GOTO(out, rc = -EINVAL);
- }
-
- atomic_inc(&entry->ge_refcount);
- spin_unlock(&hash->gh_lock);
- rc = entry_set_group_info(entry, ngroups, groups);
- spin_lock(&hash->gh_lock);
- atomic_dec(&entry->ge_refcount);
- if (rc) {
- GRP_SET_INVALID(entry);
- list_del_init(&entry->ge_hash);
- GOTO(out, rc);
- }
- entry->ge_acquisition_time = LTIME_S(CURRENT_TIME);
- entry->ge_expire = jiffies + hash->gh_entry_expire * HZ;
- GRP_SET_VALID(entry);
- CDEBUG(D_OTHER, "created mds_grp_entry %p for uid %d\n",
- entry, entry->ge_uid);
-out:
- wake_up_all(&entry->ge_waitq);
- spin_unlock(&hash->gh_lock);
- RETURN(rc);
-}
-
-static void mds_flush_group_hash(struct mds_grp_hash *hash, int force)
-{
- struct mds_grp_hash_entry *entry, *next;
- int i;
- ENTRY;
-
- spin_lock(&hash->gh_lock);
- for (i = 0; i < MDSGRP_HASH_SIZE; i++) {
- list_for_each_entry_safe(entry, next,
- &hash->gh_table[i], ge_hash) {
- if (!force && atomic_read(&entry->ge_refcount)) {
- GRP_SET_EXPIRED(entry);
- continue;
- }
- LASSERT(!atomic_read(&entry->ge_refcount));
- free_entry(entry);
- }
- }
- spin_unlock(&hash->gh_lock);
- EXIT;
-}
-
-void mds_group_hash_flush_idle()
-{
- mds_flush_group_hash(&_group_hash, 0);
-}
-
-int mds_allow_setgroups(void)
-{
- return _group_hash.gh_allow_setgroups;
-}
-
-int mds_group_hash_init()
-{
- struct mds_grp_hash *hash;
- int i;
- ENTRY;
-
- hash = &_group_hash;
-
- spin_lock_init(&hash->gh_lock);
- for (i = 0; i < MDSGRP_HASH_SIZE; i++)
- INIT_LIST_HEAD(&hash->gh_table[i]);
- /* set default value, proc tunable */
- sprintf(hash->gh_upcall, "%s", "/sbin/l_getgroups");
- hash->gh_entry_expire = 5 * 60;
- hash->gh_acquire_expire = 5;
- hash->gh_allow_setgroups = 0;
-
- RETURN(0);
-}
-
-void mds_group_hash_cleanup()
-{
- mds_flush_group_hash(&_group_hash, 1);
-}
void groups_from_buffer(struct group_info *ginfo, __u32 *gids);
int mds_update_unpack(struct ptlrpc_request *, int offset,
struct mds_update_record *);
-int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd);
+int mds_idmap_set(struct mds_export_data *med, __u32 id1, __u32 id2,
+ int is_uid_mapping);
+__u32 mds_idmap_get(struct mds_export_data *med, __u32 id,
+ int is_uid_mapping);
+void mds_idmap_cleanup(struct mds_export_data *med);
+void mds_reverse_map_ugid(struct ptlrpc_request *req,
+ struct mds_body *body);
+int mds_init_ucred(struct lvfs_ucred *ucred, struct ptlrpc_request *req,
+ struct mds_req_sec_desc *rsd);
void mds_exit_ucred(struct lvfs_ucred *ucred);
/* mds/mds_unlink_open.c */
int mds_pack_md(struct obd_device *, struct lustre_msg *, int offset,
struct mds_body *, struct inode *, int lock);
-
+int mds_pack_link(struct dentry *dentry, struct ptlrpc_request *req,
+ struct mds_body *repbody, int reply_off);
+int mds_pack_ea(struct dentry *dentry, struct ptlrpc_request *req,
+ struct mds_body *repbody, int req_off, int reply_off);
+int mds_pack_ealist(struct dentry *dentry, struct ptlrpc_request *req,
+ struct mds_body *repbody, int reply_off);
+int mds_pack_acl(struct obd_device *, struct lustre_msg *, int offset,
+ struct mds_body *, struct inode *);
int mds_pack_inode2id(struct obd_device *, struct lustre_id *,
struct inode *, int);
int mds_convert_mea_ea(struct obd_device *, struct inode *, struct lov_mds_md *, int);
int mds_is_dir_empty(struct obd_device *, struct dentry *);
-/* mds_groups.c */
-int mds_group_hash_init(void);
-void mds_group_hash_cleanup(void);
-void mds_group_hash_flush_idle(void);
-int mds_allow_setgroups(void);
-
-extern char mds_getgroups_upcall[PATH_MAX];
-extern int mds_grp_hash_entry_expire;
-extern int mds_grp_hash_acquire_expire;
-
-struct mds_grp_hash *__mds_get_global_group_hash(void);
-struct mds_grp_hash_entry * mds_get_group_entry(struct mds_obd *mds, uid_t uid);
-void mds_put_group_entry(struct mds_obd *mds, struct mds_grp_hash_entry *entry);
-int mds_handle_group_downcall(int err, uid_t uid, __u32 ngroups, gid_t *groups);
+/* mds_lsd.c */
+struct upcall_cache *__mds_get_global_lsd_cache(void);
+int mds_init_lsd_cache(void);
+void mds_cleanup_lsd_cache(void);
+struct lustre_sec_desc * mds_get_lsd(__u32 uid);
+void mds_put_lsd(struct lustre_sec_desc *lsd);
+void mds_flush_lsd(__u32 id);
#endif /* _MDS_INTERNAL_H */
}
if (req->rq_reqmsg->bufcount > offset + 2) {
- r->ur_logcookies = lustre_msg_buf(req->rq_reqmsg, offset + 2, 0);
- if (r->ur_eadata == NULL)
+ r->ur_ea2data = lustre_msg_buf(req->rq_reqmsg, offset + 2, 0);
+ if (r->ur_ea2data == NULL)
RETURN (-EFAULT);
- r->ur_cookielen = req->rq_reqmsg->buflens[offset + 2];
+ r->ur_ea2datalen = req->rq_reqmsg->buflens[offset + 2];
}
RETURN(0);
RETURN(rc);
}
+static
+struct mds_idmap_table *__get_idmap_table(struct mds_export_data *med,
+ int create)
+{
+ struct mds_idmap_table *new;
+ int i;
+
+ if (!create || med->med_idmap)
+ return med->med_idmap;
+
+ spin_unlock(&med->med_idmap_lock);
+ OBD_ALLOC(new, sizeof(*new));
+ spin_lock(&med->med_idmap_lock);
+
+ if (!new) {
+ CERROR("fail to alloc %d\n", sizeof(*new));
+ return NULL;
+ }
+
+ if (med->med_idmap) {
+ OBD_FREE(new, sizeof(*new));
+ return med->med_idmap;
+ }
+
+ for (i = 0; i < MDS_IDMAP_HASHSIZE; i++) {
+ INIT_LIST_HEAD(&new->uidmap[i]);
+ INIT_LIST_HEAD(&new->gidmap[i]);
+ }
+
+ CDEBUG(D_SEC, "allocate idmap table for med %p\n", med);
+ med->med_idmap = new;
+ return new;
+}
+
+static void __flush_mapping_table(struct list_head *table)
+{
+ struct mds_idmap_item *item;
+ int i;
+
+ for (i = 0; i < MDS_IDMAP_HASHSIZE; i++) {
+ while (!list_empty(&table[i])) {
+ item = list_entry(table[i].next, struct mds_idmap_item,
+ hash);
+ list_del(&item->hash);
+ OBD_FREE(item, sizeof(*item));
+ }
+ }
+}
+
+void mds_idmap_cleanup(struct mds_export_data *med)
+{
+ ENTRY;
+
+ if (!med->med_idmap) {
+ EXIT;
+ return;
+ }
+
+ spin_lock(&med->med_idmap_lock);
+ __flush_mapping_table(med->med_idmap->uidmap);
+ __flush_mapping_table(med->med_idmap->gidmap);
+ OBD_FREE(med->med_idmap, sizeof(struct mds_idmap_table));
+ spin_unlock(&med->med_idmap_lock);
+}
+
+static inline int idmap_hash(__u32 id)
+{
+ return (id & (MDS_IDMAP_HASHSIZE - 1));
+}
+
+static
+int __idmap_set_item(struct mds_export_data *med,
+ struct list_head *table,
+ __u32 id1, __u32 id2)
+{
+ struct list_head *head;
+ struct mds_idmap_item *item, *new = NULL;
+ int found = 0;
+
+ head = table + idmap_hash(id1);
+again:
+ list_for_each_entry(item, head, hash) {
+ if (item->id1 == id1) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ if (new == NULL) {
+ spin_unlock(&med->med_idmap_lock);
+ OBD_ALLOC(new, sizeof(*new));
+ spin_lock(&med->med_idmap_lock);
+ if (!new) {
+ CERROR("fail to alloc %d\n", sizeof(*new));
+ return -ENOMEM;
+ }
+ goto again;
+ }
+ new->id1 = id1;
+ new->id2 = id2;
+ list_add(&new->hash, head);
+ } else {
+ if (new)
+ OBD_FREE(new, sizeof(*new));
+ if (item->id2 != id2) {
+ CWARN("mapping changed: %u ==> (%u -> %u)\n",
+ id1, item->id2, id2);
+ item->id2 = id2;
+ }
+ list_move(&item->hash, head);
+ }
+
+ return 0;
+}
+
+int mds_idmap_set(struct mds_export_data *med, __u32 id1, __u32 id2,
+ int is_uid_mapping)
+{
+ struct mds_idmap_table *idmap;
+ int rc;
+ ENTRY;
+
+ spin_lock(&med->med_idmap_lock);
+
+ idmap = __get_idmap_table(med, 1);
+ if (!idmap)
+ GOTO(out, rc = -ENOMEM);
+
+ if (is_uid_mapping)
+ rc = __idmap_set_item(med, idmap->uidmap, id1, id2);
+ else
+ rc = __idmap_set_item(med, idmap->gidmap, id1, id2);
+
+out:
+ spin_unlock(&med->med_idmap_lock);
+ RETURN(rc);
+}
+
+__u32 mds_idmap_get(struct mds_export_data *med, __u32 id,
+ int is_uid_mapping)
+{
+ struct mds_idmap_table *idmap;
+ struct list_head *table;
+ struct list_head *head;
+ struct mds_idmap_item *item;
+ int found = 0;
+ __u32 res;
+
+ spin_lock(&med->med_idmap_lock);
+ idmap = __get_idmap_table(med, 0);
+ if (!idmap)
+ goto nllu;
+
+ table = is_uid_mapping ? idmap->uidmap : idmap->gidmap;
+ head = table + idmap_hash(id);
+
+ list_for_each_entry(item, head, hash) {
+ if (item->id1 == id) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ goto nllu;
+
+ res = item->id2;
+out:
+ spin_unlock(&med->med_idmap_lock);
+ return res;
+nllu:
+ res = is_uid_mapping ? med->med_nllu : med->med_nllg;
+ goto out;
+}
+
+void mds_reverse_map_ugid(struct ptlrpc_request *req,
+ struct mds_body *body)
+{
+ struct mds_export_data *med = &req->rq_export->u.eu_mds_data;
+
+ LASSERT(req->rq_remote);
+
+ if (body->valid & OBD_MD_FLUID)
+ body->uid = mds_idmap_get(med, body->uid, 1);
+
+ if (body->valid & OBD_MD_FLGID)
+ body->gid = mds_idmap_get(med, body->gid, 0);
+}
+
static inline void drop_ucred_ginfo(struct lvfs_ucred *ucred)
{
if (ucred->luc_ginfo) {
}
}
+static inline void drop_ucred_lsd(struct lvfs_ucred *ucred)
+{
+ if (ucred->luc_lsd) {
+ mds_put_lsd(ucred->luc_lsd);
+ ucred->luc_lsd = NULL;
+ }
+}
+
/*
+ * the heart of the uid/gid handling and security checking.
+ *
* root could set any group_info if we allowed setgroups, while
* normal user only could 'reduce' their group members -- which
* is somewhat expensive.
*/
-int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd)
+int mds_init_ucred(struct lvfs_ucred *ucred,
+ struct ptlrpc_request *req,
+ struct mds_req_sec_desc *rsd)
{
+ struct mds_obd *mds = &req->rq_export->exp_obd->u.mds;
+ struct mds_export_data *med = &req->rq_export->u.eu_mds_data;
+ struct lustre_sec_desc *lsd;
+ ptl_nid_t peernid = req->rq_peer.peer_id.nid;
struct group_info *gnew;
-
+ unsigned int setuid, setgid, strong_sec;
ENTRY;
+
LASSERT(ucred);
LASSERT(rsd);
+ LASSERT(rsd->rsd_ngroups <= LUSTRE_MAX_GROUPS);
+
+ strong_sec = (req->rq_auth_uid != -1);
+ LASSERT(!(req->rq_remote && !strong_sec));
+
+ /* sanity check & set local/remote flag */
+ if (req->rq_remote) {
+ if (med->med_local) {
+ CWARN("exp %p: client on nid "LPX64" was local, "
+ "set to remote\n", req->rq_export, peernid);
+ med->med_local = 0;
+ }
+ } else {
+ if (!med->med_local) {
+ CWARN("exp %p: client on nid "LPX64" was remote, "
+ "set to local\n", req->rq_export, peernid);
+ med->med_local = 1;
+ }
+ }
+
+ setuid = (rsd->rsd_fsuid != rsd->rsd_uid);
+ setgid = (rsd->rsd_fsgid != rsd->rsd_gid);
+
+ /* deny setuid/setgid for remote client */
+ if ((setuid || setgid) && !med->med_local) {
+ CWARN("deny setxid (%u/%u) from remote client "LPX64"\n",
+ setuid, setgid, peernid);
+ RETURN(-EPERM);
+ }
+
+ /* take care of uid/gid mapping for client in remote realm */
+ if (req->rq_remote) {
+ /* record the uid mapping here */
+ mds_idmap_set(med, req->rq_auth_uid, rsd->rsd_uid, 1);
+
+ /* now we act as the authenticated user */
+ rsd->rsd_uid = rsd->rsd_fsuid = req->rq_auth_uid;
+ } else if (strong_sec && req->rq_auth_uid != rsd->rsd_uid) {
+ /* if we use strong authentication on this request, we
+ * expect the uid which client claimed is true.
+ *
+ * FIXME root's machine_credential in krb5 will be interpret
+ * as "nobody", which is not good for mds-mds and mds-ost
+ * connection.
+ */
+ CWARN("nid "LPX64": UID %u was authenticated while client "
+ "claimed %u, set %u by force\n",
+ peernid, req->rq_auth_uid, rsd->rsd_uid,
+ req->rq_auth_uid);
+ rsd->rsd_uid = req->rq_auth_uid;
+ }
+
+ /* now lsd come into play */
+ ucred->luc_ginfo = NULL;
+ ucred->luc_lsd = lsd = mds_get_lsd(rsd->rsd_uid);
+
+ if (lsd) {
+ if (req->rq_remote) {
+ /* record the gid mapping here */
+ mds_idmap_set(med, lsd->lsd_gid, rsd->rsd_gid, 0);
+ /* now we act as the authenticated group */
+ rsd->rsd_gid = rsd->rsd_fsgid = lsd->lsd_gid;
+ } else if (rsd->rsd_gid != lsd->lsd_gid) {
+ /* verify gid which client declared is true */
+ CWARN("GID: %u while client declare %u, "
+ "set %u by force\n",
+ lsd->lsd_gid, rsd->rsd_gid,
+ lsd->lsd_gid);
+ rsd->rsd_gid = lsd->lsd_gid;
+ }
+
+ if (lsd->lsd_ginfo) {
+ ucred->luc_ginfo = lsd->lsd_ginfo;
+ get_group_info(ucred->luc_ginfo);
+ }
+
+ /* check permission of setuid */
+ if (setuid) {
+ if (!lsd->lsd_allow_setuid) {
+ CWARN("mds blocked setuid attempt: %u -> %u\n",
+ rsd->rsd_uid, rsd->rsd_fsuid);
+ RETURN(-EPERM);
+ }
+ }
+
+ /* check permission of setgid */
+ if (setgid) {
+ if (!lsd->lsd_allow_setgid) {
+ CWARN("mds blocked setgid attempt: %u -> %u\n",
+ rsd->rsd_gid, rsd->rsd_fsgid);
+ RETURN(-EPERM);
+ }
+ }
+ } else {
+ /* failed to get lsd, right now we simply deny any access
+ * if strong authentication is used,
+ */
+ if (strong_sec) {
+ CWARN("mds deny access without LSD\n");
+ RETURN(-EPERM);
+ }
+
+ /* and otherwise deny setuid/setgid attempt */
+ if (setuid || setgid) {
+ CWARN("mds deny setuid/setgid without LSD\n");
+ RETURN(-EPERM);
+ }
+ }
+ /* NOTE: we have already obtained supplementary groups,
+ * it will be retained across root_squash. will it be a
+ * security problem??
+ */
+ mds_squash_root(mds, rsd, &peernid);
+
+ /* remove privilege for non-root user */
+ if (rsd->rsd_fsuid)
+ rsd->rsd_cap &= ~CAP_FS_MASK;
+
+ /* by now every fields in rsd have been granted */
ucred->luc_fsuid = rsd->rsd_fsuid;
ucred->luc_fsgid = rsd->rsd_fsgid;
ucred->luc_cap = rsd->rsd_cap;
ucred->luc_uid = rsd->rsd_uid;
- ucred->luc_ghash = mds_get_group_entry(NULL, rsd->rsd_uid);
- ucred->luc_ginfo = NULL;
- if (ucred->luc_ghash && ucred->luc_ghash->ge_group_info) {
- ucred->luc_ginfo = ucred->luc_ghash->ge_group_info;
- get_group_info(ucred->luc_ginfo);
- }
-
- /* everything is done if we don't allow set groups */
- if (!mds_allow_setgroups())
+ /* everything is done if we don't allow setgroups */
+ if (!lsd || !lsd->lsd_allow_setgrp)
RETURN(0);
- if (rsd->rsd_ngroups > LUSTRE_MAX_GROUPS) {
- CERROR("client provide too many groups: %d\n",
- rsd->rsd_ngroups);
- drop_ucred_ginfo(ucred);
- mds_put_group_entry(NULL, ucred->luc_ghash);
- RETURN(-EFAULT);
- }
-
if (ucred->luc_uid == 0) {
if (rsd->rsd_ngroups == 0) {
drop_ucred_ginfo(ucred);
if (!gnew) {
CERROR("out of memory\n");
drop_ucred_ginfo(ucred);
- mds_put_group_entry(NULL, ucred->luc_ghash);
+ drop_ucred_lsd(ucred);
RETURN(-ENOMEM);
}
groups_from_buffer(gnew, rsd->rsd_groups);
- /* can't rely on client to sort them */
- groups_sort(gnew);
+ groups_sort(gnew); /* can't rely on client */
drop_ucred_ginfo(ucred);
ucred->luc_ginfo = gnew;
if (!gnew) {
CERROR("out of memory\n");
drop_ucred_ginfo(ucred);
- mds_put_group_entry(NULL, ucred->luc_ghash);
+ drop_ucred_lsd(ucred);
RETURN(-ENOMEM);
}
while (cur < rsd->rsd_ngroups) {
if (groups_search(ginfo, rsd->rsd_groups[cur])) {
GROUP_AT(gnew, set) = rsd->rsd_groups[cur];
- set++;
- }
+ set++;
+ }
cur++;
}
gnew->ngroups = set;
void mds_exit_ucred(struct lvfs_ucred *ucred)
{
ENTRY;
-
- if (ucred->luc_ginfo)
- put_group_info(ucred->luc_ginfo);
- if (ucred->luc_ghash)
- mds_put_group_entry(NULL, ucred->luc_ghash);
-
+ drop_ucred_ginfo(ucred);
+ drop_ucred_lsd(ucred);
EXIT;
}
#define DEBUG_SUBSYSTEM S_MDS
#include <linux/module.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/obd_support.h>
+#include <linux/obd_class.h>
+#include <linux/obd.h>
+#include <linux/lustre_lib.h>
#include <linux/lustre_mds.h>
#include <linux/lustre_idl.h>
#include <linux/obd_class.h>
#include <linux/obd_lov.h>
#include <linux/lustre_lib.h>
#include <linux/lustre_fsfilt.h>
+#include <linux/lustre_lite.h>
#include "mds_internal.h"
-
/*
* TODO:
* - magic in mea struct
if (rc)
GOTO(err_reg, rc);
+ if (mds->mds_mds_sec) {
+ rc = obd_set_info(mds->mds_md_exp, strlen("sec"), "sec",
+ strlen(mds->mds_mds_sec), mds->mds_mds_sec);
+ if (rc)
+ GOTO(err_reg, rc);
+ }
+
mds->mds_md_connected = 1;
up(&mds->mds_md_sem);
RETURN(0);
op_data->mea1 = mea;
it.it_op = IT_UNLINK;
+ OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data));
+
rc = md_enqueue(mds->mds_md_exp, LDLM_IBITS, &it, LCK_EX,
op_data, *rlockh, NULL, 0, ldlm_completion_ast,
mds_blocking_ast, NULL);
OBD_FREE(op_data, sizeof(*op_data));
+ OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
EXIT;
cleanup:
OBD_FREE(mea, mea_size);
CERROR("Can't unpack security desc\n");
GOTO(cleanup, rc = -EFAULT);
}
- mds_squash_root(&obd->u.mds, rsd, &req->rq_peer.peer_id.nid);
body = lustre_swab_reqbuf(req, offset, sizeof(*body),
lustre_swab_mds_body);
if (!S_ISDIR(dentry->d_inode->i_mode))
GOTO(cleanup, rc = 0);
- rc = mds_init_ucred(&uc, rsd);
+ rc = mds_init_ucred(&uc, req, rsd);
if (rc) {
CERROR("can't init ucred\n");
GOTO(cleanup, rc);
RETURN(-ENOTCONN);
}
+ if (mds->mds_ost_sec) {
+ rc = obd_set_info(mds->mds_dt_obd->obd_self_export,
+ strlen("sec"), "sec",
+ strlen(mds->mds_ost_sec), mds->mds_ost_sec);
+ if (rc) {
+ mds->mds_dt_obd = ERR_PTR(rc);
+ RETURN(rc);
+ }
+ }
+
CDEBUG(D_HA, "obd: %s osc: %s lov_name: %s\n",
obd->obd_name, mds->mds_dt_obd->obd_name, lov_name);
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDS
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/kmod.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/version.h>
+#include <linux/unistd.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <asm/segment.h>
+
+#include <libcfs/list.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_mds.h>
+#include <linux/lustre_ucache.h>
+
+#include "mds_internal.h"
+
+/*
+ * We need share hash table among the groups of MDSs (which server as the same
+ * lustre file system), maybe MDT? but there's lprocfs problems of putting this
+ * in MDT. so we make it global to the module. which brings the limitation that
+ * one node couldn't running multiple MDS which server as different Lustre FS.
+ * but which maybe not meaningful.
+ */
+
+
+#define MDS_LSD_HASHSIZE (256)
+static struct upcall_cache _lsd_cache;
+static struct list_head _lsd_hashtable[MDS_LSD_HASHSIZE];
+
+struct upcall_cache *__mds_get_global_lsd_cache()
+{
+ return &_lsd_cache;
+}
+
+static unsigned int lsd_hash(struct upcall_cache *cache, __u64 key)
+{
+ LASSERT(cache == &_lsd_cache);
+ return ((__u32) key) & (MDS_LSD_HASHSIZE - 1);
+}
+
+static struct upcall_cache_entry *
+lsd_alloc_entry(struct upcall_cache *cache, __u64 key)
+{
+ struct lsd_cache_entry *entry;
+ ENTRY;
+
+ OBD_ALLOC(entry, sizeof(*entry));
+ if (!entry) {
+ CERROR("failed to alloc entry\n");
+ RETURN(NULL);
+ }
+ upcall_cache_init_entry(cache, &entry->base, key);
+
+ RETURN(&entry->base);
+}
+
+static void lsd_free_entry(struct upcall_cache *cache,
+ struct upcall_cache_entry *entry)
+{
+ struct lsd_cache_entry *lentry;
+
+ lentry = container_of(entry, struct lsd_cache_entry, base);
+ if (lentry->lsd.lsd_ginfo)
+ put_group_info(lentry->lsd.lsd_ginfo);
+ OBD_FREE(lentry, sizeof(*lentry));
+}
+
+
+static int lsd_make_upcall(struct upcall_cache *cache,
+ struct upcall_cache_entry *entry)
+{
+ char *argv[4];
+ char *envp[3];
+ char uidstr[16];
+ int rc;
+ ENTRY;
+
+ snprintf(uidstr, 16, "%u", (__u32) entry->ue_key);
+
+ argv[0] = cache->uc_upcall;
+ argv[1] = uidstr;
+ argv[2] = NULL;
+
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/usr/sbin";
+ envp[2] = NULL;
+
+ rc = USERMODEHELPER(argv[0], argv, envp);
+ if (rc < 0) {
+ CERROR("Error invoking lsd upcall %s %s: %d; check "
+ "/proc/fs/lustre/mds/lsd_upcall\n",
+ argv[0], argv[1], rc);
+ } else {
+ CWARN("Invoked upcall %s %s\n",
+ argv[0], argv[1]);
+ }
+ RETURN(rc);
+}
+
+static int lsd_parse_downcall(struct upcall_cache *cache,
+ struct upcall_cache_entry *entry,
+ void *args)
+{
+ struct lustre_sec_desc *lsd;
+ struct lsd_cache_entry *lentry;
+ struct lsd_downcall_args *lsd_args;
+ struct group_info *ginfo;
+ ENTRY;
+
+ LASSERT(args);
+
+ lentry = container_of(entry, struct lsd_cache_entry, base);
+ lsd = &lentry->lsd;
+ lsd_args = (struct lsd_downcall_args *) args;
+ LASSERT(lsd_args->err == 0);
+ LASSERT(lsd_args->ngroups <= NGROUPS_MAX);
+
+ ginfo = groups_alloc(lsd_args->ngroups);
+ if (!ginfo) {
+ CERROR("can't alloc group_info for %d groups\n",
+ lsd_args->ngroups);
+ RETURN(-ENOMEM);
+ }
+ groups_from_buffer(ginfo, lsd_args->groups);
+ groups_sort(ginfo);
+
+ lsd->lsd_uid = lsd_args->uid;
+ lsd->lsd_gid = lsd_args->gid;
+ lsd->lsd_ginfo = ginfo;
+ lsd->lsd_allow_setuid = lsd_args->allow_setuid;
+ lsd->lsd_allow_setgid = lsd_args->allow_setgid;
+ lsd->lsd_allow_setgrp = lsd_args->allow_setgrp;
+
+ CWARN("LSD: uid %u gid %u ngroups %u, perm (%d/%d/%d)\n",
+ lsd->lsd_uid, lsd->lsd_gid, ginfo->ngroups,
+ lsd->lsd_allow_setuid, lsd->lsd_allow_setgid,
+ lsd->lsd_allow_setgrp);
+ RETURN(0);
+}
+
+struct lustre_sec_desc * mds_get_lsd(__u32 uid)
+{
+ struct upcall_cache *cache = &_lsd_cache;
+ struct upcall_cache_entry *entry;
+ struct lsd_cache_entry *lentry;
+
+ entry = upcall_cache_get_entry(cache, (__u64) uid);
+ if (!entry)
+ return NULL;
+
+ lentry = container_of(entry, struct lsd_cache_entry, base);
+ return &lentry->lsd;
+}
+
+void mds_put_lsd(struct lustre_sec_desc *lsd)
+{
+ struct lsd_cache_entry *lentry;
+
+ LASSERT(lsd);
+
+ lentry = container_of(lsd, struct lsd_cache_entry, lsd);
+ upcall_cache_put_entry(&lentry->base);
+}
+
+int mds_init_lsd_cache()
+{
+ struct upcall_cache *cache = &_lsd_cache;
+ int i;
+ ENTRY;
+
+ cache->uc_hashtable = _lsd_hashtable;
+ cache->uc_hashsize = MDS_LSD_HASHSIZE;
+ cache->uc_hashlock = RW_LOCK_UNLOCKED;
+ for (i = 0; i < cache->uc_hashsize; i++)
+ INIT_LIST_HEAD(&cache->uc_hashtable[i]);
+ cache->uc_name = "LSD_CACHE";
+
+ /* set default value, proc tunable */
+ sprintf(cache->uc_upcall, "%s", "/sbin/lsd_upcall");
+ cache->uc_entry_expire = 5 * 60;
+ cache->uc_acquire_expire = 5;
+
+ cache->hash = lsd_hash;
+ cache->alloc_entry = lsd_alloc_entry;
+ cache->free_entry = lsd_free_entry;
+ cache->make_upcall = lsd_make_upcall;
+ cache->parse_downcall = lsd_parse_downcall;
+
+ RETURN(0);
+}
+
+void mds_flush_lsd(__u32 id)
+{
+ struct upcall_cache *cache = &_lsd_cache;
+
+ if (id == -1)
+ upcall_cache_flush_idle(cache);
+ else
+ upcall_cache_flush_one(cache, (__u64) id);
+}
+
+void mds_cleanup_lsd_cache()
+{
+ upcall_cache_flush_all(&_lsd_cache);
+}
mds_mfd_put(mfd);
}
-
+#ifdef IFILTERDATA_ACTUALLY_USED
/* Caller must hold mds->mds_epoch_sem */
static int mds_alloc_filterdata(struct inode *inode)
{
inode->i_filterdata = NULL;
iput(inode);
}
+#endif /*IFILTERDATA_ACTUALLY_USED*/
/* Write access to a file: executors cause a negative count,
* writers a positive count. The semaphore is needed to perform
RETURN(-ETXTBSY);
}
-
+#ifdef IFILTERDATA_ACTUALLY_USED
if (MDS_FILTERDATA(inode) && MDS_FILTERDATA(inode)->io_epoch != 0) {
CDEBUG(D_INODE, "continuing MDS epoch "LPU64" for ino %lu/%u\n",
MDS_FILTERDATA(inode)->io_epoch, inode->i_ino,
rc = -ENOMEM;
goto out;
}
+#endif /*IFILTERDATA_ACTUALLY_USED*/
if (epoch > mds->mds_io_epoch)
mds->mds_io_epoch = epoch;
else
mds->mds_io_epoch++;
+#ifdef IFILTERDATA_ACTUALLY_USED
MDS_FILTERDATA(inode)->io_epoch = mds->mds_io_epoch;
CDEBUG(D_INODE, "starting MDS epoch "LPU64" for ino %lu/%u\n",
mds->mds_io_epoch, inode->i_ino, inode->i_generation);
out:
+#endif /*IFILTERDATA_ACTUALLY_USED*/
if (rc == 0)
atomic_inc(&inode->i_writecount);
up(&mds->mds_epoch_sem);
if (!unlinking && !(body->valid & OBD_MD_FLSIZE))
GOTO(out, rc = EAGAIN);
#endif
+#ifdef IFILTERDATA_ACTUALLY_USED
mds_free_filterdata(inode);
+#endif
out:
up(&mds->mds_epoch_sem);
return rc;
error = mds_get_write_access(mds, dentry->d_inode, 0);
if (error)
GOTO(cleanup_mfd, error);
+#ifdef IFILTERDATA_ACTUALLY_USED
body->io_epoch = MDS_FILTERDATA(dentry->d_inode)->io_epoch;
+#endif /*IFILTERDATA_ACTUALLY_USED*/
} else if (flags & FMODE_EXEC) {
error = mds_deny_write_access(mds, dentry->d_inode);
if (error)
}
}
}
+ rc = mds_pack_acl(obd, req->rq_repmsg, 3, body, dchild->d_inode);
+ if (rc < 0) {
+ CERROR("mds_pack_acl: rc = %d\n", rc);
+ up(&dchild->d_inode->i_sem);
+ RETURN(rc);
+ }
+
/* If the inode has no EA data, then MDSs hold size, mtime */
if (S_ISREG(dchild->d_inode->i_mode) &&
!(body->valid & OBD_MD_FLEASIZE)) {
#include <linux/fs.h>
#include <linux/jbd.h>
+#include <linux/namei.h>
#include <linux/ext3_fs.h>
#include <linux/obd_support.h>
#include <linux/obd_class.h>
#include <linux/lustre_dlm.h>
#include <linux/lustre_log.h>
#include <linux/lustre_fsfilt.h>
+#include <linux/lustre_lite.h>
#include "mds_internal.h"
struct mds_logcancel_data {
* chown_common and inode_setattr
* utimes and inode_setattr
*/
+#ifndef ATTR_RAW
+/* Just for the case if we have some clients that know about ATTR_RAW */
+#define ATTR_RAW 8192
+#endif
int mds_fix_attr(struct inode *inode, struct mds_update_record *rec)
{
time_t now = LTIME_S(CURRENT_TIME);
ENTRY;
/* only fix up attrs if the client VFS didn't already */
+
if (!(ia_valid & ATTR_RAW))
RETURN(0);
if (oldrep->rs_xid != req->rq_xid)
continue;
- if (oldrep->rs_msg.opc != req->rq_reqmsg->opc)
+ if (oldrep->rs_msg->opc != req->rq_reqmsg->opc)
CERROR ("Resent req xid "LPX64" has mismatched opc: "
"new %d old %d\n", req->rq_xid,
- req->rq_reqmsg->opc, oldrep->rs_msg.opc);
+ req->rq_reqmsg->opc, oldrep->rs_msg->opc);
svc = oldrep->rs_srv_ni->sni_service;
spin_lock (&svc->srv_lock);
CWARN("Stealing %d locks from rs %p x"LPD64".t"LPD64
" o%d NID %s\n", oldrep->rs_nlocks, oldrep,
- oldrep->rs_xid, oldrep->rs_transno, oldrep->rs_msg.opc,
+ oldrep->rs_xid, oldrep->rs_transno, oldrep->rs_msg->opc,
ptlrpc_peernid2str(&exp->exp_connection->c_peer, str));
for (i = 0; i < oldrep->rs_nlocks; i++)
else /* setattr */
rc = fsfilt_setattr(obd, de, handle, &rec->ur_iattr, 0);
- if (rc == 0 && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) &&
- rec->ur_eadata != NULL) {
- struct lov_stripe_md *lsm = NULL;
-
- rc = ll_permission(inode, MAY_WRITE, NULL);
- if (rc < 0)
- GOTO(cleanup, rc);
+ if (rc == 0) {
+ if (rec->ur_iattr.ia_valid & ATTR_EA) {
+ int flags = (int)rec->ur_iattr.ia_attr_flags;
+
+ rc = -EOPNOTSUPP;
+ if (inode->i_op && inode->i_op->setxattr)
+ rc = inode->i_op->setxattr(de, rec->ur_eadata,
+ rec->ur_ea2data, rec->ur_ea2datalen,
+ flags);
+ } else if (rec->ur_iattr.ia_valid & ATTR_EA_RM) {
+ rc = -EOPNOTSUPP;
+ if (inode->i_op && inode->i_op->removexattr)
+ rc = inode->i_op->removexattr(de,
+ rec->ur_eadata);
+ } else if ((S_ISREG(inode->i_mode) ||
+ S_ISDIR(inode->i_mode)) && rec->ur_eadata != NULL) {
+ struct lov_stripe_md *lsm = NULL;
+
+ rc = ll_permission(inode, MAY_WRITE, NULL);
+ if (rc < 0)
+ GOTO(cleanup, rc);
- rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE, mds->mds_dt_exp,
- 0, &lsm, rec->ur_eadata);
- if (rc)
- GOTO(cleanup, rc);
+ rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE, mds->mds_dt_exp,
+ 0, &lsm, rec->ur_eadata);
+ if (rc)
+ GOTO(cleanup, rc);
- obd_free_memmd(mds->mds_dt_exp, &lsm);
+ obd_free_memmd(mds->mds_dt_exp, &lsm);
- rc = fsfilt_set_md(obd, inode, handle, rec->ur_eadata,
- rec->ur_eadatalen);
- if (rc)
- GOTO(cleanup, rc);
+ rc = fsfilt_set_md(obd, inode, handle, rec->ur_eadata,
+ rec->ur_eadatalen);
+ if (rc)
+ GOTO(cleanup, rc);
+ }
}
body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
if (rec->ur_iattr.ia_valid & (ATTR_ATIME | ATTR_ATIME_SET))
body->valid |= OBD_MD_FLATIME;
+ /* The logcookie should be no use anymore, why nobody remove
+ * following code block?
+ */
+ LASSERT(rec->ur_cookielen == 0);
if (rc == 0 && rec->ur_cookielen && !IS_ERR(mds->mds_dt_obd)) {
OBD_ALLOC(mlcd, sizeof(*mlcd) + rec->ur_cookielen +
rec->ur_eadatalen);
mds_pack_dentry2id(obd, &op_data->id1, dentry, 1);
it.it_op = IT_UNLINK;
+ OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data));
+ if (!it.d.fs_data)
+ RETURN(-ENOMEM);
rc = md_enqueue(mds->mds_md_exp, LDLM_IBITS, &it, LCK_EX,
op_data, rlockh, NULL, 0, ldlm_completion_ast,
mds_blocking_ast, NULL);
OBD_FREE(op_data, sizeof(*op_data));
- if (rc)
- RETURN(rc);
+ if (rc) {
+ OBD_FREE(it.d.fs_data,
+ sizeof(struct lustre_intent_data));
+ RETURN(rc);
+ }
if (rlockh->cookie != 0)
ldlm_lock_decref(rlockh, LCK_EX);
- if (it.d.lustre.it_data) {
- req = (struct ptlrpc_request *)it.d.lustre.it_data;
+ if (LUSTRE_IT(&it)->it_data) {
+ req = (struct ptlrpc_request *)LUSTRE_IT(&it)->it_data;
ptlrpc_req_finished(req);
}
- if (it.d.lustre.it_status)
- rc = it.d.lustre.it_status;
+ if (LUSTRE_IT(&it)->it_status)
+ rc = LUSTRE_IT(&it)->it_status;
+ OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
OBD_FREE(rlockh, handle_size);
}
RETURN(rc);
);
}
+#if ENABLE_GSS
+/* FIXME move these staff to proper place */
+int (*lustre_secinit_downcall_handler)(const char *buffer,
+ long count) = NULL;
+EXPORT_SYMBOL(lustre_secinit_downcall_handler);
+
+int obd_proc_write_secinit(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ int rc = 0;
+
+ if (lustre_secinit_downcall_handler) {
+ rc = (*lustre_secinit_downcall_handler)((char *)buffer, count);
+ if (rc) {
+ LASSERT(rc < 0);
+ return rc;
+ }
+ }
+ return (int)count;
+}
+#endif
+
/* Root for /proc/fs/lustre */
struct proc_dir_entry *proc_lustre_root = NULL;
struct lprocfs_vars lprocfs_base[] = {
{ "version", obd_proc_read_version, NULL, NULL },
{ "kernel_version", obd_proc_read_kernel_version, NULL, NULL },
{ "pinger", obd_proc_read_pinger, NULL, NULL },
+#if ENABLE_GSS
+ { "secinit", NULL, obd_proc_write_secinit, NULL },
+#endif
{ 0 }
};
if (import->imp_connection)
ptlrpc_put_connection_superhack(import->imp_connection);
+ LASSERT(!import->imp_sec);
while (!list_empty(&import->imp_conn_list)) {
struct obd_import_conn *imp_conn;
INIT_LIST_HEAD(&imp->imp_replay_list);
INIT_LIST_HEAD(&imp->imp_sending_list);
INIT_LIST_HEAD(&imp->imp_delayed_list);
+ INIT_LIST_HEAD(&imp->imp_rawrpc_list);
spin_lock_init(&imp->imp_lock);
imp->imp_conn_cnt = 0;
imp->imp_max_transno = 0;
struct inode *inode)
{
struct llog_size_change_rec *lsc;
- int rc;
+#ifdef IFILTERDATA_ACTUALLY_USED
struct ost_filterdata *ofd;
+#endif
+ int rc;
ENTRY;
down(&inode->i_sem);
+#ifdef IFILTERDATA_ACTUALLY_USED
ofd = inode->i_filterdata;
if (ofd && ofd->ofd_epoch >= io_epoch) {
inode->i_filterdata = ofd;
ofd->ofd_epoch = io_epoch;
}
+#endif
/* the decision to write a record is now made, unlock */
up(&inode->i_sem);
rc = 0;
}
- out:
+#ifdef IFILTERDATA_ACTUALLY_USED
+out:
+#endif
RETURN(rc);
}
struct obd_llogs * filter_grab_llog_for_group(struct obd_device *,
#ifdef __KERNEL__
# include <linux/module.h>
+# include <linux/dcache.h>
+# include <linux/namei.h>
# include <linux/obd.h>
# include <linux/obd_ost.h>
# include <linux/lustre_net.h>
#include <linux/lustre_dlm.h>
#include <libcfs/kp30.h>
#include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
#include <lustre/lustre_user.h>
#include <linux/obd_ost.h>
#include <linux/obd_lov.h>
RETURN(0);
}
+ if (keylen == strlen("sec") && memcmp(key, "sec", keylen) == 0) {
+ struct client_obd *cli = &exp->exp_obd->u.cli;
+
+ if (vallen == strlen("null") &&
+ memcmp(val, "null", vallen) == 0) {
+ cli->cl_sec_flavor = PTLRPC_SEC_NULL;
+ cli->cl_sec_subflavor = 0;
+ RETURN(0);
+ }
+ if (vallen == strlen("krb5i") &&
+ memcmp(val, "krb5i", vallen) == 0) {
+ cli->cl_sec_flavor = PTLRPC_SEC_GSS;
+ cli->cl_sec_subflavor = PTLRPC_SEC_GSS_KRB5I;
+ RETURN(0);
+ }
+ if (vallen == strlen("krb5p") &&
+ memcmp(val, "krb5p", vallen) == 0) {
+ cli->cl_sec_flavor = PTLRPC_SEC_GSS;
+ cli->cl_sec_subflavor = PTLRPC_SEC_GSS_KRB5P;
+ RETURN(0);
+ }
+ CERROR("unrecognized security type %s\n", (char*) val);
+ RETURN(-EINVAL);
+ }
+
if (keylen < strlen("mds_conn") ||
memcmp(key, "mds_conn", strlen("mds_conn")) != 0)
RETURN(-EINVAL);
CERROR("bad opc %u version %08x, expecting %08x\n",
msg->opc, msg->version, LUSTRE_LOG_VERSION);
break;
+ case SEC_INIT:
+ case SEC_INIT_CONTINUE:
+ case SEC_FINI:
+ rc = 0;
+ break;
default:
CERROR("OST unexpected opcode %d\n", msg->opc);
rc = -ENOTSUPP;
RETURN(rc);
}
+ /* Security opc should NOT trigger any recovery events */
+ if (req->rq_reqmsg->opc == SEC_INIT ||
+ req->rq_reqmsg->opc == SEC_INIT_CONTINUE ||
+ req->rq_reqmsg->opc == SEC_FINI) {
+ GOTO(out, rc = 0);
+ }
+
/* XXX identical to MDS */
if (req->rq_reqmsg->opc != OST_CONNECT) {
struct obd_device *obd;
COMMON_SOURCES = client.c recover.c connection.c niobuf.c pack_generic.c \
events.c ptlrpc_module.c service.c pinger.c recov_thread.c llog_net.c \
- llog_client.c llog_server.c import.c ptlrpcd.c pers.c \
+ llog_client.c llog_server.c import.c ptlrpcd.c pers.c \
ptlrpc_internal.h $(LDLM_COMM_SOURCES)
if LIBLUSTRE
#include <linux/lustre_lib.h>
#include <linux/lustre_ha.h>
#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
#include "ptlrpc_internal.h"
EXIT;
}
+/* FIXME prep_req now should return error code other than NULL. but
+ * this is called everywhere :(
+ */
struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
int opcode, int count, int *lengths,
char **bufs)
RETURN(NULL);
}
+ request->rq_import = class_import_get(imp);
+
+ rc = ptlrpcs_req_get_cred(request);
+ if (rc) {
+ CDEBUG(D_SEC, "failed to get credential\n");
+ GOTO(out_free, rc);
+ }
+
+ /* just a try on refresh, but we proceed even if it failed */
+ rc = ptlrpcs_cred_refresh(request->rq_cred);
+ if (!ptlrpcs_cred_is_uptodate(request->rq_cred)) {
+ CERROR("req %p: failed to refresh cred %p, rc %d, continue\n",
+ request, request->rq_cred, rc);
+ }
+
rc = lustre_pack_request(request, count, lengths, bufs);
if (rc) {
CERROR("cannot pack request %d\n", rc);
- OBD_FREE(request, sizeof(*request));
- RETURN(NULL);
+ GOTO(out_cred, rc);
}
request->rq_reqmsg->version |= version;
request->rq_send_state = LUSTRE_IMP_FULL;
request->rq_type = PTL_RPC_MSG_REQUEST;
- request->rq_import = class_import_get(imp);
request->rq_req_cbid.cbid_fn = request_out_callback;
request->rq_req_cbid.cbid_arg = request;
request->rq_reqmsg->opc = opcode;
request->rq_reqmsg->flags = 0;
RETURN(request);
+out_cred:
+ ptlrpcs_req_drop_cred(request);
+out_free:
+ class_import_put(imp);
+ OBD_FREE(request, sizeof(*request));
+ RETURN(NULL);
}
struct ptlrpc_request_set *ptlrpc_prep_set(void)
/* Clear reply swab mask; this is a new reply in sender's byte order */
req->rq_rep_swab_mask = 0;
#endif
- LASSERT (req->rq_nob_received <= req->rq_replen);
- rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received);
+ LASSERT (req->rq_nob_received <= req->rq_repbuf_len);
+ rc = ptlrpcs_cli_unwrap_reply(req);
+ if (rc) {
+ CERROR("verify reply error: %d\n", rc);
+ RETURN(rc);
+ }
+ /* unwrap_reply may request rpc be resend */
+ if (req->rq_ptlrpcs_restart) {
+ req->rq_resend = 1;
+ RETURN(0);
+ }
+
+ /* unwrap_reply will set rq_replen as the actual received
+ * lustre_msg length
+ */
+ rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
if (rc) {
CERROR("unpack_rep failed: %d\n", rc);
RETURN(-EPROTO);
req->rq_waiting = 0;
if (req->rq_resend) {
- lustre_msg_add_flags(req->rq_reqmsg,
- MSG_RESENT);
+ if (!req->rq_ptlrpcs_restart)
+ lustre_msg_add_flags(
+ req->rq_reqmsg,
+ MSG_RESENT);
if (req->rq_bulk) {
__u64 old_xid = req->rq_xid;
LASSERTF(request->rq_rqbd == NULL, "req %p\n",request);/* client-side */
LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
+ LASSERT(request->rq_cred);
/* We must take it off the imp_replay_list first. Otherwise, we'll set
* request->rq_reqmsg to NULL while osc_close is dereferencing it. */
LBUG();
}
- if (request->rq_repmsg != NULL) {
- OBD_FREE(request->rq_repmsg, request->rq_replen);
- request->rq_repmsg = NULL;
- }
- if (request->rq_reqmsg != NULL) {
- OBD_FREE(request->rq_reqmsg, request->rq_reqlen);
- request->rq_reqmsg = NULL;
- }
+ if (request->rq_repbuf != NULL)
+ ptlrpcs_cli_free_repbuf(request);
+ if (request->rq_reqbuf != NULL)
+ ptlrpcs_cli_free_reqbuf(request);
+
if (request->rq_export != NULL) {
class_export_put(request->rq_export);
request->rq_export = NULL;
if (request->rq_bulk != NULL)
ptlrpc_free_bulk(request->rq_bulk);
+ ptlrpcs_req_drop_cred(request);
OBD_FREE(request, sizeof(*request));
EXIT;
}
}
if (req->rq_resend) {
- lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+ if (!req->rq_ptlrpcs_restart)
+ lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
if (req->rq_bulk != NULL)
ptlrpc_unregister_bulk (req);
/* Clear reply swab mask; this is a new reply in sender's byte order */
req->rq_rep_swab_mask = 0;
#endif
- LASSERT (req->rq_nob_received <= req->rq_replen);
- rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received);
+ LASSERT (req->rq_nob_received <= req->rq_repbuf_len);
+ rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
if (rc) {
CERROR("unpack_rep failed: %d\n", rc);
GOTO(out, rc = -EPROTO);
spin_unlock (&req->rq_lock);
}
+ list_for_each_safe(tmp, n, &imp->imp_rawrpc_list) {
+ struct ptlrpc_request *req =
+ list_entry(tmp, struct ptlrpc_request, rq_list);
+
+ DEBUG_REQ(D_HA, req, "aborting raw rpc");
+
+ spin_lock(&req->rq_lock);
+ req->rq_err = 1;
+ ptlrpc_wake_client_req(req);
+ spin_unlock(&req->rq_lock);
+ }
+
/* Last chance to free reqs left on the replay list, but we
* will still leak reqs that haven't comitted. */
if (imp->imp_replayable)
LASSERT (ev->type == PTL_EVENT_PUT_END ||
ev->type == PTL_EVENT_UNLINK);
LASSERT (ev->unlinked);
- LASSERT (ev->md.start == req->rq_repmsg);
+ LASSERT (ev->md.start == req->rq_repbuf);
LASSERT (ev->offset == 0);
- LASSERT (ev->mlength <= req->rq_replen);
+ LASSERT (ev->mlength <= req->rq_repbuf_len);
DEBUG_REQ((ev->ni_fail_type == PTL_NI_OK) ? D_NET : D_ERROR, req,
"type %d, status %d", ev->type, ev->ni_fail_type);
* flags are reset and scalars are zero. We only set the message
* size to non-zero if this was a successful receive. */
req->rq_xid = ev->match_bits;
- req->rq_reqmsg = ev->md.start + ev->offset;
+ req->rq_reqbuf = ev->md.start + ev->offset;
if (ev->type == PTL_EVENT_PUT_END &&
ev->ni_fail_type == PTL_NI_OK)
- req->rq_reqlen = ev->mlength;
+ req->rq_reqbuf_len = ev->mlength;
do_gettimeofday(&req->rq_arrival_time);
req->rq_peer.peer_id = ev->initiator;
req->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni;
#include <linux/lustre_export.h>
#include <linux/obd.h>
#include <linux/obd_class.h>
+#include <linux/lustre_sec.h>
#include "ptlrpc_internal.h"
list_add_tail(&tmp->oic_item, &imp->imp_conn_list);
}
- /* switch connection, don't mind if it's same as the current one */
- if (imp->imp_connection)
- ptlrpc_put_connection(imp->imp_connection);
- imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+ /* switch connection if we chose a new one */
+ if (imp->imp_connection != imp_conn->oic_conn) {
+ if (imp->imp_connection) {
+ ptlrpcs_sec_invalidate_cache(imp->imp_sec);
+ ptlrpc_put_connection(imp->imp_connection);
+ }
+ imp->imp_connection =
+ ptlrpc_connection_addref(imp_conn->oic_conn);
+ }
dlmexp = class_conn2export(&imp->imp_dlm_handle);
LASSERT(dlmexp != NULL);
__u64 committed_before_reconnect = 0;
struct ptlrpc_request *request;
int size[] = {sizeof(imp->imp_target_uuid),
- sizeof(obd->obd_uuid),
- sizeof(imp->imp_dlm_handle),
- sizeof(unsigned long)};
+ sizeof(obd->obd_uuid),
+ sizeof(imp->imp_dlm_handle),
+ sizeof(unsigned long),
+ sizeof(__u32) * 2};
char *tmp[] = {imp->imp_target_uuid.uuid,
obd->obd_uuid.uuid,
(char *)&imp->imp_dlm_handle,
- (char *)&imp->imp_connect_flags}; /* XXX: make this portable! */
+ (char *)&imp->imp_connect_flags, /* XXX: make this portable! */
+ (char*) &obd->u.cli.cl_nllu};
struct ptlrpc_connect_async_args *aa;
unsigned long flags;
if (rc)
GOTO(out, rc);
+ LASSERT(imp->imp_sec);
+
request = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION,
- imp->imp_connect_op, 4, size, tmp);
+ imp->imp_connect_op, 5, size, tmp);
if (!request)
GOTO(out, rc = -ENOMEM);
{ PTLBD_DISCONNECT, "ptlbd_disconnect" },
{ OBD_PING, "obd_ping" },
{ OBD_LOG_CANCEL, "llog_origin_handle_cancel"},
+ { SEC_INIT, "sec_init"},
+ { SEC_INIT_CONTINUE,"sec_init_continue"},
+ { SEC_FINI, "sec_fini"},
};
const char* ll_opcode2str(__u32 opcode)
#ifndef __KERNEL__
#include <liblustre.h>
#endif
+#include <linux/obd_class.h>
#include <linux/obd_support.h>
#include <linux/lustre_net.h>
#include <linux/lustre_lib.h>
#include <linux/obd.h>
+#include <linux/lustre_sec.h>
#include "ptlrpc_internal.h"
static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len,
int rc;
/* We must already have a reply buffer (only ptlrpc_error() may be
- * called without one). We must also have a request buffer which
+ * called without one). We usually also have a request buffer which
* is either the actual (swabbed) incoming request, or a saved copy
- * if this is a req saved in target_queue_final_reply(). */
- LASSERT (req->rq_reqmsg != NULL);
+ * if this is a req saved in target_queue_final_reply(). but this
+ * will not be true since some security handling may skip the reqmsg
+ * setting and prepare reply under normal ptlrpc layer */
LASSERT (rs != NULL);
LASSERT (req->rq_repmsg != NULL);
LASSERT (may_be_difficult || !rs->rs_difficult);
- LASSERT (req->rq_repmsg == &rs->rs_msg);
+ LASSERT (req->rq_repmsg == rs->rs_msg);
LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
LASSERT (rs->rs_cb_id.cbid_arg == rs);
req->rq_repmsg->type = req->rq_type;
req->rq_repmsg->status = req->rq_status;
- req->rq_repmsg->opc = req->rq_reqmsg->opc;
+ req->rq_repmsg->opc = req->rq_reqmsg ? req->rq_reqmsg->opc : 0;
if (req->rq_export == NULL)
conn = ptlrpc_get_connection(&req->rq_peer, NULL);
atomic_inc (&svc->srv_outstanding_replies);
- rc = ptl_send_buf (&rs->rs_md_h, req->rq_repmsg, req->rq_replen,
+ rc = svcsec_authorize(req);
+ if (rc) {
+ CERROR("Error wrap reply message "LPX64"\n", req->rq_xid);
+ goto out;
+ }
+
+ rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
rs->rs_difficult ? PTL_ACK_REQ : PTL_NOACK_REQ,
&rs->rs_cb_id, conn,
svc->srv_rep_portal, req->rq_xid);
+out:
if (rc != 0) {
atomic_dec (&svc->srv_outstanding_replies);
request->rq_reqmsg->handle = request->rq_import->imp_remote_handle;
request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST;
request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt;
-
+
+ /* wrap_request might need to refresh gss cred, if this is called
+ * in ptlrpcd then the whole daemon thread will be waiting on
+ * gss negotiate rpc. FIXME
+ */
+ rc = ptlrpcs_cli_wrap_request(request);
+ if (rc)
+ GOTO(cleanup_bulk, rc);
+
LASSERT (request->rq_replen != 0);
- if (request->rq_repmsg == NULL)
- OBD_ALLOC(request->rq_repmsg, request->rq_replen);
- if (request->rq_repmsg == NULL)
- GOTO(cleanup_bulk, rc = -ENOMEM);
+ if (request->rq_repbuf == NULL) {
+ rc = ptlrpcs_cli_alloc_repbuf(request, request->rq_replen);
+ if (rc)
+ GOTO(cleanup_bulk, rc);
+ }
rc = PtlMEAttach(connection->c_peer.peer_ni->pni_ni_h,
request->rq_reply_portal, /* XXX FIXME bug 249 */
request->rq_timedout = 0;
request->rq_net_err = 0;
request->rq_resend = 0;
+ request->rq_ptlrpcs_restart = 0;
request->rq_restart = 0;
spin_unlock_irqrestore (&request->rq_lock, flags);
- reply_md.start = request->rq_repmsg;
- reply_md.length = request->rq_replen;
+ reply_md.start = request->rq_repbuf;
+ reply_md.length = request->rq_repbuf_len;
reply_md.threshold = 1;
reply_md.options = PTLRPC_MD_OPTIONS | PTL_MD_OP_PUT;
reply_md.user_ptr = &request->rq_reply_cbid;
request->rq_sent = LTIME_S(CURRENT_TIME);
ptlrpc_pinger_sending_on_import(request->rq_import);
rc = ptl_send_buf(&request->rq_req_md_h,
- request->rq_reqmsg, request->rq_reqlen,
+ request->rq_reqbuf, request->rq_reqdata_len,
PTL_NOACK_REQ, &request->rq_req_cbid,
connection,
request->rq_request_portal,
LASSERT (!request->rq_receiving_reply);
cleanup_repmsg:
- OBD_FREE(request->rq_repmsg, request->rq_replen);
- request->rq_repmsg = NULL;
+ ptlrpcs_cli_free_repbuf(request);
cleanup_bulk:
if (request->rq_bulk != NULL)
return (-ENOMEM);
}
+
+static int rawrpc_timedout(void *data)
+{
+ struct ptlrpc_request *req = (struct ptlrpc_request *) data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&req->rq_lock, flags);
+ if (!req->rq_replied)
+ req->rq_timedout = 1;
+ spin_unlock_irqrestore(&req->rq_lock, flags);
+
+ return 1;
+}
+
+/* to make things as simple as possible */
+static int rawrpc_check_reply(struct ptlrpc_request *req)
+{
+ unsigned long flags;
+ int rc;
+
+ spin_lock_irqsave (&req->rq_lock, flags);
+ rc = req->rq_replied || req->rq_net_err || req->rq_err ||
+ req->rq_resend || req->rq_restart;
+ spin_unlock_irqrestore(&req->rq_lock, flags);
+ return rc;
+}
+
+/*
+ * Construct a fake ptlrpc_request to do the work, in order to
+ * user the existing callback/wakeup facilities
+ */
+int ptlrpc_do_rawrpc(struct obd_import *imp,
+ char *reqbuf, int reqlen,
+ char *repbuf, int *replenp,
+ int timeout)
+{
+ struct ptlrpc_connection *conn;
+ struct ptlrpc_request request; /* just a fake one */
+ ptl_handle_me_t reply_me_h;
+ ptl_md_t reply_md, req_md;
+ struct l_wait_info lwi;
+ unsigned long irq_flags;
+ int rc;
+ ENTRY;
+
+ LASSERT(imp);
+ class_import_get(imp);
+ if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+ CWARN("raw rpc on closed imp(=>%s)? send anyway\n",
+ imp->imp_target_uuid.uuid);
+ }
+
+ conn = imp->imp_connection;
+
+ /* initialize request */
+ memset(&request, 0, sizeof(request));
+ request.rq_req_cbid.cbid_fn = request_out_callback;
+ request.rq_req_cbid.cbid_arg = &request;
+ request.rq_reply_cbid.cbid_fn = reply_in_callback;
+ request.rq_reply_cbid.cbid_arg = &request;
+ request.rq_reqbuf = reqbuf;
+ request.rq_reqbuf_len = reqlen;
+ request.rq_repbuf = repbuf;
+ request.rq_repbuf_len = *replenp;
+ request.rq_set = NULL;
+ spin_lock_init(&request.rq_lock);
+ init_waitqueue_head(&request.rq_reply_waitq);
+ atomic_set(&request.rq_refcount, 1000000); /* never be droped */
+ request.rq_xid = ptlrpc_next_xid();
+
+ /* add into sending list */
+ spin_lock_irqsave(&imp->imp_lock, irq_flags);
+ list_add_tail(&request.rq_list, &imp->imp_rawrpc_list);
+ spin_unlock_irqrestore(&imp->imp_lock, irq_flags);
+
+ /* prepare reply buffer */
+ rc = PtlMEAttach(conn->c_peer.peer_ni->pni_ni_h,
+ imp->imp_client->cli_reply_portal,
+ conn->c_peer.peer_id, request.rq_xid, 0, PTL_UNLINK,
+ PTL_INS_AFTER, &reply_me_h);
+ if (rc != PTL_OK) {
+ CERROR("PtlMEAttach failed: %d\n", rc);
+ LASSERT (rc == PTL_NO_SPACE);
+ GOTO(cleanup_imp, rc = -ENOMEM);
+ }
+
+ spin_lock_irqsave(&request.rq_lock, irq_flags);
+ request.rq_receiving_reply = 1;
+ spin_unlock_irqrestore(&request.rq_lock, irq_flags);
+
+ reply_md.start = repbuf;
+ reply_md.length = *replenp;
+ reply_md.threshold = 1;
+ reply_md.options = PTLRPC_MD_OPTIONS | PTL_MD_OP_PUT;
+ reply_md.user_ptr = &request.rq_reply_cbid;
+ reply_md.eq_handle = conn->c_peer.peer_ni->pni_eq_h;
+
+ rc = PtlMDAttach(reply_me_h, reply_md, PTL_UNLINK,
+ &request.rq_reply_md_h);
+ if (rc != PTL_OK) {
+ CERROR("PtlMDAttach failed: %d\n", rc);
+ LASSERT (rc == PTL_NO_SPACE);
+ GOTO(cleanup_me, rc = -ENOMEM);
+ }
+
+ /* prepare request buffer */
+ req_md.start = reqbuf;
+ req_md.length = reqlen;
+ req_md.threshold = 1;
+ req_md.options = PTLRPC_MD_OPTIONS;
+ req_md.user_ptr = &request.rq_req_cbid;
+ req_md.eq_handle = conn->c_peer.peer_ni->pni_eq_h;
+
+ rc = PtlMDBind(conn->c_peer.peer_ni->pni_ni_h,
+ req_md, PTL_UNLINK, &request.rq_req_md_h);
+ if (rc != PTL_OK) {
+ CERROR("PtlMDBind failed %d\n", rc);
+ LASSERT (rc == PTL_NO_SPACE);
+ GOTO(cleanup_me, rc = -ENOMEM);
+ }
+
+ rc = PtlPut(request.rq_req_md_h, PTL_NOACK_REQ, conn->c_peer.peer_id,
+ imp->imp_client->cli_request_portal,
+ 0, request.rq_xid, 0, 0);
+ if (rc != PTL_OK) {
+ CERROR("PtlPut failed %d\n", rc);
+ GOTO(cleanup_md, rc);
+ }
+
+ lwi = LWI_TIMEOUT(timeout * HZ, rawrpc_timedout, &request);
+ l_wait_event(request.rq_reply_waitq,
+ rawrpc_check_reply(&request), &lwi);
+
+ ptlrpc_unregister_reply(&request);
+
+ if (request.rq_err || request.rq_resend || request.rq_intr ||
+ request.rq_timedout || !request.rq_replied) {
+ CERROR("secinit rpc error: err %d, resend %d, "
+ "intr %d, timeout %d, replied %d\n",
+ request.rq_err, request.rq_resend, request.rq_intr,
+ request.rq_timedout, request.rq_replied);
+ rc = -EINVAL;
+ } else {
+ *replenp = request.rq_nob_received;
+ rc = 0;
+ }
+ GOTO(cleanup_imp, rc);
+
+cleanup_md:
+ PtlMDUnlink(request.rq_req_md_h);
+cleanup_me:
+ PtlMEUnlink(reply_me_h);
+cleanup_imp:
+ spin_lock_irqsave(&imp->imp_lock, irq_flags);
+ list_del_init(&request.rq_list);
+ spin_unlock_irqrestore(&imp->imp_lock, irq_flags);
+
+ class_import_put(imp);
+ RETURN(rc);
+}
#include <linux/obd_support.h>
#include <linux/obd_class.h>
#include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
#include <linux/fcntl.h>
int lustre_pack_request (struct ptlrpc_request *req,
int count, int *lens, char **bufs)
{
+ int rc;
ENTRY;
- req->rq_reqlen = lustre_msg_size (count, lens);
- OBD_ALLOC(req->rq_reqmsg, req->rq_reqlen);
- if (req->rq_reqmsg == NULL)
- RETURN(-ENOMEM);
+ req->rq_reqlen = lustre_msg_size(count, lens);
+ rc = ptlrpcs_cli_alloc_reqbuf(req, req->rq_reqlen);
+ if (rc)
+ RETURN(rc);
- lustre_init_msg (req->rq_reqmsg, count, lens, bufs);
+ lustre_init_msg(req->rq_reqmsg, count, lens, bufs);
RETURN (0);
}
int count, int *lens, char **bufs)
{
struct ptlrpc_reply_state *rs;
- int msg_len;
- int size;
+ int rc;
ENTRY;
- LASSERT (req->rq_reply_state == NULL);
-
- msg_len = lustre_msg_size (count, lens);
- size = offsetof (struct ptlrpc_reply_state, rs_msg) + msg_len;
- OBD_ALLOC (rs, size);
- if (rs == NULL)
- RETURN (-ENOMEM);
-
+ LASSERT(req->rq_reply_state == NULL);
+ LASSERT(req->rq_svcsec);
+ LASSERT(req->rq_repmsg == NULL);
+
+ req->rq_replen = lustre_msg_size(count, lens);
+ rc = svcsec_alloc_repbuf(req->rq_svcsec, req, req->rq_replen);
+ if (rc)
+ RETURN(rc);
+ LASSERT(req->rq_reply_state);
+ LASSERT(req->rq_repmsg == req->rq_reply_state->rs_msg);
+
+ rs = req->rq_reply_state;
+ rs->rs_svcsec = svcsec_get(req->rq_svcsec);
rs->rs_cb_id.cbid_fn = reply_out_callback;
rs->rs_cb_id.cbid_arg = rs;
rs->rs_srv_ni = req->rq_rqbd->rqbd_srv_ni;
- rs->rs_size = size;
INIT_LIST_HEAD(&rs->rs_exp_list);
INIT_LIST_HEAD(&rs->rs_obd_list);
- req->rq_replen = msg_len;
- req->rq_reply_state = rs;
- req->rq_repmsg = &rs->rs_msg;
- lustre_init_msg (&rs->rs_msg, count, lens, bufs);
+ lustre_init_msg(rs->rs_msg, count, lens, bufs);
PTLRPC_RS_DEBUG_LRU_ADD(rs);
void lustre_free_reply_state (struct ptlrpc_reply_state *rs)
{
+ struct ptlrpc_svcsec *svcsec = rs->rs_svcsec;
+
PTLRPC_RS_DEBUG_LRU_DEL(rs);
LASSERT (!rs->rs_difficult || rs->rs_handled);
LASSERT (rs->rs_nlocks == 0);
LASSERT (list_empty(&rs->rs_exp_list));
LASSERT (list_empty(&rs->rs_obd_list));
+ LASSERT (svcsec);
+
+ if (svcsec->free_repbuf)
+ svcsec->free_repbuf(svcsec, rs);
+ else
+ svcsec_free_reply_state(rs);
- OBD_FREE (rs, rs->rs_size);
+ svcsec_put(svcsec);
}
/* This returns the size of the buffer that is required to hold a lustre_msg
__swab32s(&rsd->rsd_ngroups);
}
+ if (rsd->rsd_ngroups > LUSTRE_MAX_GROUPS) {
+ CERROR("%u groups is not allowed\n", rsd->rsd_ngroups);
+ return NULL;
+ }
+
if (m->buflens[offset] !=
sizeof(*rsd) + rsd->rsd_ngroups * sizeof(__u32)) {
CERROR("bufflen %u while contains %u groups\n",
(LDLM_LAST_OPC - LDLM_FIRST_OPC) +
(MDS_LAST_OPC - MDS_FIRST_OPC) +
(OST_LAST_OPC - OST_FIRST_OPC));
+ } else if (opc < SEC_LAST_OPC) {
+ /* Security negotiate */
+ return (opc - SEC_FIRST_OPC +
+ (PTLBD_LAST_OPC - PTLBD_FIRST_OPC) +
+ (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
+ (MDS_LAST_OPC - MDS_FIRST_OPC) +
+ (OST_LAST_OPC - OST_FIRST_OPC) +
+ (OBD_LAST_OPC - OBD_FIRST_OPC));
} else {
/* Unknown Opcode */
return -1;
(LDLM_LAST_OPC - LDLM_FIRST_OPC) + \
(MDS_LAST_OPC - MDS_FIRST_OPC) + \
(OST_LAST_OPC - OST_FIRST_OPC) + \
- (OBD_LAST_OPC - OBD_FIRST_OPC))
+ (OBD_LAST_OPC - OBD_FIRST_OPC) + \
+ (SEC_LAST_OPC - SEC_FIRST_OPC))
enum {
PTLRPC_REQWAIT_CNTR = 0,
EXPORT_SYMBOL(ptlrpc_error);
EXPORT_SYMBOL(ptlrpc_resend_req);
EXPORT_SYMBOL(ptl_send_rpc);
+EXPORT_SYMBOL(ptlrpc_do_rawrpc);
/* client.c */
EXPORT_SYMBOL(ptlrpc_init_client);
#include <linux/obd_support.h>
#include <linux/obd_class.h>
#include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
#include <linux/lustre_log.h>
#include <portals/types.h>
#include "ptlrpc_internal.h"
static void
ptlrpc_free_server_req (struct ptlrpc_request *req)
{
+ if (req->rq_svcsec) {
+ svcsec_cleanup_req(req);
+ svcsec_put(req->rq_svcsec);
+ req->rq_svcsec = NULL;
+ }
+
/* The last request to be received into a request buffer uses space
* in the request buffer descriptor, otherwise requests are
* allocated dynamically in the incoming reply event handler */
struct timeval work_start;
struct timeval work_end;
long timediff;
- int rc;
+ enum ptlrpcs_error sec_err;
+ int secrc, rc;
ENTRY;
spin_lock_irqsave (&svc->srv_lock, flags);
/* Clear request swab mask; this is a new request */
request->rq_req_swab_mask = 0;
#endif
- rc = lustre_unpack_msg (request->rq_reqmsg, request->rq_reqlen);
+
+ /* go through security check/transform */
+ request->rq_auth_uid = -1;
+ secrc = svcsec_accept(request, &sec_err);
+ switch(secrc) {
+ case SVC_OK:
+ CDEBUG(D_SEC, "request accepted ok\n");
+ break;
+ case SVC_COMPLETE:
+ target_send_reply(request, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
+ goto put_conn;
+ case SVC_DROP:
+ goto out;
+ case SVC_LOGIN:
+ case SVC_LOGOUT:
+ break;
+ default:
+ LBUG();
+ }
+
+ rc = lustre_unpack_msg(request->rq_reqmsg, request->rq_reqlen);
if (rc != 0) {
CERROR ("error unpacking request: ptl %d from %s"
" xid "LPU64"\n", svc->srv_req_portal,
ptlrpc_peernid2str(&request->rq_peer, str),
- request->rq_xid);
+ request->rq_xid);
goto out;
}
CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA,
"request "LPU64" opc %u from NID %s processed in %ldus "
- "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc,
+ "(%ldus total)\n", request->rq_xid,
+ request->rq_reqmsg ? request->rq_reqmsg->opc : 0,
ptlrpc_peernid2str(&request->rq_peer, str),
timediff, timeval_sub(&work_end, &request->rq_arrival_time));
- if (svc->srv_stats != NULL) {
+ if (svc->srv_stats != NULL && request->rq_reqmsg != NULL) {
int opc = opcode_offset(request->rq_reqmsg->opc);
if (opc > 0) {
LASSERT(opc < LUSTRE_MAX_OPCODES);
" o%d NID %s\n",
rs,
rs->rs_xid, rs->rs_transno,
- rs->rs_msg.opc,
+ rs->rs_msg->opc,
ptlrpc_peernid2str(&exp->exp_connection->c_peer, str));
#endif
}
--- /dev/null
+.Xrefs
+config.log
+config.status
+configure
+Makefile
+.deps
+TAGS
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.o.flags
+.tmp_versions
+.depend
--- /dev/null
+MODULES := ptlrpcs
+ptlrpcs-objs := sec.o sec_null.o svcsec.o svcsec_null.o upcall_cache.o
+
+@GSS_TRUE@subdir-m += gss
+
+@INCLUDE_RULES@
--- /dev/null
+# Copyright (C) 2004 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(src)/../portals/Kernelenv
+
+obj-y += ptlrpcs.o
+ptlrpcs-objs := sec.o sec_null.o svcsec.o svcsec_null.o upcall_cache.o
+
--- /dev/null
+# Copyright (C) 2004 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if GSS
+SUBDIRS = . gss #kcrypto
+endif
+
+if LIBLUSTRE
+noinst_LIBRARIES = libptlrpcs.a
+libptlrpcs_a_SOURCES = sec.c sec_null.c svcsec.c svcsec_null.c
+libptlrpcs_a_CPPFLAGS = $(LLCPPFLAGS)
+libptlrpcs_a_CFLAGS = $(LLCFLAGS)
+endif
+
+if MODULES
+modulefs_DATA = ptlrpcs$(KMODEXT)
+endif
+
+DIST_SOURCES = $(ptlrpcs-objs:.o=.c)
+MOSTLYCLEANFILES = *.o *.ko *.mod.c
--- /dev/null
+#LyX 1.3 created this file. For more info see http://www.lyx.org/
+\lyxformat 221
+\textclass article
+\language english
+\inputencoding auto
+\fontscheme times
+\graphics default
+\paperfontsize 12
+\spacing single
+\papersize Default
+\paperpackage a4
+\use_geometry 0
+\use_amsmath 0
+\use_natbib 0
+\use_numerical_citations 0
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation skip
+\defskip medskip
+\quotes_language english
+\quotes_times 2
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+
+\layout Title
+
+High Level Design of Cient-OSS Connection
+\layout Author
+
+Peter Braam, Eric Mei
+\layout Date
+
+Feb 13, 2005
+\layout Section
+
+Requirements
+\layout Itemize
+
+Establish gss connections between clients and OSS.
+\layout Itemize
+
+Establish gss connections between servers.
+\layout Section
+
+Functional Specification
+\layout Standard
+
+In Lustre system, there are several kinds of connections and security options
+ can be chosen separately:
+\layout Itemize
+
+between client and MDS's
+\layout Itemize
+
+between client and OSS's
+\layout Itemize
+
+between MDS's
+\layout Itemize
+
+between MDS's and OSS's
+\layout Standard
+
+Currently we are able to establish secure connections between the client
+ and MDS's, simply by adding a mount parameter 'sec=sec_flavor', here sec_flavor
+ could be 'krb5i' or 'krb5p' for this moment.
+ Now we also need the secure connections between client and OSS's also be
+ an option, to prepare for the coming object security features.
+ So the original mount option 'sec' will be break into 2 options: 'mds_sec'
+ and 'oss_sec'.
+\layout Itemize
+
+mount.lustre should be able to recognize options 'mds_sec=sec_flavor' and
+ 'oss_sec=sec_flavor'.
+\layout Itemize
+
+lmt should be able to add 'mds_sec' and 'oss_sec' into xml file and recognizable
+ by lconf.
+ And lconf should be able to write this info into config log with option
+ --write-conf.
+\layout Standard
+
+Usually we consider MDS and OSS are trusted nodes, but networks are normally
+ not secure.
+ So connections of MDS <=> MDS and MDS <=> OSS must be secure in most cases.
+ We should also provide security on connections between servers.
+\layout Standard
+
+For inter MDS's and MDS's to OSS's, We provide options for lconf and lmt,
+ just like client <=> OSS's case:
+\layout Itemize
+
+lconf should be able to recognize options '--inter_mds_sec=sec_flavor' and
+ '--mds_oss_sec=sec_flavor'.
+\layout Itemize
+
+lmt should be able to add 'inter_mds_sec' and 'mds_oss_sec' into xml file
+ and recognizable by lconf.
+\layout Standard
+
+Servers will have options to accept only certain types of connections.
+ When setup OSS/MDS via lconf, option '--deny_sec=sec_flavor[,sec_flavor...]'
+ should be recognized and notify OSS/MDS kernel.
+ Currently sec_flavor could be 'null', 'krb5i', or 'krb5p'.
+\layout Standard
+
+Maybe privacy connections to the OSS servers are only needed from the MDS,
+ since there will be no secret transfer between OSS and client.
+ And if we in the future support mixed security type in single security
+ context, then integrity type might be enough for most cases.
+ But anyway we provide the flexibility here.
+\layout Section
+
+Use Cases
+\layout Subsection
+
+Mount lustre at client
+\layout Enumerate
+
+Sysadmin add options into config: lmt --mds_sec krb5p --oss_sec krb5i config.xml.
+ And setup OSS/MDS ready.
+\layout Enumerate
+
+User mount lustre by 'mount -t lustre server:/mds1/client /mnt/lustre'
+\layout Enumerate
+
+Connections to MDS's are privacy protected, connections to OSS's are integrity
+ protected.
+\layout Enumerate
+
+User umount lustre.
+\layout Enumerate
+
+User mount lustre by 'mount -t lustre -o mds_sec=krb5i,oss_sec=krb5p server:/mds
+1/client /mnt/lustre'
+\layout Enumerate
+
+Connections to MDS's are integrity protected, connections to OSS's are privacy
+ protected.
+\layout Enumerate
+
+User umount lustre.
+\layout Enumerate
+
+User mount lustre by 'mount -t lustre -o mds_sec=krb5p,oss_sec=krb5p server:/mds
+1/client /mnt/lustre
+\layout Enumerate
+
+Connections to all MDS's and OSS's are privacy protected.
+\layout Subsection
+
+Startup MDS
+\layout Enumerate
+
+Sysadmin add options into config: lmt --inter_mds_sec krb5p --mds_oss_sec
+ krb5p config.xml
+\layout Enumerate
+
+Sysadmin start mds by: lconf --node mds config.xml.
+\layout Enumerate
+
+Connections between MDS's and MDS's to OSS's are privacy protected.
+\layout Enumerate
+
+Sysadmin stop MDS's.
+\layout Enumerate
+
+Sysadmin start mds again by: lconf --node mds --inter_mds_sec=krb5i --mds_oss_se
+c=krb5p config.xml.
+\layout Enumerate
+
+Connections between MDS's are integrity protected, while MDS's to OSS's
+ are privacy protected.
+\layout Subsection
+
+Deny certain type of connection
+\layout Enumerate
+
+Sysadmin start OSS's by 'lconf --node ost1 --deny_sec=null config.xml'
+\layout Enumerate
+
+Sysadmin start MDS's by 'lconf --node mds1 --mds_oss_sec=null config.xml',
+ setup will fail because OST reject connection from MDS's.
+\layout Enumerate
+
+Sysadmin start MDS's by 'lconf --node mds1 --deny_sec=null --mds_oss_sec=krb5i
+ config.xml', will succeed.
+\layout Enumerate
+
+Client mount by 'mount -t lustre -o mds_sec=null server:/mds1/client /mnt/lustre
+' or 'mount -t lustre -o oss_sec=null server:/mds1/client /mnt/lustre' will
+ fail because either MDS's or OSS's will reject connection.
+\layout Enumerate
+
+Client mount by 'mount -t lustre -o mds_sec=krb5i,oss_sec=krb5i server:/mds1/cli
+ent /mnt/lustre' will succeed.
+\layout Section
+
+Logic Specification
+\layout Standard
+
+With Kerberos, each service provider needs a service principal, and a correspond
+ing service key installed.
+ Usually the principal is bound to certain host for security.
+ For example, currently lustre service principal is 'lustre/hostname@REALM'.
+ While in clustered MDS case, we should use single principal for all MDS's,
+ to minimize the administrator burden.
+ It should be 'lustre@REALM' for all MDS's.
+ Now we should break 'lustre@REALM' into 2 principals: 'mds@REALM' for MDS
+ and 'oss@REALM' for OSS.
+ All MDS's will be installed service key of 'mds@REALM', while all OSS's
+ will be installed service key of 'oss@REALM'.
+\layout Standard
+
+If MDS <=> MDS or MDS <=> OSS security is used, we also need start client
+ gss daemon (lgssd) on MDS's at proper time.
+ This needs to be incorporated into test scripts.
+\layout Standard
+
+The interaction between kernel gss module and lgssd need some modification,
+ which need to be notified the target service type (i.e.
+ mds or oss) to issue the correct gss request.
+\layout Standard
+
+Integrating security flavor setting into MDS's startup procedure and client's
+ mount procedure needs to be integrated into the MDS startup configuration
+ log.
+
+\layout Standard
+
+Additionally the MDS and OSS should have configuration options that provide
+ information on what kind of connections to accept.
+
+\layout Section
+
+State Management
+\layout Standard
+
+MDS nodes need run lgssd if gss is active on any inter-server connections.
+\layout Standard
+
+No disk format change.
+ No special recovery consideration.
+\layout Section
+
+Alternatives
+\layout Standard
+
+None.
+\layout Section
+
+Focus of Inspection
+\layout Itemize
+
+Are there more clean design/divide on those new options?
+\the_end
--- /dev/null
+#LyX 1.3 created this file. For more info see http://www.lyx.org/
+\lyxformat 221
+\textclass article
+\language english
+\inputencoding auto
+\fontscheme times
+\graphics default
+\paperfontsize 12
+\spacing single
+\papersize Default
+\paperpackage a4
+\use_geometry 0
+\use_amsmath 0
+\use_natbib 0
+\use_numerical_citations 0
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation skip
+\defskip medskip
+\quotes_language english
+\quotes_times 2
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+
+\layout Title
+
+High Level Design of Remote UID/GID Handling
+\layout Author
+
+Peter Braam, Eric Mei
+\layout Date
+
+Jan 27, 2005
+\layout Section
+
+From the ERS (Engineering Requirements Spec, formerly Architecture)
+\layout Itemize
+
+Perform uid/gid translation between remote clients and local user database.
+\layout Itemize
+
+Handling client program calling setuid/setgid/setgroups syscalls to get
+ unusual previlege .
+\layout Itemize
+
+Handling supplementary groups membership.
+\layout Itemize
+
+Various security policies in situations with/without strong authentication
+ like Kerberos V5.
+\layout Paragraph
+
+NOTE:
+\layout Itemize
+
+remote clients may have different user database from that of MDS's.
+\layout Itemize
+
+The remote ACL issues is addressed by a separate module.
+\layout Itemize
+
+Most content of this document has been described in Lustre Book.
+\layout Standard
+
+The architecture prescribes a translation mechanism at the MDS: the MDS
+ will translate a locally found uid/gid, which is obtained through the kerberos
+ principal.
+\layout Section
+
+Functional Specification
+\layout Subsection
+
+Determine local/remote clients
+\layout Itemize
+
+
+\begin_inset Quotes eld
+\end_inset
+
+local
+\begin_inset Quotes erd
+\end_inset
+
+ client is the client node which is supposed to share the same user database
+ with MDS's.
+
+\layout Itemize
+
+
+\begin_inset Quotes eld
+\end_inset
+
+remote
+\begin_inset Quotes erd
+\end_inset
+
+ client is the client node which is supposed to have different user database
+ from MDS's.
+
+\layout Standard
+
+The MDS's will be able to determine that a client node is a local or remote
+ one, upon the client's first connection time to the MDS, and reply back
+ it's decision to client.
+ Later both MDS and client will make different operation decision according
+ to this flag.
+ This remote flag is per-client, not per user.
+ Once MDS made the decision, it will keep unchanged until client leave the
+ cluster membership (umount or so).
+\layout Standard
+
+MDS will do many conversion (mostly uid/gid mapping) for users on remote
+ clients because of the user database mismatch, and due to the nature of
+ this mismatch we have to put some limitation on users of remote clients,
+ compare to local clients.
+ Following sections have the details description.
+\layout Subsection
+
+Mapping uid/gid from clients
+\layout Standard
+
+For local client, obviously we don't need do any uid/gid mapping.
+ For remote clients, we need translate uid/gid in each request into one
+ which lives in local user database; and vice versa: translate uid/gid in
+ reply into the one in remote user database.
+ This translation affects the uid/gid's found in the inode as owner/group,
+ the security context which describes under what uid the MDS is executing
+ and in some cases (chown is a good example) the arguments of calls.
+\layout Standard
+
+Each MDS will have to access a uid-mapping database, which prescribed that:
+ which principal from which nid/netid should be mapped to which local uid.
+ The mapping database must be the same to every MDS to get consistent result.
+ During runtime, the a remote user authenticated with the MDS, the corresponding
+ mapping entry will be read from the on-disk database and cached in the
+ kernel via an upcall.
+ Note the same principal from different clients might be mapped to different
+ local user, according to the mapping database.
+ So on each MDS there's a per-client structure which maintained the uid
+ mapping cache.
+\layout Standard
+
+Each remote client must have nllu/nllg installed.
+ 'nllu' is for
+\begin_inset Quotes eld
+\end_inset
+
+Non Local Lustre User
+\begin_inset Quotes erd
+\end_inset
+
+, while 'nllg' for
+\begin_inset Quotes eld
+\end_inset
+
+Non Local Lustre Group
+\begin_inset Quotes erd
+\end_inset
+
+.
+ When client firstly mount a lustre fileset, it should notify MDS which
+ local uid/gid act as nllu/nllg.
+ MDS will translate those unrecognized uid/gid to this before send reply
+ to client.
+ Thus from client's perspect of view, those files which belong to unauthorized
+ users will be shown as belonging to nllu/nllg.
+\layout Subsection
+
+Lustre security description (LSD)
+\layout Standard
+
+There's a security configure database on each MDS, which describes who(uid)
+ from where(nid/netid) have permission to setuid/setgid/setgroups.
+ Later we might add more into it.
+ the database must be the same to every MDS to get consistent result.
+\layout Standard
+
+LSD refers to the in-kernel data structure which describe an user's security
+ property on the MDS.
+ It roughly be defined as:
+\layout LyX-Code
+
+struct lustre_sec_desc {
+\layout LyX-Code
+
+ uid_t uid;
+\layout LyX-Code
+
+ gid_t gid;
+\layout LyX-Code
+
+ supp_grp_t supp_grp;
+\layout LyX-Code
+
+ setxid_desc setxid;
+\layout LyX-Code
+
+ /* more security tags added here */
+\layout LyX-Code
+
+};
+\layout Standard
+
+In the future we'll add more special security tag into it.
+ Each LSD entry correspond to an user in the local user database.
+ the 'setxid_desc' must have the ability to describe setuid/setgid/setgroups
+ permission for different clients respectively.
+\layout Standard
+
+LSD cache is populated via an upcall during runtime.
+ The user-level helper will be feed in uid as a parameter, and found out
+ this uid's principal gid and supplementary groups from local user database,
+ and find setxid permission bits and other security tags from on-disk security
+ database.
+\layout Standard
+
+Each LSD entry have limited expiration time, and will be flushed out when
+ expired.
+ Next request come from this user will result in the LSD be populated again,
+ with the uptodate security settings if changed.
+ System administrator also could choose to flush certain user's LSD forcely.
+\layout Standard
+
+Every filesystem access request from client need go through checking of
+ LSD.
+ This checking is uid based, for those request coming from remote client,
+ uid will be mapped at first as described above, and then go to LSD.
+\layout Subsection
+
+The MDS security context
+\layout Standard
+
+All kernel-level service threads running on MDS are running as root, waiting
+ request from other nodes, and provide services.
+ But for those request to access filesystem for certain user, those threads
+ must act as the user, running as its identities.
+ Thus such a request comes in, we firstly collect the identity information
+ for this user as above described, include uid, gid, etc., then switch the
+ identity in the process context before really execute the filesystem operation;
+ we also need switch the root directory of process to the root of MDS's
+ backend filesystem.
+ after it finished, we switch back to the original context, prepare to the
+ next service.
+\layout Standard
+
+For some request for special service like llog handling, special interaction
+ between MDSs, which don't represent any certain user, and require keeping
+ the root privilege.
+ In those situation we don't need do such context switch, also user identity
+ preparation.
+\layout Subsection
+
+Remote client cache flushing
+\layout Standard
+
+For a remote client, it should realize that those locally cached file's
+ owner information, e.g.
+ owner, group, is ever translated by server side, some mapping might be
+ stale as time goes on.
+ for example: a user newly authenticated, while some cached file which should
+ be owned by him still shows owner is
+\begin_inset Quotes eld
+\end_inset
+
+nllu
+\begin_inset Quotes erd
+\end_inset
+
+.
+ client must choose the proper time to flush those stale owner informations,
+ to give user a consistent view.
+ All attribute locks held by clients must be given a revocation callback
+ when a new user connects.
+\layout Section
+
+Use Cases
+\layout Subsection
+
+Connect rpc from local realm (case 1)
+\layout Enumerate
+
+Alice doing 'mount'
+\layout Enumerate
+
+Alice sends the first ptlrpc request (MDS_CONNECT) without GSS security
+ to MDS;
+\layout Enumerate
+
+mds_handle() will initialize per-client structure, clear the remote flag
+ in it;
+\layout Enumerate
+
+After successful connection done, the MDS send the remote flag back to client
+ for future usage in client side.
+\layout Subsection
+
+Connect rpc from local realm (case 2)
+\layout Enumerate
+
+Alice doing 'mount'
+\layout Enumerate
+
+Alice from a MDS local realm sends the first ptlrpc request (MDS_CONNECT)
+ with GSS security to MDS;
+\layout Enumerate
+
+MDS svcgssd will determine it's from a local realm client;
+\layout Enumerate
+
+mds_handle() will initialize per-client structure, clear the remote flag
+ in it;
+\layout Enumerate
+
+After successful connection done, MDS will send the remote flag back to
+ client for future usage in client side.
+\layout Subsection
+
+Connect rpc from remote realm
+\layout Enumerate
+
+Alice from a MDS remote realm sends the first ptlrpc request (MDS_CONNECT)
+ with GSS security to MDS, along with its nllu/nllg id number;
+\layout Enumerate
+
+MDS svcgssd will determine it's from a remote realm client;
+\layout Enumerate
+
+mds_handle() logic will initialize per-client structure:
+\begin_deeper
+\layout Enumerate
+
+Set the remote flag in it;
+\layout Enumerate
+
+Fill in the nllu/nllg ids obtained from client rpc request;
+\end_deeper
+\layout Enumerate
+
+After successful connection done, the MDS will send the remote flag back
+ to client for future usage in client side.
+\layout Subsection
+
+Filesystem access request
+\layout Enumerate
+
+Alice (from local or remote client) try to access a file in lustre
+\layout Enumerate
+
+If Alice is from remote client, MDS do uid/gid mapping; otherwise do nothing
+\layout Enumerate
+
+MDS obtain LSD item for Alice
+\layout Enumerate
+
+MDS perform permission check, based on LSD policies.
+\layout Enumerate
+
+MDS service process switch to this user's context
+\layout Enumerate
+
+MDS finish the file operation on behave of Alice.
+\layout Enumerate
+
+MDS service process switch back original context
+\layout Enumerate
+
+If Alice is from remote client, MDS do uid/gid reserve mapping if needed.
+\layout Enumerate
+
+MDS send reply.
+\layout Subsection
+
+Rpc after setuid/setgid/setgroups from local clients
+\layout Enumerate
+
+Alice calls setuid/setgid/setgroups to change her identity to Bob in local
+ client node X;
+\layout Enumerate
+
+Bob (Alice in fact) tries to access a lustre file which belongs to Bob;
+\layout Enumerate
+
+MDS will verify the permission of Bob through local cached LSD configuration;
+\layout Enumerate
+
+MDS turns down or accept the file access request;
+\layout Subsection
+
+Rpc after setuid/setgid/setgroups from remote clients
+\layout Enumerate
+
+Alice calls setuid/setgid/setgroups to change her identity to Bob in remote
+ client node Y;
+\layout Enumerate
+
+Bob (Alice in fact) tries to access a lustre file which belongs to Bob;
+\layout Enumerate
+
+MDS will find Bob is from the remote realm and in fact he is not real Bob;
+\layout Enumerate
+
+MDS turns down the file access request;
+\layout Subsection
+
+Update LSD configuration in MDS
+\layout Enumerate
+
+Lustre system administrator hopes to update current LSD option;
+\layout Enumerate
+
+The sysadmin uses the lsd update utility which will update the on-disk security
+ database, and notify the changes of the LSD configuration to MDS;
+\layout Enumerate
+
+MDS re-fresh the cached LSD info through an upcall.
+\layout Subsection
+
+Revoke a local user
+\layout Enumerate
+
+Bob is able to access lustre filesystem
+\layout Enumerate
+
+Sysadmin remove Bob from the MDS's local user database, and flush in-kernel
+ LSD cache for Bob.
+\layout Enumerate
+
+Bob will not be able to access MDS immediately
+\layout Subsection
+
+Revoke a remote user
+\layout Enumerate
+
+Alice of a remote client is mapped to MDS local user Bob.
+\layout Enumerate
+
+Alice is able to access lustre filesystem
+\layout Enumerate
+
+Sysadmin remove the mapping
+\begin_inset Quotes eld
+\end_inset
+
+Alice->Bob
+\begin_inset Quotes erd
+\end_inset
+
+ from mapping database, and flush in-kernel mapping entry.
+\layout Enumerate
+
+Alice will not be able to access MDS immediately.
+\layout Enumerate
+
+If the mapping
+\begin_inset Quotes eld
+\end_inset
+
+anyone else -> Carol
+\begin_inset Quotes erd
+\end_inset
+
+ exist in the mapping database, Alice could reconnect to MDS and then will
+ be mapped to Carol.
+\layout Subsection
+
+Revoke a remote user (2)
+\layout Enumerate
+
+Alice of a remote client is mapped to MDS local user Bob.
+\layout Enumerate
+
+Alice is able to access lustre filesystem
+\layout Enumerate
+
+Sysadmin remove Bob from the MDS's local user database, and flush in-kernel
+ LSD cache for Bob.
+\layout Enumerate
+
+Alice will not be able to access MDS immediately.
+\layout Enumerate
+
+If the mapping
+\begin_inset Quotes eld
+\end_inset
+
+anyone else -> Carol
+\begin_inset Quotes erd
+\end_inset
+
+ exist in the mapping database, Alice could reconnect to MDS and then will
+ be mapped to Carol.
+\layout Subsection
+
+'ls -l' on remote client
+\layout Enumerate
+
+Suppose on a remote client, Alice's pricinpal group is AliceGrp; Bob's principal
+ groups is BobGrp.
+\layout Enumerate
+
+there's several files on lustre: file_1 belongs to Alice:AliceGrp; file_2
+ belongs to Alice:BobGrp; file_3 belongs to Bob:AliceGrp; file_4 belongs
+ to Bob:BobGrp; file_5 belongs to Bob:nllg;
+\layout Enumerate
+
+Alice do 'ls -l', output like this: file_1 belongs to Alice:AliceGrp; file_2
+ belongs to Alice:nllg; file_3 belongs to nllu:AliceGrp; file_4 belongs
+ to nllu:nllg; file_5 belongs to nllu:nllg;
+\layout Enumerate
+
+Bob just login the client system, also do a 'ls -l', output like this: file_1
+ belongs to Alice:AliceGrp; file_2 belongs to Alice:Bobgrp; file_3 belongs
+ to Bob:AliceGrp; file_4 belongs to Bob:BobGrp; file_5 belongs to Bob:nllg;
+\layout Enumerate
+
+Alice do 'ls -l' again, output is the same as Bob's list.
+\layout Enumerate
+
+Alice logout, then Bob do a 'ls -l' again, output like this: file_1 belongs
+ to nllu:nllg; file_2 belongs to nllu:Bobgrp; file_3 belongs to Bob:nllg;
+ file_4 belongs to Bob:BogGrp; file_5 belongs to Bob:nllg;
+\layout Subsection
+
+Chown on remote client
+\layout Enumerate
+
+Root user on a remote client want to change the owner of a file to Bob,
+ while Bob didn't login(authenticated with lustre) yet.
+\layout Enumerate
+
+MDS can't find the mapping for the destinated uid, so return error.
+\layout Enumerate
+
+Bob login at that time.
+\layout Enumerate
+
+Root do the same chown again.
+\layout Enumerate
+
+MDS will grant the request, no matter what the original owner of this file
+ is.
+\layout Subsection
+
+Chgrp on remote client
+\layout Enumerate
+
+Triditional chgrp on remote client is not allowed, since there's no clear
+ group id mapping between local and remote database.
+ so the group id on the remote client is not meaningful on the MDS.
+\layout Section
+
+Logic Specification
+\layout Subsection
+
+Specify nllu/nllg
+\layout Standard
+
+When client do mount, in addition to other parameter, user need supply with
+ the IDs of nllu/nllg on this client, which will be sent to the MDS at connectin
+g time.
+ If no nllu/nllg explicitly supplied, default values will be used.
+\layout Subsection
+
+Determine local or remote client
+\layout Standard
+
+Under GSS protection, user could explicitly supply the remote flag during
+ mount time.
+ MDS make decision as following order:
+\layout Itemize
+
+All permitted connections without GSS security are from local realm clients.
+\layout Itemize
+
+All connections with GSS security, if user supplied remote flag during mount,
+ MDS will grant the flag as requested.
+\layout Itemize
+
+All connections with GSS/local_realm_kerberos are from local realm clients.
+\layout Itemize
+
+All connections with GSS/remote_realm_kerberos are from remote realm clients.
+\layout Standard
+
+Here we made the assumption that: kerberos's local/remote realm == lustre's
+ local/remote realm.
+ Later we might bring in more factors into this dicision making.
+\layout Standard
+
+GSS/Kerberos module is responsible to provide the information that the initial
+ connect request whether has strong security; whether from remote kerberos
+ realm.
+\layout Standard
+
+On MDS's, the per-client export structure has a flag to indicate local/remote
+ of this client.
+ Accordingly, each client has a similar flag, which is send back by MDS's
+ after initial connection.
+\layout Subsection
+
+Handle local rpc request
+\layout Standard
+
+For each filesystem access request from client, we will get LSD for this
+ uid at first.
+ We then lookup in the cache, if not found or already invalid, issue a upcall
+ to get it.
+ If finally failed to get LSD(timeout or got an error), we simply deny this
+ request.
+\layout Standard
+
+After obtained LSD, we also check whether the client intend to do setuid/setgid/
+setgroups.
+ If yes, check the permission bits in LSD, if not allow we also deny this
+ request.
+ The intention of setuid/setgid could be detected by compare the uid, gid,
+ fsuid, fsgid sent by client, and the local authorized uid/gid.
+\layout Standard
+
+If setgroups is permitted: for root we'll directly use the supplementary
+ groups array sent by client; for normal user we compare those sent by client
+ with those in LSD, guarantee client only could reduce the array (can't
+ add new ids which is not part of group array in LSD).
+\layout Standard
+
+If setgroups is not permitted, we simply use the supplementary group array
+ provided by LSD.
+\layout Standard
+
+After all security context prepared as above, we switch it into process
+ context, perform the actual filesystem operation.
+ after finished, switch back the original context.
+ send reply out to client.
+\layout Standard
+
+Later an special security policy is needed to allow RAW access by FID without
+ a capability.
+ This is used for analyzing audit logs, finding pathnames from fids (for
+ recovery) etc.
+\layout Subsection
+
+Remote user mapping database
+\layout Standard
+
+There will be a user mapping configuration file on MDS, already defined
+ in
+\begin_inset Quotes eld
+\end_inset
+
+functional specification
+\begin_inset Quotes erd
+\end_inset
+
+.
+ MDS kernel will also maintain a cache of this mapping information.
+ It is populated by upcall to server side gss daemon, along with the gss
+ credential information.
+
+\layout Itemize
+
+The on-disk mapping database only described how user(principal) is mapped
+ to an local uid, and don't need specify the gid mapping.
+\layout Itemize
+
+Both on-disk mapping database and kernel mapping cache should be able to
+ allow map all other remote users to a certain local user.
+\layout Itemize
+
+On the MDS, the per-client structure will maintain this mapping cache.
+ When a user from remote client get authenticated, we check the on-disk
+ mapping database.
+ If no mapping items for this user found, we'll deny this user.
+ otherwise we record the target uid.
+\layout Itemize
+
+When a fs access request come from remote client, it contains the user's
+ uid, gid on the remote client.
+ Here we can establish mapping for uid and target uid.
+ With target uid we can find the target gid from local user database (from
+ LSD), thus we can also establish the mapping for gid and target gid.
+\layout Itemize
+
+With mapping we established above, we now do the mapping: replace the uid/gid
+ in the rpc request with target uid/gid.
+ If it request chown we also check & map the new owner id.
+\layout Itemize
+
+When reply populated and about to send back, we again check the mapping
+ cache, and do the reverse mapping if in the case which return file attributes
+ to clients.
+ For those can't find the matched items, map them to nllu/nllg of this remote
+ client.
+\layout Subsection
+
+Handle remote rpc request
+\layout Standard
+
+The overall process of handle remote rpc request is the same as for local
+ user, except following:
+\layout Itemize
+
+For incoming request, firstly do the uid/gid mapping for the requestor;
+ and do reserve mapping for the reply, as described above.
+\layout Itemize
+
+No setuid/setgid/setgroups intention is permitted, except we explicitly
+ allow setuid-root in setxid database.
+ And so we ignore the supplementary groups sent by client(if any), and simply
+ use the one provided by LSD.
+\layout Itemize
+
+For chown request, we also do translation for the new owner id (already
+ described above) according to the in-kernel mapping cache.
+ It means the root user on remote client can't change owner of a file to
+ a user which is not login yet.
+\layout Itemize
+
+Deny all chgrp request, since the group on remote client has no clear mapping
+ on MDS's local user database (We also could choose allow this when the
+ new group id showup in the in-kernel mapping cache, but it seems dosen't
+ make much sense).
+ So we probably need a special tool like
+\begin_inset Quotes eld
+\end_inset
+
+lfs chgrp
+\begin_inset Quotes erd
+\end_inset
+
+ to perform chgrp on remote client, which will send out text name instead
+ of translate to id locally.
+\layout Subsection
+
+Remote client cache flushing
+\layout Standard
+
+Anytime there might be inodes cached and their owner belongs to nllu/nllg.
+ If a new user Alice get authenticated and she happens to be the owner of
+ those inodes, we need to refresh those inode even if it's cache status
+ is correct, otherwise Alice will find her files belong to others.
+ Since we don't know whether a inode with nllu/nllg belongs to Alice or
+ not, we must flush all of them.
+\layout Standard
+
+On MDS, a callback or similar event notification mechanism should be hooked
+ into gss module.
+ When a user authenticated at the first time, we should iterate through
+ all the granted lock corresponding to this client, and revoke them selectively.
+ Strictly speaking we only want to revoke those inodebits lock and the owner/gro
+up of their resource (inode) not show up in the in-kernel mapping database,
+ but here we just flush all the inodebits locks, a cache is quickly re-populated
+ - there are a maximum of 20-100 cached locks on clients at the moment.
+\layout Standard
+
+When Alice logs out of the client system, we also do the similar things:
+ iterate through all the granted lock corresponding to this client, and
+ revoke them selectively.
+ Here we want to revoke those inodebits locks and the owner/group of their
+ resource(inode) is Alice.
+ We also could choose flush all of them like above case.
+\layout Subsection
+
+LSD upcall
+\layout Standard
+
+There is a general upcall-cache code which do upcall into user space, and
+ cache data passed down in kernel, and also implemented timeout invalidation.
+ Kernel LSD could simply be implemented as a instance of it.
+ So it will be quite simple.
+\layout Standard
+
+A user-space tools should provide following functionality:
+\layout Itemize
+
+Accept uid as parameter
+\layout Itemize
+
+Obtian gid and supplementary groups id array which the uid belongs to, if
+ failed just return error.
+\layout Itemize
+
+Obtian the setxid permission bits for this user on this NID from database.
+ If not found a default bitset will be applied: (1) for local client: setuid/set
+gid is off, setgroups for root is off, setgroups for normal user is on;
+ (2) for remote client: all of setuid/setgid/setgroups is off.
+\layout Itemize
+
+Pass all the collected information back to kernel by /proc.
+\layout Standard
+
+Since the upcall could happen concurrently, and admin could modified it
+ at anytime, so a kind of read-write lock need to be done on the database
+ file.
+\layout Subsection
+
+Recovery consideration
+\layout Standard
+
+All the code here should have minimal effect on recovery.
+ After MDS's crash, security context will be established during connection
+ time in recovery; and uid-mapping cache and LSD actually are
+\begin_inset Quotes eld
+\end_inset
+
+adaptive
+\begin_inset Quotes erd
+\end_inset
+
+, they will also be re-populated when handling related user's replay request
+ during/after recovery.
+\layout Section
+
+State Management
+\layout Subsection
+
+configuration states
+\layout Itemize
+
+Client has a remote flag at mount time.
+\layout Itemize
+
+Remote clients must have nllu:nllg installed.
+ it could simply be nobody:nobody.
+\layout Itemize
+
+MDS could have a remote-user mapping database which contains which principal
+ at with client should be mapped to which local user.
+ Without the database no remote client is allowed to connect.
+\layout Itemize
+
+MDS could have a security database which contains setxid permissions along
+ with other security setting for each affected user.
+ No such database then a default setting will be applied.
+\layout Subsection
+
+LSD entry states transition
+\layout Enumerate
+
+NEW: generated and submit to upcall
+\layout Enumerate
+
+READY: ready to serve
+\layout Enumerate
+
+INVALID: expired or error
+\layout Standard
+
+Requestor will initiate an NEW LSD entry; after upcall successfully fill
+ in data it change to READY; if timeout or some error happen (e.g.
+ not found in user database) during upcall it change to INVALID; a READY
+ LSD will change to INVALID when expired, or flushed forcely by sysadmin,
+ or MDS shutdown; an INVALID LSD will be soon destroied.
+\layout Standard
+
+No disk format changed.
+ When a large number of users access lustre from all kinds of local/remote
+ clients at the same time, MDS will have more CPU and memory overhead, especiall
+y for remote users.
+ No special recovery consideration.
+
+\layout Section
+
+Alternatives
+\layout Subsection
+
+NFSv4
+\layout Standard
+
+NFSv4 sends user and groups by name.
+\layout Section
+
+Focus of Inspection
+\layout Itemize
+
+Could this pass HP acceptance test?
+\layout Itemize
+
+Any is not reasonable? Any security hole?
+\layout Itemize
+
+Everything recoverable from MDS/client crash?
+\the_end
--- /dev/null
+#LyX 1.3 created this file. For more info see http://www.lyx.org/
+\lyxformat 221
+\textclass article
+\language english
+\inputencoding auto
+\fontscheme times
+\graphics default
+\paperfontsize 12
+\spacing single
+\papersize Default
+\paperpackage a4
+\use_geometry 0
+\use_amsmath 0
+\use_natbib 0
+\use_numerical_citations 0
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation skip
+\defskip medskip
+\quotes_language english
+\quotes_times 2
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+
+\layout Title
+
+High Level Design of User Revoke
+\layout Author
+
+Peter Braam, Eric Mei
+\layout Date
+
+Jan 31, 2005
+\layout Section
+
+Requirement
+\layout Itemize
+
+Be able to revoke a user, prevent it from accessing lustre immediately.
+\layout Itemize
+
+Be able to pass sub-test of HP acceptance 4.1.51.
+\layout Itemize
+
+user & mapping databases manipulation API.
+\layout Section
+
+Functional Specification
+\layout Standard
+
+A sub-command
+\begin_inset Quotes eld
+\end_inset
+
+revoke
+\begin_inset Quotes erd
+\end_inset
+
+ will be added into existing tool 'lctl'.
+ When system administrator want to kick somebody off from lustre filesystem
+ (e.g.
+ a certain user has known be malicious or an account be compromised), he
+ could use this functionality on MDS's to prevent the victim user from access
+ lustre filesystem right away.
+ The command format could be:
+\layout LyX-Code
+
+lctl revoke user|all
+\layout Itemize
+
+Here the 'user' format is: uid[@nid[.netid]]
+\layout Itemize
+
+option @nid.netid is only for remote users.
+ The uid is in term of local uid, thus 'uid@remote_nid.netid' means remote
+ users on node 'remote_nid.netid' who are mapped to local 'uid', it's not
+ intend to remove a certain user on specific node.
+\layout Itemize
+
+Specified uid without nid or netid means match all nid or netid.
+\layout Itemize
+
+'all' means revoke all users.
+\layout Standard
+
+Actually lctl only remove those in-kernel cache for the victim user, usually
+ there's many other configuration work need to be done by using other admin
+ tools:
+\layout Itemize
+
+Kerberos Database: For removing a user from kerberos principal database,
+ sysadmin must use kerberos admin tools.
+ And this change will not take effect right away if the victim user has
+ authenticated with MDS's before the removal (because of client side credential
+ cache).
+\layout Itemize
+
+User Database: For removing a user from user database, sysadmin also must
+ resort to other tools, usually standard unix tools.
+ This change will not take effect right away if this user had ever accessed
+ lustre before the removal (because of in-kernel LSD cache).
+\layout Itemize
+
+User Mapping Database: For removing a user from remote user mapping database,
+ sysadmin need edit the configure file manually.
+ This only affect certain user on certain remote client.
+ This change will not take effect right away if this user had ever acessed
+ lustre before the removal (because of in-kernel uid mapping cache).
+\layout Standard
+
+So when sysadmin actually revoke a user, he usually at first did one or
+ more steps of above according to requirement, then invoke lctl to finally
+ revoke the user.
+ In cases that user database or user mapping database are not centrally
+ managed by e.g.
+ LDAP, sysadmin must remove the user from all configure files on each MDS's,
+ this could be done by using 'pdsh', etc.
+\layout Standard
+
+What above described is the basic requirement.
+ There's an additional one: for user and mapping database, write a C API
+ library (probably later add python support), which can query, add, remove,
+ and enumerate users in each database.
+ 'edit' could be implemented as remove + add.
+\layout Standard
+
+By using this API, we could provide much complete functionality.
+ Sysadmin could do everything about user account within single lctl tools;
+ Kernel upcall helper also could use this API to obtain information from
+ mapping database, etc.
+\layout Section
+
+Use Cases
+\layout Subsection
+
+Revoke Alice's access right on all clients, permanently
+\layout Enumerate
+
+Sysadmin remove Alice from user database on all MDS's.
+\layout Enumerate
+
+Sysadmin invoke 'lctl revoke alice_uid' on all MDS's.
+\layout Enumerate
+
+Alice from local clients will not be able to access lustre.
+\layout Enumerate
+
+Any remote users who are mapped to Alice will not be able to access lustre.
+\layout Subsection
+
+Revoke Alice's access right on remote client remote1
+\layout Enumerate
+
+Suppose alice@remote1 is mapped to local user Bob.
+\layout Enumerate
+
+Sysadmin remove mapping entry of 'alice_uid@remote1 -> bob' from user mapping
+ database.
+\layout Enumerate
+
+Sysadmin invoke 'lctl revoke bob_uid@remote1' on all MDS's.
+\layout Enumerate
+
+Alice will not be able to access lustre from remote1.
+\layout Enumerate
+
+Bob from an local client could still work fine.
+\layout Section
+
+Logic Specification
+\layout Standard
+
+There's several kinds of in-kernel cache for certain user: LSD, gss context,
+ and uid-mapping.
+ In the future we might add consideration of removing OSS access capability.
+\layout Enumerate
+
+LSD: On each MDS, each user (uid) correspond to at most one LSD entry.
+ There's already an existing interface to flush LSD for a certain user:
+ simply write an uid into '/proc/fs/lustre/mds/lsd_flush' (Note this is
+ subject to change).
+ Write in '-1' will flush all LSD entries.
+\layout Enumerate
+
+GSS Context: On each MDS, each user (principal) might correspond to several(even
+ many) gss contexts.
+ The gss module should export a proc entry.
+ When provided uid and remote nid/netid, it should be able to find out the
+ initiating/established gss contexts and destroy them.
+ Providing a special tag will flush all gss contexts.
+\layout Enumerate
+
+UID Mapping: Firstly found out per-client structure for specified nid/netid,
+ then destroy the mapping entries for specified uid.
+ Since this is strongly related to GSS context, we can use the export proc
+ entry for gss context to initiate this flush.
+ Thus when sysadmin trying to flush gss contexts for certain user, we also
+ flush associated uid-mapping.
+\layout Standard
+
+This work should be done after the completion of GSS and remote uid/gid
+ handling implementation.
+\layout Standard
+
+The user and mapping databases manipulation API could be simple not much
+ restriction, and the details is very much related to the actual database
+ structure.
+ we leave the details to the following DLD document.
+\layout Section
+
+State Management
+\layout Standard
+
+Since we'll flush several cache separately, we might have situation that
+ not strictly consistency.
+ For example, after we flushed alice from cache1, someone re-populate it
+ in cache1 while do it on cache2.
+ In fact, the inconsistency between LSD and gss context is perfectly allowed.
+ Only one thing need be sure is: since uid mapping is established after
+ that of gss context, thus we need flush uid mapping at first, and then
+ flush gss context.
+ This could prevent unnecessary error when doing 'revoke' while we don't
+ actually remote it from mapping database.
+\layout Standard
+
+No serious locking issues, no special recovery consideration.
+\layout Section
+
+Alternatives
+\layout Standard
+
+None.
+\layout Section
+
+Focus of Inspection
+\layout Itemize
+
+Is the lctl interface reasonably reflect the facts?
+\layout Itemize
+
+Could it pass acceptance test?
+\the_end
--- /dev/null
+.Xrefs
+config.log
+config.status
+configure
+Makefile
+.deps
+TAGS
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.o.flags
+.tmp_versions
+.depend
--- /dev/null
+#MODULES := ptlrpcs_gss ptlrpcs_gss_krb5
+MODULES := ptlrpcs_gss
+ptlrpcs_gss-objs := sec_gss.o svcsec_gss.o rawobj.o gss_mech_switch.o \
+ gss_generic_token.o gss_krb5_crypto.o gss_krb5_seal.o \
+ gss_krb5_unseal.o gss_krb5_seqnum.o gss_krb5_mech.o \
+ gss_krb5_wrap.o
+#ptlrpcs_gss_krb5-objs := gss_krb5_mech.o
+
+@INCLUDE_RULES@
--- /dev/null
+# Copyright (C) 2004 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(src)/../../portals/Kernelenv
+
+#obj-y += ptlrpcs_gss.o ptlrpcs_gss_krb5.o
+obj-y += ptlrpcs_gss.o
+ptlrpcs_gss-objs := sec_gss.o svcsec_gss.o rawobj.o gss_mech_switch.o \
+ gss_generic_token.o gss_krb5_crypto.o gss_krb5_seal.o \
+ gss_krb5_unseal.o gss_krb5_seqnum.o gss_krb5_mech.o \
+ gss_krb5_wrap.o
+#ptlrpcs_gss_krb5-objs := gss_krb5_mech.o
--- /dev/null
+# Copyright (C) 2004 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if LIBLUSTRE
+noinst_LIBRARIES = libptlrpcs_gss.a
+libptlrpcs_gss_a_SOURCES = sec_gss.c gss_mech_switch.c gss_krb5_mech.c \
+ gss_generic_token.c gss_krb5_crypto.c \
+ gss_krb5_seal.c gss_krb5_unseal.c \
+ gss_krb5_seqnum.c rawobj.c
+
+libptlrpcs_gss_a_CPPFLAGS = $(LLCPPFLAGS)
+libptlrpcs_gss_a_CFLAGS = $(LLCFLAGS)
+endif
+
+if MODULES
+modulefs_DATA = ptlrpcs_gss$(KMODEXT)
+endif
+
+DIST_SOURCES = $(ptlrpcs_gss-objs:.o=.c) gss_internal.h gss_api.h gss_asn1.h \
+ gss_err.h gss_krb5.h
+MOSTLYCLEANFILES = *.o *.ko *.mod.c
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Somewhat simplified version of the gss api.
+ *
+ * Dug Song <dugsong@monkey.org>
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ * Copyright (c) 2000 The Regents of the University of Michigan
+ *
+ * $Id: gss_api.h,v 1.2 2005/03/31 22:18:24 ericm Exp $
+ */
+
+#ifndef __SEC_GSS_GSS_API_H_
+#define __SEC_GSS_GSS_API_H_
+
+struct gss_api_mech;
+
+/* The mechanism-independent gss-api context: */
+struct gss_ctx {
+ struct gss_api_mech *mech_type;
+ void *internal_ctx_id;
+};
+
+#define GSS_C_NO_BUFFER ((rawobj_t) 0)
+#define GSS_C_NO_CONTEXT ((struct gss_ctx *) 0)
+#define GSS_C_NULL_OID ((rawobj_t) 0)
+
+/*XXX arbitrary length - is this set somewhere? */
+#define GSS_OID_MAX_LEN 32
+
+/* gss-api prototypes; note that these are somewhat simplified versions of
+ * the prototypes specified in RFC 2744. */
+__u32 kgss_import_sec_context(
+ rawobj_t *input_token,
+ struct gss_api_mech *mech,
+ struct gss_ctx **ctx_id);
+__u32 kgss_inquire_context(
+ struct gss_ctx *ctx_id,
+ __u64 *endtime);
+__u32 kgss_get_mic(
+ struct gss_ctx *ctx_id,
+ __u32 qop,
+ rawobj_t *message,
+ rawobj_t *mic_token);
+__u32 kgss_verify_mic(
+ struct gss_ctx *ctx_id,
+ rawobj_t *message,
+ rawobj_t *mic_token,
+ __u32 *qstate);
+__u32 kgss_wrap(
+ struct gss_ctx *ctx_id,
+ __u32 qop,
+ rawobj_buf_t *in_token,
+ rawobj_t *out_token);
+__u32 kgss_unwrap(
+ struct gss_ctx *ctx_id,
+ __u32 qop,
+ rawobj_t *in_token,
+ rawobj_t *out_token);
+__u32 kgss_delete_sec_context(
+ struct gss_ctx **ctx_id);
+
+struct subflavor_desc {
+ __u32 subflavor;
+ __u32 qop;
+ __u32 service;
+ char *name;
+};
+
+/* Each mechanism is described by the following struct: */
+struct gss_api_mech {
+ struct list_head gm_list;
+ struct module *gm_owner;
+ char *gm_name;
+ rawobj_t gm_oid;
+ atomic_t gm_count;
+ struct gss_api_ops *gm_ops;
+ int gm_sf_num;
+ struct subflavor_desc *gm_sfs;
+};
+
+/* and must provide the following operations: */
+struct gss_api_ops {
+ __u32 (*gss_import_sec_context)(
+ rawobj_t *input_token,
+ struct gss_ctx *ctx_id);
+ __u32 (*gss_inquire_context)(
+ struct gss_ctx *ctx_id,
+ __u64 *endtime);
+ __u32 (*gss_get_mic)(
+ struct gss_ctx *ctx_id,
+ __u32 qop,
+ rawobj_t *message,
+ rawobj_t *mic_token);
+ __u32 (*gss_verify_mic)(
+ struct gss_ctx *ctx_id,
+ rawobj_t *message,
+ rawobj_t *mic_token,
+ __u32 *qstate);
+ __u32 (*gss_wrap)(
+ struct gss_ctx *ctx,
+ __u32 qop,
+ rawobj_buf_t *in_token,
+ rawobj_t *out_token);
+ __u32 (*gss_unwrap)(
+ struct gss_ctx *ctx,
+ __u32 qop,
+ rawobj_t *in_token,
+ rawobj_t *out_token);
+ void (*gss_delete_sec_context)(
+ void *internal_ctx_id);
+};
+
+int kgss_mech_register(struct gss_api_mech *mech);
+void kgss_mech_unregister(struct gss_api_mech *mech);
+
+struct gss_api_mech * kgss_OID_to_mech(rawobj_t *);
+struct gss_api_mech * kgss_name_to_mech(char *name);
+struct gss_api_mech * kgss_subflavor_to_mech(__u32 subflavor);
+
+struct gss_api_mech * kgss_mech_get(struct gss_api_mech *);
+void kgss_mech_put(struct gss_api_mech *);
+
+#endif /* __SEC_GSS_GSS_API_H_ */
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * minimal asn1 for generic encoding/decoding of gss tokens
+ *
+ * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ * lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ * require a specific license from the United States Government.
+ * It is the responsibility of any person or organization contemplating
+ * export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission. Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose. It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#define SIZEOF_INT 4
+
+/* from gssapi_err_generic.h */
+#define G_BAD_SERVICE_NAME (-2045022976L)
+#define G_BAD_STRING_UID (-2045022975L)
+#define G_NOUSER (-2045022974L)
+#define G_VALIDATE_FAILED (-2045022973L)
+#define G_BUFFER_ALLOC (-2045022972L)
+#define G_BAD_MSG_CTX (-2045022971L)
+#define G_WRONG_SIZE (-2045022970L)
+#define G_BAD_USAGE (-2045022969L)
+#define G_UNKNOWN_QOP (-2045022968L)
+#define G_NO_HOSTNAME (-2045022967L)
+#define G_BAD_HOSTNAME (-2045022966L)
+#define G_WRONG_MECH (-2045022965L)
+#define G_BAD_TOK_HEADER (-2045022964L)
+#define G_BAD_DIRECTION (-2045022963L)
+#define G_TOK_TRUNC (-2045022962L)
+#define G_REFLECT (-2045022961L)
+#define G_WRONG_TOKID (-2045022960L)
+
+#define g_OID_equal(o1,o2) \
+ (((o1)->len == (o2)->len) && \
+ (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0))
+
+__u32 g_verify_token_header(
+ rawobj_t *mech,
+ int *body_size,
+ unsigned char **buf_in,
+ int toksize);
+
+__u32 g_get_mech_oid(rawobj_t *mech, rawobj_t * in_buf);
+
+int g_token_size(
+ rawobj_t *mech,
+ unsigned int body_size);
+
+void g_make_token_header(
+ rawobj_t *mech,
+ int body_size,
+ unsigned char **buf);
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __SEC_GSS_GSS_ERR_H_
+#define __SEC_GSS_GSS_ERR_H_
+
+typedef unsigned int OM_uint32;
+
+/*
+ * Flag bits for context-level services.
+ */
+#define GSS_C_DELEG_FLAG 1
+#define GSS_C_MUTUAL_FLAG 2
+#define GSS_C_REPLAY_FLAG 4
+#define GSS_C_SEQUENCE_FLAG 8
+#define GSS_C_CONF_FLAG 16
+#define GSS_C_INTEG_FLAG 32
+#define GSS_C_ANON_FLAG 64
+#define GSS_C_PROT_READY_FLAG 128
+#define GSS_C_TRANS_FLAG 256
+
+/*
+ * Credential usage options
+ */
+#define GSS_C_BOTH 0
+#define GSS_C_INITIATE 1
+#define GSS_C_ACCEPT 2
+
+/*
+ * Status code types for gss_display_status
+ */
+#define GSS_C_GSS_CODE 1
+#define GSS_C_MECH_CODE 2
+
+
+/*
+ * Define the default Quality of Protection for per-message services. Note
+ * that an implementation that offers multiple levels of QOP may either reserve
+ * a value (for example zero, as assumed here) to mean "default protection", or
+ * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit
+ * QOP value. However a value of 0 should always be interpreted by a GSSAPI
+ * implementation as a request for the default protection level.
+ */
+#define GSS_C_QOP_DEFAULT 0
+
+/*
+ * Expiration time of 2^32-1 seconds means infinite lifetime for a
+ * credential or security context
+ */
+#define GSS_C_INDEFINITE ((OM_uint32) 0xfffffffful)
+
+
+/* Major status codes */
+
+#define GSS_S_COMPLETE 0
+
+/*
+ * Some "helper" definitions to make the status code macros obvious.
+ */
+#define GSS_C_CALLING_ERROR_OFFSET 24
+#define GSS_C_ROUTINE_ERROR_OFFSET 16
+#define GSS_C_SUPPLEMENTARY_OFFSET 0
+#define GSS_C_CALLING_ERROR_MASK ((OM_uint32) 0377ul)
+#define GSS_C_ROUTINE_ERROR_MASK ((OM_uint32) 0377ul)
+#define GSS_C_SUPPLEMENTARY_MASK ((OM_uint32) 0177777ul)
+
+/*
+ * The macros that test status codes for error conditions. Note that the
+ * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now
+ * evaluates its argument only once.
+ */
+#define GSS_CALLING_ERROR(x) \
+ ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET))
+#define GSS_ROUTINE_ERROR(x) \
+ ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))
+#define GSS_SUPPLEMENTARY_INFO(x) \
+ ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET))
+#define GSS_ERROR(x) \
+ ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \
+ (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)))
+
+/*
+ * Now the actual status code definitions
+ */
+
+/*
+ * Calling errors:
+ */
+#define GSS_S_CALL_INACCESSIBLE_READ \
+ (((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_INACCESSIBLE_WRITE \
+ (((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_BAD_STRUCTURE \
+ (((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET)
+
+/*
+ * Routine errors:
+ */
+#define GSS_S_BAD_MECH (((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAME (((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAMETYPE (((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_BINDINGS (((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_STATUS (((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_SIG (((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CRED (((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CONTEXT (((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_TOKEN (((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_CREDENTIAL \
+ (((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CREDENTIALS_EXPIRED \
+ (((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CONTEXT_EXPIRED \
+ (((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_FAILURE (((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_QOP (((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAUTHORIZED (((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAVAILABLE (((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DUPLICATE_ELEMENT \
+ (((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NAME_NOT_MN \
+ (((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+
+/*
+ * Supplementary info bits:
+ */
+#define GSS_S_CONTINUE_NEEDED (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 0))
+#define GSS_S_DUPLICATE_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 1))
+#define GSS_S_OLD_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 2))
+#define GSS_S_UNSEQ_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 3))
+#define GSS_S_GAP_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 4))
+
+/* XXXX these are not part of the GSSAPI C bindings! (but should be) */
+
+#define GSS_CALLING_ERROR_FIELD(x) \
+ (((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK)
+#define GSS_ROUTINE_ERROR_FIELD(x) \
+ (((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK)
+#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \
+ (((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK)
+
+/* XXXX This is a necessary evil until the spec is fixed */
+#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE
+
+#endif /* __SEC_GSS_GSS_ERR_H_ */
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/gss_generic_token.c
+ *
+ * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+#include "gss_asn1.h"
+
+
+/* TWRITE_STR from gssapiP_generic.h */
+#define TWRITE_STR(ptr, str, len) \
+ memcpy((ptr), (char *) (str), (len)); \
+ (ptr) += (len);
+
+/* XXXX this code currently makes the assumption that a mech oid will
+ never be longer than 127 bytes. This assumption is not inherent in
+ the interfaces, so the code can be fixed if the OSI namespace
+ balloons unexpectedly. */
+
+/* Each token looks like this:
+
+0x60 tag for APPLICATION 0, SEQUENCE
+ (constructed, definite-length)
+ <length> possible multiple bytes, need to parse/generate
+ 0x06 tag for OBJECT IDENTIFIER
+ <moid_length> compile-time constant string (assume 1 byte)
+ <moid_bytes> compile-time constant string
+ <inner_bytes> the ANY containing the application token
+ bytes 0,1 are the token type
+ bytes 2,n are the token data
+
+For the purposes of this abstraction, the token "header" consists of
+the sequence tag and length octets, the mech OID DER encoding, and the
+first two inner bytes, which indicate the token type. The token
+"body" consists of everything else.
+
+*/
+
+static int
+der_length_size( int length)
+{
+ if (length < (1<<7))
+ return(1);
+ else if (length < (1<<8))
+ return(2);
+#if (SIZEOF_INT == 2)
+ else
+ return(3);
+#else
+ else if (length < (1<<16))
+ return(3);
+ else if (length < (1<<24))
+ return(4);
+ else
+ return(5);
+#endif
+}
+
+static void
+der_write_length(unsigned char **buf, int length)
+{
+ if (length < (1<<7)) {
+ *(*buf)++ = (unsigned char) length;
+ } else {
+ *(*buf)++ = (unsigned char) (der_length_size(length)+127);
+#if (SIZEOF_INT > 2)
+ if (length >= (1<<24))
+ *(*buf)++ = (unsigned char) (length>>24);
+ if (length >= (1<<16))
+ *(*buf)++ = (unsigned char) ((length>>16)&0xff);
+#endif
+ if (length >= (1<<8))
+ *(*buf)++ = (unsigned char) ((length>>8)&0xff);
+ *(*buf)++ = (unsigned char) (length&0xff);
+ }
+}
+
+/* returns decoded length, or < 0 on failure. Advances buf and
+ decrements bufsize */
+
+static int
+der_read_length(unsigned char **buf, int *bufsize)
+{
+ unsigned char sf;
+ int ret;
+
+ if (*bufsize < 1)
+ return(-1);
+ sf = *(*buf)++;
+ (*bufsize)--;
+ if (sf & 0x80) {
+ if ((sf &= 0x7f) > ((*bufsize)-1))
+ return(-1);
+ if (sf > SIZEOF_INT)
+ return (-1);
+ ret = 0;
+ for (; sf; sf--) {
+ ret = (ret<<8) + (*(*buf)++);
+ (*bufsize)--;
+ }
+ } else {
+ ret = sf;
+ }
+
+ return(ret);
+}
+
+/* returns the length of a token, given the mech oid and the body size */
+
+int
+g_token_size(rawobj_t *mech, unsigned int body_size)
+{
+ /* set body_size to sequence contents size */
+ body_size += 4 + (int) mech->len; /* NEED overflow check */
+ return(1 + der_length_size(body_size) + body_size);
+}
+
+//EXPORT_SYMBOL(g_token_size);
+
+/* fills in a buffer with the token header. The buffer is assumed to
+ be the right size. buf is advanced past the token header */
+
+void
+g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf)
+{
+ *(*buf)++ = 0x60;
+ der_write_length(buf, 4 + mech->len + body_size);
+ *(*buf)++ = 0x06;
+ *(*buf)++ = (unsigned char) mech->len;
+ TWRITE_STR(*buf, mech->data, ((int) mech->len));
+}
+
+//EXPORT_SYMBOL(g_make_token_header);
+
+/*
+ * Given a buffer containing a token, reads and verifies the token,
+ * leaving buf advanced past the token header, and setting body_size
+ * to the number of remaining bytes. Returns 0 on success,
+ * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the
+ * mechanism in the token does not match the mech argument. buf and
+ * *body_size are left unmodified on error.
+ */
+__u32
+g_verify_token_header(rawobj_t *mech, int *body_size,
+ unsigned char **buf_in, int toksize)
+{
+ unsigned char *buf = *buf_in;
+ int seqsize;
+ rawobj_t toid;
+ int ret = 0;
+
+ if ((toksize-=1) < 0)
+ return(G_BAD_TOK_HEADER);
+ if (*buf++ != 0x60)
+ return(G_BAD_TOK_HEADER);
+
+ if ((seqsize = der_read_length(&buf, &toksize)) < 0)
+ return(G_BAD_TOK_HEADER);
+
+ if (seqsize != toksize)
+ return(G_BAD_TOK_HEADER);
+
+ if ((toksize-=1) < 0)
+ return(G_BAD_TOK_HEADER);
+ if (*buf++ != 0x06)
+ return(G_BAD_TOK_HEADER);
+
+ if ((toksize-=1) < 0)
+ return(G_BAD_TOK_HEADER);
+ toid.len = *buf++;
+
+ if ((toksize-=toid.len) < 0)
+ return(G_BAD_TOK_HEADER);
+ toid.data = buf;
+ buf+=toid.len;
+
+ if (! g_OID_equal(&toid, mech))
+ ret = G_WRONG_MECH;
+
+ /* G_WRONG_MECH is not returned immediately because it's more important
+ to return G_BAD_TOK_HEADER if the token header is in fact bad */
+
+ if ((toksize-=2) < 0)
+ return(G_BAD_TOK_HEADER);
+
+ if (ret)
+ return(ret);
+
+ if (!ret) {
+ *buf_in = buf;
+ *body_size = toksize;
+ }
+
+ return(ret);
+}
+
+//EXPORT_SYMBOL(g_verify_token_header);
+
+/* Given a buffer containing a token, returns a copy of the mech oid in
+ * the parameter mech. */
+__u32
+g_get_mech_oid(rawobj_t *mech, rawobj_t * in_buf)
+{
+ unsigned char *buf = in_buf->data;
+ int len = in_buf->len;
+ int ret=0;
+ int seqsize;
+
+ if ((len-=1) < 0)
+ return(G_BAD_TOK_HEADER);
+ if (*buf++ != 0x60)
+ return(G_BAD_TOK_HEADER);
+
+ if ((seqsize = der_read_length(&buf, &len)) < 0)
+ return(G_BAD_TOK_HEADER);
+
+ if ((len-=1) < 0)
+ return(G_BAD_TOK_HEADER);
+ if (*buf++ != 0x06)
+ return(G_BAD_TOK_HEADER);
+
+ if ((len-=1) < 0)
+ return(G_BAD_TOK_HEADER);
+ mech->len = *buf++;
+
+ if ((len-=mech->len) < 0)
+ return(G_BAD_TOK_HEADER);
+ OBD_ALLOC(mech->data, mech->len);
+ if (!mech->data)
+ return(G_BUFFER_ALLOC);
+ memcpy(mech->data, buf, mech->len);
+
+ return ret;
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modified from NFSv4 project for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#ifndef __SEC_GSS_GSS_INTERNAL_H_
+#define __SEC_GSS_GSS_INTERNAL_H_
+
+struct ptlrpc_sec;
+struct ptlrpc_cred;
+
+typedef struct rawobj_s {
+ __u32 len;
+ __u8 *data;
+} rawobj_t;
+
+int rawobj_alloc(rawobj_t *obj, char *buf, int len);
+void rawobj_free(rawobj_t *obj);
+int rawobj_equal(rawobj_t *a, rawobj_t *b);
+int rawobj_dup(rawobj_t *dest, rawobj_t *src);
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+
+typedef struct rawobj_buf_s {
+ __u32 dataoff;
+ __u32 datalen;
+ __u32 buflen;
+ __u8 *buf;
+} rawobj_buf_t;
+
+#define MAXSEQ 0x80000000 /* maximum legal sequence number, from rfc 2203 */
+
+enum rpc_gss_proc {
+ RPC_GSS_PROC_DATA = 0,
+ RPC_GSS_PROC_INIT = 1,
+ RPC_GSS_PROC_CONTINUE_INIT = 2,
+ RPC_GSS_PROC_DESTROY = 3,
+};
+
+enum rpc_gss_svc {
+ RPC_GSS_SVC_NONE = 1,
+ RPC_GSS_SVC_INTEGRITY = 2,
+ RPC_GSS_SVC_PRIVACY = 3,
+};
+
+/* on-the-wire gss cred: */
+struct rpc_gss_wire_cred {
+ __u32 gc_v; /* version */
+ __u32 gc_proc; /* control procedure */
+ __u32 gc_seq; /* sequence number */
+ __u32 gc_svc; /* service */
+ rawobj_t gc_ctx; /* context handle */
+};
+
+/* on-the-wire gss verifier: */
+struct rpc_gss_wire_verf {
+ __u32 gv_flavor;
+ rawobj_t gv_verf;
+};
+
+struct gss_cl_ctx {
+ atomic_t gc_refcount;
+ __u32 gc_proc;
+ __u32 gc_seq;
+ spinlock_t gc_seq_lock;
+ struct gss_ctx *gc_gss_ctx;
+ rawobj_t gc_wire_ctx;
+ __u32 gc_win;
+};
+
+struct gss_cred {
+ struct ptlrpc_cred gc_base;
+ ptlrpcs_flavor_t gc_flavor;
+ struct gss_cl_ctx *gc_ctx;
+};
+
+/*
+ * This only guaranteed be enough for current krb5 des-cbc-crc . We might
+ * adjust this when new enc type or mech added in.
+ */
+#define GSS_PRIVBUF_PREFIX_LEN (32)
+#define GSS_PRIVBUF_SUFFIX_LEN (32)
+
+/* This is too coarse. We'll let mech determine it */
+#define GSS_MAX_AUTH_PAYLOAD (128)
+
+/* gss_mech_switch.c */
+int init_kerberos_module(void);
+void cleanup_kerberos_module(void);
+
+/* gss_generic_token.c */
+int g_token_size(rawobj_t *mech, unsigned int body_size);
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf);
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+ unsigned char **buf_in, int toksize);
+
+/* svcsec_gss.c */
+int gss_svc_init(void);
+void gss_svc_exit(void);
+
+#endif /* __SEC_GSS_GSS_INTERNAL_H_ */
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/include/linux/sunrpc/gss_krb5_types.h
+ *
+ * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ * lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ * require a specific license from the United States Government.
+ * It is the responsibility of any person or organization contemplating
+ * export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission. Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose. It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+extern spinlock_t krb5_seq_lock;
+
+struct krb5_ctx {
+ int initiate; /* 1 = initiating, 0 = accepting */
+ int seed_init;
+ unsigned char seed[16];
+ int signalg;
+ int sealalg;
+ struct crypto_tfm *enc;
+ struct crypto_tfm *seq;
+ __s32 endtime;
+ __u32 seq_send;
+ rawobj_t mech_used;
+};
+
+#define KG_TOK_MIC_MSG 0x0101
+#define KG_TOK_WRAP_MSG 0x0201
+
+enum sgn_alg {
+ SGN_ALG_DES_MAC_MD5 = 0x0000,
+ SGN_ALG_MD2_5 = 0x0001,
+ SGN_ALG_DES_MAC = 0x0002,
+ SGN_ALG_3 = 0x0003, /* not published */
+ SGN_ALG_HMAC_MD5 = 0x0011, /* microsoft w2k; no support */
+ SGN_ALG_HMAC_SHA1_DES3_KD = 0x0004
+};
+enum seal_alg {
+ SEAL_ALG_NONE = 0xffff,
+ SEAL_ALG_DES = 0x0000,
+ SEAL_ALG_1 = 0x0001, /* not published */
+ SEAL_ALG_MICROSOFT_RC4 = 0x0010,/* microsoft w2k; no support */
+ SEAL_ALG_DES3KD = 0x0002
+};
+
+#define KRB5_CKSUM_LENGTH 8
+
+#define CKSUMTYPE_CRC32 0x0001
+#define CKSUMTYPE_RSA_MD4 0x0002
+#define CKSUMTYPE_RSA_MD4_DES 0x0003
+#define CKSUMTYPE_DESCBC 0x0004
+#define CKSUMTYPE_RSA_MD5 0x0007
+#define CKSUMTYPE_RSA_MD5_DES 0x0008
+#define CKSUMTYPE_NIST_SHA 0x0009
+#define CKSUMTYPE_HMAC_SHA1_DES3 0x000c
+
+/* from gssapi_err_krb5.h */
+#define KG_CCACHE_NOMATCH (39756032L)
+#define KG_KEYTAB_NOMATCH (39756033L)
+#define KG_TGT_MISSING (39756034L)
+#define KG_NO_SUBKEY (39756035L)
+#define KG_CONTEXT_ESTABLISHED (39756036L)
+#define KG_BAD_SIGN_TYPE (39756037L)
+#define KG_BAD_LENGTH (39756038L)
+#define KG_CTX_INCOMPLETE (39756039L)
+#define KG_CONTEXT (39756040L)
+#define KG_CRED (39756041L)
+#define KG_ENC_DESC (39756042L)
+#define KG_BAD_SEQ (39756043L)
+#define KG_EMPTY_CCACHE (39756044L)
+#define KG_NO_CTYPES (39756045L)
+
+/* per Kerberos v5 protocol spec crypto types from the wire.
+ * these get mapped to linux kernel crypto routines.
+ */
+#define ENCTYPE_NULL 0x0000
+#define ENCTYPE_DES_CBC_CRC 0x0001 /* DES cbc mode with CRC-32 */
+#define ENCTYPE_DES_CBC_MD4 0x0002 /* DES cbc mode with RSA-MD4 */
+#define ENCTYPE_DES_CBC_MD5 0x0003 /* DES cbc mode with RSA-MD5 */
+#define ENCTYPE_DES_CBC_RAW 0x0004 /* DES cbc mode raw */
+/* XXX deprecated? */
+#define ENCTYPE_DES3_CBC_SHA 0x0005 /* DES-3 cbc mode with NIST-SHA */
+#define ENCTYPE_DES3_CBC_RAW 0x0006 /* DES-3 cbc mode raw */
+#define ENCTYPE_DES_HMAC_SHA1 0x0008
+#define ENCTYPE_DES3_CBC_SHA1 0x0010
+#define ENCTYPE_UNKNOWN 0x01ff
+
+__s32
+make_checksum(__s32 cksumtype,
+ char *header, int hdrlen,
+ rawobj_t *body,
+ rawobj_t *cksum);
+
+__u32
+krb5_make_token(struct krb5_ctx *ctx,
+ int qop_req,
+ rawobj_t *text,
+ rawobj_t *token);
+
+__u32
+krb5_read_token(struct krb5_ctx *ctx,
+ rawobj_t *read_token,
+ rawobj_t *message_buffer,
+ int *qop_state);
+
+__u32
+krb5_encrypt(struct crypto_tfm *tfm,
+ void * iv,
+ void * in,
+ void * out,
+ int length);
+
+__u32
+krb5_decrypt(struct crypto_tfm *tfm,
+ void * iv,
+ void * in,
+ void * out,
+ int length);
+
+__s32
+krb5_make_seq_num(struct crypto_tfm *key,
+ int direction,
+ __s32 seqnum,
+ unsigned char *cksum,
+ unsigned char *buf);
+
+__s32
+krb5_get_seq_num(struct crypto_tfm *key,
+ unsigned char *cksum,
+ unsigned char *buf,
+ int *direction,
+ __s32 *seqnum);
+int
+gss_encrypt_rawobj(struct crypto_tfm *tfm,
+ rawobj_t *inobj,
+ rawobj_t *outobj,
+ int enc);
+__u32
+gss_wrap_kerberos(struct gss_ctx *ctx,
+ __u32 qop,
+ rawobj_buf_t *in_token,
+ rawobj_t *out_token);
+__u32
+gss_unwrap_kerberos(struct gss_ctx *ctx,
+ __u32 qop,
+ rawobj_t *in_token,
+ rawobj_t *out_token);
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/gss_krb5_crypto.c
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government. It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission. FundsXpress makes no representations about the suitability of
+ * this software for any purpose. It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#else
+#include <liblustre.h>
+#include "../kcrypto/libcrypto.h"
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+
+__u32
+krb5_encrypt(struct crypto_tfm *tfm,
+ void * iv,
+ void * in,
+ void * out,
+ int length)
+{
+ __u32 ret = -EINVAL;
+ struct scatterlist sg[1];
+ __u8 local_iv[16] = {0};
+
+ if (length % crypto_tfm_alg_blocksize(tfm) != 0)
+ goto out;
+
+ if (crypto_tfm_alg_ivsize(tfm) > 16) {
+ CERROR("tfm iv size to large %d\n", crypto_tfm_alg_ivsize(tfm));
+ goto out;
+ }
+
+ if (iv)
+ memcpy(local_iv, iv, crypto_tfm_alg_ivsize(tfm));
+
+ memcpy(out, in, length);
+ sg[0].page = virt_to_page(out);
+ sg[0].offset = offset_in_page(out);
+ sg[0].length = length;
+
+ ret = crypto_cipher_encrypt_iv(tfm, sg, sg, length, local_iv);
+
+out:
+ return(ret);
+}
+
+//EXPORT_SYMBOL(krb5_encrypt);
+
+__u32
+krb5_decrypt(struct crypto_tfm *tfm,
+ void * iv,
+ void * in,
+ void * out,
+ int length)
+{
+ __u32 ret = -EINVAL;
+ struct scatterlist sg[1];
+ __u8 local_iv[16] = {0};
+
+ if (length % crypto_tfm_alg_blocksize(tfm) != 0)
+ goto out;
+
+ if (crypto_tfm_alg_ivsize(tfm) > 16) {
+ CERROR("tfm iv size to large %d\n", crypto_tfm_alg_ivsize(tfm));
+ goto out;
+ }
+ if (iv)
+ memcpy(local_iv,iv, crypto_tfm_alg_ivsize(tfm));
+
+ memcpy(out, in, length);
+ sg[0].page = virt_to_page(out);
+ sg[0].offset = offset_in_page(out);
+ sg[0].length = length;
+
+ ret = crypto_cipher_decrypt_iv(tfm, sg, sg, length, local_iv);
+
+out:
+ return(ret);
+}
+
+//EXPORT_SYMBOL(krb5_decrypt);
+
+void
+buf_to_sg(struct scatterlist *sg, char *ptr, int len)
+{
+ sg->page = virt_to_page(ptr);
+ sg->offset = offset_in_page(ptr);
+ sg->length = len;
+}
+
+/* checksum the plaintext data and hdrlen bytes of the token header */
+__s32
+make_checksum(__s32 cksumtype,
+ char *header, int hdrlen,
+ rawobj_t *body,
+ rawobj_t *cksum)
+{
+ char *cksumname;
+ struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */
+ struct scatterlist sg[1];
+ __u32 code = GSS_S_FAILURE;
+
+ switch (cksumtype) {
+ case CKSUMTYPE_RSA_MD5:
+ cksumname = "md5";
+ break;
+ default:
+ CERROR("unsupported checksum %d", cksumtype);
+ goto out;
+ }
+ if (!(tfm = crypto_alloc_tfm(cksumname, 0)))
+ goto out;
+ cksum->len = crypto_tfm_alg_digestsize(tfm);
+ OBD_ALLOC(cksum->data, cksum->len);
+ if (!cksum->data)
+ goto out;
+
+ crypto_digest_init(tfm);
+ buf_to_sg(sg, header, hdrlen);
+ crypto_digest_update(tfm, sg, 1);
+ if (body->len) {
+ buf_to_sg(sg, body->data, body->len);
+ crypto_digest_update(tfm, sg, 1);
+ }
+
+ crypto_digest_final(tfm, cksum->data);
+ code = 0;
+out:
+ if (tfm)
+ crypto_free_tfm(tfm);
+ return code;
+}
+
+//EXPORT_SYMBOL(make_checksum);
+
+static
+void obj_to_scatter_list(rawobj_t *obj, struct scatterlist *list,
+ int listlen)
+{
+ __u8 *ptr = obj->data;
+ __u32 size = obj->len;
+ int index = 0;
+
+ while (size) {
+ LASSERT(index++ < listlen);
+ list->page = virt_to_page(ptr);
+ list->offset = (int) ptr & (~PAGE_MASK);
+ list->length = (list->offset + size) > PAGE_SIZE ?
+ (PAGE_SIZE - list->offset) : size;
+ ptr += list->length;
+ size -= list->length;
+ list++;
+ }
+}
+
+int gss_encrypt_rawobj(struct crypto_tfm *tfm,
+ rawobj_t *inobj, rawobj_t *outobj,
+ int enc)
+{
+ struct scatterlist *src_list, *dst_list;
+ __u8 local_iv[16] = {0};
+ int list_len;
+ __u32 rc;
+ ENTRY;
+
+ LASSERT(outobj->len >= inobj->len);
+
+ list_len = ((inobj->len + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
+ OBD_ALLOC(src_list, sizeof(*src_list) * list_len * 2);
+ if (!src_list) {
+ CERROR("can't alloc %d\n", sizeof(*src_list) * list_len * 2);
+ RETURN(-ENOMEM);
+ }
+ dst_list = src_list + list_len;
+
+ obj_to_scatter_list(inobj, src_list, list_len);
+ obj_to_scatter_list(outobj, dst_list, list_len);
+
+ if (enc)
+ rc = crypto_cipher_encrypt_iv(tfm, dst_list, src_list,
+ inobj->len, local_iv);
+ else
+ rc = crypto_cipher_decrypt_iv(tfm, dst_list, src_list,
+ inobj->len, local_iv);
+
+ if (rc) {
+ CERROR("encrypt error %u\n", rc);
+ GOTO(out_free, rc);
+ }
+
+ outobj->len = inobj->len;
+
+out_free:
+ OBD_FREE(src_list, sizeof(*src_list) * list_len * 2);
+ RETURN(rc);
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/gss_krb5_mech.c
+ *
+ * Copyright (c) 2001 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ * J. Bruce Fields <bfields@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#else
+#include <liblustre.h>
+//#include "../kcrypto/libcrypto.h"
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+
+rawobj_t gss_mech_krb5_oid =
+ {9, "\052\206\110\206\367\022\001\002\002"};
+
+static inline int
+get_bytes(char **ptr, const char *end, void *res, int len)
+{
+ char *p, *q;
+ p = *ptr;
+ q = p + len;
+ if (q > end || q < p)
+ return -1;
+ memcpy(res, p, len);
+ *ptr = q;
+ return 0;
+}
+
+static inline int
+get_rawobj(char **ptr, const char *end, rawobj_t *res)
+{
+ char *p, *q;
+ p = *ptr;
+ if (get_bytes(&p, end, &res->len, sizeof(res->len)))
+ return -1;
+ q = p + res->len;
+ if (q > end || q < p)
+ return -1;
+ OBD_ALLOC(res->data, res->len);
+ if (!res->data)
+ return -1;
+ memcpy(res->data, p, res->len);
+ *ptr = q;
+ return 0;
+}
+
+static inline int
+get_key(char **p, char *end, struct crypto_tfm **res)
+{
+ rawobj_t key;
+ int alg, alg_mode;
+ char *alg_name;
+
+ if (get_bytes(p, end, &alg, sizeof(alg)))
+ goto out_err;
+ if ((get_rawobj(p, end, &key)))
+ goto out_err;
+
+ switch (alg) {
+ case ENCTYPE_DES_CBC_RAW:
+ alg_name = "des";
+ alg_mode = CRYPTO_TFM_MODE_CBC;
+ break;
+ default:
+ CERROR("unsupported algorithm %d\n", alg);
+ goto out_err_free_key;
+ }
+ if (!(*res = crypto_alloc_tfm(alg_name, alg_mode)))
+ goto out_err_free_key;
+ if (crypto_cipher_setkey(*res, key.data, key.len))
+ goto out_err_free_tfm;
+
+ OBD_FREE(key.data, key.len);
+ return 0;
+
+out_err_free_tfm:
+ crypto_free_tfm(*res);
+out_err_free_key:
+ OBD_FREE(key.data, key.len);
+out_err:
+ return -1;
+}
+
+static __u32
+gss_import_sec_context_kerberos(rawobj_t *inbuf,
+ struct gss_ctx *ctx_id)
+{
+ char *p = inbuf->data;
+ char *end = inbuf->data + inbuf->len;
+ struct krb5_ctx *ctx;
+
+ OBD_ALLOC(ctx, sizeof(*ctx));
+ if (!ctx)
+ goto out_err;
+
+ if (get_bytes(&p, end, &ctx->initiate, sizeof(ctx->initiate)))
+ goto out_err_free_ctx;
+ if (get_bytes(&p, end, &ctx->seed_init, sizeof(ctx->seed_init)))
+ goto out_err_free_ctx;
+ if (get_bytes(&p, end, ctx->seed, sizeof(ctx->seed)))
+ goto out_err_free_ctx;
+ if (get_bytes(&p, end, &ctx->signalg, sizeof(ctx->signalg)))
+ goto out_err_free_ctx;
+ if (get_bytes(&p, end, &ctx->sealalg, sizeof(ctx->sealalg)))
+ goto out_err_free_ctx;
+ if (get_bytes(&p, end, &ctx->endtime, sizeof(ctx->endtime)))
+ goto out_err_free_ctx;
+ if (get_bytes(&p, end, &ctx->seq_send, sizeof(ctx->seq_send)))
+ goto out_err_free_ctx;
+ if (get_rawobj(&p, end, &ctx->mech_used))
+ goto out_err_free_ctx;
+ if (get_key(&p, end, &ctx->enc))
+ goto out_err_free_mech;
+ if (get_key(&p, end, &ctx->seq))
+ goto out_err_free_key1;
+ if (p != end)
+ goto out_err_free_key2;
+
+ ctx_id->internal_ctx_id = ctx;
+ CDEBUG(D_SEC, "Succesfully imported new context.\n");
+ return 0;
+
+out_err_free_key2:
+ crypto_free_tfm(ctx->seq);
+out_err_free_key1:
+ crypto_free_tfm(ctx->enc);
+out_err_free_mech:
+ OBD_FREE(ctx->mech_used.data, ctx->mech_used.len);
+out_err_free_ctx:
+ OBD_FREE(ctx, sizeof(*ctx));
+out_err:
+ return GSS_S_FAILURE;
+}
+
+static __u32
+gss_inquire_context_kerberos(struct gss_ctx *context_handle,
+ __u64 *endtime)
+{
+ struct krb5_ctx *kctx = context_handle->internal_ctx_id;
+
+ *endtime = (__u64) kctx->endtime;
+ return GSS_S_COMPLETE;
+}
+
+static void
+gss_delete_sec_context_kerberos(void *internal_ctx)
+{
+ struct krb5_ctx *ctx = internal_ctx;
+
+ if (ctx->seq)
+ crypto_free_tfm(ctx->seq);
+ if (ctx->enc)
+ crypto_free_tfm(ctx->enc);
+ if (ctx->mech_used.data)
+ OBD_FREE(ctx->mech_used.data, ctx->mech_used.len);
+ OBD_FREE(ctx, sizeof(*ctx));
+}
+
+/* XXX the following wrappers have become pointless; kill them. */
+static __u32
+gss_verify_mic_kerberos(struct gss_ctx *ctx,
+ rawobj_t *message,
+ rawobj_t *mic_token,
+ __u32 *qstate)
+{
+ struct krb5_ctx *kctx = ctx->internal_ctx_id;
+ __u32 maj_stat;
+ int qop_state;
+
+ maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state);
+ if (!maj_stat && qop_state)
+ *qstate = qop_state;
+
+ CDEBUG(D_SEC, "returning %d\n", maj_stat);
+ return maj_stat;
+}
+
+static __u32
+gss_get_mic_kerberos(struct gss_ctx *ctx,
+ __u32 qop,
+ rawobj_t *message,
+ rawobj_t *mic_token)
+{
+ struct krb5_ctx *kctx = ctx->internal_ctx_id;
+ __u32 err;
+
+ err = krb5_make_token(kctx, qop, message, mic_token);
+
+ CDEBUG(D_SEC, "returning %d\n",err);
+ return err;
+}
+
+static struct gss_api_ops gss_kerberos_ops = {
+ .gss_import_sec_context = gss_import_sec_context_kerberos,
+ .gss_inquire_context = gss_inquire_context_kerberos,
+ .gss_get_mic = gss_get_mic_kerberos,
+ .gss_verify_mic = gss_verify_mic_kerberos,
+ .gss_wrap = gss_wrap_kerberos,
+ .gss_unwrap = gss_unwrap_kerberos,
+ .gss_delete_sec_context = gss_delete_sec_context_kerberos,
+};
+
+static struct subflavor_desc gss_kerberos_sfs[] = {
+ {
+ .subflavor = PTLRPC_SEC_GSS_KRB5,
+ .qop = 0,
+ .service = PTLRPC_SEC_TYPE_NONE,
+ .name = "krb5"
+ },
+ {
+ .subflavor = PTLRPC_SEC_GSS_KRB5I,
+ .qop = 0,
+ .service = PTLRPC_SEC_TYPE_AUTH,
+ .name = "krb5i"
+ },
+ {
+ .subflavor = PTLRPC_SEC_GSS_KRB5P,
+ .qop = 0,
+ .service = PTLRPC_SEC_TYPE_PRIV,
+ .name = "krb5p"
+ }
+};
+
+static struct gss_api_mech gss_kerberos_mech = {
+ .gm_name = "krb5",
+ .gm_owner = THIS_MODULE,
+ .gm_ops = &gss_kerberos_ops,
+ .gm_sf_num = 3,
+ .gm_sfs = gss_kerberos_sfs,
+};
+
+/*static*/ int __init init_kerberos_module(void)
+{
+ int status;
+
+ status = kgss_mech_register(&gss_kerberos_mech);
+ if (status)
+ CERROR("Failed to register kerberos gss mechanism!\n");
+ return status;
+}
+
+/*static*/ void __exit cleanup_kerberos_module(void)
+{
+ kgss_mech_unregister(&gss_kerberos_mech);
+}
+
+/* XXX enable this when module works */
+#if 0
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("GSS Krb5 mechanism for Lustre");
+
+module_init(init_kerberos_module);
+module_exit(cleanup_kerberos_module);
+#endif
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/gss_krb5_seal.c
+ *
+ * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5seal.c
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ * J. Bruce Fields <bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government. It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission. FundsXpress makes no representations about the suitability of
+ * this software for any purpose. It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#else
+#include <liblustre.h>
+#include "../kcrypto/libcrypto.h"
+#include <netinet/in.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+
+spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED;
+
+__u32
+krb5_make_token(struct krb5_ctx *ctx,
+ int qop_req,
+ rawobj_t *text,
+ rawobj_t *token)
+{
+ __s32 checksum_type;
+ rawobj_t md5cksum = {.len = 0, .data = NULL};
+ unsigned char *ptr, *krb5_hdr, *msg_start;
+ __s32 now, seq_send;
+ ENTRY;
+
+ now = get_seconds();
+
+ if (qop_req != 0)
+ goto out_err;
+
+ switch (ctx->signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ checksum_type = CKSUMTYPE_RSA_MD5;
+ break;
+ default:
+ CERROR("ctx->signalg %d not supported\n", ctx->signalg);
+ goto out_err;
+ }
+ if (ctx->sealalg != SEAL_ALG_NONE && ctx->sealalg != SEAL_ALG_DES) {
+ CERROR("ctx->sealalg %d not supported\n", ctx->sealalg);
+ goto out_err;
+ }
+
+ token->len = g_token_size(&ctx->mech_used, 22);
+
+ ptr = token->data;
+ g_make_token_header(&ctx->mech_used, 22, &ptr);
+
+ *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff);
+ *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff);
+
+ /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
+ krb5_hdr = ptr - 2;
+ msg_start = krb5_hdr + 24;
+
+ *(__u16 *)(krb5_hdr + 2) = cpu_to_be16(ctx->signalg);
+ memset(krb5_hdr + 4, 0xff, 4);
+
+ if (make_checksum(checksum_type, krb5_hdr, 8, text, &md5cksum))
+ goto out_err;
+
+ switch (ctx->signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ if (krb5_encrypt(ctx->seq, NULL, md5cksum.data,
+ md5cksum.data, md5cksum.len))
+ goto out_err;
+ memcpy(krb5_hdr + 16,
+ md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH,
+ KRB5_CKSUM_LENGTH);
+
+ break;
+ default:
+ LBUG();
+ }
+
+ OBD_FREE(md5cksum.data, md5cksum.len);
+
+ spin_lock(&krb5_seq_lock);
+ seq_send = ctx->seq_send++;
+ spin_unlock(&krb5_seq_lock);
+
+ if ((krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff,
+ seq_send, krb5_hdr + 16, krb5_hdr + 8)))
+ goto out_err;
+
+ return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
+out_err:
+ if (md5cksum.data)
+ OBD_FREE(md5cksum.data, md5cksum.len);
+ return GSS_S_FAILURE;
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/gss_krb5_seqnum.c
+ *
+ * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/util_seqnum.c
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#else
+#include <liblustre.h>
+#include "../kcrypto/libcrypto.h"
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+
+__s32
+krb5_make_seq_num(struct crypto_tfm *key,
+ int direction,
+ __s32 seqnum,
+ unsigned char *cksum,
+ unsigned char *buf)
+{
+ unsigned char plain[8];
+
+ plain[0] = (unsigned char) (seqnum & 0xff);
+ plain[1] = (unsigned char) ((seqnum >> 8) & 0xff);
+ plain[2] = (unsigned char) ((seqnum >> 16) & 0xff);
+ plain[3] = (unsigned char) ((seqnum >> 24) & 0xff);
+
+ plain[4] = direction;
+ plain[5] = direction;
+ plain[6] = direction;
+ plain[7] = direction;
+
+ return krb5_encrypt(key, cksum, plain, buf, 8);
+}
+
+__s32
+krb5_get_seq_num(struct crypto_tfm *key,
+ unsigned char *cksum,
+ unsigned char *buf,
+ int *direction,
+ __s32 * seqnum)
+{
+ __s32 code;
+ unsigned char plain[8];
+
+ if ((code = krb5_decrypt(key, cksum, buf, plain, 8)))
+ return code;
+
+ if ((plain[4] != plain[5]) || (plain[4] != plain[6])
+ || (plain[4] != plain[7]))
+ return (__s32)KG_BAD_SEQ;
+
+ *direction = plain[4];
+
+ *seqnum = ((plain[0]) |
+ (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24));
+
+ return (0);
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/gss_krb5_unseal.c
+ *
+ * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5unseal.c
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Andy Adamson <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government. It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission. FundsXpress makes no representations about the suitability of
+ * this software for any purpose. It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#else
+#include <liblustre.h>
+#include "../kcrypto/libcrypto.h"
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+
+
+/* read_token is a mic token, and message_buffer is the data that the mic was
+ * supposedly taken over. */
+
+__u32
+krb5_read_token(struct krb5_ctx *ctx,
+ rawobj_t *read_token,
+ rawobj_t *message_buffer,
+ int *qop_state)
+{
+ int signalg;
+ int sealalg;
+ __s32 checksum_type;
+ rawobj_t md5cksum = {.len = 0, .data = NULL};
+ __s32 now;
+ int direction;
+ __s32 seqnum;
+ unsigned char *ptr = (unsigned char *)read_token->data;
+ int bodysize;
+ __u32 ret = GSS_S_DEFECTIVE_TOKEN;
+ ENTRY;
+
+ if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr,
+ read_token->len))
+ goto out;
+
+ if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) ||
+ (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) )
+ goto out;
+
+ /* XXX sanity-check bodysize?? */
+
+ /* get the sign and seal algorithms */
+
+ signalg = ptr[0] + (ptr[1] << 8);
+ sealalg = ptr[2] + (ptr[3] << 8);
+
+ /* Sanity checks */
+
+ if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
+ goto out;
+
+ if (sealalg != 0xffff)
+ goto out;
+
+ /* there are several mappings of seal algorithms to sign algorithms,
+ but few enough that we can try them all. */
+
+ if ((ctx->sealalg == SEAL_ALG_NONE && signalg > 1) ||
+ (ctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) ||
+ (ctx->sealalg == SEAL_ALG_DES3KD &&
+ signalg != SGN_ALG_HMAC_SHA1_DES3_KD))
+ goto out;
+
+ /* compute the checksum of the message */
+
+ /* initialize the the cksum */
+ switch (signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ checksum_type = CKSUMTYPE_RSA_MD5;
+ break;
+ default:
+ ret = GSS_S_DEFECTIVE_TOKEN;
+ goto out;
+ }
+
+ switch (signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ ret = make_checksum(checksum_type, ptr - 2, 8,
+ message_buffer, &md5cksum);
+ if (ret)
+ goto out;
+
+ ret = krb5_encrypt(ctx->seq, NULL, md5cksum.data,
+ md5cksum.data, 16);
+ if (ret)
+ goto out;
+
+ if (memcmp(md5cksum.data + 8, ptr + 14, 8)) {
+ ret = GSS_S_BAD_SIG;
+ goto out;
+ }
+ break;
+ default:
+ ret = GSS_S_DEFECTIVE_TOKEN;
+ goto out;
+ }
+
+ /* it got through unscathed. Make sure the context is unexpired */
+
+ if (qop_state)
+ *qop_state = GSS_C_QOP_DEFAULT;
+
+ now = get_seconds();
+
+ ret = GSS_S_CONTEXT_EXPIRED;
+ if (now > ctx->endtime)
+ goto out;
+
+ /* do sequencing checks */
+
+ ret = GSS_S_BAD_SIG;
+ if ((ret = krb5_get_seq_num(ctx->seq, ptr + 14, ptr + 6, &direction,
+ &seqnum)))
+ goto out;
+
+ if ((ctx->initiate && direction != 0xff) ||
+ (!ctx->initiate && direction != 0))
+ goto out;
+
+ ret = GSS_S_COMPLETE;
+out:
+ if (md5cksum.data)
+ OBD_FREE(md5cksum.data, md5cksum.len);
+ return ret;
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modified from NFSv4 projects for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/random.h>
+#else
+#include <liblustre.h>
+#include "../kcrypto/libcrypto.h"
+#include <netinet/in.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+#include "gss_asn1.h"
+
+static inline
+int add_padding(rawobj_buf_t *msgbuf, int blocksize)
+{
+ int padding;
+
+ padding = (blocksize - (msgbuf->datalen & (blocksize - 1))) &
+ (blocksize - 1);
+ if (padding == 0)
+ return 0;
+
+ CWARN("add padding %d\n", padding);
+ if (msgbuf->dataoff + msgbuf->datalen + padding > msgbuf->buflen) {
+ CERROR("bufsize %u too small: off %u, len %u, padding %u\n",
+ msgbuf->buflen, msgbuf->dataoff, msgbuf->datalen,
+ padding);
+ return -EINVAL;
+ }
+ memset(msgbuf->buf + msgbuf->dataoff + msgbuf->datalen,
+ padding, padding);
+ msgbuf->datalen += padding;
+ return 0;
+}
+
+static inline
+int generate_confounder(rawobj_buf_t *msgbuf, int blocksize)
+{
+ __u8 *p;
+
+ p = msgbuf->buf + msgbuf->dataoff - blocksize;
+ if (p < msgbuf->buf) {
+ CERROR("buf underflow\n");
+ return -EINVAL;
+ }
+
+ get_random_bytes(p, blocksize);
+ return 0;
+}
+
+__u32
+gss_wrap_kerberos(struct gss_ctx *ctx,
+ __u32 qop,
+ rawobj_buf_t *msgbuf,
+ rawobj_t *token)
+{
+ struct krb5_ctx *kctx = ctx->internal_ctx_id;
+ __u32 checksum_type;
+ rawobj_t data_desc, cipher_out, md5cksum;
+ int blocksize;
+ unsigned char *ptr, *krb5_hdr, *msg_start;
+ int head_len, plain_len;
+ __u32 seq_send, major;
+ ENTRY;
+
+ if (qop) {
+ CERROR("not support qop %x yet\n", qop);
+ RETURN(GSS_S_FAILURE);
+ }
+
+ switch (kctx->signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ checksum_type = CKSUMTYPE_RSA_MD5;
+ break;
+ default:
+ CERROR("not support signalg %x\n", kctx->signalg);
+ RETURN(GSS_S_FAILURE);
+ }
+ if (kctx->sealalg != SEAL_ALG_NONE &&
+ kctx->sealalg != SEAL_ALG_DES) {
+ CERROR("not support sealalg %x\n", kctx->sealalg);
+ RETURN(GSS_S_FAILURE);
+ }
+
+ blocksize = crypto_tfm_alg_blocksize(kctx->enc);
+ LASSERT(blocksize <= 16);
+ LASSERT(blocksize == 8); /* acutally must be 8 for now */
+
+ if (add_padding(msgbuf, blocksize))
+ RETURN(GSS_S_FAILURE);
+
+ /* confounder size == blocksize */
+ plain_len = msgbuf->datalen + blocksize;
+
+ head_len = g_token_size(&kctx->mech_used, 22 + plain_len) -
+ msgbuf->datalen;
+
+ LASSERT(token->len >= head_len);
+ ptr = token->data;
+
+ /*
+ * fill in gss header and krb5 header
+ */
+ g_make_token_header(&kctx->mech_used, 22 + plain_len, &ptr);
+ krb5_hdr = ptr;
+ msg_start = krb5_hdr + 24;
+ *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff);
+ *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG & 0xff);
+ *(__u16 *)(krb5_hdr + 2) = cpu_to_be16(kctx->signalg);
+ memset(krb5_hdr + 4, 0xff, 4);
+ *(__u16 *)(krb5_hdr + 4) = cpu_to_be16(kctx->sealalg);
+
+ /*
+ * prepend confounder on plain text
+ */
+ if (generate_confounder(msgbuf, blocksize))
+ RETURN(GSS_S_FAILURE);
+
+ /*
+ * compute checksum including confounder
+ */
+ data_desc.data = msgbuf->buf + msgbuf->dataoff - blocksize;
+ data_desc.len = msgbuf->datalen + blocksize;
+
+ if (make_checksum(checksum_type, krb5_hdr, 8, &data_desc, &md5cksum)) {
+ CERROR("checksum error\n");
+ RETURN(GSS_S_FAILURE);
+ }
+
+ major = GSS_S_FAILURE;
+ switch (kctx->signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
+ md5cksum.data, md5cksum.len)) {
+ rawobj_free(&md5cksum);
+ RETURN(GSS_S_FAILURE);
+ }
+ memcpy(krb5_hdr + 16,
+ md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH,
+ KRB5_CKSUM_LENGTH);
+ break;
+ default:
+ LBUG();
+ }
+
+ rawobj_free(&md5cksum);
+
+ /*
+ * fill sequence number in krb5 header
+ */
+ spin_lock(&krb5_seq_lock);
+ seq_send = kctx->seq_send++;
+ spin_unlock(&krb5_seq_lock);
+
+ if (krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff,
+ seq_send, krb5_hdr + 16, krb5_hdr + 8))
+ RETURN(GSS_S_FAILURE);
+
+ /* do encryption */
+ data_desc.data = msgbuf->buf + msgbuf->dataoff - blocksize;
+ data_desc.len = msgbuf->datalen + blocksize;
+ cipher_out.data = msg_start;
+ cipher_out.len = token->len - (msg_start - token->data);
+ LASSERT(data_desc.len % blocksize == 0);
+ LASSERT(data_desc.len <= cipher_out.len);
+
+ if (gss_encrypt_rawobj(kctx->enc, &data_desc, &cipher_out, 1))
+ RETURN(GSS_S_FAILURE);
+
+ token->len = (msg_start - token->data) + cipher_out.len;
+ RETURN(0);
+}
+
+__u32
+gss_unwrap_kerberos(struct gss_ctx *ctx,
+ __u32 qop,
+ rawobj_t *in_token,
+ rawobj_t *out_token)
+{
+ struct krb5_ctx *kctx = ctx->internal_ctx_id;
+ int signalg, sealalg;
+ rawobj_t cipher_in, plain_out, md5cksum;
+ unsigned char *ptr, *krb5_hdr, *tmpbuf;
+ int bodysize;
+ int blocksize, seqnum, direction;
+ __u32 checksum_type;
+ __u32 major;
+ ENTRY;
+
+ ptr = in_token->data;
+
+ /*
+ * verify gss header
+ */
+ major = g_verify_token_header(&kctx->mech_used, &bodysize, &ptr,
+ in_token->len);
+ if (major) {
+ CERROR("gss token error %d\n", major);
+ RETURN(GSS_S_FAILURE);
+ }
+
+ krb5_hdr = ptr;
+
+ if ((*ptr++ != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) ||
+ (*ptr++ != (KG_TOK_WRAP_MSG & 0xff))) {
+ CERROR("token type not matched\n");
+ RETURN(G_BAD_TOK_HEADER);
+ }
+
+ if (bodysize < 22) {
+ CERROR("body size only %d\n", bodysize);
+ RETURN(G_WRONG_SIZE);
+ }
+
+ /*
+ * extract algorithms
+ */
+ signalg = ptr[0] | (ptr[1] << 8);
+ sealalg = ptr[2] | (ptr[3] << 8);
+
+ if (ptr[4] != 0xFF || ptr[5] != 0xFF) {
+ CERROR("4/5: %d, %d\n", ptr[4], ptr[5]);
+ RETURN(GSS_S_DEFECTIVE_TOKEN);
+ }
+
+ if (sealalg != kctx->sealalg) {
+ CERROR("sealalg %d not matched my %d\n",
+ sealalg, kctx->sealalg);
+ RETURN(GSS_S_DEFECTIVE_TOKEN);
+ }
+
+ if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) ||
+ (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) ||
+ (kctx->sealalg == SEAL_ALG_DES3KD &&
+ signalg != SGN_ALG_HMAC_SHA1_DES3_KD)) {
+ CERROR("bad sealalg %d\n", sealalg);
+ RETURN(GSS_S_DEFECTIVE_TOKEN);
+ }
+
+ /* make bodysize as the actual cipher text size */
+ bodysize -= 22;
+ if (bodysize <= 0) {
+ CERROR("cipher text size %d?\n", bodysize);
+ RETURN(GSS_S_DEFECTIVE_TOKEN);
+ }
+
+ blocksize = crypto_tfm_alg_blocksize(kctx->enc);
+ if (bodysize % blocksize) {
+ CERROR("odd bodysize %d\n", bodysize);
+ RETURN(GSS_S_DEFECTIVE_TOKEN);
+ }
+
+ OBD_ALLOC(tmpbuf, bodysize);
+ if (!tmpbuf) {
+ CERROR("fail alloc %d\n", bodysize);
+ RETURN(GSS_S_FAILURE);
+ }
+
+ cipher_in.data = krb5_hdr + 24;
+ cipher_in.len = bodysize;
+ plain_out.data = tmpbuf;
+ plain_out.len = bodysize;
+
+ major = GSS_S_DEFECTIVE_TOKEN;
+ if (gss_encrypt_rawobj(kctx->enc, &cipher_in, &plain_out, 0)) {
+ CERROR("error decrypt: 0x%x\n", major);
+ GOTO(out_free, major);
+ }
+ LASSERT(plain_out.len == bodysize);
+
+ /*
+ * verify checksum
+ */
+ switch (signalg) {
+ case SGN_ALG_DES_MAC_MD5:
+ checksum_type = CKSUMTYPE_RSA_MD5;
+ major = make_checksum(checksum_type, krb5_hdr, 8,
+ &plain_out, &md5cksum);
+ if (major) {
+ CERROR("make checksum err: 0x%x\n", major);
+ GOTO(out_free, major);
+ }
+
+ major = krb5_encrypt(kctx->seq, NULL, md5cksum.data,
+ md5cksum.data, md5cksum.len);
+ if (major) {
+ CERROR("encrypt checksum err: 0x%x\n", major);
+ rawobj_free(&md5cksum);
+ GOTO(out_free, major);
+ }
+
+ if (memcmp(md5cksum.data + 8, krb5_hdr + 16, 8)) {
+ CERROR("checksum mismatch\n");
+ rawobj_free(&md5cksum);
+ GOTO(out_free, major = GSS_S_BAD_SIG);
+ }
+ break;
+ default:
+ CERROR("not support signalg %d\n", signalg);
+ GOTO(out_free, major);
+ }
+
+ rawobj_free(&md5cksum);
+
+ /* FIXME add expire checking here */
+
+ major = krb5_get_seq_num(kctx->seq, krb5_hdr + 16,
+ krb5_hdr + 8, &direction,
+ &seqnum);
+ if (major) {
+ CERROR("get seq number err: 0x%x\n", major);
+ GOTO(out_free, major);
+ }
+
+ if ((kctx->initiate && direction != 0xff) ||
+ (!kctx->initiate && direction != 0)) {
+ CERROR("flag checking error\n");
+ GOTO(out_free, major = GSS_S_BAD_SIG);
+ }
+
+ /* FIXME how to remove the padding? */
+
+ /*
+ * copy back
+ */
+ if (out_token->len < bodysize - blocksize) {
+ CERROR("data size %d while buffer only %d\n",
+ bodysize - blocksize, out_token->len);
+ GOTO(out_free, major = GSS_S_DEFECTIVE_TOKEN);
+ }
+
+ out_token->len = bodysize - blocksize;
+ memcpy(out_token->data, plain_out.data + blocksize, out_token->len);
+ major = 0;
+out_free:
+ OBD_FREE(tmpbuf, bodysize);
+ RETURN(major);
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/gss_mech_switch.c
+ *
+ * Copyright (c) 2001 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * J. Bruce Fields <bfields@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static LIST_HEAD(registered_mechs);
+static spinlock_t registered_mechs_lock = SPIN_LOCK_UNLOCKED;
+
+int
+kgss_mech_register(struct gss_api_mech *gm)
+{
+ spin_lock(®istered_mechs_lock);
+ list_add(&gm->gm_list, ®istered_mechs);
+ spin_unlock(®istered_mechs_lock);
+ CWARN("registered gss mechanism %s\n", gm->gm_name);
+ return 0;
+}
+
+//EXPORT_SYMBOL(kgss_mech_register);
+
+void
+kgss_mech_unregister(struct gss_api_mech *gm)
+{
+ spin_lock(®istered_mechs_lock);
+ list_del(&gm->gm_list);
+ spin_unlock(®istered_mechs_lock);
+ CWARN("unregistered gss mechanism %s\n", gm->gm_name);
+// gss_mech_free(gm);
+}
+
+//EXPORT_SYMBOL(gss_mech_unregister);
+
+struct gss_api_mech *
+kgss_mech_get(struct gss_api_mech *gm)
+{
+ __module_get(gm->gm_owner);
+ return gm;
+}
+
+//EXPORT_SYMBOL(kgss_mech_get);
+
+struct gss_api_mech *
+kgss_name_to_mech(char *name)
+{
+ struct gss_api_mech *pos, *gm = NULL;
+
+ spin_lock(®istered_mechs_lock);
+ list_for_each_entry(pos, ®istered_mechs, gm_list) {
+ if (0 == strcmp(name, pos->gm_name)) {
+ if (!try_module_get(pos->gm_owner))
+ continue;
+ gm = pos;
+ break;
+ }
+ }
+ spin_unlock(®istered_mechs_lock);
+ return gm;
+
+}
+
+//EXPORT_SYMBOL(gss_name_to_mech);
+
+static inline int
+mech_supports_subflavor(struct gss_api_mech *gm, __u32 subflavor)
+{
+ int i;
+
+ for (i = 0; i < gm->gm_sf_num; i++) {
+ if (gm->gm_sfs[i].subflavor == subflavor)
+ return 1;
+ }
+ return 0;
+}
+
+struct gss_api_mech *
+kgss_subflavor_to_mech(__u32 subflavor)
+{
+ struct gss_api_mech *pos, *gm = NULL;
+
+ spin_lock(®istered_mechs_lock);
+ list_for_each_entry(pos, ®istered_mechs, gm_list) {
+ if (!try_module_get(pos->gm_owner))
+ continue;
+ if (!mech_supports_subflavor(pos, subflavor)) {
+ module_put(pos->gm_owner);
+ continue;
+ }
+ gm = pos;
+ break;
+ }
+ spin_unlock(®istered_mechs_lock);
+ return gm;
+}
+
+//EXPORT_SYMBOL(gss_subflavor_to_mech);
+
+void
+kgss_mech_put(struct gss_api_mech *gm)
+{
+ module_put(gm->gm_owner);
+}
+
+//EXPORT_SYMBOL(kgss_mech_put);
+
+/* The mech could probably be determined from the token instead, but it's just
+ * as easy for now to pass it in. */
+__u32
+kgss_import_sec_context(rawobj_t *input_token,
+ struct gss_api_mech *mech,
+ struct gss_ctx **ctx_id)
+{
+ OBD_ALLOC(*ctx_id, sizeof(**ctx_id));
+ if (*ctx_id == NULL)
+ return GSS_S_FAILURE;
+
+ (*ctx_id)->mech_type = kgss_mech_get(mech);
+
+ LASSERT(mech);
+ LASSERT(mech->gm_ops);
+ LASSERT(mech->gm_ops->gss_import_sec_context);
+ return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
+}
+
+/*
+ * this interface is much simplified, currently we only need endtime.
+ */
+__u32
+kgss_inquire_context(struct gss_ctx *context_handle,
+ __u64 *endtime)
+{
+ LASSERT(context_handle);
+ LASSERT(context_handle->mech_type);
+ LASSERT(context_handle->mech_type->gm_ops);
+ LASSERT(context_handle->mech_type->gm_ops->gss_inquire_context);
+
+ return context_handle->mech_type->gm_ops
+ ->gss_inquire_context(context_handle,
+ endtime);
+}
+
+/* gss_get_mic: compute a mic over message and return mic_token. */
+__u32
+kgss_get_mic(struct gss_ctx *context_handle,
+ __u32 qop,
+ rawobj_t *message,
+ rawobj_t *mic_token)
+{
+ LASSERT(context_handle);
+ LASSERT(context_handle->mech_type);
+ LASSERT(context_handle->mech_type->gm_ops);
+ LASSERT(context_handle->mech_type->gm_ops->gss_get_mic);
+
+ return context_handle->mech_type->gm_ops
+ ->gss_get_mic(context_handle,
+ qop,
+ message,
+ mic_token);
+}
+
+/* gss_verify_mic: check whether the provided mic_token verifies message. */
+__u32
+kgss_verify_mic(struct gss_ctx *context_handle,
+ rawobj_t *message,
+ rawobj_t *mic_token,
+ __u32 *qstate)
+{
+ LASSERT(context_handle);
+ LASSERT(context_handle->mech_type);
+ LASSERT(context_handle->mech_type->gm_ops);
+ LASSERT(context_handle->mech_type->gm_ops->gss_verify_mic);
+
+ return context_handle->mech_type->gm_ops
+ ->gss_verify_mic(context_handle,
+ message,
+ mic_token,
+ qstate);
+}
+
+__u32
+kgss_wrap(struct gss_ctx *context_handle,
+ __u32 qop,
+ rawobj_buf_t *inbuf,
+ rawobj_t *outbuf)
+{
+ LASSERT(context_handle);
+ LASSERT(context_handle->mech_type);
+ LASSERT(context_handle->mech_type->gm_ops);
+ LASSERT(context_handle->mech_type->gm_ops->gss_wrap);
+
+ return context_handle->mech_type->gm_ops
+ ->gss_wrap(context_handle, qop, inbuf, outbuf);
+}
+
+__u32
+kgss_unwrap(struct gss_ctx *context_handle,
+ __u32 qop,
+ rawobj_t *inbuf,
+ rawobj_t *outbuf)
+{
+ LASSERT(context_handle);
+ LASSERT(context_handle->mech_type);
+ LASSERT(context_handle->mech_type->gm_ops);
+ LASSERT(context_handle->mech_type->gm_ops->gss_unwrap);
+
+ return context_handle->mech_type->gm_ops
+ ->gss_unwrap(context_handle, qop, inbuf, outbuf);
+}
+
+
+/* gss_delete_sec_context: free all resources associated with context_handle.
+ * Note this differs from the RFC 2744-specified prototype in that we don't
+ * bother returning an output token, since it would never be used anyway. */
+
+__u32
+kgss_delete_sec_context(struct gss_ctx **context_handle)
+{
+ struct gss_api_mech *mech;
+
+ CDEBUG(D_SEC, "deleting %p\n", *context_handle);
+
+ if (!*context_handle)
+ return(GSS_S_NO_CONTEXT);
+
+ mech = (*context_handle)->mech_type;
+ if ((*context_handle)->internal_ctx_id != 0) {
+ LASSERT(mech);
+ LASSERT(mech->gm_ops);
+ LASSERT(mech->gm_ops->gss_delete_sec_context);
+ mech->gm_ops->gss_delete_sec_context(
+ (*context_handle)->internal_ctx_id);
+ }
+ if (mech)
+ kgss_mech_put(mech);
+
+ OBD_FREE(*context_handle, sizeof(**context_handle));
+ *context_handle=NULL;
+ return GSS_S_COMPLETE;
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_internal.h"
+
+int rawobj_alloc(rawobj_t *obj, char *buf, int len)
+{
+ LASSERT(obj);
+ LASSERT(len >= 0);
+
+ obj->len = len;
+ if (len) {
+ OBD_ALLOC(obj->data, len);
+ if (!obj->data)
+ RETURN(-ENOMEM);
+ memcpy(obj->data, buf, len);
+ } else
+ obj->data = NULL;
+ return 0;
+}
+
+void rawobj_free(rawobj_t *obj)
+{
+ LASSERT(obj);
+
+ if (obj->len) {
+ LASSERT(obj->data);
+ OBD_FREE(obj->data, obj->len);
+ obj->len = 0;
+ obj->data = NULL;
+ } else
+ LASSERT(!obj->data);
+}
+
+int rawobj_equal(rawobj_t *a, rawobj_t *b)
+{
+ LASSERT(a && b);
+
+ return (a->len == b->len &&
+ !memcmp(a->data, b->data, a->len));
+}
+
+int rawobj_dup(rawobj_t *dest, rawobj_t *src)
+{
+ LASSERT(src && dest);
+
+ dest->len = src->len;
+ if (dest->len) {
+ OBD_ALLOC(dest->data, dest->len);
+ if (!dest->data)
+ return -ENOMEM;
+ memcpy(dest->data, src->data, dest->len);
+ } else
+ dest->data = NULL;
+ return 0;
+}
+
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+ __u32 len;
+
+ LASSERT(obj);
+ LASSERT(buf);
+ LASSERT(buflen);
+
+ len = size_round4(obj->len);
+
+ if (*buflen < 4 + len) {
+ CERROR("buflen %u < %u\n", *buflen, 4 + len);
+ return -EINVAL;
+ }
+
+ *(*buf)++ = cpu_to_le32(obj->len);
+ memcpy(*buf, obj->data, obj->len);
+ *buf += (len >> 2);
+ *buflen -= (4 + len);
+
+ return 0;
+}
+
+static int __rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen,
+ int alloc, int local)
+{
+ __u32 len;
+
+ if (*buflen < sizeof(__u32)) {
+ CERROR("buflen %u\n", *buflen);
+ return -EINVAL;
+ }
+
+ obj->len = *(*buf)++;
+ if (!local)
+ obj->len = le32_to_cpu(obj->len);
+ *buflen -= sizeof(__u32);
+
+ if (!obj->len) {
+ obj->data = NULL;
+ return 0;
+ }
+
+ len = local ? obj->len : size_round4(obj->len);
+ if (*buflen < len) {
+ CERROR("buflen %u < %u\n", *buflen, len);
+ return -EINVAL;
+ }
+
+ if (!alloc)
+ obj->data = (__u8 *) *buf;
+ else {
+ OBD_ALLOC(obj->data, obj->len);
+ if (!obj->data) {
+ CERROR("fail to alloc %u bytes\n", obj->len);
+ return -ENOMEM;
+ }
+ memcpy(obj->data, *buf, obj->len);
+ }
+
+ *((char **)buf) += len;
+ *buflen -= len;
+
+ return 0;
+}
+
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+ return __rawobj_extract(obj, buf, buflen, 0, 0);
+}
+
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+ return __rawobj_extract(obj, buf, buflen, 0, 1);
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ * Copyright (c) 2000 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Dug Song <dugsong@monkey.org>
+ * Andy Adamson <andros@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Id: sec_gss.c,v 1.2 2005/03/31 22:18:24 ericm Exp $
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/random.h>
+/* for rpc_pipefs */
+struct rpc_clnt;
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#define GSS_CREDCACHE_EXPIRE (60) /* 1 minute */
+#define GSS_CRED_EXPIRE (8 * 60 * 60) /* 8 hours */
+#define GSS_CRED_SIGN_SIZE (1024)
+#define GSS_CRED_VERIFY_SIZE (56)
+
+#define LUSTRE_PIPEDIR "/lustre"
+
+/**********************************************
+ * gss security init/fini helper *
+ **********************************************/
+
+#define SECINIT_RPC_TIMEOUT (10)
+#define SECFINI_RPC_TIMEOUT (10)
+
+static int secinit_compose_request(struct obd_import *imp,
+ char *buf, int bufsize,
+ char __user *token)
+{
+ struct ptlrpcs_wire_hdr *hdr;
+ struct lustre_msg *lmsg;
+ char __user *token_buf;
+ __u64 token_size;
+ __u32 lmsg_size, *p;
+ int rc;
+
+ lmsg_size = lustre_msg_size(0, NULL);
+
+ if (copy_from_user(&token_size, token, sizeof(token_size))) {
+ CERROR("read token error\n");
+ return -EFAULT;
+ }
+ if (sizeof(*hdr) + lmsg_size + size_round(token_size) > bufsize) {
+ CERROR("token size "LPU64" too large\n", token_size);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(&token_buf, (token + sizeof(token_size)),
+ sizeof(void*))) {
+ CERROR("read token buf pointer error\n");
+ return -EFAULT;
+ }
+
+ /* security wire hdr */
+ hdr = buf_to_sec_hdr(buf);
+ hdr->flavor = cpu_to_le32(PTLRPC_SEC_GSS);
+ hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_NONE);
+ hdr->msg_len = cpu_to_le32(lmsg_size);
+ hdr->sec_len = cpu_to_le32(7 * 4 + token_size);
+
+ /* lustre message */
+ lmsg = buf_to_lustre_msg(buf);
+ lustre_init_msg(lmsg, 0, NULL, NULL);
+ lmsg->handle = imp->imp_remote_handle;
+ lmsg->type = PTL_RPC_MSG_REQUEST;
+ lmsg->opc = SEC_INIT;
+ lmsg->flags = 0;
+ lmsg->conn_cnt = imp->imp_conn_cnt;
+
+ p = (__u32 *) (buf + sizeof(*hdr) + lmsg_size);
+
+ /* gss hdr */
+ *p++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION); /* gss version */
+ *p++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I); /* subflavor */
+ *p++ = cpu_to_le32(PTLRPC_GSS_PROC_INIT); /* proc */
+ *p++ = cpu_to_le32(0); /* seq */
+ *p++ = cpu_to_le32(PTLRPC_GSS_SVC_NONE); /* service */
+ *p++ = cpu_to_le32(0); /* context handle */
+
+ /* now the token part */
+ *p++ = (__u32)(cpu_to_le64(token_size));
+ LASSERT(((char *)p - buf) + token_size <= bufsize);
+
+ rc = copy_from_user(p, token_buf, token_size);
+ if (rc) {
+ CERROR("can't copy token\n");
+ return -EFAULT;
+ }
+
+ rc = size_round(((char *)p - buf) + token_size);
+ return rc;
+}
+
+static int secinit_parse_reply(char *repbuf, int replen,
+ char __user *outbuf, int outlen)
+{
+ __u32 *p = (__u32 *)repbuf;
+ __u32 lmsg_len, sec_len, status, major, minor, seq, obj_len, round_len;
+ __u32 effective = 0;
+
+ if (replen <= (4 + 6) * 4) {
+ CERROR("reply size %d too small\n", replen);
+ return -EINVAL;
+ }
+
+ lmsg_len = le32_to_cpu(p[2]);
+ sec_len = le32_to_cpu(p[3]);
+
+ /* sanity checks */
+ if (p[0] != cpu_to_le32(PTLRPC_SEC_GSS) ||
+ p[1] != cpu_to_le32(PTLRPC_SEC_TYPE_NONE)) {
+ CERROR("unexpected reply\n");
+ return -EINVAL;
+ }
+ if (lmsg_len % 8 ||
+ 4 * 4 + lmsg_len + sec_len > replen) {
+ CERROR("unexpected reply\n");
+ return -EINVAL;
+ }
+ if (sec_len > outlen) {
+ CERROR("outbuf too small\n");
+ return -EINVAL;
+ }
+
+ p += 4; /* skip hdr */
+ p += lmsg_len / 4; /* skip lmsg */
+ effective = 0;
+
+ status = le32_to_cpu(*p++);
+ major = le32_to_cpu(*p++);
+ minor = le32_to_cpu(*p++);
+ seq = le32_to_cpu(*p++);
+ effective += 4 * 4;
+
+ copy_to_user(outbuf, &status, 4);
+ outbuf += 4;
+ copy_to_user(outbuf, &major, 4);
+ outbuf += 4;
+ copy_to_user(outbuf, &minor, 4);
+ outbuf += 4;
+ copy_to_user(outbuf, &seq, 4);
+ outbuf += 4;
+
+ obj_len = le32_to_cpu(*p++);
+ round_len = (obj_len + 3) & ~ 3;
+ copy_to_user(outbuf, &obj_len, 4);
+ outbuf += 4;
+ copy_to_user(outbuf, (char *)p, round_len);
+ p += round_len / 4;
+ outbuf += round_len;
+ effective += 4 + round_len;
+
+ obj_len = le32_to_cpu(*p++);
+ round_len = (obj_len + 3) & ~ 3;
+ copy_to_user(outbuf, &obj_len, 4);
+ outbuf += 4;
+ copy_to_user(outbuf, (char *)p, round_len);
+ p += round_len / 4;
+ outbuf += round_len;
+ effective += 4 + round_len;
+
+ return effective;
+}
+
+/* input:
+ * 1. ptr to uuid
+ * 2. ptr to send_token
+ * 3. ptr to output buffer
+ * 4. output buffer size
+ * output:
+ * 1. return code. 0 is success
+ * 2. no meaning
+ * 3. ptr output data
+ * 4. output data size
+ *
+ * return:
+ * < 0: error
+ * = 0: success
+ *
+ * FIXME This interface looks strange, should be reimplemented
+ */
+static int gss_send_secinit_rpc(__user char *buffer, unsigned long count)
+{
+ struct obd_import *imp;
+ const int reqbuf_size = 1024;
+ const int repbuf_size = 1024;
+ char *reqbuf, *repbuf;
+ struct obd_device *obd;
+ char obdname[64];
+ long inbuf[4], lsize;
+ int rc, reqlen, replen;
+
+ if (count != 4 * sizeof(long)) {
+ CERROR("count %lu\n", count);
+ RETURN(-EINVAL);
+ }
+ if (copy_from_user(inbuf, buffer, count)) {
+ CERROR("Invalid pointer\n");
+ RETURN(-EFAULT);
+ }
+
+ /* take name */
+ if (strncpy_from_user(obdname, (char *)inbuf[0],
+ sizeof(obdname)) <= 0) {
+ CERROR("Invalid obdname pointer\n");
+ RETURN(-EFAULT);
+ }
+
+ obd = class_name2obd(obdname);
+ if (!obd) {
+ CERROR("no such obd %s\n", obdname);
+ RETURN(-EINVAL);
+ }
+ if (strcmp(obd->obd_type->typ_name, "mdc") &&
+ strcmp(obd->obd_type->typ_name, "osc")) {
+ CERROR("%s not a mdc/osc device\n", obdname);
+ RETURN(-EINVAL);
+ }
+
+ imp = class_import_get(obd->u.cli.cl_import);
+
+ OBD_ALLOC(reqbuf, reqbuf_size);
+ OBD_ALLOC(repbuf, reqbuf_size);
+
+ if (!reqbuf || !repbuf) {
+ CERROR("Can't alloc buffer: %p/%p\n", reqbuf, repbuf);
+ GOTO(out_free, rc = -ENOMEM);
+ }
+
+ /* get token */
+ reqlen = secinit_compose_request(imp, reqbuf, reqbuf_size,
+ (char *)inbuf[1]);
+ if (reqlen < 0)
+ GOTO(out_free, rc = reqlen);
+
+ replen = repbuf_size;
+ rc = ptlrpc_do_rawrpc(imp, reqbuf, reqlen,
+ repbuf, &replen, SECINIT_RPC_TIMEOUT);
+ if (rc)
+ GOTO(out_free, rc);
+
+ if (replen > inbuf[3]) {
+ CERROR("output buffer size %ld too small, need %d\n",
+ inbuf[3], replen);
+ GOTO(out_free, rc = -EINVAL);
+ }
+
+ lsize = secinit_parse_reply(repbuf, replen,
+ (char *)inbuf[2], (int)inbuf[3]);
+ if (lsize < 0)
+ GOTO(out_free, rc = (int)lsize);
+
+ copy_to_user(buffer + 3 * sizeof(long), &lsize, sizeof(lsize));
+ lsize = 0;
+ copy_to_user((char*)buffer, &lsize, sizeof(lsize));
+ rc = 0;
+out_free:
+ class_import_put(imp);
+ if (repbuf)
+ OBD_FREE(repbuf, repbuf_size);
+ if (reqbuf)
+ OBD_FREE(reqbuf, reqbuf_size);
+ RETURN(rc);
+}
+
+static int gss_send_secfini_rpc(struct obd_import *imp,
+ char *reqbuf, int reqlen)
+{
+ const int repbuf_size = 1024;
+ char *repbuf;
+ int replen = repbuf_size;
+ int rc;
+
+ OBD_ALLOC(repbuf, repbuf_size);
+ if (!repbuf) {
+ CERROR("Out of memory\n");
+ return -ENOMEM;
+ }
+
+ rc = ptlrpc_do_rawrpc(imp, reqbuf, reqlen, repbuf, &replen,
+ SECFINI_RPC_TIMEOUT);
+
+ OBD_FREE(repbuf, repbuf_size);
+ return rc;
+}
+
+/**********************************************
+ * structure definitions *
+ **********************************************/
+struct gss_sec {
+ struct ptlrpc_sec gs_base;
+ struct gss_api_mech *gs_mech;
+#ifdef __KERNEL__
+ spinlock_t gs_lock;
+ struct list_head gs_upcalls;
+ char gs_pipepath[64];
+ struct dentry *gs_depipe;
+#endif
+};
+
+static rwlock_t gss_ctx_lock = RW_LOCK_UNLOCKED;
+
+#ifdef __KERNEL__
+
+struct gss_upcall_msg {
+ struct rpc_pipe_msg gum_base;
+ atomic_t gum_refcount;
+ struct list_head gum_list;
+ struct gss_sec *gum_gsec;
+ wait_queue_head_t gum_waitq;
+ char gum_obdname[64];
+ uid_t gum_uid;
+ __u32 gum_ip; /* XXX IPv6? */
+ __u32 gum_svc;
+ __u32 gum_pad;
+};
+
+/**********************************************
+ * rpc_pipe upcall helpers *
+ **********************************************/
+static
+void gss_release_msg(struct gss_upcall_msg *gmsg)
+{
+ ENTRY;
+ LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+ if (!atomic_dec_and_test(&gmsg->gum_refcount)) {
+ CDEBUG(D_SEC, "gmsg %p ref %d\n", gmsg,
+ atomic_read(&gmsg->gum_refcount));
+ EXIT;
+ return;
+ }
+ LASSERT(list_empty(&gmsg->gum_list));
+ OBD_FREE(gmsg, sizeof(*gmsg));
+ EXIT;
+}
+
+static void
+gss_unhash_msg_nolock(struct gss_upcall_msg *gmsg)
+{
+ ENTRY;
+ if (list_empty(&gmsg->gum_list)) {
+ EXIT;
+ return;
+ }
+ /* FIXME should not do this. when we in upper upcall queue,
+ * downcall will call unhash_msg, thus later put_msg might
+ * free msg buffer while it's not dequeued XXX */
+ list_del_init(&gmsg->gum_base.list);
+ /* FIXME */
+
+ list_del_init(&gmsg->gum_list);
+ wake_up(&gmsg->gum_waitq);
+ atomic_dec(&gmsg->gum_refcount);
+ CDEBUG(D_SEC, "gmsg %p refcount now %d\n",
+ gmsg, atomic_read(&gmsg->gum_refcount));
+ LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+ EXIT;
+}
+
+static void
+gss_unhash_msg(struct gss_upcall_msg *gmsg)
+{
+ struct gss_sec *gsec = gmsg->gum_gsec;
+
+ spin_lock(&gsec->gs_lock);
+ gss_unhash_msg_nolock(gmsg);
+ spin_unlock(&gsec->gs_lock);
+}
+
+static
+struct gss_upcall_msg * gss_find_upcall(struct gss_sec *gsec,
+ char *obdname,
+ uid_t uid, __u32 dest_ip)
+{
+ struct gss_upcall_msg *gmsg;
+ ENTRY;
+
+ list_for_each_entry(gmsg, &gsec->gs_upcalls, gum_list) {
+ if (gmsg->gum_uid != uid)
+ continue;
+ if (gmsg->gum_ip != dest_ip)
+ continue;
+ if (strcmp(gmsg->gum_obdname, obdname))
+ continue;
+ atomic_inc(&gmsg->gum_refcount);
+ CDEBUG(D_SEC, "found gmsg at %p: obdname %s, uid %d, ref %d\n",
+ gmsg, obdname, uid, atomic_read(&gmsg->gum_refcount));
+ RETURN(gmsg);
+ }
+ RETURN(NULL);
+}
+
+static void gss_init_upcall_msg(struct gss_upcall_msg *gmsg,
+ struct gss_sec *gsec,
+ char *obdname,
+ uid_t uid, __u32 dest_ip, __u32 svc)
+{
+ struct rpc_pipe_msg *rpcmsg;
+ ENTRY;
+
+ /* 2 refs: 1 for hash, 1 for current user */
+ init_waitqueue_head(&gmsg->gum_waitq);
+ list_add(&gmsg->gum_list, &gsec->gs_upcalls);
+ atomic_set(&gmsg->gum_refcount, 2);
+ gmsg->gum_gsec = gsec;
+ strncpy(gmsg->gum_obdname, obdname, sizeof(gmsg->gum_obdname));
+ gmsg->gum_uid = uid;
+ gmsg->gum_ip = dest_ip;
+ gmsg->gum_svc = svc;
+
+ rpcmsg = &gmsg->gum_base;
+ rpcmsg->data = &gmsg->gum_uid;
+ rpcmsg->len = sizeof(gmsg->gum_uid) + sizeof(gmsg->gum_ip) +
+ sizeof(gmsg->gum_svc) + sizeof(gmsg->gum_pad);
+ EXIT;
+}
+#endif /* __KERNEL__ */
+
+/********************************************
+ * gss cred manupulation helpers *
+ ********************************************/
+static
+int gss_cred_is_uptodate_ctx(struct ptlrpc_cred *cred)
+{
+ struct gss_cred *gcred = container_of(cred, struct gss_cred, gc_base);
+ int res = 0;
+
+ read_lock(&gss_ctx_lock);
+ if ((cred->pc_flags & PTLRPC_CRED_UPTODATE) && gcred->gc_ctx)
+ res = 1;
+ read_unlock(&gss_ctx_lock);
+ return res;
+}
+
+static inline
+struct gss_cl_ctx * gss_get_ctx(struct gss_cl_ctx *ctx)
+{
+ atomic_inc(&ctx->gc_refcount);
+ return ctx;
+}
+
+static
+void gss_destroy_ctx(struct gss_cl_ctx *ctx)
+{
+ ENTRY;
+
+ CDEBUG(D_SEC, "destroy cl_ctx %p\n", ctx);
+ if (ctx->gc_gss_ctx)
+ kgss_delete_sec_context(&ctx->gc_gss_ctx);
+
+ if (ctx->gc_wire_ctx.len > 0) {
+ OBD_FREE(ctx->gc_wire_ctx.data, ctx->gc_wire_ctx.len);
+ ctx->gc_wire_ctx.len = 0;
+ }
+
+ OBD_FREE(ctx, sizeof(*ctx));
+}
+
+static
+void gss_put_ctx(struct gss_cl_ctx *ctx)
+{
+ if (atomic_dec_and_test(&ctx->gc_refcount))
+ gss_destroy_ctx(ctx);
+}
+
+static
+struct gss_cl_ctx *gss_cred_get_ctx(struct ptlrpc_cred *cred)
+{
+ struct gss_cred *gcred = container_of(cred, struct gss_cred, gc_base);
+ struct gss_cl_ctx *ctx = NULL;
+
+ read_lock(&gss_ctx_lock);
+ if (gcred->gc_ctx)
+ ctx = gss_get_ctx(gcred->gc_ctx);
+ read_unlock(&gss_ctx_lock);
+ return ctx;
+}
+
+static
+void gss_cred_set_ctx(struct ptlrpc_cred *cred, struct gss_cl_ctx *ctx)
+{
+ struct gss_cred *gcred = container_of(cred, struct gss_cred, gc_base);
+ struct gss_cl_ctx *old;
+ __u64 ctx_expiry;
+ ENTRY;
+
+ if (kgss_inquire_context(ctx->gc_gss_ctx, &ctx_expiry)) {
+ CERROR("unable to get expire time\n");
+ ctx_expiry = 1; /* make it expired now */
+ }
+ cred->pc_expire = (unsigned long) ctx_expiry;
+
+ write_lock(&gss_ctx_lock);
+ old = gcred->gc_ctx;
+ gcred->gc_ctx = ctx;
+ cred->pc_flags |= PTLRPC_CRED_UPTODATE;
+ write_unlock(&gss_ctx_lock);
+ if (old)
+ gss_put_ctx(old);
+
+ CWARN("client refreshed gss cred %p(uid %u)\n", cred, cred->pc_uid);
+ EXIT;
+}
+
+static int
+simple_get_bytes(char **buf, __u32 *buflen, void *res, __u32 reslen)
+{
+ if (*buflen < reslen) {
+ CERROR("buflen %u < %u\n", *buflen, reslen);
+ return -EINVAL;
+ }
+
+ memcpy(res, *buf, reslen);
+ *buf += reslen;
+ *buflen -= reslen;
+ return 0;
+}
+
+/* data passed down:
+ * - uid
+ * - timeout
+ * - gc_win / error
+ * - wire_ctx (rawobj)
+ * - mech_ctx? (rawobj)
+ */
+static
+int gss_parse_init_downcall(struct gss_api_mech *gm, rawobj_t *buf,
+ struct gss_cl_ctx **gc, struct vfs_cred *vcred,
+ __u32 *dest_ip, int *gss_err)
+{
+ char *p = buf->data;
+ __u32 len = buf->len;
+ struct gss_cl_ctx *ctx;
+ rawobj_t tmp_buf;
+ unsigned int timeout;
+ int err = -EIO;
+ ENTRY;
+
+ *gc = NULL;
+
+ OBD_ALLOC(ctx, sizeof(*ctx));
+ if (!ctx)
+ RETURN(-ENOMEM);
+
+ ctx->gc_proc = RPC_GSS_PROC_DATA;
+ ctx->gc_seq = 0;
+ spin_lock_init(&ctx->gc_seq_lock);
+ atomic_set(&ctx->gc_refcount,1);
+
+ if (simple_get_bytes(&p, &len, &vcred->vc_uid, sizeof(vcred->vc_uid)))
+ GOTO(err_free_ctx, err);
+ vcred->vc_pag = vcred->vc_uid; /* FIXME */
+ if (simple_get_bytes(&p, &len, dest_ip, sizeof(*dest_ip)))
+ GOTO(err_free_ctx, err);
+ /* FIXME: discarded timeout for now */
+ if (simple_get_bytes(&p, &len, &timeout, sizeof(timeout)))
+ GOTO(err_free_ctx, err);
+ *gss_err = 0;
+ if (simple_get_bytes(&p, &len, &ctx->gc_win, sizeof(ctx->gc_win)))
+ GOTO(err_free_ctx, err);
+ /* gssd signals an error by passing ctx->gc_win = 0: */
+ if (!ctx->gc_win) {
+ /* in which case the next int is an error code: */
+ if (simple_get_bytes(&p, &len, gss_err, sizeof(*gss_err)))
+ GOTO(err_free_ctx, err);
+ GOTO(err_free_ctx, err = 0);
+ }
+ if (rawobj_extract_local(&tmp_buf, (__u32 **) &p, &len))
+ GOTO(err_free_ctx, err);
+ if (rawobj_dup(&ctx->gc_wire_ctx, &tmp_buf)) {
+ GOTO(err_free_ctx, err = -ENOMEM);
+ }
+ if (rawobj_extract_local(&tmp_buf, (__u32 **) &p, &len))
+ GOTO(err_free_wire_ctx, err);
+ if (len) {
+ CERROR("unexpected trailing %u bytes\n", len);
+ GOTO(err_free_wire_ctx, err);
+ }
+ if (kgss_import_sec_context(&tmp_buf, gm, &ctx->gc_gss_ctx))
+ GOTO(err_free_wire_ctx, err);
+
+ *gc = ctx;
+ RETURN(0);
+
+err_free_wire_ctx:
+ if (ctx->gc_wire_ctx.data)
+ OBD_FREE(ctx->gc_wire_ctx.data, ctx->gc_wire_ctx.len);
+err_free_ctx:
+ OBD_FREE(ctx, sizeof(*ctx));
+ CDEBUG(D_SEC, "err_code %d, gss code %d\n", err, *gss_err);
+ return err;
+}
+
+/***************************************
+ * cred APIs *
+ ***************************************/
+#ifdef __KERNEL__
+static int gss_cred_refresh(struct ptlrpc_cred *cred)
+{
+ struct obd_import *import;
+ struct gss_sec *gsec;
+ struct gss_upcall_msg *gss_msg, *gss_new;
+ struct dentry *dentry;
+ char *obdname, *obdtype;
+ wait_queue_t wait;
+ uid_t uid = cred->pc_uid;
+ ptl_nid_t peer_nid;
+ __u32 dest_ip, svc;
+ int res;
+ ENTRY;
+
+ if (ptlrpcs_cred_is_uptodate(cred))
+ RETURN(0);
+
+ LASSERT(cred->pc_sec);
+ LASSERT(cred->pc_sec->ps_import);
+ LASSERT(cred->pc_sec->ps_import->imp_obd);
+
+ import = cred->pc_sec->ps_import;
+ if (!import->imp_connection) {
+ CERROR("import has no connection set\n");
+ RETURN(-EINVAL);
+ }
+
+ peer_nid = import->imp_connection->c_peer.peer_id.nid;
+ dest_ip = (__u32) (peer_nid & 0xFFFFFFFF);
+
+ obdtype = import->imp_obd->obd_type->typ_name;
+ if (!strcmp(obdtype, "mdc"))
+ svc = 0;
+ else if (!strcmp(obdtype, "osc"))
+ svc = 1;
+ else {
+ CERROR("gss on %s?\n", obdtype);
+ RETURN(-EINVAL);
+ }
+
+ gsec = container_of(cred->pc_sec, struct gss_sec, gs_base);
+ obdname = import->imp_obd->obd_name;
+ dentry = gsec->gs_depipe;
+ gss_new = NULL;
+ res = 0;
+
+ CWARN("Initiate gss context %p(%u@%s)\n",
+ container_of(cred, struct gss_cred, gc_base),
+ uid, import->imp_target_uuid.uuid);
+
+again:
+ spin_lock(&gsec->gs_lock);
+ gss_msg = gss_find_upcall(gsec, obdname, uid, dest_ip);
+ if (gss_msg) {
+ spin_unlock(&gsec->gs_lock);
+ GOTO(waiting, res);
+ }
+ if (!gss_new) {
+ spin_unlock(&gsec->gs_lock);
+ OBD_ALLOC(gss_new, sizeof(*gss_new));
+ if (!gss_new) {
+ CERROR("fail to alloc memory\n");
+ RETURN(-ENOMEM);
+ }
+ goto again;
+ }
+ /* so far we'v created gss_new */
+ gss_init_upcall_msg(gss_new, gsec, obdname, uid, dest_ip, svc);
+
+ if (gss_cred_is_uptodate_ctx(cred)) {
+ /* someone else had done it for us, simply cancel
+ * our own upcall */
+ CDEBUG(D_SEC, "cred("LPU64"/%u) has been refreshed by someone "
+ "else, simply drop our request\n",
+ cred->pc_pag, cred->pc_uid);
+ gss_unhash_msg_nolock(gss_new);
+ spin_unlock(&gsec->gs_lock);
+ gss_release_msg(gss_new);
+ RETURN(0);
+ }
+
+ /* need to make upcall now */
+ spin_unlock(&gsec->gs_lock);
+ res = rpc_queue_upcall(dentry->d_inode, &gss_new->gum_base);
+ if (res) {
+ CERROR("rpc_queue_upcall failed: %d\n", res);
+ gss_unhash_msg(gss_new);
+ gss_release_msg(gss_new);
+ RETURN(res);
+ }
+ gss_msg = gss_new;
+
+waiting:
+ init_waitqueue_entry(&wait, current);
+ spin_lock(&gsec->gs_lock);
+ add_wait_queue(&gss_msg->gum_waitq, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_unlock(&gsec->gs_lock);
+
+ schedule();
+
+ remove_wait_queue(&gss_msg->gum_waitq, &wait);
+ if (signal_pending(current)) {
+ CERROR("interrupted gss upcall %p\n", gss_msg);
+ res = -EINTR;
+ }
+ gss_release_msg(gss_msg);
+ RETURN(res);
+}
+#else /* !__KERNEL__ */
+extern int lgss_handle_krb5_upcall(uid_t uid, __u32 dest_ip,
+ char *obd_name,
+ char *buf, int bufsize,
+ int (*callback)(char*, unsigned long));
+
+static int gss_cred_refresh(struct ptlrpc_cred *cred)
+{
+ char buf[4096];
+ rawobj_t obj;
+ struct obd_import *imp;
+ struct gss_sec *gsec;
+ struct gss_api_mech *mech;
+ struct gss_cl_ctx *ctx = NULL;
+ struct vfs_cred vcred = { 0 };
+ ptl_nid_t peer_nid;
+ __u32 dest_ip;
+ __u32 subflavor;
+ int rc, gss_err;
+
+ LASSERT(cred);
+ LASSERT(cred->pc_sec);
+ LASSERT(cred->pc_sec->ps_import);
+ LASSERT(cred->pc_sec->ps_import->imp_obd);
+
+ if (ptlrpcs_cred_is_uptodate(cred))
+ RETURN(0);
+
+ imp = cred->pc_sec->ps_import;
+ peer_nid = imp->imp_connection->c_peer.peer_id.nid;
+ dest_ip = (__u32) (peer_nid & 0xFFFFFFFF);
+ subflavor = cred->pc_sec->ps_flavor.subflavor;
+
+ if (subflavor != PTLRPC_SEC_GSS_KRB5I) {
+ CERROR("unknown subflavor %u\n", subflavor);
+ GOTO(err_out, rc = -EINVAL);
+ }
+
+ rc = lgss_handle_krb5_upcall(cred->pc_uid, dest_ip,
+ imp->imp_obd->obd_name,
+ buf, sizeof(buf),
+ gss_send_secinit_rpc);
+ LASSERT(rc != 0);
+ if (rc < 0)
+ goto err_out;
+
+ obj.data = buf;
+ obj.len = rc;
+
+ gsec = container_of(cred->pc_sec, struct gss_sec, gs_base);
+ mech = gsec->gs_mech;
+ LASSERT(mech);
+ rc = gss_parse_init_downcall(mech, &obj, &ctx, &vcred, &dest_ip,
+ &gss_err);
+ if (rc) {
+ CERROR("parse init downcall error %d\n", rc);
+ goto err_out;
+ }
+
+ if (gss_err) {
+ CERROR("cred fresh got gss error %x\n", gss_err);
+ rc = -EINVAL;
+ goto err_out;
+ }
+
+ gss_cred_set_ctx(cred, ctx);
+ LASSERT(gss_cred_is_uptodate_ctx(cred));
+
+ return 0;
+err_out:
+ cred->pc_flags |= PTLRPC_CRED_DEAD;
+ return rc;
+}
+#endif
+
+static int gss_cred_match(struct ptlrpc_cred *cred,
+ struct ptlrpc_request *req,
+ struct vfs_cred *vcred)
+{
+ RETURN(cred->pc_pag == vcred->vc_pag);
+}
+
+static int gss_cred_sign(struct ptlrpc_cred *cred,
+ struct ptlrpc_request *req)
+{
+ struct gss_cred *gcred;
+ struct gss_cl_ctx *ctx;
+ rawobj_t lmsg, mic;
+ __u32 *vp, *vpsave, vlen, seclen;
+ __u32 seqnum, major, rc = 0;
+ ENTRY;
+
+ LASSERT(req->rq_reqbuf);
+ LASSERT(req->rq_cred == cred);
+
+ gcred = container_of(cred, struct gss_cred, gc_base);
+ ctx = gss_cred_get_ctx(cred);
+ if (!ctx) {
+ CERROR("cred %p("LPU64"/%u) invalidated?\n",
+ cred, cred->pc_pag, cred->pc_uid);
+ RETURN(-EPERM);
+ }
+
+ lmsg.len = req->rq_reqlen;
+ lmsg.data = (__u8 *) req->rq_reqmsg;
+
+ vp = (__u32 *) (lmsg.data + lmsg.len);
+ vlen = req->rq_reqbuf_len - sizeof(struct ptlrpcs_wire_hdr) -
+ lmsg.len;
+ seclen = vlen;
+
+ if (vlen < 6 * 4 + size_round4(ctx->gc_wire_ctx.len)) {
+ CERROR("vlen %d, need %d\n",
+ vlen, 6 * 4 + size_round4(ctx->gc_wire_ctx.len));
+ rc = -EIO;
+ goto out;
+ }
+
+ spin_lock(&ctx->gc_seq_lock);
+ seqnum = ctx->gc_seq++;
+ spin_unlock(&ctx->gc_seq_lock);
+
+ *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION); /* version */
+ *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I); /* subflavor */
+ *vp++ = cpu_to_le32(ctx->gc_proc); /* proc */
+ *vp++ = cpu_to_le32(seqnum); /* seq */
+ *vp++ = cpu_to_le32(PTLRPC_GSS_SVC_INTEGRITY); /* service */
+ vlen -= 5 * 4;
+
+ if (rawobj_serialize(&ctx->gc_wire_ctx, &vp, &vlen)) {
+ rc = -EIO;
+ goto out;
+ }
+ CDEBUG(D_SEC, "encoded wire_ctx length %d\n", ctx->gc_wire_ctx.len);
+
+ vpsave = vp++; /* reserve for size */
+ vlen -= 4;
+
+ mic.len = vlen;
+ mic.data = (char *) vp;
+
+ CDEBUG(D_SEC, "reqbuf at %p, lmsg at %p, len %d, mic at %p, len %d\n",
+ req->rq_reqbuf, lmsg.data, lmsg.len, mic.data, mic.len);
+ major = kgss_get_mic(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, &lmsg, &mic);
+ if (major) {
+ CERROR("gss compute mic error, major %x\n", major);
+ rc = -EACCES;
+ goto out;
+ }
+
+ *vpsave = cpu_to_le32(mic.len);
+
+ seclen = seclen - vlen + mic.len;
+ buf_to_sec_hdr(req->rq_reqbuf)->sec_len = cpu_to_le32(seclen);
+ req->rq_reqdata_len += size_round(seclen);
+ CDEBUG(D_SEC, "msg size %d, checksum size %d, total sec size %d\n",
+ lmsg.len, mic.len, seclen);
+out:
+ gss_put_ctx(ctx);
+ RETURN(rc);
+}
+
+static int gss_cred_verify(struct ptlrpc_cred *cred,
+ struct ptlrpc_request *req)
+{
+ struct gss_cred *gcred;
+ struct gss_cl_ctx *ctx;
+ struct ptlrpcs_wire_hdr *sec_hdr;
+ rawobj_t lmsg, mic;
+ __u32 *vp, vlen, subflavor, proc, seq, svc;
+ __u32 major, minor, rc;
+ ENTRY;
+
+ LASSERT(req->rq_repbuf);
+ LASSERT(req->rq_cred == cred);
+
+ sec_hdr = buf_to_sec_hdr(req->rq_repbuf);
+ vp = (__u32 *) (req->rq_repbuf + sizeof(*sec_hdr) + sec_hdr->msg_len);
+ vlen = sec_hdr->sec_len;
+
+ if (vlen < 7 * 4) {
+ CERROR("reply sec size %u too small\n", vlen);
+ RETURN(-EPROTO);
+ }
+
+ if (*vp++ != cpu_to_le32(PTLRPC_SEC_GSS_VERSION)) {
+ CERROR("reply have different gss version\n");
+ RETURN(-EPROTO);
+ }
+ subflavor = le32_to_cpu(*vp++);
+ proc = le32_to_cpu(*vp++);
+ vlen -= 3 * 4;
+
+ switch (proc) {
+ case PTLRPC_GSS_PROC_DATA:
+ seq = le32_to_cpu(*vp++);
+ svc = le32_to_cpu(*vp++);
+ if (svc != PTLRPC_GSS_SVC_INTEGRITY) {
+ CERROR("Unknown svc %d\n", svc);
+ RETURN(-EPROTO);
+ }
+ if (*vp++ != 0) {
+ CERROR("Unexpected ctx handle\n");
+ RETURN(-EPROTO);
+ }
+ mic.len = le32_to_cpu(*vp++);
+ vlen -= 4 * 4;
+ if (vlen < mic.len) {
+ CERROR("vlen %d, mic.len %d\n", vlen, mic.len);
+ RETURN(-EINVAL);
+ }
+ mic.data = (char *) vp;
+
+ gcred = container_of(cred, struct gss_cred, gc_base);
+ ctx = gss_cred_get_ctx(cred);
+ LASSERT(ctx);
+
+ lmsg.len = sec_hdr->msg_len;
+ lmsg.data = (__u8 *) buf_to_lustre_msg(req->rq_repbuf);
+
+ major = kgss_verify_mic(ctx->gc_gss_ctx, &lmsg, &mic, NULL);
+ if (major != GSS_S_COMPLETE) {
+ CERROR("gss verify mic error: major %x\n", major);
+ GOTO(proc_data_out, rc = -EINVAL);
+ }
+
+ req->rq_repmsg = (struct lustre_msg *) lmsg.data;
+ req->rq_replen = lmsg.len;
+
+ /* here we could check the seq number is the same one
+ * we sent to server. but portals has prevent us from
+ * replay attack, so maybe we don't need check it again.
+ */
+ rc = 0;
+proc_data_out:
+ gss_put_ctx(ctx);
+ break;
+ case PTLRPC_GSS_PROC_ERR:
+ major = le32_to_cpu(*vp++);
+ minor = le32_to_cpu(*vp++);
+ /* server return NO_CONTEXT might be caused by context expire
+ * or server reboot/failover. we refresh the cred transparently
+ * to upper layer.
+ * In some cases, our gss handle is possible to be incidentally
+ * identical to another handle since the handle itself is not
+ * fully random. In krb5 case, the GSS_S_BAD_SIG will be
+ * returned, maybe other gss error for other mechanism. Here we
+ * only consider krb5 mech (FIXME) and try to establish new
+ * context.
+ */
+ if (major == GSS_S_NO_CONTEXT ||
+ major == GSS_S_BAD_SIG) {
+ CWARN("req %p: server report cred %p %s, expired?\n",
+ req, cred, (major == GSS_S_NO_CONTEXT) ?
+ "NO_CONTEXT" : "BAD_SIG");
+
+ ptlrpcs_cred_die(cred);
+ rc = ptlrpcs_req_replace_dead_cred(req);
+ if (!rc)
+ req->rq_ptlrpcs_restart = 1;
+ else
+ CERROR("replace dead cred failed %d\n", rc);
+ } else {
+ CERROR("Unrecognized gss error (%x/%x)\n",
+ major, minor);
+ rc = -EACCES;
+ }
+ break;
+ default:
+ CERROR("unknown gss proc %d\n", proc);
+ rc = -EPROTO;
+ }
+
+ RETURN(rc);
+}
+
+static int gss_cred_seal(struct ptlrpc_cred *cred,
+ struct ptlrpc_request *req)
+{
+ struct gss_cred *gcred;
+ struct gss_cl_ctx *ctx;
+ struct ptlrpcs_wire_hdr *sec_hdr;
+ rawobj_buf_t msg_buf;
+ rawobj_t cipher_buf;
+ __u32 *vp, *vpsave, vlen, seclen;
+ __u32 major, seqnum, rc = 0;
+ ENTRY;
+
+ LASSERT(req->rq_reqbuf);
+ LASSERT(req->rq_cred == cred);
+
+ gcred = container_of(cred, struct gss_cred, gc_base);
+ ctx = gss_cred_get_ctx(cred);
+ if (!ctx) {
+ CERROR("cred %p("LPU64"/%u) invalidated?\n",
+ cred, cred->pc_pag, cred->pc_uid);
+ RETURN(-EPERM);
+ }
+
+ vp = (__u32 *) (req->rq_reqbuf + sizeof(*sec_hdr));
+ vlen = req->rq_reqbuf_len - sizeof(*sec_hdr);
+ seclen = vlen;
+
+ if (vlen < 6 * 4 + size_round4(ctx->gc_wire_ctx.len)) {
+ CERROR("vlen %d, need %d\n",
+ vlen, 6 * 4 + size_round4(ctx->gc_wire_ctx.len));
+ rc = -EIO;
+ goto out;
+ }
+
+ spin_lock(&ctx->gc_seq_lock);
+ seqnum = ctx->gc_seq++;
+ spin_unlock(&ctx->gc_seq_lock);
+
+ *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION); /* version */
+ *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5P); /* subflavor */
+ *vp++ = cpu_to_le32(ctx->gc_proc); /* proc */
+ *vp++ = cpu_to_le32(seqnum); /* seq */
+ *vp++ = cpu_to_le32(PTLRPC_GSS_SVC_PRIVACY); /* service */
+ vlen -= 5 * 4;
+
+ if (rawobj_serialize(&ctx->gc_wire_ctx, &vp, &vlen)) {
+ rc = -EIO;
+ goto out;
+ }
+ CDEBUG(D_SEC, "encoded wire_ctx length %d\n", ctx->gc_wire_ctx.len);
+
+ vpsave = vp++; /* reserve for size */
+ vlen -= 4;
+
+ msg_buf.buf = (__u8 *) req->rq_reqmsg - GSS_PRIVBUF_PREFIX_LEN;
+ msg_buf.buflen = req->rq_reqlen + GSS_PRIVBUF_PREFIX_LEN + GSS_PRIVBUF_SUFFIX_LEN;
+ msg_buf.dataoff = GSS_PRIVBUF_PREFIX_LEN;
+ msg_buf.datalen = req->rq_reqlen;
+
+ cipher_buf.data = (__u8 *) vp;
+ cipher_buf.len = vlen;
+
+ major = kgss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT,
+ &msg_buf, &cipher_buf);
+ if (major) {
+ CERROR("error wrap: major 0x%x\n", major);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ *vpsave = cpu_to_le32(cipher_buf.len);
+
+ seclen = seclen - vlen + cipher_buf.len;
+ sec_hdr = buf_to_sec_hdr(req->rq_reqbuf);
+ sec_hdr->sec_len = cpu_to_le32(seclen);
+ req->rq_reqdata_len += size_round(seclen);
+
+ CDEBUG(D_SEC, "msg size %d, total sec size %d\n",
+ req->rq_reqlen, seclen);
+out:
+ gss_put_ctx(ctx);
+ RETURN(rc);
+}
+
+static int gss_cred_unseal(struct ptlrpc_cred *cred,
+ struct ptlrpc_request *req)
+{
+ struct gss_cred *gcred;
+ struct gss_cl_ctx *ctx;
+ struct ptlrpcs_wire_hdr *sec_hdr;
+ rawobj_t cipher_text, plain_text;
+ __u32 *vp, vlen, subflavor, proc, seq, svc;
+ int rc;
+ ENTRY;
+
+ LASSERT(req->rq_repbuf);
+ LASSERT(req->rq_cred == cred);
+
+ sec_hdr = buf_to_sec_hdr(req->rq_repbuf);
+ if (sec_hdr->msg_len != 0) {
+ CERROR("unexpected msg_len %u\n", sec_hdr->msg_len);
+ RETURN(-EPROTO);
+ }
+
+ vp = (__u32 *) (req->rq_repbuf + sizeof(*sec_hdr));
+ vlen = sec_hdr->sec_len;
+
+ if (vlen < 7 * 4) {
+ CERROR("reply sec size %u too small\n", vlen);
+ RETURN(-EPROTO);
+ }
+
+ if (*vp++ != cpu_to_le32(PTLRPC_SEC_GSS_VERSION)) {
+ CERROR("reply have different gss version\n");
+ RETURN(-EPROTO);
+ }
+ subflavor = le32_to_cpu(*vp++);
+ proc = le32_to_cpu(*vp++);
+ seq = le32_to_cpu(*vp++);
+ svc = le32_to_cpu(*vp++);
+ vlen -= 5 * 4;
+
+ switch (proc) {
+ case PTLRPC_GSS_PROC_DATA:
+ if (svc != PTLRPC_GSS_SVC_PRIVACY) {
+ CERROR("Unknown svc %d\n", svc);
+ RETURN(-EPROTO);
+ }
+ if (*vp++ != 0) {
+ CERROR("Unexpected ctx handle\n");
+ RETURN(-EPROTO);
+ }
+ vlen -= 4;
+
+ cipher_text.len = le32_to_cpu(*vp++);
+ cipher_text.data = (__u8 *) vp;
+ vlen -= 4;
+
+ if (vlen < cipher_text.len) {
+ CERROR("cipher text to be %u while buf only %u\n",
+ cipher_text.len, vlen);
+ RETURN(-EPROTO);
+ }
+
+ plain_text = cipher_text;
+
+ gcred = container_of(cred, struct gss_cred, gc_base);
+ ctx = gss_cred_get_ctx(cred);
+ LASSERT(ctx);
+
+ rc = kgss_unwrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT,
+ &cipher_text, &plain_text);
+ if (rc) {
+ CERROR("error unwrap: 0x%x\n", rc);
+ GOTO(proc_out, rc = -EINVAL);
+ }
+
+ req->rq_repmsg = (struct lustre_msg *) vp;
+ req->rq_replen = plain_text.len;
+
+ rc = 0;
+proc_out:
+ gss_put_ctx(ctx);
+ break;
+ default:
+ CERROR("unknown gss proc %d\n", proc);
+ rc = -EPROTO;
+ }
+
+ RETURN(rc);
+}
+
+static void destroy_gss_context(struct ptlrpc_cred *cred)
+{
+ struct ptlrpcs_wire_hdr *hdr;
+ struct lustre_msg *lmsg;
+ struct gss_cred *gcred;
+ struct ptlrpc_request req;
+ struct obd_import *imp;
+ __u32 *vp, lmsg_size;
+ ENTRY;
+
+ /* cred's refcount is 0, steal one */
+ atomic_inc(&cred->pc_refcount);
+
+ gcred = container_of(cred, struct gss_cred, gc_base);
+ gcred->gc_ctx->gc_proc = PTLRPC_GSS_PROC_DESTROY;
+ imp = cred->pc_sec->ps_import;
+ LASSERT(imp);
+
+ if (!(cred->pc_flags & PTLRPC_CRED_UPTODATE)) {
+ CWARN("Destroy a dead gss cred %p(%u@%s), don't send rpc\n",
+ gcred, cred->pc_uid, imp->imp_target_uuid.uuid);
+ atomic_dec(&cred->pc_refcount);
+ EXIT;
+ return;
+ }
+
+ CWARN("client destroy gss cred %p(%u@%s)\n",
+ gcred, cred->pc_uid, imp->imp_target_uuid.uuid);
+
+ lmsg_size = lustre_msg_size(0, NULL);
+ req.rq_reqbuf_len = sizeof(*hdr) + lmsg_size +
+ ptlrpcs_est_req_payload(cred->pc_sec, lmsg_size);
+
+ OBD_ALLOC(req.rq_reqbuf, req.rq_reqbuf_len);
+ if (!req.rq_reqbuf) {
+ CERROR("Fail to alloc reqbuf, cancel anyway\n");
+ atomic_dec(&cred->pc_refcount);
+ EXIT;
+ return;
+ }
+
+ /* wire hdr */
+ hdr = buf_to_sec_hdr(req.rq_reqbuf);
+ hdr->flavor = cpu_to_le32(PTLRPC_SEC_GSS);
+ hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_AUTH);
+ hdr->msg_len = cpu_to_le32(lmsg_size);
+ hdr->sec_len = cpu_to_le32(0);
+
+ /* lustre message */
+ lmsg = buf_to_lustre_msg(req.rq_reqbuf);
+ lustre_init_msg(lmsg, 0, NULL, NULL);
+ lmsg->handle = imp->imp_remote_handle;
+ lmsg->type = PTL_RPC_MSG_REQUEST;
+ lmsg->opc = SEC_FINI;
+ lmsg->flags = 0;
+ lmsg->conn_cnt = imp->imp_conn_cnt;
+ /* add this for randomize */
+ get_random_bytes(&lmsg->last_xid, sizeof(lmsg->last_xid));
+ get_random_bytes(&lmsg->transno, sizeof(lmsg->transno));
+
+ vp = (__u32 *) req.rq_reqbuf;
+
+ req.rq_cred = cred;
+ req.rq_reqmsg = buf_to_lustre_msg(req.rq_reqbuf);
+ req.rq_reqlen = lmsg_size;
+ req.rq_reqdata_len = sizeof(*hdr) + lmsg_size;
+
+ if (gss_cred_sign(cred, &req)) {
+ CERROR("failed to sign, cancel anyway\n");
+ atomic_dec(&cred->pc_refcount);
+ goto exit;
+ }
+ atomic_dec(&cred->pc_refcount);
+
+ /* send out */
+ gss_send_secfini_rpc(imp, req.rq_reqbuf, req.rq_reqdata_len);
+exit:
+ OBD_FREE(req.rq_reqbuf, req.rq_reqbuf_len);
+ EXIT;
+}
+
+static void gss_cred_destroy(struct ptlrpc_cred *cred)
+{
+ struct gss_cred *gcred;
+ ENTRY;
+
+ LASSERT(cred);
+ LASSERT(!atomic_read(&cred->pc_refcount));
+
+ gcred = container_of(cred, struct gss_cred, gc_base);
+ if (gcred->gc_ctx) {
+ destroy_gss_context(cred);
+ gss_put_ctx(gcred->gc_ctx);
+ }
+
+ CDEBUG(D_SEC, "GSS_SEC: destroy cred %p\n", gcred);
+
+ OBD_FREE(gcred, sizeof(*gcred));
+ EXIT;
+}
+
+static struct ptlrpc_credops gss_credops = {
+ .refresh = gss_cred_refresh,
+ .match = gss_cred_match,
+ .sign = gss_cred_sign,
+ .verify = gss_cred_verify,
+ .seal = gss_cred_seal,
+ .unseal = gss_cred_unseal,
+ .destroy = gss_cred_destroy,
+};
+
+#ifdef __KERNEL__
+/*******************************************
+ * rpc_pipe APIs *
+ *******************************************/
+static ssize_t
+gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+ char *dst, size_t buflen)
+{
+ char *data = (char *)msg->data + msg->copied;
+ ssize_t mlen = msg->len;
+ ssize_t left;
+ ENTRY;
+
+ if (mlen > buflen)
+ mlen = buflen;
+ left = copy_to_user(dst, data, mlen);
+ if (left < 0) {
+ msg->errno = left;
+ RETURN(left);
+ }
+ mlen -= left;
+ msg->copied += mlen;
+ msg->errno = 0;
+ RETURN(mlen);
+}
+
+static ssize_t
+gss_pipe_downcall(struct file *filp, const char *src, size_t mlen)
+{
+ char *buf;
+ const int bufsize = 1024;
+ rawobj_t obj;
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct rpc_inode *rpci = RPC_I(inode);
+ struct obd_import *import;
+ struct ptlrpc_sec *sec;
+ struct gss_sec *gsec;
+ char *obdname;
+ struct gss_api_mech *mech;
+ struct vfs_cred vcred = { 0 };
+ struct ptlrpc_cred *cred;
+ struct gss_upcall_msg *gss_msg;
+ struct gss_cl_ctx *ctx = NULL;
+ __u32 dest_ip;
+ ssize_t left;
+ int err, gss_err;
+ ENTRY;
+
+ if (mlen > bufsize) {
+ CERROR("mlen %ld > bufsize %d\n", (long)mlen, bufsize);
+ RETURN(-ENOSPC);
+ }
+
+ OBD_ALLOC(buf, bufsize);
+ if (!buf) {
+ CERROR("alloc mem failed\n");
+ RETURN(-ENOMEM);
+ }
+
+ left = copy_from_user(buf, src, mlen);
+ if (left)
+ GOTO(err_free, err = -EFAULT);
+
+ obj.data = buf;
+ obj.len = mlen;
+
+ LASSERT(rpci->private);
+ gsec = (struct gss_sec *)rpci->private;
+ sec = &gsec->gs_base;
+ LASSERT(sec->ps_import);
+ import = class_import_get(sec->ps_import);
+ LASSERT(import->imp_obd);
+ obdname = import->imp_obd->obd_name;
+ mech = gsec->gs_mech;
+
+ err = gss_parse_init_downcall(mech, &obj, &ctx, &vcred, &dest_ip,
+ &gss_err);
+ if (err) {
+ CERROR("parse downcall err %d\n", err);
+ GOTO(err, err);
+ }
+ cred = ptlrpcs_cred_lookup(sec, &vcred);
+ if (!cred) {
+ CWARN("didn't find cred\n");
+ GOTO(err, err);
+ }
+ if (gss_err) {
+ CERROR("got gss err %d, set cred %p dead\n", gss_err, cred);
+ cred->pc_flags |= PTLRPC_CRED_DEAD;
+ } else {
+ CDEBUG(D_SEC, "get initial ctx:\n");
+ gss_cred_set_ctx(cred, ctx);
+ }
+
+ spin_lock(&gsec->gs_lock);
+ gss_msg = gss_find_upcall(gsec, obdname, vcred.vc_uid, dest_ip);
+ if (gss_msg) {
+ gss_unhash_msg_nolock(gss_msg);
+ spin_unlock(&gsec->gs_lock);
+ gss_release_msg(gss_msg);
+ } else
+ spin_unlock(&gsec->gs_lock);
+
+ ptlrpcs_cred_put(cred, 1);
+ class_import_put(import);
+ OBD_FREE(buf, bufsize);
+ RETURN(mlen);
+err:
+ if (ctx)
+ gss_destroy_ctx(ctx);
+ class_import_put(import);
+err_free:
+ OBD_FREE(buf, bufsize);
+ CDEBUG(D_SEC, "gss_pipe_downcall returning %d\n", err);
+ RETURN(err);
+}
+
+static
+void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+ struct gss_upcall_msg *gmsg;
+ static unsigned long ratelimit;
+ ENTRY;
+
+ if (msg->errno >= 0) {
+ EXIT;
+ return;
+ }
+
+ gmsg = container_of(msg, struct gss_upcall_msg, gum_base);
+ CDEBUG(D_SEC, "destroy gmsg %p\n", gmsg);
+ atomic_inc(&gmsg->gum_refcount);
+ gss_unhash_msg(gmsg);
+ if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) {
+ unsigned long now = get_seconds();
+ if (time_after(now, ratelimit)) {
+ CWARN("GSS_SEC upcall timed out.\n"
+ "Please check user daemon is running!\n");
+ ratelimit = now + 15;
+ }
+ }
+ gss_release_msg(gmsg);
+ EXIT;
+}
+
+static
+void gss_pipe_release(struct inode *inode)
+{
+ struct rpc_inode *rpci = RPC_I(inode);
+ struct ptlrpc_sec *sec;
+ struct gss_sec *gsec;
+ ENTRY;
+
+ gsec = (struct gss_sec *)rpci->private;
+ sec = &gsec->gs_base;
+ spin_lock(&gsec->gs_lock);
+ while (!list_empty(&gsec->gs_upcalls)) {
+ struct gss_upcall_msg *gmsg;
+
+ gmsg = list_entry(gsec->gs_upcalls.next,
+ struct gss_upcall_msg, gum_list);
+ gmsg->gum_base.errno = -EPIPE;
+ atomic_inc(&gmsg->gum_refcount);
+ gss_unhash_msg_nolock(gmsg);
+ gss_release_msg(gmsg);
+ }
+ spin_unlock(&gsec->gs_lock);
+ EXIT;
+}
+
+static struct rpc_pipe_ops gss_upcall_ops = {
+ .upcall = gss_pipe_upcall,
+ .downcall = gss_pipe_downcall,
+ .destroy_msg = gss_pipe_destroy_msg,
+ .release_pipe = gss_pipe_release,
+};
+#endif /* __KERNEL__ */
+
+/*********************************************
+ * GSS security APIs *
+ *********************************************/
+
+static
+struct ptlrpc_sec* gss_create_sec(ptlrpcs_flavor_t *flavor,
+ const char *pipe_dir,
+ void *pipe_data)
+{
+ struct gss_sec *gsec;
+ struct ptlrpc_sec *sec;
+ char *pos;
+ ENTRY;
+
+ LASSERT(flavor->flavor == PTLRPC_SEC_GSS);
+
+ OBD_ALLOC(gsec, sizeof(*gsec));
+ if (!gsec) {
+ CERROR("can't alloc gsec\n");
+ RETURN(NULL);
+ }
+
+ gsec->gs_mech = kgss_subflavor_to_mech(flavor->subflavor);
+ if (!gsec->gs_mech) {
+ CERROR("subflavor %d not found\n", flavor->subflavor);
+ goto err_free;
+ }
+
+ /* initialize gss sec */
+#ifdef __KERNEL__
+ INIT_LIST_HEAD(&gsec->gs_upcalls);
+ spin_lock_init(&gsec->gs_lock);
+
+ snprintf(gsec->gs_pipepath, sizeof(gsec->gs_pipepath),
+ LUSTRE_PIPEDIR"/%s", pipe_dir);
+ if (IS_ERR(rpc_mkdir(gsec->gs_pipepath, NULL))) {
+ CERROR("can't make pipedir %s\n", gsec->gs_pipepath);
+ goto err_mech_put;
+ }
+
+ snprintf(gsec->gs_pipepath, sizeof(gsec->gs_pipepath),
+ LUSTRE_PIPEDIR"/%s/%s", pipe_dir, gsec->gs_mech->gm_name);
+ gsec->gs_depipe = rpc_mkpipe(gsec->gs_pipepath, gsec,
+ &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
+ if (IS_ERR(gsec->gs_depipe)) {
+ CERROR("failed to make rpc_pipe %s: %ld\n",
+ gsec->gs_pipepath, PTR_ERR(gsec->gs_depipe));
+ goto err_rmdir;
+ }
+ CDEBUG(D_SEC, "gss sec %p, pipe path %s\n", gsec, gsec->gs_pipepath);
+#endif
+
+ sec = &gsec->gs_base;
+
+ switch (flavor->subflavor) {
+ case PTLRPC_SEC_GSS_KRB5I:
+ sec->ps_sectype = PTLRPC_SEC_TYPE_AUTH;
+ break;
+ case PTLRPC_SEC_GSS_KRB5P:
+ sec->ps_sectype = PTLRPC_SEC_TYPE_PRIV;
+ break;
+ default:
+ LBUG();
+ }
+
+ sec->ps_expire = GSS_CREDCACHE_EXPIRE;
+ sec->ps_nextgc = get_seconds() + sec->ps_expire;
+ sec->ps_flags = 0;
+
+ CDEBUG(D_SEC, "Create GSS security instance at %p(external %p)\n",
+ gsec, sec);
+ RETURN(sec);
+
+#ifdef __KERNEL__
+err_rmdir:
+ pos = strrchr(gsec->gs_pipepath, '/');
+ LASSERT(pos);
+ *pos = 0;
+ rpc_rmdir(gsec->gs_pipepath);
+err_mech_put:
+#endif
+ kgss_mech_put(gsec->gs_mech);
+err_free:
+ OBD_FREE(gsec, sizeof(*gsec));
+ RETURN(NULL);
+}
+
+static
+void gss_destroy_sec(struct ptlrpc_sec *sec)
+{
+ struct gss_sec *gsec;
+ char *pos;
+ ENTRY;
+
+ gsec = container_of(sec, struct gss_sec, gs_base);
+ CDEBUG(D_SEC, "Destroy GSS security instance at %p\n", gsec);
+
+ LASSERT(gsec->gs_mech);
+ LASSERT(!atomic_read(&sec->ps_refcount));
+ LASSERT(!atomic_read(&sec->ps_credcount));
+#ifdef __KERNEL__
+ rpc_unlink(gsec->gs_pipepath);
+ pos = strrchr(gsec->gs_pipepath, '/');
+ LASSERT(pos);
+ *pos = 0;
+ rpc_rmdir(gsec->gs_pipepath);
+#endif
+
+ kgss_mech_put(gsec->gs_mech);
+ OBD_FREE(gsec, sizeof(*gsec));
+ EXIT;
+}
+
+static
+struct ptlrpc_cred * gss_create_cred(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ struct vfs_cred *vcred)
+{
+ struct gss_cred *gcred;
+ struct ptlrpc_cred *cred;
+ ENTRY;
+
+ OBD_ALLOC(gcred, sizeof(*gcred));
+ if (!gcred)
+ RETURN(NULL);
+
+ cred = &gcred->gc_base;
+ INIT_LIST_HEAD(&cred->pc_hash);
+ atomic_set(&cred->pc_refcount, 0);
+ cred->pc_sec = sec;
+ cred->pc_ops = &gss_credops;
+ cred->pc_req = req;
+ cred->pc_expire = get_seconds() + GSS_CRED_EXPIRE;
+ cred->pc_flags = 0;
+ cred->pc_pag = vcred->vc_pag;
+ cred->pc_uid = vcred->vc_uid;
+ CDEBUG(D_SEC, "create a gss cred at %p("LPU64"/%u)\n",
+ cred, vcred->vc_pag, vcred->vc_uid);
+
+ RETURN(cred);
+}
+
+static int gss_estimate_payload(struct ptlrpc_sec *sec, int msgsize)
+{
+ switch (sec->ps_sectype) {
+ case PTLRPC_SEC_TYPE_AUTH:
+ return GSS_MAX_AUTH_PAYLOAD;
+ case PTLRPC_SEC_TYPE_PRIV:
+ return size_round16(GSS_MAX_AUTH_PAYLOAD + msgsize +
+ GSS_PRIVBUF_PREFIX_LEN +
+ GSS_PRIVBUF_SUFFIX_LEN);
+ default:
+ LBUG();
+ return 0;
+ }
+}
+
+static int gss_alloc_reqbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int lmsg_size)
+{
+ int msg_payload, sec_payload;
+ int privacy, rc;
+ ENTRY;
+
+ /* In PRIVACY mode, lustre message is always 0 (already encoded into
+ * security payload).
+ */
+ privacy = sec->ps_sectype == PTLRPC_SEC_TYPE_PRIV;
+ msg_payload = privacy ? 0 : lmsg_size;
+ sec_payload = gss_estimate_payload(sec, lmsg_size);
+
+ rc = sec_alloc_reqbuf(sec, req, msg_payload, sec_payload);
+ if (rc)
+ return rc;
+
+ if (privacy) {
+ int buflen = lmsg_size + GSS_PRIVBUF_PREFIX_LEN +
+ GSS_PRIVBUF_SUFFIX_LEN;
+ char *buf;
+
+ OBD_ALLOC(buf, buflen);
+ if (!buf) {
+ CERROR("Fail to alloc %d\n", buflen);
+ sec_free_reqbuf(sec, req);
+ RETURN(-ENOMEM);
+ }
+ req->rq_reqmsg = (struct lustre_msg *)
+ (buf + GSS_PRIVBUF_PREFIX_LEN);
+ }
+
+ RETURN(0);
+}
+
+static void gss_free_reqbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req)
+{
+ char *buf;
+ int privacy;
+ ENTRY;
+
+ LASSERT(req->rq_reqmsg);
+ LASSERT(req->rq_reqlen);
+
+ privacy = sec->ps_sectype == PTLRPC_SEC_TYPE_PRIV;
+ if (privacy) {
+ buf = (char *) req->rq_reqmsg - GSS_PRIVBUF_PREFIX_LEN;
+ LASSERT(buf < req->rq_reqbuf ||
+ buf >= req->rq_reqbuf + req->rq_reqbuf_len);
+ OBD_FREE(buf, req->rq_reqlen + GSS_PRIVBUF_PREFIX_LEN +
+ GSS_PRIVBUF_SUFFIX_LEN);
+ req->rq_reqmsg = NULL;
+ }
+
+ sec_free_reqbuf(sec, req);
+}
+
+static struct ptlrpc_secops gss_secops = {
+ .create_sec = gss_create_sec,
+ .destroy_sec = gss_destroy_sec,
+ .create_cred = gss_create_cred,
+ .est_req_payload = gss_estimate_payload,
+ .est_rep_payload = gss_estimate_payload,
+ .alloc_reqbuf = gss_alloc_reqbuf,
+ .free_reqbuf = gss_free_reqbuf,
+};
+
+static struct ptlrpc_sec_type gss_type = {
+ .pst_owner = THIS_MODULE,
+ .pst_name = "GSS_SEC",
+ .pst_inst = ATOMIC_INIT(0),
+ .pst_flavor = {PTLRPC_SEC_GSS, 0},
+ .pst_ops = &gss_secops,
+};
+
+extern int
+(*lustre_secinit_downcall_handler)(char *buffer, unsigned long count);
+
+int __init ptlrpcs_gss_init(void)
+{
+ int rc;
+
+ rc = ptlrpcs_register(&gss_type);
+ if (rc)
+ return rc;
+
+#ifdef __KERNEL__
+ gss_svc_init();
+
+ rc = PTR_ERR(rpc_mkdir(LUSTRE_PIPEDIR, NULL));
+ if (IS_ERR((void *)rc) && rc != -EEXIST) {
+ CERROR("fail to make rpcpipedir for lustre\n");
+ gss_svc_exit();
+ ptlrpcs_unregister(&gss_type);
+ return -1;
+ }
+ rc = 0;
+#else
+#endif
+ rc = init_kerberos_module();
+ if (rc) {
+ ptlrpcs_unregister(&gss_type);
+ }
+
+ lustre_secinit_downcall_handler = gss_send_secinit_rpc;
+
+ return rc;
+}
+
+static void __exit ptlrpcs_gss_exit(void)
+{
+ lustre_secinit_downcall_handler = NULL;
+
+ cleanup_kerberos_module();
+#ifndef __KERNEL__
+#else
+ rpc_rmdir(LUSTRE_PIPEDIR);
+ gss_svc_exit();
+#endif
+ ptlrpcs_unregister(&gss_type);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("GSS Security module for Lustre");
+MODULE_LICENSE("GPL");
+
+module_init(ptlrpcs_gss_init);
+module_exit(ptlrpcs_gss_exit);
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Neil Brown <neilb@cse.unsw.edu.au>
+ * J. Bruce Fields <bfields@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ * Dug Song <dugsong@monkey.org>
+ *
+ * RPCSEC_GSS server authentication.
+ * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078
+ * (gssapi)
+ *
+ * The RPCSEC_GSS involves three stages:
+ * 1/ context creation
+ * 2/ data exchange
+ * 3/ context destruction
+ *
+ * Context creation is handled largely by upcalls to user-space.
+ * In particular, GSS_Accept_sec_context is handled by an upcall
+ * Data exchange is handled entirely within the kernel
+ * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel.
+ * Context destruction is handled in-kernel
+ * GSS_Delete_sec_context is in-kernel
+ *
+ * Context creation is initiated by a RPCSEC_GSS_INIT request arriving.
+ * The context handle and gss_token are used as a key into the rpcsec_init cache.
+ * The content of this cache includes some of the outputs of GSS_Accept_sec_context,
+ * being major_status, minor_status, context_handle, reply_token.
+ * These are sent back to the client.
+ * Sequence window management is handled by the kernel. The window size if currently
+ * a compile time constant.
+ *
+ * When user-space is happy that a context is established, it places an entry
+ * in the rpcsec_context cache. The key for this cache is the context_handle.
+ * The content includes:
+ * uid/gidlist - for determining access rights
+ * mechanism type
+ * mechanism specific information, such as a key
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <linux/sunrpc/cache.h>
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static inline unsigned long hash_mem(char *buf, int length, int bits)
+{
+ unsigned long hash = 0;
+ unsigned long l = 0;
+ int len = 0;
+ unsigned char c;
+ do {
+ if (len == length) {
+ c = (char)len; len = -1;
+ } else
+ c = *buf++;
+ l = (l << 8) | c;
+ len++;
+ if ((len & (BITS_PER_LONG/8-1))==0)
+ hash = hash_long(hash^l, BITS_PER_LONG);
+ } while (len);
+ return hash >> (BITS_PER_LONG - bits);
+}
+
+/* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests
+ * into replies.
+ *
+ * Key is context handle (\x if empty) and gss_token.
+ * Content is major_status minor_status (integers) context_handle, reply_token.
+ *
+ */
+
+#define RSI_HASHBITS 6
+#define RSI_HASHMAX (1<<RSI_HASHBITS)
+#define RSI_HASHMASK (RSI_HASHMAX-1)
+
+struct rsi {
+ struct cache_head h;
+ rawobj_t in_handle, in_token;
+ rawobj_t out_handle, out_token;
+ int major_status, minor_status;
+};
+
+static struct cache_head *rsi_table[RSI_HASHMAX];
+static struct cache_detail rsi_cache;
+
+static void rsi_free(struct rsi *rsii)
+{
+ rawobj_free(&rsii->in_handle);
+ rawobj_free(&rsii->in_token);
+ rawobj_free(&rsii->out_handle);
+ rawobj_free(&rsii->out_token);
+}
+
+static void rsi_put(struct cache_head *item, struct cache_detail *cd)
+{
+ struct rsi *rsii = container_of(item, struct rsi, h);
+ if (cache_put(item, cd)) {
+ rsi_free(rsii);
+ OBD_FREE(rsii, sizeof(*rsii));
+ }
+}
+
+static inline int rsi_hash(struct rsi *item)
+{
+ return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS)
+ ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS);
+}
+
+static inline int rsi_match(struct rsi *item, struct rsi *tmp)
+{
+ return (rawobj_equal(&item->in_handle, &tmp->in_handle) &&
+ rawobj_equal(&item->in_token, &tmp->in_token));
+}
+
+static void rsi_request(struct cache_detail *cd,
+ struct cache_head *h,
+ char **bpp, int *blen)
+{
+ struct rsi *rsii = container_of(h, struct rsi, h);
+
+ qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len);
+ qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len);
+ (*bpp)[-1] = '\n';
+}
+
+static int
+gssd_reply(struct rsi *item)
+{
+ struct rsi *tmp;
+ struct cache_head **hp, **head;
+ ENTRY;
+
+ head = &rsi_cache.hash_table[rsi_hash(item)];
+ write_lock(&rsi_cache.hash_lock);
+ for (hp = head; *hp != NULL; hp = &tmp->h.next) {
+ tmp = container_of(*hp, struct rsi, h);
+ if (rsi_match(tmp, item)) {
+ cache_get(&tmp->h);
+ clear_bit(CACHE_HASHED, &tmp->h.flags);
+ *hp = tmp->h.next;
+ tmp->h.next = NULL;
+ rsi_cache.entries--;
+ if (test_bit(CACHE_VALID, &tmp->h.flags)) {
+ write_unlock(&rsi_cache.hash_lock);
+ rsi_put(&tmp->h, &rsi_cache);
+ RETURN(-EINVAL);
+ }
+ set_bit(CACHE_HASHED, &item->h.flags);
+ item->h.next = *hp;
+ *hp = &item->h;
+ rsi_cache.entries++;
+ set_bit(CACHE_VALID, &item->h.flags);
+ item->h.last_refresh = get_seconds();
+ write_unlock(&rsi_cache.hash_lock);
+ cache_fresh(&rsi_cache, &tmp->h, 0);
+ rsi_put(&tmp->h, &rsi_cache);
+ RETURN(0);
+ }
+ }
+ write_unlock(&rsi_cache.hash_lock);
+ RETURN(-EINVAL);
+}
+
+/* XXX
+ * here we just wait here for its completion or timedout. it's a
+ * hacking but works, and we'll comeup with real fix if we decided
+ * to still stick with NFS4 cache code
+ */
+static struct rsi *
+gssd_upcall(struct rsi *item, struct cache_req *chandle)
+{
+ struct rsi *tmp;
+ struct cache_head **hp, **head;
+ unsigned long starttime;
+ ENTRY;
+
+ head = &rsi_cache.hash_table[rsi_hash(item)];
+ read_lock(&rsi_cache.hash_lock);
+ for (hp = head; *hp != NULL; hp = &tmp->h.next) {
+ tmp = container_of(*hp, struct rsi, h);
+ if (rsi_match(tmp, item)) {
+ LBUG();
+ if (!test_bit(CACHE_VALID, &tmp->h.flags)) {
+ CERROR("found rsi without VALID\n");
+ read_unlock(&rsi_cache.hash_lock);
+ return NULL;
+ }
+ *hp = tmp->h.next;
+ tmp->h.next = NULL;
+ rsi_cache.entries--;
+ cache_get(&tmp->h);
+ read_unlock(&rsi_cache.hash_lock);
+ return tmp;
+ }
+ }
+ // cache_get(&item->h);
+ set_bit(CACHE_HASHED, &item->h.flags);
+ item->h.next = *head;
+ *head = &item->h;
+ rsi_cache.entries++;
+ read_unlock(&rsi_cache.hash_lock);
+ cache_get(&item->h);
+
+ cache_check(&rsi_cache, &item->h, chandle);
+ starttime = get_seconds();
+ do {
+ yield();
+ read_lock(&rsi_cache.hash_lock);
+ for (hp = head; *hp != NULL; hp = &tmp->h.next) {
+ tmp = container_of(*hp, struct rsi, h);
+ if (tmp == item)
+ continue;
+ if (rsi_match(tmp, item)) {
+ if (!test_bit(CACHE_VALID, &tmp->h.flags)) {
+ read_unlock(&rsi_cache.hash_lock);
+ return NULL;
+ }
+ cache_get(&tmp->h);
+ clear_bit(CACHE_HASHED, &tmp->h.flags);
+ *hp = tmp->h.next;
+ tmp->h.next = NULL;
+ rsi_cache.entries--;
+ read_unlock(&rsi_cache.hash_lock);
+ return tmp;
+ }
+ }
+ read_unlock(&rsi_cache.hash_lock);
+ } while ((get_seconds() - starttime) <= 5);
+ CERROR("5s timeout while waiting cache refill\n");
+ return NULL;
+}
+
+static int rsi_parse(struct cache_detail *cd,
+ char *mesg, int mlen)
+{
+ /* context token expiry major minor context token */
+ char *buf = mesg;
+ char *ep;
+ int len;
+ struct rsi *rsii;
+ time_t expiry;
+ int status = -EINVAL;
+ ENTRY;
+
+ OBD_ALLOC(rsii, sizeof(*rsii));
+ if (!rsii) {
+ CERROR("failed to alloc rsii\n");
+ RETURN(-ENOMEM);
+ }
+ cache_init(&rsii->h);
+
+ /* handle */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+ if (rawobj_alloc(&rsii->in_handle, buf, len))
+ goto out;
+
+ /* token */
+ len = qword_get(&mesg, buf, mlen);
+ status = -EINVAL;
+ if (len < 0)
+ goto out;;
+ status = -ENOMEM;
+ if (rawobj_alloc(&rsii->in_token, buf, len))
+ goto out;
+
+ /* expiry */
+ expiry = get_expiry(&mesg);
+ status = -EINVAL;
+ if (expiry == 0)
+ goto out;
+
+ /* major/minor */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0)
+ goto out;
+ if (len == 0) {
+ goto out;
+ } else {
+ rsii->major_status = simple_strtoul(buf, &ep, 10);
+ if (*ep)
+ goto out;
+ len = qword_get(&mesg, buf, mlen);
+ if (len <= 0)
+ goto out;
+ rsii->minor_status = simple_strtoul(buf, &ep, 10);
+ if (*ep)
+ goto out;
+
+ /* out_handle */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+ if (rawobj_alloc(&rsii->out_handle, buf, len))
+ goto out;
+
+ /* out_token */
+ len = qword_get(&mesg, buf, mlen);
+ status = -EINVAL;
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+ if (rawobj_alloc(&rsii->out_token, buf, len))
+ goto out;
+ }
+ rsii->h.expiry_time = expiry;
+ status = gssd_reply(rsii);
+out:
+ if (rsii)
+ rsi_put(&rsii->h, &rsi_cache);
+ RETURN(status);
+}
+
+static struct cache_detail rsi_cache = {
+ .hash_size = RSI_HASHMAX,
+ .hash_table = rsi_table,
+ .name = "auth.ptlrpcs.init",
+ .cache_put = rsi_put,
+ .cache_request = rsi_request,
+ .cache_parse = rsi_parse,
+};
+
+/*
+ * The rpcsec_context cache is used to store a context that is
+ * used in data exchange.
+ * The key is a context handle. The content is:
+ * uid, gidlist, mechanism, service-set, mech-specific-data
+ */
+
+#define RSC_HASHBITS 10
+#define RSC_HASHMAX (1<<RSC_HASHBITS)
+#define RSC_HASHMASK (RSC_HASHMAX-1)
+
+#define GSS_SEQ_WIN 128
+
+struct gss_svc_seq_data {
+ /* highest seq number seen so far: */
+ __u32 sd_max;
+ /* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of
+ * sd_win is nonzero iff sequence number i has been seen already: */
+ unsigned long sd_win[GSS_SEQ_WIN/BITS_PER_LONG];
+ spinlock_t sd_lock;
+};
+
+struct rsc {
+ struct cache_head h;
+ rawobj_t handle;
+ __u32 remote;
+ struct vfs_cred cred;
+ struct gss_svc_seq_data seqdata;
+ struct gss_ctx *mechctx;
+};
+
+static struct cache_head *rsc_table[RSC_HASHMAX];
+static struct cache_detail rsc_cache;
+
+static void rsc_free(struct rsc *rsci)
+{
+ rawobj_free(&rsci->handle);
+ if (rsci->mechctx)
+ kgss_delete_sec_context(&rsci->mechctx);
+#if 0
+ if (rsci->cred.vc_ginfo)
+ put_group_info(rsci->cred.vc_ginfo);
+#endif
+}
+
+static void rsc_put(struct cache_head *item, struct cache_detail *cd)
+{
+ struct rsc *rsci = container_of(item, struct rsc, h);
+
+ if (cache_put(item, cd)) {
+ rsc_free(rsci);
+ OBD_FREE(rsci, sizeof(*rsci));
+ }
+}
+
+static inline int
+rsc_hash(struct rsc *rsci)
+{
+ return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS);
+}
+
+static inline int
+rsc_match(struct rsc *new, struct rsc *tmp)
+{
+ return rawobj_equal(&new->handle, &tmp->handle);
+}
+
+static struct rsc *rsc_lookup(struct rsc *item, int set)
+{
+ struct rsc *tmp = NULL;
+ struct cache_head **hp, **head;
+ head = &rsc_cache.hash_table[rsc_hash(item)];
+ ENTRY;
+
+ if (set)
+ write_lock(&rsc_cache.hash_lock);
+ else
+ read_lock(&rsc_cache.hash_lock);
+ for (hp = head; *hp != NULL; hp = &tmp->h.next) {
+ tmp = container_of(*hp, struct rsc, h);
+ if (!rsc_match(tmp, item))
+ continue;
+ cache_get(&tmp->h);
+ if (!set) {
+ goto out_noset;
+ }
+ *hp = tmp->h.next;
+ tmp->h.next = NULL;
+ clear_bit(CACHE_HASHED, &tmp->h.flags);
+ rsc_put(&tmp->h, &rsc_cache);
+ goto out_set;
+ }
+ /* Didn't find anything */
+ if (!set)
+ goto out_noset;
+ rsc_cache.entries++;
+out_set:
+ set_bit(CACHE_HASHED, &item->h.flags);
+ item->h.next = *head;
+ *head = &item->h;
+ write_unlock(&rsc_cache.hash_lock);
+ cache_fresh(&rsc_cache, &item->h, item->h.expiry_time);
+ cache_get(&item->h);
+ RETURN(item);
+out_noset:
+ read_unlock(&rsc_cache.hash_lock);
+ RETURN(tmp);
+}
+
+static int rsc_parse(struct cache_detail *cd,
+ char *mesg, int mlen)
+{
+ /* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */
+ char *buf = mesg;
+ int len, rv;
+ struct rsc *rsci, *res = NULL;
+ time_t expiry;
+ int status = -EINVAL;
+
+ OBD_ALLOC(rsci, sizeof(*rsci));
+ if (!rsci) {
+ CERROR("fail to alloc rsci\n");
+ return -ENOMEM;
+ }
+ cache_init(&rsci->h);
+
+ /* context handle */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0) goto out;
+ status = -ENOMEM;
+ if (rawobj_alloc(&rsci->handle, buf, len))
+ goto out;
+
+ /* expiry */
+ expiry = get_expiry(&mesg);
+ status = -EINVAL;
+ if (expiry == 0)
+ goto out;
+
+ /* remote flag */
+ rv = get_int(&mesg, &rsci->remote);
+ if (rv) {
+ CERROR("fail to get remote flag\n");
+ goto out;
+ }
+
+ /* uid, or NEGATIVE */
+ rv = get_int(&mesg, &rsci->cred.vc_uid);
+ if (rv == -EINVAL)
+ goto out;
+ if (rv == -ENOENT)
+ set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+ else {
+ int N, i;
+ struct gss_api_mech *gm;
+ rawobj_t tmp_buf;
+ __u64 ctx_expiry;
+
+ /* gid */
+ if (get_int(&mesg, &rsci->cred.vc_gid))
+ goto out;
+
+ /* number of additional gid's */
+ if (get_int(&mesg, &N))
+ goto out;
+ status = -ENOMEM;
+#if 0
+ rsci->cred.vc_ginfo = groups_alloc(N);
+ if (rsci->cred.vc_ginfo == NULL)
+ goto out;
+#endif
+
+ /* gid's */
+ status = -EINVAL;
+ for (i=0; i<N; i++) {
+ gid_t gid;
+ if (get_int(&mesg, &gid))
+ goto out;
+#if 0
+ GROUP_AT(rsci->cred.vc_ginfo, i) = gid;
+#endif
+ }
+
+ /* mech name */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0)
+ goto out;
+ gm = kgss_name_to_mech(buf);
+ status = -EOPNOTSUPP;
+ if (!gm)
+ goto out;
+
+ status = -EINVAL;
+ /* mech-specific data: */
+ len = qword_get(&mesg, buf, mlen);
+ if (len < 0) {
+ kgss_mech_put(gm);
+ goto out;
+ }
+ tmp_buf.len = len;
+ tmp_buf.data = buf;
+ if (kgss_import_sec_context(&tmp_buf, gm, &rsci->mechctx)) {
+ kgss_mech_put(gm);
+ goto out;
+ }
+
+ /* currently the expiry time passed down from user-space
+ * is invalid, here we retrive it from mech.
+ */
+ if (kgss_inquire_context(rsci->mechctx, &ctx_expiry)) {
+ CERROR("unable to get expire time, drop it\n");
+ set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+ kgss_mech_put(gm);
+ goto out;
+ }
+ expiry = (time_t) ctx_expiry;
+
+ kgss_mech_put(gm);
+ }
+ rsci->h.expiry_time = expiry;
+ spin_lock_init(&rsci->seqdata.sd_lock);
+ res = rsc_lookup(rsci, 1);
+ rsc_put(&res->h, &rsc_cache);
+ status = 0;
+out:
+ if (rsci)
+ rsc_put(&rsci->h, &rsc_cache);
+ return status;
+}
+
+/*
+ * flush all entries with @uid. @uid == -1 will match all.
+ * we only know the uid, maybe netid/nid in the future, in all cases
+ * we must search the whole cache
+ */
+static void rsc_flush(uid_t uid)
+{
+ struct cache_head **ch;
+ struct rsc *rscp;
+ int n;
+ ENTRY;
+
+ write_lock(&rsc_cache.hash_lock);
+ for (n = 0; n < RSC_HASHMAX; n++) {
+ for (ch = &rsc_cache.hash_table[n]; *ch;) {
+ rscp = container_of(*ch, struct rsc, h);
+ if (uid == -1 || rscp->cred.vc_uid == uid) {
+ /* it seems simply set NEGATIVE doesn't work */
+ *ch = (*ch)->next;
+ rscp->h.next = NULL;
+ cache_get(&rscp->h);
+ set_bit(CACHE_NEGATIVE, &rscp->h.flags);
+ clear_bit(CACHE_HASHED, &rscp->h.flags);
+ CWARN("flush rsc %p for uid %u\n",
+ rscp, rscp->cred.vc_uid);
+ rsc_put(&rscp->h, &rsc_cache);
+ rsc_cache.entries--;
+ continue;
+ }
+ ch = &((*ch)->next);
+ }
+ }
+ write_unlock(&rsc_cache.hash_lock);
+ EXIT;
+}
+
+static struct cache_detail rsc_cache = {
+ .hash_size = RSC_HASHMAX,
+ .hash_table = rsc_table,
+ .name = "auth.ptlrpcs.context",
+ .cache_put = rsc_put,
+ .cache_parse = rsc_parse,
+};
+
+static struct rsc *
+gss_svc_searchbyctx(rawobj_t *handle)
+{
+ struct rsc rsci;
+ struct rsc *found;
+
+ rsci.handle = *handle;
+ found = rsc_lookup(&rsci, 0);
+ if (!found)
+ return NULL;
+
+ if (cache_check(&rsc_cache, &found->h, NULL))
+ return NULL;
+
+ return found;
+}
+
+struct gss_svc_data {
+ /* decoded gss client cred: */
+ struct rpc_gss_wire_cred clcred;
+ /* internal used status */
+ unsigned int is_init:1,
+ is_init_continue:1,
+ is_err_notify:1,
+ is_fini:1;
+ int reserve_len;
+};
+
+/* FIXME
+ * again hacking: only try to give the svcgssd a chance to handle
+ * upcalls.
+ */
+struct cache_deferred_req* my_defer(struct cache_req *req)
+{
+ yield();
+ return NULL;
+}
+static struct cache_req my_chandle = {my_defer};
+
+/* Implements sequence number algorithm as specified in RFC 2203. */
+static int
+gss_check_seq_num(struct gss_svc_seq_data *sd, __u32 seq_num)
+{
+ int rc = 0;
+
+ spin_lock(&sd->sd_lock);
+ if (seq_num > sd->sd_max) {
+ if (seq_num >= sd->sd_max + GSS_SEQ_WIN) {
+ memset(sd->sd_win, 0, sizeof(sd->sd_win));
+ sd->sd_max = seq_num;
+ } else {
+ while(sd->sd_max < seq_num) {
+ sd->sd_max++;
+ __clear_bit(sd->sd_max % GSS_SEQ_WIN,
+ sd->sd_win);
+ }
+ }
+ __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win);
+ goto exit;
+ } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) {
+ rc = 1;
+ goto exit;
+ }
+
+ if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win))
+ rc = 1;
+exit:
+ spin_unlock(&sd->sd_lock);
+ return rc;
+}
+
+static int
+gss_svc_verify_request(struct ptlrpc_request *req,
+ struct rsc *rsci,
+ struct rpc_gss_wire_cred *gc,
+ __u32 *vp, __u32 vlen)
+{
+ struct ptlrpcs_wire_hdr *sec_hdr;
+ struct gss_ctx *ctx = rsci->mechctx;
+ __u32 maj_stat;
+ rawobj_t msg;
+ rawobj_t mic;
+ ENTRY;
+
+ sec_hdr = (struct ptlrpcs_wire_hdr *) req->rq_reqbuf;
+
+ req->rq_reqmsg = (struct lustre_msg *) (req->rq_reqbuf + sizeof(*sec_hdr));
+ req->rq_reqlen = sec_hdr->msg_len;
+
+ msg.len = sec_hdr->msg_len;
+ msg.data = (__u8 *)req->rq_reqmsg;
+
+ mic.len = le32_to_cpu(*vp++);
+ mic.data = (char *) vp;
+ vlen -= 4;
+
+ if (mic.len > vlen) {
+ CERROR("checksum len %d, while buffer len %d\n",
+ mic.len, vlen);
+ RETURN(GSS_S_CALL_BAD_STRUCTURE);
+ }
+
+ if (mic.len > 256) {
+ CERROR("invalid mic len %d\n", mic.len);
+ RETURN(GSS_S_CALL_BAD_STRUCTURE);
+ }
+
+ maj_stat = kgss_verify_mic(ctx, &msg, &mic, NULL);
+ if (maj_stat != GSS_S_COMPLETE) {
+ CERROR("MIC verification error: major %x\n", maj_stat);
+ RETURN(maj_stat);
+ }
+
+ if (gss_check_seq_num(&rsci->seqdata, gc->gc_seq)) {
+ CERROR("discard request %p with old seq_num %u\n",
+ req, gc->gc_seq);
+ RETURN(GSS_S_DUPLICATE_TOKEN);
+ }
+
+ RETURN(GSS_S_COMPLETE);
+}
+
+static int
+gss_svc_unseal_request(struct ptlrpc_request *req,
+ struct rsc *rsci,
+ struct rpc_gss_wire_cred *gc,
+ __u32 *vp, __u32 vlen)
+{
+ struct ptlrpcs_wire_hdr *sec_hdr;
+ struct gss_ctx *ctx = rsci->mechctx;
+ rawobj_t cipher_text, plain_text;
+ __u32 major;
+ ENTRY;
+
+ sec_hdr = (struct ptlrpcs_wire_hdr *) req->rq_reqbuf;
+
+ if (vlen < 4) {
+ CERROR("vlen only %u\n", vlen);
+ RETURN(GSS_S_CALL_BAD_STRUCTURE);
+ }
+
+ cipher_text.len = le32_to_cpu(*vp++);
+ cipher_text.data = (__u8 *) vp;
+ vlen -= 4;
+
+ if (cipher_text.len > vlen) {
+ CERROR("cipher claimed %u while buf only %u\n",
+ cipher_text.len, vlen);
+ RETURN(GSS_S_CALL_BAD_STRUCTURE);
+ }
+
+ plain_text = cipher_text;
+
+ major = kgss_unwrap(ctx, GSS_C_QOP_DEFAULT, &cipher_text, &plain_text);
+ if (major) {
+ CERROR("unwrap error 0x%x\n", major);
+ RETURN(major);
+ }
+
+ if (gss_check_seq_num(&rsci->seqdata, gc->gc_seq)) {
+ CERROR("discard request %p with old seq_num %u\n",
+ req, gc->gc_seq);
+ RETURN(GSS_S_DUPLICATE_TOKEN);
+ }
+
+ req->rq_reqmsg = (struct lustre_msg *) (vp);
+ req->rq_reqlen = plain_text.len;
+
+ CDEBUG(D_SEC, "msg len %d\n", req->rq_reqlen);
+
+ RETURN(GSS_S_COMPLETE);
+}
+
+static int
+gss_pack_err_notify(struct ptlrpc_request *req,
+ __u32 major, __u32 minor)
+{
+ struct gss_svc_data *svcdata = req->rq_sec_svcdata;
+ __u32 reslen, *resp, *reslenp;
+ char nidstr[PTL_NALFMT_SIZE];
+ const __u32 secdata_len = 7 * 4;
+ int rc;
+ ENTRY;
+
+ OBD_FAIL_RETURN(OBD_FAIL_SVCGSS_ERR_NOTIFY|OBD_FAIL_ONCE, -EINVAL);
+
+ LASSERT(svcdata);
+ svcdata->is_err_notify = 1;
+ svcdata->reserve_len = 7 * 4;
+
+ rc = lustre_pack_reply(req, 0, NULL, NULL);
+ if (rc) {
+ CERROR("could not pack reply, err %d\n", rc);
+ RETURN(rc);
+ }
+
+ LASSERT(req->rq_reply_state);
+ LASSERT(req->rq_reply_state->rs_repbuf);
+ LASSERT(req->rq_reply_state->rs_repbuf_len >= secdata_len);
+ resp = (__u32 *) req->rq_reply_state->rs_repbuf;
+
+ /* header */
+ *resp++ = cpu_to_le32(PTLRPC_SEC_GSS);
+ *resp++ = cpu_to_le32(PTLRPC_SEC_TYPE_NONE);
+ *resp++ = cpu_to_le32(req->rq_replen);
+ reslenp = resp++;
+
+ /* skip lustre msg */
+ resp += req->rq_replen / 4;
+ reslen = svcdata->reserve_len;
+
+ /* gss replay:
+ * version, subflavor, notify, major, minor,
+ * obj1(fake), obj2(fake)
+ */
+ *resp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION);
+ *resp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I);
+ *resp++ = cpu_to_le32(PTLRPC_GSS_PROC_ERR);
+ *resp++ = cpu_to_le32(major);
+ *resp++ = cpu_to_le32(minor);
+ *resp++ = 0;
+ *resp++ = 0;
+ reslen -= (4 * 4);
+ /* the actual sec data length */
+ *reslenp = cpu_to_le32(secdata_len);
+
+ req->rq_reply_state->rs_repdata_len += (secdata_len);
+ CWARN("prepare gss error notify(0x%x/0x%x) to %s\n", major, minor,
+ portals_nid2str(req->rq_peer.peer_ni->pni_number,
+ req->rq_peer.peer_id.nid, nidstr));
+ RETURN(0);
+}
+
+static int
+gss_svcsec_handle_init(struct ptlrpc_request *req,
+ struct rpc_gss_wire_cred *gc,
+ __u32 *secdata, __u32 seclen,
+ enum ptlrpcs_error *res)
+{
+ struct gss_svc_data *svcdata = req->rq_sec_svcdata;
+ struct rsc *rsci;
+ struct rsi *rsikey, *rsip;
+ rawobj_t tmpobj;
+ __u32 reslen, *resp, *reslenp;
+ char nidstr[PTL_NALFMT_SIZE];
+ int rc;
+ ENTRY;
+
+ LASSERT(svcdata);
+
+ CWARN("processing gss init(%d) request from %s\n", gc->gc_proc,
+ portals_nid2str(req->rq_peer.peer_ni->pni_number,
+ req->rq_peer.peer_id.nid, nidstr));
+
+ *res = PTLRPCS_BADCRED;
+ OBD_FAIL_RETURN(OBD_FAIL_SVCGSS_INIT_REQ|OBD_FAIL_ONCE, SVC_DROP);
+
+ if (gc->gc_proc == RPC_GSS_PROC_INIT &&
+ gc->gc_ctx.len != 0) {
+ CERROR("proc %d, ctx_len %d: not really init?\n",
+ gc->gc_proc == RPC_GSS_PROC_INIT, gc->gc_ctx.len);
+ RETURN(SVC_DROP);
+ }
+
+ OBD_ALLOC(rsikey, sizeof(*rsikey));
+ if (!rsikey) {
+ CERROR("out of memory\n");
+ RETURN(SVC_DROP);
+ }
+ cache_init(&rsikey->h);
+
+ if (rawobj_dup(&rsikey->in_handle, &gc->gc_ctx)) {
+ CERROR("fail to dup context handle\n");
+ GOTO(out_rsikey, rc = SVC_DROP);
+ }
+ *res = PTLRPCS_BADVERF;
+ if (rawobj_extract(&tmpobj, &secdata, &seclen)) {
+ CERROR("can't extract token\n");
+ GOTO(out_rsikey, rc = SVC_DROP);
+ }
+ if (rawobj_dup(&rsikey->in_token, &tmpobj)) {
+ CERROR("can't duplicate token\n");
+ GOTO(out_rsikey, rc = SVC_DROP);
+ }
+
+ rsip = gssd_upcall(rsikey, &my_chandle);
+ if (!rsip) {
+ CERROR("error in gssd_upcall.\n");
+ GOTO(out_rsikey, rc = SVC_DROP);
+ }
+
+ rsci = gss_svc_searchbyctx(&rsip->out_handle);
+ if (!rsci) {
+ CERROR("rsci still not mature yet?\n");
+ GOTO(out_rsip, rc = SVC_DROP);
+ }
+ CWARN("svcsec create gss context %p(%u@%s)\n",
+ rsci, rsci->cred.vc_uid,
+ portals_nid2str(req->rq_peer.peer_ni->pni_number,
+ req->rq_peer.peer_id.nid, nidstr));
+
+ svcdata->is_init = 1;
+ svcdata->reserve_len = 6 * 4 +
+ size_round4(rsip->out_handle.len) +
+ size_round4(rsip->out_token.len);
+
+ rc = lustre_pack_reply(req, 0, NULL, NULL);
+ if (rc) {
+ CERROR("failed to pack reply, rc = %d\n", rc);
+ GOTO(out, rc = SVC_DROP);
+ }
+
+ /* header */
+ resp = (__u32 *) req->rq_reply_state->rs_repbuf;
+ *resp++ = cpu_to_le32(PTLRPC_SEC_GSS);
+ *resp++ = cpu_to_le32(PTLRPC_SEC_TYPE_NONE);
+ *resp++ = cpu_to_le32(req->rq_replen);
+ reslenp = resp++;
+
+ resp += req->rq_replen / 4;
+ reslen = svcdata->reserve_len;
+
+ /* gss reply:
+ * status, major, minor, seq, out_handle, out_token
+ */
+ *resp++ = cpu_to_le32(PTLRPCS_OK);
+ *resp++ = cpu_to_le32(rsip->major_status);
+ *resp++ = cpu_to_le32(rsip->minor_status);
+ *resp++ = cpu_to_le32(GSS_SEQ_WIN);
+ reslen -= (4 * 4);
+ if (rawobj_serialize(&rsip->out_handle,
+ &resp, &reslen))
+ LBUG();
+ if (rawobj_serialize(&rsip->out_token,
+ &resp, &reslen))
+ LBUG();
+ /* the actual sec data length */
+ *reslenp = cpu_to_le32(svcdata->reserve_len - reslen);
+
+ req->rq_reply_state->rs_repdata_len += le32_to_cpu(*reslenp);
+ CDEBUG(D_SEC, "req %p: msgsize %d, authsize %d, "
+ "total size %d\n", req, req->rq_replen,
+ le32_to_cpu(*reslenp),
+ req->rq_reply_state->rs_repdata_len);
+
+ *res = PTLRPCS_OK;
+
+ /* This is simplified since right now we doesn't support
+ * INIT_CONTINUE yet.
+ */
+ if (gc->gc_proc == RPC_GSS_PROC_INIT) {
+ struct ptlrpcs_wire_hdr *hdr;
+
+ hdr = buf_to_sec_hdr(req->rq_reqbuf);
+ req->rq_reqmsg = buf_to_lustre_msg(req->rq_reqbuf);
+ req->rq_reqlen = hdr->msg_len;
+
+ rc = SVC_LOGIN;
+ } else
+ rc = SVC_COMPLETE;
+
+out:
+ rsc_put(&rsci->h, &rsc_cache);
+out_rsip:
+ rsi_put(&rsip->h, &rsi_cache);
+out_rsikey:
+ rsi_put(&rsikey->h, &rsi_cache);
+
+ RETURN(rc);
+}
+
+static int
+gss_svcsec_handle_data(struct ptlrpc_request *req,
+ struct rpc_gss_wire_cred *gc,
+ __u32 *secdata, __u32 seclen,
+ enum ptlrpcs_error *res)
+{
+ struct rsc *rsci;
+ char nidstr[PTL_NALFMT_SIZE];
+ __u32 major;
+ int rc;
+ ENTRY;
+
+ *res = PTLRPCS_GSS_CREDPROBLEM;
+
+ rsci = gss_svc_searchbyctx(&gc->gc_ctx);
+ if (!rsci) {
+ CWARN("Invalid gss context handle from %s\n",
+ portals_nid2str(req->rq_peer.peer_ni->pni_number,
+ req->rq_peer.peer_id.nid, nidstr));
+ major = GSS_S_NO_CONTEXT;
+ goto notify_err;
+ }
+
+ switch (gc->gc_svc) {
+ case PTLRPC_GSS_SVC_INTEGRITY:
+ major = gss_svc_verify_request(req, rsci, gc, secdata, seclen);
+ if (major == GSS_S_COMPLETE)
+ break;
+
+ CWARN("fail in verify:0x%x: ctx %p@%s\n", major, rsci,
+ portals_nid2str(req->rq_peer.peer_ni->pni_number,
+ req->rq_peer.peer_id.nid, nidstr));
+ goto notify_err;
+ case PTLRPC_GSS_SVC_PRIVACY:
+ major = gss_svc_unseal_request(req, rsci, gc, secdata, seclen);
+ if (major == GSS_S_COMPLETE)
+ break;
+
+ CWARN("fail in decrypt:0x%x: ctx %p@%s\n", major, rsci,
+ portals_nid2str(req->rq_peer.peer_ni->pni_number,
+ req->rq_peer.peer_id.nid, nidstr));
+ goto notify_err;
+ default:
+ CERROR("unsupported gss service %d\n", gc->gc_svc);
+ GOTO(out, rc = SVC_DROP);
+ }
+
+ req->rq_auth_uid = rsci->cred.vc_uid;
+ req->rq_remote = rsci->remote;
+
+ *res = PTLRPCS_OK;
+ GOTO(out, rc = SVC_OK);
+
+notify_err:
+ if (gss_pack_err_notify(req, major, 0))
+ rc = SVC_DROP;
+ else
+ rc = SVC_COMPLETE;
+out:
+ if (rsci)
+ rsc_put(&rsci->h, &rsc_cache);
+ RETURN(rc);
+}
+
+static int
+gss_svcsec_handle_destroy(struct ptlrpc_request *req,
+ struct rpc_gss_wire_cred *gc,
+ __u32 *secdata, __u32 seclen,
+ enum ptlrpcs_error *res)
+{
+ struct gss_svc_data *svcdata = req->rq_sec_svcdata;
+ struct rsc *rsci;
+ char nidstr[PTL_NALFMT_SIZE];
+ int rc;
+ ENTRY;
+
+ LASSERT(svcdata);
+ *res = PTLRPCS_GSS_CREDPROBLEM;
+
+ rsci = gss_svc_searchbyctx(&gc->gc_ctx);
+ if (!rsci) {
+ CWARN("invalid gss context handle for destroy.\n");
+ RETURN(SVC_DROP);
+ }
+
+ if (gc->gc_svc != PTLRPC_GSS_SVC_INTEGRITY) {
+ CERROR("service %d is not supported in destroy.\n",
+ gc->gc_svc);
+ GOTO(out, rc = SVC_DROP);
+ }
+
+ *res = gss_svc_verify_request(req, rsci, gc, secdata, seclen);
+ if (*res)
+ GOTO(out, rc = SVC_DROP);
+
+ /* compose reply, which is actually nothing */
+ svcdata->is_fini = 1;
+ if (lustre_pack_reply(req, 0, NULL, NULL))
+ GOTO(out, rc = SVC_DROP);
+
+ CWARN("svcsec destroy gss context %p(%u@%s)\n",
+ rsci, rsci->cred.vc_uid,
+ portals_nid2str(req->rq_peer.peer_ni->pni_number,
+ req->rq_peer.peer_id.nid, nidstr));
+
+ set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+ *res = PTLRPCS_OK;
+ rc = SVC_LOGOUT;
+out:
+ rsc_put(&rsci->h, &rsc_cache);
+ RETURN(rc);
+}
+
+/*
+ * let incomming request go through security check:
+ * o context establishment: invoke user space helper
+ * o data exchange: verify/decrypt
+ * o context destruction: mark context invalid
+ *
+ * in most cases, error will result to drop the packet silently.
+ */
+static int
+gss_svcsec_accept(struct ptlrpc_request *req, enum ptlrpcs_error *res)
+{
+ struct gss_svc_data *svcdata;
+ struct rpc_gss_wire_cred *gc;
+ struct ptlrpcs_wire_hdr *sec_hdr;
+ __u32 seclen, *secdata, version, subflavor;
+ int rc;
+ ENTRY;
+
+ CDEBUG(D_SEC, "request %p\n", req);
+ LASSERT(req->rq_reqbuf);
+ LASSERT(req->rq_reqbuf_len);
+
+ *res = PTLRPCS_BADCRED;
+
+ sec_hdr = buf_to_sec_hdr(req->rq_reqbuf);
+ LASSERT(sec_hdr->flavor == PTLRPC_SEC_GSS);
+
+ seclen = req->rq_reqbuf_len - sizeof(*sec_hdr) - sec_hdr->msg_len;
+ secdata = (__u32 *) buf_to_sec_data(req->rq_reqbuf);
+
+ if (sec_hdr->sec_len > seclen) {
+ CERROR("seclen %d, while max buf %d\n",
+ sec_hdr->sec_len, seclen);
+ RETURN(SVC_DROP);
+ }
+
+ if (seclen < 6 * 4) {
+ CERROR("sec size %d too small\n", seclen);
+ RETURN(SVC_DROP);
+ }
+
+ LASSERT(!req->rq_sec_svcdata);
+ OBD_ALLOC(svcdata, sizeof(*svcdata));
+ if (!svcdata) {
+ CERROR("fail to alloc svcdata\n");
+ RETURN(SVC_DROP);
+ }
+ req->rq_sec_svcdata = svcdata;
+ gc = &svcdata->clcred;
+
+ /* Now secdata/seclen is what we want to parse
+ */
+ version = le32_to_cpu(*secdata++); /* version */
+ subflavor = le32_to_cpu(*secdata++); /* subflavor */
+ gc->gc_proc = le32_to_cpu(*secdata++); /* proc */
+ gc->gc_seq = le32_to_cpu(*secdata++); /* seq */
+ gc->gc_svc = le32_to_cpu(*secdata++); /* service */
+ seclen -= 5 * 4;
+
+ CDEBUG(D_SEC, "wire gss_hdr: %u/%u/%u/%u/%u\n",
+ version, subflavor, gc->gc_proc, gc->gc_seq, gc->gc_svc);
+
+ if (version != PTLRPC_SEC_GSS_VERSION) {
+ CERROR("gss version mismatch: %d - %d\n",
+ version, PTLRPC_SEC_GSS_VERSION);
+ GOTO(err_free, rc = SVC_DROP);
+ }
+
+ if (rawobj_extract(&gc->gc_ctx, &secdata, &seclen)) {
+ CERROR("fail to obtain gss context handle\n");
+ GOTO(err_free, rc = SVC_DROP);
+ }
+
+ *res = PTLRPCS_BADVERF;
+ switch(gc->gc_proc) {
+ case RPC_GSS_PROC_INIT:
+ case RPC_GSS_PROC_CONTINUE_INIT:
+ rc = gss_svcsec_handle_init(req, gc, secdata, seclen, res);
+ break;
+ case RPC_GSS_PROC_DATA:
+ rc = gss_svcsec_handle_data(req, gc, secdata, seclen, res);
+ break;
+ case RPC_GSS_PROC_DESTROY:
+ rc = gss_svcsec_handle_destroy(req, gc, secdata, seclen, res);
+ break;
+ default:
+ rc = SVC_DROP;
+ LBUG();
+ }
+
+err_free:
+ if (rc == SVC_DROP && req->rq_sec_svcdata) {
+ OBD_FREE(req->rq_sec_svcdata, sizeof(struct gss_svc_data));
+ req->rq_sec_svcdata = NULL;
+ }
+
+ RETURN(rc);
+}
+
+static int
+gss_svcsec_authorize(struct ptlrpc_request *req)
+{
+ struct ptlrpc_reply_state *rs = req->rq_reply_state;
+ struct gss_svc_data *gsd = (struct gss_svc_data *)req->rq_sec_svcdata;
+ struct rpc_gss_wire_cred *gc = &gsd->clcred;
+ struct rsc *rscp;
+ struct ptlrpcs_wire_hdr *sec_hdr;
+ rawobj_buf_t msg_buf;
+ rawobj_t cipher_buf;
+ __u32 *vp, *vpsave, major, vlen, seclen;
+ rawobj_t lmsg, mic;
+ int ret;
+ ENTRY;
+
+ LASSERT(rs);
+ LASSERT(rs->rs_repbuf);
+ LASSERT(gsd);
+
+ if (gsd->is_init || gsd->is_init_continue ||
+ gsd->is_err_notify || gsd->is_fini) {
+ /* nothing to do in these cases */
+ CDEBUG(D_SEC, "req %p: init/fini/err\n", req);
+ RETURN(0);
+ }
+
+ if (gc->gc_proc != RPC_GSS_PROC_DATA) {
+ CERROR("proc %d not support\n", gc->gc_proc);
+ RETURN(-EINVAL);
+ }
+
+ rscp = gss_svc_searchbyctx(&gc->gc_ctx);
+ if (!rscp) {
+ CERROR("ctx disapeared under us?\n");
+ RETURN(-EINVAL);
+ }
+
+ sec_hdr = (struct ptlrpcs_wire_hdr *) rs->rs_repbuf;
+ switch (gc->gc_svc) {
+ case PTLRPC_GSS_SVC_INTEGRITY:
+ /* prepare various pointers */
+ lmsg.len = req->rq_replen;
+ lmsg.data = (__u8 *) (rs->rs_repbuf + sizeof(*sec_hdr));
+ vp = (__u32 *) (lmsg.data + lmsg.len);
+ vlen = rs->rs_repbuf_len - sizeof(*sec_hdr) - lmsg.len;
+ seclen = vlen;
+
+ sec_hdr->flavor = cpu_to_le32(PTLRPC_SEC_GSS);
+ sec_hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_AUTH);
+ sec_hdr->msg_len = cpu_to_le32(req->rq_replen);
+
+ /* standard gss hdr */
+ LASSERT(vlen >= 7 * 4);
+ *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION);
+ *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I);
+ *vp++ = cpu_to_le32(RPC_GSS_PROC_DATA);
+ *vp++ = cpu_to_le32(gc->gc_seq);
+ *vp++ = cpu_to_le32(PTLRPC_GSS_SVC_INTEGRITY);
+ *vp++ = 0; /* fake ctx handle */
+ vpsave = vp++; /* reserve size */
+ vlen -= 7 * 4;
+
+ mic.len = vlen;
+ mic.data = (char *) vp;
+
+ major = kgss_get_mic(rscp->mechctx, 0, &lmsg, &mic);
+ if (major) {
+ CERROR("fail to get MIC: 0x%x\n", major);
+ GOTO(out, ret = -EINVAL);
+ }
+ *vpsave = cpu_to_le32(mic.len);
+ seclen = seclen - vlen + mic.len;
+ sec_hdr->sec_len = cpu_to_le32(seclen);
+ rs->rs_repdata_len += size_round(seclen);
+ break;
+ case PTLRPC_GSS_SVC_PRIVACY:
+ vp = (__u32 *) (rs->rs_repbuf + sizeof(*sec_hdr));
+ vlen = rs->rs_repbuf_len - sizeof(*sec_hdr);
+ seclen = vlen;
+
+ sec_hdr->flavor = cpu_to_le32(PTLRPC_SEC_GSS);
+ sec_hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_PRIV);
+ sec_hdr->msg_len = cpu_to_le32(0);
+
+ /* standard gss hdr */
+ LASSERT(vlen >= 7 * 4);
+ *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION);
+ *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I);
+ *vp++ = cpu_to_le32(RPC_GSS_PROC_DATA);
+ *vp++ = cpu_to_le32(gc->gc_seq);
+ *vp++ = cpu_to_le32(PTLRPC_GSS_SVC_PRIVACY);
+ *vp++ = 0; /* fake ctx handle */
+ vpsave = vp++; /* reserve size */
+ vlen -= 7 * 4;
+
+ msg_buf.buf = (__u8 *) rs->rs_msg - GSS_PRIVBUF_PREFIX_LEN;
+ msg_buf.buflen = req->rq_replen + GSS_PRIVBUF_PREFIX_LEN +
+ GSS_PRIVBUF_SUFFIX_LEN;
+ msg_buf.dataoff = GSS_PRIVBUF_PREFIX_LEN;
+ msg_buf.datalen = req->rq_replen;
+
+ cipher_buf.data = (__u8 *) vp;
+ cipher_buf.len = vlen;
+
+ major = kgss_wrap(rscp->mechctx, GSS_C_QOP_DEFAULT,
+ &msg_buf, &cipher_buf);
+ if (major) {
+ CERROR("failed to wrap: 0x%x\n", major);
+ GOTO(out, ret = -EINVAL);
+ }
+
+ *vpsave = cpu_to_le32(cipher_buf.len);
+ seclen = seclen - vlen + cipher_buf.len;
+ sec_hdr->sec_len = cpu_to_le32(seclen);
+ rs->rs_repdata_len += size_round(seclen);
+ break;
+ default:
+ CERROR("Unknown service %d\n", gc->gc_svc);
+ GOTO(out, ret = -EINVAL);
+ }
+ ret = 0;
+out:
+ rsc_put(&rscp->h, &rsc_cache);
+
+ RETURN(ret);
+}
+
+static
+void gss_svcsec_cleanup_req(struct ptlrpc_svcsec *svcsec,
+ struct ptlrpc_request *req)
+{
+ struct gss_svc_data *gsd = (struct gss_svc_data *) req->rq_sec_svcdata;
+
+ if (!gsd) {
+ CDEBUG(D_SEC, "no svc_data present. do nothing\n");
+ return;
+ }
+
+ /* gsd->clclred.gc_ctx is NOT allocated, just set pointer
+ * to the incoming packet buffer, so don't need free it
+ */
+ OBD_FREE(gsd, sizeof(*gsd));
+ req->rq_sec_svcdata = NULL;
+ return;
+}
+
+static
+int gss_svcsec_est_payload(struct ptlrpc_svcsec *svcsec,
+ struct ptlrpc_request *req,
+ int msgsize)
+{
+ struct gss_svc_data *svcdata = req->rq_sec_svcdata;
+ ENTRY;
+
+ /* just return the pre-set reserve_len for init/fini/err cases.
+ */
+ LASSERT(svcdata);
+ if (svcdata->is_init) {
+ CDEBUG(D_SEC, "is_init, reserver size %d(%d)\n",
+ size_round(svcdata->reserve_len),
+ svcdata->reserve_len);
+ LASSERT(svcdata->reserve_len);
+ LASSERT(svcdata->reserve_len % 4 == 0);
+ RETURN(size_round(svcdata->reserve_len));
+ } else if (svcdata->is_err_notify) {
+ CDEBUG(D_SEC, "is_err_notify, reserver size %d(%d)\n",
+ size_round(svcdata->reserve_len),
+ svcdata->reserve_len);
+ RETURN(size_round(svcdata->reserve_len));
+ } else if (svcdata->is_fini) {
+ CDEBUG(D_SEC, "is_fini, reserver size 0\n");
+ RETURN(0);
+ } else {
+ if (svcdata->clcred.gc_svc == PTLRPC_GSS_SVC_NONE ||
+ svcdata->clcred.gc_svc == PTLRPC_GSS_SVC_INTEGRITY)
+ RETURN(size_round(GSS_MAX_AUTH_PAYLOAD));
+ else if (svcdata->clcred.gc_svc == PTLRPC_GSS_SVC_PRIVACY)
+ RETURN(size_round16(GSS_MAX_AUTH_PAYLOAD + msgsize +
+ GSS_PRIVBUF_PREFIX_LEN +
+ GSS_PRIVBUF_SUFFIX_LEN));
+ else {
+ CERROR("unknown gss svc %u\n", svcdata->clcred.gc_svc);
+ *((int *)0) = 0;
+ LBUG();
+ }
+ }
+ RETURN(0);
+}
+
+int gss_svcsec_alloc_repbuf(struct ptlrpc_svcsec *svcsec,
+ struct ptlrpc_request *req,
+ int msgsize)
+{
+ struct gss_svc_data *gsd = (struct gss_svc_data *) req->rq_sec_svcdata;
+ struct ptlrpc_reply_state *rs;
+ int msg_payload, sec_payload;
+ int privacy, rc;
+ ENTRY;
+
+ /* determine the security type: none/auth or priv, we have
+ * different pack scheme for them.
+ * init/fini/err will always be treated as none/auth.
+ */
+ LASSERT(gsd);
+ if (!gsd->is_init && !gsd->is_init_continue &&
+ !gsd->is_fini && !gsd->is_err_notify &&
+ gsd->clcred.gc_svc == PTLRPC_GSS_SVC_PRIVACY)
+ privacy = 1;
+ else
+ privacy = 0;
+
+ msg_payload = privacy ? 0 : msgsize;
+ sec_payload = gss_svcsec_est_payload(svcsec, req, msgsize);
+
+ rc = svcsec_alloc_reply_state(req, msg_payload, sec_payload);
+ if (rc)
+ RETURN(rc);
+
+ rs = req->rq_reply_state;
+ LASSERT(rs);
+ rs->rs_msg_len = msgsize;
+
+ if (privacy) {
+ /* we can choose to let msg simply point to the rear of the
+ * buffer, which lead to buffer overlap when doing encryption.
+ * usually it's ok and it indeed passed all existing tests.
+ * but not sure if there will be subtle problems in the future.
+ * so right now we choose to alloc another new buffer. we'll
+ * see how it works.
+ */
+#if 0
+ rs->rs_msg = (struct lustre_msg *)
+ (rs->rs_repbuf + rs->rs_repbuf_len -
+ msgsize - GSS_PRIVBUF_SUFFIX_LEN);
+#endif
+ char *msgbuf;
+
+ msgsize += GSS_PRIVBUF_PREFIX_LEN + GSS_PRIVBUF_SUFFIX_LEN;
+ OBD_ALLOC(msgbuf, msgsize);
+ if (!msgbuf) {
+ CERROR("can't alloc %d\n", msgsize);
+ svcsec_free_reply_state(rs);
+ req->rq_reply_state = NULL;
+ RETURN(-ENOMEM);
+ }
+ rs->rs_msg = (struct lustre_msg *)
+ (msgbuf + GSS_PRIVBUF_PREFIX_LEN);
+ }
+
+ req->rq_repmsg = rs->rs_msg;
+
+ RETURN(0);
+}
+
+static
+void gss_svcsec_free_repbuf(struct ptlrpc_svcsec *svcsec,
+ struct ptlrpc_reply_state *rs)
+{
+ unsigned long p1 = (unsigned long) rs->rs_msg;
+ unsigned long p2 = (unsigned long) rs->rs_buf;
+
+ LASSERT(rs->rs_buf);
+ LASSERT(rs->rs_msg);
+ LASSERT(rs->rs_msg_len);
+
+ if (p1 < p2 || p1 >= p2 + rs->rs_buf_len) {
+ char *start = (char*) rs->rs_msg - GSS_PRIVBUF_PREFIX_LEN;
+ int size = rs->rs_msg_len + GSS_PRIVBUF_PREFIX_LEN +
+ GSS_PRIVBUF_SUFFIX_LEN;
+ OBD_FREE(start, size);
+ }
+
+ svcsec_free_reply_state(rs);
+}
+
+struct ptlrpc_svcsec svcsec_gss = {
+ .pss_owner = THIS_MODULE,
+ .pss_name = "GSS_SVCSEC",
+ .pss_flavor = {PTLRPC_SEC_GSS, 0},
+ .accept = gss_svcsec_accept,
+ .authorize = gss_svcsec_authorize,
+ .alloc_repbuf = gss_svcsec_alloc_repbuf,
+ .free_repbuf = gss_svcsec_free_repbuf,
+ .cleanup_req = gss_svcsec_cleanup_req,
+};
+
+/* XXX hacking */
+void lgss_svc_cache_purge_all(void)
+{
+ cache_purge(&rsi_cache);
+ cache_purge(&rsc_cache);
+}
+EXPORT_SYMBOL(lgss_svc_cache_purge_all);
+
+void lgss_svc_cache_flush(__u32 uid)
+{
+ rsc_flush(uid);
+}
+EXPORT_SYMBOL(lgss_svc_cache_flush);
+
+int gss_svc_init(void)
+{
+ int rc;
+
+ rc = svcsec_register(&svcsec_gss);
+ if (!rc) {
+ cache_register(&rsc_cache);
+ cache_register(&rsi_cache);
+ }
+ return rc;
+}
+
+void gss_svc_exit(void)
+{
+ int rc;
+ if ((rc = cache_unregister(&rsi_cache)))
+ CERROR("unregister rsi cache: %d\n", rc);
+ if ((rc = cache_unregister(&rsc_cache)))
+ CERROR("unregister rsc cache: %d\n", rc);
+ if ((rc = svcsec_unregister(&svcsec_gss)))
+ CERROR("unregister svcsec_gss: %d\n", rc);
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_dlm.h>
+#include <linux/lustre_sec.h>
+
+static spinlock_t sectypes_lock = SPIN_LOCK_UNLOCKED;
+static struct ptlrpc_sec_type *sectypes[PTLRPC_SEC_MAX_FLAVORS] = {
+ NULL,
+};
+
+int ptlrpcs_register(struct ptlrpc_sec_type *type)
+{
+ __u32 flavor = type->pst_flavor.flavor;
+
+ LASSERT(type->pst_name);
+ LASSERT(type->pst_ops);
+
+ if (flavor >= PTLRPC_SEC_MAX_FLAVORS)
+ return -EINVAL;
+
+ spin_lock(§ypes_lock);
+ if (sectypes[flavor]) {
+ spin_unlock(§ypes_lock);
+ return -EALREADY;
+ }
+ sectypes[flavor] = type;
+ atomic_set(&type->pst_inst, 0);
+ spin_unlock(§ypes_lock);
+
+ CWARN("Security module %s registered\n", type->pst_name);
+ return 0;
+}
+
+int ptlrpcs_unregister(struct ptlrpc_sec_type *type)
+{
+ __u32 flavor = type->pst_flavor.flavor;
+
+ if (flavor >= PTLRPC_SEC_MAX_FLAVORS)
+ return -EINVAL;
+
+ spin_lock(§ypes_lock);
+ if (!sectypes[flavor]) {
+ spin_unlock(§ypes_lock);
+ return -EINVAL;
+ }
+
+ if (sectypes[flavor] != type) {
+ CERROR("invalid unregister\n");
+ return -EINVAL;
+ }
+
+ if (atomic_read(&type->pst_inst)) {
+ CERROR("sec module %s still have instance %d\n",
+ type->pst_name, atomic_read(&type->pst_inst));
+ spin_unlock(§ypes_lock);
+ return -EINVAL;
+ }
+
+ CDEBUG(D_SEC, "Security module %s unregistered\n", type->pst_name);
+ sectypes[flavor] = NULL;
+ spin_unlock(§ypes_lock);
+
+ return 0;
+}
+
+static
+struct ptlrpc_sec_type * ptlrpcs_flavor2type(ptlrpcs_flavor_t *flavor)
+{
+ struct ptlrpc_sec_type *type;
+ __u32 major = flavor->flavor;
+
+ if (major >= PTLRPC_SEC_MAX_FLAVORS)
+ return NULL;
+
+ spin_lock(§ypes_lock);
+ type = sectypes[major];
+ if (type && !try_module_get(type->pst_owner))
+ type = NULL;
+ spin_unlock(§ypes_lock);
+ return type;
+}
+
+static inline
+void ptlrpcs_type_put(struct ptlrpc_sec_type *type)
+{
+ module_put(type->pst_owner);
+}
+
+/***********************************************
+ * credential cache helpers *
+ ***********************************************/
+
+void ptlrpcs_init_credcache(struct ptlrpc_sec *sec)
+{
+ int i;
+ for (i = 0; i < PTLRPC_CREDCACHE_NR; i++)
+ INIT_LIST_HEAD(&sec->ps_credcache[i]);
+ sec->ps_nextgc = get_seconds() + (sec->ps_expire >> 1);
+}
+
+static void ptlrpcs_cred_destroy(struct ptlrpc_cred *cred)
+{
+ struct ptlrpc_sec *sec = cred->pc_sec;
+
+ LASSERT(cred->pc_sec);
+ LASSERT(atomic_read(&cred->pc_refcount) == 0);
+ LASSERT(list_empty(&cred->pc_hash));
+
+ cred->pc_ops->destroy(cred);
+ atomic_dec(&sec->ps_credcount);
+}
+
+static void ptlrpcs_destroy_credlist(struct list_head *head)
+{
+ struct ptlrpc_cred *cred;
+
+ while (!list_empty(head)) {
+ cred = list_entry(head->next, struct ptlrpc_cred, pc_hash);
+ list_del_init(&cred->pc_hash);
+ ptlrpcs_cred_destroy(cred);
+ }
+}
+
+static
+int ptlrpcs_cred_unlink_expired(struct ptlrpc_cred *cred,
+ struct list_head *freelist)
+{
+ LASSERT(cred->pc_sec);
+
+ if (atomic_read(&cred->pc_refcount) != 0)
+ return 0;
+ if (time_after(cred->pc_expire, get_seconds()))
+ return 0;
+
+ list_del(&cred->pc_hash);
+ list_add(&cred->pc_hash, freelist);
+ CDEBUG(D_SEC, "put cred %p into freelist\n", cred);
+ return 1;
+}
+
+static
+void ptlrpcs_credcache_gc(struct ptlrpc_sec *sec,
+ struct list_head *freelist)
+{
+ struct ptlrpc_cred *cred, *n;
+ int i;
+ ENTRY;
+
+ CDEBUG(D_SEC, "do gc on sec %s\n", sec->ps_type->pst_name);
+ for (i = 0; i < PTLRPC_CREDCACHE_NR; i++) {
+ list_for_each_entry_safe(cred, n, &sec->ps_credcache[i],
+ pc_hash) {
+ ptlrpcs_cred_unlink_expired(cred, freelist);
+ }
+ }
+ sec->ps_nextgc = get_seconds() + sec->ps_expire;
+ EXIT;
+}
+
+static
+int ptlrpcs_flush_credcache(struct ptlrpc_sec *sec, int force)
+{
+ struct ptlrpc_cred *cred, *n;
+ LIST_HEAD(freelist);
+ int i, busy = 0;
+ ENTRY;
+
+ spin_lock(&sec->ps_lock);
+ for (i = 0; i < PTLRPC_CREDCACHE_NR; i++) {
+ list_for_each_entry_safe(cred, n, &sec->ps_credcache[i],
+ pc_hash) {
+ LASSERT(atomic_read(&cred->pc_refcount) >= 0);
+ if (atomic_read(&cred->pc_refcount)) {
+ busy = 1;
+ if (!force)
+ continue;
+ list_del_init(&cred->pc_hash);
+ } else
+ list_move(&cred->pc_hash, &freelist);
+
+ /* don't remove CRED_UPTODATE flag here */
+ cred->pc_flags |= PTLRPC_CRED_DEAD;
+ }
+ }
+ spin_unlock(&sec->ps_lock);
+ ptlrpcs_destroy_credlist(&freelist);
+ RETURN(busy);
+}
+
+/**************************************************
+ * credential APIs *
+ **************************************************/
+
+static inline
+int ptlrpcs_cred_get_hash(__u64 pag)
+{
+ LASSERT((pag & PTLRPC_CREDCACHE_MASK) < PTLRPC_CREDCACHE_NR);
+ return (pag & PTLRPC_CREDCACHE_MASK);
+}
+
+static
+struct ptlrpc_cred * cred_cache_lookup(struct ptlrpc_sec *sec,
+ struct vfs_cred *vcred,
+ struct ptlrpc_request *req,
+ int create)
+{
+ struct ptlrpc_cred *cred, *new = NULL, *n;
+ LIST_HEAD(freelist);
+ int hash, found = 0;
+ ENTRY;
+
+ hash = ptlrpcs_cred_get_hash(vcred->vc_pag);
+
+retry:
+ spin_lock(&sec->ps_lock);
+ /* do gc if expired */
+ if (time_after(get_seconds(), sec->ps_nextgc))
+ ptlrpcs_credcache_gc(sec, &freelist);
+
+ list_for_each_entry_safe(cred, n, &sec->ps_credcache[hash], pc_hash) {
+ if (cred->pc_flags & PTLRPC_CRED_DEAD)
+ continue;
+ if (ptlrpcs_cred_unlink_expired(cred, &freelist))
+ continue;
+ if (cred->pc_ops->match(cred, req, vcred)) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found) {
+ if (new && new != cred) {
+ /* lost the race, just free it */
+ list_add(&new->pc_hash, &freelist);
+ }
+ list_move(&cred->pc_hash, &sec->ps_credcache[hash]);
+ } else {
+ if (new) {
+ list_add(&new->pc_hash, &sec->ps_credcache[hash]);
+ cred = new;
+ } else if (create) {
+ spin_unlock(&sec->ps_lock);
+ new = sec->ps_type->pst_ops->create_cred(sec, req, vcred);
+ if (new) {
+ atomic_inc(&sec->ps_credcount);
+ goto retry;
+ }
+ } else
+ cred = NULL;
+ }
+
+ /* hold a ref */
+ if (cred)
+ atomic_inc(&cred->pc_refcount);
+
+ spin_unlock(&sec->ps_lock);
+
+ ptlrpcs_destroy_credlist(&freelist);
+ RETURN(cred);
+}
+
+struct ptlrpc_cred * ptlrpcs_cred_lookup(struct ptlrpc_sec *sec,
+ struct vfs_cred *vcred)
+{
+ struct ptlrpc_cred *cred;
+ ENTRY;
+
+ cred = cred_cache_lookup(sec, vcred, NULL, 0);
+ RETURN(cred);
+}
+
+int ptlrpcs_req_get_cred(struct ptlrpc_request *req)
+{
+ struct obd_import *imp = req->rq_import;
+ struct vfs_cred vcred;
+ ENTRY;
+
+ LASSERT(!req->rq_cred);
+ LASSERT(imp);
+ LASSERT(imp->imp_sec);
+
+ /* XXX
+ * for now we simply let PAG == real uid
+ */
+ vcred.vc_pag = (__u64) current->uid;
+ vcred.vc_uid = current->uid;
+
+ req->rq_cred = cred_cache_lookup(imp->imp_sec, &vcred, req, 1);
+
+ if (!req->rq_cred) {
+ CERROR("req %p: fail to get cred from cache\n", req);
+ RETURN(-ENOMEM);
+ }
+
+ RETURN(0);
+}
+
+static void ptlrpcs_sec_destroy(struct ptlrpc_sec *sec);
+
+void ptlrpcs_cred_put(struct ptlrpc_cred *cred, int sync)
+{
+ struct ptlrpc_sec *sec = cred->pc_sec;
+
+ LASSERT(cred);
+ LASSERT(sec);
+ LASSERT(atomic_read(&cred->pc_refcount));
+
+ spin_lock(&sec->ps_lock);
+ if (atomic_dec_and_test(&cred->pc_refcount) &&
+ sync && cred->pc_flags & PTLRPC_CRED_DEAD) {
+ list_del_init(&cred->pc_hash);
+ ptlrpcs_cred_destroy(cred);
+ if (!atomic_read(&sec->ps_credcount) &&
+ !atomic_read(&sec->ps_refcount)) {
+ CWARN("put last cred on a dead sec %p(%s), "
+ "also destroy the sec\n", sec,
+ sec->ps_type->pst_name);
+ spin_unlock(&sec->ps_lock);
+
+ ptlrpcs_sec_destroy(sec);
+ return;
+ }
+ }
+ spin_unlock(&sec->ps_lock);
+}
+
+void ptlrpcs_req_drop_cred(struct ptlrpc_request *req)
+{
+ ENTRY;
+
+ LASSERT(req);
+ LASSERT(req->rq_cred);
+
+ if (req->rq_cred) {
+ /* We'd like to not use 'sync' mode, but might cause
+ * some cred leak. Need more thinking here. FIXME
+ */
+ ptlrpcs_cred_put(req->rq_cred, 1);
+ req->rq_cred = NULL;
+ } else
+ CDEBUG(D_SEC, "req %p have no cred\n", req);
+ EXIT;
+}
+
+/*
+ * request must have a cred. if failed to get new cred,
+ * just restore the old one
+ */
+int ptlrpcs_req_replace_dead_cred(struct ptlrpc_request *req)
+{
+ struct ptlrpc_cred *cred = req->rq_cred;
+ int rc;
+ ENTRY;
+
+ LASSERT(cred);
+ LASSERT(cred->pc_flags & PTLRPC_CRED_DEAD);
+
+ ptlrpcs_cred_get(cred);
+ ptlrpcs_req_drop_cred(req);
+ LASSERT(!req->rq_cred);
+ rc = ptlrpcs_req_get_cred(req);
+ if (!rc) {
+ LASSERT(req->rq_cred);
+ LASSERT(req->rq_cred != cred);
+ ptlrpcs_cred_put(cred, 1);
+ } else {
+ LASSERT(!req->rq_cred);
+ req->rq_cred = cred;
+ }
+ RETURN(rc);
+}
+
+int ptlrpcs_req_refresh_cred(struct ptlrpc_request *req)
+{
+ struct ptlrpc_cred *cred = req->rq_cred;
+ int rc;
+ ENTRY;
+
+ LASSERT(cred);
+
+ if ((cred->pc_flags & (PTLRPC_CRED_UPTODATE | PTLRPC_CRED_DEAD)) ==
+ PTLRPC_CRED_UPTODATE)
+ RETURN(0);
+
+ if (cred->pc_flags & PTLRPC_CRED_DEAD) {
+ rc = ptlrpcs_req_replace_dead_cred(req);
+ if (!rc) {
+ LASSERT(cred != req->rq_cred);
+ CWARN("req %p: replace cred %p => %p\n",
+ req, cred, req->rq_cred);
+ cred = req->rq_cred;
+ } else {
+ LASSERT(cred == req->rq_cred);
+ CERROR("req %p: failed to replace dead cred %p\n",
+ req, cred);
+ RETURN(-ENOMEM);
+ }
+ }
+
+ rc = ptlrpcs_cred_refresh(cred);
+ if (!(cred->pc_flags & PTLRPC_CRED_UPTODATE)) {
+ CERROR("req %p: failed to refresh cred %p, rc %d\n",
+ req, cred, rc);
+ if (!rc)
+ rc = -EACCES;
+ }
+ RETURN(rc);
+}
+
+int ptlrpcs_cli_wrap_request(struct ptlrpc_request *req)
+{
+ struct ptlrpc_cred *cred;
+ int rc;
+ ENTRY;
+
+ LASSERT(req->rq_cred);
+ LASSERT(req->rq_cred->pc_sec);
+ LASSERT(req->rq_cred->pc_ops);
+ LASSERT(req->rq_reqbuf);
+ LASSERT(req->rq_reqbuf_len);
+
+ rc = ptlrpcs_req_refresh_cred(req);
+ if (rc)
+ RETURN(rc);
+
+ CDEBUG(D_SEC, "wrap req %p\n", req);
+ cred = req->rq_cred;
+
+ switch (cred->pc_sec->ps_sectype) {
+ case PTLRPC_SEC_TYPE_NONE:
+ case PTLRPC_SEC_TYPE_AUTH:
+ if (req->rq_req_wrapped) {
+ CWARN("req %p(o%u,x"LPU64",t"LPU64") "
+ "already signed, resend?\n", req,
+ req->rq_reqmsg ? req->rq_reqmsg->opc : -1,
+ req->rq_xid, req->rq_transno);
+ req->rq_req_wrapped = 0;
+ req->rq_reqdata_len = sizeof(struct ptlrpcs_wire_hdr) +
+ req->rq_reqlen;
+ LASSERT(req->rq_reqdata_len % 8 == 0);
+ }
+
+ LASSERT(cred->pc_ops->sign);
+ rc = cred->pc_ops->sign(cred, req);
+ if (!rc)
+ req->rq_req_wrapped = 1;
+ break;
+ case PTLRPC_SEC_TYPE_PRIV:
+ if (req->rq_req_wrapped) {
+ CWARN("req %p(o%u,x"LPU64",t"LPU64") "
+ "already encrypted, resend?\n", req,
+ req->rq_reqmsg ? req->rq_reqmsg->opc : -1,
+ req->rq_xid, req->rq_transno);
+ req->rq_req_wrapped = 0;
+ req->rq_reqdata_len = sizeof(struct ptlrpcs_wire_hdr);
+ LASSERT(req->rq_reqdata_len % 8 == 0);
+ }
+
+ LASSERT(cred->pc_ops->seal);
+ rc = cred->pc_ops->seal(cred, req);
+ if (!rc)
+ req->rq_req_wrapped = 1;
+ break;
+ default:
+ LBUG();
+ }
+ LASSERT(req->rq_reqdata_len);
+ LASSERT(req->rq_reqdata_len % 8 == 0);
+ LASSERT(req->rq_reqdata_len >= sizeof(struct ptlrpcs_wire_hdr));
+ LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len);
+
+ RETURN(rc);
+}
+
+/* rq_nob_received is the actual received data length */
+int ptlrpcs_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+ struct ptlrpc_cred *cred = req->rq_cred;
+ struct ptlrpc_sec *sec;
+ struct ptlrpcs_wire_hdr *sec_hdr;
+ int rc;
+ ENTRY;
+
+ LASSERT(cred);
+ LASSERT(cred->pc_sec);
+ LASSERT(cred->pc_ops);
+ LASSERT(req->rq_repbuf);
+
+ if (req->rq_nob_received < sizeof(*sec_hdr)) {
+ CERROR("req %p: reply size only %d\n",
+ req, req->rq_nob_received);
+ RETURN(-EPROTO);
+ }
+
+ sec_hdr = (struct ptlrpcs_wire_hdr *) req->rq_repbuf;
+ sec_hdr->flavor = le32_to_cpu(sec_hdr->flavor);
+ sec_hdr->sectype = le32_to_cpu(sec_hdr->sectype);
+ sec_hdr->msg_len = le32_to_cpu(sec_hdr->msg_len);
+ sec_hdr->sec_len = le32_to_cpu(sec_hdr->sec_len);
+
+ CDEBUG(D_SEC, "req %p, cred %p, flavor %u, sectype %u\n",
+ req, cred, sec_hdr->flavor, sec_hdr->sectype);
+
+ sec = cred->pc_sec;
+ if (sec_hdr->flavor != sec->ps_flavor.flavor) {
+ CERROR("unmatched flavor %u while expect %u\n",
+ sec_hdr->flavor, sec->ps_flavor.flavor);
+ RETURN(-EPROTO);
+ }
+
+ if (sizeof(*sec_hdr) + sec_hdr->msg_len + sec_hdr->sec_len >
+ req->rq_nob_received) {
+ CERROR("msg %u, sec %u, while only get %d\n",
+ sec_hdr->msg_len, sec_hdr->sec_len,
+ req->rq_nob_received);
+ RETURN(-EPROTO);
+ }
+
+ switch (sec_hdr->sectype) {
+ case PTLRPC_SEC_TYPE_NONE:
+ case PTLRPC_SEC_TYPE_AUTH: {
+ LASSERT(cred->pc_ops->verify);
+ rc = cred->pc_ops->verify(cred, req);
+ LASSERT(rc || req->rq_repmsg || req->rq_ptlrpcs_restart);
+ break;
+ case PTLRPC_SEC_TYPE_PRIV:
+ LASSERT(cred->pc_ops->unseal);
+ rc = cred->pc_ops->unseal(cred, req);
+ LASSERT(rc || req->rq_repmsg || req->rq_ptlrpcs_restart);
+ break;
+ }
+ default:
+ rc = -1;
+ LBUG();
+ }
+ RETURN(rc);
+}
+
+/**************************************************
+ * security APIs *
+ **************************************************/
+
+struct ptlrpc_sec * ptlrpcs_sec_create(ptlrpcs_flavor_t *flavor,
+ struct obd_import *import,
+ const char *pipe_dir,
+ void *pipe_data)
+{
+ struct ptlrpc_sec_type *type;
+ struct ptlrpc_sec *sec;
+ ENTRY;
+
+ type = ptlrpcs_flavor2type(flavor);
+ if (!type) {
+ CDEBUG(D_SEC, "invalid major flavor %u\n", flavor->flavor);
+ RETURN(NULL);
+ }
+
+ sec = type->pst_ops->create_sec(flavor, pipe_dir, pipe_data);
+ if (sec) {
+ spin_lock_init(&sec->ps_lock);
+ ptlrpcs_init_credcache(sec);
+ sec->ps_type = type;
+ sec->ps_flavor = *flavor;
+ sec->ps_import = class_import_get(import);
+ atomic_set(&sec->ps_refcount, 1);
+ atomic_set(&sec->ps_credcount, 0);
+ atomic_inc(&type->pst_inst);
+ } else
+ ptlrpcs_type_put(type);
+
+ return sec;
+}
+
+static void ptlrpcs_sec_destroy(struct ptlrpc_sec *sec)
+{
+ struct ptlrpc_sec_type *type = sec->ps_type;
+ struct ptlrpc_import *imp = sec->ps_import;
+
+ LASSERT(type && type->pst_ops);
+ LASSERT(type->pst_ops->destroy_sec);
+
+ type->pst_ops->destroy_sec(sec);
+ atomic_dec(&type->pst_inst);
+ ptlrpcs_type_put(type);
+ class_import_put(imp);
+}
+
+void ptlrpcs_sec_put(struct ptlrpc_sec *sec)
+{
+ if (atomic_dec_and_test(&sec->ps_refcount)) {
+ ptlrpcs_flush_credcache(sec, 1);
+
+ if (atomic_read(&sec->ps_credcount) == 0) {
+ ptlrpcs_sec_destroy(sec);
+ } else {
+ CWARN("sec %p(%s) is no usage while %d cred still "
+ "holded, destroy delayed\n",
+ sec, sec->ps_type->pst_name,
+ atomic_read(&sec->ps_credcount));
+ }
+ }
+}
+
+void ptlrpcs_sec_invalidate_cache(struct ptlrpc_sec *sec)
+{
+ ptlrpcs_flush_credcache(sec, 1);
+}
+
+int sec_alloc_reqbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ int msgsize, int secsize)
+{
+ struct ptlrpcs_wire_hdr *hdr;
+ ENTRY;
+
+ LASSERT(msgsize % 8 == 0);
+ LASSERT(secsize % 8 == 0);
+
+ req->rq_reqbuf_len = sizeof(*hdr) + msgsize + secsize;
+ OBD_ALLOC(req->rq_reqbuf, req->rq_reqbuf_len);
+ if (!req->rq_reqbuf) {
+ CERROR("can't alloc %d\n", req->rq_reqbuf_len);
+ RETURN(-ENOMEM);
+ }
+
+ hdr = buf_to_sec_hdr(req->rq_reqbuf);
+ hdr->flavor = cpu_to_le32(sec->ps_flavor.flavor);
+ hdr->sectype = cpu_to_le32(sec->ps_sectype);
+ hdr->msg_len = msgsize;
+ /* security length will be filled later */
+
+ /* later reqdata_len will be added on actual security payload */
+ req->rq_reqdata_len = sizeof(*hdr) + msgsize;
+ req->rq_reqmsg = buf_to_lustre_msg(req->rq_reqbuf);
+
+ CDEBUG(D_SEC, "req %p: rqbuf at %p, len %d, msg %d, sec %d\n",
+ req, req->rq_reqbuf, req->rq_reqbuf_len,
+ msgsize, secsize);
+
+ RETURN(0);
+}
+
+/* when complete successfully, req->rq_reqmsg should point to the
+ * right place.
+ */
+int ptlrpcs_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize)
+{
+ struct ptlrpc_cred *cred = req->rq_cred;
+ struct ptlrpc_sec *sec;
+ struct ptlrpc_secops *ops;
+
+ LASSERT(msgsize % 8 == 0);
+ LASSERT(sizeof(struct ptlrpcs_wire_hdr) % 8 == 0);
+ LASSERT(cred);
+ LASSERT(atomic_read(&cred->pc_refcount));
+ LASSERT(cred->pc_sec);
+ LASSERT(cred->pc_sec->ps_type);
+ LASSERT(cred->pc_sec->ps_type->pst_ops);
+ LASSERT(req->rq_reqbuf == NULL);
+ LASSERT(req->rq_reqmsg == NULL);
+
+ sec = cred->pc_sec;
+ ops = sec->ps_type->pst_ops;
+ if (ops->alloc_reqbuf)
+ return ops->alloc_reqbuf(sec, req, msgsize);
+ else
+ return sec_alloc_reqbuf(sec, req, msgsize, 0);
+}
+
+void sec_free_reqbuf(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req)
+{
+ LASSERT(req->rq_reqbuf);
+ LASSERT(req->rq_reqbuf_len);
+
+ /* sanity check */
+ if (req->rq_reqmsg) {
+ LASSERT((char *) req->rq_reqmsg >= req->rq_reqbuf &&
+ (char *) req->rq_reqmsg < req->rq_reqbuf +
+ req->rq_reqbuf_len);
+ }
+
+ OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len);
+ req->rq_reqbuf = NULL;
+ req->rq_reqmsg = NULL;
+}
+
+void ptlrpcs_cli_free_reqbuf(struct ptlrpc_request *req)
+{
+ struct ptlrpc_cred *cred = req->rq_cred;
+ struct ptlrpc_sec *sec;
+ struct ptlrpc_secops *ops;
+
+ LASSERT(cred);
+ LASSERT(atomic_read(&cred->pc_refcount));
+ LASSERT(cred->pc_sec);
+ LASSERT(cred->pc_sec->ps_type);
+ LASSERT(cred->pc_sec->ps_type->pst_ops);
+ LASSERT(req->rq_reqbuf);
+
+ sec = cred->pc_sec;
+ ops = sec->ps_type->pst_ops;
+ if (ops->free_reqbuf)
+ ops->free_reqbuf(sec, req);
+ else
+ sec_free_reqbuf(sec, req);
+}
+
+int ptlrpcs_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize)
+{
+ struct ptlrpc_cred *cred = req->rq_cred;
+ struct ptlrpc_sec *sec;
+ struct ptlrpc_secops *ops;
+ int msg_payload, sec_payload;
+ ENTRY;
+
+ LASSERT(msgsize % 8 == 0);
+ LASSERT(sizeof(struct ptlrpcs_wire_hdr) % 8 == 0);
+ LASSERT(cred);
+ LASSERT(atomic_read(&cred->pc_refcount));
+ LASSERT(cred->pc_sec);
+ LASSERT(cred->pc_sec->ps_type);
+ LASSERT(cred->pc_sec->ps_type->pst_ops);
+ LASSERT(req->rq_repbuf == NULL);
+
+ sec = cred->pc_sec;
+ ops = sec->ps_type->pst_ops;
+ if (ops->alloc_repbuf)
+ RETURN(ops->alloc_repbuf(sec, req, msgsize));
+
+ /* default allocation scheme */
+ msg_payload = sec->ps_sectype == PTLRPC_SEC_TYPE_PRIV ? 0 : msgsize;
+ sec_payload = size_round(ptlrpcs_est_rep_payload(sec, msgsize));
+
+ req->rq_repbuf_len = sizeof(struct ptlrpcs_wire_hdr) +
+ msg_payload + sec_payload;
+ OBD_ALLOC(req->rq_repbuf, req->rq_repbuf_len);
+ if (!req->rq_repbuf)
+ RETURN(-ENOMEM);
+
+ CDEBUG(D_SEC, "req %p: repbuf at %p, len %d, msg %d, sec %d\n",
+ req, req->rq_repbuf, req->rq_repbuf_len,
+ msg_payload, sec_payload);
+
+ RETURN(0);
+}
+
+void ptlrpcs_cli_free_repbuf(struct ptlrpc_request *req)
+{
+ struct ptlrpc_cred *cred = req->rq_cred;
+ struct ptlrpc_sec *sec;
+ struct ptlrpc_secops *ops;
+ ENTRY;
+
+ LASSERT(cred);
+ LASSERT(atomic_read(&cred->pc_refcount));
+ LASSERT(cred->pc_sec);
+ LASSERT(cred->pc_sec->ps_type);
+ LASSERT(cred->pc_sec->ps_type->pst_ops);
+ LASSERT(req->rq_repbuf);
+
+ sec = cred->pc_sec;
+ ops = sec->ps_type->pst_ops;
+ if (ops->free_repbuf)
+ ops->free_repbuf(sec, req);
+ else {
+ OBD_FREE(req->rq_repbuf, req->rq_repbuf_len);
+ req->rq_repbuf = NULL;
+ req->rq_repmsg = NULL;
+ }
+ EXIT;
+}
+
+int ptlrpcs_import_get_sec(struct obd_import *imp)
+{
+ ptlrpcs_flavor_t flavor = {PTLRPC_SEC_NULL, 0};
+ char *pipedir = NULL;
+ ENTRY;
+
+ LASSERT(imp->imp_obd);
+ LASSERT(imp->imp_obd->obd_type);
+
+ /* old sec might be still there in reconnecting */
+ if (imp->imp_sec)
+ RETURN(0);
+
+ /* find actual flavor for client obd. right now server side
+ * obd (reverse imp, etc) will simply use NULL.
+ */
+ if (!strcmp(imp->imp_obd->obd_type->typ_name, "mdc") ||
+ !strcmp(imp->imp_obd->obd_type->typ_name, "osc")) {
+ struct client_obd *cli = &imp->imp_obd->u.cli;
+
+ if (cli->cl_sec_flavor == PTLRPC_SEC_GSS) {
+ CWARN("select security gss/%s for %s(%s)\n",
+ cli->cl_sec_subflavor == PTLRPC_SEC_GSS_KRB5I ?
+ "krb5i" : "krb5p",
+ imp->imp_obd->obd_type->typ_name,
+ imp->imp_obd->obd_name);
+ flavor.flavor = cli->cl_sec_flavor;
+ flavor.subflavor = cli->cl_sec_subflavor;
+ pipedir = imp->imp_obd->obd_name;
+ } else if (cli->cl_sec_flavor == PTLRPC_SEC_NULL) {
+ CWARN("select security null for %s(%s)\n",
+ imp->imp_obd->obd_type->typ_name,
+ imp->imp_obd->obd_name);
+ } else {
+ CWARN("unknown security flavor for mdc(%s), "
+ "use 'null'\n", imp->imp_obd->obd_name);
+ }
+ }
+
+ imp->imp_sec = ptlrpcs_sec_create(&flavor, imp, pipedir, imp);
+ if (!imp->imp_sec)
+ RETURN(-EINVAL);
+ else
+ RETURN(0);
+}
+
+void ptlrpcs_import_drop_sec(struct obd_import *imp)
+{
+ ENTRY;
+ if (imp->imp_sec) {
+ ptlrpcs_sec_put(imp->imp_sec);
+ imp->imp_sec = NULL;
+ }
+ EXIT;
+}
+
+int __init ptlrpc_sec_init(void)
+{
+ int rc;
+
+ if ((rc = ptlrpcs_null_init()))
+ return rc;
+
+ if ((rc = svcsec_null_init())) {
+ ptlrpcs_null_exit();
+ return rc;
+ }
+
+#if 0
+#if !defined __KERNEL__ && defined ENABLE_GSS
+ ptlrpcs_gss_init();
+#endif
+#endif
+ return 0;
+}
+
+static void __exit ptlrpc_sec_exit(void)
+{
+ svcsec_null_exit();
+ ptlrpcs_null_exit();
+}
+
+
+EXPORT_SYMBOL(ptlrpcs_register);
+EXPORT_SYMBOL(ptlrpcs_unregister);
+EXPORT_SYMBOL(ptlrpcs_sec_create);
+EXPORT_SYMBOL(ptlrpcs_sec_put);
+EXPORT_SYMBOL(ptlrpcs_sec_invalidate_cache);
+EXPORT_SYMBOL(ptlrpcs_import_get_sec);
+EXPORT_SYMBOL(ptlrpcs_import_drop_sec);
+EXPORT_SYMBOL(ptlrpcs_cred_lookup);
+EXPORT_SYMBOL(ptlrpcs_cred_put);
+EXPORT_SYMBOL(ptlrpcs_req_get_cred);
+EXPORT_SYMBOL(ptlrpcs_req_drop_cred);
+EXPORT_SYMBOL(ptlrpcs_req_replace_dead_cred);
+EXPORT_SYMBOL(ptlrpcs_req_refresh_cred);
+EXPORT_SYMBOL(ptlrpcs_cli_alloc_reqbuf);
+EXPORT_SYMBOL(ptlrpcs_cli_free_reqbuf);
+EXPORT_SYMBOL(ptlrpcs_cli_alloc_repbuf);
+EXPORT_SYMBOL(ptlrpcs_cli_free_repbuf);
+EXPORT_SYMBOL(ptlrpcs_cli_wrap_request);
+EXPORT_SYMBOL(ptlrpcs_cli_unwrap_reply);
+EXPORT_SYMBOL(sec_alloc_reqbuf);
+EXPORT_SYMBOL(sec_free_reqbuf);
+
+EXPORT_SYMBOL(svcsec_register);
+EXPORT_SYMBOL(svcsec_unregister);
+EXPORT_SYMBOL(svcsec_accept);
+EXPORT_SYMBOL(svcsec_authorize);
+EXPORT_SYMBOL(svcsec_alloc_repbuf);
+EXPORT_SYMBOL(svcsec_cleanup_req);
+EXPORT_SYMBOL(svcsec_get);
+EXPORT_SYMBOL(svcsec_put);
+EXPORT_SYMBOL(svcsec_alloc_reply_state);
+EXPORT_SYMBOL(svcsec_free_reply_state);
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Lustre Security Support");
+MODULE_LICENSE("GPL");
+
+module_init(ptlrpc_sec_init);
+module_exit(ptlrpc_sec_exit);
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
+
+static int null_cred_refresh(struct ptlrpc_cred *cred)
+{
+ ENTRY;
+ LASSERT(cred->pc_flags & PTLRPC_CRED_UPTODATE);
+ RETURN(0);
+}
+
+static int null_cred_match(struct ptlrpc_cred *cred,
+ struct ptlrpc_request *req,
+ struct vfs_cred *vcred)
+{
+ ENTRY;
+ RETURN(1);
+}
+
+static int null_cred_sign(struct ptlrpc_cred *cred,
+ struct ptlrpc_request *req)
+{
+ struct ptlrpcs_wire_hdr *hdr = buf_to_sec_hdr(req->rq_reqbuf);
+ ENTRY;
+
+ hdr->sec_len = cpu_to_le32(0);
+
+ RETURN(0);
+}
+
+static int null_cred_verify(struct ptlrpc_cred *cred,
+ struct ptlrpc_request *req)
+{
+ struct ptlrpcs_wire_hdr *hdr = buf_to_sec_hdr(req->rq_repbuf);
+
+ if (hdr->sec_len != 0) {
+ CERROR("security payload %u not zero\n", hdr->sec_len);
+ RETURN(-EPROTO);
+ }
+
+ req->rq_repmsg = (struct lustre_msg *)(hdr + 1);
+ req->rq_replen = hdr->msg_len;
+ CDEBUG(D_SEC, "set repmsg at %p, len %d\n",
+ req->rq_repmsg, req->rq_replen);
+
+ RETURN(0);
+}
+
+static void null_cred_destroy(struct ptlrpc_cred *cred)
+{
+ LASSERT(!atomic_read(&cred->pc_refcount));
+
+ CDEBUG(D_SEC, "NULL_SEC: destroy cred %p\n", cred);
+ OBD_FREE(cred, sizeof(*cred));
+}
+
+static struct ptlrpc_credops null_credops = {
+ .refresh = null_cred_refresh,
+ .match = null_cred_match,
+ .sign = null_cred_sign,
+ .verify = null_cred_verify,
+ .destroy = null_cred_destroy,
+};
+
+static
+struct ptlrpc_sec* null_create_sec(ptlrpcs_flavor_t *flavor,
+ const char *pipe_dir,
+ void *pipe_data)
+{
+ struct ptlrpc_sec *sec;
+ ENTRY;
+
+ LASSERT(flavor->flavor == PTLRPC_SEC_NULL);
+
+ OBD_ALLOC(sec, sizeof(*sec));
+ if (!sec)
+ RETURN(ERR_PTR(-ENOMEM));
+
+ sec->ps_sectype = PTLRPC_SEC_TYPE_NONE;
+ sec->ps_expire = (-1UL >> 1); /* never expire */
+ sec->ps_nextgc = (-1UL >> 1);
+ sec->ps_flags = 0;
+
+ CDEBUG(D_SEC, "Create NULL security module at %p\n", sec);
+ RETURN(sec);
+}
+
+static
+void null_destroy_sec(struct ptlrpc_sec *sec)
+{
+ ENTRY;
+
+ CDEBUG(D_SEC, "Destroy NULL security module at %p\n", sec);
+
+ LASSERT(!atomic_read(&sec->ps_refcount));
+ OBD_FREE(sec, sizeof(*sec));
+ EXIT;
+}
+
+static
+struct ptlrpc_cred* null_create_cred(struct ptlrpc_sec *sec,
+ struct ptlrpc_request *req,
+ struct vfs_cred *vcred)
+{
+ struct ptlrpc_cred *cred;
+ ENTRY;
+
+ OBD_ALLOC(cred, sizeof(*cred));
+ if (!cred)
+ RETURN(NULL);
+
+ INIT_LIST_HEAD(&cred->pc_hash);
+ atomic_set(&cred->pc_refcount, 0);
+ cred->pc_sec = sec;
+ cred->pc_ops = &null_credops;
+ cred->pc_req = req;
+ cred->pc_expire = (-1UL >> 1); /* never expire */
+ cred->pc_flags = PTLRPC_CRED_UPTODATE;
+ cred->pc_pag = vcred->vc_pag;
+ cred->pc_uid = vcred->vc_uid;
+ CDEBUG(D_SEC, "create a null cred at %p("LPU64"/%u)\n",
+ cred, vcred->vc_pag, vcred->vc_uid);
+
+ RETURN(cred);
+}
+
+static struct ptlrpc_secops null_secops = {
+ .create_sec = null_create_sec,
+ .destroy_sec = null_destroy_sec,
+ .create_cred = null_create_cred,
+};
+
+static struct ptlrpc_sec_type null_type = {
+ .pst_owner = THIS_MODULE,
+ .pst_name = "NULL_SEC",
+ .pst_inst = ATOMIC_INIT(0),
+ .pst_flavor = {PTLRPC_SEC_NULL, 0},
+ .pst_ops = &null_secops,
+};
+
+int ptlrpcs_null_init(void)
+{
+ int rc;
+
+ rc = ptlrpcs_register(&null_type);
+ if (rc)
+ CERROR("failed to register NULL security: %d\n", rc);
+
+ return rc;
+}
+
+int ptlrpcs_null_exit(void)
+{
+ int rc;
+
+ rc = ptlrpcs_unregister(&null_type);
+ if (rc)
+ CERROR("cannot unregister NULL security: %d\n", rc);
+
+ return rc;
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
+
+static spinlock_t svcsecs_lock = SPIN_LOCK_UNLOCKED;
+static struct ptlrpc_svcsec *svcsecs[PTLRPC_SEC_MAX_FLAVORS] = {
+ NULL,
+};
+
+int svcsec_register(struct ptlrpc_svcsec *sec)
+{
+ __u32 flavor = sec->pss_flavor.flavor;
+
+ if (flavor >= PTLRPC_SEC_MAX_FLAVORS)
+ return -EINVAL;
+
+ spin_lock(&svcsecs_lock);
+ if (svcsecs[flavor]) {
+ spin_unlock(&svcsecs_lock);
+ return -EALREADY;
+ }
+ svcsecs[flavor] = sec;
+ spin_unlock(&svcsecs_lock);
+
+ CDEBUG(D_SEC, "Registered svc security module %s\n", sec->pss_name);
+ return 0;
+}
+
+int svcsec_unregister(struct ptlrpc_svcsec *sec)
+{
+ __u32 flavor = sec->pss_flavor.flavor;
+
+ if (flavor >= PTLRPC_SEC_MAX_FLAVORS)
+ return -EINVAL;
+
+ spin_lock(&svcsecs_lock);
+ if (!svcsecs[flavor]) {
+ spin_unlock(&svcsecs_lock);
+ return -EINVAL;
+ }
+
+ LASSERT(svcsecs[flavor] == sec);
+
+ CDEBUG(D_SEC, "Unregistered svc security module %s\n", sec->pss_name);
+ svcsecs[flavor] = NULL;
+ spin_unlock(&svcsecs_lock);
+
+ return 0;
+}
+
+static
+struct ptlrpc_svcsec * flavor2svcsec(__u32 flavor)
+{
+ struct ptlrpc_svcsec *sec;
+
+ if (flavor >= PTLRPC_SEC_MAX_FLAVORS)
+ return NULL;
+
+ spin_lock(&svcsecs_lock);
+ sec = svcsecs[flavor];
+ if (sec && !try_module_get(sec->pss_owner))
+ sec = NULL;
+ spin_unlock(&svcsecs_lock);
+ return sec;
+}
+
+struct ptlrpc_svcsec * svcsec_get(struct ptlrpc_svcsec *sec)
+{
+ int rc;
+
+ spin_lock(&svcsecs_lock);
+ rc = try_module_get(sec->pss_owner);
+ spin_unlock(&svcsecs_lock);
+ LASSERT(rc);
+ return sec;
+}
+
+void svcsec_put(struct ptlrpc_svcsec *sec)
+{
+ spin_lock(&svcsecs_lock);
+ module_put(sec->pss_owner);
+ spin_unlock(&svcsecs_lock);
+}
+
+/*
+ * common code to allocate reply_state buffer.
+ */
+int svcsec_alloc_reply_state(struct ptlrpc_request *req,
+ int msgsize, int secsize)
+{
+ struct ptlrpc_reply_state *rs;
+ char *buf;
+ int repsize, bufsize;
+ ENTRY;
+
+ LASSERT(msgsize % 8 == 0);
+ LASSERT(secsize % 8 == 0);
+
+ repsize = sizeof(struct ptlrpcs_wire_hdr) + msgsize + secsize;
+ bufsize = repsize + sizeof(struct ptlrpc_reply_state);
+
+ OBD_ALLOC(buf, bufsize);
+ if (!buf) {
+ CERROR("can't alloc %d\n", bufsize);
+ RETURN(-ENOMEM);
+ }
+
+ /* req->rq_repbuf is not used on server side */
+ rs = (struct ptlrpc_reply_state *) (buf + repsize);
+ rs->rs_buf = buf;
+ rs->rs_buf_len = bufsize;
+ rs->rs_repbuf = buf;
+ rs->rs_repbuf_len = repsize;
+ /* current known data length is hdr + msg, security payload
+ * will be added on later.
+ */
+ rs->rs_repdata_len = sizeof(struct ptlrpcs_wire_hdr) + msgsize;
+ req->rq_repmsg = rs->rs_msg = (struct lustre_msg *)
+ (rs->rs_repbuf + sizeof(struct ptlrpcs_wire_hdr));
+
+ req->rq_reply_state = rs;
+
+ CDEBUG(D_SEC, "alloc rs buf at %p, len %d; repbuf at %p, len %d\n",
+ rs->rs_buf, rs->rs_buf_len, rs->rs_repbuf, rs->rs_repbuf_len);
+
+ RETURN(0);
+}
+
+void svcsec_free_reply_state(struct ptlrpc_reply_state *rs)
+{
+ char *p;
+ ENTRY;
+
+ /* for work around memory-alloc debug poison */
+ LASSERT(rs);
+ p = rs->rs_buf;
+ OBD_FREE(p, rs->rs_buf_len);
+ EXIT;
+}
+
+int svcsec_alloc_repbuf(struct ptlrpc_svcsec *svcsec,
+ struct ptlrpc_request *req,
+ int msgsize)
+{
+ LASSERT(svcsec);
+ LASSERT(msgsize % 8 == 0);
+
+ if (svcsec->alloc_repbuf)
+ return svcsec->alloc_repbuf(svcsec, req, msgsize);
+ else
+ return svcsec_alloc_reply_state(req, msgsize, 0);
+}
+
+int svcsec_accept(struct ptlrpc_request *req, enum ptlrpcs_error *res)
+{
+ struct ptlrpc_svcsec *sec;
+ struct ptlrpcs_wire_hdr *sec_hdr;
+ int rc;
+ ENTRY;
+
+ LASSERT(req->rq_reqbuf);
+ LASSERT(!req->rq_reqmsg);
+ LASSERT(!req->rq_svcsec);
+
+ *res = PTLRPCS_BADCRED;
+ if (req->rq_reqbuf_len < sizeof(*sec_hdr)) {
+ CERROR("drop too short msg (length: %d)\n", req->rq_reqbuf_len);
+ RETURN(SVC_DROP);
+ }
+
+ sec_hdr = (struct ptlrpcs_wire_hdr *) req->rq_reqbuf;
+ sec_hdr->flavor = le32_to_cpu(sec_hdr->flavor);
+ sec_hdr->sectype = le32_to_cpu(sec_hdr->sectype);
+ sec_hdr->msg_len = le32_to_cpu(sec_hdr->msg_len);
+ sec_hdr->sec_len = le32_to_cpu(sec_hdr->sec_len);
+
+ /* sanity check */
+ switch (sec_hdr->sectype) {
+ case PTLRPC_SEC_TYPE_NONE:
+ case PTLRPC_SEC_TYPE_AUTH:
+ case PTLRPC_SEC_TYPE_PRIV:
+ break;
+ default:
+ CERROR("unknown security type %d\n", sec_hdr->sectype);
+ RETURN(SVC_DROP);
+ }
+
+ if (sizeof(*sec_hdr) + sec_hdr->msg_len + sec_hdr->sec_len >
+ req->rq_reqbuf_len) {
+ CERROR("received %d, msg %d, sec %d\n",
+ req->rq_reqbuf_len, sec_hdr->msg_len, sec_hdr->sec_len);
+ RETURN(SVC_DROP);
+ }
+
+ req->rq_svcsec = sec = flavor2svcsec(sec_hdr->flavor);
+ if (!sec) {
+ CERROR("drop msg: unsupported flavor %d\n", sec_hdr->flavor);
+ RETURN(SVC_DROP);
+ }
+ LASSERT(sec->accept);
+
+ rc = sec->accept(req, res);
+
+ switch (rc) {
+ case SVC_DROP:
+ svcsec_put(sec);
+ req->rq_svcsec = NULL;
+ break;
+ case SVC_OK:
+ case SVC_LOGIN:
+ case SVC_LOGOUT:
+ LASSERT(req->rq_reqmsg);
+ break;
+ }
+
+ RETURN(rc);
+}
+
+int svcsec_authorize(struct ptlrpc_request *req)
+{
+ LASSERT(req->rq_svcsec);
+ LASSERT(req->rq_svcsec->authorize);
+
+ return (req->rq_svcsec->authorize(req));
+}
+
+void svcsec_cleanup_req(struct ptlrpc_request *req)
+{
+ struct ptlrpc_svcsec *svcsec = req->rq_svcsec;
+ ENTRY;
+
+ LASSERT(svcsec);
+ LASSERT(svcsec->cleanup_req || !req->rq_sec_svcdata);
+
+ if (svcsec->cleanup_req)
+ svcsec->cleanup_req(svcsec, req);
+ EXIT;
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
+
+static
+int null_svcsec_accept(struct ptlrpc_request *req, enum ptlrpcs_error *res)
+{
+ struct ptlrpcs_wire_hdr *hdr = buf_to_sec_hdr(req->rq_reqbuf);
+ ENTRY;
+
+ LASSERT(hdr->flavor == PTLRPC_SEC_NULL);
+
+ if (hdr->sec_len != 0) {
+ CERROR("security payload %d not zero\n", hdr->sec_len);
+ *res = PTLRPCS_REJECTEDCRED;
+ RETURN(SVC_DROP);
+ }
+
+ req->rq_reqmsg = (struct lustre_msg *)(hdr + 1);
+ req->rq_reqlen = hdr->msg_len;
+ *res = PTLRPCS_OK;
+ CDEBUG(D_SEC, "req %p: set reqmsg at %p, len %d\n",
+ req, req->rq_reqmsg, req->rq_reqlen);
+ RETURN(SVC_OK);
+}
+
+static
+int null_svcsec_authorize(struct ptlrpc_request *req)
+{
+ struct ptlrpc_reply_state *rs = req->rq_reply_state;
+ struct ptlrpcs_wire_hdr *hdr;
+ ENTRY;
+
+ LASSERT(rs);
+ LASSERT(rs->rs_repbuf_len >= 4 * 4);
+
+ hdr = buf_to_sec_hdr(rs->rs_repbuf);
+ hdr->flavor = cpu_to_le32(PTLRPC_SEC_NULL);
+ hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_AUTH);
+ hdr->msg_len = cpu_to_le32(req->rq_replen);
+ hdr->sec_len = cpu_to_le32(0);
+
+ CDEBUG(D_SEC, "fill in datasize %d\n", rs->rs_repdata_len);
+ RETURN(0);
+}
+
+static struct ptlrpc_svcsec null_svcsec = {
+ .pss_owner = THIS_MODULE,
+ .pss_name = "NULL_SVCSEC",
+ .pss_flavor = {PTLRPC_SEC_NULL, 0},
+ .accept = null_svcsec_accept,
+ .authorize = null_svcsec_authorize,
+};
+
+int svcsec_null_init()
+{
+ int rc;
+
+ rc = svcsec_register(&null_svcsec);
+ if (rc)
+ CERROR("failed to register SVCNULL security: %d\n", rc);
+
+ return rc;
+}
+
+int svcsec_null_exit()
+{
+ int rc;
+
+ rc = svcsec_unregister(&null_svcsec);
+ if (rc)
+ CERROR("cannot unregister SVCNULL security: %d\n", rc);
+
+ return rc;
+}
+
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LOV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <linux/obd_support.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_idl.h>
+#include <linux/obd_class.h>
+#include <linux/lustre_ucache.h>
+
+/* FIXME
+ * current ucache implementation is simply took from group hash code, almost
+ * without any change. it's very simple and have very limited functionality,
+ * and probably it's also only suitable for usage of group hash.
+ */
+
+void upcall_cache_init_entry(struct upcall_cache *cache,
+ struct upcall_cache_entry *entry,
+ __u64 key)
+{
+ UC_CACHE_SET_NEW(entry);
+ INIT_LIST_HEAD(&entry->ue_hash);
+ atomic_set(&entry->ue_refcount, 0);
+ entry->ue_key = key;
+ entry->ue_cache = cache;
+ init_waitqueue_head(&entry->ue_waitq);
+}
+EXPORT_SYMBOL(upcall_cache_init_entry);
+
+static inline struct upcall_cache_entry *
+alloc_entry(struct upcall_cache *cache, __u64 key)
+{
+ LASSERT(cache->alloc_entry);
+ return cache->alloc_entry(cache, key);
+}
+
+static void free_entry(struct upcall_cache_entry *entry)
+{
+ struct upcall_cache *cache = entry->ue_cache;
+
+ LASSERT(cache);
+ LASSERT(cache->free_entry);
+ LASSERT(atomic_read(&entry->ue_refcount) == 0);
+
+ CDEBUG(D_OTHER, "destroy %s entry %p for key "LPU64"\n",
+ cache->uc_name, entry, entry->ue_key);
+
+ list_del(&entry->ue_hash);
+ cache->free_entry(cache, entry);
+}
+
+static inline void get_entry(struct upcall_cache_entry *entry)
+{
+ atomic_inc(&entry->ue_refcount);
+}
+
+static inline void put_entry(struct upcall_cache_entry *entry)
+{
+ if (atomic_dec_and_test(&entry->ue_refcount) &&
+ !UC_CACHE_IS_VALID(entry)) {
+ free_entry(entry);
+ }
+}
+
+static inline int refresh_entry(struct upcall_cache_entry *entry)
+{
+ struct upcall_cache *cache = entry->ue_cache;
+
+ LASSERT(cache);
+ LASSERT(cache->make_upcall);
+
+ return cache->make_upcall(cache, entry);
+}
+
+static int check_unlink_entry(struct upcall_cache_entry *entry)
+{
+ if (UC_CACHE_IS_VALID(entry) &&
+ time_before(get_seconds(), entry->ue_expire))
+ return 0;
+
+ if (UC_CACHE_IS_ACQUIRING(entry) &&
+ time_after(get_seconds(), entry->ue_acquire_expire)) {
+ UC_CACHE_SET_EXPIRED(entry);
+ wake_up_all(&entry->ue_waitq);
+ } else if (!UC_CACHE_IS_INVALID(entry)) {
+ UC_CACHE_SET_EXPIRED(entry);
+ }
+
+ list_del_init(&entry->ue_hash);
+ if (!atomic_read(&entry->ue_refcount))
+ free_entry(entry);
+ return 1;
+}
+
+/* XXX
+ * currently always use write_lock
+ */
+static struct upcall_cache_entry *
+__get_entry(struct upcall_cache *cache, unsigned int hash, __u64 key,
+ int create, int async)
+{
+ struct list_head *head;
+ struct upcall_cache_entry *entry, *next, *new = NULL;
+ int found = 0, rc;
+ ENTRY;
+
+ LASSERT(hash < cache->uc_hashsize);
+
+ head = &cache->uc_hashtable[hash];
+
+find_again:
+ write_lock(&cache->uc_hashlock);
+ list_for_each_entry_safe(entry, next, head, ue_hash) {
+ if (check_unlink_entry(entry))
+ continue;
+ if (entry->ue_key == key) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ if (!create)
+ RETURN(NULL);
+ if (!new) {
+ write_unlock(&cache->uc_hashlock);
+ new = alloc_entry(cache, key);
+ if (!new) {
+ CERROR("fail to alloc entry\n");
+ RETURN(NULL);
+ }
+ goto find_again;
+ } else {
+ list_add(&new->ue_hash, head);
+ entry = new;
+ }
+ } else {
+ if (new) {
+ free_entry(new);
+ new = NULL;
+ }
+ list_move(&entry->ue_hash, head);
+ }
+ get_entry(entry);
+
+ /* as for this moment, we have found matched entry
+ * and hold a ref of it. if it's NEW (we created it),
+ * we must give it a push to refresh
+ */
+ if (UC_CACHE_IS_NEW(entry)) {
+ LASSERT(entry == new);
+ UC_CACHE_SET_ACQUIRING(entry);
+ UC_CACHE_CLEAR_NEW(entry);
+ entry->ue_acquire_expire = get_seconds() +
+ cache->uc_acquire_expire;
+
+ write_unlock(&cache->uc_hashlock);
+ rc = refresh_entry(entry);
+ write_lock(&cache->uc_hashlock);
+ if (rc) {
+ UC_CACHE_CLEAR_ACQUIRING(entry);
+ UC_CACHE_SET_INVALID(entry);
+ }
+ }
+
+ /* caller don't want to wait */
+ if (async) {
+ write_unlock(&cache->uc_hashlock);
+ RETURN(entry);
+ }
+
+ /* someone (and only one) is doing upcall upon
+ * this item, just wait it complete
+ */
+ if (UC_CACHE_IS_ACQUIRING(entry)) {
+ wait_queue_t wait;
+
+ init_waitqueue_entry(&wait, current);
+ add_wait_queue(&entry->ue_waitq, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ write_unlock(&cache->uc_hashlock);
+
+ schedule_timeout(cache->uc_acquire_expire);
+
+ write_lock(&cache->uc_hashlock);
+ remove_wait_queue(&entry->ue_waitq, &wait);
+ if (UC_CACHE_IS_ACQUIRING(entry)) {
+ /* we're interrupted or upcall failed
+ * in the middle
+ */
+ CERROR("entry %p not refreshed: cur %lu, key "LPU64", "
+ "ref %d fl %u, ac %ld, ex %ld\n",
+ entry, get_seconds(), entry->ue_key,
+ atomic_read(&entry->ue_refcount),
+ entry->ue_flags, entry->ue_acquire_expire,
+ entry->ue_expire);
+ put_entry(entry);
+ write_unlock(&cache->uc_hashlock);
+ RETURN(NULL);
+ }
+ /* fall through */
+ }
+
+ /* invalid means error, don't need to try again */
+ if (UC_CACHE_IS_INVALID(entry)) {
+ put_entry(entry);
+ write_unlock(&cache->uc_hashlock);
+ RETURN(NULL);
+ }
+
+ /* check expired
+ * We can't refresh the existed one because some
+ * memory might be shared by multiple processes.
+ */
+ if (check_unlink_entry(entry)) {
+ /* if expired, try again. but if this entry is
+ * created by me and too quickly turn to expired
+ * without any error, should at least give a
+ * chance to use it once.
+ */
+ if (entry != new) {
+ put_entry(entry);
+ write_unlock(&cache->uc_hashlock);
+ new = NULL;
+ goto find_again;
+ }
+ }
+
+ /* Now we know it's good */
+ LASSERT(UC_CACHE_IS_VALID(entry));
+ write_unlock(&cache->uc_hashlock);
+
+ RETURN(entry);
+}
+
+struct upcall_cache_entry *
+upcall_cache_get_entry(struct upcall_cache *cache, __u64 key)
+{
+ unsigned int hash;
+
+ LASSERT(cache->hash);
+
+ hash = cache->hash(cache, key);
+
+ return __get_entry(cache, hash, key, 1, 0);
+}
+EXPORT_SYMBOL(upcall_cache_get_entry);
+
+void upcall_cache_put_entry(struct upcall_cache_entry *entry)
+{
+ struct upcall_cache *cache = entry->ue_cache;
+
+ write_lock(&cache->uc_hashlock);
+ LASSERTF(atomic_read(&entry->ue_refcount) > 0,
+ "entry %p: ref %d\n", entry, atomic_read(&entry->ue_refcount));
+ put_entry(entry);
+ write_unlock(&cache->uc_hashlock);
+}
+EXPORT_SYMBOL(upcall_cache_put_entry);
+
+int upcall_cache_downcall(struct upcall_cache *cache, __u64 key,
+ int err, void *args)
+{
+ struct list_head *head;
+ struct upcall_cache_entry *entry;
+ int found = 0, rc;
+ unsigned int hash;
+ ENTRY;
+
+ hash = cache->hash(cache, key);
+ LASSERT(hash < cache->uc_hashsize);
+
+ head = &cache->uc_hashtable[hash];
+
+ write_lock(&cache->uc_hashlock);
+ list_for_each_entry(entry, head, ue_hash) {
+ if (entry->ue_key == key) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ /* haven't found, it's possible */
+ write_unlock(&cache->uc_hashlock);
+ CWARN("key "LPU64" entry dosen't found\n", key);
+ RETURN(-EINVAL);
+ }
+
+ if (err < 0) {
+ UC_CACHE_SET_INVALID(entry);
+ GOTO(out, rc = err);
+ }
+
+ if (!UC_CACHE_IS_ACQUIRING(entry) ||
+ UC_CACHE_IS_INVALID(entry) ||
+ UC_CACHE_IS_EXPIRED(entry)) {
+ CWARN("stale entry %p: cur %lu, key "LPU64", ref %d, "
+ "fl %u, ac %ld, ex %ld\n",
+ entry, get_seconds(), entry->ue_key,
+ atomic_read(&entry->ue_refcount), entry->ue_flags,
+ entry->ue_acquire_expire, entry->ue_expire);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ atomic_inc(&entry->ue_refcount);
+ write_unlock(&cache->uc_hashlock);
+ rc = cache->parse_downcall(cache, entry, args);
+ write_lock(&cache->uc_hashlock);
+ atomic_dec(&entry->ue_refcount);
+ if (rc) {
+ UC_CACHE_SET_INVALID(entry);
+ list_del_init(&entry->ue_hash);
+ GOTO(out, rc);
+ }
+ entry->ue_expire = get_seconds() + cache->uc_entry_expire;
+ UC_CACHE_SET_VALID(entry);
+ CDEBUG(D_OTHER, "create ucache entry %p(key "LPU64")\n",
+ entry, entry->ue_key);
+out:
+ wake_up_all(&entry->ue_waitq);
+ write_unlock(&cache->uc_hashlock);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(upcall_cache_downcall);
+
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key)
+{
+ struct list_head *head;
+ struct upcall_cache_entry *entry;
+ unsigned int hash;
+ int found = 0;
+ ENTRY;
+
+ hash = cache->hash(cache, key);
+ LASSERT(hash < cache->uc_hashsize);
+
+ head = &cache->uc_hashtable[hash];
+
+ write_lock(&cache->uc_hashlock);
+ list_for_each_entry(entry, head, ue_hash) {
+ if (entry->ue_key == key) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found) {
+ UC_CACHE_SET_EXPIRED(entry);
+ if (!atomic_read(&entry->ue_refcount))
+ free_entry(entry);
+ }
+ write_unlock(&cache->uc_hashlock);
+}
+EXPORT_SYMBOL(upcall_cache_flush_one);
+
+static void cache_flush(struct upcall_cache *cache, int force, int sync)
+{
+ struct upcall_cache_entry *entry, *next;
+ int i;
+ ENTRY;
+
+ write_lock(&cache->uc_hashlock);
+ for (i = 0; i < cache->uc_hashsize; i++) {
+ list_for_each_entry_safe(entry, next,
+ &cache->uc_hashtable[i], ue_hash) {
+ if (!force && atomic_read(&entry->ue_refcount)) {
+ UC_CACHE_SET_EXPIRED(entry);
+ continue;
+ }
+ LASSERT(!atomic_read(&entry->ue_refcount));
+ free_entry(entry);
+ }
+ }
+ write_unlock(&cache->uc_hashlock);
+ EXIT;
+}
+
+void upcall_cache_flush_idle(struct upcall_cache *cache)
+{
+ cache_flush(cache, 0, 0);
+}
+
+void upcall_cache_flush_all(struct upcall_cache *cache)
+{
+ cache_flush(cache, 1, 0);
+}
+EXPORT_SYMBOL(upcall_cache_flush_idle);
+EXPORT_SYMBOL(upcall_cache_flush_all);
d_add(dentry, inode);
SMFS_POST_HOOK(dir, HOOK_LOOKUP, &msg, rc);
-exit:
post_smfs_dentry(cache_dentry);
post_smfs_dentry(cache_parent);
RETURN(ERR_PTR(rc));
SMFS_POST_HOOK(dentry->d_inode, HOOK_READDIR, &msg, rc);
duplicate_file(filp, sfi->c_file);
-exit:
if (rc > 0)
rc = 0;
--- /dev/null
+!
+! Test that can only be run as root as it uses mknod.
+!
+$mkdir asroot
+$ umask 027
+$ mknod asroot/null c 1 3
+$ acl_mode asroot/null
+crw-r-----
+$ setfacl -m u:joe:rw,u:lisa:- asroot/null
+$ acl_mode asroot/null
+crw-rw----+
+$ setfacl -m u:lisa:r asroot/null
+$ getfacl --omit-header asroot/null
+user::rw-
+user:joe:rw-
+user:lisa:r--
+group::r--
+mask::rw-
+other::---
+
+$ su - lisa -c chmod\ +rw\ /mnt/lustre/asroot/null
+chmod: changing permissions of `/mnt/lustre/asroot/null': Operation not permitted
+$ rm -f asroot/null
+$ mkfifo asroot/fifo
+$ acl_mode asroot/fifo
+prw-r-----
+$ setfacl -m u:joe:- asroot/fifo
+$ getfacl --omit-header asroot/fifo
+user::rw-
+user:joe:---
+group::r--
+mask::r--
+other::---
+
+$ rm asroot/fifo
+$ mknod asroot/block b 1 1
+$ setfacl -m u:joe:- asroot/block
+$ getfacl --omit-header asroot/block
+user::rw-
+user:joe:---
+group::r--
+mask::r--
+other::---
+
+$ rm asroot/block
+$ rmdir asroot
--- /dev/null
+!
+! Test for the patched file utilities.
+!
+$ umask 022
+$ mkdir dir
+$ acl_mode dir
+drwxr-xr-x
+$ touch dir/f
+$ getfacl --omit-header dir/f
+user::rw-
+group::r--
+other::r--
+
+$ umask 027
+$ cp -p dir/f dir/g
+$ getfacl --omit-header dir/g
+user::rw-
+group::r--
+other::r--
+
+$ rm dir/g
+$ cp dir/f dir/g
+$ getfacl --omit-header dir/g
+user::rw-
+group::r--
+other::---
+
+$ setfacl -m u::rwx,u:joe:rwx,g::rwx,o::r-x dir/.
+$ setfacl -dm u::rwx,u:joe:rwx,g::rwx,o::r-x dir/.
+$ acl_mode dir
+drwxrwxr-x+
+$ touch dir/h
+$ getfacl --omit-header --no-effective dir/h
+user::rw-
+user:joe:rwx
+group::rwx
+mask::r--
+other::---
+
+$ mkdir dir/d
+$ getfacl --omit-header --no-effective dir/d
+user::rwx
+user:joe:rwx
+group::rwx
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:rwx
+default:group::rwx
+default:mask::rwx
+default:other::r-x
+
+$ cp dir/f dir/i
+$ getfacl --omit-header --no-effective dir/i
+user::rw-
+user:joe:rwx
+group::rwx
+mask::r--
+other::---
+
+$ acl_mode dir/f
+-rw-r--r--
+$ cp -p dir/f dir/j
+$ acl_mode dir/j
+-rw-r--r--
+$ rm -r dir
--- /dev/null
+!
+! Pretty comprehensive ACL tests.
+!
+! This must be run on a filesystem with ACL support. Also, you will need
+! two dummy users (lisa and joe) and a dummy group (toolies).
+!
+$ umask 027
+$ touch f
+! Only change a base ACL:
+$ setfacl -m u::r f
+$ setfacl -m u::rw,u:lisa:rw f
+$ acl_mode f
+-rw-rw----+
+$ getfacl --omit-header f
+user::rw-
+user:lisa:rw-
+group::r--
+mask::rw-
+other::---
+
+$ rm f
+$ umask 022
+$ touch f
+$ setfacl -m u:lisa:rw f
+$ acl_mode f
+-rw-rw-r--+
+$ getfacl --omit-header f
+user::rw-
+user:lisa:rw-
+group::r--
+mask::rw-
+other::r--
+
+$rm f
+$ umask 027
+$ mkdir d
+$ setfacl -m u:lisa:rwx d
+$ acl_mode d
+drwxrwx---+
+$ getfacl --omit-header d
+user::rwx
+user:lisa:rwx
+group::r-x
+mask::rwx
+other::---
+
+$ rmdir d
+$ umask 022
+$ mkdir d
+$ setfacl -m u:lisa:rwx d
+$ acl_mode d
+drwxrwxr-x+
+$ getfacl --omit-header d
+user::rwx
+user:lisa:rwx
+group::r-x
+mask::rwx
+other::r-x
+
+$ rmdir d
+!
+! Multiple users
+!
+$ umask 022
+$ touch f
+$ setfacl -m u:lisa:rw,u:joe:r f
+$ acl_mode f
+-rw-rw-r--+
+$ getfacl --omit-header f
+user::rw-
+user:joe:r--
+user:lisa:rw-
+group::r--
+mask::rw-
+other::r--
+
+!
+! Multiple groups
+!
+$ setfacl -m g:users:rw,g:toolies:r f
+$ acl_mode f
+-rw-rw-r--+
+$ getfacl --omit-header f
+user::rw-
+user:joe:r--
+user:lisa:rw-
+group::r--
+group:users:rw-
+group:toolies:r--
+mask::rw-
+other::r--
+
+!
+! Remove one group
+!
+$ setfacl -x g:users f
+$ acl_mode f
+-rw-rw-r--+
+$ getfacl --omit-header f
+user::rw-
+user:joe:r--
+user:lisa:rw-
+group::r--
+group:toolies:r--
+mask::rw-
+other::r--
+
+!
+! Remove one user
+!
+$ setfacl -x u:joe f
+$ acl_mode f
+-rw-rw-r--+
+$ getfacl --omit-header f
+user::rw-
+user:lisa:rw-
+group::r--
+group:toolies:r--
+mask::rw-
+other::r--
+
+$ rm f
+!
+! Default ACL
+!
+$ umask 027
+$ mkdir d
+$ setfacl -m u:lisa:rwx,u:joe:rw,d:u:lisa:rwx,d:m:rx d
+$ acl_mode d
+drwxrwx---+
+$ getfacl --omit-header d
+user::rwx
+user:joe:rw-
+user:lisa:rwx
+group::r-x
+mask::rwx
+other::---
+default:user::rwx
+default:user:lisa:rwx #effective:r-x
+default:group::r-x
+default:mask::r-x
+default:other::---
+
+!
+! Umask now ignored?
+!
+$ umask 027
+$ touch d/f
+$ acl_mode d/f
+-rw-r-----+
+$ getfacl --omit-header d/f
+user::rw-
+user:lisa:rwx #effective:r--
+group::r-x #effective:r--
+mask::r--
+other::---
+
+$ rm d/f
+$ umask 022
+$ touch d/f
+$ acl_mode d/f
+-rw-r-----+
+$ getfacl --omit-header d/f
+user::rw-
+user:lisa:rwx #effective:r--
+group::r-x #effective:r--
+mask::r--
+other::---
+
+$ rm d/f
+!
+! Default ACL copying
+!
+$ umask 000
+$ mkdir d/d
+$ acl_mode d/d
+drwxr-x---+
+$ getfacl --omit-header d/d
+user::rwx
+user:lisa:rwx #effective:r-x
+group::r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:lisa:rwx #effective:r-x
+default:group::r-x
+default:mask::r-x
+default:other::---
+
+$ rmdir d/d
+$ umask 022
+$ mkdir d/d
+$ acl_mode d/d
+drwxr-x---+
+$ getfacl --omit-header d/d
+user::rwx
+user:lisa:rwx #effective:r-x
+group::r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:lisa:rwx #effective:r-x
+default:group::r-x
+default:mask::r-x
+default:other::---
+
+!
+! Add some users and groups
+!
+$ setfacl -nm u:joe:rx,d:u:joe:rx,g:users:rx,g:toolies:rwx d/d
+$ acl_mode d/d
+drwxr-x---+
+$ getfacl --omit-header d/d
+user::rwx
+user:joe:r-x
+user:lisa:rwx #effective:r-x
+group::r-x
+group:users:r-x
+group:toolies:rwx #effective:r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:r-x
+default:user:lisa:rwx #effective:r-x
+default:group::r-x
+default:mask::r-x
+default:other::---
+
+!
+! symlink in directory with default ACL?
+!
+$ ln -s d d/l
+$ acl_mode d/l
+lrwxrwxrwx
+$ acl_mode -L d/l
+drwxr-x---+
+$ getfacl --omit-header d/l
+user::rwx
+user:joe:r-x
+user:lisa:rwx #effective:r-x
+group::r-x
+group:users:r-x
+group:toolies:rwx #effective:r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:r-x
+default:user:lisa:rwx #effective:r-x
+default:group::r-x
+default:mask::r-x
+default:other::---
+
+$ rm d/l
+!
+! Does mask manipulation work?
+!
+$ setfacl -m g:toolies:rx,u:lisa:rx d/d
+$ acl_mode d/d
+drwxr-x---+
+$ getfacl --omit-header d/d
+user::rwx
+user:joe:r-x
+user:lisa:r-x
+group::r-x
+group:users:r-x
+group:toolies:r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:r-x
+default:user:lisa:rwx #effective:r-x
+default:group::r-x
+default:mask::r-x
+default:other::---
+
+$ setfacl -m d:u:lisa:rwx d/d
+$ acl_mode d/d
+drwxr-x---+
+$ getfacl --omit-header d/d
+user::rwx
+user:joe:r-x
+user:lisa:r-x
+group::r-x
+group:users:r-x
+group:toolies:r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:r-x
+default:user:lisa:rwx
+default:group::r-x
+default:mask::rwx
+default:other::---
+
+$ rmdir d/d
+!
+! Remove the default ACL
+!
+$ setfacl -k d
+$ acl_mode d
+drwxrwx---+
+$ getfacl --omit-header d
+user::rwx
+user:joe:rw-
+user:lisa:rwx
+group::r-x
+mask::rwx
+other::---
+
+!
+! Reset to base entries
+!
+$ setfacl -b d
+$ acl_mode d
+drwxr-x---
+$ getfacl --omit-header d
+user::rwx
+group::r-x
+other::---
+
+!
+! Now, chmod should change the group_obj entry
+!
+$ chmod 775 d
+$ acl_mode d
+drwxrwxr-x
+$ getfacl --omit-header d
+user::rwx
+group::rwx
+other::r-x
+
+$ rmdir d
+$ umask 002
+$ mkdir d
+$ setfacl -m u:joe:rwx,u:lisa:rx,d:u:joe:rwx,d:u:lisa:rx d
+$ acl_mode d
+drwxrwxr-x+
+$ getfacl --omit-header d
+user::rwx
+user:joe:rwx
+user:lisa:r-x
+group::rwx
+mask::rwx
+other::r-x
+default:user::rwx
+default:user:joe:rwx
+default:user:lisa:r-x
+default:group::rwx
+default:mask::rwx
+default:other::r-x
+
+$ chmod 750 d
+$ acl_mode d
+drwxr-x---+
+$ getfacl --omit-header d
+user::rwx
+user:joe:rwx #effective:r-x
+user:lisa:r-x
+group::rwx #effective:r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:rwx
+default:user:lisa:r-x
+default:group::rwx
+default:mask::rwx
+default:other::r-x
+
+$ chmod 750 d
+$ acl_mode d
+drwxr-x---+
+$ getfacl --omit-header d
+user::rwx
+user:joe:rwx #effective:r-x
+user:lisa:r-x
+group::rwx #effective:r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:rwx
+default:user:lisa:r-x
+default:group::rwx
+default:mask::rwx
+default:other::r-x
+
+$ rmdir d
--- /dev/null
+#!/bin/sh
+ls -dl $* | awk -- '!/^total/ { print $1; }'
--- /dev/null
+!
+! Test whether ACL permissions work
+!
+$ umask 022
+$ mkdir dir
+$ umask 077
+$ touch dir/file
+$ setfacl -m u:joe:rw,u:lisa:- dir/file
+$ su - lisa -c cat\ /mnt/lustre/dir/file
+cat: /mnt/lustre/dir/file: Permission denied
+$ su - joe -c cat\ /mnt/lustre/dir/file
+$ su - joe -c touch\ /mnt/lustre/dir/file
+$ cat dir/file
+$ setfacl -m g:users:- dir/file
+$ su - nobody -c cat\ /mnt/lustre/dir/file
+cat: /mnt/lustre/dir/file: Permission denied
+$ rm dir/file
+$ rmdir dir
start_mds() {
echo "start mds1 service on `facet_active_host mds1`"
start mds1 --reformat $MDSLCONFARGS || return 94
+ start_lsvcgssd || return 501
}
stop_mds() {
echo "stop mds1 service on `facet_active_host mds1`"
stop mds1 $@ || return 97
+ stop_lsvcgssd
}
start_ost() {
mount_client() {
local MOUNTPATH=$1
+ start_lgssd || return 502
echo "mount lustre on ${MOUNTPATH}....."
zconf_mount `hostname` $MOUNTPATH || return 96
}
local MOUNTPATH=$1
echo "umount lustre on ${MOUNTPATH}....."
zconf_umount `hostname` $MOUNTPATH || return 97
+ stop_lgssd
}
manual_umount_client(){
echo "manual umount lustre on ${MOUNTPATH}...."
do_facet client "umount $MOUNT"
+ stop_lgssd
}
setup() {
#create single point mountpoint
gen_config
+start_krb5_kdc || exit 1
test_0() {
kill -TERM $UMOUNT_PID
echo "waiting for umount to finish"
wait $UMOUNT_PID
+ stop_lgssd
# cleanup client modules
$LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
[ -d $MOUNT ] || mkdir -p $MOUNT
$LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
+ start_lgssd || return 1
llmount $mds_HOST://mds1_svc/client_facet $MOUNT && exit 1
# cleanup client modules
$LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
+ stop_lgssd
# stop_mds is a no-op here, and should not fail
stop_mds || return 2
[ -d $MOUNT ] || mkdir -p $MOUNT
$LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
- llmount $mds_HOST://wrong_mds1_svc/client_facet $MOUNT && return 1
+ start_lgssd || return 1
+ llmount $mds_HOST://wrong_mds1_svc/client_facet $MOUNT && return 2
# cleanup client modules
$LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
+ stop_lgssd
- stop_mds || return 2
- stop_ost || return 3
+ stop_mds || return 3
+ stop_ost || return 4
- lsmod | grep -q portals && return 4
+ lsmod | grep -q portals && return 5
return 0
}
[ -d $MOUNT ] || mkdir -p $MOUNT
$LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
+ start_lgssd || return 1
llmount $mds_HOST://mds1_svc/client_facet $MOUNT || return 1
umount $MOUNT || return 2
# cleanup client modules
$LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
+ stop_lgssd
stop_mds || return 3
--- /dev/null
+#!/bin/sh
+
+MOUNT=`which mount 2>/dev/null`
+test "x$MOUNT" = "x" && MOUNT="/bin/mount"
+
+OPTIONS=$1
+MNTPATH=$2
+
+test "x$OPTIONS" = "x" || "x$MNTPATH" = "x" &&
+ exit 1
+
+$MOUNT $OPTIONS $MNTPATH > /tmp/gns-log 2>&1
+exit $?
setup() {
gen_config
+ start_krb5_kdc || exit 1
rm -rf logs/*
for i in `seq $NUMOST`; do
wait_for ost$i
start ost$i ${REFORMAT} $OSTLCONFARGS
done
+ start_lsvcgssd || exit 2
+ start_lgssd || exit 3
[ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
for mds in `mds_list`; do
wait_for $mds
for mds in `mds_list`; do
stop $mds ${FORCE} $MDSLCONFARGS || :
done
+ stop_lgssd
+ stop_lsvcgssd
for i in `seq $NUMOST`; do
stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS || :
done
--- /dev/null
+#!/bin/sh
+
+#
+# KDC could be on remote hosts, but we suppose lgssd/lsvcgssd only
+# runs locally.
+#
+
+export KDCHOST=${KDCHOST:-"localhost"}
+export KDCDIR=${KDCDIR:-"/usr/kerberos/sbin"}
+export KRB5DIR=${KRB5DIR:-"/usr/kerberos"}
+export LGSSD=${LGSSD:-"/sbin/lgssd"}
+export SVCGSSD=${SVCGSSD:-"/sbin/lsvcgssd"}
+export PDSH=${PDSH:-"ssh"}
+
+using_krb5_sec() {
+ if [ "x$1" != "xkrb5i" -a "x$1" != "xkrb5p" ]; then
+ echo "n"
+ else
+ echo "y"
+ fi
+}
+
+start_krb5_kdc() {
+ if [ `using_krb5_sec $SECURITY` == 'n' ] ; then
+ return 0
+ fi
+
+ num=`$PDSH $KDCHOST "PATH=\$PATH:$KDCDIR; ps ax | grep krb5kdc | grep -v "grep" | wc -l"`
+ if [ $num -eq 1 ]; then
+ return 0
+ fi
+
+ $PDSH $KDCHOST "PATH=\$PATH:$KDCDIR; krb5kdc"
+ num=`$PDSH $KDCHOST "PATH=\$PATH:$KDCDIR; ps ax | grep krb5kdc | grep -v "grep" | wc -l"`
+ if [ $num -ne 1 ]; then
+ echo "fail to start krb5 KDC, check env KDCHOST and KDCDIR"
+ return 1
+ fi
+ return 0
+}
+
+prepare_krb5_cache() {
+ if [ `using_krb5_sec $SECURITY` == 'n' ] ; then
+ return 0
+ fi
+
+ $KRB5DIR/bin/klist -5 -s
+ invalid=$?
+ if [ $invalid -eq 0 ]; then
+ return 0
+ fi
+
+ echo "***** refresh Kerberos V5 TGT for uid $UID *****"
+ $KRB5DIR/bin/kinit
+ ret=$?
+ return $ret
+}
+
+start_lsvcgssd() {
+ if [ `using_krb5_sec $SECURITY` == 'n' ] ; then
+ return 0
+ fi
+
+ killall -q -9 lsvcgssd || true
+
+ `$SVCGSSD`
+ num=`ps -o cmd -C "lsvcgssd" | grep lsvcgssd | wc -l`
+ if [ $num -ne 1 ]; then
+ echo "failed to start lsvcgssd"
+ return 1
+ fi
+ return 0
+}
+
+stop_lsvcgssd() {
+ killall -q -9 lsvcgssd || true
+ return 0
+}
+
+start_lgssd() {
+ if [ `using_krb5_sec $SECURITY` == 'n' ] ; then
+ return 0
+ fi
+
+ prepare_krb5_cache || exit 1
+
+ killall -q -9 lgssd || true
+
+ `$LGSSD`
+ num=`ps -o cmd -C "lgssd" | grep lgssd | wc -l`
+ if [ $num -ne 1 ]; then
+ echo "failed to start lgssd $num"
+ return 1
+ fi
+ return 0
+}
+
+stop_lgssd() {
+ killall -q -9 lgssd || true
+ return 0
+}
--- /dev/null
+#!/bin/sh
+
+KRB5DIR=${KRB5DIR:-"/usr/kerberos"}
+
+$KRB5DIR/bin/klist -5 -s
+invalid=$?
+
+if [ $invalid -eq 0 ]; then
+ exit 0
+fi
+
+echo "***** refresh Kerberos V5 TGT for uid $UID *****"
+$KRB5DIR/bin/kinit
+ret=$?
+exit $ret
LCONF=${LCONF:-lconf}
NAME=${NAME:-local}
LLMOUNT=${LLMOUNT:-llmount}
+SECURITY=${SECURITY:-"null"}
config=$NAME.xml
mkconfig=$NAME.sh
+. krb5_env.sh
+start_krb5_kdc || exit 1
+
if [ "$PORTALS" ]; then
portals_opt="--portals=$PORTALS"
fi
if [ "$LDAPURL" ]; then
conf_opt="--ldapurl $LDAPURL --config $NAME"
else
- sh $mkconfig $config || exit 1
+ sh $mkconfig $config || exit 2
conf_opt="$config"
fi
[ "$NODE" ] && node_opt="--node $NODE"
-${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt ${REFORMAT:---reformat} $@ \
- $conf_opt || exit 2
+# We'd better start lsvcgssd after gss modules loaded.
+# remove this if we don't depend on lsvcgssd in the future
+${LCONF} --nosetup --sec $SECURITY $portals_opt $node_opt $@ $conf_opt || exit 3
+start_lsvcgssd || exit 4
+start_lgssd || exit 5
+
+${LCONF} $NOMOD --sec $SECURITY $portals_opt $lustre_opt $node_opt \
+ ${REFORMAT:---reformat} $@ $conf_opt || exit 6
if [ "$MOUNT2" ]; then
- $LLMOUNT -v `hostname`:/mds1/client $MOUNT2 || exit 3
+ $LLMOUNT -v -o sec=$SECURITY `hostname`:/mds1/client $MOUNT2 || exit 7
fi
config=$NAME.xml
mkconfig=$NAME.sh
+. krb5_env.sh
+
if [ "$PORTALS" ]; then
portals_opt="--portals=$PORTALS"
fi
--dump $TMP/debug $conf_opt
rc=$?
echo "lconf DONE"
+stop_lsvcgssd
+stop_lgssd
+
BUSY=`dmesg | grep -i destruct`
if [ "$BUSY" ]; then
echo "$BUSY" 1>&2
LCONF=${LCONF:-lconf}
NAME=${NAME:-local}
LLMOUNT=${LLMOUNT:-llmount}
+SECURITY=${SECURITY:-"null"}
config=$NAME.xml
mkconfig=$NAME.sh
+. krb5_env.sh
+
+start_krb5_kdc || exit 1
+
if [ "$PORTALS" ]; then
portals_opt="--portals=$PORTALS"
fi
conf_opt="--ldapurl $LDAPURL --config $NAME"
else
if [ ! -f $config -o $mkconfig -nt $config ]; then
- sh $mkconfig $config || exit 1
+ sh $mkconfig $config || exit 2
fi
conf_opt="$config"
fi
[ "$NODE" ] && node_opt="--node $NODE"
-${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt $@ $conf_opt || exit 2
+# We'd better start lsvcgssd after gss modules loaded.
+# remove this if we don't depend on lsvcgssd in the future
+${LCONF} --nosetup --sec $SECURITY $portals_opt $node_opt $@ $conf_opt || exit 3
+start_lsvcgssd || exit 4
+start_lgssd || exit 5
+
+${LCONF} $NOMOD --sec $SECURITY $portals_opt $lustre_opt $node_opt \
+ $@ $conf_opt || exit 6
if [ "$MOUNT2" ]; then
- $LLMOUNT -v `hostname`:/mds1/client $MOUNT2 || exit 3
+ $LLMOUNT -v -o sec=$SECURITY `hostname`:/mds1/client $MOUNT2 || exit 7
fi
# create nodes
${LMC} -m $config --add node --node localhost || exit 10
-${LMC} -m $config --add net --node localhost --nid localhost --nettype tcp || exit 11
+${LMC} -m $config --add net --node localhost --nid `hostname` --nettype tcp || exit 11
# configure mds server
${LMC} -m $config --add lmv --lmv lmv1 || exit 12
setup() {
gen_config
+ start_krb5_kdc || exit 1
start ost --reformat $OSTLCONFARGS
start ost2 --reformat $OSTLCONFARGS
+ start_lsvcgssd || exit 2
+ start_lgssd || exit 3
[ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
for mds in `mds_list`; do
start $mds --reformat $MDSLCONFARGS
done
-
grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT
}
for mds in `mds_list`; do
stop $mds ${FORCE} $MDSLCONFARGS
done
+ stop_lgssd
+ stop_lsvcgssd
stop ost2 ${FORCE} --dump cleanup.log
stop ost ${FORCE} --dump cleanup.log
}
for mds in `mds_list`; do
stop $mds ${FORCE} $MDSLCONFARGS
done
+ stop_lgssd
+ stop_lsvcgssd
stop ost2 ${FORCE}
stop ost ${FORCE} --dump cleanup-dual.log
}
setup() {
gen_config
+
+ start_krb5_kdc || exit 1
start ost --reformat $OSTLCONFARGS
PINGER=`cat /proc/fs/lustre/pinger`
fi
start ost2 --reformat $OSTLCONFARGS
+ start_lsvcgssd || exit 2
+ start_lgssd || exit 3
[ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
for mds in `mds_list`; do
start $mds --reformat $MDSLCONFARGS
# Skip these tests
ALWAYS_EXCEPT=""
+if [ `using_krb5_sec $SECURITY` == 'n' ] ; then
+ ALWAYS_EXCEPT="0c $ALWAYS_EXCEPT"
+fi
+
+
gen_config() {
rm -f $XMLCONFIG
for mds in `mds_list`; do
stop $mds ${FORCE} $MDSLCONFARGS
done
+ stop_lgssd
+ stop_lsvcgssd
stop ost2 ${FORCE} --dump cleanup.log
stop ost ${FORCE} --dump cleanup.log
}
setup() {
gen_config
+ start_krb5_kdc || exit 1
start ost --reformat $OSTLCONFARGS
start ost2 --reformat $OSTLCONFARGS
+ start_lsvcgssd || exit 2
+ start_lgssd || exit 3
[ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
for mds in `mds_list`; do
start $mds --reformat $MDSLCONFARGS
}
run_test 0b "ensure object created after recover exists. (3284)"
+test_0c() {
+ # drop gss error notification
+ replay_barrier mds1
+ fail_drop mds1 0x760
+
+ # drop gss init request
+ replay_barrier mds1
+ fail_drop mds1 0x780
+}
+run_test 0c "empty replay with gss init failures"
+
test_1() {
replay_barrier mds1
mcreate $DIR/$tfile
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+use FileHandle;
+use POSIX qw(geteuid getegid isatty);
+
+my $owner = getpwuid(geteuid());
+my $group = getgrgid(getegid());
+
+my ($OK, $FAILED) = ("ok", "failed");
+if (isatty(fileno(STDOUT))) {
+ $OK = "\033[32m" . $OK . "\033[m";
+ $FAILED = "\033[31m\033[1m" . $FAILED . "\033[m";
+}
+
+my ($prog, $in, $out) = ([], [], []);
+my $line = 0;
+my $prog_line;
+my ($tests, $failed);
+
+for (;;) {
+ my $script = <>; $line++;
+ $script =~ s/\@OWNER\@/$owner/g;
+ $script =~ s/\@GROUP\@/$group/g;
+ next if (defined($script) && $script =~ /^!/);
+ if (!defined($script) || $script =~ s/^\$ ?//) {
+ if (@$prog) {
+ #print "[$prog_line] \$ ", join(' ', @$prog), " -- ";
+ my $p = [ @$prog ];
+ print "[$prog_line] \$ ", join(' ',
+ map { s/\s/\\$&/g; $_ } @$p), " -- ";
+ my $result = exec_test($prog, $in);
+ my $good = 1;
+ my $nmax = (@$out > @$result) ? @$out : @$result;
+ for (my $n=0; $n < $nmax; $n++) {
+ if (!defined($out->[$n]) || !defined($result->[$n]) ||
+ $out->[$n] ne $result->[$n]) {
+ $good = 0;
+ #chomp $out->[$n];
+ #chomp $result->[$n];
+ #print "$out->[$n] != $result->[$n]";
+ }
+ }
+ $tests++;
+ $failed++ unless $good;
+ print $good ? $OK : $FAILED, "\n";
+ if (!$good) {
+ for (my $n=0; $n < $nmax; $n++) {
+ my $l = defined($out->[$n]) ? $out->[$n] : "~";
+ chomp $l;
+ my $r = defined($result->[$n]) ? $result->[$n] : "~";
+ chomp $r;
+ print sprintf("%-37s | %-39s\n", $l, $r);
+ }
+ }
+ }
+ #$prog = [ split /\s+/, $script ] if $script;
+ $prog = [ map { s/\\(.)/$1/g; $_ } split /(?<!\\)\s+/, $script ] if $script;
+ $prog_line = $line;
+ $in = [];
+ $out = [];
+ } elsif ($script =~ s/^> ?//) {
+ push @$in, $script;
+ } else {
+ push @$out, $script;
+ }
+ last unless defined($script);
+}
+my $status = sprintf("%d commands (%d passed, %d failed)",
+ $tests, $tests-$failed, $failed);
+if (isatty(fileno(STDOUT))) {
+ if ($failed) {
+ $status = "\033[31m\033[1m" . $status . "\033[m";
+ } else {
+ $status = "\033[32m" . $status . "\033[m";
+ }
+}
+print $status, "\n";
+exit $failed ? 1 : 0;
+
+sub exec_test($$) {
+ my ($prog, $in) = @_;
+ local (*IN, *IN_DUP, *IN2, *OUT_DUP, *OUT, *OUT2);
+
+ if ($prog->[0] eq "umask") {
+ umask oct $prog->[1];
+ return [];
+ } elsif ($prog->[0] eq "cd") {
+ if (!chdir $prog->[1]) {
+ return [ "chdir: $prog->[1]: $!\n" ];
+ }
+ return [];
+ }
+
+ pipe *IN2, *OUT
+ or die "Can't create pipe for reading: $!";
+ open *IN_DUP, "<&STDIN"
+ or *IN_DUP = undef;
+ open *STDIN, "<&IN2"
+ or die "Can't duplicate pipe for reading: $!";
+ close *IN2;
+
+ open *OUT_DUP, ">&STDOUT"
+ or die "Can't duplicate STDOUT: $!";
+ pipe *IN, *OUT2
+ or die "Can't create pipe for writing: $!";
+ open *STDOUT, ">&OUT2"
+ or die "Can't duplicate pipe for writing: $!";
+ close *OUT2;
+
+ *STDOUT->autoflush();
+ *OUT->autoflush();
+
+ if (fork()) {
+ # Server
+ if (*IN_DUP) {
+ open *STDIN, "<&IN_DUP"
+ or die "Can't duplicate STDIN: $!";
+ close *IN_DUP
+ or die "Can't close STDIN duplicate: $!";
+ }
+ open *STDOUT, ">&OUT_DUP"
+ or die "Can't duplicate STDOUT: $!";
+ close *OUT_DUP
+ or die "Can't close STDOUT duplicate: $!";
+
+ foreach my $line (@$in) {
+ #print "> $line";
+ print OUT $line;
+ }
+ close *OUT
+ or die "Can't close pipe for writing: $!";
+
+ my $result = [];
+ while (<IN>) {
+ #print "< $_";
+ push @$result, $_;
+ }
+ return $result;
+ } else {
+ # Client
+ close IN
+ or die "Can't close read end for input pipe: $!";
+ close OUT
+ or die "Can't close write end for output pipe: $!";
+ close OUT_DUP
+ or die "Can't close STDOUT duplicate: $!";
+ local *ERR_DUP;
+ open ERR_DUP, ">&STDERR"
+ or die "Can't duplicate STDERR: $!";
+ open STDERR, ">&STDOUT"
+ or die "Can't join STDOUT and STDERR: $!";
+
+ #print ERR_DUP "<", join(' ', @$prog), ">\n";
+ exec @$prog;
+ print ERR_DUP $prog->[0], ": $!\n";
+ exit;
+ }
+}
+
--- /dev/null
+#!/bin/bash
+#
+# Run select tests by setting ONLY, or as arguments to the script.
+# Skip specific tests by setting EXCEPT.
+#
+# e.g. ONLY="22 23" or ONLY="`seq 32 39`" or EXCEPT="31"
+set -e
+
+ONLY=${ONLY:-"$*"}
+ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""}
+[ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT"
+
+SRCDIR=`dirname $0`
+export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
+
+TMP=${TMP:-/tmp}
+FSTYPE=${FSTYPE:-ext3}
+
+CHECKSTAT=${CHECKSTAT:-"checkstat -v"}
+CREATETEST=${CREATETEST:-createtest}
+LFS=${LFS:-lfs}
+LSTRIPE=${LSTRIPE:-"$LFS setstripe"}
+LFIND=${LFIND:-"$LFS find"}
+LVERIFY=${LVERIFY:-ll_dirstripe_verify}
+LCTL=${LCTL:-lctl}
+MCREATE=${MCREATE:-mcreate}
+OPENFILE=${OPENFILE:-openfile}
+OPENUNLINK=${OPENUNLINK:-openunlink}
+TOEXCL=${TOEXCL:-toexcl}
+TRUNCATE=${TRUNCATE:-truncate}
+MUNLINK=${MUNLINK:-munlink}
+SOCKETSERVER=${SOCKETSERVER:-socketserver}
+SOCKETCLIENT=${SOCKETCLIENT:-socketclient}
+IOPENTEST1=${IOPENTEST1:-iopentest1}
+IOPENTEST2=${IOPENTEST2:-iopentest2}
+PTLDEBUG=${PTLDEBUG:-0}
+
+if [ $UID -ne 0 ]; then
+ RUNAS_ID="$UID"
+ RUNAS=""
+else
+ RUNAS_ID=${RUNAS_ID:-500}
+ RUNAS=${RUNAS:-"runas -u $RUNAS_ID"}
+fi
+
+export NAME=${NAME:-local}
+
+SAVE_PWD=$PWD
+
+clean() {
+ echo -n "cln.."
+ sh llmountcleanup.sh > /dev/null || exit 20
+ I_MOUNTED=no
+}
+CLEAN=${CLEAN:-clean}
+
+start() {
+ echo -n "mnt.."
+ sh llrmount.sh > /dev/null || exit 10
+ I_MOUNTED=yes
+ echo "done"
+}
+START=${START:-start}
+
+log() {
+ echo "$*"
+ lctl mark "$*" 2> /dev/null || true
+}
+
+trace() {
+ log "STARTING: $*"
+ strace -o $TMP/$1.strace -ttt $*
+ RC=$?
+ log "FINISHED: $*: rc $RC"
+ return 1
+}
+TRACE=${TRACE:-""}
+
+check_kernel_version() {
+ VERSION_FILE=/proc/fs/lustre/kernel_version
+ WANT_VER=$1
+ [ ! -f $VERSION_FILE ] && echo "can't find kernel version" && return 1
+ GOT_VER=`cat $VERSION_FILE`
+ [ $GOT_VER -ge $WANT_VER ] && return 0
+ log "test needs at least kernel version $WANT_VER, running $GOT_VER"
+ return 1
+}
+
+run_one() {
+ if ! mount | grep -q $DIR; then
+ $START
+ fi
+ echo $PTLDEBUG >/proc/sys/portals/debug
+ log "== test $1: $2"
+ export TESTNAME=test_$1
+ test_$1 || error "test_$1: exit with rc=$?"
+ unset TESTNAME
+ pass
+ cd $SAVE_PWD
+ $CLEAN
+}
+
+build_test_filter() {
+ for O in $ONLY; do
+ eval ONLY_${O}=true
+ done
+ for E in $EXCEPT $ALWAYS_EXCEPT; do
+ eval EXCEPT_${E}=true
+ done
+}
+
+_basetest() {
+ echo $*
+}
+
+basetest() {
+ IFS=abcdefghijklmnopqrstuvwxyz _basetest $1
+}
+
+run_test() {
+ base=`basetest $1`
+ if [ "$ONLY" ]; then
+ testname=ONLY_$1
+ if [ ${!testname}x != x ]; then
+ run_one $1 "$2"
+ return $?
+ fi
+ testname=ONLY_$base
+ if [ ${!testname}x != x ]; then
+ run_one $1 "$2"
+ return $?
+ fi
+ echo -n "."
+ return 0
+ fi
+ testname=EXCEPT_$1
+ if [ ${!testname}x != x ]; then
+ echo "skipping excluded test $1"
+ return 0
+ fi
+ testname=EXCEPT_$base
+ if [ ${!testname}x != x ]; then
+ echo "skipping excluded test $1 (base $base)"
+ return 0
+ fi
+ run_one $1 "$2"
+ return $?
+}
+
+[ "$SANITYLOG" ] && rm -f $SANITYLOG || true
+
+error() {
+ log "FAIL: $@"
+ if [ "$SANITYLOG" ]; then
+ echo "FAIL: $TESTNAME $@" >> $SANITYLOG
+ else
+ exit 1
+ fi
+}
+
+pass() {
+ echo PASS
+}
+
+MOUNT="`mount | awk '/^'$NAME' .* lustre_lite / { print $3 }'`"
+if [ -z "$MOUNT" ]; then
+ sh llmount.sh
+ MOUNT="`mount | awk '/^'$NAME' .* lustre_lite / { print $3 }'`"
+ [ -z "$MOUNT" ] && error "NAME=$NAME not mounted"
+ I_MOUNTED=yes
+fi
+
+[ `echo $MOUNT | wc -w` -gt 1 ] && error "NAME=$NAME mounted more than once"
+
+DIR=${DIR:-$MOUNT}
+[ -z "`echo $DIR | grep $MOUNT`" ] && echo "$DIR not in $MOUNT" && exit 99
+
+rm -rf $DIR/[Rdfs][1-9]*
+build_test_filter
+
+echo preparing for tests involving mounts
+EXT2_DEV=${EXT2_DEV:-/tmp/SANITY.LOOP}
+touch $EXT2_DEV
+mke2fs -j -F $EXT2_DEV 8000 >/dev/null 2>&1
+
+find_free_loop() {
+ local LOOP_DEV=""
+ test -b /dev/loop0 &&
+ base="/dev/loop" || base="/dev/loop/"
+
+ for ((i=0;i<256;i++)); do
+ test -b $base$i || continue
+
+ losetup $base$i >/dev/null 2>&1 || {
+ LOOP_DEV="$base$i"
+ break
+ }
+ done
+ echo $LOOP_DEV
+}
+
+cleanup_loop() {
+ local LOOP_DEV=$1
+ local LOOP_FILE=$2
+ local LOOP_MNTPT=$3
+
+ chmod u-s $LOOP_MNTPT >/dev/null 2>&1
+ umount $LOOP_MNTPT >/dev/null 2>&1
+ losetup -d $LOOP_DEV >/dev/null 2>&1
+ rm -fr $LOOP_FILE >/dev/null 2>&1
+ rm -fr $LOOP_MNTPT >/dev/null 2>&1
+}
+
+setup_loop() {
+ local LOOP_DEV=$1
+ local LOOP_FILE=$2
+
+ dd if=/dev/zero of=$LOOP_FILE bs=1M count=10 2>/dev/null || return $?
+
+ losetup $LOOP_DEV $LOOP_FILE || {
+ rc=$?
+ cleanup_mount $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a
+ return $rc
+ }
+
+ mke2fs -F $LOOP_DEV >/dev/null 2>&1 || {
+ rc=$?
+ cleanup_mount $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a
+ echo "cannot create test ext2 fs on $LOOP_DEV"
+ return $?
+ }
+ return 0
+}
+
+prep_upcall() {
+ local INJECTION=""
+ local UPCALL=$1
+ local MODE=$2
+ local LOG=$3
+
+ test "x$MODE" = "xDEADLOCK" &&
+ INJECTION="touch \$MNTPATH/file"
+
+ cat > $UPCALL <<- EOF
+#!/bin/sh
+
+MOUNT=\`which mount 2>/dev/null\`
+test "x\$MOUNT" = "x" && MOUNT="/bin/mount"
+
+OPTIONS=\$1
+MNTPATH=\$2
+
+test "x\$OPTIONS" = "x" || "x\$MNTPATH" = "x" &&
+exit 1
+
+$INJECTION
+\$MOUNT \$OPTIONS \$MNTPATH > $LOG 2>&1
+exit \$?
+EOF
+ chmod +x $UPCALL
+ return $?
+}
+
+check_gns() {
+ local LOG="/tmp/gns-log"
+ local UPCALL_PATH=""
+
+ local UPCALL=$1
+ local OBJECT=$2
+ local TIMOUT=$3
+ local TICK=$4
+
+ rm -fr $LOG >/dev/null 2>&1
+ UPCALL_PATH="/tmp/gns-upcall-$UPCALL.sh"
+
+ echo "generating upcall $UPCALL_PATH"
+ prep_upcall $UPCALL_PATH $UPCALL $LOG || return $rc
+ echo "======================== upcall script ==========================="
+ cat $UPCALL_PATH 2>/dev/null || return $?
+ echo "=================================================================="
+
+ echo "$UPCALL_PATH" > /proc/fs/lustre/llite/fs0/gns_upcall || return $?
+ echo "upcall: $(cat /proc/fs/lustre/llite/fs0/gns_upcall)"
+
+ echo -n "mount on open $OBJECT/test_file1: "
+ echo -n "test data" > $OBJECT/test_file1 >/dev/null 2>&1 || return $?
+
+ local ENTRY="`basename $OBJECT`"
+
+ cat /proc/mounts | grep -q "$ENTRY" || {
+ echo "fail"
+ test -f $LOG && {
+ echo "======================== upcall log ==========================="
+ cat $LOG
+ echo "==============================================================="
+ } || {
+ echo "upcall log file $LOG is not found"
+ }
+ return 1
+ }
+ echo "success"
+
+ local sleep_time=$TIMOUT
+ let sleep_time+=$TICK*2
+ echo -n "waiting for umount ${sleep_time}s (timeout + tick*2): "
+ sleep $sleep_time
+
+ cat /proc/mounts | grep -q "$ENTRY" && {
+ echo "failed"
+ return 2
+ }
+ echo "success"
+ return 0
+}
+
+test_1a() {
+ local LOOP_DEV=$(find_free_loop 2>/dev/null)
+ local UPCALL="/tmp/gns-upcall.sh"
+ local LOOP_FILE="/tmp/gns_loop"
+ local OBJECT=".mntinfo"
+ local TIMOUT=5
+ local TICK=1
+
+ test "x$LOOP_DEV" != "x" && test -b $LOOP_DEV ||
+ error "can't find free loop device"
+
+ echo "preparing loop device $LOOP_DEV <-> $LOOP_FILE..."
+ cleanup_loop $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a
+ setup_loop $LOOP_DEV $LOOP_FILE || error
+
+ echo "setting up GNS timeouts and mount object..."
+ echo "$OBJECT" > /proc/fs/lustre/llite/fs0/gns_object_name || error
+ echo "$TIMOUT" > /proc/fs/lustre/llite/fs0/gns_timeout || error
+ echo "$TICK" > /proc/fs/lustre/llite/fs0/gns_tick || error
+
+ echo ""
+ echo "timeout: $(cat /proc/fs/lustre/llite/fs0/gns_timeout)s"
+ echo "object: $(cat /proc/fs/lustre/llite/fs0/gns_object_name)"
+ echo "tick: $(cat /proc/fs/lustre/llite/fs0/gns_tick)s"
+ echo ""
+
+ echo "preparing mount object at $DIR/gns_test_1a/$OBJECT..."
+ mkdir -p $DIR/gns_test_1a || error
+ echo -n "-t ext2 $LOOP_DEV" > $DIR/gns_test_1a/$OBJECT
+ echo "======================== mount object ==========================="
+ cat $DIR/gns_test_1a/$OBJECT
+ echo ""
+ echo "================================================================="
+ chmod u+s $DIR/gns_test_1a || error
+
+ echo ""
+ echo "testing GNS with GENERIC upcall 2 times on the row"
+ for ((i=0;i<2;i++)); do
+ check_gns GENERIC $DIR/gns_test_1a $TIMOUT $TICK || {
+ cleanup_loop $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a
+ error
+ }
+ done
+
+ echo ""
+ echo "testing GNS with DEADLOCK upcall 2 times on the row"
+ for ((i=0;i<2;i++)); do
+ check_gns DEADLOCK $DIR/gns_test_1a $TIMOUT $TICK || {
+ cleanup_loop $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a
+ error
+ }
+ done
+
+ cleanup_loop $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a
+}
+
+run_test 1a " general GNS test - mounting/umount ===================="
+
+TMPDIR=$OLDTMPDIR
+TMP=$OLDTMP
+HOME=$OLDHOME
+
+log "cleanup: ==========================================================="
+if [ "`mount | grep ^$NAME`" ]; then
+ rm -rf $DIR/[Rdfs][1-9]*
+ if [ "$I_MOUNTED" = "yes" ]; then
+ sh llmountcleanup.sh || error
+ fi
+fi
+
+echo '=========================== finished ==============================='
+[ -f "$SANITYLOG" ] && cat $SANITYLOG && exit 1 || true
SRCDIR=`dirname $0`
export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
+export SECURITY=${SECURITY:-"null"}
TMP=${TMP:-/tmp}
FSTYPE=${FSTYPE:-ext3}
IOPENTEST2=${IOPENTEST2:-iopentest2}
PTLDEBUG=${PTLDEBUG:-0}
+. krb5_env.sh
+
if [ $UID -ne 0 ]; then
RUNAS_ID="$UID"
RUNAS=""
RUNAS=${RUNAS:-"runas -u $RUNAS_ID"}
fi
+if [ `using_krb5_sec $SECURITY` == 'y' ] ; then
+ start_krb5_kdc || exit 1
+ if [ $RUNAS_ID -ne $UID ]; then
+ $RUNAS ./krb5_refresh_cache.sh || exit 2
+ fi
+fi
+
export NAME=${NAME:-lmv}
SAVE_PWD=$PWD
SRCDIR=`dirname $0`
export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
+export SECURITY=${SECURITY:-"null"}
TMP=${TMP:-/tmp}
FSTYPE=${FSTYPE:-ext3}
IOPENTEST1=${IOPENTEST1:-iopentest1}
IOPENTEST2=${IOPENTEST2:-iopentest2}
+. krb5_env.sh
+
if [ $UID -ne 0 ]; then
RUNAS_ID="$UID"
RUNAS=""
RUNAS=${RUNAS:-"runas -u $RUNAS_ID"}
fi
+if [ `using_krb5_sec $SECURITY` == 'y' ] ; then
+ start_krb5_kdc || exit 1
+ if [ $RUNAS_ID -ne $UID ]; then
+ $RUNAS ./krb5_refresh_cache.sh || exit 2
+ fi
+fi
+
export NAME=${NAME:-local}
SAVE_PWD=$PWD
run_test 1 "test root_squash ============================"
+test_2() {
+ touch $DIR/f2
+
+ #test set/get xattr
+ setfattr -n trusted.name1 -v value1 $DIR/f2 || error
+ [ "`getfattr -n trusted.name1 $DIR/f2 2> /dev/null | \
+ grep "trusted.name1"`" == "trusted.name1=\"value1\"" ] || error
+
+ setfattr -n user.author1 -v author1 $DIR/f2 || error
+ [ "`getfattr -n user.author1 $DIR/f2 2> /dev/null | \
+ grep "user.author1"`" == "user.author1=\"author1\"" ] || error
+
+ # test listxattr
+ setfattr -n trusted.name2 -v value2 $DIR/f2 || error
+ setfattr -n trusted.name3 -v value3 $DIR/f2 || error
+ [ `getfattr -d -m "^trusted" $DIR/f2 2> /dev/null | \
+ grep "trusted" | wc -l` -eq 5 ] || error
+
+
+ setfattr -n user.author2 -v author2 $DIR/f2 || error
+ setfattr -n user.author3 -v author3 $DIR/f2 || error
+ [ `getfattr -d -m "^user" $DIR/f2 2> /dev/null | \
+ grep "user" | wc -l` -eq 3 ] || error
+ #test removexattr
+ setfattr -x trusted.name1 $DIR/f2 2> /dev/null || error
+ getfattr -d -m trusted $DIR/f2 2> /dev/null | \
+ grep "trusted.name1" && error || true
+
+ setfattr -x user.author1 $DIR/f2 2> /dev/null || error
+ getfattr -d -m user $DIR/f2 2> /dev/null | \
+ grep "user.author1" && error || true
+}
+run_test 2 "set/get xattr test (trusted xattr only) ============"
+
+test_3 () {
+ SAVE_UMASK=`umask`
+ umask 022
+ USER1=rpm
+ USER2=vsx2
+ GROUP1=nobody
+ GROUP2=users
+
+ chmod +x runacltest
+ chmod +x acl_mode
+ cd $DIR
+
+ #sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/setfacl.test | runacltest ||
+#error "$? setfacl tests failed"
+
+ #sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/acl_asroot.test | runacltest || error "$? acl_asroot tests failed"
+
+ #sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/acl_perm.test | runacltest || error "$? acl_perm tests failed"
+
+ #sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/acl_misc.test | runacltest || error "$? acl_misc tests failed"
+
+ sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/acl_fileutil.test | runacltest || error "$? acl_fileutil tests failed"
+
+ umask $SAVE_UMASK
+}
+run_test 3 "==============acl test ============="
+
TMPDIR=$OLDTMPDIR
TMP=$OLDTMP
HOME=$OLDHOME
# bug number for skipped test: 2739
# 51b and 51c depend on kernel
# 65* fixes in b_hd_cray_merge3
-ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"51b 51c 65a 65b 65c 65d 65e 65f"}
+# the new kernel api make 48 not valid anymore
+ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"48 51b 51c 65a 65b 65c 65d 65e 65f"}
# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
[ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT"
SRCDIR=`dirname $0`
export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
+export SECURITY=${SECURITY:-"null"}
TMP=${TMP:-/tmp}
FSTYPE=${FSTYPE:-ext3}
IOPENTEST2=${IOPENTEST2:-iopentest2}
MEMHOG=${MEMHOG:-memhog}
+. krb5_env.sh
+
if [ $UID -ne 0 ]; then
RUNAS_ID="$UID"
RUNAS=""
RUNAS=${RUNAS:-"runas -u $RUNAS_ID"}
fi
+if [ `using_krb5_sec $SECURITY` == 'y' ] ; then
+ start_krb5_kdc || exit 1
+ if [ $RUNAS_ID -ne $UID ]; then
+ $RUNAS ./krb5_refresh_cache.sh || exit 2
+ fi
+fi
+
export NAME=${NAME:-local}
SAVE_PWD=$PWD
}
run_test 18 "mmap sanity check ================================="
+test_19() { # bug 2441
+ touch $DIR1/f2b
+
+ #test set/get xattr
+ setfattr -n trusted.name1 -v value1 $DIR1/f2b || error
+ [ "`getfattr -n trusted.name1 $DIR2/f2b 2> /dev/null | \
+ grep "trusted.name1"`" == "trusted.name1=\"value1\"" ] || error
+
+ setfattr -n user.author1 -v author1 $DIR/f2b || error
+ [ "`getfattr -n user.author1 $DIR/f2b 2> /dev/null | \
+ grep "user.author1"`" == "user.author1=\"author1\"" ] || error
+
+ # test listxattr
+ setfattr -n trusted.name2 -v value2 $DIR2/f2b || error
+ setfattr -n trusted.name3 -v value3 $DIR1/f2b || error
+ [ `getfattr -d -m "^trusted" $DIR2/f2b 2> /dev/null | \
+ grep "trusted" | wc -l` -eq 5 ] || error
+
+ setfattr -n user.author2 -v author2 $DIR/f2b || error
+ setfattr -n user.author3 -v author3 $DIR/f2b || error
+ [ `getfattr -d -m "^user" $DIR/f2b 2> /dev/null | \
+ grep "user" | wc -l` -eq 3 ] || error
+ #test removexattr
+ setfattr -x trusted.name1 $DIR2/f2b 2> /dev/null || error
+ getfattr -d -m trusted $DIR2/f2b 2> /dev/null | \
+ grep "trusted.name1" && error || true
+
+ setfattr -x user.author1 $DIR/f2b 2> /dev/null || error
+ getfattr -d -m user $DIR/f2b 2> /dev/null | \
+ grep "user.author1" && error || true
+}
+run_test 19 "test set/get xattr on multiple mounts ============"
+
log "cleanup: ======================================================"
rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true
--- /dev/null
+!
+! setfacl tests.
+!
+! Run these tests on a filesystem with ACL support.
+!
+$ umask 027
+$ touch g
+$ acl_mode g
+-rw-r-----
+$ setfacl -m m:- g
+$ acl_mode g
+-rw-------+
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rw-
+group::r-- #effective:---
+mask::---
+other::---
+
+$ setfacl -x m g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rw-
+group::r--
+other::---
+
+$ setfacl -m u:joe:rw g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rw-
+user:joe:rw-
+group::r--
+mask::rw-
+other::---
+
+$ setfacl -m u::rwx,g::r-x,o:- g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rwx
+user:joe:rw-
+group::r-x
+mask::rwx
+other::---
+
+$ setfacl -m u::rwx,g::r-x,o:-,m:- g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rwx
+user:joe:rw- #effective:---
+group::r-x #effective:---
+mask::---
+other::---
+
+$ setfacl -m u::rwx,g::r-x,o:-,u:root:-,m:- g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rwx
+user:root:---
+user:joe:rw- #effective:---
+group::r-x #effective:---
+mask::---
+other::---
+
+$ setfacl -m u::rwx,g::r-x,o:-,u:root:-,m:- g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rwx
+user:root:---
+user:joe:rw- #effective:---
+group::r-x #effective:---
+mask::---
+other::---
+
+$ setfacl -m u::rwx,g::r-x,o:-,u:root:- g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rwx
+user:root:---
+user:joe:rw-
+group::r-x
+mask::rwx
+other::---
+
+$ setfacl --test -x u: g
+setfacl: g: Malformed access ACL `user:root:---,user:joe:rw-,group::r-x,mask::rwx,other::---': Missing or wrong entry at entry 1
+$ setfacl --test -x u:x
+setfacl: Option -x: Invalid argument near character 3
+$ setfacl -m d:u:root:rwx g
+setfacl: g: Only directories can have default ACLs
+$ setfacl -x m g
+setfacl: g: Malformed access ACL `user::rwx,user:root:---,user:joe:rw-,group::r-x,other::---': Missing or wrong entry at entry 5
+!setfacl --test -m d:u:joe:rwx setfacl
+!setfacl --test -n -m d:u:joe:rwx setfacl
+$ rm g
+!
+! Check if the mask is properly recalculated
+!
+$ mkdir d
+$ setfacl --test -m u::rwx,u:@OWNER@:rwx,g::r-x,o::--- d
+d: u::rwx,u:@OWNER@:rwx,g::r-x,m::rwx,o::---,*
+$ setfacl --test -m u::rwx,u:@OWNER@:rwx,g::r-x,m::---,o::--- d
+d: u::rwx,u:@OWNER@:rwx,g::r-x,m::---,o::---,*
+$ setfacl --test -d -m u::rwx,u:@OWNER@:rwx,g::r-x,o::--- d
+d: *,d:u::rwx,d:u:@OWNER@:rwx,d:g::r-x,d:m::rwx,d:o::---
+$ setfacl --test -d -m u::rwx,u:@OWNER@:rwx,g::r-x,m::---,o::--- d
+d: *,d:u::rwx,d:u:@OWNER@:rwx,d:g::r-x,d:m::---,d:o::---
+$ rmdir d
export LCTL=${LCTL:-"$LUSTRE/utils/lctl"}
export CHECKSTAT="${CHECKSTAT:-checkstat} "
export FSYTPE=${FSTYPE:-"ext3"}
+ export SECURITY=${SECURITY:-"null"}
# Paths on remote nodes, if different
export RLUSTRE=${RLUSTRE:-$LUSTRE}
# echo "CONFIG=`canonical_path $CONFIG`" > $LUSTRE/tests/CONFIG
}
+. krb5_env.sh
+
# Facet functions
start() {
facet=$1
active=`facet_active $facet`
do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \
--node ${active}_facet --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \
- $@ $XMLCONFIG
+ --sec $SECURITY $@ $XMLCONFIG
}
stop() {
do_node $client mkdir $mnt 2> /dev/null || :
if [ -x /sbin/mount.lustre ] ; then
- do_node $client mount -t lustre -o nettype=$NETTYPE `facet_active_host mds1`:/mds1_svc/client_facet $mnt || return 1
+ do_node $client mount -t lustre -o sec=$SECURITY,nettype=$NETTYPE \
+ `facet_active_host mds1`:/mds1_svc/client_facet $mnt || return 2
else
# this is so cheating
do_node $client $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null || return 2
- do_node $client $LLMOUNT `facet_active_host mds1`:/mds1_svc/client_facet $mnt -o nettype=$NETTYPE|| return 4
+ do_node $client $LLMOUNT `facet_active_host mds1`:/mds1_svc/client_facet $mnt \
+ -o sec=$SECURITY,nettype=$NETTYPE|| return 4
fi
[ -d /r ] && $LCTL modules > /r/tmp/ogdb-`hostname`
df $MOUNT || error "post-failover df: $?"
}
+fail_drop() {
+ local facet=$1
+ local failcode=$2
+ facet_failover $facet
+ do_facet mds "echo $failcode > /proc/sys/lustre/fail_loc"
+ cat /proc/sys/lustre/fail_loc
+ df $MOUNT || error "post-failover df: $?"
+ do_facet mds "echo 0 > /proc/sys/lustre/fail_loc"
+}
+
fail_abort() {
local facet=$1
stop $facet --force --failover --nomod
llmount
mount.lustre
wiretest
+lsd_upcall
.*.cmd
.*.d
if UTILS
rootsbin_SCRIPTS = mount.lustre
-sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount l_getgroups
+sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount lsd_upcall
bin_PROGRAMS = lfs
lib_LIBRARIES = liblustreapi.a
sbin_SCRIPTS = $(sbin_scripts)
lfs_SOURCES = lfs.c
llmount_SOURCES = llmount.c
llmount_LDADD = $(LIBREADLINE) -lptlctl
-l_getgroups_SOURCES = l_getgroups.c
+lsd_upcall_SOURCES = lsd_upcall.c
EXTRA_DIST = $(bin_scripts) $(sbin_scripts)
quit""" % (type, name, uuid)
self.run(cmds)
+ def set_security(self, name, key, value):
+ cmds = """
+ cfg_device %s
+ set_security %s %s
+ quit""" % (name, key, value)
+ self.run(cmds)
+
def setup(self, name, setup = ""):
cmds = """
cfg_device %s
self.dev_dir = dev_dir
self.name = name
+ # FIXME we ignore the failure of loading gss module, because we might
+ # don't need it at all.
def load(self):
"""Load module"""
log ('loading module:', self.name, 'srcdir',
if self.src_dir:
module = kmod_find(self.src_dir, self.dev_dir,
self.name)
- if not module:
+ if not module and self.name != 'ptlrpcs_gss':
panic('module not found:', self.name)
(rc, out) = run('/sbin/insmod', module)
if rc:
- raise CommandError('insmod', out, rc)
+ if self.name == 'ptlrpcs_gss':
+ print "Warning: not support gss security!"
+ else:
+ raise CommandError('insmod', out, rc)
else:
(rc, out) = run('/sbin/modprobe', self.name)
if rc:
- raise CommandError('modprobe', out, rc)
+ if self.name == 'ptlrpcs_gss':
+ print "Warning: not support gss security!"
+ else:
+ raise CommandError('modprobe', out, rc)
def cleanup(self):
"""Unload module"""
def add_module(self, manager):
manager.add_lustre_module('lvfs', 'lvfs')
manager.add_lustre_module('obdclass', 'obdclass')
+ manager.add_lustre_module('sec', 'ptlrpcs')
manager.add_lustre_module('ptlrpc', 'ptlrpc')
+ manager.add_lustre_module('sec/gss', 'ptlrpcs_gss')
def prepare(self):
return
self.info("mds", realdev, mountfsoptions, self.fstype, self.size,
self.format, master_name, profile_name, self.obdtype)
- lctl.newdev("mds", self.name, self.uuid,
- setup = "%s %s %s %s %s %s" %(realdev,
+ lctl.attach("mds", self.name, self.uuid)
+ if config.mds_mds_sec:
+ lctl.set_security(self.name, "mds_mds_sec", config.mds_mds_sec)
+ if config.mds_ost_sec:
+ lctl.set_security(self.name, "mds_ost_sec", config.mds_ost_sec)
+
+ lctl.setup(self.name, setup = "%s %s %s %s %s %s" %(realdev,
self.fstype, profile_name, mountfsoptions,
master_name, self.obdtype))
if development_mode():
- procentry = "/proc/fs/lustre/mds/grp_hash_upcall"
- upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/l_getgroups")
+ procentry = "/proc/fs/lustre/mds/lsd_upcall"
+ upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/lsd_upcall")
if not (os.access(procentry, os.R_OK) and os.access(upcall, os.R_OK)):
- print "MDS Warning: failed to set group-hash upcall"
+ print "MDS Warning: failed to set lsd cache upcall"
else:
run("echo ", upcall, " > ", procentry)
self.clientoptions = string.replace(self.clientoptions, "async",
"lasync")
- cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
- (self.vosc.get_name(), vmdc_name, self.clientoptions,
+ if not config.sec:
+ config.sec = "null"
+ cmd = "mount -t lustre_lite -o osc=%s,mdc=%s,sec=%s%s %s %s" % \
+ (self.vosc.get_name(), vmdc_name, config.sec, self.clientoptions,
config.config, self.path)
run("mkdir", self.path)
ret, val = run(cmd)
('config', "Cluster config name used for LDAP query", PARAM),
('select', "service=nodeA,service2=nodeB ", PARAMLIST),
('node', "Load config for <nodename>", PARAM),
+ ('sec', "security flavor <null|krb5i|krb5p> of client", PARAM),
+ ('mds_mds_sec', "security flavor <null|krb5i|krb5p> of inter mds's", PARAM),
+ ('mds_ost_sec', "security flavor <null|krb5i|krb5p> of mds's-ost's", PARAM),
('cleanup,d', "Cleans up config. (Shutdown)"),
('force,f', "Forced unmounting and/or obd detach during cleanup",
FLAG, 0),
{"deactivate", jt_obd_deactivate, 0, "deactivate an import\n"},
{"recover", jt_obd_recover, 0, "usage: recover [<connection UUID>]"},
{"lookup", jt_obd_mdc_lookup, 0, "usage: lookup <directory> <file>"},
- {"finish_gns", jt_obd_finish_gns, 0, "usage: finish_gns <directory>"},
{"notransno", jt_obd_no_transno, 0,
"disable sending of committed-transno updates\n"},
{"readonly", jt_obd_set_readonly, 0,
"usage: add_conn <conn_uuid> [priority]\n"},
{"del_conn ", jt_lcfg_del_conn, 0,
"usage: del_conn <conn_uuid> \n"},
+ {"set_security", jt_lcfg_set_security, 0,
+ "usage: set_security key value\n"},
{"lsync", jt_obd_reint_sync, 0,
"usage: lsync\n"},
{"cache_on", jt_obd_cache_on, 0,
#define _GNU_SOURCE
#include <getopt.h>
#include <sys/utsname.h>
+#include <pwd.h>
+#include <grp.h>
#include "obdctl.h"
#include <portals/ptlctl.h>
lmd->lmd_local_nid = PTL_NID_ANY;
lmd->lmd_port = 988; /* XXX define LUSTRE_DEFAULT_PORT */
lmd->lmd_nal = SOCKNAL;
+ lmd->lmd_nllu = 99;
+ lmd->lmd_nllg = 99;
+ strncpy(lmd->lmd_security, "null", sizeof(lmd->lmd_security));
return 0;
}
printf("mds: %s\n", lmd->lmd_mds);
printf("profile: %s\n", lmd->lmd_profile);
+ printf("sec_flavor: %s\n", lmd->lmd_security);
printf("server_nid: "LPX64"\n", lmd->lmd_server_nid);
printf("local_nid: "LPX64"\n", lmd->lmd_local_nid);
printf("nal: %d\n", lmd->lmd_nal);
return(0);
}
+/*
+ * here all what we do is gurantee the result is exactly
+ * what user intend to get, no ambiguous. maybe there have
+ * simpler library call could do the same job for us?
+ */
+static int parse_u32(char *str, uint32_t *res)
+{
+ unsigned long id;
+ char *endptr = NULL;
+
+ id = strtol(str, &endptr, 0);
+ if (endptr && *endptr != 0)
+ return -1;
+
+ if (id == LONG_MAX || id == LONG_MIN)
+ return -1;
+
+ if ((uint32_t)id != id)
+ return -1;
+
+ *res = (uint32_t) id;
+ return 0;
+}
+
+static int parse_nllu(struct lustre_mount_data *lmd, char *str_nllu)
+{
+ struct passwd *pass;
+
+ if (parse_u32(str_nllu, &lmd->lmd_nllu) == 0)
+ return 0;
+
+ pass = getpwnam(str_nllu);
+ if (pass == NULL)
+ return -1;
+
+ lmd->lmd_nllu = pass->pw_uid;
+ return 0;
+}
+
+static int parse_nllg(struct lustre_mount_data *lmd, char *str_nllg)
+{
+ struct group *grp;
+
+ if (parse_u32(str_nllg, &lmd->lmd_nllg) == 0)
+ return 0;
+
+ grp = getgrnam(str_nllg);
+ if (grp == NULL)
+ return -1;
+
+ lmd->lmd_nllg = grp->gr_gid;
+ return 0;
+}
+
int parse_options(char * options, struct lustre_mount_data *lmd)
{
ptl_nid_t nid = 0, cluster_id = 0;
lmd->lmd_server_nid = nid;
} else if (!strcmp(opt, "port")) {
lmd->lmd_port = val;
+ } else if (!strcmp(opt, "sec")) {
+ strncpy(lmd->lmd_security, opteq + 1,
+ sizeof(lmd->lmd_security));
+ } else if (!strcmp(opt, "nllu")) {
+ if (parse_nllu(lmd, opteq + 1)) {
+ fprintf(stderr, "%s: "
+ "can't parse user: %s\n",
+ progname, opteq + 1);
+ return (-1);
+ }
+ } else if (!strcmp(opt, "nllg")) {
+ if (parse_nllg(lmd, opteq + 1)) {
+ fprintf(stderr, "%s: "
+ "can't parse group: %s\n",
+ progname, opteq + 1);
+ return (-1);
+ }
}
} else {
val = 1;
LIBLUSTRE_MOUNT_POINT=${LIBLUSTRE_MOUNT_POINT:-"/mnt/lustre"}
LIBLUSTRE_MOUNT_TARGET=${LIBLUSTRE_MOUNT_TARGET:-"TARGET_NOT_SET"}
+LIBLUSTRE_SECURITY=${LIBLUSTRE_SECURITY:-"null"}
LIBLUSTRE_DUMPFILE=${LIBLUSTRE_DUMPFILE:-"/tmp/DUMP_FILE"}
LD_PRELOAD=${LD_PRELOAD:-"/usr/lib/liblustre.so"}
export LIBLUSTRE_MOUNT_POINT
export LIBLUSTRE_MOUNT_TARGET
+export LIBLUSTRE_SECURITY
export LIBLUSTRE_DUMPFILE
export LD_PRELOAD
#include <pwd.h>
#include <grp.h>
+#include <liblustre.h>
+#include <linux/lustre_idl.h>
+#include <linux/obd.h>
+#include <linux/lustre_mds.h>
+
/*
* return:
* 0: fail to insert (found identical)
return 1;
}
-int get_groups_local(uid_t uid, int *ngroups, gid_t **groups)
+int get_groups_local(uid_t uid, gid_t *gid, int *ngroups, gid_t **groups)
{
int maxgroups;
int i, size = 0;
if (!pw)
return -errno;
+ *gid = pw->pw_gid;
+
while ((gr = getgrent())) {
if (!gr->gr_mem)
continue;
int main (int argc, char **argv)
{
+ char *pathname = "/proc/fs/lustre/mds/lsd_downcall";
int fd, rc;
- struct {
- uint32_t err;
- uint32_t uid;
- uint32_t ngroups;
- gid_t *groups;
- } ioc_data;
- char *pathname = "/proc/fs/lustre/mds/group_info";
+ struct lsd_downcall_args ioc_data;
if (argc != 2) {
printf("bad parameter\n");
return rc;
}
- ioc_data.err = get_groups_local(ioc_data.uid, &ioc_data.ngroups, &ioc_data.groups);
+ ioc_data.err = get_groups_local(ioc_data.uid, &ioc_data.gid,
+ &ioc_data.ngroups, &ioc_data.groups);
+
+ /* FIXME get these from config file */
+ ioc_data.allow_setuid = 1;
+ ioc_data.allow_setgid = 1;
+ ioc_data.allow_setgrp = 1;
rc = write(fd, &ioc_data, sizeof(ioc_data));
return (rc != sizeof(ioc_data));
return rc;
}
+int jt_lcfg_set_security(int argc, char **argv)
+{
+ struct lustre_cfg lcfg;
+ int rc;
+
+ if (argc != 3)
+ return CMD_HELP;
+
+ if (lcfg_devname == NULL) {
+ fprintf(stderr, "%s: please use 'cfg_device name' to set the "
+ "device name for config commands.\n",
+ jt_cmdname(argv[0]));
+ return -EINVAL;
+ }
+
+ LCFG_INIT(lcfg, LCFG_SET_SECURITY, lcfg_devname);
+
+ /* currently only used to set on mds */
+ if (strcmp(argv[1], "mds_mds_sec") && strcmp(argv[1], "mds_ost_sec")) {
+ fprintf(stderr, "%s: invalid security key %s\n",
+ jt_cmdname(argv[0]), argv[1]);
+ return -EINVAL;
+ }
+ if (strcmp(argv[2], "null") && strcmp(argv[2], "krb5")) {
+ fprintf(stderr, "%s: invalid security value %s\n",
+ jt_cmdname(argv[0]), argv[2]);
+ return -EINVAL;
+ }
+
+ /* connection uuid */
+ lcfg.lcfg_inllen1 = strlen(argv[1]) + 1;
+ lcfg.lcfg_inlbuf1 = argv[1];
+ lcfg.lcfg_inllen2 = strlen(argv[2]) + 1;
+ lcfg.lcfg_inlbuf2 = argv[2];
+
+ rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg);
+ if (rc < 0) {
+ fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
+ strerror(rc = errno));
+ }
+
+ return rc;
+}
return rc;
}
-int jt_obd_finish_gns(int argc, char **argv)
-{
- char *mtpt;
- int rc, fd;
- struct obd_ioctl_data data;
-
- if (argc != 2)
- return CMD_HELP;
-
- mtpt = argv[1];
-
- fd = open(mtpt, O_RDONLY);
- if (fd < 0) {
- fprintf(stderr, "open \"%s\" failed: %s\n", mtpt,
- strerror(errno));
- return -1;
- }
-
- IOC_INIT(data);
- IOC_PACK(argv[0], data);
- rc = ioctl(fd, IOC_MDC_FINISH_GNS, buf);
- if (rc < 0) {
- fprintf(stderr, "error: %s(%s) ioctl error: %s\n",
- jt_cmdname(argv[0]), mtpt, strerror(rc = errno));
- }
- close(fd);
-
- return rc;
-}
-
int jt_obd_close_uuid(int argc, char **argv)
{
int rc, nal;
int jt_obd_deactivate(int argc, char **argv);
int jt_obd_recover(int argc, char **argv);
int jt_obd_mdc_lookup(int argc, char **argv);
-int jt_obd_finish_gns(int argc, char **argv);
int jt_get_version(int argc, char **argv);
int jt_obd_close_uuid(int argc, char **argv);
int jt_cfg_record(int argc, char **argv);
int jt_lcfg_set_lustre_upcall(int argc, char **argv);
int jt_lcfg_add_conn(int argc, char **argv);
int jt_lcfg_del_conn(int argc, char **argv);
+int jt_lcfg_set_security(int argc, char **argv);
int obd_add_uuid(char *uuid, ptl_nid_t nid, int nal);