Whamcloud - gitweb
land lustre part of b_hd_sec on HEAD.
authorericm <ericm>
Thu, 31 Mar 2005 22:18:52 +0000 (22:18 +0000)
committerericm <ericm>
Thu, 31 Mar 2005 22:18:52 +0000 (22:18 +0000)
166 files changed:
ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch
lustre/Makefile.in
lustre/autoMakefile.am
lustre/autoconf/lustre-core.m4
lustre/cobd/cache_obd.c
lustre/include/liblustre.h
lustre/include/linux/Makefile.am
lustre/include/linux/lustre_acl.h [new file with mode: 0644]
lustre/include/linux/lustre_cfg.h
lustre/include/linux/lustre_compat25.h
lustre/include/linux/lustre_export.h
lustre/include/linux/lustre_idl.h
lustre/include/linux/lustre_import.h
lustre/include/linux/lustre_lite.h
lustre/include/linux/lustre_mds.h
lustre/include/linux/lustre_net.h
lustre/include/linux/lustre_sec.h [new file with mode: 0644]
lustre/include/linux/lustre_smfs.h
lustre/include/linux/lustre_ucache.h [new file with mode: 0644]
lustre/include/linux/lvfs.h
lustre/include/linux/obd.h
lustre/include/linux/obd_class.h
lustre/include/linux/obd_support.h
lustre/kernel_patches/patches/dcache-mds-num-2.6.7.patch
lustre/kernel_patches/patches/export-vanilla-2.6.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-wantedi-2.6-suse.patch
lustre/kernel_patches/patches/header_guards-vanilla-2.6.patch [new file with mode: 0644]
lustre/kernel_patches/patches/iopen-2.6-vanilla.patch
lustre/kernel_patches/patches/linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs-dcache_lustre_invalid-vanilla-2.6.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs-do_truncate.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs-gns_export_doumount.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs-pdirops-2.6.7.patch
lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs_fmode_exec-2.6.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs_gns-2.6-vanilla.patch [new file with mode: 0644]
lustre/kernel_patches/patches/vfs_intent-2.6-vanilla.patch
lustre/kernel_patches/series/2.6-vanilla.series
lustre/ldlm/ldlm_lib.c
lustre/ldlm/ldlm_lock.c
lustre/liblustre/Makefile.am
lustre/liblustre/dir.c
lustre/liblustre/file.c
lustre/liblustre/genlib.sh
lustre/liblustre/namei.c
lustre/liblustre/super.c
lustre/llite/dcache.c
lustre/llite/dir.c
lustre/llite/file.c
lustre/llite/llite_gns.c
lustre/llite/llite_internal.h
lustre/llite/llite_lib.c
lustre/llite/llite_nfs.c
lustre/llite/lproc_llite.c
lustre/llite/namei.c
lustre/llite/special.c
lustre/llite/super.c
lustre/llite/super25.c
lustre/llite/symlink.c
lustre/lmv/lmv_intent.c
lustre/lmv/lmv_obd.c
lustre/lmv/lmv_objmgr.c
lustre/lov/lov_obd.c
lustre/lvfs/lvfs_reint.c
lustre/mdc/autoMakefile.am
lustre/mdc/mdc_locks.c
lustre/mdc/mdc_request.c
lustre/mds/Makefile.in
lustre/mds/handler.c
lustre/mds/lproc_mds.c
lustre/mds/mds_fs.c
lustre/mds/mds_groups.c [deleted file]
lustre/mds/mds_internal.h
lustre/mds/mds_lib.c
lustre/mds/mds_lmv.c
lustre/mds/mds_lov.c
lustre/mds/mds_lsd.c [new file with mode: 0644]
lustre/mds/mds_open.c
lustre/mds/mds_reint.c
lustre/obdclass/class_obd.c
lustre/obdclass/genops.c
lustre/obdfilter/filter_log.c
lustre/osc/osc_lib.c
lustre/osc/osc_request.c
lustre/ost/ost_handler.c
lustre/ptlrpc/autoMakefile.am
lustre/ptlrpc/client.c
lustre/ptlrpc/events.c
lustre/ptlrpc/import.c
lustre/ptlrpc/lproc_ptlrpc.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/ptlrpc_internal.h
lustre/ptlrpc/ptlrpc_module.c
lustre/ptlrpc/service.c
lustre/sec/.cvsignore [new file with mode: 0644]
lustre/sec/Makefile.in [new file with mode: 0644]
lustre/sec/Makefile.mk [new file with mode: 0644]
lustre/sec/autoMakefile.am [new file with mode: 0644]
lustre/sec/doc/oss_gss_HLD.lyx [new file with mode: 0644]
lustre/sec/doc/remote_ugid_HLD.lyx [new file with mode: 0644]
lustre/sec/doc/revoke_user_HLD.lyx [new file with mode: 0644]
lustre/sec/gss/.cvsignore [new file with mode: 0644]
lustre/sec/gss/Makefile.in [new file with mode: 0644]
lustre/sec/gss/Makefile.mk [new file with mode: 0644]
lustre/sec/gss/autoMakefile.am [new file with mode: 0644]
lustre/sec/gss/gss_api.h [new file with mode: 0644]
lustre/sec/gss/gss_asn1.h [new file with mode: 0644]
lustre/sec/gss/gss_err.h [new file with mode: 0644]
lustre/sec/gss/gss_generic_token.c [new file with mode: 0644]
lustre/sec/gss/gss_internal.h [new file with mode: 0644]
lustre/sec/gss/gss_krb5.h [new file with mode: 0644]
lustre/sec/gss/gss_krb5_crypto.c [new file with mode: 0644]
lustre/sec/gss/gss_krb5_mech.c [new file with mode: 0644]
lustre/sec/gss/gss_krb5_seal.c [new file with mode: 0644]
lustre/sec/gss/gss_krb5_seqnum.c [new file with mode: 0644]
lustre/sec/gss/gss_krb5_unseal.c [new file with mode: 0644]
lustre/sec/gss/gss_krb5_wrap.c [new file with mode: 0644]
lustre/sec/gss/gss_mech_switch.c [new file with mode: 0644]
lustre/sec/gss/rawobj.c [new file with mode: 0644]
lustre/sec/gss/sec_gss.c [new file with mode: 0644]
lustre/sec/gss/svcsec_gss.c [new file with mode: 0644]
lustre/sec/sec.c [new file with mode: 0644]
lustre/sec/sec_null.c [new file with mode: 0644]
lustre/sec/svcsec.c [new file with mode: 0644]
lustre/sec/svcsec_null.c [new file with mode: 0644]
lustre/sec/upcall_cache.c [new file with mode: 0644]
lustre/smfs/dir.c
lustre/tests/acl_asroot.test [new file with mode: 0644]
lustre/tests/acl_fileutil.test [new file with mode: 0644]
lustre/tests/acl_misc.test [new file with mode: 0644]
lustre/tests/acl_mode [new file with mode: 0755]
lustre/tests/acl_perm.test [new file with mode: 0644]
lustre/tests/conf-sanity.sh
lustre/tests/gns-upcall.sh [new file with mode: 0755]
lustre/tests/insanity.sh
lustre/tests/krb5_env.sh [new file with mode: 0755]
lustre/tests/krb5_refresh_cache.sh [new file with mode: 0755]
lustre/tests/llmount.sh
lustre/tests/llmountcleanup.sh
lustre/tests/llrmount.sh
lustre/tests/lmv.sh
lustre/tests/recovery-small.sh
lustre/tests/replay-dual.sh
lustre/tests/replay-single.sh
lustre/tests/runacltest [new file with mode: 0755]
lustre/tests/sanity-gns.sh [new file with mode: 0644]
lustre/tests/sanity-lmv.sh
lustre/tests/sanity-sec.sh
lustre/tests/sanity.sh
lustre/tests/sanityN.sh
lustre/tests/setfacl.test [new file with mode: 0644]
lustre/tests/test-framework.sh
lustre/utils/.cvsignore
lustre/utils/Makefile.am
lustre/utils/lconf
lustre/utils/lctl.c
lustre/utils/llmount.c
lustre/utils/lrun
lustre/utils/lsd_upcall.c [moved from lustre/utils/l_getgroups.c with 84% similarity]
lustre/utils/lustre_cfg.c
lustre/utils/obd.c
lustre/utils/obdctl.h

index a4867a5..4fd69a5 100644 (file)
@@ -5,10 +5,10 @@
  include/linux/ext3_fs.h |    5 ++++-
  5 files changed, 85 insertions(+), 6 deletions(-)
 
-Index: uml-2.6.3/fs/ext3/ialloc.c
+Index: linux-2.6.7/fs/ext3/ialloc.c
 ===================================================================
---- uml-2.6.3.orig/fs/ext3/ialloc.c    2004-02-20 15:00:48.000000000 +0800
-+++ uml-2.6.3/fs/ext3/ialloc.c 2004-02-21 00:24:45.202693776 +0800
+--- linux-2.6.7.orig/fs/ext3/ialloc.c  2005-03-24 00:27:43.282608616 +0800
++++ linux-2.6.7/fs/ext3/ialloc.c       2005-03-24 00:27:43.888516504 +0800
 @@ -420,7 +420,8 @@
   * For other inodes, search forward from the parent directory's block
   * group to find a free inode.
@@ -58,11 +58,19 @@ Index: uml-2.6.3/fs/ext3/ialloc.c
        if (S_ISDIR(mode)) {
                if (test_opt (sb, OLDALLOC))
                        group = find_group_dir(sb, dir);
-Index: uml-2.6.3/fs/ext3/ioctl.c
+Index: linux-2.6.7/fs/ext3/ioctl.c
 ===================================================================
---- uml-2.6.3.orig/fs/ext3/ioctl.c     2004-01-09 14:59:26.000000000 +0800
-+++ uml-2.6.3/fs/ext3/ioctl.c  2004-02-21 00:21:04.541239416 +0800
-@@ -24,6 +24,31 @@
+--- linux-2.6.7.orig/fs/ext3/ioctl.c   2004-06-16 13:19:13.000000000 +0800
++++ linux-2.6.7/fs/ext3/ioctl.c        2005-03-24 00:31:16.113253440 +0800
+@@ -9,6 +9,7 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
++#include <linux/namei.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/time.h>
+@@ -24,6 +25,31 @@
        ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
  
        switch (cmd) {
@@ -93,12 +101,12 @@ Index: uml-2.6.3/fs/ext3/ioctl.c
 +      }
        case EXT3_IOC_GETFLAGS:
                flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
-               return put_user(flags, (int *) arg);
-Index: uml-2.6.3/fs/ext3/namei.c
+               return put_user(flags, (int __user *) arg);
+Index: linux-2.6.7/fs/ext3/namei.c
 ===================================================================
---- uml-2.6.3.orig/fs/ext3/namei.c     2004-02-20 15:01:27.000000000 +0800
-+++ uml-2.6.3/fs/ext3/namei.c  2004-02-21 00:21:04.611228776 +0800
-@@ -1617,6 +1617,19 @@
+--- linux-2.6.7.orig/fs/ext3/namei.c   2005-03-24 00:27:43.536570008 +0800
++++ linux-2.6.7/fs/ext3/namei.c        2005-03-24 00:27:43.893515744 +0800
+@@ -1939,6 +1939,19 @@
        return err;
  }
  
@@ -118,7 +126,7 @@ Index: uml-2.6.3/fs/ext3/namei.c
  /*
   * By the time this is called, we already have created
   * the directory cache entry for the new file, but it
-@@ -1640,7 +1653,7 @@
+@@ -1963,7 +1976,7 @@
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
  
@@ -127,7 +135,7 @@ Index: uml-2.6.3/fs/ext3/namei.c
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext3_file_inode_operations;
-@@ -1670,7 +1683,7 @@
+@@ -1994,7 +2007,7 @@
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
  
@@ -136,7 +144,7 @@ Index: uml-2.6.3/fs/ext3/namei.c
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
-@@ -1702,7 +1715,7 @@
+@@ -2027,7 +2040,7 @@
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
  
@@ -145,7 +153,7 @@ Index: uml-2.6.3/fs/ext3/namei.c
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
-@@ -2094,7 +2107,7 @@
+@@ -2439,7 +2452,7 @@
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
  
@@ -154,10 +162,10 @@ Index: uml-2.6.3/fs/ext3/namei.c
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
-Index: uml-2.6.3/include/linux/ext3_fs.h
+Index: linux-2.6.7/include/linux/ext3_fs.h
 ===================================================================
---- uml-2.6.3.orig/include/linux/ext3_fs.h     2004-01-09 14:59:44.000000000 +0800
-+++ uml-2.6.3/include/linux/ext3_fs.h  2004-02-21 00:21:04.613228472 +0800
+--- linux-2.6.7.orig/include/linux/ext3_fs.h   2005-03-24 00:27:43.542569096 +0800
++++ linux-2.6.7/include/linux/ext3_fs.h        2005-03-24 00:27:43.893515744 +0800
 @@ -203,6 +203,7 @@
  #define       EXT3_IOC_SETFLAGS               _IOW('f', 2, long)
  #define       EXT3_IOC_GETVERSION             _IOR('f', 3, long)
@@ -166,7 +174,7 @@ Index: uml-2.6.3/include/linux/ext3_fs.h
  #define       EXT3_IOC_GETVERSION_OLD         _IOR('v', 1, long)
  #define       EXT3_IOC_SETVERSION_OLD         _IOW('v', 2, long)
  #ifdef CONFIG_JBD_DEBUG
-@@ -707,7 +708,8 @@
+@@ -708,7 +709,8 @@
                          dx_hash_info *hinfo);
  
  /* ialloc.c */
@@ -176,7 +184,7 @@ Index: uml-2.6.3/include/linux/ext3_fs.h
  extern void ext3_free_inode (handle_t *, struct inode *);
  extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
  extern unsigned long ext3_count_free_inodes (struct super_block *);
-@@ -792,4 +794,5 @@
+@@ -793,4 +795,5 @@
  
  #endif        /* __KERNEL__ */
  
index 1907eb1..1a5db43 100644 (file)
@@ -2,6 +2,7 @@
 
 subdir-m += lvfs
 subdir-m += obdclass
+subdir-m += sec
 subdir-m += lov
 subdir-m += lmv
 subdir-m += ptlrpc
index a8197e1..24f80d0 100644 (file)
@@ -5,7 +5,7 @@
 
 AUTOMAKE_OPTIONS = foreign
 
-SUBDIRS = include ldiskfs lvfs obdclass lov ldlm ptlrpc      \
+SUBDIRS = include ldiskfs lvfs obdclass lov ldlm sec ptlrpc      \
        obdecho osc mdc lmv  mds obdfilter ost llite cobd ptlbd smfs snapfs \
        cmobd liblustre doc utils tests conf scripts autoconf
 
index c19132d..dae4f44 100644 (file)
@@ -317,6 +317,23 @@ AC_DEFINE_UNQUOTED(OBD_MAX_IOCTL_BUFFER, $OBD_BUFFER_SIZE, [IOCTL Buffer Size])
 ])
 
 #
+# LC_CONFIG_GSS
+#
+# whether build-in gss/krb5 capability
+#
+AC_DEFUN([LC_CONFIG_GSS],
+[AC_MSG_CHECKING([whether to enable gss/krb5 support])
+AC_ARG_ENABLE([gss],
+       AC_HELP_STRING([--enable-gss],
+                       [enable gss/krb5 support]),
+       [],[enable_gss='yes'])
+AC_MSG_RESULT([$enable_gss])
+if test x$enable_gss != xno ; then
+  AC_DEFINE(ENABLE_GSS, 1, Support GSS/krb5)
+fi
+])
+
+#
 # LC_CONFIG_SNAPFS
 #
 # Whether snapfs is desired
@@ -353,6 +370,7 @@ AC_MSG_RESULT([$enable_smfs])
 AC_DEFUN([LC_PROG_LINUX],
 [LC_CONFIG_BACKINGFS
 LC_CONFIG_PINGER
+LC_CONFIG_GSS
 LC_CONFIG_SNAPFS
 LC_CONFIG_SMFS
 
@@ -423,6 +441,7 @@ AM_CONDITIONAL(USE_QUILT, test x$QUILT != xno)
 AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests)
 AM_CONDITIONAL(SNAPFS, test x$enable_snapfs = xyes)
 AM_CONDITIONAL(SMFS, test x$enable_smfs = xyes)
+AM_CONDITIONAL(GSS, test x$enable_gss = xyes)
 AM_CONDITIONAL(LIBLUSTRE, test x$enable_liblustre = xyes)
 AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests)
 ])
@@ -450,7 +469,6 @@ lustre/ldiskfs/Makefile
 lustre/ldiskfs/autoMakefile
 lustre/ldlm/Makefile
 lustre/liblustre/Makefile
-lustre/liblustre/tests/Makefile
 lustre/llite/Makefile
 lustre/llite/autoMakefile
 lustre/lmv/Makefile
@@ -479,6 +497,10 @@ lustre/ptlrpc/Makefile
 lustre/ptlrpc/autoMakefile
 lustre/scripts/Makefile
 lustre/scripts/version_tag.pl
+lustre/sec/Makefile
+lustre/sec/autoMakefile
+lustre/sec/gss/Makefile
+lustre/sec/gss/autoMakefile
 lustre/smfs/Makefile
 lustre/smfs/autoMakefile
 lustre/snapfs/Makefile
index dd446bd..8a28304 100644 (file)
@@ -351,7 +351,7 @@ static int cobd_precleanup(struct obd_device *obd, int flags)
 }
 
 static int cobd_getattr(struct obd_export *exp, struct obdo *oa,
-                        struct lov_stripe_md *lsm)
+                        struct lov_stripe_md *ea)
 {
         struct obd_device *obd = class_exp2obd(exp);
         struct obd_export *cobd_exp;
@@ -362,7 +362,7 @@ static int cobd_getattr(struct obd_export *exp, struct obdo *oa,
                 return -EINVAL;
         }
         cobd_exp = cobd_get_exp(obd);
-        return obd_getattr(cobd_exp, oa, lsm);
+        return obd_getattr(cobd_exp, oa, ea);
 }
 
 static int cobd_getattr_async(struct obd_export *exp,
@@ -870,8 +870,8 @@ static int  cobd_import_event(struct obd_device *obd,
 }
 
 static int cobd_md_getattr(struct obd_export *exp, struct lustre_id *id,
-                           __u64 valid, unsigned int ea_size,
-                           struct ptlrpc_request **request)
+                          __u64 valid, const char *ea_name, int ea_namelen,
+                           unsigned int ea_size, struct ptlrpc_request **request)
 {
         struct obd_device *obd = class_exp2obd(exp);
         struct obd_export *cobd_exp;
@@ -882,7 +882,7 @@ static int cobd_md_getattr(struct obd_export *exp, struct lustre_id *id,
                 return -EINVAL;
         }
         cobd_exp = cobd_get_exp(obd);
-        return md_getattr(cobd_exp, id, valid, ea_size, request);
+        return md_getattr(cobd_exp, id, valid, NULL, 0, ea_size, request);
 }
 
 static int cobd_md_req2lustre_md (struct obd_export *mdc_exp, 
index 8f925e6..c99e6a5 100644 (file)
@@ -197,16 +197,17 @@ struct module {
         int count;
 };
 
-static inline void MODULE_AUTHOR(char *name)
-{
-        printf("%s\n", name);
-}
-#define MODULE_DESCRIPTION(name) MODULE_AUTHOR(name)
-#define MODULE_LICENSE(name) MODULE_AUTHOR(name)
+#define MODULE_AUTHOR(name)
+#define MODULE_DESCRIPTION(name)
+#define MODULE_LICENSE(name)
+
+#define module_init(init)
+#define module_exit(exit)
 
 #define THIS_MODULE NULL
 #define __init
 #define __exit
+#define __user
 
 /* devices */
 
@@ -275,6 +276,14 @@ static inline void spin_unlock_bh(spinlock_t *l) {}
 static inline void spin_lock_irqsave(spinlock_t *a, unsigned long b) {}
 static inline void spin_unlock_irqrestore(spinlock_t *a, unsigned long b) {}
 
+typedef struct { } rwlock_t;
+#define rwlock_init(x) do {} while(0)
+#define RW_LOCK_UNLOCKED (rwlock_t) {}
+#define read_lock(l)
+#define read_unlock(l)
+#define write_lock(l)
+#define write_unlock(l)
+
 #define min(x,y) ((x)<(y) ? (x) : (y))
 #define max(x,y) ((x)>(y) ? (x) : (y))
 
@@ -287,6 +296,10 @@ static inline void spin_unlock_irqrestore(spinlock_t *a, unsigned long b) {}
        ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
 #endif
 
+#define container_of(ptr, type, member) ({                      \
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
+        (type *)( (char *)__mptr - offsetof(type,member) );})
+
 /* registering symbols */
 
 #define ERESTARTSYS ERESTART
@@ -313,6 +326,12 @@ static inline int copy_to_user(void *a,void *b, int c)
         return 0;
 }
 
+static inline long strncpy_from_user(char *dest, const char *src, long n)
+{
+        char *s;
+        s = strncpy(dest, src, n);
+        return strnlen(s, n);
+}
 
 /* slabs */
 typedef struct {
@@ -427,7 +446,7 @@ static inline struct page* __grab_cache_page(unsigned long index)
 #define ATTR_ATTR_FLAG  0x0400
 #define ATTR_RAW        0x0800  /* file system, not vfs will massage attrs */
 #define ATTR_FROM_OPEN  0x1000  /* called from open path, ie O_TRUNC */
-#define ATTR_CTIME_SET  0x2000
+/* ATTR_CTIME_SET has been defined in lustre_idl.h */
 
 struct iattr {
         unsigned int    ia_valid;
@@ -457,25 +476,28 @@ struct iattr {
 
 #define INTENT_MAGIC 0x19620323
 
-struct lustre_intent_data {
-        int       it_disposition;
-        int       it_status;
-        __u64     it_lock_handle;
-        void     *it_data;
-        int       it_lock_mode;
-        int it_int_flags;
-};
 struct lookup_intent {
         int     it_magic;
         void    (*it_op_release)(struct lookup_intent *);
         int     it_op;
         int     it_flags;
         int     it_create_mode;
-        union {
-                struct lustre_intent_data lustre;
-        } d;
+       union {
+               void *fs_data; /* FS-specific intent data */
+       } d;
 };
 
+struct lustre_intent_data {
+        int     it_disposition;
+        int     it_status;
+        __u64   it_lock_handle;
+        void    *it_data;
+        int     it_lock_mode;
+        int     it_int_flags;
+};
+
+#define LUSTRE_IT(it) ((struct lustre_intent_data *)((it)->d.fs_data))
+
 static inline void intent_init(struct lookup_intent *it, int op, int flags)
 {
         memset(it, 0, sizeof(*it));
@@ -543,6 +565,8 @@ struct task_struct {
         struct signal pending;
         char comm[32];
         int pid;
+        uid_t uid;
+        gid_t gid;
         int fsuid;
         int fsgid;
         int max_groups;
@@ -625,6 +649,14 @@ static inline int schedule_timeout(signed long t)
 #define time_after(a, b) ((long)(b) - (long)(a) < 0)
 #define time_before(a, b) time_after(b,a)
 
+static inline unsigned long get_seconds(void)
+{
+        struct timeval tv;
+
+        gettimeofday(&tv, NULL);
+        return (tv.tv_sec + tv.tv_usec / 1000000);
+}
+
 struct timer_list {
         struct list_head tl_list;
         void (*function)(unsigned long unused);
index d187775..fc1017d 100644 (file)
@@ -15,4 +15,5 @@ EXTRA_DIST = lprocfs_status.h lustre_debug.h lustre_ha.h lustre_lib.h \
   lustre_export.h lustre_log.h obd_echo.h obd_ptlbd.h obd_trace.h \
   lustre_compat25.h lustre_fsfilt.h lustre_import.h lustre_mds.h obd.h \
   lvfs.h lvfs_linux.h lustre_cfg.h lustre_lite.h  lustre_idl.h lustre_smfs.h \
-  lustre_cmobd.h obd_lmv.h lustre_snap.h
+  lustre_cmobd.h obd_lmv.h lustre_snap.h lustre_sec.h lustre_ucache.h \
+  lustre_acl.h
diff --git a/lustre/include/linux/lustre_acl.h b/lustre/include/linux/lustre_acl.h
new file mode 100644 (file)
index 0000000..2267997
--- /dev/null
@@ -0,0 +1,36 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- 
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */ 
+
+#ifndef _LUSTRE_ACL_H_
+#define _LUSTRE_ACL_H_
+
+#include <linux/xattr_acl.h>
+
+/*
+* the value of LL_ACL_MAX_ENTRIES and LL_ACL_NOT_CACHED should be 
+* kept step with related definition in ext3 (EXT3_ACL_MAX_ENTRIES and
+* EXT3_ACL_NOT_CACHED)
+*/
+#define LL_ACL_MAX_ENTRIES      32      // EXT3_ACL_MAX_ENTRIES
+#define LL_ACL_NOT_CACHED       ((void *)-1) //EXT3_ACL_NOT_CACHED
+
+#endif
index fe446e5..3f2038f 100644 (file)
@@ -40,6 +40,7 @@ enum lcfg_command_type {
         LCFG_LOV_DEL_OBD    = 0x00cf00c,
         LCFG_ADD_CONN       = 0x00cf00d,
         LCFG_DEL_CONN       = 0x00cf00e,
+        LCFG_SET_SECURITY   = 0x00cf00f,
 };
 
 struct lustre_cfg {
@@ -279,6 +280,9 @@ struct lustre_mount_data {
         uint32_t lmd_nal;
         uint32_t lmd_server_ipaddr;
         uint32_t lmd_port;
+        uint32_t lmd_nllu;
+        uint32_t lmd_nllg;
+        char     lmd_security[16];
         char     lmd_mds[64];
         char     lmd_profile[64];
 };
index 711f282..03a88a4 100644 (file)
@@ -99,6 +99,16 @@ static inline int cleanup_group_info(void)
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
 
+/* New (actually old) intent naming */
+#define lookup_intent open_intent
+
+/* And internals */
+#define it_flags flags
+#define it_op op
+#define it_magic magic
+#define it_op_release op_release
+#define it_create_mode create_mode
+
 /*
  * OBD need working random driver, thus all our
  * initialization routines must be called after device
index 525110d..2e4e760 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/lustre_dlm.h>
 
 struct mds_client_data;
+struct mds_idmap_table;
 
 struct mds_export_data {
         struct list_head        med_open_head;
@@ -21,6 +22,12 @@ struct mds_export_data {
         struct mds_client_data *med_mcd;
         loff_t                  med_off;
         int                     med_idx;
+        unsigned int            med_local:1;
+        __u32                   med_nllu;
+        __u32                   med_nllg;
+        /* simple idmapping */
+        spinlock_t              med_idmap_lock;
+        struct mds_idmap_table *med_idmap;
 };
 
 struct osc_creator {
index 198f89c..184572f 100644 (file)
@@ -361,6 +361,7 @@ struct lov_mds_md_v0 {            /* LOV EA mds/wire data (little-endian) */
 #define OBD_MD_FLUID    (0x0000000000000200LL)    /* user ID */
 #define OBD_MD_FLGID    (0x0000000000000400LL)    /* group ID */
 #define OBD_MD_FLFLAGS  (0x0000000000000800LL)    /* flags word */
+#define OBD_MD_FLEA     (0x0000000000001000LL)    /* extended attributes */
 #define OBD_MD_FLNLINK  (0x0000000000002000LL)    /* link count */
 #define OBD_MD_FLGENER  (0x0000000000004000LL)    /* generation number */
 #define OBD_MD_FLINLINE (0x0000000000008000LL)    /* inline data */
@@ -380,12 +381,15 @@ struct lov_mds_md_v0 {            /* LOV EA mds/wire data (little-endian) */
 #define OBD_MD_FLDIREA  (0x0000000020000000LL)    /* dir's extended attribute data */
 #define OBD_MD_REINT    (0x0000000040000000LL)    /* reintegrate oa */
 #define OBD_MD_FID      (0x0000000080000000LL)    /* lustre_id data */
+#define OBD_MD_FLEALIST (0x0000000100000000LL)    /* list extended attributes */
+#define OBD_MD_FLACL_ACCESS (0x0000000200000000LL) /*access acl*/
 
 #define OBD_MD_FLNOTOBD (~(OBD_MD_FLBLOCKS | OBD_MD_LINKNAME |          \
                            OBD_MD_FLEASIZE | OBD_MD_FLHANDLE |          \
                            OBD_MD_FLCKSUM | OBD_MD_FLQOS |              \
                            OBD_MD_FLOSCOPQ | OBD_MD_FLCOOKIE |          \
-                           OBD_MD_MDS))
+                           OBD_MD_FLEA | OBD_MD_FLEALIST |              \
+                           OBD_MD_FLACL_ACCESS | OBD_MD_MDS))
 
 static inline struct lustre_handle *obdo_handle(struct obdo *oa)
 {
@@ -487,10 +491,6 @@ extern void lustre_swab_ost_lvb(struct ost_lvb *);
 
 /* 
  * security descriptor in mds request
- *
- * note gid & cap might need be removed later:
- *  - cap should be obtained on mds
- *  - gid is actually not used.
  */
 struct mds_req_sec_desc {
         __u32           rsd_uid;
@@ -635,6 +635,7 @@ struct lustre_md {
         struct mds_body *body;
         struct lov_stripe_md *lsm;
         struct mea *mea;
+        struct posix_acl *acl_access;
 };
 
 struct mdc_op_data {
@@ -666,11 +667,21 @@ struct mds_rec_setattr {
         __u64            sa_ctime;
 };
 
-/* Remove this once we declare it in include/linux/fs.h (v21 kernel patch?) */
-#ifndef ATTR_CTIME_SET
-#define ATTR_CTIME_SET 0x2000
+/* XXX Following ATTR_XXX should go to vfs patch...  */
+#ifdef ATTR_CTIME_SET
+#error "ATTR_CTIME_SET has been defined somewhere else"
+#endif
+#ifdef ATTR_EA
+#error "ATTR_EA has been defined somewhere else"
+#endif
+#ifdef ATTR_EA_RM
+#error "ATTR_EA_RM has been defined somewhere else"
 #endif
 
+#define ATTR_CTIME_SET  0x00002000
+#define ATTR_EA         0x00040000
+#define ATTR_EA_RM      0x00080000
+
 extern void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa);
 
 #ifndef FMODE_READ
@@ -1116,4 +1127,13 @@ static inline struct lustre_id *obdo_id(struct obdo *oa)
         return (struct lustre_id *)raw_id;
 }
 
+/* security negotiate */
+typedef enum {
+        SEC_INIT                = 600,
+        SEC_INIT_CONTINUE       = 601,
+        SEC_FINI                = 602,
+        SEC_LAST_OPC
+} sec_cmd_t;
+#define SEC_FIRST_OPC SEC_INIT
+
 #endif
index e7230d0..d3c182c 100644 (file)
@@ -46,6 +46,8 @@ enum obd_import_event {
         IMP_EVENT_ACTIVE     = 0x808004,
 };
 
+struct ptlrpc_sec;
+
 struct obd_import_conn {
         struct list_head          oic_item;
         struct ptlrpc_connection *oic_conn;
@@ -53,7 +55,6 @@ struct obd_import_conn {
         unsigned long             oic_last_attempt; /* in jiffies */
 };
 
-
 struct obd_import {
         struct portals_handle     imp_handle;
         atomic_t                  imp_refcount;
@@ -70,7 +71,11 @@ struct obd_import {
         struct list_head          imp_sending_list;
         struct list_head          imp_delayed_list;
 
+        /* list of ongoing raw rpcs (only used by gss) */
+        struct list_head          imp_rawrpc_list;
+
         struct obd_device        *imp_obd;
+        struct ptlrpc_sec        *imp_sec;
         wait_queue_head_t         imp_recovery_waitq;
         __u64                     imp_last_replay_transno;
         atomic_t                  imp_inflight;
index bd8341b..866d429 100644 (file)
@@ -36,6 +36,8 @@
 #include <linux/rbtree.h>
 #include <linux/lustre_compat25.h>
 #include <linux/pagemap.h>
+#include <linux/namei.h>
+
 
 /* careful, this is easy to screw up */
 #define PAGE_CACHE_MAXBYTES ((__u64)(~0UL) << PAGE_CACHE_SHIFT)
@@ -45,7 +47,7 @@
 static inline struct lookup_intent *ll_nd2it(struct nameidata *nd)
 {
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-        return &nd->intent;
+        return &nd->intent.open;
 #else
         return nd->intent;
 #endif
@@ -96,6 +98,7 @@ struct ll_inode_info {
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
         struct inode            lli_vfs_inode;
 #endif
+        struct posix_acl       *lli_acl_access;
 };
 
 // FIXME: replace the name of this with LL_I to conform to kernel stuff
@@ -140,8 +143,19 @@ enum {
 
          LPROC_LL_DIRECT_READ,
          LPROC_LL_DIRECT_WRITE,
-         LPROC_LL_FILE_OPCODES
+         LPROC_LL_SETXATTR,
+         LPROC_LL_GETXATTR,
+         LPROC_LL_FILE_OPCODES,
+};
+
+struct lustre_intent_data {
+        int     it_disposition;
+        int     it_status;
+        __u64   it_lock_handle;
+        void    *it_data;
+        int     it_lock_mode;
 };
+#define LUSTRE_IT(it) ((struct lustre_intent_data *)((it)->d.fs_data))
 
 static inline void
 ll_inode2id(struct lustre_id *id, struct inode *inode)
index d918380..da6aafe 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/lustre_dlm.h>
 #include <linux/lustre_log.h>
 #include <linux/lustre_export.h>
+#include <linux/lustre_ucache.h>
 
 struct ldlm_lock_desc;
 struct mds_obd;
@@ -62,8 +63,10 @@ struct mds_update_record {
         char               *ur_tgt;
         int                 ur_eadatalen;
         void               *ur_eadata;
-        int                 ur_cookielen;
-        struct llog_cookie *ur_logcookies;
+        int                 ur_ea2datalen;
+        void               *ur_ea2data;
+        int                 ur_cookielen;       /* obsolete? */
+        struct llog_cookie *ur_logcookies;      /* obsolete? */
         struct iattr        ur_iattr;
         struct lvfs_ucred   ur_uc;
         __u64               ur_rdev;
@@ -130,6 +133,19 @@ struct mds_client_data {
         __u8 mcd_padding[MDS_LR_CLIENT_SIZE - 64];
 };
 
+/* simple uid/gid mapping hash table */
+struct mds_idmap_item {
+        struct list_head        hash;
+        __u32                   id1;
+        __u32                   id2;
+};
+
+#define MDS_IDMAP_HASHSIZE      (32)
+struct mds_idmap_table {
+        struct list_head uidmap[MDS_IDMAP_HASHSIZE];
+        struct list_head gidmap[MDS_IDMAP_HASHSIZE];
+};
+
 /* file data for open files on MDS */
 struct mds_file_data {
         struct portals_handle mfd_handle; /* must be first */
@@ -166,6 +182,32 @@ struct mds_grp_hash {
         unsigned int            gh_allow_setgroups:1;
 };
 
+/* lustre security descriptor */
+struct lustre_sec_desc {
+        uid_t                   lsd_uid;
+        gid_t                   lsd_gid;
+        struct group_info      *lsd_ginfo;
+        unsigned int            lsd_allow_setuid:1,
+                                lsd_allow_setgid:1,
+                                lsd_allow_setgrp:1;
+};
+
+struct lsd_cache_entry {
+        struct upcall_cache_entry     base;
+        struct lustre_sec_desc        lsd;
+};
+
+struct lsd_downcall_args {
+        int     err;
+        uid_t   uid;
+        gid_t   gid;
+        __u32   ngroups;
+        gid_t  *groups;
+        __u32   allow_setuid;
+        __u32   allow_setgid;
+        __u32   allow_setgrp;
+};
+
 /* mds/mds_reint.c  */
 int mds_reint_rec(struct mds_update_record *r, int offset,
                   struct ptlrpc_request *req, struct lustre_handle *);
@@ -224,8 +266,8 @@ int mdc_req2lustre_md(struct obd_export *exp_lmv, struct ptlrpc_request *req,
                       struct lustre_md *md);
 int mdc_getstatus(struct obd_export *exp, struct lustre_id *rootid);
 int mdc_getattr(struct obd_export *exp, struct lustre_id *id,
-                __u64 valid, unsigned int ea_size,
-                struct ptlrpc_request **request);
+                __u64 valid, const char *ea_name, int ea_namelen,
+                unsigned int ea_size, struct ptlrpc_request **request);
 int mdc_getattr_lock(struct obd_export *exp, struct lustre_id *id,
                      char *filename, int namelen, __u64 valid,
                      unsigned int ea_size, struct ptlrpc_request **request);
index d938260..019e1de 100644 (file)
@@ -241,6 +241,9 @@ struct ptlrpc_cb_id {
         void    *cbid_arg;                      /* additional arg */
 };
 
+struct ptlrpc_cred;
+struct ptlrpc_svcsec;
+
 #define RS_MAX_LOCKS 4
 #define RS_DEBUG     1
 
@@ -259,7 +262,15 @@ struct ptlrpc_reply_state {
         unsigned int          rs_handled:1;     /* been handled yet? */
         unsigned int          rs_on_net:1;      /* reply_out_callback pending? */
 
-        int                   rs_size;
+        struct ptlrpc_svcsec *rs_svcsec;
+        char                 *rs_buf;           /* backend buffer */
+        int                   rs_buf_len;       /* backend buffer length */
+        char                 *rs_repbuf;        /* will be sent on wire */
+        int                   rs_repbuf_len;    /* max on-wire data length */
+        int                   rs_repdata_len;   /* actual on-wire data length */
+        struct lustre_msg    *rs_msg;           /* lustre msg pointer */
+        int                   rs_msg_len;       /* length of lustre msg */
+
         __u64                 rs_transno;
         __u64                 rs_xid;
         struct obd_export    *rs_export;
@@ -271,9 +282,6 @@ struct ptlrpc_reply_state {
         struct lustre_handle  rs_locks[RS_MAX_LOCKS];
         ldlm_mode_t           rs_modes[RS_MAX_LOCKS];
         struct llog_create_locks *rs_llog_locks;
-
-        /* last member: variable sized reply message */
-        struct lustre_msg     rs_msg;
 };
 
 struct ptlrpc_request {
@@ -285,7 +293,8 @@ struct ptlrpc_request {
         unsigned int rq_intr:1, rq_replied:1, rq_err:1,
                 rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1,
                 rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1,
-                rq_no_delay:1, rq_net_err:1;
+                rq_no_delay:1, rq_net_err:1, rq_req_wrapped:1,
+                rq_ptlrpcs_restart:1;
         int rq_phase;
         /* client-side refcount for SENT race */
         atomic_t rq_refcount;
@@ -306,6 +315,20 @@ struct ptlrpc_request {
         __u64 rq_xid;
         struct list_head rq_replay_list;
 
+        struct ptlrpc_cred   *rq_cred;        /* client side credit */
+        struct ptlrpc_svcsec *rq_svcsec;      /* server side security */
+        /* XXX temporarily put here XXX */
+        void                 *rq_sec_svcdata; /* server security data */
+        unsigned int          rq_remote;      /* from remote client */
+        uid_t                 rq_auth_uid;
+
+        char *rq_reqbuf;       /* backend request buffer */
+        int   rq_reqbuf_len;   /* backend request buffer length */
+        int   rq_reqdata_len;  /* actual request data length */
+        char *rq_repbuf;       /* backend reply buffer */
+        int   rq_repbuf_len;   /* backend reply buffer length */
+        int   rq_repdata_len;  /* actual reply data length, not used yet */
+
 #if SWAB_PARANOIA
         __u32 rq_req_swab_mask;
         __u32 rq_rep_swab_mask;
@@ -574,6 +597,8 @@ int ptlrpc_error(struct ptlrpc_request *req);
 void ptlrpc_resend_req(struct ptlrpc_request *request);
 int ptl_send_rpc(struct ptlrpc_request *request);
 int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd);
+int ptlrpc_do_rawrpc(struct obd_import *imp, char *reqbuf, int reqlen,
+                     char *repbuf, int *replenp, int timeout);
 
 /* ptlrpc/client.c */
 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
diff --git a/lustre/include/linux/lustre_sec.h b/lustre/include/linux/lustre_sec.h
new file mode 100644 (file)
index 0000000..e1e866c
--- /dev/null
@@ -0,0 +1,360 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __LINUX_SEC_H_
+#define __LINUX_SEC_H_
+
+/* forward declaration */
+struct obd_import;
+struct ptlrpc_request;
+struct ptlrpc_cred;
+struct ptlrpc_credops;
+struct ptlrpc_sec;
+struct ptlrpc_secops;
+
+#define PTLRPC_SEC_MAX_FLAVORS   (4)
+
+typedef struct ptlrpcs_flavor_s {
+        __u32   flavor;
+        __u32   subflavor;
+} ptlrpcs_flavor_t;
+
+enum ptlrpcs_security_type {
+        PTLRPC_SEC_TYPE_NONE    = 0,    /* no security */
+        PTLRPC_SEC_TYPE_AUTH    = 1,    /* authentication */
+        PTLRPC_SEC_TYPE_PRIV    = 2,    /* privacy */
+};
+
+/*
+ * This header is prepended at any on-wire ptlrpc packets
+ */
+struct ptlrpcs_wire_hdr {
+        __u32   flavor;
+        __u32   sectype;
+        __u32   msg_len;
+        __u32   sec_len;
+};
+
+static inline
+struct ptlrpcs_wire_hdr *buf_to_sec_hdr(void *buf)
+{
+        return (struct ptlrpcs_wire_hdr *) buf;
+}
+
+static inline
+struct lustre_msg *buf_to_lustre_msg(void *buf)
+{
+        return (struct lustre_msg *)
+               ((char *) buf + sizeof(struct ptlrpcs_wire_hdr));
+}
+
+static inline
+__u8 *buf_to_sec_data(void *buf)
+{
+        struct ptlrpcs_wire_hdr *hdr = buf_to_sec_hdr(buf);
+        return (__u8 *) (buf + sizeof(*hdr) + hdr->msg_len);
+}
+
+enum ptlrpcs_flavors {
+        PTLRPC_SEC_NULL = 0,
+        PTLRPC_SEC_GSS  = 1,
+};
+
+#define PTLRPC_SEC_GSS_VERSION (1)
+
+enum ptlrpcs_gss_subflavors {
+        PTLRPC_SEC_GSS_KRB5  = 0,
+        PTLRPC_SEC_GSS_KRB5I = 1,
+        PTLRPC_SEC_GSS_KRB5P = 2,
+};
+
+enum ptlrpcs_gss_proc {
+        PTLRPC_GSS_PROC_DATA =          0,
+        PTLRPC_GSS_PROC_INIT =          1,
+        PTLRPC_GSS_PROC_CONTINUE_INIT = 2,
+        PTLRPC_GSS_PROC_DESTROY =       3,
+        PTLRPC_GSS_PROC_ERR =           4,
+};
+                                                                                                                        
+enum ptlrpcs_gss_svc {
+        PTLRPC_GSS_SVC_NONE =           1,
+        PTLRPC_GSS_SVC_INTEGRITY =      2,
+        PTLRPC_GSS_SVC_PRIVACY =        3,
+};
+
+enum ptlrpcs_error {
+        PTLRPCS_OK =                    0,
+        PTLRPCS_BADCRED =               1,
+        PTLRPCS_REJECTEDCRED =          2,
+        PTLRPCS_BADVERF =               3,
+        PTLRPCS_REJECTEDVERF =          4,
+        PTLRPCS_TOOWEAK =               5,
+        /* GSS errors */
+        PTLRPCS_GSS_CREDPROBLEM =       13,
+        PTLRPCS_GSS_CTXPROBLEM =        14,
+};
+
+struct vfs_cred {
+        __u64   vc_pag;
+        uid_t   vc_uid;
+        gid_t   vc_gid;
+        struct group_info *vc_ginfo;
+};
+
+struct ptlrpc_credops {
+        int     (*refresh)(struct ptlrpc_cred *cred);
+        int     (*match)  (struct ptlrpc_cred *cred,
+                           struct ptlrpc_request *req,
+                           struct vfs_cred *vcred);
+        int     (*sign)   (struct ptlrpc_cred *cred, struct ptlrpc_request *req);
+        int     (*verify) (struct ptlrpc_cred *cred, struct ptlrpc_request *req);
+        int     (*seal)   (struct ptlrpc_cred *cred, struct ptlrpc_request *req);
+        int     (*unseal) (struct ptlrpc_cred *cred, struct ptlrpc_request *req);
+        void    (*destroy)(struct ptlrpc_cred *cred);
+};
+
+#define PTLRPC_CRED_UPTODATE    0x00000001
+#define PTLRPC_CRED_DEAD        0x00000002
+
+struct ptlrpc_cred {
+        struct list_head        pc_hash;   /* linked into hash table */
+        atomic_t                pc_refcount;
+        struct ptlrpc_sec      *pc_sec;
+        struct ptlrpc_credops  *pc_ops;
+        struct ptlrpc_request  *pc_req;
+        unsigned long           pc_expire;
+        int                     pc_flags;
+        /* XXX maybe should not be here */
+        __u64                   pc_pag;
+        uid_t                   pc_uid;
+};
+
+struct ptlrpc_secops {
+        struct ptlrpc_sec *   (*create_sec)    (ptlrpcs_flavor_t *flavor,
+                                                const char *pipe_dir,
+                                                void *pipe_data);
+        void                  (*destroy_sec)   (struct ptlrpc_sec *sec);
+        struct ptlrpc_cred *  (*create_cred)   (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req,
+                                                struct vfs_cred *vcred);
+        /* buffer manipulation */
+        int                   (*alloc_reqbuf)  (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req,
+                                                int lustre_msg_size);
+        int                   (*alloc_repbuf)  (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req,
+                                                int lustre_msg_size);
+        void                  (*free_reqbuf)   (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req);
+        void                  (*free_repbuf)   (struct ptlrpc_sec *sec,
+                                                struct ptlrpc_request *req);
+        /* security payload size estimation */
+        int                   (*est_req_payload)(struct ptlrpc_sec *sec,
+                                                 int msgsize);
+        int                   (*est_rep_payload)(struct ptlrpc_sec *sec,
+                                                 int msgsize);
+};
+
+struct ptlrpc_sec_type {
+        struct module          *pst_owner;
+        char                   *pst_name;
+        atomic_t                pst_inst;       /* instance, debug only */
+        ptlrpcs_flavor_t        pst_flavor;
+        struct ptlrpc_secops   *pst_ops;
+};
+
+#define PTLRPC_CREDCACHE_NR     8
+#define PTLRPC_CREDCACHE_MASK   (PTLRPC_CREDCACHE_NR - 1)
+
+struct ptlrpc_sec {
+        struct ptlrpc_sec_type *ps_type;
+        struct list_head        ps_credcache[PTLRPC_CREDCACHE_NR];
+        spinlock_t              ps_lock;        /* protect cred cache */
+        __u32                   ps_sectype;
+        ptlrpcs_flavor_t        ps_flavor;
+        atomic_t                ps_refcount;
+        atomic_t                ps_credcount;
+        struct obd_import      *ps_import;
+        /* actual security model need initialize following fields */
+        unsigned long           ps_expire;      /* cache expire interval */
+        unsigned long           ps_nextgc;      /* next gc time */
+        unsigned int            ps_flags;
+};
+
+/* sec.c */
+int  ptlrpcs_register(struct ptlrpc_sec_type *type);
+int  ptlrpcs_unregister(struct ptlrpc_sec_type *type);
+
+struct ptlrpc_sec * ptlrpcs_sec_create(ptlrpcs_flavor_t *flavor,
+                                       struct obd_import *import,
+                                       const char *pipe_dir,
+                                       void *pipe_data);
+void ptlrpcs_sec_put(struct ptlrpc_sec *sec);
+void ptlrpcs_sec_invalidate_cache(struct ptlrpc_sec *sec);
+
+struct ptlrpc_cred * ptlrpcs_cred_lookup(struct ptlrpc_sec *sec,
+                                         struct vfs_cred *vcred);
+void ptlrpcs_cred_put(struct ptlrpc_cred *cred, int sync);
+
+static inline void ptlrpcs_cred_get(struct ptlrpc_cred *cred)
+{
+        LASSERT(atomic_read(&cred->pc_refcount));
+        atomic_inc(&cred->pc_refcount);
+}
+
+static inline int ptlrpcs_cred_is_uptodate(struct ptlrpc_cred *cred)
+{
+        LASSERT(cred);
+        LASSERT(atomic_read(&cred->pc_refcount));
+        return (cred->pc_flags & PTLRPC_CRED_UPTODATE);
+}
+static inline int ptlrpcs_cred_refresh(struct ptlrpc_cred *cred)
+{
+        LASSERT(cred);
+        LASSERT(atomic_read(&cred->pc_refcount));
+        LASSERT(cred->pc_ops);
+        LASSERT(cred->pc_ops->refresh);
+        return cred->pc_ops->refresh(cred);
+}
+static inline void ptlrpcs_cred_die(struct ptlrpc_cred *cred)
+{
+        LASSERT(atomic_read(&cred->pc_refcount));
+        LASSERT(cred->pc_sec);
+        if (!(cred->pc_flags & PTLRPC_CRED_DEAD)) {
+                spin_lock(&cred->pc_sec->ps_lock);
+                cred->pc_flags |= PTLRPC_CRED_DEAD;
+                cred->pc_flags &= ~PTLRPC_CRED_UPTODATE;
+                list_del_init(&cred->pc_hash);
+                spin_unlock(&cred->pc_sec->ps_lock);
+        }
+}
+static inline int ptlrpcs_cred_is_dead(struct ptlrpc_cred *cred)
+{
+        return(cred->pc_flags & PTLRPC_CRED_DEAD);
+}
+
+static inline int ptlrpcs_est_req_payload(struct ptlrpc_sec *sec,
+                                          int datasize)
+{
+        struct ptlrpc_secops *ops;
+
+        LASSERT(sec);
+        LASSERT(sec->ps_type);
+        LASSERT(sec->ps_type->pst_ops);
+
+        ops = sec->ps_type->pst_ops;
+        if (ops->est_req_payload)
+                return ops->est_req_payload(sec, datasize);
+        else
+                return 0;
+}
+
+static inline int ptlrpcs_est_rep_payload(struct ptlrpc_sec *sec,
+                                          int datasize)
+{
+        struct ptlrpc_secops *ops;
+
+        LASSERT(sec);
+        LASSERT(sec->ps_type);
+        LASSERT(sec->ps_type->pst_ops);
+
+        ops = sec->ps_type->pst_ops;
+        if (ops->est_rep_payload)
+                return ops->est_rep_payload(sec, datasize);
+        else
+                return 0;
+}
+
+int ptlrpcs_cli_wrap_request(struct ptlrpc_request *req);
+int ptlrpcs_cli_unwrap_reply(struct ptlrpc_request *req);
+int ptlrpcs_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize);
+int ptlrpcs_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize);
+void ptlrpcs_cli_free_reqbuf(struct ptlrpc_request *req);
+void ptlrpcs_cli_free_repbuf(struct ptlrpc_request *req);
+
+/* higher interface */
+int  ptlrpcs_import_get_sec(struct obd_import *imp);
+void ptlrpcs_import_drop_sec(struct obd_import *imp);
+int  ptlrpcs_req_get_cred(struct ptlrpc_request *req);
+void ptlrpcs_req_drop_cred(struct ptlrpc_request *req);
+int  ptlrpcs_req_replace_dead_cred(struct ptlrpc_request *req);
+int  ptlrpcs_req_refresh_cred(struct ptlrpc_request *req);
+
+/* internal helpers */
+int sec_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req,
+                     int msgsize, int secsize);
+void sec_free_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req);
+
+/* sec_null.c */
+int ptlrpcs_null_init(void);
+int ptlrpcs_null_exit(void);
+
+/**********************************************************
+ * Server side stuff
+ **********************************************************/
+
+struct ptlrpc_reply_state;
+
+struct ptlrpc_svcsec {
+        struct module           *pss_owner;
+        char                    *pss_name;
+        ptlrpcs_flavor_t         pss_flavor;
+        int                      pss_sec_size;
+
+        int                    (*accept)      (struct ptlrpc_request *req,
+                                               enum ptlrpcs_error *res);
+        int                    (*authorize)   (struct ptlrpc_request *req);
+        int                    (*alloc_repbuf)(struct ptlrpc_svcsec *svcsec,
+                                               struct ptlrpc_request *req,
+                                               int msgsize);
+        void                   (*free_repbuf) (struct ptlrpc_svcsec *svcsec,
+                                               struct ptlrpc_reply_state *rs);
+        void                   (*cleanup_req) (struct ptlrpc_svcsec *svcsec,
+                                               struct ptlrpc_request *req);
+};
+
+#define SVC_OK          1
+#define SVC_COMPLETE    2
+#define SVC_DROP        3
+#define SVC_LOGIN       4
+#define SVC_LOGOUT      5
+
+int svcsec_register(struct ptlrpc_svcsec *ss);
+int svcsec_unregister(struct ptlrpc_svcsec *ss);
+int svcsec_accept(struct ptlrpc_request *req, enum ptlrpcs_error *res);
+int svcsec_authorize(struct ptlrpc_request *req);
+int svcsec_alloc_repbuf(struct ptlrpc_svcsec *svcsec,
+                        struct ptlrpc_request *req, int msgsize);
+void svcsec_cleanup_req(struct ptlrpc_request *req);
+
+struct ptlrpc_svcsec * svcsec_get(struct ptlrpc_svcsec *sec);
+void svcsec_put(struct ptlrpc_svcsec *sec);
+
+/* internal helpers */
+int svcsec_alloc_reply_state(struct ptlrpc_request *req,
+                             int msgsize, int secsize);
+void svcsec_free_reply_state(struct ptlrpc_reply_state *rs);
+
+/* svcsec_null.c */
+int svcsec_null_init(void);
+int svcsec_null_exit(void);
+
+#endif /* __LINUX_SEC_H_ */
index ee3e43a..7f83f04 100644 (file)
@@ -26,6 +26,7 @@
 #ifndef __LUSTRE_SMFS_H
 #define __LUSTRE_SMFS_H
 
+#include <linux/namei.h>
 struct snap_inode_info {
        int sn_flags;           /*the flags indicated inode type */
        int sn_gen;             /*the inode generation*/
diff --git a/lustre/include/linux/lustre_ucache.h b/lustre/include/linux/lustre_ucache.h
new file mode 100644 (file)
index 0000000..68e37db
--- /dev/null
@@ -0,0 +1,79 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+
+#ifndef _UPCALL_CACHE_H
+#define _UPCALL_CACHE_H
+
+#define UC_CACHE_NEW            0x01
+#define UC_CACHE_ACQUIRING      0x02
+#define UC_CACHE_INVALID        0x04
+#define UC_CACHE_EXPIRED        0x08
+
+#define UC_CACHE_IS_NEW(i)          ((i)->ue_flags & UC_CACHE_NEW)
+#define UC_CACHE_IS_INVALID(i)      ((i)->ue_flags & UC_CACHE_INVALID)
+#define UC_CACHE_IS_ACQUIRING(i)    ((i)->ue_flags & UC_CACHE_ACQUIRING)
+#define UC_CACHE_IS_EXPIRED(i)      ((i)->ue_flags & UC_CACHE_EXPIRED)
+#define UC_CACHE_IS_VALID(i)        ((i)->ue_flags == 0)
+
+#define UC_CACHE_SET_NEW(i)         (i)->ue_flags |= UC_CACHE_NEW
+#define UC_CACHE_SET_INVALID(i)     (i)->ue_flags |= UC_CACHE_INVALID
+#define UC_CACHE_SET_ACQUIRING(i)   (i)->ue_flags |= UC_CACHE_ACQUIRING
+#define UC_CACHE_SET_EXPIRED(i)     (i)->ue_flags |= UC_CACHE_EXPIRED
+#define UC_CACHE_SET_VALID(i)       (i)->ue_flags = 0
+
+#define UC_CACHE_CLEAR_NEW(i)       (i)->ue_flags &= ~UC_CACHE_NEW
+#define UC_CACHE_CLEAR_ACQUIRING(i) (i)->ue_flags &= ~UC_CACHE_ACQUIRING
+#define UC_CACHE_CLEAR_INVALID(i)   (i)->ue_flags &= ~UC_CACHE_INVALID
+#define UC_CACHE_CLEAR_EXPIRED(i)   (i)->ue_flags &= ~UC_CACHE_EXPIRED
+
+struct upcall_cache;
+
+struct upcall_cache_entry {
+        struct list_head        ue_hash;
+        atomic_t                ue_refcount;
+        __u64                   ue_key;
+        struct upcall_cache    *ue_cache;
+        int                     ue_flags;
+        wait_queue_head_t       ue_waitq;
+        unsigned long           ue_acquire_expire;
+        unsigned long           ue_expire;
+};
+
+#define UC_CACHE_UPCALL_MAXPATH (1024)
+
+struct upcall_cache {
+        struct list_head       *uc_hashtable;
+        int                     uc_hashsize;
+        rwlock_t                uc_hashlock;
+
+        char                   *uc_name;
+        char                    uc_upcall[UC_CACHE_UPCALL_MAXPATH];
+        unsigned long           uc_acquire_expire;
+        unsigned long           uc_entry_expire;
+
+        /* functions */
+        unsigned int                (*hash)(struct upcall_cache *, __u64);
+        struct upcall_cache_entry*  (*alloc_entry)(struct upcall_cache *, __u64);
+        void                        (*free_entry)(struct upcall_cache *,
+                                                  struct upcall_cache_entry *);
+        int                         (*make_upcall)(struct upcall_cache *,
+                                                   struct upcall_cache_entry *);
+        int                         (*parse_downcall)(struct upcall_cache *,
+                                                      struct upcall_cache_entry *,
+                                                      void *args);
+};
+
+void upcall_cache_init_entry(struct upcall_cache *cache,
+                             struct upcall_cache_entry *entry,
+                             __u64 key);
+struct upcall_cache_entry *
+upcall_cache_get_entry(struct upcall_cache *cache, __u64 key);
+void upcall_cache_put_entry(struct upcall_cache_entry *entry);
+int upcall_cache_downcall(struct upcall_cache *cache, __u64 key,
+                          int err, void *args);
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key);
+void upcall_cache_flush_idle(struct upcall_cache *cache);
+void upcall_cache_flush_all(struct upcall_cache *cache);
+
+#endif /* _UPCALL_CACHE_H */
index 5e3cbd0..96898fd 100644 (file)
@@ -1,3 +1,6 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */ 
 #ifndef __LVFS_H__
 #define __LVFS_H__
 
@@ -6,6 +9,8 @@
 #define LL_ID_NAMELEN (16 + 1 + 8 + 1)
 
 #if defined __KERNEL__
+#include <linux/dcache.h>
+#include <linux/namei.h>
 #include <linux/lustre_compat25.h>
 #include <linux/lvfs_linux.h>
 #endif 
@@ -18,13 +23,13 @@ struct mds_grp_hash_entry;
 
 /* simple.c */
 struct lvfs_ucred {
-        struct mds_grp_hash_entry *luc_ghash;
-        struct group_info *luc_ginfo;
+        struct lustre_sec_desc *luc_lsd;
+        struct group_info      *luc_ginfo;
         __u32 luc_fsuid;
         __u32 luc_fsgid;
         __u32 luc_cap;
         __u32 luc_uid;
-       __u32 luc_umask;
+        __u32 luc_umask;
 };
 
 struct lvfs_callback_ops {
@@ -100,11 +105,11 @@ ll_lookup_one_len(const char *name, struct dentry *dparent, int namelen)
 {
         struct dentry *dchild;
 #ifdef S_PDIROPS
-       struct qstr qstr;
-       void *lock;
-       qstr.name = name;
-       qstr.len = namelen;
-       lock = lock_dir(dparent->d_inode, &qstr);
+        struct qstr qstr;
+        void *lock;
+        qstr.name = name;
+        qstr.len = namelen;
+        lock = lock_dir(dparent->d_inode, &qstr);
 #else
         down(&dparent->d_inode->i_sem);
 #endif
@@ -112,7 +117,7 @@ ll_lookup_one_len(const char *name, struct dentry *dparent, int namelen)
         dchild = lookup_one_len(name, dparent, namelen);
 
 #ifdef S_PDIROPS
-       unlock_dir(dparent->d_inode, lock);
+        unlock_dir(dparent->d_inode, lock);
 #else
         up(&dparent->d_inode->i_sem);
 #endif
@@ -125,6 +130,18 @@ static inline void ll_sleep(int t)
         schedule_timeout(t * HZ);
         set_current_state(TASK_RUNNING);
 }
+
+static inline struct dentry *
+ll_d_lookup(const char *name,
+           struct dentry *dparent, int len)
+{
+       struct qstr qstr;
+
+       qstr.len = len;
+       qstr.name = name;
+       qstr.hash = full_name_hash(name, len);
+       return d_lookup(dparent, &qstr);
+}
 #endif
 
 static inline int ll_id2str(char *str, __u64 id, __u32 generation)
index 29c77c7..a7f8b5f 100644 (file)
@@ -20,7 +20,6 @@
 #define IOC_MDC_LOOKUP       _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
 /* Moved to lustre_user.h
 #define IOC_MDC_GETSTRIPE    _IOWR(IOC_MDC_TYPE, 21, struct lov_mds_md *) */
-#define IOC_MDC_FINISH_GNS   _IOWR(IOC_MDC_TYPE, 22, struct obd_device *)
 #define IOC_MDC_MAX_NR       50
 
 #ifdef __KERNEL__
@@ -275,6 +274,12 @@ struct client_obd {
         int                      cl_max_mds_cookiesize;
         kdev_t                   cl_sandev;
 
+        /* security flavors */
+        __u32                    cl_sec_flavor;
+        __u32                    cl_sec_subflavor;
+        __u32                    cl_nllu; /* non lustre local user */
+        __u32                    cl_nllg; /* non lustre local group */
+
         //struct llog_canceld_ctxt *cl_llcd; /* it's included by obd_llog_ctxt */
         void                    *cl_llcd_offset;
 
@@ -386,6 +391,10 @@ struct mds_obd {
         struct dentry                   *mds_id_dir;
         int                              mds_obd_type;
         struct dentry                   *mds_unnamed_dir; /* for mdt_obd_create only */
+
+        /* security related */
+        char                            *mds_mds_sec;
+        char                            *mds_ost_sec;
 };
 
 struct echo_obd {
@@ -850,8 +859,8 @@ struct md_ops {
                          void *, int, ldlm_completion_callback,
                          ldlm_blocking_callback, void *);
         int (*m_getattr)(struct obd_export *, struct lustre_id *,
-                         __u64, unsigned int,
-                         struct ptlrpc_request **);
+                         __u64, const char *, int,
+                         unsigned int, struct ptlrpc_request **);
         int (*m_getattr_lock)(struct obd_export *, struct lustre_id *,
                               char *, int, __u64,
                               unsigned int, struct ptlrpc_request **);
index faba9a6..6bb4dca 100644 (file)
@@ -1245,14 +1245,14 @@ static inline int md_delete_inode(struct obd_export *exp,
 }
 
 static inline int md_getattr(struct obd_export *exp, struct lustre_id *id,
-                             __u64 valid, unsigned int ea_size,
-                             struct ptlrpc_request **request)
+                             __u64 valid, const char *ea_name, int ea_namelen,
+                             unsigned int ea_size, struct ptlrpc_request **request)
 {
         int rc;
         ENTRY;
         EXP_CHECK_MD_OP(exp, getattr);
         MD_COUNTER_INCREMENT(exp->exp_obd, getattr);
-        rc = MDP(exp->exp_obd, getattr)(exp, id, valid, ea_size, request);
+        rc = MDP(exp->exp_obd, getattr)(exp, id, valid, ea_name, ea_namelen, ea_size, request);
         RETURN(rc);
 }
 
index 8783209..64db5f7 100644 (file)
@@ -145,6 +145,14 @@ extern wait_queue_head_t obd_race_waitq;
 #define OBD_FAIL_TGT_REPLY_NET           0x700
 #define OBD_FAIL_TGT_CONN_RACE           0x701
 
+#define OBD_FAIL_SVCSEC_ACCEPT_BEG       0x750
+#define OBD_FAIL_SVCSEC_ACCEPT_END       0x751
+#define OBD_FAIL_SVCSEC_WRAP_BEG         0x752
+#define OBD_FAIL_SVCSEC_WRAP_END         0x753
+#define OBD_FAIL_SVCGSS_ERR_NOTIFY       0x760
+#define OBD_FAIL_SVCGSS_INIT_REQ         0x780
+#define OBD_FAIL_SVCGSS_INIT_REP         0x781
+
 /* preparation for a more advanced failure testbed (not functional yet) */
 #define OBD_FAIL_MASK_SYS    0x0000FF00
 #define OBD_FAIL_MASK_LOC    (0x000000FF | OBD_FAIL_MASK_SYS)
index d86d1b6..466235d 100644 (file)
@@ -1,8 +1,8 @@
 Index: linux-2.6.7/include/linux/dcache.h
 ===================================================================
---- linux-2.6.7.orig/include/linux/dcache.h    2004-08-30 17:20:57.000000000 +0800
-+++ linux-2.6.7/include/linux/dcache.h 2004-08-30 17:39:12.000000000 +0800
-@@ -94,6 +94,9 @@
+--- linux-2.6.7.orig/include/linux/dcache.h    2005-03-23 23:28:49.669799416 +0800
++++ linux-2.6.7/include/linux/dcache.h 2005-03-23 23:38:25.648237384 +0800
+@@ -86,6 +86,9 @@
        spinlock_t d_lock;              /* per dentry lock */
        struct inode *d_inode;          /* Where the name belongs to - NULL is
                                         * negative */
@@ -12,11 +12,12 @@ Index: linux-2.6.7/include/linux/dcache.h
        /*
         * The next three fields are touched by __d_lookup.  Place them here
         * so they all fit in a 16-byte range, with 16-byte alignment.
-@@ -166,6 +169,7 @@
+@@ -158,6 +161,8 @@
  #define DCACHE_UNHASHED               0x0010  
- #define DCACHE_LUSTRE_INVALID     0x0020  /* Lustre invalidated */
+ #define DCACHE_LUSTRE_INVALID 0x0020  /* invalidated by Lustre */
  
 +#define DCACHE_CROSS_REF       0x0040  /* entry points to inode on another MDS */
++
  extern spinlock_t dcache_lock;
  
+ /**
diff --git a/lustre/kernel_patches/patches/export-vanilla-2.6.patch b/lustre/kernel_patches/patches/export-vanilla-2.6.patch
new file mode 100644 (file)
index 0000000..c18a380
--- /dev/null
@@ -0,0 +1,94 @@
+Index: linux-2.6.7/mm/truncate.c
+===================================================================
+--- linux-2.6.7.orig/mm/truncate.c     2004-06-16 13:20:04.000000000 +0800
++++ linux-2.6.7/mm/truncate.c  2005-03-23 23:30:30.676444072 +0800
+@@ -42,7 +42,7 @@
+  * its lock, b) when a concurrent invalidate_inode_pages got there first and
+  * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
+  */
+-static void
++void
+ truncate_complete_page(struct address_space *mapping, struct page *page)
+ {
+       if (page->mapping != mapping)
+@@ -58,6 +58,8 @@
+       page_cache_release(page);       /* pagecache ref */
+ }
++EXPORT_SYMBOL(truncate_complete_page);
++
+ /*
+  * This is for invalidate_inode_pages().  That function can be called at
+  * any time, and is not supposed to throw away dirty pages.  But pages can
+Index: linux-2.6.7/fs/super.c
+===================================================================
+--- linux-2.6.7.orig/fs/super.c        2004-06-16 13:19:22.000000000 +0800
++++ linux-2.6.7/fs/super.c     2005-03-23 23:30:30.648448328 +0800
+@@ -804,6 +804,8 @@
+       return (struct vfsmount *)sb;
+ }
++EXPORT_SYMBOL(do_kern_mount);
++
+ struct vfsmount *kern_mount(struct file_system_type *type)
+ {
+       return do_kern_mount(type->name, 0, type->name, NULL);
+Index: linux-2.6.7/fs/jbd/journal.c
+===================================================================
+--- linux-2.6.7.orig/fs/jbd/journal.c  2004-06-16 13:18:59.000000000 +0800
++++ linux-2.6.7/fs/jbd/journal.c       2005-03-23 23:30:30.647448480 +0800
+@@ -71,6 +71,7 @@
+ EXPORT_SYMBOL(journal_errno);
+ EXPORT_SYMBOL(journal_ack_err);
+ EXPORT_SYMBOL(journal_clear_err);
++EXPORT_SYMBOL(log_start_commit);
+ EXPORT_SYMBOL(log_wait_commit);
+ EXPORT_SYMBOL(journal_start_commit);
+ EXPORT_SYMBOL(journal_wipe);
+Index: linux-2.6.7/kernel/exit.c
+===================================================================
+--- linux-2.6.7.orig/kernel/exit.c     2004-06-16 13:19:52.000000000 +0800
++++ linux-2.6.7/kernel/exit.c  2005-03-23 23:34:17.539955576 +0800
+@@ -256,6 +256,8 @@
+       write_unlock_irq(&tasklist_lock);
+ }
++EXPORT_SYMBOL(reparent_to_init);
++
+ void __set_special_pids(pid_t session, pid_t pgrp)
+ {
+       struct task_struct *curr = current;
+@@ -435,6 +437,7 @@
+ {
+       __exit_files(tsk);
+ }
++EXPORT_SYMBOL(exit_files);
+ static inline void __put_fs_struct(struct fs_struct *fs)
+ {
+Index: linux-2.6.7/include/linux/fs.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/fs.h        2005-03-23 23:30:08.535809960 +0800
++++ linux-2.6.7/include/linux/fs.h     2005-03-23 23:30:30.675444224 +0800
+@@ -1133,6 +1133,7 @@
+ extern struct vfsmount *kern_mount(struct file_system_type *);
+ extern int may_umount_tree(struct vfsmount *);
+ extern int may_umount(struct vfsmount *);
++struct vfsmount *do_kern_mount(const char *type, int flags, const char *name, void *data);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
+ extern int vfs_statfs(struct super_block *, struct kstatfs *);
+Index: linux-2.6.7/include/linux/mm.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/mm.h        2004-06-16 13:18:56.000000000 +0800
++++ linux-2.6.7/include/linux/mm.h     2005-03-23 23:30:30.673444528 +0800
+@@ -653,6 +653,9 @@
+ extern unsigned long do_brk(unsigned long, unsigned long);
++/* truncate.c */
++extern void truncate_complete_page(struct address_space *mapping,struct page *);
++
+ /* filemap.c */
+ extern unsigned long page_unuse(struct page *);
+ extern void truncate_inode_pages(struct address_space *, loff_t);
index a4867a5..4fd69a5 100644 (file)
@@ -5,10 +5,10 @@
  include/linux/ext3_fs.h |    5 ++++-
  5 files changed, 85 insertions(+), 6 deletions(-)
 
-Index: uml-2.6.3/fs/ext3/ialloc.c
+Index: linux-2.6.7/fs/ext3/ialloc.c
 ===================================================================
---- uml-2.6.3.orig/fs/ext3/ialloc.c    2004-02-20 15:00:48.000000000 +0800
-+++ uml-2.6.3/fs/ext3/ialloc.c 2004-02-21 00:24:45.202693776 +0800
+--- linux-2.6.7.orig/fs/ext3/ialloc.c  2005-03-24 00:27:43.282608616 +0800
++++ linux-2.6.7/fs/ext3/ialloc.c       2005-03-24 00:27:43.888516504 +0800
 @@ -420,7 +420,8 @@
   * For other inodes, search forward from the parent directory's block
   * group to find a free inode.
@@ -58,11 +58,19 @@ Index: uml-2.6.3/fs/ext3/ialloc.c
        if (S_ISDIR(mode)) {
                if (test_opt (sb, OLDALLOC))
                        group = find_group_dir(sb, dir);
-Index: uml-2.6.3/fs/ext3/ioctl.c
+Index: linux-2.6.7/fs/ext3/ioctl.c
 ===================================================================
---- uml-2.6.3.orig/fs/ext3/ioctl.c     2004-01-09 14:59:26.000000000 +0800
-+++ uml-2.6.3/fs/ext3/ioctl.c  2004-02-21 00:21:04.541239416 +0800
-@@ -24,6 +24,31 @@
+--- linux-2.6.7.orig/fs/ext3/ioctl.c   2004-06-16 13:19:13.000000000 +0800
++++ linux-2.6.7/fs/ext3/ioctl.c        2005-03-24 00:31:16.113253440 +0800
+@@ -9,6 +9,7 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
++#include <linux/namei.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/time.h>
+@@ -24,6 +25,31 @@
        ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
  
        switch (cmd) {
@@ -93,12 +101,12 @@ Index: uml-2.6.3/fs/ext3/ioctl.c
 +      }
        case EXT3_IOC_GETFLAGS:
                flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
-               return put_user(flags, (int *) arg);
-Index: uml-2.6.3/fs/ext3/namei.c
+               return put_user(flags, (int __user *) arg);
+Index: linux-2.6.7/fs/ext3/namei.c
 ===================================================================
---- uml-2.6.3.orig/fs/ext3/namei.c     2004-02-20 15:01:27.000000000 +0800
-+++ uml-2.6.3/fs/ext3/namei.c  2004-02-21 00:21:04.611228776 +0800
-@@ -1617,6 +1617,19 @@
+--- linux-2.6.7.orig/fs/ext3/namei.c   2005-03-24 00:27:43.536570008 +0800
++++ linux-2.6.7/fs/ext3/namei.c        2005-03-24 00:27:43.893515744 +0800
+@@ -1939,6 +1939,19 @@
        return err;
  }
  
@@ -118,7 +126,7 @@ Index: uml-2.6.3/fs/ext3/namei.c
  /*
   * By the time this is called, we already have created
   * the directory cache entry for the new file, but it
-@@ -1640,7 +1653,7 @@
+@@ -1963,7 +1976,7 @@
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
  
@@ -127,7 +135,7 @@ Index: uml-2.6.3/fs/ext3/namei.c
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext3_file_inode_operations;
-@@ -1670,7 +1683,7 @@
+@@ -1994,7 +2007,7 @@
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
  
@@ -136,7 +144,7 @@ Index: uml-2.6.3/fs/ext3/namei.c
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
-@@ -1702,7 +1715,7 @@
+@@ -2027,7 +2040,7 @@
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
  
@@ -145,7 +153,7 @@ Index: uml-2.6.3/fs/ext3/namei.c
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
-@@ -2094,7 +2107,7 @@
+@@ -2439,7 +2452,7 @@
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
  
@@ -154,10 +162,10 @@ Index: uml-2.6.3/fs/ext3/namei.c
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
-Index: uml-2.6.3/include/linux/ext3_fs.h
+Index: linux-2.6.7/include/linux/ext3_fs.h
 ===================================================================
---- uml-2.6.3.orig/include/linux/ext3_fs.h     2004-01-09 14:59:44.000000000 +0800
-+++ uml-2.6.3/include/linux/ext3_fs.h  2004-02-21 00:21:04.613228472 +0800
+--- linux-2.6.7.orig/include/linux/ext3_fs.h   2005-03-24 00:27:43.542569096 +0800
++++ linux-2.6.7/include/linux/ext3_fs.h        2005-03-24 00:27:43.893515744 +0800
 @@ -203,6 +203,7 @@
  #define       EXT3_IOC_SETFLAGS               _IOW('f', 2, long)
  #define       EXT3_IOC_GETVERSION             _IOR('f', 3, long)
@@ -166,7 +174,7 @@ Index: uml-2.6.3/include/linux/ext3_fs.h
  #define       EXT3_IOC_GETVERSION_OLD         _IOR('v', 1, long)
  #define       EXT3_IOC_SETVERSION_OLD         _IOW('v', 2, long)
  #ifdef CONFIG_JBD_DEBUG
-@@ -707,7 +708,8 @@
+@@ -708,7 +709,8 @@
                          dx_hash_info *hinfo);
  
  /* ialloc.c */
@@ -176,7 +184,7 @@ Index: uml-2.6.3/include/linux/ext3_fs.h
  extern void ext3_free_inode (handle_t *, struct inode *);
  extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
  extern unsigned long ext3_count_free_inodes (struct super_block *);
-@@ -792,4 +794,5 @@
+@@ -793,4 +795,5 @@
  
  #endif        /* __KERNEL__ */
  
diff --git a/lustre/kernel_patches/patches/header_guards-vanilla-2.6.patch b/lustre/kernel_patches/patches/header_guards-vanilla-2.6.patch
new file mode 100644 (file)
index 0000000..e8b6abb
--- /dev/null
@@ -0,0 +1,45 @@
+%diffstat
+ blockgroup_lock.h |    4 +++-
+ percpu_counter.h  |    4 ++++
+ 2 files changed, 7 insertions(+), 1 deletion(-)
+
+%patch
+Index: linux-2.6.6/include/linux/percpu_counter.h
+===================================================================
+--- linux-2.6.6.orig/include/linux/percpu_counter.h    2004-04-04 11:37:23.000000000 +0800
++++ linux-2.6.6/include/linux/percpu_counter.h 2004-05-22 16:08:16.000000000 +0800
+@@ -3,6 +3,8 @@
+  *
+  * WARNING: these things are HUGE.  4 kbytes per counter on 32-way P4.
+  */
++#ifndef _LINUX_PERCPU_COUNTER_H
++#define _LINUX_PERCPU_COUNTER_H
+ #include <linux/config.h>
+ #include <linux/spinlock.h>
+@@ -101,3 +103,5 @@ static inline void percpu_counter_dec(st
+ {
+       percpu_counter_mod(fbc, -1);
+ }
++
++#endif /* _LINUX_PERCPU_COUNTER_H */
+Index: linux-2.6.6/include/linux/blockgroup_lock.h
+===================================================================
+--- linux-2.6.6.orig/include/linux/blockgroup_lock.h   2004-04-04 11:36:26.000000000 +0800
++++ linux-2.6.6/include/linux/blockgroup_lock.h        2004-05-22 16:08:45.000000000 +0800
+@@ -3,6 +3,8 @@
+  *
+  * Simple hashed spinlocking.
+  */
++#ifndef _LINUX_BLOCKGROUP_LOCK_H
++#define _LINUX_BLOCKGROUP_LOCK_H
+ #include <linux/config.h>
+ #include <linux/spinlock.h>
+@@ -55,4 +57,4 @@ static inline void bgl_lock_init(struct 
+ #define sb_bgl_lock(sb, block_group) \
+       (&(sb)->s_blockgroup_lock.locks[(block_group) & (NR_BG_LOCKS-1)].lock)
+-
++#endif
+
index cb504d9..88e0843 100644 (file)
@@ -159,7 +159,7 @@ Index: linux-stage/fs/ext3/iopen.c
 +      list_add(&dentry->d_alias, &inode->i_dentry);   /* d_instantiate */
 +      dentry->d_inode = inode;
 +
-+      __d_rehash(dentry, 0);                          /* d_rehash */
++      __d_rehash(dentry);                             /* d_rehash */
 +      spin_unlock(&dcache_lock);
 +
 +      return NULL;
@@ -222,7 +222,7 @@ Index: linux-stage/fs/ext3/iopen.c
 +      /* Move the goal to the de hash queue */
 +      goal->d_flags &= ~ DCACHE_DISCONNECTED;
 +      security_d_instantiate(goal, inode);
-+      __d_rehash(dentry, 0);
++      __d_rehash(dentry);
 +      __d_move(goal, dentry);
 +      spin_unlock(&dcache_lock);
 +      iput(inode);
@@ -235,7 +235,7 @@ Index: linux-stage/fs/ext3/iopen.c
 +      dentry->d_inode = inode;
 +do_rehash:
 +      if (rehash)
-+              __d_rehash(dentry, 0);                  /* d_rehash */
++              __d_rehash(dentry);                     /* d_rehash */
 +      spin_unlock(&dcache_lock);
 +
 +      return NULL;
diff --git a/lustre/kernel_patches/patches/linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch b/lustre/kernel_patches/patches/linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch
new file mode 100644 (file)
index 0000000..f754546
--- /dev/null
@@ -0,0 +1,16246 @@
+--- linux-2.6.7/Documentation/filesystems/00-INDEX.lsec        2004-06-15 23:20:26.000000000 -0600
++++ linux-2.6.7/Documentation/filesystems/00-INDEX     2005-03-23 14:28:24.576313528 -0700
+@@ -28,6 +28,8 @@ jfs.txt
+       - info and mount options for the JFS filesystem.
+ ncpfs.txt
+       - info on Novell Netware(tm) filesystem using NCP protocol.
++nfs4.txt
++      - info and mount options for the nfs4 filesystem.
+ ntfs.txt
+       - info and mount options for the NTFS filesystem (Windows NT).
+ proc.txt
+--- linux-2.6.7/Documentation/filesystems/nfs4.txt.lsec        2005-03-23 14:28:24.576313528 -0700
++++ linux-2.6.7/Documentation/filesystems/nfs4.txt     2005-03-23 14:28:24.576313528 -0700
+@@ -0,0 +1,20 @@
++NFS version 4
++=============
++
++NFS version 4 is specified by RFC3530.  Compared to earlier NFS versions,
++it provides enhanced security and better client caching, among other features.
++
++In addition to basic file operations, the NFS client supports locking, kerberos
++(basic authentication and integrity), and reboot recovery.
++
++As this writing (July 2004), patches to nfs-utils and util-linux are required
++for NFSv4 support; see http://www.citi.umich.edu/projects/nfsv4/linux/ for
++patches and instructions.
++
++The kernel treats NFS version 4 as a separate filesystem type, nfs4, so it is
++mounted using "mount -tnfs4 server:/path /mntpoint", not by mounting the nfs
++filesystem with -onfsver=4.
++
++Mount options:
++
++XXX?
+--- linux-2.6.7/fs/locks.c.lsec        2004-06-15 23:20:03.000000000 -0600
++++ linux-2.6.7/fs/locks.c     2005-03-23 14:28:22.425640480 -0700
+@@ -317,7 +317,7 @@ static int flock_to_posix_lock(struct fi
+       if (l->l_len == 0)
+               fl->fl_end = OFFSET_MAX;
+       
+-      fl->fl_owner = current->files;
++      fl->fl_owner = 0;
+       fl->fl_pid = current->tgid;
+       fl->fl_file = filp;
+       fl->fl_flags = FL_POSIX;
+@@ -357,7 +357,7 @@ static int flock64_to_posix_lock(struct 
+       if (l->l_len == 0)
+               fl->fl_end = OFFSET_MAX;
+       
+-      fl->fl_owner = current->files;
++      fl->fl_owner = 0;
+       fl->fl_pid = current->tgid;
+       fl->fl_file = filp;
+       fl->fl_flags = FL_POSIX;
+@@ -920,7 +920,7 @@ int posix_lock_file(struct file *filp, s
+  */
+ int locks_mandatory_locked(struct inode *inode)
+ {
+-      fl_owner_t owner = current->files;
++      unsigned int pid = current->tgid;
+       struct file_lock *fl;
+       /*
+@@ -930,7 +930,9 @@ int locks_mandatory_locked(struct inode 
+       for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+               if (!IS_POSIX(fl))
+                       continue;
+-              if (fl->fl_owner != owner)
++              if (fl->fl_owner != 0)
++                      break;
++              if (fl->fl_pid != pid)
+                       break;
+       }
+       unlock_kernel();
+@@ -958,7 +960,7 @@ int locks_mandatory_area(int read_write,
+       int error;
+       locks_init_lock(&fl);
+-      fl.fl_owner = current->files;
++      fl.fl_owner = 0;
+       fl.fl_pid = current->tgid;
+       fl.fl_file = filp;
+       fl.fl_flags = FL_POSIX | FL_ACCESS;
+@@ -1684,7 +1686,7 @@ void locks_remove_posix(struct file *fil
+       lock_kernel();
+       while (*before != NULL) {
+               struct file_lock *fl = *before;
+-              if (IS_POSIX(fl) && (fl->fl_owner == owner)) {
++              if (IS_POSIX(fl) && posix_same_owner(fl, &lock)) {
+                       locks_delete_lock(before);
+                       continue;
+               }
+@@ -1982,18 +1984,6 @@ int lock_may_write(struct inode *inode, 
+ EXPORT_SYMBOL(lock_may_write);
+-static inline void __steal_locks(struct file *file, fl_owner_t from)
+-{
+-      struct inode *inode = file->f_dentry->d_inode;
+-      struct file_lock *fl = inode->i_flock;
+-
+-      while (fl) {
+-              if (fl->fl_file == file && fl->fl_owner == from)
+-                      fl->fl_owner = current->files;
+-              fl = fl->fl_next;
+-      }
+-}
+-
+ /* When getting ready for executing a binary, we make sure that current
+  * has a files_struct on its own. Before dropping the old files_struct,
+  * we take over ownership of all locks for all file descriptors we own.
+@@ -2002,31 +1992,6 @@ static inline void __steal_locks(struct 
+  */
+ void steal_locks(fl_owner_t from)
+ {
+-      struct files_struct *files = current->files;
+-      int i, j;
+-
+-      if (from == files)
+-              return;
+-
+-      lock_kernel();
+-      j = 0;
+-      for (;;) {
+-              unsigned long set;
+-              i = j * __NFDBITS;
+-              if (i >= files->max_fdset || i >= files->max_fds)
+-                      break;
+-              set = files->open_fds->fds_bits[j++];
+-              while (set) {
+-                      if (set & 1) {
+-                              struct file *file = files->fd[i];
+-                              if (file)
+-                                      __steal_locks(file, from);
+-                      }
+-                      i++;
+-                      set >>= 1;
+-              }
+-      }
+-      unlock_kernel();
+ }
+ EXPORT_SYMBOL(steal_locks);
+--- linux-2.6.7/fs/hostfs/hostfs_kern.c.lsec   2005-03-23 14:25:58.982447160 -0700
++++ linux-2.6.7/fs/hostfs/hostfs_kern.c        2005-03-23 14:33:11.946626600 -0700
+@@ -290,7 +290,6 @@ static void hostfs_delete_inode(struct i
+ {
+       if(HOSTFS_I(inode)->fd != -1) {
+               close_file(&HOSTFS_I(inode)->fd);
+-              printk("Closing host fd in .delete_inode\n");
+               HOSTFS_I(inode)->fd = -1;
+       }
+       clear_inode(inode);
+@@ -303,7 +302,6 @@ static void hostfs_destroy_inode(struct 
+       if(HOSTFS_I(inode)->fd != -1) {
+               close_file(&HOSTFS_I(inode)->fd);
+-              printk("Closing host fd in .destroy_inode\n");
+       }
+       kfree(HOSTFS_I(inode));
+--- linux-2.6.7/fs/open.c.lsec 2005-03-23 14:26:01.774022776 -0700
++++ linux-2.6.7/fs/open.c      2005-03-23 14:28:23.226518728 -0700
+@@ -1025,7 +1025,7 @@ int filp_close(struct file *filp, fl_own
+       }
+       dnotify_flush(filp, id);
+-      locks_remove_posix(filp, id);
++      locks_remove_posix(filp, 0);
+       fput(filp);
+       return retval;
+ }
+--- linux-2.6.7/fs/nfsd/export.c.lsec  2004-06-15 23:19:36.000000000 -0600
++++ linux-2.6.7/fs/nfsd/export.c       2005-03-23 14:28:24.686296808 -0700
+@@ -255,7 +255,7 @@ static inline void svc_expkey_update(str
+       new->ek_export = item->ek_export;
+ }
+-static DefineSimpleCacheLookup(svc_expkey,0) /* no inplace updates */
++static DefineSimpleCacheLookup(svc_expkey)
+ #define       EXPORT_HASHBITS         8
+ #define       EXPORT_HASHMAX          (1<< EXPORT_HASHBITS)
+@@ -487,8 +487,72 @@ static inline void svc_export_update(str
+       new->ex_fsid = item->ex_fsid;
+ }
+-static DefineSimpleCacheLookup(svc_export,1) /* allow inplace updates */
++struct svc_export *
++svc_export_lookup(struct svc_export *item, int set)
++{
++      struct svc_export *tmp, *new = NULL;
++      struct cache_head **hp, **head;
++      head = &svc_export_cache.hash_table[svc_export_hash(item)];
++retry:
++      if (set||new)
++              write_lock(&svc_export_cache.hash_lock);
++      else
++              read_lock(&svc_export_cache.hash_lock);
++      for(hp=head; *hp != NULL; hp = &tmp->h.next) {
++              tmp = container_of(*hp, struct svc_export, h);
++              if (svc_export_match(item, tmp)) { /* found a match */
++                      cache_get(&tmp->h);
++                      if (set) {
++                              if (test_bit(CACHE_NEGATIVE,  &item->h.flags))
++                                       set_bit(CACHE_NEGATIVE, &tmp->h.flags);
++                              else {
++                                      clear_bit(CACHE_NEGATIVE, &tmp->h.flags);
++                                      svc_export_update(tmp, item);
++                              }
++                      }
++                      if (set||new)
++                              write_unlock(&svc_export_cache.hash_lock);
++                      else
++                              read_unlock(&svc_export_cache.hash_lock);
++                      if (set)
++                              cache_fresh(&svc_export_cache, &tmp->h,
++                                              item->h.expiry_time);
++                      if (new)
++                              svc_export_put(&new->h, &svc_export_cache);
++                      return tmp;
++              }
++      }
++      /* Didn't find anything */
++      if (new) {
++              svc_export_init(new, item);
++              new->h.next = *head;
++              *head = &new->h;
++              set_bit(CACHE_HASHED, &new->h.flags);
++              svc_export_cache.entries++;
++              if (set) {
++                      tmp = new;
++                      if (test_bit(CACHE_NEGATIVE, &item->h.flags))
++                              set_bit(CACHE_NEGATIVE, &tmp->h.flags);
++                      else
++                              svc_export_update(tmp, item);
++              }
++      }
++      if (set||new)
++              write_unlock(&svc_export_cache.hash_lock);
++      else
++              read_unlock(&svc_export_cache.hash_lock);
++      if (new && set)
++              cache_fresh(&svc_export_cache, &new->h, item->h.expiry_time);
++      if (new)
++              return new;
++      new = kmalloc(sizeof(*new), GFP_KERNEL);
++      if (new) {
++              cache_init(&new->h);
++              goto retry;
++      }
++      return NULL;
++}
+ struct svc_expkey *
+ exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
+--- linux-2.6.7/fs/nfsd/nfs4callback.c.lsec    2005-03-23 14:28:24.578313224 -0700
++++ linux-2.6.7/fs/nfsd/nfs4callback.c 2005-03-23 14:28:24.578313224 -0700
+@@ -0,0 +1,631 @@
++/*
++ *  linux/fs/nfsd/nfs4callback.c
++ *
++ *  Copyright (c) 2001 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Kendrick Smith <kmsmith@umich.edu>
++ *  Andy Adamson <andros@umich.edu>
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the University nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/list.h>
++#include <linux/inet.h>
++#include <linux/errno.h>
++#include <linux/sunrpc/xdr.h>
++#include <linux/sunrpc/svc.h>
++#include <linux/sunrpc/clnt.h>
++#include <linux/nfsd/nfsd.h>
++#include <linux/nfsd/state.h>
++#include <linux/sunrpc/sched.h>
++#include <linux/nfs4.h>
++
++#define NFSDDBG_FACILITY                NFSDDBG_PROC
++
++#define NFSPROC4_CB_NULL 0
++#define NFSPROC4_CB_COMPOUND 1
++
++/* forward declarations */
++static void nfs4_cb_null(struct rpc_task *task);
++
++/* Index of predefined Linux callback client operations */
++
++enum {
++        NFSPROC4_CLNT_CB_NULL = 0,
++        NFSPROC4_CLNT_CB_GETATTR,
++        NFSPROC4_CLNT_CB_RECALL,
++};
++
++enum nfs_cb_opnum4 {
++      OP_CB_GETATTR           = 3,
++      OP_CB_RECALL            = 4,
++      OP_CB_ILLEGAL           = 10044
++};
++
++
++#define NFS4_MAXTAGLEN                20
++
++#define cb_compound_enc_hdr_sz                4
++#define cb_compound_dec_hdr_sz                (3 + (NFS4_MAXTAGLEN >> 2))
++#define op_enc_sz                     1
++#define op_dec_sz                     2
++#define enc_nfs4_fh_sz                        (1 + (NFS4_FHSIZE >> 2))
++#define enc_stateid_sz                        16
++
++#define NFS4_enc_cb_getattr_sz                (cb_compound_enc_hdr_sz +       \
++                                      op_enc_sz  +                    \
++                                      enc_nfs4_fh_sz + 4)
++
++#define NFS4_dec_cb_getattr_sz                (cb_compound_dec_hdr_sz +       \
++                                      op_dec_sz          +            \
++                                      11)
++
++#define NFS4_enc_cb_recall_sz         (cb_compound_enc_hdr_sz +       \
++                                      1 + enc_stateid_sz +            \
++                                      enc_nfs4_fh_sz)
++
++#define NFS4_dec_cb_recall_sz         (cb_compound_dec_hdr_sz  +      \
++                                       op_dec_sz)
++
++/*
++* Generic encode routines from fs/nfs/nfs4xdr.c
++*/
++static inline u32 *
++xdr_writemem(u32 *p, const void *ptr, int nbytes)
++{
++      int tmp = XDR_QUADLEN(nbytes);
++      if (!tmp)
++              return p;
++      p[tmp-1] = 0;
++      memcpy(p, ptr, nbytes);
++      return p + tmp;
++}
++
++#define WRITE32(n)               *p++ = htonl(n)
++#define WRITEMEM(ptr,nbytes)     do {                           \
++        p = xdr_writemem(p, ptr, nbytes);                       \
++} while (0)
++#define RESERVE_SPACE(nbytes)   do {                            \
++      p = xdr_reserve_space(xdr, nbytes);                     \
++      if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \
++      BUG_ON(!p);                                             \
++} while (0)
++
++/*
++ * Generic decode routines from fs/nfs/nfs4xdr.c
++ */
++#define DECODE_TAIL                             \
++      status = 0;                             \
++out:                                            \
++      return status;                          \
++xdr_error:                                      \
++      dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \
++      status = -EIO;                          \
++      goto out
++
++#define READ32(x)         (x) = ntohl(*p++)
++#define READ64(x)         do {                  \
++      (x) = (u64)ntohl(*p++) << 32;           \
++      (x) |= ntohl(*p++);                     \
++} while (0)
++#define READTIME(x)       do {                  \
++      p++;                                    \
++      (x.tv_sec) = ntohl(*p++);               \
++      (x.tv_nsec) = ntohl(*p++);              \
++} while (0)
++#define READ_BUF(nbytes)  do { \
++      p = xdr_inline_decode(xdr, nbytes); \
++      if (!p) { \
++              dprintk("NFSD: %s: reply buffer overflowed in line %d.", \
++                        __FUNCTION__, __LINE__); \
++              return -EIO; \
++      } \
++} while (0)
++
++struct nfs4_cb_compound_hdr {
++      int    status;
++      u32    ident;
++      u32    nops;
++      u32    taglen;
++      char * tag;
++};
++
++struct nfs4_cb_getattr {
++      struct nfs_fh   fh;
++      u32             bm0;
++      u32             bm1;
++      __u64           change_attr;
++      __u64           size;
++      struct timespec mtime;
++};
++
++struct nfs4_cb_recall {
++      nfs4_stateid  stateid;
++      int           trunc;
++      struct nfs_fh fh;
++};
++
++static struct {
++        int stat;
++        int errno;
++} nfs_cb_errtbl[] = {
++      { NFS4_OK,              0               },
++      { NFS4ERR_PERM,         EPERM           },
++      { NFS4ERR_NOENT,        ENOENT          },
++      { NFS4ERR_IO,           EIO             },
++      { NFS4ERR_NXIO,         ENXIO           },
++      { NFS4ERR_ACCESS,       EACCES          },
++      { NFS4ERR_EXIST,        EEXIST          },
++      { NFS4ERR_XDEV,         EXDEV           },
++      { NFS4ERR_NOTDIR,       ENOTDIR         },
++      { NFS4ERR_ISDIR,        EISDIR          },
++      { NFS4ERR_INVAL,        EINVAL          },
++      { NFS4ERR_FBIG,         EFBIG           },
++      { NFS4ERR_NOSPC,        ENOSPC          },
++      { NFS4ERR_ROFS,         EROFS           },
++      { NFS4ERR_MLINK,        EMLINK          },
++      { NFS4ERR_NAMETOOLONG,  ENAMETOOLONG    },
++      { NFS4ERR_NOTEMPTY,     ENOTEMPTY       },
++      { NFS4ERR_DQUOT,        EDQUOT          },
++      { NFS4ERR_STALE,        ESTALE          },
++      { NFS4ERR_BADHANDLE,    EBADHANDLE      },
++      { NFS4ERR_BAD_COOKIE,   EBADCOOKIE      },
++      { NFS4ERR_NOTSUPP,      ENOTSUPP        },
++      { NFS4ERR_TOOSMALL,     ETOOSMALL       },
++      { NFS4ERR_SERVERFAULT,  ESERVERFAULT    },
++      { NFS4ERR_BADTYPE,      EBADTYPE        },
++      { NFS4ERR_LOCKED,       EAGAIN          },
++      { NFS4ERR_RESOURCE,     EREMOTEIO       },
++      { NFS4ERR_SYMLINK,      ELOOP           },
++      { NFS4ERR_OP_ILLEGAL,   EOPNOTSUPP      },
++      { NFS4ERR_DEADLOCK,     EDEADLK         },
++      { -1,                   EIO             }
++};
++
++static int
++nfs_cb_stat_to_errno(int stat)
++{
++        int i;
++        for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) {
++                if (nfs_cb_errtbl[i].stat == stat)
++                        return nfs_cb_errtbl[i].errno;
++        }
++        /* If we cannot translate the error, the recovery routines should
++         * handle it.
++         * Note: remaining NFSv4 error codes have values > 10000, so should
++         * not conflict with native Linux error codes.
++         */
++        return stat;
++}
++
++/*
++ * XDR encode
++ */
++
++static int
++encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
++{
++      u32 * p;
++
++      RESERVE_SPACE(16);
++      WRITE32(0);            /* tag length is always 0 */
++      WRITE32(NFS4_MINOR_VERSION);
++      WRITE32(hdr->ident);
++      WRITE32(hdr->nops);
++      return 0;
++}
++
++static int
++encode_cb_getattr(struct xdr_stream *xdr, struct nfs4_cb_getattr *cb_get)
++{
++        u32 *p;
++      int len = cb_get->fh.size;
++
++      RESERVE_SPACE(20 + len);
++      WRITE32(OP_CB_GETATTR);
++      WRITE32(len);
++      WRITEMEM(cb_get->fh.data, len);
++      WRITE32(2);
++      WRITE32(cb_get->bm0);
++      WRITE32(cb_get->bm1);
++      return 0;
++}
++
++static int
++encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
++{
++      u32 *p;
++      int len = cb_rec->fh.size;
++
++        RESERVE_SPACE(8+sizeof(cb_rec->stateid.data));
++        WRITE32(OP_CB_RECALL);
++        WRITEMEM(cb_rec->stateid.data, sizeof(cb_rec->stateid.data));
++      WRITE32(cb_rec->trunc);
++      WRITE32(len);
++      WRITEMEM(cb_rec->fh.data, len);
++      return 0;
++}
++
++static int
++nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req, u32 *p, struct nfs4_cb_getattr *args)
++{
++      struct xdr_stream xdr;
++      struct nfs4_cb_compound_hdr hdr = {
++              .nops   = 1,
++      };
++
++        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++        encode_cb_compound_hdr(&xdr, &hdr);
++        return (encode_cb_getattr(&xdr, args));
++}
++
++static int
++nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args)
++{
++      struct xdr_stream xdr;
++      struct nfs4_cb_compound_hdr hdr = {
++              .nops   = 1,
++      };
++
++        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++        encode_cb_compound_hdr(&xdr, &hdr);
++        return (encode_cb_recall(&xdr, args));
++}
++
++
++static int
++decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
++        u32 *p;
++
++        READ_BUF(8);
++        READ32(hdr->status);
++        READ32(hdr->taglen);
++        READ_BUF(hdr->taglen + 4);
++        hdr->tag = (char *)p;
++        p += XDR_QUADLEN(hdr->taglen);
++        READ32(hdr->nops);
++        return 0;
++}
++
++static int
++decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
++{
++      u32 *p;
++      u32 op;
++      int32_t nfserr;
++
++      READ_BUF(8);
++      READ32(op);
++      if (op != expected) {
++              dprintk("NFSD: decode_cb_op_hdr: Callback server returned operation"
++                      " %d but we issued a request for %d\n",
++                      op, expected);
++              return -EIO;
++      }
++      READ32(nfserr);
++      if (nfserr != NFS_OK)
++              return -nfs_cb_stat_to_errno(nfserr);
++      return 0;
++}
++
++static int
++decode_cb_getattr(struct xdr_stream *xdr, struct nfs4_cb_getattr *cb_get)
++{
++      int status;
++      u32 bmlen,
++              attrlen =0,
++              bmval0 =0,
++              bmval1 =0,
++              len = 0;
++      u32 *p;
++
++      status = decode_cb_op_hdr(xdr, OP_CB_GETATTR);
++      if (status)
++              return status;
++      READ_BUF(4);
++      READ32(bmlen);
++      if( (bmlen < 1) || (bmlen > 2))
++              goto xdr_error;
++      READ_BUF((bmlen << 2) + 4);
++      READ32(bmval0);
++      if (bmval0 & ~(FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE))
++              goto out_bad_bitmap;
++      if (bmlen == 2) {
++              READ32(bmval1);
++              if (bmval1 & ~ FATTR4_WORD1_TIME_MODIFY)
++                      goto out_bad_bitmap;
++      }
++      READ32(attrlen);
++      if (bmval0 & FATTR4_WORD0_CHANGE) {
++              READ_BUF(8);
++              len += 8;
++              READ64(cb_get->change_attr);
++              dprintk("decode_cb_getattr: changeid=%Ld\n",
++                                   (long long)cb_get->change_attr);
++      }
++      if (bmval0 & FATTR4_WORD0_SIZE) {
++              READ_BUF(8);
++              len += 8;
++              READ64(cb_get->size);
++              dprintk("decode_cb_getattr: size=%Ld\n",
++                                   (long long)cb_get->size);
++      }
++      if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
++              READ_BUF(12);
++              len += 12;
++              READTIME(cb_get->mtime);
++              dprintk("decode_cb_gatattr: mtime=%ld\n",
++                                    (long)cb_get->mtime.tv_sec);
++      }
++      if (len != attrlen)
++              goto xdr_error;
++
++      DECODE_TAIL;
++
++out_bad_bitmap:
++        dprintk("NFSD: %s Callback server returned bad attribute bitmap\n",
++               __FUNCTION__);
++        return -EIO;
++
++}
++
++static int
++nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, u32 *p, struct nfs4_cb_getattr *res)
++{
++      struct xdr_stream xdr;
++      struct nfs4_cb_compound_hdr hdr;
++      int status;
++
++      xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++      status = decode_cb_compound_hdr(&xdr, &hdr);
++      if (status)
++              goto out;
++      status = decode_cb_getattr(&xdr, res);
++out:
++      return status;
++}
++
++static int
++nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, u32 *p)
++{
++      struct xdr_stream xdr;
++      struct nfs4_cb_compound_hdr hdr;
++      int status;
++
++      xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++      status = decode_cb_compound_hdr(&xdr, &hdr);
++      if (status)
++              goto out;
++      status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
++out:
++      return status;
++}
++
++static int
++nfs4_xdr_enc_null(struct rpc_rqst *req, u32 *p)
++{
++      struct xdr_stream xdrs, *xdr = &xdrs;
++
++      xdr_init_encode(&xdrs, &req->rq_snd_buf, p);
++        RESERVE_SPACE(0);
++      return 0;
++}
++
++static int
++nfs4_xdr_dec_null(struct rpc_rqst *req, u32 *p)
++{
++      return 0;
++}
++
++/*
++ * RPC procedure tables
++ */
++#ifndef MAX
++# define MAX(a, b)      (((a) > (b))? (a) : (b))
++#endif
++
++#define PROC(proc, argtype, restype)                                  \
++[NFSPROC4_CLNT_##proc] = {                                            \
++        .p_proc   = NFSPROC4_CB_COMPOUND,                             \
++        .p_encode = (kxdrproc_t) nfs4_xdr_##argtype,                  \
++        .p_decode = (kxdrproc_t) nfs4_xdr_##restype,                  \
++        .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2,  \
++}
++
++struct rpc_procinfo     nfs4_cb_procedures[] = {
++  PROC(CB_GETATTR,      enc_cb_getattr,     dec_cb_getattr),
++  PROC(CB_RECALL,       enc_cb_recall,      dec_cb_recall),
++};
++
++struct rpc_version              nfs_cb_version4 = {
++        .number                 = 1,
++        .nrprocs                = sizeof(nfs4_cb_procedures)/sizeof(nfs4_cb_procedures[0]),
++        .procs                  = nfs4_cb_procedures
++};
++
++static struct rpc_version *   nfs_cb_version[] = {
++      NULL,
++      &nfs_cb_version4,
++};
++
++struct rpc_procinfo  nfs4_cb_null_proc= {
++      .p_proc = NFSPROC4_CB_NULL,
++      .p_encode = (kxdrproc_t)nfs4_xdr_enc_null,
++        .p_decode = (kxdrproc_t) nfs4_xdr_dec_null,
++        .p_bufsiz = 0,
++};
++
++/*
++ * Use the SETCLIENTID credential
++ */
++struct rpc_cred *
++nfsd4_lookupcred(struct nfs4_client *clp, int taskflags)
++{
++        struct auth_cred acred;
++      struct rpc_clnt *clnt = clp->cl_callback.cb_client;
++        struct rpc_cred *ret = NULL;
++
++      if (!clnt)
++              goto out;
++        get_group_info(clp->cl_cred.cr_group_info);
++        acred.uid = clp->cl_cred.cr_uid;
++        acred.gid = clp->cl_cred.cr_gid;
++        acred.group_info = clp->cl_cred.cr_group_info;
++
++        dprintk("NFSD:     looking up %s cred\n",
++                clnt->cl_auth->au_ops->au_name);
++        ret = rpcauth_lookup_credcache(clnt->cl_auth, &acred, taskflags);
++        put_group_info(clp->cl_cred.cr_group_info);
++out:
++        return ret;
++}
++
++/*
++ * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
++ */
++void
++nfsd4_probe_callback(struct nfs4_client *clp)
++{
++      struct sockaddr_in      addr;
++      struct nfs4_callback    *cb = &clp->cl_callback;
++      struct rpc_timeout      timeparms;
++      struct rpc_xprt *       xprt;
++      struct rpc_program *    program = &cb->cb_program;
++      struct rpc_stat *       stat = &cb->cb_stat;
++      struct rpc_clnt *       clnt;
++        struct rpc_message msg = {
++                .rpc_proc       = &nfs4_cb_null_proc,
++                .rpc_argp       = clp,
++        };
++      char                    hostname[32];
++      int status;
++
++      dprintk("NFSD: probe_callback. cb_parsed %d cb_set %d 1\n",
++               cb->cb_parsed, cb->cb_set);
++      if (!cb->cb_parsed || cb->cb_set)
++              goto out_err;
++
++      /* Currently, we only support tcp for the callback channel */
++      if (cb->cb_netid.len !=3 || memcmp((char *)cb->cb_netid.data, "tcp", 3))
++              goto out_err;
++
++      /* Initialize address */
++      memset(&addr, 0, sizeof(addr));
++      addr.sin_family = AF_INET;
++      addr.sin_port = htons(cb->cb_port);
++      addr.sin_addr.s_addr = htonl(cb->cb_addr);
++
++      /* Initialize timeout */
++      timeparms.to_initval = HZ;
++      timeparms.to_retries = 5;
++      timeparms.to_maxval = NFSD_LEASE_TIME*HZ;
++      timeparms.to_exponential = 1;
++
++      /* Create RPC transport */
++      if (!(xprt = xprt_create_proto(IPPROTO_TCP, &addr, &timeparms))) {
++              dprintk("NFSD: couldn't create callback transport!\n");
++              goto out_err;
++      }
++
++      /* Initialize rpc_program */
++      program->name = "nfs4_cb";
++      program->number = cb->cb_prog;
++      program->nrvers = sizeof(nfs_cb_version)/sizeof(nfs_cb_version[0]);
++      program->version = nfs_cb_version;
++      program->stats = stat;
++
++      /* Initialize rpc_stat */
++      memset(stat, 0, sizeof(struct rpc_stat));
++      stat->program = program;
++
++      /* Create RPC client
++       *
++       * XXX AUTH_UNIX only - need AUTH_GSS....
++       */
++      sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(addr.sin_addr.s_addr));
++      if (!(clnt = rpc_create_client(xprt, hostname, program, 1, RPC_AUTH_UNIX))) {
++              dprintk("NFSD: couldn't create callback client\n");
++              goto out_xprt;
++      }
++      clnt->cl_intr = 1;
++      clnt->cl_softrtry = 1;
++      clnt->cl_chatty = 1;
++      cb->cb_client = clnt;
++
++      /* Kick rpciod, put the call on the wire. */
++
++      if (rpciod_up() != 0) {
++              dprintk("nfsd: couldn't start rpciod for callbacks!\n");
++              goto out_clnt;
++      }
++
++      /* the task holds a reference to the nfs4_client struct */
++      atomic_inc(&clp->cl_count);
++
++      msg.rpc_cred = nfsd4_lookupcred(clp,0);
++      status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, nfs4_cb_null, 0);
++
++      if (status != 0) {
++              dprintk("NFSD: asynchronous NFSPROC4_CB_NULL failed!\n");
++              goto out_rpciod;
++      }
++      return;
++
++out_rpciod:
++      rpciod_down();
++out_clnt:
++      rpc_shutdown_client(clnt);
++      goto out_err;
++out_xprt:
++      xprt_destroy(xprt);
++out_err:
++      dprintk("NFSD: warning: no callback path to client %.*s\n",
++              clp->cl_name.len, clp->cl_name.data);
++      cb->cb_client = NULL;
++}
++
++static void
++nfs4_cb_null(struct rpc_task *task)
++{
++      struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
++      struct nfs4_callback *cb = &clp->cl_callback;
++      u32 addr = htonl(cb->cb_addr);
++
++      dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status);
++
++      if (task->tk_status < 0) {
++              dprintk("NFSD: callback establishment to client %.*s failed\n",
++                      clp->cl_name.len, clp->cl_name.data);
++              goto out;
++      }
++      cb->cb_set = 1;
++      dprintk("NFSD: callback set to client %u.%u.%u.%u\n", NIPQUAD(addr));
++out:
++      put_nfs4_client(clp);
++}
+--- linux-2.6.7/fs/nfsd/nfs4xdr.c.lsec 2004-06-15 23:19:52.000000000 -0600
++++ linux-2.6.7/fs/nfsd/nfs4xdr.c      2005-03-23 14:28:23.924412632 -0700
+@@ -55,6 +55,8 @@
+ #include <linux/nfsd/state.h>
+ #include <linux/nfsd/xdr4.h>
+ #include <linux/nfsd_idmap.h>
++#include <linux/nfs4.h>
++#include <linux/nfs4_acl.h>
+ #define NFSDDBG_FACILITY              NFSDDBG_XDR
+@@ -287,27 +289,40 @@ u32 *read_buf(struct nfsd4_compoundargs 
+       return p;
+ }
+-char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
++static int
++defer_free(struct nfsd4_compoundargs *argp,
++              void (*release)(const void *), void *p)
+ {
+       struct tmpbuf *tb;
++
++      tb = kmalloc(sizeof(*tb), GFP_KERNEL);
++      if (!tb)
++              return -ENOMEM;
++      tb->buf = p;
++      tb->release = release;
++      tb->next = argp->to_free;
++      argp->to_free = tb;
++      return 0;
++}
++
++char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
++{
++      void *new = NULL;
+       if (p == argp->tmp) {
+-              p = kmalloc(nbytes, GFP_KERNEL);
+-              if (!p) return NULL;
++              new = kmalloc(nbytes, GFP_KERNEL);
++              if (!new) return NULL;
++              p = new;
+               memcpy(p, argp->tmp, nbytes);
+       } else {
+               if (p != argp->tmpp)
+                       BUG();
+               argp->tmpp = NULL;
+       }
+-      tb = kmalloc(sizeof(*tb), GFP_KERNEL);
+-      if (!tb) {
+-              kfree(p);
++      if (defer_free(argp, kfree, p)) {
++              kfree(new);
+               return NULL;
+-      }
+-      tb->buf = p;
+-      tb->next = argp->to_free;
+-      argp->to_free = tb;
+-      return (char*)p;
++      } else
++              return (char *)p;
+ }
+@@ -335,7 +350,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoun
+ }
+ static int
+-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr)
++nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
++    struct nfs4_acl **acl)
+ {
+       int expected_len, len = 0;
+       u32 dummy32;
+@@ -364,6 +380,51 @@ nfsd4_decode_fattr(struct nfsd4_compound
+               READ64(iattr->ia_size);
+               iattr->ia_valid |= ATTR_SIZE;
+       }
++      if (bmval[0] & FATTR4_WORD0_ACL) {
++              int nace, i;
++              struct nfs4_ace ace;
++
++              READ_BUF(4); len += 4;
++              READ32(nace);
++
++              *acl = nfs4_acl_new();
++              if (*acl == NULL) {
++                      status = -ENOMEM;
++                      goto out_nfserr;
++              }
++              defer_free(argp, (void (*)(const void *))nfs4_acl_free, *acl);
++
++              for (i = 0; i < nace; i++) {
++                      READ_BUF(16); len += 16;
++                      READ32(ace.type);
++                      READ32(ace.flag);
++                      READ32(ace.access_mask);
++                      READ32(dummy32);
++                      READ_BUF(dummy32);
++                      len += XDR_QUADLEN(dummy32) << 2;
++                      READMEM(buf, dummy32);
++                      if (check_utf8(buf, dummy32))
++                              return nfserr_inval;
++                      ace.whotype = nfs4_acl_get_whotype(buf, dummy32);
++                      status = 0;
++                      if (ace.whotype != NFS4_ACL_WHO_NAMED)
++                              ace.who = 0;
++                      else if (ace.flag & NFS4_ACE_IDENTIFIER_GROUP)
++                              status = nfsd_map_name_to_gid(argp->rqstp,
++                                              buf, dummy32, &ace.who);
++                      else
++                              status = nfsd_map_name_to_uid(argp->rqstp,
++                                              buf, dummy32, &ace.who);
++                      if (status)
++                              goto out_nfserr;
++                      if (nfs4_acl_add_ace(*acl, ace.type, ace.flag,
++                               ace.access_mask, ace.whotype, ace.who) != 0) {
++                              status = -ENOMEM;
++                              goto out_nfserr;
++                      }
++              }
++      } else
++              *acl = NULL;
+       if (bmval[1] & FATTR4_WORD1_MODE) {
+               READ_BUF(4);
+               len += 4;
+@@ -549,7 +610,7 @@ nfsd4_decode_create(struct nfsd4_compoun
+       if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
+               return status;
+-      if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr)))
++      if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl)))
+               goto out;
+       DECODE_TAIL;
+@@ -698,7 +759,7 @@ nfsd4_decode_open(struct nfsd4_compounda
+               switch (open->op_createmode) {
+               case NFS4_CREATE_UNCHECKED:
+               case NFS4_CREATE_GUARDED:
+-                      if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr)))
++                      if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl)))
+                               goto out;
+                       break;
+               case NFS4_CREATE_EXCLUSIVE:
+@@ -875,7 +936,7 @@ nfsd4_decode_setattr(struct nfsd4_compou
+       READ_BUF(sizeof(stateid_t));
+       READ32(setattr->sa_stateid.si_generation);
+       COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t));
+-      if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr)))
++      if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl)))
+               goto out;
+       DECODE_TAIL;
+@@ -1288,32 +1349,24 @@ static u32 nfs4_ftypes[16] = {
+         NF4SOCK, NF4BAD,  NF4LNK, NF4BAD,
+ };
+-static inline int
+-xdr_padding(int l)
+-{
+-       return 3 - ((l - 1) & 3); /* smallest i>=0 such that (l+i)%4 = 0 */
+-}
+-
+ static int
+-nfsd4_encode_name(struct svc_rqst *rqstp, int group, uid_t id,
++nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
+                       u32 **p, int *buflen)
+ {
+       int status;
+-      u32 len;
+       if (*buflen < (XDR_QUADLEN(IDMAP_NAMESZ) << 2) + 4)
+               return nfserr_resource;
+-      if (group)
++      if (whotype != NFS4_ACL_WHO_NAMED)
++              status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1));
++      else if (group)
+               status = nfsd_map_gid_to_name(rqstp, id, (u8 *)(*p + 1));
+       else
+               status = nfsd_map_uid_to_name(rqstp, id, (u8 *)(*p + 1));
+       if (status < 0)
+               return nfserrno(status);
+-      len = (unsigned)status;
+-      *(*p)++ = htonl(len);
+-      memset((u8 *)*p + len, 0, xdr_padding(len));
+-      *p += XDR_QUADLEN(len);
+-      *buflen -= (XDR_QUADLEN(len) << 2) + 4;
++      *p = xdr_encode_opaque(*p, NULL, status);
++      *buflen -= (XDR_QUADLEN(status) << 2) + 4;
+       BUG_ON(*buflen < 0);
+       return 0;
+ }
+@@ -1321,13 +1374,20 @@ nfsd4_encode_name(struct svc_rqst *rqstp
+ static inline int
+ nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, u32 **p, int *buflen)
+ {
+-      return nfsd4_encode_name(rqstp, uid, 0, p, buflen);
++      return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen);
+ }
+ static inline int
+ nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, u32 **p, int *buflen)
+ {
+-      return nfsd4_encode_name(rqstp, gid, 1, p, buflen);
++      return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen);
++}
++
++static inline int
++nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
++              u32 **p, int *buflen)
++{
++      return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen);
+ }
+@@ -1354,6 +1414,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s
+       u64 dummy64;
+       u32 *p = buffer;
+       int status;
++      int aclsupport = 0;
++      struct nfs4_acl *acl = NULL;
+       BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
+       BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
+@@ -1376,6 +1438,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s
+                       goto out;
+               fhp = &tempfh;
+       }
++      if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
++                      | FATTR4_WORD0_SUPPORTED_ATTRS)) {
++              status = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
++              aclsupport = (status == 0);
++              if (bmval0 & FATTR4_WORD0_ACL) {
++                      if (status == -EOPNOTSUPP)
++                              bmval0 &= ~FATTR4_WORD0_ACL;
++                      else if (status != 0)
++                              goto out_nfserr;
++              }
++      }
+       if ((buflen -= 16) < 0)
+               goto out_resource;
+@@ -1388,7 +1461,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s
+               if ((buflen -= 12) < 0)
+                       goto out_resource;
+               WRITE32(2);
+-              WRITE32(NFSD_SUPPORTED_ATTRS_WORD0);
++              WRITE32(aclsupport ?
++                      NFSD_SUPPORTED_ATTRS_WORD0 :
++                      NFSD_SUPPORTED_ATTRS_WORD0 & ~FATTR4_WORD0_ACL);
+               WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
+       }
+       if (bmval0 & FATTR4_WORD0_TYPE) {
+@@ -1459,10 +1534,44 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s
+                       goto out_resource;
+               WRITE32(0);
+       }
++      if (bmval0 & FATTR4_WORD0_ACL) {
++              struct nfs4_ace *ace;
++              struct list_head *h;
++
++              if (acl == NULL) {
++                      if ((buflen -= 4) < 0)
++                              goto out_resource;
++
++                      WRITE32(0);
++                      goto out_acl;
++              }
++              if ((buflen -= 4) < 0)
++                      goto out_resource;
++              WRITE32(acl->naces);
++
++              list_for_each(h, &acl->ace_head) {
++                      ace = list_entry(h, struct nfs4_ace, l_ace);
++
++                      if ((buflen -= 4*3) < 0)
++                              goto out_resource;
++                      WRITE32(ace->type);
++                      WRITE32(ace->flag);
++                      WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL);
++                      status = nfsd4_encode_aclname(rqstp, ace->whotype,
++                              ace->who, ace->flag & NFS4_ACE_IDENTIFIER_GROUP,
++                              &p, &buflen);
++                      if (status == nfserr_resource)
++                              goto out_resource;
++                      if (status)
++                              goto out;
++              }
++      }
++out_acl:
+       if (bmval0 & FATTR4_WORD0_ACLSUPPORT) {
+               if ((buflen -= 4) < 0)
+                       goto out_resource;
+-              WRITE32(0);
++              WRITE32(aclsupport ?
++                      ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
+       }
+       if (bmval0 & FATTR4_WORD0_CANSETTIME) {
+               if ((buflen -= 4) < 0)
+@@ -1645,6 +1754,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s
+       status = nfs_ok;
+ out:
++      nfs4_acl_free(acl);
+       if (fhp == &tempfh)
+               fh_put(&tempfh);
+       return status;
+@@ -2471,6 +2581,24 @@ nfs4svc_encode_voidres(struct svc_rqst *
+         return xdr_ressize_check(rqstp, p);
+ }
++void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args)
++{
++      if (args->ops != args->iops) {
++              kfree(args->ops);
++              args->ops = args->iops;
++      }
++      if (args->tmpp) {
++              kfree(args->tmpp);
++              args->tmpp = NULL;
++      }
++      while (args->to_free) {
++              struct tmpbuf *tb = args->to_free;
++              args->to_free = tb->next;
++              tb->release(tb->buf);
++              kfree(tb);
++      }
++}
++
+ int
+ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoundargs *args)
+ {
+@@ -2487,20 +2615,7 @@ nfs4svc_decode_compoundargs(struct svc_r
+       status = nfsd4_decode_compound(args);
+       if (status) {
+-              if (args->ops != args->iops) {
+-                      kfree(args->ops);
+-                      args->ops = args->iops;
+-              }
+-              if (args->tmpp) {
+-                      kfree(args->tmpp);
+-                      args->tmpp = NULL;
+-              }
+-              while (args->to_free) {
+-                      struct tmpbuf *tb = args->to_free;
+-                      args->to_free = tb->next;
+-                      kfree(tb->buf);
+-                      kfree(tb);
+-              }
++              nfsd4_release_compoundargs(args);
+       }
+       return !status;
+ }
+--- linux-2.6.7/fs/nfsd/nfs4proc.c.lsec        2004-06-15 23:20:26.000000000 -0600
++++ linux-2.6.7/fs/nfsd/nfs4proc.c     2005-03-23 14:28:24.080388920 -0700
+@@ -52,6 +52,7 @@
+ #include <linux/nfs4.h>
+ #include <linux/nfsd/state.h>
+ #include <linux/nfsd/xdr4.h>
++#include <linux/nfs4_acl.h>
+ #define NFSDDBG_FACILITY              NFSDDBG_PROC
+@@ -135,9 +136,11 @@ do_open_fhandle(struct svc_rqst *rqstp, 
+ {
+       int status;
+-      dprintk("NFSD: do_open_fhandle\n");
++      /* Only reclaims from previously confirmed clients are valid */
++      if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
++              return status;
+-      /* we don't know the target directory, and therefore can not
++      /* We don't know the target directory, and therefore can not
+       * set the change info
+       */
+@@ -172,8 +175,7 @@ nfsd4_open(struct svc_rqst *rqstp, struc
+       if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+               return nfserr_grace;
+-      if (nfs4_in_no_grace() &&
+-                         open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
++      if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+               return nfserr_no_grace;
+       /* This check required by spec. */
+@@ -318,7 +320,7 @@ nfsd4_commit(struct svc_rqst *rqstp, str
+       return status;
+ }
+-static inline int
++static int
+ nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_create *create)
+ {
+       struct svc_fh resfh;
+@@ -435,7 +437,7 @@ nfsd4_link(struct svc_rqst *rqstp, struc
+       return status;
+ }
+-static inline int
++static int
+ nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh)
+ {
+       struct svc_fh tmp_fh;
+@@ -619,7 +621,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, st
+               status = nfserr_bad_stateid;
+               if (ZERO_STATEID(&setattr->sa_stateid) || ONE_STATEID(&setattr->sa_stateid)) {
+                       dprintk("NFSD: nfsd4_setattr: magic stateid!\n");
+-                      return status;
++                      goto out;
+               }
+               nfs4_lock_state();
+@@ -627,17 +629,25 @@ nfsd4_setattr(struct svc_rqst *rqstp, st
+                                               &setattr->sa_stateid, 
+                                               CHECK_FH | RDWR_STATE, &stp))) {
+                       dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
+-                      goto out;
++                      goto out_unlock;
+               }
+               status = nfserr_openmode;
+               if (!access_bits_permit_write(stp->st_access_bmap)) {
+                       dprintk("NFSD: nfsd4_setattr: not opened for write!\n");
+-                      goto out;
++                      goto out_unlock;
+               }
+               nfs4_unlock_state();
+       }
+-      return (nfsd_setattr(rqstp, current_fh, &setattr->sa_iattr, 0, (time_t)0));
++      status = nfs_ok;
++      if (setattr->sa_acl != NULL)
++              status = nfsd4_set_nfs4_acl(rqstp, current_fh, setattr->sa_acl);
++      if (status)
++              goto out;
++      status = nfsd_setattr(rqstp, current_fh, &setattr->sa_iattr,
++                              0, (time_t)0);
+ out:
++      return status;
++out_unlock:
+       nfs4_unlock_state();
+       return status;
+ }
+@@ -773,13 +783,20 @@ nfsd4_proc_compound(struct svc_rqst *rqs
+                   struct nfsd4_compoundres *resp)
+ {
+       struct nfsd4_op *op;
+-      struct svc_fh   current_fh;
+-      struct svc_fh   save_fh;
++      struct svc_fh   *current_fh = NULL;
++      struct svc_fh   *save_fh = NULL;
+       int             slack_space;    /* in words, not bytes! */
+       int             status;
+-      fh_init(&current_fh, NFS4_FHSIZE);
+-      fh_init(&save_fh, NFS4_FHSIZE);
++      status = nfserr_resource;
++      current_fh = kmalloc(sizeof(*current_fh), GFP_KERNEL);
++      if (current_fh == NULL)
++              goto out;
++      fh_init(current_fh, NFS4_FHSIZE);
++      save_fh = kmalloc(sizeof(*save_fh), GFP_KERNEL);
++      if (save_fh == NULL)
++              goto out;
++      fh_init(save_fh, NFS4_FHSIZE);
+       resp->xbuf = &rqstp->rq_res;
+       resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
+@@ -831,7 +848,7 @@ nfsd4_proc_compound(struct svc_rqst *rqs
+               * SETATTR NOFILEHANDLE error handled in nfsd4_setattr
+               * due to required returned bitmap argument
+               */
+-              if ((!current_fh.fh_dentry) &&
++              if ((!current_fh->fh_dentry) &&
+                  !((op->opnum == OP_PUTFH) || (op->opnum == OP_PUTROOTFH) ||
+                  (op->opnum == OP_SETCLIENTID) ||
+                  (op->opnum == OP_SETCLIENTID_CONFIRM) ||
+@@ -843,105 +860,105 @@ nfsd4_proc_compound(struct svc_rqst *rqs
+               }
+               switch (op->opnum) {
+               case OP_ACCESS:
+-                      op->status = nfsd4_access(rqstp, &current_fh, &op->u.access);
++                      op->status = nfsd4_access(rqstp, current_fh, &op->u.access);
+                       break;
+               case OP_CLOSE:
+-                      op->status = nfsd4_close(rqstp, &current_fh, &op->u.close);
++                      op->status = nfsd4_close(rqstp, current_fh, &op->u.close);
+                       if (op->u.close.cl_stateowner)
+                               op->replay =
+                                       &op->u.close.cl_stateowner->so_replay;
+                       break;
+               case OP_COMMIT:
+-                      op->status = nfsd4_commit(rqstp, &current_fh, &op->u.commit);
++                      op->status = nfsd4_commit(rqstp, current_fh, &op->u.commit);
+                       break;
+               case OP_CREATE:
+-                      op->status = nfsd4_create(rqstp, &current_fh, &op->u.create);
++                      op->status = nfsd4_create(rqstp, current_fh, &op->u.create);
+                       break;
+               case OP_GETATTR:
+-                      op->status = nfsd4_getattr(rqstp, &current_fh, &op->u.getattr);
++                      op->status = nfsd4_getattr(rqstp, current_fh, &op->u.getattr);
+                       break;
+               case OP_GETFH:
+-                      op->status = nfsd4_getfh(&current_fh, &op->u.getfh);
++                      op->status = nfsd4_getfh(current_fh, &op->u.getfh);
+                       break;
+               case OP_LINK:
+-                      op->status = nfsd4_link(rqstp, &current_fh, &save_fh, &op->u.link);
++                      op->status = nfsd4_link(rqstp, current_fh, save_fh, &op->u.link);
+                       break;
+               case OP_LOCK:
+-                      op->status = nfsd4_lock(rqstp, &current_fh, &op->u.lock);
++                      op->status = nfsd4_lock(rqstp, current_fh, &op->u.lock);
+                       if (op->u.lock.lk_stateowner)
+                               op->replay =
+                                       &op->u.lock.lk_stateowner->so_replay;
+                       break;
+               case OP_LOCKT:
+-                      op->status = nfsd4_lockt(rqstp, &current_fh, &op->u.lockt);
++                      op->status = nfsd4_lockt(rqstp, current_fh, &op->u.lockt);
+                       break;
+               case OP_LOCKU:
+-                      op->status = nfsd4_locku(rqstp, &current_fh, &op->u.locku);
++                      op->status = nfsd4_locku(rqstp, current_fh, &op->u.locku);
+                       if (op->u.locku.lu_stateowner)
+                               op->replay =
+                                       &op->u.locku.lu_stateowner->so_replay;
+                       break;
+               case OP_LOOKUP:
+-                      op->status = nfsd4_lookup(rqstp, &current_fh, &op->u.lookup);
++                      op->status = nfsd4_lookup(rqstp, current_fh, &op->u.lookup);
+                       break;
+               case OP_LOOKUPP:
+-                      op->status = nfsd4_lookupp(rqstp, &current_fh);
++                      op->status = nfsd4_lookupp(rqstp, current_fh);
+                       break;
+               case OP_NVERIFY:
+-                      op->status = nfsd4_verify(rqstp, &current_fh, &op->u.nverify);
++                      op->status = nfsd4_verify(rqstp, current_fh, &op->u.nverify);
+                       if (op->status == nfserr_not_same)
+                               op->status = nfs_ok;
+                       break;
+               case OP_OPEN:
+-                      op->status = nfsd4_open(rqstp, &current_fh, &op->u.open);
++                      op->status = nfsd4_open(rqstp, current_fh, &op->u.open);
+                       if (op->u.open.op_stateowner)
+                               op->replay =
+                                       &op->u.open.op_stateowner->so_replay;
+                       break;
+               case OP_OPEN_CONFIRM:
+-                      op->status = nfsd4_open_confirm(rqstp, &current_fh, &op->u.open_confirm);
++                      op->status = nfsd4_open_confirm(rqstp, current_fh, &op->u.open_confirm);
+                       if (op->u.open_confirm.oc_stateowner)
+                               op->replay =
+                                       &op->u.open_confirm.oc_stateowner->so_replay;
+                       break;
+               case OP_OPEN_DOWNGRADE:
+-                      op->status = nfsd4_open_downgrade(rqstp, &current_fh, &op->u.open_downgrade);
++                      op->status = nfsd4_open_downgrade(rqstp, current_fh, &op->u.open_downgrade);
+                       if (op->u.open_downgrade.od_stateowner)
+                               op->replay =
+                                       &op->u.open_downgrade.od_stateowner->so_replay;
+                       break;
+               case OP_PUTFH:
+-                      op->status = nfsd4_putfh(rqstp, &current_fh, &op->u.putfh);
++                      op->status = nfsd4_putfh(rqstp, current_fh, &op->u.putfh);
+                       break;
+               case OP_PUTROOTFH:
+-                      op->status = nfsd4_putrootfh(rqstp, &current_fh);
++                      op->status = nfsd4_putrootfh(rqstp, current_fh);
+                       break;
+               case OP_READ:
+-                      op->status = nfsd4_read(rqstp, &current_fh, &op->u.read);
++                      op->status = nfsd4_read(rqstp, current_fh, &op->u.read);
+                       break;
+               case OP_READDIR:
+-                      op->status = nfsd4_readdir(rqstp, &current_fh, &op->u.readdir);
++                      op->status = nfsd4_readdir(rqstp, current_fh, &op->u.readdir);
+                       break;
+               case OP_READLINK:
+-                      op->status = nfsd4_readlink(rqstp, &current_fh, &op->u.readlink);
++                      op->status = nfsd4_readlink(rqstp, current_fh, &op->u.readlink);
+                       break;
+               case OP_REMOVE:
+-                      op->status = nfsd4_remove(rqstp, &current_fh, &op->u.remove);
++                      op->status = nfsd4_remove(rqstp, current_fh, &op->u.remove);
+                       break;
+               case OP_RENAME:
+-                      op->status = nfsd4_rename(rqstp, &current_fh, &save_fh, &op->u.rename);
++                      op->status = nfsd4_rename(rqstp, current_fh, save_fh, &op->u.rename);
+                       break;
+               case OP_RENEW:
+                       op->status = nfsd4_renew(&op->u.renew);
+                       break;
+               case OP_RESTOREFH:
+-                      op->status = nfsd4_restorefh(&current_fh, &save_fh);
++                      op->status = nfsd4_restorefh(current_fh, save_fh);
+                       break;
+               case OP_SAVEFH:
+-                      op->status = nfsd4_savefh(&current_fh, &save_fh);
++                      op->status = nfsd4_savefh(current_fh, save_fh);
+                       break;
+               case OP_SETATTR:
+-                      op->status = nfsd4_setattr(rqstp, &current_fh, &op->u.setattr);
++                      op->status = nfsd4_setattr(rqstp, current_fh, &op->u.setattr);
+                       break;
+               case OP_SETCLIENTID:
+                       op->status = nfsd4_setclientid(rqstp, &op->u.setclientid);
+@@ -950,12 +967,12 @@ nfsd4_proc_compound(struct svc_rqst *rqs
+                       op->status = nfsd4_setclientid_confirm(rqstp, &op->u.setclientid_confirm);
+                       break;
+               case OP_VERIFY:
+-                      op->status = nfsd4_verify(rqstp, &current_fh, &op->u.verify);
++                      op->status = nfsd4_verify(rqstp, current_fh, &op->u.verify);
+                       if (op->status == nfserr_same)
+                               op->status = nfs_ok;
+                       break;
+               case OP_WRITE:
+-                      op->status = nfsd4_write(rqstp, &current_fh, &op->u.write);
++                      op->status = nfsd4_write(rqstp, current_fh, &op->u.write);
+                       break;
+               case OP_RELEASE_LOCKOWNER:
+                       op->status = nfsd4_release_lockowner(rqstp, &op->u.release_lockowner);
+@@ -976,22 +993,13 @@ encode_op:
+       }
+ out:
+-      if (args->ops != args->iops) {
+-              kfree(args->ops);
+-              args->ops = args->iops;
+-      }
+-      if (args->tmpp) {
+-              kfree(args->tmpp);
+-              args->tmpp = NULL;
+-      }
+-      while (args->to_free) {
+-              struct tmpbuf *tb = args->to_free;
+-              args->to_free = tb->next;
+-              kfree(tb->buf);
+-              kfree(tb);
+-      }
+-      fh_put(&current_fh);
+-      fh_put(&save_fh);
++      nfsd4_release_compoundargs(args);
++      if (current_fh)
++              fh_put(current_fh);
++      kfree(current_fh);
++      if (save_fh)
++              fh_put(save_fh);
++      kfree(save_fh);
+       return status;
+ }
+--- linux-2.6.7/fs/nfsd/nfs4state.c.lsec       2004-06-15 23:19:43.000000000 -0600
++++ linux-2.6.7/fs/nfsd/nfs4state.c    2005-03-23 14:28:24.028396824 -0700
+@@ -51,6 +51,9 @@
+ #define NFSDDBG_FACILITY                NFSDDBG_PROC
+ /* Globals */
++static time_t lease_time = 90;     /* default lease time */
++static time_t old_lease_time = 90; /* past incarnation lease time */
++static u32 nfs4_reclaim_init = 0;
+ time_t boot_time;
+ static time_t grace_end = 0;
+ static u32 current_clientid = 1;
+@@ -82,7 +85,7 @@ struct nfs4_stateid * find_stateid(state
+  *    protects clientid_hashtbl[], clientstr_hashtbl[],
+  *    unconfstr_hashtbl[], uncofid_hashtbl[].
+  */
+-static struct semaphore client_sema;
++static DECLARE_MUTEX(client_sema);
+ void
+ nfs4_lock_state(void)
+@@ -131,8 +134,11 @@ static void release_file(struct nfs4_fil
+       ((id) & CLIENT_HASH_MASK)
+ #define clientstr_hashval(name, namelen) \
+       (opaque_hashval((name), (namelen)) & CLIENT_HASH_MASK)
+-
+-/* conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
++/*
++ * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
++ * used in reboot/reset lease grace period processing
++ *
++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
+  * setclientid_confirmed info. 
+  *
+  * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed 
+@@ -144,6 +150,8 @@ static void release_file(struct nfs4_fil
+  * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
+  * for last close replay.
+  */
++static struct list_head       reclaim_str_hashtbl[CLIENT_HASH_SIZE];
++static int reclaim_str_hashtbl_size;
+ static struct list_head       conf_id_hashtbl[CLIENT_HASH_SIZE];
+ static struct list_head       conf_str_hashtbl[CLIENT_HASH_SIZE];
+ static struct list_head       unconf_str_hashtbl[CLIENT_HASH_SIZE];
+@@ -208,12 +216,20 @@ free_client(struct nfs4_client *clp)
+       kfree(clp);
+ }
+-static void
++void
++put_nfs4_client(struct nfs4_client *clp)
++{
++      if (atomic_dec_and_test(&clp->cl_count))
++              free_client(clp);
++}
++
++void
+ expire_client(struct nfs4_client *clp)
+ {
+       struct nfs4_stateowner *sop;
+-      dprintk("NFSD: expire_client\n");
++      dprintk("NFSD: expire_client cl_count %d\n",
++                  atomic_read(&clp->cl_count));
+       list_del(&clp->cl_idhash);
+       list_del(&clp->cl_strhash);
+       list_del(&clp->cl_lru);
+@@ -221,7 +237,7 @@ expire_client(struct nfs4_client *clp)
+               sop = list_entry(clp->cl_perclient.next, struct nfs4_stateowner, so_perclient);
+               release_stateowner(sop);
+       }
+-      free_client(clp);
++      put_nfs4_client(clp);
+ }
+ static struct nfs4_client *
+@@ -230,6 +246,7 @@ create_client(struct xdr_netobj name) {
+       if(!(clp = alloc_client(name)))
+               goto out;
++      atomic_set(&clp->cl_count, 1);
+       INIT_LIST_HEAD(&clp->cl_idhash);
+       INIT_LIST_HEAD(&clp->cl_strhash);
+       INIT_LIST_HEAD(&clp->cl_perclient);
+@@ -339,6 +356,99 @@ move_to_confirmed(struct nfs4_client *cl
+       renew_client(clp);
+ }
++
++/* a helper function for parse_callback */
++static int
++parse_octet(unsigned int *lenp, char **addrp)
++{
++      unsigned int len = *lenp;
++      char *p = *addrp;
++      int n = -1;
++      char c;
++
++      for (;;) {
++              if (!len)
++                      break;
++              len--;
++              c = *p++;
++              if (c == '.')
++                      break;
++              if ((c < '0') || (c > '9')) {
++                      n = -1;
++                      break;
++              }
++              if (n < 0)
++                      n = 0;
++              n = (n * 10) + (c - '0');
++              if (n > 255) {
++                      n = -1;
++                      break;
++              }
++      }
++      *lenp = len;
++      *addrp = p;
++      return n;
++}
++
++/* parse and set the setclientid ipv4 callback address */
++int
++parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp)
++{
++      int temp = 0;
++      u32 cbaddr = 0;
++      u16 cbport = 0;
++      u32 addrlen = addr_len;
++      char *addr = addr_val;
++      int i, shift;
++
++      /* ipaddress */
++      shift = 24;
++      for(i = 4; i > 0  ; i--) {
++              if ((temp = parse_octet(&addrlen, &addr)) < 0) {
++                      return 0;
++              }
++              cbaddr |= (temp << shift);
++              if(shift > 0)
++              shift -= 8;
++      }
++      *cbaddrp = cbaddr;
++
++      /* port */
++      shift = 8;
++      for(i = 2; i > 0  ; i--) {
++              if ((temp = parse_octet(&addrlen, &addr)) < 0) {
++                      return 0;
++              }
++              cbport |= (temp << shift);
++              if(shift > 0)
++                      shift -= 8;
++      }
++      *cbportp = cbport;
++      return 1;
++}
++
++void
++gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
++{
++      struct nfs4_callback *cb = &clp->cl_callback;
++
++      if( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val,
++                       &cb->cb_addr, &cb->cb_port))) {
++              printk(KERN_INFO "NFSD: BAD callback address. client will not receive delegations\n");
++              printk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
++                      "will not receive delegations\n",
++                      clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
++
++              cb->cb_parsed = 0;
++              return;
++      }
++      cb->cb_netid.len = se->se_callback_netid_len;
++      cb->cb_netid.data = se->se_callback_netid_val;
++        cb->cb_prog = se->se_callback_prog;
++        cb->cb_ident = se->se_callback_ident;
++        cb->cb_parsed = 1;
++}
++
+ /*
+  * RFC 3010 has a complex implmentation description of processing a 
+  * SETCLIENTID request consisting of 5 bullets, labeled as 
+@@ -450,6 +560,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp
+               copy_cred(&new->cl_cred,&rqstp->rq_cred);
+               gen_clid(new);
+               gen_confirm(new);
++              gen_callback(new, setclid);
+               add_to_unconfirmed(new, strhashval);
+       } else if (cmp_verf(&conf->cl_verifier, &clverifier)) {
+               /*
+@@ -477,6 +588,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp
+               copy_cred(&new->cl_cred,&rqstp->rq_cred);
+               copy_clid(new, conf);
+               gen_confirm(new);
++              gen_callback(new, setclid);
+               add_to_unconfirmed(new,strhashval);
+       } else if (!unconf) {
+               /*
+@@ -494,6 +606,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp
+               copy_cred(&new->cl_cred,&rqstp->rq_cred);
+               gen_clid(new);
+               gen_confirm(new);
++              gen_callback(new, setclid);
+               add_to_unconfirmed(new, strhashval);
+       } else if (!cmp_verf(&conf->cl_confirm, &unconf->cl_confirm)) {
+               /*      
+@@ -519,6 +632,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp
+               copy_cred(&new->cl_cred,&rqstp->rq_cred);
+               gen_clid(new);
+               gen_confirm(new);
++              gen_callback(new, setclid);
+               add_to_unconfirmed(new, strhashval);
+       } else {
+               /* No cases hit !!! */
+@@ -529,7 +643,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp
+       setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
+       setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
+       memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
+-      printk(KERN_INFO "NFSD: this client will not receive delegations\n");
+       status = nfs_ok;
+ out:
+       nfs4_unlock_state();
+@@ -575,7 +688,7 @@ nfsd4_setclientid_confirm(struct svc_rqs
+                * not been found.
+                */
+               if (clp->cl_addr != ip_addr) { 
+-                      printk("NFSD: setclientid: string in use by client"
++                      dprintk("NFSD: setclientid: string in use by client"
+                       "(clientid %08x/%08x)\n",
+                       clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
+                       goto out;
+@@ -588,7 +701,7 @@ nfsd4_setclientid_confirm(struct svc_rqs
+                       continue;
+               status = nfserr_inval;
+               if (clp->cl_addr != ip_addr) { 
+-                      printk("NFSD: setclientid: string in use by client"
++                      dprintk("NFSD: setclientid: string in use by client"
+                       "(clientid %08x/%08x)\n",
+                       clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
+                       goto out;
+@@ -610,6 +723,7 @@ nfsd4_setclientid_confirm(struct svc_rqs
+                       status = nfserr_clid_inuse;
+               else {
+                       expire_client(conf);
++                      clp = unconf;
+                       move_to_confirmed(unconf, idhashval);
+                       status = nfs_ok;
+               }
+@@ -627,6 +741,7 @@ nfsd4_setclientid_confirm(struct svc_rqs
+               if (!cmp_creds(&conf->cl_cred,&rqstp->rq_cred)) {
+                       status = nfserr_clid_inuse;
+               } else {
++                      clp = conf;
+                       status = nfs_ok;
+               }
+               goto out;
+@@ -641,6 +756,7 @@ nfsd4_setclientid_confirm(struct svc_rqs
+                       status = nfserr_clid_inuse;
+               } else {
+                       status = nfs_ok;
++                      clp = unconf;
+                       move_to_confirmed(unconf, idhashval);
+               }
+               goto out;
+@@ -660,7 +776,9 @@ nfsd4_setclientid_confirm(struct svc_rqs
+       status = nfserr_inval;
+       goto out;
+ out:
+-      /* XXX if status == nfs_ok, probe callback path */
++      if (!status)
++              nfsd4_probe_callback(clp);
++
+       nfs4_unlock_state();
+       return status;
+ }
+@@ -1510,10 +1628,12 @@ nfs4_preprocess_seqid_op(struct svc_fh *
+       status = nfserr_bad_stateid;
+-      /* for new lock stateowners, check that the lock->v.new.open_stateid
+-       * refers to an open stateowner, and that the lockclid
+-       * (nfs4_lock->v.new.clientid) is the same as the
+-       * open_stateid->st_stateowner->so_client->clientid
++      /* for new lock stateowners: 
++       * check that the lock->v.new.open_stateid
++       * refers to an open stateowner
++       * 
++       * check that the lockclid (nfs4_lock->v.new.clientid) is the same 
++       * as the open_stateid->st_stateowner->so_client->clientid
+        */
+       if (lockclid) {
+               struct nfs4_stateowner *sop = stp->st_stateowner;
+@@ -1599,6 +1719,17 @@ check_replay:
+ }
+ /*
++ * eventually, this will perform an upcall to the 'state daemon' as well as
++ * set the cl_first_state field.
++ */
++void
++first_state(struct nfs4_client *clp)
++{
++      if (!clp->cl_first_state)
++              clp->cl_first_state = get_seconds();
++}
++
++/*
+  * nfs4_unlock_state(); called in encode
+  */
+ int
+@@ -1635,6 +1766,7 @@ nfsd4_open_confirm(struct svc_rqst *rqst
+                        stp->st_stateid.si_fileid,
+                        stp->st_stateid.si_generation);
+       status = nfs_ok;
++      first_state(sop->so_client);
+ out:
+       return status;
+ }
+@@ -1850,6 +1982,21 @@ nfs4_set_lock_denied(struct file_lock *f
+               deny->ld_type = NFS4_WRITE_LT;
+ }
++static struct nfs4_stateowner *
++find_lockstateowner(struct xdr_netobj *owner, clientid_t *clid)
++{
++      struct nfs4_stateowner *local = NULL;
++      int i;
++
++      for (i = 0; i < LOCK_HASH_SIZE; i++) {
++              list_for_each_entry(local, &lock_ownerid_hashtbl[i], so_idhash) {
++                      if(!cmp_owner_str(local, owner, clid))
++                              continue;
++                      return local;
++              }
++      }
++      return NULL;
++}
+ static int
+ find_lockstateowner_str(unsigned int hashval, struct xdr_netobj *owner, clientid_t *clid, struct nfs4_stateowner **op) {
+@@ -1969,7 +2116,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struc
+       if (nfs4_in_grace() && !lock->lk_reclaim)
+               return nfserr_grace;
+-      if (nfs4_in_no_grace() && lock->lk_reclaim)
++      if (!nfs4_in_grace() && lock->lk_reclaim)
+               return nfserr_no_grace;
+       if (check_lock_length(lock->lk_offset, lock->lk_length))
+@@ -1992,7 +2139,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struc
+                       printk("NFSD: nfsd4_lock: clientid is stale!\n");
+                       goto out;
+               }
+-              /* does the clientid in the lock owner own the open stateid? */
++
++              /* is the new lock seqid presented by the client zero? */
++              status = nfserr_bad_seqid;
++              if (lock->v.new.lock_seqid != 0)
++                      goto out;
+               /* validate and update open stateid and open seqid */
+               status = nfs4_preprocess_seqid_op(current_fh, 
+@@ -2011,15 +2162,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struc
+               strhashval = lock_ownerstr_hashval(fp->fi_inode, 
+                               open_sop->so_client->cl_clientid.cl_id, 
+                               lock->v.new.owner);
+-
+               /* 
+                * If we already have this lock owner, the client is in 
+                * error (or our bookeeping is wrong!) 
+                * for asking for a 'new lock'.
+                */
+               status = nfserr_bad_stateid;
+-              if (find_lockstateowner_str(strhashval, &lock->v.new.owner,
+-                                      &lock->v.new.clientid, &lock_sop))
++              lock_sop = find_lockstateowner(&lock->v.new.owner,
++                                              &lock->v.new.clientid);
++              if (lock_sop)
+                       goto out;
+               status = nfserr_resource;
+               if (!(lock->lk_stateowner = alloc_init_lock_stateowner(strhashval, open_sop->so_client, open_stp, lock)))
+@@ -2315,7 +2466,7 @@ nfsd4_release_lockowner(struct svc_rqst 
+       clientid_t *clid = &rlockowner->rl_clientid;
+       struct nfs4_stateowner *local = NULL;
+       struct xdr_netobj *owner = &rlockowner->rl_owner;
+-      int status, i;
++      int status;
+       dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
+               clid->cl_boot, clid->cl_id);
+@@ -2330,34 +2481,136 @@ nfsd4_release_lockowner(struct svc_rqst 
+       nfs4_lock_state();
+-      /* find the lockowner */
+         status = nfs_ok;
+-      for (i=0; i < LOCK_HASH_SIZE; i++)
+-              list_for_each_entry(local, &lock_ownerstr_hashtbl[i], so_strhash)
+-                      if(cmp_owner_str(local, owner, clid)) {
+-                              struct nfs4_stateid *stp;
+-
+-                              /* check for any locks held by any stateid
+-                               * associated with the (lock) stateowner */
+-                              status = nfserr_locks_held;
+-                              list_for_each_entry(stp, &local->so_perfilestate,
+-                                                  st_perfilestate) {
+-                                      if(stp->st_vfs_set) {
+-                                              if (check_for_locks(&stp->st_vfs_file,
+-                                                                  local))
+-                                                      goto out;
+-                                      }
+-                              }
+-                              /* no locks held by (lock) stateowner */
+-                              status = nfs_ok;
+-                              release_stateowner(local);
+-                              goto out;
++      local = find_lockstateowner(owner, clid);
++      if (local) {
++              struct nfs4_stateid *stp;
++
++              /* check for any locks held by any stateid
++               * associated with the (lock) stateowner */
++              status = nfserr_locks_held;
++              list_for_each_entry(stp, &local->so_perfilestate,
++                              st_perfilestate) {
++                      if(stp->st_vfs_set) {
++                              if (check_for_locks(&stp->st_vfs_file, local))
++                                      goto out;
+                       }
++              }
++              /* no locks held by (lock) stateowner */
++              status = nfs_ok;
++              release_stateowner(local);
++      }
+ out:
+       nfs4_unlock_state();
+       return status;
+ }
++static inline struct nfs4_client_reclaim *
++alloc_reclaim(int namelen)
++{
++      struct nfs4_client_reclaim *crp = NULL;
++
++      crp = kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
++      if (!crp)
++              return NULL;
++      crp->cr_name.data = kmalloc(namelen, GFP_KERNEL);
++      if (!crp->cr_name.data) {
++              kfree(crp);
++              return NULL;
++      }
++              return crp;
++}
++
++/*
++ * failure => all reset bets are off, nfserr_no_grace...
++ */
++static int
++nfs4_client_to_reclaim(struct nfs4_client *clp)
++{
++      unsigned int strhashval;
++      struct nfs4_client_reclaim *crp = NULL;
++
++      crp = alloc_reclaim(clp->cl_name.len);
++      if (!crp)
++              return 0;
++      strhashval = clientstr_hashval(clp->cl_name.data, clp->cl_name.len);
++      INIT_LIST_HEAD(&crp->cr_strhash);
++      list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]);
++      memcpy(crp->cr_name.data, clp->cl_name.data, clp->cl_name.len);
++      crp->cr_name.len = clp->cl_name.len;
++      crp->cr_first_state = clp->cl_first_state;
++      crp->cr_expired = 0;
++      return 1;
++}
++
++static void
++nfs4_release_reclaim(void)
++{
++      struct nfs4_client_reclaim *crp = NULL;
++      int i;
++
++      BUG_ON(!nfs4_reclaim_init);
++      for (i = 0; i < CLIENT_HASH_SIZE; i++) {
++              while (!list_empty(&reclaim_str_hashtbl[i])) {
++                      crp = list_entry(reclaim_str_hashtbl[i].next,
++                                      struct nfs4_client_reclaim, cr_strhash);
++                      list_del(&crp->cr_strhash);
++                      kfree(crp->cr_name.data);
++                      kfree(crp);
++                      reclaim_str_hashtbl_size--;
++              }
++      }
++      BUG_ON(reclaim_str_hashtbl_size);
++}
++
++/*
++ * called from OPEN, CLAIM_PREVIOUS with a new clientid. */
++struct nfs4_client_reclaim *
++nfs4_find_reclaim_client(clientid_t *clid)
++{
++      unsigned int idhashval = clientid_hashval(clid->cl_id);
++      unsigned int strhashval;
++      struct nfs4_client *clp, *client = NULL;
++      struct nfs4_client_reclaim *crp = NULL;
++
++
++      /* find clientid in conf_id_hashtbl */
++      list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
++              if (cmp_clid(&clp->cl_clientid, clid)) {
++                      client = clp;
++                      break;
++              }
++      }
++      if (!client)
++              return NULL;
++
++      /* find clp->cl_name in reclaim_str_hashtbl */
++      strhashval = clientstr_hashval(client->cl_name.data,
++                                    client->cl_name.len);
++      list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) {
++              if(cmp_name(&crp->cr_name, &client->cl_name)) {
++                      return crp;
++              }
++      }
++      return NULL;
++}
++
++/*
++* Called from OPEN. Look for clientid in reclaim list.
++*/
++int
++nfs4_check_open_reclaim(clientid_t *clid)
++{
++      struct nfs4_client_reclaim *crp;
++
++      if ((crp = nfs4_find_reclaim_client(clid)) == NULL)
++              return nfserr_reclaim_bad;
++      if (crp->cr_expired)
++              return nfserr_no_grace;
++      return nfs_ok;
++}
++
++
+ /* 
+  * Start and stop routines
+  */
+@@ -2366,10 +2619,16 @@ void 
+ nfs4_state_init(void)
+ {
+       int i;
+-      time_t start = get_seconds();
++      time_t grace_time;
+       if (nfs4_init)
+               return;
++      if (!nfs4_reclaim_init) {
++              for (i = 0; i < CLIENT_HASH_SIZE; i++)
++                      INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
++              reclaim_str_hashtbl_size = 0;
++              nfs4_reclaim_init = 1;
++      }
+       for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+               INIT_LIST_HEAD(&conf_id_hashtbl[i]);
+               INIT_LIST_HEAD(&conf_str_hashtbl[i]);
+@@ -2396,27 +2655,36 @@ nfs4_state_init(void)
+       INIT_LIST_HEAD(&close_lru);
+       INIT_LIST_HEAD(&client_lru);
+-      init_MUTEX(&client_sema);
+-      boot_time = start;
+-      grace_end = start + NFSD_LEASE_TIME;
++      boot_time = get_seconds();
++      grace_time = max(old_lease_time, lease_time);
++      if (reclaim_str_hashtbl_size == 0)
++              grace_time = 0;
++      if (grace_time)
++              printk("NFSD: starting %ld-second grace period\n", grace_time);
++      grace_end = boot_time + grace_time;
+       INIT_WORK(&laundromat_work,laundromat_main, NULL);
+       schedule_delayed_work(&laundromat_work, NFSD_LEASE_TIME*HZ);
+       nfs4_init = 1;
+-
+ }
+ int
+ nfs4_in_grace(void)
+ {
+-      return time_before(get_seconds(), (unsigned long)grace_end);
++      return get_seconds() < grace_end;
+ }
+-int
+-nfs4_in_no_grace(void)
++void
++set_no_grace(void)
+ {
+-      return (grace_end < get_seconds());
++      printk("NFSD: ERROR in reboot recovery.  State reclaims will fail.\n");
++      grace_end = get_seconds();
+ }
++time_t
++nfs4_lease_time(void)
++{
++      return lease_time;
++}
+ static void
+ __nfs4_state_shutdown(void)
+@@ -2454,6 +2722,61 @@ void
+ nfs4_state_shutdown(void)
+ {
+       nfs4_lock_state();
++      nfs4_release_reclaim();
+       __nfs4_state_shutdown();
+       nfs4_unlock_state();
+ }
++
++/*
++ * Called when leasetime is changed.
++ *
++ * if nfsd is not started, simply set the global lease.
++ *
++ * if nfsd(s) are running, lease change requires nfsv4 state to be reset.
++ * e.g: boot_time is reset, existing nfs4_client structs are
++ * used to fill reclaim_str_hashtbl, then all state (except for the
++ * reclaim_str_hashtbl) is re-initialized.
++ *
++ * if the old lease time is greater than the new lease time, the grace
++ * period needs to be set to the old lease time to allow clients to reclaim
++ * their state. XXX - we may want to set the grace period == lease time
++ * after an initial grace period == old lease time
++ *
++ * if an error occurs in this process, the new lease is set, but the server
++ * will not honor OPEN or LOCK reclaims, and will return nfserr_no_grace
++ * which means OPEN/LOCK/READ/WRITE will fail during grace period.
++ *
++ * clients will attempt to reset all state with SETCLIENTID/CONFIRM, and
++ * OPEN and LOCK reclaims.
++ */
++void
++nfs4_reset_lease(time_t leasetime)
++{
++      struct nfs4_client *clp;
++      int i;
++
++      printk("NFSD: New leasetime %ld\n",leasetime);
++      if (!nfs4_init)
++              return;
++      nfs4_lock_state();
++      old_lease_time = lease_time;
++      lease_time = leasetime;
++
++      nfs4_release_reclaim();
++
++      /* populate reclaim_str_hashtbl with current confirmed nfs4_clientid */
++      for (i = 0; i < CLIENT_HASH_SIZE; i++) {
++              list_for_each_entry(clp, &conf_id_hashtbl[i], cl_idhash) {
++                      if (!nfs4_client_to_reclaim(clp)) {
++                              nfs4_release_reclaim();
++                              goto init_state;
++                      }
++                      reclaim_str_hashtbl_size++;
++              }
++      }
++init_state:
++      __nfs4_state_shutdown();
++      nfs4_state_init();
++      nfs4_unlock_state();
++}
++
+--- linux-2.6.7/fs/nfsd/vfs.c.lsec     2004-06-15 23:19:13.000000000 -0600
++++ linux-2.6.7/fs/nfsd/vfs.c  2005-03-23 14:28:24.520322040 -0700
+@@ -44,6 +44,16 @@
+ #include <linux/nfsd/nfsfh.h>
+ #include <linux/quotaops.h>
+ #include <linux/dnotify.h>
++#ifdef CONFIG_NFSD_V4
++#include <linux/posix_acl.h>
++#include <linux/posix_acl_xattr.h>
++#include <linux/xattr_acl.h>
++#include <linux/xattr.h>
++#include <linux/nfs4.h>
++#include <linux/nfs4_acl.h>
++#include <linux/nfsd_idmap.h>
++#include <linux/security.h>
++#endif /* CONFIG_NFSD_V4 */
+ #include <asm/uaccess.h>
+@@ -344,6 +354,177 @@ out_nfserr:
+       goto out;
+ }
++#if defined(CONFIG_NFSD_V4)
++
++static int
++set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
++{
++      int len;
++      size_t buflen;
++      char *buf = NULL;
++      int error = 0;
++      struct inode *inode = dentry->d_inode;
++
++      buflen = posix_acl_xattr_size(pacl->a_count);
++      buf = kmalloc(buflen, GFP_KERNEL);
++      error = -ENOMEM;
++      if (buf == NULL)
++              goto out;
++
++      len = posix_acl_to_xattr(pacl, buf, buflen);
++      if (len < 0) {
++              error = len;
++              goto out;
++      }
++
++      error = -EOPNOTSUPP;
++      if (inode->i_op && inode->i_op->setxattr) {
++              down(&inode->i_sem);
++              security_inode_setxattr(dentry, key, buf, len, 0);
++              error = inode->i_op->setxattr(dentry, key, buf, len, 0);
++              if (!error)
++                      security_inode_post_setxattr(dentry, key, buf, len, 0);
++              up(&inode->i_sem);
++      }
++out:
++      kfree(buf);
++      return (error);
++}
++
++int
++nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
++    struct nfs4_acl *acl)
++{
++      int error;
++      struct dentry *dentry;
++      struct inode *inode;
++      struct posix_acl *pacl = NULL, *dpacl = NULL;
++      unsigned int flags = 0;
++
++      /* Get inode */
++      error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);
++      if (error)
++              goto out;
++
++      dentry = fhp->fh_dentry;
++      inode = dentry->d_inode;
++      if (S_ISDIR(inode->i_mode))
++              flags = NFS4_ACL_DIR;
++
++      error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
++      if (error < 0)
++              goto out_nfserr;
++
++      if (pacl) {
++              error = set_nfsv4_acl_one(dentry, pacl, XATTR_NAME_ACL_ACCESS);
++              if (error < 0)
++                      goto out_nfserr;
++      }
++
++      if (dpacl) {
++              error = set_nfsv4_acl_one(dentry, dpacl, XATTR_NAME_ACL_DEFAULT);
++              if (error < 0)
++                      goto out_nfserr;
++      }
++
++      error = nfs_ok;
++
++out:
++      posix_acl_release(pacl);
++      posix_acl_release(dpacl);
++      return (error);
++out_nfserr:
++      error = nfserrno(error);
++      goto out;
++}
++
++static struct posix_acl *
++_get_posix_acl(struct dentry *dentry, char *key)
++{
++      struct inode *inode = dentry->d_inode;
++      char *buf = NULL;
++      int buflen, error = 0;
++      struct posix_acl *pacl = NULL;
++
++      down(&inode->i_sem);
++
++      buflen = inode->i_op->getxattr(dentry, key, NULL, 0);
++      if (buflen <= 0) {
++              error = buflen < 0 ? buflen : -ENODATA;
++              goto out_sem;
++      }
++
++      buf = kmalloc(buflen, GFP_KERNEL);
++      if (buf == NULL) {
++              error = -ENOMEM;
++              goto out_sem;
++      }
++
++      error = -EOPNOTSUPP;
++      if (inode->i_op && inode->i_op->getxattr) {
++              error = security_inode_getxattr(dentry, key);
++              if (error)
++                      goto out_sem;
++              error = inode->i_op->getxattr(dentry, key, buf, buflen);
++      }
++      if (error < 0)
++              goto out_sem;
++
++      error = 0;
++      up(&inode->i_sem);
++
++      pacl = posix_acl_from_xattr(buf, buflen);
++ out:
++      kfree(buf);
++      return pacl;
++ out_sem:
++      up(&inode->i_sem);
++      pacl = ERR_PTR(error);
++      goto out;
++}
++
++int
++nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl)
++{
++      struct inode *inode = dentry->d_inode;
++      int error = 0;
++      struct posix_acl *pacl = NULL, *dpacl = NULL;
++      unsigned int flags = 0;
++
++      pacl = _get_posix_acl(dentry, XATTR_NAME_ACL_ACCESS);
++      if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA)
++              pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
++      if (IS_ERR(pacl)) {
++              error = PTR_ERR(pacl);
++              pacl = NULL;
++              goto out;
++      }
++
++      if (S_ISDIR(inode->i_mode)) {
++              dpacl = _get_posix_acl(dentry, XATTR_NAME_ACL_DEFAULT);
++              if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA)
++                      dpacl = NULL;
++              else if (IS_ERR(dpacl)) {
++                      error = PTR_ERR(dpacl);
++                      dpacl = NULL;
++                      goto out;
++              }
++              flags = NFS4_ACL_DIR;
++      }
++
++      *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags);
++      if (IS_ERR(*acl)) {
++              error = PTR_ERR(*acl);
++              *acl = NULL;
++      }
++ out:
++      posix_acl_release(pacl);
++      posix_acl_release(dpacl);
++      return error;
++}
++
++#endif /* defined(CONFIG_NFS_V4) */
++
+ #ifdef CONFIG_NFSD_V3
+ /*
+  * Check server access rights to a file system object
+--- linux-2.6.7/fs/nfsd/nfs4idmap.c.lsec       2004-06-15 23:19:43.000000000 -0600
++++ linux-2.6.7/fs/nfsd/nfs4idmap.c    2005-03-23 14:28:24.687296656 -0700
+@@ -78,9 +78,9 @@ struct ent {
+ #define DefineSimpleCacheLookupMap(STRUCT, FUNC)                      \
+         DefineCacheLookup(struct STRUCT, h, FUNC##_lookup,            \
+-        (struct STRUCT *item, int set), /*no setup */,                        \
++        (struct STRUCT *item, int set),                       \
+       & FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp),     \
+-      STRUCT##_init(new, item), STRUCT##_update(tmp, item), 0)
++      STRUCT##_init(new, item), STRUCT##_update(tmp, item))
+ /* Common entry handling */
+--- linux-2.6.7/fs/nfsd/nfs4acl.c.lsec 2005-03-23 14:28:24.463330704 -0700
++++ linux-2.6.7/fs/nfsd/nfs4acl.c      2005-03-23 14:28:24.463330704 -0700
+@@ -0,0 +1,974 @@
++/*
++ *  fs/nfs4acl/acl.c
++ *
++ *  Common NFSv4 ACL handling code.
++ *
++ *  Copyright (c) 2002, 2003 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Marius Aamodt Eriksen <marius@umich.edu>
++ *  Jeff Sedlak <jsedlak@umich.edu>
++ *  J. Bruce Fields <bfields@umich.edu>
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the University nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/types.h>
++#include <linux/fs.h>
++#include <linux/module.h>
++#include <linux/nfs_fs.h>
++#include <linux/posix_acl.h>
++#include <linux/nfs4.h>
++#include <linux/nfs4_acl.h>
++
++
++/* mode bit translations: */
++#define NFS4_READ_MODE (NFS4_ACE_READ_DATA | NFS4_ACE_READ_NAMED_ATTRS)
++#define NFS4_WRITE_MODE (NFS4_ACE_WRITE_DATA | NFS4_ACE_WRITE_NAMED_ATTRS | NFS4_ACE_APPEND_DATA)
++#define NFS4_EXECUTE_MODE NFS4_ACE_EXECUTE
++#define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE)
++#define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL)
++
++/* flags used to simulate posix default ACLs */
++#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \
++              | NFS4_ACE_DIRECTORY_INHERIT_ACE | NFS4_ACE_INHERIT_ONLY_ACE)
++
++#define MASK_EQUAL(mask1, mask2) \
++      ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) )
++
++static u32
++mask_from_posix(unsigned short perm, unsigned int flags)
++{
++      int mask = NFS4_ANYONE_MODE;
++
++      if (flags & NFS4_ACL_OWNER)
++              mask |= NFS4_OWNER_MODE;
++      if (perm & ACL_READ)
++              mask |= NFS4_READ_MODE;
++      if (perm & ACL_WRITE)
++              mask |= NFS4_WRITE_MODE;
++      if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR))
++              mask |= NFS4_ACE_DELETE_CHILD;
++      if (perm & ACL_EXECUTE)
++              mask |= NFS4_EXECUTE_MODE;
++      return mask;
++}
++
++static u32
++deny_mask(u32 allow_mask, unsigned int flags)
++{
++      u32 ret = ~allow_mask & ~NFS4_ACE_DELETE;
++      if (!(flags & NFS4_ACL_DIR))
++              ret &= ~NFS4_ACE_DELETE_CHILD;
++      return ret;
++}
++
++static int
++mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
++{
++      u32 ignore = 0;
++
++      if (!(flags & NFS4_ACL_DIR))
++              ignore |= NFS4_ACE_DELETE_CHILD; /* ignore it */
++      perm |= ignore;
++      *mode = 0;
++      if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE)
++              *mode |= ACL_READ;
++      if ((perm & NFS4_WRITE_MODE) == NFS4_WRITE_MODE)
++              *mode |= ACL_WRITE;
++      if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE)
++              *mode |= ACL_EXECUTE;
++      if (!MASK_EQUAL(perm, ignore|mask_from_posix(*mode, flags)))
++              return -EINVAL;
++      return 0;
++}
++
++struct ace_container {
++      struct nfs4_ace  *ace;
++      struct list_head  ace_l;
++};
++
++static short ace2type(struct nfs4_ace *);
++static int _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int);
++static struct posix_acl *_nfsv4_to_posix_one(struct nfs4_acl *, unsigned int);
++int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
++int nfs4_acl_split(struct nfs4_acl *, struct nfs4_acl *);
++
++struct nfs4_acl *
++nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl,
++                      unsigned int flags)
++{
++      struct nfs4_acl *acl;
++      int error = -EINVAL;
++
++      if ((pacl != NULL &&
++              (posix_acl_valid(pacl) < 0 || pacl->a_count == 0)) ||
++          (dpacl != NULL &&
++              (posix_acl_valid(dpacl) < 0 || dpacl->a_count == 0)))
++              goto out_err;
++
++      acl = nfs4_acl_new();
++      if (acl == NULL) {
++              error = -ENOMEM;
++              goto out_err;
++      }
++
++      if (pacl != NULL) {
++              error = _posix_to_nfsv4_one(pacl, acl,
++                                              flags & ~NFS4_ACL_TYPE_DEFAULT);
++              if (error < 0)
++                      goto out_acl;
++      }
++
++      if (dpacl != NULL) {
++              error = _posix_to_nfsv4_one(dpacl, acl,
++                                              flags | NFS4_ACL_TYPE_DEFAULT);
++              if (error < 0)
++                      goto out_acl;
++      }
++
++      return acl;
++
++out_acl:
++      nfs4_acl_free(acl);
++out_err:
++      acl = ERR_PTR(error);
++
++      return acl;
++}
++
++static int
++nfs4_acl_add_pair(struct nfs4_acl *acl, int eflag, u32 mask, int whotype,
++              uid_t owner, unsigned int flags)
++{
++      int error;
++
++      error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE,
++                               eflag, mask, whotype, owner);
++      if (error < 0)
++              return error;
++      error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
++                              eflag, deny_mask(mask, flags), whotype, owner);
++      return error;
++}
++
++/* We assume the acl has been verified with posix_acl_valid. */
++static int
++_posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
++                                              unsigned int flags)
++{
++      struct posix_acl_entry *pa, *pe, *group_owner_entry;
++      int error = -EINVAL;
++      u32 mask, mask_mask;
++      int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ?
++                                      NFS4_INHERITANCE_FLAGS : 0);
++
++      BUG_ON(pacl->a_count < 3);
++      pe = pacl->a_entries + pacl->a_count;
++      pa = pe - 2; /* if mask entry exists, it's second from the last. */
++      if (pa->e_tag == ACL_MASK)
++              mask_mask = deny_mask(mask_from_posix(pa->e_perm, flags), flags);
++      else
++              mask_mask = 0;
++
++      pa = pacl->a_entries;
++      BUG_ON(pa->e_tag != ACL_USER_OBJ);
++      mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER);
++      error = nfs4_acl_add_pair(acl, eflag, mask, NFS4_ACL_WHO_OWNER, 0, flags);
++      if (error < 0)
++              goto out;
++      pa++;
++
++      while (pa->e_tag == ACL_USER) {
++              mask = mask_from_posix(pa->e_perm, flags);
++              error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
++                              eflag,  mask_mask, NFS4_ACL_WHO_NAMED, pa->e_id);
++              if (error < 0)
++                      goto out;
++
++
++              error = nfs4_acl_add_pair(acl, eflag, mask,
++                              NFS4_ACL_WHO_NAMED, pa->e_id, flags);
++              if (error < 0)
++                      goto out;
++              pa++;
++      }
++
++      /* In the case of groups, we apply allow ACEs first, then deny ACEs,
++       * since a user can be in more than one group.  */
++
++      /* allow ACEs */
++
++      if (pacl->a_count > 3) {
++              BUG_ON(pa->e_tag != ACL_GROUP_OBJ);
++              error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
++                              NFS4_ACE_IDENTIFIER_GROUP | eflag, mask_mask,
++                              NFS4_ACL_WHO_GROUP, 0);
++              if (error < 0)
++                      goto out;
++      }
++      group_owner_entry = pa;
++      mask = mask_from_posix(pa->e_perm, flags);
++      error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE,
++                      NFS4_ACE_IDENTIFIER_GROUP | eflag, mask,
++                      NFS4_ACL_WHO_GROUP, 0);
++      if (error < 0)
++              goto out;
++      pa++;
++
++      while (pa->e_tag == ACL_GROUP) {
++              mask = mask_from_posix(pa->e_perm, flags);
++              error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
++                              NFS4_ACE_IDENTIFIER_GROUP | eflag, mask_mask,
++                              NFS4_ACL_WHO_NAMED, pa->e_id);
++              if (error < 0)
++                      goto out;
++
++              error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE,
++                              NFS4_ACE_IDENTIFIER_GROUP | eflag, mask,
++                              NFS4_ACL_WHO_NAMED, pa->e_id);
++              if (error < 0)
++                      goto out;
++              pa++;
++      }
++
++      /* deny ACEs */
++
++      pa = group_owner_entry;
++      mask = mask_from_posix(pa->e_perm, flags);
++      error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
++                      NFS4_ACE_IDENTIFIER_GROUP | eflag,
++                      deny_mask(mask, flags), NFS4_ACL_WHO_GROUP, 0);
++      if (error < 0)
++              goto out;
++      pa++;
++      while (pa->e_tag == ACL_GROUP) {
++              mask = mask_from_posix(pa->e_perm, flags);
++              error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE,
++                              NFS4_ACE_IDENTIFIER_GROUP | eflag,
++                              deny_mask(mask, flags), NFS4_ACL_WHO_NAMED, pa->e_id);
++              if (error < 0)
++                      goto out;
++              pa++;
++      }
++
++      if (pa->e_tag == ACL_MASK)
++              pa++;
++      BUG_ON(pa->e_tag != ACL_OTHER);
++      mask = mask_from_posix(pa->e_perm, flags);
++      error = nfs4_acl_add_pair(acl, eflag, mask, NFS4_ACL_WHO_EVERYONE, 0, flags);
++
++out:
++      return error;
++}
++
++static void
++sort_pacl_range(struct posix_acl *pacl, int start, int end) {
++      int sorted = 0, i;
++      struct posix_acl_entry tmp;
++
++      /* We just do a bubble sort; easy to do in place, and we're not
++       * expecting acl's to be long enough to justify anything more. */
++      while (!sorted) {
++              sorted = 1;
++              for (i = start; i < end; i++) {
++                      if (pacl->a_entries[i].e_id
++                                      > pacl->a_entries[i+1].e_id) {
++                              sorted = 0;
++                              tmp = pacl->a_entries[i];
++                              pacl->a_entries[i] = pacl->a_entries[i+1];
++                              pacl->a_entries[i+1] = tmp;
++                      }
++              }
++      }
++}
++
++static void
++sort_pacl(struct posix_acl *pacl)
++{
++      /* posix_acl_valid requires that users and groups be in order
++       * by uid/gid. */
++      int i, j;
++
++      if (pacl->a_count <= 4)
++              return; /* no users or groups */
++      i = 1;
++      while (pacl->a_entries[i].e_tag == ACL_USER)
++              i++;
++      sort_pacl_range(pacl, 1, i-1);
++
++      BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ);
++      j = i++;
++      while (pacl->a_entries[j].e_tag == ACL_GROUP)
++              j++;
++      sort_pacl_range(pacl, i, j-1);
++      return;
++}
++
++static int
++write_pace(struct nfs4_ace *ace, struct posix_acl *pacl,
++              struct posix_acl_entry **pace, short tag, unsigned int flags)
++{
++      struct posix_acl_entry *this = *pace;
++
++      if (*pace == pacl->a_entries + pacl->a_count)
++              return -EINVAL; /* fell off the end */
++      (*pace)++;
++      this->e_tag = tag;
++      if (tag == ACL_USER_OBJ)
++              flags |= NFS4_ACL_OWNER;
++      if (mode_from_nfs4(ace->access_mask, &this->e_perm, flags))
++              return -EINVAL;
++      this->e_id = (tag == ACL_USER || tag == ACL_GROUP ?
++                      ace->who : ACL_UNDEFINED_ID);
++      return 0;
++}
++
++static struct nfs4_ace *
++get_next_v4_ace(struct list_head **p, struct list_head *head)
++{
++      struct nfs4_ace *ace;
++
++      *p = (*p)->next;
++      if (*p == head)
++              return NULL;
++      ace = list_entry(*p, struct nfs4_ace, l_ace);
++
++      return ace;
++}
++
++int
++nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
++              struct posix_acl **dpacl, unsigned int flags)
++{
++      struct nfs4_acl *dacl;
++      int error = -ENOMEM;
++
++      *pacl = NULL;
++      *dpacl = NULL;
++
++      dacl = nfs4_acl_new();
++      if (dacl == NULL)
++              goto out;
++
++      error = nfs4_acl_split(acl, dacl);
++      if (error < 0)
++              goto out_acl;
++
++      if (pacl != NULL) {
++              if (acl->naces == 0) {
++                      error = -ENODATA;
++                      goto try_dpacl;
++              }
++
++              *pacl = _nfsv4_to_posix_one(acl, flags);
++              if (IS_ERR(*pacl)) {
++                      error = PTR_ERR(*pacl);
++                      *pacl = NULL;
++                      goto out_acl;
++              }
++      }
++
++try_dpacl:
++      if (dpacl != NULL) {
++              if (dacl->naces == 0) {
++                      if (pacl == NULL || *pacl == NULL)
++                              error = -ENODATA;
++                      goto out_acl;
++              }
++
++              error = 0;
++              *dpacl = _nfsv4_to_posix_one(dacl, flags);
++              if (IS_ERR(*dpacl)) {
++                      error = PTR_ERR(*dpacl);
++                      *dpacl = NULL;
++                      goto out_acl;
++              }
++      }
++
++out_acl:
++      if (error && pacl) {
++              posix_acl_release(*pacl);
++              *pacl = NULL;
++      }
++      nfs4_acl_free(dacl);
++out:
++      return error;
++}
++
++static int
++same_who(struct nfs4_ace *a, struct nfs4_ace *b)
++{
++      return a->whotype == b->whotype &&
++              (a->whotype != NFS4_ACL_WHO_NAMED || a->who == b->who);
++}
++
++static int
++complementary_ace_pair(struct nfs4_ace *allow, struct nfs4_ace *deny,
++              unsigned int flags)
++{
++      int ignore = 0;
++      if (!(flags & NFS4_ACL_DIR))
++              ignore |= NFS4_ACE_DELETE_CHILD;
++      return MASK_EQUAL(ignore|deny_mask(allow->access_mask, flags),
++                        ignore|deny->access_mask) &&
++              allow->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
++              deny->type == NFS4_ACE_ACCESS_DENIED_ACE_TYPE &&
++              allow->flag == deny->flag &&
++              same_who(allow, deny);
++}
++
++static inline int
++user_obj_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
++              struct posix_acl *pacl, struct posix_acl_entry **pace,
++              unsigned int flags)
++{
++      int error = -EINVAL;
++      struct nfs4_ace *ace, *ace2;
++
++      ace = get_next_v4_ace(p, &n4acl->ace_head);
++      if (ace == NULL)
++              goto out;
++      if (ace2type(ace) != ACL_USER_OBJ)
++              goto out;
++      error = write_pace(ace, pacl, pace, ACL_USER_OBJ, flags);
++      if (error < 0)
++              goto out;
++      error = -EINVAL;
++      ace2 = get_next_v4_ace(p, &n4acl->ace_head);
++      if (ace2 == NULL)
++              goto out;
++      if (!complementary_ace_pair(ace, ace2, flags))
++              goto out;
++      error = 0;
++out:
++      return error;
++}
++
++static inline int
++users_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
++              struct nfs4_ace **mask_ace,
++              struct posix_acl *pacl, struct posix_acl_entry **pace,
++              unsigned int flags)
++{
++      int error = -EINVAL;
++      struct nfs4_ace *ace, *ace2;
++
++      ace = get_next_v4_ace(p, &n4acl->ace_head);
++      if (ace == NULL)
++              goto out;
++      while (ace2type(ace) == ACL_USER) {
++              if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
++                      goto out;
++              if (*mask_ace &&
++                      !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
++                      goto out;
++              *mask_ace = ace;
++              ace = get_next_v4_ace(p, &n4acl->ace_head);
++              if (ace == NULL)
++                      goto out;
++              if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
++                      goto out;
++              error = write_pace(ace, pacl, pace, ACL_USER, flags);
++              if (error < 0)
++                      goto out;
++              error = -EINVAL;
++              ace2 = get_next_v4_ace(p, &n4acl->ace_head);
++              if (ace2 == NULL)
++                      goto out;
++              if (!complementary_ace_pair(ace, ace2, flags))
++                      goto out;
++              if ((*mask_ace)->flag != ace2->flag ||
++                              !same_who(*mask_ace, ace2))
++                      goto out;
++              ace = get_next_v4_ace(p, &n4acl->ace_head);
++              if (ace == NULL)
++                      goto out;
++      }
++      error = 0;
++out:
++      return error;
++}
++
++static inline int
++group_obj_and_groups_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
++              struct nfs4_ace **mask_ace,
++              struct posix_acl *pacl, struct posix_acl_entry **pace,
++              unsigned int flags)
++{
++      int error = -EINVAL;
++      struct nfs4_ace *ace, *ace2;
++      struct ace_container *ac;
++      struct list_head group_l;
++
++      INIT_LIST_HEAD(&group_l);
++      ace = list_entry(*p, struct nfs4_ace, l_ace);
++
++      /* group owner (mask and allow aces) */
++
++      if (pacl->a_count != 3) {
++              /* then the group owner should be preceded by mask */
++              if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
++                      goto out;
++              if (*mask_ace &&
++                      !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
++                      goto out;
++              *mask_ace = ace;
++              ace = get_next_v4_ace(p, &n4acl->ace_head);
++              if (ace == NULL)
++                      goto out;
++
++              if ((*mask_ace)->flag != ace->flag || !same_who(*mask_ace, ace))
++                      goto out;
++      }
++
++      if (ace2type(ace) != ACL_GROUP_OBJ)
++              goto out;
++
++      ac = kmalloc(sizeof(*ac), GFP_KERNEL);
++      error = -ENOMEM;
++      if (ac == NULL)
++              goto out;
++      ac->ace = ace;
++      list_add_tail(&ac->ace_l, &group_l);
++
++      error = -EINVAL;
++      if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
++              goto out;
++
++      error = write_pace(ace, pacl, pace, ACL_GROUP_OBJ, flags);
++      if (error < 0)
++              goto out;
++
++      error = -EINVAL;
++      ace = get_next_v4_ace(p, &n4acl->ace_head);
++      if (ace == NULL)
++              goto out;
++
++      /* groups (mask and allow aces) */
++
++      while (ace2type(ace) == ACL_GROUP) {
++              if (*mask_ace == NULL)
++                      goto out;
++
++              if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE ||
++                      !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
++                      goto out;
++              *mask_ace = ace;
++
++              ace = get_next_v4_ace(p, &n4acl->ace_head);
++              if (ace == NULL)
++                      goto out;
++              ac = kmalloc(sizeof(*ac), GFP_KERNEL);
++              error = -ENOMEM;
++              if (ac == NULL)
++                      goto out;
++              error = -EINVAL;
++              if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE ||
++                              !same_who(ace, *mask_ace))
++                      goto out;
++
++              ac->ace = ace;
++              list_add_tail(&ac->ace_l, &group_l);
++
++              error = write_pace(ace, pacl, pace, ACL_GROUP, flags);
++              if (error < 0)
++                      goto out;
++              error = -EINVAL;
++              ace = get_next_v4_ace(p, &n4acl->ace_head);
++              if (ace == NULL)
++                      goto out;
++      }
++
++      /* group owner (deny ace) */
++
++      if (ace2type(ace) != ACL_GROUP_OBJ)
++              goto out;
++      ac = list_entry(group_l.next, struct ace_container, ace_l);
++      ace2 = ac->ace;
++      if (!complementary_ace_pair(ace2, ace, flags))
++              goto out;
++      list_del(group_l.next);
++      kfree(ac);
++
++      /* groups (deny aces) */
++
++      while (!list_empty(&group_l)) {
++              ace = get_next_v4_ace(p, &n4acl->ace_head);
++              if (ace == NULL)
++                      goto out;
++              if (ace2type(ace) != ACL_GROUP)
++                      goto out;
++              ac = list_entry(group_l.next, struct ace_container, ace_l);
++              ace2 = ac->ace;
++              if (!complementary_ace_pair(ace2, ace, flags))
++                      goto out;
++              list_del(group_l.next);
++              kfree(ac);
++      }
++
++      ace = get_next_v4_ace(p, &n4acl->ace_head);
++      if (ace == NULL)
++              goto out;
++      if (ace2type(ace) != ACL_OTHER)
++              goto out;
++      error = 0;
++out:
++      while (!list_empty(&group_l)) {
++              ac = list_entry(group_l.next, struct ace_container, ace_l);
++              list_del(group_l.next);
++              kfree(ac);
++      }
++      return error;
++}
++
++static inline int
++mask_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
++              struct nfs4_ace **mask_ace,
++              struct posix_acl *pacl, struct posix_acl_entry **pace,
++              unsigned int flags)
++{
++      int error = -EINVAL;
++      struct nfs4_ace *ace;
++
++      ace = list_entry(*p, struct nfs4_ace, l_ace);
++      if (pacl->a_count != 3) {
++              if (*mask_ace == NULL)
++                      goto out;
++              (*mask_ace)->access_mask = deny_mask((*mask_ace)->access_mask, flags);
++              write_pace(*mask_ace, pacl, pace, ACL_MASK, flags);
++      }
++      error = 0;
++out:
++      return error;
++}
++
++static inline int
++other_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
++              struct posix_acl *pacl, struct posix_acl_entry **pace,
++              unsigned int flags)
++{
++      int error = -EINVAL;
++      struct nfs4_ace *ace, *ace2;
++
++      ace = list_entry(*p, struct nfs4_ace, l_ace);
++      if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
++              goto out;
++      error = write_pace(ace, pacl, pace, ACL_OTHER, flags);
++      if (error < 0)
++              goto out;
++      error = -EINVAL;
++      ace2 = get_next_v4_ace(p, &n4acl->ace_head);
++      if (ace2 == NULL)
++              goto out;
++      if (!complementary_ace_pair(ace, ace2, flags))
++              goto out;
++      error = 0;
++out:
++      return error;
++}
++
++static int
++calculate_posix_ace_count(struct nfs4_acl *n4acl)
++{
++      if (n4acl->naces == 6) /* owner, owner group, and other only */
++              return 3;
++      else { /* Otherwise there must be a mask entry. */
++              /* Also, the remaining entries are for named users and
++               * groups, and come in threes (mask, allow, deny): */
++              if (n4acl->naces < 7)
++                      return -1;
++              if ((n4acl->naces - 7) % 3)
++                      return -1;
++              return 4 + (n4acl->naces - 7)/3;
++      }
++}
++
++
++static struct posix_acl *
++_nfsv4_to_posix_one(struct nfs4_acl *n4acl, unsigned int flags)
++{
++      struct posix_acl *pacl;
++      int error = -EINVAL, nace = 0;
++      struct list_head *p;
++      struct nfs4_ace *mask_ace = NULL;
++      struct posix_acl_entry *pace;
++
++      nace = calculate_posix_ace_count(n4acl);
++      if (nace < 0)
++              goto out_err;
++
++      pacl = posix_acl_alloc(nace, GFP_KERNEL);
++      error = -ENOMEM;
++      if (pacl == NULL)
++              goto out_err;
++
++      pace = &pacl->a_entries[0];
++      p = &n4acl->ace_head;
++
++      error = user_obj_from_v4(n4acl, &p, pacl, &pace, flags);
++      if (error)
++              goto out_acl;
++
++      error = users_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags);
++      if (error)
++              goto out_acl;
++
++      error = group_obj_and_groups_from_v4(n4acl, &p, &mask_ace, pacl, &pace,
++                                              flags);
++      if (error)
++              goto out_acl;
++
++      error = mask_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags);
++      if (error)
++              goto out_acl;
++      error = other_from_v4(n4acl, &p, pacl, &pace, flags);
++      if (error)
++              goto out_acl;
++
++      error = -EINVAL;
++      if (p->next != &n4acl->ace_head)
++              goto out_acl;
++      if (pace != pacl->a_entries + pacl->a_count)
++              goto out_acl;
++
++      sort_pacl(pacl);
++
++      return pacl;
++out_acl:
++      posix_acl_release(pacl);
++out_err:
++      pacl = ERR_PTR(error);
++      return pacl;
++}
++
++int
++nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl)
++{
++      struct list_head *h, *n;
++      struct nfs4_ace *ace;
++      int error = 0;
++
++      list_for_each_safe(h, n, &acl->ace_head) {
++              ace = list_entry(h, struct nfs4_ace, l_ace);
++
++              if ((ace->flag & NFS4_INHERITANCE_FLAGS)
++                              != NFS4_INHERITANCE_FLAGS)
++                      continue;
++
++              error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
++                              ace->access_mask, ace->whotype, ace->who) == -1;
++              if (error < 0)
++                      goto out;
++
++              list_del(h);
++              kfree(ace);
++              acl->naces--;
++      }
++
++out:
++      return error;
++}
++
++static short
++ace2type(struct nfs4_ace *ace)
++{
++      switch (ace->whotype) {
++              case NFS4_ACL_WHO_NAMED:
++                      return (ace->flag & NFS4_ACE_IDENTIFIER_GROUP ?
++                                      ACL_GROUP : ACL_USER);
++              case NFS4_ACL_WHO_OWNER:
++                      return ACL_USER_OBJ;
++              case NFS4_ACL_WHO_GROUP:
++                      return ACL_GROUP_OBJ;
++              case NFS4_ACL_WHO_EVERYONE:
++                      return ACL_OTHER;
++      }
++      BUG();
++      return -1;
++}
++
++EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4);
++EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix);
++
++struct nfs4_acl *
++nfs4_acl_new(void)
++{
++      struct nfs4_acl *acl;
++
++      if ((acl = kmalloc(sizeof(*acl), GFP_KERNEL)) == NULL)
++              return NULL;
++
++      acl->naces = 0;
++      INIT_LIST_HEAD(&acl->ace_head);
++
++      return acl;
++}
++
++void
++nfs4_acl_free(struct nfs4_acl *acl)
++{
++      struct list_head *h;
++      struct nfs4_ace *ace;
++
++      if (!acl)
++              return;
++
++      while (!list_empty(&acl->ace_head)) {
++              h = acl->ace_head.next;
++              list_del(h);
++              ace = list_entry(h, struct nfs4_ace, l_ace);
++              kfree(ace);
++      }
++
++      kfree(acl);
++
++      return;
++}
++
++int
++nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask,
++              int whotype, uid_t who)
++{
++      struct nfs4_ace *ace;
++
++      if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL)
++              return -1;
++
++      ace->type = type;
++      ace->flag = flag;
++      ace->access_mask = access_mask;
++      ace->whotype = whotype;
++      ace->who = who;
++
++      list_add_tail(&ace->l_ace, &acl->ace_head);
++      acl->naces++;
++
++      return 0;
++}
++
++static struct {
++      char *string;
++      int   stringlen;
++      int type;
++} s2t_map[] = {
++      {
++              .string    = "OWNER@",
++              .stringlen = sizeof("OWNER@") - 1,
++              .type      = NFS4_ACL_WHO_OWNER,
++      },
++      {
++              .string    = "GROUP@",
++              .stringlen = sizeof("GROUP@") - 1,
++              .type      = NFS4_ACL_WHO_GROUP,
++      },
++      {
++              .string    = "EVERYONE@",
++              .stringlen = sizeof("EVERYONE@") - 1,
++              .type      = NFS4_ACL_WHO_EVERYONE,
++      },
++};
++
++int
++nfs4_acl_get_whotype(char *p, u32 len)
++{
++      int i;
++
++      for (i=0; i < sizeof(s2t_map) / sizeof(*s2t_map); i++) {
++              if (s2t_map[i].stringlen == len &&
++                              0 == memcmp(s2t_map[i].string, p, len))
++                      return s2t_map[i].type;
++      }
++      return NFS4_ACL_WHO_NAMED;
++}
++
++int
++nfs4_acl_write_who(int who, char *p)
++{
++      int i;
++
++      for (i=0; i < sizeof(s2t_map) / sizeof(*s2t_map); i++) {
++              if (s2t_map[i].type == who) {
++                      memcpy(p, s2t_map[i].string, s2t_map[i].stringlen);
++                      return s2t_map[i].stringlen;
++              }
++      }
++      BUG();
++      return -1;
++}
++
++static inline int
++match_who(struct nfs4_ace *ace, uid_t owner, gid_t group, uid_t who)
++{
++      switch (ace->whotype) {
++              case NFS4_ACL_WHO_NAMED:
++                      return who == ace->who;
++              case NFS4_ACL_WHO_OWNER:
++                      return who == owner;
++              case NFS4_ACL_WHO_GROUP:
++                      return who == group;
++              case NFS4_ACL_WHO_EVERYONE:
++                      return 1;
++              default:
++                      return 0;
++      }
++}
++
++/* 0 = granted, -EACCES = denied; mask is an nfsv4 mask, not mode bits */
++int
++nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
++                      uid_t who, u32 mask)
++{
++      struct nfs4_ace *ace;
++      u32 allowed = 0;
++
++      list_for_each_entry(ace, &acl->ace_head, l_ace) {
++              if (!match_who(ace, group, owner, who))
++                      continue;
++              switch (ace->type) {
++                      case NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE:
++                              allowed |= ace->access_mask;
++                              if ((allowed & mask) == mask)
++                                      return 0;
++                              break;
++                      case NFS4_ACE_ACCESS_DENIED_ACE_TYPE:
++                              if (ace->access_mask & mask)
++                                      return -EACCES;
++                              break;
++              }
++      }
++      return -EACCES;
++}
++
++EXPORT_SYMBOL(nfs4_acl_new);
++EXPORT_SYMBOL(nfs4_acl_free);
++EXPORT_SYMBOL(nfs4_acl_add_ace);
++EXPORT_SYMBOL(nfs4_acl_get_whotype);
++EXPORT_SYMBOL(nfs4_acl_write_who);
++EXPORT_SYMBOL(nfs4_acl_permission);
+--- linux-2.6.7/fs/nfsd/Makefile.lsec  2004-06-15 23:19:13.000000000 -0600
++++ linux-2.6.7/fs/nfsd/Makefile       2005-03-23 14:28:24.461331008 -0700
+@@ -7,5 +7,6 @@ obj-$(CONFIG_NFSD)     += nfsd.o
+ nfsd-y                        := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
+                          export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+ nfsd-$(CONFIG_NFSD_V3)        += nfs3proc.o nfs3xdr.o
+-nfsd-$(CONFIG_NFSD_V4)        += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o
++nfsd-$(CONFIG_NFSD_V4)        += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
++                         nfs4acl.o nfs4callback.o
+ nfsd-objs             := $(nfsd-y)
+--- linux-2.6.7/fs/nfsd/nfsctl.c.lsec  2004-06-15 23:19:01.000000000 -0600
++++ linux-2.6.7/fs/nfsd/nfsctl.c       2005-03-23 14:28:24.132381016 -0700
+@@ -36,7 +36,7 @@
+ #include <asm/uaccess.h>
+ /*
+- *    We have a single directory with 8 nodes in it.
++ *    We have a single directory with 9 nodes in it.
+  */
+ enum {
+       NFSD_Root = 1,
+@@ -50,6 +50,7 @@ enum {
+       NFSD_List,
+       NFSD_Fh,
+       NFSD_Threads,
++      NFSD_Leasetime,
+ };
+ /*
+@@ -64,6 +65,7 @@ static ssize_t write_getfd(struct file *
+ static ssize_t write_getfs(struct file *file, char *buf, size_t size);
+ static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
+ static ssize_t write_threads(struct file *file, char *buf, size_t size);
++static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
+ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
+       [NFSD_Svc] = write_svc,
+@@ -75,6 +77,7 @@ static ssize_t (*write_op[])(struct file
+       [NFSD_Getfs] = write_getfs,
+       [NFSD_Fh] = write_filehandle,
+       [NFSD_Threads] = write_threads,
++      [NFSD_Leasetime] = write_leasetime,
+ };
+ /* an argresp is stored in an allocated page and holds the 
+@@ -393,6 +396,29 @@ static ssize_t write_threads(struct file
+       return strlen(buf);
+ }
++extern time_t nfs4_leasetime(void);
++
++static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
++{
++      /* if size > 10 seconds, call
++       * nfs4_reset_lease() then write out the new lease (seconds) as reply
++       */
++      char *mesg = buf;
++      int rv;
++
++      if (size > 0) {
++              int lease;
++              rv = get_int(&mesg, &lease);
++              if (rv)
++                      return rv;
++              if (lease < 10 || lease > 3600)
++                      return -EINVAL;
++              nfs4_reset_lease(lease);
++      }
++      sprintf(buf, "%ld\n", nfs4_lease_time());
++      return strlen(buf);
++}
++
+ /*----------------------------------------------------------------------------*/
+ /*
+  *    populating the filesystem.
+@@ -411,6 +437,7 @@ static int nfsd_fill_super(struct super_
+               [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
+               [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
+               [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
++              [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
+               /* last one */ {""}
+       };
+       return simple_fill_super(sb, 0x6e667364, nfsd_files);
+--- linux-2.6.7/fs/nfs/callback_proc.c.lsec    2005-03-23 14:28:22.485631360 -0700
++++ linux-2.6.7/fs/nfs/callback_proc.c 2005-03-23 14:28:22.485631360 -0700
+@@ -0,0 +1,85 @@
++/*
++ * linux/fs/nfs/callback_proc.c
++ *
++ * Copyright (C) 2004 Trond Myklebust
++ *
++ * NFSv4 callback procedures
++ */
++#include <linux/config.h>
++#include <linux/nfs4.h>
++#include <linux/nfs_fs.h>
++#include "callback.h"
++#include "delegation.h"
++
++#define NFSDBG_FACILITY NFSDBG_CALLBACK
++ 
++unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
++{
++      struct nfs4_client *clp;
++      struct nfs_delegation *delegation;
++      struct nfs_inode *nfsi;
++      struct inode *inode;
++      
++      res->bitmap[0] = res->bitmap[1] = 0;
++      res->status = htonl(NFS4ERR_BADHANDLE);
++      clp = nfs4_find_client(&args->addr->sin_addr);
++      if (clp == NULL)
++              goto out;
++      inode = nfs_delegation_find_inode(clp, &args->fh);
++      if (inode == NULL)
++              goto out_putclient;
++      nfsi = NFS_I(inode);
++      down_read(&nfsi->rwsem);
++      delegation = nfsi->delegation;
++      if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
++              goto out_iput;
++      res->size = i_size_read(inode);
++      res->change_attr = NFS_CHANGE_ATTR(inode);
++      res->ctime = inode->i_ctime;
++      res->mtime = inode->i_mtime;
++      res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
++              args->bitmap[0];
++      res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) &
++              args->bitmap[1];
++      res->status = 0;
++out_iput:
++      up_read(&nfsi->rwsem);
++      iput(inode);
++out_putclient:
++      nfs4_put_client(clp);
++out:
++      dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res->status));
++      return res->status;
++}
++
++unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
++{
++      struct nfs4_client *clp;
++      struct inode *inode;
++      unsigned res;
++      
++      res = htonl(NFS4ERR_BADHANDLE);
++      clp = nfs4_find_client(&args->addr->sin_addr);
++      if (clp == NULL)
++              goto out;
++      inode = nfs_delegation_find_inode(clp, &args->fh);
++      if (inode == NULL)
++              goto out_putclient;
++      /* Set up a helper thread to actually return the delegation */
++      switch(nfs_async_inode_return_delegation(inode, &args->stateid)) {
++              case 0:
++                      res = 0;
++                      break;
++              case -ENOENT:
++                      res = htonl(NFS4ERR_BAD_STATEID);
++                      break;
++              default:
++                      res = htonl(NFS4ERR_RESOURCE);
++      }
++      iput(inode);
++out_putclient:
++      nfs4_put_client(clp);
++out:
++      dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res));
++      return res;
++}
+--- linux-2.6.7/fs/nfs/delegation.c.lsec       2005-03-23 14:28:22.546622088 -0700
++++ linux-2.6.7/fs/nfs/delegation.c    2005-03-23 14:28:22.545622240 -0700
+@@ -0,0 +1,320 @@
++/*
++ * linux/fs/nfs/delegation.c
++ *
++ * Copyright (C) 2004 Trond Myklebust
++ *
++ * NFS file delegation management
++ *
++ */
++#include <linux/config.h>
++#include <linux/completion.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/spinlock.h>
++
++#include <linux/nfs4.h>
++#include <linux/nfs_fs.h>
++#include <linux/nfs_xdr.h>
++
++#include "delegation.h"
++
++static struct nfs_delegation *nfs_alloc_delegation(void)
++{
++      return (struct nfs_delegation *)kmalloc(sizeof(struct nfs_delegation), GFP_KERNEL);
++}
++
++static void nfs_free_delegation(struct nfs_delegation *delegation)
++{
++      if (delegation->cred)
++              put_rpccred(delegation->cred);
++      kfree(delegation);
++}
++
++static void nfs_delegation_claim_opens(struct inode *inode)
++{
++      struct nfs_inode *nfsi = NFS_I(inode);
++      struct nfs_open_context *ctx;
++      struct nfs4_state *state;
++
++again:
++      spin_lock(&inode->i_lock);
++      list_for_each_entry(ctx, &nfsi->open_files, list) {
++              state = ctx->state;
++              if (state == NULL)
++                      continue;
++              if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
++                      continue;
++              get_nfs_open_context(ctx);
++              spin_unlock(&inode->i_lock);
++              if (nfs4_open_delegation_recall(ctx->dentry, state) < 0)
++                      return;
++              put_nfs_open_context(ctx);
++              goto again;
++      }
++      spin_unlock(&inode->i_lock);
++}
++
++/*
++ * Set up a delegation on an inode
++ */
++void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
++{
++      struct nfs_delegation *delegation = NFS_I(inode)->delegation;
++
++      if (delegation == NULL)
++              return;
++      memcpy(delegation->stateid.data, res->delegation.data,
++                      sizeof(delegation->stateid.data));
++      delegation->type = res->delegation_type;
++      delegation->maxsize = res->maxsize;
++      put_rpccred(cred);
++      delegation->cred = get_rpccred(cred);
++      delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM;
++      NFS_I(inode)->delegation_state = delegation->type;
++      wmb();
++}
++
++/*
++ * Set up a delegation on an inode
++ */
++int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
++{
++      struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
++      struct nfs_inode *nfsi = NFS_I(inode);
++      struct nfs_delegation *delegation;
++      int status = 0;
++
++      delegation = nfs_alloc_delegation();
++      if (delegation == NULL)
++              return -ENOMEM;
++      memcpy(delegation->stateid.data, res->delegation.data,
++                      sizeof(delegation->stateid.data));
++      delegation->type = res->delegation_type;
++      delegation->maxsize = res->maxsize;
++      delegation->cred = get_rpccred(cred);
++      delegation->inode = inode;
++
++      spin_lock(&clp->cl_lock);
++      if (nfsi->delegation == NULL) {
++              list_add(&delegation->super_list, &clp->cl_delegations);
++              nfsi->delegation = delegation;
++              nfsi->delegation_state = delegation->type;
++              delegation = NULL;
++      } else {
++              if (memcmp(&delegation->stateid, &nfsi->delegation->stateid,
++                                      sizeof(delegation->stateid)) != 0 ||
++                              delegation->type != nfsi->delegation->type) {
++                      printk("%s: server %u.%u.%u.%u, handed out a duplicate delegation!\n",
++                                      __FUNCTION__, NIPQUAD(clp->cl_addr));
++                      status = -EIO;
++              }
++      }
++      spin_unlock(&clp->cl_lock);
++      if (delegation != NULL)
++              kfree(delegation);
++      return status;
++}
++
++static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
++{
++      int res = 0;
++
++      __nfs_revalidate_inode(NFS_SERVER(inode), inode);
++
++      res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid);
++      nfs_free_delegation(delegation);
++      return res;
++}
++
++/* Sync all data to disk upon delegation return */
++static void nfs_msync_inode(struct inode *inode)
++{
++      down(&inode->i_sem);
++      filemap_fdatawrite(inode->i_mapping);
++      nfs_wb_all(inode);
++      filemap_fdatawait(inode->i_mapping);
++      up(&inode->i_sem);
++}
++
++/*
++ * Basic procedure for returning a delegation to the server
++ */
++int nfs_inode_return_delegation(struct inode *inode)
++{
++      struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
++      struct nfs_inode *nfsi = NFS_I(inode);
++      struct nfs_delegation *delegation;
++      int res = 0;
++
++      nfs_msync_inode(inode);
++      down_read(&clp->cl_sem);
++      /* Guard against new delegated open calls */
++      down_write(&nfsi->rwsem);
++      spin_lock(&clp->cl_lock);
++      delegation = nfsi->delegation;
++      if (delegation != NULL) {
++              list_del_init(&delegation->super_list);
++              nfsi->delegation = NULL;
++              nfsi->delegation_state = 0;
++      }
++      spin_unlock(&clp->cl_lock);
++      nfs_delegation_claim_opens(inode);
++      up_write(&nfsi->rwsem);
++      up_read(&clp->cl_sem);
++      nfs_msync_inode(inode);
++
++      if (delegation != NULL)
++              res = nfs_do_return_delegation(inode, delegation);
++      return res;
++}
++
++/*
++ * Return all delegations associated to a super block
++ */
++void nfs_return_all_delegations(struct super_block *sb)
++{
++      struct nfs4_client *clp = NFS_SB(sb)->nfs4_state;
++      struct nfs_delegation *delegation;
++      struct inode *inode;
++
++      if (clp == NULL)
++              return;
++restart:
++      spin_lock(&clp->cl_lock);
++      list_for_each_entry(delegation, &clp->cl_delegations, super_list) {
++              if (delegation->inode->i_sb != sb)
++                      continue;
++              inode = igrab(delegation->inode);
++              if (inode == NULL)
++                      continue;
++              spin_unlock(&clp->cl_lock);
++              nfs_inode_return_delegation(inode);
++              iput(inode);
++              goto restart;
++      }
++      spin_unlock(&clp->cl_lock);
++}
++
++struct recall_threadargs {
++      struct inode *inode;
++      struct nfs4_client *clp;
++      const nfs4_stateid *stateid;
++
++      struct completion started;
++      int result;
++};
++
++static int recall_thread(void *data)
++{
++      struct recall_threadargs *args = (struct recall_threadargs *)data;
++      struct inode *inode = igrab(args->inode);
++      struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state;
++      struct nfs_inode *nfsi = NFS_I(inode);
++      struct nfs_delegation *delegation;
++
++      daemonize("nfsv4-delegreturn");
++
++      nfs_msync_inode(inode);
++      down_read(&clp->cl_sem);
++      down_write(&nfsi->rwsem);
++      spin_lock(&clp->cl_lock);
++      delegation = nfsi->delegation;
++      if (delegation != NULL && memcmp(delegation->stateid.data,
++                              args->stateid->data,
++                              sizeof(delegation->stateid.data)) == 0) {
++              list_del_init(&delegation->super_list);
++              nfsi->delegation = NULL;
++              nfsi->delegation_state = 0;
++              args->result = 0;
++      } else {
++              delegation = NULL;
++              args->result = -ENOENT;
++      }
++      spin_unlock(&clp->cl_lock);
++      complete(&args->started);
++      nfs_delegation_claim_opens(inode);
++      up_write(&nfsi->rwsem);
++      up_read(&clp->cl_sem);
++      nfs_msync_inode(inode);
++
++      if (delegation != NULL)
++              nfs_do_return_delegation(inode, delegation);
++      iput(inode);
++      module_put_and_exit(0);
++}
++
++/*
++ * Asynchronous delegation recall!
++ */
++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
++{
++      struct recall_threadargs data = {
++              .inode = inode,
++              .stateid = stateid,
++      };
++      int status;
++
++      init_completion(&data.started);
++      __module_get(THIS_MODULE);
++      status = kernel_thread(recall_thread, &data, CLONE_KERNEL);
++      if (status < 0)
++              goto out_module_put;
++      wait_for_completion(&data.started);
++      return data.result;
++out_module_put:
++      module_put(THIS_MODULE);
++      return status;
++}
++
++/*
++ * Retrieve the inode associated with a delegation
++ */
++struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle)
++{
++      struct nfs_delegation *delegation;
++      struct inode *res = NULL;
++      spin_lock(&clp->cl_lock);
++      list_for_each_entry(delegation, &clp->cl_delegations, super_list) {
++              if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
++                      res = igrab(delegation->inode);
++                      break;
++              }
++      }
++      spin_unlock(&clp->cl_lock);
++      return res;
++}
++
++/*
++ * Mark all delegations as needing to be reclaimed
++ */
++void nfs_delegation_mark_reclaim(struct nfs4_client *clp)
++{
++      struct nfs_delegation *delegation;
++      spin_lock(&clp->cl_lock);
++      list_for_each_entry(delegation, &clp->cl_delegations, super_list)
++              delegation->flags |= NFS_DELEGATION_NEED_RECLAIM;
++      spin_unlock(&clp->cl_lock);
++}
++
++/*
++ * Reap all unclaimed delegations after reboot recovery is done
++ */
++void nfs_delegation_reap_unclaimed(struct nfs4_client *clp)
++{
++      struct nfs_delegation *delegation, *n;
++      LIST_HEAD(head);
++      spin_lock(&clp->cl_lock);
++      list_for_each_entry_safe(delegation, n, &clp->cl_delegations, super_list) {
++              if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0)
++                      continue;
++              list_move(&delegation->super_list, &head);
++              NFS_I(delegation->inode)->delegation = NULL;
++              NFS_I(delegation->inode)->delegation_state = 0;
++      }
++      spin_unlock(&clp->cl_lock);
++      while(!list_empty(&head)) {
++              delegation = list_entry(head.next, struct nfs_delegation, super_list);
++              list_del(&delegation->super_list);
++              nfs_free_delegation(delegation);
++      }
++}
+--- linux-2.6.7/fs/nfs/delegation.h.lsec       2005-03-23 14:28:22.546622088 -0700
++++ linux-2.6.7/fs/nfs/delegation.h    2005-03-23 14:28:22.546622088 -0700
+@@ -0,0 +1,56 @@
++/*
++ * linux/fs/nfs/delegation.h
++ *
++ * Copyright (c) Trond Myklebust
++ *
++ * Definitions pertaining to NFS delegated files
++ */
++#ifndef FS_NFS_DELEGATION_H
++#define FS_NFS_DELEGATION_H
++
++#if defined(CONFIG_NFS_V4)
++/*
++ * NFSv4 delegation
++ */
++struct nfs_delegation {
++      struct list_head super_list;
++      struct rpc_cred *cred;
++      struct inode *inode;
++      nfs4_stateid stateid;
++      int type;
++#define NFS_DELEGATION_NEED_RECLAIM 1
++      long flags;
++      loff_t maxsize;
++};
++
++int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
++void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
++int nfs_inode_return_delegation(struct inode *inode);
++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
++
++struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle);
++void nfs_return_all_delegations(struct super_block *sb);
++
++void nfs_delegation_mark_reclaim(struct nfs4_client *clp);
++void nfs_delegation_reap_unclaimed(struct nfs4_client *clp);
++
++/* NFSv4 delegation-related procedures */
++int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid);
++int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state);
++
++static inline int nfs_have_delegation(struct inode *inode, int flags)
++{
++      flags &= FMODE_READ|FMODE_WRITE;
++      rmb();
++      if ((NFS_I(inode)->delegation_state & flags) == flags)
++              return 1;
++      return 0;
++}
++#else
++static inline int nfs_have_delegation(struct inode *inode, int flags)
++{
++      return 0;
++}
++#endif
++
++#endif
+--- linux-2.6.7/fs/nfs/nfs3proc.c.lsec 2004-06-15 23:19:23.000000000 -0600
++++ linux-2.6.7/fs/nfs/nfs3proc.c      2005-03-23 14:28:22.820580440 -0700
+@@ -68,18 +68,6 @@ nfs3_async_handle_jukebox(struct rpc_tas
+       return 1;
+ }
+-static struct rpc_cred *
+-nfs_cred(struct inode *inode, struct file *filp)
+-{
+-      struct rpc_cred *cred = NULL;
+-
+-      if (filp)
+-              cred = (struct rpc_cred *)filp->private_data;
+-      if (!cred)
+-              cred = NFS_I(inode)->mm_cred;
+-      return cred;
+-}
+-
+ /*
+  * Bare-bones access to getattr: this is for nfs_read_super.
+  */
+@@ -164,8 +152,7 @@ nfs3_proc_lookup(struct inode *dir, stru
+       return status;
+ }
+-static int
+-nfs3_proc_access(struct inode *inode, struct rpc_cred *cred, int mode)
++static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+ {
+       struct nfs_fattr        fattr;
+       struct nfs3_accessargs  arg = {
+@@ -178,9 +165,10 @@ nfs3_proc_access(struct inode *inode, st
+               .rpc_proc       = &nfs3_procedures[NFS3PROC_ACCESS],
+               .rpc_argp       = &arg,
+               .rpc_resp       = &res,
+-              .rpc_cred       = cred
++              .rpc_cred       = entry->cred
+       };
+-      int     status;
++      int mode = entry->mask;
++      int status;
+       dprintk("NFS call  access\n");
+       fattr.valid = 0;
+@@ -200,10 +188,16 @@ nfs3_proc_access(struct inode *inode, st
+       }
+       status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+       nfs_refresh_inode(inode, &fattr);
+-      dprintk("NFS reply access\n");
+-
+-      if (status == 0 && (arg.access & res.access) != arg.access)
+-              status = -EACCES;
++      if (status == 0) {
++              entry->mask = 0;
++              if (res.access & NFS3_ACCESS_READ)
++                      entry->mask |= MAY_READ;
++              if (res.access & (NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE))
++                      entry->mask |= MAY_WRITE;
++              if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE))
++                      entry->mask |= MAY_EXEC;
++      }
++      dprintk("NFS reply access, status = %d\n", status);
+       return status;
+ }
+@@ -227,8 +221,7 @@ nfs3_proc_readlink(struct inode *inode, 
+       return status;
+ }
+-static int
+-nfs3_proc_read(struct nfs_read_data *rdata, struct file *filp)
++static int nfs3_proc_read(struct nfs_read_data *rdata)
+ {
+       int                     flags = rdata->flags;
+       struct inode *          inode = rdata->inode;
+@@ -237,13 +230,13 @@ nfs3_proc_read(struct nfs_read_data *rda
+               .rpc_proc       = &nfs3_procedures[NFS3PROC_READ],
+               .rpc_argp       = &rdata->args,
+               .rpc_resp       = &rdata->res,
++              .rpc_cred       = rdata->cred,
+       };
+       int                     status;
+       dprintk("NFS call  read %d @ %Ld\n", rdata->args.count,
+                       (long long) rdata->args.offset);
+       fattr->valid = 0;
+-      msg.rpc_cred = nfs_cred(inode, filp);
+       status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags);
+       if (status >= 0)
+               nfs_refresh_inode(inode, fattr);
+@@ -251,8 +244,7 @@ nfs3_proc_read(struct nfs_read_data *rda
+       return status;
+ }
+-static int
+-nfs3_proc_write(struct nfs_write_data *wdata, struct file *filp)
++static int nfs3_proc_write(struct nfs_write_data *wdata)
+ {
+       int                     rpcflags = wdata->flags;
+       struct inode *          inode = wdata->inode;
+@@ -261,13 +253,13 @@ nfs3_proc_write(struct nfs_write_data *w
+               .rpc_proc       = &nfs3_procedures[NFS3PROC_WRITE],
+               .rpc_argp       = &wdata->args,
+               .rpc_resp       = &wdata->res,
++              .rpc_cred       = wdata->cred,
+       };
+       int                     status;
+       dprintk("NFS call  write %d @ %Ld\n", wdata->args.count,
+                       (long long) wdata->args.offset);
+       fattr->valid = 0;
+-      msg.rpc_cred = nfs_cred(inode, filp);
+       status = rpc_call_sync(NFS_CLIENT(inode), &msg, rpcflags);
+       if (status >= 0)
+               nfs_refresh_inode(inode, fattr);
+@@ -275,8 +267,7 @@ nfs3_proc_write(struct nfs_write_data *w
+       return status < 0? status : wdata->res.count;
+ }
+-static int
+-nfs3_proc_commit(struct nfs_write_data *cdata, struct file *filp)
++static int nfs3_proc_commit(struct nfs_write_data *cdata)
+ {
+       struct inode *          inode = cdata->inode;
+       struct nfs_fattr *      fattr = cdata->res.fattr;
+@@ -284,13 +275,13 @@ nfs3_proc_commit(struct nfs_write_data *
+               .rpc_proc       = &nfs3_procedures[NFS3PROC_COMMIT],
+               .rpc_argp       = &cdata->args,
+               .rpc_resp       = &cdata->res,
++              .rpc_cred       = cdata->cred,
+       };
+       int                     status;
+       dprintk("NFS call  commit %d @ %Ld\n", cdata->args.count,
+                       (long long) cdata->args.offset);
+       fattr->valid = 0;
+-      msg.rpc_cred = nfs_cred(inode, filp);
+       status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+       if (status >= 0)
+               nfs_refresh_inode(inode, fattr);
+@@ -534,6 +525,8 @@ nfs3_proc_symlink(struct inode *dir, str
+       };
+       int                     status;
++      if (path->len > NFS3_MAXPATHLEN)
++              return -ENAMETOOLONG;
+       dprintk("NFS call  symlink %s -> %s\n", name->name, path->name);
+       dir_attr.valid = 0;
+       fattr->valid = 0;
+@@ -832,27 +825,6 @@ nfs3_proc_commit_setup(struct nfs_write_
+       rpc_call_setup(task, &msg, 0);
+ }
+-/*
+- * Set up the nfspage struct with the right credentials
+- */
+-void
+-nfs3_request_init(struct nfs_page *req, struct file *filp)
+-{
+-      req->wb_cred = get_rpccred(nfs_cred(req->wb_inode, filp));
+-}
+-
+-static int
+-nfs3_request_compatible(struct nfs_page *req, struct file *filp, struct page *page)
+-{
+-      if (req->wb_file != filp)
+-              return 0;
+-      if (req->wb_page != page)
+-              return 0;
+-      if (req->wb_cred != nfs_file_cred(filp))
+-              return 0;
+-      return 1;
+-}
+-
+ static int
+ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
+ {
+@@ -863,6 +835,7 @@ struct nfs_rpc_ops nfs_v3_clientops = {
+       .version        = 3,                    /* protocol version */
+       .dentry_ops     = &nfs_dentry_operations,
+       .dir_inode_ops  = &nfs_dir_inode_operations,
++      .file_inode_ops = &nfs_file_inode_operations,
+       .getroot        = nfs3_proc_get_root,
+       .getattr        = nfs3_proc_getattr,
+       .setattr        = nfs3_proc_setattr,
+@@ -892,7 +865,5 @@ struct nfs_rpc_ops nfs_v3_clientops = {
+       .commit_setup   = nfs3_proc_commit_setup,
+       .file_open      = nfs_open,
+       .file_release   = nfs_release,
+-      .request_init   = nfs3_request_init,
+-      .request_compatible = nfs3_request_compatible,
+       .lock           = nfs3_proc_lock,
+ };
+--- linux-2.6.7/fs/nfs/proc.c.lsec     2004-06-15 23:20:03.000000000 -0600
++++ linux-2.6.7/fs/nfs/proc.c  2005-03-23 14:28:23.058544264 -0700
+@@ -49,18 +49,6 @@
+ extern struct rpc_procinfo nfs_procedures[];
+-static struct rpc_cred *
+-nfs_cred(struct inode *inode, struct file *filp)
+-{
+-      struct rpc_cred *cred = NULL;
+-
+-      if (filp)
+-              cred = (struct rpc_cred *)filp->private_data;
+-      if (!cred)
+-              cred = NFS_I(inode)->mm_cred;
+-      return cred;
+-}
+-
+ /*
+  * Bare-bones access to getattr: this is for nfs_read_super.
+  */
+@@ -167,8 +155,7 @@ nfs_proc_readlink(struct inode *inode, s
+       return status;
+ }
+-static int
+-nfs_proc_read(struct nfs_read_data *rdata, struct file *filp)
++static int nfs_proc_read(struct nfs_read_data *rdata)
+ {
+       int                     flags = rdata->flags;
+       struct inode *          inode = rdata->inode;
+@@ -177,15 +164,14 @@ nfs_proc_read(struct nfs_read_data *rdat
+               .rpc_proc       = &nfs_procedures[NFSPROC_READ],
+               .rpc_argp       = &rdata->args,
+               .rpc_resp       = &rdata->res,
++              .rpc_resp       = rdata->cred,
+       };
+       int                     status;
+       dprintk("NFS call  read %d @ %Ld\n", rdata->args.count,
+                       (long long) rdata->args.offset);
+       fattr->valid = 0;
+-      msg.rpc_cred = nfs_cred(inode, filp);
+       status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags);
+-
+       if (status >= 0) {
+               nfs_refresh_inode(inode, fattr);
+               /* Emulate the eof flag, which isn't normally needed in NFSv2
+@@ -198,8 +184,7 @@ nfs_proc_read(struct nfs_read_data *rdat
+       return status;
+ }
+-static int
+-nfs_proc_write(struct nfs_write_data *wdata, struct file *filp)
++static int nfs_proc_write(struct nfs_write_data *wdata)
+ {
+       int                     flags = wdata->flags;
+       struct inode *          inode = wdata->inode;
+@@ -208,13 +193,13 @@ nfs_proc_write(struct nfs_write_data *wd
+               .rpc_proc       = &nfs_procedures[NFSPROC_WRITE],
+               .rpc_argp       = &wdata->args,
+               .rpc_resp       = &wdata->res,
++              .rpc_resp       = wdata->cred,
+       };
+       int                     status;
+       dprintk("NFS call  write %d @ %Ld\n", wdata->args.count,
+                       (long long) wdata->args.offset);
+       fattr->valid = 0;
+-      msg.rpc_cred = nfs_cred(inode, filp);
+       status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags);
+       if (status >= 0) {
+               nfs_refresh_inode(inode, fattr);
+@@ -400,6 +385,8 @@ nfs_proc_symlink(struct inode *dir, stru
+       };
+       int                     status;
++      if (path->len > NFS2_MAXPATHLEN)
++              return -ENAMETOOLONG;
+       dprintk("NFS call  symlink %s -> %s\n", name->name, path->name);
+       fattr->valid = 0;
+       status = rpc_call(NFS_CLIENT(dir), NFSPROC_SYMLINK, &arg, NULL, 0);
+@@ -619,27 +606,6 @@ nfs_proc_commit_setup(struct nfs_write_d
+       BUG();
+ }
+-/*
+- * Set up the nfspage struct with the right credentials
+- */
+-static void
+-nfs_request_init(struct nfs_page *req, struct file *filp)
+-{
+-      req->wb_cred = get_rpccred(nfs_cred(req->wb_inode, filp));
+-}
+-
+-static int
+-nfs_request_compatible(struct nfs_page *req, struct file *filp, struct page *page)
+-{
+-      if (req->wb_file != filp)
+-              return 0;
+-      if (req->wb_page != page)
+-              return 0;
+-      if (req->wb_cred != nfs_file_cred(filp))
+-              return 0;
+-      return 1;
+-}
+-
+ static int
+ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
+ {
+@@ -651,6 +617,7 @@ struct nfs_rpc_ops nfs_v2_clientops = {
+       .version        = 2,                   /* protocol version */
+       .dentry_ops     = &nfs_dentry_operations,
+       .dir_inode_ops  = &nfs_dir_inode_operations,
++      .file_inode_ops = &nfs_file_inode_operations,
+       .getroot        = nfs_proc_get_root,
+       .getattr        = nfs_proc_getattr,
+       .setattr        = nfs_proc_setattr,
+@@ -680,7 +647,5 @@ struct nfs_rpc_ops nfs_v2_clientops = {
+       .commit_setup   = nfs_proc_commit_setup,
+       .file_open      = nfs_open,
+       .file_release   = nfs_release,
+-      .request_init   = nfs_request_init,
+-      .request_compatible = nfs_request_compatible,
+       .lock           = nfs_proc_lock,
+ };
+--- linux-2.6.7/fs/nfs/file.c.lsec     2004-06-15 23:19:37.000000000 -0600
++++ linux-2.6.7/fs/nfs/file.c  2005-03-23 14:28:22.760589560 -0700
+@@ -31,6 +31,8 @@
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
++#include "delegation.h"
++
+ #define NFSDBG_FACILITY               NFSDBG_FILE
+ static long nfs_file_fcntl(int fd, unsigned int cmd,
+@@ -66,6 +68,19 @@ struct inode_operations nfs_file_inode_o
+       .setattr        = nfs_setattr,
+ };
++#ifdef CONFIG_NFS_V4
++
++struct inode_operations nfs4_file_inode_operations = {
++      .permission     = nfs_permission,
++      .getattr        = nfs_getattr,
++      .setattr        = nfs_setattr,
++      .getxattr       = nfs_getxattr,
++      .setxattr       = nfs_setxattr,
++      .listxattr      = nfs_listxattr,
++};
++
++#endif /* CONFIG_NFS_V4 */
++
+ /* Hack for future NFS swap support */
+ #ifndef IS_SWAPFILE
+ # define IS_SWAPFILE(inode)   (0)
+@@ -127,6 +142,7 @@ nfs_file_release(struct inode *inode, st
+ static int
+ nfs_file_flush(struct file *file)
+ {
++      struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+       struct inode    *inode = file->f_dentry->d_inode;
+       int             status;
+@@ -138,9 +154,9 @@ nfs_file_flush(struct file *file)
+       /* Ensure that data+attribute caches are up to date after close() */
+       status = nfs_wb_all(inode);
+       if (!status) {
+-              status = file->f_error;
+-              file->f_error = 0;
+-              if (!status)
++              status = ctx->error;
++              ctx->error = 0;
++              if (!status && !nfs_have_delegation(inode, FMODE_READ))
+                       __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+       }
+       unlock_kernel();
+@@ -211,6 +227,7 @@ nfs_file_mmap(struct file * file, struct
+ static int
+ nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+ {
++      struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+       struct inode *inode = dentry->d_inode;
+       int status;
+@@ -219,8 +236,8 @@ nfs_fsync(struct file *file, struct dent
+       lock_kernel();
+       status = nfs_wb_all(inode);
+       if (!status) {
+-              status = file->f_error;
+-              file->f_error = 0;
++              status = ctx->error;
++              ctx->error = 0;
+       }
+       unlock_kernel();
+       return status;
+@@ -302,6 +319,90 @@ out_swapfile:
+       goto out;
+ }
++static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
++{
++      struct inode *inode = filp->f_mapping->host;
++      int status;
++
++      lock_kernel();
++      status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++      unlock_kernel();
++      return status;
++}
++
++static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
++{
++      struct inode *inode = filp->f_mapping->host;
++      sigset_t oldset;
++      int status;
++
++      rpc_clnt_sigmask(NFS_CLIENT(inode), &oldset);
++      /*
++       * Flush all pending writes before doing anything
++       * with locks..
++       */
++      filemap_fdatawrite(filp->f_mapping);
++      down(&inode->i_sem);
++      nfs_wb_all(inode);
++      up(&inode->i_sem);
++      filemap_fdatawait(filp->f_mapping);
++
++      /* NOTE: special case
++       *      If we're signalled while cleaning up locks on process exit, we
++       *      still need to complete the unlock.
++       */
++      lock_kernel();
++      status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++      rpc_clnt_sigunmask(NFS_CLIENT(inode), &oldset);
++      return status;
++}
++
++static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
++{
++      struct inode *inode = filp->f_mapping->host;
++      int status;
++
++      /*
++       * Flush all pending writes before doing anything
++       * with locks..
++       */
++      status = filemap_fdatawrite(filp->f_mapping);
++      if (status == 0) {
++              down(&inode->i_sem);
++              status = nfs_wb_all(inode);
++              up(&inode->i_sem);
++              if (status == 0)
++                      status = filemap_fdatawait(filp->f_mapping);
++      }
++      if (status < 0)
++              return status;
++
++      lock_kernel();
++      status = NFS_PROTO(inode)->lock(filp, cmd, fl);
++      /* If we were signalled we still need to ensure that
++       * we clean up any state on the server. We therefore
++       * record the lock call as having succeeded in order to
++       * ensure that locks_remove_posix() cleans it out when
++       * the process exits.
++       */
++      if (status == -EINTR || status == -ERESTARTSYS)
++              posix_lock_file(filp, fl);
++      unlock_kernel();
++      if (status < 0)
++              return status;
++      /*
++       * Make sure we clear the cache whenever we try to get the lock.
++       * This makes locking act as a cache coherency point.
++       */
++      filemap_fdatawrite(filp->f_mapping);
++      down(&inode->i_sem);
++      nfs_wb_all(inode);      /* we may have slept */
++      up(&inode->i_sem);
++      filemap_fdatawait(filp->f_mapping);
++      nfs_zap_caches(inode);
++      return 0;
++}
++
+ /*
+  * Lock a (portion of) a file
+  */
+@@ -309,8 +410,6 @@ int
+ nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
+ {
+       struct inode * inode = filp->f_mapping->host;
+-      int     status = 0;
+-      int     status2;
+       dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n",
+                       inode->i_sb->s_id, inode->i_ino,
+@@ -328,8 +427,8 @@ nfs_lock(struct file *filp, int cmd, str
+               /* Fake OK code if mounted without NLM support */
+               if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) {
+                       if (IS_GETLK(cmd))
+-                              status = LOCK_USE_CLNT;
+-                      goto out_ok;
++                              return LOCK_USE_CLNT;
++                      return 0;
+               }
+       }
+@@ -340,45 +439,12 @@ nfs_lock(struct file *filp, int cmd, str
+        * Not sure whether that would be unique, though, or whether
+        * that would break in other places.
+        */
+-      if (!fl->fl_owner || !(fl->fl_flags & FL_POSIX))
++      if (!(fl->fl_flags & FL_POSIX))
+               return -ENOLCK;
+-      /*
+-       * Flush all pending writes before doing anything
+-       * with locks..
+-       */
+-      status = filemap_fdatawrite(filp->f_mapping);
+-      down(&inode->i_sem);
+-      status2 = nfs_wb_all(inode);
+-      if (!status)
+-              status = status2;
+-      up(&inode->i_sem);
+-      status2 = filemap_fdatawait(filp->f_mapping);
+-      if (!status)
+-              status = status2;
+-      if (status < 0)
+-              return status;
+-
+-      lock_kernel();
+-      status = NFS_PROTO(inode)->lock(filp, cmd, fl);
+-      unlock_kernel();
+-      if (status < 0)
+-              return status;
+-      
+-      status = 0;
+-
+-      /*
+-       * Make sure we clear the cache whenever we try to get the lock.
+-       * This makes locking act as a cache coherency point.
+-       */
+- out_ok:
+-      if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+-              filemap_fdatawrite(filp->f_mapping);
+-              down(&inode->i_sem);
+-              nfs_wb_all(inode);      /* we may have slept */
+-              up(&inode->i_sem);
+-              filemap_fdatawait(filp->f_mapping);
+-              nfs_zap_caches(inode);
+-      }
+-      return status;
++      if (IS_GETLK(cmd))
++              return do_getlk(filp, cmd, fl);
++      if (fl->fl_type == F_UNLCK)
++              return do_unlk(filp, cmd, fl);
++      return do_setlk(filp, cmd, fl);
+ }
+--- linux-2.6.7/fs/nfs/write.c.lsec    2004-06-15 23:19:43.000000000 -0600
++++ linux-2.6.7/fs/nfs/write.c 2005-03-23 14:28:23.225518880 -0700
+@@ -63,6 +63,8 @@
+ #include <linux/smp_lock.h>
+ #include <linux/mempool.h>
++#include "delegation.h"
++
+ #define NFSDBG_FACILITY               NFSDBG_PAGECACHE
+ #define MIN_POOL_WRITE                (32)
+@@ -71,7 +73,8 @@
+ /*
+  * Local function declarations
+  */
+-static struct nfs_page * nfs_update_request(struct file*, struct inode *,
++static struct nfs_page * nfs_update_request(struct nfs_open_context*,
++                                          struct inode *,
+                                           struct page *,
+                                           unsigned int, unsigned int);
+ static void nfs_writeback_done_partial(struct nfs_write_data *, int);
+@@ -173,7 +176,7 @@ static void nfs_mark_uptodate(struct pag
+  * Write a page synchronously.
+  * Offset is the data offset within the page.
+  */
+-static int nfs_writepage_sync(struct file *file, struct inode *inode,
++static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
+               struct page *page, unsigned int offset, unsigned int count,
+               int how)
+ {
+@@ -187,9 +190,10 @@ static int nfs_writepage_sync(struct fil
+       memset(wdata, 0, sizeof(*wdata));
+       wdata->flags = how;
++      wdata->cred = ctx->cred;
+       wdata->inode = inode;
+       wdata->args.fh = NFS_FH(inode);
+-      wdata->args.lockowner = current->files;
++      wdata->args.context = ctx;
+       wdata->args.pages = &page;
+       wdata->args.stable = NFS_FILE_SYNC;
+       wdata->args.pgbase = offset;
+@@ -208,7 +212,7 @@ static int nfs_writepage_sync(struct fil
+                       wdata->args.count = count;
+               wdata->args.offset = page_offset(page) + wdata->args.pgbase;
+-              result = NFS_PROTO(inode)->write(wdata, file);
++              result = NFS_PROTO(inode)->write(wdata);
+               if (result < 0) {
+                       /* Must mark the page invalid after I/O error */
+@@ -241,13 +245,14 @@ io_error:
+       return written ? written : result;
+ }
+-static int nfs_writepage_async(struct file *file, struct inode *inode,
+-              struct page *page, unsigned int offset, unsigned int count)
++static int nfs_writepage_async(struct nfs_open_context *ctx,
++              struct inode *inode, struct page *page,
++              unsigned int offset, unsigned int count)
+ {
+       struct nfs_page *req;
+       int             status;
+-      req = nfs_update_request(file, inode, page, offset, count);
++      req = nfs_update_request(ctx, inode, page, offset, count);
+       status = (IS_ERR(req)) ? PTR_ERR(req) : 0;
+       if (status < 0)
+               goto out;
+@@ -274,6 +279,7 @@ static int wb_priority(struct writeback_
+  */
+ int nfs_writepage(struct page *page, struct writeback_control *wbc)
+ {
++      struct nfs_open_context *ctx;
+       struct inode *inode = page->mapping->host;
+       unsigned long end_index;
+       unsigned offset = PAGE_CACHE_SIZE;
+@@ -308,16 +314,21 @@ int nfs_writepage(struct page *page, str
+       if (page->index >= end_index+1 || !offset)
+               goto out;
+ do_it:
++      ctx = nfs_find_open_context(inode, FMODE_WRITE);
++      if (ctx == NULL) {
++              err = -EBADF;
++              goto out;
++      }
+       lock_kernel();
+       if (!IS_SYNC(inode) && inode_referenced) {
+-              err = nfs_writepage_async(NULL, inode, page, 0, offset);
++              err = nfs_writepage_async(ctx, inode, page, 0, offset);
+               if (err >= 0) {
+                       err = 0;
+                       if (wbc->for_reclaim)
+                               nfs_flush_inode(inode, 0, 0, FLUSH_STABLE);
+               }
+       } else {
+-              err = nfs_writepage_sync(NULL, inode, page, 0,
++              err = nfs_writepage_sync(ctx, inode, page, 0,
+                                               offset, priority);
+               if (err >= 0) {
+                       if (err != offset)
+@@ -326,6 +337,7 @@ do_it:
+               }
+       }
+       unlock_kernel();
++      put_nfs_open_context(ctx);
+ out:
+       unlock_page(page);
+       if (inode_referenced)
+@@ -374,8 +386,7 @@ out:
+ /*
+  * Insert a write request into an inode
+  */
+-static inline int
+-nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
++static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+ {
+       struct nfs_inode *nfsi = NFS_I(inode);
+       int error;
+@@ -387,6 +398,8 @@ nfs_inode_add_request(struct inode *inod
+       if (!nfsi->npages) {
+               igrab(inode);
+               nfs_begin_data_update(inode);
++              if (nfs_have_delegation(inode, FMODE_WRITE))
++                      nfsi->change_attr++;
+       }
+       nfsi->npages++;
+       req->wb_count++;
+@@ -404,7 +417,7 @@ nfs_inode_remove_request(struct nfs_page
+       BUG_ON (!NFS_WBACK_BUSY(req));
+       spin_lock(&nfs_wreq_lock);
+-      inode = req->wb_inode;
++      inode = req->wb_context->dentry->d_inode;
+       nfsi = NFS_I(inode);
+       radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
+       nfsi->npages--;
+@@ -450,7 +463,7 @@ nfs_find_request(struct inode *inode, un
+ static void
+ nfs_mark_request_dirty(struct nfs_page *req)
+ {
+-      struct inode *inode = req->wb_inode;
++      struct inode *inode = req->wb_context->dentry->d_inode;
+       struct nfs_inode *nfsi = NFS_I(inode);
+       spin_lock(&nfs_wreq_lock);
+@@ -467,7 +480,7 @@ nfs_mark_request_dirty(struct nfs_page *
+ static inline int
+ nfs_dirty_request(struct nfs_page *req)
+ {
+-      struct nfs_inode *nfsi = NFS_I(req->wb_inode);
++      struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode);
+       return !list_empty(&req->wb_list) && req->wb_list_head == &nfsi->dirty;
+ }
+@@ -478,7 +491,7 @@ nfs_dirty_request(struct nfs_page *req)
+ static void
+ nfs_mark_request_commit(struct nfs_page *req)
+ {
+-      struct inode *inode = req->wb_inode;
++      struct inode *inode = req->wb_context->dentry->d_inode;
+       struct nfs_inode *nfsi = NFS_I(inode);
+       spin_lock(&nfs_wreq_lock);
+@@ -619,9 +632,9 @@ static int nfs_wait_on_write_congestion(
+  *
+  * Note: Should always be called with the Page Lock held!
+  */
+-static struct nfs_page *
+-nfs_update_request(struct file* file, struct inode *inode, struct page *page,
+-                 unsigned int offset, unsigned int bytes)
++static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
++              struct inode *inode, struct page *page,
++              unsigned int offset, unsigned int bytes)
+ {
+       struct nfs_server *server = NFS_SERVER(inode);
+       struct nfs_page         *req, *new = NULL;
+@@ -668,13 +681,9 @@ nfs_update_request(struct file* file, st
+               }
+               spin_unlock(&nfs_wreq_lock);
+-              new = nfs_create_request(file, inode, page, offset, bytes);
++              new = nfs_create_request(ctx, inode, page, offset, bytes);
+               if (IS_ERR(new))
+                       return new;
+-              if (file) {
+-                      new->wb_file = file;
+-                      get_file(file);
+-              }
+       }
+       /* We have a request for our page.
+@@ -684,7 +693,7 @@ nfs_update_request(struct file* file, st
+        * request.
+        */
+       rqend = req->wb_offset + req->wb_bytes;
+-      if (req->wb_file != file
++      if (req->wb_context != ctx
+           || req->wb_page != page
+           || !nfs_dirty_request(req)
+           || offset > rqend || end < req->wb_offset) {
+@@ -705,9 +714,9 @@ nfs_update_request(struct file* file, st
+       return req;
+ }
+-int
+-nfs_flush_incompatible(struct file *file, struct page *page)
++int nfs_flush_incompatible(struct file *file, struct page *page)
+ {
++      struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+       struct inode    *inode = page->mapping->host;
+       struct nfs_page *req;
+       int             status = 0;
+@@ -721,7 +730,7 @@ nfs_flush_incompatible(struct file *file
+        */
+       req = nfs_find_request(inode, page->index);
+       if (req) {
+-              if (!NFS_PROTO(inode)->request_compatible(req, file, page))
++              if (req->wb_page != page || ctx != req->wb_context)
+                       status = nfs_wb_page(inode, page);
+               nfs_release_request(req);
+       }
+@@ -737,6 +746,7 @@ nfs_flush_incompatible(struct file *file
+ int nfs_updatepage(struct file *file, struct page *page,
+               unsigned int offset, unsigned int count)
+ {
++      struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
+       struct dentry   *dentry = file->f_dentry;
+       struct inode    *inode = page->mapping->host;
+       struct nfs_page *req;
+@@ -747,7 +757,7 @@ int nfs_updatepage(struct file *file, st
+               count, (long long)(page_offset(page) +offset));
+       if (IS_SYNC(inode)) {
+-              status = nfs_writepage_sync(file, inode, page, offset, count, 0);
++              status = nfs_writepage_sync(ctx, inode, page, offset, count, 0);
+               if (status > 0) {
+                       if (offset == 0 && status == PAGE_CACHE_SIZE)
+                               SetPageUptodate(page);
+@@ -784,7 +794,7 @@ int nfs_updatepage(struct file *file, st
+        * it out now.
+        */
+       do {
+-              req = nfs_update_request(file, inode, page, offset, count);
++              req = nfs_update_request(ctx, inode, page, offset, count);
+               status = (IS_ERR(req)) ? PTR_ERR(req) : 0;
+               if (status != -EBUSY)
+                       break;
+@@ -860,16 +870,15 @@ static void nfs_write_rpcsetup(struct nf
+        * NB: take care not to mess about with data->commit et al. */
+       data->req = req;
+-      data->inode = inode = req->wb_inode;
+-      data->cred = req->wb_cred;
++      data->inode = inode = req->wb_context->dentry->d_inode;
++      data->cred = req->wb_context->cred;
+       data->args.fh     = NFS_FH(inode);
+       data->args.offset = req_offset(req) + offset;
+       data->args.pgbase = req->wb_pgbase + offset;
+       data->args.pages  = data->pagevec;
+       data->args.count  = count;
+-      data->args.lockowner = req->wb_lockowner;
+-      data->args.state  = req->wb_state;
++      data->args.context = req->wb_context;
+       data->res.fattr   = &data->fattr;
+       data->res.count   = count;
+@@ -1029,7 +1038,7 @@ nfs_flush_list(struct list_head *head, i
+       while (!list_empty(head)) {
+               pages += nfs_coalesce_requests(head, &one_request, wpages);
+               req = nfs_list_entry(one_request.next);
+-              error = nfs_flush_one(&one_request, req->wb_inode, how);
++              error = nfs_flush_one(&one_request, req->wb_context->dentry->d_inode, how);
+               if (error < 0)
+                       break;
+       }
+@@ -1054,16 +1063,15 @@ static void nfs_writeback_done_partial(s
+       struct page             *page = req->wb_page;
+       dprintk("NFS: write (%s/%Ld %d@%Ld)",
+-              req->wb_inode->i_sb->s_id,
+-              (long long)NFS_FILEID(req->wb_inode),
++              req->wb_context->dentry->d_inode->i_sb->s_id,
++              (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+               req->wb_bytes,
+               (long long)req_offset(req));
+       if (status < 0) {
+               ClearPageUptodate(page);
+               SetPageError(page);
+-              if (req->wb_file)
+-                      req->wb_file->f_error = status;
++              req->wb_context->error = status;
+               dprintk(", error = %d\n", status);
+       } else {
+ #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+@@ -1104,16 +1112,15 @@ static void nfs_writeback_done_full(stru
+               page = req->wb_page;
+               dprintk("NFS: write (%s/%Ld %d@%Ld)",
+-                      req->wb_inode->i_sb->s_id,
+-                      (long long)NFS_FILEID(req->wb_inode),
++                      req->wb_context->dentry->d_inode->i_sb->s_id,
++                      (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+                       req->wb_bytes,
+                       (long long)req_offset(req));
+               if (status < 0) {
+                       ClearPageUptodate(page);
+                       SetPageError(page);
+-                      if (req->wb_file)
+-                              req->wb_file->f_error = status;
++                      req->wb_context->error = status;
+                       end_page_writeback(page);
+                       nfs_inode_remove_request(req);
+                       dprintk(", error = %d\n", status);
+@@ -1232,7 +1239,7 @@ static void nfs_commit_rpcsetup(struct l
+       list_splice_init(head, &data->pages);
+       first = nfs_list_entry(data->pages.next);
+       last = nfs_list_entry(data->pages.prev);
+-      inode = first->wb_inode;
++      inode = first->wb_context->dentry->d_inode;
+       /*
+        * Determine the offset range of requests in the COMMIT call.
+@@ -1246,7 +1253,7 @@ static void nfs_commit_rpcsetup(struct l
+               len = 0;
+       data->inode       = inode;
+-      data->cred        = first->wb_cred;
++      data->cred        = first->wb_context->cred;
+       data->args.fh     = NFS_FH(data->inode);
+       data->args.offset = start;
+@@ -1313,13 +1320,12 @@ nfs_commit_done(struct rpc_task *task)
+               nfs_list_remove_request(req);
+               dprintk("NFS: commit (%s/%Ld %d@%Ld)",
+-                      req->wb_inode->i_sb->s_id,
+-                      (long long)NFS_FILEID(req->wb_inode),
++                      req->wb_context->dentry->d_inode->i_sb->s_id,
++                      (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+                       req->wb_bytes,
+                       (long long)req_offset(req));
+               if (task->tk_status < 0) {
+-                      if (req->wb_file)
+-                              req->wb_file->f_error = task->tk_status;
++                      req->wb_context->error = task->tk_status;
+                       nfs_inode_remove_request(req);
+                       dprintk(", error = %d\n", task->tk_status);
+                       goto next;
+--- linux-2.6.7/fs/nfs/nfs4xdr.c.lsec  2004-06-15 23:20:26.000000000 -0600
++++ linux-2.6.7/fs/nfs/nfs4xdr.c       2005-03-23 14:28:23.056544568 -0700
+@@ -84,9 +84,13 @@ static int nfs_stat_to_errno(int);
+                               ((3+NFS4_FHSIZE) >> 2))
+ #define encode_getattr_maxsz    (op_encode_hdr_maxsz + 3)
+ #define nfs4_name_maxsz               (1 + ((3 + NFS4_MAXNAMLEN) >> 2))
++#define nfs4_path_maxsz               (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
+ #define nfs4_fattr_bitmap_maxsz (36 + 2 * nfs4_name_maxsz)
+ #define decode_getattr_maxsz    (op_decode_hdr_maxsz + 3 + \
+                                 nfs4_fattr_bitmap_maxsz)
++#define encode_setattr_maxsz  (op_decode_hdr_maxsz + 4 + \
++                              nfs4_fattr_bitmap_maxsz)
++#define decode_setattr_maxsz  (op_decode_hdr_maxsz + 3)
+ #define encode_savefh_maxsz     (op_encode_hdr_maxsz)
+ #define decode_savefh_maxsz     (op_decode_hdr_maxsz)
+ #define encode_fsinfo_maxsz   (op_encode_hdr_maxsz + 2)
+@@ -118,10 +122,17 @@ static int nfs_stat_to_errno(int);
+ #define encode_link_maxsz     (op_encode_hdr_maxsz + \
+                               nfs4_name_maxsz)
+ #define decode_link_maxsz     (op_decode_hdr_maxsz + 5)
++#define encode_symlink_maxsz  (op_encode_hdr_maxsz + \
++                              1 + nfs4_name_maxsz + \
++                              nfs4_path_maxsz + \
++                              nfs4_fattr_bitmap_maxsz)
++#define decode_symlink_maxsz  (op_decode_hdr_maxsz + 8)
+ #define encode_create_maxsz   (op_encode_hdr_maxsz + \
+-                              2 + 2 * nfs4_name_maxsz + \
++                              2 + nfs4_name_maxsz + \
+                               nfs4_fattr_bitmap_maxsz)
+ #define decode_create_maxsz   (op_decode_hdr_maxsz + 8)
++#define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4)
++#define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
+ #define NFS4_enc_compound_sz  (1024)  /* XXX: large enough? */
+ #define NFS4_dec_compound_sz  (1024)  /* XXX: large enough? */
+ #define NFS4_enc_read_sz      (compound_encode_hdr_maxsz + \
+@@ -172,16 +183,14 @@ static int nfs_stat_to_errno(int);
+ #define NFS4_dec_open_confirm_sz        (compound_decode_hdr_maxsz + \
+                                         decode_putfh_maxsz + \
+                                         op_decode_hdr_maxsz + 4)
+-#define NFS4_enc_open_reclaim_sz      (compound_encode_hdr_maxsz + \
++#define NFS4_enc_open_noattr_sz       (compound_encode_hdr_maxsz + \
+                                       encode_putfh_maxsz + \
+                                       op_encode_hdr_maxsz + \
+-                                      11 + \
+-                                      encode_getattr_maxsz)
+-#define NFS4_dec_open_reclaim_sz      (compound_decode_hdr_maxsz + \
++                                      11)
++#define NFS4_dec_open_noattr_sz       (compound_decode_hdr_maxsz + \
+                                       decode_putfh_maxsz + \
+                                       op_decode_hdr_maxsz + \
+-                                      4 + 5 + 2 + 3 + \
+-                                      decode_getattr_maxsz)
++                                      4 + 5 + 2 + 3)
+ #define NFS4_enc_open_downgrade_sz \
+                               (compound_encode_hdr_maxsz + \
+                                 encode_putfh_maxsz + \
+@@ -313,6 +322,16 @@ static int nfs_stat_to_errno(int);
+                               decode_savefh_maxsz + \
+                               decode_putfh_maxsz + \
+                               decode_link_maxsz)
++#define NFS4_enc_symlink_sz   (compound_encode_hdr_maxsz + \
++                              encode_putfh_maxsz + \
++                              encode_symlink_maxsz + \
++                              encode_getattr_maxsz + \
++                              encode_getfh_maxsz)
++#define NFS4_dec_symlink_sz   (compound_decode_hdr_maxsz + \
++                              decode_putfh_maxsz + \
++                              decode_symlink_maxsz + \
++                              decode_getattr_maxsz + \
++                              decode_getfh_maxsz)
+ #define NFS4_enc_create_sz    (compound_encode_hdr_maxsz + \
+                               encode_putfh_maxsz + \
+                               encode_create_maxsz + \
+@@ -339,6 +358,33 @@ static int nfs_stat_to_errno(int);
+                               encode_getattr_maxsz)
+ #define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \
+                               decode_getattr_maxsz)
++#define NFS4_enc_delegreturn_sz       (compound_encode_hdr_maxsz + \
++                              encode_putfh_maxsz + \
++                              encode_delegreturn_maxsz)
++#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
++                              decode_delegreturn_maxsz)
++#define username_maxsz                (1 + ((IDMAP_NAMESZ + 3) >> 2))
++/* XXX: fix ACL bounds */
++#define ace_maxsz             (3 + username_maxsz)
++#define NFS_ACL_MAX_ENTRIES   32
++#define acl_maxentries                ((NFS_ACL_MAX_ENTRIES - 3) * 3 + 6)
++#define acl_maxsz             (1 + acl_maxentries * ace_maxsz)
++#define NFS4_enc_getacl_sz    compound_encode_hdr_maxsz + \
++                              encode_putfh_maxsz + \
++                              encode_getattr_maxsz
++#define username_maxsz                (1 + ((IDMAP_NAMESZ + 3) >> 2))
++#define ace_maxsz             (3 + username_maxsz)
++#define acl_maxentries                ((NFS_ACL_MAX_ENTRIES - 3) * 3 + 6)
++#define acl_maxsz             (1 + acl_maxentries * ace_maxsz)
++#define NFS4_dec_getacl_sz    (compound_decode_hdr_maxsz + \
++                              decode_putfh_maxsz + \
++                              op_decode_hdr_maxsz + 3 + 1 + acl_maxsz)
++#define NFS4_enc_setacl_sz    (compound_encode_hdr_maxsz + \
++                              encode_putfh_maxsz + \
++                              op_encode_hdr_maxsz + 4 + 1 + acl_maxsz)
++#define NFS4_dec_setacl_sz    (compound_decode_hdr_maxsz + \
++                              decode_putfh_maxsz + \
++                              decode_setattr_maxsz)
+ static struct {
+       unsigned int    mode;
+@@ -388,6 +434,15 @@ struct compound_hdr {
+       BUG_ON(!p);                                             \
+ } while (0)
++static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
++{
++      uint32_t *p;
++
++      p = xdr_reserve_space(xdr, 4 + len);
++      BUG_ON(p == NULL);
++      xdr_encode_opaque(p, str, len);
++}
++
+ static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+ {
+       uint32_t *p;
+@@ -402,6 +457,15 @@ static int encode_compound_hdr(struct xd
+       return 0;
+ }
++static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
++{
++      uint32_t *p;
++
++      p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
++      BUG_ON(p == NULL);
++      xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE);
++}
++
+ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+ {
+       char owner_name[IDMAP_NAMESZ];
+@@ -420,7 +484,7 @@ static int encode_attrs(struct xdr_strea
+        * In the worst-case, this would be
+        *   12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
+        *          = 36 bytes, plus any contribution from variable-length fields
+-       *            such as owner/group/acl's.
++       *            such as owner/group.
+        */
+       len = 16;
+@@ -742,19 +806,12 @@ static int encode_lookup(struct xdr_stre
+       return 0;
+ }
+-static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg)
++static void encode_share_access(struct xdr_stream *xdr, int open_flags)
+ {
+-      int status;
+       uint32_t *p;
+- /*
+- * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
+- * owner 4, opentype 4 = 36
+- */
+-      RESERVE_SPACE(36);
+-      WRITE32(OP_OPEN);
+-      WRITE32(arg->seqid);
+-      switch (arg->share_access) {
++      RESERVE_SPACE(8);
++      switch (open_flags & (FMODE_READ|FMODE_WRITE)) {
+               case FMODE_READ:
+                       WRITE32(NFS4_SHARE_ACCESS_READ);
+                       break;
+@@ -767,84 +824,135 @@ static int encode_open(struct xdr_stream
+               default:
+                       BUG();
+       }
+-      WRITE32(0);                  /* for linux, share_deny = 0 always */
++      WRITE32(0);             /* for linux, share_deny = 0 always */
++}
++
++static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
++{
++      uint32_t *p;
++ /*
++ * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
++ * owner 4 = 32
++ */
++      RESERVE_SPACE(8);
++      WRITE32(OP_OPEN);
++      WRITE32(arg->seqid);
++      encode_share_access(xdr, arg->open_flags);
++      RESERVE_SPACE(16);
+       WRITE64(arg->clientid);
+       WRITE32(4);
+       WRITE32(arg->id);
+-      WRITE32(arg->opentype);
++}
+-      if (arg->opentype == NFS4_OPEN_CREATE) {
+-              if (arg->createmode == NFS4_CREATE_EXCLUSIVE) {
+-                      RESERVE_SPACE(12);
+-                      WRITE32(arg->createmode);
+-                      WRITEMEM(arg->u.verifier.data, sizeof(arg->u.verifier.data));
+-              }
+-              else if (arg->u.attrs) {
+-                      RESERVE_SPACE(4);
+-                      WRITE32(arg->createmode);
+-                      if ((status = encode_attrs(xdr, arg->u.attrs, arg->server)))
+-                              return status;
+-              }
+-              else {
+-                      RESERVE_SPACE(12);
+-                      WRITE32(arg->createmode);
+-                      WRITE32(0);
+-                      WRITE32(0);
+-              }
++static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
++{
++      uint32_t *p;
++
++      RESERVE_SPACE(4);
++      switch(arg->open_flags & O_EXCL) {
++              case 0:
++                      WRITE32(NFS4_CREATE_UNCHECKED);
++                      encode_attrs(xdr, arg->u.attrs, arg->server);
++                      break;
++              default:
++                      WRITE32(NFS4_CREATE_EXCLUSIVE);
++                      encode_nfs4_verifier(xdr, &arg->u.verifier);
+       }
++}
+-      RESERVE_SPACE(8 + arg->name->len);
+-      WRITE32(NFS4_OPEN_CLAIM_NULL);
+-      WRITE32(arg->name->len);
+-      WRITEMEM(arg->name->name, arg->name->len);
++static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg)
++{
++      uint32_t *p;
+-      return 0;
++      RESERVE_SPACE(4);
++      switch (arg->open_flags & O_CREAT) {
++              case 0:
++                      WRITE32(NFS4_OPEN_NOCREATE);
++                      break;
++              default:
++                      BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
++                      WRITE32(NFS4_OPEN_CREATE);
++                      encode_createmode(xdr, arg);
++      }
+ }
+-static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg)
++static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type)
+ {
+       uint32_t *p;
+-      RESERVE_SPACE(8+sizeof(arg->stateid.data));
+-      WRITE32(OP_OPEN_CONFIRM);
+-      WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
+-      WRITE32(arg->seqid);
++      RESERVE_SPACE(4);
++      switch (delegation_type) {
++              case 0:
++                      WRITE32(NFS4_OPEN_DELEGATE_NONE);
++                      break;
++              case FMODE_READ:
++                      WRITE32(NFS4_OPEN_DELEGATE_READ);
++                      break;
++              case FMODE_WRITE|FMODE_READ:
++                      WRITE32(NFS4_OPEN_DELEGATE_WRITE);
++                      break;
++              default:
++                      BUG();
++      }
++}
+-      return 0;
++static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name)
++{
++      uint32_t *p;
++
++      RESERVE_SPACE(4);
++      WRITE32(NFS4_OPEN_CLAIM_NULL);
++      encode_string(xdr, name->len, name->name);
+ }
++static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
++{
++      uint32_t *p;
++
++      RESERVE_SPACE(4);
++      WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
++      encode_delegation_type(xdr, type);
++}
+-static int encode_open_reclaim(struct xdr_stream *xdr, const struct nfs_open_reclaimargs *arg)
++static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid)
+ {
+       uint32_t *p;
+- /*
+- * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
+- * owner 4, opentype 4, claim 4, delegation_type 4 = 44
+- */
+-      RESERVE_SPACE(44);
+-      WRITE32(OP_OPEN);
+-      WRITE32(arg->seqid);
+-      switch (arg->share_access) {
+-              case FMODE_READ:
+-                      WRITE32(NFS4_SHARE_ACCESS_READ);
++      RESERVE_SPACE(4+sizeof(stateid->data));
++      WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
++      WRITEMEM(stateid->data, sizeof(stateid->data));
++      encode_string(xdr, name->len, name->name);
++}
++
++static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg)
++{
++      encode_openhdr(xdr, arg);
++      encode_opentype(xdr, arg);
++      switch (arg->claim) {
++              case NFS4_OPEN_CLAIM_NULL:
++                      encode_claim_null(xdr, arg->name);
+                       break;
+-              case FMODE_WRITE:
+-                      WRITE32(NFS4_SHARE_ACCESS_WRITE);
++              case NFS4_OPEN_CLAIM_PREVIOUS:
++                      encode_claim_previous(xdr, arg->u.delegation_type);
+                       break;
+-              case FMODE_READ|FMODE_WRITE:
+-                      WRITE32(NFS4_SHARE_ACCESS_BOTH);
++              case NFS4_OPEN_CLAIM_DELEGATE_CUR:
++                      encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
+                       break;
+               default:
+                       BUG();
+       }
+-      WRITE32(0);                  /* for linux, share_deny = 0 always */
+-      WRITE64(arg->clientid);
+-      WRITE32(4);
+-      WRITE32(arg->id);
+-      WRITE32(NFS4_OPEN_NOCREATE);
+-      WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
+-      WRITE32(NFS4_OPEN_DELEGATE_NONE);
++      return 0;
++}
++
++static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg)
++{
++      uint32_t *p;
++
++      RESERVE_SPACE(8+sizeof(arg->stateid.data));
++      WRITE32(OP_OPEN_CONFIRM);
++      WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
++      WRITE32(arg->seqid);
++
+       return 0;
+ }
+@@ -852,14 +960,11 @@ static int encode_open_downgrade(struct 
+ {
+       uint32_t *p;
+-      RESERVE_SPACE(16+sizeof(arg->stateid.data));
++      RESERVE_SPACE(8+sizeof(arg->stateid.data));
+       WRITE32(OP_OPEN_DOWNGRADE);
+       WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data));
+       WRITE32(arg->seqid);
+-      WRITE32(arg->share_access);
+-      /* No deny modes */
+-      WRITE32(0);
+-
++      encode_share_access(xdr, arg->open_flags);
+       return 0;
+ }
+@@ -887,15 +992,15 @@ static int encode_putrootfh(struct xdr_s
+         return 0;
+ }
+-static void encode_stateid(struct xdr_stream *xdr, struct nfs4_state *state, fl_owner_t lockowner)
++static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
+ {
+       extern nfs4_stateid zero_stateid;
+       nfs4_stateid stateid;
+       uint32_t *p;
+       RESERVE_SPACE(16);
+-      if (state != NULL) {
+-              nfs4_copy_stateid(&stateid, state, lockowner);
++      if (ctx->state != NULL) {
++              nfs4_copy_stateid(&stateid, ctx->state, ctx->pid);
+               WRITEMEM(stateid.data, sizeof(stateid.data));
+       } else
+               WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data));
+@@ -908,7 +1013,7 @@ static int encode_read(struct xdr_stream
+       RESERVE_SPACE(4);
+       WRITE32(OP_READ);
+-      encode_stateid(xdr, args->state, args->lockowner);
++      encode_stateid(xdr, args->context);
+       RESERVE_SPACE(12);
+       WRITE64(args->offset);
+@@ -1003,6 +1108,45 @@ static int encode_renew(struct xdr_strea
+       return 0;
+ }
++extern nfs4_stateid zero_stateid;
++
++static int
++encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
++{
++      uint32_t *p;
++      uint32_t *q = (uint32_t *)arg->acl;
++      uint32_t *end = (uint32_t *)(arg->acl + arg->acl_len);
++      uint32_t tmp;
++      int naces, i;
++
++      RESERVE_SPACE(4+sizeof(zero_stateid.data));
++      WRITE32(OP_SETATTR);
++      WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data));
++      RESERVE_SPACE(4*4);
++      WRITE32(1);
++      WRITE32(FATTR4_WORD0_ACL);
++      WRITE32(arg->acl_len);
++      if (q + 1 > end)
++              return -EINVAL;
++      naces = ntohl(*q++);
++      WRITE32(naces);
++      for (i = 0; i < naces; i++) {
++              if (q + 4 > end)
++                      return -EINVAL;
++              RESERVE_SPACE(3*4);
++              memcpy(p, q, 3*4); /* type, flag, access_mask, length */
++              q += 3;
++              tmp = ntohl(*q++); /* length */
++              if (tmp > XDR_MAX_NETOBJ)
++                      return -EINVAL;
++              if (q + XDR_QUADLEN(tmp) > end)
++                      return -EINVAL;
++              RESERVE_SPACE((XDR_QUADLEN(tmp) << 2) + 4);
++              p = xdr_encode_opaque(p, q, tmp);
++      }
++      return 0;
++}
++
+ static int
+ encode_savefh(struct xdr_stream *xdr)
+ {
+@@ -1031,26 +1175,18 @@ static int encode_setattr(struct xdr_str
+ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid)
+ {
+-      uint32_t total_len;
+-      uint32_t len1, len2, len3;
+       uint32_t *p;
+-      len1 = strlen(setclientid->sc_name);
+-      len2 = strlen(setclientid->sc_netid);
+-      len3 = strlen(setclientid->sc_uaddr);
+-      total_len = XDR_QUADLEN(len1) + XDR_QUADLEN(len2) + XDR_QUADLEN(len3);
+-      total_len = (total_len << 2) + 24 + sizeof(setclientid->sc_verifier.data);
+-
+-      RESERVE_SPACE(total_len);
++      RESERVE_SPACE(4 + sizeof(setclientid->sc_verifier->data));
+       WRITE32(OP_SETCLIENTID);
+-      WRITEMEM(setclientid->sc_verifier.data, sizeof(setclientid->sc_verifier.data));
+-      WRITE32(len1);
+-      WRITEMEM(setclientid->sc_name, len1);
++      WRITEMEM(setclientid->sc_verifier->data, sizeof(setclientid->sc_verifier->data));
++
++      encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
++      RESERVE_SPACE(4);
+       WRITE32(setclientid->sc_prog);
+-      WRITE32(len2);
+-      WRITEMEM(setclientid->sc_netid, len2);
+-      WRITE32(len3);
+-      WRITEMEM(setclientid->sc_uaddr, len3);
++      encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
++      encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
++      RESERVE_SPACE(4);
+       WRITE32(setclientid->sc_cb_ident);
+       return 0;
+@@ -1075,7 +1211,7 @@ static int encode_write(struct xdr_strea
+       RESERVE_SPACE(4);
+       WRITE32(OP_WRITE);
+-      encode_stateid(xdr, args->state, args->lockowner);
++      encode_stateid(xdr, args->context);
+       RESERVE_SPACE(16);
+       WRITE64(args->offset);
+@@ -1086,6 +1222,18 @@ static int encode_write(struct xdr_strea
+       return 0;
+ }
++
++static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid)
++{
++      uint32_t *p;
++
++      RESERVE_SPACE(20);
++
++      WRITE32(OP_DELEGRETURN);
++      WRITEMEM(stateid->data, sizeof(stateid->data));
++      return 0;
++
++}
+ /*
+  * END OF "GENERIC" ENCODE ROUTINES.
+  */
+@@ -1244,6 +1392,14 @@ out:
+ }
+ /*
++ * Encode SYMLINK request
++ */
++static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, uint32_t *p, const struct nfs4_create_arg *args)
++{
++      return nfs4_xdr_enc_create(req, p, args);
++}
++
++/*
+  * Encode GETATTR request
+  */
+ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, uint32_t *p, const struct nfs4_getattr_arg *args)
+@@ -1331,13 +1487,13 @@ out:
+ }
+ /*
+- * Encode an OPEN request
++ * Encode an OPEN request with no attributes.
+  */
+-static int nfs4_xdr_enc_open_reclaim(struct rpc_rqst *req, uint32_t *p, struct nfs_open_reclaimargs *args)
++static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nfs_openargs *args)
+ {
+       struct xdr_stream xdr;
+       struct compound_hdr hdr = {
+-              .nops   = 3,
++              .nops   = 2,
+       };
+       int status;
+@@ -1346,10 +1502,7 @@ static int nfs4_xdr_enc_open_reclaim(str
+       status = encode_putfh(&xdr, args->fh);
+       if (status)
+               goto out;
+-      status = encode_open_reclaim(&xdr, args);
+-      if (status)
+-              goto out;
+-      status = encode_getfattr(&xdr, args->bitmask);
++      status = encode_open(&xdr, args);
+ out:
+       return status;
+ }
+@@ -1538,6 +1691,52 @@ out:
+ }
+ /*
++ * Encode an SETACL request
++ */
++static int
++nfs4_xdr_enc_setacl(struct rpc_rqst *req, uint32_t *p, struct nfs_setaclargs *args)
++
++{
++        struct xdr_stream xdr;
++        struct compound_hdr hdr = {
++                .nops   = 2,
++        };
++        int status;
++
++        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++        encode_compound_hdr(&xdr, &hdr);
++        status = encode_putfh(&xdr, args->fh);
++        if(status)
++                goto out;
++        status = encode_setacl(&xdr, args);
++out:
++        return status;
++}
++
++/*
++ * Encode a GETACL request
++ */
++static int
++nfs4_xdr_enc_getacl(struct rpc_rqst *req, uint32_t *p,struct nfs_fh *fhandle)
++{
++      struct xdr_stream xdr;
++      struct compound_hdr hdr = {
++              .nops   = 2,
++      };
++      int status;
++
++      xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++      encode_compound_hdr(&xdr, &hdr);
++      status = encode_putfh(&xdr, fhandle);
++      if (status)
++              goto out;
++      status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0);
++out:
++      return status;
++
++}
++
++/*
+  * Encode a WRITE request
+  */
+ static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args)
+@@ -1716,6 +1915,24 @@ static int nfs4_xdr_enc_setclientid_conf
+ }
+ /*
++ * DELEGRETURN request
++ */
++static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, uint32_t *p, const struct nfs4_delegreturnargs *args)
++{
++      struct xdr_stream xdr;
++      struct compound_hdr hdr = {
++              .nops = 2,
++      };
++      int status;
++
++      xdr_init_encode(&xdr, &req->rq_snd_buf, p);
++      encode_compound_hdr(&xdr, &hdr);
++      if ((status = encode_putfh(&xdr, args->fhandle)) == 0)
++              status = encode_delegreturn(&xdr, args->stateid);
++      return status;
++}
++
++/*
+  * START OF "GENERIC" DECODE ROUTINES.
+  *   These may look a little ugly since they are imported from a "generic"
+  * set of XDR encode/decode routines which are intended to be shared by
+@@ -1749,6 +1966,17 @@ static int nfs4_xdr_enc_setclientid_conf
+       } \
+ } while (0)
++static int decode_opaque_inline(struct xdr_stream *xdr, uint32_t *len, char **string)
++{
++      uint32_t *p;
++
++      READ_BUF(4);
++      READ32(*len);
++      READ_BUF(*len);
++      *string = (char *)p;
++      return 0;
++}
++
+ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+ {
+       uint32_t *p;
+@@ -1785,6 +2013,17 @@ static int decode_op_hdr(struct xdr_stre
+       return 0;
+ }
++/* Dummy routine */
++static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp)
++{
++      uint32_t *p;
++      uint32_t strlen;
++      char *str;
++
++      READ_BUF(12);
++      return decode_opaque_inline(xdr, &strlen, &str);
++}
++
+ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+ {
+       uint32_t bmlen, *p;
+@@ -2717,10 +2956,56 @@ static int decode_lookup(struct xdr_stre
+       return decode_op_hdr(xdr, OP_LOOKUP);
+ }
++/* This is too sick! */
++static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
++{
++        uint32_t *p;
++      uint32_t limit_type, nblocks, blocksize;
++
++      READ_BUF(12);
++      READ32(limit_type);
++      switch (limit_type) {
++              case 1:
++                      READ64(*maxsize);
++                      break;
++              case 2:
++                      READ32(nblocks);
++                      READ32(blocksize);
++                      *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
++      }
++      return 0;
++}
++
++static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
++{
++        uint32_t *p;
++        uint32_t delegation_type;
++
++      READ_BUF(4);
++      READ32(delegation_type);
++      if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
++              res->delegation_type = 0;
++              return 0;
++      }
++      READ_BUF(20);
++      COPYMEM(res->delegation.data, sizeof(res->delegation.data));
++      READ32(res->do_recall);
++      switch (delegation_type) {
++              case NFS4_OPEN_DELEGATE_READ:
++                      res->delegation_type = FMODE_READ;
++                      break;
++              case NFS4_OPEN_DELEGATE_WRITE:
++                      res->delegation_type = FMODE_WRITE|FMODE_READ;
++                      if (decode_space_limit(xdr, &res->maxsize) < 0)
++                              return -EIO;
++      }
++      return decode_ace(xdr, NULL, res->server->nfs4_state);
++}
++
+ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
+ {
+         uint32_t *p;
+-        uint32_t bmlen, delegation_type;
++        uint32_t bmlen;
+         int status;
+         status = decode_op_hdr(xdr, OP_OPEN);
+@@ -2737,11 +3022,9 @@ static int decode_open(struct xdr_stream
+         if (bmlen > 10)
+                 goto xdr_error;
+-        READ_BUF((bmlen << 2) + 4);
++        READ_BUF(bmlen << 2);
+         p += bmlen;
+-        READ32(delegation_type);
+-        if (delegation_type == NFS4_OPEN_DELEGATE_NONE)
+-              return 0;
++      return decode_delegation(xdr, res);
+ xdr_error:
+       printk(KERN_NOTICE "%s: xdr error!\n", __FUNCTION__);
+       return -EIO;
+@@ -2967,6 +3250,72 @@ static int decode_renew(struct xdr_strea
+       return decode_op_hdr(xdr, OP_RENEW);
+ }
++static int decode_attr_acl(struct xdr_stream *xdr, uint32_t *bitmap,
++                              struct nfs_getaclres *res)
++{
++      uint32_t *p;
++
++      if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
++              return -EIO;
++      if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
++              ssize_t size = res->acl_len;
++              uint32_t nace, tmp;
++              u32 *start;
++              int i;
++
++              res->acl_len = 0;
++              READ_BUF(4);
++              start = p;
++              READ32(nace);
++              res->acl_len += 4;
++
++              for (i = 0; i < nace; i++) {
++                      READ_BUF(4*4);
++                      res->acl_len += 4*4;
++                      p += 3;
++                      READ32(tmp); /* namelen */
++                      READ_BUF(tmp);
++                      if (tmp > XDR_MAX_NETOBJ) {
++                              printk(KERN_WARNING "%s: name too long (%u)!\n",
++                                      __FUNCTION__, tmp);
++                              return -EIO;
++                      }
++                      res->acl_len += XDR_QUADLEN(tmp) << 2;
++              }
++              if (size && res->acl_len > size)
++                      return -ERANGE;
++              if (size == 0 && res->acl_len <= XATTR_SIZE_MAX)
++                      res->acl = kmalloc(res->acl_len, GFP_KERNEL);
++              if (res->acl)
++                      memcpy(res->acl, start, res->acl_len);
++      }
++      return 0;
++}
++
++static int decode_getacl(struct xdr_stream *xdr, struct nfs_getaclres *res)
++{
++      uint32_t *savep;
++      uint32_t attrlen,
++               bitmap[2] = {0};
++      int status;
++
++      if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
++              goto xdr_error;
++      if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
++              goto xdr_error;
++      if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
++              goto xdr_error;
++
++      if ((status = decode_attr_acl(xdr, bitmap, res)) != 0)
++              goto xdr_error;
++
++      status = verify_attr_len(xdr, savep, attrlen);
++xdr_error:
++      if (status != 0)
++              printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status);
++      return status;
++}
++
+ static int
+ decode_savefh(struct xdr_stream *xdr)
+ {
+@@ -3048,6 +3397,11 @@ static int decode_write(struct xdr_strea
+       return 0;
+ }
++static int decode_delegreturn(struct xdr_stream *xdr)
++{
++      return decode_op_hdr(xdr, OP_DELEGRETURN);
++}
++
+ /*
+  * Decode OPEN_DOWNGRADE response
+  */
+@@ -3222,6 +3576,14 @@ out:
+ }
+ /*
++ * Decode SYMLINK response
++ */
++static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_create_res *res)
++{
++      return nfs4_xdr_dec_create(rqstp, p, res);
++}
++
++/*
+  * Decode GETATTR response
+  */
+ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_getattr_res *res)
+@@ -3243,6 +3605,50 @@ out:
+ }
++/*
++ * Decode SETACL response
++ */
++static int
++nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, uint32_t *p, void *res)
++{
++      struct xdr_stream xdr;
++      struct compound_hdr hdr;
++      int status;
++
++      xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++      status = decode_compound_hdr(&xdr, &hdr);
++      if (status)
++              goto out;
++      status = decode_putfh(&xdr);
++      if (status)
++              goto out;
++      status = decode_setattr(&xdr, res);
++out:
++      return status;
++}
++
++/*
++ * Decode GETACL response
++ */
++static int
++nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_getaclres *res)
++{
++      struct xdr_stream xdr;
++      struct compound_hdr hdr;
++      int status;
++
++      xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++      status = decode_compound_hdr(&xdr, &hdr);
++      if (status)
++              goto out;
++      status = decode_putfh(&xdr);
++      if (status)
++              goto out;
++      status = decode_getacl(&xdr, res);
++
++out:
++      return status;
++}
+ /*
+  * Decode CLOSE response
+@@ -3314,9 +3720,9 @@ out:
+ }
+ /*
+- * Decode OPEN_RECLAIM response
++ * Decode OPEN response
+  */
+-static int nfs4_xdr_dec_open_reclaim(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res)
++static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res)
+ {
+         struct xdr_stream xdr;
+         struct compound_hdr hdr;
+@@ -3330,9 +3736,6 @@ static int nfs4_xdr_dec_open_reclaim(str
+         if (status)
+                 goto out;
+         status = decode_open(&xdr, res);
+-        if (status)
+-                goto out;
+-      status = decode_getfattr(&xdr, res->f_attr, res->server);
+ out:
+         return status;
+ }
+@@ -3665,6 +4068,25 @@ static int nfs4_xdr_dec_setclientid_conf
+       return status;
+ }
++/*
++ * DELEGRETURN request
++ */
++static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
++{
++      struct xdr_stream xdr;
++      struct compound_hdr hdr;
++      int status;
++
++      xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
++      status = decode_compound_hdr(&xdr, &hdr);
++      if (status == 0) {
++              status = decode_putfh(&xdr);
++              if (status == 0)
++                      status = decode_delegreturn(&xdr);
++      }
++      return status;
++}
++
+ uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus)
+ {
+       uint32_t len;
+@@ -3756,7 +4178,7 @@ nfs_stat_to_errno(int stat)
+               if (nfs_errtbl[i].stat == stat)
+                       return nfs_errtbl[i].errno;
+       }
+-      if (stat < 0) {
++      if (stat <= 10000 || stat > 10100) {
+               /* The server is looney tunes. */
+               return ESERVERFAULT;
+       }
+@@ -3786,7 +4208,7 @@ struct rpc_procinfo      nfs4_procedures[] = 
+   PROC(COMMIT,                enc_commit,     dec_commit),
+   PROC(OPEN,          enc_open,       dec_open),
+   PROC(OPEN_CONFIRM,  enc_open_confirm,       dec_open_confirm),
+-  PROC(OPEN_RECLAIM,  enc_open_reclaim,       dec_open_reclaim),
++  PROC(OPEN_NOATTR,   enc_open_noattr,        dec_open_noattr),
+   PROC(OPEN_DOWNGRADE,        enc_open_downgrade,     dec_open_downgrade),
+   PROC(CLOSE,         enc_close,      dec_close),
+   PROC(SETATTR,               enc_setattr,    dec_setattr),
+@@ -3804,12 +4226,16 @@ struct rpc_procinfo    nfs4_procedures[] = 
+   PROC(REMOVE,                enc_remove,     dec_remove),
+   PROC(RENAME,                enc_rename,     dec_rename),
+   PROC(LINK,          enc_link,       dec_link),
++  PROC(SYMLINK,               enc_symlink,    dec_symlink),
+   PROC(CREATE,                enc_create,     dec_create),
+   PROC(PATHCONF,      enc_pathconf,   dec_pathconf),
+   PROC(STATFS,                enc_statfs,     dec_statfs),
+   PROC(READLINK,      enc_readlink,   dec_readlink),
+   PROC(READDIR,               enc_readdir,    dec_readdir),
+   PROC(SERVER_CAPS,   enc_server_caps, dec_server_caps),
++  PROC(DELEGRETURN,   enc_delegreturn, dec_delegreturn),
++  PROC(GETACL,                enc_getacl,     dec_getacl),
++  PROC(SETACL,                enc_setacl,     dec_setacl),
+ };
+ struct rpc_version            nfs_version4 = {
+--- linux-2.6.7/fs/nfs/pagelist.c.lsec 2004-06-15 23:20:03.000000000 -0600
++++ linux-2.6.7/fs/nfs/pagelist.c      2005-03-23 14:28:23.057544416 -0700
+@@ -36,7 +36,6 @@ nfs_page_alloc(void)
+       if (p) {
+               memset(p, 0, sizeof(*p));
+               INIT_LIST_HEAD(&p->wb_list);
+-              init_waitqueue_head(&p->wb_wait);
+       }
+       return p;
+ }
+@@ -62,7 +61,7 @@ nfs_page_free(struct nfs_page *p)
+  * User should ensure it is safe to sleep in this function.
+  */
+ struct nfs_page *
+-nfs_create_request(struct file *file, struct inode *inode,
++nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
+                  struct page *page,
+                  unsigned int offset, unsigned int count)
+ {
+@@ -94,33 +93,38 @@ nfs_create_request(struct file *file, st
+       req->wb_offset  = offset;
+       req->wb_pgbase  = offset;
+       req->wb_bytes   = count;
+-      req->wb_inode   = inode;
+       req->wb_count   = 1;
+-      server->rpc_ops->request_init(req, file);
++      req->wb_context = get_nfs_open_context(ctx);
+       return req;
+ }
+ /**
++ * nfs_unlock_request - Unlock request and wake up sleepers.
++ * @req:
++ */
++void nfs_unlock_request(struct nfs_page *req)
++{
++      if (!NFS_WBACK_BUSY(req)) {
++              printk(KERN_ERR "NFS: Invalid unlock attempted\n");
++              BUG();
++      }
++      smp_mb__before_clear_bit();
++      clear_bit(PG_BUSY, &req->wb_flags);
++      smp_mb__after_clear_bit();
++      wake_up_all(&req->wb_context->waitq);
++      nfs_release_request(req);
++}
++
++/**
+  * nfs_clear_request - Free up all resources allocated to the request
+  * @req:
+  *
+- * Release all resources associated with a write request after it
++ * Release page resources associated with a write request after it
+  * has completed.
+  */
+ void nfs_clear_request(struct nfs_page *req)
+ {
+-      if (req->wb_state)
+-              req->wb_state = NULL;
+-      /* Release struct file or cached credential */
+-      if (req->wb_file) {
+-              fput(req->wb_file);
+-              req->wb_file = NULL;
+-      }
+-      if (req->wb_cred) {
+-              put_rpccred(req->wb_cred);
+-              req->wb_cred = NULL;
+-      }
+       if (req->wb_page) {
+               page_cache_release(req->wb_page);
+               req->wb_page = NULL;
+@@ -151,6 +155,7 @@ nfs_release_request(struct nfs_page *req
+       /* Release struct file or cached credential */
+       nfs_clear_request(req);
++      put_nfs_open_context(req->wb_context);
+       nfs_page_free(req);
+ }
+@@ -194,12 +199,12 @@ nfs_list_add_request(struct nfs_page *re
+ int
+ nfs_wait_on_request(struct nfs_page *req)
+ {
+-      struct inode    *inode = req->wb_inode;
++      struct inode    *inode = req->wb_context->dentry->d_inode;
+         struct rpc_clnt       *clnt = NFS_CLIENT(inode);
+       if (!NFS_WBACK_BUSY(req))
+               return 0;
+-      return nfs_wait_event(clnt, req->wb_wait, !NFS_WBACK_BUSY(req));
++      return nfs_wait_event(clnt, req->wb_context->waitq, !NFS_WBACK_BUSY(req));
+ }
+ /**
+@@ -224,7 +229,11 @@ nfs_coalesce_requests(struct list_head *
+               req = nfs_list_entry(head->next);
+               if (prev) {
+-                      if (req->wb_cred != prev->wb_cred)
++                      if (req->wb_context->cred != prev->wb_context->cred)
++                              break;
++                      if (req->wb_context->pid != prev->wb_context->pid)
++                              break;
++                      if (req->wb_context->state != prev->wb_context->state)
+                               break;
+                       if (req->wb_index != (prev->wb_index + 1))
+                               break;
+--- linux-2.6.7/fs/nfs/nfs4proc.c.lsec 2004-06-15 23:19:44.000000000 -0600
++++ linux-2.6.7/fs/nfs/nfs4proc.c      2005-03-23 14:32:35.532162440 -0700
+@@ -47,12 +47,16 @@
+ #include <linux/smp_lock.h>
+ #include <linux/namei.h>
++#include "delegation.h"
++
+ #define NFSDBG_FACILITY               NFSDBG_PROC
+-#define NFS4_POLL_RETRY_TIME  (15*HZ)
++#define NFS4_POLL_RETRY_MIN   (1*HZ)
++#define NFS4_POLL_RETRY_MAX   (15*HZ)
+ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
+ static int nfs4_async_handle_error(struct rpc_task *, struct nfs_server *);
++static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
+ extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
+ extern struct rpc_procinfo nfs4_procedures[];
+@@ -189,53 +193,296 @@ static void update_changeattr(struct ino
+  *    reclaim state on the server after a reboot.
+  *    Assumes caller is holding the sp->so_sem
+  */
+-int
+-nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
++static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
+ {
+       struct inode *inode = state->inode;
+       struct nfs_server *server = NFS_SERVER(inode);
+-      struct nfs_fattr fattr = {
+-              .valid = 0,
+-      };
+-      struct nfs_open_reclaimargs o_arg = {
++      struct nfs_delegation *delegation = NFS_I(inode)->delegation;
++      struct nfs_openargs o_arg = {
+               .fh = NFS_FH(inode),
+               .seqid = sp->so_seqid,
+               .id = sp->so_id,
+-              .share_access = state->state,
++              .open_flags = state->state,
+               .clientid = server->nfs4_state->cl_clientid,
+               .claim = NFS4_OPEN_CLAIM_PREVIOUS,
+               .bitmask = server->attr_bitmask,
+       };
+       struct nfs_openres o_res = {
+-              .f_attr = &fattr,
+               .server = server,       /* Grrr */
+       };
+       struct rpc_message msg = {
+-              .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_OPEN_RECLAIM],
++              .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR],
+               .rpc_argp       = &o_arg,
+               .rpc_resp       = &o_res,
+               .rpc_cred       = sp->so_cred,
+       };
+       int status;
++      if (delegation != NULL) {
++              if (!(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) {
++                      memcpy(&state->stateid, &delegation->stateid,
++                                      sizeof(state->stateid));
++                      set_bit(NFS_DELEGATED_STATE, &state->flags);
++                      return 0;
++              }
++              o_arg.u.delegation_type = delegation->type;
++      }
+       status = rpc_call_sync(server->client, &msg, 0);
+       nfs4_increment_seqid(status, sp);
+-      if (status == 0)
++      if (status == 0) {
+               memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid));
+-      /* Update the inode attributes */
+-      nfs_refresh_inode(inode, &fattr);
++              if (o_res.delegation_type != 0) {
++                      nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res);
++                      /* Did the server issue an immediate delegation recall? */
++                      if (o_res.do_recall)
++                              nfs_async_inode_return_delegation(inode, &o_res.stateid);
++              }
++      }
++      clear_bit(NFS_DELEGATED_STATE, &state->flags);
++      /* Ensure we update the inode attributes */
++      NFS_CACHEINV(inode);
+       return status;
+ }
++int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state)
++{
++      struct nfs_server *server = NFS_SERVER(state->inode);
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = _nfs4_open_reclaim(sp, state);
++              switch (err) {
++                      case 0:
++                      case -NFS4ERR_STALE_CLIENTID:
++                      case -NFS4ERR_STALE_STATEID:
++                      case -NFS4ERR_EXPIRED:
++                              return err;
++              }
++              err = nfs4_handle_exception(server, err, &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
++{
++      struct nfs4_state_owner  *sp  = state->owner;
++      struct inode *inode = dentry->d_inode;
++      struct nfs_server *server = NFS_SERVER(inode);
++      struct dentry *parent = dget_parent(dentry);
++      struct nfs_openargs arg = {
++              .fh = NFS_FH(parent->d_inode),
++              .clientid = server->nfs4_state->cl_clientid,
++              .name = &dentry->d_name,
++              .id = sp->so_id,
++              .server = server,
++              .bitmask = server->attr_bitmask,
++              .claim = NFS4_OPEN_CLAIM_DELEGATE_CUR,
++      };
++      struct nfs_openres res = {
++              .server = server,
++      };
++      struct  rpc_message msg = {
++              .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR],
++              .rpc_argp       = &arg,
++              .rpc_resp       = &res,
++              .rpc_cred       = sp->so_cred,
++      };
++      int status = 0;
++
++      down(&sp->so_sema);
++      if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
++              goto out;
++      if (state->state == 0)
++              goto out;
++      arg.seqid = sp->so_seqid;
++      arg.open_flags = state->state;
++      memcpy(arg.u.delegation.data, state->stateid.data, sizeof(arg.u.delegation.data));
++      status = rpc_call_sync(server->client, &msg, 0);
++      nfs4_increment_seqid(status, sp);
++      if (status >= 0) {
++              memcpy(state->stateid.data, res.stateid.data,
++                              sizeof(state->stateid.data));
++              clear_bit(NFS_DELEGATED_STATE, &state->flags);
++      }
++out:
++      up(&sp->so_sema);
++      dput(parent);
++      return status;
++}
++
++int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state)
++{
++      struct nfs4_exception exception = { };
++      struct nfs_server *server = NFS_SERVER(dentry->d_inode);
++      int err;
++      do {
++              err = _nfs4_open_delegation_recall(dentry, state);
++              switch (err) {
++                      case 0:
++                              return err;
++                      case -NFS4ERR_STALE_CLIENTID:
++                      case -NFS4ERR_STALE_STATEID:
++                      case -NFS4ERR_EXPIRED:
++                              /* Don't recall a delegation if it was lost */
++                              nfs4_schedule_state_recovery(server->nfs4_state);
++                              return err;
++              }
++              err = nfs4_handle_exception(server, err, &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid)
++{
++      struct nfs_open_confirmargs arg = {
++              .fh             = fh,
++              .seqid          = sp->so_seqid,
++              .stateid        = *stateid,
++      };
++      struct nfs_open_confirmres res;
++      struct  rpc_message msg = {
++              .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
++              .rpc_argp       = &arg,
++              .rpc_resp       = &res,
++              .rpc_cred       = sp->so_cred,
++      };
++      int status;
++
++      status = rpc_call_sync(clnt, &msg, 0);
++      nfs4_increment_seqid(status, sp);
++      if (status >= 0)
++              memcpy(stateid, &res.stateid, sizeof(*stateid));
++      return status;
++}
++
++static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
++{
++      struct nfs_access_entry cache;
++      int status;
++
++      status = nfs_access_get_cached(inode, cred, &cache);
++      if (status == 0)
++              goto out;
++
++      /* Be clever: ask server to check for all possible rights */
++      cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
++      cache.cred = cred;
++      cache.jiffies = jiffies;
++      status = _nfs4_proc_access(inode, &cache);
++      if (status != 0)
++              return status;
++      nfs_access_add_cache(inode, &cache);
++out:
++      if ((cache.mask & mask) == mask)
++              return 0;
++      return -EACCES;
++}
++
++/*
++ * Returns an nfs4_state + an extra reference to the inode
++ */
++int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred, struct nfs4_state **res)
++{
++      struct nfs_delegation *delegation;
++      struct nfs_server *server = NFS_SERVER(inode);
++      struct nfs4_client *clp = server->nfs4_state;
++      struct nfs_inode *nfsi = NFS_I(inode);
++      struct nfs4_state_owner *sp = NULL;
++      struct nfs4_state *state = NULL;
++      int open_flags = flags & (FMODE_READ|FMODE_WRITE);
++      int mask = 0;
++      int err;
++
++      /* Protect against reboot recovery - NOTE ORDER! */
++      down_read(&clp->cl_sem);
++      /* Protect against delegation recall */
++      down_read(&nfsi->rwsem);
++      delegation = NFS_I(inode)->delegation;
++      err = -ENOENT;
++      if (delegation == NULL || (delegation->type & open_flags) != open_flags)
++              goto out_err;
++      err = -ENOMEM;
++      if (!(sp = nfs4_get_state_owner(server, cred))) {
++              dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__);
++              goto out_err;
++      }
++      down(&sp->so_sema);
++      state = nfs4_get_open_state(inode, sp);
++      if (state == NULL)
++              goto out_err;
++
++      err = -ENOENT;
++      if ((state->state & open_flags) == open_flags) {
++              spin_lock(&inode->i_lock);
++              if (open_flags & FMODE_READ)
++                      state->nreaders++;
++              if (open_flags & FMODE_WRITE)
++                      state->nwriters++;
++              spin_unlock(&inode->i_lock);
++              goto out_ok;
++      } else if (state->state != 0)
++              goto out_err;
++
++      lock_kernel();
++      err = _nfs4_do_access(inode, cred, mask);
++      unlock_kernel();
++      if (err != 0)
++              goto out_err;
++      spin_lock(&inode->i_lock);
++      memcpy(state->stateid.data, delegation->stateid.data,
++                      sizeof(state->stateid.data));
++      state->state |= open_flags;
++      if (open_flags & FMODE_READ)
++              state->nreaders++;
++      if (open_flags & FMODE_WRITE)
++              state->nwriters++;
++      set_bit(NFS_DELEGATED_STATE, &state->flags);
++      spin_unlock(&inode->i_lock);
++out_ok:
++      up(&sp->so_sema);
++      nfs4_put_state_owner(sp);
++      up_read(&nfsi->rwsem);
++      up_read(&clp->cl_sem);
++      igrab(inode);
++      *res = state;
++      return 0; 
++out_err:
++      if (sp != NULL) {
++              if (state != NULL)
++                      nfs4_put_open_state(state);
++              up(&sp->so_sema);
++              nfs4_put_state_owner(sp);
++      }
++      up_read(&nfsi->rwsem);
++      up_read(&clp->cl_sem);
++      return err;
++}
++
++static struct nfs4_state *nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred)
++{
++      struct nfs4_exception exception = { };
++      struct nfs4_state *res;
++      int err;
++
++      do {
++              err = _nfs4_open_delegated(inode, flags, cred, &res);
++              if (err == 0)
++                      break;
++              res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(inode),
++                                      err, &exception));
++      } while (exception.retry);
++      return res;
++}
++
+ /*
+  * Returns an nfs4_state + an referenced inode
+  */
+-struct nfs4_state *
+-nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred)
++static int _nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+ {
+       struct nfs4_state_owner  *sp;
+       struct nfs4_state     *state = NULL;
+       struct nfs_server       *server = NFS_SERVER(dir);
++      struct nfs4_client *clp = server->nfs4_state;
+       struct inode *inode = NULL;
+       int                     status;
+       struct nfs_fattr        f_attr = {
+@@ -243,12 +490,11 @@ nfs4_do_open(struct inode *dir, struct q
+       };
+       struct nfs_openargs o_arg = {
+               .fh             = NFS_FH(dir),
+-              .share_access   = flags & (FMODE_READ|FMODE_WRITE),
+-              .opentype       = (flags & O_CREAT) ? NFS4_OPEN_CREATE : NFS4_OPEN_NOCREATE,
+-              .createmode     = (flags & O_EXCL) ? NFS4_CREATE_EXCLUSIVE : NFS4_CREATE_UNCHECKED,
++              .open_flags     = flags,
+               .name           = name,
+               .server         = server,
+               .bitmask = server->attr_bitmask,
++              .claim = NFS4_OPEN_CLAIM_NULL,
+       };
+       struct nfs_openres o_res = {
+               .f_attr         = &f_attr,
+@@ -261,60 +507,44 @@ nfs4_do_open(struct inode *dir, struct q
+               .rpc_cred       = cred,
+       };
+-retry:
++      /* Protect against reboot recovery conflicts */
++      down_read(&clp->cl_sem);
+       status = -ENOMEM;
+-      if (!(sp = nfs4_get_state_owner(NFS_SERVER(dir), cred))) {
++      if (!(sp = nfs4_get_state_owner(server, cred))) {
+               dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n");
+-              goto out;
++              goto out_err;
+       }
+-      if (o_arg.createmode & NFS4_CREATE_EXCLUSIVE){
++      if (flags & O_EXCL) {
+               u32 *p = (u32 *) o_arg.u.verifier.data;
+               p[0] = jiffies;
+               p[1] = current->pid;
+-      } else if (o_arg.createmode == NFS4_CREATE_UNCHECKED) {
++      } else
+               o_arg.u.attrs = sattr;
+-      }
+       /* Serialization for the sequence id */
+       down(&sp->so_sema);
+       o_arg.seqid = sp->so_seqid;
+       o_arg.id = sp->so_id;
+-      o_arg.clientid = NFS_SERVER(dir)->nfs4_state->cl_clientid,
++      o_arg.clientid = clp->cl_clientid;
+       status = rpc_call_sync(server->client, &msg, 0);
+       nfs4_increment_seqid(status, sp);
+       if (status)
+-              goto out_up;
++              goto out_err;
+       update_changeattr(dir, &o_res.cinfo);
++      if(o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) {
++              status = _nfs4_proc_open_confirm(server->client, &o_res.fh, sp, &o_res.stateid);
++              if (status)
++                      goto out_err;
++      }
+       status = -ENOMEM;
+       inode = nfs_fhget(dir->i_sb, &o_res.fh, &f_attr);
+       if (!inode)
+-              goto out_up;
++              goto out_err;
+       state = nfs4_get_open_state(inode, sp);
+       if (!state)
+-              goto out_up;
+-
+-      if(o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) {
+-              struct nfs_open_confirmargs oc_arg = {
+-                      .fh             = &o_res.fh,
+-                      .seqid          = sp->so_seqid,
+-              };
+-              struct nfs_open_confirmres oc_res;
+-              struct  rpc_message msg = {
+-                      .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
+-                      .rpc_argp       = &oc_arg,
+-                      .rpc_resp       = &oc_res,
+-                      .rpc_cred       = cred,
+-              };
+-
+-              memcpy(&oc_arg.stateid, &o_res.stateid, sizeof(oc_arg.stateid));
+-              status = rpc_call_sync(server->client, &msg, 0);
+-              nfs4_increment_seqid(status, sp);
+-              if (status)
+-                      goto out_up;
+-              memcpy(&state->stateid, &oc_res.stateid, sizeof(state->stateid));
+-      } else
+-              memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid));
++              goto out_err;
++      memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid));
+       spin_lock(&inode->i_lock);
+       if (flags & FMODE_READ)
+               state->nreaders++;
+@@ -322,47 +552,62 @@ retry:
+               state->nwriters++;
+       state->state |= flags & (FMODE_READ|FMODE_WRITE);
+       spin_unlock(&inode->i_lock);
+-
++      if (o_res.delegation_type != 0)
++              nfs_inode_set_delegation(inode, cred, &o_res);
+       up(&sp->so_sema);
+       nfs4_put_state_owner(sp);
+-      return state;
+-
+-out_up:
+-      up(&sp->so_sema);
+-      nfs4_put_state_owner(sp);
+-      if (state) {
+-              nfs4_put_open_state(state);
+-              state = NULL;
+-      }
+-      if (inode) {
++      up_read(&clp->cl_sem);
++      *res = state;
++      return 0;
++out_err:
++      if (sp != NULL) {
++              if (state != NULL)
++                      nfs4_put_open_state(state);
++              up(&sp->so_sema);
++              nfs4_put_state_owner(sp);
++      }
++      /* Note: clp->cl_sem must be released before nfs4_put_open_state()! */
++      up_read(&clp->cl_sem);
++      if (inode != NULL)
+               iput(inode);
+-              inode = NULL;
+-      }
+-      /* NOTE: BAD_SEQID means the server and client disagree about the
+-       * book-keeping w.r.t. state-changing operations
+-       * (OPEN/CLOSE/LOCK/LOCKU...)
+-       * It is actually a sign of a bug on the client or on the server.
+-       *
+-       * If we receive a BAD_SEQID error in the particular case of
+-       * doing an OPEN, we assume that nfs4_increment_seqid() will
+-       * have unhashed the old state_owner for us, and that we can
+-       * therefore safely retry using a new one. We should still warn
+-       * the user though...
+-       */
+-      if (status == -NFS4ERR_BAD_SEQID) {
+-              printk(KERN_WARNING "NFS: v4 server returned a bad sequence-id error!\n");
+-              goto retry;
+-      }
+-      status = nfs4_handle_error(server, status);
+-      if (!status)
+-              goto retry;
+-      BUG_ON(status < -1000 || status > 0);
+-out:
+-      return ERR_PTR(status);
++      *res = NULL;
++      return status;
+ }
+-int
+-nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
++
++struct nfs4_state *nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred)
++{
++      struct nfs4_exception exception = { };
++      struct nfs4_state *res;
++      int status;
++
++      do {
++              status = _nfs4_do_open(dir, name, flags, sattr, cred, &res);
++              if (status == 0)
++                      break;
++              /* NOTE: BAD_SEQID means the server and client disagree about the
++               * book-keeping w.r.t. state-changing operations
++               * (OPEN/CLOSE/LOCK/LOCKU...)
++               * It is actually a sign of a bug on the client or on the server.
++               *
++               * If we receive a BAD_SEQID error in the particular case of
++               * doing an OPEN, we assume that nfs4_increment_seqid() will
++               * have unhashed the old state_owner for us, and that we can
++               * therefore safely retry using a new one. We should still warn
++               * the user though...
++               */
++              if (status == -NFS4ERR_BAD_SEQID) {
++                      printk(KERN_WARNING "NFS: v4 server returned a bad sequence-id error!\n");
++                      exception.retry = 1;
++                      continue;
++              }
++              res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir),
++                                      status, &exception));
++      } while (exception.retry);
++      return res;
++}
++
++static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
+                 struct nfs_fh *fhandle, struct iattr *sattr,
+                 struct nfs4_state *state)
+ {
+@@ -381,9 +626,7 @@ nfs4_do_setattr(struct nfs_server *serve
+                 .rpc_argp       = &arg,
+                 .rpc_resp       = &res,
+         };
+-      int status;
+-retry:
+         fattr->valid = 0;
+       if (sattr->ia_valid & ATTR_SIZE)
+@@ -391,13 +634,22 @@ retry:
+       else
+               memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
+-      status = rpc_call_sync(server->client, &msg, 0);
+-      if (status) {
+-              status = nfs4_handle_error(server, status);
+-              if (!status)
+-                      goto retry;
+-      }
+-      return status;
++      return rpc_call_sync(server->client, &msg, 0);
++}
++
++int nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
++                struct nfs_fh *fhandle, struct iattr *sattr,
++                struct nfs4_state *state)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(server,
++                              _nfs4_do_setattr(server, fattr, fhandle, sattr,
++                                      state),
++                              &exception);
++      } while (exception.retry);
++      return err;
+ }
+ /* 
+@@ -411,8 +663,7 @@ retry:
+  *
+  * NOTE: Caller must be holding the sp->so_owner semaphore!
+  */
+-int
+-nfs4_do_close(struct inode *inode, struct nfs4_state *state) 
++static int _nfs4_do_close(struct inode *inode, struct nfs4_state *state) 
+ {
+       struct nfs4_state_owner *sp = state->owner;
+       int status = 0;
+@@ -426,6 +677,8 @@ nfs4_do_close(struct inode *inode, struc
+               .rpc_resp       = &res,
+       };
++      if (test_bit(NFS_DELEGATED_STATE, &state->flags))
++              return 0;
+       memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid));
+       /* Serialization for the sequence id */
+       arg.seqid = sp->so_seqid,
+@@ -441,15 +694,34 @@ nfs4_do_close(struct inode *inode, struc
+       return status;
+ }
+-int
+-nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) 
++int nfs4_do_close(struct inode *inode, struct nfs4_state *state) 
++{
++      struct nfs_server *server = NFS_SERVER(state->inode);
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = _nfs4_do_close(inode, state);
++              switch (err) {
++                      case -NFS4ERR_STALE_STATEID:
++                      case -NFS4ERR_EXPIRED:
++                              nfs4_schedule_state_recovery(server->nfs4_state);
++                      case 0:
++                              state->state = 0;
++                              return 0;
++              }
++              err = nfs4_handle_exception(server, err, &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) 
+ {
+       struct nfs4_state_owner *sp = state->owner;
+       int status = 0;
+       struct nfs_closeargs arg = {
+               .fh             = NFS_FH(inode),
+               .seqid          = sp->so_seqid,
+-              .share_access   = mode,
++              .open_flags     = mode,
+       };
+       struct nfs_closeres res;
+       struct rpc_message msg = {
+@@ -458,6 +730,8 @@ nfs4_do_downgrade(struct inode *inode, s
+               .rpc_resp       = &res,
+       };
++      if (test_bit(NFS_DELEGATED_STATE, &state->flags))
++              return 0;
+       memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid));
+       status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0);
+       nfs4_increment_seqid(status, sp);
+@@ -467,6 +741,26 @@ nfs4_do_downgrade(struct inode *inode, s
+       return status;
+ }
++int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) 
++{
++      struct nfs_server *server = NFS_SERVER(state->inode);
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = _nfs4_do_downgrade(inode, state, mode);
++              switch (err) {
++                      case -NFS4ERR_STALE_STATEID:
++                      case -NFS4ERR_EXPIRED:
++                              nfs4_schedule_state_recovery(server->nfs4_state);
++                      case 0:
++                              state->state = mode;
++                              return 0;
++              }
++              err = nfs4_handle_exception(server, err, &exception);
++      } while (exception.retry);
++      return err;
++}
++
+ struct inode *
+ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+ {
+@@ -500,7 +794,9 @@ nfs4_open_revalidate(struct inode *dir, 
+       struct inode *inode;
+       cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0);
+-      state = nfs4_do_open(dir, &dentry->d_name, openflags, NULL, cred);
++      state = nfs4_open_delegated(dentry->d_inode, openflags, cred);
++      if (IS_ERR(state))
++              state = nfs4_do_open(dir, &dentry->d_name, openflags, NULL, cred);
+       put_rpccred(cred);
+       if (state == ERR_PTR(-ENOENT) && dentry->d_inode == 0)
+               return 1;
+@@ -518,7 +814,7 @@ nfs4_open_revalidate(struct inode *dir, 
+ }
+-static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
++static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+ {
+       struct nfs4_server_caps_res res = {};
+       struct rpc_message msg = {
+@@ -542,7 +838,19 @@ static int nfs4_server_capabilities(stru
+       return status;
+ }
+-static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
++static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(server,
++                              _nfs4_server_capabilities(server, fhandle),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
+               struct nfs_fsinfo *info)
+ {
+       struct nfs_fattr *      fattr = info->fattr;
+@@ -563,6 +871,19 @@ static int nfs4_lookup_root(struct nfs_s
+       return rpc_call_sync(server->client, &msg, 0);
+ }
++static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
++              struct nfs_fsinfo *info)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(server,
++                              _nfs4_lookup_root(server, fhandle, info),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
+ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
+               struct nfs_fsinfo *info)
+ {
+@@ -597,6 +918,8 @@ static int nfs4_proc_get_root(struct nfs
+       p = server->mnt_path;
+       for (;;) {
++              struct nfs4_exception exception = { };
++
+               while (*p == '/')
+                       p++;
+               if (!*p)
+@@ -606,9 +929,13 @@ static int nfs4_proc_get_root(struct nfs
+                       p++;
+               q.len = p - q.name;
+-              fattr->valid = 0;
+-              status = rpc_call_sync(server->client, &msg, 0);
+-              if (!status)
++              do {
++                      fattr->valid = 0;
++                      status = nfs4_handle_exception(server,
++                                      rpc_call_sync(server->client, &msg, 0),
++                                      &exception);
++              } while (exception.retry);
++              if (status == 0)
+                       continue;
+               if (status == -ENOENT) {
+                       printk(KERN_NOTICE "NFS: mount path %s does not exist!\n", server->mnt_path);
+@@ -621,10 +948,10 @@ static int nfs4_proc_get_root(struct nfs
+       if (status == 0)
+               status = nfs4_do_fsinfo(server, fhandle, info);
+ out:
+-      return nfs4_map_errors(status);
++      return status;
+ }
+-static int nfs4_proc_getattr(struct inode *inode, struct nfs_fattr *fattr)
++static int _nfs4_proc_getattr(struct inode *inode, struct nfs_fattr *fattr)
+ {
+       struct nfs_server *server = NFS_SERVER(inode);
+       struct nfs4_getattr_arg args = {
+@@ -642,8 +969,19 @@ static int nfs4_proc_getattr(struct inod
+       };
+       
+       fattr->valid = 0;
++      return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
++}
+-      return nfs4_map_errors(rpc_call_sync(NFS_CLIENT(inode), &msg, 0));
++static int nfs4_proc_getattr(struct inode *inode, struct nfs_fattr *fattr)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(inode),
++                              _nfs4_proc_getattr(inode, fattr),
++                              &exception);
++      } while (exception.retry);
++      return err;
+ }
+ /* 
+@@ -678,9 +1016,13 @@ nfs4_proc_setattr(struct dentry *dentry,
+       if (size_change) {
+               struct rpc_cred *cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0);
+               state = nfs4_find_state(inode, cred, FMODE_WRITE);
+-              if (!state) {
+-                      state = nfs4_do_open(dentry->d_parent->d_inode, 
+-                              &dentry->d_name, FMODE_WRITE, NULL, cred);
++              if (state == NULL) {
++                      state = nfs4_open_delegated(dentry->d_inode,
++                                      FMODE_WRITE, cred);
++                      if (IS_ERR(state))
++                              state = nfs4_do_open(dentry->d_parent->d_inode,
++                                              &dentry->d_name, FMODE_WRITE,
++                                              NULL, cred);
+                       need_iput = 1;
+               }
+               put_rpccred(cred);
+@@ -705,7 +1047,7 @@ out:
+       return status;
+ }
+-static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
++static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
+               struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+ {
+       int                    status;
+@@ -731,12 +1073,23 @@ static int nfs4_proc_lookup(struct inode
+       dprintk("NFS call  lookup %s\n", name->name);
+       status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+       dprintk("NFS reply lookup: %d\n", status);
+-      return nfs4_map_errors(status);
++      return status;
+ }
+-static int nfs4_proc_access(struct inode *inode, struct rpc_cred *cred, int mode)
++static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(dir),
++                              _nfs4_proc_lookup(dir, name, fhandle, fattr),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
+ {
+-      int                     status;
+       struct nfs4_accessargs args = {
+               .fh = NFS_FH(inode),
+       };
+@@ -745,8 +1098,10 @@ static int nfs4_proc_access(struct inode
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS],
+               .rpc_argp = &args,
+               .rpc_resp = &res,
+-              .rpc_cred = cred,
++              .rpc_cred = entry->cred,
+       };
++      int mode = entry->mask;
++      int status;
+       /*
+        * Determine which access bits we want to ask for...
+@@ -758,8 +1113,7 @@ static int nfs4_proc_access(struct inode
+                       args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE;
+               if (mode & MAY_EXEC)
+                       args.access |= NFS4_ACCESS_LOOKUP;
+-      }
+-      else {
++      } else {
+               if (mode & MAY_WRITE)
+                       args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND;
+               if (mode & MAY_EXEC)
+@@ -767,13 +1121,27 @@ static int nfs4_proc_access(struct inode
+       }
+       status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+       if (!status) {
+-              if (args.access != res.supported) {
+-                      printk(KERN_NOTICE "NFS: server didn't support all access bits!\n");
+-                      status = -ENOTSUPP;
+-              } else if ((args.access & res.access) != args.access)
+-                      status = -EACCES;
++              entry->mask = 0;
++              if (res.access & NFS4_ACCESS_READ)
++                      entry->mask |= MAY_READ;
++              if (res.access & (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE))
++                      entry->mask |= MAY_WRITE;
++              if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE))
++                      entry->mask |= MAY_EXEC;
+       }
+-      return nfs4_map_errors(status);
++      return status;
++}
++
++static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(inode),
++                              _nfs4_proc_access(inode, entry),
++                              &exception);
++      } while (exception.retry);
++      return err;
+ }
+ /*
+@@ -800,7 +1168,7 @@ static int nfs4_proc_access(struct inode
+  * Both of these changes to the XDR layer would in fact be quite
+  * minor, but I decided to leave them for a subsequent patch.
+  */
+-static int nfs4_proc_readlink(struct inode *inode, struct page *page)
++static int _nfs4_proc_readlink(struct inode *inode, struct page *page)
+ {
+       struct nfs4_readlink args = {
+               .fh       = NFS_FH(inode),
+@@ -813,11 +1181,22 @@ static int nfs4_proc_readlink(struct ino
+               .rpc_resp = NULL,
+       };
+-      return nfs4_map_errors(rpc_call_sync(NFS_CLIENT(inode), &msg, 0));
++      return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
+ }
+-static int
+-nfs4_proc_read(struct nfs_read_data *rdata, struct file *filp)
++static int nfs4_proc_readlink(struct inode *inode, struct page *page)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(inode),
++                              _nfs4_proc_readlink(inode, page),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_proc_read(struct nfs_read_data *rdata)
+ {
+       int flags = rdata->flags;
+       struct inode *inode = rdata->inode;
+@@ -827,6 +1206,7 @@ nfs4_proc_read(struct nfs_read_data *rda
+               .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_READ],
+               .rpc_argp       = &rdata->args,
+               .rpc_resp       = &rdata->res,
++              .rpc_cred       = rdata->cred,
+       };
+       unsigned long timestamp = jiffies;
+       int status;
+@@ -834,29 +1214,27 @@ nfs4_proc_read(struct nfs_read_data *rda
+       dprintk("NFS call  read %d @ %Ld\n", rdata->args.count,
+                       (long long) rdata->args.offset);
+-      /*
+-       * Try first to use O_RDONLY, then O_RDWR stateid.
+-       */
+-      if (filp) {
+-              struct nfs4_state *state;
+-              state = (struct nfs4_state *)filp->private_data;
+-              rdata->args.state = state;
+-              msg.rpc_cred = state->owner->so_cred;
+-      } else {
+-              rdata->args.state = NULL;
+-              msg.rpc_cred = NFS_I(inode)->mm_cred;
+-      }
+-
+       fattr->valid = 0;
+       status = rpc_call_sync(server->client, &msg, flags);
+       if (!status)
+               renew_lease(server, timestamp);
+       dprintk("NFS reply read: %d\n", status);
+-      return nfs4_map_errors(status);
++      return status;
+ }
+-static int
+-nfs4_proc_write(struct nfs_write_data *wdata, struct file *filp)
++static int nfs4_proc_read(struct nfs_read_data *rdata)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(rdata->inode),
++                              _nfs4_proc_read(rdata),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_proc_write(struct nfs_write_data *wdata)
+ {
+       int rpcflags = wdata->flags;
+       struct inode *inode = wdata->inode;
+@@ -866,33 +1244,32 @@ nfs4_proc_write(struct nfs_write_data *w
+               .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_WRITE],
+               .rpc_argp       = &wdata->args,
+               .rpc_resp       = &wdata->res,
++              .rpc_cred       = wdata->cred,
+       };
+       int status;
+       dprintk("NFS call  write %d @ %Ld\n", wdata->args.count,
+                       (long long) wdata->args.offset);
+-      /*
+-       * Try first to use O_WRONLY, then O_RDWR stateid.
+-       */
+-      if (filp) {
+-              struct nfs4_state *state;
+-              state = (struct nfs4_state *)filp->private_data;
+-              wdata->args.state = state;
+-              msg.rpc_cred = state->owner->so_cred;
+-      } else {
+-              wdata->args.state = NULL;
+-              msg.rpc_cred = NFS_I(inode)->mm_cred;
+-      }
+-
+       fattr->valid = 0;
+       status = rpc_call_sync(server->client, &msg, rpcflags);
+       dprintk("NFS reply write: %d\n", status);
+-      return nfs4_map_errors(status);
++      return status;
+ }
+-static int
+-nfs4_proc_commit(struct nfs_write_data *cdata, struct file *filp)
++static int nfs4_proc_write(struct nfs_write_data *wdata)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(wdata->inode),
++                              _nfs4_proc_write(wdata),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_proc_commit(struct nfs_write_data *cdata)
+ {
+       struct inode *inode = cdata->inode;
+       struct nfs_fattr *fattr = cdata->res.fattr;
+@@ -901,24 +1278,29 @@ nfs4_proc_commit(struct nfs_write_data *
+               .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_COMMIT],
+               .rpc_argp       = &cdata->args,
+               .rpc_resp       = &cdata->res,
++              .rpc_cred       = cdata->cred,
+       };
+       int status;
+       dprintk("NFS call  commit %d @ %Ld\n", cdata->args.count,
+                       (long long) cdata->args.offset);
+-      /*
+-       * Try first to use O_WRONLY, then O_RDWR stateid.
+-       */
+-      if (filp)
+-              msg.rpc_cred = ((struct nfs4_state *)filp->private_data)->owner->so_cred;
+-      else
+-              msg.rpc_cred = NFS_I(inode)->mm_cred;
+-
+       fattr->valid = 0;
+       status = rpc_call_sync(server->client, &msg, 0);
+       dprintk("NFS reply commit: %d\n", status);
+-      return nfs4_map_errors(status);
++      return status;
++}
++
++static int nfs4_proc_commit(struct nfs_write_data *cdata)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(cdata->inode),
++                              _nfs4_proc_commit(cdata),
++                              &exception);
++      } while (exception.retry);
++      return err;
+ }
+ /*
+@@ -965,7 +1347,7 @@ nfs4_proc_create(struct inode *dir, stru
+       return inode;
+ }
+-static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
++static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
+ {
+       struct nfs4_remove_arg args = {
+               .fh = NFS_FH(dir),
+@@ -982,7 +1364,19 @@ static int nfs4_proc_remove(struct inode
+       status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+       if (status == 0)
+               update_changeattr(dir, &res);
+-      return nfs4_map_errors(status);
++      return status;
++}
++
++static int nfs4_proc_remove(struct inode *dir, struct qstr *name)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(dir),
++                              _nfs4_proc_remove(dir, name),
++                              &exception);
++      } while (exception.retry);
++      return err;
+ }
+ struct unlink_desc {
+@@ -1023,7 +1417,7 @@ static int nfs4_proc_unlink_done(struct 
+       return 0;
+ }
+-static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
++static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
+               struct inode *new_dir, struct qstr *new_name)
+ {
+       struct nfs4_rename_arg arg = {
+@@ -1046,10 +1440,24 @@ static int nfs4_proc_rename(struct inode
+               update_changeattr(old_dir, &res.old_cinfo);
+               update_changeattr(new_dir, &res.new_cinfo);
+       }
+-      return nfs4_map_errors(status);
++      return status;
+ }
+-static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
++static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
++              struct inode *new_dir, struct qstr *new_name)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(old_dir),
++                              _nfs4_proc_rename(old_dir, old_name,
++                                      new_dir, new_name),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
+ {
+       struct nfs4_link_arg arg = {
+               .fh     = NFS_FH(inode),
+@@ -1068,10 +1476,22 @@ static int nfs4_proc_link(struct inode *
+       if (!status)
+               update_changeattr(dir, &cinfo);
+-      return nfs4_map_errors(status);
++      return status;
++}
++
++static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(inode),
++                              _nfs4_proc_link(inode, dir, name),
++                              &exception);
++      } while (exception.retry);
++      return err;
+ }
+-static int nfs4_proc_symlink(struct inode *dir, struct qstr *name,
++static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name,
+               struct qstr *path, struct iattr *sattr, struct nfs_fh *fhandle,
+               struct nfs_fattr *fattr)
+ {
+@@ -1090,22 +1510,39 @@ static int nfs4_proc_symlink(struct inod
+               .fattr = fattr,
+       };
+       struct rpc_message msg = {
+-              .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
++              .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK],
+               .rpc_argp = &arg,
+               .rpc_resp = &res,
+       };
+       int                     status;
++      if (path->len > NFS4_MAXPATHLEN)
++              return -ENAMETOOLONG;
+       arg.u.symlink = path;
+       fattr->valid = 0;
+       
+       status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+       if (!status)
+               update_changeattr(dir, &res.dir_cinfo);
+-      return nfs4_map_errors(status);
++      return status;
+ }
+-static int nfs4_proc_mkdir(struct inode *dir, struct qstr *name,
++static int nfs4_proc_symlink(struct inode *dir, struct qstr *name,
++              struct qstr *path, struct iattr *sattr, struct nfs_fh *fhandle,
++              struct nfs_fattr *fattr)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(dir),
++                              _nfs4_proc_symlink(dir, name, path, sattr,
++                                      fhandle, fattr),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_proc_mkdir(struct inode *dir, struct qstr *name,
+               struct iattr *sattr, struct nfs_fh *fhandle,
+               struct nfs_fattr *fattr)
+ {
+@@ -1135,10 +1572,25 @@ static int nfs4_proc_mkdir(struct inode 
+       status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+       if (!status)
+               update_changeattr(dir, &res.dir_cinfo);
+-      return nfs4_map_errors(status);
++      return status;
+ }
+-static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
++static int nfs4_proc_mkdir(struct inode *dir, struct qstr *name,
++              struct iattr *sattr, struct nfs_fh *fhandle,
++              struct nfs_fattr *fattr)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(dir),
++                              _nfs4_proc_mkdir(dir, name, sattr,
++                                      fhandle, fattr),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
+                   u64 cookie, struct page *page, unsigned int count, int plus)
+ {
+       struct inode            *dir = dentry->d_inode;
+@@ -1164,10 +1616,24 @@ static int nfs4_proc_readdir(struct dent
+       if (status == 0)
+               memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
+       unlock_kernel();
+-      return nfs4_map_errors(status);
++      return status;
+ }
+-static int nfs4_proc_mknod(struct inode *dir, struct qstr *name,
++static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
++                  u64 cookie, struct page *page, unsigned int count, int plus)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode),
++                              _nfs4_proc_readdir(dentry, cred, cookie,
++                                      page, count, plus),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_proc_mknod(struct inode *dir, struct qstr *name,
+               struct iattr *sattr, dev_t rdev, struct nfs_fh *fh,
+               struct nfs_fattr *fattr)
+ {
+@@ -1214,10 +1680,25 @@ static int nfs4_proc_mknod(struct inode 
+       status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+       if (!status)
+               update_changeattr(dir, &res.dir_cinfo);
+-      return nfs4_map_errors(status);
++      return status;
++}
++
++static int nfs4_proc_mknod(struct inode *dir, struct qstr *name,
++              struct iattr *sattr, dev_t rdev, struct nfs_fh *fh,
++              struct nfs_fattr *fattr)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(dir),
++                              _nfs4_proc_mknod(dir, name, sattr, rdev,
++                                      fh, fattr),
++                              &exception);
++      } while (exception.retry);
++      return err;
+ }
+-static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
++static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
+                struct nfs_fsstat *fsstat)
+ {
+       struct nfs4_statfs_arg args = {
+@@ -1231,10 +1712,22 @@ static int nfs4_proc_statfs(struct nfs_s
+       };
+       fsstat->fattr->valid = 0;
+-      return nfs4_map_errors(rpc_call_sync(server->client, &msg, 0));
++      return rpc_call_sync(server->client, &msg, 0);
+ }
+-static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
++static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
++{
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = nfs4_handle_exception(server,
++                              _nfs4_proc_statfs(server, fhandle, fsstat),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+               struct nfs_fsinfo *fsinfo)
+ {
+       struct nfs4_fsinfo_arg args = {
+@@ -1247,16 +1740,29 @@ static int nfs4_do_fsinfo(struct nfs_ser
+               .rpc_resp = fsinfo,
+       };
+-      return nfs4_map_errors(rpc_call_sync(server->client, &msg, 0));
++      return rpc_call_sync(server->client, &msg, 0);
++}
++
++static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
++{
++      struct nfs4_exception exception = { };
++      int err;
++
++      do {
++              err = nfs4_handle_exception(server,
++                              _nfs4_do_fsinfo(server, fhandle, fsinfo),
++                              &exception);
++      } while (exception.retry);
++      return err;
+ }
+ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
+ {
+       fsinfo->fattr->valid = 0;
+-      return nfs4_map_errors(nfs4_do_fsinfo(server, fhandle, fsinfo));
++      return nfs4_do_fsinfo(server, fhandle, fsinfo);
+ }
+-static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
++static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
+               struct nfs_pathconf *pathconf)
+ {
+       struct nfs4_pathconf_arg args = {
+@@ -1276,7 +1782,21 @@ static int nfs4_proc_pathconf(struct nfs
+       }
+       pathconf->fattr->valid = 0;
+-      return nfs4_map_errors(rpc_call_sync(server->client, &msg, 0));
++      return rpc_call_sync(server->client, &msg, 0);
++}
++
++static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
++              struct nfs_pathconf *pathconf)
++{
++      struct nfs4_exception exception = { };
++      int err;
++
++      do {
++              err = nfs4_handle_exception(server,
++                              _nfs4_proc_pathconf(server, fhandle, pathconf),
++                              &exception);
++      } while (exception.retry);
++      return err;
+ }
+ static void
+@@ -1467,8 +1987,10 @@ static int
+ nfs4_proc_file_open(struct inode *inode, struct file *filp)
+ {
+       struct dentry *dentry = filp->f_dentry;
+-      struct nfs4_state *state;
++      struct nfs_open_context *ctx;
++      struct nfs4_state *state = NULL;
+       struct rpc_cred *cred;
++      int status = -ENOMEM;
+       dprintk("nfs4_proc_file_open: starting on (%.*s/%.*s)\n",
+                              (int)dentry->d_parent->d_name.len,
+@@ -1478,21 +2000,28 @@ nfs4_proc_file_open(struct inode *inode,
+       /* Find our open stateid */
+       cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0);
+-      state = nfs4_find_state(inode, cred, filp->f_mode);
++      if (unlikely(cred == NULL))
++              return -ENOMEM;
++      ctx = alloc_nfs_open_context(dentry, cred);
+       put_rpccred(cred);
+-      if (state == NULL) {
+-              printk(KERN_WARNING "NFS: v4 raced in function %s\n", __FUNCTION__);
+-              return -EIO; /* ERACE actually */
+-      }
++      if (unlikely(ctx == NULL))
++              return -ENOMEM;
++      status = -EIO; /* ERACE actually */
++      state = nfs4_find_state(inode, cred, filp->f_mode);
++      if (unlikely(state == NULL))
++              goto no_state;
++      ctx->state = state;
+       nfs4_close_state(state, filp->f_mode);
+-      if (filp->f_mode & FMODE_WRITE) {
+-              lock_kernel();
+-              nfs_set_mmcred(inode, state->owner->so_cred);
++      ctx->mode = filp->f_mode;
++      nfs_file_set_open_context(filp, ctx);
++      put_nfs_open_context(ctx);
++      if (filp->f_mode & FMODE_WRITE)
+               nfs_begin_data_update(inode);
+-              unlock_kernel();
+-      }
+-      filp->private_data = state;
+       return 0;
++no_state:
++      printk(KERN_WARNING "NFS: v4 raced in function %s\n", __FUNCTION__);
++      put_nfs_open_context(ctx);
++      return status;
+ }
+ /*
+@@ -1501,35 +2030,148 @@ nfs4_proc_file_open(struct inode *inode,
+ static int
+ nfs4_proc_file_release(struct inode *inode, struct file *filp)
+ {
+-      struct nfs4_state *state = (struct nfs4_state *)filp->private_data;
+-
+-      if (state)
+-              nfs4_close_state(state, filp->f_mode);
+-      if (filp->f_mode & FMODE_WRITE) {
+-              lock_kernel();
++      if (filp->f_mode & FMODE_WRITE)
+               nfs_end_data_update(inode);
+-              unlock_kernel();
+-      }
++      nfs_file_clear_open_context(filp);
+       return 0;
+ }
+-/*
+- * Set up the nfspage struct with the right state info and credentials
+- */
++static ssize_t
++nfs4_read_acl_attr(struct inode *inode, char *buf, ssize_t buflen)
++{
++      struct nfs_inode *nfsi = NFS_I(inode);
++      int ret;
++
++      spin_lock(&inode->i_lock);
++      if (buf == NULL && nfsi->acl_len)
++              goto out_len;
++      ret = -ENOENT;
++      if (nfsi->acl_len == 0)
++              goto out;
++      ret = -ERANGE; /* see getxattr(2) man page */
++      if (nfsi->acl_len > buflen)
++              goto out;
++      memcpy(buf, nfsi->acl, nfsi->acl_len);
++out_len:
++      ret = nfsi->acl_len;
++out:
++      spin_unlock(&inode->i_lock);
++      return ret;
++}
++
+ static void
+-nfs4_request_init(struct nfs_page *req, struct file *filp)
++nfs4_set_acl_attr(struct inode *inode, char *buf, ssize_t buflen)
+ {
+-      struct nfs4_state *state;
++      struct nfs_inode *nfsi = NFS_I(inode);
+-      if (!filp) {
+-              req->wb_cred = get_rpccred(NFS_I(req->wb_inode)->mm_cred);
+-              req->wb_state = NULL;
+-              return;
++      spin_lock(&inode->i_lock);
++      kfree(nfsi->acl);
++      nfsi->acl = buf;
++      nfsi->acl_len = buflen;
++      spin_unlock(&inode->i_lock);
++}
++
++static int
++nfs4_write_acl_attr(struct inode *inode, const char *buf, ssize_t buflen)
++{
++      void *abuf = NULL;
++
++      if (buflen > PAGE_SIZE)
++              goto out_nomem;
++      abuf = kmalloc(buflen, GFP_KERNEL);
++      if (abuf == NULL)
++              goto out_nomem;
++      memcpy(abuf, buf, buflen);
++      nfs4_set_acl_attr(inode, abuf, buflen);
++      return 0;
++out_nomem:
++      nfs4_set_acl_attr(inode, NULL, 0);
++      return -ENOMEM;
++}
++
++void
++nfs4_zap_acl_attr(struct inode *inode)
++{
++      nfs4_set_acl_attr(inode, NULL, 0);
++}
++
++static int
++nfs4_server_supports_acls(struct nfs_server *server)
++{
++      return (server->caps & NFS_CAP_ACLS)
++              && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
++              && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL);
++}
++
++ssize_t
++nfs4_proc_get_acl(struct inode *inode, void *buf, ssize_t buflen)
++{
++      struct nfs_server *server = NFS_SERVER(inode);
++      struct nfs_getaclres res = {
++              .acl = buf,
++              .acl_len = buflen,
++              .server = server,
++      };
++      struct rpc_message msg = {
++              .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
++              .rpc_argp = NFS_FH(inode),
++              .rpc_resp = &res,
++      };
++      int ret;
++
++      if (!nfs4_server_supports_acls(server))
++              return -EOPNOTSUPP;
++      lock_kernel();
++      ret = nfs_revalidate_inode(NFS_SERVER(inode), inode);
++      if (ret < 0)
++              goto out;
++      ret = nfs4_read_acl_attr(inode, buf, buflen);
++      if (ret == -ENOENT) {
++              ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
++              if (ret == 0) {
++                      nfs4_write_acl_attr(inode, res.acl, res.acl_len);
++                      ret = res.acl_len;
++              }
++              if (res.acl != buf) {
++                      /* xdr decode allocated the memory: */
++                      kfree(res.acl);
++              }
+       }
+-      state = (struct nfs4_state *)filp->private_data;
+-      req->wb_state = state;
+-      req->wb_cred = get_rpccred(state->owner->so_cred);
+-      req->wb_lockowner = current->files;
++out:
++      unlock_kernel();
++      return ret;
++}
++
++int
++nfs4_proc_set_acl(struct inode *inode, const void *buf, ssize_t buflen)
++{
++      struct nfs_server *server = NFS_SERVER(inode);
++      struct nfs_setaclargs arg = {
++              .fh             = NFS_FH(inode),
++              .server         = server,
++              .acl            = buf,
++              .acl_len        = buflen,
++      };
++      struct rpc_message msg = {
++              .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_SETACL],
++              .rpc_argp       = &arg,
++              .rpc_resp       = NULL,
++      };
++      int ret;
++
++      if (!nfs4_server_supports_acls(server))
++              return -EOPNOTSUPP;
++
++      /* XXX: should check for buflen too large? */
++
++      lock_kernel();
++      ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0);
++      unlock_kernel();
++
++      if (ret == 0)
++              nfs4_write_acl_attr(inode, buf, buflen);
++
++      return ret;
+ }
+ static int
+@@ -1545,11 +2187,13 @@ nfs4_async_handle_error(struct rpc_task 
+               case -NFS4ERR_EXPIRED:
+                       rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL);
+                       nfs4_schedule_state_recovery(clp);
++                      if (test_bit(NFS4CLNT_OK, &clp->cl_state))
++                              rpc_wake_up_task(task);
+                       task->tk_status = 0;
+                       return -EAGAIN;
+               case -NFS4ERR_GRACE:
+               case -NFS4ERR_DELAY:
+-                      rpc_delay(task, NFS4_POLL_RETRY_TIME);
++                      rpc_delay(task, NFS4_POLL_RETRY_MAX);
+                       task->tk_status = 0;
+                       return -EAGAIN;
+               case -NFS4ERR_OLD_STATEID:
+@@ -1560,12 +2204,11 @@ nfs4_async_handle_error(struct rpc_task 
+       return 0;
+ }
+-int
+-nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp)
++int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp)
+ {
+       DEFINE_WAIT(wait);
+       sigset_t oldset;
+-      int interruptible, res;
++      int interruptible, res = 0;
+       might_sleep();
+@@ -1573,101 +2216,85 @@ nfs4_wait_clnt_recover(struct rpc_clnt *
+       interruptible = TASK_UNINTERRUPTIBLE;
+       if (clnt->cl_intr)
+               interruptible = TASK_INTERRUPTIBLE;
+-      do {
+-              res = 0;
+-              prepare_to_wait(&clp->cl_waitq, &wait, interruptible);
+-              nfs4_schedule_state_recovery(clp);
+-              if (test_bit(NFS4CLNT_OK, &clp->cl_state) &&
+-                              !test_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state))
+-                      break;
+-              if (clnt->cl_intr && signalled()) {
+-                      res = -ERESTARTSYS;
+-                      break;
+-              }
++      prepare_to_wait(&clp->cl_waitq, &wait, interruptible);
++      nfs4_schedule_state_recovery(clp);
++      if (clnt->cl_intr && signalled())
++              res = -ERESTARTSYS;
++      else if (!test_bit(NFS4CLNT_OK, &clp->cl_state))
+               schedule();
+-      } while(!test_bit(NFS4CLNT_OK, &clp->cl_state));
+       finish_wait(&clp->cl_waitq, &wait);
+       rpc_clnt_sigunmask(clnt, &oldset);
+       return res;
+ }
+-static int
+-nfs4_delay(struct rpc_clnt *clnt)
++static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
+ {
+       sigset_t oldset;
+       int res = 0;
+       might_sleep();
++      if (*timeout <= 0)
++              *timeout = NFS4_POLL_RETRY_MIN;
++      if (*timeout > NFS4_POLL_RETRY_MAX)
++              *timeout = NFS4_POLL_RETRY_MAX;
+       rpc_clnt_sigmask(clnt, &oldset);
+       if (clnt->cl_intr) {
+               set_current_state(TASK_INTERRUPTIBLE);
+-              schedule_timeout(NFS4_POLL_RETRY_TIME);
++              schedule_timeout(*timeout);
+               if (signalled())
+                       res = -ERESTARTSYS;
+       } else {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+-              schedule_timeout(NFS4_POLL_RETRY_TIME);
++              schedule_timeout(*timeout);
+       }
+       rpc_clnt_sigunmask(clnt, &oldset);
++      *timeout <<= 1;
+       return res;
+ }
+ /* This is the error handling routine for processes that are allowed
+  * to sleep.
+  */
+-int
+-nfs4_handle_error(struct nfs_server *server, int errorcode)
++int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+ {
+       struct nfs4_client *clp = server->nfs4_state;
+       int ret = errorcode;
++      exception->retry = 0;
+       switch(errorcode) {
++              case 0:
++                      return 0;
+               case -NFS4ERR_STALE_CLIENTID:
+               case -NFS4ERR_STALE_STATEID:
+               case -NFS4ERR_EXPIRED:
+                       ret = nfs4_wait_clnt_recover(server->client, clp);
++                      if (ret == 0)
++                              exception->retry = 1;
+                       break;
+               case -NFS4ERR_GRACE:
+               case -NFS4ERR_DELAY:
+-                      ret = nfs4_delay(server->client);
++                      ret = nfs4_delay(server->client, &exception->timeout);
++                      if (ret == 0)
++                              exception->retry = 1;
+                       break;
+               case -NFS4ERR_OLD_STATEID:
+-                      ret = 0;
++                      if (ret == 0)
++                              exception->retry = 1;
+       }
+       /* We failed to handle the error */
+       return nfs4_map_errors(ret);
+ }
+-
+-static int
+-nfs4_request_compatible(struct nfs_page *req, struct file *filp, struct page *page)
+-{
+-      struct nfs4_state *state = NULL;
+-      struct rpc_cred *cred = NULL;
+-
+-      if (req->wb_file != filp)
+-              return 0;
+-      if (req->wb_page != page)
+-              return 0;
+-      state = (struct nfs4_state *)filp->private_data;
+-      if (req->wb_state != state)
+-              return 0;
+-      if (req->wb_lockowner != current->files)
+-              return 0;
+-      cred = state->owner->so_cred;
+-      if (req->wb_cred != cred)
+-              return 0;
+-      return 1;
+-}
+-
+-int
+-nfs4_proc_setclientid(struct nfs4_client *clp,
+-              u32 program, unsigned short port)
++int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short port)
+ {
+-      u32 *p;
+-      struct nfs4_setclientid setclientid;
+-      struct timespec tv;
++      static nfs4_verifier sc_verifier;
++      static int initialized;
++      
++      struct nfs4_setclientid setclientid = {
++              .sc_verifier = &sc_verifier,
++              .sc_prog = program,
++      };
+       struct rpc_message msg = {
+               .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
+               .rpc_argp = &setclientid,
+@@ -1675,15 +2302,24 @@ nfs4_proc_setclientid(struct nfs4_client
+               .rpc_cred = clp->cl_cred,
+       };
+-      tv = CURRENT_TIME;
+-      p = (u32*)setclientid.sc_verifier.data;
+-      *p++ = (u32)tv.tv_sec;
+-      *p = (u32)tv.tv_nsec;
+-      setclientid.sc_name = clp->cl_ipaddr;
+-      sprintf(setclientid.sc_netid, "tcp");
+-      sprintf(setclientid.sc_uaddr, "%s.%d.%d", clp->cl_ipaddr, port >> 8, port & 255);
+-      setclientid.sc_prog = htonl(program);
+-      setclientid.sc_cb_ident = 0;
++      if (!initialized) {
++              struct timespec boot_time;
++              u32 *p;
++
++              initialized = 1;
++              boot_time = CURRENT_TIME;
++              p = (u32*)sc_verifier.data;
++              *p++ = htonl((u32)boot_time.tv_sec);
++              *p = htonl((u32)boot_time.tv_nsec);
++      }
++      setclientid.sc_name_len = scnprintf(setclientid.sc_name,
++                      sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u",
++                      clp->cl_ipaddr, NIPQUAD(clp->cl_addr.s_addr));
++      setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
++                      sizeof(setclientid.sc_netid), "tcp");
++      setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
++                      sizeof(setclientid.sc_uaddr), "%s.%d.%d",
++                      clp->cl_ipaddr, port >> 8, port & 255);
+       return rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+ }
+@@ -1712,6 +2348,40 @@ nfs4_proc_setclientid_confirm(struct nfs
+       return status;
+ }
++static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
++{
++      struct nfs4_delegreturnargs args = {
++              .fhandle = NFS_FH(inode),
++              .stateid = stateid,
++      };
++      struct rpc_message msg = {
++              .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN],
++              .rpc_argp = &args,
++              .rpc_cred = cred,
++      };
++
++      return rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
++}
++
++int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid)
++{
++      struct nfs_server *server = NFS_SERVER(inode);
++      struct nfs4_exception exception = { };
++      int err;
++      do {
++              err = _nfs4_proc_delegreturn(inode, cred, stateid);
++              switch (err) {
++                      case -NFS4ERR_STALE_STATEID:
++                      case -NFS4ERR_EXPIRED:
++                              nfs4_schedule_state_recovery(server->nfs4_state);
++                      case 0:
++                              return 0;
++              }
++              err = nfs4_handle_exception(server, err, &exception);
++      } while (exception.retry);
++      return err;
++}
++
+ #define NFS4_LOCK_MINTIMEOUT (1 * HZ)
+ #define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
+@@ -1753,8 +2423,7 @@ nfs4_lck_length(struct file_lock *reques
+       return request->fl_end - request->fl_start + 1;
+ }
+-int
+-nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
++static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+ {
+       struct inode *inode = state->inode;
+       struct nfs_server *server = NFS_SERVER(inode);
+@@ -1778,9 +2447,10 @@ nfs4_proc_getlk(struct nfs4_state *state
+       struct nfs4_lock_state *lsp;
+       int status;
++      down_read(&clp->cl_sem);
+       nlo.clientid = clp->cl_clientid;
+       down(&state->lock_sema);
+-      lsp = nfs4_find_lock_state(state, request->fl_owner);
++      lsp = nfs4_find_lock_state(state, request->fl_pid);
+       if (lsp)
+               nlo.id = lsp->ls_id; 
+       else {
+@@ -1811,14 +2481,28 @@ nfs4_proc_getlk(struct nfs4_state *state
+       if (lsp)
+               nfs4_put_lock_state(lsp);
+       up(&state->lock_sema);
+-      return nfs4_map_errors(status);
++      up_read(&clp->cl_sem);
++      return status;
+ }
+-int
+-nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
++static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
++{
++      struct nfs4_exception exception = { };
++      int err;
++
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(state->inode),
++                              _nfs4_proc_getlk(state, cmd, request),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
+ {
+       struct inode *inode = state->inode;
+       struct nfs_server *server = NFS_SERVER(inode);
++      struct nfs4_client *clp = server->nfs4_state;
+       struct nfs_lockargs arg = {
+               .fh = NFS_FH(inode),
+               .type = nfs4_lck_type(cmd, request),
+@@ -1838,29 +2522,46 @@ nfs4_proc_unlck(struct nfs4_state *state
+       struct nfs_locku_opargs luargs;
+       int status = 0;
+                       
++      down_read(&clp->cl_sem);
+       down(&state->lock_sema);
+-      lsp = nfs4_find_lock_state(state, request->fl_owner);
++      lsp = nfs4_find_lock_state(state, request->fl_pid);
+       if (!lsp)
+               goto out;
+-      luargs.seqid = lsp->ls_seqid;
+-      memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid));
+-      arg.u.locku = &luargs;
+-      status = rpc_call_sync(server->client, &msg, 0);
+-      nfs4_increment_lock_seqid(status, lsp);
++      /* We might have lost the locks! */
++      if ((lsp->flags & NFS_LOCK_INITIALIZED) != 0) {
++              luargs.seqid = lsp->ls_seqid;
++              memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid));
++              arg.u.locku = &luargs;
++              status = rpc_call_sync(server->client, &msg, 0);
++              nfs4_increment_lock_seqid(status, lsp);
++      }
+       if (status == 0) {
+               memcpy(&lsp->ls_stateid,  &res.u.stateid, 
+                               sizeof(lsp->ls_stateid));
+-              nfs4_notify_unlck(inode, request, lsp);
++              nfs4_notify_unlck(state, request, lsp);
+       }
+       nfs4_put_lock_state(lsp);
+ out:
+       up(&state->lock_sema);
+-      return nfs4_map_errors(status);
++      up_read(&clp->cl_sem);
++      return status;
+ }
+-static int
+-nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
++static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
++{
++      struct nfs4_exception exception = { };
++      int err;
++
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(state->inode),
++                              _nfs4_proc_unlck(state, cmd, request),
++                              &exception);
++      } while (exception.retry);
++      return err;
++}
++
++static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *request, int reclaim)
+ {
+       struct inode *inode = state->inode;
+       struct nfs_server *server = NFS_SERVER(inode);
+@@ -1881,23 +2582,22 @@ nfs4_proc_setlk(struct nfs4_state *state
+               .rpc_cred       = state->owner->so_cred,
+       };
+       struct nfs_lock_opargs largs = {
++              .reclaim = reclaim,
+               .new_lock_owner = 0,
+       };
+       int status;
+-      down(&state->lock_sema);
+-      lsp = nfs4_find_lock_state(state, request->fl_owner);
+-      if (lsp == NULL) {
++      lsp = nfs4_get_lock_state(state, request->fl_pid);
++      if (lsp == NULL)
++              return -ENOMEM;
++      if (!(lsp->flags & NFS_LOCK_INITIALIZED)) {
+               struct nfs4_state_owner *owner = state->owner;
+               struct nfs_open_to_lock otl = {
+                       .lock_owner = {
+                               .clientid = server->nfs4_state->cl_clientid,
+                       },
+               };
+-              status = -ENOMEM;
+-              lsp = nfs4_alloc_lock_state(state, request->fl_owner);
+-              if (!lsp)
+-                      goto out;
++
+               otl.lock_seqid = lsp->ls_seqid;
+               otl.lock_owner.id = lsp->ls_id;
+               memcpy(&otl.open_stateid, &state->stateid, sizeof(otl.open_stateid));
+@@ -1926,25 +2626,60 @@ nfs4_proc_setlk(struct nfs4_state *state
+       /* save the returned stateid. */
+       if (status == 0) {
+               memcpy(&lsp->ls_stateid, &res.u.stateid, sizeof(nfs4_stateid));
+-              nfs4_notify_setlk(inode, request, lsp);
++              if (!reclaim)
++                      nfs4_notify_setlk(state, request, lsp);
+       } else if (status == -NFS4ERR_DENIED)
+               status = -EAGAIN;
+       nfs4_put_lock_state(lsp);
+-out:
++      return status;
++}
++
++int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request)
++{
++#ifdef F_SETLK64
++      return _nfs4_do_setlk(state, F_SETLK64, request, 1);
++#else
++      return _nfs4_do_setlk(state, F_SETLK, request, 1);
++#endif
++}
++
++static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
++{
++      struct nfs4_client *clp = state->owner->so_client;
++      int status;
++
++      down_read(&clp->cl_sem);
++      down(&state->lock_sema);
++      status = _nfs4_do_setlk(state, cmd, request, 0);
+       up(&state->lock_sema);
+-      return nfs4_map_errors(status);
++      up_read(&clp->cl_sem);
++      return status;
++}
++
++static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
++{
++      struct nfs4_exception exception = { };
++      int err;
++
++      do {
++              err = nfs4_handle_exception(NFS_SERVER(state->inode),
++                              _nfs4_proc_setlk(state, cmd, request),
++                              &exception);
++      } while (exception.retry);
++      return err;
+ }
+ static int
+ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
+ {
++      struct nfs_open_context *ctx;
+       struct nfs4_state *state;
+       unsigned long timeout = NFS4_LOCK_MINTIMEOUT;
+       int status;
+       /* verify open state */
+-      state = (struct nfs4_state *)filp->private_data;
+-      BUG_ON(!state);
++      ctx = (struct nfs_open_context *)filp->private_data;
++      state = ctx->state;
+       if (request->fl_start < 0 || request->fl_end < 0)
+               return -EINVAL;
+@@ -1975,6 +2710,7 @@ struct nfs_rpc_ops       nfs_v4_clientops = {
+       .version        = 4,                    /* protocol version */
+       .dentry_ops     = &nfs4_dentry_operations,
+       .dir_inode_ops  = &nfs4_dir_inode_operations,
++      .file_inode_ops = &nfs4_file_inode_operations,
+       .getroot        = nfs4_proc_get_root,
+       .getattr        = nfs4_proc_getattr,
+       .setattr        = nfs4_proc_setattr,
+@@ -2004,8 +2740,6 @@ struct nfs_rpc_ops       nfs_v4_clientops = {
+       .commit_setup   = nfs4_proc_commit_setup,
+       .file_open      = nfs4_proc_file_open,
+       .file_release   = nfs4_proc_file_release,
+-      .request_init   = nfs4_request_init,
+-      .request_compatible = nfs4_request_compatible,
+       .lock           = nfs4_proc_lock,
+ };
+--- linux-2.6.7/fs/nfs/callback.h.lsec 2005-03-23 14:28:22.484631512 -0700
++++ linux-2.6.7/fs/nfs/callback.h      2005-03-23 14:28:22.484631512 -0700
+@@ -0,0 +1,70 @@
++/*
++ * linux/fs/nfs/callback.h
++ *
++ * Copyright (C) 2004 Trond Myklebust
++ *
++ * NFSv4 callback definitions
++ */
++#ifndef __LINUX_FS_NFS_CALLBACK_H
++#define __LINUX_FS_NFS_CALLBACK_H
++
++#define NFS4_CALLBACK 0x40000000
++#define NFS4_CALLBACK_XDRSIZE 2048
++#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE)
++
++enum nfs4_callback_procnum {
++      CB_NULL = 0,
++      CB_COMPOUND = 1,
++};
++
++enum nfs4_callback_opnum {
++      OP_CB_GETATTR = 3,
++      OP_CB_RECALL  = 4,
++      OP_CB_ILLEGAL = 10044,
++};
++
++struct cb_compound_hdr_arg {
++      int taglen;
++      const char *tag;
++      unsigned int callback_ident;
++      unsigned nops;
++};
++
++struct cb_compound_hdr_res {
++      uint32_t *status;
++      int taglen;
++      const char *tag;
++      uint32_t *nops;
++};
++
++struct cb_getattrargs {
++      struct sockaddr_in *addr;
++      struct nfs_fh fh;
++      uint32_t bitmap[2];
++};
++
++struct cb_getattrres {
++      uint32_t status;
++      uint32_t bitmap[2];
++      uint64_t size;
++      uint64_t change_attr;
++      struct timespec ctime;
++      struct timespec mtime;
++};
++
++struct cb_recallargs {
++      struct sockaddr_in *addr;
++      struct nfs_fh fh;
++      nfs4_stateid stateid;
++      uint32_t truncate;
++};
++
++extern unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
++extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
++
++extern int nfs_callback_up(void);
++extern int nfs_callback_down(void);
++
++extern unsigned short nfs_callback_tcpport;
++
++#endif /* __LINUX_FS_NFS_CALLBACK_H */
+--- linux-2.6.7/fs/nfs/direct.c.lsec   2004-06-15 23:19:53.000000000 -0600
++++ linux-2.6.7/fs/nfs/direct.c        2005-03-23 14:28:22.702598376 -0700
+@@ -110,7 +110,7 @@ nfs_free_user_pages(struct page **pages,
+  * nfs_direct_read_seg - Read in one iov segment.  Generate separate
+  *                        read RPCs for each "rsize" bytes.
+  * @inode: target inode
+- * @file: target file (may be NULL)
++ * @ctx: target file open context
+  * user_addr: starting address of this segment of user's buffer
+  * count: size of this segment
+  * file_offset: offset in file to begin the operation
+@@ -118,7 +118,7 @@ nfs_free_user_pages(struct page **pages,
+  * nr_pages: size of pages array
+  */
+ static int
+-nfs_direct_read_seg(struct inode *inode, struct file *file,
++nfs_direct_read_seg(struct inode *inode, struct nfs_open_context *ctx,
+               unsigned long user_addr, size_t count, loff_t file_offset,
+               struct page **pages, int nr_pages)
+ {
+@@ -127,9 +127,10 @@ nfs_direct_read_seg(struct inode *inode,
+       int curpage = 0;
+       struct nfs_read_data    rdata = {
+               .inode          = inode,
++              .cred           = ctx->cred,
+               .args           = {
+                       .fh             = NFS_FH(inode),
+-                      .lockowner      = current->files,
++                      .context        = ctx,
+               },
+               .res            = {
+                       .fattr          = &rdata.fattr,
+@@ -151,7 +152,7 @@ nfs_direct_read_seg(struct inode *inode,
+                       user_addr + tot_bytes, rdata.args.pgbase, curpage);
+               lock_kernel();
+-              result = NFS_PROTO(inode)->read(&rdata, file);
++              result = NFS_PROTO(inode)->read(&rdata);
+               unlock_kernel();
+               if (result <= 0) {
+@@ -183,7 +184,7 @@ nfs_direct_read_seg(struct inode *inode,
+  * nfs_direct_read - For each iov segment, map the user's buffer
+  *                   then generate read RPCs.
+  * @inode: target inode
+- * @file: target file (may be NULL)
++ * @ctx: target file open context
+  * @iov: array of vectors that define I/O buffer
+  * file_offset: offset in file to begin the operation
+  * nr_segs: size of iovec array
+@@ -193,7 +194,7 @@ nfs_direct_read_seg(struct inode *inode,
+  * server.
+  */
+ static ssize_t
+-nfs_direct_read(struct inode *inode, struct file *file,
++nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx,
+               const struct iovec *iov, loff_t file_offset,
+               unsigned long nr_segs)
+ {
+@@ -216,7 +217,7 @@ nfs_direct_read(struct inode *inode, str
+                         return page_count;
+                 }
+-              result = nfs_direct_read_seg(inode, file, user_addr, size,
++              result = nfs_direct_read_seg(inode, ctx, user_addr, size,
+                               file_offset, pages, page_count);
+               nfs_free_user_pages(pages, page_count, 1);
+@@ -239,7 +240,7 @@ nfs_direct_read(struct inode *inode, str
+  * nfs_direct_write_seg - Write out one iov segment.  Generate separate
+  *                        write RPCs for each "wsize" bytes, then commit.
+  * @inode: target inode
+- * @file: target file (may be NULL)
++ * @ctx: target file open context
+  * user_addr: starting address of this segment of user's buffer
+  * count: size of this segment
+  * file_offset: offset in file to begin the operation
+@@ -247,7 +248,7 @@ nfs_direct_read(struct inode *inode, str
+  * nr_pages: size of pages array
+  */
+ static int
+-nfs_direct_write_seg(struct inode *inode, struct file *file,
++nfs_direct_write_seg(struct inode *inode, struct nfs_open_context *ctx,
+               unsigned long user_addr, size_t count, loff_t file_offset,
+               struct page **pages, int nr_pages)
+ {
+@@ -257,9 +258,10 @@ nfs_direct_write_seg(struct inode *inode
+       struct nfs_writeverf first_verf;
+       struct nfs_write_data   wdata = {
+               .inode          = inode,
++              .cred           = ctx->cred,
+               .args           = {
+                       .fh             = NFS_FH(inode),
+-                      .lockowner      = current->files,
++                      .context        = ctx,
+               },
+               .res            = {
+                       .fattr          = &wdata.fattr,
+@@ -290,7 +292,7 @@ retry:
+                       user_addr + tot_bytes, wdata.args.pgbase, curpage);
+               lock_kernel();
+-              result = NFS_PROTO(inode)->write(&wdata, file);
++              result = NFS_PROTO(inode)->write(&wdata);
+               unlock_kernel();
+               if (result <= 0) {
+@@ -325,7 +327,7 @@ retry:
+               wdata.args.offset = file_offset;
+               lock_kernel();
+-              result = NFS_PROTO(inode)->commit(&wdata, file);
++              result = NFS_PROTO(inode)->commit(&wdata);
+               unlock_kernel();
+               if (result < 0 || memcmp(&first_verf.verifier,
+@@ -349,7 +351,7 @@ sync_retry:
+  * nfs_direct_write - For each iov segment, map the user's buffer
+  *                    then generate write and commit RPCs.
+  * @inode: target inode
+- * @file: target file (may be NULL)
++ * @ctx: target file open context
+  * @iov: array of vectors that define I/O buffer
+  * file_offset: offset in file to begin the operation
+  * nr_segs: size of iovec array
+@@ -358,8 +360,7 @@ sync_retry:
+  * that non-direct readers might access, so they will pick up these
+  * writes immediately.
+  */
+-static ssize_t
+-nfs_direct_write(struct inode *inode, struct file *file,
++static int nfs_direct_write(struct inode *inode, struct nfs_open_context *ctx,
+               const struct iovec *iov, loff_t file_offset,
+               unsigned long nr_segs)
+ {
+@@ -382,7 +383,7 @@ nfs_direct_write(struct inode *inode, st
+                         return page_count;
+                 }
+-              result = nfs_direct_write_seg(inode, file, user_addr, size,
++              result = nfs_direct_write_seg(inode, ctx, user_addr, size,
+                               file_offset, pages, page_count);
+               nfs_free_user_pages(pages, page_count, 0);
+@@ -414,6 +415,7 @@ nfs_direct_IO(int rw, struct kiocb *iocb
+ {
+       ssize_t result = -EINVAL;
+       struct file *file = iocb->ki_filp;
++      struct nfs_open_context *ctx;
+       struct dentry *dentry = file->f_dentry;
+       struct inode *inode = dentry->d_inode;
+@@ -423,19 +425,20 @@ nfs_direct_IO(int rw, struct kiocb *iocb
+       if (!is_sync_kiocb(iocb))
+               return result;
++      ctx = (struct nfs_open_context *)file->private_data;
+       switch (rw) {
+       case READ:
+               dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n",
+                               dentry->d_name.name, file_offset, nr_segs);
+-              result = nfs_direct_read(inode, file, iov,
++              result = nfs_direct_read(inode, ctx, iov,
+                                               file_offset, nr_segs);
+               break;
+       case WRITE:
+               dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n",
+                               dentry->d_name.name, file_offset, nr_segs);
+-              result = nfs_direct_write(inode, file, iov,
++              result = nfs_direct_write(inode, ctx, iov,
+                                               file_offset, nr_segs);
+               break;
+       default:
+@@ -471,6 +474,8 @@ nfs_file_direct_read(struct kiocb *iocb,
+       ssize_t retval = -EINVAL;
+       loff_t *ppos = &iocb->ki_pos;
+       struct file *file = iocb->ki_filp;
++      struct nfs_open_context *ctx =
++                      (struct nfs_open_context *) file->private_data;
+       struct dentry *dentry = file->f_dentry;
+       struct address_space *mapping = file->f_mapping;
+       struct inode *inode = mapping->host;
+@@ -502,7 +507,7 @@ nfs_file_direct_read(struct kiocb *iocb,
+                       goto out;
+       }
+-      retval = nfs_direct_read(inode, file, &iov, pos, 1);
++      retval = nfs_direct_read(inode, ctx, &iov, pos, 1);
+       if (retval > 0)
+               *ppos = pos + retval;
+@@ -542,6 +547,8 @@ nfs_file_direct_write(struct kiocb *iocb
+       loff_t *ppos = &iocb->ki_pos;
+       unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
+       struct file *file = iocb->ki_filp;
++      struct nfs_open_context *ctx =
++                      (struct nfs_open_context *) file->private_data;
+       struct dentry *dentry = file->f_dentry;
+       struct address_space *mapping = file->f_mapping;
+       struct inode *inode = mapping->host;
+@@ -589,7 +596,7 @@ nfs_file_direct_write(struct kiocb *iocb
+                       goto out;
+       }
+-      retval = nfs_direct_write(inode, file, &iov, pos, 1);
++      retval = nfs_direct_write(inode, ctx, &iov, pos, 1);
+       if (mapping->nrpages)
+               invalidate_inode_pages2(mapping);
+       if (retval > 0)
+--- linux-2.6.7/fs/nfs/nfs4state.c.lsec        2004-06-15 23:18:47.000000000 -0600
++++ linux-2.6.7/fs/nfs/nfs4state.c     2005-03-23 14:28:22.939562352 -0700
+@@ -40,11 +40,15 @@
+ #include <linux/config.h>
+ #include <linux/slab.h>
++#include <linux/smp_lock.h>
+ #include <linux/nfs_fs.h>
+ #include <linux/nfs_idmap.h>
+ #include <linux/workqueue.h>
+ #include <linux/bitops.h>
++#include "callback.h"
++#include "delegation.h"
++
+ #define OPENOWNER_POOL_SIZE   8
+ static spinlock_t             state_spinlock = SPIN_LOCK_UNLOCKED;
+@@ -93,21 +97,26 @@ nfs4_alloc_client(struct in_addr *addr)
+ {
+       struct nfs4_client *clp;
+-      if ((clp = kmalloc(sizeof(*clp), GFP_KERNEL))) {
+-              memset(clp, 0, sizeof(*clp));
+-              memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
+-              init_rwsem(&clp->cl_sem);
+-              INIT_LIST_HEAD(&clp->cl_state_owners);
+-              INIT_LIST_HEAD(&clp->cl_unused);
+-              spin_lock_init(&clp->cl_lock);
+-              atomic_set(&clp->cl_count, 1);
+-              INIT_WORK(&clp->cl_recoverd, nfs4_recover_state, clp);
+-              INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
+-              INIT_LIST_HEAD(&clp->cl_superblocks);
+-              init_waitqueue_head(&clp->cl_waitq);
+-              rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client");
+-              clp->cl_state = 1 << NFS4CLNT_NEW;
++      if (nfs_callback_up() < 0)
++              return NULL;
++      if ((clp = kmalloc(sizeof(*clp), GFP_KERNEL)) == NULL) {
++              nfs_callback_down();
++              return NULL;
+       }
++      memset(clp, 0, sizeof(*clp));
++      memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr));
++      init_rwsem(&clp->cl_sem);
++      INIT_LIST_HEAD(&clp->cl_delegations);
++      INIT_LIST_HEAD(&clp->cl_state_owners);
++      INIT_LIST_HEAD(&clp->cl_unused);
++      spin_lock_init(&clp->cl_lock);
++      atomic_set(&clp->cl_count, 1);
++      INIT_WORK(&clp->cl_recoverd, nfs4_recover_state, clp);
++      INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp);
++      INIT_LIST_HEAD(&clp->cl_superblocks);
++      init_waitqueue_head(&clp->cl_waitq);
++      rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client");
++      clp->cl_state = 1 << NFS4CLNT_OK;
+       return clp;
+ }
+@@ -130,25 +139,52 @@ nfs4_free_client(struct nfs4_client *clp
+       if (clp->cl_rpcclient)
+               rpc_shutdown_client(clp->cl_rpcclient);
+       kfree(clp);
++      nfs_callback_down();
++}
++
++static struct nfs4_client *__nfs4_find_client(struct in_addr *addr)
++{
++      struct nfs4_client *clp;
++      list_for_each_entry(clp, &nfs4_clientid_list, cl_servers) {
++              if (memcmp(&clp->cl_addr, addr, sizeof(clp->cl_addr)) == 0) {
++                      atomic_inc(&clp->cl_count);
++                      return clp;
++              }
++      }
++      return NULL;
++}
++
++struct nfs4_client *nfs4_find_client(struct in_addr *addr)
++{
++      struct nfs4_client *clp;
++      spin_lock(&state_spinlock);
++      clp = __nfs4_find_client(addr);
++      spin_unlock(&state_spinlock);
++      return clp;
+ }
+ struct nfs4_client *
+ nfs4_get_client(struct in_addr *addr)
+ {
+-      struct nfs4_client *new, *clp = NULL;
++      struct nfs4_client *clp, *new = NULL;
+-      new = nfs4_alloc_client(addr);
+       spin_lock(&state_spinlock);
+-      list_for_each_entry(clp, &nfs4_clientid_list, cl_servers) {
+-              if (memcmp(&clp->cl_addr, addr, sizeof(clp->cl_addr)) == 0)
+-                      goto found;
++      for (;;) {
++              clp = __nfs4_find_client(addr);
++              if (clp != NULL)
++                      break;
++              clp = new;
++              if (clp != NULL) {
++                      list_add(&clp->cl_servers, &nfs4_clientid_list);
++                      new = NULL;
++                      break;
++              }
++              spin_unlock(&state_spinlock);
++              new = nfs4_alloc_client(addr);
++              spin_lock(&state_spinlock);
++              if (new == NULL)
++                      break;
+       }
+-      if (new)
+-              list_add(&new->cl_servers, &nfs4_clientid_list);
+-      spin_unlock(&state_spinlock);
+-      return new;
+-found:
+-      atomic_inc(&clp->cl_count);
+       spin_unlock(&state_spinlock);
+       if (new)
+               nfs4_free_client(new);
+@@ -169,6 +205,16 @@ nfs4_put_client(struct nfs4_client *clp)
+       nfs4_free_client(clp);
+ }
++int nfs4_init_client(struct nfs4_client *clp)
++{
++      int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, nfs_callback_tcpport);
++      if (status == 0)
++              status = nfs4_proc_setclientid_confirm(clp);
++      if (status == 0)
++              nfs4_schedule_state_renewal(clp);
++      return status;
++}
++
+ u32
+ nfs4_alloc_lockowner_id(struct nfs4_client *clp)
+ {
+@@ -185,7 +231,6 @@ nfs4_client_grab_unused(struct nfs4_clie
+               atomic_inc(&sp->so_count);
+               sp->so_cred = cred;
+               list_move(&sp->so_list, &clp->cl_state_owners);
+-              sp->so_generation = clp->cl_generation;
+               clp->cl_nunused--;
+       }
+       return sp;
+@@ -224,6 +269,7 @@ nfs4_alloc_state_owner(void)
+       init_MUTEX(&sp->so_sema);
+       sp->so_seqid = 0;                 /* arbitrary */
+       INIT_LIST_HEAD(&sp->so_states);
++      INIT_LIST_HEAD(&sp->so_delegations);
+       atomic_set(&sp->so_count, 1);
+       return sp;
+ }
+@@ -237,8 +283,11 @@ nfs4_unhash_state_owner(struct nfs4_stat
+       spin_unlock(&clp->cl_lock);
+ }
+-struct nfs4_state_owner *
+-nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
++/*
++ * Note: must be called with clp->cl_sem held in order to prevent races
++ *       with reboot recovery!
++ */
++struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
+ {
+       struct nfs4_client *clp = server->nfs4_state;
+       struct nfs4_state_owner *sp, *new;
+@@ -254,23 +303,23 @@ nfs4_get_state_owner(struct nfs_server *
+               new->so_client = clp;
+               new->so_id = nfs4_alloc_lockowner_id(clp);
+               new->so_cred = cred;
+-              new->so_generation = clp->cl_generation;
+               sp = new;
+               new = NULL;
+       }
+       spin_unlock(&clp->cl_lock);
+       if (new)
+               kfree(new);
+-      if (sp) {
+-              if (!test_bit(NFS4CLNT_OK, &clp->cl_state))
+-                      nfs4_wait_clnt_recover(server->client, clp);
+-      } else
+-              put_rpccred(cred);
+-      return sp;
++      if (sp != NULL)
++              return sp;
++      put_rpccred(cred);
++      return NULL;
+ }
+-void
+-nfs4_put_state_owner(struct nfs4_state_owner *sp)
++/*
++ * Must be called with clp->cl_sem held in order to avoid races
++ * with state recovery...
++ */
++void nfs4_put_state_owner(struct nfs4_state_owner *sp)
+ {
+       struct nfs4_client *clp = sp->so_client;
+       struct rpc_cred *cred = sp->so_cred;
+@@ -330,8 +379,6 @@ __nfs4_find_state(struct inode *inode, s
+                       continue;
+               if ((state->state & mode) != mode)
+                       continue;
+-              /* Add the state to the head of the inode's list */
+-              list_move(&state->inode_states, &nfsi->open_states);
+               atomic_inc(&state->count);
+               if (mode & FMODE_READ)
+                       state->nreaders++;
+@@ -353,8 +400,6 @@ __nfs4_find_state_byowner(struct inode *
+               if (state->nreaders == 0 && state->nwriters == 0)
+                       continue;
+               if (state->owner == owner) {
+-                      /* Add the state to the head of the inode's list */
+-                      list_move(&state->inode_states, &nfsi->open_states);
+                       atomic_inc(&state->count);
+                       return state;
+               }
+@@ -411,51 +456,40 @@ out:
+       return state;
+ }
+-static void
+-__nfs4_put_open_state(struct nfs4_state *state)
++/*
++ * Beware! Caller must be holding exactly one
++ * reference to clp->cl_sem and owner->so_sema!
++ */
++void nfs4_put_open_state(struct nfs4_state *state)
+ {
+       struct inode *inode = state->inode;
+       struct nfs4_state_owner *owner = state->owner;
+-      int status = 0;
+-      if (!atomic_dec_and_lock(&state->count, &inode->i_lock)) {
+-              up(&owner->so_sema);
++      if (!atomic_dec_and_lock(&state->count, &inode->i_lock))
+               return;
+-      }
+       if (!list_empty(&state->inode_states))
+               list_del(&state->inode_states);
+       spin_unlock(&inode->i_lock);
+       list_del(&state->open_states);
+-      if (state->state != 0) {
+-              do {
+-                      status = nfs4_do_close(inode, state);
+-                      if (!status)
+-                              break;
+-                      up(&owner->so_sema);
+-                      status = nfs4_handle_error(NFS_SERVER(inode), status);
+-                      down(&owner->so_sema);
+-              } while (!status);
+-      }
+-      up(&owner->so_sema);
++      BUG_ON (state->state != 0);
+       nfs4_free_open_state(state);
+       nfs4_put_state_owner(owner);
+ }
+-void
+-nfs4_put_open_state(struct nfs4_state *state)
+-{
+-      down(&state->owner->so_sema);
+-      __nfs4_put_open_state(state);
+-}
+-
+-void
+-nfs4_close_state(struct nfs4_state *state, mode_t mode)
++/*
++ * Beware! Caller must be holding no references to clp->cl_sem!
++ * of owner->so_sema!
++ */
++void nfs4_close_state(struct nfs4_state *state, mode_t mode)
+ {
+       struct inode *inode = state->inode;
+       struct nfs4_state_owner *owner = state->owner;
++      struct nfs4_client *clp = owner->so_client;
+       int newstate;
+       int status = 0;
++      atomic_inc(&owner->so_count);
++      down_read(&clp->cl_sem);
+       down(&owner->so_sema);
+       /* Protect against nfs4_find_state() */
+       spin_lock(&inode->i_lock);
+@@ -466,29 +500,24 @@ nfs4_close_state(struct nfs4_state *stat
+       if (state->nwriters == 0 && state->nreaders == 0)
+               list_del_init(&state->inode_states);
+       spin_unlock(&inode->i_lock);
+-      do {
+-              newstate = 0;
+-              if (state->state == 0)
+-                      break;
++      newstate = 0;
++      if (state->state != 0) {
+               if (state->nreaders)
+                       newstate |= FMODE_READ;
+               if (state->nwriters)
+                       newstate |= FMODE_WRITE;
+               if (state->state == newstate)
+-                      break;
++                      goto out;
+               if (newstate != 0)
+                       status = nfs4_do_downgrade(inode, state, newstate);
+               else
+                       status = nfs4_do_close(inode, state);
+-              if (!status) {
+-                      state->state = newstate;
+-                      break;
+-              }
+-              up(&owner->so_sema);
+-              status = nfs4_handle_error(NFS_SERVER(inode), status);
+-              down(&owner->so_sema);
+-      } while (!status);
+-      __nfs4_put_open_state(state);
++      }
++out:
++      nfs4_put_open_state(state);
++      up(&owner->so_sema);
++      nfs4_put_state_owner(owner);
++      up_read(&clp->cl_sem);
+ }
+ /*
+@@ -496,11 +525,11 @@ nfs4_close_state(struct nfs4_state *stat
+  * that is compatible with current->files
+  */
+ static struct nfs4_lock_state *
+-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
++__nfs4_find_lock_state(struct nfs4_state *state, unsigned int pid)
+ {
+       struct nfs4_lock_state *pos;
+       list_for_each_entry(pos, &state->lock_states, ls_locks) {
+-              if (pos->ls_owner != fl_owner)
++              if (pos->ls_pid != pid)
+                       continue;
+               atomic_inc(&pos->ls_count);
+               return pos;
+@@ -509,23 +538,16 @@ __nfs4_find_lock_state(struct nfs4_state
+ }
+ struct nfs4_lock_state *
+-nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
++nfs4_find_lock_state(struct nfs4_state *state, unsigned int pid)
+ {
+       struct nfs4_lock_state *lsp;
+       read_lock(&state->state_lock);
+-      lsp = __nfs4_find_lock_state(state, fl_owner);
++      lsp = __nfs4_find_lock_state(state, pid);
+       read_unlock(&state->state_lock);
+       return lsp;
+ }
+-/*
+- * Return a compatible lock_state. If no initialized lock_state structure
+- * exists, return an uninitialized one.
+- *
+- * The caller must be holding state->lock_sema
+- */
+-struct nfs4_lock_state *
+-nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
++static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, unsigned int pid)
+ {
+       struct nfs4_lock_state *lsp;
+       struct nfs4_client *clp = state->owner->so_client;
+@@ -533,12 +555,12 @@ nfs4_alloc_lock_state(struct nfs4_state 
+       lsp = kmalloc(sizeof(*lsp), GFP_KERNEL);
+       if (lsp == NULL)
+               return NULL;
++      lsp->flags = 0;
+       lsp->ls_seqid = 0;      /* arbitrary */
+       lsp->ls_id = -1; 
+       memset(lsp->ls_stateid.data, 0, sizeof(lsp->ls_stateid.data));
+       atomic_set(&lsp->ls_count, 1);
+-      lsp->ls_owner = fl_owner;
+-      lsp->ls_parent = state;
++      lsp->ls_pid = pid;
+       INIT_LIST_HEAD(&lsp->ls_locks);
+       spin_lock(&clp->cl_lock);
+       lsp->ls_id = nfs4_alloc_lockowner_id(clp);
+@@ -547,16 +569,32 @@ nfs4_alloc_lock_state(struct nfs4_state 
+ }
+ /*
++ * Return a compatible lock_state. If no initialized lock_state structure
++ * exists, return an uninitialized one.
++ *
++ * The caller must be holding state->lock_sema and clp->cl_sem
++ */
++struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, unsigned int pid)
++{
++      struct nfs4_lock_state * lsp;
++      
++      lsp = nfs4_find_lock_state(state, pid);
++      if (lsp == NULL)
++              lsp = nfs4_alloc_lock_state(state, pid);
++      return lsp;
++}
++
++/*
+  * Byte-range lock aware utility to initialize the stateid of read/write
+  * requests.
+  */
+ void
+-nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner)
++nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, unsigned int pid)
+ {
+       if (test_bit(LK_STATE_IN_USE, &state->flags)) {
+               struct nfs4_lock_state *lsp;
+-              lsp = nfs4_find_lock_state(state, fl_owner);
++              lsp = nfs4_find_lock_state(state, pid);
+               if (lsp) {
+                       memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
+                       nfs4_put_lock_state(lsp);
+@@ -567,13 +605,14 @@ nfs4_copy_stateid(nfs4_stateid *dst, str
+ }
+ /*
+-* Called with state->lock_sema held.
++* Called with state->lock_sema and clp->cl_sem held.
+ */
+-void
+-nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp)
++void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp)
+ {
+-      if (status == NFS_OK || seqid_mutating_err(-status))
++      if (status == NFS_OK || seqid_mutating_err(-status)) {
+               lsp->ls_seqid++;
++              lsp->flags |= NFS_LOCK_INITIALIZED;
++      }
+ }
+ /* 
+@@ -598,12 +637,11 @@ nfs4_check_unlock(struct file_lock *fl, 
+  * Post an initialized lock_state on the state->lock_states list.
+  */
+ void
+-nfs4_notify_setlk(struct inode *inode, struct file_lock *request, struct nfs4_lock_state *lsp)
++nfs4_notify_setlk(struct nfs4_state *state, struct file_lock *request, struct nfs4_lock_state *lsp)
+ {
+-      struct nfs4_state *state = lsp->ls_parent;
+-
+       if (!list_empty(&lsp->ls_locks))
+               return;
++      atomic_inc(&lsp->ls_count);
+       write_lock(&state->state_lock);
+       list_add(&lsp->ls_locks, &state->lock_states);
+       set_bit(LK_STATE_IN_USE, &state->flags);
+@@ -620,15 +658,15 @@ nfs4_notify_setlk(struct inode *inode, s
+  *
+  */
+ void
+-nfs4_notify_unlck(struct inode *inode, struct file_lock *request, struct nfs4_lock_state *lsp)
++nfs4_notify_unlck(struct nfs4_state *state, struct file_lock *request, struct nfs4_lock_state *lsp)
+ {
+-      struct nfs4_state *state = lsp->ls_parent;
++      struct inode *inode = state->inode;
+       struct file_lock *fl;
+       for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+               if (!(fl->fl_flags & FL_POSIX))
+                       continue;
+-              if (fl->fl_owner != lsp->ls_owner)
++              if (fl->fl_pid != lsp->ls_pid)
+                       continue;
+               /* Exit if we find at least one lock which is not consumed */
+               if (nfs4_check_unlock(fl,request) == 0)
+@@ -640,6 +678,7 @@ nfs4_notify_unlck(struct inode *inode, s
+       if (list_empty(&state->lock_states))
+               clear_bit(LK_STATE_IN_USE, &state->flags);
+       write_unlock(&state->state_lock);
++      nfs4_put_lock_state(lsp);
+ }
+ /*
+@@ -651,20 +690,18 @@ nfs4_put_lock_state(struct nfs4_lock_sta
+ {
+       if (!atomic_dec_and_test(&lsp->ls_count))
+               return;
+-      if (!list_empty(&lsp->ls_locks))
+-              return;
++      BUG_ON (!list_empty(&lsp->ls_locks));
+       kfree(lsp);
+ }
+ /*
+-* Called with sp->so_sema held.
++* Called with sp->so_sema and clp->cl_sem held.
+ *
+ * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
+ * failed with a seqid incrementing error -
+ * see comments nfs_fs.h:seqid_mutating_error()
+ */
+-void
+-nfs4_increment_seqid(int status, struct nfs4_state_owner *sp)
++void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp)
+ {
+       if (status == NFS_OK || seqid_mutating_err(-status))
+               sp->so_seqid++;
+@@ -693,21 +730,14 @@ nfs4_recover_state(void *data)
+       init_completion(&args.complete);
+-      down_read(&clp->cl_sem);
+-      if (test_and_set_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state))
+-              goto out_failed;
+       if (kernel_thread(reclaimer, &args, CLONE_KERNEL) < 0)
+               goto out_failed_clear;
+       wait_for_completion(&args.complete);
+       return;
+ out_failed_clear:
+-      smp_mb__before_clear_bit();
+-      clear_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state);
+-      smp_mb__after_clear_bit();
++      set_bit(NFS4CLNT_OK, &clp->cl_state);
+       wake_up_all(&clp->cl_waitq);
+       rpc_wake_up(&clp->cl_rpcwaitq);
+-out_failed:
+-      up_read(&clp->cl_sem);
+ }
+ /*
+@@ -718,24 +748,66 @@ nfs4_schedule_state_recovery(struct nfs4
+ {
+       if (!clp)
+               return;
+-      smp_mb__before_clear_bit();
+-      clear_bit(NFS4CLNT_OK, &clp->cl_state);
+-      smp_mb__after_clear_bit();
+-      schedule_work(&clp->cl_recoverd);
++      if (test_and_clear_bit(NFS4CLNT_OK, &clp->cl_state))
++              schedule_work(&clp->cl_recoverd);
+ }
+-static int
+-nfs4_reclaim_open_state(struct nfs4_state_owner *sp)
++static int nfs4_reclaim_locks(struct nfs4_state *state)
++{
++      struct inode *inode = state->inode;
++      struct file_lock *fl;
++      int status = 0;
++
++      for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) {
++              if (!(fl->fl_flags & FL_POSIX))
++                      continue;
++              if (((struct nfs_open_context *)fl->fl_file->private_data)->state != state)
++                      continue;
++              status = nfs4_lock_reclaim(state, fl);
++              if (status >= 0)
++                      continue;
++              switch (status) {
++                      default:
++                              printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
++                                              __FUNCTION__, status);
++                      case -NFS4ERR_EXPIRED:
++                      case -NFS4ERR_NO_GRACE:
++                      case -NFS4ERR_RECLAIM_BAD:
++                      case -NFS4ERR_RECLAIM_CONFLICT:
++                              /* kill_proc(fl->fl_pid, SIGLOST, 1); */
++                              break;
++                      case -NFS4ERR_STALE_CLIENTID:
++                              goto out_err;
++              }
++      }
++      return 0;
++out_err:
++      return status;
++}
++
++static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp)
+ {
+       struct nfs4_state *state;
++      struct nfs4_lock_state *lock;
+       int status = 0;
+       list_for_each_entry(state, &sp->so_states, open_states) {
+               if (state->state == 0)
+                       continue;
+               status = nfs4_open_reclaim(sp, state);
+-              if (status >= 0)
++              list_for_each_entry(lock, &state->lock_states, ls_locks)
++                      lock->flags &= ~NFS_LOCK_INITIALIZED;
++              if (status >= 0) {
++                      status = nfs4_reclaim_locks(state);
++                      if (status < 0)
++                              goto out_err;
++                      list_for_each_entry(lock, &state->lock_states, ls_locks) {
++                              if (!(lock->flags & NFS_LOCK_INITIALIZED))
++                                      printk("%s: Lock reclaim failed!\n",
++                                                      __FUNCTION__);
++                      }
+                       continue;
++              }
+               switch (status) {
+                       default:
+                               printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
+@@ -762,75 +834,55 @@ out_err:
+       return status;
+ }
+-static int
+-reclaimer(void *ptr)
++static int reclaimer(void *ptr)
+ {
+       struct reclaimer_args *args = (struct reclaimer_args *)ptr;
+       struct nfs4_client *clp = args->clp;
+       struct nfs4_state_owner *sp;
+-      int generation;
+       int status;
+       daemonize("%u.%u.%u.%u-reclaim", NIPQUAD(clp->cl_addr));
+       allow_signal(SIGKILL);
++      atomic_inc(&clp->cl_count);
+       complete(&args->complete);
++      /* Ensure exclusive access to NFSv4 state */
++      lock_kernel();
++      down_write(&clp->cl_sem);
+       /* Are there any NFS mounts out there? */
+       if (list_empty(&clp->cl_superblocks))
+               goto out;
+-      if (!test_bit(NFS4CLNT_NEW, &clp->cl_state)) {
+-              status = nfs4_proc_renew(clp);
+-              if (status == 0) {
+-                      set_bit(NFS4CLNT_OK, &clp->cl_state);
+-                      goto out;
+-              }
+-      }
+-      status = nfs4_proc_setclientid(clp, 0, 0);
+-      if (status)
+-              goto out_error;
+-      status = nfs4_proc_setclientid_confirm(clp);
++restart_loop:
++      status = nfs4_proc_renew(clp);
++      if (status == 0)
++              goto out;
++      status = nfs4_init_client(clp);
+       if (status)
+               goto out_error;
+-      generation = ++(clp->cl_generation);
+-      clear_bit(NFS4CLNT_NEW, &clp->cl_state);
+-      set_bit(NFS4CLNT_OK, &clp->cl_state);
+-      up_read(&clp->cl_sem);
+-      nfs4_schedule_state_renewal(clp);
+-restart_loop:
+-      spin_lock(&clp->cl_lock);
++      /* Mark all delagations for reclaim */
++      nfs_delegation_mark_reclaim(clp);
++      /* Note: list is protected by exclusive lock on cl->cl_sem */
+       list_for_each_entry(sp, &clp->cl_state_owners, so_list) {
+-              if (sp->so_generation - generation >= 0)
+-                      continue;
+-              atomic_inc(&sp->so_count);
+-              spin_unlock(&clp->cl_lock);
+-              down(&sp->so_sema);
+-              if (sp->so_generation - generation < 0) {
+-                      smp_rmb();
+-                      sp->so_generation = clp->cl_generation;
+-                      status = nfs4_reclaim_open_state(sp);
+-              }
+-              up(&sp->so_sema);
+-              nfs4_put_state_owner(sp);
++              status = nfs4_reclaim_open_state(sp);
+               if (status < 0) {
+                       if (status == -NFS4ERR_STALE_CLIENTID)
+-                              nfs4_schedule_state_recovery(clp);
+-                      goto out;
++                              goto restart_loop;
++                      goto out_error;
+               }
+-              goto restart_loop;
+       }
+-      spin_unlock(&clp->cl_lock);
++      nfs_delegation_reap_unclaimed(clp);
+ out:
+-      smp_mb__before_clear_bit();
+-      clear_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state);
+-      smp_mb__after_clear_bit();
++      set_bit(NFS4CLNT_OK, &clp->cl_state);
++      up_write(&clp->cl_sem);
++      unlock_kernel();
+       wake_up_all(&clp->cl_waitq);
+       rpc_wake_up(&clp->cl_rpcwaitq);
++      nfs4_put_client(clp);
+       return 0;
+ out_error:
+-      printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u\n",
+-                              NIPQUAD(clp->cl_addr.s_addr));
+-      up_read(&clp->cl_sem);
++      printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n",
++                              NIPQUAD(clp->cl_addr.s_addr), -status);
+       goto out;
+ }
+--- linux-2.6.7/fs/nfs/inode.c.lsec    2004-06-15 23:19:44.000000000 -0600
++++ linux-2.6.7/fs/nfs/inode.c 2005-03-23 14:28:22.818580744 -0700
+@@ -39,6 +39,8 @@
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
++#include "delegation.h"
++
+ #define NFSDBG_FACILITY               NFSDBG_VFS
+ #define NFS_PARANOIA 1
+@@ -123,8 +125,9 @@ nfs_delete_inode(struct inode * inode)
+ {
+       dprintk("NFS: delete_inode(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
++      nfs_wb_all(inode);
+       /*
+-       * The following can never actually happen...
++       * The following should never happen...
+        */
+       if (nfs_have_writebacks(inode)) {
+               printk(KERN_ERR "nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino);
+@@ -133,18 +136,15 @@ nfs_delete_inode(struct inode * inode)
+       clear_inode(inode);
+ }
+-/*
+- * For the moment, the only task for the NFS clear_inode method is to
+- * release the mmap credential
+- */
+ static void
+ nfs_clear_inode(struct inode *inode)
+ {
+       struct nfs_inode *nfsi = NFS_I(inode);
+-      struct rpc_cred *cred = nfsi->mm_cred;
++      struct rpc_cred *cred;
+-      if (cred)
+-              put_rpccred(cred);
++      nfs4_zap_acl_attr(inode);
++      nfs_wb_all(inode);
++      BUG_ON (!list_empty(&nfsi->open_files));
+       cred = nfsi->cache_access.cred;
+       if (cred)
+               put_rpccred(cred);
+@@ -704,7 +704,7 @@ nfs_fhget(struct super_block *sb, struct
+               /* Why so? Because we want revalidate for devices/FIFOs, and
+                * that's precisely what we have in nfs_file_inode_operations.
+                */
+-              inode->i_op = &nfs_file_inode_operations;
++              inode->i_op = NFS_SB(sb)->rpc_ops->file_inode_ops;
+               if (S_ISREG(inode->i_mode)) {
+                       inode->i_fop = &nfs_file_operations;
+                       inode->i_data.a_ops = &nfs_file_aops;
+@@ -859,53 +859,114 @@ int nfs_getattr(struct vfsmount *mnt, st
+       return err;
+ }
++struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred)
++{
++      struct nfs_open_context *ctx;
++
++      ctx = (struct nfs_open_context *)kmalloc(sizeof(*ctx), GFP_KERNEL);
++      if (ctx != NULL) {
++              atomic_set(&ctx->count, 1);
++              ctx->dentry = dget(dentry);
++              ctx->cred = get_rpccred(cred);
++              ctx->state = NULL;
++              ctx->pid = current->tgid;
++              ctx->error = 0;
++              init_waitqueue_head(&ctx->waitq);
++      }
++      return ctx;
++}
++
++struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
++{
++      if (ctx != NULL)
++              atomic_inc(&ctx->count);
++      return ctx;
++}
++
++void put_nfs_open_context(struct nfs_open_context *ctx)
++{
++      if (atomic_dec_and_test(&ctx->count)) {
++              if (ctx->state != NULL)
++                      nfs4_close_state(ctx->state, ctx->mode);
++              if (ctx->cred != NULL)
++                      put_rpccred(ctx->cred);
++              dput(ctx->dentry);
++              kfree(ctx);
++      }
++}
++
+ /*
+  * Ensure that mmap has a recent RPC credential for use when writing out
+  * shared pages
+  */
+-void
+-nfs_set_mmcred(struct inode *inode, struct rpc_cred *cred)
++void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
++{
++      struct inode *inode = filp->f_dentry->d_inode;
++      struct nfs_inode *nfsi = NFS_I(inode);
++
++      filp->private_data = get_nfs_open_context(ctx);
++      spin_lock(&inode->i_lock);
++      list_add(&ctx->list, &nfsi->open_files);
++      spin_unlock(&inode->i_lock);
++}
++
++struct nfs_open_context *nfs_find_open_context(struct inode *inode, int mode)
++{
++      struct nfs_inode *nfsi = NFS_I(inode);
++      struct nfs_open_context *pos, *ctx = NULL;
++
++      spin_lock(&inode->i_lock);
++      list_for_each_entry(pos, &nfsi->open_files, list) {
++              if ((pos->mode & mode) == mode) {
++                      ctx = get_nfs_open_context(pos);
++                      break;
++              }
++      }
++      spin_unlock(&inode->i_lock);
++      return ctx;
++}
++
++void nfs_file_clear_open_context(struct file *filp)
+ {
+-      struct rpc_cred **p = &NFS_I(inode)->mm_cred,
+-                      *oldcred = *p;
++      struct inode *inode = filp->f_dentry->d_inode;
++      struct nfs_open_context *ctx = (struct nfs_open_context *)filp->private_data;
+-      *p = get_rpccred(cred);
+-      if (oldcred)
+-              put_rpccred(oldcred);
++      if (ctx) {
++              filp->private_data = NULL;
++              spin_lock(&inode->i_lock);
++              list_del(&ctx->list);
++              spin_unlock(&inode->i_lock);
++              put_nfs_open_context(ctx);
++      }
+ }
+ /*
+- * These are probably going to contain hooks for
+- * allocating and releasing RPC credentials for
+- * the file. I'll have to think about Tronds patch
+- * a bit more..
++ * These allocate and release file read/write context information.
+  */
+ int nfs_open(struct inode *inode, struct file *filp)
+ {
+-      struct rpc_auth *auth;
++      struct nfs_open_context *ctx;
+       struct rpc_cred *cred;
+-      auth = NFS_CLIENT(inode)->cl_auth;
+-      cred = rpcauth_lookupcred(auth, 0);
+-      filp->private_data = cred;
+-      if ((filp->f_mode & FMODE_WRITE) != 0) {
+-              nfs_set_mmcred(inode, cred);
++      if ((cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0)) == NULL)
++              return -ENOMEM;
++      ctx = alloc_nfs_open_context(filp->f_dentry, cred);
++      put_rpccred(cred);
++      if (ctx == NULL)
++              return -ENOMEM;
++      ctx->mode = filp->f_mode;
++      nfs_file_set_open_context(filp, ctx);
++      put_nfs_open_context(ctx);
++      if ((filp->f_mode & FMODE_WRITE) != 0)
+               nfs_begin_data_update(inode);
+-      }
+       return 0;
+ }
+ int nfs_release(struct inode *inode, struct file *filp)
+ {
+-      struct rpc_cred *cred;
+-
+-      lock_kernel();
+       if ((filp->f_mode & FMODE_WRITE) != 0)
+               nfs_end_data_update(inode);
+-      cred = nfs_file_cred(filp);
+-      if (cred)
+-              put_rpccred(cred);
+-      unlock_kernel();
++      nfs_file_clear_open_context(filp);
+       return 0;
+ }
+@@ -1002,6 +1063,30 @@ out:
+       return status;
+ }
++int nfs_attribute_timeout(struct inode *inode)
++{
++      struct nfs_inode *nfsi = NFS_I(inode);
++
++      if (nfs_have_delegation(inode, FMODE_READ))
++              return 0;
++      return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo);
++}
++
++/**
++ * nfs_revalidate_inode - Revalidate the inode attributes
++ * @server - pointer to nfs_server struct
++ * @inode - pointer to inode struct
++ *
++ * Updates inode attribute information by retrieving the data from the server.
++ */
++int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
++{
++      if (!(NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
++                      && !nfs_attribute_timeout(inode))
++              return NFS_STALE(inode) ? -ESTALE : 0;
++      return __nfs_revalidate_inode(server, inode);
++}
++
+ /**
+  * nfs_begin_data_update
+  * @inode - pointer to inode
+@@ -1023,11 +1108,13 @@ void nfs_end_data_update(struct inode *i
+ {
+       struct nfs_inode *nfsi = NFS_I(inode);
+-      /* Mark the attribute cache for revalidation */
+-      nfsi->flags |= NFS_INO_INVALID_ATTR;
+-      /* Directories and symlinks: invalidate page cache too */
+-      if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+-              nfsi->flags |= NFS_INO_INVALID_DATA;
++      if (!nfs_have_delegation(inode, FMODE_READ)) {
++              /* Mark the attribute cache for revalidation */
++              nfsi->flags |= NFS_INO_INVALID_ATTR;
++              /* Directories and symlinks: invalidate page cache too */
++              if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
++                      nfsi->flags |= NFS_INO_INVALID_DATA;
++      }
+       nfsi->cache_change_attribute ++;
+       atomic_dec(&nfsi->data_updates);
+ }
+@@ -1068,6 +1155,10 @@ int nfs_refresh_inode(struct inode *inod
+       loff_t cur_size, new_isize;
+       int data_unstable;
++      /* Do we hold a delegation? */
++      if (nfs_have_delegation(inode, FMODE_READ))
++              return 0;
++
+       /* Are we in the process of updating data on the server? */
+       data_unstable = nfs_caches_unstable(inode);
+@@ -1240,6 +1331,7 @@ static int nfs_update_inode(struct inode
+       inode->i_nlink = fattr->nlink;
+       inode->i_uid = fattr->uid;
+       inode->i_gid = fattr->gid;
++      nfs4_zap_acl_attr(inode);
+       if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+               /*
+@@ -1265,7 +1357,8 @@ static int nfs_update_inode(struct inode
+       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+                               || S_ISLNK(inode->i_mode)))
+               invalid &= ~NFS_INO_INVALID_DATA;
+-      nfsi->flags |= invalid;
++      if (!nfs_have_delegation(inode, FMODE_READ))
++              nfsi->flags |= invalid;
+       return 0;
+  out_changed:
+@@ -1400,6 +1493,52 @@ static struct file_system_type nfs_fs_ty
+ #ifdef CONFIG_NFS_V4
++#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
++
++int
++nfs_setxattr(struct dentry *dentry, const char *key, const void *buf,
++              size_t buflen, int flags)
++{
++      struct inode *inode = dentry->d_inode;
++
++      if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
++              return -EINVAL;
++
++        if (!S_ISREG(inode->i_mode) &&
++            (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++                return -EPERM;
++
++      return nfs4_proc_set_acl(inode, buf, buflen);
++}
++
++/* The getxattr man page suggests returning -ENODATA for unknown attributes,
++ * and that's what we'll do for e.g. user attributes that haven't been set.
++ * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported
++ * attributes in kernel-managed attribute namespaces. */
++ssize_t
++nfs_getxattr(struct dentry *dentry, const char *key, void *buf,
++              size_t buflen)
++{
++      struct inode *inode = dentry->d_inode;
++
++      if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0)
++              return -EOPNOTSUPP;
++
++      return nfs4_proc_get_acl(inode, buf, buflen);
++}
++
++ssize_t
++nfs_listxattr(struct dentry *dentry, char *buf, size_t buflen)
++{
++      ssize_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1;
++
++      if (buf && buflen < len)
++              return -ERANGE;
++      if (buf)
++              memcpy(buf, XATTR_NAME_NFSV4_ACL, len);
++      return len;
++}
++
+ static void nfs4_clear_inode(struct inode *);
+ static struct super_operations nfs4_sops = { 
+@@ -1423,6 +1562,12 @@ static void nfs4_clear_inode(struct inod
+ {
+       struct nfs_inode *nfsi = NFS_I(inode);
++      /* If we are holding a delegation, return it! */
++      if (nfsi->delegation != NULL)
++              nfs_inode_return_delegation(inode);
++      /* First call standard NFS clear_inode() code */
++      nfs_clear_inode(inode);
++      /* Now clear out any remaining state */
+       while (!list_empty(&nfsi->open_states)) {
+               struct nfs4_state *state;
+               
+@@ -1437,8 +1582,6 @@ static void nfs4_clear_inode(struct inod
+               BUG_ON(atomic_read(&state->count) != 1);
+               nfs4_close_state(state, state->state);
+       }
+-      /* Now call standard NFS clear_inode() code */
+-      nfs_clear_inode(inode);
+ }
+@@ -1536,8 +1679,19 @@ static int nfs4_fill_super(struct super_
+               memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
+               nfs_idmap_new(clp);
+       }
+-      if (list_empty(&clp->cl_superblocks))
+-              clear_bit(NFS4CLNT_OK, &clp->cl_state);
++      /* Fire up rpciod if not yet running */
++      if (rpciod_up() != 0) {
++              printk(KERN_WARNING "NFS: couldn't start rpciod!\n");
++              goto out_fail;
++      }
++
++      if (list_empty(&clp->cl_superblocks)) {
++              err = nfs4_init_client(clp);
++              if (err != 0) {
++                      up_write(&clp->cl_sem);
++                      goto out_rpciod;
++              }
++      }
+       list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+       clnt = rpc_clone_client(clp->cl_rpcclient);
+       if (!IS_ERR(clnt))
+@@ -1567,17 +1721,10 @@ static int nfs4_fill_super(struct super_
+               }
+       }
+-      /* Fire up rpciod if not yet running */
+-      if (rpciod_up() != 0) {
+-              printk(KERN_WARNING "NFS: couldn't start rpciod!\n");
+-              goto out_shutdown;
+-      }
+-
+       sb->s_op = &nfs4_sops;
+       err = nfs_sb_init(sb, authflavour);
+       if (err == 0)
+               return 0;
+-      rpciod_down();
+ out_shutdown:
+       rpc_shutdown_client(server->client);
+ out_remove_list:
+@@ -1585,6 +1732,8 @@ out_remove_list:
+       list_del_init(&server->nfs4_siblings);
+       up_write(&server->nfs4_state->cl_sem);
+       destroy_nfsv4_state(server);
++out_rpciod:
++      rpciod_down();
+ out_fail:
+       if (clp)
+               nfs4_put_client(clp);
+@@ -1709,22 +1858,31 @@ out_free:
+       return s;
+ }
++static void nfs4_kill_super(struct super_block *sb)
++{
++      nfs_return_all_delegations(sb);
++      nfs_kill_super(sb);
++}
++
+ static struct file_system_type nfs4_fs_type = {
+       .owner          = THIS_MODULE,
+       .name           = "nfs4",
+       .get_sb         = nfs4_get_sb,
+-      .kill_sb        = nfs_kill_super,
++      .kill_sb        = nfs4_kill_super,
+       .fs_flags       = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ };
+-#define nfs4_zero_state(nfsi) \
++#define nfs4_init_once(nfsi) \
+       do { \
+               INIT_LIST_HEAD(&(nfsi)->open_states); \
++              nfsi->delegation = NULL; \
++              nfsi->delegation_state = 0; \
++              init_rwsem(&nfsi->rwsem); \
+       } while(0)
+ #define register_nfs4fs() register_filesystem(&nfs4_fs_type)
+ #define unregister_nfs4fs() unregister_filesystem(&nfs4_fs_type)
+ #else
+-#define nfs4_zero_state(nfsi) \
++#define nfs4_init_once(nfsi) \
+       do { } while (0)
+ #define register_nfs4fs() (0)
+ #define unregister_nfs4fs()
+@@ -1746,8 +1904,8 @@ static struct inode *nfs_alloc_inode(str
+       if (!nfsi)
+               return NULL;
+       nfsi->flags = 0;
+-      nfsi->mm_cred = NULL;
+-      nfs4_zero_state(nfsi);
++      nfsi->acl_len = 0;
++      nfsi->acl = NULL;
+       return &nfsi->vfs_inode;
+ }
+@@ -1765,12 +1923,14 @@ static void init_once(void * foo, kmem_c
+               inode_init_once(&nfsi->vfs_inode);
+               INIT_LIST_HEAD(&nfsi->dirty);
+               INIT_LIST_HEAD(&nfsi->commit);
++              INIT_LIST_HEAD(&nfsi->open_files);
+               INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
+               atomic_set(&nfsi->data_updates, 0);
+               nfsi->ndirty = 0;
+               nfsi->ncommit = 0;
+               nfsi->npages = 0;
+               init_waitqueue_head(&nfsi->nfs_i_wait);
++              nfs4_init_once(nfsi);
+       }
+ }
+  
+--- linux-2.6.7/fs/nfs/dir.c.lsec      2004-06-15 23:19:23.000000000 -0600
++++ linux-2.6.7/fs/nfs/dir.c   2005-03-23 14:28:22.701598528 -0700
+@@ -32,6 +32,8 @@
+ #include <linux/smp_lock.h>
+ #include <linux/namei.h>
++#include "delegation.h"
++
+ #define NFS_PARANOIA 1
+ /* #define NFS_DEBUG_VERBOSE 1 */
+@@ -88,6 +90,9 @@ struct inode_operations nfs4_dir_inode_o
+       .permission     = nfs_permission,
+       .getattr        = nfs_getattr,
+       .setattr        = nfs_setattr,
++      .getxattr       = nfs_getxattr,
++      .setxattr       = nfs_setxattr,
++      .listxattr      = nfs_listxattr,
+ };
+ #endif /* CONFIG_NFS_V4 */
+@@ -850,22 +855,22 @@ static int nfs_open_revalidate(struct de
+       unsigned long verifier;
+       int openflags, ret = 0;
+-      /* NFS only supports OPEN for regular files */
+-      if (inode && !S_ISREG(inode->i_mode))
+-              goto no_open;
+       parent = dget_parent(dentry);
+       dir = parent->d_inode;
+       if (!is_atomic_open(dir, nd))
+               goto no_open;
++      /* We can't create new files in nfs_open_revalidate(), so we
++       * optimize away revalidation of negative dentries.
++       */
++      if (inode == NULL)
++              goto out;
++      /* NFS only supports OPEN on regular files */
++      if (!S_ISREG(inode->i_mode))
++              goto no_open;
+       openflags = nd->intent.open.flags;
+-      if (openflags & O_CREAT) {
+-              /* If this is a negative dentry, just drop it */
+-              if (!inode)
+-                      goto out;
+-              /* If this is exclusive open, just revalidate */
+-              if (openflags & O_EXCL)
+-                      goto no_open;
+-      }
++      /* We cannot do exclusive creation on a positive dentry */
++      if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
++              goto no_open;
+       /* We can't create new files, or truncate existing ones here */
+       openflags &= ~(O_CREAT|O_TRUNC);
+@@ -887,6 +892,8 @@ out:
+       return ret;
+ no_open:
+       dput(parent);
++      if (inode != NULL && nfs_have_delegation(inode, FMODE_READ))
++              return 1;
+       return nfs_lookup_revalidate(dentry, nd);
+ }
+ #endif /* CONFIG_NFSV4 */
+@@ -1299,19 +1306,6 @@ nfs_symlink(struct inode *dir, struct de
+       dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id,
+               dir->i_ino, dentry->d_name.name, symname);
+-      error = -ENAMETOOLONG;
+-      switch (NFS_PROTO(dir)->version) {
+-              case 2:
+-                      if (strlen(symname) > NFS2_MAXPATHLEN)
+-                              goto out;
+-                      break;
+-              case 3:
+-                      if (strlen(symname) > NFS3_MAXPATHLEN)
+-                              goto out;
+-              default:
+-                      break;
+-      }
+-
+ #ifdef NFS_PARANOIA
+ if (dentry->d_inode)
+ printk("nfs_proc_symlink: %s/%s not negative!\n",
+@@ -1341,8 +1335,6 @@ dentry->d_parent->d_name.name, dentry->d
+               d_drop(dentry);
+       }
+       unlock_kernel();
+-
+-out:
+       return error;
+ }
+@@ -1498,10 +1490,56 @@ out:
+       return error;
+ }
+-int
+-nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
++int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
++{
++      struct nfs_access_entry *cache = &NFS_I(inode)->cache_access;
++
++      if (cache->cred != cred
++                      || time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))
++                      || (NFS_FLAGS(inode) & NFS_INO_INVALID_ATTR))
++              return -ENOENT;
++      memcpy(res, cache, sizeof(*res));
++      return 0;
++}
++
++void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
++{
++      struct nfs_access_entry *cache = &NFS_I(inode)->cache_access;
++
++      if (cache->cred != set->cred) {
++              if (cache->cred)
++                      put_rpccred(cache->cred);
++              cache->cred = get_rpccred(set->cred);
++      }
++      cache->jiffies = set->jiffies;
++      cache->mask = set->mask;
++}
++
++static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
++{
++      struct nfs_access_entry cache;
++      int status;
++
++      status = nfs_access_get_cached(inode, cred, &cache);
++      if (status == 0)
++              goto out;
++
++      /* Be clever: ask server to check for all possible rights */
++      cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
++      cache.cred = cred;
++      cache.jiffies = jiffies;
++      status = NFS_PROTO(inode)->access(inode, &cache);
++      if (status != 0)
++              return status;
++      nfs_access_add_cache(inode, &cache);
++out:
++      if ((cache.mask & mask) == mask)
++              return 0;
++      return -EACCES;
++}
++
++int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+ {
+-      struct nfs_access_cache *cache = &NFS_I(inode)->cache_access;
+       struct rpc_cred *cred;
+       int mode = inode->i_mode;
+       int res;
+@@ -1542,24 +1580,7 @@ nfs_permission(struct inode *inode, int 
+               goto out_notsup;
+       cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0);
+-      if (cache->cred == cred
+-          && time_before(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))
+-          && !(NFS_FLAGS(inode) & NFS_INO_INVALID_ATTR)) {
+-              if (!(res = cache->err)) {
+-                      /* Is the mask a subset of an accepted mask? */
+-                      if ((cache->mask & mask) == mask)
+-                              goto out;
+-              } else {
+-                      /* ...or is it a superset of a rejected mask? */
+-                      if ((cache->mask & mask) == cache->mask)
+-                              goto out;
+-              }
+-      }
+-
+-      res = NFS_PROTO(inode)->access(inode, cred, mask);
+-      if (!res || res == -EACCES)
+-              goto add_cache;
+-out:
++      res = nfs_do_access(inode, cred, mask);
+       put_rpccred(cred);
+       unlock_kernel();
+       return res;
+@@ -1568,15 +1589,6 @@ out_notsup:
+       res = vfs_permission(inode, mask);
+       unlock_kernel();
+       return res;
+-add_cache:
+-      cache->jiffies = jiffies;
+-      if (cache->cred)
+-              put_rpccred(cache->cred);
+-      cache->cred = cred;
+-      cache->mask = mask;
+-      cache->err = res;
+-      unlock_kernel();
+-      return res;
+ }
+ /*
+--- linux-2.6.7/fs/nfs/unlink.c.lsec   2004-06-15 23:20:04.000000000 -0600
++++ linux-2.6.7/fs/nfs/unlink.c        2005-03-23 14:28:23.170527240 -0700
+@@ -215,7 +215,6 @@ nfs_complete_unlink(struct dentry *dentr
+       spin_lock(&dentry->d_lock);
+       dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+       spin_unlock(&dentry->d_lock);
+-      if (data->task.tk_rpcwait == &nfs_delete_queue)
+-              rpc_wake_up_task(&data->task);
++      rpc_wake_up_task(&data->task);
+       nfs_put_unlinkdata(data);
+ }
+--- linux-2.6.7/fs/nfs/callback_xdr.c.lsec     2005-03-23 14:28:22.545622240 -0700
++++ linux-2.6.7/fs/nfs/callback_xdr.c  2005-03-23 14:28:22.544622392 -0700
+@@ -0,0 +1,481 @@
++/*
++ * linux/fs/nfs/callback_xdr.c
++ *
++ * Copyright (C) 2004 Trond Myklebust
++ *
++ * NFSv4 callback encode/decode procedures
++ */
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/sunrpc/svc.h>
++#include <linux/nfs4.h>
++#include <linux/nfs_fs.h>
++#include "callback.h"
++
++#define CB_OP_TAGLEN_MAXSZ    (512)
++#define CB_OP_HDR_RES_MAXSZ   (2 + CB_OP_TAGLEN_MAXSZ)
++#define CB_OP_GETATTR_BITMAP_MAXSZ    (4)
++#define CB_OP_GETATTR_RES_MAXSZ       (CB_OP_HDR_RES_MAXSZ + \
++                              CB_OP_GETATTR_BITMAP_MAXSZ + \
++                              2 + 2 + 3 + 3)
++#define CB_OP_RECALL_RES_MAXSZ        (CB_OP_HDR_RES_MAXSZ)
++
++#define NFSDBG_FACILITY NFSDBG_CALLBACK
++
++typedef unsigned (*callback_process_op_t)(void *, void *);
++typedef unsigned (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
++typedef unsigned (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
++
++
++struct callback_op {
++      callback_process_op_t process_op;
++      callback_decode_arg_t decode_args;
++      callback_encode_res_t encode_res;
++      long res_maxsize;
++};
++
++static struct callback_op callback_ops[];
++
++static int nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp)
++{
++      return htonl(NFS4_OK);
++}
++
++static int nfs4_decode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy)
++{
++      return xdr_argsize_check(rqstp, p);
++}
++
++static int nfs4_encode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy)
++{
++      return xdr_ressize_check(rqstp, p);
++}
++
++static uint32_t *read_buf(struct xdr_stream *xdr, int nbytes)
++{
++      uint32_t *p;
++
++      p = xdr_inline_decode(xdr, nbytes);
++      if (unlikely(p == NULL))
++              printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n");
++      return p;
++}
++
++static unsigned decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str)
++{
++      uint32_t *p;
++
++      p = read_buf(xdr, 4);
++      if (unlikely(p == NULL))
++              return htonl(NFS4ERR_RESOURCE);
++      *len = ntohl(*p);
++
++      if (*len != 0) {
++              p = read_buf(xdr, *len);
++              if (unlikely(p == NULL))
++                      return htonl(NFS4ERR_RESOURCE);
++              *str = (const char *)p;
++      } else
++              *str = NULL;
++
++      return 0;
++}
++
++static unsigned decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
++{
++      uint32_t *p;
++
++      p = read_buf(xdr, 4);
++      if (unlikely(p == NULL))
++              return htonl(NFS4ERR_RESOURCE);
++      fh->size = ntohl(*p);
++      if (fh->size > NFS4_FHSIZE)
++              return htonl(NFS4ERR_BADHANDLE);
++      p = read_buf(xdr, fh->size);
++      if (unlikely(p == NULL))
++              return htonl(NFS4ERR_RESOURCE);
++      memcpy(&fh->data[0], p, fh->size);
++      memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size);
++      return 0;
++}
++
++static unsigned decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
++{
++      uint32_t *p;
++      unsigned int attrlen;
++
++      p = read_buf(xdr, 4);
++      if (unlikely(p == NULL))
++              return htonl(NFS4ERR_RESOURCE);
++      attrlen = ntohl(*p);
++      p = read_buf(xdr, attrlen << 2);
++      if (unlikely(p == NULL))
++              return htonl(NFS4ERR_RESOURCE);
++      if (likely(attrlen > 0))
++              bitmap[0] = ntohl(*p++);
++      if (attrlen > 1)
++              bitmap[1] = ntohl(*p);
++      return 0;
++}
++
++static unsigned decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
++{
++      uint32_t *p;
++
++      p = read_buf(xdr, 16);
++      if (unlikely(p == NULL))
++              return htonl(NFS4ERR_RESOURCE);
++      memcpy(stateid->data, p, 16);
++      return 0;
++}
++
++static unsigned decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
++{
++      uint32_t *p;
++      unsigned int minor_version;
++      unsigned status;
++
++      status = decode_string(xdr, &hdr->taglen, &hdr->tag);
++      if (unlikely(status != 0))
++              return status;
++      /* We do not like overly long tags! */
++      if (hdr->taglen > CB_OP_TAGLEN_MAXSZ-12 || hdr->taglen < 0) {
++              printk("NFSv4 CALLBACK %s: client sent tag of length %u\n",
++                              __FUNCTION__, hdr->taglen);
++              return htonl(NFS4ERR_RESOURCE);
++      }
++      p = read_buf(xdr, 12);
++      if (unlikely(p == NULL))
++              return htonl(NFS4ERR_RESOURCE);
++      minor_version = ntohl(*p++);
++      /* Check minor version is zero. */
++      if (minor_version != 0) {
++              printk(KERN_WARNING "%s: NFSv4 server callback with illegal minor version %u!\n",
++                              __FUNCTION__, minor_version);
++              return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
++      }
++      hdr->callback_ident = ntohl(*p++);
++      hdr->nops = ntohl(*p);
++      return 0;
++}
++
++static unsigned decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
++{
++      uint32_t *p;
++      p = read_buf(xdr, 4);
++      if (unlikely(p == NULL))
++              return htonl(NFS4ERR_RESOURCE);
++      *op = ntohl(*p);
++      return 0;
++}
++
++static unsigned decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args)
++{
++      unsigned status;
++
++      status = decode_fh(xdr, &args->fh);
++      if (unlikely(status != 0))
++              goto out;
++      args->addr = &rqstp->rq_addr;
++      status = decode_bitmap(xdr, args->bitmap);
++out:
++      dprintk("%s: exit with status = %d\n", __FUNCTION__, status);
++      return status;
++}
++
++static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
++{
++      uint32_t *p;
++      unsigned status;
++
++      args->addr = &rqstp->rq_addr;
++      status = decode_stateid(xdr, &args->stateid);
++      if (unlikely(status != 0))
++              goto out;
++      p = read_buf(xdr, 4);
++      if (unlikely(p == NULL)) {
++              status = htonl(NFS4ERR_RESOURCE);
++              goto out;
++      }
++      args->truncate = ntohl(*p);
++      status = decode_fh(xdr, &args->fh);
++out:
++      dprintk("%s: exit with status = %d\n", __FUNCTION__, status);
++      return 0;
++}
++
++static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
++{
++      uint32_t *p;
++
++      p = xdr_reserve_space(xdr, 4 + len);
++      if (unlikely(p == NULL))
++              return htonl(NFS4ERR_RESOURCE);
++      xdr_encode_opaque(p, str, len);
++      return 0;
++}
++
++#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE)
++#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY)
++static unsigned encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, uint32_t **savep)
++{
++      uint32_t bm[2];
++      uint32_t *p;
++
++      bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0);
++      bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1);
++      if (bm[1] != 0) {
++              p = xdr_reserve_space(xdr, 16);
++              if (unlikely(p == NULL))
++                      return htonl(NFS4ERR_RESOURCE);
++              *p++ = htonl(2);
++              *p++ = bm[0];
++              *p++ = bm[1];
++      } else if (bm[0] != 0) {
++              p = xdr_reserve_space(xdr, 12);
++              if (unlikely(p == NULL))
++                      return htonl(NFS4ERR_RESOURCE);
++              *p++ = htonl(1);
++              *p++ = bm[0];
++      } else {
++              p = xdr_reserve_space(xdr, 8);
++              if (unlikely(p == NULL))
++                      return htonl(NFS4ERR_RESOURCE);
++              *p++ = htonl(0);
++      }
++      *savep = p;
++      return 0;
++}
++
++static unsigned encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
++{
++      uint32_t *p;
++
++      if (!(bitmap[0] & FATTR4_WORD0_CHANGE))
++              return 0;
++      p = xdr_reserve_space(xdr, 8);
++      if (unlikely(p == 0))
++              return htonl(NFS4ERR_RESOURCE);
++      p = xdr_encode_hyper(p, change);
++      return 0;
++}
++
++static unsigned encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
++{
++      uint32_t *p;
++
++      if (!(bitmap[0] & FATTR4_WORD0_SIZE))
++              return 0;
++      p = xdr_reserve_space(xdr, 8);
++      if (unlikely(p == 0))
++              return htonl(NFS4ERR_RESOURCE);
++      p = xdr_encode_hyper(p, size);
++      return 0;
++}
++
++static unsigned encode_attr_time(struct xdr_stream *xdr, const struct timespec *time)
++{
++      uint32_t *p;
++
++      p = xdr_reserve_space(xdr, 12);
++      if (unlikely(p == 0))
++              return htonl(NFS4ERR_RESOURCE);
++      p = xdr_encode_hyper(p, time->tv_sec);
++      *p = htonl(time->tv_nsec);
++      return 0;
++}
++
++static unsigned encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
++{
++      if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA))
++              return 0;
++      return encode_attr_time(xdr,time);
++}
++
++static unsigned encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
++{
++      if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY))
++              return 0;
++      return encode_attr_time(xdr,time);
++}
++
++static unsigned encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
++{
++      unsigned status;
++
++      hdr->status = xdr_reserve_space(xdr, 4);
++      if (unlikely(hdr->status == NULL))
++              return htonl(NFS4ERR_RESOURCE);
++      status = encode_string(xdr, hdr->taglen, hdr->tag);
++      if (unlikely(status != 0))
++              return status;
++      hdr->nops = xdr_reserve_space(xdr, 4);
++      if (unlikely(hdr->nops == NULL))
++              return htonl(NFS4ERR_RESOURCE);
++      return 0;
++}
++
++static unsigned encode_op_hdr(struct xdr_stream *xdr, uint32_t op, uint32_t res)
++{
++      uint32_t *p;
++      
++      p = xdr_reserve_space(xdr, 8);
++      if (unlikely(p == NULL))
++              return htonl(NFS4ERR_RESOURCE);
++      *p++ = htonl(op);
++      *p = htonl(res);
++      return 0;
++}
++
++static unsigned encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res)
++{
++      uint32_t *savep;
++      unsigned status = res->status;
++      
++      if (unlikely(status != 0))
++              goto out;
++      status = encode_attr_bitmap(xdr, res->bitmap, &savep);
++      if (unlikely(status != 0))
++              goto out;
++      status = encode_attr_change(xdr, res->bitmap, res->change_attr);
++      if (unlikely(status != 0))
++              goto out;
++      status = encode_attr_size(xdr, res->bitmap, res->size);
++      if (unlikely(status != 0))
++              goto out;
++      status = encode_attr_ctime(xdr, res->bitmap, &res->ctime);
++      if (unlikely(status != 0))
++              goto out;
++      status = encode_attr_mtime(xdr, res->bitmap, &res->mtime);
++      *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1)));
++out:
++      dprintk("%s: exit with status = %d\n", __FUNCTION__, status);
++      return status;
++}
++
++static unsigned process_op(struct svc_rqst *rqstp,
++              struct xdr_stream *xdr_in, void *argp,
++              struct xdr_stream *xdr_out, void *resp)
++{
++      struct callback_op *op;
++      unsigned int op_nr;
++      unsigned int status = 0;
++      long maxlen;
++      unsigned res;
++
++      dprintk("%s: start\n", __FUNCTION__);
++      status = decode_op_hdr(xdr_in, &op_nr);
++      if (unlikely(status != 0)) {
++              op_nr = OP_CB_ILLEGAL;
++              op = &callback_ops[0];
++      } else if (unlikely(op_nr != OP_CB_GETATTR && op_nr != OP_CB_RECALL)) {
++              op_nr = OP_CB_ILLEGAL;
++              op = &callback_ops[0];
++              status = htonl(NFS4ERR_OP_ILLEGAL);
++      } else
++              op = &callback_ops[op_nr];
++
++      maxlen = xdr_out->end - xdr_out->p;
++      if (maxlen > 0 && maxlen < PAGE_SIZE) {
++              if (likely(status == 0 && op->decode_args != NULL))
++                      status = op->decode_args(rqstp, xdr_in, argp);
++              if (likely(status == 0 && op->process_op != NULL))
++                      status = op->process_op(argp, resp);
++      } else
++              status = htonl(NFS4ERR_RESOURCE);
++
++      res = encode_op_hdr(xdr_out, op_nr, status);
++      if (status == 0)
++              status = res;
++      if (op->encode_res != NULL && status == 0)
++              status = op->encode_res(rqstp, xdr_out, resp);
++      dprintk("%s: done, status = %d\n", __FUNCTION__, status);
++      return status;
++}
++
++/*
++ * Decode, process and encode a COMPOUND
++ */
++static int nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
++{
++      struct cb_compound_hdr_arg hdr_arg;
++      struct cb_compound_hdr_res hdr_res;
++      struct xdr_stream xdr_in, xdr_out;
++      uint32_t *p;
++      unsigned int status;
++      unsigned int nops = 1;
++
++      dprintk("%s: start\n", __FUNCTION__);
++
++      xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
++
++      p = (uint32_t*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
++      rqstp->rq_res.head[0].iov_len = PAGE_SIZE;
++      xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
++
++      decode_compound_hdr_arg(&xdr_in, &hdr_arg);
++      hdr_res.taglen = hdr_arg.taglen;
++      hdr_res.tag = hdr_arg.tag;
++      encode_compound_hdr_res(&xdr_out, &hdr_res);
++
++      for (;;) {
++              status = process_op(rqstp, &xdr_in, argp, &xdr_out, resp);
++              if (status != 0)
++                      break;
++              if (nops == hdr_arg.nops)
++                      break;
++              nops++;
++      }
++      *hdr_res.status = status;
++      *hdr_res.nops = htonl(nops);
++      dprintk("%s: done, status = %u\n", __FUNCTION__, status);
++      return rpc_success;
++}
++
++/*
++ * Define NFS4 callback COMPOUND ops.
++ */
++static struct callback_op callback_ops[] = {
++      [0] = {
++              .res_maxsize = CB_OP_HDR_RES_MAXSZ,
++      },
++      [OP_CB_GETATTR] = {
++              .process_op = (callback_process_op_t)nfs4_callback_getattr,
++              .decode_args = (callback_decode_arg_t)decode_getattr_args,
++              .encode_res = (callback_encode_res_t)encode_getattr_res,
++              .res_maxsize = CB_OP_GETATTR_RES_MAXSZ,
++      },
++      [OP_CB_RECALL] = {
++              .process_op = (callback_process_op_t)nfs4_callback_recall,
++              .decode_args = (callback_decode_arg_t)decode_recall_args,
++              .res_maxsize = CB_OP_RECALL_RES_MAXSZ,
++      }
++};
++
++/*
++ * Define NFS4 callback procedures
++ */
++static struct svc_procedure nfs4_callback_procedures1[] = {
++      [CB_NULL] = {
++              .pc_func = nfs4_callback_null,
++              .pc_decode = (kxdrproc_t)nfs4_decode_void,
++              .pc_encode = (kxdrproc_t)nfs4_encode_void,
++              .pc_xdrressize = 1,
++      },
++      [CB_COMPOUND] = {
++              .pc_func = nfs4_callback_compound,
++              .pc_encode = (kxdrproc_t)nfs4_encode_void,
++              .pc_argsize = 256,
++              .pc_ressize = 256,
++              .pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
++      }
++};
++
++struct svc_version nfs4_callback_version1 = {
++      .vs_vers = 1,
++      .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1),
++      .vs_proc = nfs4_callback_procedures1,
++      .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
++      .vs_dispatch = NULL,
++};
++
+--- linux-2.6.7/fs/nfs/callback.c.lsec 2005-03-23 14:28:22.484631512 -0700
++++ linux-2.6.7/fs/nfs/callback.c      2005-03-23 14:28:22.483631664 -0700
+@@ -0,0 +1,325 @@
++/*
++ * linux/fs/nfs/callback.c
++ *
++ * Copyright (C) 2004 Trond Myklebust
++ *
++ * NFSv4 callback handling
++ */
++
++#include <linux/config.h>
++#include <linux/completion.h>
++#include <linux/ip.h>
++#include <linux/module.h>
++#include <linux/smp_lock.h>
++#include <linux/sunrpc/svc.h>
++#include <linux/sunrpc/svcsock.h>
++#include <linux/nfs_fs.h>
++#include "callback.h"
++
++#define NFSDBG_FACILITY NFSDBG_CALLBACK
++
++struct nfs_callback_data {
++      unsigned int users;
++      struct svc_serv *serv;
++      pid_t pid;
++      struct completion started;
++      struct completion stopped;
++};
++
++static struct nfs_callback_data nfs_callback_info;
++static DECLARE_MUTEX(nfs_callback_sema);
++static struct svc_program nfs4_callback_program;
++
++unsigned short nfs_callback_tcpport;
++
++/*
++ * This is the callback kernel thread.
++ */
++static void nfs_callback_svc(struct svc_rqst *rqstp)
++{
++      struct svc_serv *serv = rqstp->rq_server;
++      int err;
++
++      __module_get(THIS_MODULE);
++      lock_kernel();
++
++      nfs_callback_info.pid = current->pid;
++      daemonize("nfsv4-svc");
++      /* Process request with signals blocked, but allow SIGKILL.  */
++      allow_signal(SIGKILL);
++
++      complete(&nfs_callback_info.started);
++
++      while (nfs_callback_info.users != 0 || !signalled()) {
++              /*
++               * Listen for a request on the socket
++               */
++              err = svc_recv(serv, rqstp, MAX_SCHEDULE_TIMEOUT);
++              if (err == -EAGAIN || err == -EINTR)
++                      continue;
++              if (err < 0) {
++                      printk(KERN_WARNING
++                                      "%s: terminating on error %d\n",
++                                      __FUNCTION__, -err);
++                      break;
++              }
++              dprintk("%s: request from %u.%u.%u.%u\n", __FUNCTION__,
++                              NIPQUAD(rqstp->rq_addr.sin_addr.s_addr));
++              svc_process(serv, rqstp);
++      }
++
++      nfs_callback_info.pid = 0;
++      complete(&nfs_callback_info.stopped);
++      unlock_kernel();
++      module_put_and_exit(0);
++}
++
++/*
++ * Bring up the server process if it is not already up.
++ */
++int nfs_callback_up(void)
++{
++      struct svc_serv *serv;
++      struct svc_sock *svsk;
++      int ret = 0;
++
++      lock_kernel();
++      down(&nfs_callback_sema);
++      if (nfs_callback_info.users++ || nfs_callback_info.pid != 0)
++              goto out;
++      init_completion(&nfs_callback_info.started);
++      init_completion(&nfs_callback_info.stopped);
++      serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE);
++      ret = -ENOMEM;
++      if (!serv)
++              goto out_err;
++      /* FIXME: We don't want to register this socket with the portmapper */
++      ret = svc_makesock(serv, IPPROTO_TCP, 0);
++      if (ret < 0)
++              goto out_destroy;
++      if (!list_empty(&serv->sv_permsocks)) {
++              svsk = list_entry(serv->sv_permsocks.next,
++                              struct svc_sock, sk_list);
++              nfs_callback_tcpport = ntohs(inet_sk(svsk->sk_sk)->sport);
++              dprintk ("Callback port = 0x%x\n", nfs_callback_tcpport);
++      } else
++              BUG();
++      ret = svc_create_thread(nfs_callback_svc, serv);
++      if (ret < 0)
++              goto out_destroy;
++      nfs_callback_info.serv = serv;
++      wait_for_completion(&nfs_callback_info.started);
++out:
++      up(&nfs_callback_sema);
++      unlock_kernel();
++      return ret;
++out_destroy:
++      svc_destroy(serv);
++out_err:
++      nfs_callback_info.users--;
++      goto out;
++}
++
++/*
++ * Kill the server process if it is not already up.
++ */
++int nfs_callback_down(void)
++{
++      int ret = 0;
++
++      lock_kernel();
++      down(&nfs_callback_sema);
++      if (--nfs_callback_info.users || nfs_callback_info.pid == 0)
++              goto out;
++      kill_proc(nfs_callback_info.pid, SIGKILL, 1);
++      wait_for_completion(&nfs_callback_info.stopped);
++out:
++      up(&nfs_callback_sema);
++      unlock_kernel();
++      return ret;
++}
++
++/*
++ * AUTH_NULL authentication
++ */
++static int nfs_callback_null_accept(struct svc_rqst *rqstp, u32 *authp)
++{
++      struct iovec    *argv = &rqstp->rq_arg.head[0];
++      struct iovec    *resv = &rqstp->rq_res.head[0];
++
++      if (argv->iov_len < 3*4)
++              return SVC_GARBAGE;
++
++      if (svc_getu32(argv) != 0) {
++              dprintk("svc: bad null cred\n");
++              *authp = rpc_autherr_badcred;
++              return SVC_DENIED;
++      }
++      if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) {
++              dprintk("svc: bad null verf\n");
++               *authp = rpc_autherr_badverf;
++               return SVC_DENIED;
++      }
++
++      /* Signal that mapping to nobody uid/gid is required */
++      rqstp->rq_cred.cr_uid = (uid_t) -1;
++      rqstp->rq_cred.cr_gid = (gid_t) -1;
++      rqstp->rq_cred.cr_group_info = groups_alloc(0);
++      if (rqstp->rq_cred.cr_group_info == NULL)
++              return SVC_DROP; /* kmalloc failure - client must retry */
++
++      /* Put NULL verifier */
++      svc_putu32(resv, RPC_AUTH_NULL);
++      svc_putu32(resv, 0);
++      dprintk("%s: success, returning %d!\n", __FUNCTION__, SVC_OK);
++      return SVC_OK;
++}
++
++static int nfs_callback_null_release(struct svc_rqst *rqstp)
++{
++      if (rqstp->rq_cred.cr_group_info)
++              put_group_info(rqstp->rq_cred.cr_group_info);
++      rqstp->rq_cred.cr_group_info = NULL;
++      return 0; /* don't drop */
++}
++
++static struct auth_ops nfs_callback_auth_null = {
++      .name = "null",
++      .flavour = RPC_AUTH_NULL,
++      .accept = nfs_callback_null_accept,
++      .release = nfs_callback_null_release,
++};
++
++/*
++ * AUTH_SYS authentication
++ */
++static int nfs_callback_unix_accept(struct svc_rqst *rqstp, u32 *authp)
++{
++      struct iovec    *argv = &rqstp->rq_arg.head[0];
++      struct iovec    *resv = &rqstp->rq_res.head[0];
++      struct svc_cred *cred = &rqstp->rq_cred;
++      u32 slen, i;
++      int len = argv->iov_len;
++
++      dprintk("%s: start\n", __FUNCTION__);
++      cred->cr_group_info = NULL;
++      rqstp->rq_client = NULL;
++      if ((len -= 3*4) < 0)
++              return SVC_GARBAGE;
++
++      /* Get length, time stamp and machine name */
++      svc_getu32(argv);
++      svc_getu32(argv);
++      slen = XDR_QUADLEN(ntohl(svc_getu32(argv)));
++      if (slen > 64 || (len -= (slen + 3)*4) < 0)
++              goto badcred;
++      argv->iov_base = (void*)((u32*)argv->iov_base + slen);
++      argv->iov_len -= slen*4;
++
++      cred->cr_uid = ntohl(svc_getu32(argv));
++      cred->cr_gid = ntohl(svc_getu32(argv));
++      slen = ntohl(svc_getu32(argv));
++      if (slen > 16 || (len -= (slen + 2)*4) < 0)
++              goto badcred;
++      cred->cr_group_info = groups_alloc(slen);
++      if (cred->cr_group_info == NULL)
++              return SVC_DROP;
++      for (i = 0; i < slen; i++)
++              GROUP_AT(cred->cr_group_info, i) = ntohl(svc_getu32(argv));
++
++      if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) {
++              *authp = rpc_autherr_badverf;
++              return SVC_DENIED;
++      }
++      /* Put NULL verifier */
++      svc_putu32(resv, RPC_AUTH_NULL);
++      svc_putu32(resv, 0);
++      dprintk("%s: success, returning %d!\n", __FUNCTION__, SVC_OK);
++      return SVC_OK;
++badcred:
++      *authp = rpc_autherr_badcred;
++      return SVC_DENIED;
++}
++
++static int nfs_callback_unix_release(struct svc_rqst *rqstp)
++{
++      if (rqstp->rq_cred.cr_group_info)
++              put_group_info(rqstp->rq_cred.cr_group_info);
++      rqstp->rq_cred.cr_group_info = NULL;
++      return 0;
++}
++
++static struct auth_ops nfs_callback_auth_unix = {
++      .name = "unix",
++      .flavour = RPC_AUTH_UNIX,
++      .accept = nfs_callback_unix_accept,
++      .release = nfs_callback_unix_release,
++};
++
++/*
++ * Hook the authentication protocol
++ */
++static int nfs_callback_auth(struct svc_rqst *rqstp, u32 *authp)
++{
++      struct in_addr *addr = &rqstp->rq_addr.sin_addr;
++      struct nfs4_client *clp;
++      struct iovec *argv = &rqstp->rq_arg.head[0];
++      int flavour;
++      int retval;
++
++      /* Don't talk to strangers */
++      clp = nfs4_find_client(addr);
++      if (clp == NULL)
++              return SVC_DROP;
++      dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr));
++      nfs4_put_client(clp);
++      flavour = ntohl(svc_getu32(argv));
++      switch(flavour) {
++              case RPC_AUTH_NULL:
++                      if (rqstp->rq_proc != CB_NULL) {
++                              *authp = rpc_autherr_tooweak;
++                              retval = SVC_DENIED;
++                              break;
++                      }
++                      rqstp->rq_authop = &nfs_callback_auth_null;
++                      retval = nfs_callback_null_accept(rqstp, authp);
++                      break;
++              case RPC_AUTH_UNIX:
++                      /* Eat the authentication flavour */
++                      rqstp->rq_authop = &nfs_callback_auth_unix;
++                      retval = nfs_callback_unix_accept(rqstp, authp);
++                      break;
++              default:
++                      /* FIXME: need to add RPCSEC_GSS upcalls */
++#if 0
++                      svc_ungetu32(argv);
++                      retval = svc_authenticate(rqstp, authp);
++#else
++                      *authp = rpc_autherr_rejectedcred;
++                      retval = SVC_DENIED;
++#endif
++      }
++      dprintk("%s: flavour %d returning error %d\n", __FUNCTION__, flavour, retval);
++      return retval;
++}
++
++/*
++ * Define NFS4 callback program
++ */
++extern struct svc_version nfs4_callback_version1;
++
++static struct svc_version *nfs4_callback_version[] = {
++      [1] = &nfs4_callback_version1,
++};
++
++static struct svc_stat nfs4_callback_stats;
++
++static struct svc_program nfs4_callback_program = {
++      .pg_prog = NFS4_CALLBACK,                       /* RPC service number */
++      .pg_nvers = ARRAY_SIZE(nfs4_callback_version),  /* Number of entries */
++      .pg_vers = nfs4_callback_version,               /* version table */
++      .pg_name = "NFSv4 callback",                    /* service name */
++      .pg_class = "nfs",                              /* authentication class */
++      .pg_stats = &nfs4_callback_stats,
++      .pg_authenticate = nfs_callback_auth,
++};
+--- linux-2.6.7/fs/nfs/read.c.lsec     2004-06-15 23:18:37.000000000 -0600
++++ linux-2.6.7/fs/nfs/read.c  2005-03-23 14:28:23.114535752 -0700
+@@ -91,8 +91,8 @@ int nfs_return_empty_page(struct page *p
+ /*
+  * Read a page synchronously.
+  */
+-static int
+-nfs_readpage_sync(struct file *file, struct inode *inode, struct page *page)
++static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
++              struct page *page)
+ {
+       unsigned int    rsize = NFS_SERVER(inode)->rsize;
+       unsigned int    count = PAGE_CACHE_SIZE;
+@@ -105,10 +105,11 @@ nfs_readpage_sync(struct file *file, str
+       memset(rdata, 0, sizeof(*rdata));
+       rdata->flags = (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0);
++      rdata->cred = ctx->cred;
+       rdata->inode = inode;
+       INIT_LIST_HEAD(&rdata->pages);
+       rdata->args.fh = NFS_FH(inode);
+-      rdata->args.lockowner = current->files;
++      rdata->args.context = ctx;
+       rdata->args.pages = &page;
+       rdata->args.pgbase = 0UL;
+       rdata->args.count = rsize;
+@@ -134,7 +135,7 @@ nfs_readpage_sync(struct file *file, str
+                       rdata->args.count);
+               lock_kernel();
+-              result = NFS_PROTO(inode)->read(rdata, file);
++              result = NFS_PROTO(inode)->read(rdata);
+               unlock_kernel();
+               /*
+@@ -169,8 +170,8 @@ io_error:
+       return result;
+ }
+-static int
+-nfs_readpage_async(struct file *file, struct inode *inode, struct page *page)
++static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
++              struct page *page)
+ {
+       LIST_HEAD(one_request);
+       struct nfs_page *new;
+@@ -179,7 +180,7 @@ nfs_readpage_async(struct file *file, st
+       len = nfs_page_length(inode, page);
+       if (len == 0)
+               return nfs_return_empty_page(page);
+-      new = nfs_create_request(file, inode, page, 0, len);
++      new = nfs_create_request(ctx, inode, page, 0, len);
+       if (IS_ERR(new)) {
+               unlock_page(page);
+               return PTR_ERR(new);
+@@ -202,8 +203,8 @@ static void nfs_readpage_release(struct 
+       nfs_unlock_request(req);
+       dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
+-                      req->wb_inode->i_sb->s_id,
+-                      (long long)NFS_FILEID(req->wb_inode),
++                      req->wb_context->dentry->d_inode->i_sb->s_id,
++                      (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+                       req->wb_bytes,
+                       (long long)req_offset(req));
+ }
+@@ -217,16 +218,15 @@ static void nfs_read_rpcsetup(struct nfs
+       struct inode            *inode;
+       data->req         = req;
+-      data->inode       = inode = req->wb_inode;
+-      data->cred        = req->wb_cred;
++      data->inode       = inode = req->wb_context->dentry->d_inode;
++      data->cred        = req->wb_context->cred;
+       data->args.fh     = NFS_FH(inode);
+       data->args.offset = req_offset(req) + offset;
+       data->args.pgbase = req->wb_pgbase + offset;
+       data->args.pages  = data->pagevec;
+       data->args.count  = count;
+-      data->args.lockowner = req->wb_lockowner;
+-      data->args.state  = req->wb_state;
++      data->args.context = req->wb_context;
+       data->res.fattr   = &data->fattr;
+       data->res.count   = count;
+@@ -396,7 +396,7 @@ nfs_pagein_list(struct list_head *head, 
+       while (!list_empty(head)) {
+               pages += nfs_coalesce_requests(head, &one_request, rpages);
+               req = nfs_list_entry(one_request.next);
+-              error = nfs_pagein_one(&one_request, req->wb_inode);
++              error = nfs_pagein_one(&one_request, req->wb_context->dentry->d_inode);
+               if (error < 0)
+                       break;
+       }
+@@ -500,9 +500,9 @@ void nfs_readpage_result(struct rpc_task
+  *  - The error flag is set for this page. This happens only when a
+  *    previous async read operation failed.
+  */
+-int
+-nfs_readpage(struct file *file, struct page *page)
++int nfs_readpage(struct file *file, struct page *page)
+ {
++      struct nfs_open_context *ctx;
+       struct inode *inode = page->mapping->host;
+       int             error;
+@@ -519,25 +519,33 @@ nfs_readpage(struct file *file, struct p
+       if (error)
+               goto out_error;
++      if (file == NULL) {
++              ctx = nfs_find_open_context(inode, FMODE_READ);
++              if (ctx == NULL)
++                      return -EBADF;
++      } else
++              ctx = get_nfs_open_context((struct nfs_open_context *)
++                              file->private_data);
+       if (!IS_SYNC(inode)) {
+-              error = nfs_readpage_async(file, inode, page);
++              error = nfs_readpage_async(ctx, inode, page);
+               goto out;
+       }
+-      error = nfs_readpage_sync(file, inode, page);
++      error = nfs_readpage_sync(ctx, inode, page);
+       if (error < 0 && IS_SWAPFILE(inode))
+               printk("Aiee.. nfs swap-in of page failed!\n");
+ out:
++      put_nfs_open_context(ctx);
+       return error;
+ out_error:
+       unlock_page(page);
+-      goto out;
++      return error;
+ }
+ struct nfs_readdesc {
+       struct list_head *head;
+-      struct file *filp;
++      struct nfs_open_context *ctx;
+ };
+ static int
+@@ -552,7 +560,7 @@ readpage_async_filler(void *data, struct
+       len = nfs_page_length(inode, page);
+       if (len == 0)
+               return nfs_return_empty_page(page);
+-      new = nfs_create_request(desc->filp, inode, page, 0, len);
++      new = nfs_create_request(desc->ctx, inode, page, 0, len);
+       if (IS_ERR(new)) {
+                       SetPageError(page);
+                       unlock_page(page);
+@@ -565,13 +573,11 @@ readpage_async_filler(void *data, struct
+       return 0;
+ }
+-int
+-nfs_readpages(struct file *filp, struct address_space *mapping,
++int nfs_readpages(struct file *filp, struct address_space *mapping,
+               struct list_head *pages, unsigned nr_pages)
+ {
+       LIST_HEAD(head);
+       struct nfs_readdesc desc = {
+-              .filp           = filp,
+               .head           = &head,
+       };
+       struct inode *inode = mapping->host;
+@@ -583,12 +589,20 @@ nfs_readpages(struct file *filp, struct 
+                       (long long)NFS_FILEID(inode),
+                       nr_pages);
++      if (filp == NULL) {
++              desc.ctx = nfs_find_open_context(inode, FMODE_READ);
++              if (desc.ctx == NULL)
++                      return -EBADF;
++      } else
++              desc.ctx = get_nfs_open_context((struct nfs_open_context *)
++                              filp->private_data);
+       ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
+       if (!list_empty(&head)) {
+               int err = nfs_pagein_list(&head, server->rpages);
+               if (!ret)
+                       ret = err;
+       }
++      put_nfs_open_context(desc.ctx);
+       return ret;
+ }
+--- linux-2.6.7/fs/nfs/Makefile.lsec   2004-06-15 23:19:01.000000000 -0600
++++ linux-2.6.7/fs/nfs/Makefile        2005-03-23 14:28:22.819580592 -0700
+@@ -9,6 +9,7 @@ nfs-y                  := dir.o file.o inode.o nfs2xdr
+ nfs-$(CONFIG_ROOT_NFS)        += nfsroot.o mount_clnt.o      
+ nfs-$(CONFIG_NFS_V3)  += nfs3proc.o nfs3xdr.o
+ nfs-$(CONFIG_NFS_V4)  += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
+-                         idmap.o
++                         delegation.o idmap.o \
++                         callback.o callback_xdr.o callback_proc.o
+ nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
+ nfs-objs              := $(nfs-y)
+--- linux-2.6.7/fs/Kconfig.lsec        2004-06-15 23:19:36.000000000 -0600
++++ linux-2.6.7/fs/Kconfig     2005-03-23 14:28:23.871420688 -0700
+@@ -322,7 +322,7 @@ config FS_POSIX_ACL
+ #     Never use this symbol for ifdefs.
+ #
+       bool
+-      depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL
++      depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL || NFS_V4
+       default y
+ config XFS_FS
+@@ -1443,6 +1443,7 @@ config NFSD_V3
+ config NFSD_V4
+       bool "Provide NFSv4 server support (EXPERIMENTAL)"
+       depends on NFSD_V3 && EXPERIMENTAL
++      select NFSD_TCP
+       help
+         If you would like to include the NFSv4 server as well as the NFSv2
+         and NFSv3 servers, say Y here.  This feature is experimental, and
+@@ -1450,11 +1451,13 @@ config NFSD_V4
+         If unsure, say N.
+ config NFSD_TCP
+-      bool "Provide NFS server over TCP support (EXPERIMENTAL)"
+-      depends on NFSD && EXPERIMENTAL
++      bool "Provide NFS server over TCP support"
++      depends on NFSD
++      default y
+       help
+-        Enable NFS service over TCP connections.  This the officially
+-        still experimental, but seems to work well.
++        If you want your NFS server to support TCP connections, say Y here.
++        TCP connections usually perform better than the default UDP when
++        the network is lossy or congested.  If unsure, say Y.
+ config ROOT_NFS
+       bool "Root file system on NFS"
+@@ -1505,6 +1508,22 @@ config RPCSEC_GSS_KRB5
+         If unsure, say N.
++config RPCSEC_GSS_SPKM3
++      tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)"
++      depends on SUNRPC && EXPERIMENTAL
++      select SUNRPC_GSS
++      select CRYPTO
++      select CRYPTO_MD5
++      select CRYPTO_DES
++      help
++        Provides for secure RPC calls by means of a gss-api
++        mechanism based on the SPKM3 public-key mechanism.
++
++        Note: Requires an auxiliary userspace daemon which may be found on
++              http://www.citi.umich.edu/projects/nfsv4/
++
++        If unsure, say N.
++
+ config SMB_FS
+       tristate "SMB file system support (to mount Windows shares etc.)"
+       depends on INET
+--- linux-2.6.7/include/linux/fs.h.lsec        2005-03-23 14:26:03.300790672 -0700
++++ linux-2.6.7/include/linux/fs.h     2005-03-23 14:28:23.280510520 -0700
+@@ -632,7 +632,7 @@ struct file_lock {
+       struct file_lock *fl_next;      /* singly linked list for this inode  */
+       struct list_head fl_link;       /* doubly linked list of all locks */
+       struct list_head fl_block;      /* circular list of blocked processes */
+-      fl_owner_t fl_owner;
++      fl_owner_t fl_owner;            /* 0 if lock owned by a local process */
+       unsigned int fl_pid;
+       wait_queue_head_t fl_wait;
+       struct file *fl_file;
+--- linux-2.6.7/include/linux/nfs4.h.lsec      2004-06-15 23:19:22.000000000 -0600
++++ linux-2.6.7/include/linux/nfs4.h   2005-03-23 14:28:23.335502160 -0700
+@@ -13,8 +13,12 @@
+ #ifndef _LINUX_NFS4_H
+ #define _LINUX_NFS4_H
++#include <linux/types.h>
++#include <linux/list.h>
++
+ #define NFS4_VERIFIER_SIZE    8
+ #define NFS4_FHSIZE           128
++#define NFS4_MAXPATHLEN               PATH_MAX
+ #define NFS4_MAXNAMLEN                NAME_MAX
+ #define NFS4_ACCESS_READ        0x0001
+@@ -52,6 +56,60 @@
+ #define ACL4_SUPPORT_AUDIT_ACL 0x04
+ #define ACL4_SUPPORT_ALARM_ACL 0x08
++#define NFS4_ACE_FILE_INHERIT_ACE             0x00000001
++#define NFS4_ACE_DIRECTORY_INHERIT_ACE        0x00000002
++#define NFS4_ACE_NO_PROPAGATE_INHERIT_ACE     0x00000004
++#define NFS4_ACE_INHERIT_ONLY_ACE             0x00000008
++#define NFS4_ACE_SUCCESSFUL_ACCESS_ACE_FLAG   0x00000010
++#define NFS4_ACE_FAILED_ACCESS_ACE_FLAG       0x00000020
++#define NFS4_ACE_IDENTIFIER_GROUP             0x00000040
++#define NFS4_ACE_OWNER                        0x00000080
++#define NFS4_ACE_GROUP                        0x00000100
++#define NFS4_ACE_EVERYONE                     0x00000200
++
++#define NFS4_ACE_READ_DATA                    0x00000001
++#define NFS4_ACE_LIST_DIRECTORY               0x00000001
++#define NFS4_ACE_WRITE_DATA                   0x00000002
++#define NFS4_ACE_ADD_FILE                     0x00000002
++#define NFS4_ACE_APPEND_DATA                  0x00000004
++#define NFS4_ACE_ADD_SUBDIRECTORY             0x00000004
++#define NFS4_ACE_READ_NAMED_ATTRS             0x00000008
++#define NFS4_ACE_WRITE_NAMED_ATTRS            0x00000010
++#define NFS4_ACE_EXECUTE                      0x00000020
++#define NFS4_ACE_DELETE_CHILD                 0x00000040
++#define NFS4_ACE_READ_ATTRIBUTES              0x00000080
++#define NFS4_ACE_WRITE_ATTRIBUTES             0x00000100
++#define NFS4_ACE_DELETE                       0x00010000
++#define NFS4_ACE_READ_ACL                     0x00020000
++#define NFS4_ACE_WRITE_ACL                    0x00040000
++#define NFS4_ACE_WRITE_OWNER                  0x00080000
++#define NFS4_ACE_SYNCHRONIZE                  0x00100000
++#define NFS4_ACE_GENERIC_READ                 0x00120081
++#define NFS4_ACE_GENERIC_WRITE                0x00160106
++#define NFS4_ACE_GENERIC_EXECUTE              0x001200A0
++#define NFS4_ACE_MASK_ALL                     0x001F01FF
++
++enum nfs4_acl_whotype {
++      NFS4_ACL_WHO_NAMED = 0,
++      NFS4_ACL_WHO_OWNER,
++      NFS4_ACL_WHO_GROUP,
++      NFS4_ACL_WHO_EVERYONE,
++};
++
++struct nfs4_ace {
++      uint32_t        type;
++      uint32_t        flag;
++      uint32_t        access_mask;
++      int             whotype;
++      uid_t           who;
++      struct list_head l_ace;
++};
++
++struct nfs4_acl {
++      uint32_t        naces;
++      struct list_head ace_head;
++};
++
+ typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier;
+ typedef struct { char data[16]; } nfs4_stateid;
+@@ -297,7 +355,7 @@ enum {
+       NFSPROC4_CLNT_COMMIT,
+       NFSPROC4_CLNT_OPEN,
+       NFSPROC4_CLNT_OPEN_CONFIRM,
+-      NFSPROC4_CLNT_OPEN_RECLAIM,
++      NFSPROC4_CLNT_OPEN_NOATTR,
+       NFSPROC4_CLNT_OPEN_DOWNGRADE,
+       NFSPROC4_CLNT_CLOSE,
+       NFSPROC4_CLNT_SETATTR,
+@@ -315,12 +373,16 @@ enum {
+       NFSPROC4_CLNT_REMOVE,
+       NFSPROC4_CLNT_RENAME,
+       NFSPROC4_CLNT_LINK,
++      NFSPROC4_CLNT_SYMLINK,
+       NFSPROC4_CLNT_CREATE,
+       NFSPROC4_CLNT_PATHCONF,
+       NFSPROC4_CLNT_STATFS,
+       NFSPROC4_CLNT_READLINK,
+       NFSPROC4_CLNT_READDIR,
+       NFSPROC4_CLNT_SERVER_CAPS,
++      NFSPROC4_CLNT_DELEGRETURN,
++      NFSPROC4_CLNT_GETACL,
++      NFSPROC4_CLNT_SETACL,
+ };
+ #endif
+--- linux-2.6.7/include/linux/nfs_page.h.lsec  2004-06-15 23:18:57.000000000 -0600
++++ linux-2.6.7/include/linux/nfs_page.h       2005-03-23 14:28:23.392493496 -0700
+@@ -29,14 +29,9 @@
+ struct nfs_page {
+       struct list_head        wb_list,        /* Defines state of page: */
+                               *wb_list_head;  /*      read/write/commit */
+-      struct file             *wb_file;
+-      fl_owner_t              wb_lockowner;
+-      struct inode            *wb_inode;
+-      struct rpc_cred         *wb_cred;
+-      struct nfs4_state       *wb_state;
+       struct page             *wb_page;       /* page to read in/write out */
++      struct nfs_open_context *wb_context;    /* File state context info */
+       atomic_t                wb_complete;    /* i/os we're waiting for */
+-      wait_queue_head_t       wb_wait;        /* wait queue */
+       unsigned long           wb_index;       /* Offset >> PAGE_CACHE_SHIFT */
+       unsigned int            wb_offset,      /* Offset & ~PAGE_CACHE_MASK */
+                               wb_pgbase,      /* Start of page data */
+@@ -50,9 +45,11 @@ struct nfs_page {
+ #define NFS_NEED_COMMIT(req)  (test_bit(PG_NEED_COMMIT,&(req)->wb_flags))
+ #define NFS_NEED_RESCHED(req) (test_bit(PG_NEED_RESCHED,&(req)->wb_flags))
+-extern        struct nfs_page *nfs_create_request(struct file *, struct inode *,
+-                                          struct page *,
+-                                          unsigned int, unsigned int);
++extern        struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
++                                          struct inode *inode,
++                                          struct page *page,
++                                          unsigned int offset,
++                                          unsigned int count);
+ extern        void nfs_clear_request(struct nfs_page *req);
+ extern        void nfs_release_request(struct nfs_page *req);
+@@ -64,6 +61,7 @@ extern       int nfs_scan_list(struct list_hea
+ extern        int nfs_coalesce_requests(struct list_head *, struct list_head *,
+                                 unsigned int);
+ extern  int nfs_wait_on_request(struct nfs_page *);
++extern        void nfs_unlock_request(struct nfs_page *req);
+ extern        spinlock_t nfs_wreq_lock;
+@@ -90,19 +88,6 @@ nfs_lock_request(struct nfs_page *req)
+       return 1;
+ }
+-static inline void
+-nfs_unlock_request(struct nfs_page *req)
+-{
+-      if (!NFS_WBACK_BUSY(req)) {
+-              printk(KERN_ERR "NFS: Invalid unlock attempted\n");
+-              BUG();
+-      }
+-      smp_mb__before_clear_bit();
+-      clear_bit(PG_BUSY, &req->wb_flags);
+-      smp_mb__after_clear_bit();
+-      wake_up_all(&req->wb_wait);
+-      nfs_release_request(req);
+-}
+ /**
+  * nfs_list_remove_request - Remove a request from its wb_list
+--- linux-2.6.7/include/linux/sunrpc/svc.h.lsec        2004-06-15 23:19:35.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/svc.h     2005-03-23 14:28:23.541470848 -0700
+@@ -87,6 +87,14 @@ static inline u32 svc_getu32(struct iove
+       iov->iov_len -= sizeof(u32);
+       return val;
+ }
++
++static inline void svc_ungetu32(struct iovec *iov)
++{
++      u32 *vp = (u32 *)iov->iov_base;
++      iov->iov_base = (void *)(vp - 1);
++      iov->iov_len += sizeof(*vp);
++}
++
+ static inline void svc_putu32(struct iovec *iov, u32 val)
+ {
+       u32 *vp = iov->iov_base + iov->iov_len;
+@@ -243,6 +251,8 @@ struct svc_program {
+       char *                  pg_name;        /* service name */
+       char *                  pg_class;       /* class name: services sharing authentication */
+       struct svc_stat *       pg_stats;       /* rpc statistics */
++      /* Override authentication. NULL means use default */
++      int                     (*pg_authenticate)(struct svc_rqst *, u32 *);
+ };
+ /*
+--- linux-2.6.7/include/linux/sunrpc/gss_spkm3.h.lsec  2005-03-23 14:28:24.186372808 -0700
++++ linux-2.6.7/include/linux/sunrpc/gss_spkm3.h       2005-03-23 14:28:24.185372960 -0700
+@@ -0,0 +1,61 @@
++/*
++ *  linux/include/linux/sunrpc/gss_spkm3.h
++ *
++ *  Copyright (c) 2000 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Andy Adamson   <andros@umich.edu>
++ */
++
++#include <linux/sunrpc/auth_gss.h>
++#include <linux/sunrpc/gss_err.h>
++#include <linux/sunrpc/gss_asn1.h>
++
++struct spkm3_ctx {
++      struct xdr_netobj       ctx_id; /* per message context id */
++      int                     qop;         /* negotiated qop */
++      struct xdr_netobj       mech_used;
++      unsigned int            ret_flags ;
++      unsigned int            req_flags ;
++      struct xdr_netobj       share_key;
++      int                     conf_alg;
++      struct crypto_tfm*      derived_conf_key;
++      int                     intg_alg;
++      struct crypto_tfm*      derived_integ_key;
++      int                     keyestb_alg;   /* alg used to get share_key */
++      int                     owf_alg;   /* one way function */
++};
++
++/* from openssl/objects.h */
++/* XXX need SEAL_ALG_NONE */
++#define NID_md5               4
++#define NID_dhKeyAgreement    28
++#define NID_des_cbc           31
++#define NID_sha1              64
++#define NID_cast5_cbc         108
++
++/* SPKM InnerContext Token types */
++
++#define SPKM_ERROR_TOK        3
++#define SPKM_MIC_TOK  4
++#define SPKM_WRAP_TOK 5
++#define SPKM_DEL_TOK  6
++
++u32 spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, struct xdr_buf * text, struct xdr_netobj * token, int toktype);
++
++u32 spkm3_read_token(struct spkm3_ctx *ctx, struct xdr_netobj *read_token, struct xdr_buf *message_buffer, int *qop_state, int toktype);
++
++#define CKSUMTYPE_RSA_MD5            0x0007
++
++s32 make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
++                   struct xdr_netobj *cksum);
++void asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits);
++int decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen,
++                   int explen);
++void spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen,
++                   unsigned char *ctxhdr, int elen, int zbit);
++void spkm3_make_mic_token(unsigned  char **tokp, int toklen,
++                   struct xdr_netobj *mic_hdr,
++                   struct xdr_netobj *md5cksum, int md5elen, int md5zbit);
++u32 spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen,
++                   unsigned char **cksum);
+--- linux-2.6.7/include/linux/sunrpc/sched.h.lsec      2004-06-15 23:19:42.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/sched.h   2005-03-23 14:28:23.540471000 -0700
+@@ -11,7 +11,9 @@
+ #include <linux/timer.h>
+ #include <linux/sunrpc/types.h>
++#include <linux/spinlock.h>
+ #include <linux/wait.h>
++#include <linux/workqueue.h>
+ #include <linux/sunrpc/xdr.h>
+ /*
+@@ -25,11 +27,18 @@ struct rpc_message {
+       struct rpc_cred *       rpc_cred;       /* Credentials */
+ };
++struct rpc_wait_queue;
++struct rpc_wait {
++      struct list_head        list;           /* wait queue links */
++      struct list_head        links;          /* Links to related tasks */
++      wait_queue_head_t       waitq;          /* sync: sleep on this q */
++      struct rpc_wait_queue * rpc_waitq;      /* RPC wait queue we're on */
++};
++
+ /*
+  * This is the RPC task struct
+  */
+ struct rpc_task {
+-      struct list_head        tk_list;        /* wait queue links */
+ #ifdef RPC_DEBUG
+       unsigned long           tk_magic;       /* 0xf00baa */
+ #endif
+@@ -37,7 +46,6 @@ struct rpc_task {
+       struct rpc_clnt *       tk_client;      /* RPC client */
+       struct rpc_rqst *       tk_rqstp;       /* RPC request */
+       int                     tk_status;      /* result of last operation */
+-      struct rpc_wait_queue * tk_rpcwait;     /* RPC wait queue we're on */
+       /*
+        * RPC call state
+@@ -70,13 +78,18 @@ struct rpc_task {
+        * you have a pathological interest in kernel oopses.
+        */
+       struct timer_list       tk_timer;       /* kernel timer */
+-      wait_queue_head_t       tk_wait;        /* sync: sleep on this q */
+       unsigned long           tk_timeout;     /* timeout for rpc_sleep() */
+       unsigned short          tk_flags;       /* misc flags */
+       unsigned char           tk_active   : 1;/* Task has been activated */
+       unsigned char           tk_priority : 2;/* Task priority */
+       unsigned long           tk_runstate;    /* Task run status */
+-      struct list_head        tk_links;       /* links to related tasks */
++      struct workqueue_struct *tk_workqueue;  /* Normally rpciod, but could
++                                               * be any workqueue
++                                               */
++      union {
++              struct work_struct      tk_work;        /* Async task work queue */
++              struct rpc_wait         tk_wait;        /* RPC wait */
++      } u;
+ #ifdef RPC_DEBUG
+       unsigned short          tk_pid;         /* debugging aid */
+ #endif
+@@ -87,11 +100,11 @@ struct rpc_task {
+ /* support walking a list of tasks on a wait queue */
+ #define       task_for_each(task, pos, head) \
+       list_for_each(pos, head) \
+-              if ((task=list_entry(pos, struct rpc_task, tk_list)),1)
++              if ((task=list_entry(pos, struct rpc_task, u.tk_wait.list)),1)
+ #define       task_for_first(task, head) \
+       if (!list_empty(head) &&  \
+-          ((task=list_entry((head)->next, struct rpc_task, tk_list)),1))
++          ((task=list_entry((head)->next, struct rpc_task, u.tk_wait.list)),1))
+ /* .. and walking list of all tasks */
+ #define       alltask_for_each(task, pos, head) \
+@@ -124,22 +137,24 @@ typedef void                     (*rpc_action)(struct rpc_
+ #define RPC_DO_CALLBACK(t)    ((t)->tk_callback != NULL)
+ #define RPC_IS_SOFT(t)                ((t)->tk_flags & RPC_TASK_SOFT)
+-#define RPC_TASK_SLEEPING     0
+-#define RPC_TASK_RUNNING      1
+-#define RPC_IS_SLEEPING(t)    (test_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate))
+-#define RPC_IS_RUNNING(t)     (test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
++#define RPC_TASK_RUNNING      0
++#define RPC_TASK_QUEUED               1
++#define RPC_IS_RUNNING(t)     (test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
+ #define rpc_set_running(t)    (set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
+-#define rpc_clear_running(t)  (clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
+-
+-#define rpc_set_sleeping(t)   (set_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate))
+-
+-#define rpc_clear_sleeping(t) \
++#define rpc_test_and_set_running(t) \
++                              (test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate))
++#define rpc_clear_running(t)  \
+       do { \
+               smp_mb__before_clear_bit(); \
+-              clear_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate); \
++              clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \
+               smp_mb__after_clear_bit(); \
+-      } while(0)
++      } while (0)
++
++#define RPC_IS_QUEUED(t)      (test_bit(RPC_TASK_QUEUED, &(t)->tk_runstate))
++#define rpc_set_queued(t)     (set_bit(RPC_TASK_QUEUED, &(t)->tk_runstate))
++#define rpc_test_and_clear_queued(t) \
++              (test_and_clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate))
+ /*
+  * Task priorities.
+@@ -155,6 +170,7 @@ typedef void                       (*rpc_action)(struct rpc_
+  * RPC synchronization objects
+  */
+ struct rpc_wait_queue {
++      spinlock_t              lock;
+       struct list_head        tasks[RPC_NR_PRIORITY]; /* task queue for each priority level */
+       unsigned long           cookie;                 /* cookie of last task serviced */
+       unsigned char           maxpriority;            /* maximum priority (0 if queue is not a priority queue) */
+@@ -175,6 +191,7 @@ struct rpc_wait_queue {
+ #ifndef RPC_DEBUG
+ # define RPC_WAITQ_INIT(var,qname) { \
++              .lock = SPIN_LOCK_UNLOCKED, \
+               .tasks = { \
+                       [0] = LIST_HEAD_INIT(var.tasks[0]), \
+                       [1] = LIST_HEAD_INIT(var.tasks[1]), \
+@@ -183,6 +200,7 @@ struct rpc_wait_queue {
+       }
+ #else
+ # define RPC_WAITQ_INIT(var,qname) { \
++              .lock = SPIN_LOCK_UNLOCKED, \
+               .tasks = { \
+                       [0] = LIST_HEAD_INIT(var.tasks[0]), \
+                       [1] = LIST_HEAD_INIT(var.tasks[1]), \
+@@ -207,13 +225,10 @@ void             rpc_killall_tasks(struct rpc_clnt 
+ int           rpc_execute(struct rpc_task *);
+ void          rpc_run_child(struct rpc_task *parent, struct rpc_task *child,
+                                       rpc_action action);
+-int           rpc_add_wait_queue(struct rpc_wait_queue *, struct rpc_task *);
+-void          rpc_remove_wait_queue(struct rpc_task *);
+ void          rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *);
+ void          rpc_init_wait_queue(struct rpc_wait_queue *, const char *);
+ void          rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *,
+                                       rpc_action action, rpc_action timer);
+-void          rpc_add_timer(struct rpc_task *, rpc_action);
+ void          rpc_wake_up_task(struct rpc_task *);
+ void          rpc_wake_up(struct rpc_wait_queue *);
+ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *);
+--- linux-2.6.7/include/linux/sunrpc/gss_api.h.lsec    2004-06-15 23:20:03.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/gss_api.h 2005-03-23 14:28:24.688296504 -0700
+@@ -47,6 +47,18 @@ u32 gss_verify_mic(
+               struct xdr_buf          *message,
+               struct xdr_netobj       *mic_token,
+               u32                     *qstate);
++u32 gss_wrap(
++              struct gss_ctx          *ctx_id,
++              u32                     qop,
++              int                     offset,
++              struct xdr_buf          *outbuf,
++              struct page             **inpages);
++u32 gss_unwrap(
++              struct gss_ctx          *ctx_id,
++              u32                     *qop,
++              int                     offset,
++              struct xdr_buf          *inbuf,
++              int                     *out_offset);
+ u32 gss_delete_sec_context(
+               struct gss_ctx          **ctx_id);
+@@ -93,6 +105,18 @@ struct gss_api_ops {
+                       struct xdr_buf          *message,
+                       struct xdr_netobj       *mic_token,
+                       u32                     *qstate);
++      u32 (*gss_wrap)(
++                      struct gss_ctx          *ctx_id,
++                      u32                     qop,
++                      int                     offset,
++                      struct xdr_buf          *outbuf,
++                      struct page             **inpages);
++      u32 (*gss_unwrap)(
++                      struct gss_ctx          *ctx_id,
++                      u32                     *qop,
++                      int                     offset,
++                      struct xdr_buf          *buf,
++                      int                     *out_offset);
+       void (*gss_delete_sec_context)(
+                       void                    *internal_ctx_id);
+ };
+--- linux-2.6.7/include/linux/sunrpc/xprt.h.lsec       2004-06-15 23:19:43.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/xprt.h    2005-03-23 14:28:24.783282064 -0700
+@@ -95,7 +95,10 @@ struct rpc_rqst {
+       int                     rq_cong;        /* has incremented xprt->cong */
+       int                     rq_received;    /* receive completed */
+       u32                     rq_seqno;       /* gss seq no. used on req. */
+-
++      int                     rq_enc_pages_num;
++      struct page             **rq_enc_pages; /* scratch pages for use by
++                                                 gss privacy code */
++      void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */
+       struct list_head        rq_list;
+       struct xdr_buf          rq_private_buf;         /* The receive buffer
+--- linux-2.6.7/include/linux/sunrpc/gss_krb5.h.lsec   2004-06-15 23:19:29.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/gss_krb5.h        2005-03-23 14:28:24.840273400 -0700
+@@ -53,6 +53,8 @@ struct krb5_ctx {
+       struct xdr_netobj       mech_used;
+ };
++extern spinlock_t krb5_seq_lock;
++
+ #define KG_TOK_MIC_MSG    0x0101
+ #define KG_TOK_WRAP_MSG   0x0201
+@@ -116,18 +118,25 @@ enum seal_alg {
+ s32
+ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
+-                 struct xdr_netobj *cksum);
++              int body_offset, struct xdr_netobj *cksum);
+ u32
+ krb5_make_token(struct krb5_ctx *context_handle, int qop_req,
+       struct xdr_buf *input_message_buffer,
+-      struct xdr_netobj *output_message_buffer, int toktype);
++      struct xdr_netobj *output_message_buffer);
+ u32
+ krb5_read_token(struct krb5_ctx *context_handle,
+         struct xdr_netobj *input_token_buffer,
+-        struct xdr_buf *message_buffer,
+-        int *qop_state, int toktype);
++        struct xdr_buf *message_buffer, int *qop_state);
++
++u32
++gss_wrap_kerberos(struct gss_ctx *ctx_id, u32 qop, int offset,
++              struct xdr_buf *outbuf, struct page **pages);
++
++u32
++gss_unwrap_kerberos(struct gss_ctx *ctx_id, u32 *qop, int offset,
++              struct xdr_buf *buf, int *out_offset);
+ u32
+ krb5_encrypt(struct crypto_tfm * key,
+@@ -137,6 +146,13 @@ u32
+ krb5_decrypt(struct crypto_tfm * key,
+            void *iv, void *in, void *out, int length); 
++int
++gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *outbuf, int offset,
++              struct page **pages);
++
++int
++gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *inbuf, int offset);
++
+ s32
+ krb5_make_seq_num(struct crypto_tfm * key,
+               int direction,
+--- linux-2.6.7/include/linux/sunrpc/gss_asn1.h.lsec   2004-06-15 23:20:04.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/gss_asn1.h        2005-03-23 14:28:23.706445768 -0700
+@@ -69,7 +69,6 @@ u32 g_verify_token_header(
+      struct xdr_netobj *mech,
+      int *body_size,
+      unsigned char **buf_in,
+-     int tok_type,
+      int toksize);
+ u32 g_get_mech_oid(struct xdr_netobj *mech, struct xdr_netobj * in_buf);
+--- linux-2.6.7/include/linux/sunrpc/cache.h.lsec      2004-06-15 23:19:28.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/cache.h   2005-03-23 14:28:24.349348032 -0700
+@@ -128,20 +128,17 @@ struct cache_deferred_req {
+  * just like a template in C++, this macro does cache lookup
+  * for us.
+  * The function is passed some sort of HANDLE from which a cache_detail
+- * structure can be determined (via SETUP, DETAIL), a template
++ * structure can be determined (via DETAIL), a template
+  * cache entry (type RTN*), and a "set" flag.  Using the HASHFN and the 
+  * TEST, the function will try to find a matching cache entry in the cache.
+  * If "set" == 0 :
+  *    If an entry is found, it is returned
+  *    If no entry is found, a new non-VALID entry is created.
+- * If "set" == 1 and INPLACE == 0 :
++ * If "set" == 1:
+  *    If no entry is found a new one is inserted with data from "template"
+  *    If a non-CACHE_VALID entry is found, it is updated from template using UPDATE
+  *    If a CACHE_VALID entry is found, a new entry is swapped in with data
+  *       from "template"
+- * If set == 1, and INPLACE == 1 :
+- *    As above, except that if a CACHE_VALID entry is found, we UPDATE in place
+- *       instead of swapping in a new entry.
+  *
+  * If the passed handle has the CACHE_NEGATIVE flag set, then UPDATE is not
+  * run but insteead CACHE_NEGATIVE is set in any new item.
+@@ -153,21 +150,18 @@ struct cache_deferred_req {
+  * MEMBER is the member of the cache which is cache_head, which must be first
+  * FNAME is the name for the function 
+  * ARGS are arguments to function and must contain RTN *item, int set.  May
+- *   also contain something to be usedby SETUP or DETAIL to find cache_detail.
+- * SETUP  locates the cache detail and makes it available as...
+- * DETAIL identifies the cache detail, possibly set up by SETUP
++ *   also contain something to be used by DETAIL to find cache_detail.
++ * DETAIL identifies the cache detail
+  * HASHFN returns a hash value of the cache entry "item"
+  * TEST  tests if "tmp" matches "item"
+  * INIT copies key information from "item" to "new"
+  * UPDATE copies content information from "item" to "tmp"
+- * INPLACE is true if updates can happen inplace rather than allocating a new structure
+  */
+-#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE,INPLACE) \
++#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,DETAIL,HASHFN,TEST,INIT,UPDATE)       \
+ RTN *FNAME ARGS                                                                               \
+ {                                                                                     \
+       RTN *tmp, *new=NULL;                                                            \
+       struct cache_head **hp, **head;                                                 \
+-      SETUP;                                                                          \
+       head = &(DETAIL)->hash_table[HASHFN];                                           \
+  retry:                                                                                       \
+       if (set||new) write_lock(&(DETAIL)->hash_lock);                                 \
+@@ -176,14 +170,14 @@ RTN *FNAME ARGS                                                                          \
+               tmp = container_of(*hp, RTN, MEMBER);                                   \
+               if (TEST) { /* found a match */                                         \
+                                                                                       \
+-                      if (set && !INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \
++                      if (set && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \
+                               break;                                                  \
+                                                                                       \
+                       if (new)                                                        \
+                               {INIT;}                                                 \
+                       cache_get(&tmp->MEMBER);                                        \
+                       if (set) {                                                      \
+-                              if (!INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
++                              if (test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
+                               { /* need to swap in new */                             \
+                                       RTN *t2;                                        \
+                                                                                       \
+@@ -205,7 +199,7 @@ RTN *FNAME ARGS                                                                            \
+                       else read_unlock(&(DETAIL)->hash_lock);                         \
+                       if (set)                                                        \
+                               cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \
+-                      if (set && !INPLACE && new) cache_fresh(DETAIL, &new->MEMBER, 0);       \
++                      if (set && new) cache_fresh(DETAIL, &new->MEMBER, 0);   \
+                       if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL);             \
+                       return tmp;                                                     \
+               }                                                                       \
+@@ -233,16 +227,15 @@ RTN *FNAME ARGS                                                                          \
+       new = kmalloc(sizeof(*new), GFP_KERNEL);                                        \
+       if (new) {                                                                      \
+               cache_init(&new->MEMBER);                                               \
+-              cache_get(&new->MEMBER);                                                \
+               goto retry;                                                             \
+       }                                                                               \
+       return NULL;                                                                    \
+ }
+-#define DefineSimpleCacheLookup(STRUCT,INPLACE)       \
+-      DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), /*no setup */,     \
++#define DefineSimpleCacheLookup(STRUCT)       \
++      DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set),    \
+                         & STRUCT##_cache, STRUCT##_hash(item), STRUCT##_match(item, tmp),\
+-                        STRUCT##_init(new, item), STRUCT##_update(tmp, item),INPLACE)
++                        STRUCT##_init(new, item), STRUCT##_update(tmp, item))
+ #define cache_for_each(pos, detail, index, member)                                            \
+       for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ;                \
+--- linux-2.6.7/include/linux/sunrpc/xdr.h.lsec        2004-06-15 23:20:26.000000000 -0600
++++ linux-2.6.7/include/linux/sunrpc/xdr.h     2005-03-23 14:28:24.783282064 -0700
+@@ -192,6 +192,7 @@ extern void xdr_write_pages(struct xdr_s
+ extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p);
+ extern uint32_t *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
+ extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
++extern void truncate_xdr_buf(struct xdr_buf *xdr, int len);
+ #endif /* __KERNEL__ */
+--- linux-2.6.7/include/linux/nfsd/state.h.lsec        2004-06-15 23:18:56.000000000 -0600
++++ linux-2.6.7/include/linux/nfsd/state.h     2005-03-23 14:28:24.081388768 -0700
+@@ -38,6 +38,7 @@
+ #define _NFSD4_STATE_H
+ #include <linux/list.h>
++#include <linux/sunrpc/clnt.h>
+ #define NFS4_OPAQUE_LIMIT 1024
+ typedef struct {
+@@ -65,6 +66,22 @@ extern stateid_t onestateid;
+ #define ZERO_STATEID(stateid)       (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
+ #define ONE_STATEID(stateid)        (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
++/* client delegation callback info */
++struct nfs4_callback {
++      /* SETCLIENTID info */
++      u32                     cb_parsed;  /* addr parsed */
++      u32                     cb_addr;
++      unsigned short          cb_port;
++      u32                     cb_prog;
++      u32                     cb_ident;
++      struct xdr_netobj       cb_netid;
++      /* RPC client info */
++      u32                     cb_set;     /* successful CB_NULL call */
++      struct rpc_program      cb_program;
++      struct rpc_stat         cb_stat;
++      struct rpc_clnt *       cb_client;
++};
++
+ /*
+  * struct nfs4_client - one per client.  Clientids live here.
+  *    o Each nfs4_client is hashed by clientid.
+@@ -87,6 +104,21 @@ struct nfs4_client {
+       struct svc_cred         cl_cred;        /* setclientid principal */
+       clientid_t              cl_clientid;    /* generated by server */
+       nfs4_verifier           cl_confirm;     /* generated by server */
++      struct nfs4_callback    cl_callback;    /* callback info */
++      time_t                  cl_first_state; /* first state aquisition*/
++      atomic_t                cl_count;       /* ref count */
++};
++
++/* struct nfs4_client_reset
++ * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
++ * upon lease reset, or from upcall to state_daemon (to read in state
++ * from non-volitile storage) upon reboot.
++ */
++struct nfs4_client_reclaim {
++      struct list_head        cr_strhash;     /* hash by cr_name */
++      struct xdr_netobj       cr_name;        /* id generated by client */
++      time_t                  cr_first_state; /* first state aquisition */
++      u32                     cr_expired;     /* boolean: lease expired? */
+ };
+ static inline void
+@@ -216,5 +248,8 @@ extern int nfs4_share_conflict(struct sv
+ extern void nfs4_lock_state(void);
+ extern void nfs4_unlock_state(void);
+ extern int nfs4_in_grace(void);
+-extern int nfs4_in_no_grace(void);
++extern int nfs4_check_open_reclaim(clientid_t *clid);
++extern void nfsd4_probe_callback(struct nfs4_client *clp);
++extern void expire_client(struct nfs4_client *clp);
++extern void put_nfs4_client(struct nfs4_client *clp);
+ #endif   /* NFSD4_STATE_H */
+--- linux-2.6.7/include/linux/nfsd/nfsd.h.lsec 2004-06-15 23:20:04.000000000 -0600
++++ linux-2.6.7/include/linux/nfsd/nfsd.h      2005-03-23 14:28:24.133380864 -0700
+@@ -76,6 +76,11 @@ int         nfsd_lookup(struct svc_rqst *, stru
+                               const char *, int, struct svc_fh *);
+ int           nfsd_setattr(struct svc_rqst *, struct svc_fh *,
+                               struct iattr *, int, time_t);
++#ifdef CONFIG_NFSD_V4
++int             nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
++                    struct nfs4_acl *);
++int             nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
++#endif /* CONFIG_NFSD_V4 */
+ int           nfsd_create(struct svc_rqst *, struct svc_fh *,
+                               char *name, int len, struct iattr *attrs,
+                               int type, dev_t rdev, struct svc_fh *res);
+@@ -126,9 +131,13 @@ int               nfsd_permission(struct svc_export *
+ #ifdef CONFIG_NFSD_V4
+ void nfs4_state_init(void);
+ void nfs4_state_shutdown(void);
++time_t nfs4_lease_time(void);
++void nfs4_reset_lease(time_t leasetime);
+ #else
+ void static inline nfs4_state_init(void){}
+ void static inline nfs4_state_shutdown(void){}
++time_t static inline nfs4_lease_time(void){return 0;}
++void static inline nfs4_reset_lease(time_t leasetime){}
+ #endif
+ /*
+@@ -249,12 +258,11 @@ static inline int is_fsid(struct svc_fh 
+ #define       COMPOUND_SLACK_SPACE            140    /* OP_GETFH */
+ #define COMPOUND_ERR_SLACK_SPACE      12     /* OP_SETATTR */
+-#define NFSD_LEASE_TIME                       60  /* seconds */
++#define NFSD_LEASE_TIME                 (nfs4_lease_time())
+ #define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
+ /*
+  * The following attributes are currently not supported by the NFSv4 server:
+- *    ACL           (will be supported in a forthcoming patch)
+  *    ARCHIVE       (deprecated anyway)
+  *    FS_LOCATIONS  (will be supported eventually)
+  *    HIDDEN        (unlikely to be supported any time soon)
+@@ -274,7 +282,7 @@ static inline int is_fsid(struct svc_fh 
+  | FATTR4_WORD0_FILEHANDLE      | FATTR4_WORD0_FILEID       | FATTR4_WORD0_FILES_AVAIL      \
+  | FATTR4_WORD0_FILES_FREE      | FATTR4_WORD0_FILES_TOTAL  | FATTR4_WORD0_HOMOGENEOUS      \
+  | FATTR4_WORD0_MAXFILESIZE     | FATTR4_WORD0_MAXLINK      | FATTR4_WORD0_MAXNAME          \
+- | FATTR4_WORD0_MAXREAD         | FATTR4_WORD0_MAXWRITE)
++ | FATTR4_WORD0_MAXREAD         | FATTR4_WORD0_MAXWRITE     | FATTR4_WORD0_ACL)
+ #define NFSD_SUPPORTED_ATTRS_WORD1                                                          \
+ (FATTR4_WORD1_MODE              | FATTR4_WORD1_NO_TRUNC     | FATTR4_WORD1_NUMLINKS         \
+@@ -289,7 +297,8 @@ static inline int is_fsid(struct svc_fh 
+ (FATTR4_WORD1_TIME_ACCESS_SET   | FATTR4_WORD1_TIME_MODIFY_SET)
+ /* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
+-#define NFSD_WRITEABLE_ATTRS_WORD0                            FATTR4_WORD0_SIZE
++#define NFSD_WRITEABLE_ATTRS_WORD0                                                          \
++(FATTR4_WORD0_SIZE              | FATTR4_WORD0_ACL                                         )
+ #define NFSD_WRITEABLE_ATTRS_WORD1                                                          \
+ (FATTR4_WORD1_MODE              | FATTR4_WORD1_OWNER         | FATTR4_WORD1_OWNER_GROUP     \
+  | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY_SET)
+--- linux-2.6.7/include/linux/nfsd/xdr4.h.lsec 2004-06-15 23:18:59.000000000 -0600
++++ linux-2.6.7/include/linux/nfsd/xdr4.h      2005-03-23 14:28:24.082388616 -0700
+@@ -39,6 +39,8 @@
+ #ifndef _LINUX_NFSD_XDR4_H
+ #define _LINUX_NFSD_XDR4_H
++#include <linux/nfs4.h>
++
+ #define NFSD4_MAX_TAGLEN      128
+ #define XDR_LEN(n)                     (((n) + 3) & ~3)
+@@ -95,6 +97,7 @@ struct nfsd4_create {
+       u32             cr_bmval[2];        /* request */
+       struct iattr    cr_iattr;           /* request */
+       struct nfsd4_change_info  cr_cinfo; /* response */
++      struct nfs4_acl *cr_acl;
+ };
+ #define cr_linklen    u.link.namelen
+ #define cr_linkname   u.link.name
+@@ -216,7 +219,7 @@ struct nfsd4_open {
+       u32             op_rflags;          /* response */
+       int             op_truncate;        /* used during processing */
+       struct nfs4_stateowner *op_stateowner; /* used during processing */
+-
++      struct nfs4_acl *op_acl;
+ };
+ #define op_iattr      u.iattr
+ #define op_verf               u.verf
+@@ -291,6 +294,7 @@ struct nfsd4_setattr {
+       stateid_t       sa_stateid;         /* request */
+       u32             sa_bmval[2];        /* request */
+       struct iattr    sa_iattr;           /* request */
++      struct nfs4_acl *sa_acl;
+ };
+ struct nfsd4_setclientid {
+@@ -378,6 +382,7 @@ struct nfsd4_compoundargs {
+       u32 *                           tmpp;
+       struct tmpbuf {
+               struct tmpbuf *next;
++              void (*release)(const void *);
+               void *buf;
+       }                               *to_free;
+@@ -449,6 +454,7 @@ extern int nfsd4_locku(struct svc_rqst *
+ extern int
+ nfsd4_release_lockowner(struct svc_rqst *rqstp,
+               struct nfsd4_release_lockowner *rlockowner);
++extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *);
+ #endif
+ /*
+--- linux-2.6.7/include/linux/nfs_fs.h.lsec    2004-06-15 23:19:13.000000000 -0600
++++ linux-2.6.7/include/linux/nfs_fs.h 2005-03-23 14:28:23.338501704 -0700
+@@ -28,6 +28,7 @@
+ #include <linux/nfs3.h>
+ #include <linux/nfs4.h>
+ #include <linux/nfs_xdr.h>
++#include <linux/rwsem.h>
+ #include <linux/workqueue.h>
+ /*
+@@ -75,15 +76,33 @@
+ #ifdef __KERNEL__
+ /*
+- * NFSv3 Access mode cache
++ * NFSv3/v4 Access mode cache entry
+  */
+-struct nfs_access_cache {
++struct nfs_access_entry {
+       unsigned long           jiffies;
+       struct rpc_cred *       cred;
+       int                     mask;
+-      int                     err;
+ };
++struct nfs4_state;
++struct nfs_open_context {
++      atomic_t count;
++      struct dentry *dentry;
++      struct rpc_cred *cred;
++      struct nfs4_state *state;
++      unsigned int pid;
++      int mode;
++      int error;
++
++      struct list_head list;
++      wait_queue_head_t waitq;
++};
++
++/*
++ * NFSv4 delegation
++ */
++struct nfs_delegation;
++
+ /*
+  * nfs fs inode data in memory
+  */
+@@ -137,7 +156,7 @@ struct nfs_inode {
+        */
+       atomic_t                data_updates;
+-      struct nfs_access_cache cache_access;
++      struct nfs_access_entry cache_access;
+       /*
+        * This is the cookie verifier used for NFSv3 readdir
+@@ -156,16 +175,20 @@ struct nfs_inode {
+                               ncommit,
+                               npages;
+-      /* Credentials for shared mmap */
+-      struct rpc_cred         *mm_cred;
++      /* Open contexts for shared mmap writes */
++      struct list_head        open_files;
+       wait_queue_head_t       nfs_i_wait;
+ #ifdef CONFIG_NFS_V4
+         /* NFSv4 state */
+       struct list_head        open_states;
++      struct nfs_delegation   *delegation;
++      int                      delegation_state;
++      struct rw_semaphore     rwsem;
+ #endif /* CONFIG_NFS_V4*/
+-
++      void                    *acl;
++      ssize_t                 acl_len;
+       struct inode            vfs_inode;
+ };
+@@ -259,6 +282,18 @@ static inline int nfs_verify_change_attr
+               && chattr == NFS_I(inode)->cache_change_attribute;
+ }
++/**
++ * nfs_compare_fh - compare two filehandles for equality
++ * @fh1 - pointer to first filehandle
++ * @fh2 - pointer to second filehandle
++ */
++static inline int nfs_compare_fh(const struct nfs_fh *fh1, const struct nfs_fh *fh2)
++{
++      if (fh1->size == fh2->size)
++              return memcmp(fh1->data, fh2->data, fh1->size);
++      return (fh1->size > fh2->size) ? 1 : -1;
++}
++
+ /*
+  * linux/fs/nfs/inode.c
+  */
+@@ -268,9 +303,12 @@ extern struct inode *nfs_fhget(struct su
+ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
+ extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+ extern int nfs_permission(struct inode *, int, struct nameidata *);
+-extern void nfs_set_mmcred(struct inode *, struct rpc_cred *);
++extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *);
++extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
+ extern int nfs_open(struct inode *, struct file *);
+ extern int nfs_release(struct inode *, struct file *);
++extern int nfs_attribute_timeout(struct inode *inode);
++extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
+ extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
+ extern int nfs_setattr(struct dentry *, struct iattr *);
+ extern void nfs_begin_attr_update(struct inode *);
+@@ -278,6 +316,12 @@ extern void nfs_end_attr_update(struct i
+ extern void nfs_begin_data_update(struct inode *);
+ extern void nfs_end_data_update(struct inode *);
+ extern void nfs_end_data_update_defer(struct inode *);
++extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred);
++extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
++extern void put_nfs_open_context(struct nfs_open_context *ctx);
++extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx);
++extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, int mode);
++extern void nfs_file_clear_open_context(struct file *filp);
+ /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */
+ extern u32 root_nfs_parse_addr(char *name); /*__init*/
+@@ -289,16 +333,15 @@ extern struct inode_operations nfs_file_
+ extern struct file_operations nfs_file_operations;
+ extern struct address_space_operations nfs_file_aops;
+-static __inline__ struct rpc_cred *
+-nfs_file_cred(struct file *file)
++static inline struct rpc_cred *nfs_file_cred(struct file *file)
+ {
+-      struct rpc_cred *cred = NULL;
+-      if (file)
+-              cred = (struct rpc_cred *)file->private_data;
+-#ifdef RPC_DEBUG
+-      BUG_ON(cred && cred->cr_magic != RPCAUTH_CRED_MAGIC);
+-#endif
+-      return cred;
++      if (file != NULL) {
++              struct nfs_open_context *ctx;
++
++              ctx = (struct nfs_open_context*)file->private_data;
++              return ctx->cred;
++      }
++      return NULL;
+ }
+ /*
+@@ -418,28 +461,6 @@ extern int  nfsroot_mount(struct sockadd
+  * inline functions
+  */
+-static inline int nfs_attribute_timeout(struct inode *inode)
+-{
+-      struct nfs_inode *nfsi = NFS_I(inode);
+-
+-      return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo);
+-}
+-
+-/**
+- * nfs_revalidate_inode - Revalidate the inode attributes
+- * @server - pointer to nfs_server struct
+- * @inode - pointer to inode struct
+- *
+- * Updates inode attribute information by retrieving the data from the server.
+- */
+-static inline int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
+-{
+-      if (!(NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+-                      && !nfs_attribute_timeout(inode))
+-              return NFS_STALE(inode) ? -ESTALE : 0;
+-      return __nfs_revalidate_inode(server, inode);
+-}
+-
+ static inline loff_t
+ nfs_size_to_loff_t(__u64 size)
+ {
+@@ -507,8 +528,6 @@ struct idmap;
+ enum nfs4_client_state {
+       NFS4CLNT_OK  = 0,
+-      NFS4CLNT_NEW,
+-      NFS4CLNT_SETUP_STATE,
+ };
+ /*
+@@ -520,7 +539,6 @@ struct nfs4_client {
+       u64                     cl_clientid;    /* constant */
+       nfs4_verifier           cl_confirm;
+       unsigned long           cl_state;
+-      long                    cl_generation;
+       u32                     cl_lockowner_id;
+@@ -530,6 +548,7 @@ struct nfs4_client {
+        */
+       struct rw_semaphore     cl_sem;
++      struct list_head        cl_delegations;
+       struct list_head        cl_state_owners;
+       struct list_head        cl_unused;
+       int                     cl_nunused;
+@@ -573,12 +592,11 @@ struct nfs4_state_owner {
+       u32                  so_id;      /* 32-bit identifier, unique */
+       struct semaphore     so_sema;
+       u32                  so_seqid;   /* protected by so_sema */
+-      unsigned int         so_flags;   /* protected by so_sema */
+       atomic_t             so_count;
+-      long                 so_generation;
+       struct rpc_cred      *so_cred;   /* Associated cred */
+       struct list_head     so_states;
++      struct list_head     so_delegations;
+ };
+ /*
+@@ -593,10 +611,13 @@ struct nfs4_state_owner {
+  * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
+  */
++/* bits for nfs4_lock_state->flags */
++
+ struct nfs4_lock_state {
+       struct list_head        ls_locks;       /* Other lock stateids */
+-      fl_owner_t              ls_owner;       /* POSIX lock owner */
+-      struct nfs4_state *     ls_parent;      /* Parent nfs4_state */
++      unsigned int            ls_pid;         /* pid of owner process */
++#define NFS_LOCK_INITIALIZED 1
++      int                     flags;
+       u32                     ls_seqid;
+       u32                     ls_id;
+       nfs4_stateid            ls_stateid;
+@@ -606,6 +627,7 @@ struct nfs4_lock_state {
+ /* bits for nfs4_state->flags */
+ enum {
+       LK_STATE_IN_USE,
++      NFS_DELEGATED_STATE,
+ };
+ struct nfs4_state {
+@@ -629,8 +651,19 @@ struct nfs4_state {
+ };
++struct nfs4_exception {
++      long timeout;
++      int retry;
++};
++
+ extern struct dentry_operations nfs4_dentry_operations;
+ extern struct inode_operations nfs4_dir_inode_operations;
++extern struct inode_operations nfs4_file_inode_operations;
++
++/* inode.c */
++extern ssize_t nfs_getxattr(struct dentry *, const char *, void *, size_t);
++extern int nfs_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t nfs_listxattr(struct dentry *, char *, size_t);
+ /* nfs4proc.c */
+ extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short);
+@@ -639,10 +672,15 @@ extern int nfs4_open_reclaim(struct nfs4
+ extern int nfs4_proc_async_renew(struct nfs4_client *);
+ extern int nfs4_proc_renew(struct nfs4_client *);
+ extern int nfs4_do_close(struct inode *, struct nfs4_state *);
+-int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode);
++extern int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode);
+ extern int nfs4_wait_clnt_recover(struct rpc_clnt *, struct nfs4_client *);
+ extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
+ extern int nfs4_open_revalidate(struct inode *, struct dentry *, int);
++extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *);
++extern int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request);
++extern ssize_t nfs4_proc_get_acl(struct inode *, void *buf, ssize_t buflen);
++extern int nfs4_proc_set_acl(struct inode *, const void *buf, ssize_t buflen);
++extern void nfs4_zap_acl_attr(struct inode *inode);
+ /* nfs4renewd.c */
+ extern void nfs4_schedule_state_renewal(struct nfs4_client *);
+@@ -654,6 +692,8 @@ extern void init_nfsv4_state(struct nfs_
+ extern void destroy_nfsv4_state(struct nfs_server *);
+ extern struct nfs4_client *nfs4_get_client(struct in_addr *);
+ extern void nfs4_put_client(struct nfs4_client *clp);
++extern int nfs4_init_client(struct nfs4_client *clp);
++extern struct nfs4_client *nfs4_find_client(struct in_addr *);
+ extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *);
+ extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
+@@ -663,15 +703,14 @@ extern void nfs4_put_open_state(struct n
+ extern void nfs4_close_state(struct nfs4_state *, mode_t);
+ extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode);
+ extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp);
+-extern int nfs4_handle_error(struct nfs_server *, int);
+ extern void nfs4_schedule_state_recovery(struct nfs4_client *);
+-extern struct nfs4_lock_state *nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t);
+-extern struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t);
++extern struct nfs4_lock_state *nfs4_find_lock_state(struct nfs4_state *state, unsigned int pid);
++extern struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, unsigned int pid);
+ extern void nfs4_put_lock_state(struct nfs4_lock_state *state);
+ extern void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *ls);
+-extern void nfs4_notify_setlk(struct inode *, struct file_lock *, struct nfs4_lock_state *);
+-extern void nfs4_notify_unlck(struct inode *, struct file_lock *, struct nfs4_lock_state *);
+-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
++extern void nfs4_notify_setlk(struct nfs4_state *, struct file_lock *, struct nfs4_lock_state *);
++extern void nfs4_notify_unlck(struct nfs4_state *, struct file_lock *, struct nfs4_lock_state *);
++extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, unsigned int pid);
+@@ -681,6 +720,7 @@ struct nfs4_mount_data;
+ #define destroy_nfsv4_state(server)       do { } while (0)
+ #define nfs4_put_state_owner(inode, owner) do { } while (0)
+ #define nfs4_put_open_state(state) do { } while (0)
++#define nfs4_close_state(a, b) do { } while (0)
+ #define nfs4_renewd_prepare_shutdown(server) do { } while (0)
+ #endif
+@@ -697,6 +737,7 @@ struct nfs4_mount_data;
+ #define NFSDBG_XDR            0x0020
+ #define NFSDBG_FILE           0x0040
+ #define NFSDBG_ROOT           0x0080
++#define NFSDBG_CALLBACK               0x0100
+ #define NFSDBG_ALL            0xFFFF
+ #ifdef __KERNEL__
+--- linux-2.6.7/include/linux/nfs4_acl.h.lsec  2005-03-23 14:28:24.519322192 -0700
++++ linux-2.6.7/include/linux/nfs4_acl.h       2005-03-23 14:28:24.518322344 -0700
+@@ -0,0 +1,59 @@
++/*
++ *  include/linux/nfs4_acl.c
++ *
++ *  Common NFSv4 ACL handling definitions.
++ *
++ *  Copyright (c) 2002 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Marius Aamodt Eriksen <marius@umich.edu>
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the University nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#ifndef LINUX_NFS4_ACL_H
++#define LINUX_NFS4_ACL_H
++
++#include <linux/posix_acl.h>
++
++struct nfs4_acl *nfs4_acl_new(void);
++void nfs4_acl_free(struct nfs4_acl *);
++int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
++int nfs4_acl_get_whotype(char *, u32);
++int nfs4_acl_write_who(int who, char *p);
++int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group,
++                                      uid_t who, u32 mask);
++
++#define NFS4_ACL_TYPE_DEFAULT 0x01
++#define NFS4_ACL_DIR          0x02
++#define NFS4_ACL_OWNER                0x04
++
++struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
++                              struct posix_acl *, unsigned int flags);
++int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
++                              struct posix_acl **, unsigned int flags);
++
++#endif /* LINUX_NFS4_ACL_H */
+--- linux-2.6.7/include/linux/nfs_xdr.h.lsec   2004-06-15 23:19:52.000000000 -0600
++++ linux-2.6.7/include/linux/nfs_xdr.h        2005-03-23 14:28:23.539471152 -0700
+@@ -99,20 +99,21 @@ struct nfs4_change_info {
+  * Arguments to the open call.
+  */
+ struct nfs_openargs {
+-      struct nfs_fh *         fh;
++      const struct nfs_fh *   fh;
+       __u32                   seqid;
+-      __u32                   share_access;
++      int                     open_flags;
+       __u64                   clientid;
+       __u32                   id;
+-      __u32                   opentype;
+-      __u32                   createmode;
+       union {
+               struct iattr *  attrs;    /* UNCHECKED, GUARDED */
+               nfs4_verifier   verifier; /* EXCLUSIVE */
++              nfs4_stateid    delegation;             /* CLAIM_DELEGATE_CUR */
++              int             delegation_type;        /* CLAIM_PREVIOUS */
+       } u;
+       const struct qstr *     name;
+       const struct nfs_server *server;         /* Needed for ID mapping */
+       const u32 *             bitmask;
++      __u32                   claim;
+ };
+ struct nfs_openres {
+@@ -122,13 +123,17 @@ struct nfs_openres {
+       __u32                   rflags;
+       struct nfs_fattr *      f_attr;
+       const struct nfs_server *server;
++      int                     delegation_type;
++      nfs4_stateid            delegation;
++      __u32                   do_recall;
++      __u64                   maxsize;
+ };
+ /*
+  * Arguments to the open_confirm call.
+  */
+ struct nfs_open_confirmargs {
+-      struct nfs_fh *         fh;
++      const struct nfs_fh *   fh;
+       nfs4_stateid            stateid;
+       __u32                   seqid;
+ };
+@@ -138,26 +143,13 @@ struct nfs_open_confirmres {
+ };
+ /*
+- * Arguments to the open_reclaim call.
+- */
+-struct nfs_open_reclaimargs {
+-      struct nfs_fh *         fh;
+-      __u64                   clientid;
+-      __u32                   seqid;
+-      __u32                   id;
+-      __u32                   share_access;
+-      __u32                   claim;
+-      const __u32 *           bitmask;
+-};
+-
+-/*
+  * Arguments to the close call.
+  */
+ struct nfs_closeargs {
+       struct nfs_fh *         fh;
+       nfs4_stateid            stateid;
+       __u32                   seqid;
+-      __u32                   share_access;
++      int                     open_flags;
+ };
+ struct nfs_closeres {
+@@ -224,6 +216,11 @@ struct nfs_lockres {
+       const struct nfs_server *       server;
+ };
++struct nfs4_delegreturnargs {
++      const struct nfs_fh *fhandle;
++      const nfs4_stateid *stateid;
++};
++
+ /*
+  * Arguments to the read call.
+  */
+@@ -235,8 +232,7 @@ struct nfs_lockres {
+ struct nfs_readargs {
+       struct nfs_fh *         fh;
+-      fl_owner_t              lockowner;
+-      struct nfs4_state *     state;
++      struct nfs_open_context *context;
+       __u64                   offset;
+       __u32                   count;
+       unsigned int            pgbase;
+@@ -259,8 +255,7 @@ struct nfs_readres {
+ struct nfs_writeargs {
+       struct nfs_fh *         fh;
+-      fl_owner_t              lockowner;
+-      struct nfs4_state *     state;
++      struct nfs_open_context *context;
+       __u64                   offset;
+       __u32                   count;
+       enum nfs3_stable_how    stable;
+@@ -331,6 +326,19 @@ struct nfs_setattrargs {
+       const u32 *                     bitmask;
+ };
++struct nfs_setaclargs {
++      struct nfs_fh *                 fh;
++      const char *                    acl;
++      ssize_t                         acl_len;
++      const struct nfs_server *       server; /* Needed for name mapping */
++};
++
++struct nfs_getaclres {
++      char *                          acl;
++      ssize_t                         acl_len;
++      const struct nfs_server  *      server; /* Needed for name mapping */
++};
++
+ struct nfs_setattrres {
+       struct nfs_fattr *              fattr;
+       const struct nfs_server *       server;
+@@ -597,13 +605,15 @@ struct nfs4_rename_res {
+ };
+ struct nfs4_setclientid {
+-      nfs4_verifier                   sc_verifier;      /* request */
+-      char *                          sc_name;          /* request */
++      const nfs4_verifier *           sc_verifier;      /* request */
++      unsigned int                    sc_name_len;
++      char                            sc_name[32];      /* request */
+       u32                             sc_prog;          /* request */
++      unsigned int                    sc_netid_len;
+       char                            sc_netid[4];      /* request */
++      unsigned int                    sc_uaddr_len;
+       char                            sc_uaddr[24];     /* request */
+       u32                             sc_cb_ident;      /* request */
+-      struct nfs4_client *            sc_state;         /* response */
+ };
+ struct nfs4_statfs_arg {
+@@ -657,6 +667,8 @@ struct nfs_write_data {
+       void (*complete) (struct nfs_write_data *, int);
+ };
++struct nfs_access_entry;
++
+ /*
+  * RPC procedure vector for NFSv2/NFSv3 demuxing
+  */
+@@ -664,6 +676,7 @@ struct nfs_rpc_ops {
+       int     version;                /* Protocol version */
+       struct dentry_operations *dentry_ops;
+       struct inode_operations *dir_inode_ops;
++      struct inode_operations *file_inode_ops;
+       int     (*getroot) (struct nfs_server *, struct nfs_fh *,
+                           struct nfs_fsinfo *);
+@@ -672,11 +685,11 @@ struct nfs_rpc_ops {
+                           struct iattr *);
+       int     (*lookup)  (struct inode *, struct qstr *,
+                           struct nfs_fh *, struct nfs_fattr *);
+-      int     (*access)  (struct inode *, struct rpc_cred *, int);
++      int     (*access)  (struct inode *, struct nfs_access_entry *);
+       int     (*readlink)(struct inode *, struct page *);
+-      int     (*read)    (struct nfs_read_data *, struct file *);
+-      int     (*write)   (struct nfs_write_data *, struct file *);
+-      int     (*commit)  (struct nfs_write_data *, struct file *);
++      int     (*read)    (struct nfs_read_data *);
++      int     (*write)   (struct nfs_write_data *);
++      int     (*commit)  (struct nfs_write_data *);
+       struct inode *  (*create)  (struct inode *, struct qstr *,
+                           struct iattr *, int);
+       int     (*remove)  (struct inode *, struct qstr *);
+@@ -708,8 +721,6 @@ struct nfs_rpc_ops {
+       void    (*commit_setup) (struct nfs_write_data *, int how);
+       int     (*file_open)   (struct inode *, struct file *);
+       int     (*file_release) (struct inode *, struct file *);
+-      void    (*request_init)(struct nfs_page *, struct file *);
+-      int     (*request_compatible)(struct nfs_page *, struct file *, struct page *);
+       int     (*lock)(struct file *, int, struct file_lock *);
+ };
+--- linux-2.6.7/arch/s390/defconfig.lsec       2004-06-15 23:19:52.000000000 -0600
++++ linux-2.6.7/arch/s390/defconfig    2005-03-23 14:28:23.869420992 -0700
+@@ -422,7 +422,7 @@ CONFIG_NFS_V3=y
+ CONFIG_NFSD=y
+ CONFIG_NFSD_V3=y
+ # CONFIG_NFSD_V4 is not set
+-# CONFIG_NFSD_TCP is not set
++CONFIG_NFSD_TCP=y
+ CONFIG_LOCKD=y
+ CONFIG_LOCKD_V4=y
+ CONFIG_EXPORTFS=y
+--- linux-2.6.7/arch/ia64/defconfig.lsec       2004-06-15 23:18:57.000000000 -0600
++++ linux-2.6.7/arch/ia64/defconfig    2005-03-23 14:28:23.816429048 -0700
+@@ -987,7 +987,7 @@ CONFIG_NFS_DIRECTIO=y
+ CONFIG_NFSD=y
+ CONFIG_NFSD_V3=y
+ # CONFIG_NFSD_V4 is not set
+-# CONFIG_NFSD_TCP is not set
++CONFIG_NFSD_TCP=y
+ CONFIG_LOCKD=y
+ CONFIG_LOCKD_V4=y
+ CONFIG_EXPORTFS=y
+--- linux-2.6.7/arch/ppc/defconfig.lsec        2004-06-15 23:19:52.000000000 -0600
++++ linux-2.6.7/arch/ppc/defconfig     2005-03-23 14:28:23.817428896 -0700
+@@ -1230,7 +1230,7 @@ CONFIG_NFS_V3=y
+ CONFIG_NFSD=y
+ CONFIG_NFSD_V3=y
+ # CONFIG_NFSD_V4 is not set
+-# CONFIG_NFSD_TCP is not set
++CONFIG_NFSD_TCP=y
+ CONFIG_LOCKD=y
+ CONFIG_LOCKD_V4=y
+ CONFIG_EXPORTFS=y
+--- linux-2.6.7/arch/i386/defconfig.lsec       2004-06-15 23:19:42.000000000 -0600
++++ linux-2.6.7/arch/i386/defconfig    2005-03-23 14:28:23.763437104 -0700
+@@ -1148,7 +1148,7 @@ CONFIG_NFS_FS=y
+ # CONFIG_NFS_DIRECTIO is not set
+ CONFIG_NFSD=y
+ # CONFIG_NFSD_V3 is not set
+-# CONFIG_NFSD_TCP is not set
++CONFIG_NFSD_TCP=y
+ CONFIG_LOCKD=y
+ CONFIG_EXPORTFS=y
+ CONFIG_SUNRPC=y
+--- linux-2.6.7/arch/alpha/defconfig.lsec      2004-06-15 23:19:23.000000000 -0600
++++ linux-2.6.7/arch/alpha/defconfig   2005-03-23 14:28:23.762437256 -0700
+@@ -791,7 +791,7 @@ CONFIG_NFS_V3=y
+ CONFIG_NFSD=m
+ CONFIG_NFSD_V3=y
+ # CONFIG_NFSD_V4 is not set
+-# CONFIG_NFSD_TCP is not set
++CONFIG_NFSD_TCP=y
+ CONFIG_LOCKD=m
+ CONFIG_LOCKD_V4=y
+ CONFIG_EXPORTFS=m
+--- linux-2.6.7/net/sunrpc/svcauth_unix.c.lsec 2004-06-15 23:19:37.000000000 -0600
++++ linux-2.6.7/net/sunrpc/svcauth_unix.c      2005-03-23 14:28:24.295356240 -0700
+@@ -55,12 +55,10 @@ struct auth_domain *unix_domain_find(cha
+       if (new == NULL)
+               return NULL;
+       cache_init(&new->h.h);
+-      atomic_inc(&new->h.h.refcnt);
+       new->h.name = strdup(name);
+       new->h.flavour = RPC_AUTH_UNIX;
+       new->addr_changes = 0;
+       new->h.h.expiry_time = NEVER;
+-      new->h.h.flags = 0;
+       rv = auth_domain_lookup(&new->h, 2);
+       if (rv == &new->h) {
+@@ -262,7 +260,7 @@ struct cache_detail ip_map_cache = {
+       .cache_show     = ip_map_show,
+ };
+-static DefineSimpleCacheLookup(ip_map, 0)
++static DefineSimpleCacheLookup(ip_map)
+ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
+@@ -318,7 +316,8 @@ struct auth_domain *auth_unix_lookup(str
+               return NULL;
+       if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) {
+-              set_bit(CACHE_NEGATIVE, &ipm->h.flags);
++              if (test_and_set_bit(CACHE_NEGATIVE, &ipm->h.flags) == 0)
++                      auth_domain_put(&ipm->m_client->h);
+               rv = NULL;
+       } else {
+               rv = &ipm->m_client->h;
+@@ -405,6 +404,9 @@ svcauth_null_release(struct svc_rqst *rq
+       if (rqstp->rq_client)
+               auth_domain_put(rqstp->rq_client);
+       rqstp->rq_client = NULL;
++      if (rqstp->rq_cred.cr_group_info)
++              put_group_info(rqstp->rq_cred.cr_group_info);
++      rqstp->rq_cred.cr_group_info = NULL;
+       return 0; /* don't drop */
+ }
+--- linux-2.6.7/net/sunrpc/xprt.c.lsec 2004-06-15 23:19:42.000000000 -0600
++++ linux-2.6.7/net/sunrpc/xprt.c      2005-03-23 14:28:23.706445768 -0700
+@@ -1099,7 +1099,7 @@ xprt_write_space(struct sock *sk)
+               goto out;
+       spin_lock_bh(&xprt->sock_lock);
+-      if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->pending)
++      if (xprt->snd_task)
+               rpc_wake_up_task(xprt->snd_task);
+       spin_unlock_bh(&xprt->sock_lock);
+ out:
+@@ -1357,6 +1357,7 @@ xprt_request_init(struct rpc_task *task,
+       req->rq_task    = task;
+       req->rq_xprt    = xprt;
+       req->rq_xid     = xprt_alloc_xid(xprt);
++      req->rq_release_snd_buf = NULL;
+       dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid,
+                       req, req->rq_xid);
+ }
+@@ -1382,6 +1383,8 @@ xprt_release(struct rpc_task *task)
+               mod_timer(&xprt->timer, xprt->last_used + XPRT_IDLE_TIMEOUT);
+       spin_unlock_bh(&xprt->sock_lock);
+       task->tk_rqstp = NULL;
++      if (req->rq_release_snd_buf)
++              req->rq_release_snd_buf(req);
+       memset(req, 0, sizeof(*req));   /* mark unused */
+       dprintk("RPC: %4d release request %p\n", task->tk_pid, req);
+--- linux-2.6.7/net/sunrpc/sched.c.lsec        2004-06-15 23:19:35.000000000 -0600
++++ linux-2.6.7/net/sunrpc/sched.c     2005-03-23 14:28:23.651454128 -0700
+@@ -41,13 +41,7 @@ static mempool_t    *rpc_buffer_mempool;
+ static void                   __rpc_default_timer(struct rpc_task *task);
+ static void                   rpciod_killall(void);
+-
+-/*
+- * When an asynchronous RPC task is activated within a bottom half
+- * handler, or while executing another RPC task, it is put on
+- * schedq, and rpciod is woken up.
+- */
+-static RPC_WAITQ(schedq, "schedq");
++static void                   rpc_async_schedule(void *);
+ /*
+  * RPC tasks that create another task (e.g. for contacting the portmapper)
+@@ -68,26 +62,18 @@ static LIST_HEAD(all_tasks);
+ /*
+  * rpciod-related stuff
+  */
+-static DECLARE_WAIT_QUEUE_HEAD(rpciod_idle);
+-static DECLARE_COMPLETION(rpciod_killer);
+ static DECLARE_MUTEX(rpciod_sema);
+ static unsigned int           rpciod_users;
+-static pid_t                  rpciod_pid;
+-static int                    rpc_inhibit;
++static struct workqueue_struct *rpciod_workqueue;
+ /*
+- * Spinlock for wait queues. Access to the latter also has to be
+- * interrupt-safe in order to allow timers to wake up sleeping tasks.
+- */
+-static spinlock_t rpc_queue_lock = SPIN_LOCK_UNLOCKED;
+-/*
+  * Spinlock for other critical sections of code.
+  */
+ static spinlock_t rpc_sched_lock = SPIN_LOCK_UNLOCKED;
+ /*
+  * Disable the timer for a given RPC task. Should be called with
+- * rpc_queue_lock and bh_disabled in order to avoid races within
++ * queue->lock and bh_disabled in order to avoid races within
+  * rpc_run_timer().
+  */
+ static inline void
+@@ -105,16 +91,13 @@ __rpc_disable_timer(struct rpc_task *tas
+  * without calling del_timer_sync(). The latter could cause a
+  * deadlock if called while we're holding spinlocks...
+  */
+-static void
+-rpc_run_timer(struct rpc_task *task)
++static void rpc_run_timer(struct rpc_task *task)
+ {
+       void (*callback)(struct rpc_task *);
+-      spin_lock_bh(&rpc_queue_lock);
+       callback = task->tk_timeout_fn;
+       task->tk_timeout_fn = NULL;
+-      spin_unlock_bh(&rpc_queue_lock);
+-      if (callback) {
++      if (callback && RPC_IS_QUEUED(task)) {
+               dprintk("RPC: %4d running timer\n", task->tk_pid);
+               callback(task);
+       }
+@@ -140,19 +123,8 @@ __rpc_add_timer(struct rpc_task *task, r
+ }
+ /*
+- * Set up a timer for an already sleeping task.
+- */
+-void rpc_add_timer(struct rpc_task *task, rpc_action timer)
+-{
+-      spin_lock_bh(&rpc_queue_lock);
+-      if (!RPC_IS_RUNNING(task))
+-              __rpc_add_timer(task, timer);
+-      spin_unlock_bh(&rpc_queue_lock);
+-}
+-
+-/*
+  * Delete any timer for the current task. Because we use del_timer_sync(),
+- * this function should never be called while holding rpc_queue_lock.
++ * this function should never be called while holding queue->lock.
+  */
+ static inline void
+ rpc_delete_timer(struct rpc_task *task)
+@@ -169,16 +141,17 @@ static void __rpc_add_wait_queue_priorit
+       struct list_head *q;
+       struct rpc_task *t;
++      INIT_LIST_HEAD(&task->u.tk_wait.links);
+       q = &queue->tasks[task->tk_priority];
+       if (unlikely(task->tk_priority > queue->maxpriority))
+               q = &queue->tasks[queue->maxpriority];
+-      list_for_each_entry(t, q, tk_list) {
++      list_for_each_entry(t, q, u.tk_wait.list) {
+               if (t->tk_cookie == task->tk_cookie) {
+-                      list_add_tail(&task->tk_list, &t->tk_links);
++                      list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links);
+                       return;
+               }
+       }
+-      list_add_tail(&task->tk_list, q);
++      list_add_tail(&task->u.tk_wait.list, q);
+ }
+ /*
+@@ -189,37 +162,21 @@ static void __rpc_add_wait_queue_priorit
+  * improve overall performance.
+  * Everyone else gets appended to the queue to ensure proper FIFO behavior.
+  */
+-static int __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
++static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task)
+ {
+-      if (task->tk_rpcwait == queue)
+-              return 0;
++      BUG_ON (RPC_IS_QUEUED(task));
+-      if (task->tk_rpcwait) {
+-              printk(KERN_WARNING "RPC: doubly enqueued task!\n");
+-              return -EWOULDBLOCK;
+-      }
+       if (RPC_IS_PRIORITY(queue))
+               __rpc_add_wait_queue_priority(queue, task);
+       else if (RPC_IS_SWAPPER(task))
+-              list_add(&task->tk_list, &queue->tasks[0]);
++              list_add(&task->u.tk_wait.list, &queue->tasks[0]);
+       else
+-              list_add_tail(&task->tk_list, &queue->tasks[0]);
+-      task->tk_rpcwait = queue;
++              list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
++      task->u.tk_wait.rpc_waitq = queue;
++      rpc_set_queued(task);
+       dprintk("RPC: %4d added to queue %p \"%s\"\n",
+                               task->tk_pid, queue, rpc_qname(queue));
+-
+-      return 0;
+-}
+-
+-int rpc_add_wait_queue(struct rpc_wait_queue *q, struct rpc_task *task)
+-{
+-      int             result;
+-
+-      spin_lock_bh(&rpc_queue_lock);
+-      result = __rpc_add_wait_queue(q, task);
+-      spin_unlock_bh(&rpc_queue_lock);
+-      return result;
+ }
+ /*
+@@ -229,12 +186,12 @@ static void __rpc_remove_wait_queue_prio
+ {
+       struct rpc_task *t;
+-      if (!list_empty(&task->tk_links)) {
+-              t = list_entry(task->tk_links.next, struct rpc_task, tk_list);
+-              list_move(&t->tk_list, &task->tk_list);
+-              list_splice_init(&task->tk_links, &t->tk_links);
++      if (!list_empty(&task->u.tk_wait.links)) {
++              t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list);
++              list_move(&t->u.tk_wait.list, &task->u.tk_wait.list);
++              list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links);
+       }
+-      list_del(&task->tk_list);
++      list_del(&task->u.tk_wait.list);
+ }
+ /*
+@@ -243,31 +200,17 @@ static void __rpc_remove_wait_queue_prio
+  */
+ static void __rpc_remove_wait_queue(struct rpc_task *task)
+ {
+-      struct rpc_wait_queue *queue = task->tk_rpcwait;
+-
+-      if (!queue)
+-              return;
++      struct rpc_wait_queue *queue;
++      queue = task->u.tk_wait.rpc_waitq;
+       if (RPC_IS_PRIORITY(queue))
+               __rpc_remove_wait_queue_priority(task);
+       else
+-              list_del(&task->tk_list);
+-      task->tk_rpcwait = NULL;
+-
++              list_del(&task->u.tk_wait.list);
+       dprintk("RPC: %4d removed from queue %p \"%s\"\n",
+                               task->tk_pid, queue, rpc_qname(queue));
+ }
+-void
+-rpc_remove_wait_queue(struct rpc_task *task)
+-{
+-      if (!task->tk_rpcwait)
+-              return;
+-      spin_lock_bh(&rpc_queue_lock);
+-      __rpc_remove_wait_queue(task);
+-      spin_unlock_bh(&rpc_queue_lock);
+-}
+-
+ static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority)
+ {
+       queue->priority = priority;
+@@ -290,6 +233,7 @@ static void __rpc_init_priority_wait_que
+ {
+       int i;
++      spin_lock_init(&queue->lock);
+       for (i = 0; i < ARRAY_SIZE(queue->tasks); i++)
+               INIT_LIST_HEAD(&queue->tasks[i]);
+       queue->maxpriority = maxprio;
+@@ -316,34 +260,27 @@ EXPORT_SYMBOL(rpc_init_wait_queue);
+  * Note: If the task is ASYNC, this must be called with 
+  * the spinlock held to protect the wait queue operation.
+  */
+-static inline void
+-rpc_make_runnable(struct rpc_task *task)
++static void rpc_make_runnable(struct rpc_task *task)
+ {
+-      if (task->tk_timeout_fn) {
+-              printk(KERN_ERR "RPC: task w/ running timer in rpc_make_runnable!!\n");
++      if (rpc_test_and_set_running(task))
+               return;
+-      }
+-      rpc_set_running(task);
++      BUG_ON(task->tk_timeout_fn);
+       if (RPC_IS_ASYNC(task)) {
+-              if (RPC_IS_SLEEPING(task)) {
+-                      int status;
+-                      status = __rpc_add_wait_queue(&schedq, task);
+-                      if (status < 0) {
+-                              printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
+-                              task->tk_status = status;
+-                              return;
+-                      }
+-                      rpc_clear_sleeping(task);
+-                      wake_up(&rpciod_idle);
++              int status;
++
++              INIT_WORK(&task->u.tk_work, rpc_async_schedule, (void *)task);
++              status = queue_work(task->tk_workqueue, &task->u.tk_work);
++              if (status < 0) {
++                      printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
++                      task->tk_status = status;
++                      return;
+               }
+-      } else {
+-              rpc_clear_sleeping(task);
+-              wake_up(&task->tk_wait);
+-      }
++      } else
++              wake_up(&task->u.tk_wait.waitq);
+ }
+ /*
+- * Place a newly initialized task on the schedq.
++ * Place a newly initialized task on the workqueue.
+  */
+ static inline void
+ rpc_schedule_run(struct rpc_task *task)
+@@ -352,33 +289,18 @@ rpc_schedule_run(struct rpc_task *task)
+       if (RPC_IS_ACTIVATED(task))
+               return;
+       task->tk_active = 1;
+-      rpc_set_sleeping(task);
+       rpc_make_runnable(task);
+ }
+ /*
+- *    For other people who may need to wake the I/O daemon
+- *    but should (for now) know nothing about its innards
+- */
+-void rpciod_wake_up(void)
+-{
+-      if(rpciod_pid==0)
+-              printk(KERN_ERR "rpciod: wot no daemon?\n");
+-      wake_up(&rpciod_idle);
+-}
+-
+-/*
+  * Prepare for sleeping on a wait queue.
+  * By always appending tasks to the list we ensure FIFO behavior.
+  * NB: An RPC task will only receive interrupt-driven events as long
+  * as it's on a wait queue.
+  */
+-static void
+-__rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
++static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
+                       rpc_action action, rpc_action timer)
+ {
+-      int status;
+-
+       dprintk("RPC: %4d sleep_on(queue \"%s\" time %ld)\n", task->tk_pid,
+                               rpc_qname(q), jiffies);
+@@ -388,49 +310,36 @@ __rpc_sleep_on(struct rpc_wait_queue *q,
+       }
+       /* Mark the task as being activated if so needed */
+-      if (!RPC_IS_ACTIVATED(task)) {
++      if (!RPC_IS_ACTIVATED(task))
+               task->tk_active = 1;
+-              rpc_set_sleeping(task);
+-      }
+-      status = __rpc_add_wait_queue(q, task);
+-      if (status) {
+-              printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status);
+-              task->tk_status = status;
+-      } else {
+-              rpc_clear_running(task);
+-              if (task->tk_callback) {
+-                      dprintk(KERN_ERR "RPC: %4d overwrites an active callback\n", task->tk_pid);
+-                      BUG();
+-              }
+-              task->tk_callback = action;
+-              __rpc_add_timer(task, timer);
+-      }
++      __rpc_add_wait_queue(q, task);
++
++      BUG_ON(task->tk_callback != NULL);
++      task->tk_callback = action;
++      __rpc_add_timer(task, timer);
+ }
+-void
+-rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
++void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
+                               rpc_action action, rpc_action timer)
+ {
+       /*
+        * Protect the queue operations.
+        */
+-      spin_lock_bh(&rpc_queue_lock);
++      spin_lock_bh(&q->lock);
+       __rpc_sleep_on(q, task, action, timer);
+-      spin_unlock_bh(&rpc_queue_lock);
++      spin_unlock_bh(&q->lock);
+ }
+ /**
+- * __rpc_wake_up_task - wake up a single rpc_task
++ * __rpc_do_wake_up_task - wake up a single rpc_task
+  * @task: task to be woken up
+  *
+- * Caller must hold rpc_queue_lock
++ * Caller must hold queue->lock, and have cleared the task queued flag.
+  */
+-static void
+-__rpc_wake_up_task(struct rpc_task *task)
++static void __rpc_do_wake_up_task(struct rpc_task *task)
+ {
+-      dprintk("RPC: %4d __rpc_wake_up_task (now %ld inh %d)\n",
+-                                      task->tk_pid, jiffies, rpc_inhibit);
++      dprintk("RPC: %4d __rpc_wake_up_task (now %ld)\n", task->tk_pid, jiffies);
+ #ifdef RPC_DEBUG
+       if (task->tk_magic != 0xf00baa) {
+@@ -445,12 +354,9 @@ __rpc_wake_up_task(struct rpc_task *task
+               printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task);
+               return;
+       }
+-      if (RPC_IS_RUNNING(task))
+-              return;
+       __rpc_disable_timer(task);
+-      if (task->tk_rpcwait != &schedq)
+-              __rpc_remove_wait_queue(task);
++      __rpc_remove_wait_queue(task);
+       rpc_make_runnable(task);
+@@ -458,6 +364,15 @@ __rpc_wake_up_task(struct rpc_task *task
+ }
+ /*
++ * Wake up the specified task
++ */
++static void __rpc_wake_up_task(struct rpc_task *task)
++{
++      if (rpc_test_and_clear_queued(task))
++              __rpc_do_wake_up_task(task);
++}
++
++/*
+  * Default timeout handler if none specified by user
+  */
+ static void
+@@ -471,14 +386,15 @@ __rpc_default_timer(struct rpc_task *tas
+ /*
+  * Wake up the specified task
+  */
+-void
+-rpc_wake_up_task(struct rpc_task *task)
++void rpc_wake_up_task(struct rpc_task *task)
+ {
+-      if (RPC_IS_RUNNING(task))
+-              return;
+-      spin_lock_bh(&rpc_queue_lock);
+-      __rpc_wake_up_task(task);
+-      spin_unlock_bh(&rpc_queue_lock);
++      if (rpc_test_and_clear_queued(task)) {
++              struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq;
++
++              spin_lock_bh(&queue->lock);
++              __rpc_do_wake_up_task(task);
++              spin_unlock_bh(&queue->lock);
++      }
+ }
+ /*
+@@ -494,11 +410,11 @@ static struct rpc_task * __rpc_wake_up_n
+        */
+       q = &queue->tasks[queue->priority];
+       if (!list_empty(q)) {
+-              task = list_entry(q->next, struct rpc_task, tk_list);
++              task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+               if (queue->cookie == task->tk_cookie) {
+                       if (--queue->nr)
+                               goto out;
+-                      list_move_tail(&task->tk_list, q);
++                      list_move_tail(&task->u.tk_wait.list, q);
+               }
+               /*
+                * Check if we need to switch queues.
+@@ -516,7 +432,7 @@ static struct rpc_task * __rpc_wake_up_n
+               else
+                       q = q - 1;
+               if (!list_empty(q)) {
+-                      task = list_entry(q->next, struct rpc_task, tk_list);
++                      task = list_entry(q->next, struct rpc_task, u.tk_wait.list);
+                       goto new_queue;
+               }
+       } while (q != &queue->tasks[queue->priority]);
+@@ -541,14 +457,14 @@ struct rpc_task * rpc_wake_up_next(struc
+       struct rpc_task *task = NULL;
+       dprintk("RPC:      wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue));
+-      spin_lock_bh(&rpc_queue_lock);
++      spin_lock_bh(&queue->lock);
+       if (RPC_IS_PRIORITY(queue))
+               task = __rpc_wake_up_next_priority(queue);
+       else {
+               task_for_first(task, &queue->tasks[0])
+                       __rpc_wake_up_task(task);
+       }
+-      spin_unlock_bh(&rpc_queue_lock);
++      spin_unlock_bh(&queue->lock);
+       return task;
+ }
+@@ -557,25 +473,25 @@ struct rpc_task * rpc_wake_up_next(struc
+  * rpc_wake_up - wake up all rpc_tasks
+  * @queue: rpc_wait_queue on which the tasks are sleeping
+  *
+- * Grabs rpc_queue_lock
++ * Grabs queue->lock
+  */
+ void rpc_wake_up(struct rpc_wait_queue *queue)
+ {
+       struct rpc_task *task;
+       struct list_head *head;
+-      spin_lock_bh(&rpc_queue_lock);
++      spin_lock_bh(&queue->lock);
+       head = &queue->tasks[queue->maxpriority];
+       for (;;) {
+               while (!list_empty(head)) {
+-                      task = list_entry(head->next, struct rpc_task, tk_list);
++                      task = list_entry(head->next, struct rpc_task, u.tk_wait.list);
+                       __rpc_wake_up_task(task);
+               }
+               if (head == &queue->tasks[0])
+                       break;
+               head--;
+       }
+-      spin_unlock_bh(&rpc_queue_lock);
++      spin_unlock_bh(&queue->lock);
+ }
+ /**
+@@ -583,18 +499,18 @@ void rpc_wake_up(struct rpc_wait_queue *
+  * @queue: rpc_wait_queue on which the tasks are sleeping
+  * @status: status value to set
+  *
+- * Grabs rpc_queue_lock
++ * Grabs queue->lock
+  */
+ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
+ {
+       struct list_head *head;
+       struct rpc_task *task;
+-      spin_lock_bh(&rpc_queue_lock);
++      spin_lock_bh(&queue->lock);
+       head = &queue->tasks[queue->maxpriority];
+       for (;;) {
+               while (!list_empty(head)) {
+-                      task = list_entry(head->next, struct rpc_task, tk_list);
++                      task = list_entry(head->next, struct rpc_task, u.tk_wait.list);
+                       task->tk_status = status;
+                       __rpc_wake_up_task(task);
+               }
+@@ -602,7 +518,7 @@ void rpc_wake_up_status(struct rpc_wait_
+                       break;
+               head--;
+       }
+-      spin_unlock_bh(&rpc_queue_lock);
++      spin_unlock_bh(&queue->lock);
+ }
+ /*
+@@ -626,18 +542,14 @@ __rpc_atrun(struct rpc_task *task)
+ /*
+  * This is the RPC `scheduler' (or rather, the finite state machine).
+  */
+-static int
+-__rpc_execute(struct rpc_task *task)
++static int __rpc_execute(struct rpc_task *task)
+ {
+       int             status = 0;
+       dprintk("RPC: %4d rpc_execute flgs %x\n",
+                               task->tk_pid, task->tk_flags);
+-      if (!RPC_IS_RUNNING(task)) {
+-              printk(KERN_WARNING "RPC: rpc_execute called for sleeping task!!\n");
+-              return 0;
+-      }
++      BUG_ON(RPC_IS_QUEUED(task));
+  restarted:
+       while (1) {
+@@ -657,7 +569,9 @@ __rpc_execute(struct rpc_task *task)
+                        */
+                       save_callback=task->tk_callback;
+                       task->tk_callback=NULL;
++                      lock_kernel();
+                       save_callback(task);
++                      unlock_kernel();
+               }
+               /*
+@@ -665,43 +579,41 @@ __rpc_execute(struct rpc_task *task)
+                * tk_action may be NULL when the task has been killed
+                * by someone else.
+                */
+-              if (RPC_IS_RUNNING(task)) {
++              if (!RPC_IS_QUEUED(task)) {
+                       /*
+                        * Garbage collection of pending timers...
+                        */
+                       rpc_delete_timer(task);
+                       if (!task->tk_action)
+                               break;
++                      lock_kernel();
+                       task->tk_action(task);
+-                      /* micro-optimization to avoid spinlock */
+-                      if (RPC_IS_RUNNING(task))
+-                              continue;
++                      unlock_kernel();
+               }
+               /*
+-               * Check whether task is sleeping.
++               * Lockless check for whether task is sleeping or not.
+                */
+-              spin_lock_bh(&rpc_queue_lock);
+-              if (!RPC_IS_RUNNING(task)) {
+-                      rpc_set_sleeping(task);
+-                      if (RPC_IS_ASYNC(task)) {
+-                              spin_unlock_bh(&rpc_queue_lock);
++              if (!RPC_IS_QUEUED(task))
++                      continue;
++              if (RPC_IS_ASYNC(task)) {
++                      rpc_clear_running(task);
++                      /* Careful! we may have raced... */
++                      if (RPC_IS_QUEUED(task))
+                               return 0;
+-                      }
++                      if (rpc_test_and_set_running(task))
++                              return 0;
++                      continue;
+               }
+-              spin_unlock_bh(&rpc_queue_lock);
+-              if (!RPC_IS_SLEEPING(task))
+-                      continue;
++              init_waitqueue_head(&task->u.tk_wait.waitq);
++              rpc_clear_running(task);
+               /* sync task: sleep here */
+               dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid);
+-              if (current->pid == rpciod_pid)
+-                      printk(KERN_ERR "RPC: rpciod waiting on sync task!\n");
+-
+               if (!task->tk_client->cl_intr) {
+-                      __wait_event(task->tk_wait, !RPC_IS_SLEEPING(task));
++                      __wait_event(task->u.tk_wait.waitq, RPC_IS_RUNNING(task));
+               } else {
+-                      __wait_event_interruptible(task->tk_wait, !RPC_IS_SLEEPING(task), status);
++                      __wait_event_interruptible(task->u.tk_wait.waitq, RPC_IS_RUNNING(task), status);
+                       /*
+                        * When a sync task receives a signal, it exits with
+                        * -ERESTARTSYS. In order to catch any callbacks that
+@@ -719,7 +631,9 @@ __rpc_execute(struct rpc_task *task)
+       }
+       if (task->tk_exit) {
++              lock_kernel();
+               task->tk_exit(task);
++              unlock_kernel();
+               /* If tk_action is non-null, the user wants us to restart */
+               if (task->tk_action) {
+                       if (!RPC_ASSASSINATED(task)) {
+@@ -738,7 +652,6 @@ __rpc_execute(struct rpc_task *task)
+       /* Release all resources associated with the task */
+       rpc_release_task(task);
+-
+       return status;
+ }
+@@ -754,57 +667,16 @@ __rpc_execute(struct rpc_task *task)
+ int
+ rpc_execute(struct rpc_task *task)
+ {
+-      int status = -EIO;
+-      if (rpc_inhibit) {
+-              printk(KERN_INFO "RPC: execution inhibited!\n");
+-              goto out_release;
+-      }
+-
+-      status = -EWOULDBLOCK;
+-      if (task->tk_active) {
+-              printk(KERN_ERR "RPC: active task was run twice!\n");
+-              goto out_err;
+-      }
++      BUG_ON(task->tk_active);
+       task->tk_active = 1;
+       rpc_set_running(task);
+       return __rpc_execute(task);
+- out_release:
+-      rpc_release_task(task);
+- out_err:
+-      return status;
+ }
+-/*
+- * This is our own little scheduler for async RPC tasks.
+- */
+-static void
+-__rpc_schedule(void)
++static void rpc_async_schedule(void *arg)
+ {
+-      struct rpc_task *task;
+-      int             count = 0;
+-
+-      dprintk("RPC:      rpc_schedule enter\n");
+-      while (1) {
+-
+-              task_for_first(task, &schedq.tasks[0]) {
+-                      __rpc_remove_wait_queue(task);
+-                      spin_unlock_bh(&rpc_queue_lock);
+-
+-                      __rpc_execute(task);
+-                      spin_lock_bh(&rpc_queue_lock);
+-              } else {
+-                      break;
+-              }
+-
+-              if (++count >= 200 || need_resched()) {
+-                      count = 0;
+-                      spin_unlock_bh(&rpc_queue_lock);
+-                      schedule();
+-                      spin_lock_bh(&rpc_queue_lock);
+-              }
+-      }
+-      dprintk("RPC:      rpc_schedule leave\n");
++      __rpc_execute((struct rpc_task *)arg);
+ }
+ /*
+@@ -862,7 +734,6 @@ void rpc_init_task(struct rpc_task *task
+       task->tk_client = clnt;
+       task->tk_flags  = flags;
+       task->tk_exit   = callback;
+-      init_waitqueue_head(&task->tk_wait);
+       if (current->uid != current->fsuid || current->gid != current->fsgid)
+               task->tk_flags |= RPC_TASK_SETUID;
+@@ -873,7 +744,9 @@ void rpc_init_task(struct rpc_task *task
+       task->tk_priority = RPC_PRIORITY_NORMAL;
+       task->tk_cookie = (unsigned long)current;
+-      INIT_LIST_HEAD(&task->tk_links);
++
++      /* Initialize workqueue for async tasks */
++      task->tk_workqueue = rpciod_workqueue;
+       /* Add to global list of all tasks */
+       spin_lock(&rpc_sched_lock);
+@@ -942,8 +815,7 @@ cleanup:
+       goto out;
+ }
+-void
+-rpc_release_task(struct rpc_task *task)
++void rpc_release_task(struct rpc_task *task)
+ {
+       dprintk("RPC: %4d release task\n", task->tk_pid);
+@@ -961,19 +833,9 @@ rpc_release_task(struct rpc_task *task)
+       list_del(&task->tk_task);
+       spin_unlock(&rpc_sched_lock);
+-      /* Protect the execution below. */
+-      spin_lock_bh(&rpc_queue_lock);
+-
+-      /* Disable timer to prevent zombie wakeup */
+-      __rpc_disable_timer(task);
+-
+-      /* Remove from any wait queue we're still on */
+-      __rpc_remove_wait_queue(task);
+-
++      BUG_ON (rpc_test_and_clear_queued(task));
+       task->tk_active = 0;
+-      spin_unlock_bh(&rpc_queue_lock);
+-
+       /* Synchronously delete any running timer */
+       rpc_delete_timer(task);
+@@ -1003,10 +865,9 @@ rpc_release_task(struct rpc_task *task)
+  * queue 'childq'. If so returns a pointer to the parent.
+  * Upon failure returns NULL.
+  *
+- * Caller must hold rpc_queue_lock
++ * Caller must hold childq.lock
+  */
+-static inline struct rpc_task *
+-rpc_find_parent(struct rpc_task *child)
++static inline struct rpc_task *rpc_find_parent(struct rpc_task *child)
+ {
+       struct rpc_task *task, *parent;
+       struct list_head *le;
+@@ -1019,17 +880,16 @@ rpc_find_parent(struct rpc_task *child)
+       return NULL;
+ }
+-static void
+-rpc_child_exit(struct rpc_task *child)
++static void rpc_child_exit(struct rpc_task *child)
+ {
+       struct rpc_task *parent;
+-      spin_lock_bh(&rpc_queue_lock);
++      spin_lock_bh(&childq.lock);
+       if ((parent = rpc_find_parent(child)) != NULL) {
+               parent->tk_status = child->tk_status;
+               __rpc_wake_up_task(parent);
+       }
+-      spin_unlock_bh(&rpc_queue_lock);
++      spin_unlock_bh(&childq.lock);
+ }
+ /*
+@@ -1052,22 +912,20 @@ fail:
+       return NULL;
+ }
+-void
+-rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func)
++void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func)
+ {
+-      spin_lock_bh(&rpc_queue_lock);
++      spin_lock_bh(&childq.lock);
+       /* N.B. Is it possible for the child to have already finished? */
+       __rpc_sleep_on(&childq, task, func, NULL);
+       rpc_schedule_run(child);
+-      spin_unlock_bh(&rpc_queue_lock);
++      spin_unlock_bh(&childq.lock);
+ }
+ /*
+  * Kill all tasks for the given client.
+  * XXX: kill their descendants as well?
+  */
+-void
+-rpc_killall_tasks(struct rpc_clnt *clnt)
++void rpc_killall_tasks(struct rpc_clnt *clnt)
+ {
+       struct rpc_task *rovr;
+       struct list_head *le;
+@@ -1089,93 +947,14 @@ rpc_killall_tasks(struct rpc_clnt *clnt)
+ static DECLARE_MUTEX_LOCKED(rpciod_running);
+-static inline int
+-rpciod_task_pending(void)
+-{
+-      return !list_empty(&schedq.tasks[0]);
+-}
+-
+-
+-/*
+- * This is the rpciod kernel thread
+- */
+-static int
+-rpciod(void *ptr)
+-{
+-      int             rounds = 0;
+-
+-      lock_kernel();
+-      /*
+-       * Let our maker know we're running ...
+-       */
+-      rpciod_pid = current->pid;
+-      up(&rpciod_running);
+-
+-      daemonize("rpciod");
+-      allow_signal(SIGKILL);
+-
+-      dprintk("RPC: rpciod starting (pid %d)\n", rpciod_pid);
+-      spin_lock_bh(&rpc_queue_lock);
+-      while (rpciod_users) {
+-              DEFINE_WAIT(wait);
+-              if (signalled()) {
+-                      spin_unlock_bh(&rpc_queue_lock);
+-                      rpciod_killall();
+-                      flush_signals(current);
+-                      spin_lock_bh(&rpc_queue_lock);
+-              }
+-              __rpc_schedule();
+-              if (current->flags & PF_FREEZE) {
+-                      spin_unlock_bh(&rpc_queue_lock);
+-                      refrigerator(PF_FREEZE);
+-                      spin_lock_bh(&rpc_queue_lock);
+-              }
+-
+-              if (++rounds >= 64) {   /* safeguard */
+-                      spin_unlock_bh(&rpc_queue_lock);
+-                      schedule();
+-                      rounds = 0;
+-                      spin_lock_bh(&rpc_queue_lock);
+-              }
+-
+-              dprintk("RPC: rpciod back to sleep\n");
+-              prepare_to_wait(&rpciod_idle, &wait, TASK_INTERRUPTIBLE);
+-              if (!rpciod_task_pending() && !signalled()) {
+-                      spin_unlock_bh(&rpc_queue_lock);
+-                      schedule();
+-                      rounds = 0;
+-                      spin_lock_bh(&rpc_queue_lock);
+-              }
+-              finish_wait(&rpciod_idle, &wait);
+-              dprintk("RPC: switch to rpciod\n");
+-      }
+-      spin_unlock_bh(&rpc_queue_lock);
+-
+-      dprintk("RPC: rpciod shutdown commences\n");
+-      if (!list_empty(&all_tasks)) {
+-              printk(KERN_ERR "rpciod: active tasks at shutdown?!\n");
+-              rpciod_killall();
+-      }
+-
+-      dprintk("RPC: rpciod exiting\n");
+-      unlock_kernel();
+-
+-      rpciod_pid = 0;
+-      complete_and_exit(&rpciod_killer, 0);
+-      return 0;
+-}
+-
+-static void
+-rpciod_killall(void)
++static void rpciod_killall(void)
+ {
+       unsigned long flags;
+       while (!list_empty(&all_tasks)) {
+               clear_thread_flag(TIF_SIGPENDING);
+               rpc_killall_tasks(NULL);
+-              spin_lock_bh(&rpc_queue_lock);
+-              __rpc_schedule();
+-              spin_unlock_bh(&rpc_queue_lock);
++              flush_workqueue(rpciod_workqueue);
+               if (!list_empty(&all_tasks)) {
+                       dprintk("rpciod_killall: waiting for tasks to exit\n");
+                       yield();
+@@ -1193,28 +972,30 @@ rpciod_killall(void)
+ int
+ rpciod_up(void)
+ {
++      struct workqueue_struct *wq;
+       int error = 0;
+       down(&rpciod_sema);
+-      dprintk("rpciod_up: pid %d, users %d\n", rpciod_pid, rpciod_users);
++      dprintk("rpciod_up: users %d\n", rpciod_users);
+       rpciod_users++;
+-      if (rpciod_pid)
++      if (rpciod_workqueue)
+               goto out;
+       /*
+        * If there's no pid, we should be the first user.
+        */
+       if (rpciod_users > 1)
+-              printk(KERN_WARNING "rpciod_up: no pid, %d users??\n", rpciod_users);
++              printk(KERN_WARNING "rpciod_up: no workqueue, %d users??\n", rpciod_users);
+       /*
+        * Create the rpciod thread and wait for it to start.
+        */
+-      error = kernel_thread(rpciod, NULL, 0);
+-      if (error < 0) {
+-              printk(KERN_WARNING "rpciod_up: create thread failed, error=%d\n", error);
++      error = -ENOMEM;
++      wq = create_workqueue("rpciod");
++      if (wq == NULL) {
++              printk(KERN_WARNING "rpciod_up: create workqueue failed, error=%d\n", error);
+               rpciod_users--;
+               goto out;
+       }
+-      down(&rpciod_running);
++      rpciod_workqueue = wq;
+       error = 0;
+ out:
+       up(&rpciod_sema);
+@@ -1225,20 +1006,21 @@ void
+ rpciod_down(void)
+ {
+       down(&rpciod_sema);
+-      dprintk("rpciod_down pid %d sema %d\n", rpciod_pid, rpciod_users);
++      dprintk("rpciod_down sema %d\n", rpciod_users);
+       if (rpciod_users) {
+               if (--rpciod_users)
+                       goto out;
+       } else
+-              printk(KERN_WARNING "rpciod_down: pid=%d, no users??\n", rpciod_pid);
++              printk(KERN_WARNING "rpciod_down: no users??\n");
+-      if (!rpciod_pid) {
++      if (!rpciod_workqueue) {
+               dprintk("rpciod_down: Nothing to do!\n");
+               goto out;
+       }
++      rpciod_killall();
+-      kill_proc(rpciod_pid, SIGKILL, 1);
+-      wait_for_completion(&rpciod_killer);
++      destroy_workqueue(rpciod_workqueue);
++      rpciod_workqueue = NULL;
+  out:
+       up(&rpciod_sema);
+ }
+@@ -1256,7 +1038,12 @@ void rpc_show_tasks(void)
+       }
+       printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout "
+               "-rpcwait -action- --exit--\n");
+-      alltask_for_each(t, le, &all_tasks)
++      alltask_for_each(t, le, &all_tasks) {
++              const char *rpc_waitq = "none";
++
++              if (RPC_IS_QUEUED(t))
++                      rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq);
++
+               printk("%05d %04d %04x %06d %8p %6d %8p %08ld %8s %8p %8p\n",
+                       t->tk_pid,
+                       (t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1),
+@@ -1264,8 +1051,9 @@ void rpc_show_tasks(void)
+                       t->tk_client,
+                       (t->tk_client ? t->tk_client->cl_prog : 0),
+                       t->tk_rqstp, t->tk_timeout,
+-                      rpc_qname(t->tk_rpcwait),
++                      rpc_waitq,
+                       t->tk_action, t->tk_exit);
++      }
+       spin_unlock(&rpc_sched_lock);
+ }
+ #endif
+--- linux-2.6.7/net/sunrpc/svcsock.c.lsec      2004-06-15 23:18:57.000000000 -0600
++++ linux-2.6.7/net/sunrpc/svcsock.c   2005-03-23 14:28:24.029396672 -0700
+@@ -414,7 +414,6 @@ svc_sendto(struct svc_rqst *rqstp, struc
+       }
+       /* send tail */
+       if (xdr->tail[0].iov_len) {
+-              /* The tail *will* be in respages[0]; */
+               result = sock->ops->sendpage(sock, rqstp->rq_respages[rqstp->rq_restailpage], 
+                                            ((unsigned long)xdr->tail[0].iov_base)& (PAGE_SIZE-1),
+                                            xdr->tail[0].iov_len, 0);
+--- linux-2.6.7/net/sunrpc/clnt.c.lsec 2004-06-15 23:19:13.000000000 -0600
++++ linux-2.6.7/net/sunrpc/clnt.c      2005-03-23 14:28:23.595462640 -0700
+@@ -351,7 +351,9 @@ int rpc_call_sync(struct rpc_clnt *clnt,
+       rpc_clnt_sigmask(clnt, &oldset);                
+       /* Create/initialize a new RPC task */
+-      rpc_init_task(task, clnt, NULL, flags);
++      task = rpc_new_task(clnt, NULL, flags);
++      if (task == NULL)
++              return -ENOMEM;
+       rpc_call_setup(task, msg, 0);
+       /* Set up the call info struct and execute the task */
+@@ -620,8 +622,14 @@ call_encode(struct rpc_task *task)
+               rpc_exit(task, -EIO);
+               return;
+       }
+-      if (encode && (status = rpcauth_wrap_req(task, encode, req, p,
+-                                               task->tk_msg.rpc_argp)) < 0) {
++      if (encode == NULL)
++              return;
++
++      status = rpcauth_wrap_req(task, encode, req, p, task->tk_msg.rpc_argp);
++      if (status == -EAGAIN) {
++              printk("XXXJBF: out of memeory?  Should retry here!!!\n");
++      }
++      if (status < 0) {
+               printk(KERN_WARNING "%s: can't encode arguments: %d\n",
+                               clnt->cl_protname, -status);
+               rpc_exit(task, status);
+--- linux-2.6.7/net/sunrpc/sunrpc_syms.c.lsec  2004-06-15 23:19:52.000000000 -0600
++++ linux-2.6.7/net/sunrpc/sunrpc_syms.c       2005-03-23 14:32:35.589153776 -0700
+@@ -58,6 +58,8 @@ EXPORT_SYMBOL(rpc_unlink);
+ EXPORT_SYMBOL(rpc_wake_up);
+ EXPORT_SYMBOL(rpc_queue_upcall);
+ EXPORT_SYMBOL(rpc_mkpipe);
++EXPORT_SYMBOL(rpc_mkdir);
++EXPORT_SYMBOL(rpc_rmdir);
+ /* Client transport */
+ EXPORT_SYMBOL(xprt_create_proto);
+@@ -89,6 +91,7 @@ EXPORT_SYMBOL(svc_makesock);
+ EXPORT_SYMBOL(svc_reserve);
+ EXPORT_SYMBOL(svc_auth_register);
+ EXPORT_SYMBOL(auth_domain_lookup);
++EXPORT_SYMBOL(svc_authenticate);
+ /* RPC statistics */
+ #ifdef CONFIG_PROC_FS
+--- linux-2.6.7/net/sunrpc/pmap_clnt.c.lsec    2004-06-15 23:19:23.000000000 -0600
++++ linux-2.6.7/net/sunrpc/pmap_clnt.c 2005-03-23 14:28:24.134380712 -0700
+@@ -183,8 +183,10 @@ rpc_register(u32 prog, u32 vers, int pro
+       map.pm_prot = prot;
+       map.pm_port = port;
++      rpciod_up();
+       error = rpc_call(pmap_clnt, port? PMAP_SET : PMAP_UNSET,
+                                       &map, okay, 0);
++      rpciod_down();
+       if (error < 0) {
+               printk(KERN_WARNING
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_unseal.c.lsec     2004-06-15 23:19:44.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_unseal.c  2005-03-23 14:28:23.761437408 -0700
+@@ -68,20 +68,13 @@
+ #endif
+-/* message_buffer is an input if toktype is MIC and an output if it is WRAP:
+- * If toktype is MIC: read_token is a mic token, and message_buffer is the
+- *   data that the mic was supposedly taken over.
+- * If toktype is WRAP: read_token is a wrap token, and message_buffer is used
+- *   to return the decrypted data.
+- */
++/* read_token is a mic token, and message_buffer is the data that the mic was
++ * supposedly taken over. */
+-/* XXX will need to change prototype and/or just split into a separate function
+- * when we add privacy (because read_token will be in pages too). */
+ u32
+ krb5_read_token(struct krb5_ctx *ctx,
+               struct xdr_netobj *read_token,
+-              struct xdr_buf *message_buffer,
+-              int *qop_state, int toktype)
++              struct xdr_buf *message_buffer, int *qop_state)
+ {
+       int                     signalg;
+       int                     sealalg;
+@@ -96,20 +89,16 @@ krb5_read_token(struct krb5_ctx *ctx,
+       dprintk("RPC:      krb5_read_token\n");
+-      if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr, toktype,
++      if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr,
+                                       read_token->len))
+               goto out;
+-      if ((*ptr++ != ((toktype>>8)&0xff)) || (*ptr++ != (toktype&0xff)))
++      if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) ||
++          (*ptr++ != ( KG_TOK_MIC_MSG    &0xff))   )
+               goto out;
+       /* XXX sanity-check bodysize?? */
+-      if (toktype == KG_TOK_WRAP_MSG) {
+-              /* XXX gone */
+-              goto out;
+-      }
+-
+       /* get the sign and seal algorithms */
+       signalg = ptr[0] + (ptr[1] << 8);
+@@ -120,14 +109,7 @@ krb5_read_token(struct krb5_ctx *ctx,
+       if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
+               goto out;
+-      if (((toktype != KG_TOK_WRAP_MSG) && (sealalg != 0xffff)) ||
+-          ((toktype == KG_TOK_WRAP_MSG) && (sealalg == 0xffff)))
+-              goto out;
+-
+-      /* in the current spec, there is only one valid seal algorithm per
+-         key type, so a simple comparison is ok */
+-
+-      if ((toktype == KG_TOK_WRAP_MSG) && !(sealalg == ctx->sealalg))
++      if (sealalg != 0xffff)
+               goto out;
+       /* there are several mappings of seal algorithms to sign algorithms,
+@@ -154,7 +136,7 @@ krb5_read_token(struct krb5_ctx *ctx,
+       switch (signalg) {
+       case SGN_ALG_DES_MAC_MD5:
+               ret = make_checksum(checksum_type, ptr - 2, 8,
+-                                       message_buffer, &md5cksum);
++                                       message_buffer, 0, &md5cksum);
+               if (ret)
+                       goto out;
+--- linux-2.6.7/net/sunrpc/auth_gss/auth_gss.c.lsec    2004-06-15 23:19:22.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/auth_gss.c 2005-03-23 14:28:24.185372960 -0700
+@@ -45,6 +45,7 @@
+ #include <linux/socket.h>
+ #include <linux/in.h>
+ #include <linux/sched.h>
++#include <linux/pagemap.h>
+ #include <linux/sunrpc/clnt.h>
+ #include <linux/sunrpc/auth.h>
+ #include <linux/sunrpc/auth_gss.h>
+@@ -397,7 +398,7 @@ retry:
+               spin_unlock(&gss_auth->lock);
+       }
+       gss_release_msg(gss_msg);
+-      dprintk("RPC: %4u gss_upcall for uid %u result %d", task->tk_pid,
++      dprintk("RPC: %4u gss_upcall for uid %u result %d\n", task->tk_pid,
+                       uid, res);
+       return res;
+ out_sleep:
+@@ -740,6 +741,8 @@ gss_marshal(struct rpc_task *task, u32 *
+       maj_stat = gss_get_mic(ctx->gc_gss_ctx,
+                              GSS_C_QOP_DEFAULT, 
+                              &verf_buf, &mic);
++      if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++              cred->cr_flags |= RPCAUTH_CRED_DEAD;
+       if(maj_stat != 0){
+               printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat);
+               goto out_put_ctx;
+@@ -779,6 +782,7 @@ gss_validate(struct rpc_task *task, u32 
+       struct xdr_netobj mic;
+       u32             flav,len;
+       u32             service;
++      u32             maj_stat;
+       dprintk("RPC: %4u gss_validate\n", task->tk_pid);
+@@ -794,8 +798,11 @@ gss_validate(struct rpc_task *task, u32 
+       mic.data = (u8 *)p;
+       mic.len = len;
+-      if (gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state))
+-               goto out_bad;
++      maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state);
++      if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++              cred->cr_flags |= RPCAUTH_CRED_DEAD;
++      if (maj_stat)
++              goto out_bad;
+        service = gss_pseudoflavor_to_service(ctx->gc_gss_ctx->mech_type,
+                                       gss_cred->gc_flavor);
+        switch (service) {
+@@ -807,6 +814,11 @@ gss_validate(struct rpc_task *task, u32 
+              /* verifier data, flavor, length, length, sequence number: */
+              task->tk_auth->au_rslack = XDR_QUADLEN(len) + 4;
+              break;
++       case RPC_GSS_SVC_PRIVACY:
++             /* XXXJBF: Ugh. Going for a wild overestimate.
++              * Need some info from krb5 layer? */
++             task->tk_auth->au_rslack = XDR_QUADLEN(len) + 32;
++             break;
+        default:
+              goto out_bad;
+        }
+@@ -821,11 +833,11 @@ out_bad:
+ }
+ static inline int
+-gss_wrap_req_integ(struct gss_cl_ctx *ctx,
+-                      kxdrproc_t encode, void *rqstp, u32 *p, void *obj)
++gss_wrap_req_integ(struct rpc_cred *cred, kxdrproc_t encode,
++                      struct rpc_rqst *rqstp, u32 *p, void *obj)
+ {
+-      struct rpc_rqst *req = (struct rpc_rqst *)rqstp;
+-      struct xdr_buf  *snd_buf = &req->rq_snd_buf;
++      struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
++      struct xdr_buf  *snd_buf = &rqstp->rq_snd_buf;
+       struct xdr_buf  integ_buf;
+       u32             *integ_len = NULL;
+       struct xdr_netobj mic;
+@@ -836,7 +848,7 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct
+       integ_len = p++;
+       offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
+-      *p++ = htonl(req->rq_seqno);
++      *p++ = htonl(rqstp->rq_seqno);
+       status = encode(rqstp, p, obj);
+       if (status)
+@@ -848,7 +860,7 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct
+       *integ_len = htonl(integ_buf.len);
+       /* guess whether we're in the head or the tail: */
+-      if (snd_buf->page_len || snd_buf->tail[0].iov_len) 
++      if (snd_buf->page_len || snd_buf->tail[0].iov_len)
+               iov = snd_buf->tail;
+       else
+               iov = snd_buf->head;
+@@ -857,6 +869,8 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct
+       maj_stat = gss_get_mic(ctx->gc_gss_ctx,
+                       GSS_C_QOP_DEFAULT, &integ_buf, &mic);
++      if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++              cred->cr_flags |= RPCAUTH_CRED_DEAD;
+       status = -EIO; /* XXX? */
+       if (maj_stat)
+               return status;
+@@ -868,6 +882,113 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct
+       return 0;
+ }
++static void
++priv_release_snd_buf(struct rpc_rqst *rqstp)
++{
++      int i;
++
++      for (i=0; i < rqstp->rq_enc_pages_num; i++)
++              __free_page(rqstp->rq_enc_pages[i]);
++      kfree(rqstp->rq_enc_pages);
++}
++
++static int
++alloc_enc_pages(struct rpc_rqst *rqstp)
++{
++      struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
++      int first, last, i;
++
++      if (snd_buf->page_len == 0) {
++              rqstp->rq_enc_pages_num = 0;
++              return 0;
++      }
++
++      first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
++      last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT;
++      rqstp->rq_enc_pages_num = last - first + 1 + 1;
++      rqstp->rq_enc_pages
++              = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *),
++                              GFP_NOFS);
++      if (!rqstp->rq_enc_pages)
++              goto out;
++      for (i=0; i < rqstp->rq_enc_pages_num; i++) {
++              rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS);
++              if (rqstp->rq_enc_pages[i] == NULL)
++                      goto out_free;
++      }
++      rqstp->rq_release_snd_buf = priv_release_snd_buf;
++      return 0;
++out_free:
++      for (i--; i >= 0; i--) {
++              __free_page(rqstp->rq_enc_pages[i]);
++      }
++out:
++      return -EAGAIN;
++}
++
++static inline int
++gss_wrap_req_priv(struct rpc_cred *cred, kxdrproc_t encode,
++              struct rpc_rqst *rqstp, u32 *p, void *obj)
++{
++      struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
++      struct xdr_buf  *snd_buf = &rqstp->rq_snd_buf;
++      u32             offset;
++      u32             maj_stat;
++      int             status;
++      u32             *opaque_len;
++      struct page     **inpages;
++      int             first;
++      int             pad;
++      struct iovec    *iov;
++      char            *tmp;
++
++      opaque_len = p++;
++      offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base;
++      *p++ = htonl(rqstp->rq_seqno);
++
++      status = encode(rqstp, p, obj);
++      if (status)
++              return status;
++
++      status = alloc_enc_pages(rqstp);
++      if (status)
++              return status;
++      /* XXXJBF: Oops!  Do we need rq_enc_pages really any more?? */
++      first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
++      inpages = snd_buf->pages + first;
++      snd_buf->pages = rqstp->rq_enc_pages;
++      snd_buf->page_base -= first << PAGE_CACHE_SHIFT;
++      /* XXX?: tail needs to be separate if we want to be able to expand
++       * the head (since it's often put right after the head).  But is
++       * expanding the head safe in any case? */
++      if (snd_buf->page_len || snd_buf->tail[0].iov_len) {
++              tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]);
++              memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len);
++              snd_buf->tail[0].iov_base = tmp;
++      }
++      maj_stat = gss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, offset,
++                              snd_buf, inpages);
++      if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++              cred->cr_flags |= RPCAUTH_CRED_DEAD;
++        status = -EIO; /* XXX? */
++      if (maj_stat)
++              return status;
++
++      *opaque_len = htonl(snd_buf->len - offset);
++      /* guess whether we're in the head or the tail: */
++      if (snd_buf->page_len || snd_buf->tail[0].iov_len)
++              iov = snd_buf->tail;
++      else
++              iov = snd_buf->head;
++      p = iov->iov_base + iov->iov_len;
++      pad = 3 - ((snd_buf->len - offset - 1) & 3);
++      memset(p, 0, pad);
++      iov->iov_len += pad;
++      snd_buf->len += pad;
++
++      return 0;
++}
++
+ static int
+ gss_wrap_req(struct rpc_task *task,
+            kxdrproc_t encode, void *rqstp, u32 *p, void *obj)
+@@ -894,9 +1015,11 @@ gss_wrap_req(struct rpc_task *task,
+                       status = encode(rqstp, p, obj);
+                       goto out;
+               case RPC_GSS_SVC_INTEGRITY:
+-                      status = gss_wrap_req_integ(ctx, encode, rqstp, p, obj);
++                      status = gss_wrap_req_integ(cred, encode, rqstp, p, obj);
+                       goto out;
+               case RPC_GSS_SVC_PRIVACY:
++                      status = gss_wrap_req_priv(cred, encode, rqstp, p, obj);
++                      goto out;
+               default:
+                       goto out;
+       }
+@@ -907,11 +1030,10 @@ out:
+ }
+ static inline int
+-gss_unwrap_resp_integ(struct gss_cl_ctx *ctx,
+-              kxdrproc_t decode, void *rqstp, u32 **p, void *obj)
++gss_unwrap_resp_integ(struct rpc_cred *cred, struct rpc_rqst *rqstp, u32 **p)
+ {
+-      struct rpc_rqst *req = (struct rpc_rqst *)rqstp;
+-      struct xdr_buf  *rcv_buf = &req->rq_rcv_buf;
++      struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
++      struct xdr_buf  *rcv_buf = &rqstp->rq_rcv_buf;
+       struct xdr_buf integ_buf;
+       struct xdr_netobj mic;
+       u32 data_offset, mic_offset;
+@@ -926,7 +1048,7 @@ gss_unwrap_resp_integ(struct gss_cl_ctx 
+       mic_offset = integ_len + data_offset;
+       if (mic_offset > rcv_buf->len)
+               return status;
+-      if (ntohl(*(*p)++) != req->rq_seqno)
++      if (ntohl(*(*p)++) != rqstp->rq_seqno)
+               return status;
+       if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset,
+@@ -938,11 +1060,44 @@ gss_unwrap_resp_integ(struct gss_cl_ctx 
+       maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf,
+                       &mic, NULL);
++      if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++              cred->cr_flags |= RPCAUTH_CRED_DEAD;
+       if (maj_stat != GSS_S_COMPLETE)
+               return status;
+       return 0;
+ }
++static inline int
++gss_unwrap_resp_priv(struct rpc_cred *cred, struct rpc_rqst *rqstp, u32 **p)
++{
++      struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
++      struct xdr_buf  *rcv_buf = &rqstp->rq_rcv_buf;
++      u32 offset, out_offset;
++      u32 opaque_len;
++      u32 maj_stat;
++      int status = -EIO;
++
++      opaque_len = ntohl(*(*p)++);
++      offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base;
++      if (offset + opaque_len > rcv_buf->len)
++              return status;
++      /* remove padding: */
++      rcv_buf->len = offset + opaque_len;
++
++      maj_stat = gss_unwrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT,
++                      offset, rcv_buf, &out_offset);
++      if (maj_stat == GSS_S_CONTEXT_EXPIRED)
++              cred->cr_flags |= RPCAUTH_CRED_DEAD;
++      if (maj_stat != GSS_S_COMPLETE)
++              return status;
++      *p = (u32 *)(rcv_buf->head[0].iov_base + out_offset);
++      if (ntohl(*(*p)++) != rqstp->rq_seqno)
++              return status;
++
++      return 0;
++}
++
++
+ static int
+ gss_unwrap_resp(struct rpc_task *task,
+               kxdrproc_t decode, void *rqstp, u32 *p, void *obj)
+@@ -962,12 +1117,16 @@ gss_unwrap_resp(struct rpc_task *task,
+               case RPC_GSS_SVC_NONE:
+                       goto out_decode;
+               case RPC_GSS_SVC_INTEGRITY:
+-                      status = gss_unwrap_resp_integ(ctx, decode, 
+-                                                      rqstp, &p, obj);
++                      status = gss_unwrap_resp_integ(cred, rqstp, &p);
+                       if (status)
+                               goto out;
+                       break;
+               case RPC_GSS_SVC_PRIVACY:
++                      status = gss_unwrap_resp_priv(cred, rqstp, &p);
++                      if (status)
++                              goto out;
++                      break;
++
+               default:
+                       goto out;
+       }
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_mech.c.lsec      2005-03-23 14:28:24.187372656 -0700
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_mech.c   2005-03-23 14:28:24.186372808 -0700
+@@ -0,0 +1,296 @@
++/*
++ *  linux/net/sunrpc/gss_spkm3_mech.c
++ *
++ *  Copyright (c) 2003 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Andy Adamson <andros@umich.edu>
++ *  J. Bruce Fields <bfields@umich.edu>
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the University nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ */
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/sunrpc/auth.h>
++#include <linux/in.h>
++#include <linux/sunrpc/svcauth_gss.h>
++#include <linux/sunrpc/gss_spkm3.h>
++#include <linux/sunrpc/xdr.h>
++#include <linux/crypto.h>
++
++#ifdef RPC_DEBUG
++# define RPCDBG_FACILITY      RPCDBG_AUTH
++#endif
++
++struct xdr_netobj gss_mech_spkm3_oid =
++   {7, "\053\006\001\005\005\001\003"};
++
++static inline int
++get_bytes(char **ptr, const char *end, void *res, int len)
++{
++      char *p, *q;
++      p = *ptr;
++      q = p + len;
++      if (q > end || q < p)
++              return -1;
++      memcpy(res, p, len);
++      *ptr = q;
++      return 0;
++}
++
++static inline int
++get_netobj(char **ptr, const char *end, struct xdr_netobj *res)
++{
++      char *p, *q;
++      p = *ptr;
++      if (get_bytes(&p, end, &res->len, sizeof(res->len)))
++              return -1;
++      q = p + res->len;
++      if(res->len == 0)
++              goto out_nocopy;
++      if (q > end || q < p)
++              return -1;
++      if (!(res->data = kmalloc(res->len, GFP_KERNEL)))
++              return -1;
++      memcpy(res->data, p, res->len);
++out_nocopy:
++      *ptr = q;
++      return 0;
++}
++
++static inline int
++get_key(char **p, char *end, struct crypto_tfm **res, int *resalg)
++{
++      struct xdr_netobj       key = {
++              .len = 0,
++              .data = NULL,
++      };
++      int                     alg_mode,setkey = 0;
++      char                    *alg_name;
++
++      if (get_bytes(p, end, resalg, sizeof(int)))
++              goto out_err;
++      if ((get_netobj(p, end, &key)))
++              goto out_err;
++
++      switch (*resalg) {
++              case NID_des_cbc:
++                      alg_name = "des";
++                      alg_mode = CRYPTO_TFM_MODE_CBC;
++                      setkey = 1;
++                      break;
++              case NID_md5:
++                      if (key.len == 0) {
++                              dprintk("RPC: SPKM3 get_key: NID_md5 zero Key length\n");
++                      }
++                      alg_name = "md5";
++                      alg_mode = 0;
++                      setkey = 0;
++                      break;
++              case NID_cast5_cbc:
++                      dprintk("RPC: SPKM3 get_key: case cast5_cbc, UNSUPPORTED \n");
++                      goto out_err;
++                      break;
++              default:
++                      dprintk("RPC: SPKM3 get_key: unsupported algorithm %d", *resalg);
++                      goto out_err_free_key;
++      }
++      if (!(*res = crypto_alloc_tfm(alg_name, alg_mode)))
++              goto out_err_free_key;
++      if (setkey) {
++              if (crypto_cipher_setkey(*res, key.data, key.len))
++                      goto out_err_free_tfm;
++      }
++
++      if(key.len > 0)
++              kfree(key.data);
++      return 0;
++
++out_err_free_tfm:
++      crypto_free_tfm(*res);
++out_err_free_key:
++      if(key.len > 0)
++              kfree(key.data);
++out_err:
++      return -1;
++}
++
++static u32
++gss_import_sec_context_spkm3(struct xdr_netobj *inbuf,
++                              struct gss_ctx *ctx_id)
++{
++      char    *p = inbuf->data;
++      char    *end = inbuf->data + inbuf->len;
++      struct  spkm3_ctx *ctx;
++
++      if (!(ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)))
++              goto out_err;
++      memset(ctx, 0, sizeof(*ctx));
++
++      if (get_netobj(&p, end, &ctx->ctx_id))
++              goto out_err_free_ctx;
++
++      if (get_bytes(&p, end, &ctx->qop, sizeof(ctx->qop)))
++              goto out_err_free_ctx_id;
++
++      if (get_netobj(&p, end, &ctx->mech_used))
++              goto out_err_free_mech;
++
++      if (get_bytes(&p, end, &ctx->ret_flags, sizeof(ctx->ret_flags)))
++              goto out_err_free_mech;
++
++      if (get_bytes(&p, end, &ctx->req_flags, sizeof(ctx->req_flags)))
++              goto out_err_free_mech;
++
++      if (get_netobj(&p, end, &ctx->share_key))
++              goto out_err_free_s_key;
++
++      if (get_key(&p, end, &ctx->derived_conf_key, &ctx->conf_alg)) {
++              dprintk("RPC: SPKM3 confidentiality key will be NULL\n");
++      }
++
++      if (get_key(&p, end, &ctx->derived_integ_key, &ctx->intg_alg)) {
++              dprintk("RPC: SPKM3 integrity key will be NULL\n");
++      }
++
++      if (get_bytes(&p, end, &ctx->owf_alg, sizeof(ctx->owf_alg)))
++              goto out_err_free_s_key;
++
++      if (get_bytes(&p, end, &ctx->owf_alg, sizeof(ctx->owf_alg)))
++              goto out_err_free_s_key;
++
++      if (p != end)
++              goto out_err_free_s_key;
++
++      ctx_id->internal_ctx_id = ctx;
++
++      dprintk("Succesfully imported new spkm context.\n");
++      return 0;
++
++out_err_free_s_key:
++      kfree(ctx->share_key.data);
++out_err_free_mech:
++      kfree(ctx->mech_used.data);
++out_err_free_ctx_id:
++      kfree(ctx->ctx_id.data);
++out_err_free_ctx:
++      kfree(ctx);
++out_err:
++      return GSS_S_FAILURE;
++}
++
++void
++gss_delete_sec_context_spkm3(void *internal_ctx) {
++      struct spkm3_ctx *sctx = internal_ctx;
++
++      if(sctx->derived_integ_key)
++              crypto_free_tfm(sctx->derived_integ_key);
++      if(sctx->derived_conf_key)
++              crypto_free_tfm(sctx->derived_conf_key);
++      if(sctx->share_key.data)
++              kfree(sctx->share_key.data);
++      if(sctx->mech_used.data)
++              kfree(sctx->mech_used.data);
++      kfree(sctx);
++}
++
++u32
++gss_verify_mic_spkm3(struct gss_ctx           *ctx,
++                      struct xdr_buf          *signbuf,
++                      struct xdr_netobj       *checksum,
++                      u32             *qstate) {
++      u32 maj_stat = 0;
++      int qop_state = 0;
++      struct spkm3_ctx *sctx = ctx->internal_ctx_id;
++
++      dprintk("RPC: gss_verify_mic_spkm3 calling spkm3_read_token\n");
++      maj_stat = spkm3_read_token(sctx, checksum, signbuf, &qop_state,
++                                 SPKM_MIC_TOK);
++
++      if (!maj_stat && qop_state)
++          *qstate = qop_state;
++
++      dprintk("RPC: gss_verify_mic_spkm3 returning %d\n", maj_stat);
++      return maj_stat;
++}
++
++u32
++gss_get_mic_spkm3(struct gss_ctx      *ctx,
++                   u32                qop,
++                   struct xdr_buf     *message_buffer,
++                   struct xdr_netobj  *message_token) {
++      u32 err = 0;
++      struct spkm3_ctx *sctx = ctx->internal_ctx_id;
++
++      dprintk("RPC: gss_get_mic_spkm3\n");
++
++      err = spkm3_make_token(sctx, qop, message_buffer,
++                            message_token, SPKM_MIC_TOK);
++      return err;
++}
++
++static struct gss_api_ops gss_spkm3_ops = {
++      .gss_import_sec_context = gss_import_sec_context_spkm3,
++      .gss_get_mic            = gss_get_mic_spkm3,
++      .gss_verify_mic         = gss_verify_mic_spkm3,
++      .gss_delete_sec_context = gss_delete_sec_context_spkm3,
++};
++
++static struct pf_desc gss_spkm3_pfs[] = {
++      {RPC_AUTH_GSS_SPKM, 0, RPC_GSS_SVC_NONE, "spkm3"},
++      {RPC_AUTH_GSS_SPKMI, 0, RPC_GSS_SVC_INTEGRITY, "spkm3i"},
++};
++
++static struct gss_api_mech gss_spkm3_mech = {
++      .gm_name        = "spkm3",
++      .gm_owner       = THIS_MODULE,
++      .gm_ops         = &gss_spkm3_ops,
++      .gm_pf_num      = ARRAY_SIZE(gss_spkm3_pfs),
++      .gm_pfs         = gss_spkm3_pfs,
++};
++
++static int __init init_spkm3_module(void)
++{
++      int status;
++
++      status = gss_mech_register(&gss_spkm3_mech);
++      if (status)
++              printk("Failed to register spkm3 gss mechanism!\n");
++      return 0;
++}
++
++static void __exit cleanup_spkm3_module(void)
++{
++      gss_mech_unregister(&gss_spkm3_mech);
++}
++
++MODULE_LICENSE("GPL");
++module_init(init_spkm3_module);
++module_exit(cleanup_spkm3_module);
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_crypto.c.lsec     2004-06-15 23:18:55.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_crypto.c  2005-03-23 14:28:24.840273400 -0700
+@@ -139,17 +139,91 @@ buf_to_sg(struct scatterlist *sg, char *
+       sg->length = len;
+ }
++static int
++process_xdr_buf(struct xdr_buf *buf, int offset, int len,
++              int (*actor)(struct scatterlist *, void *), void *data)
++{
++      int i, page_len, thislen, page_offset, ret = 0;
++      struct scatterlist      sg[1];
++
++      if (offset >= buf->head[0].iov_len) {
++              offset -= buf->head[0].iov_len;
++      } else {
++              thislen = buf->head[0].iov_len - offset;
++              if (thislen > len)
++                      thislen = len;
++              buf_to_sg(sg, buf->head[0].iov_base + offset, thislen);
++              ret = actor(sg, data);
++              if (ret)
++                      goto out;
++              offset = 0;
++              len -= thislen;
++      }
++      if (len == 0)
++              goto out;
++
++      if (offset >= buf->page_len) {
++              offset -= buf->page_len;
++      } else {
++              page_len = buf->page_len - offset;
++              if (page_len > len)
++                      page_len = len;
++              len -= page_len;
++              page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1);
++              i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT;
++              thislen = PAGE_CACHE_SIZE - page_offset;
++              do {
++                      if (thislen > page_len)
++                              thislen = page_len;
++                      sg->page = buf->pages[i];
++                      sg->offset = page_offset;
++                      sg->length = thislen;
++                      ret = actor(sg, data);
++                      if (ret)
++                              goto out;
++                      page_len -= thislen;
++                      i++;
++                      page_offset = 0;
++                      thislen = PAGE_CACHE_SIZE;
++              } while (page_len != 0);
++              offset = 0;
++      }
++      if (len == 0)
++              goto out;
++
++      if (offset < buf->tail[0].iov_len) {
++              thislen = buf->tail[0].iov_len - offset;
++              if (thislen > len)
++                      thislen = len;
++              buf_to_sg(sg, buf->tail[0].iov_base + offset, thislen);
++              ret = actor(sg, data);
++              len -= thislen;
++      }
++      if (len != 0)
++              ret = -EINVAL;
++out:
++      return ret;
++}
++
++static int
++checksummer(struct scatterlist *sg, void *data)
++{
++      struct crypto_tfm *tfm = (struct crypto_tfm *)data;
++
++      crypto_digest_update(tfm, sg, 1);
++
++      return 0;
++}
++
+ /* checksum the plaintext data and hdrlen bytes of the token header */
+ s32
+ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body,
+-                 struct xdr_netobj *cksum)
++                 int body_offset, struct xdr_netobj *cksum)
+ {
+       char                            *cksumname;
+       struct crypto_tfm               *tfm = NULL; /* XXX add to ctx? */
+       struct scatterlist              sg[1];
+       u32                             code = GSS_S_FAILURE;
+-      int                             len, thislen, offset;
+-      int                             i;
+       switch (cksumtype) {
+               case CKSUMTYPE_RSA_MD5:
+@@ -169,35 +243,8 @@ make_checksum(s32 cksumtype, char *heade
+       crypto_digest_init(tfm);
+       buf_to_sg(sg, header, hdrlen);
+       crypto_digest_update(tfm, sg, 1);
+-      if (body->head[0].iov_len) {
+-              buf_to_sg(sg, body->head[0].iov_base, body->head[0].iov_len);
+-              crypto_digest_update(tfm, sg, 1);
+-      }
+-
+-      len = body->page_len;
+-      if (len != 0) {
+-              offset = body->page_base & (PAGE_CACHE_SIZE - 1);
+-              i = body->page_base >> PAGE_CACHE_SHIFT;
+-              thislen = PAGE_CACHE_SIZE - offset;
+-              do {
+-                      if (thislen > len)
+-                              thislen = len;
+-                      sg->page = body->pages[i];
+-                      sg->offset = offset;
+-                      sg->length = thislen;
+-                      kmap(sg->page); /* XXX kmap_atomic? */
+-                      crypto_digest_update(tfm, sg, 1);
+-                      kunmap(sg->page);
+-                      len -= thislen;
+-                      i++;
+-                      offset = 0;
+-                      thislen = PAGE_CACHE_SIZE;
+-              } while(len != 0);
+-      }
+-      if (body->tail[0].iov_len) {
+-              buf_to_sg(sg, body->tail[0].iov_base, body->tail[0].iov_len);
+-              crypto_digest_update(tfm, sg, 1);
+-      }
++      process_xdr_buf(body, body_offset, body->len - body_offset,
++                      checksummer, tfm);
+       crypto_digest_final(tfm, cksum->data);
+       code = 0;
+ out:
+@@ -207,3 +254,154 @@ out:
+ }
+ EXPORT_SYMBOL(make_checksum);
++
++struct encryptor_desc {
++      u8 iv[8]; /* XXX hard-coded blocksize */
++      struct crypto_tfm *tfm;
++      int pos;
++      struct xdr_buf *outbuf;
++      struct page **pages;
++      struct scatterlist infrags[4];
++      struct scatterlist outfrags[4];
++      int fragno;
++      int fraglen;
++};
++
++static int
++encryptor(struct scatterlist *sg, void *data)
++{
++      struct encryptor_desc *desc = data;
++      struct xdr_buf *outbuf = desc->outbuf;
++      struct page *in_page;
++      int thislen = desc->fraglen + sg->length;
++      int fraglen, ret;
++      int page_pos;
++
++      /* Worst case is 4 fragments: head, end of page 1, start
++       * of page 2, tail.  Anything more is a bug. */
++      BUG_ON(desc->fragno > 3);
++      desc->infrags[desc->fragno] = *sg;
++      desc->outfrags[desc->fragno] = *sg;
++
++      page_pos = desc->pos - outbuf->head[0].iov_len;
++      if (page_pos >= 0 && page_pos < outbuf->page_len) {
++              /* pages are not in place: */
++              int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT;
++              in_page = desc->pages[i];
++      } else {
++              in_page = sg->page;
++      }
++      desc->infrags[desc->fragno].page = in_page;
++      desc->fragno++;
++      desc->fraglen += sg->length;
++      desc->pos += sg->length;
++
++      fraglen = thislen & 7; /* XXX hardcoded blocksize */
++      thislen -= fraglen;
++
++      if (thislen == 0)
++              return 0;
++
++      ret = crypto_cipher_encrypt_iv(desc->tfm, desc->outfrags, desc->infrags,
++                                      thislen, desc->iv);
++      if (ret)
++              return ret;
++      if (fraglen) {
++              desc->outfrags[0].page = sg->page;
++              desc->outfrags[0].offset = sg->offset + sg->length - fraglen;
++              desc->outfrags[0].length = fraglen;
++              desc->infrags[0] = desc->outfrags[0];
++              desc->infrags[0].page = in_page;
++              desc->fragno = 1;
++              desc->fraglen = fraglen;
++      } else {
++              desc->fragno = 0;
++              desc->fraglen = 0;
++      }
++      return 0;
++}
++
++int
++gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset,
++              struct page **pages)
++{
++      int ret;
++      struct encryptor_desc desc;
++
++      BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0);
++
++      memset(desc.iv, 0, sizeof(desc.iv));
++      desc.tfm = tfm;
++      desc.pos = offset;
++      desc.outbuf = buf;
++      desc.pages = pages;
++      desc.fragno = 0;
++      desc.fraglen = 0;
++
++      ret = process_xdr_buf(buf, offset, buf->len - offset, encryptor, &desc);
++      return ret;
++}
++
++EXPORT_SYMBOL(gss_encrypt_xdr_buf);
++
++struct decryptor_desc {
++      u8 iv[8]; /* XXX hard-coded blocksize */
++      struct crypto_tfm *tfm;
++      struct scatterlist frags[4];
++      int fragno;
++      int fraglen;
++};
++
++static int
++decryptor(struct scatterlist *sg, void *data)
++{
++      struct decryptor_desc *desc = data;
++      int thislen = desc->fraglen + sg->length;
++      int fraglen, ret;
++
++      /* Worst case is 4 fragments: head, end of page 1, start
++       * of page 2, tail.  Anything more is a bug. */
++      BUG_ON(desc->fragno > 3);
++      desc->frags[desc->fragno] = *sg;
++      desc->fragno++;
++      desc->fraglen += sg->length;
++
++      fraglen = thislen & 7; /* XXX hardcoded blocksize */
++      thislen -= fraglen;
++
++      if (thislen == 0)
++              return 0;
++
++      ret = crypto_cipher_decrypt_iv(desc->tfm, desc->frags, desc->frags,
++                                      thislen, desc->iv);
++      if (ret)
++              return ret;
++      if (fraglen) {
++              desc->frags[0].page = sg->page;
++              desc->frags[0].offset = sg->offset + sg->length - fraglen;
++              desc->frags[0].length = fraglen;
++              desc->fragno = 1;
++              desc->fraglen = fraglen;
++      } else {
++              desc->fragno = 0;
++              desc->fraglen = 0;
++      }
++      return 0;
++}
++
++int
++gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset)
++{
++      struct decryptor_desc desc;
++
++      /* XXXJBF: */
++      BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0);
++
++      memset(desc.iv, 0, sizeof(desc.iv));
++      desc.tfm = tfm;
++      desc.fragno = 0;
++      desc.fraglen = 0;
++      return process_xdr_buf(buf, offset, buf->len - offset, decryptor, &desc);
++}
++
++EXPORT_SYMBOL(gss_decrypt_xdr_buf);
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_seal.c.lsec      2005-03-23 14:28:24.239364752 -0700
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_seal.c   2005-03-23 14:28:24.238364904 -0700
+@@ -0,0 +1,132 @@
++/*
++ *  linux/net/sunrpc/gss_spkm3_seal.c
++ *
++ *  Copyright (c) 2003 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Andy Adamson <andros@umich.edu>
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the University nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ */
++
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/sunrpc/gss_spkm3.h>
++#include <linux/random.h>
++#include <linux/crypto.h>
++
++#ifdef RPC_DEBUG
++# define RPCDBG_FACILITY        RPCDBG_AUTH
++#endif
++
++/*
++ * spkm3_make_token()
++ *
++ * Only SPKM_MIC_TOK with md5 intg-alg is supported
++ */
++
++u32
++spkm3_make_token(struct spkm3_ctx *ctx, int qop_req,
++                 struct xdr_buf * text, struct xdr_netobj * token,
++                 int toktype)
++{
++      s32                     checksum_type;
++      char                    tokhdrbuf[25];
++      struct xdr_netobj       md5cksum = {.len = 0, .data = NULL};
++      struct xdr_netobj       mic_hdr = {.len = 0, .data = tokhdrbuf};
++      int                     tmsglen, tokenlen = 0;
++      unsigned char           *ptr;
++      s32                     now;
++      int                     ctxelen = 0, ctxzbit = 0;
++      int                     md5elen = 0, md5zbit = 0;
++
++      dprintk("RPC: spkm3_make_token\n");
++
++      now = jiffies;
++      if (qop_req != 0)
++              goto out_err;
++
++      if (ctx->ctx_id.len != 16) {
++              dprintk("RPC: spkm3_make_token BAD ctx_id.len %d\n",
++                      ctx->ctx_id.len);
++              goto out_err;
++      }
++
++      switch (ctx->intg_alg) {
++              case NID_md5:
++                      checksum_type = CKSUMTYPE_RSA_MD5;
++                      break;
++              default:
++                      dprintk("RPC: gss_spkm3_seal: ctx->signalg %d not"
++                              " supported\n", ctx->intg_alg);
++                      goto out_err;
++      }
++      /* XXX since we don't support WRAP, perhaps we don't care... */
++      if (ctx->conf_alg != NID_cast5_cbc) {
++              dprintk("RPC: gss_spkm3_seal: ctx->sealalg %d not supported\n",
++                      ctx->conf_alg);
++              goto out_err;
++      }
++
++      if (toktype == SPKM_MIC_TOK) {
++              tmsglen = 0;
++              /* Calculate checksum over the mic-header */
++              asn1_bitstring_len(&ctx->ctx_id, &ctxelen, &ctxzbit);
++              spkm3_mic_header(&mic_hdr.data, &mic_hdr.len, ctx->ctx_id.data,
++                                       ctxelen, ctxzbit);
++
++              if (make_checksum(checksum_type, mic_hdr.data, mic_hdr.len,
++                                           text, &md5cksum))
++                      goto out_err;
++
++              asn1_bitstring_len(&md5cksum, &md5elen, &md5zbit);
++              tokenlen = 10 + ctxelen + 1 + 2 + md5elen + 1;
++
++              /* Create token header using generic routines */
++              token->len = g_token_size(&ctx->mech_used, tokenlen + tmsglen);
++
++              ptr = token->data;
++              g_make_token_header(&ctx->mech_used, tokenlen + tmsglen, &ptr);
++
++              spkm3_make_mic_token(&ptr, tokenlen, &mic_hdr, &md5cksum, md5elen, md5zbit);
++      } else if (toktype == SPKM_WRAP_TOK) { /* Not Supported */
++              dprintk("RPC: gss_spkm3_seal: SPKM_WRAP_TOK not supported\n");
++              goto out_err;
++      }
++      kfree(md5cksum.data);
++
++      /* XXX need to implement sequence numbers, and ctx->expired */
++
++      return  GSS_S_COMPLETE;
++out_err:
++      if (md5cksum.data)
++              kfree(md5cksum.data);
++      token->data = 0;
++      token->len = 0;
++      return GSS_S_FAILURE;
++}
+--- linux-2.6.7/net/sunrpc/auth_gss/svcauth_gss.c.lsec 2004-06-15 23:19:22.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/svcauth_gss.c      2005-03-23 14:28:24.405339520 -0700
+@@ -37,6 +37,7 @@
+  *
+  */
++#include <asm/bitops.h>
+ #include <linux/types.h>
+ #include <linux/module.h>
+ #include <linux/pagemap.h>
+@@ -78,7 +79,6 @@ struct rsi {
+ static struct cache_head *rsi_table[RSI_HASHMAX];
+ static struct cache_detail rsi_cache;
+-static struct rsi *rsi_lookup(struct rsi *item, int set);
+ static void rsi_free(struct rsi *rsii)
+ {
+@@ -125,38 +125,6 @@ static inline int dup_netobj(struct xdr_
+       return dup_to_netobj(dst, src->data, src->len);
+ }
+-static inline void rsi_init(struct rsi *new, struct rsi *item)
+-{
+-      new->out_handle.data = NULL;
+-      new->out_handle.len = 0;
+-      new->out_token.data = NULL;
+-      new->out_token.len = 0;
+-      new->in_handle.len = item->in_handle.len;
+-      item->in_handle.len = 0;
+-      new->in_token.len = item->in_token.len;
+-      item->in_token.len = 0;
+-      new->in_handle.data = item->in_handle.data;
+-      item->in_handle.data = NULL;
+-      new->in_token.data = item->in_token.data;
+-      item->in_token.data = NULL;
+-}
+-
+-static inline void rsi_update(struct rsi *new, struct rsi *item)
+-{
+-      BUG_ON(new->out_handle.data || new->out_token.data);
+-      new->out_handle.len = item->out_handle.len;
+-      item->out_handle.len = 0;
+-      new->out_token.len = item->out_token.len;
+-      item->out_token.len = 0;
+-      new->out_handle.data = item->out_handle.data;
+-      item->out_handle.data = NULL;
+-      new->out_token.data = item->out_token.data;
+-      item->out_token.data = NULL;
+-
+-      new->major_status = item->major_status;
+-      new->minor_status = item->minor_status;
+-}
+-
+ static void rsi_request(struct cache_detail *cd,
+                        struct cache_head *h,
+                        char **bpp, int *blen)
+@@ -168,6 +136,75 @@ static void rsi_request(struct cache_det
+       (*bpp)[-1] = '\n';
+ }
++static inline int
++gssd_reply(struct rsi *item)
++{
++      struct rsi *tmp;
++      struct cache_head **hp, **head;
++
++      head = &rsi_cache.hash_table[rsi_hash(item)];
++      write_lock(&rsi_cache.hash_lock);
++      for (hp = head; *hp != NULL; hp = &tmp->h.next) {
++              tmp = container_of(*hp, struct rsi, h);
++              if (rsi_match(tmp, item)) {
++                      cache_get(&tmp->h);
++                      clear_bit(CACHE_HASHED, &tmp->h.flags);
++                      *hp = tmp->h.next;
++                      tmp->h.next = NULL;
++                      rsi_cache.entries--;
++                      if (test_bit(CACHE_VALID, &tmp->h.flags)) {
++                              write_unlock(&rsi_cache.hash_lock);
++                              rsi_put(&tmp->h, &rsi_cache);
++                              return -EINVAL;
++                      }
++                      set_bit(CACHE_HASHED, &item->h.flags);
++                      item->h.next = *hp;
++                      *hp = &item->h;
++                      rsi_cache.entries++;
++                      set_bit(CACHE_VALID, &item->h.flags);
++                      item->h.last_refresh = get_seconds();
++                      write_unlock(&rsi_cache.hash_lock);
++                      cache_fresh(&rsi_cache, &tmp->h, 0);
++                      rsi_put(&tmp->h, &rsi_cache);
++                      return 0;
++              }
++      }
++      write_unlock(&rsi_cache.hash_lock);
++      return -EINVAL;
++}
++
++static inline struct rsi *
++gssd_upcall(struct rsi *item, struct svc_rqst *rqstp)
++{
++      struct rsi *tmp;
++      struct cache_head **hp, **head;
++
++      head = &rsi_cache.hash_table[rsi_hash(item)];
++      read_lock(&rsi_cache.hash_lock);
++      for (hp = head; *hp != NULL; hp = &tmp->h.next) {
++              tmp = container_of(*hp, struct rsi, h);
++              if (rsi_match(tmp, item)) {
++                      if (!test_bit(CACHE_VALID, &tmp->h.flags)) {
++                              read_unlock(&rsi_cache.hash_lock);
++                              return NULL;
++                      }
++                      *hp = tmp->h.next;
++                      tmp->h.next = NULL;
++                      rsi_cache.entries--;
++                      read_unlock(&rsi_cache.hash_lock);
++                      return tmp;
++              }
++      }
++      cache_get(&item->h);
++      item->h.next = *head;
++      *head = &item->h;
++      rsi_cache.entries++;
++      read_unlock(&rsi_cache.hash_lock);
++      cache_get(&item->h);
++      if (cache_check(&rsi_cache, &item->h, &rqstp->rq_chandle))
++              return NULL;
++      return item;
++}
+ static int rsi_parse(struct cache_detail *cd,
+                     char *mesg, int mlen)
+@@ -176,17 +213,22 @@ static int rsi_parse(struct cache_detail
+       char *buf = mesg;
+       char *ep;
+       int len;
+-      struct rsi rsii, *rsip = NULL;
++      struct rsi *rsii;
+       time_t expiry;
+       int status = -EINVAL;
+-      memset(&rsii, 0, sizeof(rsii));
++      rsii = kmalloc(sizeof(*rsii), GFP_KERNEL);
++      if (!rsii)
++              return -ENOMEM;
++      memset(rsii, 0, sizeof(*rsii));
++      cache_init(&rsii->h);
++
+       /* handle */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0)
+               goto out;
+       status = -ENOMEM;
+-      if (dup_to_netobj(&rsii.in_handle, buf, len))
++      if (dup_to_netobj(&rsii->in_handle, buf, len))
+               goto out;
+       /* token */
+@@ -195,10 +237,9 @@ static int rsi_parse(struct cache_detail
+       if (len < 0)
+               goto out;
+       status = -ENOMEM;
+-      if (dup_to_netobj(&rsii.in_token, buf, len))
++      if (dup_to_netobj(&rsii->in_token, buf, len))
+               goto out;
+-      rsii.h.flags = 0;
+       /* expiry */
+       expiry = get_expiry(&mesg);
+       status = -EINVAL;
+@@ -212,13 +253,13 @@ static int rsi_parse(struct cache_detail
+       if (len == 0) {
+               goto out;
+       } else {
+-              rsii.major_status = simple_strtoul(buf, &ep, 10);
++              rsii->major_status = simple_strtoul(buf, &ep, 10);
+               if (*ep)
+                       goto out;
+               len = qword_get(&mesg, buf, mlen);
+               if (len <= 0)
+                       goto out;
+-              rsii.minor_status = simple_strtoul(buf, &ep, 10);
++              rsii->minor_status = simple_strtoul(buf, &ep, 10);
+               if (*ep)
+                       goto out;
+@@ -227,7 +268,7 @@ static int rsi_parse(struct cache_detail
+               if (len < 0)
+                       goto out;
+               status = -ENOMEM;
+-              if (dup_to_netobj(&rsii.out_handle, buf, len))
++              if (dup_to_netobj(&rsii->out_handle, buf, len))
+                       goto out;
+               /* out_token */
+@@ -236,16 +277,14 @@ static int rsi_parse(struct cache_detail
+               if (len < 0)
+                       goto out;
+               status = -ENOMEM;
+-              if (dup_to_netobj(&rsii.out_token, buf, len))
++              if (dup_to_netobj(&rsii->out_token, buf, len))
+                       goto out;
+       }
+-      rsii.h.expiry_time = expiry;
+-      rsip = rsi_lookup(&rsii, 1);
+-      status = 0;
++      rsii->h.expiry_time = expiry;
++      status = gssd_reply(rsii);
+ out:
+-      rsi_free(&rsii);
+-      if (rsip)
+-              rsi_put(&rsip->h, &rsi_cache);
++      if (rsii)
++              rsi_put(&rsii->h, &rsi_cache);
+       return status;
+ }
+@@ -258,8 +297,6 @@ static struct cache_detail rsi_cache = {
+       .cache_parse    = rsi_parse,
+ };
+-static DefineSimpleCacheLookup(rsi, 0)
+-
+ /*
+  * The rpcsec_context cache is used to store a context that is
+  * used in data exchange.
+@@ -292,7 +329,6 @@ struct rsc {
+ static struct cache_head *rsc_table[RSC_HASHMAX];
+ static struct cache_detail rsc_cache;
+-static struct rsc *rsc_lookup(struct rsc *item, int set);
+ static void rsc_free(struct rsc *rsci)
+ {
+@@ -325,26 +361,44 @@ rsc_match(struct rsc *new, struct rsc *t
+       return netobj_equal(&new->handle, &tmp->handle);
+ }
+-static inline void
+-rsc_init(struct rsc *new, struct rsc *tmp)
++static struct rsc *rsc_lookup(struct rsc *item, int set)
+ {
+-      new->handle.len = tmp->handle.len;
+-      tmp->handle.len = 0;
+-      new->handle.data = tmp->handle.data;
+-      tmp->handle.data = NULL;
+-      new->mechctx = NULL;
+-      new->cred.cr_group_info = NULL;
+-}
+-
+-static inline void
+-rsc_update(struct rsc *new, struct rsc *tmp)
+-{
+-      new->mechctx = tmp->mechctx;
+-      tmp->mechctx = NULL;
+-      memset(&new->seqdata, 0, sizeof(new->seqdata));
+-      spin_lock_init(&new->seqdata.sd_lock);
+-      new->cred = tmp->cred;
+-      tmp->cred.cr_group_info = NULL;
++      struct rsc *tmp = NULL;
++      struct cache_head **hp, **head;
++      head = &rsc_cache.hash_table[rsc_hash(item)];
++
++      if (set)
++              write_lock(&rsc_cache.hash_lock);
++      else
++              read_lock(&rsc_cache.hash_lock);
++      for (hp = head; *hp != NULL; hp = &tmp->h.next) {
++              tmp = container_of(*hp, struct rsc, h);
++              if (!rsc_match(tmp, item))
++                      continue;
++              cache_get(&tmp->h);
++              if (!set)
++                      goto out_noset;
++              *hp = tmp->h.next;
++              tmp->h.next = NULL;
++              clear_bit(CACHE_HASHED, &tmp->h.flags);
++              rsc_put(&tmp->h, &rsc_cache);
++              goto out_set;
++      }
++      /* Didn't find anything */
++      if (!set)
++              goto out_noset;
++      rsc_cache.entries++;
++out_set:
++      set_bit(CACHE_HASHED, &item->h.flags);
++      item->h.next = *head;
++      *head = &item->h;
++      write_unlock(&rsc_cache.hash_lock);
++      cache_fresh(&rsc_cache, &item->h, item->h.expiry_time);
++      cache_get(&item->h);
++      return item;
++out_noset:
++      read_unlock(&rsc_cache.hash_lock);
++      return tmp;
+ }
+ static int rsc_parse(struct cache_detail *cd,
+@@ -353,19 +407,22 @@ static int rsc_parse(struct cache_detail
+       /* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */
+       char *buf = mesg;
+       int len, rv;
+-      struct rsc rsci, *rscp = NULL;
++      struct rsc *rsci, *res = NULL;
+       time_t expiry;
+       int status = -EINVAL;
+-      memset(&rsci, 0, sizeof(rsci));
++      rsci = kmalloc(sizeof(*rsci), GFP_KERNEL);
++      if (!rsci)
++              return -ENOMEM;
++      memset(rsci, 0, sizeof(*rsci));
++      cache_init(&rsci->h);
+       /* context handle */
+       len = qword_get(&mesg, buf, mlen);
+       if (len < 0) goto out;
+       status = -ENOMEM;
+-      if (dup_to_netobj(&rsci.handle, buf, len))
++      if (dup_to_netobj(&rsci->handle, buf, len))
+               goto out;
+-      rsci.h.flags = 0;
+       /* expiry */
+       expiry = get_expiry(&mesg);
+       status = -EINVAL;
+@@ -373,26 +430,26 @@ static int rsc_parse(struct cache_detail
+               goto out;
+       /* uid, or NEGATIVE */
+-      rv = get_int(&mesg, &rsci.cred.cr_uid);
++      rv = get_int(&mesg, &rsci->cred.cr_uid);
+       if (rv == -EINVAL)
+               goto out;
+       if (rv == -ENOENT)
+-              set_bit(CACHE_NEGATIVE, &rsci.h.flags);
++              set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+       else {
+               int N, i;
+               struct gss_api_mech *gm;
+               struct xdr_netobj tmp_buf;
+               /* gid */
+-              if (get_int(&mesg, &rsci.cred.cr_gid))
++              if (get_int(&mesg, &rsci->cred.cr_gid))
+                       goto out;
+               /* number of additional gid's */
+               if (get_int(&mesg, &N))
+                       goto out;
+               status = -ENOMEM;
+-              rsci.cred.cr_group_info = groups_alloc(N);
+-              if (rsci.cred.cr_group_info == NULL)
++              rsci->cred.cr_group_info = groups_alloc(N);
++              if (rsci->cred.cr_group_info == NULL)
+                       goto out;
+               /* gid's */
+@@ -401,7 +458,7 @@ static int rsc_parse(struct cache_detail
+                       gid_t gid;
+                       if (get_int(&mesg, &gid))
+                               goto out;
+-                      GROUP_AT(rsci.cred.cr_group_info, i) = gid;
++                      GROUP_AT(rsci->cred.cr_group_info, i) = gid;
+               }
+               /* mech name */
+@@ -422,19 +479,21 @@ static int rsc_parse(struct cache_detail
+               }
+               tmp_buf.len = len;
+               tmp_buf.data = buf;
+-              if (gss_import_sec_context(&tmp_buf, gm, &rsci.mechctx)) {
++              if (gss_import_sec_context(&tmp_buf, gm, &rsci->mechctx)) {
+                       gss_mech_put(gm);
+                       goto out;
+               }
+               gss_mech_put(gm);
+       }
+-      rsci.h.expiry_time = expiry;
+-      rscp = rsc_lookup(&rsci, 1);
++      rsci->h.expiry_time = expiry;
++      spin_lock_init(&rsci->seqdata.sd_lock);
++      res = rsc_lookup(rsci, 1);
++      rsc_put(&res->h, &rsc_cache);
++      rsci = NULL;
+       status = 0;
+ out:
+-      rsc_free(&rsci);
+-      if (rscp)
+-              rsc_put(&rscp->h, &rsc_cache);
++      if (rsci)
++              rsc_put(&rsci->h, &rsc_cache);
+       return status;
+ }
+@@ -446,19 +505,14 @@ static struct cache_detail rsc_cache = {
+       .cache_parse    = rsc_parse,
+ };
+-static DefineSimpleCacheLookup(rsc, 0);
+-
+ struct rsc *
+ gss_svc_searchbyctx(struct xdr_netobj *handle)
+ {
+       struct rsc rsci;
+       struct rsc *found;
+-      memset(&rsci, 0, sizeof(rsci));
+-      if (dup_to_netobj(&rsci.handle, handle->data, handle->len))
+-              return NULL;
++      rsci.handle = *handle;
+       found = rsc_lookup(&rsci, 0);
+-      rsc_free(&rsci);
+       if (!found)
+               return NULL;
+       if (cache_check(&rsc_cache, &found->h, NULL))
+@@ -643,7 +697,6 @@ svcauth_gss_register_pseudoflavor(u32 ps
+       if (!new)
+               goto out;
+       cache_init(&new->h.h);
+-      atomic_inc(&new->h.h.refcnt);
+       new->h.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+       if (!new->h.name)
+               goto out_free_dom;
+@@ -651,7 +704,6 @@ svcauth_gss_register_pseudoflavor(u32 ps
+       new->h.flavour = RPC_AUTH_GSS;
+       new->pseudoflavor = pseudoflavor;
+       new->h.h.expiry_time = NEVER;
+-      new->h.h.flags = 0;
+       test = auth_domain_lookup(&new->h, 1);
+       if (test == &new->h) {
+@@ -723,6 +775,45 @@ out:
+       return stat;
+ }
++static int
++unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
++{
++      int stat = -EINVAL;
++      int out_offset;
++      u32 * lenp;
++      u32 priv_len, maj_stat;
++      int saved_len;
++
++      lenp = buf->head[0].iov_base;
++      priv_len = ntohl(svc_getu32(&buf->head[0]));
++      if (priv_len > buf->len) /* XXXJBF: wrong check */
++              goto out;
++      /* XXXJBF: bizarre hack: to handle revisits (and not decrypt
++       * twice), the first time through we write an offset
++       * telling us where to skip to find the already-decrypted data */
++      if (rqstp->rq_deferred) {
++              buf->head[0].iov_base += priv_len;
++              buf->head[0].iov_len -= priv_len;
++              return 0;
++      }
++      saved_len = buf->len; /* XXX HACK */
++      buf->len = priv_len;
++      maj_stat = gss_unwrap(ctx, GSS_C_QOP_DEFAULT, 0, buf, &out_offset);
++      buf->len = saved_len;
++      buf->head[0].iov_base += out_offset;
++      buf->head[0].iov_len -= out_offset;
++      BUG_ON(buf->head[0].iov_len <= 0);
++      if (maj_stat != GSS_S_COMPLETE)
++              goto out;
++      if (ntohl(svc_getu32(&buf->head[0])) != seq)
++              goto out;
++      /* XXXJBF: see "bizarre hack", above. */
++      *lenp = htonl(out_offset + 4);
++      stat = 0;
++out:
++      return stat;
++}
++
+ struct gss_svc_data {
+       /* decoded gss client cred: */
+       struct rpc_gss_wire_cred        clcred;
+@@ -750,7 +841,7 @@ svcauth_gss_accept(struct svc_rqst *rqst
+       struct gss_svc_data *svcdata = rqstp->rq_auth_data;
+       struct rpc_gss_wire_cred *gc;
+       struct rsc      *rsci = NULL;
+-      struct rsi      *rsip, rsikey;
++      struct rsi      *rsip, *rsikey = NULL;
+       u32             *rpcstart;
+       u32             *reject_stat = resv->iov_base + resv->iov_len;
+       int             ret;
+@@ -843,30 +934,23 @@ svcauth_gss_accept(struct svc_rqst *rqst
+               *authp = rpc_autherr_badcred;
+               if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0)
+                       goto auth_err;
+-              memset(&rsikey, 0, sizeof(rsikey));
+-              if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx))
++              rsikey = kmalloc(sizeof(*rsikey), GFP_KERNEL);
++              if (!rsikey)
++                      goto drop;
++              memset(rsikey, 0, sizeof(*rsikey));
++              cache_init(&rsikey->h);
++              if (dup_netobj(&rsikey->in_handle, &gc->gc_ctx))
+                       goto drop;
+               *authp = rpc_autherr_badverf;
+-              if (svc_safe_getnetobj(argv, &tmpobj)) {
+-                      kfree(rsikey.in_handle.data);
++              if (svc_safe_getnetobj(argv, &tmpobj))
+                       goto auth_err;
+-              }
+-              if (dup_netobj(&rsikey.in_token, &tmpobj)) {
+-                      kfree(rsikey.in_handle.data);
++              if (dup_netobj(&rsikey->in_token, &tmpobj))
+                       goto drop;
+-              }
+-              rsip = rsi_lookup(&rsikey, 0);
+-              rsi_free(&rsikey);
+-              if (!rsip) {
+-                      goto drop;
+-              }
+-              switch(cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) {
+-              case -EAGAIN:
++              rsip = gssd_upcall(rsikey, rqstp);
++              if (!rsip)
+                       goto drop;
+-              case -ENOENT:
+-                      goto drop;
+-              case 0:
++              else {
+                       rsci = gss_svc_searchbyctx(&rsip->out_handle);
+                       if (!rsci) {
+                               goto drop;
+@@ -921,7 +1005,16 @@ svcauth_gss_accept(struct svc_rqst *rqst
+                       svc_putu32(resv, 0);
+                       break;
+               case RPC_GSS_SVC_PRIVACY:
+-                      /* currently unsupported */
++                      if (unwrap_priv_data(rqstp, &rqstp->rq_arg,
++                                      gc->gc_seq, rsci->mechctx))
++                              goto auth_err;
++                      svcdata->rsci = rsci;
++                      cache_get(&rsci->h);
++                      /* placeholders for length and seq. number: */
++                      svcdata->body_start = resv->iov_base + resv->iov_len;
++                      svc_putu32(resv, 0);
++                      svc_putu32(resv, 0);
++                      break;
+               default:
+                       goto auth_err;
+               }
+@@ -939,13 +1032,15 @@ complete:
+ drop:
+       ret = SVC_DROP;
+ out:
++      if (rsikey)
++              rsi_put(&rsikey->h, &rsi_cache);
+       if (rsci)
+               rsc_put(&rsci->h, &rsc_cache);
+       return ret;
+ }
+-static int
+-svcauth_gss_release(struct svc_rqst *rqstp)
++static inline int
++svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
+ {
+       struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
+       struct rpc_gss_wire_cred *gc = &gsd->clcred;
+@@ -957,6 +1052,156 @@ svcauth_gss_release(struct svc_rqst *rqs
+       int integ_offset, integ_len;
+       int stat = -EINVAL;
++      p = gsd->body_start;
++      gsd->body_start = 0;
++      /* move accept_stat to right place: */
++      memcpy(p, p + 2, 4);
++      /* Don't wrap in failure case: */
++      /* Counting on not getting here if call was not even accepted! */
++      if (*p != rpc_success) {
++              resbuf->head[0].iov_len -= 2 * 4;
++              goto out;
++      }
++      p++;
++      integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
++      integ_len = resbuf->len - integ_offset;
++      BUG_ON(integ_len % 4);
++      *p++ = htonl(integ_len);
++      *p++ = htonl(gc->gc_seq);
++      if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
++                              integ_len))
++              BUG();
++      if (resbuf->page_len == 0
++                      && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE
++                      < PAGE_SIZE) {
++              BUG_ON(resbuf->tail[0].iov_len);
++              /* Use head for everything */
++              resv = &resbuf->head[0];
++      } else if (resbuf->tail[0].iov_base == NULL) {
++              /* copied from nfsd4_encode_read */
++              svc_take_page(rqstp);
++              resbuf->tail[0].iov_base = page_address(rqstp
++                              ->rq_respages[rqstp->rq_resused-1]);
++              rqstp->rq_restailpage = rqstp->rq_resused-1;
++              resbuf->tail[0].iov_len = 0;
++              resv = &resbuf->tail[0];
++      } else {
++              resv = &resbuf->tail[0];
++      }
++      mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
++      if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic))
++              goto out_err;
++      svc_putu32(resv, htonl(mic.len));
++      memset(mic.data + mic.len, 0,
++                      round_up_to_quad(mic.len) - mic.len);
++      resv->iov_len += XDR_QUADLEN(mic.len) << 2;
++      /* not strictly required: */
++      resbuf->len += XDR_QUADLEN(mic.len) << 2;
++      BUG_ON(resv->iov_len > PAGE_SIZE);
++out:
++      stat = 0;
++out_err:
++      return stat;
++}
++
++/* XXXJBF: Look for chances to share code with client */
++/* XXXJBF: Do we need to preallocate these pages somehow?  E.g. see
++ * buffer size calculations in svcsock.c */
++/* XXXJBF: how does reference counting on pages work? */
++static struct page **
++svc_alloc_enc_pages(struct xdr_buf *buf)
++{
++      struct page **ret;
++      int last, i;
++
++      if (buf->page_len == 0)
++              return NULL;
++      BUG_ON(buf->page_base >> PAGE_CACHE_SHIFT);
++      last = (buf->page_base + buf->page_len - 1) >> PAGE_CACHE_SHIFT;
++      ret = kmalloc((last + 1) * sizeof(struct page *), GFP_KERNEL);
++      if (!ret)
++              goto out;
++      for (i = 0; i<= last; i++) {
++              ret[i] = alloc_page(GFP_KERNEL);
++              if (ret[i] == NULL)
++                      goto out_free;
++      }
++out:
++      return ret;
++out_free:
++      for (i--; i >= 0; i--) {
++              __free_page(ret[i]);
++      }
++      return NULL;
++}
++
++static inline int
++svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp)
++{
++      struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
++      struct rpc_gss_wire_cred *gc = &gsd->clcred;
++      struct xdr_buf *resbuf = &rqstp->rq_res;
++      struct page **inpages;
++      u32 *p;
++      int offset, *len;
++      int pad;
++      int stat = -EINVAL;
++
++      p = gsd->body_start;
++      gsd->body_start = 0;
++      /* move accept_stat to right place: */
++      memcpy(p, p + 2, 4);
++      /* Don't wrap in failure case: */
++      /* Counting on not getting here if call was not even accepted! */
++      if (*p != rpc_success) {
++              resbuf->head[0].iov_len -= 2 * 4;
++              goto out;
++      }
++      p++;
++      len = p++;
++      offset = (u8 *)p - (u8 *)resbuf->head[0].iov_base;
++      *p++ = htonl(gc->gc_seq);
++      stat = -ENOMEM;
++      inpages = resbuf->pages;
++      /* XXXJBF: huge memory leaks here: allocated pages probably aren't
++       * freed, and neither is memory used to hold page array. */
++      resbuf->pages = svc_alloc_enc_pages(resbuf);
++      if (resbuf->page_len && !resbuf->pages)
++              goto out_err; /* XXX sleep and retry? Reserve ahead of time
++                              and BUG_ON? */
++      if (resbuf->tail[0].iov_len == 0 || resbuf->tail[0].iov_base == NULL) {
++              /* copied from nfsd4_encode_read */
++              {int i = svc_take_page(rqstp); BUG_ON(i); }
++              resbuf->tail[0].iov_base = page_address(rqstp
++                              ->rq_respages[rqstp->rq_resused-1]);
++              rqstp->rq_restailpage = rqstp->rq_resused-1;
++              resbuf->tail[0].iov_len = 0;
++      }
++      /* XXX: Will svc code attempt to free stuff in xdr_buf->pages?
++       * Or can we leave it in any old state on error?? */
++      stat = -EINVAL;
++      if (gss_wrap(gsd->rsci->mechctx, GSS_C_QOP_DEFAULT, offset,
++                              resbuf, inpages))
++              goto out_err;
++      *len = htonl(resbuf->len - offset);
++      pad = 3 - ((resbuf->len - offset - 1)&3);
++      p = (u32 *)(resbuf->tail[0].iov_base + resbuf->tail[0].iov_len);
++      memset(p, 0, pad);
++      resbuf->tail[0].iov_len += pad;
++out:
++      return 0;
++out_err:
++      return stat;
++}
++
++static int
++svcauth_gss_release(struct svc_rqst *rqstp)
++{
++      struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
++      struct rpc_gss_wire_cred *gc = &gsd->clcred;
++      struct xdr_buf *resbuf = &rqstp->rq_res;
++      int stat = -EINVAL;
++
+       if (gc->gc_proc != RPC_GSS_PROC_DATA)
+               goto out;
+       /* Release can be called twice, but we only wrap once. */
+@@ -969,55 +1214,15 @@ svcauth_gss_release(struct svc_rqst *rqs
+       case RPC_GSS_SVC_NONE:
+               break;
+       case RPC_GSS_SVC_INTEGRITY:
+-              p = gsd->body_start;
+-              gsd->body_start = 0;
+-              /* move accept_stat to right place: */
+-              memcpy(p, p + 2, 4);
+-              /* don't wrap in failure case: */
+-              /* Note: counting on not getting here if call was not even
+-               * accepted! */
+-              if (*p != rpc_success) {
+-                      resbuf->head[0].iov_len -= 2 * 4;
+-                      goto out;
+-              }
+-              p++;
+-              integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
+-              integ_len = resbuf->len - integ_offset;
+-              BUG_ON(integ_len % 4);
+-              *p++ = htonl(integ_len);
+-              *p++ = htonl(gc->gc_seq);
+-              if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
+-                                      integ_len))
+-                      BUG();
+-              if (resbuf->page_len == 0
+-                      && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE
+-                              < PAGE_SIZE) {
+-                      BUG_ON(resbuf->tail[0].iov_len);
+-                      /* Use head for everything */
+-                      resv = &resbuf->head[0];
+-              } else if (resbuf->tail[0].iov_base == NULL) {
+-                      /* copied from nfsd4_encode_read */
+-                      svc_take_page(rqstp);
+-                      resbuf->tail[0].iov_base = page_address(rqstp
+-                                      ->rq_respages[rqstp->rq_resused-1]);
+-                      rqstp->rq_restailpage = rqstp->rq_resused-1;
+-                      resbuf->tail[0].iov_len = 0;
+-                      resv = &resbuf->tail[0];
+-              } else {
+-                      resv = &resbuf->tail[0];
+-              }
+-              mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
+-              if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic))
++              stat = svcauth_gss_wrap_resp_integ(rqstp);
++              if (stat)
+                       goto out_err;
+-              svc_putu32(resv, htonl(mic.len));
+-              memset(mic.data + mic.len, 0,
+-                              round_up_to_quad(mic.len) - mic.len);
+-              resv->iov_len += XDR_QUADLEN(mic.len) << 2;
+-              /* not strictly required: */
+-              resbuf->len += XDR_QUADLEN(mic.len) << 2;
+-              BUG_ON(resv->iov_len > PAGE_SIZE);
+               break;
+       case RPC_GSS_SVC_PRIVACY:
++              stat = svcauth_gss_wrap_resp_priv(rqstp);
++              if (stat)
++                      goto out_err;
++              break;
+       default:
+               goto out_err;
+       }
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_wrap.c.lsec       2005-03-23 14:28:24.900264280 -0700
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_wrap.c    2005-03-23 14:28:24.900264280 -0700
+@@ -0,0 +1,337 @@
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/sunrpc/gss_krb5.h>
++#include <linux/random.h>
++#include <linux/pagemap.h>
++#include <asm/scatterlist.h>
++#include <linux/crypto.h>
++
++#ifdef RPC_DEBUG
++# define RPCDBG_FACILITY      RPCDBG_AUTH
++#endif
++
++static inline int
++gss_krb5_padding(int blocksize, int length)
++{
++      /* Most of the code is block-size independent but currently we
++       * use only 8: */
++      BUG_ON(blocksize != 8);
++      return 8 - (length & 7);
++}
++
++static inline void
++gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize)
++{
++      int padding = gss_krb5_padding(blocksize, buf->len - offset);
++      char *p;
++      struct iovec *iov;
++
++      if (buf->page_len || buf->tail[0].iov_len)
++              iov = &buf->tail[0];
++      else
++              iov = &buf->head[0];
++      p = iov->iov_base + iov->iov_len;
++      iov->iov_len += padding;
++      buf->len += padding;
++      memset(p, padding, padding);
++}
++
++static inline int
++gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
++{
++      u8 *ptr;
++      u8 pad;
++      int len = buf->len;
++
++      if (len <= buf->head[0].iov_len) {
++              pad = *(u8 *)(buf->head[0].iov_base + len - 1);
++              goto out;
++      } else
++              len -= buf->head[0].iov_len;
++      if (len <= buf->page_len) {
++              int last = (buf->page_base + len - 1)
++                                      >>PAGE_CACHE_SHIFT;
++              int offset = (buf->page_base + len - 1)
++                                      & (PAGE_CACHE_SIZE - 1);
++              ptr = kmap_atomic(buf->pages[last], KM_SKB_SUNRPC_DATA);
++              pad = *(ptr + offset);
++              kunmap_atomic(ptr, KM_SKB_SUNRPC_DATA);
++              goto out;
++      } else
++              len -= buf->page_len;
++      BUG_ON(len > buf->tail[0].iov_len);
++      pad = *(u8 *)(buf->tail[0].iov_base + len - 1);
++out:
++      if (pad > blocksize)
++              return -EINVAL;
++      buf->len -= pad;
++      return 0;
++}
++
++static inline void
++make_confounder(char *p, int blocksize)
++{
++      /* XXX?  Is this OK to do on every packet? */
++      get_random_bytes(p, blocksize);
++}
++
++/* Assumptions: the head and tail of inbuf are ours to play with.
++ * The pages, however, may be real pages in the page cache and we replace
++ * them with scratch pages from **pages before writing to them. */
++/* XXX: obviously the above should be documentation of wrap interface,
++ * and shouldn't be in this kerberos-specific file. */
++
++/* XXX factor out common code with seal/unseal. */
++
++u32
++gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset,
++              struct xdr_buf *buf, struct page **pages)
++{
++      struct krb5_ctx         *kctx = ctx->internal_ctx_id;
++      s32                     checksum_type;
++      struct xdr_netobj       md5cksum = {.len = 0, .data = NULL};
++      int                     blocksize = 0, plainlen;
++      unsigned char           *ptr, *krb5_hdr, *msg_start;
++      s32                     now;
++      int                     headlen;
++      struct page             **tmp_pages;
++      u32                     seq_send;
++
++      dprintk("RPC:     gss_wrap_kerberos\n");
++
++      now = get_seconds();
++
++      if (qop != 0)
++              goto out_err;
++
++      switch (kctx->signalg) {
++              case SGN_ALG_DES_MAC_MD5:
++                      checksum_type = CKSUMTYPE_RSA_MD5;
++                      break;
++              default:
++                      dprintk("RPC:      gss_krb5_seal: kctx->signalg %d not"
++                              " supported\n", kctx->signalg);
++                      goto out_err;
++      }
++      if (kctx->sealalg != SEAL_ALG_NONE && kctx->sealalg != SEAL_ALG_DES) {
++              dprintk("RPC:      gss_krb5_seal: kctx->sealalg %d not supported\n",
++                      kctx->sealalg);
++              goto out_err;
++      }
++
++      blocksize = crypto_tfm_alg_blocksize(kctx->enc);
++      gss_krb5_add_padding(buf, offset, blocksize);
++      BUG_ON((buf->len - offset) % blocksize);
++      plainlen = blocksize + buf->len - offset;
++
++      headlen = g_token_size(&kctx->mech_used, 22 + plainlen) -
++                                              (buf->len - offset);
++
++      ptr = buf->head[0].iov_base + offset;
++      /* shift data to make room for header. */
++      /* XXX Would be cleverer to encrypt while copying. */
++      /* XXX bounds checking, slack, etc. */
++      memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset);
++      buf->head[0].iov_len += headlen;
++      buf->len += headlen;
++      BUG_ON((buf->len - offset - headlen) % blocksize);
++
++      g_make_token_header(&kctx->mech_used, 22 + plainlen, &ptr);
++
++
++      *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff);
++      *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff);
++
++      /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
++      krb5_hdr = ptr - 2;
++      msg_start = krb5_hdr + 24;
++      /* XXXJBF: */ BUG_ON(buf->head[0].iov_base + offset + headlen != msg_start + blocksize);
++
++      *(u16 *)(krb5_hdr + 2) = htons(kctx->signalg);
++      memset(krb5_hdr + 4, 0xff, 4);
++      *(u16 *)(krb5_hdr + 4) = htons(kctx->sealalg);
++
++      make_confounder(msg_start, blocksize);
++
++      /* XXXJBF: UGH!: */
++      tmp_pages = buf->pages;
++      buf->pages = pages;
++      if (make_checksum(checksum_type, krb5_hdr, 8, buf,
++                              offset + headlen - blocksize, &md5cksum))
++              goto out_err;
++      buf->pages = tmp_pages;
++
++      switch (kctx->signalg) {
++      case SGN_ALG_DES_MAC_MD5:
++              if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
++                                md5cksum.data, md5cksum.len))
++                      goto out_err;
++              memcpy(krb5_hdr + 16,
++                     md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH,
++                     KRB5_CKSUM_LENGTH);
++
++              dprintk("RPC:      make_seal_token: cksum data: \n");
++              print_hexl((u32 *) (krb5_hdr + 16), KRB5_CKSUM_LENGTH, 0);
++              break;
++      default:
++              BUG();
++      }
++
++      kfree(md5cksum.data);
++
++      spin_lock(&krb5_seq_lock);
++      seq_send = kctx->seq_send++;
++      spin_unlock(&krb5_seq_lock);
++
++      /* XXX would probably be more efficient to compute checksum
++       * and encrypt at the same time: */
++      if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff,
++                             seq_send, krb5_hdr + 16, krb5_hdr + 8)))
++              goto out_err;
++
++      if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize,
++                                                                      pages))
++              goto out_err;
++
++      return ((kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
++out_err:
++      if (md5cksum.data) kfree(md5cksum.data);
++      return GSS_S_FAILURE;
++}
++
++u32
++gss_unwrap_kerberos(struct gss_ctx *ctx, u32 *qop, int offset,
++                      struct xdr_buf *buf, int *out_offset)
++{
++      struct krb5_ctx         *kctx = ctx->internal_ctx_id;
++      int                     signalg;
++      int                     sealalg;
++      s32                     checksum_type;
++      struct xdr_netobj       md5cksum = {.len = 0, .data = NULL};
++      s32                     now;
++      int                     direction;
++      s32                     seqnum;
++      unsigned char           *ptr;
++      int                     bodysize;
++      u32                     ret = GSS_S_DEFECTIVE_TOKEN;
++      u8                      *data_start;
++      int                     blocksize;
++
++      dprintk("RPC:      gss_unwrap_kerberos\n");
++
++      ptr = (u8 *)buf->head[0].iov_base + offset;
++      if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr,
++                                      buf->len - offset))
++              goto out;
++
++      if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) ||
++          (*ptr++ !=  (KG_TOK_WRAP_MSG    &0xff))   )
++              goto out;
++
++      /* XXX sanity-check bodysize?? */
++
++      /* get the sign and seal algorithms */
++
++      signalg = ptr[0] + (ptr[1] << 8);
++      sealalg = ptr[2] + (ptr[3] << 8);
++
++      /* Sanity checks */
++
++      if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
++              goto out;
++
++      if (sealalg == 0xffff)
++              goto out;
++
++      /* in the current spec, there is only one valid seal algorithm per
++         key type, so a simple comparison is ok */
++
++      if (sealalg != kctx->sealalg)
++              goto out;
++
++      /* there are several mappings of seal algorithms to sign algorithms,
++         but few enough that we can try them all. */
++
++      if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) ||
++          (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) ||
++          (kctx->sealalg == SEAL_ALG_DES3KD &&
++           signalg != SGN_ALG_HMAC_SHA1_DES3_KD))
++              goto out;
++
++      if (gss_decrypt_xdr_buf(kctx->enc, buf,
++                      ptr + 22 - (unsigned char *)buf->head[0].iov_base))
++              goto out;
++
++      /* compute the checksum of the message */
++
++      /* initialize the the cksum */
++      switch (signalg) {
++      case SGN_ALG_DES_MAC_MD5:
++              checksum_type = CKSUMTYPE_RSA_MD5;
++              break;
++      default:
++              ret = GSS_S_DEFECTIVE_TOKEN;
++              goto out;
++      }
++
++      switch (signalg) {
++      case SGN_ALG_DES_MAC_MD5:
++              ret = make_checksum(checksum_type, ptr - 2, 8, buf,
++                       ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum);
++              if (ret)
++                      goto out;
++
++              ret = krb5_encrypt(kctx->seq, NULL, md5cksum.data,
++                                 md5cksum.data, md5cksum.len);
++              if (ret)
++                      goto out;
++
++              if (memcmp(md5cksum.data + 8, ptr + 14, 8)) {
++                      ret = GSS_S_BAD_SIG;
++                      goto out;
++              }
++              break;
++      default:
++              ret = GSS_S_DEFECTIVE_TOKEN;
++              goto out;
++      }
++
++      /* it got through unscathed.  Make sure the context is unexpired */
++
++      if (qop)
++              *qop = GSS_C_QOP_DEFAULT;
++
++      now = get_seconds();
++
++      ret = GSS_S_CONTEXT_EXPIRED;
++      if (now > kctx->endtime)
++              goto out;
++
++      /* do sequencing checks */
++
++      ret = GSS_S_BAD_SIG;
++      if ((ret = krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction,
++                                  &seqnum)))
++              goto out;
++
++      if ((kctx->initiate && direction != 0xff) ||
++          (!kctx->initiate && direction != 0))
++              goto out;
++
++      /* Copy the data back to the right position.  XXX: Would probably be
++       * better to copy and encrypt at the same time. */
++
++      blocksize = crypto_tfm_alg_blocksize(kctx->enc);
++      data_start = ptr + 22 + blocksize;
++      *out_offset = data_start - (u8 *)buf->head[0].iov_base;
++
++      ret = GSS_S_DEFECTIVE_TOKEN;
++      if (gss_krb5_remove_padding(buf, blocksize))
++              goto out;
++
++      ret = GSS_S_COMPLETE;
++out:
++      if (md5cksum.data) kfree(md5cksum.data);
++      return ret;
++}
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_mech_switch.c.lsec     2004-06-15 23:19:37.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_mech_switch.c  2005-03-23 14:28:24.782282216 -0700
+@@ -279,6 +279,29 @@ gss_verify_mic(struct gss_ctx             *context_
+                                qstate);
+ }
++u32
++gss_wrap(struct gss_ctx       *ctx_id,
++       u32            qop,
++       int            offset,
++       struct xdr_buf *buf,
++       struct page    **inpages)
++{
++      return ctx_id->mech_type->gm_ops
++              ->gss_wrap(ctx_id, qop, offset, buf, inpages);
++}
++
++u32
++gss_unwrap(struct gss_ctx     *ctx_id,
++         u32                  *qop,
++         int                  offset,
++         struct xdr_buf       *buf,
++         int                  *out_offset)
++{
++      return ctx_id->mech_type->gm_ops
++              ->gss_unwrap(ctx_id, qop, offset, buf, out_offset);
++}
++
++
+ /* gss_delete_sec_context: free all resources associated with context_handle.
+  * Note this differs from the RFC 2744-specified prototype in that we don't
+  * bother returning an output token, since it would never be used anyway. */
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_mech.c.lsec       2004-06-15 23:19:42.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_mech.c    2005-03-23 14:28:24.841273248 -0700
+@@ -182,6 +182,7 @@ gss_delete_sec_context_kerberos(void *in
+       kfree(kctx);
+ }
++/* XXX the following wrappers have become pointless; kill them. */
+ static u32
+ gss_verify_mic_kerberos(struct gss_ctx                *ctx,
+                       struct xdr_buf          *message,
+@@ -191,8 +192,7 @@ gss_verify_mic_kerberos(struct gss_ctx             
+       int qop_state;
+       struct krb5_ctx *kctx = ctx->internal_ctx_id;
+-      maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state,
+-                                 KG_TOK_MIC_MSG);
++      maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state);
+       if (!maj_stat && qop_state)
+           *qstate = qop_state;
+@@ -208,7 +208,7 @@ gss_get_mic_kerberos(struct gss_ctx        *ctx
+       u32 err = 0;
+       struct krb5_ctx *kctx = ctx->internal_ctx_id;
+-      err = krb5_make_token(kctx, qop, message, mic_token, KG_TOK_MIC_MSG);
++      err = krb5_make_token(kctx, qop, message, mic_token);
+       dprintk("RPC:      gss_get_mic_kerberos returning %d\n",err);
+@@ -219,6 +219,8 @@ static struct gss_api_ops gss_kerberos_o
+       .gss_import_sec_context = gss_import_sec_context_kerberos,
+       .gss_get_mic            = gss_get_mic_kerberos,
+       .gss_verify_mic         = gss_verify_mic_kerberos,
++      .gss_wrap               = gss_wrap_kerberos,
++      .gss_unwrap             = gss_unwrap_kerberos,
+       .gss_delete_sec_context = gss_delete_sec_context_kerberos,
+ };
+@@ -233,6 +235,11 @@ static struct pf_desc gss_kerberos_pfs[]
+               .service = RPC_GSS_SVC_INTEGRITY,
+               .name = "krb5i",
+       },
++      [2] = {
++              .pseudoflavor = RPC_AUTH_GSS_KRB5P,
++              .service = RPC_GSS_SVC_PRIVACY,
++              .name = "krb5p",
++      },
+ };
+ static struct gss_api_mech gss_kerberos_mech = {
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_seal.c.lsec       2004-06-15 23:18:37.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_seal.c    2005-03-23 14:28:24.898264584 -0700
+@@ -70,24 +70,17 @@
+ # define RPCDBG_FACILITY        RPCDBG_AUTH
+ #endif
+-static inline int
+-gss_krb5_padding(int blocksize, int length) {
+-      /* Most of the code is block-size independent but in practice we
+-       * use only 8: */
+-      BUG_ON(blocksize != 8);
+-      return 8 - (length & 7);
+-}
++spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED;
+ u32
+ krb5_make_token(struct krb5_ctx *ctx, int qop_req,
+-                 struct xdr_buf *text, struct xdr_netobj *token,
+-                 int toktype)
++                 struct xdr_buf *text, struct xdr_netobj *token)
+ {
+       s32                     checksum_type;
+       struct xdr_netobj       md5cksum = {.len = 0, .data = NULL};
+-      int                     blocksize = 0, tmsglen;
+       unsigned char           *ptr, *krb5_hdr, *msg_start;
+       s32                     now;
++      u32                     seq_send;
+       dprintk("RPC:     gss_krb5_seal\n");
+@@ -111,21 +104,13 @@ krb5_make_token(struct krb5_ctx *ctx, in
+               goto out_err;
+       }
+-      if (toktype == KG_TOK_WRAP_MSG) {
+-              blocksize = crypto_tfm_alg_blocksize(ctx->enc);
+-              tmsglen = blocksize + text->len
+-                      + gss_krb5_padding(blocksize, blocksize + text->len);
+-      } else {
+-              tmsglen = 0;
+-      }
+-
+-      token->len = g_token_size(&ctx->mech_used, 22 + tmsglen);
++      token->len = g_token_size(&ctx->mech_used, 22);
+       ptr = token->data;
+-      g_make_token_header(&ctx->mech_used, 22 + tmsglen, &ptr);
++      g_make_token_header(&ctx->mech_used, 22, &ptr);
+-      *ptr++ = (unsigned char) ((toktype>>8)&0xff);
+-      *ptr++ = (unsigned char) (toktype&0xff);
++      *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff);
++      *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff);
+       /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
+       krb5_hdr = ptr - 2;
+@@ -133,17 +118,9 @@ krb5_make_token(struct krb5_ctx *ctx, in
+       *(u16 *)(krb5_hdr + 2) = htons(ctx->signalg);
+       memset(krb5_hdr + 4, 0xff, 4);
+-      if (toktype == KG_TOK_WRAP_MSG)
+-              *(u16 *)(krb5_hdr + 4) = htons(ctx->sealalg);
+-      if (toktype == KG_TOK_WRAP_MSG) {
+-              /* XXX removing support for now */
+-              goto out_err;
+-      } else { /* Sign only.  */
+-              if (make_checksum(checksum_type, krb5_hdr, 8, text,
+-                                     &md5cksum))
++      if (make_checksum(checksum_type, krb5_hdr, 8, text, 0, &md5cksum))
+                       goto out_err;
+-      }
+       switch (ctx->signalg) {
+       case SGN_ALG_DES_MAC_MD5:
+@@ -163,12 +140,14 @@ krb5_make_token(struct krb5_ctx *ctx, in
+       kfree(md5cksum.data);
++      spin_lock(&krb5_seq_lock);
++      seq_send = ctx->seq_send++;
++      spin_unlock(&krb5_seq_lock);
++
+       if ((krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff,
+-                             ctx->seq_send, krb5_hdr + 16, krb5_hdr + 8)))
++                             seq_send, krb5_hdr + 16, krb5_hdr + 8)))
+               goto out_err;
+-      ctx->seq_send++;
+-
+       return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
+ out_err:
+       if (md5cksum.data) kfree(md5cksum.data);
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_token.c.lsec     2005-03-23 14:28:24.240364600 -0700
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_token.c  2005-03-23 14:28:24.239364752 -0700
+@@ -0,0 +1,266 @@
++/*
++ *  linux/net/sunrpc/gss_spkm3_token.c
++ *
++ *  Copyright (c) 2003 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Andy Adamson <andros@umich.edu>
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the University nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ */
++
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/sunrpc/gss_spkm3.h>
++#include <linux/random.h>
++#include <linux/crypto.h>
++
++#ifdef RPC_DEBUG
++# define RPCDBG_FACILITY        RPCDBG_AUTH
++#endif
++
++/*
++ * asn1_bitstring_len()
++ *
++ * calculate the asn1 bitstring length of the xdr_netobject
++ */
++void
++asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits)
++{
++      int i, zbit = 0,elen = in->len;
++      char *ptr;
++
++      ptr = &in->data[in->len -1];
++
++      /* count trailing 0's */
++      for(i = in->len; i > 0; i--) {
++              if (*ptr == 0) {
++                      ptr--;
++                      elen--;
++              } else
++                      break;
++      }
++
++      /* count number of 0 bits in final octet */
++      ptr = &in->data[elen - 1];
++      for(i = 0; i < 8; i++) {
++              short mask = 0x01;
++
++              if (!((mask << i) & *ptr))
++                      zbit++;
++              else
++                      break;
++      }
++      *enclen = elen;
++      *zerobits = zbit;
++}
++
++/*
++ * decode_asn1_bitstring()
++ *
++ * decode a bitstring into a buffer of the expected length.
++ * enclen = bit string length
++ * explen = expected length (define in rfc)
++ */
++int
++decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen, int explen)
++{
++      if (!(out->data = kmalloc(explen,GFP_KERNEL)))
++              return 0;
++      out->len = explen;
++      memset(out->data, 0, explen);
++      memcpy(out->data, in, enclen);
++      return 1;
++}
++
++/*
++ * SPKMInnerContextToken choice SPKM_MIC asn1 token layout
++ *
++ * contextid is always 16 bytes plain data. max asn1 bitstring len = 17.
++ *
++ * tokenlen = pos[0] to end of token (max pos[45] with MD5 cksum)
++ *
++ * pos  value
++ * ----------
++ * [0]        a4  SPKM-MIC tag
++ * [1]        ??  innertoken length  (max 44)
++ *
++ *
++ * tok_hdr piece of checksum data starts here
++ *
++ * the maximum mic-header len = 9 + 17 = 26
++ *    mic-header
++ *    ----------
++ * [2]        30      SEQUENCE tag
++ * [3]        ??      mic-header length: (max 23) = TokenID + ContextID
++ *
++ *            TokenID  - all fields constant and can be hardcoded
++ *            -------
++ * [4]          02    Type 2
++ * [5]          02    Length 2
++ * [6][7] 01 01       TokenID (SPKM_MIC_TOK)
++ *
++ *            ContextID  - encoded length not constant, calculated
++ *            ---------
++ * [8]        03      Type 3
++ * [9]        ??      encoded length
++ * [10]       ??      ctxzbit
++ * [11]               contextid
++ *
++ * mic_header piece of checksum data ends here.
++ *
++ *    int-cksum - encoded length not constant, calculated
++ *    ---------
++ * [??]       03      Type 3
++ * [??]       ??      encoded length
++ * [??]       ??      md5zbit
++ * [??]               int-cksum (NID_md5 = 16)
++ *
++ * maximum SPKM-MIC innercontext token length =
++ *     10 + encoded contextid_size(17 max) + 2 + encoded
++ *       cksum_size (17 maxfor NID_md5) = 46
++ */
++
++/*
++ * spkm3_mic_header()
++ *
++ * Prepare the SPKM_MIC_TOK mic-header for check-sum calculation
++ * elen: 16 byte context id asn1 bitstring encoded length
++ */
++void
++spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen, unsigned char *ctxdata, int elen, int zbit)
++{
++      char *hptr = *hdrbuf;
++      char *top = *hdrbuf;
++
++      *(u8 *)hptr++ = 0x30;
++      *(u8 *)hptr++ = elen + 7;  /* on the wire header length */
++
++      /* tokenid */
++      *(u8 *)hptr++ = 0x02;
++      *(u8 *)hptr++ = 0x02;
++      *(u8 *)hptr++ = 0x01;
++      *(u8 *)hptr++ = 0x01;
++
++      /* coniextid */
++      *(u8 *)hptr++ = 0x03;
++      *(u8 *)hptr++ = elen + 1; /* add 1 to include zbit */
++      *(u8 *)hptr++ = zbit;
++      memcpy(hptr, ctxdata, elen);
++      hptr += elen;
++      *hdrlen = hptr - top;
++}
++
++/*
++ * spkm3_mic_innercontext_token()
++ *
++ * *tokp points to the beginning of the SPKM_MIC token  described
++ * in rfc 2025, section 3.2.1:
++ *
++ */
++void
++spkm3_make_mic_token(unsigned char **tokp, int toklen, struct xdr_netobj *mic_hdr, struct xdr_netobj *md5cksum, int md5elen, int md5zbit)
++{
++      unsigned char *ict = *tokp;
++
++      *(u8 *)ict++ = 0xa4;
++      *(u8 *)ict++ = toklen - 2;
++      memcpy(ict, mic_hdr->data, mic_hdr->len);
++      ict += mic_hdr->len;
++
++      *(u8 *)ict++ = 0x03;
++      *(u8 *)ict++ = md5elen + 1; /* add 1 to include zbit */
++      *(u8 *)ict++ = md5zbit;
++      memcpy(ict, md5cksum->data, md5elen);
++}
++
++u32
++spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, unsigned char **cksum)
++{
++      struct xdr_netobj       spkm3_ctx_id = {.len =0, .data = NULL};
++      unsigned char           *ptr = *tokp;
++      int                     ctxelen;
++      u32                     ret = GSS_S_DEFECTIVE_TOKEN;
++
++      /* spkm3 innercontext token preamble */
++      if ((ptr[0] != 0xa4) || (ptr[2] != 0x30)) {
++              dprintk("RPC: BAD SPKM ictoken preamble\n");
++              goto out;
++      }
++
++      *mic_hdrlen = ptr[3];
++
++      /* token type */
++      if ((ptr[4] != 0x02) || (ptr[5] != 0x02)) {
++              dprintk("RPC: BAD asn1 SPKM3 token type\n");
++              goto out;
++      }
++
++      /* only support SPKM_MIC_TOK */
++      if((ptr[6] != 0x01) || (ptr[7] != 0x01)) {
++              dprintk("RPC: ERROR unsupported SPKM3 token \n");
++              goto out;
++      }
++
++      /* contextid */
++      if (ptr[8] != 0x03) {
++              dprintk("RPC: BAD SPKM3 asn1 context-id type\n");
++              goto out;
++      }
++
++      ctxelen = ptr[9];
++      if (ctxelen > 17) {  /* length includes asn1 zbit octet */
++              dprintk("RPC: BAD SPKM3 contextid len %d\n", ctxelen);
++              goto out;
++      }
++
++      /* ignore ptr[10] */
++
++      if(!decode_asn1_bitstring(&spkm3_ctx_id, &ptr[11], ctxelen - 1, 16))
++              goto out;
++
++      /*
++      * in the current implementation: the optional int-alg is not present
++      * so the default int-alg (md5) is used the optional snd-seq field is
++      * also not present
++      */
++
++      if (*mic_hdrlen != 6 + ctxelen) {
++              dprintk("RPC: BAD SPKM_ MIC_TOK header len %d: we only support default int-alg (should be absent) and do not support snd-seq\n", *mic_hdrlen);
++              goto out;
++      }
++      /* checksum */
++        *cksum = (&ptr[10] + ctxelen); /* ctxelen includes ptr[10] */
++
++      ret = GSS_S_COMPLETE;
++out:
++      if (spkm3_ctx_id.data)
++              kfree(spkm3_ctx_id.data);
++      return ret;
++}
++
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_generic_token.c.lsec   2004-06-15 23:19:10.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_generic_token.c        2005-03-23 14:28:23.707445616 -0700
+@@ -179,7 +179,7 @@ EXPORT_SYMBOL(g_make_token_header);
+  */
+ u32
+ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
+-                    unsigned char **buf_in, int tok_type, int toksize)
++                    unsigned char **buf_in, int toksize)
+ {
+       unsigned char *buf = *buf_in;
+       int seqsize;
+--- linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_unseal.c.lsec    2005-03-23 14:28:24.240364600 -0700
++++ linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_unseal.c 2005-03-23 14:28:24.240364600 -0700
+@@ -0,0 +1,128 @@
++/*
++ *  linux/net/sunrpc/gss_spkm3_unseal.c
++ *
++ *  Copyright (c) 2003 The Regents of the University of Michigan.
++ *  All rights reserved.
++ *
++ *  Andy Adamson <andros@umich.edu>
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in the
++ *     documentation and/or other materials provided with the distribution.
++ *  3. Neither the name of the University nor the names of its
++ *     contributors may be used to endorse or promote products derived
++ *     from this software without specific prior written permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
++ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
++ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
++ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ */
++
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/jiffies.h>
++#include <linux/sunrpc/gss_spkm3.h>
++#include <linux/crypto.h>
++
++#ifdef RPC_DEBUG
++# define RPCDBG_FACILITY        RPCDBG_AUTH
++#endif
++
++/*
++ * spkm3_read_token()
++ *
++ * only SPKM_MIC_TOK with md5 intg-alg is supported
++ */
++u32
++spkm3_read_token(struct spkm3_ctx *ctx,
++              struct xdr_netobj *read_token,    /* checksum */
++              struct xdr_buf *message_buffer, /* signbuf */
++              int *qop_state, int toktype)
++{
++      s32                     code;
++      struct xdr_netobj       wire_cksum = {.len =0, .data = NULL};
++      struct xdr_netobj       md5cksum = {.len = 0, .data = NULL};
++      unsigned char           *ptr = (unsigned char *)read_token->data;
++      unsigned char           *cksum;
++      int                     bodysize, md5elen;
++      int                     mic_hdrlen;
++      u32                     ret = GSS_S_DEFECTIVE_TOKEN;
++
++      dprintk("RPC: spkm3_read_token read_token->len %d\n", read_token->len);
++
++      if (g_verify_token_header((struct xdr_netobj *) &ctx->mech_used,
++                                      &bodysize, &ptr, read_token->len))
++              goto out;
++
++      /* decode the token */
++
++      if (toktype == SPKM_MIC_TOK) {
++
++              if ((ret = spkm3_verify_mic_token(&ptr, &mic_hdrlen, &cksum)))
++                      goto out;
++
++              if (*cksum++ != 0x03) {
++                      dprintk("RPC: spkm3_read_token BAD checksum type\n");
++                      goto out;
++              }
++              md5elen = *cksum++;
++              cksum++;        /* move past the zbit */
++
++              if(!decode_asn1_bitstring(&wire_cksum, cksum, md5elen - 1, 16))
++                      goto out;
++
++              /* HARD CODED FOR MD5 */
++
++              /* compute the checksum of the message.
++              *  ptr + 2 = start of header piece of checksum
++              *  mic_hdrlen + 2 = length of header piece of checksum
++              */
++              ret = GSS_S_DEFECTIVE_TOKEN;
++              code = make_checksum(CKSUMTYPE_RSA_MD5, ptr + 2,
++                                      mic_hdrlen + 2,
++                                      message_buffer, &md5cksum);
++
++              if (code)
++                      goto out;
++
++              dprintk("RPC: spkm3_read_token: digest wire_cksum.len %d:\n",
++                      wire_cksum.len);
++              dprintk("          md5cksum.data\n");
++              print_hexl((u32 *) md5cksum.data, 16, 0);
++              dprintk("          cksum.data:\n");
++              print_hexl((u32 *) wire_cksum.data, wire_cksum.len, 0);
++
++              ret = GSS_S_BAD_SIG;
++              code = memcmp(md5cksum.data, wire_cksum.data, wire_cksum.len);
++              if (code)
++                      goto out;
++
++      } else {
++              dprintk("RPC: BAD or UNSUPPORTED SPKM3 token type: %d\n",toktype);
++              goto out;
++      }
++
++      /* XXX: need to add expiration and sequencing */
++      ret = GSS_S_COMPLETE;
++out:
++      if (md5cksum.data)
++              kfree(md5cksum.data);
++      if (wire_cksum.data)
++              kfree(wire_cksum.data);
++      return ret;
++}
+--- linux-2.6.7/net/sunrpc/auth_gss/Makefile.lsec      2004-06-15 23:19:22.000000000 -0600
++++ linux-2.6.7/net/sunrpc/auth_gss/Makefile   2005-03-23 14:28:24.294356392 -0700
+@@ -10,5 +10,9 @@ auth_rpcgss-objs := auth_gss.o gss_gener
+ obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o
+ rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \
+-      gss_krb5_seqnum.o
++      gss_krb5_seqnum.o gss_krb5_wrap.o
++obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o
++
++rpcsec_gss_spkm3-objs := gss_spkm3_mech.o gss_spkm3_seal.o gss_spkm3_unseal.o \
++      gss_spkm3_token.o
+--- linux-2.6.7/net/sunrpc/cache.c.lsec        2004-06-15 23:19:36.000000000 -0600
++++ linux-2.6.7/net/sunrpc/cache.c     2005-03-23 14:28:24.406339368 -0700
+@@ -38,7 +38,7 @@ void cache_init(struct cache_head *h)
+       time_t now = get_seconds();
+       h->next = NULL;
+       h->flags = 0;
+-      atomic_set(&h->refcnt, 0);
++      atomic_set(&h->refcnt, 1);
+       h->expiry_time = now + CACHE_NEW_EXPIRY;
+       h->last_refresh = now;
+ }
+--- linux-2.6.7/net/sunrpc/svc.c.lsec  2004-06-15 23:20:03.000000000 -0600
++++ linux-2.6.7/net/sunrpc/svc.c       2005-03-23 14:28:23.652453976 -0700
+@@ -263,6 +263,7 @@ svc_process(struct svc_serv *serv, struc
+       u32                     *statp;
+       u32                     dir, prog, vers, proc,
+                               auth_stat, rpc_stat;
++      int                     auth_res;
+       rpc_stat = rpc_success;
+@@ -304,12 +305,17 @@ svc_process(struct svc_serv *serv, struc
+       rqstp->rq_vers = vers = ntohl(svc_getu32(argv));        /* version number */
+       rqstp->rq_proc = proc = ntohl(svc_getu32(argv));        /* procedure number */
++      progp = serv->sv_program;
+       /*
+        * Decode auth data, and add verifier to reply buffer.
+        * We do this before anything else in order to get a decent
+        * auth verifier.
+        */
+-      switch (svc_authenticate(rqstp, &auth_stat)) {
++      if (progp->pg_authenticate != NULL)
++              auth_res = progp->pg_authenticate(rqstp, &auth_stat);
++      else
++              auth_res = svc_authenticate(rqstp, &auth_stat);
++      switch (auth_res) {
+       case SVC_OK:
+               break;
+       case SVC_GARBAGE:
+@@ -326,7 +332,6 @@ svc_process(struct svc_serv *serv, struc
+               goto sendit;
+       }
+               
+-      progp = serv->sv_program;
+       if (prog != progp->pg_prog)
+               goto err_bad_prog;
+--- linux-2.6.7/net/sunrpc/svcauth.c.lsec      2004-06-15 23:19:44.000000000 -0600
++++ linux-2.6.7/net/sunrpc/svcauth.c   2005-03-23 14:28:24.407339216 -0700
+@@ -156,25 +156,47 @@ static inline int auth_domain_match(stru
+ {
+       return strcmp(tmp->name, item->name) == 0;
+ }
+-DefineCacheLookup(struct auth_domain,
+-                h,
+-                auth_domain_lookup,
+-                (struct auth_domain *item, int set),
+-                /* no setup */,
+-                &auth_domain_cache,
+-                auth_domain_hash(item),
+-                auth_domain_match(tmp, item),
+-                kfree(new); if(!set) {
+-                      if (new)
+-                              write_unlock(&auth_domain_cache.hash_lock);
+-                      else
+-                              read_unlock(&auth_domain_cache.hash_lock);
+-                      return NULL;
+-                }
+-                new=item; atomic_inc(&new->h.refcnt),
+-                /* no update */,
+-                0 /* no inplace updates */
+-                )
++
++struct auth_domain *
++auth_domain_lookup(struct auth_domain *item, int set)
++{
++      struct auth_domain *tmp = NULL;
++      struct cache_head **hp, **head;
++      head = &auth_domain_cache.hash_table[auth_domain_hash(item)];
++
++      if (set)
++              write_lock(&auth_domain_cache.hash_lock);
++      else
++              read_lock(&auth_domain_cache.hash_lock);
++      for (hp=head; *hp != NULL; hp = &tmp->h.next) {
++              tmp = container_of(*hp, struct auth_domain, h);
++              if (!auth_domain_match(tmp, item))
++                      continue;
++              cache_get(&tmp->h);
++              if (!set)
++                      goto out_noset;
++              *hp = tmp->h.next;
++              tmp->h.next = NULL;
++              clear_bit(CACHE_HASHED, &tmp->h.flags);
++              auth_domain_drop(&tmp->h, &auth_domain_cache);
++              goto out_set;
++      }
++      /* Didn't find anything */
++      if (!set)
++              goto out_noset;
++      auth_domain_cache.entries++;
++out_set:
++      set_bit(CACHE_HASHED, &item->h.flags);
++      item->h.next = *head;
++      *head = &item->h;
++      write_unlock(&auth_domain_cache.hash_lock);
++      cache_fresh(&auth_domain_cache, &item->h, item->h.expiry_time);
++      cache_get(&item->h);
++      return item;
++out_noset:
++      read_unlock(&auth_domain_cache.hash_lock);
++      return tmp;
++}
+ struct auth_domain *auth_domain_find(char *name)
+ {
diff --git a/lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.patch b/lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.patch
new file mode 100644 (file)
index 0000000..f99ff70
--- /dev/null
@@ -0,0 +1,85 @@
+Introduce lock-free versions of d_rehash and d_move.
+
+ fs/dcache.c            |   22 ++++++++++++++++++----
+ include/linux/dcache.h |    2 ++
+ 2 files changed, 20 insertions(+), 4 deletions(-)
+
+Index: linus-2.6.7-bk5/fs/dcache.c
+===================================================================
+--- linus-2.6.7-bk5.orig/fs/dcache.c   2004-06-24 10:39:11.232154728 +0300
++++ linus-2.6.7-bk5/fs/dcache.c        2004-06-24 10:56:01.043640048 +0300
+@@ -1115,16 +1115,23 @@
+  * Adds a dentry to the hash according to its name.
+  */
+  
+-void d_rehash(struct dentry * entry)
++void __d_rehash(struct dentry * entry)
+ {
+       struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash);
+-      spin_lock(&dcache_lock);
+       spin_lock(&entry->d_lock);
+       entry->d_flags &= ~DCACHE_UNHASHED;
+       spin_unlock(&entry->d_lock);
+       entry->d_bucket = list;
+       hlist_add_head_rcu(&entry->d_hash, list);
++}
++
++EXPORT_SYMBOL(__d_rehash);
++ 
++void d_rehash(struct dentry * entry)
++{
++      spin_lock(&dcache_lock);
++      __d_rehash(entry);
+       spin_unlock(&dcache_lock);
+ }
+@@ -1200,12 +1207,11 @@
+  * dcache entries should not be moved in this way.
+  */
+-void d_move(struct dentry * dentry, struct dentry * target)
++void __d_move(struct dentry * dentry, struct dentry * target)
+ {
+       if (!dentry->d_inode)
+               printk(KERN_WARNING "VFS: moving negative dcache entry\n");
+-      spin_lock(&dcache_lock);
+       write_seqlock(&rename_lock);
+       /*
+        * XXXX: do we really need to take target->d_lock?
+@@ -1257,6 +1263,14 @@
+       spin_unlock(&target->d_lock);
+       spin_unlock(&dentry->d_lock);
+       write_sequnlock(&rename_lock);
++}
++
++EXPORT_SYMBOL(__d_move);
++
++void d_move(struct dentry *dentry, struct dentry *target)
++{
++      spin_lock(&dcache_lock);
++      __d_move(dentry, target);
+       spin_unlock(&dcache_lock);
+ }
+Index: linus-2.6.7-bk5/include/linux/dcache.h
+===================================================================
+--- linus-2.6.7-bk5.orig/include/linux/dcache.h        2004-06-24 10:39:29.534372368 +0300
++++ linus-2.6.7-bk5/include/linux/dcache.h     2004-06-24 10:53:10.319594048 +0300
+@@ -227,6 +227,7 @@
+  * This adds the entry to the hash queues.
+  */
+ extern void d_rehash(struct dentry *);
++extern void __d_rehash(struct dentry *);
+ /**
+  * d_add - add dentry to hash queues
+@@ -245,6 +246,7 @@
+ /* used for rename() and baskets */
+ extern void d_move(struct dentry *, struct dentry *);
++extern void __d_move(struct dentry *, struct dentry *);
+ /* appendix may either be NULL or be used for transname suffixes */
+ extern struct dentry * d_lookup(struct dentry *, struct qstr *);
diff --git a/lustre/kernel_patches/patches/vfs-dcache_lustre_invalid-vanilla-2.6.patch b/lustre/kernel_patches/patches/vfs-dcache_lustre_invalid-vanilla-2.6.patch
new file mode 100644 (file)
index 0000000..f83b663
--- /dev/null
@@ -0,0 +1,37 @@
+%diffstat
+ fs/dcache.c            |    7 +++++++
+ include/linux/dcache.h |    1 +
+ 2 files changed, 8 insertions(+)
+
+%patch
+Index: linux-2.6.6/fs/dcache.c
+===================================================================
+--- linux-2.6.6.orig/fs/dcache.c       2004-05-22 02:11:17.000000000 +0800
++++ linux-2.6.6/fs/dcache.c    2004-05-22 02:14:46.000000000 +0800
+@@ -217,6 +217,13 @@ int d_invalidate(struct dentry * dentry)
+               spin_unlock(&dcache_lock);
+               return 0;
+       }
++
++      /* network invalidation by Lustre */
++      if (dentry->d_flags & DCACHE_LUSTRE_INVALID) {
++              spin_unlock(&dcache_lock);
++              return 0;
++      }
++
+       /*
+        * Check whether to do a partial shrink_dcache
+        * to get rid of unused child entries.
+Index: linux-2.6.6/include/linux/dcache.h
+===================================================================
+--- linux-2.6.6.orig/include/linux/dcache.h    2004-05-22 02:10:01.000000000 +0800
++++ linux-2.6.6/include/linux/dcache.h 2004-05-22 02:15:17.000000000 +0800
+@@ -153,6 +153,7 @@ d_iput:            no              no              yes
+ #define DCACHE_REFERENCED     0x0008  /* Recently used, don't discard. */
+ #define DCACHE_UNHASHED               0x0010  
++#define DCACHE_LUSTRE_INVALID 0x0020  /* invalidated by Lustre */
+ extern spinlock_t dcache_lock;
+
diff --git a/lustre/kernel_patches/patches/vfs-do_truncate.patch b/lustre/kernel_patches/patches/vfs-do_truncate.patch
new file mode 100644 (file)
index 0000000..1cfd57b
--- /dev/null
@@ -0,0 +1,87 @@
+Index: linux-2.6.6/fs/namei.c
+===================================================================
+--- linux-2.6.6.orig/fs/namei.c        2004-05-30 23:17:06.267030976 +0300
++++ linux-2.6.6/fs/namei.c     2004-05-30 23:23:15.642877312 +0300
+@@ -1270,7 +1270,7 @@
+               if (!error) {
+                       DQUOT_INIT(inode);
+                       
+-                      error = do_truncate(dentry, 0);
++                      error = do_truncate(dentry, 0, 1);
+               }
+               put_write_access(inode);
+               if (error)
+Index: linux-2.6.6/fs/open.c
+===================================================================
+--- linux-2.6.6.orig/fs/open.c 2004-05-30 20:05:26.857206992 +0300
++++ linux-2.6.6/fs/open.c      2004-05-30 23:24:38.908219056 +0300
+@@ -189,7 +189,7 @@
+       return error;
+ }
+-int do_truncate(struct dentry *dentry, loff_t length)
++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open)
+ {
+       int err;
+       struct iattr newattrs;
+@@ -202,6 +202,8 @@
+       newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+       down(&dentry->d_inode->i_sem);
+       down_write(&dentry->d_inode->i_alloc_sem);
++      if (called_from_open)
++              newattrs.ia_valid |= ATTR_FROM_OPEN;
+       err = notify_change(dentry, &newattrs);
+       up_write(&dentry->d_inode->i_alloc_sem);
+       up(&dentry->d_inode->i_sem);
+@@ -259,7 +261,7 @@
+       error = locks_verify_truncate(inode, NULL, length);
+       if (!error) {
+               DQUOT_INIT(inode);
+-              error = do_truncate(nd.dentry, length);
++              error = do_truncate(nd.dentry, length, 0);
+       }
+       put_write_access(inode);
+@@ -311,7 +313,7 @@
+       error = locks_verify_truncate(inode, file, length);
+       if (!error)
+-              error = do_truncate(dentry, length);
++              error = do_truncate(dentry, length, 0);
+ out_putf:
+       fput(file);
+ out:
+Index: linux-2.6.6/fs/exec.c
+===================================================================
+--- linux-2.6.6.orig/fs/exec.c 2004-05-30 20:05:26.862206232 +0300
++++ linux-2.6.6/fs/exec.c      2004-05-30 23:23:15.648876400 +0300
+@@ -1395,7 +1395,7 @@
+               goto close_fail;
+       if (!file->f_op->write)
+               goto close_fail;
+-      if (do_truncate(file->f_dentry, 0) != 0)
++      if (do_truncate(file->f_dentry, 0, 0) != 0)
+               goto close_fail;
+       retval = binfmt->core_dump(signr, regs, file);
+Index: linux-2.6.6/include/linux/fs.h
+===================================================================
+--- linux-2.6.6.orig/include/linux/fs.h        2004-05-30 23:20:11.979798344 +0300
++++ linux-2.6.6/include/linux/fs.h     2004-05-30 23:25:29.167578472 +0300
+@@ -249,6 +249,7 @@
+ #define ATTR_ATTR_FLAG        1024
+ #define ATTR_KILL_SUID        2048
+ #define ATTR_KILL_SGID        4096
++#define ATTR_FROM_OPEN        16384   /* called from open path, ie O_TRUNC */
+ /*
+  * This is the Inode Attributes structure, used for notify_change().  It
+@@ -1189,7 +1190,7 @@
+ /* fs/open.c */
+-extern int do_truncate(struct dentry *, loff_t start);
++extern int do_truncate(struct dentry *, loff_t start, int called_from_open);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
+ extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *);
diff --git a/lustre/kernel_patches/patches/vfs-gns_export_doumount.patch b/lustre/kernel_patches/patches/vfs-gns_export_doumount.patch
new file mode 100644 (file)
index 0000000..36ae7b4
--- /dev/null
@@ -0,0 +1,34 @@
+Index: linux-2.6.7/fs/namespace.c
+===================================================================
+--- linux-2.6.7.orig/fs/namespace.c    2004-11-21 00:25:13.000000000 +0200
++++ linux-2.6.7/fs/namespace.c 2004-11-21 00:25:15.000000000 +0200
+@@ -360,7 +360,7 @@
+       }
+ }
+-static int do_umount(struct vfsmount *mnt, int flags)
++int do_umount(struct vfsmount *mnt, int flags)
+ {
+       struct super_block * sb = mnt->mnt_sb;
+       int retval;
+@@ -434,6 +434,8 @@
+       return retval;
+ }
++EXPORT_SYMBOL(do_umount);
++
+ /*
+  * Now umount can handle mount points as well as block devices.
+  * This is important for filesystems which use unnamed block devices.
+Index: linux-2.6.7/include/linux/mount.h
+===================================================================
+--- linux-2.6.7.orig/include/linux/mount.h     2004-11-21 00:25:13.000000000 +0200
++++ linux-2.6.7/include/linux/mount.h  2005-01-11 15:28:26.627030408 +0200
+@@ -56,6 +56,7 @@
+ extern struct vfsmount *alloc_vfsmnt(const char *name);
+ extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
+                                     const char *name, void *data);
++extern int do_umount(struct vfsmount *mnt, int flags);
+ extern spinlock_t vfsmount_lock;
+ #endif
diff --git a/lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.patch b/lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.patch
new file mode 100644 (file)
index 0000000..49c2938
--- /dev/null
@@ -0,0 +1,555 @@
+Index: linus-2.6.7-bk-latest/include/linux/namei.h
+===================================================================
+--- linus-2.6.7-bk-latest.orig/include/linux/namei.h   2004-07-07 10:56:34.232378296 +0300
++++ linus-2.6.7-bk-latest/include/linux/namei.h        2004-07-07 11:41:48.569736296 +0300
+@@ -2,13 +2,40 @@
+ #define _LINUX_NAMEI_H
+ #include <linux/linkage.h>
++#include <linux/string.h>
+ struct vfsmount;
++/* intent opcodes */
++#define IT_OPEN               (1)
++#define IT_CREAT      (1<<1)
++#define IT_READDIR    (1<<2)
++#define IT_GETATTR    (1<<3)
++#define IT_LOOKUP     (1<<4)
++#define IT_UNLINK     (1<<5)
++#define IT_TRUNC      (1<<6)
++#define IT_GETXATTR   (1<<7)
++
++#define INTENT_MAGIC 0x19620323
++
+ struct open_intent {
++      int     magic;
++      int     op;
++      void    (*op_release)(struct open_intent *);
+       int     flags;
+       int     create_mode;
++      union {
++              void *fs_data; /* FS-specific intent data */
++      } d;
+ };
++static inline void intent_init(struct open_intent *it, int op)
++{
++      memset(it, 0, sizeof(*it));
++      it->magic = INTENT_MAGIC;
++      it->op = op;
++}
++
++ 
+ struct nameidata {
+       struct dentry   *dentry;
+@@ -53,14 +76,22 @@
+ #define LOOKUP_ACCESS         (0x0400)
+ extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
++extern int FASTCALL(__user_walk_it(const char __user *, unsigned, struct nameidata *));
+ #define user_path_walk(name,nd) \
+       __user_walk(name, LOOKUP_FOLLOW, nd)
++#define user_path_walk_it(name,nd) \
++      __user_walk_it(name, LOOKUP_FOLLOW, nd)
+ #define user_path_walk_link(name,nd) \
+       __user_walk(name, 0, nd)
++#define user_path_walk_link_it(name,nd) \
++      __user_walk_it(name, 0, nd)
+ extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
++extern int FASTCALL(path_lookup_it(const char *, unsigned, struct nameidata *));
+ extern int FASTCALL(path_walk(const char *, struct nameidata *));
++extern int FASTCALL(path_walk_it(const char *, struct nameidata *));
+ extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
+ extern void path_release(struct nameidata *);
++extern void intent_release(struct open_intent *);
+ extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
+ extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
+Index: linus-2.6.7-bk-latest/include/linux/fs.h
+===================================================================
+--- linus-2.6.7-bk-latest.orig/include/linux/fs.h      2004-07-07 10:56:33.720456120 +0300
++++ linus-2.6.7-bk-latest/include/linux/fs.h   2004-07-07 11:38:42.864967712 +0300
+@@ -583,6 +583,7 @@
+       spinlock_t              f_ep_lock;
+ #endif /* #ifdef CONFIG_EPOLL */
+       struct address_space    *f_mapping;
++      struct open_intent      *f_it;
+ };
+ extern spinlock_t files_lock;
+ #define file_list_lock() spin_lock(&files_lock);
+@@ -1201,6 +1202,7 @@
+ extern int do_truncate(struct dentry *, loff_t start);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct open_intent *);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char __user *);
+Index: linus-2.6.7-bk-latest/fs/namei.c
+===================================================================
+--- linus-2.6.7-bk-latest.orig/fs/namei.c      2004-07-07 10:56:13.455536856 +0300
++++ linus-2.6.7-bk-latest/fs/namei.c   2004-07-07 11:38:42.866967408 +0300
+@@ -272,8 +272,19 @@
+       return 0;
+ }
++void intent_release(struct open_intent *it)
++{
++      if (!it)
++              return;
++      if (it->magic != INTENT_MAGIC)
++              return;
++      if (it->op_release)
++              it->op_release(it);
++}
++
+ void path_release(struct nameidata *nd)
+ {
++      intent_release(&nd->intent.open);
+       dput(nd->dentry);
+       mntput(nd->mnt);
+ }
+@@ -790,8 +801,14 @@
+       return err;
+ }
++int fastcall path_walk_it(const char * name, struct nameidata *nd)
++{
++      current->total_link_count = 0;
++      return link_path_walk(name, nd);
++}
+ int fastcall path_walk(const char * name, struct nameidata *nd)
+ {
++      intent_init(&nd->intent.open, IT_LOOKUP);
+       current->total_link_count = 0;
+       return link_path_walk(name, nd);
+ }
+@@ -800,7 +817,7 @@
+ /* returns 1 if everything is done */
+ static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
+ {
+-      if (path_walk(name, nd))
++      if (path_walk_it(name, nd))
+               return 0;               /* something went wrong... */
+       if (!nd->dentry->d_inode || S_ISDIR(nd->dentry->d_inode->i_mode)) {
+@@ -878,7 +895,18 @@
+       return 1;
+ }
+-int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
++static inline int it_mode_from_lookup_flags(int flags)
++{
++      int mode = IT_LOOKUP;
++
++      if (flags & LOOKUP_OPEN)
++              mode = IT_OPEN;
++      if (flags & LOOKUP_CREATE)
++              mode |= IT_CREAT;
++      return mode;
++}
++
++int fastcall path_lookup_it(const char *name, unsigned int flags, struct nameidata *nd)
+ {
+       int retval;
+@@ -914,6 +942,12 @@
+       return retval;
+ }
++int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
++{
++      intent_init(&nd->intent.open, it_mode_from_lookup_flags(flags));
++      return path_lookup_it(name, flags, nd);
++}
++
+ /*
+  * Restricted form of lookup. Doesn't follow links, single-component only,
+  * needs parent already locked. Doesn't follow mounts.
+@@ -964,7 +998,7 @@
+ }
+ /* SMP-safe */
+-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd)
+ {
+       unsigned long hash;
+       struct qstr this;
+@@ -984,11 +1018,16 @@
+       }
+       this.hash = end_name_hash(hash);
+-      return lookup_hash(&this, base);
++      return __lookup_hash(&this, base, nd);
+ access:
+       return ERR_PTR(-EACCES);
+ }
++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++{
++      return lookup_one_len_it(name, base, len, NULL);
++}
++
+ /*
+  *    namei()
+  *
+@@ -1000,18 +1039,24 @@
+  * that namei follows links, while lnamei does not.
+  * SMP-safe
+  */
+-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd)
+ {
+       char *tmp = getname(name);
+       int err = PTR_ERR(tmp);
+       if (!IS_ERR(tmp)) {
+-              err = path_lookup(tmp, flags, nd);
++              err = path_lookup_it(tmp, flags, nd);
+               putname(tmp);
+       }
+       return err;
+ }
++int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
++{
++      intent_init(&nd->intent.open, it_mode_from_lookup_flags(flags));
++      return __user_walk_it(name, flags, nd);
++}
++
+ /*
+  * It's inline, so penalty for filesystems that don't use sticky bit is
+  * minimal.
+@@ -1296,7 +1341,7 @@
+        * The simplest case - just a plain lookup.
+        */
+       if (!(flag & O_CREAT)) {
+-              error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd);
++              error = path_lookup_it(pathname, lookup_flags(flag), nd);
+               if (error)
+                       return error;
+               goto ok;
+@@ -1305,7 +1350,8 @@
+       /*
+        * Create - we need to know the parent.
+        */
+-      error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
++      nd->intent.open.op |= IT_CREAT;
++      error = path_lookup_it(pathname, LOOKUP_PARENT, nd);
+       if (error)
+               return error;
+@@ -2214,6 +2260,7 @@
+ static int __vfs_follow_link(struct nameidata *nd, const char *link)
+ {
+       int res = 0;
++      struct open_intent it = nd->intent.open;
+       char *name;
+       if (IS_ERR(link))
+               goto fail;
+@@ -2224,6 +2271,10 @@
+                       /* weird __emul_prefix() stuff did it */
+                       goto out;
+       }
++      intent_release(&nd->intent.open);
++      intent_init(&nd->intent.open, it.op);
++      nd->intent.open.flags = it.flags;
++      nd->intent.open.create_mode = it.create_mode;
+       res = link_path_walk(link, nd);
+ out:
+       if (nd->depth || res || nd->last_type!=LAST_NORM)
+@@ -2322,6 +2372,7 @@
+       return res;
+ }
++
+ int page_symlink(struct inode *inode, const char *symname, int len)
+ {
+       struct address_space *mapping = inode->i_mapping;
+@@ -2385,8 +2436,10 @@
+ EXPORT_SYMBOL(page_symlink);
+ EXPORT_SYMBOL(page_symlink_inode_operations);
+ EXPORT_SYMBOL(path_lookup);
++EXPORT_SYMBOL(path_lookup_it);
+ EXPORT_SYMBOL(path_release);
+ EXPORT_SYMBOL(path_walk);
++EXPORT_SYMBOL(path_walk_it);
+ EXPORT_SYMBOL(permission);
+ EXPORT_SYMBOL(unlock_rename);
+ EXPORT_SYMBOL(vfs_create);
+Index: linus-2.6.7-bk-latest/fs/open.c
+===================================================================
+--- linus-2.6.7-bk-latest.orig/fs/open.c       2004-07-07 10:56:13.610513296 +0300
++++ linus-2.6.7-bk-latest/fs/open.c    2004-07-07 11:38:42.867967256 +0300
+@@ -216,11 +216,12 @@
+       struct inode * inode;
+       int error;
++      intent_init(&nd.intent.open, IT_GETATTR);
+       error = -EINVAL;
+       if (length < 0) /* sorry, but loff_t says... */
+               goto out;
+-      error = user_path_walk(path, &nd);
++      error = user_path_walk_it(path, &nd);
+       if (error)
+               goto out;
+       inode = nd.dentry->d_inode;
+@@ -475,6 +476,7 @@
+       kernel_cap_t old_cap;
+       int res;
++      intent_init(&nd.intent.open, IT_GETATTR);
+       if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
+               return -EINVAL;
+@@ -498,7 +500,7 @@
+       else
+               current->cap_effective = current->cap_permitted;
+-      res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
++      res = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+       if (!res) {
+               res = permission(nd.dentry->d_inode, mode, &nd);
+               /* SuS v2 requires we report a read only fs too */
+@@ -520,7 +522,8 @@
+       struct nameidata nd;
+       int error;
+-      error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
++      intent_init(&nd.intent.open, IT_GETATTR);
++      error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+       if (error)
+               goto out;
+@@ -571,7 +574,8 @@
+       struct nameidata nd;
+       int error;
+-      error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
++      intent_init(&nd.intent.open, IT_GETATTR);
++      error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+       if (error)
+               goto out;
+@@ -754,6 +758,7 @@
+ {
+       int namei_flags, error;
+       struct nameidata nd;
++      intent_init(&nd.intent.open, IT_OPEN);
+       namei_flags = flags;
+       if ((namei_flags+1) & O_ACCMODE)
+@@ -763,14 +768,14 @@
+       error = open_namei(filename, namei_flags, mode, &nd);
+       if (!error)
+-              return dentry_open(nd.dentry, nd.mnt, flags);
++              return dentry_open_it(nd.dentry, nd.mnt, flags, &nd.intent.open);
+       return ERR_PTR(error);
+ }
+ EXPORT_SYMBOL(filp_open);
+-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, int flags, struct open_intent *it)
+ {
+       struct file * f;
+       struct inode *inode;
+@@ -782,6 +787,7 @@
+               goto cleanup_dentry;
+       f->f_flags = flags;
+       f->f_mode = (flags+1) & O_ACCMODE;
++      f->f_it = it;
+       inode = dentry->d_inode;
+       if (f->f_mode & FMODE_WRITE) {
+               error = get_write_access(inode);
+@@ -800,6 +806,7 @@
+               error = f->f_op->open(inode,f);
+               if (error)
+                       goto cleanup_all;
++              intent_release(it);
+       }
+       f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+@@ -825,11 +832,20 @@
+ cleanup_file:
+       put_filp(f);
+ cleanup_dentry:
++      intent_release(it);
+       dput(dentry);
+       mntput(mnt);
+       return ERR_PTR(error);
+ }
++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++{
++      struct open_intent it;
++      intent_init(&it, IT_LOOKUP);
++
++      return dentry_open_it(dentry, mnt, flags, &it);
++}
++
+ EXPORT_SYMBOL(dentry_open);
+ /*
+Index: linus-2.6.7-bk-latest/fs/stat.c
+===================================================================
+--- linus-2.6.7-bk-latest.orig/fs/stat.c       2004-07-07 10:56:13.635509496 +0300
++++ linus-2.6.7-bk-latest/fs/stat.c    2004-07-07 11:38:42.868967104 +0300
+@@ -59,15 +59,15 @@
+       }
+       return 0;
+ }
+-
+ EXPORT_SYMBOL(vfs_getattr);
+ int vfs_stat(char __user *name, struct kstat *stat)
+ {
+       struct nameidata nd;
+       int error;
++      intent_init(&nd.intent.open, IT_GETATTR);
+-      error = user_path_walk(name, &nd);
++      error = user_path_walk_it(name, &nd);
+       if (!error) {
+               error = vfs_getattr(nd.mnt, nd.dentry, stat);
+               path_release(&nd);
+@@ -81,8 +81,9 @@
+ {
+       struct nameidata nd;
+       int error;
++      intent_init(&nd.intent.open, IT_GETATTR);
+-      error = user_path_walk_link(name, &nd);
++      error = user_path_walk_link_it(name, &nd);
+       if (!error) {
+               error = vfs_getattr(nd.mnt, nd.dentry, stat);
+               path_release(&nd);
+@@ -96,9 +97,12 @@
+ {
+       struct file *f = fget(fd);
+       int error = -EBADF;
++      struct nameidata nd;
++      intent_init(&nd.intent.open, IT_GETATTR);
+       if (f) {
+               error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat);
++              intent_release(&nd.intent.open);
+               fput(f);
+       }
+       return error;
+Index: linus-2.6.7-bk-latest/fs/namespace.c
+===================================================================
+--- linus-2.6.7-bk-latest.orig/fs/namespace.c  2004-07-07 10:56:13.605514056 +0300
++++ linus-2.6.7-bk-latest/fs/namespace.c       2004-07-07 11:38:42.868967104 +0300
+@@ -117,6 +117,7 @@
+ static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
+ {
++      memset(old_nd, 0, sizeof(*old_nd));
+       old_nd->dentry = mnt->mnt_mountpoint;
+       old_nd->mnt = mnt->mnt_parent;
+       mnt->mnt_parent = mnt;
+Index: linus-2.6.7-bk-latest/fs/exec.c
+===================================================================
+--- linus-2.6.7-bk-latest.orig/fs/exec.c       2004-07-07 10:56:13.395545976 +0300
++++ linus-2.6.7-bk-latest/fs/exec.c    2004-07-07 11:38:42.869966952 +0300
+@@ -121,8 +121,9 @@
+       struct nameidata nd;
+       int error;
++      intent_init(&nd.intent.open, IT_OPEN);
+       nd.intent.open.flags = FMODE_READ;
+-      error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++      error = user_path_walk_it(library, &nd);
+       if (error)
+               goto out;
+@@ -134,7 +135,7 @@
+       if (error)
+               goto exit;
+-      file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++      file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent.open);
+       error = PTR_ERR(file);
+       if (IS_ERR(file))
+               goto out;
+@@ -474,8 +475,9 @@
+       int err;
+       struct file *file;
++      intent_init(&nd.intent.open, IT_OPEN);
+       nd.intent.open.flags = FMODE_READ;
+-      err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
++      err = path_lookup_it(name, LOOKUP_FOLLOW, &nd);
+       file = ERR_PTR(err);
+       if (!err) {
+@@ -488,7 +490,7 @@
+                               err = -EACCES;
+                       file = ERR_PTR(err);
+                       if (!err) {
+-                              file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++                              file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent.open);
+                               if (!IS_ERR(file)) {
+                                       err = deny_write_access(file);
+                                       if (err) {
+Index: linus-2.6.7-bk-latest/fs/xattr.c
+===================================================================
+--- linus-2.6.7-bk-latest.orig/fs/xattr.c      2004-07-07 10:56:13.643508280 +0300
++++ linus-2.6.7-bk-latest/fs/xattr.c   2004-07-07 11:38:42.870966800 +0300
+@@ -161,7 +161,8 @@
+       struct nameidata nd;
+       ssize_t error;
+-      error = user_path_walk(path, &nd);
++      intent_init(&nd.intent.open, IT_GETXATTR);
++      error = user_path_walk_it(path, &nd);
+       if (error)
+               return error;
+       error = getxattr(nd.dentry, name, value, size);
+@@ -176,7 +177,8 @@
+       struct nameidata nd;
+       ssize_t error;
+-      error = user_path_walk_link(path, &nd);
++      intent_init(&nd.intent.open, IT_GETXATTR);
++      error = user_path_walk_link_it(path, &nd);
+       if (error)
+               return error;
+       error = getxattr(nd.dentry, name, value, size);
+@@ -242,7 +244,8 @@
+       struct nameidata nd;
+       ssize_t error;
+-      error = user_path_walk(path, &nd);
++      intent_init(&nd.intent.open, IT_GETXATTR);
++      error = user_path_walk_it(path, &nd);
+       if (error)
+               return error;
+       error = listxattr(nd.dentry, list, size);
+@@ -256,7 +259,8 @@
+       struct nameidata nd;
+       ssize_t error;
+-      error = user_path_walk_link(path, &nd);
++      intent_init(&nd.intent.open, IT_GETXATTR);
++      error = user_path_walk_link_it(path, &nd);
+       if (error)
+               return error;
+       error = listxattr(nd.dentry, list, size);
+
+--- linux-2.6.7.orig/include/linux/mount.h     2004-06-16 13:18:57.000000000 +0800
++++ linux-2.6.7/include/linux/mount.h  2004-09-06 21:05:29.000000000 +0800
+@@ -31,6 +31,8 @@
+       int mnt_flags;
+       char *mnt_devname;              /* Name of device e.g. /dev/dsk/hda1 */
+       struct list_head mnt_list;
++        struct list_head mnt_lustre_list; /* GNS mount list */
++        unsigned long mnt_last_used;      /* for GNS auto-umount (jiffies) */
+ };
+ static inline struct vfsmount *mntget(struct vfsmount *mnt)
diff --git a/lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.patch b/lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.patch
new file mode 100644 (file)
index 0000000..76ccd7b
--- /dev/null
@@ -0,0 +1,77 @@
+Index: linus-2.6.7/fs/namei.c
+===================================================================
+--- linus-2.6.7.orig/fs/namei.c        2005-03-05 20:24:52.000000000 +0200
++++ linus-2.6.7/fs/namei.c     2005-03-28 17:11:20.486991680 +0300
+@@ -676,8 +676,11 @@
+                       goto out_dput;
+               if (inode->i_op->follow_link) {
++                      int saved_flags = nd->flags;
+                       mntget(next.mnt);
++                      nd->flags |= LOOKUP_LINK_NOTLAST;
+                       err = do_follow_link(next.dentry, nd);
++                      nd->flags = saved_flags;
+                       dput(next.dentry);
+                       mntput(next.mnt);
+                       if (err)
+@@ -723,7 +726,9 @@
+                       if (err < 0)
+                               break;
+               }
++              nd->flags |= LOOKUP_LAST;
+               err = do_lookup(nd, &this, &next);
++              nd->flags &= ~LOOKUP_LAST;
+               if (err)
+                       break;
+               follow_mount(&next.mnt, &next.dentry);
+@@ -769,10 +774,14 @@
+                */
+               if (nd->dentry && nd->dentry->d_sb &&
+                   (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
+-                      err = -ESTALE;
++                      nd->flags |= LOOKUP_LAST;
++                      err = !nd->dentry->d_op->d_revalidate(nd->dentry, nd);
++                        nd->flags &= ~LOOKUP_LAST;
+                       /* Note: we do not d_invalidate() */
+-                      if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
++                      if (err) {
++                              err = -ESTALE;
+                               break;
++                      }
+               }
+ return_base:
+               return 0;
+@@ -1344,7 +1353,9 @@
+       dir = nd->dentry;
+       nd->flags &= ~LOOKUP_PARENT;
+       down(&dir->d_inode->i_sem);
++      nd->flags |= LOOKUP_LAST;
+       dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++      nd->flags &= ~LOOKUP_LAST;
+ do_last:
+       error = PTR_ERR(dentry);
+@@ -1449,7 +1460,9 @@
+       }
+       dir = nd->dentry;
+       down(&dir->d_inode->i_sem);
++      nd->flags |= LOOKUP_LAST;
+       dentry = __lookup_hash(&nd->last, nd->dentry, nd);
++      nd->flags &= ~LOOKUP_LAST;
+       putname(nd->last.name);
+       goto do_last;
+ }
+Index: linus-2.6.7/include/linux/namei.h
+===================================================================
+--- linus-2.6.7.orig/include/linux/namei.h     2005-03-05 20:24:52.000000000 +0200
++++ linus-2.6.7/include/linux/namei.h  2005-03-05 20:24:52.000000000 +0200
+@@ -68,6 +68,9 @@
+ #define LOOKUP_CONTINUE                4
+ #define LOOKUP_PARENT         16
+ #define LOOKUP_NOALT          32
++#define LOOKUP_LAST           64
++#define LOOKUP_LINK_NOTLAST   128
++
+ /*
+  * Intent data
+  */
index 9f95068..0621750 100644 (file)
@@ -46,14 +46,12 @@ Index: linux-2.6.7/fs/namei.c
  /* In order to reduce some races, while at the same time doing additional
   * checking and hopefully speeding things up, we copy filenames to the
   * kernel data space before using them..
-@@ -362,10 +394,11 @@
+@@ -362,8 +394,9 @@
+ {
        struct dentry * result;
        struct inode *dir = parent->d_inode;
-       int counter = 0;
 +      void *lock;
  
- again:
-       counter++;
 -      down(&dir->i_sem);
 +      lock = lock_dir(dir, name);
        /*
@@ -149,10 +147,10 @@ Index: linux-2.6.7/fs/namei.c
  out2:
                path_release(&nd);
  out:
-@@ -1765,14 +1798,14 @@
-                       goto exit1;
-       }
-  
+@@ -1735,14 +1735,14 @@
+                       error = -EBUSY;
+                       goto exit1;
+       }
 -      down(&nd.dentry->d_inode->i_sem);
 +      nd.lock = lock_dir(nd.dentry->d_inode, &nd.last);
        dentry = lookup_hash(&nd.last, nd.dentry);
@@ -166,10 +164,10 @@ Index: linux-2.6.7/fs/namei.c
  exit1:
        path_release(&nd);
  exit:
-@@ -1842,7 +1875,7 @@
-               if (error != -EOPNOTSUPP)
-                       goto exit1;
-       }
+@@ -1808,7 +1808,7 @@
+       error = -EISDIR;
+       if (nd.last_type != LAST_NORM)
+               goto exit1;
 -      down(&nd.dentry->d_inode->i_sem);
 +      nd.lock = lock_dir(nd.dentry->d_inode, &nd.last);
        dentry = lookup_hash(&nd.last, nd.dentry);
@@ -257,8 +255,8 @@ Index: linux-2.6.7/include/linux/namei.h
 @@ -52,6 +52,7 @@
        unsigned int    flags;
        int             last_type;
-       struct lookup_intent intent;
 +      void *lock;
- };
  
- /*
+       /* Intent data */
+       union {
+               struct open_intent open;
diff --git a/lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.patch b/lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.patch
new file mode 100644 (file)
index 0000000..21d4e12
--- /dev/null
@@ -0,0 +1,235 @@
+Index: linus-2.6.7/fs/namei.c
+===================================================================
+--- linus-2.6.7.orig/fs/namei.c        2005-03-05 20:24:52.000000000 +0200
++++ linus-2.6.7/fs/namei.c     2005-03-23 13:37:48.563339840 +0200
+@@ -758,14 +758,20 @@
+ lookup_parent:
+               nd->last = this;
+               nd->last_type = LAST_NORM;
+-              if (this.name[0] != '.')
+-                      goto return_base;
+-              if (this.len == 1)
+-                      nd->last_type = LAST_DOT;
+-              else if (this.len == 2 && this.name[1] == '.')
+-                      nd->last_type = LAST_DOTDOT;
+-              else
+-                      goto return_base;
++              if (this.name[0] == '.') {
++                      if (this.len == 1)
++                              nd->last_type = LAST_DOT;
++                      else if (this.len == 2 && this.name[1] == '.')
++                              nd->last_type = LAST_DOTDOT;
++              }
++
++              if ((nd->last_type == LAST_NORM) && inode->i_op &&
++                  inode->i_op->endparentlookup) {
++                      err = inode->i_op->endparentlookup(nd);
++                      if (err)
++                              break;
++              }
++              goto return_base;
+ return_reval:
+               /*
+                * We bypassed the ordinary revalidation routines.
+@@ -1535,9 +1541,16 @@
+       if (IS_ERR(tmp))
+               return PTR_ERR(tmp);
+-      error = path_lookup(tmp, LOOKUP_PARENT, &nd);
++      intent_init(&nd.intent.open, IT_MKNOD);
++      nd.intent.open.create_mode = mode;
++      nd.intent.open.create.dev = dev;
++
++      error = path_lookup_it(tmp, LOOKUP_PARENT, &nd);
+       if (error)
+               goto out;
++      if (nd.intent.open.flags & IT_STATUS_RAW)
++              goto out2;
++
+       dentry = lookup_create(&nd, 0);
+       error = PTR_ERR(dentry);
+@@ -1564,6 +1577,7 @@
+               dput(dentry);
+       }
+       up(&nd.dentry->d_inode->i_sem);
++out2:
+       path_release(&nd);
+ out:
+       putname(tmp);
+@@ -1606,9 +1620,13 @@
+               struct dentry *dentry;
+               struct nameidata nd;
+-              error = path_lookup(tmp, LOOKUP_PARENT, &nd);
++              intent_init(&nd.intent.open, IT_MKDIR);
++              nd.intent.open.create_mode = mode;
++              error = path_lookup_it(tmp, LOOKUP_PARENT, &nd);
+               if (error)
+                       goto out;
++              if (nd.intent.open.flags & IT_STATUS_RAW)
++                      goto out2;
+               dentry = lookup_create(&nd, 1);
+               error = PTR_ERR(dentry);
+               if (!IS_ERR(dentry)) {
+@@ -1618,6 +1636,7 @@
+                       dput(dentry);
+               }
+               up(&nd.dentry->d_inode->i_sem);
++out2:
+               path_release(&nd);
+ out:
+               putname(tmp);
+@@ -1703,9 +1722,12 @@
+       if(IS_ERR(name))
+               return PTR_ERR(name);
+-      error = path_lookup(name, LOOKUP_PARENT, &nd);
++      intent_init(&nd.intent.open, IT_RMDIR);
++      error = path_lookup_it(name, LOOKUP_PARENT, &nd);
+       if (error)
+               goto exit;
++      if (nd.intent.open.flags & IT_STATUS_RAW)
++              goto exit1;
+       switch(nd.last_type) {
+               case LAST_DOTDOT:
+@@ -1781,9 +1803,13 @@
+       if(IS_ERR(name))
+               return PTR_ERR(name);
+-      error = path_lookup(name, LOOKUP_PARENT, &nd);
++      intent_init(&nd.intent.open, IT_UNLINK);
++      error = path_lookup_it(name, LOOKUP_PARENT, &nd);
+       if (error)
+               goto exit;
++      if (nd.intent.open.flags & IT_STATUS_RAW)
++              goto exit1;
++
+       error = -EISDIR;
+       if (nd.last_type != LAST_NORM)
+               goto exit1;
+@@ -1855,9 +1881,13 @@
+               struct dentry *dentry;
+               struct nameidata nd;
+-              error = path_lookup(to, LOOKUP_PARENT, &nd);
++              intent_init(&nd.intent.open, IT_SYMLINK);
++              nd.intent.open.create.link = from;
++              error = path_lookup_it(to, LOOKUP_PARENT, &nd);
+               if (error)
+                       goto out;
++              if (nd.intent.open.flags & IT_STATUS_RAW)
++                      goto out2;
+               dentry = lookup_create(&nd, 0);
+               error = PTR_ERR(dentry);
+               if (!IS_ERR(dentry)) {
+@@ -1865,6 +1895,7 @@
+                       dput(dentry);
+               }
+               up(&nd.dentry->d_inode->i_sem);
++out2:
+               path_release(&nd);
+ out:
+               putname(to);
+@@ -1936,9 +1967,13 @@
+       error = __user_walk(oldname, 0, &old_nd);
+       if (error)
+               goto exit;
+-      error = path_lookup(to, LOOKUP_PARENT, &nd);
++      intent_init(&nd.intent.open, IT_LINK);
++      nd.intent.open.create.source_nd = &old_nd;
++      error = path_lookup_it(to, LOOKUP_PARENT, &nd);
+       if (error)
+               goto out;
++      if (nd.intent.open.flags & IT_STATUS_RAW)
++              goto out_release;
+       error = -EXDEV;
+       if (old_nd.mnt != nd.mnt)
+               goto out_release;
+@@ -2119,9 +2154,18 @@
+       if (error)
+               goto exit;
+-      error = path_lookup(newname, LOOKUP_PARENT, &newnd);
++      error = -EBUSY;
++      if (oldnd.last_type != LAST_NORM)
++              goto exit1;
++
++      intent_init(&newnd.intent.open, IT_RENAME);
++      newnd.intent.open.create.source_nd = &oldnd;
++      error = path_lookup_it(newname, LOOKUP_PARENT, &newnd);
+       if (error)
+               goto exit1;
++      if (newnd.intent.open.flags & IT_STATUS_RAW) {
++              goto exit2;
++      }
+       error = -EXDEV;
+       if (oldnd.mnt != newnd.mnt)
+@@ -2129,8 +2173,6 @@
+       old_dir = oldnd.dentry;
+       error = -EBUSY;
+-      if (oldnd.last_type != LAST_NORM)
+-              goto exit2;
+       new_dir = newnd.dentry;
+       if (newnd.last_type != LAST_NORM)
+@@ -2238,6 +2280,7 @@
+       intent_init(&nd->intent.open, it.op);
+       nd->intent.open.flags = it.flags;
+       nd->intent.open.create_mode = it.create_mode;
++      nd->intent.open.create = it.create;
+       res = link_path_walk(link, nd);
+ out:
+       if (current->link_count || res || nd->last_type!=LAST_NORM)
+Index: linus-2.6.7/include/linux/namei.h
+===================================================================
+--- linus-2.6.7.orig/include/linux/namei.h     2005-03-05 20:24:52.000000000 +0200
++++ linus-2.6.7/include/linux/namei.h  2005-03-23 13:34:56.632477304 +0200
+@@ -15,9 +15,19 @@
+ #define IT_UNLINK     (1<<5)
+ #define IT_TRUNC      (1<<6)
+ #define IT_GETXATTR   (1<<7)
++#define IT_RMDIR      (1<<8)
++#define IT_LINK               (1<<9)
++#define IT_RENAME     (1<<10)
++#define IT_MKDIR      (1<<11)
++#define IT_MKNOD      (1<<12)
++#define IT_SYMLINK    (1<<13)
++#define IT_CHDIR      (1<<14)
+ #define INTENT_MAGIC 0x19620323
+-
++#define IT_STATUS_RAW (1<<10)  /* Setting this in it_flags on exit from lookup
++                                  means everything was done already and return
++                                  value from lookup is in fact status of
++                                  already performed operation */
+ struct open_intent {
+       int     magic;
+       int     op;
+@@ -25,6 +35,11 @@
+       int     flags;
+       int     create_mode;
+       union {
++              unsigned        dev;    /* For mknod */
++              char    *link;  /* For symlink */
++              struct nameidata *source_nd; /* For link/rename */
++      } create;
++      union {
+               void *fs_data; /* FS-specific intent data */
+       } d;
+ };
+Index: linus-2.6.7/include/linux/fs.h
+===================================================================
+--- linus-2.6.7.orig/include/linux/fs.h        2005-03-05 20:24:52.000000000 +0200
++++ linus-2.6.7/include/linux/fs.h     2005-03-23 13:35:08.796628072 +0200
+@@ -909,6 +909,7 @@
+       ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+       ssize_t (*listxattr) (struct dentry *, char *, size_t);
+       int (*removexattr) (struct dentry *, const char *);
++      int (*endparentlookup) (struct nameidata *);
+ };
+ struct seq_file;
diff --git a/lustre/kernel_patches/patches/vfs_fmode_exec-2.6.patch b/lustre/kernel_patches/patches/vfs_fmode_exec-2.6.patch
new file mode 100644 (file)
index 0000000..9c3a5d6
--- /dev/null
@@ -0,0 +1,34 @@
+ fs/exec.c          |    4 ++--
+ include/linux/fs.h |    1 +
+ 2 files changed, 3 insertions(+), 2 deletions(-)
+
+--- linus-2.6.7-bk-latest/include/linux/fs.h.orig      2004-07-07 12:33:21.246507224 +0300
++++ linus-2.6.7-bk-latest/include/linux/fs.h   2004-07-07 12:33:55.069365368 +0300
+@@ -74,6 +74,7 @@ extern int leases_enable, dir_notify_ena
+ #define FMODE_READ 1
+ #define FMODE_WRITE 2
++#define FMODE_EXEC 4
+ #define RW_MASK               1
+ #define RWA_MASK      2
+--- linus-2.6.7-bk-latest/fs/exec.c.orig       2004-07-07 12:33:05.466906088 +0300
++++ linus-2.6.7-bk-latest/fs/exec.c    2004-07-07 12:33:38.127940856 +0300
+@@ -122,7 +122,7 @@ asmlinkage long sys_uselib(const char __
+       int error;
+       intent_init(&nd.intent.open, IT_OPEN);
+-      nd.intent.open.flags = FMODE_READ;
++      nd.intent.open.flags = FMODE_READ|FMODE_EXEC;
+       error = user_path_walk_it(library, &nd);
+       if (error)
+               goto out;
+@@ -476,7 +476,7 @@ struct file *open_exec(const char *name)
+       struct file *file;
+       intent_init(&nd.intent.open, IT_OPEN);
+-      nd.intent.open.flags = FMODE_READ;
++      nd.intent.open.flags = FMODE_READ|FMODE_EXEC;
+       err = path_lookup_it(name, LOOKUP_FOLLOW, &nd);
+       file = ERR_PTR(err);
diff --git a/lustre/kernel_patches/patches/vfs_gns-2.6-vanilla.patch b/lustre/kernel_patches/patches/vfs_gns-2.6-vanilla.patch
new file mode 100644 (file)
index 0000000..0cb55e6
--- /dev/null
@@ -0,0 +1,55 @@
+diff -rupN linux-2.6.7/fs/namei.c linux-2.6.7.new/fs/namei.c
+--- linux-2.6.7/fs/namei.c     2005-03-29 18:54:13.000000000 +0300
++++ linux-2.6.7.new/fs/namei.c 2005-03-31 14:42:01.605302456 +0300
+@@ -422,6 +422,16 @@ static struct dentry * real_lookup(struc
+                               result = dentry;
+               }
+               unlock_dir(dir, lock);
++              if (!IS_ERR(result)) {
++                      spin_lock(&result->d_lock);
++                      if (result->d_flags & DCACHE_GNS_PENDING) {
++                              spin_unlock(&result->d_lock);
++                              if (result->d_op && result->d_op->d_revalidate)
++                                      result->d_op->d_revalidate(result, nd);
++                      } else {
++                              spin_unlock(&result->d_lock);
++                      }
++              }
+               return result;
+       }
+diff -rupN linux-2.6.7/fs/namespace.c linux-2.6.7.new/fs/namespace.c
+--- linux-2.6.7/fs/namespace.c 2005-03-29 18:54:13.000000000 +0300
++++ linux-2.6.7.new/fs/namespace.c     2005-03-30 17:51:39.000000000 +0300
+@@ -60,6 +60,7 @@ struct vfsmount *alloc_vfsmnt(const char
+               INIT_LIST_HEAD(&mnt->mnt_child);
+               INIT_LIST_HEAD(&mnt->mnt_mounts);
+               INIT_LIST_HEAD(&mnt->mnt_list);
++              INIT_LIST_HEAD(&mnt->mnt_lustre_list);
+               if (name) {
+                       int size = strlen(name)+1;
+                       char *newname = kmalloc(size, GFP_KERNEL);
+@@ -173,6 +174,9 @@ void __mntput(struct vfsmount *mnt)
+ {
+       struct super_block *sb = mnt->mnt_sb;
+       dput(mnt->mnt_root);
++        spin_lock(&dcache_lock);
++        list_del(&mnt->mnt_lustre_list);
++        spin_unlock(&dcache_lock);
+       free_vfsmnt(mnt);
+       deactivate_super(sb);
+ }
+diff -rupN linux-2.6.7/include/linux/dcache.h linux-2.6.7.new/include/linux/dcache.h
+--- linux-2.6.7/include/linux/dcache.h 2005-03-29 18:54:13.000000000 +0300
++++ linux-2.6.7.new/include/linux/dcache.h     2005-03-31 14:35:51.589553400 +0300
+@@ -167,7 +167,9 @@ d_iput:            no              no              no       yes
+ #define DCACHE_UNHASHED               0x0010  
+ #define DCACHE_LUSTRE_INVALID 0x0020  /* invalidated by Lustre */
+-#define DCACHE_CROSS_REF       0x0040  /* entry points to inode on another MDS */
++#define DCACHE_CROSS_REF        0x0040  /* entry points to inode on another MDS */
++#define DCACHE_GNS_PENDING      0x0080  /* entry is GNS pending mount point */
++#define DCACHE_GNS_MOUNTING     0x0100  /* entry is GNS mount in progress */
+ extern spinlock_t dcache_lock;
index 6cfae66..5598314 100644 (file)
@@ -634,9 +634,9 @@ Index: linux-2.6.7/include/linux/dcache.h
        int nr_unused;
 Index: linux-2.6.7/include/linux/fs.h
 ===================================================================
---- linux-2.6.7.orig/include/linux/fs.h        2004-08-26 17:12:41.000000000 +0400
-+++ linux-2.6.7/include/linux/fs.h     2005-01-18 11:27:18.092496832 +0300
-@@ -74,6 +74,7 @@
+--- linux-2.6.7.old/include/linux/fs.h 2005-01-31 14:27:16.000000000 +0800
++++ linux-2.6.7/include/linux/fs.h     2005-01-31 14:32:19.000000000 +0800
+@@ -74,6 +74,7 @@ extern int leases_enable, dir_notify_ena
  
  #define FMODE_READ 1
  #define FMODE_WRITE 2
@@ -644,16 +644,21 @@ Index: linux-2.6.7/include/linux/fs.h
  
  #define RW_MASK               1
  #define RWA_MASK      2
-@@ -250,6 +251,8 @@
+@@ -250,6 +251,13 @@ typedef void (dio_iodone_t)(struct inode
  #define ATTR_ATTR_FLAG        1024
  #define ATTR_KILL_SUID        2048
  #define ATTR_KILL_SGID        4096
 +#define ATTR_RAW              8192    /* file system, not vfs will massage attrs */
 +#define ATTR_FROM_OPEN        16384    /* called from open path, ie O_TRUNC */
++
++#define ATTR_CTIME_SET 0x2000
++/* ea support  */
++#define ATTR_EA 0x40000
++#define ATTR_EA_RM 0x80000
  
  /*
   * This is the Inode Attributes structure, used for notify_change().  It
-@@ -446,6 +449,7 @@
+@@ -446,6 +454,7 @@ struct inode {
        struct block_device     *i_bdev;
        struct cdev             *i_cdev;
        int                     i_cindex;
@@ -661,7 +666,7 @@ Index: linux-2.6.7/include/linux/fs.h
  
        unsigned long           i_dnotify_mask; /* Directory notify events */
        struct dnotify_struct   *i_dnotify; /* for directory notifications */
-@@ -579,6 +583,7 @@
+@@ -579,6 +588,7 @@ struct file {
        spinlock_t              f_ep_lock;
  #endif /* #ifdef CONFIG_EPOLL */
        struct address_space    *f_mapping;
@@ -669,7 +674,7 @@ Index: linux-2.6.7/include/linux/fs.h
  };
  extern spinlock_t files_lock;
  #define file_list_lock() spin_lock(&files_lock);
-@@ -903,7 +908,9 @@
+@@ -903,7 +913,9 @@ struct inode_operations {
        void (*truncate) (struct inode *);
        int (*permission) (struct inode *, int, struct nameidata *);
        int (*setattr) (struct dentry *, struct iattr *);
@@ -679,7 +684,7 @@ Index: linux-2.6.7/include/linux/fs.h
        int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
        ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
-@@ -943,6 +950,7 @@
+@@ -943,6 +955,7 @@ struct super_operations {
        int (*remount_fs) (struct super_block *, int *, char *);
        void (*clear_inode) (struct inode *);
        void (*umount_begin) (struct super_block *);
@@ -687,7 +692,7 @@ Index: linux-2.6.7/include/linux/fs.h
  
        int (*show_options)(struct seq_file *, struct vfsmount *);
  };
-@@ -1131,6 +1139,7 @@
+@@ -1131,6 +1144,7 @@ extern int unregister_filesystem(struct 
  extern struct vfsmount *kern_mount(struct file_system_type *);
  extern int may_umount_tree(struct vfsmount *);
  extern int may_umount(struct vfsmount *);
@@ -695,14 +700,14 @@ Index: linux-2.6.7/include/linux/fs.h
  extern long do_mount(char *, char *, char *, unsigned long, void *);
  
  extern int vfs_statfs(struct super_block *, struct kstatfs *);
-@@ -1195,6 +1204,7 @@
+@@ -1195,6 +1209,7 @@ static inline int break_lease(struct ino
  extern int do_truncate(struct dentry *, loff_t start);
  extern struct file *filp_open(const char *, int, int);
  extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
 +extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *);
  extern int filp_close(struct file *, fl_owner_t id);
  extern char * getname(const char __user *);
+
 Index: linux-2.6.7/include/linux/namei.h
 ===================================================================
 --- linux-2.6.7.orig/include/linux/namei.h     2003-07-24 15:52:31.000000000 +0400
index 8e88ec9..b5f5e74 100644 (file)
@@ -1,20 +1,22 @@
 uml-2.6.7-01-bb2.patch
 lustre_version.patch
-vfs_intent-2.6-vanilla.patch 
-vfs_nointent-2.6-vanilla.patch
-vfs_races-2.6-vanilla.patch
-vfs-wantedi-misc-2.6-suse.patch 
-nfs-cifs-intent-2.6-vanilla.patch 
-iopen-misc-2.6-suse.patch 
-export-truncate-2.6-suse.patch 
-export_symbols-2.6-suse.patch 
-dev_read_only-2.6-suse.patch 
-export-2.6-suse.patch
-header-guards-2.6-suse.patch
+vfs-dcache_locking-vanilla-2.6.patch
+vfs-dcache_lustre_invalid-vanilla-2.6.patch 
+vfs-intent_api-vanilla-2.6.patch 
+vfs-lookup_last-vanilla-2.6.patch
+vfs-raw_ops-vanilla-2.6.patch 
+export-vanilla-2.6.patch 
+header_guards-vanilla-2.6.patch 
+vfs-do_truncate.patch
+vfs_fmode_exec-2.6.patch
+vfs-gns_export_doumount.patch
 ext3-super-ntohl.patch
-lookup_bdev_init_intent.patch
-dcache-mds-num-2.6.7.patch
+dcache-mds-num-2.6.7.patch 
 dynamic-locks-2.6.7.patch
 vfs-pdirops-2.6.7.patch
 dcache-fid-2.6.7.patch
-jbd-buffer-release-2.6.7.patch 
+vfs-wantedi-misc-2.6-suse.patch
+jbd-buffer-release-2.6.7.patch
+dev_read_only-2.6-suse.patch
+vfs_gns-2.6-vanilla.patch
+linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch
index 70bd9b7..d1b8914 100644 (file)
@@ -35,6 +35,8 @@
 #include <linux/lustre_mgmt.h>
 #include <linux/lustre_dlm.h>
 #include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
+
 /* @priority: if non-zero, move the selected to the list head
  * @nocreate: if non-zero, only search in existed connections
  */
@@ -344,6 +346,7 @@ err:
 int client_obd_cleanup(struct obd_device *obddev, int flags)
 {
         struct client_obd *cli = &obddev->u.cli;
+        ENTRY;
 
         if (!cli->cl_import)
                 RETURN(-EINVAL);
@@ -354,7 +357,14 @@ int client_obd_cleanup(struct obd_device *obddev, int flags)
                 dereg_f(cli->cl_mgmtcli_obd, obddev);
                 inter_module_put("mgmtcli_deregister_for_events");
         }
+
+        /* Here we try to drop the security structure after destroy import,
+         * to avoid issue of "sleep in spinlock".
+         */
+        class_import_get(cli->cl_import);
         class_destroy_import(cli->cl_import);
+        ptlrpcs_import_drop_sec(cli->cl_import);
+        class_import_put(cli->cl_import);
         cli->cl_import = NULL;
 
         ldlm_put_ref(flags & OBD_OPT_FORCE);
@@ -390,6 +400,10 @@ int client_connect_import(struct lustre_handle *dlm_handle,
         if (obd->obd_namespace == NULL)
                 GOTO(out_disco, rc = -ENOMEM);
 
+        rc = ptlrpcs_import_get_sec(imp);
+        if (rc != 0)
+                GOTO(out_ldlm, rc);
+
         imp->imp_dlm_handle = *dlm_handle;
         rc = ptlrpc_init_import(imp);
         if (rc != 0) 
@@ -721,15 +735,38 @@ int target_handle_connect(struct ptlrpc_request *req)
         memcpy(&conn, lustre_msg_buf(req->rq_reqmsg, 2, sizeof conn),
                sizeof conn);
 
-        if (export->exp_imp_reverse != NULL)
+        if (export->exp_imp_reverse != NULL) {
+                /* same logic as client_obd_cleanup */
+                class_import_get(export->exp_imp_reverse);
                 class_destroy_import(export->exp_imp_reverse);
+                ptlrpcs_import_drop_sec(export->exp_imp_reverse);
+                class_import_put(export->exp_imp_reverse);
+        }
+
+        /* for the rest part, we return -ENOTCONN in case of errors
+         * in order to let client initialize connection again.
+         */
         revimp = export->exp_imp_reverse = class_new_import();
+        if (!revimp) {
+                CERROR("fail to alloc new reverse import.\n");
+                GOTO(out, rc = -ENOTCONN);
+        }
+
         revimp->imp_connection = ptlrpc_connection_addref(export->exp_connection);
         revimp->imp_client = &export->exp_obd->obd_ldlm_client;
         revimp->imp_remote_handle = conn;
         revimp->imp_obd = target;
         revimp->imp_dlm_fake = 1;
         revimp->imp_state = LUSTRE_IMP_FULL;
+
+        rc = ptlrpcs_import_get_sec(revimp);
+        if (rc) {
+                CERROR("reverse import can not get sec: %d\n", rc);
+                class_destroy_import(revimp);
+                export->exp_imp_reverse = NULL;
+                GOTO(out, rc = -ENOTCONN);
+        }
+
         class_import_put(revimp);
 
         rc = obd_connect_post(export, connect_flags);
@@ -759,8 +796,10 @@ void target_destroy_export(struct obd_export *exp)
 {
         /* exports created from last_rcvd data, and "fake"
            exports created by lctl don't have an import */
-        if (exp->exp_imp_reverse != NULL)
+        if (exp->exp_imp_reverse != NULL) {
+                ptlrpcs_import_drop_sec(exp->exp_imp_reverse);
                 class_destroy_import(exp->exp_imp_reverse);
+        }
 
         /* We cancel locks at disconnect time, but this will catch any locks
          * granted in a race with recovery-induced disconnect. */
@@ -789,8 +828,9 @@ ptlrpc_clone_req( struct ptlrpc_request *orig_req)
 
         memcpy(copy_req, orig_req, sizeof *copy_req);
         memcpy(copy_reqmsg, orig_req->rq_reqmsg, orig_req->rq_reqlen);
-        /* the copied req takes over the reply state */
+        /* the copied req takes over the reply state and security data */
         orig_req->rq_reply_state = NULL;
+        orig_req->rq_sec_svcdata = NULL;
 
         copy_req->rq_reqmsg = copy_reqmsg;
         class_export_get(copy_req->rq_export);
@@ -800,6 +840,9 @@ ptlrpc_clone_req( struct ptlrpc_request *orig_req)
 }
 void ptlrpc_free_clone( struct ptlrpc_request *req) 
 {
+        if (req->rq_svcsec)
+                svcsec_cleanup_req(req);
+
         class_export_put(req->rq_export);
         list_del(&req->rq_list);
         OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
@@ -810,6 +853,9 @@ void ptlrpc_free_clone( struct ptlrpc_request *req)
 
 static void target_release_saved_req(struct ptlrpc_request *req)
 {
+        if (req->rq_svcsec)
+                svcsec_cleanup_req(req);
+
         class_export_put(req->rq_export);
         OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
         OBD_FREE(req, sizeof *req);
index 4b58aea..9f863b4 100644 (file)
@@ -25,6 +25,8 @@
 
 #ifdef __KERNEL__
 # include <linux/slab.h>
+# include <linux/dcache.h>
+# include <linux/namei.h>
 # include <linux/module.h>
 # include <linux/lustre_dlm.h>
 #else
index 90a35c4..860d2ec 100644 (file)
@@ -1,5 +1,7 @@
 ## Liblustre excecutables & libraries Makefile
-SUBDIRS = . tests
+
+# FIXME: we disable building any executables for this moment.
+#SUBDIRS = . tests
 
 AM_CPPFLAGS = $(HAVE_EFENCE) -I$(SYSIO)/include -D_LARGEFILE64_SOURCE=1 \
               $(LLCPPFLAGS) -I$(top_srcdir)/portals/unals
@@ -13,6 +15,7 @@ LUSTRE_LIBS = liblutils.a libllite.a \
               $(top_builddir)/lustre/osc/libosc.a \
               $(top_builddir)/lustre/mdc/libmdc.a \
               $(top_builddir)/lustre/ptlrpc/libptlrpc.a \
+              $(top_builddir)/lustre/sec/libptlrpcs.a \
               $(top_builddir)/lustre/obdclass/liblustreclass.a \
               $(top_builddir)/lustre/lvfs/liblvfs.a
 
index 0200da9..7e1d7dd 100644 (file)
@@ -74,7 +74,7 @@ static int llu_dir_do_readpage(struct inode *inode, struct page *page)
                                  &data, &lockh, NULL, 0,
                                  ldlm_completion_ast, llu_mdc_blocking_ast,
                                  inode);
-                request = (struct ptlrpc_request *)it.d.lustre.it_data;
+                request = (struct ptlrpc_request *)LUSTRE_IT(&it)->it_data;
                 if (request)
                         ptlrpc_req_finished(request);
                 if (rc < 0) {
index 00a0b82..e393198 100644 (file)
@@ -90,7 +90,7 @@ void obdo_refresh_inode(struct inode *dst,
 
 static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it)
 {
-        struct ptlrpc_request *req = it->d.lustre.it_data;
+        struct ptlrpc_request *req = LUSTRE_IT(it)->it_data;
         struct ll_file_data *fd;
         struct mds_body *body;
         ENTRY;
@@ -114,7 +114,7 @@ static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it)
         fd->fd_mds_och.och_magic = OBD_CLIENT_HANDLE_MAGIC;
         lli->lli_file_data = fd;
 
-        mdc_set_open_replay_data(NULL, &fd->fd_mds_och, it->d.lustre.it_data);
+        mdc_set_open_replay_data(NULL, &fd->fd_mds_och, LUSTRE_IT(it)->it_data);
 
         RETURN(0);
 }
@@ -139,9 +139,8 @@ int llu_iop_open(struct pnode *pnode, int flags, mode_t mode)
         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", lli->lli_st_ino);
         LL_GET_INTENT(inode, it);
 
-        if (!it->d.lustre.it_disposition) {
+        if (!LUSTRE_IT(it)->it_disposition)
                 LBUG();
-        }
 
         rc = it_open_error(DISP_OPEN_OPEN, it);
         if (rc)
@@ -168,7 +167,7 @@ int llu_iop_open(struct pnode *pnode, int flags, mode_t mode)
         lli->lli_open_flags = flags & ~(O_CREAT | O_EXCL | O_TRUNC);
 
  out_release:
-        request = it->d.lustre.it_data;
+        request = LUSTRE_IT(it)->it_data;
         ptlrpc_req_finished(request);
 
         it->it_op_release(it);
index 1324cf9..04e27fe 100755 (executable)
@@ -82,6 +82,7 @@ build_obj_list ../obdecho libobdecho.a
 build_obj_list ../osc libosc.a
 build_obj_list ../mdc libmdc.a
 build_obj_list ../ptlrpc libptlrpc.a
+build_obj_list ../sec libptlrpcs.a
 build_obj_list ../obdclass liblustreclass.a
 build_obj_list ../lvfs liblvfs.a
 
index 280c1dd..0949b5d 100644 (file)
@@ -46,15 +46,15 @@ static void ll_intent_drop_lock(struct lookup_intent *it)
 {
         struct lustre_handle *handle;
 
-        if (it->it_op && it->d.lustre.it_lock_mode) {
-                handle = (struct lustre_handle *)&it->d.lustre.it_lock_handle;
+        if (it->it_op && LUSTRE_IT(it)->it_lock_mode) {
+                handle = (struct lustre_handle *)&LUSTRE_IT(it)->it_lock_handle;
                 CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64
                        " from it %p\n", handle->cookie, it);
-                ldlm_lock_decref(handle, it->d.lustre.it_lock_mode);
+                ldlm_lock_decref(handle, LUSTRE_IT(it)->it_lock_mode);
 
                 /* bug 494: intent_release may be called multiple times, from
                  * this thread and we don't want to double-decref this lock */
-                it->d.lustre.it_lock_mode = 0;
+                LUSTRE_IT(it)->it_lock_mode = 0;
         }
 }
 
@@ -65,8 +65,8 @@ static void ll_intent_release(struct lookup_intent *it)
         ll_intent_drop_lock(it);
         it->it_magic = 0;
         it->it_op_release = 0;
-        it->d.lustre.it_disposition = 0;
-        it->d.lustre.it_data = NULL;
+        LUSTRE_IT(it)->it_disposition = 0;
+        LUSTRE_IT(it)->it_data = NULL;
         EXIT;
 }
 
@@ -107,7 +107,7 @@ void llu_lookup_finish_locks(struct lookup_intent *it, struct pnode *pnode)
                 CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%lu)\n",
                        inode, llu_i2info(inode)->lli_st_ino,
                        llu_i2info(inode)->lli_st_generation);
-                mdc_set_lock_data(NULL, &it->d.lustre.it_lock_handle, inode);
+                mdc_set_lock_data(NULL, &LUSTRE_IT(it)->it_lock_handle, inode);
         }
 
         /* drop lookup/getattr locks */
index 211be83..1962920 100644 (file)
@@ -181,8 +181,8 @@ void obdo_to_inode(struct inode *dst, struct obdo *src, obd_valid valid)
         valid &= src->o_valid;
 
         if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
-                CDEBUG(D_INODE, "valid %x, cur time %lu/%lu, new %lu/%lu\n",
-                       src->o_valid, 
+                CDEBUG(D_INODE, "valid %llx, cur time %lu/%lu, new %lu/%lu\n",
+                       (unsigned long long)src->o_valid, 
                        LTIME_S(lli->lli_st_mtime), LTIME_S(lli->lli_st_ctime),
                        (long)src->o_mtime, (long)src->o_ctime);
 
@@ -221,8 +221,8 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, obd_valid valid)
         obd_valid newvalid = 0;
 
         if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
-                CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n",
-                       valid, LTIME_S(lli->lli_st_mtime), 
+                CDEBUG(D_INODE, "valid %llx, new time %lu/%lu\n",
+                       (unsigned long long)valid, LTIME_S(lli->lli_st_mtime), 
                        LTIME_S(lli->lli_st_ctime));
 
         if (valid & OBD_MD_FLATIME) {
@@ -438,7 +438,8 @@ static int llu_inode_revalidate(struct inode *inode)
                         valid |= OBD_MD_FLEASIZE;
                 }
                 ll_inode2id(&id, inode);
-                rc = mdc_getattr(sbi->ll_md_exp, &id, valid, ealen, &req);
+                rc = mdc_getattr(sbi->ll_md_exp, &id, valid, NULL, 0,
+                                 ealen, &req);
                 if (rc) {
                         CERROR("failure %d inode %lu\n", rc, lli->lli_st_ino);
                         RETURN(-abs(rc));
@@ -869,7 +870,7 @@ static int llu_readlink_internal(struct inode *inode,
 
         ll_inode2id(&id, inode);
         rc = mdc_getattr(sbi->ll_md_exp, &id,
-                         OBD_MD_LINKNAME, symlen, request);
+                         OBD_MD_LINKNAME, NULL, 0, symlen, request);
         if (rc) {
                 CERROR("inode %lu: rc = %d\n", lli->lli_st_ino, rc);
                 RETURN(rc);
@@ -1355,7 +1356,8 @@ struct inode *llu_iget(struct filesys *fs, struct lustre_md *md)
         if ((md->body->valid &
              (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) !=
             (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) {
-                CERROR("bad md body valid mask 0x%x\n", md->body->valid);
+                CERROR("bad md body valid mask 0x%llx\n", 
+                      (unsigned long long)md->body->valid);
                 LBUG();
                 return ERR_PTR(-EPERM);
         }
@@ -1522,7 +1524,8 @@ llu_fsswop_mount(const char *source,
 
         /* fetch attr of root inode */
         err = mdc_getattr(sbi->ll_md_exp, &rootid,
-                          OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, 0, &request);
+                          OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, NULL, 0,
+                          0, &request);
         if (err) {
                 CERROR("mdc_getattr failed for root: rc = %d\n", err);
                 GOTO(out_lov, err);
index 2d5a7c8..b8a6d0a 100644 (file)
@@ -40,11 +40,15 @@ static void ll_release(struct dentry *de)
         struct ll_dentry_data *lld;
         ENTRY;
         LASSERT(de != NULL);
+
+        CDEBUG(D_DENTRY, "releasing dentry %p\n", de);
+
         lld = ll_d2d(de);
-        LASSERT(lld != NULL);
-        LASSERT(lld->lld_cwd_count == 0);
-        LASSERT(lld->lld_mnt_count == 0);
-        OBD_FREE(de->d_fsdata, sizeof(struct ll_dentry_data));
+        if (lld) { /* Root dentry does not have ll_dentry_data */
+                LASSERT(lld->lld_cwd_count == 0);
+                LASSERT(lld->lld_mnt_count == 0);
+                OBD_FREE(de->d_fsdata, sizeof(struct ll_dentry_data));
+        }
 
         EXIT;
 }
@@ -82,16 +86,17 @@ void ll_set_dd(struct dentry *de)
 void ll_intent_drop_lock(struct lookup_intent *it)
 {
         struct lustre_handle *handle;
+        struct lustre_intent_data *itdata = LUSTRE_IT(it);
 
-        if (it->it_op && it->d.lustre.it_lock_mode) {
-                handle = (struct lustre_handle *)&it->d.lustre.it_lock_handle;
+        if (it->it_op && itdata && itdata->it_lock_mode) {
+                handle = (struct lustre_handle *)&itdata->it_lock_handle;
                 CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64
                        " from it %p\n", handle->cookie, it);
-                ldlm_lock_decref(handle, it->d.lustre.it_lock_mode);
+                ldlm_lock_decref(handle, itdata->it_lock_mode);
 
                 /* bug 494: intent_release may be called multiple times, from
                  * this thread and we don't want to double-decref this lock */
-                it->d.lustre.it_lock_mode = 0;
+                itdata->it_lock_mode = 0;
         }
 }
 
@@ -102,11 +107,19 @@ void ll_intent_release(struct lookup_intent *it)
         ll_intent_drop_lock(it);
         it->it_magic = 0;
         it->it_op_release = 0;
-        it->d.lustre.it_disposition = 0;
-        it->d.lustre.it_data = NULL;
+        ll_intent_free(it);
         EXIT;
 }
 
+void ll_intent_free(struct lookup_intent *it)
+{
+        if (it->d.fs_data) {
+                OBD_SLAB_FREE(it->d.fs_data, ll_intent_slab,
+                               sizeof(struct lustre_intent_data));
+                it->d.fs_data = NULL;
+        }
+}
+
 void ll_unhash_aliases(struct inode *inode)
 {
         struct list_head *tmp, *head;
@@ -180,11 +193,11 @@ void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry)
         LASSERT(it != NULL);
         LASSERT(dentry != NULL);
 
-        if (it->d.lustre.it_lock_mode && dentry->d_inode != NULL) {
+        if (LUSTRE_IT(it)->it_lock_mode && dentry->d_inode != NULL) {
                 struct inode *inode = dentry->d_inode;
                 CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
                        inode, inode->i_ino, inode->i_generation);
-                mdc_set_lock_data(NULL, &it->d.lustre.it_lock_handle, inode);
+                mdc_set_lock_data(NULL, &LUSTRE_IT(it)->it_lock_handle, inode);
         }
 
         /* drop lookup or getattr locks immediately */
@@ -206,7 +219,7 @@ void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry)
 void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft)
 {
         struct lookup_intent *it = *itp;
-        
+
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
         if (it) {
                 LASSERTF(it->it_magic == INTENT_MAGIC, "bad intent magic: %x\n",
@@ -217,7 +230,34 @@ void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft)
         if (!it || it->it_op == IT_GETXATTR)
                 it = *itp = deft;
 
+        if (it->d.fs_data)
+                return;
+
+        if (ll_intent_alloc(it)) {
+                CERROR("Failed to allocate memory for lustre specific intent "
+                       "data\n");
+                /* XXX: we cannot return status just yet */
+                LBUG();
+        }
+}
+
+int ll_intent_alloc(struct lookup_intent *it)
+{
+        if (it->d.fs_data) {
+                CERROR("Intent alloc on already allocated intent\n");
+                return 0;
+        }
+        OBD_SLAB_ALLOC(it->d.fs_data, ll_intent_slab, SLAB_KERNEL,
+                        sizeof(struct lustre_intent_data));
+        if (!it->d.fs_data) {
+                CERROR("Failed to allocate memory for lustre specific intent "
+                       "data\n");
+                return -ENOMEM;
+        }
+
         it->it_op_release = ll_intent_release;
+
+        return 0;
 }
 
 int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd,
@@ -229,16 +269,38 @@ int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd,
         struct obd_export *exp;
         struct lustre_id pid;
         struct lustre_id cid;
-        int rc;
+        int orig_it, rc = 0;
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op:name=%s, intent=%s\n", de->d_name.name,
-               LL_IT2STR(it));
+        spin_lock(&de->d_lock);
+
+        if ((de->d_flags & DCACHE_GNS_PENDING) &&
+            !(de->d_flags & DCACHE_GNS_MOUNTING))
+        {
+                spin_unlock(&de->d_lock);
+                        
+                if (nd) {
+                        int err = ll_gns_mount_object(de, nd->mnt);
+                        if (err)
+                                CERROR("can't mount %s, err = %d\n",
+                                       de->d_name.name, err);
+                }
+                RETURN(1);
+        }
+        spin_unlock(&de->d_lock);
+
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s (%p), intent=%s\n", de->d_name.name,
+               de, LL_IT2STR(it));
 
         /* Cached negative dentries are unsafe for now - look them up again */
         if (de->d_inode == NULL)
                 RETURN(0);
 
+        /* Root of the tree is always valid, attributes would be fixed in
+          ll_inode_revalidate_it */
+        if (de->d_sb->s_root == de)
+                RETURN(1);
+
         CDEBUG(D_INODE, "revalidate 0x%p: %*s -> %lu/%lu\n",
                de, de->d_name.len, de->d_name.name,
                (unsigned long) de->d_inode->i_ino,
@@ -262,11 +324,17 @@ int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd,
         if (nd != NULL)
                 nd->mnt->mnt_last_used = jiffies;
 
+        orig_it = it ? it->it_op : IT_OPEN;
         ll_frob_intent(&it, &lookup_it);
         LASSERT(it != NULL);
 
         if (it->it_op == IT_GETATTR) { /* We need to check for LOOKUP lock as
                                           well */
+                rc = ll_intent_alloc(&lookup_it);
+                if (rc)
+                        LBUG(); /* Can't think of better idea just yet */
+
+
                 rc = md_intent_lock(exp, &pid, de->d_name.name,
                                     de->d_name.len, NULL, 0, &cid, &lookup_it,
                                     flags, &req, ll_mdc_blocking_ast);
@@ -274,11 +342,15 @@ int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd,
                    UPDATE lock */
                 if (!rc) {
                         it = &lookup_it;
+                        if (!req) {
+                                ll_intent_free(it);
+                                goto do_lookup;
+                        }
                         GOTO(out, rc);
                 }
                 if (it_disposition(&lookup_it, DISP_LOOKUP_NEG)) {
-                        ll_intent_release(&lookup_it);
                         it = &lookup_it;
+                        ll_intent_free(it);
                         GOTO(out, rc = 0);
                 }
 
@@ -286,6 +358,8 @@ int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd,
                         ptlrpc_req_finished(req);
                 req = NULL;
                 ll_lookup_finish_locks(&lookup_it, de);
+                /* XXX: on 2.6 ll_lookup_finish_locks does not call ll_intent_release */
+                ll_intent_release(&lookup_it);
         }
 
         rc = md_intent_lock(exp, &pid, de->d_name.name, de->d_name.len,
@@ -294,17 +368,20 @@ int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd,
         
         /* If req is NULL, then mdc_intent_lock only tried to do a lock match;
          * if all was well, it will return 1 if it found locks, 0 otherwise. */
-        if (req == NULL && rc >= 0)
+        if (req == NULL && rc >= 0) {
+                if (!rc)
+                        goto do_lookup;
                 GOTO(out, rc);
+        }
 
         if (rc < 0) {
                 if (rc != -ESTALE) {
                         CDEBUG(D_INFO, "ll_intent_lock(): rc %d : it->it_status "
-                               "%d\n", rc, it->d.lustre.it_status);
+                               "%d\n", rc, LUSTRE_IT(it)->it_status);
                 }
                 GOTO(out, rc = 0);
         }
-
+revalidate_finish:
         rc = revalidate_it_finish(req, 1, it, de);
         if (rc != 0) {
                 ll_intent_release(it);
@@ -316,14 +393,21 @@ int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd,
            dentry */
         spin_lock(&dcache_lock);
         hlist_del_init(&de->d_hash);
-        __d_rehash(de, 0);
+        __d_rehash(de);
         spin_unlock(&dcache_lock);
 
         GOTO(out, rc);
 out:
         if (req != NULL && rc == 1)
                 ptlrpc_req_finished(req);
+
         if (rc == 0) {
+                if (it == &lookup_it) {
+                        ll_intent_release(it);
+                        if (req) /* Special case: We did lookup and it failed,
+                                    need to free request */
+                                ptlrpc_req_finished(req);
+                }
                 ll_unhash_aliases(de->d_inode);
                 return rc;
         }
@@ -334,13 +418,37 @@ out:
                atomic_read(&de->d_count));
         ll_lookup_finish_locks(it, de);
         de->d_flags &= ~DCACHE_LUSTRE_INVALID;
-        if (!((de->d_inode->i_mode & S_ISUID) &&S_ISDIR(de->d_inode->i_mode)) ||
-            !(flags & LOOKUP_CONTINUE || (it->it_op & (IT_CHDIR | IT_OPEN))))
+        if (it == &lookup_it)
+                ll_intent_release(it);
+    
+        if (!((de->d_inode->i_mode & S_ISUID) && S_ISDIR(de->d_inode->i_mode)) ||
+            !(flags & LOOKUP_CONTINUE || (orig_it & (IT_CHDIR | IT_OPEN))))
                 return rc;
 
-        if (nd)
-                (void)ll_dir_process_mount_object(de, nd->mnt);
+        if (nd && !(de->d_flags & DCACHE_GNS_MOUNTING)) {
+                int err = ll_gns_mount_object(de, nd->mnt);
+                if (err)
+                        CERROR("can't mount %s, err = %d\n",
+                               de->d_name.name, err);
+        }
         return rc;
+do_lookup:
+        it = &lookup_it;
+        if (ll_intent_alloc(it))
+                LBUG();
+// We did that already, right?  ll_inode2id(&pid, de->d_parent->d_inode);
+        rc = md_intent_lock(exp, &pid, de->d_name.name,
+                            de->d_name.len, NULL, 0, NULL,
+                            it, 0, &req, ll_mdc_blocking_ast);
+        if (rc >= 0) {
+                struct mds_body *mds_body = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*mds_body));
+
+                /* See if we got same inode, if not - return error */
+                if (id_equal_stc(&cid, &mds_body->id1))
+                        goto revalidate_finish;
+        }
+
+        GOTO(out, rc = 0);
 }
 
 /*static*/ void ll_pin(struct dentry *de, struct vfsmount *mnt, int flag)
@@ -433,7 +541,7 @@ static int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd)
         ENTRY;
 
         if (nd && nd->flags & LOOKUP_LAST && !(nd->flags & LOOKUP_LINK_NOTLAST))
-                rc = ll_revalidate_it(dentry, nd->flags, nd, &nd->intent);
+                rc = ll_revalidate_it(dentry, nd->flags, nd, &nd->intent.open);
         else
                 rc = ll_revalidate_it(dentry, 0, nd, NULL);
 
@@ -462,14 +570,18 @@ static void ll_dentry_iput(struct dentry *dentry, struct inode *inode)
         struct ll_sb_info *sbi = ll_i2sbi(inode);
         struct lustre_id parent, child;
 
-        LASSERT(dentry->d_parent && dentry->d_parent->d_inode);
-        ll_inode2id(&parent, dentry->d_parent->d_inode);
-        ll_inode2id(&child, inode);
-        md_change_cbdata_name(sbi->ll_md_exp, &parent,
-                              (char *)dentry->d_name.name, 
-                              dentry->d_name.len, &child, 
-                              null_if_equal, inode);
+        if (dentry->d_parent != dentry) {
+                /* Do not do this for root of the tree */
+                LASSERT(dentry->d_parent && dentry->d_parent->d_inode);
+                ll_inode2id(&parent, dentry->d_parent->d_inode);
+                ll_inode2id(&child, inode);
+                md_change_cbdata_name(sbi->ll_md_exp, &parent,
+                                      (char *)dentry->d_name.name,
+                                      dentry->d_name.len, &child,
+                                      null_if_equal, inode);
+        }
         iput(inode);
+
 }
 #endif
 
index b13bd1a..fa9a335 100644 (file)
@@ -232,12 +232,18 @@ static struct page *ll_get_dir_page(struct inode *dir, unsigned long n)
 
                 ll_prepare_mdc_data(op_data, dir, NULL, NULL, 0, 0);
 
+                rc = ll_intent_alloc(&it);
+                if (rc)
+                        return ERR_PTR(rc);
+
                 rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, LDLM_IBITS, &it,
                                 LCK_PR, op_data, &lockh, NULL, 0,
                                 ldlm_completion_ast, ll_mdc_blocking_ast, dir);
                 OBD_FREE(op_data, sizeof(*op_data));
 
-                request = (struct ptlrpc_request *)it.d.lustre.it_data;
+                request = (struct ptlrpc_request *)LUSTRE_IT(&it)->it_data;
+                ll_intent_free(&it);
+
                 if (request)
                         ptlrpc_req_finished(request);
                 if (rc < 0) {
@@ -479,8 +485,6 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
         }
         case LL_IOC_MDC_MKDIRSTRIPE:
                 RETURN(ll_mkdir_stripe(inode, arg));
-        case IOC_MDC_FINISH_GNS:
-                RETURN(ll_finish_gns(sbi));
         case LL_IOC_LOV_SETSTRIPE: {
                 struct ptlrpc_request *request = NULL;
                 struct mdc_op_data *op_data;
@@ -527,7 +531,7 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
                 valid |= OBD_MD_FLDIREA;
 
                 ll_inode2id(&id, inode);
-                rc = md_getattr(sbi->ll_md_exp, &id, valid,
+                rc = md_getattr(sbi->ll_md_exp, &id, valid, NULL, 0,
                                 obd_size_diskmd(sbi->ll_dt_exp, NULL),
                                 &request);
                 if (rc < 0) {
index 1f4a49a..e13260c 100644 (file)
 #include <linux/lustre_lite.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
+#include <linux/lustre_acl.h>
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #include <linux/lustre_compat25.h>
 #endif
 #include "llite_internal.h"
 #include <linux/obd_lov.h>
 
+#define XATTR_NAME_MAX  255
 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
                 struct file *file)
 {
@@ -144,9 +146,10 @@ static int ll_intent_file_open(struct file *file, void *lmm,
                         ll_mdc_blocking_ast, NULL);
         OBD_FREE(op_data, sizeof(*op_data));
         if (rc == 0) {
-                if (itp->d.lustre.it_lock_mode)
-                        memcpy(&itp->d.lustre.it_lock_handle,
+                if (LUSTRE_IT(itp)->it_lock_mode)
+                        memcpy(&LUSTRE_IT(itp)->it_lock_handle,
                                &lockh, sizeof(lockh));
+
         } else if (rc < 0) {
                 CERROR("lock enqueue: err: %d\n", rc);
         }
@@ -156,7 +159,7 @@ static int ll_intent_file_open(struct file *file, void *lmm,
 
 int ll_local_open(struct file *file, struct lookup_intent *it)
 {
-        struct ptlrpc_request *req = it->d.lustre.it_data;
+        struct ptlrpc_request *req = LUSTRE_IT(it)->it_data;
         struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
         struct obd_export *md_exp = ll_i2mdexp(file->f_dentry->d_inode);
         struct ll_file_data *fd;
@@ -189,8 +192,8 @@ int ll_local_open(struct file *file, struct lookup_intent *it)
 
         lli->lli_io_epoch = body->io_epoch;
 
-        mdc_set_open_replay_data(md_exp, &fd->fd_mds_och, it->d.lustre.it_data);
-
+        mdc_set_open_replay_data(md_exp, &fd->fd_mds_och, LUSTRE_IT(it)->it_data);
+        
         RETURN(0);
 }
 
@@ -228,13 +231,17 @@ int ll_file_open(struct inode *inode, struct file *file)
 
         it = file->f_it;
 
-        if (!it || !it->d.lustre.it_disposition) {
+        if (!it || !LUSTRE_IT(it) || !LUSTRE_IT(it)->it_disposition) {
                 it = &oit;
+                rc = ll_intent_alloc(it);
+                if (rc)
+                        GOTO(out, rc);
                 rc = ll_intent_file_open(file, NULL, 0, it);
                 if (rc)
                         GOTO(out, rc);
         }
 
+
         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
         /* mdc_intent_lock() didn't get a request ref if there was an open
          * error, so don't do cleanup on the request here (bug 3430) */
@@ -260,7 +267,9 @@ int ll_file_open(struct inode *inode, struct file *file)
         file->f_flags &= ~O_LOV_DELAY_CREATE;
         GOTO(out, rc);
  out:
-        req = it->d.lustre.it_data;
+        req = LUSTRE_IT(it)->it_data;
+        ll_intent_release(it);
+
         ptlrpc_req_finished(req);
         if (rc == 0)
                 ll_open_complete(inode);
@@ -1010,13 +1019,18 @@ static int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
         f->f_dentry = file->f_dentry;
         f->f_vfsmnt = file->f_vfsmnt;
 
+        rc = ll_intent_alloc(&oit);
+        if (rc)
+                GOTO(out, rc);
+
         rc = ll_intent_file_open(f, lum, lum_size, &oit);
         if (rc)
                 GOTO(out, rc);
         if (it_disposition(&oit, DISP_LOOKUP_NEG))
                 GOTO(out, -ENOENT);
-        req = oit.d.lustre.it_data;
-        rc = oit.d.lustre.it_status;
+        
+        req = LUSTRE_IT(&oit)->it_data;
+        rc = LUSTRE_IT(&oit)->it_status;
 
         if (rc < 0)
                 GOTO(out, rc);
@@ -1034,6 +1048,7 @@ static int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
         rc = ll_file_release(f->f_dentry->d_inode, f);
         EXIT;
  out:
+        ll_intent_release(&oit);
         if (f)
                 put_filp(f);
         up(&lli->lli_open_sem);
@@ -1438,7 +1453,7 @@ int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
         RETURN(rc);
 }
 
-int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
+int ll_inode_revalidate_it(struct dentry *dentry)
 {
         struct lookup_intent oit = { .it_op = IT_GETATTR };
         struct inode *inode = dentry->d_inode;
@@ -1448,7 +1463,6 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
         struct ll_sb_info *sbi;
         struct lustre_id id;
         int rc;
-        
         ENTRY;
 
         if (!inode) {
@@ -1462,14 +1476,18 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
         lli = ll_i2info(inode);
         LASSERT(id_fid(&id) != 0);
 
-        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), name=%s, intent=%s\n",
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), name=%s(%p)\n",
                inode->i_ino, inode->i_generation, inode, dentry->d_name.name,
-               LL_IT2STR(it));
+               dentry);
 
 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE);
 #endif
 
+        rc = ll_intent_alloc(&oit);
+        if (rc)
+                RETURN(-ENOMEM);
+
         rc = md_intent_lock(sbi->ll_md_exp, &id, NULL, 0, NULL, 0, &id,
                             &oit, 0, &req, ll_mdc_blocking_ast);
         if (rc < 0)
@@ -1477,7 +1495,6 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
 
         rc = revalidate_it_finish(req, 1, &oit, dentry);
         if (rc) {
-                ll_intent_release(&oit);
                 GOTO(out, rc);
         }
 
@@ -1494,19 +1511,19 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
         rc = ll_glimpse_size(inode);
         EXIT;
 out:
+        ll_intent_release(&oit);
         if (req)
                 ptlrpc_req_finished(req);
         return rc;
 }
 
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-int ll_getattr(struct vfsmount *mnt, struct dentry *de,
-               struct lookup_intent *it, struct kstat *stat)
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
 {
         int res = 0;
         struct inode *inode = de->d_inode;
 
-        res = ll_inode_revalidate_it(de, it);
+        res = ll_inode_revalidate_it(de);
         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR);
 
         if (res)
@@ -1529,6 +1546,237 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de,
 }
 #endif
 
+static
+int ll_setxattr_internal(struct inode *inode, const char *name,
+                         const void *value, size_t size, int flags, 
+                         __u64 valid)
+{
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ptlrpc_request *request = NULL;
+        struct mdc_op_data op_data;
+        struct iattr attr;
+        int rc = 0;
+        ENTRY;
+
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino);
+        lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_SETXATTR);
+
+        memset(&attr, 0x0, sizeof(attr));
+        attr.ia_valid |= valid;
+        attr.ia_attr_flags = flags;
+
+        ll_prepare_mdc_data(&op_data, inode, NULL, NULL, 0, 0);
+
+        rc = md_setattr(sbi->ll_md_exp, &op_data, &attr,
+                        (void*) name, strnlen(name, XATTR_NAME_MAX)+1, 
+                        (void*) value, size, &request);
+        if (rc) {
+                CERROR("md_setattr fails: rc = %d\n", rc);
+                GOTO(out, rc);
+        }
+
+ out:
+        ptlrpc_req_finished(request);
+        RETURN(rc);
+}
+
+int ll_setxattr(struct dentry *dentry, const char *name, const void *value,
+                size_t size, int flags)
+{
+        int rc, error;
+        struct posix_acl *acl;
+        struct ll_inode_info *lli;
+        ENTRY;
+
+        rc = ll_setxattr_internal(dentry->d_inode, name, value, size, 
+                                  flags, ATTR_EA);
+        
+        /* update inode's acl info */
+        if (rc == 0 && strcmp(name, XATTR_NAME_ACL_ACCESS) == 0) {
+                if (value) {
+                        acl = posix_acl_from_xattr(value, size);
+                        if (IS_ERR(acl)) {
+                                CERROR("convert from xattr to acl error: %ld",
+                                        PTR_ERR(acl));
+                                GOTO(out, rc);
+                        } else if (acl) {
+                                error = posix_acl_valid(acl);
+                                if (error) {
+                                        CERROR("acl valid error: %d", error);
+                                        posix_acl_release(acl);
+                                        GOTO(out, rc);
+                                }
+                        }
+                } else {
+                        acl = NULL;
+                }
+                                        
+                lli = ll_i2info(dentry->d_inode);
+                spin_lock(&lli->lli_lock);
+                if (lli->lli_acl_access != NULL)
+                        posix_acl_release(lli->lli_acl_access);
+                lli->lli_acl_access = acl;
+                spin_unlock(&lli->lli_lock);
+        }
+        EXIT;
+out:
+        return(rc);
+}
+
+int ll_removexattr(struct dentry *dentry, const char *name)
+{
+        return ll_setxattr_internal(dentry->d_inode, name, NULL, 0, 0,
+                                    ATTR_EA_RM);
+}
+
+static
+int ll_getxattr_internal(struct inode *inode, const char *name, int namelen,
+                         void *value, size_t size, __u64 valid)
+{
+        struct ptlrpc_request *request = NULL;
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct lustre_id id;
+        struct mds_body *body;
+        void *ea_data; 
+        int rc, ea_size;
+        ENTRY;
+
+        lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETXATTR);
+
+        ll_inode2id(&id, inode);
+        rc = md_getattr(sbi->ll_md_exp, &id, valid, name, namelen,
+                         size, &request);
+        if (rc) {
+                if (rc != -ENODATA && rc != -EOPNOTSUPP)
+                        CERROR("md_getattr fails: rc = %d\n", rc);
+                GOTO(out, rc);
+        }
+
+        body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body));
+        LASSERT(body != NULL);
+        LASSERT_REPSWABBED(request, 0);
+
+        ea_size = body->eadatasize;
+        LASSERT(ea_size <= request->rq_repmsg->buflens[0]);
+
+        if (size == 0) 
+                GOTO(out, rc = ea_size);
+
+        ea_data = lustre_msg_buf(request->rq_repmsg, 1, ea_size);
+        LASSERT(ea_data != NULL);
+        LASSERT_REPSWABBED(request, 1);
+
+        if (value)
+                memcpy(value, ea_data, ea_size);
+        rc = ea_size;
+ out:
+        ptlrpc_req_finished(request);
+        RETURN(rc);
+}
+
+int ll_getxattr(struct dentry *dentry, const char *name, void *value,
+                size_t size)
+{
+        return ll_getxattr_internal(dentry->d_inode, name, strlen(name) + 1, 
+                                    value, size, OBD_MD_FLEA);
+}
+
+int ll_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+        return ll_getxattr_internal(dentry->d_inode, NULL, 0, list, size,
+                                    OBD_MD_FLEALIST);
+}
+
+int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+        struct lookup_intent it = { .it_op = IT_GETATTR };
+        int mode = inode->i_mode;
+        struct dentry de;
+        struct ll_sb_info *sbi;
+        struct lustre_id id;
+        struct ptlrpc_request *req = NULL;
+        int rc;
+        ENTRY;
+
+        sbi = ll_i2sbi(inode);
+        ll_inode2id(&id, inode);
+
+        /* Nobody gets write access to a read-only fs */
+        if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
+            (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
+                return -EROFS;
+        /* Nobody gets write access to an immutable file */
+        if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+                return -EACCES;
+        if (current->fsuid == inode->i_uid) {
+                mode >>= 6;
+        } else if (1) {
+                struct ll_inode_info *lli = ll_i2info(inode);
+                struct posix_acl *acl;
+
+                /* The access ACL cannot grant access if the group class
+                   permission bits don't contain all requested permissions. */
+                if (((mode >> 3) & mask & S_IRWXO) != mask)
+                        goto check_groups;
+
+                if (ll_intent_alloc(&it))
+                        return -EACCES;
+
+                de.d_inode = inode;
+                rc = md_intent_lock(sbi->ll_md_exp, &id, NULL, 0, NULL, 0, &id,
+                                    &it, 0, &req, ll_mdc_blocking_ast);
+                if (rc < 0) {
+                        ll_intent_free(&it);
+                        GOTO(out, rc);
+                }
+
+                rc = revalidate_it_finish(req, 1, &it, &de);
+                if (rc) {
+                        ll_intent_release(&it);
+                        GOTO(out, rc);
+                }
+
+                ll_lookup_finish_locks(&it, &de);
+                ll_intent_free(&it);
+
+                spin_lock(&lli->lli_lock);
+                acl = posix_acl_dup(ll_i2info(inode)->lli_acl_access);
+                spin_unlock(&lli->lli_lock);
+
+                if (!acl)
+                        goto check_groups;
+
+                rc = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                if (rc == -EACCES)
+                        goto check_capabilities;
+                GOTO(out, rc);
+        } else {
+check_groups:
+                if (in_group_p(inode->i_gid))
+                        mode >>= 3;
+        }
+        if ((mode & mask & S_IRWXO) == mask)
+                GOTO(out, rc = 0);
+
+check_capabilities:
+        rc = -EACCES; 
+        /* Allowed to override Discretionary Access Control? */
+        if (!(mask & MAY_EXEC) ||
+            (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
+                if (capable(CAP_DAC_OVERRIDE))
+                        GOTO(out, rc = 0);
+       /* Read and search granted if capable(CAP_DAC_READ_SEARCH) */
+        if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
+            (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
+                GOTO(out, rc = 0);
+out:
+        if (req)
+                ptlrpc_req_finished(req);
+
+        return rc;
+}
+
 struct file_operations ll_file_operations = {
         .read           = ll_file_read,
         .write          = ll_file_write,
@@ -1545,13 +1793,17 @@ struct file_operations ll_file_operations = {
 };
 
 struct inode_operations ll_file_inode_operations = {
-        .setattr_raw    = ll_setattr_raw,
         .setattr        = ll_setattr,
         .truncate       = ll_truncate,
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-        .getattr_it     = ll_getattr,
+        .getattr        = ll_getattr,
 #else
         .revalidate_it  = ll_inode_revalidate_it,
 #endif
+        .setxattr       = ll_setxattr,
+        .getxattr       = ll_getxattr,
+        .listxattr      = ll_listxattr,
+        .removexattr    = ll_removexattr,
+        .permission     = ll_inode_permission,
 };
 
index f53eeac..d3ae81c 100644 (file)
 #include <linux/lustre_lite.h>
 #include "llite_internal.h"
 
-/* After roughly how long should we remove an inactive mount? */
-#define GNS_MOUNT_TIMEOUT 120
-/* How often should the GNS timer look for mounts to cleanup? */
-#define GNS_TICK 30
+static struct list_head gns_sbi_list = LIST_HEAD_INIT(gns_sbi_list);
+static spinlock_t gns_lock = SPIN_LOCK_UNLOCKED;
+static struct ptlrpc_thread gns_thread;
+static struct ll_gns_ctl gns_ctl;
 
-int ll_finish_gns(struct ll_sb_info *sbi)
+/*
+ * waits until passed dentry gets mountpoint or timeout and attempts are
+ * exhausted. Returns 1 if dentry became mountpoint and 0 otherwise.
+ */
+static int
+ll_gns_wait_for_mount(struct dentry *dentry,
+                      int timeout, int tries)
 {
-        down(&sbi->ll_gns_sem);
-        if (sbi->ll_gns_state != LL_GNS_STATE_MOUNTING) {
-                up(&sbi->ll_gns_sem);
-                CERROR("FINISH_GNS called on mount which was not expecting "
-                       "completion.\n");
-                return -EINVAL;
-        }
-
-        sbi->ll_gns_state = LL_GNS_STATE_FINISHED;
-        up(&sbi->ll_gns_sem);
-        complete(&sbi->ll_gns_completion);
-
-        return 0;
-}
+        struct l_wait_info lwi;
+        struct ll_sb_info *sbi;
+        int rc;
+        ENTRY;
 
-/* Pass exactly one (1) page in; when this function returns "page" will point
- * somewhere into the middle of the page. */
-int fill_page_with_path(struct dentry *dentry, struct vfsmount *mnt,
-                        char **pagep)
-{
-        char *path = *pagep, *p;
-
-        path[PAGE_SIZE - 1] = '\0';
-        p = path + PAGE_SIZE - 1;
-
-        while (1) {
-                if (p - path < dentry->d_name.len + 1)
-                        return -ENAMETOOLONG;
-                if (dentry->d_name.name[0] != '/') {
-                        p -= dentry->d_name.len;
-                        memcpy(p, dentry->d_name.name, dentry->d_name.len);
-                        p--;
-                        *p = '/';
-                }
+        LASSERT(dentry != NULL);
+        LASSERT(!IS_ERR(dentry));
+        sbi = ll_s2sbi(dentry->d_sb);
+        
+        for (; !d_mountpoint(dentry) && tries > 0; tries--) {
+                lwi = LWI_TIMEOUT(timeout * HZ, NULL, NULL);
+                l_wait_event(sbi->ll_gns_waitq, d_mountpoint(dentry), &lwi);
+        }
 
-                dentry = dentry->d_parent;
-                if (dentry->d_parent == dentry) {
-                        if (mnt->mnt_parent == mnt)
-                                break; /* finished walking up */
-                        mnt = mntget(mnt);
-                        dget(dentry);
-                        while (dentry->d_parent == dentry &&
-                               follow_up(&mnt, &dentry))
-                                ;
-                        mntput(mnt);
-                        dput(dentry);
-                }
+        if ((rc = d_mountpoint(dentry) ? 1 : 0)) {
+                spin_lock(&sbi->ll_gns_lock);
+                LASSERT(sbi->ll_gns_state == LL_GNS_MOUNTING);
+                sbi->ll_gns_state = LL_GNS_FINISHED;
+                spin_unlock(&sbi->ll_gns_lock);
         }
-        *pagep = p;
-        return 0;
+
+        complete(&sbi->ll_gns_mount_finished);
+        RETURN(rc);
 }
 
-int ll_dir_process_mount_object(struct dentry *dentry, struct vfsmount *mnt)
+/*
+ * tries to mount the mount object under passed @dentry. In the case of success
+ * @dentry will become mount point and 0 will be retuned. Error code will be
+ * returned otherwise.
+ */
+int ll_gns_mount_object(struct dentry *dentry,
+                        struct vfsmount *mnt)
 {
-        struct ll_sb_info *sbi;
+        struct ll_dentry_data *lld = dentry->d_fsdata;
+        char *p, *path, *pathpage, *argv[4];
         struct file *mntinfo_fd = NULL;
-        struct page *datapage = NULL, *pathpage;
         struct address_space *mapping;
-        struct ll_dentry_data *lld = dentry->d_fsdata;
-        struct dentry *dchild, *tmp_dentry;
-        struct vfsmount *tmp_mnt;
-        char *p, *path, *argv[4];
-        int stage = 0, rc = 0;
+        int cleanup_phase = 0, rc = 0;
+        struct ll_sb_info *sbi;
+        struct dentry *dchild;
+        struct page *datapage;
+        filler_t *filler;
         ENTRY;
 
         if (mnt == NULL) {
-                CERROR("suid directory found, but no vfsmount available.\n");
-                RETURN(-1);
+                CERROR("suid directory found, but no "
+                       "vfsmount available.\n");
+                RETURN(-EINVAL);
         }
 
+        CDEBUG(D_INODE, "mounting dentry %p\n", dentry);
+
         LASSERT(dentry->d_inode != NULL);
         LASSERT(S_ISDIR(dentry->d_inode->i_mode));
         LASSERT(lld != NULL);
+        
         sbi = ll_i2sbi(dentry->d_inode);
         LASSERT(sbi != NULL);
 
-        down(&sbi->ll_gns_sem);
-        if (sbi->ll_gns_state == LL_GNS_STATE_MOUNTING) {
-                up(&sbi->ll_gns_sem);
-                wait_for_completion(&sbi->ll_gns_completion);
+        /* another thead is in progress of mouning some entry */
+        spin_lock(&sbi->ll_gns_lock);
+        if (sbi->ll_gns_state == LL_GNS_MOUNTING) {
+                spin_unlock(&sbi->ll_gns_lock);
+
+                wait_for_completion(&sbi->ll_gns_mount_finished);
                 if (d_mountpoint(dentry))
                         RETURN(0);
-                RETURN(-1);
         }
-        if (sbi->ll_gns_state == LL_GNS_STATE_FINISHED) {
+
+        /* another thread mounted it already */
+        if (sbi->ll_gns_state == LL_GNS_FINISHED) {
+                spin_unlock(&sbi->ll_gns_lock);
+
                 /* we lost a race; just return */
-                up(&sbi->ll_gns_sem);
                 if (d_mountpoint(dentry))
                         RETURN(0);
-                RETURN(-1);
         }
-        LASSERT(sbi->ll_gns_state == LL_GNS_STATE_IDLE);
-        sbi->ll_gns_state = LL_GNS_STATE_MOUNTING;
+        LASSERT(sbi->ll_gns_state == LL_GNS_IDLE);
+
+        spin_lock(&dentry->d_lock);
+        dentry->d_flags |= DCACHE_GNS_MOUNTING;
+        spin_unlock(&dentry->d_lock);
+        
+        /* mounting started */
+        sbi->ll_gns_state = LL_GNS_MOUNTING;
+        spin_unlock(&sbi->ll_gns_lock);
+
+        /* we need to build an absolute pathname to pass to mount */
+        pathpage = (char *)__get_free_page(GFP_KERNEL);
+        if (!pathpage)
+                GOTO(cleanup, rc = -ENOMEM);
+        cleanup_phase = 1;
+
+        /* getting @dentry path stored in @pathpage. */
+        path = d_path(dentry, mnt, pathpage, PAGE_SIZE);
+        if (IS_ERR(path)) {
+                CERROR("can't build mount object path, err %d\n",
+                       (int)PTR_ERR(dchild));
+                GOTO(cleanup, rc = PTR_ERR(dchild));
+        }
+
+        /* sychronizing with possible /proc/fs/...write */
+        down(&sbi->ll_gns_sem);
+        
+        /* 
+         * mount object name is taken from sbi, where it is set in mount time or
+         * via /proc/fs... tunable. It may be ".mntinfo" or so.
+         */
+        dchild = ll_d_lookup(sbi->ll_gns_oname, dentry,
+                             strlen(sbi->ll_gns_oname));
         up(&sbi->ll_gns_sem);
 
-        /* We need to build an absolute pathname to pass to mount */
-        pathpage = alloc_pages(GFP_HIGHUSER, 0);
-        if (pathpage == NULL)
-                GOTO(cleanup, rc = -ENOMEM);
-        path = kmap(pathpage);
-        LASSERT(path != NULL);
-        stage = 1;
-        fill_page_with_path(dentry, mnt, &path);
-
-        dchild = lookup_one_len(".mntinfo", dentry, strlen(".mntinfo"));
-        if (dchild == NULL || IS_ERR(dchild)) {
-                CERROR("Directory %*s is setuid, but without a mount object.\n",
-                       dentry->d_name.len, dentry->d_name.name);
-                GOTO(cleanup, rc = -1);
+        if (!dchild)
+                GOTO(cleanup, rc = -ENOENT);
+        
+        if (IS_ERR(dchild)) {
+                CERROR("can't find mount object %*s/%*s err = %d.\n",
+                       (int)dentry->d_name.len, dentry->d_name.name,
+                       (int)dchild->d_name.len, dchild->d_name.name,
+                       (int)PTR_ERR(dchild));
+                GOTO(cleanup, rc = PTR_ERR(dchild));
         }
 
         mntget(mnt);
 
+        /* ok, mount object if found, opening it. */
         mntinfo_fd = dentry_open(dchild, mnt, 0);
         if (IS_ERR(mntinfo_fd)) {
+                CERROR("can't open mount object %*s/%*s err = %d.\n",
+                       (int)dentry->d_name.len, dentry->d_name.name,
+                       (int)dchild->d_name.len, dchild->d_name.name,
+                       (int)PTR_ERR(mntinfo_fd));
                 dput(dchild);
                 mntput(mnt);
                 GOTO(cleanup, rc = PTR_ERR(mntinfo_fd));
         }
-        stage = 2;
+        cleanup_phase = 2;
 
         if (mntinfo_fd->f_dentry->d_inode->i_size > PAGE_SIZE) {
-                CERROR("Mount object file is too big (%Ld)\n",
+                CERROR("mount object %*s/%*s is too big (%Ld)\n",
+                       (int)dentry->d_name.len, dentry->d_name.name,
+                       (int)dchild->d_name.len, dchild->d_name.name,
                        mntinfo_fd->f_dentry->d_inode->i_size);
-                GOTO(cleanup, rc = -1);
+                GOTO(cleanup, rc = -EFBIG);
         }
+
+        /* read data from mount object. */
         mapping = mntinfo_fd->f_dentry->d_inode->i_mapping;
-        datapage = read_cache_page(mapping, 0,
-                                   (filler_t *)mapping->a_ops->readpage,
+        filler = (filler_t *)mapping->a_ops->readpage;
+        datapage = read_cache_page(mapping, 0, filler,
                                    mntinfo_fd);
-        if (IS_ERR(datapage))
+        if (IS_ERR(datapage)) {
+                CERROR("can't read data from mount object %*s/%*s\n",
+                       (int)dentry->d_name.len, dentry->d_name.name,
+                       (int)dchild->d_name.len, dchild->d_name.name);
                 GOTO(cleanup, rc = PTR_ERR(datapage));
+        }
 
         p = kmap(datapage);
         LASSERT(p != NULL);
-        stage = 3;
-
         p[PAGE_SIZE - 1] = '\0';
+        cleanup_phase = 3;
 
         fput(mntinfo_fd);
         mntinfo_fd = NULL;
 
-        argv[0] = "/usr/lib/lustre/gns-upcall.sh";
+        /* sychronizing with possible /proc/fs/...write */
+        down(&sbi->ll_gns_sem);
+
+        /*
+         * upcall is initialized in mount time or via /proc/fs/... tuneable and
+         * may be /usr/lib/lustre/gns-upcall.sh
+         */
+        argv[0] = sbi->ll_gns_upcall;
         argv[1] = p;
         argv[2] = path;
         argv[3] = NULL;
-        rc = USERMODEHELPER(argv[0], argv, NULL);
+        
+        up(&sbi->ll_gns_sem);
 
-        if (rc != 0) {
-                CERROR("GNS mount failed: %d\n", rc);
+        rc = USERMODEHELPER(argv[0], argv, NULL);
+        if (rc) {
+                CERROR("failed to call GNS upcall %s, err = %d\n",
+                       sbi->ll_gns_upcall, rc);
                 GOTO(cleanup, rc);
         }
 
-        wait_for_completion(&sbi->ll_gns_completion);
-        LASSERT(sbi->ll_gns_state == LL_GNS_STATE_FINISHED);
-
-        if (d_mountpoint(dentry)) {
-                /* successful follow_down will mntput and dput */
-                tmp_mnt = mntget(mnt);
-                tmp_dentry = dget(dentry);
-                rc = follow_down(&tmp_mnt, &tmp_dentry);
-                if (rc == 1) {
-                        struct ll_sb_info *sbi = ll_s2sbi(dentry->d_sb);
+        /*
+         * wait for mount completion. This is actually not need, because
+         * USERMODEHELPER() returns only when usermode process finishes. But we
+         * doing this just for case USERMODEHELPER() semanthics will be changed
+         * or usermode upcall program will start mounting in backgound and
+         * return instantly. --umka
+         */
+        if (ll_gns_wait_for_mount(dentry, 1, GNS_WAIT_ATTEMPTS)) {
+                struct dentry *rdentry;
+                struct vfsmount *rmnt;
+                
+                /* mount is successful */
+                LASSERT(sbi->ll_gns_state == LL_GNS_FINISHED);
+
+                rmnt = mntget(mnt);
+                rdentry = dget(dentry);
+                
+                if (follow_down(&rmnt, &rdentry)) {
+                        /* 
+                         * registering new mount in GNS mounts list and thus
+                         * make it accessible from GNS control thread.
+                         */
                         spin_lock(&dcache_lock);
-                        LASSERT(list_empty(&tmp_mnt->mnt_lustre_list));
-                        list_add_tail(&tmp_mnt->mnt_lustre_list,
+                        LASSERT(list_empty(&rmnt->mnt_lustre_list));
+                        list_add_tail(&rmnt->mnt_lustre_list,
                                       &sbi->ll_mnt_list);
                         spin_unlock(&dcache_lock);
-
-                        tmp_mnt->mnt_last_used = jiffies;
-
-                        mntput(tmp_mnt);
-                        dput(tmp_dentry);
-                        rc = 0;
+                        rmnt->mnt_last_used = jiffies;
+                        mntput(rmnt);
+                        dput(rdentry);
                 } else {
                         mntput(mnt);
                         dput(dentry);
                 }
+                spin_lock(&dentry->d_lock);
+                dentry->d_flags &= ~DCACHE_GNS_PENDING;
+                spin_unlock(&dentry->d_lock);
         } else {
-                CERROR("Woke up from GNS mount, but no mountpoint in place.\n");
-                rc = -1;
+                CERROR("usermode upcall %s failed to mount %s\n",
+                       sbi->ll_gns_upcall, path);
+                rc = -ETIME;
         }
 
         EXIT;
 cleanup:
-        switch (stage) {
+        switch (cleanup_phase) {
         case 3:
                 kunmap(datapage);
                 page_cache_release(datapage);
@@ -231,82 +284,87 @@ cleanup:
                 if (mntinfo_fd != NULL)
                         fput(mntinfo_fd);
         case 1:
-                kunmap(pathpage);
-                __free_pages(pathpage, 0);
+                free_page((unsigned long)pathpage);
         case 0:
-                down(&sbi->ll_gns_sem);
-                sbi->ll_gns_state = LL_GNS_STATE_IDLE;
-                up(&sbi->ll_gns_sem);
+                spin_lock(&sbi->ll_gns_lock);
+                sbi->ll_gns_state = LL_GNS_IDLE;
+                spin_unlock(&sbi->ll_gns_lock);
+
+                spin_lock(&dentry->d_lock);
+                dentry->d_flags &= ~DCACHE_GNS_MOUNTING;
+                spin_unlock(&dentry->d_lock);
         }
         return rc;
 }
 
-/* If timeout == 1, only remove the mounts which are properly aged.
- *
- * If timeout == 0, we are unmounting -- remove them all. */
-int ll_gns_umount_all(struct ll_sb_info *sbi, int timeout)
+/* tries to umount passed @mnt. */
+int ll_gns_umount_object(struct vfsmount *mnt)
 {
-        struct list_head kill_list = LIST_HEAD_INIT(kill_list);
-        struct page *page = NULL;
-        char *kpage = NULL, *path;
-        int rc;
+        int rc = 0;
         ENTRY;
-
-        if (timeout == 0) {
-                page = alloc_pages(GFP_HIGHUSER, 0);
-                if (page == NULL)
-                        RETURN(-ENOMEM);
-                kpage = kmap(page);
-                LASSERT(kpage != NULL);
+        
+        CDEBUG(D_INODE, "unmounting mnt %p\n", mnt);
+        rc = do_umount(mnt, 0);
+        if (rc) {
+                CDEBUG(D_INODE, "can't umount 0x%p, err = %d\n",
+                       mnt, rc);
         }
+        
+        RETURN(rc);
+}
+
+int ll_gns_check_mounts(struct ll_sb_info *sbi, int flags)
+{
+        struct list_head check_list = LIST_HEAD_INIT(check_list);
+        struct vfsmount *mnt;
+        unsigned long pass;
+        ENTRY;
 
         spin_lock(&dcache_lock);
-        list_splice_init(&sbi->ll_mnt_list, &kill_list);
-
-        /* Walk the list in reverse order, and put them on the front of the
-         * sbi list each iteration; this avoids list-ordering problems if we
-         * race with another gns-mounting thread */
-        while (!list_empty(&kill_list)) {
-                struct vfsmount *mnt =
-                        list_entry(kill_list.prev, struct vfsmount,
-                                   mnt_lustre_list);
+        list_splice_init(&sbi->ll_mnt_list, &check_list);
+
+        /*
+         * walk the list in reverse order, and put them on the front of the sbi
+         * list each iteration; this avoids list-ordering problems if we race
+         * with another gns-mounting thread.
+         */
+        while (!list_empty(&check_list)) {
+                mnt = list_entry(check_list.prev,
+                                 struct vfsmount,
+                                 mnt_lustre_list);
+
                 mntget(mnt);
+
                 list_del_init(&mnt->mnt_lustre_list);
-                list_add(&mnt->mnt_lustre_list, &sbi->ll_mnt_list);
 
-                if (timeout &&
-                    jiffies - mnt->mnt_last_used < GNS_MOUNT_TIMEOUT * HZ) {
+                list_add(&mnt->mnt_lustre_list,
+                         &sbi->ll_mnt_list);
+
+                /* check for timeout if needed */
+                pass = jiffies - mnt->mnt_last_used;
+                
+                if (flags == LL_GNS_CHECK &&
+                    pass < sbi->ll_gns_timeout * HZ)
+                {
                         mntput(mnt);
                         continue;
                 }
                 spin_unlock(&dcache_lock);
 
-                CDEBUG(D_INODE, "unmounting mnt %p from sbi %p\n", mnt, sbi);
+                /* umounting @mnt */
+                ll_gns_umount_object(mnt);
 
-                rc = do_umount(mnt, 0);
-                if (rc != 0 && page != NULL) {
-                        int rc2;
-                        path = kpage;
-                        rc2 = fill_page_with_path(mnt->mnt_root, mnt, &path);
-                        CERROR("GNS umount(%s): %d\n", rc2 == 0 ? path : "",
-                               rc);
-                }
                 mntput(mnt);
                 spin_lock(&dcache_lock);
         }
         spin_unlock(&dcache_lock);
-
-        if (page != NULL) {
-                kunmap(page);
-                __free_pages(page, 0);
-        }
         RETURN(0);
 }
 
-static struct list_head gns_sbi_list = LIST_HEAD_INIT(gns_sbi_list);
-static spinlock_t gns_lock = SPIN_LOCK_UNLOCKED;
-static struct ptlrpc_thread gns_thread;
-
+/*
+ * GNS timer callback function. It restarts gns timer and wakes up GNS cvontrol
+ * thread to process mounts list.
+ */
 void ll_gns_timer_callback(unsigned long data)
 {
         struct ll_sb_info *sbi = (void *)data;
@@ -316,27 +374,35 @@ void ll_gns_timer_callback(unsigned long data)
         if (list_empty(&sbi->ll_gns_sbi_head))
                 list_add(&sbi->ll_gns_sbi_head, &gns_sbi_list);
         spin_unlock(&gns_lock);
+        
         wake_up(&gns_thread.t_ctl_waitq);
-        mod_timer(&sbi->ll_gns_timer, jiffies + GNS_TICK * HZ);
+        mod_timer(&sbi->ll_gns_timer,
+                  jiffies + sbi->ll_gns_tick * HZ);
 }
 
-static int gns_check_event(void)
+/* this function checks if something new happened to exist in gns list. */
+static int inline ll_gns_check_event(void)
 {
         int rc;
+        
         spin_lock(&gns_lock);
         rc = !list_empty(&gns_sbi_list);
         spin_unlock(&gns_lock);
+
         return rc;
 }
 
-static int inline gns_check_stopping(void)
+/* should we staop GNS control thread? */
+static int inline ll_gns_check_stop(void)
 {
         mb();
         return (gns_thread.t_flags & SVC_STOPPING) ? 1 : 0;
 }
 
+/* GNS control thread function. */
 static int ll_gns_thread_main(void *arg)
 {
+        struct ll_gns_ctl *ctl = arg;
         unsigned long flags;
         ENTRY;
 
@@ -345,42 +411,57 @@ static int ll_gns_thread_main(void *arg)
                 snprintf(name, sizeof(name) - 1, "ll_gns");
                 kportal_daemonize(name);
         }
+        
         SIGNAL_MASK_LOCK(current, flags);
         sigfillset(&current->blocked);
         RECALC_SIGPENDING;
         SIGNAL_MASK_UNLOCK(current, flags);
 
+        /*
+         * letting starting function know, that we are ready and control may be
+         * returned.
+         */
         gns_thread.t_flags = SVC_RUNNING;
-        wake_up(&gns_thread.t_ctl_waitq);
+        complete(&ctl->gc_starting);
 
-        while (!gns_check_stopping()) {
+        while (!ll_gns_check_stop()) {
                 struct l_wait_info lwi = { 0 };
 
-                l_wait_event(gns_thread.t_ctl_waitq, gns_check_event() ||
-                             gns_check_stopping(), &lwi);
-
+                l_wait_event(gns_thread.t_ctl_waitq,
+                             (ll_gns_check_event() ||
+                              ll_gns_check_stop()), &lwi);
+                
                 spin_lock(&gns_lock);
                 while (!list_empty(&gns_sbi_list)) {
-                        struct ll_sb_info *sbi =
-                                list_entry(gns_sbi_list.prev, struct ll_sb_info,
-                                           ll_gns_sbi_head);
+                        struct ll_sb_info *sbi;
+
+                        sbi = list_entry(gns_sbi_list.prev,
+                                         struct ll_sb_info,
+                                         ll_gns_sbi_head);
+                        
                         list_del_init(&sbi->ll_gns_sbi_head);
                         spin_unlock(&gns_lock);
-                        ll_gns_umount_all(sbi, 1);
+                        ll_gns_check_mounts(sbi, LL_GNS_CHECK);
                         spin_lock(&gns_lock);
                 }
                 spin_unlock(&gns_lock);
         }
 
+        /* 
+         * letting know stop function know that thread is stoped and it may
+         * return.
+         */
+        EXIT;
         gns_thread.t_flags = SVC_STOPPED;
-        wake_up(&gns_thread.t_ctl_waitq);
 
-        RETURN(0);
+        /* this is SMP-safe way to finish thread. */
+        complete_and_exit(&ctl->gc_finishing, 0);
 }
 
 void ll_gns_add_timer(struct ll_sb_info *sbi)
 {
-        mod_timer(&sbi->ll_gns_timer, jiffies + GNS_TICK * HZ);
+        mod_timer(&sbi->ll_gns_timer,
+                  jiffies + sbi->ll_gns_tick * HZ);
 }
 
 void ll_gns_del_timer(struct ll_sb_info *sbi)
@@ -388,32 +469,40 @@ void ll_gns_del_timer(struct ll_sb_info *sbi)
         del_timer(&sbi->ll_gns_timer);
 }
 
+/*
+ * starts GNS control thread and waits for a signal it is up and work may be
+ * continued.
+ */
 int ll_gns_start_thread(void)
 {
-        struct l_wait_info lwi = { 0 };
         int rc;
+        ENTRY;
 
         LASSERT(gns_thread.t_flags == 0);
-
+        init_completion(&gns_ctl.gc_starting);
+        init_completion(&gns_ctl.gc_finishing);
         init_waitqueue_head(&gns_thread.t_ctl_waitq);
-        rc = kernel_thread(ll_gns_thread_main, NULL, CLONE_VM | CLONE_FILES);
+        
+        rc = kernel_thread(ll_gns_thread_main, &gns_ctl,
+                           (CLONE_VM | CLONE_FILES));
         if (rc < 0) {
-                CERROR("cannot start thread: %d\n", rc);
-                return rc;
+                CERROR("cannot start GNS control thread, "
+                       "err = %d\n", rc);
+                RETURN(rc);
         }
-        l_wait_event(gns_thread.t_ctl_waitq, gns_thread.t_flags & SVC_RUNNING,
-                     &lwi);
-        return 0;
+        wait_for_completion(&gns_ctl.gc_starting);
+        LASSERT(gns_thread.t_flags == SVC_RUNNING);
+        RETURN(0);
 }
 
+/* stops GNS control thread and waits its actual stop. */
 void ll_gns_stop_thread(void)
 {
-        struct l_wait_info lwi = { 0 };
-
+        ENTRY;
         gns_thread.t_flags = SVC_STOPPING;
-
         wake_up(&gns_thread.t_ctl_waitq);
-        l_wait_event(gns_thread.t_ctl_waitq, gns_thread.t_flags & SVC_STOPPED,
-                     &lwi);
+        wait_for_completion(&gns_ctl.gc_finishing);
+        LASSERT(gns_thread.t_flags == SVC_STOPPED);
         gns_thread.t_flags = 0;
+        EXIT;
 }
index 2dd8aae..ec99d29 100644 (file)
@@ -34,6 +34,15 @@ struct ll_ra_info {
         unsigned long             ra_stats[_NR_RA_STAT];
 };
 
+/* after roughly how long should we remove an inactive mount? */
+#define GNS_MOUNT_TIMEOUT 120
+
+/* how often should the GNS timer look for mounts to cleanup? */
+#define GNS_TICK_TIMEOUT  1
+
+/* how many times GNS will try to wait for 1 second for mount */
+#define GNS_WAIT_ATTEMPTS 10
+
 struct ll_sb_info {
         /* this protects pglist and max_r_a_pages.  It isn't safe to grab from
          * interrupt contexts. */
@@ -78,16 +87,36 @@ struct ll_sb_info {
         struct list_head          ll_mnt_list;
 
         struct semaphore          ll_gns_sem;
+        spinlock_t                ll_gns_lock;
         wait_queue_head_t         ll_gns_waitq;
-        struct completion         ll_gns_completion;
         int                       ll_gns_state;
         struct timer_list         ll_gns_timer;
         struct list_head          ll_gns_sbi_head;
+
+        unsigned long             ll_gns_tick;
+        unsigned long             ll_gns_timeout;
+        struct completion         ll_gns_mount_finished;
+
+        /* path to upcall */
+        char                      ll_gns_upcall[PATH_MAX];
+
+        /* mount object entry name */
+        char                      ll_gns_oname[PATH_MAX];
+};
+
+struct ll_gns_ctl {
+        struct completion gc_starting;
+        struct completion gc_finishing;
 };
 
-#define LL_GNS_STATE_IDLE     1100
-#define LL_GNS_STATE_MOUNTING 1101
-#define LL_GNS_STATE_FINISHED 1102
+/* mounting states */
+#define LL_GNS_IDLE               (1 << 0)
+#define LL_GNS_MOUNTING           (1 << 1)
+#define LL_GNS_FINISHED           (1 << 2)
+
+/* mounts checking flags */
+#define LL_GNS_UMOUNT             (1 << 0)
+#define LL_GNS_CHECK              (1 << 1)
 
 struct ll_readahead_state {
         spinlock_t      ras_lock;
@@ -98,6 +127,7 @@ struct ll_readahead_state {
 };
 
 extern kmem_cache_t *ll_file_data_slab;
+extern kmem_cache_t *ll_intent_slab;
 struct lustre_handle;
 struct ll_file_data {
         struct obd_client_handle fd_mds_och;
@@ -192,7 +222,13 @@ void ll_truncate(struct inode *inode);
 /* llite/file.c */
 extern struct file_operations ll_file_operations;
 extern struct inode_operations ll_file_inode_operations;
-extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *);
+extern int ll_inode_revalidate_it(struct dentry *);
+extern int ll_setxattr(struct dentry *, const char *, const void *,
+                       size_t, int);
+extern int ll_getxattr(struct dentry *, const char *, void *, size_t);
+extern int ll_listxattr(struct dentry *, char *, size_t);
+extern int ll_removexattr(struct dentry *, const char *);
+extern int ll_inode_permission(struct inode *, int, struct nameidata *);
 int ll_refresh_lsm(struct inode *inode, struct lov_stripe_md *lsm);
 int ll_extent_lock(struct ll_file_data *, struct inode *,
                    struct lov_stripe_md *, int mode, ldlm_policy_data_t *,
@@ -208,8 +244,7 @@ int ll_local_open(struct file *file, struct lookup_intent *it);
 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
                  struct file *file);
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-int ll_getattr(struct vfsmount *mnt, struct dentry *de,
-               struct lookup_intent *it, struct kstat *stat);
+int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
 #endif
 void ll_stime_record(struct ll_sb_info *sbi, struct timeval *start,
                      struct obd_service_time *stime);
@@ -217,6 +252,8 @@ void ll_stime_record(struct ll_sb_info *sbi, struct timeval *start,
 /* llite/dcache.c */
 void ll_intent_drop_lock(struct lookup_intent *);
 void ll_intent_release(struct lookup_intent *);
+int ll_intent_alloc(struct lookup_intent *);
+void ll_intent_free(struct lookup_intent *it);
 extern void ll_set_dd(struct dentry *de);
 void ll_unhash_aliases(struct inode *);
 void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft);
@@ -226,22 +263,26 @@ int revalidate_it_finish(struct ptlrpc_request *request, int offset,
 
 
 /* llite/llite_gns.c */
-int ll_finish_gns(struct ll_sb_info *sbi);
-int fill_page_with_path(struct dentry *, struct vfsmount *, char **pagep);
-int ll_dir_process_mount_object(struct dentry *, struct vfsmount *);
-int ll_gns_umount_all(struct ll_sb_info *sbi, int timeout);
+int ll_gns_start_thread(void);
+void ll_gns_stop_thread(void);
+
+int ll_gns_mount_object(struct dentry *dentry,
+                        struct vfsmount *mnt);
+int ll_gns_umount_object(struct vfsmount *mnt);
+
+int ll_gns_check_mounts(struct ll_sb_info *sbi,
+                        int flags);
+
 void ll_gns_timer_callback(unsigned long data);
 void ll_gns_add_timer(struct ll_sb_info *sbi);
 void ll_gns_del_timer(struct ll_sb_info *sbi);
-int ll_gns_start_thread(void);
-void ll_gns_stop_thread(void);
 
 /* llite/llite_lib.c */
 extern struct super_operations lustre_super_operations;
 
 char *ll_read_opt(const char *opt, char *data);
 int ll_set_opt(const char *opt, char *data, int fl);
-void ll_options(char *options, char **ost, char **mds, int *flags);
+void ll_options(char *options, char **ost, char **mds, char **sec, int *flags);
 void ll_lli_init(struct ll_inode_info *lli);
 int ll_fill_super(struct super_block *sb, void *data, int silent);
 int lustre_fill_super(struct super_block *sb, void *data, int silent);
@@ -335,7 +376,6 @@ int ll_get_fid(struct obd_export *exp, struct lustre_id *idp,
 #if  (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
 #define    ll_s2sbi(sb)        ((struct ll_sb_info *)((sb)->s_fs_info))
 #define    ll_set_sbi(sb, sbi) ((sb)->s_fs_info = sbi)
-void __d_rehash(struct dentry * entry, int lock);
 static inline __u64 ll_ts2u64(struct timespec *time)
 {
         __u64 t = time->tv_sec;
index f0443a1..338a597 100644 (file)
 #include <linux/lustre_ha.h>
 #include <linux/lustre_dlm.h>
 #include <linux/lprocfs_status.h>
+#include <linux/lustre_acl.h>
 #include "llite_internal.h"
 
 kmem_cache_t *ll_file_data_slab;
+kmem_cache_t *ll_intent_slab;
 
 extern struct address_space_operations ll_aops;
 extern struct address_space_operations ll_dir_aops;
@@ -63,13 +65,28 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb)
         INIT_LIST_HEAD(&sbi->ll_conn_chain);
         INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list);
         INIT_LIST_HEAD(&sbi->ll_mnt_list);
+       
         sema_init(&sbi->ll_gns_sem, 1);
-        init_completion(&sbi->ll_gns_completion);
-        sbi->ll_gns_state = LL_GNS_STATE_IDLE;
+        spin_lock_init(&sbi->ll_gns_lock);
+        INIT_LIST_HEAD(&sbi->ll_gns_sbi_head);
+        init_waitqueue_head(&sbi->ll_gns_waitq);
+        init_completion(&sbi->ll_gns_mount_finished);
+
+        /* this later may be reset via /proc/fs/... */
+        memcpy(sbi->ll_gns_oname, ".mntinfo", strlen(".mntinfo"));
+        sbi->ll_gns_oname[strlen(sbi->ll_gns_oname) - 1] = '\0';
+        
+        /* this later may be reset via /proc/fs/... */
+        memset(sbi->ll_gns_upcall, 0, sizeof(sbi->ll_gns_upcall));
+
+        /* default values, may be changed via /proc/fs/... */
+        sbi->ll_gns_state = LL_GNS_IDLE;
+        sbi->ll_gns_tick = GNS_TICK_TIMEOUT;
+        sbi->ll_gns_timeout = GNS_MOUNT_TIMEOUT;
+
         sbi->ll_gns_timer.data = (unsigned long)sbi;
         sbi->ll_gns_timer.function = ll_gns_timer_callback;
         init_timer(&sbi->ll_gns_timer);
-        INIT_LIST_HEAD(&sbi->ll_gns_sbi_head);
 
         ll_set_sbi(sb, sbi);
 
@@ -104,7 +121,10 @@ int lustre_init_dt_desc(struct ll_sb_info *sbi)
         RETURN(rc);
 }
 
-int lustre_common_fill_super(struct super_block *sb, char *lmv, char *lov)
+extern struct dentry_operations ll_d_ops;
+
+int lustre_common_fill_super(struct super_block *sb, char *lmv, char *lov,
+                             char *security, __u32 *nllu)
 {
         struct ll_sb_info *sbi = ll_s2sbi(sb);
         struct ptlrpc_request *request = NULL;
@@ -124,6 +144,25 @@ int lustre_common_fill_super(struct super_block *sb, char *lmv, char *lov)
                 RETURN(-EINVAL);
         }
 
+        if (security == NULL)
+                security = "null";
+
+        err = obd_set_info(obd->obd_self_export, strlen("sec"), "sec",
+                           strlen(security), security);
+        if (err) {
+                CERROR("LMV %s: failed to set security %s, err %d\n",
+                        lmv, security, err);
+                RETURN(err);
+        }
+
+        err = obd_set_info(obd->obd_self_export, strlen("nllu"), "nllu",
+                           sizeof(__u32) * 2, nllu);
+        if (err) {
+                CERROR("LMV %s: failed to set NLLU, err %d\n",
+                        lmv, err);
+                RETURN(err);
+        }
+
         if (proc_lustre_fs_root) {
                 err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
                                                   lov, lmv);
@@ -199,7 +238,7 @@ int lustre_common_fill_super(struct super_block *sb, char *lmv, char *lov)
         /* make root inode
          * XXX: move this to after cbd setup? */
         err = md_getattr(sbi->ll_md_exp, &sbi->ll_rootid,
-                         (OBD_MD_FLNOTOBD | OBD_MD_FLBLOCKS | OBD_MD_FID),
+                         (OBD_MD_FLNOTOBD | OBD_MD_FLBLOCKS | OBD_MD_FID), NULL, 0,
                          0, &request);
         if (err) {
                 CERROR("md_getattr failed for root: rc = %d\n", err);
@@ -241,6 +280,7 @@ int lustre_common_fill_super(struct super_block *sb, char *lmv, char *lov)
 #endif
 
         sb->s_root = d_alloc_root(root);
+        sb->s_root->d_op = &ll_d_ops;
 
 #ifdef S_PDIROPS
         CWARN("Enabling PDIROPS\n");
@@ -327,7 +367,7 @@ int ll_set_opt(const char *opt, char *data, int fl)
                 RETURN(fl);
 }
 
-void ll_options(char *options, char **lov, char **lmv, int *flags)
+void ll_options(char *options, char **lov, char **lmv, char **sec, int *flags)
 {
         char *this_char;
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
@@ -352,6 +392,8 @@ void ll_options(char *options, char **lov, char **lmv, int *flags)
                         continue;
                 if (!*lmv && (*lmv = ll_read_opt("mdc", this_char)))
                         continue;
+                if (!*sec && (*sec = ll_read_opt("sec", this_char)))
+                        continue;
                 if (!(*flags & LL_SBI_NOLCK) &&
                     ((*flags) = (*flags) |
                                 ll_set_opt("nolock", this_char,
@@ -378,6 +420,8 @@ int ll_fill_super(struct super_block *sb, void *data, int silent)
         struct ll_sb_info *sbi;
         char *lov = NULL;
         char *lmv = NULL;
+        char *sec = NULL;
+        __u32 nllu[2] = { 99, 99 };
         int err;
         ENTRY;
 
@@ -388,7 +432,7 @@ int ll_fill_super(struct super_block *sb, void *data, int silent)
                 RETURN(-ENOMEM);
 
         sbi->ll_flags |= LL_SBI_READAHEAD;
-        ll_options(data, &lov, &lmv, &sbi->ll_flags);
+        ll_options(data, &lov, &lmv, &sec, &sbi->ll_flags);
 
         if (!lov) {
                 CERROR("no osc\n");
@@ -400,12 +444,14 @@ int ll_fill_super(struct super_block *sb, void *data, int silent)
                 GOTO(out, err = -EINVAL);
         }
 
-        err = lustre_common_fill_super(sb, lmv, lov);
+        err = lustre_common_fill_super(sb, lmv, lov, sec, nllu);
         EXIT;
 out:
         if (err)
                 lustre_free_sbi(sb);
 
+        if (sec)
+                OBD_FREE(sec, strlen(sec) + 1);
         if (lmv)
                 OBD_FREE(lmv, strlen(lmv) + 1);
         if (lov)
@@ -426,8 +472,7 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char *profile,
         class_uuid_t uuid;
         struct obd_uuid lmv_uuid;
         struct llog_ctxt *ctxt;
-        int rc = 0;
-        int err;
+        int rc, err = 0;
         ENTRY;
 
         if (lmd_bad_magic(lmd))
@@ -440,9 +485,9 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char *profile,
                 PCFG_INIT(pcfg, NAL_CMD_REGISTER_MYNID);
                 pcfg.pcfg_nal = lmd->lmd_nal;
                 pcfg.pcfg_nid = lmd->lmd_local_nid;
-                err = libcfs_nal_cmd(&pcfg);
-                if (err <0)
-                        GOTO(out, err);
+                rc = libcfs_nal_cmd(&pcfg);
+                if (rc < 0)
+                        GOTO(out, rc);
         }
 
         if (lmd->lmd_nal == SOCKNAL ||
@@ -455,9 +500,9 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char *profile,
                 pcfg.pcfg_nid     = lmd->lmd_server_nid;
                 pcfg.pcfg_id      = lmd->lmd_server_ipaddr;
                 pcfg.pcfg_misc    = lmd->lmd_port;
-                err = libcfs_nal_cmd(&pcfg);
-                if (err <0)
-                        GOTO(out, err);
+                rc = libcfs_nal_cmd(&pcfg);
+                if (rc < 0)
+                        GOTO(out, rc);
         }
 
         LCFG_INIT(lcfg, LCFG_ADD_UUID, name);
@@ -465,9 +510,9 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char *profile,
         lcfg.lcfg_inllen1 = strlen(peer) + 1;
         lcfg.lcfg_inlbuf1 = peer;
         lcfg.lcfg_nal = lmd->lmd_nal;
-        err = class_process_config(&lcfg);
-        if (err < 0)
-                GOTO(out_del_conn, err);
+        rc = class_process_config(&lcfg);
+        if (rc < 0)
+                GOTO(out_del_conn, rc);
 
         LCFG_INIT(lcfg, LCFG_ATTACH, name);
         lcfg.lcfg_inlbuf1 = "mdc";
@@ -475,33 +520,38 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char *profile,
         lcfg.lcfg_inlbuf2 = lmv_uuid.uuid;
         lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
         err = class_process_config(&lcfg);
-        if (err < 0)
-                GOTO(out_del_uuid, err);
+        if (rc < 0)
+                GOTO(out_del_uuid, rc);
 
         LCFG_INIT(lcfg, LCFG_SETUP, name);
         lcfg.lcfg_inlbuf1 = lmd->lmd_mds;
         lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1;
         lcfg.lcfg_inlbuf2 = peer;
         lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1;
-        err = class_process_config(&lcfg);
-        if (err < 0)
-                GOTO(out_detach, err);
+        rc = class_process_config(&lcfg);
+        if (rc < 0)
+                GOTO(out_detach, rc);
 
         obd = class_name2obd(name);
         if (obd == NULL)
-                GOTO(out_cleanup, err = -EINVAL);
+                GOTO(out_cleanup, rc = -EINVAL);
+
+        rc = obd_set_info(obd->obd_self_export, strlen("sec"), "sec",
+                          strlen(lmd->lmd_security), lmd->lmd_security);
+        if (rc)
+                GOTO(out_cleanup, rc);
 
         /* Disable initial recovery on this import */
-        err = obd_set_info(obd->obd_self_export,
-                           strlen("initial_recov"), "initial_recov",
-                           sizeof(allow_recov), &allow_recov);
-        if (err)
-                GOTO(out_cleanup, err);
+        rc = obd_set_info(obd->obd_self_export,
+                          strlen("initial_recov"), "initial_recov",
+                          sizeof(allow_recov), &allow_recov);
+        if (rc)
+                GOTO(out_cleanup, rc);
 
-        err = obd_connect(&md_conn, obd, &lmv_uuid, 0);
-        if (err) {
-                CERROR("cannot connect to %s: rc = %d\n", lmd->lmd_mds, err);
-                GOTO(out_cleanup, err);
+        rc = obd_connect(&md_conn, obd, &lmv_uuid, 0);
+        if (rc) {
+                CERROR("cannot connect to %s: rc = %d\n", lmd->lmd_mds, rc);
+                GOTO(out_cleanup, rc);
         }
 
         exp = class_conn2export(&md_conn);
@@ -511,7 +561,7 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char *profile,
         if (rc)
                 CERROR("class_config_process_llog failed: rc = %d\n", rc);
 
-        err = obd_disconnect(exp, 0);
+        rc = obd_disconnect(exp, 0);
         
         EXIT;
 out_cleanup:
@@ -538,12 +588,16 @@ out_del_conn:
             lmd->lmd_nal == IIBNAL ||
             lmd->lmd_nal == VIBNAL ||
             lmd->lmd_nal == RANAL) {
+                int err2;
+
                 PCFG_INIT(pcfg, NAL_CMD_DEL_PEER);
                 pcfg.pcfg_nal     = lmd->lmd_nal;
                 pcfg.pcfg_nid     = lmd->lmd_server_nid;
                 pcfg.pcfg_flags   = 1;          /* single_share */
-                err = libcfs_nal_cmd(&pcfg);
-                if (err <0)
+                err2 = libcfs_nal_cmd(&pcfg);
+                if (err2 && !err)
+                        err = err2;
+                if (err < 0)
                         GOTO(out, err);
         }
 out:
@@ -580,6 +634,7 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent)
                         CERROR("no mds name\n");
                         GOTO(out_free, err = -EINVAL);
                 }
+                lmd->lmd_security[sizeof(lmd->lmd_security) - 1] = 0;
 
                 OBD_ALLOC(sbi->ll_lmd, sizeof(*sbi->ll_lmd));
                 if (sbi->ll_lmd == NULL)
@@ -631,7 +686,8 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent)
                 GOTO(out_free, err = -EINVAL);
         }
 
-        err = lustre_common_fill_super(sb, lmv, lov);
+        err = lustre_common_fill_super(sb, lmv, lov, lmd->lmd_security,
+                                       &lmd->lmd_nllu);
 
         if (err)
                 GOTO(out_free, err);
@@ -957,7 +1013,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
         /* If only OST attributes being set on objects, don't do MDS RPC.
          * In that case, we need to check permissions and update the local
          * inode ourselves so we can call obdo_from_inode() always. */
-        if (ia_valid & (lsm ? ~(ATTR_SIZE | ATTR_FROM_OPEN | ATTR_RAW) : ~0)) {
+        if (ia_valid & (lsm ? ~(ATTR_SIZE | ATTR_FROM_OPEN /*| ATTR_RAW*/) : ~0)) {
                 struct lustre_md md;
 
                 OBD_ALLOC(op_data, sizeof(*op_data));
@@ -1094,8 +1150,8 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
 
 int ll_setattr(struct dentry *de, struct iattr *attr)
 {
-        LBUG(); /* code is unused, but leave this in case of VFS changes */
-        RETURN(-ENOSYS);
+        LASSERT(de->d_inode);
+        return ll_setattr_raw(de->d_inode, attr);
 }
 
 int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs,
@@ -1184,10 +1240,12 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
         struct lov_stripe_md *lsm = md->lsm;
         struct mds_body *body = md->body;
         struct mea *mea = md->mea;
+        struct posix_acl *ll_acl_access = md->acl_access;
         ENTRY;
 
         LASSERT((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
         LASSERT((mea != NULL) == ((body->valid & OBD_MD_FLDIREA) != 0));
+
         if (lsm != NULL) {
                 LASSERT(lsm->lsm_object_gr > 0);
                 if (lli->lli_smd == NULL) {
@@ -1250,6 +1308,14 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
        if (body->valid & OBD_MD_FLGENER)
                id_gen(&lli->lli_id) = id_gen(&body->id1);
 
+        spin_lock(&lli->lli_lock);
+        if (ll_acl_access != NULL) {
+                if (lli->lli_acl_access != NULL)
+                        posix_acl_release(lli->lli_acl_access);
+                lli->lli_acl_access = ll_acl_access;
+        }
+        spin_unlock(&lli->lli_lock);
         if (body->valid & OBD_MD_FLID)
                 inode->i_ino = id_ino(&body->id1);
         if (body->valid & OBD_MD_FLGENER)
@@ -1415,7 +1481,7 @@ int ll_iocontrol(struct inode *inode, struct file *file,
                 struct mds_body *body;
 
                 ll_inode2id(&id, inode);
-                rc = md_getattr(sbi->ll_md_exp, &id, valid, 0, &req);
+                rc = md_getattr(sbi->ll_md_exp, &id, valid, NULL, 0, 0, &req);
                 if (rc) {
                         CERROR("failure %d inode %lu\n", rc, inode->i_ino);
                         RETURN(-abs(rc));
index e94b605..2d35405 100644 (file)
@@ -61,7 +61,8 @@ static struct inode *search_inode_for_lustre(struct super_block *sb,
         id_ino(&id) = (__u64)ino;
         id_gen(&id) = generation;
 
-        rc = md_getattr(sbi->ll_md_exp, &id, valid, eadatalen, &req);
+        rc = md_getattr(sbi->ll_md_exp, &id, valid, NULL, 0, 
+                        eadatalen, &req);
         if (rc) {
                 CERROR("failure %d inode %lu\n", rc, ino);
                 return ERR_PTR(rc);
index 665e9d7..815c1ac 100644 (file)
@@ -35,7 +35,6 @@ struct file_operations llite_dump_pgcache_fops;
 struct file_operations ll_ra_stats_fops;
 struct file_operations llite_wait_times_fops;
 
-
 #ifndef LPROCFS
 int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
                                 struct super_block *sb, char *osc, char *mdc)
@@ -263,6 +262,126 @@ static int ll_wr_max_read_ahead_mb(struct file *file, const char *buffer,
         return count;
 }
 
+static int ll_rd_gns_upcall(char *page, char **start, off_t off,
+                            int count, int *eof, void *data)
+{
+        struct super_block *sb = (struct super_block *)data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int len;
+
+        down(&sbi->ll_gns_sem);
+        len = snprintf(page, count, "%s\n", sbi->ll_gns_upcall);
+        up(&sbi->ll_gns_sem);
+
+        return len;
+}
+
+static int ll_wr_gns_upcall(struct file *file, const char *buffer,
+                            unsigned long count, void *data)
+{
+        struct super_block *sb = (struct super_block *)data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+        down(&sbi->ll_gns_sem);
+        snprintf(sbi->ll_gns_upcall, count, "%s", buffer);
+        up(&sbi->ll_gns_sem);
+
+        return count;
+}
+
+static int ll_rd_gns_object_name(char *page, char **start, off_t off,
+                                 int count, int *eof, void *data)
+{
+        struct super_block *sb = (struct super_block *)data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int len;
+
+        down(&sbi->ll_gns_sem);
+        len = snprintf(page, count, "%s\n", sbi->ll_gns_oname);
+        up(&sbi->ll_gns_sem);
+
+        return len;
+}
+
+static int ll_wr_gns_object_name(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
+{
+        struct super_block *sb = (struct super_block *)data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+
+        down(&sbi->ll_gns_sem);
+        snprintf(sbi->ll_gns_oname, count, "%s", buffer);
+        up(&sbi->ll_gns_sem);
+
+        return count;
+}
+
+static int ll_rd_gns_timeout(char *page, char **start, off_t off,
+                             int count, int *eof, void *data)
+{
+        struct super_block *sb = (struct super_block *)data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int len;
+
+        down(&sbi->ll_gns_sem);
+        len = snprintf(page, count, "%lu\n",
+                       (unsigned long)sbi->ll_gns_timeout);
+        up(&sbi->ll_gns_sem);
+
+        return len;
+}
+
+static int ll_wr_gns_timeout(struct file *file, const char *buffer,
+                             unsigned long count, void *data)
+{
+        struct super_block *sb = (struct super_block *)data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int val, rc;
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        down(&sbi->ll_gns_sem);
+        sbi->ll_gns_timeout = val;
+        up(&sbi->ll_gns_sem);
+
+        return count;
+}
+
+static int ll_rd_gns_tick(char *page, char **start, off_t off,
+                          int count, int *eof, void *data)
+{
+        struct super_block *sb = (struct super_block *)data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int len;
+
+        down(&sbi->ll_gns_sem);
+        len = snprintf(page, count, "%lu\n",
+                       (unsigned long)sbi->ll_gns_tick);
+        up(&sbi->ll_gns_sem);
+
+        return len;
+}
+
+static int ll_wr_gns_tick(struct file *file, const char *buffer,
+                          unsigned long count, void *data)
+{
+        struct super_block *sb = (struct super_block *)data;
+        struct ll_sb_info *sbi = ll_s2sbi(sb);
+        int val, rc;
+
+        rc = lprocfs_write_helper(buffer, count, &val);
+        if (rc)
+                return rc;
+
+        down(&sbi->ll_gns_sem);
+        if (sbi->ll_gns_tick < sbi->ll_gns_timeout)
+                sbi->ll_gns_tick = val;
+        up(&sbi->ll_gns_sem);
+
+        return count;
+}
 static struct lprocfs_vars lprocfs_obd_vars[] = {
         { "uuid",         ll_rd_sb_uuid,          0, 0 },
         //{ "mntpt_path",   ll_rd_path,             0, 0 },
@@ -278,6 +397,19 @@ static struct lprocfs_vars lprocfs_obd_vars[] = {
         { "config_update", 0, ll_wr_config_update, 0 },
         { "max_read_ahead_mb", ll_rd_max_read_ahead_mb,
                                ll_wr_max_read_ahead_mb, 0 },
+
+        { "gns_upcall", ll_rd_gns_upcall,
+          ll_wr_gns_upcall, 0 },
+        
+        { "gns_timeout", ll_rd_gns_timeout,
+          ll_wr_gns_timeout, 0 },
+        
+        { "gns_tick", ll_rd_gns_tick,
+          ll_wr_gns_tick, 0 },
+        
+        { "gns_object_name", ll_rd_gns_object_name,
+          ll_wr_gns_object_name, 0 },
+        
         { 0 }
 };
 
@@ -329,7 +461,8 @@ struct llite_file_opcode {
                                    "direct_read" },
         { LPROC_LL_DIRECT_WRITE,   LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES,
                                    "direct_write" },
-
+        { LPROC_LL_SETXATTR,       LPROCFS_TYPE_REGS, "setxattr" },
+        { LPROC_LL_GETXATTR,       LPROCFS_TYPE_REGS, "getxattr" },
 };
 
 int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
index 0b16f62..d291096 100644 (file)
@@ -254,7 +254,7 @@ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de)
                         list_del_init(&dentry->d_lru);
 
                 hlist_del_init(&dentry->d_hash);
-                __d_rehash(dentry, 0); /* avoid taking dcache_lock inside */
+                __d_rehash(dentry); /* avoid taking dcache_lock inside */
                 spin_unlock(&dcache_lock);
                 atomic_inc(&dentry->d_count);
                 iput(inode);
@@ -294,7 +294,7 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset,
                 CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n",
                        inode, inode->i_ino, inode->i_generation);
                 
-                mdc_set_lock_data(NULL, &it->d.lustre.it_lock_handle, inode);
+                mdc_set_lock_data(NULL, &LUSTRE_IT(it)->it_lock_handle, inode);
                 
                 /* If this is a stat, get the authoritative file size */
                 if (it->it_op == IT_GETATTR && S_ISREG(inode->i_mode) &&
@@ -329,20 +329,17 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset,
 }
 
 static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
-                                   struct nameidata *nd, struct lookup_intent *it,
-                                   int flags)
+                                   struct nameidata *nd, int flags)
 {
         struct dentry *save = dentry, *retval;
+        struct lookup_intent *it = flags ? &nd->intent.open : NULL;
         struct lustre_id pid;
         struct it_cb_data icbd;
         struct ptlrpc_request *req = NULL;
         struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
-        int rc;
+        int rc, orig_it;
         ENTRY;
 
-        if (dentry->d_name.len > EXT3_NAME_LEN)
-                RETURN(ERR_PTR(-ENAMETOOLONG));
-
         CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n",
                dentry->d_name.name, parent->i_ino, parent->i_generation,
                parent, LL_IT2STR(it));
@@ -353,6 +350,7 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
         if (nd != NULL)
                 nd->mnt->mnt_last_used = jiffies;
 
+        orig_it = it ? it->it_op : IT_OPEN;
         ll_frob_intent(&it, &lookup_it);
 
         icbd.icbd_childp = &dentry;
@@ -376,8 +374,12 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
         if (nd &&
             dentry->d_inode != NULL && dentry->d_inode->i_mode & S_ISUID &&
             S_ISDIR(dentry->d_inode->i_mode) &&
-            (flags & LOOKUP_CONTINUE || (it->it_op & (IT_CHDIR | IT_OPEN))))
-                ll_dir_process_mount_object(dentry, nd->mnt);
+            ((flags & LOOKUP_CONTINUE) || (orig_it & (IT_CHDIR | IT_OPEN))))
+        {
+                spin_lock(&dentry->d_lock);
+                dentry->d_flags |= DCACHE_GNS_PENDING;
+                spin_unlock(&dentry->d_lock);
+        }
 
         if (dentry == save)
                 GOTO(out, retval = NULL);
@@ -386,6 +388,8 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry,
  out:
         if (req)
                 ptlrpc_req_finished(req);
+        if (it == &lookup_it)
+                ll_intent_release(it);
         if (dentry->d_inode)
                 CDEBUG(D_INODE, "lookup 0x%p in %lu/%lu: %*s -> %lu/%lu\n",
                        dentry,
@@ -411,9 +415,9 @@ static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry,
         ENTRY;
 
         if (nd && nd->flags & LOOKUP_LAST && !(nd->flags & LOOKUP_LINK_NOTLAST))
-                de = ll_lookup_it(parent, dentry, nd, &nd->intent, nd->flags);
+                de = ll_lookup_it(parent, dentry, nd, nd->flags);
         else
-                de = ll_lookup_it(parent, dentry, nd, NULL, 0);
+                de = ll_lookup_it(parent, dentry, nd, 0);
 
         RETURN(de);
 }
@@ -431,9 +435,10 @@ static struct inode *ll_create_node(struct inode *dir, const char *name,
         int rc;
         ENTRY;
 
-        LASSERT(it && it->d.lustre.it_disposition);
 
-        request = it->d.lustre.it_data;
+        LASSERT(it && LUSTRE_IT(it)->it_disposition);
+  
+        request = LUSTRE_IT(it)->it_data;
         rc = ll_prep_inode(sbi->ll_dt_exp, sbi->ll_md_exp,
                            &inode, request, 1, dir->i_sb);
         if (rc)
@@ -446,7 +451,7 @@ static struct inode *ll_create_node(struct inode *dir, const char *name,
          * stuff it in the lock. */
         CDEBUG(D_DLMTRACE, "setting l_ast_data to inode %p (%lu/%u)\n",
                inode, inode->i_ino, inode->i_generation);
-        mdc_set_lock_data(NULL, &it->d.lustre.it_lock_handle, inode);
+        mdc_set_lock_data(NULL, &LUSTRE_IT(it)->it_lock_handle, inode);
         EXIT;
  out:
         ptlrpc_req_finished(request);
@@ -471,7 +476,7 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode,
                         struct lookup_intent *it)
 {
         struct inode *inode;
-        struct ptlrpc_request *request = it->d.lustre.it_data;
+        struct ptlrpc_request *request = LUSTRE_IT(it)->it_data;
         struct obd_export *md_exp = ll_i2mdexp(dir); 
         int rc = 0;
         ENTRY;
@@ -497,7 +502,7 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode,
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
 static int ll_create_nd(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
 {
-        return ll_create_it(dir, dentry, mode, &nd->intent);
+        return ll_create_it(dir, dentry, mode, &nd->intent.open);
 }
 #endif
 
@@ -533,9 +538,6 @@ static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev)
         CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n",
                name, dir->i_ino, dir->i_generation, dir);
 
-        if (dir->i_nlink >= EXT3_LINK_MAX)
-                RETURN(err);
-
         mode &= ~current->fs->umask;
 
         switch (mode & S_IFMT) {
@@ -582,9 +584,6 @@ static int ll_mknod(struct inode *dir, struct dentry *child,
         CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n",
                name, dir->i_ino, dir->i_generation, dir);
 
-        if (dir->i_nlink >= EXT3_LINK_MAX)
-                RETURN(err);
-
         mode &= ~current->fs->umask;
 
         switch (mode & S_IFMT) {
@@ -640,14 +639,12 @@ static int ll_symlink_raw(struct nameidata *nd, const char *tgt)
 
         CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),target=%s\n",
                name, dir->i_ino, dir->i_generation, dir, tgt);
-
-        if (dir->i_nlink >= EXT3_LINK_MAX)
-                RETURN(err);
-
+        
         OBD_ALLOC(op_data, sizeof(*op_data));
         if (op_data == NULL)
                 RETURN(-ENOMEM);
         ll_prepare_mdc_data(op_data, dir, NULL, name, len, 0);
+        LASSERT(tgt);
         err = md_create(sbi->ll_md_exp, op_data,
                         tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO,
                         current->fsuid, current->fsgid, 0, &request);
@@ -883,17 +880,53 @@ static int ll_rename_raw(struct nameidata *oldnd, struct nameidata *newnd)
         RETURN(err);
 }
 
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#define LLITE_IT_RAWOPS (IT_MKNOD|IT_MKDIR|IT_SYMLINK|IT_LINK|IT_UNLINK|IT_RMDIR|IT_RENAME)
+static int ll_rawop_from_intent(struct nameidata *nd)
+{
+        int error = 0;
+
+        if (!nd || !(nd->intent.open.op & LLITE_IT_RAWOPS))
+                return 0;
+
+        switch (nd->intent.open.op) {
+        case IT_MKNOD:
+                error = ll_mknod_raw(nd, nd->intent.open.create_mode,
+                                     nd->intent.open.create.dev);
+                break;
+        case IT_MKDIR:
+                error = ll_mkdir_raw(nd, nd->intent.open.create_mode);
+                break;
+        case IT_RMDIR:
+                error = ll_rmdir_raw(nd);
+                break;
+        case IT_UNLINK:
+                error = ll_unlink_raw(nd);
+                break;
+        case IT_SYMLINK:
+                LASSERT(nd->intent.open.create.link);
+                error = ll_symlink_raw(nd, nd->intent.open.create.link);
+                break;
+        case IT_LINK:
+                error = ll_link_raw(nd->intent.open.create.source_nd, nd);
+                break;
+        case IT_RENAME:
+                LASSERT(nd->intent.open.create.source_nd);
+                error = ll_rename_raw(nd->intent.open.create.source_nd, nd);
+                break;
+        default:
+                LBUG();
+        }
+        if (error != -EOPNOTSUPP)
+                nd->intent.open.flags |= IT_STATUS_RAW;
+
+        return error;
+}
+#endif
+
 struct inode_operations ll_dir_inode_operations = {
-        .link_raw           = ll_link_raw,
-        .unlink_raw         = ll_unlink_raw,
-        .symlink_raw        = ll_symlink_raw,
-        .mkdir_raw          = ll_mkdir_raw,
-        .rmdir_raw          = ll_rmdir_raw,
-        .mknod_raw          = ll_mknod_raw,
         .mknod              = ll_mknod,
-        .rename_raw         = ll_rename_raw,
         .setattr            = ll_setattr,
-        .setattr_raw        = ll_setattr_raw,
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         .create_it          = ll_create_it,
         .lookup_it          = ll_lookup_it,
@@ -901,6 +934,12 @@ struct inode_operations ll_dir_inode_operations = {
 #else
         .lookup             = ll_lookup_nd,
         .create             = ll_create_nd,
-        .getattr_it         = ll_getattr,
+        .getattr            = ll_getattr,
+        .endparentlookup    = ll_rawop_from_intent,
 #endif
+        .setxattr           = ll_setxattr,
+        .getxattr           = ll_getxattr,
+        .listxattr          = ll_listxattr,
+        .removexattr        = ll_removexattr,
+        .permission         = ll_inode_permission,
 };
index ae0d11f..befc716 100644 (file)
@@ -320,7 +320,7 @@ static int ll_special_open(struct inode *inode, struct file *filp)
                 rc = err;
         }
 
-        req = it->d.lustre.it_data;
+        req = LUSTRE_IT(it)->it_data;
         if (req)
                 ptlrpc_req_finished(req);
 
@@ -338,13 +338,18 @@ static int ll_special_file_release(struct inode *inode, struct file *filp)
 }
 
 struct inode_operations ll_special_inode_operations = {
-        .setattr_raw    = ll_setattr_raw,
         .setattr        = ll_setattr,
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-        .getattr_it     = ll_getattr,
+        .getattr        = ll_getattr,
 #else
         .revalidate_it  = ll_inode_revalidate_it,
 #endif
+        .setxattr       = ll_setxattr,
+        .getxattr       = ll_getxattr,
+        .listxattr      = ll_listxattr,
+        .removexattr    = ll_removexattr,
+        .permission     = ll_inode_permission,
+        
 };
 
 struct file_operations ll_special_chr_inode_fops = {
index fcb89b0..f267dfc 100644 (file)
@@ -64,8 +64,7 @@ static struct super_block *lustre_read_super(struct super_block *sb,
 static void ll_umount_lustre(struct super_block *sb)
 {
         struct ll_sb_info *sbi = ll_s2sbi(sb);
-
-        ll_gns_umount_all(sbi, 0);
+        ll_gns_check_all(sbi, LL_GNS_UMOUNT);
 }
 
 static struct file_system_type lustre_lite_fs_type = {
@@ -108,6 +107,16 @@ static int __init init_lustre_lite(void)
         if (ll_file_data_slab == NULL)
                 return -ENOMEM;
 
+        ll_intent_slab = kmem_cache_create("lustre_intent_data",
+                                              sizeof(struct lustre_intent_data),
+                                              0, SLAB_HWCACHE_ALIGN, NULL,
+                                              NULL);
+        if (ll_intent_slab == NULL) {
+                kmem_cache_destroy(ll_file_data_slab);
+                return -ENOMEM;
+        }
+
+
         proc_lustre_fs_root = proc_lustre_root ? proc_mkdir("llite", proc_lustre_root) : NULL;
 
         rc = register_filesystem(&lustre_lite_fs_type);
@@ -146,6 +155,8 @@ static void __exit exit_lustre_lite(void)
 
         LASSERTF(kmem_cache_destroy(ll_file_data_slab) == 0,
                  "couldn't destroy ll_file_data slab\n");
+        LASSERTF(kmem_cache_destroy(ll_intent_slab) == 0,
+                 "couldn't destroy ll_intent_slab slab\n");
 
         if (proc_lustre_fs_root) {
                 lprocfs_remove(proc_lustre_fs_root);
index 8ebcc4b..22c165a 100644 (file)
@@ -114,7 +114,7 @@ struct file_system_type lustre_lite_fs_type = {
         .name         = "lustre_lite",
         .get_sb       = ll_get_sb,
         .kill_sb      = kill_anon_super,
-        .fs_flags     = FS_BINARY_MOUNTDATA,
+        .fs_flags     = FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 
 struct file_system_type lustre_fs_type = {
@@ -122,7 +122,7 @@ struct file_system_type lustre_fs_type = {
         .name         = "lustre",
         .get_sb       = lustre_get_sb,
         .kill_sb      = kill_anon_super,
-        .fs_flags     = FS_BINARY_MOUNTDATA,
+        .fs_flags     = FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 
 static int __init init_lustre_lite(void)
@@ -143,6 +143,16 @@ static int __init init_lustre_lite(void)
                 rc = -ENOMEM;
                 goto out;
         }
+        ll_intent_slab = kmem_cache_create("lustre_intent_data",
+                                              sizeof(struct lustre_intent_data),
+                                              0, SLAB_HWCACHE_ALIGN, NULL,
+                                              NULL);
+        if (ll_intent_slab == NULL) {
+                kmem_cache_destroy(ll_file_data_slab);
+                ll_destroy_inodecache();
+                return -ENOMEM;
+        }
+
 
         proc_lustre_fs_root = proc_lustre_root ?
                               proc_mkdir("llite", proc_lustre_root) : NULL;
@@ -178,9 +188,13 @@ static void __exit exit_lustre_lite(void)
         unregister_filesystem(&lustre_fs_type);
         unregister_filesystem(&lustre_lite_fs_type);
         ll_destroy_inodecache();
+
+        ll_gns_stop_thread();
         
         LASSERTF(kmem_cache_destroy(ll_file_data_slab) == 0,
                  "couldn't destroy ll_file_data slab\n");
+        LASSERTF(kmem_cache_destroy(ll_intent_slab) == 0,
+                 "couldn't destroy ll_intent_slab slab\n");
         if (proc_lustre_fs_root) {
                 lprocfs_remove(proc_lustre_fs_root);
                 proc_lustre_fs_root = NULL;
index f913d8a..6061f74 100644 (file)
@@ -49,8 +49,9 @@ static int ll_readlink_internal(struct inode *inode,
         }
 
         ll_inode2id(&id, inode);
-        rc = md_getattr(sbi->ll_md_exp, &id, OBD_MD_LINKNAME, symlen,
+        rc = md_getattr(sbi->ll_md_exp, &id, OBD_MD_LINKNAME, NULL, 0, symlen,
                         request);
+
         if (rc) {
                 if (rc != -ENOENT)
                         CERROR("inode %lu: rc = %d\n", inode->i_ino, rc);
@@ -152,11 +153,14 @@ static int ll_follow_link(struct dentry *dentry, struct nameidata *nd)
 struct inode_operations ll_fast_symlink_inode_operations = {
         .readlink       = ll_readlink,
         .setattr        = ll_setattr,
-        .setattr_raw    = ll_setattr_raw,
         .follow_link    = ll_follow_link,
+        .setxattr       = ll_setxattr,
+        .getxattr       = ll_getxattr,
+        .listxattr      = ll_listxattr,
+        .removexattr    = ll_removexattr,
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         .revalidate_it  = ll_inode_revalidate_it
 #else 
-        .getattr_it     = ll_getattr
+        .getattr        = ll_getattr
 #endif
 };
index ce3d0f0..205d4a7 100644 (file)
 #include <linux/lprocfs_status.h>
 #include <linux/lustre_fsfilt.h>
 #include <linux/obd_lmv.h>
+#include <linux/namei.h>
+#include <linux/lustre_lite.h>
 #include "lmv_internal.h"
 
 
 static inline void lmv_drop_intent_lock(struct lookup_intent *it)
 {
-        if (it->d.lustre.it_lock_mode != 0)
-                ldlm_lock_decref((void *)&it->d.lustre.it_lock_handle,
-                                 it->d.lustre.it_lock_mode);
+        if (LUSTRE_IT(it)->it_lock_mode != 0)
+                ldlm_lock_decref((void *)&LUSTRE_IT(it)->it_lock_handle,
+                                 LUSTRE_IT(it)->it_lock_mode);
 }
 
 int lmv_handle_remote_inode(struct obd_export *exp, void *lmm,
@@ -89,17 +91,17 @@ int lmv_handle_remote_inode(struct obd_export *exp, void *lmm,
                 }
 
                 /* we got LOOKUP lock, but we really need attrs */
-                pmode = it->d.lustre.it_lock_mode;
+                pmode = LUSTRE_IT(it)->it_lock_mode;
                 if (pmode) {
-                        memcpy(&plock, &it->d.lustre.it_lock_handle,
+                        memcpy(&plock, &LUSTRE_IT(it)->it_lock_handle,
                                sizeof(plock));
-                        it->d.lustre.it_lock_mode = 0;
+                        LUSTRE_IT(it)->it_lock_mode = 0;
                 }
 
                 LASSERT((body->valid & OBD_MD_FID) != 0);
                 
                 nid = body->id1;
-                it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE;
+                LUSTRE_IT(it)->it_disposition &= ~DISP_ENQ_COMPLETE;
                 rc = md_intent_lock(lmv->tgts[id_group(&nid)].ltd_exp, &nid, NULL,
                                     0, lmm, lmmsize, NULL, it, flags, &req, cb_blocking);
 
@@ -110,9 +112,9 @@ int lmv_handle_remote_inode(struct obd_export *exp, void *lmm,
                  */
                 if (rc == 0) {
                         lmv_drop_intent_lock(it);
-                        memcpy(&it->d.lustre.it_lock_handle, &plock,
+                        memcpy(&LUSTRE_IT(it)->it_lock_handle, &plock,
                                sizeof(plock));
-                        it->d.lustre.it_lock_mode = pmode;
+                        LUSTRE_IT(it)->it_lock_mode = pmode;
                 } else if (pmode)
                         ldlm_lock_decref(&plock, pmode);
 
@@ -194,7 +196,7 @@ repeat:
          * nothing is found, do not access body->id1 as it is zero and thus
          * pointless.
          */
-        if (it->d.lustre.it_disposition & DISP_LOOKUP_NEG)
+        if (LUSTRE_IT(it)->it_disposition & DISP_LOOKUP_NEG)
                 RETURN(0);
 
         /* caller may use attrs MDS returns on IT_OPEN lock request so, we have
@@ -317,7 +319,7 @@ int lmv_intent_getattr(struct obd_export *exp, struct lustre_id *pid,
          * nothing is found, do not access body->id1 as it is zero and thus
          * pointless.
          */
-        if (it->d.lustre.it_disposition & DISP_LOOKUP_NEG)
+        if (LUSTRE_IT(it)->it_disposition & DISP_LOOKUP_NEG)
                 RETURN(0);
                 
         body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body));
@@ -406,11 +408,13 @@ int lmv_lookup_slaves(struct obd_export *exp, struct ptlrpc_request **reqp)
                 /* is obj valid? */
                 memset(&it, 0, sizeof(it));
                 it.it_op = IT_GETATTR;
+                OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data));
+
                 rc = md_intent_lock(lmv->tgts[id_group(&id)].ltd_exp, &id,
                                     NULL, 0, NULL, 0, &id, &it, 0, &req,
                                     lmv_dirobj_blocking_ast);
                 
-                lockh = (struct lustre_handle *)&it.d.lustre.it_lock_handle;
+                lockh = (struct lustre_handle *)&LUSTRE_IT(&it)->it_lock_handle;
                 if (rc > 0 && req == NULL) {
                         /* nice, this slave is valid */
                         LASSERT(req == NULL);
@@ -418,10 +422,11 @@ int lmv_lookup_slaves(struct obd_export *exp, struct ptlrpc_request **reqp)
                         goto release_lock;
                 }
 
-                if (rc < 0)
+                if (rc < 0) {
+                        OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
                         /* error during lookup */
                         GOTO(cleanup, rc);
-                
+                } 
                 lock = ldlm_handle2lock(lockh);
                 LASSERT(lock);
 
@@ -442,8 +447,9 @@ int lmv_lookup_slaves(struct obd_export *exp, struct ptlrpc_request **reqp)
 release_lock:
                 lmv_update_body_from_obj(body, obj->objs + i);
 
-                if (it.d.lustre.it_lock_mode)
-                        ldlm_lock_decref(lockh, it.d.lustre.it_lock_mode);
+                if (LUSTRE_IT(&it)->it_lock_mode)
+                        ldlm_lock_decref(lockh, LUSTRE_IT(&it)->it_lock_mode);
+                OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
         }
 
         EXIT;
@@ -655,8 +661,10 @@ int lmv_revalidate_slaves(struct obd_export *exp, struct ptlrpc_request **reqp,
 
                 memset(&it, 0, sizeof(it));
                 it.it_op = IT_GETATTR;
+
                 cb = lmv_dirobj_blocking_ast;
 
+                OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data));
                 if (id_equal_fid(&id, &obj->id)) {
                         if (master_valid) {
                                 /* lmv_intent_getattr() already checked
@@ -678,11 +686,12 @@ int lmv_revalidate_slaves(struct obd_export *exp, struct ptlrpc_request **reqp,
                         cb = cb_blocking;
                 }
 
+               
                 /* is obj valid? */
                 rc = md_intent_lock(lmv->tgts[id_group(&id)].ltd_exp,
                                     &id, NULL, 0, NULL, 0, &id, &it, 0, 
                                     &req, cb);
-                lockh = (struct lustre_handle *) &it.d.lustre.it_lock_handle;
+                lockh = (struct lustre_handle *) &LUSTRE_IT(&it)->it_lock_handle;
                 if (rc > 0 && req == NULL) {
                         /* nice, this slave is valid */
                         LASSERT(req == NULL);
@@ -690,17 +699,18 @@ int lmv_revalidate_slaves(struct obd_export *exp, struct ptlrpc_request **reqp,
                         goto release_lock;
                 }
 
-                if (rc < 0)
+                if (rc < 0) {
+                        OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
                         /* error during revalidation */
                         GOTO(cleanup, rc);
-
+                }
                 if (master) {
                         LASSERT(master_valid == 0);
                         /* save lock on master to be returned to the caller */
                         CDEBUG(D_OTHER, "no lock on master yet\n");
                         memcpy(&master_lockh, lockh, sizeof(master_lockh));
-                        master_lock_mode = it.d.lustre.it_lock_mode;
-                        it.d.lustre.it_lock_mode = 0;
+                        master_lock_mode = LUSTRE_IT(&it)->it_lock_mode;
+                        LUSTRE_IT(&it)->it_lock_mode = 0;
                 } else {
                         /* this is slave. we want to control it */
                         lock = ldlm_handle2lock(lockh);
@@ -726,14 +736,15 @@ update:
                 
                 CDEBUG(D_OTHER, "fresh: %lu\n",
                        (unsigned long)obj->objs[i].size);
-
+                
                 if (req)
                         ptlrpc_req_finished(req);
 release_lock:
                 size += obj->objs[i].size;
 
-                if (it.d.lustre.it_lock_mode)
-                        ldlm_lock_decref(lockh, it.d.lustre.it_lock_mode);
+                if (LUSTRE_IT(&it)->it_lock_mode)
+                        ldlm_lock_decref(lockh, LUSTRE_IT(&it)->it_lock_mode);
+                OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
         }
 
         if (*reqp) {
@@ -757,16 +768,16 @@ release_lock:
 //                        body->mds = id_group(&obj->id);
                 }
                 if (master_valid == 0) {
-                        memcpy(&oit->d.lustre.it_lock_handle,
+                        memcpy(&LUSTRE_IT(oit)->it_lock_handle,
                                &master_lockh, sizeof(master_lockh));
-                        oit->d.lustre.it_lock_mode = master_lock_mode;
+                        LUSTRE_IT(oit)->it_lock_mode = master_lock_mode;
                 }
                 rc = 0;
         } else {
                 /* it seems all the attrs are fresh and we did no request */
                 CDEBUG(D_OTHER, "all the attrs were fresh\n");
                 if (master_valid == 0)
-                        oit->d.lustre.it_lock_mode = master_lock_mode;
+                        LUSTRE_IT(oit)->it_lock_mode = master_lock_mode;
                 rc = 1;
         }
 
index 7aac1f0..86b1f97 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/pagemap.h>
 #include <asm/div64.h>
 #include <linux/seq_file.h>
+#include <linux/namei.h>
 #else
 #include <liblustre.h>
 #endif
@@ -47,6 +48,7 @@
 #include <linux/lprocfs_status.h>
 #include <linux/lustre_fsfilt.h>
 #include <linux/obd_lmv.h>
+#include <linux/lustre_lite.h>
 #include "lmv_internal.h"
 
 /* object cache. */
@@ -675,8 +677,8 @@ static int lmv_getstatus(struct obd_export *exp, struct lustre_id *id)
 }
 
 static int lmv_getattr(struct obd_export *exp, struct lustre_id *id,
-                       __u64 valid, unsigned int ea_size,
-                       struct ptlrpc_request **request)
+                       __u64 valid, const char *ea_name, int ea_namelen,
+                       unsigned int ea_size, struct ptlrpc_request **request)
 {
         struct obd_device *obd = exp->exp_obd;
         struct lmv_obd *lmv = &obd->u.lmv;
@@ -690,8 +692,9 @@ static int lmv_getattr(struct obd_export *exp, struct lustre_id *id,
 
         LASSERT(i < lmv->desc.ld_tgt_count);
 
+
         rc = md_getattr(lmv->tgts[i].ltd_exp, id, valid,
-                        ea_size, request);
+                        ea_name, ea_namelen, ea_size, request);
         if (rc)
                 RETURN(rc);
         
@@ -860,7 +863,7 @@ int lmv_get_mea_and_update_object(struct obd_export *exp,
 
         /* time to update mea of parent id */
         rc = md_getattr(lmv->tgts[id_group(id)].ltd_exp,
-                        id, valid, mealen, &req);
+                        id, valid, NULL, 0, mealen, &req);
         if (rc) {
                 CERROR("md_getattr() failed, error %d\n", rc);
                 GOTO(cleanup, rc);
@@ -994,17 +997,17 @@ int lmv_enqueue_slaves(struct obd_export *exp, int locktype,
                                 cb_compl, cb_blocking, cb_data);
                 
                 CDEBUG(D_OTHER, "take lock on slave "DLID4" -> %d/%d\n",
-                       OLID4(&mea->mea_ids[i]), rc, it->d.lustre.it_status);
+                       OLID4(&mea->mea_ids[i]), rc, LUSTRE_IT(it)->it_status);
                 if (rc)
                         GOTO(cleanup, rc);
-                if (it->d.lustre.it_data) {
+                if (LUSTRE_IT(it)->it_data) {
                         struct ptlrpc_request *req;
-                        req = (struct ptlrpc_request *)it->d.lustre.it_data;
+                        req = (struct ptlrpc_request *) LUSTRE_IT(it)->it_data;
                         ptlrpc_req_finished(req);
                 }
                 
-                if (it->d.lustre.it_status)
-                        GOTO(cleanup, rc = it->d.lustre.it_status);
+                if (LUSTRE_IT(it)->it_status)
+                        GOTO(cleanup, rc = LUSTRE_IT(it)->it_status);
         }
         
         OBD_FREE(data2, sizeof(*data2));
@@ -1827,7 +1830,46 @@ int lmv_set_info(struct obd_export *exp, obd_count keylen,
                 lmv_set_timeouts(obd);
                 RETURN(0);
         }
-        
+
+        /* maybe this could be default */
+        if ((keylen == strlen("sec") && strcmp(key, "sec") == 0) ||
+            (keylen == strlen("nllu") && strcmp(key, "nllu") == 0)) {
+                struct lmv_tgt_desc *tgt;
+                struct obd_export *exp;
+                int rc = 0, err, i;
+
+                spin_lock(&lmv->lmv_lock);
+                for (i = 0, tgt = lmv->tgts; i < lmv->desc.ld_tgt_count;
+                     i++, tgt++) {
+                        exp = tgt->ltd_exp;
+                        /* during setup time the connections to mdc might
+                         * haven't been established.
+                         */
+                        if (exp == NULL) {
+                                struct obd_device *tgt_obd;
+
+                                tgt_obd = class_find_client_obd(&tgt->uuid,
+                                                                LUSTRE_MDC_NAME,
+                                                                &obd->obd_uuid);
+                                if (!tgt_obd) {
+                                        CERROR("can't set info %s, "
+                                               "device %s not attached?\n",
+                                                (char *) key, tgt->uuid.uuid);
+                                        rc = -EINVAL;
+                                        continue;
+                                }
+                                exp = tgt_obd->obd_self_export;
+                        }
+
+                        err = obd_set_info(exp, keylen, key, vallen, val);
+                        if (!rc)
+                                rc = err;
+                }
+                spin_unlock(&lmv->lmv_lock);
+
+                RETURN(rc);
+        }
+
         RETURN(-EINVAL);
 }
 
index 447320c..4c2ef10 100644 (file)
@@ -310,7 +310,7 @@ lmv_create_obj(struct obd_export *exp, struct lustre_id *id, struct mea *mea)
                 valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
 
                 rc = md_getattr(lmv->tgts[id_group(id)].ltd_exp,
-                                id, valid, mealen, &req);
+                                id, valid, NULL, 0, mealen, &req);
                 if (rc) {
                         CERROR("md_getattr() failed, error %d\n", rc);
                         GOTO(cleanup, obj = ERR_PTR(rc));
index 18aa9e4..7246a1d 100644 (file)
@@ -3116,6 +3116,41 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen,
         } else if (KEY_IS("unlinked") || KEY_IS("unrecovery")) {
                 if (vallen != 0)
                         RETURN(-EINVAL);
+        } else if (KEY_IS("sec")) {
+                struct lov_tgt_desc *tgt;
+                struct obd_export *exp;
+                int rc = 0, err, i;
+
+                spin_lock(&lov->lov_lock);
+                for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count;
+                     i++, tgt++) {
+                        exp = tgt->ltd_exp;
+                        /* during setup time the connections to osc might
+                         * haven't been established.
+                         */
+                        if (exp == NULL) {
+                                struct obd_device *tgt_obd;
+
+                                tgt_obd = class_find_client_obd(&tgt->uuid,
+                                                                LUSTRE_OSC_NAME,
+                                                                &obddev->obd_uuid);
+                                if (!tgt_obd) {
+                                        CERROR("can't set security flavor, "
+                                               "device %s not attached?\n",
+                                                tgt->uuid.uuid);
+                                        rc = -EINVAL;
+                                        continue;
+                                }
+                                exp = tgt_obd->obd_self_export;
+                        }
+
+                        err = obd_set_info(exp, keylen, key, vallen, val);
+                        if (!rc)
+                                rc = err;
+                }
+                spin_unlock(&lov->lov_lock);
+
+                RETURN(rc);
         } else {
                 RETURN(-EINVAL);
         }
index 0bf6444..fbcf400 100644 (file)
@@ -139,7 +139,7 @@ static int lvfs_reint_create(struct super_block *sb, struct reint_record *r_rec)
                 handle = fsfilt->fs_start(dir, FSFILT_OP_SYMLINK, NULL, 0);
                 if (IS_ERR(handle))
                         GOTO(cleanup, rc = PTR_ERR(handle));
-                rc = ll_vfs_symlink(dir, dentry, new_path);
+                rc = ll_vfs_symlink(dir, dentry, new_path, S_IALLUGO);
                 break;
         }
         case S_IFCHR:
index e46e120..4a71d24 100644 (file)
@@ -5,7 +5,7 @@
 
 if LIBLUSTRE
 noinst_LIBRARIES = libmdc.a
-libmdc_a_SOURCES = mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c
+libmdc_a_SOURCES = #mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c
 libmdc_a_CPPFLAGS = $(LLCPPFLAGS)
 libmdc_a_CFLAGS = $(LLCFLAGS)
 endif
index 2478afc..e3bda59 100644 (file)
 #include <linux/lustre_mds.h>
 #include <linux/lustre_dlm.h>
 #include <linux/lprocfs_status.h>
+#include <linux/lustre_acl.h>
+#include <linux/lustre_lite.h>
 #include "mdc_internal.h"
 
 int it_disposition(struct lookup_intent *it, int flag)
 {
-        return it->d.lustre.it_disposition & flag;
+        return LUSTRE_IT(it)->it_disposition & flag;
 }
 EXPORT_SYMBOL(it_disposition);
 
 void it_set_disposition(struct lookup_intent *it, int flag)
 {
-        it->d.lustre.it_disposition |= flag;
+        LUSTRE_IT(it)->it_disposition |= flag;
 }
 EXPORT_SYMBOL(it_set_disposition);
 
@@ -88,33 +90,33 @@ int it_open_error(int phase, struct lookup_intent *it)
 {
         if (it_disposition(it, DISP_OPEN_OPEN)) {
                 if (phase == DISP_OPEN_OPEN)
-                        return it->d.lustre.it_status;
+                        return LUSTRE_IT(it)->it_status;
                 else
                         return 0;
         }
 
         if (it_disposition(it, DISP_OPEN_CREATE)) {
                 if (phase == DISP_OPEN_CREATE)
-                        return it->d.lustre.it_status;
+                        return LUSTRE_IT(it)->it_status;
                 else
                         return 0;
         }
 
         if (it_disposition(it, DISP_LOOKUP_EXECD)) {
                 if (phase == DISP_LOOKUP_EXECD)
-                        return it->d.lustre.it_status;
+                        return LUSTRE_IT(it)->it_status;
                 else
                         return 0;
         }
 
         if (it_disposition(it, DISP_IT_EXECD)) {
                 if (phase == DISP_IT_EXECD)
-                        return it->d.lustre.it_status;
+                        return LUSTRE_IT(it)->it_status;
                 else
                         return 0;
         }
-        CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition,
-               it->d.lustre.it_status);
+        CERROR("it disp: %X, status: %d\n", LUSTRE_IT(it)->it_disposition,
+               LUSTRE_IT(it)->it_status);
         LBUG();
         return 0;
 }
@@ -199,10 +201,9 @@ int mdc_enqueue(struct obd_export *exp,
         int reqsize[6] = {[MDS_REQ_SECDESC_OFF] = 0,
                           [MDS_REQ_INTENT_LOCKREQ_OFF] = sizeof(*lockreq),
                           [MDS_REQ_INTENT_IT_OFF] = sizeof(*lit)};
-        int repsize[4] = {sizeof(struct ldlm_reply),
+        int repsize[5] = {sizeof(struct ldlm_reply),
                           sizeof(struct mds_body),
-                          obddev->u.cli.cl_max_mds_easize,
-                          obddev->u.cli.cl_max_mds_cookiesize};
+                          obddev->u.cli.cl_max_mds_easize};
         int req_buffers = 3, reply_buffers = 0;
         int rc, flags = LDLM_FL_HAS_INTENT;
         void *eadata;
@@ -240,10 +241,13 @@ int mdc_enqueue(struct obd_export *exp,
                               it->it_create_mode, 0, it->it_flags,
                               lmm, lmmsize);
                 /* get ready for the reply */
-                reply_buffers = 3;
-                req->rq_replen = lustre_msg_size(3, repsize);
+                repsize[3] = 4;
+                repsize[4] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
+                reply_buffers = 5;
+                req->rq_replen = lustre_msg_size(5, repsize);
         } else if (it->it_op & (IT_GETATTR | IT_LOOKUP | IT_CHDIR)) {
-                __u64 valid = data->valid | OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE;
+                __u64 valid = data->valid | OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE |
+                            OBD_MD_FLACL_ACCESS;
 
                 reqsize[req_buffers++] = sizeof(struct mds_body);
                 reqsize[req_buffers++] = data->namelen + 1;
@@ -267,8 +271,10 @@ int mdc_enqueue(struct obd_export *exp,
                                  valid, it->it_flags, data);
                 
                 /* get ready for the reply */
-                reply_buffers = 3;
-                req->rq_replen = lustre_msg_size(3, repsize);
+                repsize[3] = 4;
+                repsize[4] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
+                reply_buffers = 5;
+                req->rq_replen = lustre_msg_size(5, repsize);
         } else if (it->it_op == IT_READDIR) {
                 policy.l_inodebits.bits = MDS_INODELOCK_UPDATE;
                 req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION,
@@ -353,12 +359,12 @@ int mdc_enqueue(struct obd_export *exp,
         LASSERT(dlm_rep != NULL);           /* checked by ldlm_cli_enqueue() */
         LASSERT_REPSWABBED(req, 0);         /* swabbed by ldlm_cli_enqueue() */
 
-        it->d.lustre.it_disposition = (int) dlm_rep->lock_policy_res1;
-        it->d.lustre.it_status = (int) dlm_rep->lock_policy_res2;
-        it->d.lustre.it_lock_mode = lock_mode;
-        it->d.lustre.it_data = req;
+        LUSTRE_IT(it)->it_disposition = (int) dlm_rep->lock_policy_res1;
+        LUSTRE_IT(it)->it_status = (int) dlm_rep->lock_policy_res2;
+        LUSTRE_IT(it)->it_lock_mode = lock_mode;
+        LUSTRE_IT(it)->it_data = req;
 
-        if (it->d.lustre.it_status < 0 && req->rq_replay) {
+        if (LUSTRE_IT(it)->it_status < 0 && req->rq_replay) {
                 LASSERT(req->rq_transno == 0);
                 /* Don't hold error requests for replay. */
                 spin_lock(&req->rq_lock);
@@ -367,10 +373,11 @@ int mdc_enqueue(struct obd_export *exp,
         }
 
         DEBUG_REQ(D_RPCTRACE, req, "disposition: %x, status: %d",
-                  it->d.lustre.it_disposition, it->d.lustre.it_status);
+                  LUSTRE_IT(it)->it_disposition, LUSTRE_IT(it)->it_status);
 
         /* We know what to expect, so we do any byte flipping required here */
-        LASSERT(reply_buffers == 4 || reply_buffers == 3 || reply_buffers == 1);
+        LASSERT(reply_buffers == 5 || reply_buffers == 4 || 
+                reply_buffers == 3 || reply_buffers == 1);
         if (reply_buffers >= 3) {
                 struct mds_body *body;
 
@@ -427,15 +434,15 @@ EXPORT_SYMBOL(mdc_enqueue);
  * ll_create/ll_open gets called.
  *
  * The server will return to us, in it_disposition, an indication of
- * exactly what d.lustre.it_status refers to.
+ * exactly what d.lustre->it_status refers to.
  *
- * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call,
+ * If DISP_OPEN_OPEN is set, then d.lustre->it_status refers to the open() call,
  * otherwise if DISP_OPEN_CREATE is set, then it status is the
  * creation failure mode.  In either case, one of DISP_LOOKUP_NEG or
  * DISP_LOOKUP_POS will be set, indicating whether the child lookup
  * was successful.
  *
- * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the
+ * Else, if DISP_LOOKUP_EXECD then d.lustre->it_status is the rc of the
  * child lookup.
  */
 int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid, 
@@ -486,9 +493,9 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid,
                                              &lockh);
                 }
                 if (rc) {
-                        memcpy(&it->d.lustre.it_lock_handle, &lockh,
+                        memcpy(&LUSTRE_IT(it)->it_lock_handle, &lockh,
                                sizeof(lockh));
-                        it->d.lustre.it_lock_mode = mode;
+                        LUSTRE_IT(it)->it_lock_mode = mode;
                 }
 
                 /* Only return failure if it was not GETATTR by cid (from
@@ -524,9 +531,9 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid,
                 if (rc < 0)
                         RETURN(rc);
                 
-                memcpy(&it->d.lustre.it_lock_handle, &lockh, sizeof(lockh));
+                memcpy(&LUSTRE_IT(it)->it_lock_handle, &lockh, sizeof(lockh));
         }
-        request = *reqp = it->d.lustre.it_data;
+        request = *reqp = LUSTRE_IT(it)->it_data;
         LASSERT(request != NULL);
         
         /* If we're doing an IT_OPEN which did not result in an actual
@@ -538,7 +545,7 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid,
          * 3440) */
         if (it->it_op & IT_OPEN) {
                 if (!it_disposition(it, DISP_OPEN_OPEN) ||
-                    it->d.lustre.it_status != 0) {
+                    LUSTRE_IT(it)->it_status != 0) {
                         unsigned long irqflags;
 
                         spin_lock_irqsave(&request->rq_lock, irqflags);
@@ -549,8 +556,8 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid,
         if (!it_disposition(it, DISP_IT_EXECD)) {
                 /* The server failed before it even started executing the
                  * intent, i.e. because it couldn't unpack the request. */
-                LASSERT(it->d.lustre.it_status != 0);
-                RETURN(it->d.lustre.it_status);
+                LASSERT(LUSTRE_IT(it)->it_status != 0);
+                RETURN(LUSTRE_IT(it)->it_status);
         }
         rc = it_open_error(DISP_IT_EXECD, it);
         if (rc)
@@ -620,15 +627,15 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid,
                 if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
                                     LDLM_IBITS, &policy, LCK_NL, &old_lock)) {
                         ldlm_lock_decref_and_cancel(&lockh,
-                                                    it->d.lustre.it_lock_mode);
+                                                    LUSTRE_IT(it)->it_lock_mode);
                         memcpy(&lockh, &old_lock, sizeof(old_lock));
-                        memcpy(&it->d.lustre.it_lock_handle, &lockh,
+                        memcpy(&LUSTRE_IT(it)->it_lock_handle, &lockh,
                                sizeof(lockh));
                 }
         }
         CDEBUG(D_DENTRY, "D_IT dentry %*s intent: %s status %d disp %x rc %d\n",
-               len, name, ldlm_it2str(it->it_op), it->d.lustre.it_status,
-               it->d.lustre.it_disposition, rc);
+               len, name, ldlm_it2str(it->it_op), LUSTRE_IT(it)->it_status,
+               LUSTRE_IT(it)->it_disposition, rc);
 
         RETURN(rc);
 }
index bd7af6a..6cf6e08 100644 (file)
@@ -36,7 +36,9 @@
 #include <linux/obd_class.h>
 #include <linux/lustre_mds.h>
 #include <linux/lustre_dlm.h>
+#include <linux/lustre_sec.h>
 #include <linux/lprocfs_status.h>
+#include <linux/lustre_acl.h>
 #include "mdc_internal.h"
 
 #define REQUEST_MINOR 244
@@ -159,10 +161,10 @@ int mdc_getstatus(struct obd_export *exp, struct lustre_id *rootid)
 int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size,
                        struct ptlrpc_request *req)
 {
-        struct mds_body *body;
+        struct mds_body *body, *reqbody;
         void            *eadata;
         int              rc;
-        int              repsize[2] = {sizeof(*body), 0};
+        int              repsize[4] = {sizeof(*body)};
         int              bufcount = 1;
         ENTRY;
 
@@ -173,6 +175,14 @@ int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size,
                 CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n",
                        ea_size);
         }
+
+        reqbody = lustre_msg_buf(req->rq_reqmsg, 1, sizeof(*reqbody));
+
+        if (reqbody->valid & OBD_MD_FLACL_ACCESS) {
+                repsize[bufcount++] = 4;
+                repsize[bufcount++] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
+        }
+
         req->rq_replen = lustre_msg_size(bufcount, repsize);
 
         mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL);
@@ -191,25 +201,32 @@ int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size,
         CDEBUG(D_NET, "mode: %o\n", body->mode);
 
         LASSERT_REPSWAB (req, 1);
-        if (body->eadatasize != 0) {
+
+        /* Skip the check if getxattr/listxattr are called with no buffers */
+        if ((reqbody->valid & (OBD_MD_FLEA | OBD_MD_FLEALIST)) &&
+            (reqbody->eadatasize != 0)){
+                if (body->eadatasize != 0) {
                 /* reply indicates presence of eadata; check it's there... */
-                eadata = lustre_msg_buf (req->rq_repmsg, 1, body->eadatasize);
-                if (eadata == NULL) {
-                        CERROR ("Missing/short eadata\n");
-                        RETURN (-EPROTO);
-                }
-        }
+                        eadata = lustre_msg_buf (req->rq_repmsg, 1,
+                                                 body->eadatasize);
+                        if (eadata == NULL) {
+                                CERROR ("Missing/short eadata\n");
+                                RETURN (-EPROTO);
+                        }
+                 }
+         }
 
         RETURN (0);
 }
 
 int mdc_getattr(struct obd_export *exp, struct lustre_id *id,
-                __u64 valid, unsigned int ea_size,
-                struct ptlrpc_request **request)
+                __u64 valid, const char *ea_name, int ea_namelen,
+                unsigned int ea_size, struct ptlrpc_request **request)
 {
         struct ptlrpc_request *req;
         struct mds_body *body;
-        int size[2] = {0, sizeof(*body)};
+        int bufcount = 2;
+        int size[3] = {0, sizeof(*body)};
         int rc;
         ENTRY;
 
@@ -218,8 +235,14 @@ int mdc_getattr(struct obd_export *exp, struct lustre_id *id,
          */
         size[0] = mdc_get_secdesc_size();
 
+        LASSERT((ea_name != NULL) == (ea_namelen != 0));
+        if (valid & (OBD_MD_FLEA | OBD_MD_FLEALIST)) {
+                size[bufcount] = ea_namelen;
+                bufcount++;
+        }
+
         req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION,
-                              MDS_GETATTR, 2, size, NULL);
+                              MDS_GETATTR, bufcount, size, NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
@@ -230,6 +253,13 @@ int mdc_getattr(struct obd_export *exp, struct lustre_id *id,
         body->valid = valid;
         body->eadatasize = ea_size;
 
+
+        if (valid & OBD_MD_FLEA) {
+                LASSERT(strnlen(ea_name, ea_namelen) == (ea_namelen - 1));
+                memcpy(lustre_msg_buf(req->rq_reqmsg, 2, ea_namelen),
+                       ea_name, ea_namelen);
+        }
+
         rc = mdc_getattr_common(exp, ea_size, req);
         if (rc != 0) {
                 ptlrpc_req_finished (req);
@@ -304,6 +334,9 @@ int mdc_req2lustre_md(struct obd_export *exp_lmv, struct ptlrpc_request *req,
                       unsigned int offset, struct obd_export *exp_lov, 
                       struct lustre_md *md)
 {
+        void *buf;
+        int size, acl_off;
+        struct posix_acl *acl;
         int rc = 0;
         ENTRY;
 
@@ -378,8 +411,38 @@ int mdc_req2lustre_md(struct obd_export *exp_lmv, struct ptlrpc_request *req,
                 CERROR("Detected invalid mea, which does not "
                        "support neither old either new format.\n");
         } else {
-                LASSERT(0);
+                LASSERT(S_ISCHR(md->body->mode) ||
+                        S_ISBLK(md->body->mode) ||
+                        S_ISFIFO(md->body->mode)||
+                        S_ISLNK(md->body->mode) ||
+                        S_ISSOCK(md->body->mode));
         }
+
+        acl_off = (md->body->valid & OBD_MD_FLEASIZE) ? (offset + 2) :
+                  (offset + 1);
+
+        if (md->body->valid & OBD_MD_FLACL_ACCESS) {
+                size = le32_to_cpu(*(__u32 *) lustre_msg_buf(req->rq_repmsg, 
+                                   acl_off, 4));
+                buf = lustre_msg_buf(req->rq_repmsg, acl_off + 1, size);
+
+                acl = posix_acl_from_xattr(buf, size);
+                if (IS_ERR(acl)) {
+                        rc = PTR_ERR(acl);
+                        CERROR("convert xattr to acl failed: %d\n", rc);
+                        RETURN(rc);
+                } else if (acl) {
+                        rc = posix_acl_valid(acl);
+                        if (rc) {
+                                CERROR("acl valid error: %d\n", rc);
+                                posix_acl_release(acl);
+                                RETURN(rc);
+                        }
+                }
+
+                md->acl_access = acl;
+        }
+
         RETURN(rc);
 }
 
@@ -844,7 +907,38 @@ int mdc_set_info(struct obd_export *exp, obd_count keylen,
                 imp->imp_server_timeout = 1;
                 CDEBUG(D_OTHER, "%s: timeout / 2\n", exp->exp_obd->obd_name);
                 RETURN(0);
+        } else if (keylen == strlen("sec") && memcmp(key, "sec", keylen) == 0) {
+                struct client_obd *cli = &exp->exp_obd->u.cli;
+
+                if (vallen == strlen("null") &&
+                    memcmp(val, "null", vallen) == 0) {
+                        cli->cl_sec_flavor = PTLRPC_SEC_NULL;
+                        cli->cl_sec_subflavor = 0;
+                        RETURN(0);
+                }
+                if (vallen == strlen("krb5i") &&
+                    memcmp(val, "krb5i", vallen) == 0) {
+                        cli->cl_sec_flavor = PTLRPC_SEC_GSS;
+                        cli->cl_sec_subflavor = PTLRPC_SEC_GSS_KRB5I;
+                        RETURN(0);
+                }
+                if (vallen == strlen("krb5p") &&
+                    memcmp(val, "krb5p", vallen) == 0) {
+                        cli->cl_sec_flavor = PTLRPC_SEC_GSS;
+                        cli->cl_sec_subflavor = PTLRPC_SEC_GSS_KRB5P;
+                        RETURN(0);
+                }
+                CERROR("unrecognized security type %s\n", (char*) val);
+                rc = -EINVAL;
+        } else if (keylen == strlen("nllu") && memcmp(key, "nllu", keylen) == 0) {
+                struct client_obd *cli = &exp->exp_obd->u.cli;
+
+                LASSERT(vallen == sizeof(__u32) * 2);
+                cli->cl_nllu = ((__u32 *) val)[0];
+                cli->cl_nllg = ((__u32 *) val)[1];
+                RETURN(0);
         }
+
         RETURN(rc);
 }
 
index df14327..98f9e75 100644 (file)
@@ -1,5 +1,5 @@
 MODULES := mds
 mds-objs := mds_log.o mds_unlink_open.o mds_lov.o handler.o mds_reint.o
-mds-objs += mds_fs.o lproc_mds.o mds_open.o mds_lib.o mds_lmv.o mds_groups.o
+mds-objs += mds_fs.o lproc_mds.o mds_open.o mds_lib.o mds_lmv.o mds_lsd.o
 
 @INCLUDE_RULES@
index d328fd1..2b2c223 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/random.h>
 #include <linux/fs.h>
 #include <linux/jbd.h>
+#include <linux/namei.h>
 #include <linux/ext3_fs.h>
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
 # include <linux/smp_lock.h>
@@ -55,6 +56,7 @@
 #include <linux/lprocfs_status.h>
 #include <linux/lustre_commit_confd.h>
 
+#include <linux/lustre_acl.h>
 #include "mds_internal.h"
 
 static int mds_intent_policy(struct ldlm_namespace *ns,
@@ -720,33 +722,150 @@ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset,
 
         RETURN(rc);
 }
+int mds_pack_link(struct dentry *dentry, struct ptlrpc_request *req,
+                  struct mds_body *repbody, int reply_off)
+{
+        struct inode *inode = dentry->d_inode;
+        char *symname;
+        int len, rc;
+        ENTRY;
+
+        symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1,0);
+        LASSERT(symname != NULL);
+        len = req->rq_repmsg->buflens[reply_off + 1];
+        
+        rc = inode->i_op->readlink(dentry, symname, len);
+        if (rc < 0) {
+                CERROR("readlink failed: %d\n", rc);
+        } else if (rc != len - 1) {
+                CERROR ("Unexpected readlink rc %d: expecting %d\n",
+                        rc, len - 1);
+                rc = -EINVAL;
+        } else {
+                CDEBUG(D_INODE, "read symlink dest %s\n", symname);
+                repbody->valid |= OBD_MD_LINKNAME;
+                repbody->eadatasize = rc + 1;
+                symname[rc] = 0;        /* NULL terminate */
+                rc = 0;
+        }
+
+        RETURN(rc);
+}
+
+int mds_pack_ea(struct dentry *dentry, struct ptlrpc_request *req,
+                struct mds_body *repbody, int req_off, int reply_off)
+{
+        struct inode *inode = dentry->d_inode;
+        char *ea_name;
+        void *value = NULL;
+        int len, rc;
+        ENTRY;
+
+        ea_name = lustre_msg_string(req->rq_reqmsg, req_off + 1, 0);
+        len = req->rq_repmsg->buflens[reply_off + 1];
+        if (len != 0)
+                value = lustre_msg_buf(req->rq_repmsg, reply_off + 1, len);
+
+        rc = -EOPNOTSUPP;
+        if (inode->i_op && inode->i_op->getxattr) 
+                rc = inode->i_op->getxattr(dentry, ea_name, value, len);
 
+        if (rc < 0) {
+                if (rc != -ENODATA && rc != -EOPNOTSUPP)
+                        CERROR("getxattr failed: %d", rc);
+        } else {
+                repbody->valid |= OBD_MD_FLEA;
+                repbody->eadatasize = rc;
+                rc = 0;
+        }
+
+        RETURN(rc);        
+}
+
+int mds_pack_ealist(struct dentry *dentry, struct ptlrpc_request *req,
+                    struct mds_body *repbody, int reply_off)
+{
+        struct inode *inode = dentry->d_inode;        
+        void *value = NULL;
+        int len, rc;
+        ENTRY;
+
+        len = req->rq_repmsg->buflens[reply_off + 1];
+        if (len != 0)
+                value = lustre_msg_buf(req->rq_repmsg, reply_off + 1, len);
+
+        rc = -EOPNOTSUPP;
+        if (inode->i_op && inode->i_op->getxattr) 
+                rc = inode->i_op->listxattr(dentry, value, len);
+
+        if (rc < 0) {
+                CERROR("listxattr failed: %d", rc);
+        } else {
+                repbody->valid |= OBD_MD_FLEALIST;
+                repbody->eadatasize = rc;
+                rc = 0;
+        }
+        RETURN(rc);
+}
+
+int mds_pack_acl(struct obd_device *obd, struct lustre_msg *repmsg, int offset,
+                 struct mds_body *body, struct inode *inode)
+{
+        struct dentry de = { .d_inode = inode };
+        void *buf;
+        __u32 buflen, *sizep, size;
+        ENTRY;
+
+        if (!inode->i_op->getxattr)
+                RETURN(0);
+
+        buflen = repmsg->buflens[offset + 1];
+        buf = lustre_msg_buf(repmsg, offset + 1, buflen);
+
+        size = inode->i_op->getxattr(&de, XATTR_NAME_ACL_ACCESS, buf, buflen);
+        if (size == -ENODATA)
+                RETURN(0);
+        if (size < 0)
+                RETURN(size);
+        LASSERT(size);
+
+        sizep = lustre_msg_buf(repmsg, offset, 4);
+        if (!sizep) {
+                CERROR("can't locate returned acl size buf\n");
+                RETURN(-EPROTO);
+        }
+
+        *sizep = cpu_to_le32(size);
+        body->valid |= OBD_MD_FLACL_ACCESS;
+
+        RETURN(0);
+}
+
+/* 
+ * we only take care of fsuid/fsgid.
+ */
 void mds_squash_root(struct mds_obd *mds, struct mds_req_sec_desc *rsd,
                      ptl_nid_t *peernid)
 {
-        if (!mds->mds_squash_uid ||
-            (rsd->rsd_uid && rsd->rsd_fsuid))
+        if (!mds->mds_squash_uid || rsd->rsd_fsuid)
                 return;
 
         if (*peernid == mds->mds_nosquash_nid)
                 return;
 
-        CDEBUG(D_OTHER, "squash req from 0x%llx, (%d:%d/%x)=>(%d:%d/%x)\n",
+        CDEBUG(D_SEC, "squash req from 0x%llx, (%d:%d/%x)=>(%d:%d/%x)\n",
                 *peernid, rsd->rsd_fsuid, rsd->rsd_fsgid, rsd->rsd_cap,
                 mds->mds_squash_uid, mds->mds_squash_gid,
                 (rsd->rsd_cap & ~CAP_FS_MASK));
 
-        rsd->rsd_uid = mds->mds_squash_uid;
         rsd->rsd_fsuid = mds->mds_squash_uid;
         rsd->rsd_fsgid = mds->mds_squash_gid;
-
-        /* XXX should we remove all capabilities? */
         rsd->rsd_cap &= ~CAP_FS_MASK;
 }
 
 static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
-                                struct ptlrpc_request *req, struct mds_body *reqbody,
-                                int reply_off)
+                                struct ptlrpc_request *req, int req_off,
+                                struct mds_body *reqbody, int reply_off)
 {
         struct inode *inode = dentry->d_inode;
         struct mds_body *body;
@@ -782,30 +901,22 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
                                         OBD_MD_FLATIME | OBD_MD_FLMTIME);
         } else if (S_ISLNK(inode->i_mode) &&
                    (reqbody->valid & OBD_MD_LINKNAME) != 0) {
-                int len = req->rq_repmsg->buflens[reply_off + 1];
-                char *symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1, 0);
-
-                LASSERT(symname != NULL);       /* caller prepped reply */
-
-                if (!inode->i_op->readlink) {
-                        rc = -ENOSYS;
-                } else {
-                        rc = inode->i_op->readlink(dentry, symname, len);
-                        if (rc < 0) {
-                                CERROR("readlink failed: %d\n", rc);
-                        } else if (rc != len - 1) {
-                                CERROR("Unexpected readlink rc %d: expecting %d\n",
-                                        rc, len - 1);
-                                rc = -EINVAL;
-                        } else {
-                                CDEBUG(D_INODE, "read symlink dest %s\n", symname);
-                                body->valid |= OBD_MD_LINKNAME;
-                                body->eadatasize = rc + 1;
-                                symname[rc] = 0;
-                                rc = 0;
-                        }
-                }
+                rc = mds_pack_link(dentry, req, body, reply_off);
+        } else if (reqbody->valid & OBD_MD_FLEA) {
+                rc = mds_pack_ea(dentry, req, body, req_off, reply_off);
+        } else if (reqbody->valid & OBD_MD_FLEALIST) {
+                rc = mds_pack_ealist(dentry, req, body, reply_off);
         }
+        
+        if (reqbody->valid & OBD_MD_FLACL_ACCESS) {
+                int inc = (reqbody->valid & OBD_MD_FLEASIZE) ? 2 : 1;
+                rc = mds_pack_acl(obd, req->rq_repmsg, reply_off + inc, 
+                                  body, inode);
+        }                
+
+        /* do reverse uid/gid mapping if needed */
+        if (rc == 0 && req->rq_remote)
+                mds_reverse_map_ugid(req, body);
 
         RETURN(rc);
 }
@@ -834,13 +945,13 @@ out:
         return rc;
 }
 
-static int mds_getattr_pack_msg(struct ptlrpc_request *req, 
-                               struct inode *inode,
+static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct dentry *de,
                                 int offset)
 {
+        struct inode *inode = de->d_inode;
         struct mds_obd *mds = mds_req2mds(req);
         struct mds_body *body;
-        int rc = 0, size[2] = {sizeof(*body)}, bufcount = 1;
+        int rc = 0, size[4] = {sizeof(*body)}, bufcount = 1;
         ENTRY;
 
         body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*body));
@@ -853,8 +964,6 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req,
                 down(&inode->i_sem);
                 rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0);
                 up(&inode->i_sem);
-                CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n",
-                       rc, inode->i_ino);
                 if (rc < 0) {
                         if (rc != -ENODATA)
                                 CERROR("error getting inode %lu MD: rc = %d\n",
@@ -876,6 +985,42 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req,
                 bufcount++;
                 CDEBUG(D_INODE, "symlink size: %Lu, reply space: %d\n",
                        inode->i_size + 1, body->eadatasize);
+        } else if ((body->valid & OBD_MD_FLEA)) {
+                char *ea_name = lustre_msg_string(req->rq_reqmsg, 
+                                                  offset + 1, 0);
+                rc = -EOPNOTSUPP;
+                if (inode->i_op && inode->i_op->getxattr) 
+                        rc = inode->i_op->getxattr(de, ea_name, NULL, 0);
+                
+                if (rc < 0) {
+                        if (rc != -ENODATA)
+                                CERROR("error getting inode %lu EA: rc = %d\n",
+                                       inode->i_ino, rc);
+                        size[bufcount] = 0;
+                } else {
+                        size[bufcount] = min_t(int, body->eadatasize, rc);
+                }
+                bufcount++;
+        } else if (body->valid & OBD_MD_FLEALIST) {
+                rc = -EOPNOTSUPP;
+                if (inode->i_op && inode->i_op->getxattr) 
+                        rc = inode->i_op->listxattr(de, NULL, 0);
+
+                if (rc < 0) {
+                        if (rc != -ENODATA)
+                                CERROR("error getting inode %lu EA: rc = %d\n",
+                                       inode->i_ino, rc);
+                        size[bufcount] = 0;
+                } else {
+                        size[bufcount] = min_t(int, body->eadatasize, rc);
+                }
+                bufcount++;
+        }
+        
+        /* may co-exist with OBD_MD_FLEASIZE */
+        if (body->valid & OBD_MD_FLACL_ACCESS) {
+                size[bufcount++] = 4;
+                size[bufcount++] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
         }
 
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) {
@@ -935,7 +1080,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
         struct mds_req_sec_desc *rsd;
         struct mds_body *body;
         struct dentry *dparent = NULL, *dchild = NULL;
-        struct lvfs_ucred uc;
+        struct lvfs_ucred uc = {NULL, NULL,};
         struct lustre_handle parent_lockh[2] = {{0}, {0}};
         unsigned int namesize;
         int rc = 0, cleanup_phase = 0, resent_req = 0, update_mode, reply_offset;
@@ -950,7 +1095,6 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
                 CERROR("Can't unpack security desc\n");
                 RETURN(-EFAULT);
         }
-        mds_squash_root(mds, rsd, &req->rq_peer.peer_id.nid); 
 
         /* swab now, before anyone looks inside the request. */
         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
@@ -981,7 +1125,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
                 reply_offset = 0;
         }
 
-        rc = mds_init_ucred(&uc, rsd);
+        rc = mds_init_ucred(&uc, req, rsd);
         if (rc) {
                 CERROR("can't init ucred\n");
                 GOTO(cleanup, rc);
@@ -1084,18 +1228,30 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
                          id_fid(&body->id1), (unsigned long)id_group(&body->id1),
                          child_lockh->cookie);
 
-                dparent = mds_id2dentry(obd, &body->id1, NULL);
-                LASSERT(dparent);
-
-                dchild = ll_lookup_one_len(name, dparent, namesize - 1);
-                if (IS_ERR(dchild)) {
-                        DEBUG_REQ(D_ERROR, req, "resent, not enqueuing new locks");
-                        CDEBUG(D_ERROR, "lock against [%lu:%lu]/%*s\n",
-                               (unsigned long) id_ino(&body->id1),
-                               (unsigned long) id_gen(&body->id1),
-                               namesize - 1, name);
+                if (name) {
+                        /* usual named request */
+                        dparent = mds_id2dentry(obd, &body->id1, NULL);
+                        LASSERT(!IS_ERR(dparent));
+                        dchild = ll_lookup_one_len(name, dparent, namesize - 1);
+                        if (IS_ERR(dchild)) {
+                                DEBUG_REQ(D_ERROR, req, "resent, not enqueuing new locks");
+                                CDEBUG(D_ERROR, "lock against [%lu:%lu]/%*s\n",
+                                                (unsigned long) id_ino(&body->id1),
+                                                (unsigned long) id_gen(&body->id1),
+                                                namesize - 1, name);
+                        }
+                        LASSERT(!IS_ERR(dchild));
+                } else {
+                        /* client wants to get attr. by id */
+                        dchild = mds_id2dentry(obd, &body->id1, NULL);
+                        if (IS_ERR(dchild)) {
+                                DEBUG_REQ(D_ERROR, req, "resent, not enqueuing new locks");
+                                CDEBUG(D_ERROR, "lock against [%lu:%lu]\n",
+                                                (unsigned long) id_ino(&body->id1),
+                                                (unsigned long) id_gen(&body->id1));
+                        }
+                        LASSERT(!IS_ERR(dchild));
                 }
-                LASSERT(!IS_ERR(dchild));
                 LDLM_LOCK_PUT(granted_lock);
         }
 
@@ -1117,14 +1273,14 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
                 if (dchild->d_flags & DCACHE_CROSS_REF)
                         rc = mds_getattr_pack_msg_cf(req, dchild, offset);
                 else
-                        rc = mds_getattr_pack_msg(req, dchild->d_inode, offset);
+                        rc = mds_getattr_pack_msg(req, dchild, offset);
                 if (rc != 0) {
                         CERROR ("mds_getattr_pack_msg: %d\n", rc);
                         GOTO (cleanup, rc);
                 }
         }
 
-        rc = mds_getattr_internal(obd, dchild, req, body, reply_offset);
+        rc = mds_getattr_internal(obd, dchild, req, offset, body, reply_offset);        
         GOTO(cleanup, rc); /* returns the lock to the client */
 
  cleanup:
@@ -1145,6 +1301,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset,
                 l_dput(dchild);
         case 1:
                 pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
+        default:
                 mds_exit_ucred(&uc);
         }
         return rc;
@@ -1157,7 +1314,7 @@ static int mds_getattr(struct ptlrpc_request *req, int offset)
         struct dentry *de;
         struct mds_req_sec_desc *rsd;
         struct mds_body *body;
-        struct lvfs_ucred uc;
+        struct lvfs_ucred uc = {NULL, NULL,};
         int rc = 0;
         ENTRY;
 
@@ -1176,8 +1333,9 @@ static int mds_getattr(struct ptlrpc_request *req, int offset)
 
         MD_COUNTER_INCREMENT(obd, getattr);
 
-        rc = mds_init_ucred(&uc, rsd);
+        rc = mds_init_ucred(&uc, req, rsd);
         if (rc) {
+                mds_exit_ucred(&uc);
                 CERROR("can't init ucred\n");
                 RETURN(rc);
         }
@@ -1189,14 +1347,13 @@ static int mds_getattr(struct ptlrpc_request *req, int offset)
                 GOTO(out_pop, rc);
         }
 
-        rc = mds_getattr_pack_msg(req, de->d_inode, offset);
+        rc = mds_getattr_pack_msg(req, de, offset);
         if (rc != 0) {
                 CERROR("mds_getattr_pack_msg: %d\n", rc);
                 GOTO(out_pop, rc);
         }
 
-        req->rq_status = mds_getattr_internal(obd, de, req, body, 0);
-
+        req->rq_status = mds_getattr_internal(obd, de, req, offset, body, 0);
         l_dput(de);
 
         EXIT;
@@ -1306,7 +1463,6 @@ out:
 static int mds_readpage(struct ptlrpc_request *req, int offset)
 {
         struct obd_device *obd = req->rq_export->exp_obd;
-        struct mds_obd *mds = &obd->u.mds;
         struct vfsmount *mnt;
         struct dentry *de;
         struct file *file;
@@ -1314,7 +1470,7 @@ static int mds_readpage(struct ptlrpc_request *req, int offset)
         struct mds_body *body, *repbody;
         struct lvfs_run_ctxt saved;
         int rc, size = sizeof(*repbody);
-        struct lvfs_ucred uc;
+        struct lvfs_ucred uc = {NULL, NULL,};
         ENTRY;
 
         rc = lustre_pack_reply(req, 1, &size, NULL);
@@ -1328,7 +1484,6 @@ static int mds_readpage(struct ptlrpc_request *req, int offset)
                 CERROR("Can't unpack security desc\n");
                 GOTO (out, rc = -EFAULT);
         }
-        mds_squash_root(mds, rsd, &req->rq_peer.peer_id.nid); 
 
         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
                                   lustre_swab_mds_body);
@@ -1337,7 +1492,7 @@ static int mds_readpage(struct ptlrpc_request *req, int offset)
                 GOTO (out, rc = -EFAULT);
         }
 
-        rc = mds_init_ucred(&uc, rsd);
+        rc = mds_init_ucred(&uc, req, rsd);
         if (rc) {
                 CERROR("can't init ucred\n");
                 GOTO(out, rc);
@@ -1384,8 +1539,8 @@ out_file:
         filp_close(file, 0);
 out_pop:
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
-        mds_exit_ucred(&uc);
 out:
+        mds_exit_ucred(&uc);
         req->rq_status = rc;
         return 0;
 }
@@ -1479,7 +1634,6 @@ EXPORT_SYMBOL(mds_read_mid);
 int mds_reint(struct ptlrpc_request *req, int offset,
               struct lustre_handle *lockh)
 {
-        struct mds_obd *mds = &req->rq_export->exp_obd->u.mds;
         struct mds_update_record *rec;
         struct mds_req_sec_desc *rsd;
         int rc;
@@ -1494,7 +1648,6 @@ int mds_reint(struct ptlrpc_request *req, int offset,
                 CERROR("Can't unpack security desc\n");
                 GOTO(out, rc = -EFAULT);
         }
-        mds_squash_root(mds, rsd, &req->rq_peer.peer_id.nid); 
 
         rc = mds_update_unpack(req, offset, rec);
         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNPACK)) {
@@ -1502,7 +1655,7 @@ int mds_reint(struct ptlrpc_request *req, int offset,
                 GOTO(out, req->rq_status = -EINVAL);
         }
 
-        rc = mds_init_ucred(&rec->ur_uc, rsd);
+        rc = mds_init_ucred(&rec->ur_uc, req, rsd);
         if (rc) {
                 CERROR("can't init ucred\n");
                 GOTO(out, rc);
@@ -1510,11 +1663,27 @@ int mds_reint(struct ptlrpc_request *req, int offset,
 
         /* rc will be used to interrupt a for loop over multiple records */
         rc = mds_reint_rec(rec, offset, req, lockh);
-        mds_exit_ucred(&rec->ur_uc);
-        EXIT;
+
+        /* do reverse uid/gid mapping if needed */
+        if (rc == 0 && req->rq_remote &&
+            (rec->ur_opcode == REINT_SETATTR ||
+             rec->ur_opcode == REINT_OPEN)) {
+                struct mds_body *body;
+                int bodyoff;
+
+                if (rec->ur_opcode == REINT_SETATTR)
+                        bodyoff = 0;
+                else /* open */
+                        bodyoff = (offset == 3 ? 1 : 0);
+                body = lustre_msg_buf(req->rq_repmsg, bodyoff, sizeof(*body));
+                LASSERT(body);
+
+                mds_reverse_map_ugid(req, body);
+        }
  out:
+        mds_exit_ucred(&rec->ur_uc);
         OBD_FREE(rec, sizeof(*rec));
-        return rc;
+        RETURN(rc);
 }
 
 static int mds_filter_recovery_request(struct ptlrpc_request *req,
@@ -1655,7 +1824,7 @@ static int mdt_obj_create(struct ptlrpc_request *req)
          * this only serve to inter-mds request, don't need check group database
          * here. --ericm.
          */
-        uc.luc_ghash = NULL;
+        uc.luc_lsd = NULL;
         uc.luc_ginfo = NULL;
         uc.luc_uid = body->oa.o_uid;
         uc.luc_fsuid = body->oa.o_uid;
@@ -1891,7 +2060,6 @@ cleanup:
 
         l_dput(new);
         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc);
-        mds_put_group_entry(mds, uc.luc_ghash);
         return rc;
 }
 
@@ -2008,6 +2176,38 @@ static int mdt_set_info(struct ptlrpc_request *req)
         RETURN(-EINVAL);
 }
 
+static int mds_init_export_data(struct ptlrpc_request *req)
+{
+        struct mds_export_data *med = &req->rq_export->u.eu_mds_data;
+        __u32 *nllu;
+
+        nllu = lustre_msg_buf(req->rq_reqmsg, 4, sizeof(__u32) * 2);
+        if (nllu == NULL) {
+                CERROR("failed to extract nllu, use 99:99\n");
+                med->med_nllu = 99;
+                med->med_nllg = 99;
+        } else {
+                if (lustre_msg_swabbed(req->rq_reqmsg)) {
+                        __swab32s(&nllu[0]);
+                        __swab32s(&nllu[1]);
+                }
+                med->med_nllu = nllu[0];
+                med->med_nllg = nllu[1];
+        }
+
+        if (req->rq_remote) {
+                CWARN("exp %p, peer "LPX64": set as remote\n",
+                       req->rq_export, req->rq_peer.peer_id.nid);
+                med->med_local = 0;
+        } else
+                med->med_local = 1;
+
+        LASSERT(med->med_idmap == NULL);
+        spin_lock_init(&med->med_idmap_lock);
+
+        return 0;
+}
+
 static int mds_msg_check_version(struct lustre_msg *msg)
 {
         int rc;
@@ -2066,6 +2266,11 @@ static int mds_msg_check_version(struct lustre_msg *msg)
                         CERROR("bad opc %u version %08x, expecting %08x\n",
                                msg->opc, msg->version, LUSTRE_OBD_VERSION);
                 break;
+        case SEC_INIT:
+        case SEC_INIT_CONTINUE:
+        case SEC_FINI:
+                rc = 0;
+                break;
         default:
                 CERROR("MDS unknown opcode %d\n", msg->opc);
                 rc = -ENOTSUPP;
@@ -2093,6 +2298,13 @@ int mds_handle(struct ptlrpc_request *req)
                 RETURN(rc);
         }
 
+        /* Security opc should NOT trigger any recovery events */
+        if (req->rq_reqmsg->opc == SEC_INIT ||
+            req->rq_reqmsg->opc == SEC_INIT_CONTINUE ||
+            req->rq_reqmsg->opc == SEC_FINI) {
+                GOTO(out, rc = 0);
+        }
+
         LASSERT(current->journal_info == NULL);
         /* XXX identical to OST */
         if (req->rq_reqmsg->opc != MDS_CONNECT) {
@@ -2148,9 +2360,11 @@ int mds_handle(struct ptlrpc_request *req)
                 DEBUG_REQ(D_INODE, req, "connect");
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_CONNECT_NET, 0);
                 rc = target_handle_connect(req);
-                if (!rc)
+                if (!rc) {
                         /* Now that we have an export, set mds. */
                         mds = mds_req2mds(req);
+                        mds_init_export_data(req);
+                }
                 break;
 
         case MDS_DISCONNECT:
@@ -2690,10 +2904,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf)
         /*
          * here we use "iopen_nopriv" hardcoded, because it affects MDS utility
          * and the rest of options are passed by mount options. Probably this
-         * should be moved to somewhere else like startup scripts or lconf.
-         */
-        sprintf(options, "iopen_nopriv");
-
+         * should be moved to somewhere else like startup scripts or lconf. */
+        sprintf(options, "iopen_nopriv,acl,user_xattr");
         if (lcfg->lcfg_inllen4 > 0 && lcfg->lcfg_inlbuf4)
                 sprintf(options + strlen(options), ",%s",
                         lcfg->lcfg_inlbuf4);
@@ -3002,6 +3214,7 @@ static int mds_precleanup(struct obd_device *obd, int flags)
         RETURN(rc);
 }
 
+extern void lgss_svc_cache_purge_all(void);
 static int mds_cleanup(struct obd_device *obd, int flags)
 {
         struct mds_obd *mds = &obd->u.mds;
@@ -3046,9 +3259,65 @@ static int mds_cleanup(struct obd_device *obd, int flags)
         dev_clear_rdonly(2);
         fsfilt_put_ops(obd->obd_fsops);
 
+#ifdef ENABLE_GSS
+        /* XXX */
+        lgss_svc_cache_purge_all();
+#endif
         RETURN(0);
 }
 
+static int set_security(const char *value, char **sec)
+{
+        int rc = 0;
+
+        if (!strcmp(value, "null"))
+                *sec = "null";
+        else if (!strcmp(value, "krb5i"))
+                *sec = "krb5i";
+        else if (!strcmp(value, "krb5p"))
+                *sec = "krb5p";
+        else {
+                CERROR("Unrecognized value, force use NULL\n");
+                rc = -EINVAL;
+        }
+
+        return rc;
+}
+
+static int mds_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+        struct lustre_cfg *lcfg = buf;
+        struct mds_obd *mds = &obd->u.mds;
+        int rc = 0;
+        ENTRY;
+
+        switch(lcfg->lcfg_command) {
+        case LCFG_SET_SECURITY: {
+                if (!lcfg->lcfg_inllen1 || !lcfg->lcfg_inllen2)
+                        GOTO(out, rc = -EINVAL);
+
+                if (!strcmp(lcfg->lcfg_inlbuf1, "mds_mds_sec"))
+                        rc = set_security(lcfg->lcfg_inlbuf2,
+                                          &mds->mds_mds_sec);
+                else if (!strcmp(lcfg->lcfg_inlbuf1, "mds_ost_sec"))
+                        rc = set_security(lcfg->lcfg_inlbuf2,
+                                          &mds->mds_ost_sec);
+                else {
+                        CERROR("Unrecognized key\n");
+                        rc = -EINVAL;
+                }
+                break;
+        }
+        default: {
+                CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+                GOTO(out, rc = -EINVAL);
+
+        }
+        }
+out:
+        RETURN(rc);
+}
+
 static void fixup_handle_for_resent_req(struct ptlrpc_request *req,
                                         int offset,
                                         struct ldlm_lock *new_lock,
@@ -3126,10 +3395,11 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
         struct lustre_handle lockh[2] = {{0}, {0}};
         struct ldlm_lock *new_lock = NULL;
         int getattr_part = MDS_INODELOCK_UPDATE;
-        int rc, repsize[4] = { sizeof(struct ldlm_reply),
-                               sizeof(struct mds_body),
-                               mds->mds_max_mdsize,
-                               mds->mds_max_cookiesize };
+        int rc, reply_buffers;
+        int repsize[5] = {sizeof(struct ldlm_reply),
+                          sizeof(struct mds_body),
+                          mds->mds_max_mdsize};
+
         int offset = MDS_REQ_INTENT_REC_OFF; 
         ENTRY;
 
@@ -3153,7 +3423,14 @@ static int mds_intent_policy(struct ldlm_namespace *ns,
 
         LDLM_DEBUG(lock, "intent policy, opc: %s", ldlm_it2str(it->opc));
 
-        rc = lustre_pack_reply(req, 3, repsize, NULL);
+        reply_buffers = 3;
+        if (it->opc & ( IT_OPEN | IT_GETATTR | IT_LOOKUP | IT_CHDIR )) {
+                reply_buffers = 5;
+                repsize[3] = 4;
+                repsize[4] = xattr_acl_size(LL_ACL_MAX_ENTRIES);
+        }
+
+        rc = lustre_pack_reply(req, reply_buffers, repsize, NULL);
         if (rc)
                 RETURN(req->rq_status = rc);
 
@@ -3488,6 +3765,7 @@ static struct obd_ops mds_obd_ops = {
         .o_setup           = mds_setup,
         .o_precleanup      = mds_precleanup,
         .o_cleanup         = mds_cleanup,
+        .o_process_config  = mds_process_config,
         .o_postrecov       = mds_postrecov,
         .o_statfs          = mds_obd_statfs,
         .o_iocontrol       = mds_iocontrol,
@@ -3514,7 +3792,7 @@ static int __init mds_init(void)
 {
         struct lprocfs_static_vars lvars;
 
-        mds_group_hash_init();
+        mds_init_lsd_cache();
 
         lprocfs_init_multi_vars(0, &lvars);
         class_register_type(&mds_obd_ops, NULL, lvars.module_vars,
@@ -3528,7 +3806,7 @@ static int __init mds_init(void)
 
 static void /*__exit*/ mds_exit(void)
 {
-        mds_group_hash_cleanup();
+        mds_cleanup_lsd_cache();
 
         class_unregister_type(LUSTRE_MDS_NAME);
         class_unregister_type(LUSTRE_MDT_NAME);
index 3912499..085c840 100644 (file)
@@ -142,17 +142,13 @@ struct lprocfs_vars lprocfs_mds_obd_vars[] = {
 };
 
 /*
- * group hash proc entries handler
+ * LSD proc entry handlers
  */
-static int lprocfs_wr_group_info(struct file *file, const char *buffer,
-                                 unsigned long count, void *data)
+static int lprocfs_wr_lsd_downcall(struct file *file, const char *buffer,
+                                   unsigned long count, void *data)
 {
-        struct {
-                int             err;
-                uid_t           uid;
-                uint32_t        ngroups;
-                gid_t          *groups;
-        } param;
+        struct upcall_cache *cache = __mds_get_global_lsd_cache();
+        struct lsd_downcall_args param;
         gid_t   gids_local[NGROUPS_SMALL];
         gid_t  *gids = NULL;
 
@@ -164,9 +160,16 @@ static int lprocfs_wr_group_info(struct file *file, const char *buffer,
                 CERROR("broken downcall\n");
                 return count;
         }
+
+        if (param.err) {
+                CERROR("LSD downcall indicate error %d\n", param.err);
+                goto do_downcall;
+        }
+
         if (param.ngroups > NGROUPS_MAX) {
                 CERROR("%d groups?\n", param.ngroups);
-                return count;
+                param.err = -EINVAL;
+                goto do_downcall;
         }
 
         if (param.ngroups <= NGROUPS_SMALL)
@@ -176,132 +179,119 @@ static int lprocfs_wr_group_info(struct file *file, const char *buffer,
                 if (!gids) {
                         CERROR("fail to alloc memory for %d gids\n",
                                 param.ngroups);
-                        return count;
+                        param.err = -ENOMEM;
+                        goto do_downcall;
                 }
         }
         if (copy_from_user(gids, param.groups,
                            param.ngroups * sizeof(gid_t))) {
                 CERROR("broken downcall\n");
-                goto out;
+                param.err = -EFAULT;
+                goto do_downcall;
         }
 
-        mds_handle_group_downcall(param.err, param.uid,
-                                  param.ngroups, gids);
+        param.groups = gids;
+
+do_downcall:
+        upcall_cache_downcall(cache, (__u64) param.uid, param.err, &param);
 
-out:
         if (gids && gids != gids_local)
                 OBD_FREE(gids, param.ngroups * sizeof(gid_t));
         return count;
 }
 
-static int lprocfs_rd_expire(char *page, char **start, off_t off, int count,
-                             int *eof, void *data)
+static int lprocfs_rd_lsd_expire(char *page, char **start, off_t off, int count,
+                                 int *eof, void *data)
 {
-        struct mds_grp_hash *hash = __mds_get_global_group_hash();
+        struct upcall_cache *cache= __mds_get_global_lsd_cache();
 
         *eof = 1;
-        return snprintf(page, count, "%d\n", hash->gh_entry_expire);
+        return snprintf(page, count, "%lu\n", cache->uc_entry_expire);
 }
-
-static int lprocfs_wr_expire(struct file *file, const char *buffer,
-                             unsigned long count, void *data)
+static int lprocfs_wr_lsd_expire(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
 {
-        struct mds_grp_hash *hash = __mds_get_global_group_hash();
+        struct upcall_cache *cache= __mds_get_global_lsd_cache();
         char buf[32];
 
         if (copy_from_user(buf, buffer, min(count, 32UL)))
                 return count;
         buf[31] = 0;
-        sscanf(buf, "%d", &hash->gh_entry_expire);
+        sscanf(buf, "%lu", &cache->uc_entry_expire);
         return count;
 }
 
-static int lprocfs_rd_ac_expire(char *page, char **start, off_t off, int count,
-                                int *eof, void *data)
+static int lprocfs_rd_lsd_ac_expire(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data)
 {
-        struct mds_grp_hash *hash = __mds_get_global_group_hash();
+        struct upcall_cache *cache= __mds_get_global_lsd_cache();
 
         *eof = 1;
-        return snprintf(page, count, "%d\n", hash->gh_acquire_expire);
+        return snprintf(page, count, "%lu\n", cache->uc_acquire_expire);
 }
-
-static int lprocfs_wr_ac_expire(struct file *file, const char *buffer,
-                                unsigned long count, void *data)
+static int lprocfs_wr_lsd_ac_expire(struct file *file, const char *buffer,
+                                    unsigned long count, void *data)
 {
-        struct mds_grp_hash *hash = __mds_get_global_group_hash();
+        struct upcall_cache *cache= __mds_get_global_lsd_cache();
         char buf[32];
 
         if (copy_from_user(buf, buffer, min(count, 32UL)))
                 return count;
         buf[31] = 0;
-        sscanf(buf, "%d", &hash->gh_acquire_expire);
+        sscanf(buf, "%lu", &cache->uc_acquire_expire);
         return count;
 }
 
-static int lprocfs_rd_hash_upcall(char *page, char **start, off_t off, int count,
-                                int *eof, void *data)
+static int lprocfs_rd_lsd_upcall(char *page, char **start, off_t off, int count,
+                                 int *eof, void *data)
 {
-        struct mds_grp_hash *hash = __mds_get_global_group_hash();
+        struct upcall_cache *cache= __mds_get_global_lsd_cache();
 
         *eof = 1;
-        return snprintf(page, count, "%s\n", hash->gh_upcall);
+        return snprintf(page, count, "%s\n", cache->uc_upcall);
 }
-
-static int lprocfs_wr_hash_upcall(struct file *file, const char *buffer,
-                                  unsigned long count, void *data)
+static int lprocfs_wr_lsd_upcall(struct file *file, const char *buffer,
+                                 unsigned long count, void *data)
 {
-        struct mds_grp_hash *hash = __mds_get_global_group_hash();
+        struct upcall_cache *cache= __mds_get_global_lsd_cache();
 
-        if (count < MDSGRP_UPCALL_MAXPATH) {
-                sscanf(buffer, "%1024s", hash->gh_upcall);
-                hash->gh_upcall[MDSGRP_UPCALL_MAXPATH-1] = 0;
+        if (count < UC_CACHE_UPCALL_MAXPATH) {
+                sscanf(buffer, "%1024s", cache->uc_upcall);
+                cache->uc_upcall[UC_CACHE_UPCALL_MAXPATH - 1] = 0;
         }
         return count;
 }
 
-static int lprocfs_wr_hash_flush(struct file *file, const char *buffer,
-                                  unsigned long count, void *data)
-{
-        mds_group_hash_flush_idle();
-        return count;
-}
-
-static int lprocfs_rd_allow_setgroups(char *page, char **start, off_t off,
-                                      int count, int *eof, void *data)
-{
-        struct mds_grp_hash *hash = __mds_get_global_group_hash();
-
-        *eof = 1;
-        return snprintf(page, count, "%d\n", hash->gh_allow_setgroups);
-}
-
-static int lprocfs_wr_allow_setgroups(struct file *file, const char *buffer,
-                                      unsigned long count, void *data)
+extern void lgss_svc_cache_flush(__u32 uid);
+static int lprocfs_wr_lsd_flush(struct file *file, const char *buffer,
+                                unsigned long count, void *data)
 {
-        struct mds_grp_hash *hash = __mds_get_global_group_hash();
-        char buf[8];
-        int val;
+        char buf[32];
+        __u32 uid;
 
-        if (copy_from_user(buf, buffer, min(count, 8UL)))
+        if (copy_from_user(buf, buffer, min(count, 32UL)))
                 return count;
-        buf[7] = 0;
-        sscanf(buf, "%d", &val);
-        hash->gh_allow_setgroups = (val != 0);
+        buf[31] = 0;
+        sscanf(buf, "%d", &uid);
+
+        mds_flush_lsd(uid);
+#ifdef ENABLE_GSS
+        lgss_svc_cache_flush(uid);
+#endif
         return count;
 }
 
 struct lprocfs_vars lprocfs_mds_module_vars[] = {
-        { "num_refs", lprocfs_rd_numrefs, 0, 0 },
-        { "grp_hash_expire_interval",lprocfs_rd_expire,
-          lprocfs_wr_expire, 0},
-        { "grp_hash_acquire_expire", lprocfs_rd_ac_expire,
-          lprocfs_wr_ac_expire, 0},
-        { "grp_hash_upcall", lprocfs_rd_hash_upcall,
-          lprocfs_wr_hash_upcall, 0},
-        { "grp_hash_flush", 0, lprocfs_wr_hash_flush, 0},
-        { "group_info", 0, lprocfs_wr_group_info, 0 },
-        { "allow_setgroups", lprocfs_rd_allow_setgroups,
-          lprocfs_wr_allow_setgroups, 0},
+        { "num_refs",                   lprocfs_rd_numrefs, 0, 0 },
+        /* LSD stuff */
+        { "lsd_expire_interval",        lprocfs_rd_lsd_expire,
+                                        lprocfs_wr_lsd_expire, 0},
+        { "lsd_acquire_expire",         lprocfs_rd_lsd_ac_expire,
+                                        lprocfs_wr_lsd_ac_expire, 0},
+        { "lsd_upcall",                 lprocfs_rd_lsd_upcall,
+                                        lprocfs_wr_lsd_upcall, 0},
+        { "lsd_flush",                  0, lprocfs_wr_lsd_flush, 0},
+        { "lsd_downcall",               0, lprocfs_wr_lsd_downcall, 0},
         { 0 }
 };
 
index ed41934..b52bf4a 100644 (file)
@@ -131,6 +131,8 @@ int mds_client_free(struct obd_export *exp, int clear_client)
         struct lvfs_run_ctxt saved;
         int rc;
 
+        mds_idmap_cleanup(med);
+
         if (!med->med_mcd)
                 RETURN(0);
 
diff --git a/lustre/mds/mds_groups.c b/lustre/mds/mds_groups.c
deleted file mode 100644 (file)
index 7da07f7..0000000
+++ /dev/null
@@ -1,451 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2004 Cluster File Systems, Inc.
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define DEBUG_SUBSYSTEM S_MDS
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/kmod.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/version.h>
-#include <linux/unistd.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <asm/uaccess.h>
-#include <linux/slab.h>
-#include <asm/segment.h>
-
-#include <libcfs/list.h>
-#include <linux/obd_support.h>
-#include <linux/lustre_lib.h>
-#include <linux/lustre_mds.h>
-#include "mds_internal.h"
-
-#define GRP_HASH_NEW              0x1
-#define GRP_HASH_ACQUIRING        0x2
-#define GRP_HASH_INVALID          0x4
-#define GRP_HASH_EXPIRED          0x8
-
-#define GRP_IS_NEW(i)          ((i)->ge_flags & GRP_HASH_NEW)
-#define GRP_IS_INVALID(i)      ((i)->ge_flags & GRP_HASH_INVALID)
-#define GRP_IS_ACQUIRING(i)    ((i)->ge_flags & GRP_HASH_ACQUIRING)
-#define GRP_IS_EXPIRED(i)      ((i)->ge_flags & GRP_HASH_EXPIRED)
-#define GRP_IS_VALID(i)        ((i)->ge_flags == 0)
-
-#define GRP_SET_NEW(i)         (i)->ge_flags |= GRP_HASH_NEW
-#define GRP_SET_INVALID(i)     (i)->ge_flags |= GRP_HASH_INVALID
-#define GRP_SET_ACQUIRING(i)   (i)->ge_flags |= GRP_HASH_ACQUIRING
-#define GRP_SET_EXPIRED(i)     (i)->ge_flags |= GRP_HASH_EXPIRED
-#define GRP_SET_VALID(i)       (i)->ge_flags = 0
-
-#define GRP_CLEAR_NEW(i)       (i)->ge_flags &= ~GRP_HASH_NEW
-#define GRP_CLEAR_ACQUIRING(i) (i)->ge_flags &= ~GRP_HASH_ACQUIRING
-#define GRP_CLEAR_INVALID(i)   (i)->ge_flags &= ~GRP_HASH_INVALID
-#define GRP_CLEAR_EXPIRED(i)   (i)->ge_flags &= ~GRP_HASH_EXPIRED
-
-/* 
- * We need share hash table among the groups of MDSs (which server as the same
- * lustre file system), maybe MDT? but there's lprocfs problems of putting this
- * in MDT. so we make it global to the module. which brings the limitation that
- * one node couldn't running multiple MDS which server as different Lustre FS.
- * but which maybe not meaningful.
- */
-static struct mds_grp_hash _group_hash;
-
-struct mds_grp_hash *__mds_get_global_group_hash()
-{
-        return &_group_hash;
-}
-
-static struct mds_grp_hash_entry *alloc_entry(uid_t uid)
-{
-        struct mds_grp_hash_entry *entry;
-
-        OBD_ALLOC(entry, sizeof(*entry));
-        if (!entry)
-                return NULL;
-
-        GRP_SET_NEW(entry);
-        INIT_LIST_HEAD(&entry->ge_hash);
-        entry->ge_uid = uid;
-        atomic_set(&entry->ge_refcount, 0);
-        init_waitqueue_head(&entry->ge_waitq);
-        return entry;
-}
-
-/* protected by hash lock */
-static void free_entry(struct mds_grp_hash_entry *entry)
-{
-        if (entry->ge_group_info)
-                groups_free(entry->ge_group_info);
-        list_del(&entry->ge_hash);
-        CDEBUG(D_OTHER, "destroy mds_grp_entry %p for uid %d\n",
-               entry, entry->ge_uid);
-        OBD_FREE(entry, sizeof(*entry));
-}
-
-static inline void get_entry(struct mds_grp_hash_entry *entry)
-{
-        atomic_inc(&entry->ge_refcount);
-}
-static inline void put_entry(struct mds_grp_hash_entry *entry)
-{
-        if (atomic_dec_and_test(&entry->ge_refcount) &&
-            (GRP_IS_INVALID(entry) || GRP_IS_EXPIRED(entry))) {
-                free_entry(entry);
-        }
-}
-static int check_unlink_entry(struct mds_grp_hash_entry *entry)
-{
-        if (GRP_IS_VALID(entry) &&
-            time_before(jiffies, entry->ge_expire))
-                return 0;
-
-        if (GRP_IS_ACQUIRING(entry) &&
-            time_after(jiffies, entry->ge_acquire_expire)) {
-                GRP_SET_EXPIRED(entry);
-                wake_up_all(&entry->ge_waitq);
-        } else if (!GRP_IS_INVALID(entry)) {
-                GRP_SET_EXPIRED(entry);
-        }
-
-        list_del_init(&entry->ge_hash);
-        if (!atomic_read(&entry->ge_refcount))
-                free_entry(entry);
-        return 1;
-}
-
-static int refresh_entry(struct mds_grp_hash *hash,
-                         struct mds_grp_hash_entry *entry)
-{
-        char *argv[4];
-        char *envp[3];
-        char uidstr[16];
-        int rc;
-        ENTRY;
-
-        snprintf(uidstr, 16, "%d", entry->ge_uid);
-
-        argv[0] = hash->gh_upcall;
-        argv[1] = uidstr;
-        argv[2] = NULL;
-                                                                                                                        
-        envp[0] = "HOME=/";
-        envp[1] = "PATH=/sbin:/usr/sbin";
-        envp[2] = NULL;
-
-        rc = USERMODEHELPER(argv[0], argv, envp);
-        if (rc < 0) {
-                CERROR("Error invoking getgroups upcall %s %s: %d; check "
-                       "/proc/fs/lustre/mds/grp_hash_upcall\n",
-                       argv[0], argv[1], rc);
-        } else {
-                CWARN("Invoked upcall %s %s\n",
-                        argv[0], argv[1]);
-        }
-        RETURN(rc);
-}
-
-struct mds_grp_hash_entry *mds_get_group_entry(struct mds_obd *mds, uid_t uid)
-{
-        struct mds_grp_hash_entry *entry = NULL, *new = NULL, *next;
-        struct mds_grp_hash *hash = &_group_hash;
-        struct list_head *head;
-        wait_queue_t wait;
-        int rc, found;
-        ENTRY;
-
-        head = &hash->gh_table[MDSGRP_HASH_INDEX(uid)];
-        
-find_again:
-        found = 0;
-        spin_lock(&hash->gh_lock);
-        list_for_each_entry_safe(entry, next, head, ge_hash) {
-                /* check invalid & expired items */
-                if (check_unlink_entry(entry))
-                        continue;
-                if (entry->ge_uid == uid) {
-                        found = 1;
-                        break;
-                }
-        }
-
-        if (!found) { /* didn't found */
-                if (!new) {
-                        spin_unlock(&hash->gh_lock);
-                        new = alloc_entry(uid);
-                        if (!new) {
-                                CERROR("fail to alloc entry\n");
-                                RETURN(NULL);
-                        }
-                        goto find_again;
-                } else {
-                        list_add(&new->ge_hash, head);
-                        entry = new;
-                }
-        } else {
-                if (new) {
-                        free_entry(new);
-                        new = NULL;
-                }
-                list_move(&entry->ge_hash, head);
-        }
-        get_entry(entry);
-
-        /* acquire for new one */
-        if (GRP_IS_NEW(entry)) {
-                GRP_SET_ACQUIRING(entry);
-                GRP_CLEAR_NEW(entry);
-                entry->ge_acquire_expire = jiffies +
-                        hash->gh_acquire_expire * HZ;
-                spin_unlock(&hash->gh_lock);
-
-                rc = refresh_entry(hash, entry);
-
-                spin_lock(&hash->gh_lock);
-                if (rc) {
-                        GRP_CLEAR_ACQUIRING(entry);
-                        GRP_SET_INVALID(entry);
-                }
-                /* fall through */
-        }
-        
-        /*
-         * someone (and only one) is doing upcall upon this item, just wait it
-         * complete
-         */
-        if (GRP_IS_ACQUIRING(entry)) {
-                init_waitqueue_entry(&wait, current);
-                add_wait_queue(&entry->ge_waitq, &wait);
-                set_current_state(TASK_INTERRUPTIBLE);
-                spin_unlock(&hash->gh_lock);
-
-                schedule_timeout(hash->gh_acquire_expire * HZ);
-
-                spin_lock(&hash->gh_lock);
-                remove_wait_queue(&entry->ge_waitq, &wait);
-                if (GRP_IS_ACQUIRING(entry)) {
-                        /* we're interrupted or upcall failed
-                         * in the middle
-                         */
-                        put_entry(entry);
-                        spin_unlock(&hash->gh_lock);
-                        RETURN(NULL);
-                }
-                /* fall through */
-        }
-
-        /* invalid means error, don't need to try again */
-        if (GRP_IS_INVALID(entry)) {
-                put_entry(entry);
-                spin_unlock(&hash->gh_lock);
-                RETURN(NULL);
-        }
-
-        /*
-         * check expired. We can't refresh the existed one because some memory
-         * might be shared by multiple processes.
-         */
-        if (check_unlink_entry(entry)) {
-                /*
-                 * if expired, try again. but if this entry is created by me but
-                 * too quickly turn to expired without any error, should at
-                 * least give a chance to use it once.
-                 */
-                if (entry != new) {
-                        put_entry(entry);
-                        spin_unlock(&hash->gh_lock);
-                        new = NULL;
-                        goto find_again;
-                }
-        }
-        
-        /* Now we know it's good */
-        spin_unlock(&hash->gh_lock);
-        RETURN(entry);
-}
-
-void mds_put_group_entry(struct mds_obd *mds, struct mds_grp_hash_entry *entry)
-{
-        struct mds_grp_hash *hash = &_group_hash;
-        ENTRY;
-
-        if (!entry) {
-                EXIT;
-                return;
-        }
-
-        spin_lock(&hash->gh_lock);
-        LASSERT(atomic_read(&entry->ge_refcount) > 0);
-        put_entry(entry);
-        spin_unlock(&hash->gh_lock);
-        EXIT;
-}
-
-static int entry_set_group_info(struct mds_grp_hash_entry *entry,
-                                __u32 ngroups, gid_t *groups)
-{
-        struct group_info *ginfo;
-        ENTRY;
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4)
-        if (ngroups > NGROUPS)
-                ngroups = NGROUPS;
-#endif
-
-        if (ngroups > NGROUPS_MAX) {
-                CERROR("too many (%d) supp groups\n", ngroups); 
-                RETURN(-EINVAL);
-        }
-
-        ginfo = groups_alloc(ngroups);
-        if (!ginfo) {
-                CERROR("can't alloc group_info for %d groups\n", ngroups);
-                RETURN(-ENOMEM);
-        }
-        groups_from_buffer(ginfo, groups);
-
-        entry->ge_group_info = ginfo;
-        RETURN(0);
-}
-
-int mds_handle_group_downcall(int err, uid_t uid, __u32 ngroups, gid_t *groups)
-{
-        struct mds_grp_hash *hash = &_group_hash;
-        struct mds_grp_hash_entry *entry = NULL;
-        struct list_head *head;
-        int found = 0, rc = 0;
-        ENTRY;
-
-        LASSERT(hash);
-
-        head = &hash->gh_table[MDSGRP_HASH_INDEX(uid)];
-
-        spin_lock(&hash->gh_lock);
-        list_for_each_entry(entry, head, ge_hash) {
-                if (entry->ge_uid == uid) {
-                        found = 1;
-                        break;
-                }
-        }
-        if (!found) {
-                /* haven't found, it's possible */
-                spin_unlock(&hash->gh_lock);
-                RETURN(-EINVAL);
-        }
-        if (err) {
-                GRP_SET_INVALID(entry);
-                GOTO(out, rc = -EINVAL);
-        }
-
-        if (!GRP_IS_ACQUIRING(entry) ||
-            GRP_IS_INVALID(entry) ||
-            GRP_IS_EXPIRED(entry)) {
-                CERROR("found a stale entry %p(uid %d) in ioctl\n",
-                        entry, entry->ge_uid);
-                GOTO(out, rc = -EINVAL);
-        }
-
-        atomic_inc(&entry->ge_refcount);
-        spin_unlock(&hash->gh_lock);
-        rc = entry_set_group_info(entry, ngroups, groups);
-        spin_lock(&hash->gh_lock);
-        atomic_dec(&entry->ge_refcount);
-        if (rc) {
-                GRP_SET_INVALID(entry);
-                list_del_init(&entry->ge_hash);
-                GOTO(out, rc);
-        }
-        entry->ge_acquisition_time = LTIME_S(CURRENT_TIME);
-        entry->ge_expire = jiffies + hash->gh_entry_expire * HZ;
-        GRP_SET_VALID(entry);
-        CDEBUG(D_OTHER, "created mds_grp_entry %p for uid %d\n",
-               entry, entry->ge_uid);
-out:
-        wake_up_all(&entry->ge_waitq);
-        spin_unlock(&hash->gh_lock);
-        RETURN(rc);
-}
-
-static void mds_flush_group_hash(struct mds_grp_hash *hash, int force)
-{
-        struct mds_grp_hash_entry *entry, *next;
-        int i;
-        ENTRY;
-
-        spin_lock(&hash->gh_lock);
-        for (i = 0; i < MDSGRP_HASH_SIZE; i++) {
-                list_for_each_entry_safe(entry, next,
-                                         &hash->gh_table[i], ge_hash) {
-                        if (!force && atomic_read(&entry->ge_refcount)) {
-                                GRP_SET_EXPIRED(entry);
-                                continue;
-                        }
-                        LASSERT(!atomic_read(&entry->ge_refcount));
-                        free_entry(entry);
-                }
-        }
-        spin_unlock(&hash->gh_lock);
-        EXIT;
-}
-
-void mds_group_hash_flush_idle()
-{
-        mds_flush_group_hash(&_group_hash, 0);
-}
-
-int mds_allow_setgroups(void)
-{
-        return _group_hash.gh_allow_setgroups;
-}
-
-int mds_group_hash_init()
-{
-        struct mds_grp_hash *hash;
-        int i;
-        ENTRY;
-
-        hash = &_group_hash;
-
-        spin_lock_init(&hash->gh_lock);
-        for (i = 0; i < MDSGRP_HASH_SIZE; i++)
-                INIT_LIST_HEAD(&hash->gh_table[i]);
-        /* set default value, proc tunable */
-        sprintf(hash->gh_upcall, "%s", "/sbin/l_getgroups");
-        hash->gh_entry_expire = 5 * 60;
-        hash->gh_acquire_expire = 5;
-        hash->gh_allow_setgroups = 0;
-
-        RETURN(0);
-}
-
-void mds_group_hash_cleanup()
-{
-        mds_flush_group_hash(&_group_hash, 1);
-}
index be2fcfc..d68b78e 100644 (file)
@@ -97,7 +97,15 @@ int mds_lock_new_child(struct obd_device *obd, struct inode *inode,
 void groups_from_buffer(struct group_info *ginfo, __u32 *gids);
 int mds_update_unpack(struct ptlrpc_request *, int offset,
                       struct mds_update_record *);
-int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd);
+int mds_idmap_set(struct mds_export_data *med, __u32 id1, __u32 id2,
+                  int is_uid_mapping);
+__u32 mds_idmap_get(struct mds_export_data *med, __u32 id,
+                    int is_uid_mapping);
+void mds_idmap_cleanup(struct mds_export_data *med);
+void mds_reverse_map_ugid(struct ptlrpc_request *req,
+                          struct mds_body *body);
+int mds_init_ucred(struct lvfs_ucred *ucred, struct ptlrpc_request *req,
+                   struct mds_req_sec_desc *rsd);
 void mds_exit_ucred(struct lvfs_ucred *ucred);
 
 /* mds/mds_unlink_open.c */
@@ -205,7 +213,14 @@ int mds_get_md(struct obd_device *, struct inode *, void *md,
 
 int mds_pack_md(struct obd_device *, struct lustre_msg *, int offset,
                 struct mds_body *, struct inode *, int lock);
-
+int mds_pack_link(struct dentry *dentry, struct ptlrpc_request *req,
+                  struct mds_body *repbody, int reply_off);
+int mds_pack_ea(struct dentry *dentry, struct ptlrpc_request *req,
+                struct mds_body *repbody, int req_off, int reply_off);
+int mds_pack_ealist(struct dentry *dentry, struct ptlrpc_request *req,
+                    struct mds_body *repbody, int reply_off);
+int mds_pack_acl(struct obd_device *, struct lustre_msg *, int offset,
+                 struct mds_body *, struct inode *);
 int mds_pack_inode2id(struct obd_device *, struct lustre_id *,
                       struct inode *, int);
 
@@ -238,19 +253,12 @@ int mds_lock_and_check_slave(int, struct ptlrpc_request *, struct lustre_handle
 int mds_convert_mea_ea(struct obd_device *, struct inode *, struct lov_mds_md *, int);
 int mds_is_dir_empty(struct obd_device *, struct dentry *);
 
-/* mds_groups.c */
-int mds_group_hash_init(void);
-void mds_group_hash_cleanup(void);
-void mds_group_hash_flush_idle(void);
-int mds_allow_setgroups(void);
-
-extern char mds_getgroups_upcall[PATH_MAX];
-extern int mds_grp_hash_entry_expire;
-extern int mds_grp_hash_acquire_expire;
-
-struct mds_grp_hash *__mds_get_global_group_hash(void);
-struct mds_grp_hash_entry * mds_get_group_entry(struct mds_obd *mds, uid_t uid);
-void mds_put_group_entry(struct mds_obd *mds, struct mds_grp_hash_entry *entry);
-int mds_handle_group_downcall(int err, uid_t uid, __u32 ngroups, gid_t *groups);
+/* mds_lsd.c */
+struct upcall_cache *__mds_get_global_lsd_cache(void);
+int mds_init_lsd_cache(void);
+void mds_cleanup_lsd_cache(void);
+struct lustre_sec_desc * mds_get_lsd(__u32 uid);
+void mds_put_lsd(struct lustre_sec_desc *lsd);
+void mds_flush_lsd(__u32 id);
 
 #endif /* _MDS_INTERNAL_H */
index c298512..6706841 100644 (file)
@@ -286,11 +286,11 @@ static int mds_setattr_unpack(struct ptlrpc_request *req, int offset,
         }
 
         if (req->rq_reqmsg->bufcount > offset + 2) {
-                r->ur_logcookies = lustre_msg_buf(req->rq_reqmsg, offset + 2, 0);
-                if (r->ur_eadata == NULL)
+                r->ur_ea2data = lustre_msg_buf(req->rq_reqmsg, offset + 2, 0);
+                if (r->ur_ea2data == NULL)
                         RETURN (-EFAULT);
 
-                r->ur_cookielen = req->rq_reqmsg->buflens[offset + 2];
+                r->ur_ea2datalen = req->rq_reqmsg->buflens[offset + 2];
         }
 
         RETURN(0);
@@ -504,6 +504,195 @@ int mds_update_unpack(struct ptlrpc_request *req, int offset,
         RETURN(rc);
 }
 
+static
+struct mds_idmap_table *__get_idmap_table(struct mds_export_data *med,
+                                          int create)
+{
+        struct mds_idmap_table *new;
+        int i;
+
+        if (!create || med->med_idmap)
+                return med->med_idmap;
+
+        spin_unlock(&med->med_idmap_lock);
+        OBD_ALLOC(new, sizeof(*new));
+        spin_lock(&med->med_idmap_lock);
+
+        if (!new) {
+                CERROR("fail to alloc %d\n", sizeof(*new));
+                return NULL;
+        }
+
+        if (med->med_idmap) {
+                OBD_FREE(new, sizeof(*new));
+                return med->med_idmap;
+        }
+
+        for (i = 0; i < MDS_IDMAP_HASHSIZE; i++) {
+                INIT_LIST_HEAD(&new->uidmap[i]);
+                INIT_LIST_HEAD(&new->gidmap[i]);
+        }
+
+        CDEBUG(D_SEC, "allocate idmap table for med %p\n", med);
+        med->med_idmap = new;
+        return new;
+}
+
+static void __flush_mapping_table(struct list_head *table)
+{
+        struct mds_idmap_item *item;
+        int i;
+
+        for (i = 0; i < MDS_IDMAP_HASHSIZE; i++) {
+                while (!list_empty(&table[i])) {
+                        item = list_entry(table[i].next, struct mds_idmap_item,
+                                          hash);
+                        list_del(&item->hash);
+                        OBD_FREE(item, sizeof(*item));
+                }
+        }
+}
+
+void mds_idmap_cleanup(struct mds_export_data *med)
+{
+        ENTRY;
+
+        if (!med->med_idmap) {
+                EXIT;
+                return;
+        }
+
+        spin_lock(&med->med_idmap_lock);
+        __flush_mapping_table(med->med_idmap->uidmap);
+        __flush_mapping_table(med->med_idmap->gidmap);
+        OBD_FREE(med->med_idmap, sizeof(struct mds_idmap_table));
+        spin_unlock(&med->med_idmap_lock);
+}
+
+static inline int idmap_hash(__u32 id)
+{
+        return (id & (MDS_IDMAP_HASHSIZE - 1));
+}
+
+static
+int __idmap_set_item(struct mds_export_data *med,
+                     struct list_head *table,
+                     __u32 id1, __u32 id2)
+{
+        struct list_head *head;
+        struct mds_idmap_item *item, *new = NULL;
+        int found = 0;
+
+        head = table + idmap_hash(id1);
+again:
+        list_for_each_entry(item, head, hash) {
+                if (item->id1 == id1) {
+                        found = 1;
+                        break;
+                }
+        }
+
+        if (!found) {
+                if (new == NULL) {
+                        spin_unlock(&med->med_idmap_lock);
+                        OBD_ALLOC(new, sizeof(*new));
+                        spin_lock(&med->med_idmap_lock);
+                        if (!new) {
+                                CERROR("fail to alloc %d\n", sizeof(*new));
+                                return -ENOMEM;
+                        }
+                        goto again;
+                }
+                new->id1 = id1;
+                new->id2 = id2;
+                list_add(&new->hash, head);
+        } else {
+                if (new)
+                        OBD_FREE(new, sizeof(*new));
+                if (item->id2 != id2) {
+                        CWARN("mapping changed: %u ==> (%u -> %u)\n",
+                               id1, item->id2, id2);
+                        item->id2 = id2;
+                }
+                list_move(&item->hash, head);
+        }
+
+        return 0;
+}
+
+int mds_idmap_set(struct mds_export_data *med, __u32 id1, __u32 id2,
+                  int is_uid_mapping)
+{
+        struct mds_idmap_table *idmap;
+        int rc;
+        ENTRY;
+
+        spin_lock(&med->med_idmap_lock);
+
+        idmap = __get_idmap_table(med, 1);
+        if (!idmap)
+                GOTO(out, rc = -ENOMEM);
+
+        if (is_uid_mapping)
+                rc = __idmap_set_item(med, idmap->uidmap, id1, id2);
+        else
+                rc = __idmap_set_item(med, idmap->gidmap, id1, id2);
+
+out:
+        spin_unlock(&med->med_idmap_lock);
+        RETURN(rc);
+}
+
+__u32 mds_idmap_get(struct mds_export_data *med, __u32 id,
+                    int is_uid_mapping)
+{
+        struct mds_idmap_table *idmap;
+        struct list_head *table;
+        struct list_head *head;
+        struct mds_idmap_item *item;
+        int found = 0;
+        __u32 res;
+
+        spin_lock(&med->med_idmap_lock);
+        idmap = __get_idmap_table(med, 0);
+        if (!idmap)
+                goto nllu;
+
+        table = is_uid_mapping ? idmap->uidmap : idmap->gidmap;
+        head = table + idmap_hash(id);
+
+        list_for_each_entry(item, head, hash) {
+                if (item->id1 == id) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found)
+                goto nllu;
+
+        res = item->id2;
+out:
+        spin_unlock(&med->med_idmap_lock);
+        return res;
+nllu:
+        res = is_uid_mapping ? med->med_nllu : med->med_nllg;
+        goto out;
+}
+
+void mds_reverse_map_ugid(struct ptlrpc_request *req,
+                          struct mds_body *body)
+{
+        struct mds_export_data *med = &req->rq_export->u.eu_mds_data;
+
+        LASSERT(req->rq_remote);
+
+        if (body->valid & OBD_MD_FLUID)
+                body->uid = mds_idmap_get(med, body->uid, 1);
+
+        if (body->valid & OBD_MD_FLGID)
+                body->gid = mds_idmap_get(med, body->gid, 0);
+}
+
 static inline void drop_ucred_ginfo(struct lvfs_ucred *ucred)
 {
         if (ucred->luc_ginfo) {
@@ -512,43 +701,164 @@ static inline void drop_ucred_ginfo(struct lvfs_ucred *ucred)
         }
 }
 
+static inline void drop_ucred_lsd(struct lvfs_ucred *ucred)
+{
+        if (ucred->luc_lsd) {
+                mds_put_lsd(ucred->luc_lsd);
+                ucred->luc_lsd = NULL;
+        }
+}
+
 /*
+ * the heart of the uid/gid handling and security checking.
+ *
  * root could set any group_info if we allowed setgroups, while
  * normal user only could 'reduce' their group members -- which
  * is somewhat expensive.
  */
-int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd)
+int mds_init_ucred(struct lvfs_ucred *ucred,
+                   struct ptlrpc_request *req,
+                   struct mds_req_sec_desc *rsd)
 {
+        struct mds_obd *mds = &req->rq_export->exp_obd->u.mds;
+        struct mds_export_data *med = &req->rq_export->u.eu_mds_data;
+        struct lustre_sec_desc *lsd;
+        ptl_nid_t peernid = req->rq_peer.peer_id.nid;
         struct group_info *gnew;
-
+        unsigned int setuid, setgid, strong_sec;
         ENTRY;
+
         LASSERT(ucred);
         LASSERT(rsd);
+        LASSERT(rsd->rsd_ngroups <= LUSTRE_MAX_GROUPS);
+
+        strong_sec = (req->rq_auth_uid != -1);
+        LASSERT(!(req->rq_remote && !strong_sec));
+
+        /* sanity check & set local/remote flag */
+        if (req->rq_remote) {
+                if (med->med_local) {
+                        CWARN("exp %p: client on nid "LPX64" was local, "
+                              "set to remote\n", req->rq_export, peernid);
+                        med->med_local = 0;
+                }
+        } else {
+                if (!med->med_local) {
+                        CWARN("exp %p: client on nid "LPX64" was remote, "
+                              "set to local\n", req->rq_export, peernid);
+                        med->med_local = 1;
+                }
+        }
+
+        setuid = (rsd->rsd_fsuid != rsd->rsd_uid);
+        setgid = (rsd->rsd_fsgid != rsd->rsd_gid);
+
+        /* deny setuid/setgid for remote client */
+        if ((setuid || setgid) && !med->med_local) {
+                CWARN("deny setxid (%u/%u) from remote client "LPX64"\n",
+                      setuid, setgid, peernid);
+                RETURN(-EPERM);
+        }
+
+        /* take care of uid/gid mapping for client in remote realm */
+        if (req->rq_remote) {
+                /* record the uid mapping here */
+                mds_idmap_set(med, req->rq_auth_uid, rsd->rsd_uid, 1);
+
+                /* now we act as the authenticated user */
+                rsd->rsd_uid = rsd->rsd_fsuid = req->rq_auth_uid;
+        } else if (strong_sec && req->rq_auth_uid != rsd->rsd_uid) {
+                /* if we use strong authentication on this request, we
+                 * expect the uid which client claimed is true.
+                 *
+                 * FIXME root's machine_credential in krb5 will be interpret
+                 * as "nobody", which is not good for mds-mds and mds-ost
+                 * connection.
+                 */
+                CWARN("nid "LPX64": UID %u was authenticated while client "
+                      "claimed %u, set %u by force\n",
+                      peernid, req->rq_auth_uid, rsd->rsd_uid,
+                      req->rq_auth_uid);
+                rsd->rsd_uid = req->rq_auth_uid;
+        }
+
+        /* now lsd come into play */
+        ucred->luc_ginfo = NULL;
+        ucred->luc_lsd = lsd = mds_get_lsd(rsd->rsd_uid);
+
+        if (lsd) {
+                if (req->rq_remote) {
+                        /* record the gid mapping here */
+                        mds_idmap_set(med, lsd->lsd_gid, rsd->rsd_gid, 0);
+                        /* now we act as the authenticated group */
+                        rsd->rsd_gid = rsd->rsd_fsgid = lsd->lsd_gid;
+                } else if (rsd->rsd_gid != lsd->lsd_gid) {
+                        /* verify gid which client declared is true */
+                        CWARN("GID: %u while client declare %u, "
+                              "set %u by force\n",
+                              lsd->lsd_gid, rsd->rsd_gid,
+                              lsd->lsd_gid);
+                        rsd->rsd_gid = lsd->lsd_gid;
+                }
+
+                if (lsd->lsd_ginfo) {
+                        ucred->luc_ginfo = lsd->lsd_ginfo;
+                        get_group_info(ucred->luc_ginfo);
+                }
+
+                /* check permission of setuid */
+                if (setuid) {
+                        if (!lsd->lsd_allow_setuid) {
+                                CWARN("mds blocked setuid attempt: %u -> %u\n",
+                                      rsd->rsd_uid, rsd->rsd_fsuid);
+                                RETURN(-EPERM);
+                        }
+                }
+
+                /* check permission of setgid */
+                if (setgid) {
+                        if (!lsd->lsd_allow_setgid) {
+                                CWARN("mds blocked setgid attempt: %u -> %u\n",
+                                      rsd->rsd_gid, rsd->rsd_fsgid);
+                                RETURN(-EPERM);
+                        }
+                }
+        } else {
+                /* failed to get lsd, right now we simply deny any access
+                 * if strong authentication is used,
+                 */
+                if (strong_sec) {
+                        CWARN("mds deny access without LSD\n");
+                        RETURN(-EPERM);
+                }
+
+                /* and otherwise deny setuid/setgid attempt */
+                if (setuid || setgid) {
+                        CWARN("mds deny setuid/setgid without LSD\n");
+                        RETURN(-EPERM);
+                }
+        }
 
+        /* NOTE: we have already obtained supplementary groups,
+         * it will be retained across root_squash. will it be a
+         * security problem??
+         */
+        mds_squash_root(mds, rsd, &peernid); 
+
+        /* remove privilege for non-root user */
+        if (rsd->rsd_fsuid)
+                rsd->rsd_cap &= ~CAP_FS_MASK;
+
+        /* by now every fields in rsd have been granted */
         ucred->luc_fsuid = rsd->rsd_fsuid;
         ucred->luc_fsgid = rsd->rsd_fsgid;
         ucred->luc_cap = rsd->rsd_cap;
         ucred->luc_uid = rsd->rsd_uid;
-        ucred->luc_ghash = mds_get_group_entry(NULL, rsd->rsd_uid);
-        ucred->luc_ginfo = NULL;
 
-        if (ucred->luc_ghash && ucred->luc_ghash->ge_group_info) {
-                ucred->luc_ginfo = ucred->luc_ghash->ge_group_info;
-                get_group_info(ucred->luc_ginfo);
-        }
-
-        /* everything is done if we don't allow set groups */
-        if (!mds_allow_setgroups())
+        /* everything is done if we don't allow setgroups */
+        if (!lsd || !lsd->lsd_allow_setgrp)
                 RETURN(0);
 
-        if (rsd->rsd_ngroups > LUSTRE_MAX_GROUPS) {
-                CERROR("client provide too many groups: %d\n",
-                rsd->rsd_ngroups);
-                drop_ucred_ginfo(ucred);
-                mds_put_group_entry(NULL, ucred->luc_ghash);
-                RETURN(-EFAULT);
-        }
-
         if (ucred->luc_uid == 0) {
                 if (rsd->rsd_ngroups == 0) {
                         drop_ucred_ginfo(ucred);
@@ -559,12 +869,11 @@ int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd)
                 if (!gnew) {
                         CERROR("out of memory\n");
                         drop_ucred_ginfo(ucred);
-                        mds_put_group_entry(NULL, ucred->luc_ghash);
+                        drop_ucred_lsd(ucred);
                         RETURN(-ENOMEM);
                 }
                 groups_from_buffer(gnew, rsd->rsd_groups);
-                /* can't rely on client to sort them */
-                groups_sort(gnew);
+                groups_sort(gnew); /* can't rely on client */
 
                 drop_ucred_ginfo(ucred);
                 ucred->luc_ginfo = gnew;
@@ -586,7 +895,7 @@ int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd)
                 if (!gnew) {
                         CERROR("out of memory\n");
                         drop_ucred_ginfo(ucred);
-                        mds_put_group_entry(NULL, ucred->luc_ghash);
+                        drop_ucred_lsd(ucred);
                         RETURN(-ENOMEM);
                 }
 
@@ -594,8 +903,8 @@ int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd)
                 while (cur < rsd->rsd_ngroups) {
                         if (groups_search(ginfo, rsd->rsd_groups[cur])) {
                                 GROUP_AT(gnew, set) = rsd->rsd_groups[cur];
-                               set++;
-                       }
+                                set++;
+                        }
                         cur++;
                 }
                 gnew->ngroups = set;
@@ -609,11 +918,7 @@ int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd)
 void mds_exit_ucred(struct lvfs_ucred *ucred)
 {
         ENTRY;
-
-        if (ucred->luc_ginfo)
-                put_group_info(ucred->luc_ginfo);
-        if (ucred->luc_ghash)
-                mds_put_group_entry(NULL, ucred->luc_ghash);
-
+        drop_ucred_ginfo(ucred);
+        drop_ucred_lsd(ucred);
         EXIT;
 }
index 0287b11..6685b2b 100644 (file)
 #define DEBUG_SUBSYSTEM S_MDS
 
 #include <linux/module.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/obd_support.h>
+#include <linux/obd_class.h>
+#include <linux/obd.h>
+#include <linux/lustre_lib.h>
 #include <linux/lustre_mds.h>
 #include <linux/lustre_idl.h>
 #include <linux/obd_class.h>
 #include <linux/obd_lov.h>
 #include <linux/lustre_lib.h>
 #include <linux/lustre_fsfilt.h>
+#include <linux/lustre_lite.h>
 
 #include "mds_internal.h"
 
-
 /*
  * TODO:
  *   - magic in mea struct
@@ -110,6 +116,13 @@ int mds_md_connect(struct obd_device *obd, char *md_name)
         if (rc)
                 GOTO(err_reg, rc);
 
+        if (mds->mds_mds_sec) {
+                rc = obd_set_info(mds->mds_md_exp, strlen("sec"), "sec",
+                                  strlen(mds->mds_mds_sec), mds->mds_mds_sec);
+                if (rc)
+                        GOTO(err_reg, rc);
+        }
+
         mds->mds_md_connected = 1;
         up(&mds->mds_md_sem);
        RETURN(0);
@@ -952,10 +965,13 @@ int mds_lock_slave_objs(struct obd_device *obd, struct dentry *dentry,
         op_data->mea1 = mea;
         it.it_op = IT_UNLINK;
 
+        OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data));
+
         rc = md_enqueue(mds->mds_md_exp, LDLM_IBITS, &it, LCK_EX,
                         op_data, *rlockh, NULL, 0, ldlm_completion_ast,
                         mds_blocking_ast, NULL);
         OBD_FREE(op_data, sizeof(*op_data));
+        OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
         EXIT;
 cleanup:
         OBD_FREE(mea, mea_size);
@@ -1133,7 +1149,6 @@ int mds_lock_and_check_slave(int offset, struct ptlrpc_request *req,
                 CERROR("Can't unpack security desc\n");
                 GOTO(cleanup, rc = -EFAULT);
         }
-        mds_squash_root(&obd->u.mds, rsd, &req->rq_peer.peer_id.nid); 
 
         body = lustre_swab_reqbuf(req, offset, sizeof(*body),
                                   lustre_swab_mds_body);
@@ -1162,7 +1177,7 @@ int mds_lock_and_check_slave(int offset, struct ptlrpc_request *req,
        if (!S_ISDIR(dentry->d_inode->i_mode))
                GOTO(cleanup, rc = 0);
 
-        rc = mds_init_ucred(&uc, rsd);
+        rc = mds_init_ucred(&uc, req, rsd);
         if (rc) {
                 CERROR("can't init ucred\n");
                 GOTO(cleanup, rc);
index a0f3880..bd3ed48 100644 (file)
@@ -242,6 +242,16 @@ int mds_dt_connect(struct obd_device *obd, char * lov_name)
                 RETURN(-ENOTCONN);
         }
 
+        if (mds->mds_ost_sec) {
+                rc = obd_set_info(mds->mds_dt_obd->obd_self_export,
+                                  strlen("sec"), "sec",
+                                  strlen(mds->mds_ost_sec), mds->mds_ost_sec);
+                if (rc) {
+                        mds->mds_dt_obd = ERR_PTR(rc);
+                        RETURN(rc);
+                }
+        }
+
         CDEBUG(D_HA, "obd: %s osc: %s lov_name: %s\n",
                obd->obd_name, mds->mds_dt_obd->obd_name, lov_name);
 
diff --git a/lustre/mds/mds_lsd.c b/lustre/mds/mds_lsd.c
new file mode 100644 (file)
index 0000000..fbc3de3
--- /dev/null
@@ -0,0 +1,240 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDS
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/kmod.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/version.h>
+#include <linux/unistd.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <asm/segment.h>
+
+#include <libcfs/list.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_mds.h>
+#include <linux/lustre_ucache.h>
+
+#include "mds_internal.h"
+
+/* 
+ * We need share hash table among the groups of MDSs (which server as the same
+ * lustre file system), maybe MDT? but there's lprocfs problems of putting this
+ * in MDT. so we make it global to the module. which brings the limitation that
+ * one node couldn't running multiple MDS which server as different Lustre FS.
+ * but which maybe not meaningful.
+ */
+
+
+#define MDS_LSD_HASHSIZE        (256)
+static struct upcall_cache _lsd_cache;
+static struct list_head _lsd_hashtable[MDS_LSD_HASHSIZE];
+
+struct upcall_cache *__mds_get_global_lsd_cache()
+{
+        return &_lsd_cache;
+}
+
+static unsigned int lsd_hash(struct upcall_cache *cache, __u64 key)
+{
+        LASSERT(cache == &_lsd_cache);
+        return ((__u32) key) & (MDS_LSD_HASHSIZE - 1);
+}
+
+static struct upcall_cache_entry *
+lsd_alloc_entry(struct upcall_cache *cache, __u64 key)
+{
+        struct lsd_cache_entry *entry;
+        ENTRY;
+
+        OBD_ALLOC(entry, sizeof(*entry));
+        if (!entry) {
+                CERROR("failed to alloc entry\n");
+                RETURN(NULL);
+        }
+        upcall_cache_init_entry(cache, &entry->base, key);
+
+        RETURN(&entry->base);
+}
+
+static void lsd_free_entry(struct upcall_cache *cache,
+                           struct upcall_cache_entry *entry)
+{
+        struct lsd_cache_entry *lentry;
+
+        lentry = container_of(entry, struct lsd_cache_entry, base);
+        if (lentry->lsd.lsd_ginfo)
+                put_group_info(lentry->lsd.lsd_ginfo);
+        OBD_FREE(lentry, sizeof(*lentry));
+}
+
+
+static int lsd_make_upcall(struct upcall_cache *cache,
+                           struct upcall_cache_entry *entry)
+{
+        char *argv[4];
+        char *envp[3];
+        char uidstr[16];
+        int rc;
+        ENTRY;
+
+        snprintf(uidstr, 16, "%u", (__u32) entry->ue_key);
+
+        argv[0] = cache->uc_upcall;
+        argv[1] = uidstr;
+        argv[2] = NULL;
+                                                                                                                        
+        envp[0] = "HOME=/";
+        envp[1] = "PATH=/sbin:/usr/sbin";
+        envp[2] = NULL;
+
+        rc = USERMODEHELPER(argv[0], argv, envp);
+        if (rc < 0) {
+                CERROR("Error invoking lsd upcall %s %s: %d; check "
+                       "/proc/fs/lustre/mds/lsd_upcall\n",
+                       argv[0], argv[1], rc);
+        } else {
+                CWARN("Invoked upcall %s %s\n",
+                        argv[0], argv[1]);
+        }
+        RETURN(rc);
+}
+
+static int lsd_parse_downcall(struct upcall_cache *cache,
+                              struct upcall_cache_entry *entry,
+                              void *args)
+{
+        struct lustre_sec_desc *lsd;
+        struct lsd_cache_entry *lentry;
+        struct lsd_downcall_args *lsd_args;
+        struct group_info *ginfo;
+        ENTRY;
+
+        LASSERT(args);
+
+        lentry = container_of(entry, struct lsd_cache_entry, base);
+        lsd = &lentry->lsd;
+        lsd_args = (struct lsd_downcall_args *) args;
+        LASSERT(lsd_args->err == 0);
+        LASSERT(lsd_args->ngroups <= NGROUPS_MAX);
+
+        ginfo = groups_alloc(lsd_args->ngroups);
+        if (!ginfo) {
+                CERROR("can't alloc group_info for %d groups\n",
+                        lsd_args->ngroups);
+                RETURN(-ENOMEM);
+        }
+        groups_from_buffer(ginfo, lsd_args->groups);
+        groups_sort(ginfo);
+
+        lsd->lsd_uid = lsd_args->uid;
+        lsd->lsd_gid = lsd_args->gid;
+        lsd->lsd_ginfo = ginfo;
+        lsd->lsd_allow_setuid = lsd_args->allow_setuid;
+        lsd->lsd_allow_setgid = lsd_args->allow_setgid;
+        lsd->lsd_allow_setgrp = lsd_args->allow_setgrp;
+
+        CWARN("LSD: uid %u gid %u ngroups %u, perm (%d/%d/%d)\n",
+              lsd->lsd_uid, lsd->lsd_gid, ginfo->ngroups,
+              lsd->lsd_allow_setuid, lsd->lsd_allow_setgid,
+              lsd->lsd_allow_setgrp);
+        RETURN(0);
+}
+
+struct lustre_sec_desc * mds_get_lsd(__u32 uid)
+{
+        struct upcall_cache *cache = &_lsd_cache;
+        struct upcall_cache_entry *entry;
+        struct lsd_cache_entry *lentry;
+
+        entry = upcall_cache_get_entry(cache, (__u64) uid);
+        if (!entry)
+                return NULL;
+
+        lentry = container_of(entry, struct lsd_cache_entry, base);
+        return &lentry->lsd;
+}
+
+void mds_put_lsd(struct lustre_sec_desc *lsd)
+{
+        struct lsd_cache_entry *lentry;
+
+        LASSERT(lsd);
+
+        lentry = container_of(lsd, struct lsd_cache_entry, lsd);
+        upcall_cache_put_entry(&lentry->base);
+}
+
+int mds_init_lsd_cache()
+{
+        struct upcall_cache *cache = &_lsd_cache;
+        int i;
+        ENTRY;
+
+        cache->uc_hashtable = _lsd_hashtable;
+        cache->uc_hashsize = MDS_LSD_HASHSIZE;
+        cache->uc_hashlock = RW_LOCK_UNLOCKED;
+        for (i = 0; i < cache->uc_hashsize; i++)
+                INIT_LIST_HEAD(&cache->uc_hashtable[i]);
+        cache->uc_name = "LSD_CACHE";
+
+        /* set default value, proc tunable */
+        sprintf(cache->uc_upcall, "%s", "/sbin/lsd_upcall");
+        cache->uc_entry_expire = 5 * 60;
+        cache->uc_acquire_expire = 5;
+
+        cache->hash = lsd_hash;
+        cache->alloc_entry = lsd_alloc_entry;
+        cache->free_entry = lsd_free_entry;
+        cache->make_upcall = lsd_make_upcall;
+        cache->parse_downcall = lsd_parse_downcall;
+
+        RETURN(0);
+}
+
+void mds_flush_lsd(__u32 id)
+{
+        struct upcall_cache *cache = &_lsd_cache;
+
+        if (id == -1)
+                upcall_cache_flush_idle(cache);
+        else
+                upcall_cache_flush_one(cache, (__u64) id);
+}
+
+void mds_cleanup_lsd_cache()
+{
+        upcall_cache_flush_all(&_lsd_cache);
+}
index 9d947a1..8b66569 100644 (file)
@@ -111,7 +111,7 @@ static void mds_mfd_destroy(struct mds_file_data *mfd)
         mds_mfd_put(mfd);
 }
 
-
+#ifdef IFILTERDATA_ACTUALLY_USED
 /* Caller must hold mds->mds_epoch_sem */
 static int mds_alloc_filterdata(struct inode *inode)
 {
@@ -131,6 +131,7 @@ static void mds_free_filterdata(struct inode *inode)
         inode->i_filterdata = NULL;
         iput(inode);
 }
+#endif /*IFILTERDATA_ACTUALLY_USED*/
 
 /* Write access to a file: executors cause a negative count,
  * writers a positive count.  The semaphore is needed to perform
@@ -155,7 +156,7 @@ static int mds_get_write_access(struct mds_obd *mds, struct inode *inode,
                 RETURN(-ETXTBSY);
         }
 
-
+#ifdef IFILTERDATA_ACTUALLY_USED
         if (MDS_FILTERDATA(inode) && MDS_FILTERDATA(inode)->io_epoch != 0) {
                 CDEBUG(D_INODE, "continuing MDS epoch "LPU64" for ino %lu/%u\n",
                        MDS_FILTERDATA(inode)->io_epoch, inode->i_ino,
@@ -169,14 +170,17 @@ static int mds_get_write_access(struct mds_obd *mds, struct inode *inode,
                 rc = -ENOMEM;
                 goto out;
         }
+#endif /*IFILTERDATA_ACTUALLY_USED*/
         if (epoch > mds->mds_io_epoch)
                 mds->mds_io_epoch = epoch;
         else
                 mds->mds_io_epoch++;
+#ifdef IFILTERDATA_ACTUALLY_USED
         MDS_FILTERDATA(inode)->io_epoch = mds->mds_io_epoch;
         CDEBUG(D_INODE, "starting MDS epoch "LPU64" for ino %lu/%u\n",
                mds->mds_io_epoch, inode->i_ino, inode->i_generation);
  out:
+#endif /*IFILTERDATA_ACTUALLY_USED*/
         if (rc == 0)
                 atomic_inc(&inode->i_writecount);
         up(&mds->mds_epoch_sem);
@@ -201,7 +205,9 @@ static int mds_put_write_access(struct mds_obd *mds, struct inode *inode,
         if (!unlinking && !(body->valid & OBD_MD_FLSIZE))
                 GOTO(out, rc = EAGAIN);
 #endif
+#ifdef IFILTERDATA_ACTUALLY_USED
         mds_free_filterdata(inode);
+#endif
  out:
         up(&mds->mds_epoch_sem);
         return rc;
@@ -257,7 +263,9 @@ static struct mds_file_data *mds_dentry_open(struct dentry *dentry,
                 error = mds_get_write_access(mds, dentry->d_inode, 0);
                 if (error)
                         GOTO(cleanup_mfd, error);
+#ifdef IFILTERDATA_ACTUALLY_USED
                 body->io_epoch = MDS_FILTERDATA(dentry->d_inode)->io_epoch;
+#endif /*IFILTERDATA_ACTUALLY_USED*/
         } else if (flags & FMODE_EXEC) {
                 error = mds_deny_write_access(mds, dentry->d_inode);
                 if (error)
@@ -666,6 +674,13 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild,
                         }
                 }
         }
+        rc = mds_pack_acl(obd, req->rq_repmsg, 3, body, dchild->d_inode);
+        if (rc < 0) {
+                CERROR("mds_pack_acl: rc = %d\n", rc);
+                up(&dchild->d_inode->i_sem);
+                RETURN(rc);
+        }
+
         /* If the inode has no EA data, then MDSs hold size, mtime */
         if (S_ISREG(dchild->d_inode->i_mode) &&
             !(body->valid & OBD_MD_FLEASIZE)) {
index 8753866..a4e7a9b 100644 (file)
@@ -32,6 +32,7 @@
 
 #include <linux/fs.h>
 #include <linux/jbd.h>
+#include <linux/namei.h>
 #include <linux/ext3_fs.h>
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
@@ -42,6 +43,7 @@
 #include <linux/lustre_dlm.h>
 #include <linux/lustre_log.h>
 #include <linux/lustre_fsfilt.h>
+#include <linux/lustre_lite.h>
 #include "mds_internal.h"
 
 struct mds_logcancel_data {
@@ -191,6 +193,10 @@ out_commit:
  * chown_common and inode_setattr
  * utimes and inode_setattr
  */
+#ifndef ATTR_RAW
+/* Just for the case if we have some clients that know about ATTR_RAW */
+#define ATTR_RAW 8192
+#endif
 int mds_fix_attr(struct inode *inode, struct mds_update_record *rec)
 {
         time_t now = LTIME_S(CURRENT_TIME);
@@ -200,6 +206,7 @@ int mds_fix_attr(struct inode *inode, struct mds_update_record *rec)
         ENTRY;
 
         /* only fix up attrs if the client VFS didn't already */
+
         if (!(ia_valid & ATTR_RAW))
                 RETURN(0);
 
@@ -296,10 +303,10 @@ void mds_steal_ack_locks(struct ptlrpc_request *req)
                 if (oldrep->rs_xid != req->rq_xid)
                         continue;
 
-                if (oldrep->rs_msg.opc != req->rq_reqmsg->opc)
+                if (oldrep->rs_msg->opc != req->rq_reqmsg->opc)
                         CERROR ("Resent req xid "LPX64" has mismatched opc: "
                                 "new %d old %d\n", req->rq_xid,
-                                req->rq_reqmsg->opc, oldrep->rs_msg.opc);
+                                req->rq_reqmsg->opc, oldrep->rs_msg->opc);
 
                 svc = oldrep->rs_srv_ni->sni_service;
                 spin_lock (&svc->srv_lock);
@@ -308,7 +315,7 @@ void mds_steal_ack_locks(struct ptlrpc_request *req)
 
                 CWARN("Stealing %d locks from rs %p x"LPD64".t"LPD64
                       " o%d NID %s\n", oldrep->rs_nlocks, oldrep,
-                      oldrep->rs_xid, oldrep->rs_transno, oldrep->rs_msg.opc,
+                      oldrep->rs_xid, oldrep->rs_transno, oldrep->rs_msg->opc,
                       ptlrpc_peernid2str(&exp->exp_connection->c_peer, str));
 
                 for (i = 0; i < oldrep->rs_nlocks; i++)
@@ -444,25 +451,40 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
         else                                            /* setattr */
                 rc = fsfilt_setattr(obd, de, handle, &rec->ur_iattr, 0);
 
-        if (rc == 0 && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) &&
-            rec->ur_eadata != NULL) {
-                struct lov_stripe_md *lsm = NULL;
-
-                rc = ll_permission(inode, MAY_WRITE, NULL);
-                if (rc < 0)
-                        GOTO(cleanup, rc);
+        if (rc == 0) {
+                if (rec->ur_iattr.ia_valid & ATTR_EA) {
+                        int flags = (int)rec->ur_iattr.ia_attr_flags;
+
+                        rc = -EOPNOTSUPP;
+                        if (inode->i_op && inode->i_op->setxattr) 
+                                rc = inode->i_op->setxattr(de, rec->ur_eadata,
+                                       rec->ur_ea2data, rec->ur_ea2datalen,
+                                       flags);
+                } else if (rec->ur_iattr.ia_valid & ATTR_EA_RM) {
+                        rc = -EOPNOTSUPP;
+                        if (inode->i_op && inode->i_op->removexattr) 
+                                rc = inode->i_op->removexattr(de,
+                                                    rec->ur_eadata);
+                } else if ((S_ISREG(inode->i_mode) ||
+                           S_ISDIR(inode->i_mode)) && rec->ur_eadata != NULL) {
+                         struct lov_stripe_md *lsm = NULL;
+
+                        rc = ll_permission(inode, MAY_WRITE, NULL);
+                        if (rc < 0)
+                                GOTO(cleanup, rc);
 
-                rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE, mds->mds_dt_exp,
-                                   0, &lsm, rec->ur_eadata);
-                if (rc)
-                        GOTO(cleanup, rc);
+                        rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE, mds->mds_dt_exp,
+                                           0, &lsm, rec->ur_eadata);
+                        if (rc)
+                                GOTO(cleanup, rc);
 
-                obd_free_memmd(mds->mds_dt_exp, &lsm);
+                        obd_free_memmd(mds->mds_dt_exp, &lsm);
 
-                rc = fsfilt_set_md(obd, inode, handle, rec->ur_eadata,
-                                   rec->ur_eadatalen);
-                if (rc)
-                        GOTO(cleanup, rc);
+                        rc = fsfilt_set_md(obd, inode, handle, rec->ur_eadata,
+                                           rec->ur_eadatalen);
+                        if (rc)
+                                GOTO(cleanup, rc);
+                }    
         }
 
         body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
@@ -476,6 +498,10 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
         if (rec->ur_iattr.ia_valid & (ATTR_ATIME | ATTR_ATIME_SET))
                 body->valid |= OBD_MD_FLATIME;
 
+        /* The logcookie should be no use anymore, why nobody remove
+         * following code block?
+         */
+        LASSERT(rec->ur_cookielen == 0);
         if (rc == 0 && rec->ur_cookielen && !IS_ERR(mds->mds_dt_obd)) {
                 OBD_ALLOC(mlcd, sizeof(*mlcd) + rec->ur_cookielen +
                           rec->ur_eadatalen);
@@ -2845,24 +2871,31 @@ static int mds_check_for_rename(struct obd_device *obd,
                 mds_pack_dentry2id(obd, &op_data->id1, dentry, 1);
 
                 it.it_op = IT_UNLINK;
+                OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data));
+                if (!it.d.fs_data)
+                        RETURN(-ENOMEM);
                 rc = md_enqueue(mds->mds_md_exp, LDLM_IBITS, &it, LCK_EX,
                                 op_data, rlockh, NULL, 0, ldlm_completion_ast,
                                 mds_blocking_ast, NULL);
                 OBD_FREE(op_data, sizeof(*op_data));
 
-                if (rc)
-                        RETURN(rc);
 
+                if (rc) {
+                        OBD_FREE(it.d.fs_data,
+                                 sizeof(struct lustre_intent_data));
+                        RETURN(rc);
+                }
                 if (rlockh->cookie != 0)
                         ldlm_lock_decref(rlockh, LCK_EX);
                 
-                if (it.d.lustre.it_data) {
-                        req = (struct ptlrpc_request *)it.d.lustre.it_data;
+                if (LUSTRE_IT(&it)->it_data) {
+                        req = (struct ptlrpc_request *)LUSTRE_IT(&it)->it_data;
                         ptlrpc_req_finished(req);
                 }
 
-                if (it.d.lustre.it_status)
-                        rc = it.d.lustre.it_status;
+                if (LUSTRE_IT(&it)->it_status)
+                        rc = LUSTRE_IT(&it)->it_status;
+                OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data));
                 OBD_FREE(rlockh, handle_size);
         }
         RETURN(rc);
index 21090cf..90a61e1 100644 (file)
@@ -459,12 +459,37 @@ int obd_proc_read_pinger(char *page, char **start, off_t off, int count,
                        );
 }
 
+#if ENABLE_GSS
+/* FIXME move these staff to proper place */
+int (*lustre_secinit_downcall_handler)(const char *buffer,
+                                       long count) = NULL;
+EXPORT_SYMBOL(lustre_secinit_downcall_handler);
+
+int obd_proc_write_secinit(struct file *file, const char *buffer,
+                           unsigned long count, void *data)
+{
+        int rc = 0;
+
+        if (lustre_secinit_downcall_handler) {
+                rc = (*lustre_secinit_downcall_handler)((char *)buffer, count);
+                if (rc) {
+                        LASSERT(rc < 0);
+                        return rc;
+                }
+        }
+        return (int)count;
+}
+#endif
+
 /* Root for /proc/fs/lustre */
 struct proc_dir_entry *proc_lustre_root = NULL;
 struct lprocfs_vars lprocfs_base[] = {
         { "version", obd_proc_read_version, NULL, NULL },
         { "kernel_version", obd_proc_read_kernel_version, NULL, NULL },
         { "pinger", obd_proc_read_pinger, NULL, NULL },
+#if ENABLE_GSS
+        { "secinit", NULL, obd_proc_write_secinit, NULL },
+#endif
         { 0 }
 };
 
index 26c96e6..2bd9b91 100644 (file)
@@ -548,6 +548,7 @@ void class_import_put(struct obd_import *import)
         if (import->imp_connection)
                 ptlrpc_put_connection_superhack(import->imp_connection);
 
+        LASSERT(!import->imp_sec);
         while (!list_empty(&import->imp_conn_list)) {
                 struct obd_import_conn *imp_conn;
 
@@ -575,6 +576,7 @@ struct obd_import *class_new_import(void)
         INIT_LIST_HEAD(&imp->imp_replay_list);
         INIT_LIST_HEAD(&imp->imp_sending_list);
         INIT_LIST_HEAD(&imp->imp_delayed_list);
+        INIT_LIST_HEAD(&imp->imp_rawrpc_list);
         spin_lock_init(&imp->imp_lock);
         imp->imp_conn_cnt = 0;
         imp->imp_max_transno = 0;
index 0237fc0..9d7afe9 100644 (file)
@@ -43,11 +43,14 @@ int filter_log_sz_change(struct llog_handle *cathandle,
                          struct inode *inode)
 {
         struct llog_size_change_rec *lsc;
-        int rc;
+#ifdef IFILTERDATA_ACTUALLY_USED
         struct ost_filterdata *ofd;
+#endif
+        int rc;
         ENTRY;
 
         down(&inode->i_sem);
+#ifdef IFILTERDATA_ACTUALLY_USED
         ofd = inode->i_filterdata;
         
         if (ofd && ofd->ofd_epoch >= io_epoch) {
@@ -68,6 +71,7 @@ int filter_log_sz_change(struct llog_handle *cathandle,
                 inode->i_filterdata = ofd;
                 ofd->ofd_epoch = io_epoch;
         }
+#endif
         /* the decision to write a record is now made, unlock */
         up(&inode->i_sem);
 
@@ -88,7 +92,9 @@ int filter_log_sz_change(struct llog_handle *cathandle,
                 rc = 0;
         }
 
-        out:
+#ifdef IFILTERDATA_ACTUALLY_USED
+out:
+#endif
         RETURN(rc);
 }
 struct obd_llogs * filter_grab_llog_for_group(struct obd_device *,
index 79b4b6b..cdf2ae6 100644 (file)
@@ -26,6 +26,8 @@
 
 #ifdef __KERNEL__
 # include <linux/module.h>
+# include <linux/dcache.h>
+# include <linux/namei.h>
 # include <linux/obd.h>
 # include <linux/obd_ost.h>
 # include <linux/lustre_net.h>
index f6f1a6c..b41258e 100644 (file)
@@ -51,6 +51,7 @@
 #include <linux/lustre_dlm.h>
 #include <libcfs/kp30.h>
 #include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
 #include <lustre/lustre_user.h>
 #include <linux/obd_ost.h>
 #include <linux/obd_lov.h>
@@ -2880,6 +2881,31 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen,
                 RETURN(0);
         }
 
+        if (keylen == strlen("sec") && memcmp(key, "sec", keylen) == 0) {
+                struct client_obd *cli = &exp->exp_obd->u.cli;
+
+                if (vallen == strlen("null") &&
+                    memcmp(val, "null", vallen) == 0) {
+                        cli->cl_sec_flavor = PTLRPC_SEC_NULL;
+                        cli->cl_sec_subflavor = 0;
+                        RETURN(0);
+                }
+                if (vallen == strlen("krb5i") &&
+                    memcmp(val, "krb5i", vallen) == 0) {
+                        cli->cl_sec_flavor = PTLRPC_SEC_GSS;
+                        cli->cl_sec_subflavor = PTLRPC_SEC_GSS_KRB5I;
+                        RETURN(0);
+                }
+                if (vallen == strlen("krb5p") &&
+                    memcmp(val, "krb5p", vallen) == 0) {
+                        cli->cl_sec_flavor = PTLRPC_SEC_GSS;
+                        cli->cl_sec_subflavor = PTLRPC_SEC_GSS_KRB5P;
+                        RETURN(0);
+                }
+                CERROR("unrecognized security type %s\n", (char*) val);
+                RETURN(-EINVAL);
+        }
+
         if (keylen < strlen("mds_conn") ||
             memcmp(key, "mds_conn", strlen("mds_conn")) != 0)
                 RETURN(-EINVAL);
index c615015..d18919a 100644 (file)
@@ -1005,6 +1005,11 @@ int ost_msg_check_version(struct lustre_msg *msg)
                         CERROR("bad opc %u version %08x, expecting %08x\n",
                                msg->opc, msg->version, LUSTRE_LOG_VERSION);
                 break;
+        case SEC_INIT:
+        case SEC_INIT_CONTINUE:
+        case SEC_FINI:
+                rc = 0;
+                break;
         default:
                 CERROR("OST unexpected opcode %d\n", msg->opc);
                 rc = -ENOTSUPP;
@@ -1029,6 +1034,13 @@ int ost_handle(struct ptlrpc_request *req)
                 RETURN(rc);
         }
 
+        /* Security opc should NOT trigger any recovery events */
+        if (req->rq_reqmsg->opc == SEC_INIT ||
+            req->rq_reqmsg->opc == SEC_INIT_CONTINUE ||
+            req->rq_reqmsg->opc == SEC_FINI) {
+                GOTO(out, rc = 0);
+        }
+
         /* XXX identical to MDS */
         if (req->rq_reqmsg->opc != OST_CONNECT) {
                 struct obd_device *obd;
index 63b30e9..f2105e8 100644 (file)
@@ -16,7 +16,7 @@ LDLM_COMM_SOURCES= $(top_srcdir)/lustre/ldlm/l_lock.c         \
 
 COMMON_SOURCES =  client.c recover.c connection.c niobuf.c pack_generic.c   \
     events.c ptlrpc_module.c service.c pinger.c recov_thread.c llog_net.c   \
-    llog_client.c llog_server.c import.c ptlrpcd.c pers.c                  \
+    llog_client.c llog_server.c import.c ptlrpcd.c pers.c                  \
     ptlrpc_internal.h $(LDLM_COMM_SOURCES)
 
 if LIBLUSTRE
index 2d03035..1f6127e 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/lustre_lib.h>
 #include <linux/lustre_ha.h>
 #include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
 
 #include "ptlrpc_internal.h"
 
@@ -181,6 +182,9 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
         EXIT;
 }
 
+/* FIXME prep_req now should return error code other than NULL. but
+ * this is called everywhere :(
+ */
 struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
                                        int opcode, int count, int *lengths,
                                        char **bufs)
@@ -197,11 +201,25 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
                 RETURN(NULL);
         }
 
+        request->rq_import = class_import_get(imp);
+
+        rc = ptlrpcs_req_get_cred(request);
+        if (rc) {
+                CDEBUG(D_SEC, "failed to get credential\n");
+                GOTO(out_free, rc);
+        }
+
+        /* just a try on refresh, but we proceed even if it failed */
+        rc = ptlrpcs_cred_refresh(request->rq_cred);
+        if (!ptlrpcs_cred_is_uptodate(request->rq_cred)) {
+                CERROR("req %p: failed to refresh cred %p, rc %d, continue\n",
+                       request, request->rq_cred, rc);
+        }
+
         rc = lustre_pack_request(request, count, lengths, bufs);
         if (rc) {
                 CERROR("cannot pack request %d\n", rc);
-                OBD_FREE(request, sizeof(*request));
-                RETURN(NULL);
+                GOTO(out_cred, rc);
         }
         request->rq_reqmsg->version |= version;
 
@@ -212,7 +230,6 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
 
         request->rq_send_state = LUSTRE_IMP_FULL;
         request->rq_type = PTL_RPC_MSG_REQUEST;
-        request->rq_import = class_import_get(imp);
 
         request->rq_req_cbid.cbid_fn  = request_out_callback;
         request->rq_req_cbid.cbid_arg = request;
@@ -237,6 +254,12 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
         request->rq_reqmsg->opc = opcode;
         request->rq_reqmsg->flags = 0;
         RETURN(request);
+out_cred:
+        ptlrpcs_req_drop_cred(request);
+out_free:
+        class_import_put(imp);
+        OBD_FREE(request, sizeof(*request));
+        RETURN(NULL);
 }
 
 struct ptlrpc_request_set *ptlrpc_prep_set(void)
@@ -469,8 +492,22 @@ static int after_reply(struct ptlrpc_request *req)
         /* Clear reply swab mask; this is a new reply in sender's byte order */
         req->rq_rep_swab_mask = 0;
 #endif
-        LASSERT (req->rq_nob_received <= req->rq_replen);
-        rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received);
+        LASSERT (req->rq_nob_received <= req->rq_repbuf_len);
+        rc = ptlrpcs_cli_unwrap_reply(req);
+        if (rc) {
+                CERROR("verify reply error: %d\n", rc);
+                RETURN(rc);
+        }
+        /* unwrap_reply may request rpc be resend */
+        if (req->rq_ptlrpcs_restart) {
+                req->rq_resend = 1;
+                RETURN(0);
+        }
+
+        /* unwrap_reply will set rq_replen as the actual received
+         * lustre_msg length
+         */
+        rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
         if (rc) {
                 CERROR("unpack_rep failed: %d\n", rc);
                 RETURN(-EPROTO);
@@ -696,8 +733,10 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set)
 
                                 req->rq_waiting = 0;
                                 if (req->rq_resend) {
-                                        lustre_msg_add_flags(req->rq_reqmsg,
-                                                             MSG_RESENT);
+                                        if (!req->rq_ptlrpcs_restart)
+                                                lustre_msg_add_flags(
+                                                        req->rq_reqmsg,
+                                                        MSG_RESENT);
                                         if (req->rq_bulk) {
                                                 __u64 old_xid = req->rq_xid;
 
@@ -1022,6 +1061,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
         LASSERTF(request->rq_rqbd == NULL, "req %p\n",request);/* client-side */
         LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
         LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
+        LASSERT(request->rq_cred);
 
         /* We must take it off the imp_replay_list first.  Otherwise, we'll set
          * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
@@ -1042,14 +1082,11 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
                 LBUG();
         }
 
-        if (request->rq_repmsg != NULL) {
-                OBD_FREE(request->rq_repmsg, request->rq_replen);
-                request->rq_repmsg = NULL;
-        }
-        if (request->rq_reqmsg != NULL) {
-                OBD_FREE(request->rq_reqmsg, request->rq_reqlen);
-                request->rq_reqmsg = NULL;
-        }
+        if (request->rq_repbuf != NULL)
+                ptlrpcs_cli_free_repbuf(request);
+        if (request->rq_reqbuf != NULL)
+                ptlrpcs_cli_free_reqbuf(request);
+
         if (request->rq_export != NULL) {
                 class_export_put(request->rq_export);
                 request->rq_export = NULL;
@@ -1061,6 +1098,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
         if (request->rq_bulk != NULL)
                 ptlrpc_free_bulk(request->rq_bulk);
 
+        ptlrpcs_req_drop_cred(request);
         OBD_FREE(request, sizeof(*request));
         EXIT;
 }
@@ -1399,7 +1437,8 @@ restart:
         }
 
         if (req->rq_resend) {
-                lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+                if (!req->rq_ptlrpcs_restart)
+                        lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
 
                 if (req->rq_bulk != NULL)
                         ptlrpc_unregister_bulk (req);
@@ -1537,8 +1576,8 @@ static int ptlrpc_replay_interpret(struct ptlrpc_request *req,
         /* Clear reply swab mask; this is a new reply in sender's byte order */
         req->rq_rep_swab_mask = 0;
 #endif
-        LASSERT (req->rq_nob_received <= req->rq_replen);
-        rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received);
+        LASSERT (req->rq_nob_received <= req->rq_repbuf_len);
+        rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
         if (rc) {
                 CERROR("unpack_rep failed: %d\n", rc);
                 GOTO(out, rc = -EPROTO);
@@ -1657,6 +1696,18 @@ void ptlrpc_abort_inflight(struct obd_import *imp)
                 spin_unlock (&req->rq_lock);
         }
 
+        list_for_each_safe(tmp, n, &imp->imp_rawrpc_list) {
+                struct ptlrpc_request *req =
+                        list_entry(tmp, struct ptlrpc_request, rq_list);
+
+                DEBUG_REQ(D_HA, req, "aborting raw rpc");
+
+                spin_lock(&req->rq_lock);
+                req->rq_err = 1;
+                ptlrpc_wake_client_req(req);
+                spin_unlock(&req->rq_lock);
+        }
+
         /* Last chance to free reqs left on the replay list, but we
          * will still leak reqs that haven't comitted.  */
         if (imp->imp_replayable)
index 270351d..37a7f94 100644 (file)
@@ -88,9 +88,9 @@ void reply_in_callback(ptl_event_t *ev)
         LASSERT (ev->type == PTL_EVENT_PUT_END ||
                  ev->type == PTL_EVENT_UNLINK);
         LASSERT (ev->unlinked);
-        LASSERT (ev->md.start == req->rq_repmsg);
+        LASSERT (ev->md.start == req->rq_repbuf);
         LASSERT (ev->offset == 0);
-        LASSERT (ev->mlength <= req->rq_replen);
+        LASSERT (ev->mlength <= req->rq_repbuf_len);
         
         DEBUG_REQ((ev->ni_fail_type == PTL_NI_OK) ? D_NET : D_ERROR, req,
                   "type %d, status %d", ev->type, ev->ni_fail_type);
@@ -207,10 +207,10 @@ void request_in_callback(ptl_event_t *ev)
          * flags are reset and scalars are zero.  We only set the message
          * size to non-zero if this was a successful receive. */
         req->rq_xid = ev->match_bits;
-        req->rq_reqmsg = ev->md.start + ev->offset;
+        req->rq_reqbuf = ev->md.start + ev->offset;
         if (ev->type == PTL_EVENT_PUT_END &&
             ev->ni_fail_type == PTL_NI_OK)
-                req->rq_reqlen = ev->mlength;
+                req->rq_reqbuf_len = ev->mlength;
         do_gettimeofday(&req->rq_arrival_time);
         req->rq_peer.peer_id = ev->initiator;
         req->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni;
index d2ccb41..122f878 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/lustre_export.h>
 #include <linux/obd.h>
 #include <linux/obd_class.h>
+#include <linux/lustre_sec.h>
 
 #include "ptlrpc_internal.h"
 
@@ -273,10 +274,15 @@ static int import_select_connection(struct obd_import *imp)
                 list_add_tail(&tmp->oic_item, &imp->imp_conn_list);
         }
 
-        /* switch connection, don't mind if it's same as the current one */
-        if (imp->imp_connection)
-                ptlrpc_put_connection(imp->imp_connection);
-        imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
+        /* switch connection if we chose a new one */
+        if (imp->imp_connection != imp_conn->oic_conn) {
+                if (imp->imp_connection) {
+                        ptlrpcs_sec_invalidate_cache(imp->imp_sec);
+                        ptlrpc_put_connection(imp->imp_connection);
+                }
+                imp->imp_connection =
+                        ptlrpc_connection_addref(imp_conn->oic_conn);
+        }
 
         dlmexp =  class_conn2export(&imp->imp_dlm_handle);
         LASSERT(dlmexp != NULL);
@@ -304,13 +310,15 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
         __u64 committed_before_reconnect = 0;
         struct ptlrpc_request *request;
         int size[] = {sizeof(imp->imp_target_uuid),
-                                 sizeof(obd->obd_uuid),
-                                 sizeof(imp->imp_dlm_handle),
-                                 sizeof(unsigned long)};
+                      sizeof(obd->obd_uuid),
+                      sizeof(imp->imp_dlm_handle),
+                      sizeof(unsigned long),
+                      sizeof(__u32) * 2};
         char *tmp[] = {imp->imp_target_uuid.uuid,
                        obd->obd_uuid.uuid,
                        (char *)&imp->imp_dlm_handle,
-                       (char *)&imp->imp_connect_flags}; /* XXX: make this portable! */
+                       (char *)&imp->imp_connect_flags, /* XXX: make this portable! */
+                       (char*) &obd->u.cli.cl_nllu};
         struct ptlrpc_connect_async_args *aa;
         unsigned long flags;
 
@@ -356,8 +364,10 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
         if (rc)
                 GOTO(out, rc);
 
+        LASSERT(imp->imp_sec);
+
         request = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION,
-                                  imp->imp_connect_op, 4, size, tmp);
+                                  imp->imp_connect_op, 5, size, tmp);
         if (!request)
                 GOTO(out, rc = -ENOMEM);
 
index 7054f99..370fe17 100644 (file)
@@ -78,6 +78,9 @@ struct ll_rpc_opcode {
         { PTLBD_DISCONNECT, "ptlbd_disconnect" },
         { OBD_PING,         "obd_ping" },
         { OBD_LOG_CANCEL,   "llog_origin_handle_cancel"},
+        { SEC_INIT,         "sec_init"},
+        { SEC_INIT_CONTINUE,"sec_init_continue"},
+        { SEC_FINI,         "sec_fini"},
 };
 
 const char* ll_opcode2str(__u32 opcode)
index 6de6be6..d03f2ed 100644 (file)
 #ifndef __KERNEL__
 #include <liblustre.h>
 #endif
+#include <linux/obd_class.h>
 #include <linux/obd_support.h>
 #include <linux/lustre_net.h>
 #include <linux/lustre_lib.h>
 #include <linux/obd.h>
+#include <linux/lustre_sec.h>
 #include "ptlrpc_internal.h"
 
 static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len, 
@@ -311,14 +313,15 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult)
         int                        rc;
 
         /* We must already have a reply buffer (only ptlrpc_error() may be
-         * called without one).  We must also have a request buffer which
+         * called without one).  We usually also have a request buffer which
          * is either the actual (swabbed) incoming request, or a saved copy
-         * if this is a req saved in target_queue_final_reply(). */
-        LASSERT (req->rq_reqmsg != NULL);
+         * if this is a req saved in target_queue_final_reply(). but this
+         * will not be true since some security handling may skip the reqmsg
+         * setting and prepare reply under normal ptlrpc layer */
         LASSERT (rs != NULL);
         LASSERT (req->rq_repmsg != NULL);
         LASSERT (may_be_difficult || !rs->rs_difficult);
-        LASSERT (req->rq_repmsg == &rs->rs_msg);
+        LASSERT (req->rq_repmsg == rs->rs_msg);
         LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback);
         LASSERT (rs->rs_cb_id.cbid_arg == rs);
 
@@ -328,7 +331,7 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult)
 
         req->rq_repmsg->type   = req->rq_type;
         req->rq_repmsg->status = req->rq_status;
-        req->rq_repmsg->opc    = req->rq_reqmsg->opc;
+        req->rq_repmsg->opc    = req->rq_reqmsg ? req->rq_reqmsg->opc : 0;
 
         if (req->rq_export == NULL) 
                 conn = ptlrpc_get_connection(&req->rq_peer, NULL);
@@ -337,10 +340,17 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult)
 
         atomic_inc (&svc->srv_outstanding_replies);
 
-        rc = ptl_send_buf (&rs->rs_md_h, req->rq_repmsg, req->rq_replen,
+        rc = svcsec_authorize(req);
+        if (rc) {
+                CERROR("Error wrap reply message "LPX64"\n", req->rq_xid);
+                goto out;
+        }
+
+        rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len,
                            rs->rs_difficult ? PTL_ACK_REQ : PTL_NOACK_REQ,
                            &rs->rs_cb_id, conn,
                            svc->srv_rep_portal, req->rq_xid);
+out:
         if (rc != 0) {
                 atomic_dec (&svc->srv_outstanding_replies);
 
@@ -405,12 +415,21 @@ int ptl_send_rpc(struct ptlrpc_request *request)
         request->rq_reqmsg->handle = request->rq_import->imp_remote_handle;
         request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST;
         request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt;
-                
+
+        /* wrap_request might need to refresh gss cred, if this is called
+         * in ptlrpcd then the whole daemon thread will be waiting on
+         * gss negotiate rpc. FIXME
+         */
+        rc = ptlrpcs_cli_wrap_request(request);
+        if (rc)
+                GOTO(cleanup_bulk, rc);
+
         LASSERT (request->rq_replen != 0);
-        if (request->rq_repmsg == NULL)
-                OBD_ALLOC(request->rq_repmsg, request->rq_replen);
-        if (request->rq_repmsg == NULL)
-                GOTO(cleanup_bulk, rc = -ENOMEM);
+        if (request->rq_repbuf == NULL) {
+                rc = ptlrpcs_cli_alloc_repbuf(request, request->rq_replen);
+                if (rc)
+                        GOTO(cleanup_bulk, rc);
+        }
 
         rc = PtlMEAttach(connection->c_peer.peer_ni->pni_ni_h,
                          request->rq_reply_portal, /* XXX FIXME bug 249 */
@@ -431,11 +450,12 @@ int ptl_send_rpc(struct ptlrpc_request *request)
         request->rq_timedout = 0;
         request->rq_net_err = 0;
         request->rq_resend = 0;
+        request->rq_ptlrpcs_restart = 0;
         request->rq_restart = 0;
         spin_unlock_irqrestore (&request->rq_lock, flags);
 
-        reply_md.start     = request->rq_repmsg;
-        reply_md.length    = request->rq_replen;
+        reply_md.start     = request->rq_repbuf;
+        reply_md.length    = request->rq_repbuf_len;
         reply_md.threshold = 1;
         reply_md.options   = PTLRPC_MD_OPTIONS | PTL_MD_OP_PUT;
         reply_md.user_ptr  = &request->rq_reply_cbid;
@@ -460,7 +480,7 @@ int ptl_send_rpc(struct ptlrpc_request *request)
         request->rq_sent = LTIME_S(CURRENT_TIME);
         ptlrpc_pinger_sending_on_import(request->rq_import);
         rc = ptl_send_buf(&request->rq_req_md_h, 
-                          request->rq_reqmsg, request->rq_reqlen,
+                          request->rq_reqbuf, request->rq_reqdata_len,
                           PTL_NOACK_REQ, &request->rq_req_cbid, 
                           connection,
                           request->rq_request_portal,
@@ -482,8 +502,7 @@ int ptl_send_rpc(struct ptlrpc_request *request)
         LASSERT (!request->rq_receiving_reply);
 
  cleanup_repmsg:
-        OBD_FREE(request->rq_repmsg, request->rq_replen);
-        request->rq_repmsg = NULL;
+        ptlrpcs_cli_free_repbuf(request);
 
  cleanup_bulk:
         if (request->rq_bulk != NULL)
@@ -537,3 +556,163 @@ int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd)
         
         return (-ENOMEM);
 }
+
+static int rawrpc_timedout(void *data)
+{
+        struct ptlrpc_request *req = (struct ptlrpc_request *) data;
+        unsigned long flags;
+
+        spin_lock_irqsave(&req->rq_lock, flags);
+        if (!req->rq_replied)
+                req->rq_timedout = 1;
+        spin_unlock_irqrestore(&req->rq_lock, flags);
+
+        return 1;
+}
+
+/* to make things as simple as possible */
+static int rawrpc_check_reply(struct ptlrpc_request *req)
+{
+        unsigned long flags;
+        int rc;
+
+        spin_lock_irqsave (&req->rq_lock, flags);
+        rc = req->rq_replied || req->rq_net_err || req->rq_err ||
+             req->rq_resend || req->rq_restart;
+        spin_unlock_irqrestore(&req->rq_lock, flags);
+        return rc;
+}
+
+/*
+ * Construct a fake ptlrpc_request to do the work, in order to
+ * user the existing callback/wakeup facilities
+ */
+int ptlrpc_do_rawrpc(struct obd_import *imp,
+                     char *reqbuf, int reqlen,
+                     char *repbuf, int *replenp,
+                     int timeout)
+{
+        struct ptlrpc_connection *conn;
+        struct ptlrpc_request request; /* just a fake one */
+        ptl_handle_me_t reply_me_h;
+        ptl_md_t reply_md, req_md;
+        struct l_wait_info lwi;
+        unsigned long irq_flags;
+        int rc;
+        ENTRY;
+
+        LASSERT(imp);
+        class_import_get(imp);
+        if (imp->imp_state == LUSTRE_IMP_CLOSED) {
+                CWARN("raw rpc on closed imp(=>%s)? send anyway\n",
+                       imp->imp_target_uuid.uuid);
+        }
+
+        conn = imp->imp_connection;
+
+        /* initialize request */
+        memset(&request, 0, sizeof(request));
+        request.rq_req_cbid.cbid_fn = request_out_callback;
+        request.rq_req_cbid.cbid_arg = &request;
+        request.rq_reply_cbid.cbid_fn  = reply_in_callback;
+        request.rq_reply_cbid.cbid_arg = &request;
+        request.rq_reqbuf = reqbuf;
+        request.rq_reqbuf_len = reqlen;
+        request.rq_repbuf = repbuf;
+        request.rq_repbuf_len = *replenp;
+        request.rq_set = NULL;
+        spin_lock_init(&request.rq_lock);
+        init_waitqueue_head(&request.rq_reply_waitq);
+        atomic_set(&request.rq_refcount, 1000000); /* never be droped */
+        request.rq_xid = ptlrpc_next_xid();
+
+        /* add into sending list */
+        spin_lock_irqsave(&imp->imp_lock, irq_flags);
+        list_add_tail(&request.rq_list, &imp->imp_rawrpc_list);
+        spin_unlock_irqrestore(&imp->imp_lock, irq_flags);
+
+        /* prepare reply buffer */
+        rc = PtlMEAttach(conn->c_peer.peer_ni->pni_ni_h,
+                         imp->imp_client->cli_reply_portal,
+                         conn->c_peer.peer_id, request.rq_xid, 0, PTL_UNLINK,
+                         PTL_INS_AFTER, &reply_me_h);
+        if (rc != PTL_OK) {
+                CERROR("PtlMEAttach failed: %d\n", rc);
+                LASSERT (rc == PTL_NO_SPACE);
+                GOTO(cleanup_imp, rc = -ENOMEM);
+        }
+
+        spin_lock_irqsave(&request.rq_lock, irq_flags);
+        request.rq_receiving_reply = 1;
+        spin_unlock_irqrestore(&request.rq_lock, irq_flags);
+
+        reply_md.start          = repbuf;
+        reply_md.length         = *replenp;
+        reply_md.threshold      = 1;
+        reply_md.options        = PTLRPC_MD_OPTIONS | PTL_MD_OP_PUT;
+        reply_md.user_ptr       = &request.rq_reply_cbid;
+        reply_md.eq_handle      = conn->c_peer.peer_ni->pni_eq_h;
+
+        rc = PtlMDAttach(reply_me_h, reply_md, PTL_UNLINK,
+                         &request.rq_reply_md_h);
+        if (rc != PTL_OK) {
+                CERROR("PtlMDAttach failed: %d\n", rc);
+                LASSERT (rc == PTL_NO_SPACE);
+                GOTO(cleanup_me, rc = -ENOMEM);
+        }
+
+        /* prepare request buffer */
+        req_md.start            = reqbuf;
+        req_md.length           = reqlen;
+        req_md.threshold        = 1;
+        req_md.options          = PTLRPC_MD_OPTIONS;
+        req_md.user_ptr         = &request.rq_req_cbid;
+        req_md.eq_handle        = conn->c_peer.peer_ni->pni_eq_h;
+
+        rc = PtlMDBind(conn->c_peer.peer_ni->pni_ni_h,
+                       req_md, PTL_UNLINK, &request.rq_req_md_h);
+        if (rc != PTL_OK) {
+                CERROR("PtlMDBind failed %d\n", rc);
+                LASSERT (rc == PTL_NO_SPACE);
+                GOTO(cleanup_me, rc = -ENOMEM);
+        }
+
+        rc = PtlPut(request.rq_req_md_h, PTL_NOACK_REQ, conn->c_peer.peer_id,
+                    imp->imp_client->cli_request_portal,
+                    0, request.rq_xid, 0, 0);
+        if (rc != PTL_OK) {
+                CERROR("PtlPut failed %d\n", rc);
+                GOTO(cleanup_md, rc);
+        }
+
+        lwi = LWI_TIMEOUT(timeout * HZ, rawrpc_timedout, &request);
+        l_wait_event(request.rq_reply_waitq,
+                     rawrpc_check_reply(&request), &lwi);
+
+        ptlrpc_unregister_reply(&request);
+
+        if (request.rq_err || request.rq_resend || request.rq_intr ||
+            request.rq_timedout || !request.rq_replied) {
+                CERROR("secinit rpc error: err %d, resend %d, "
+                       "intr %d, timeout %d, replied %d\n",
+                        request.rq_err, request.rq_resend, request.rq_intr,
+                        request.rq_timedout, request.rq_replied);
+                rc = -EINVAL;
+        } else {
+                *replenp = request.rq_nob_received;
+                rc = 0;
+        }
+        GOTO(cleanup_imp, rc);
+
+cleanup_md:
+        PtlMDUnlink(request.rq_req_md_h);
+cleanup_me:
+        PtlMEUnlink(reply_me_h);
+cleanup_imp:
+        spin_lock_irqsave(&imp->imp_lock, irq_flags);
+        list_del_init(&request.rq_list);
+        spin_unlock_irqrestore(&imp->imp_lock, irq_flags);
+
+        class_import_put(imp);
+        RETURN(rc);
+}
index 920ea49..db9a38d 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
 #include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
 #include <linux/fcntl.h>
 
 
@@ -76,14 +77,15 @@ void lustre_init_msg (struct lustre_msg *msg, int count, int *lens, char **bufs)
 int lustre_pack_request (struct ptlrpc_request *req,
                          int count, int *lens, char **bufs)
 {
+        int rc;
         ENTRY;
 
-        req->rq_reqlen = lustre_msg_size (count, lens);
-        OBD_ALLOC(req->rq_reqmsg, req->rq_reqlen);
-        if (req->rq_reqmsg == NULL)
-                RETURN(-ENOMEM);
+        req->rq_reqlen = lustre_msg_size(count, lens);
+        rc = ptlrpcs_cli_alloc_reqbuf(req, req->rq_reqlen);
+        if (rc)
+                RETURN(rc);
 
-        lustre_init_msg (req->rq_reqmsg, count, lens, bufs);
+        lustre_init_msg(req->rq_reqmsg, count, lens, bufs);
         RETURN (0);
 }
 
@@ -117,29 +119,29 @@ int lustre_pack_reply (struct ptlrpc_request *req,
                        int count, int *lens, char **bufs)
 {
         struct ptlrpc_reply_state *rs;
-        int                        msg_len;
-        int                        size;
+        int                        rc;
         ENTRY;
 
-        LASSERT (req->rq_reply_state == NULL);
-
-        msg_len = lustre_msg_size (count, lens);
-        size = offsetof (struct ptlrpc_reply_state, rs_msg) + msg_len;
-        OBD_ALLOC (rs, size);
-        if (rs == NULL)
-                RETURN (-ENOMEM);
-
+        LASSERT(req->rq_reply_state == NULL);
+        LASSERT(req->rq_svcsec);
+        LASSERT(req->rq_repmsg == NULL);
+
+        req->rq_replen = lustre_msg_size(count, lens);
+        rc = svcsec_alloc_repbuf(req->rq_svcsec, req, req->rq_replen);
+        if (rc)
+                RETURN(rc);
+        LASSERT(req->rq_reply_state);
+        LASSERT(req->rq_repmsg == req->rq_reply_state->rs_msg);
+                                                                                                    
+        rs = req->rq_reply_state;
+        rs->rs_svcsec = svcsec_get(req->rq_svcsec);
         rs->rs_cb_id.cbid_fn = reply_out_callback;
         rs->rs_cb_id.cbid_arg = rs;
         rs->rs_srv_ni = req->rq_rqbd->rqbd_srv_ni;
-        rs->rs_size = size;
         INIT_LIST_HEAD(&rs->rs_exp_list);
         INIT_LIST_HEAD(&rs->rs_obd_list);
 
-        req->rq_replen = msg_len;
-        req->rq_reply_state = rs;
-        req->rq_repmsg = &rs->rs_msg;
-        lustre_init_msg (&rs->rs_msg, count, lens, bufs);
+        lustre_init_msg(rs->rs_msg, count, lens, bufs);
 
         PTLRPC_RS_DEBUG_LRU_ADD(rs);
 
@@ -148,6 +150,8 @@ int lustre_pack_reply (struct ptlrpc_request *req,
 
 void lustre_free_reply_state (struct ptlrpc_reply_state *rs)
 {
+        struct ptlrpc_svcsec *svcsec = rs->rs_svcsec;
+
         PTLRPC_RS_DEBUG_LRU_DEL(rs);
 
         LASSERT (!rs->rs_difficult || rs->rs_handled);
@@ -157,8 +161,14 @@ void lustre_free_reply_state (struct ptlrpc_reply_state *rs)
         LASSERT (rs->rs_nlocks == 0);
         LASSERT (list_empty(&rs->rs_exp_list));
         LASSERT (list_empty(&rs->rs_obd_list));
+        LASSERT (svcsec);
+
+        if (svcsec->free_repbuf)
+                svcsec->free_repbuf(svcsec, rs);
+        else
+                svcsec_free_reply_state(rs);
 
-        OBD_FREE (rs, rs->rs_size);
+        svcsec_put(svcsec);
 }
 
 /* This returns the size of the buffer that is required to hold a lustre_msg
@@ -618,6 +628,11 @@ struct mds_req_sec_desc *lustre_swab_mds_secdesc(struct ptlrpc_request *req,
                 __swab32s(&rsd->rsd_ngroups);
         }
 
+        if (rsd->rsd_ngroups > LUSTRE_MAX_GROUPS) {
+                CERROR("%u groups is not allowed\n", rsd->rsd_ngroups);
+                return NULL;
+        }
+
         if (m->buflens[offset] !=
             sizeof(*rsd) + rsd->rsd_ngroups * sizeof(__u32)) {
                 CERROR("bufflen %u while contains %u groups\n",
index e49b5f9..12a3c20 100644 (file)
@@ -85,6 +85,14 @@ static inline int opcode_offset(__u32 opc) {
                         (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
                         (MDS_LAST_OPC - MDS_FIRST_OPC) +
                         (OST_LAST_OPC - OST_FIRST_OPC));
+        } else if (opc < SEC_LAST_OPC) {
+                /* Security negotiate */
+                return (opc - SEC_FIRST_OPC +
+                        (PTLBD_LAST_OPC - PTLBD_FIRST_OPC) +
+                        (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
+                        (MDS_LAST_OPC - MDS_FIRST_OPC) +
+                        (OST_LAST_OPC - OST_FIRST_OPC) +
+                        (OBD_LAST_OPC - OBD_FIRST_OPC));
         } else {
                 /* Unknown Opcode */
                 return -1;
@@ -95,7 +103,8 @@ static inline int opcode_offset(__u32 opc) {
                             (LDLM_LAST_OPC - LDLM_FIRST_OPC)   + \
                             (MDS_LAST_OPC - MDS_FIRST_OPC)     + \
                             (OST_LAST_OPC - OST_FIRST_OPC)     + \
-                            (OBD_LAST_OPC - OBD_FIRST_OPC))
+                            (OBD_LAST_OPC - OBD_FIRST_OPC)     + \
+                            (SEC_LAST_OPC - SEC_FIRST_OPC))
 
 enum {
         PTLRPC_REQWAIT_CNTR = 0,
index 5cbdf4f..c42c47c 100644 (file)
@@ -92,6 +92,7 @@ EXPORT_SYMBOL(ptlrpc_reply);
 EXPORT_SYMBOL(ptlrpc_error);
 EXPORT_SYMBOL(ptlrpc_resend_req);
 EXPORT_SYMBOL(ptl_send_rpc);
+EXPORT_SYMBOL(ptlrpc_do_rawrpc);
 
 /* client.c */
 EXPORT_SYMBOL(ptlrpc_init_client);
index fa924fc..30217ab 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
 #include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
 #include <linux/lustre_log.h>
 #include <portals/types.h>
 #include "ptlrpc_internal.h"
@@ -42,6 +43,12 @@ static spinlock_t ptlrpc_all_services_lock = SPIN_LOCK_UNLOCKED;
 static void
 ptlrpc_free_server_req (struct ptlrpc_request *req)
 {
+        if (req->rq_svcsec) {
+                svcsec_cleanup_req(req);
+                svcsec_put(req->rq_svcsec);
+                req->rq_svcsec = NULL;
+        }
+
         /* The last request to be received into a request buffer uses space
          * in the request buffer descriptor, otherwise requests are
          * allocated dynamically in the incoming reply event handler */
@@ -408,7 +415,8 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc)
         struct timeval         work_start;
         struct timeval         work_end;
         long                   timediff;
-        int                    rc;
+        enum ptlrpcs_error     sec_err;
+        int                    secrc, rc;
         ENTRY;
 
         spin_lock_irqsave (&svc->srv_lock, flags);
@@ -445,12 +453,32 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc)
         /* Clear request swab mask; this is a new request */
         request->rq_req_swab_mask = 0;
 #endif
-        rc = lustre_unpack_msg (request->rq_reqmsg, request->rq_reqlen);
+
+        /* go through security check/transform */
+        request->rq_auth_uid = -1;
+        secrc = svcsec_accept(request, &sec_err);
+        switch(secrc) {
+        case SVC_OK:
+                CDEBUG(D_SEC, "request accepted ok\n");
+                break;
+        case SVC_COMPLETE:
+                target_send_reply(request, 0, OBD_FAIL_MDS_ALL_REPLY_NET);
+                goto put_conn;
+        case SVC_DROP:
+                goto out;
+        case SVC_LOGIN:
+        case SVC_LOGOUT:
+                break;
+        default:
+                LBUG();
+        }
+
+        rc = lustre_unpack_msg(request->rq_reqmsg, request->rq_reqlen);
         if (rc != 0) {
                 CERROR ("error unpacking request: ptl %d from %s"
                         " xid "LPU64"\n", svc->srv_req_portal,
                         ptlrpc_peernid2str(&request->rq_peer, str),
-                       request->rq_xid);
+                        request->rq_xid);
                 goto out;
         }
 
@@ -530,11 +558,12 @@ put_conn:
 
         CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA,
                "request "LPU64" opc %u from NID %s processed in %ldus "
-               "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc,
+               "(%ldus total)\n", request->rq_xid,
+               request->rq_reqmsg ? request->rq_reqmsg->opc : 0,
                ptlrpc_peernid2str(&request->rq_peer, str),
                timediff, timeval_sub(&work_end, &request->rq_arrival_time));
 
-        if (svc->srv_stats != NULL) {
+        if (svc->srv_stats != NULL && request->rq_reqmsg != NULL) {
                 int opc = opcode_offset(request->rq_reqmsg->opc);
                 if (opc > 0) {
                         LASSERT(opc < LUSTRE_MAX_OPCODES);
@@ -612,7 +641,7 @@ ptlrpc_server_handle_reply (struct ptlrpc_service *svc)
                       " o%d NID %s\n",
                       rs, 
                       rs->rs_xid, rs->rs_transno,
-                      rs->rs_msg.opc, 
+                      rs->rs_msg->opc, 
                       ptlrpc_peernid2str(&exp->exp_connection->c_peer, str));
 #endif
         }
diff --git a/lustre/sec/.cvsignore b/lustre/sec/.cvsignore
new file mode 100644 (file)
index 0000000..d5103fa
--- /dev/null
@@ -0,0 +1,15 @@
+.Xrefs
+config.log
+config.status
+configure
+Makefile
+.deps
+TAGS
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.o.flags
+.tmp_versions
+.depend
diff --git a/lustre/sec/Makefile.in b/lustre/sec/Makefile.in
new file mode 100644 (file)
index 0000000..224d66b
--- /dev/null
@@ -0,0 +1,6 @@
+MODULES := ptlrpcs
+ptlrpcs-objs := sec.o sec_null.o svcsec.o svcsec_null.o upcall_cache.o
+
+@GSS_TRUE@subdir-m += gss
+
+@INCLUDE_RULES@
diff --git a/lustre/sec/Makefile.mk b/lustre/sec/Makefile.mk
new file mode 100644 (file)
index 0000000..7dcc93c
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2004  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(src)/../portals/Kernelenv
+
+obj-y += ptlrpcs.o
+ptlrpcs-objs := sec.o sec_null.o svcsec.o svcsec_null.o upcall_cache.o
+
diff --git a/lustre/sec/autoMakefile.am b/lustre/sec/autoMakefile.am
new file mode 100644 (file)
index 0000000..7422341
--- /dev/null
@@ -0,0 +1,22 @@
+# Copyright (C) 2004  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if GSS
+SUBDIRS = . gss #kcrypto 
+endif
+
+if LIBLUSTRE
+noinst_LIBRARIES = libptlrpcs.a
+libptlrpcs_a_SOURCES = sec.c sec_null.c svcsec.c svcsec_null.c
+libptlrpcs_a_CPPFLAGS = $(LLCPPFLAGS)
+libptlrpcs_a_CFLAGS = $(LLCFLAGS)
+endif
+
+if MODULES
+modulefs_DATA = ptlrpcs$(KMODEXT)
+endif
+
+DIST_SOURCES = $(ptlrpcs-objs:.o=.c)
+MOSTLYCLEANFILES = *.o *.ko *.mod.c
diff --git a/lustre/sec/doc/oss_gss_HLD.lyx b/lustre/sec/doc/oss_gss_HLD.lyx
new file mode 100644 (file)
index 0000000..515eb2b
--- /dev/null
@@ -0,0 +1,258 @@
+#LyX 1.3 created this file. For more info see http://www.lyx.org/
+\lyxformat 221
+\textclass article
+\language english
+\inputencoding auto
+\fontscheme times
+\graphics default
+\paperfontsize 12
+\spacing single 
+\papersize Default
+\paperpackage a4
+\use_geometry 0
+\use_amsmath 0
+\use_natbib 0
+\use_numerical_citations 0
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation skip
+\defskip medskip
+\quotes_language english
+\quotes_times 2
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+
+\layout Title
+
+High Level Design of Cient-OSS Connection
+\layout Author
+
+Peter Braam, Eric Mei
+\layout Date
+
+Feb 13, 2005
+\layout Section
+
+Requirements
+\layout Itemize
+
+Establish gss connections between clients and OSS.
+\layout Itemize
+
+Establish gss connections between servers.
+\layout Section
+
+Functional Specification
+\layout Standard
+
+In Lustre system, there are several kinds of connections and security options
+ can be chosen separately:
+\layout Itemize
+
+between client and MDS's
+\layout Itemize
+
+between client and OSS's
+\layout Itemize
+
+between MDS's
+\layout Itemize
+
+between MDS's and OSS's
+\layout Standard
+
+Currently we are able to establish secure connections between the client
+ and MDS's, simply by adding a mount parameter 'sec=sec_flavor', here sec_flavor
+ could be 'krb5i' or 'krb5p' for this moment.
+ Now we also need the secure connections between client and OSS's also be
+ an option, to prepare for the coming object security features.
+ So the original mount option 'sec' will be break into 2 options: 'mds_sec'
+ and 'oss_sec'.
+\layout Itemize
+
+mount.lustre should be able to recognize options 'mds_sec=sec_flavor' and
+ 'oss_sec=sec_flavor'.
+\layout Itemize
+
+lmt should be able to add 'mds_sec' and 'oss_sec' into xml file and recognizable
+ by lconf.
+ And lconf should be able to write this info into config log with option
+ --write-conf.
+\layout Standard
+
+Usually we consider MDS and OSS are trusted nodes, but networks are normally
+ not secure.
+ So connections of MDS <=> MDS and MDS <=> OSS must be secure in most cases.
+ We should also provide security on connections between servers.
+\layout Standard
+
+For inter MDS's and MDS's to OSS's, We provide options for lconf and lmt,
+ just like client <=> OSS's case:
+\layout Itemize
+
+lconf should be able to recognize options '--inter_mds_sec=sec_flavor' and
+ '--mds_oss_sec=sec_flavor'.
+\layout Itemize
+
+lmt should be able to add 'inter_mds_sec' and 'mds_oss_sec' into xml file
+ and recognizable by lconf.
+\layout Standard
+
+Servers will have options to accept only certain types of connections.
+ When setup OSS/MDS via lconf, option '--deny_sec=sec_flavor[,sec_flavor...]'
+ should be recognized and notify OSS/MDS kernel.
+ Currently sec_flavor could be 'null', 'krb5i', or 'krb5p'.
+\layout Standard
+
+Maybe privacy connections to the OSS servers are only needed from the MDS,
+ since there will be no secret transfer between OSS and client.
+ And if we in the future support mixed security type in single security
+ context, then integrity type might be enough for most cases.
+ But anyway we provide the flexibility here.
+\layout Section
+
+Use Cases
+\layout Subsection
+
+Mount lustre at client
+\layout Enumerate
+
+Sysadmin add options into config: lmt --mds_sec krb5p --oss_sec krb5i config.xml.
+ And setup OSS/MDS ready.
+\layout Enumerate
+
+User mount lustre by 'mount -t lustre server:/mds1/client /mnt/lustre'
+\layout Enumerate
+
+Connections to MDS's are privacy protected, connections to OSS's are integrity
+ protected.
+\layout Enumerate
+
+User umount lustre.
+\layout Enumerate
+
+User mount lustre by 'mount -t lustre -o mds_sec=krb5i,oss_sec=krb5p server:/mds
+1/client /mnt/lustre'
+\layout Enumerate
+
+Connections to MDS's are integrity protected, connections to OSS's are privacy
+ protected.
+\layout Enumerate
+
+User umount lustre.
+\layout Enumerate
+
+User mount lustre by 'mount -t lustre -o mds_sec=krb5p,oss_sec=krb5p server:/mds
+1/client /mnt/lustre 
+\layout Enumerate
+
+Connections to all MDS's and OSS's are privacy protected.
+\layout Subsection
+
+Startup MDS
+\layout Enumerate
+
+Sysadmin add options into config: lmt --inter_mds_sec krb5p --mds_oss_sec
+ krb5p config.xml
+\layout Enumerate
+
+Sysadmin start mds by: lconf --node mds config.xml.
+\layout Enumerate
+
+Connections between MDS's and MDS's to OSS's are privacy protected.
+\layout Enumerate
+
+Sysadmin stop MDS's.
+\layout Enumerate
+
+Sysadmin start mds again by: lconf --node mds --inter_mds_sec=krb5i --mds_oss_se
+c=krb5p config.xml.
+\layout Enumerate
+
+Connections between MDS's are integrity protected, while MDS's to OSS's
+ are privacy protected.
+\layout Subsection
+
+Deny certain type of connection
+\layout Enumerate
+
+Sysadmin start OSS's by 'lconf --node ost1 --deny_sec=null config.xml'
+\layout Enumerate
+
+Sysadmin start MDS's by 'lconf --node mds1 --mds_oss_sec=null config.xml',
+ setup will fail because OST reject connection from MDS's.
+\layout Enumerate
+
+Sysadmin start MDS's by 'lconf --node mds1 --deny_sec=null --mds_oss_sec=krb5i
+ config.xml', will succeed.
+\layout Enumerate
+
+Client mount by 'mount -t lustre -o mds_sec=null server:/mds1/client /mnt/lustre
+' or 'mount -t lustre -o oss_sec=null server:/mds1/client /mnt/lustre' will
+ fail because either MDS's or OSS's will reject connection.
+\layout Enumerate
+
+Client mount by 'mount -t lustre -o mds_sec=krb5i,oss_sec=krb5i server:/mds1/cli
+ent /mnt/lustre' will succeed.
+\layout Section
+
+Logic Specification
+\layout Standard
+
+With Kerberos, each service provider needs a service principal, and a correspond
+ing service key installed.
+ Usually the principal is bound to certain host for security.
+ For example, currently lustre service principal is 'lustre/hostname@REALM'.
+ While in clustered MDS case, we should use single principal for all MDS's,
+ to minimize the administrator burden.
+ It should be 'lustre@REALM' for all MDS's.
+ Now we should break 'lustre@REALM' into 2 principals: 'mds@REALM' for MDS
+ and 'oss@REALM' for OSS.
+ All MDS's will be installed service key of 'mds@REALM', while all OSS's
+ will be installed service key of 'oss@REALM'.
+\layout Standard
+
+If MDS <=> MDS or MDS <=> OSS security is used, we also need start client
+ gss daemon (lgssd) on MDS's at proper time.
+ This needs to be incorporated into test scripts.
+\layout Standard
+
+The interaction between kernel gss module and lgssd need some modification,
+ which need to be notified the target service type (i.e.
+ mds or oss) to issue the correct gss request.
+\layout Standard
+
+Integrating security flavor setting into MDS's startup procedure and client's
+ mount procedure needs to be integrated into the MDS startup configuration
+ log.
+\layout Standard
+
+Additionally the MDS and OSS should have configuration options that provide
+ information on what kind of connections to accept.
+\layout Section
+
+State Management
+\layout Standard
+
+MDS nodes need run lgssd if gss is active on any inter-server connections.
+\layout Standard
+
+No disk format change.
+ No special recovery consideration.
+\layout Section
+
+Alternatives
+\layout Standard
+
+None.
+\layout Section
+
+Focus of Inspection
+\layout Itemize
+
+Are there more clean design/divide on those new options?
+\the_end
diff --git a/lustre/sec/doc/remote_ugid_HLD.lyx b/lustre/sec/doc/remote_ugid_HLD.lyx
new file mode 100644 (file)
index 0000000..c799534
--- /dev/null
@@ -0,0 +1,884 @@
+#LyX 1.3 created this file. For more info see http://www.lyx.org/
+\lyxformat 221
+\textclass article
+\language english
+\inputencoding auto
+\fontscheme times
+\graphics default
+\paperfontsize 12
+\spacing single 
+\papersize Default
+\paperpackage a4
+\use_geometry 0
+\use_amsmath 0
+\use_natbib 0
+\use_numerical_citations 0
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation skip
+\defskip medskip
+\quotes_language english
+\quotes_times 2
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+
+\layout Title
+
+High Level Design of Remote UID/GID Handling
+\layout Author
+
+Peter Braam, Eric Mei
+\layout Date
+
+Jan 27, 2005
+\layout Section
+
+From the ERS (Engineering Requirements Spec, formerly Architecture)
+\layout Itemize
+
+Perform uid/gid translation between remote clients and local user database.
+\layout Itemize
+
+Handling client program calling setuid/setgid/setgroups syscalls to get
+ unusual previlege .
+\layout Itemize
+
+Handling supplementary groups membership.
+\layout Itemize
+
+Various security policies in situations with/without strong authentication
+ like Kerberos V5.
+\layout Paragraph
+
+NOTE:
+\layout Itemize
+
+remote clients may have different user database from that of MDS's.
+\layout Itemize
+
+The remote ACL issues is addressed by a separate module.
+\layout Itemize
+
+Most content of this document has been described in Lustre Book.
+\layout Standard
+
+The architecture prescribes a translation mechanism at the MDS: the MDS
+ will translate a locally found uid/gid, which is obtained through the kerberos
+ principal.
+\layout Section
+
+Functional Specification
+\layout Subsection
+
+Determine local/remote clients
+\layout Itemize
+
+
+\begin_inset Quotes eld
+\end_inset 
+
+local
+\begin_inset Quotes erd
+\end_inset 
+
+ client is the client node which is supposed to share the same user database
+ with MDS's.
+\layout Itemize
+
+
+\begin_inset Quotes eld
+\end_inset 
+
+remote
+\begin_inset Quotes erd
+\end_inset 
+
+ client is the client node which is supposed to have different user database
+ from MDS's.
+\layout Standard
+
+The MDS's will be able to determine that a client node is a local or remote
+ one, upon the client's first connection time to the MDS, and reply back
+ it's decision to client.
+ Later both MDS and client will make different operation decision according
+ to this flag.
+ This remote flag is per-client, not per user.
+ Once MDS made the decision, it will keep unchanged until client leave the
+ cluster membership (umount or so).
+\layout Standard
+
+MDS will do many conversion (mostly uid/gid mapping) for users on remote
+ clients because of the user database mismatch, and due to the nature of
+ this mismatch we have to put some limitation on users of remote clients,
+ compare to local clients.
+ Following sections have the details description.
+\layout Subsection
+
+Mapping uid/gid from clients
+\layout Standard
+
+For local client, obviously we don't need do any uid/gid mapping.
+ For remote clients, we need translate uid/gid in each request into one
+ which lives in local user database; and vice versa: translate uid/gid in
+ reply into the one in remote user database.
+ This translation affects the uid/gid's found in the inode as owner/group,
+ the security context which describes under what uid the MDS is executing
+ and in some cases (chown is a good example) the arguments of calls.
+\layout Standard
+
+Each MDS will have to access a uid-mapping database, which prescribed that:
+ which principal from which nid/netid should be mapped to which local uid.
+ The mapping database must be the same to every MDS to get consistent result.
+ During runtime, the a remote user authenticated with the MDS, the corresponding
+ mapping entry will be read from the on-disk database and cached in the
+ kernel via an upcall.
+ Note the same principal from different clients might be mapped to different
+ local user, according to the mapping database.
+ So on each MDS there's a per-client structure which maintained the uid
+ mapping cache.
+\layout Standard
+
+Each remote client must have nllu/nllg installed.
+ 'nllu' is for 
+\begin_inset Quotes eld
+\end_inset 
+
+Non Local Lustre User
+\begin_inset Quotes erd
+\end_inset 
+
+, while 'nllg' for 
+\begin_inset Quotes eld
+\end_inset 
+
+Non Local Lustre Group
+\begin_inset Quotes erd
+\end_inset 
+
+.
+ When client firstly mount a lustre fileset, it should notify MDS which
+ local uid/gid act as nllu/nllg.
+ MDS will translate those unrecognized uid/gid to this before send reply
+ to client.
+ Thus from client's perspect of view, those files which belong to unauthorized
+ users will be shown as belonging to nllu/nllg.
+\layout Subsection
+
+Lustre security description (LSD)
+\layout Standard
+
+There's a security configure database on each MDS, which describes who(uid)
+ from where(nid/netid) have permission to setuid/setgid/setgroups.
+ Later we might add more into it.
+ the database must be the same to every MDS to get consistent result.
+\layout Standard
+
+LSD refers to the in-kernel data structure which describe an user's security
+ property on the MDS.
+ It roughly be defined as:
+\layout LyX-Code
+
+struct lustre_sec_desc {
+\layout LyX-Code
+
+    uid_t            uid;
+\layout LyX-Code
+
+    gid_t            gid;
+\layout LyX-Code
+
+    supp_grp_t       supp_grp;
+\layout LyX-Code
+
+    setxid_desc      setxid;
+\layout LyX-Code
+
+    /* more security tags added here */
+\layout LyX-Code
+
+};
+\layout Standard
+
+In the future we'll add more special security tag into it.
+ Each LSD entry correspond to an user in the local user database.
+ the 'setxid_desc' must have the ability to describe setuid/setgid/setgroups
+ permission for different clients respectively.
+\layout Standard
+
+LSD cache is populated via an upcall during runtime.
+ The user-level helper will be feed in uid as a parameter, and found out
+ this uid's principal gid and supplementary groups from local user database,
+ and find setxid permission bits and other security tags from on-disk security
+ database.
+\layout Standard
+
+Each LSD entry have limited expiration time, and will be flushed out when
+ expired.
+ Next request come from this user will result in the LSD be populated again,
+ with the uptodate security settings if changed.
+ System administrator also could choose to flush certain user's LSD forcely.
+\layout Standard
+
+Every filesystem access request from client need go through checking of
+ LSD.
+ This checking is uid based, for those request coming from remote client,
+ uid will be mapped at first as described above, and then go to LSD.
+\layout Subsection
+
+The MDS security context 
+\layout Standard
+
+All kernel-level service threads running on MDS are running as root, waiting
+ request from other nodes, and provide services.
+ But for those request to access filesystem for certain user, those threads
+ must act as the user, running as its identities.
+ Thus such a request comes in, we firstly collect the identity information
+ for this user as above described, include uid, gid, etc., then switch the
+ identity in the process context before really execute the filesystem operation;
+ we also need switch the root directory of process to the root of MDS's
+ backend filesystem.
+ after it finished, we switch back to the original context, prepare to the
+ next service.
+\layout Standard
+
+For some request for special service like llog handling, special interaction
+ between MDSs, which don't represent any certain user, and require keeping
+ the root privilege.
+ In those situation we don't need do such context switch, also user identity
+ preparation.
+\layout Subsection
+
+Remote client cache flushing
+\layout Standard
+
+For a remote client, it should realize that those locally cached file's
+ owner information, e.g.
+ owner, group, is ever translated by server side, some mapping might be
+ stale as time goes on.
+ for example: a user newly authenticated, while some cached file which should
+ be owned by him still shows owner is 
+\begin_inset Quotes eld
+\end_inset 
+
+nllu
+\begin_inset Quotes erd
+\end_inset 
+
+.
+ client must choose the proper time to flush those stale owner informations,
+ to give user a consistent view.
+ All attribute locks held by clients must be given a revocation callback
+ when a new user connects.
+\layout Section
+
+Use Cases
+\layout Subsection
+
+Connect rpc from local realm (case 1)
+\layout Enumerate
+
+Alice doing 'mount'
+\layout Enumerate
+
+Alice sends the first ptlrpc request (MDS_CONNECT) without GSS security
+ to MDS;
+\layout Enumerate
+
+mds_handle() will initialize per-client structure, clear the remote flag
+ in it;
+\layout Enumerate
+
+After successful connection done, the MDS send the remote flag back to client
+ for future usage in client side.
+\layout Subsection
+
+Connect rpc from local realm (case 2)
+\layout Enumerate
+
+Alice doing 'mount'
+\layout Enumerate
+
+Alice from a MDS local realm sends the first ptlrpc request (MDS_CONNECT)
+ with GSS security to MDS;
+\layout Enumerate
+
+MDS svcgssd will determine it's from a local realm client;
+\layout Enumerate
+
+mds_handle() will initialize per-client structure, clear the remote flag
+ in it;
+\layout Enumerate
+
+After successful connection done, MDS will send the remote flag back to
+ client for future usage in client side.
+\layout Subsection
+
+Connect rpc from remote realm
+\layout Enumerate
+
+Alice from a MDS remote realm sends the first ptlrpc request (MDS_CONNECT)
+ with GSS security to MDS, along with its nllu/nllg id number;
+\layout Enumerate
+
+MDS svcgssd will determine it's from a remote realm client;
+\layout Enumerate
+
+mds_handle() logic will initialize per-client structure:
+\begin_deeper 
+\layout Enumerate
+
+Set the remote flag in it;
+\layout Enumerate
+
+Fill in the nllu/nllg ids obtained from client rpc request;
+\end_deeper 
+\layout Enumerate
+
+After successful connection done, the MDS will send the remote flag back
+ to client for future usage in client side.
+\layout Subsection
+
+Filesystem access request
+\layout Enumerate
+
+Alice (from local or remote client) try to access a file in lustre
+\layout Enumerate
+
+If Alice is from remote client, MDS do uid/gid mapping; otherwise do nothing
+\layout Enumerate
+
+MDS obtain LSD item for Alice
+\layout Enumerate
+
+MDS perform permission check, based on LSD policies.
+\layout Enumerate
+
+MDS service process switch to this user's context
+\layout Enumerate
+
+MDS finish the file operation on behave of Alice.
+\layout Enumerate
+
+MDS service process switch back original context
+\layout Enumerate
+
+If Alice is from remote client, MDS do uid/gid reserve mapping if needed.
+\layout Enumerate
+
+MDS send reply.
+\layout Subsection
+
+Rpc after setuid/setgid/setgroups from local clients
+\layout Enumerate
+
+Alice calls setuid/setgid/setgroups to change her identity to Bob in local
+ client node X;
+\layout Enumerate
+
+Bob (Alice in fact) tries to access a lustre file which belongs to Bob;
+\layout Enumerate
+
+MDS will verify the permission of Bob through local cached LSD configuration;
+\layout Enumerate
+
+MDS turns down or accept the file access request;
+\layout Subsection
+
+Rpc after setuid/setgid/setgroups from remote clients
+\layout Enumerate
+
+Alice calls setuid/setgid/setgroups to change her identity to Bob in remote
+ client node Y;
+\layout Enumerate
+
+Bob (Alice in fact) tries to access a lustre file which belongs to Bob;
+\layout Enumerate
+
+MDS will find Bob is from the remote realm and in fact he is not real Bob;
+\layout Enumerate
+
+MDS turns down the file access request;
+\layout Subsection
+
+Update LSD configuration in MDS
+\layout Enumerate
+
+Lustre system administrator hopes to update current LSD option;
+\layout Enumerate
+
+The sysadmin uses the lsd update utility which will update the on-disk security
+ database, and notify the changes of the LSD configuration to MDS;
+\layout Enumerate
+
+MDS re-fresh the cached LSD info through an upcall.
+\layout Subsection
+
+Revoke a local user
+\layout Enumerate
+
+Bob is able to access lustre filesystem
+\layout Enumerate
+
+Sysadmin remove Bob from the MDS's local user database, and flush in-kernel
+ LSD cache for Bob.
+\layout Enumerate
+
+Bob will not be able to access MDS immediately
+\layout Subsection
+
+Revoke a remote user
+\layout Enumerate
+
+Alice of a remote client is mapped to MDS local user Bob.
+\layout Enumerate
+
+Alice is able to access lustre filesystem
+\layout Enumerate
+
+Sysadmin remove the mapping 
+\begin_inset Quotes eld
+\end_inset 
+
+Alice->Bob
+\begin_inset Quotes erd
+\end_inset 
+
+ from mapping database, and flush in-kernel mapping entry.
+\layout Enumerate
+
+Alice will not be able to access MDS immediately.
+\layout Enumerate
+
+If the mapping 
+\begin_inset Quotes eld
+\end_inset 
+
+anyone else -> Carol
+\begin_inset Quotes erd
+\end_inset 
+
+ exist in the mapping database, Alice could reconnect to MDS and then will
+ be mapped to Carol.
+\layout Subsection
+
+Revoke a remote user (2)
+\layout Enumerate
+
+Alice of a remote client is mapped to MDS local user Bob.
+\layout Enumerate
+
+Alice is able to access lustre filesystem
+\layout Enumerate
+
+Sysadmin remove Bob from the MDS's local user database, and flush in-kernel
+ LSD cache for Bob.
+\layout Enumerate
+
+Alice will not be able to access MDS immediately.
+\layout Enumerate
+
+If the mapping 
+\begin_inset Quotes eld
+\end_inset 
+
+anyone else -> Carol
+\begin_inset Quotes erd
+\end_inset 
+
+ exist in the mapping database, Alice could reconnect to MDS and then will
+ be mapped to Carol.
+\layout Subsection
+
+'ls -l' on remote client
+\layout Enumerate
+
+Suppose on a remote client, Alice's pricinpal group is AliceGrp; Bob's principal
+ groups is BobGrp.
+\layout Enumerate
+
+there's several files on lustre: file_1 belongs to Alice:AliceGrp; file_2
+ belongs to Alice:BobGrp; file_3 belongs to Bob:AliceGrp; file_4 belongs
+ to Bob:BobGrp; file_5 belongs to Bob:nllg;
+\layout Enumerate
+
+Alice do 'ls -l', output like this: file_1 belongs to Alice:AliceGrp; file_2
+ belongs to Alice:nllg; file_3 belongs to nllu:AliceGrp; file_4 belongs
+ to nllu:nllg; file_5 belongs to nllu:nllg;
+\layout Enumerate
+
+Bob just login the client system, also do a 'ls -l', output like this: file_1
+ belongs to Alice:AliceGrp; file_2 belongs to Alice:Bobgrp; file_3 belongs
+ to Bob:AliceGrp; file_4 belongs to Bob:BobGrp; file_5 belongs to Bob:nllg;
+\layout Enumerate
+
+Alice do 'ls -l' again, output is the same as Bob's list.
+\layout Enumerate
+
+Alice logout, then Bob do a 'ls -l' again, output like this: file_1 belongs
+ to nllu:nllg; file_2 belongs to nllu:Bobgrp; file_3 belongs to Bob:nllg;
+ file_4 belongs to Bob:BogGrp; file_5 belongs to Bob:nllg;
+\layout Subsection
+
+Chown on remote client
+\layout Enumerate
+
+Root user on a remote client want to change the owner of a file to Bob,
+ while Bob didn't login(authenticated with lustre) yet.
+\layout Enumerate
+
+MDS can't find the mapping for the destinated uid, so return error.
+\layout Enumerate
+
+Bob login at that time.
+\layout Enumerate
+
+Root do the same chown again.
+\layout Enumerate
+
+MDS will grant the request, no matter what the original owner of this file
+ is.
+\layout Subsection
+
+Chgrp on remote client
+\layout Enumerate
+
+Triditional chgrp on remote client is not allowed, since there's no clear
+ group id mapping between local and remote database.
+ so the group id on the remote client is not meaningful on the MDS.
+\layout Section
+
+Logic Specification
+\layout Subsection
+
+Specify nllu/nllg
+\layout Standard
+
+When client do mount, in addition to other parameter, user need supply with
+ the IDs of nllu/nllg on this client, which will be sent to the MDS at connectin
+g time.
+ If no nllu/nllg explicitly supplied, default values will be used.
+\layout Subsection
+
+Determine local or remote client
+\layout Standard
+
+Under GSS protection, user could explicitly supply the remote flag during
+ mount time.
+ MDS make decision as following order:
+\layout Itemize
+
+All permitted connections without GSS security are from local realm clients.
+\layout Itemize
+
+All connections with GSS security, if user supplied remote flag during mount,
+ MDS will grant the flag as requested.
+\layout Itemize
+
+All connections with GSS/local_realm_kerberos are from local realm clients.
+\layout Itemize
+
+All connections with GSS/remote_realm_kerberos are from remote realm clients.
+\layout Standard
+
+Here we made the assumption that: kerberos's local/remote realm == lustre's
+ local/remote realm.
+ Later we might bring in more factors into this dicision making.
+\layout Standard
+
+GSS/Kerberos module is responsible to provide the information that the initial
+ connect request whether has strong security; whether from remote kerberos
+ realm.
+\layout Standard
+
+On MDS's, the per-client export structure has a flag to indicate local/remote
+ of this client.
+ Accordingly, each client has a similar flag, which is send back by MDS's
+ after initial connection.
+\layout Subsection
+
+Handle local rpc request
+\layout Standard
+
+For each filesystem access request from client, we will get LSD for this
+ uid at first.
+ We then lookup in the cache, if not found or already invalid, issue a upcall
+ to get it.
+ If finally failed to get LSD(timeout or got an error), we simply deny this
+ request.
+\layout Standard
+
+After obtained LSD, we also check whether the client intend to do setuid/setgid/
+setgroups.
+ If yes, check the permission bits in LSD, if not allow we also deny this
+ request.
+ The intention of setuid/setgid could be detected by compare the uid, gid,
+ fsuid, fsgid sent by client, and the local authorized uid/gid.
+\layout Standard
+
+If setgroups is permitted: for root we'll directly use the supplementary
+ groups array sent by client; for normal user we compare those sent by client
+ with those in LSD, guarantee client only could reduce the array (can't
+ add new ids which is not part of group array in LSD).
+\layout Standard
+
+If setgroups is not permitted, we simply use the supplementary group array
+ provided by LSD.
+\layout Standard
+
+After all security context prepared as above, we switch it into process
+ context, perform the actual filesystem operation.
+ after finished, switch back the original context.
+ send reply out to client.
+\layout Standard
+
+Later an special security policy is needed to allow RAW access by FID without
+ a capability.
+ This is used for analyzing audit logs, finding pathnames from fids (for
+ recovery) etc.
+\layout Subsection
+
+Remote user mapping database
+\layout Standard
+
+There will be a user mapping configuration file on MDS, already defined
+ in 
+\begin_inset Quotes eld
+\end_inset 
+
+functional specification
+\begin_inset Quotes erd
+\end_inset 
+
+.
+ MDS kernel will also maintain a cache of this mapping information.
+ It is populated by upcall to server side gss daemon, along with the gss
+ credential information.
+\layout Itemize
+
+The on-disk mapping database only described how user(principal) is mapped
+ to an local uid, and don't need specify the gid mapping.
+\layout Itemize
+
+Both on-disk mapping database and kernel mapping cache should be able to
+ allow map all other remote users to a certain local user.
+\layout Itemize
+
+On the MDS, the per-client structure will maintain this mapping cache.
+ When a user from remote client get authenticated, we check the on-disk
+ mapping database.
+ If no mapping items for this user found, we'll deny this user.
+ otherwise we record the target uid.
+\layout Itemize
+
+When a fs access request come from remote client, it contains the user's
+ uid, gid on the remote client.
+ Here we can establish mapping for uid and target uid.
+ With target uid we can find the target gid from local user database (from
+ LSD), thus we can also establish the mapping for gid and target gid.
+\layout Itemize
+
+With mapping we established above, we now do the mapping: replace the uid/gid
+ in the rpc request with target uid/gid.
+ If it request chown we also check & map the new owner id.
+\layout Itemize
+
+When reply populated and about to send back, we again check the mapping
+ cache, and do the reverse mapping if in the case which return file attributes
+ to clients.
+ For those can't find the matched items, map them to nllu/nllg of this remote
+ client.
+\layout Subsection
+
+Handle remote rpc request
+\layout Standard
+
+The overall process of handle remote rpc request is the same as for local
+ user, except following:
+\layout Itemize
+
+For incoming request, firstly do the uid/gid mapping for the requestor;
+ and do reserve mapping for the reply, as described above.
+\layout Itemize
+
+No setuid/setgid/setgroups intention is permitted, except we explicitly
+ allow setuid-root in setxid database.
+ And so we ignore the supplementary groups sent by client(if any), and simply
+ use the one provided by LSD.
+\layout Itemize
+
+For chown request, we also do translation for the new owner id (already
+ described above) according to the in-kernel mapping cache.
+ It means the root user on remote client can't change owner of a file to
+ a user which is not login yet.
+\layout Itemize
+
+Deny all chgrp request, since the group on remote client has no clear mapping
+ on MDS's local user database (We also could choose allow this when the
+ new group id showup in the in-kernel mapping cache, but it seems dosen't
+ make much sense).
+ So we probably need a special tool like 
+\begin_inset Quotes eld
+\end_inset 
+
+lfs chgrp
+\begin_inset Quotes erd
+\end_inset 
+
+ to perform chgrp on remote client, which will send out text name instead
+ of translate to id locally.
+\layout Subsection
+
+Remote client cache flushing
+\layout Standard
+
+Anytime there might be inodes cached and their owner belongs to nllu/nllg.
+ If a new user Alice get authenticated and she happens to be the owner of
+ those inodes, we need to refresh those inode even if it's cache status
+ is correct, otherwise Alice will find her files belong to others.
+ Since we don't know whether a inode with nllu/nllg belongs to Alice or
+ not, we must flush all of them.
+\layout Standard
+
+On MDS, a callback or similar event notification mechanism should be hooked
+ into gss module.
+ When a user authenticated at the first time, we should iterate through
+ all the granted lock corresponding to this client, and revoke them selectively.
+ Strictly speaking we only want to revoke those inodebits lock and the owner/gro
+up of their resource (inode) not show up in the in-kernel mapping database,
+ but here we just flush all the inodebits locks, a cache is quickly re-populated
+ - there are a maximum of 20-100 cached locks on clients at the moment.
+\layout Standard
+
+When Alice logs out of the client system, we also do the similar things:
+ iterate through all the granted lock corresponding to this client, and
+ revoke them selectively.
+ Here we want to revoke those inodebits locks and the owner/group of their
+ resource(inode) is Alice.
+ We also could choose flush all of them like above case.
+\layout Subsection
+
+LSD upcall
+\layout Standard
+
+There is a general upcall-cache code which do upcall into user space, and
+ cache data passed down in kernel, and also implemented timeout invalidation.
+ Kernel LSD could simply be implemented as a instance of it.
+ So it will be quite simple.
+\layout Standard
+
+A user-space tools should provide following functionality:
+\layout Itemize
+
+Accept uid as parameter
+\layout Itemize
+
+Obtian gid and supplementary groups id array which the uid belongs to, if
+ failed just return error.
+\layout Itemize
+
+Obtian the setxid permission bits for this user on this NID from database.
+ If not found a default bitset will be applied: (1) for local client: setuid/set
+gid is off, setgroups for root is off, setgroups for normal user is on;
+ (2) for remote client: all of setuid/setgid/setgroups is off.
+\layout Itemize
+
+Pass all the collected information back to kernel by /proc.
+\layout Standard
+
+Since the upcall could happen concurrently, and admin could modified it
+ at anytime, so a kind of read-write lock need to be done on the database
+ file.
+\layout Subsection
+
+Recovery consideration
+\layout Standard
+
+All the code here should have minimal effect on recovery.
+ After MDS's crash, security context will be established during connection
+ time in recovery; and uid-mapping cache and LSD actually are 
+\begin_inset Quotes eld
+\end_inset 
+
+adaptive
+\begin_inset Quotes erd
+\end_inset 
+
+, they will also be re-populated when handling related user's replay request
+ during/after recovery.
+\layout Section
+
+State Management
+\layout Subsection
+
+configuration states
+\layout Itemize
+
+Client has a remote flag at mount time.
+\layout Itemize
+
+Remote clients must have nllu:nllg installed.
+ it could simply be nobody:nobody.
+\layout Itemize
+
+MDS could have a remote-user mapping database which contains which principal
+ at with client should be mapped to which local user.
+ Without the database no remote client is allowed to connect.
+\layout Itemize
+
+MDS could have a security database which contains setxid permissions along
+ with other security setting for each affected user.
+ No such database then a default setting will be applied.
+\layout Subsection
+
+LSD entry states transition
+\layout Enumerate
+
+NEW: generated and submit to upcall
+\layout Enumerate
+
+READY: ready to serve
+\layout Enumerate
+
+INVALID: expired or error
+\layout Standard
+
+Requestor will initiate an NEW LSD entry; after upcall successfully fill
+ in data it change to READY; if timeout or some error happen (e.g.
+ not found in user database) during upcall it change to INVALID; a READY
+ LSD will change to INVALID when expired, or flushed forcely by sysadmin,
+ or MDS shutdown; an INVALID LSD will be soon destroied.
+\layout Standard
+
+No disk format changed.
+ When a large number of users access lustre from all kinds of local/remote
+ clients at the same time, MDS will have more CPU and memory overhead, especiall
+y for remote users.
+ No special recovery consideration.
+\layout Section
+
+Alternatives
+\layout Subsection
+
+NFSv4
+\layout Standard
+
+NFSv4 sends user and groups by name.
+\layout Section
+
+Focus of Inspection
+\layout Itemize
+
+Could this pass HP acceptance test?
+\layout Itemize
+
+Any is not reasonable? Any security hole?
+\layout Itemize
+
+Everything recoverable from MDS/client crash?
+\the_end
diff --git a/lustre/sec/doc/revoke_user_HLD.lyx b/lustre/sec/doc/revoke_user_HLD.lyx
new file mode 100644 (file)
index 0000000..f454dc5
--- /dev/null
@@ -0,0 +1,244 @@
+#LyX 1.3 created this file. For more info see http://www.lyx.org/
+\lyxformat 221
+\textclass article
+\language english
+\inputencoding auto
+\fontscheme times
+\graphics default
+\paperfontsize 12
+\spacing single 
+\papersize Default
+\paperpackage a4
+\use_geometry 0
+\use_amsmath 0
+\use_natbib 0
+\use_numerical_citations 0
+\paperorientation portrait
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation skip
+\defskip medskip
+\quotes_language english
+\quotes_times 2
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+
+\layout Title
+
+High Level Design of User Revoke
+\layout Author
+
+Peter Braam, Eric Mei
+\layout Date
+
+Jan 31, 2005
+\layout Section
+
+Requirement
+\layout Itemize
+
+Be able to revoke a user, prevent it from accessing lustre immediately.
+\layout Itemize
+
+Be able to pass sub-test of HP acceptance 4.1.51.
+\layout Itemize
+
+user & mapping databases manipulation API.
+\layout Section
+
+Functional Specification
+\layout Standard
+
+A sub-command 
+\begin_inset Quotes eld
+\end_inset 
+
+revoke
+\begin_inset Quotes erd
+\end_inset 
+
+ will be added into existing tool 'lctl'.
+ When system administrator want to kick somebody off from lustre filesystem
+ (e.g.
+ a certain user has known be malicious or an account be compromised), he
+ could use this functionality on MDS's to prevent the victim user from access
+ lustre filesystem right away.
+ The command format could be:
+\layout LyX-Code
+
+lctl revoke user|all
+\layout Itemize
+
+Here the 'user' format is: uid[@nid[.netid]]
+\layout Itemize
+
+option @nid.netid is only for remote users.
+ The uid is in term of local uid, thus 'uid@remote_nid.netid' means remote
+ users on node 'remote_nid.netid' who are mapped to local 'uid', it's not
+ intend to remove a certain user on specific node.
+\layout Itemize
+
+Specified uid without nid or netid means match all nid or netid.
+\layout Itemize
+
+'all' means revoke all users.
+\layout Standard
+
+Actually lctl only remove those in-kernel cache for the victim user, usually
+ there's many other configuration work need to be done by using other admin
+ tools:
+\layout Itemize
+
+Kerberos Database: For removing a user from kerberos principal database,
+ sysadmin must use kerberos admin tools.
+ And this change will not take effect right away if the victim user has
+ authenticated with MDS's before the removal (because of client side credential
+ cache).
+\layout Itemize
+
+User Database: For removing a user from user database, sysadmin also must
+ resort to other tools, usually standard unix tools.
+ This change will not take effect right away if this user had ever accessed
+ lustre before the removal (because of in-kernel LSD cache).
+\layout Itemize
+
+User Mapping Database: For removing a user from remote user mapping database,
+ sysadmin need edit the configure file manually.
+ This only affect certain user on certain remote client.
+ This change will not take effect right away if this user had ever acessed
+ lustre before the removal (because of in-kernel uid mapping cache).
+\layout Standard
+
+So when sysadmin actually revoke a user, he usually at first did one or
+ more steps of above according to requirement, then invoke lctl to finally
+ revoke the user.
+ In cases that user database or user mapping database are not centrally
+ managed by e.g.
+ LDAP, sysadmin must remove the user from all configure files on each MDS's,
+ this could be done by using 'pdsh', etc.
+\layout Standard
+
+What above described is the basic requirement.
+ There's an additional one: for user and mapping database, write a C API
+ library (probably later add python support), which can query, add, remove,
+ and enumerate users in each database.
+ 'edit' could be implemented as remove + add.
+\layout Standard
+
+By using this API, we could provide much complete functionality.
+ Sysadmin could do everything about user account within single lctl tools;
+ Kernel upcall helper also could use this API to obtain information from
+ mapping database, etc.
+\layout Section
+
+Use Cases
+\layout Subsection
+
+Revoke Alice's access right on all clients, permanently
+\layout Enumerate
+
+Sysadmin remove Alice from user database on all MDS's.
+\layout Enumerate
+
+Sysadmin invoke 'lctl revoke alice_uid' on all MDS's.
+\layout Enumerate
+
+Alice from local clients will not be able to access lustre.
+\layout Enumerate
+
+Any remote users who are mapped to Alice will not be able to access lustre.
+\layout Subsection
+
+Revoke Alice's access right on remote client remote1
+\layout Enumerate
+
+Suppose alice@remote1 is mapped to local user Bob.
+\layout Enumerate
+
+Sysadmin remove mapping entry of 'alice_uid@remote1 -> bob' from user mapping
+ database.
+\layout Enumerate
+
+Sysadmin invoke 'lctl revoke bob_uid@remote1' on all MDS's.
+\layout Enumerate
+
+Alice will not be able to access lustre from remote1.
+\layout Enumerate
+
+Bob from an local client could still work fine.
+\layout Section
+
+Logic Specification
+\layout Standard
+
+There's several kinds of in-kernel cache for certain user: LSD, gss context,
+ and uid-mapping.
+ In the future we might add consideration of removing OSS access capability.
+\layout Enumerate
+
+LSD: On each MDS, each user (uid) correspond to at most one LSD entry.
+ There's already an existing interface to flush LSD for a certain user:
+ simply write an uid into '/proc/fs/lustre/mds/lsd_flush' (Note this is
+ subject to change).
+ Write in '-1' will flush all LSD entries.
+\layout Enumerate
+
+GSS Context: On each MDS, each user (principal) might correspond to several(even
+ many) gss contexts.
+ The gss module should export a proc entry.
+ When provided uid and remote nid/netid, it should be able to find out the
+ initiating/established gss contexts and destroy them.
+ Providing a special tag will flush all gss contexts.
+\layout Enumerate
+
+UID Mapping: Firstly found out per-client structure for specified nid/netid,
+ then destroy the mapping entries for specified uid.
+ Since this is strongly related to GSS context, we can use the export proc
+ entry for gss context to initiate this flush.
+ Thus when sysadmin trying to flush gss contexts for certain user, we also
+ flush associated uid-mapping.
+\layout Standard
+
+This work should be done after the completion of GSS and remote uid/gid
+ handling implementation.
+\layout Standard
+
+The user and mapping databases manipulation API could be simple not much
+ restriction, and the details is very much related to the actual database
+ structure.
+ we leave the details to the following DLD document.
+\layout Section
+
+State Management
+\layout Standard
+
+Since we'll flush several cache separately, we might have situation that
+ not strictly consistency.
+ For example, after we flushed alice from cache1, someone re-populate it
+ in cache1 while do it on cache2.
+ In fact, the inconsistency between LSD and gss context is perfectly allowed.
+ Only one thing need be sure is: since uid mapping is established after
+ that of gss context, thus we need flush uid mapping at first, and then
+ flush gss context.
+ This could prevent unnecessary error when doing 'revoke' while we don't
+ actually remote it from mapping database.
+\layout Standard
+
+No serious locking issues, no special recovery consideration.
+\layout Section
+
+Alternatives
+\layout Standard
+
+None.
+\layout Section
+
+Focus of Inspection
+\layout Itemize
+
+Is the lctl interface reasonably reflect the facts?
+\layout Itemize
+
+Could it pass acceptance test?
+\the_end
diff --git a/lustre/sec/gss/.cvsignore b/lustre/sec/gss/.cvsignore
new file mode 100644 (file)
index 0000000..d5103fa
--- /dev/null
@@ -0,0 +1,15 @@
+.Xrefs
+config.log
+config.status
+configure
+Makefile
+.deps
+TAGS
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.o.flags
+.tmp_versions
+.depend
diff --git a/lustre/sec/gss/Makefile.in b/lustre/sec/gss/Makefile.in
new file mode 100644 (file)
index 0000000..ccfd0d3
--- /dev/null
@@ -0,0 +1,9 @@
+#MODULES := ptlrpcs_gss ptlrpcs_gss_krb5
+MODULES := ptlrpcs_gss
+ptlrpcs_gss-objs := sec_gss.o svcsec_gss.o rawobj.o gss_mech_switch.o \
+                    gss_generic_token.o gss_krb5_crypto.o gss_krb5_seal.o \
+                    gss_krb5_unseal.o gss_krb5_seqnum.o gss_krb5_mech.o \
+                    gss_krb5_wrap.o
+#ptlrpcs_gss_krb5-objs := gss_krb5_mech.o
+
+@INCLUDE_RULES@
diff --git a/lustre/sec/gss/Makefile.mk b/lustre/sec/gss/Makefile.mk
new file mode 100644 (file)
index 0000000..08de7a4
--- /dev/null
@@ -0,0 +1,14 @@
+# Copyright (C) 2004  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(src)/../../portals/Kernelenv
+
+#obj-y += ptlrpcs_gss.o ptlrpcs_gss_krb5.o
+obj-y += ptlrpcs_gss.o
+ptlrpcs_gss-objs := sec_gss.o svcsec_gss.o rawobj.o gss_mech_switch.o \
+                    gss_generic_token.o gss_krb5_crypto.o gss_krb5_seal.o \
+                    gss_krb5_unseal.o gss_krb5_seqnum.o gss_krb5_mech.o \
+                   gss_krb5_wrap.o
+#ptlrpcs_gss_krb5-objs := gss_krb5_mech.o
diff --git a/lustre/sec/gss/autoMakefile.am b/lustre/sec/gss/autoMakefile.am
new file mode 100644 (file)
index 0000000..f729d06
--- /dev/null
@@ -0,0 +1,23 @@
+# Copyright (C) 2004  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if LIBLUSTRE
+noinst_LIBRARIES = libptlrpcs_gss.a
+libptlrpcs_gss_a_SOURCES = sec_gss.c gss_mech_switch.c gss_krb5_mech.c \
+                           gss_generic_token.c gss_krb5_crypto.c       \
+                           gss_krb5_seal.c gss_krb5_unseal.c           \
+                           gss_krb5_seqnum.c rawobj.c
+
+libptlrpcs_gss_a_CPPFLAGS = $(LLCPPFLAGS)
+libptlrpcs_gss_a_CFLAGS = $(LLCFLAGS)
+endif
+
+if MODULES
+modulefs_DATA = ptlrpcs_gss$(KMODEXT)
+endif
+
+DIST_SOURCES = $(ptlrpcs_gss-objs:.o=.c) gss_internal.h gss_api.h gss_asn1.h \
+               gss_err.h gss_krb5.h
+MOSTLYCLEANFILES = *.o *.ko *.mod.c
diff --git a/lustre/sec/gss/gss_api.h b/lustre/sec/gss/gss_api.h
new file mode 100644 (file)
index 0000000..06557d4
--- /dev/null
@@ -0,0 +1,132 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Somewhat simplified version of the gss api.
+ *
+ * Dug Song <dugsong@monkey.org>
+ * Andy Adamson <andros@umich.edu>
+ * Bruce Fields <bfields@umich.edu>
+ * Copyright (c) 2000 The Regents of the University of Michigan
+ *
+ * $Id: gss_api.h,v 1.2 2005/03/31 22:18:24 ericm Exp $
+ */
+
+#ifndef __SEC_GSS_GSS_API_H_
+#define __SEC_GSS_GSS_API_H_
+
+struct gss_api_mech;
+
+/* The mechanism-independent gss-api context: */
+struct gss_ctx {
+        struct gss_api_mech        *mech_type;
+        void                       *internal_ctx_id;
+};
+
+#define GSS_C_NO_BUFFER                ((rawobj_t) 0)
+#define GSS_C_NO_CONTEXT        ((struct gss_ctx *) 0)
+#define GSS_C_NULL_OID                ((rawobj_t) 0)
+
+/*XXX  arbitrary length - is this set somewhere? */
+#define GSS_OID_MAX_LEN 32
+
+/* gss-api prototypes; note that these are somewhat simplified versions of
+ * the prototypes specified in RFC 2744. */
+__u32 kgss_import_sec_context(
+                rawobj_t                *input_token,
+                struct gss_api_mech     *mech,
+                struct gss_ctx         **ctx_id);
+__u32 kgss_inquire_context(
+                struct gss_ctx         *ctx_id,
+                __u64                  *endtime);
+__u32 kgss_get_mic(
+                struct gss_ctx          *ctx_id,
+                __u32                    qop,
+                rawobj_t                *message,
+                rawobj_t                *mic_token);
+__u32 kgss_verify_mic(
+                struct gss_ctx          *ctx_id,
+                rawobj_t                *message,
+                rawobj_t                *mic_token,
+                __u32                   *qstate);
+__u32 kgss_wrap(
+                struct gss_ctx          *ctx_id,
+                __u32                    qop,
+                rawobj_buf_t            *in_token,
+                rawobj_t                *out_token);
+__u32 kgss_unwrap(
+                struct gss_ctx          *ctx_id,
+                __u32                    qop,
+                rawobj_t                *in_token,
+                rawobj_t                *out_token);
+__u32 kgss_delete_sec_context(
+                struct gss_ctx         **ctx_id);
+
+struct subflavor_desc {
+        __u32           subflavor;
+        __u32           qop;
+        __u32           service;
+        char           *name;
+};
+
+/* Each mechanism is described by the following struct: */
+struct gss_api_mech {
+        struct list_head        gm_list;
+        struct module          *gm_owner;
+        char                   *gm_name;
+        rawobj_t                gm_oid;
+        atomic_t                gm_count;
+        struct gss_api_ops     *gm_ops;
+        int                     gm_sf_num;
+        struct subflavor_desc  *gm_sfs;
+};
+
+/* and must provide the following operations: */
+struct gss_api_ops {
+        __u32 (*gss_import_sec_context)(
+                        rawobj_t               *input_token,
+                        struct gss_ctx         *ctx_id);
+        __u32 (*gss_inquire_context)(
+                        struct gss_ctx         *ctx_id,
+                        __u64                  *endtime);
+        __u32 (*gss_get_mic)(
+                        struct gss_ctx         *ctx_id,
+                        __u32                   qop, 
+                        rawobj_t               *message,
+                        rawobj_t               *mic_token);
+        __u32 (*gss_verify_mic)(
+                        struct gss_ctx         *ctx_id,
+                        rawobj_t               *message,
+                        rawobj_t               *mic_token,
+                        __u32                  *qstate);
+        __u32 (*gss_wrap)(
+                        struct gss_ctx         *ctx,
+                        __u32                   qop,
+                        rawobj_buf_t           *in_token,
+                        rawobj_t               *out_token);
+        __u32 (*gss_unwrap)(
+                        struct gss_ctx         *ctx,
+                        __u32                   qop,
+                        rawobj_t               *in_token,
+                        rawobj_t               *out_token);
+        void (*gss_delete_sec_context)(
+                        void                   *internal_ctx_id);
+};
+
+int kgss_mech_register(struct gss_api_mech *mech);
+void kgss_mech_unregister(struct gss_api_mech *mech);
+
+struct gss_api_mech * kgss_OID_to_mech(rawobj_t *);
+struct gss_api_mech * kgss_name_to_mech(char *name);
+struct gss_api_mech * kgss_subflavor_to_mech(__u32 subflavor);
+
+struct gss_api_mech * kgss_mech_get(struct gss_api_mech *);
+void kgss_mech_put(struct gss_api_mech *);
+
+#endif /* __SEC_GSS_GSS_API_H_ */
diff --git a/lustre/sec/gss/gss_asn1.h b/lustre/sec/gss/gss_asn1.h
new file mode 100644 (file)
index 0000000..cd44f6d
--- /dev/null
@@ -0,0 +1,87 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  minimal asn1 for generic encoding/decoding of gss tokens
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+#define SIZEOF_INT 4
+
+/* from gssapi_err_generic.h */
+#define G_BAD_SERVICE_NAME                       (-2045022976L)
+#define G_BAD_STRING_UID                         (-2045022975L)
+#define G_NOUSER                                 (-2045022974L)
+#define G_VALIDATE_FAILED                        (-2045022973L)
+#define G_BUFFER_ALLOC                           (-2045022972L)
+#define G_BAD_MSG_CTX                            (-2045022971L)
+#define G_WRONG_SIZE                             (-2045022970L)
+#define G_BAD_USAGE                              (-2045022969L)
+#define G_UNKNOWN_QOP                            (-2045022968L)
+#define G_NO_HOSTNAME                            (-2045022967L)
+#define G_BAD_HOSTNAME                           (-2045022966L)
+#define G_WRONG_MECH                             (-2045022965L)
+#define G_BAD_TOK_HEADER                         (-2045022964L)
+#define G_BAD_DIRECTION                          (-2045022963L)
+#define G_TOK_TRUNC                              (-2045022962L)
+#define G_REFLECT                                (-2045022961L)
+#define G_WRONG_TOKID                            (-2045022960L)
+
+#define g_OID_equal(o1,o2) \
+   (((o1)->len == (o2)->len) && \
+    (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0))
+
+__u32 g_verify_token_header(
+     rawobj_t *mech,
+     int *body_size,
+     unsigned char **buf_in,
+     int toksize);
+
+__u32 g_get_mech_oid(rawobj_t *mech, rawobj_t * in_buf);
+
+int g_token_size(
+     rawobj_t *mech,
+     unsigned int body_size);
+
+void g_make_token_header(
+     rawobj_t *mech,
+     int body_size,
+     unsigned char **buf);
diff --git a/lustre/sec/gss/gss_err.h b/lustre/sec/gss/gss_err.h
new file mode 100644 (file)
index 0000000..c893983
--- /dev/null
@@ -0,0 +1,181 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef __SEC_GSS_GSS_ERR_H_
+#define __SEC_GSS_GSS_ERR_H_
+
+typedef unsigned int OM_uint32;
+
+/*
+ * Flag bits for context-level services.
+ */
+#define GSS_C_DELEG_FLAG 1
+#define GSS_C_MUTUAL_FLAG 2
+#define GSS_C_REPLAY_FLAG 4
+#define GSS_C_SEQUENCE_FLAG 8
+#define GSS_C_CONF_FLAG 16
+#define GSS_C_INTEG_FLAG 32
+#define        GSS_C_ANON_FLAG 64
+#define GSS_C_PROT_READY_FLAG 128
+#define GSS_C_TRANS_FLAG 256
+
+/*
+ * Credential usage options
+ */
+#define GSS_C_BOTH 0
+#define GSS_C_INITIATE 1
+#define GSS_C_ACCEPT 2
+
+/*
+ * Status code types for gss_display_status
+ */
+#define GSS_C_GSS_CODE 1
+#define GSS_C_MECH_CODE 2
+
+
+/*
+ * Define the default Quality of Protection for per-message services.  Note
+ * that an implementation that offers multiple levels of QOP may either reserve
+ * a value (for example zero, as assumed here) to mean "default protection", or
+ * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit
+ * QOP value.  However a value of 0 should always be interpreted by a GSSAPI
+ * implementation as a request for the default protection level.
+ */
+#define GSS_C_QOP_DEFAULT 0
+
+/*
+ * Expiration time of 2^32-1 seconds means infinite lifetime for a
+ * credential or security context
+ */
+#define GSS_C_INDEFINITE ((OM_uint32) 0xfffffffful)
+
+
+/* Major status codes */
+
+#define GSS_S_COMPLETE 0
+
+/*
+ * Some "helper" definitions to make the status code macros obvious.
+ */
+#define GSS_C_CALLING_ERROR_OFFSET 24
+#define GSS_C_ROUTINE_ERROR_OFFSET 16
+#define GSS_C_SUPPLEMENTARY_OFFSET 0
+#define GSS_C_CALLING_ERROR_MASK ((OM_uint32) 0377ul)
+#define GSS_C_ROUTINE_ERROR_MASK ((OM_uint32) 0377ul)
+#define GSS_C_SUPPLEMENTARY_MASK ((OM_uint32) 0177777ul)
+
+/*
+ * The macros that test status codes for error conditions.  Note that the
+ * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now
+ * evaluates its argument only once.
+ */
+#define GSS_CALLING_ERROR(x) \
+  ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET))
+#define GSS_ROUTINE_ERROR(x) \
+  ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))
+#define GSS_SUPPLEMENTARY_INFO(x) \
+  ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET))
+#define GSS_ERROR(x) \
+  ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \
+         (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)))
+
+/*
+ * Now the actual status code definitions
+ */
+
+/*
+ * Calling errors:
+ */
+#define GSS_S_CALL_INACCESSIBLE_READ \
+                             (((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_INACCESSIBLE_WRITE \
+                             (((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET)
+#define GSS_S_CALL_BAD_STRUCTURE \
+                             (((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET)
+
+/*
+ * Routine errors:
+ */
+#define GSS_S_BAD_MECH (((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAME (((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_NAMETYPE (((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_BINDINGS (((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_STATUS (((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_SIG (((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CRED (((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NO_CONTEXT (((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_TOKEN (((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DEFECTIVE_CREDENTIAL \
+     (((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CREDENTIALS_EXPIRED \
+     (((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_CONTEXT_EXPIRED \
+     (((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_FAILURE (((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_BAD_QOP (((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAUTHORIZED (((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_UNAVAILABLE (((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_DUPLICATE_ELEMENT \
+     (((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+#define GSS_S_NAME_NOT_MN \
+     (((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET)
+
+/*
+ * Supplementary info bits:
+ */
+#define GSS_S_CONTINUE_NEEDED (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 0))
+#define GSS_S_DUPLICATE_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 1))
+#define GSS_S_OLD_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 2))
+#define GSS_S_UNSEQ_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 3))
+#define GSS_S_GAP_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 4))
+
+/* XXXX these are not part of the GSSAPI C bindings!  (but should be) */
+
+#define GSS_CALLING_ERROR_FIELD(x) \
+   (((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK)
+#define GSS_ROUTINE_ERROR_FIELD(x) \
+   (((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK)
+#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \
+   (((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK)
+
+/* XXXX This is a necessary evil until the spec is fixed */
+#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE
+
+#endif /* __SEC_GSS_GSS_ERR_H_ */
diff --git a/lustre/sec/gss/gss_generic_token.c b/lustre/sec/gss/gss_generic_token.c
new file mode 100644 (file)
index 0000000..c48653a
--- /dev/null
@@ -0,0 +1,295 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_generic_token.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+#include "gss_asn1.h"
+
+
+/* TWRITE_STR from gssapiP_generic.h */
+#define TWRITE_STR(ptr, str, len) \
+        memcpy((ptr), (char *) (str), (len)); \
+        (ptr) += (len);
+
+/* XXXX this code currently makes the assumption that a mech oid will
+   never be longer than 127 bytes.  This assumption is not inherent in
+   the interfaces, so the code can be fixed if the OSI namespace
+   balloons unexpectedly. */
+
+/* Each token looks like this:
+
+0x60                                tag for APPLICATION 0, SEQUENCE
+                                        (constructed, definite-length)
+        <length>                possible multiple bytes, need to parse/generate
+        0x06                        tag for OBJECT IDENTIFIER
+                <moid_length>        compile-time constant string (assume 1 byte)
+                <moid_bytes>        compile-time constant string
+        <inner_bytes>                the ANY containing the application token
+                                        bytes 0,1 are the token type
+                                        bytes 2,n are the token data
+
+For the purposes of this abstraction, the token "header" consists of
+the sequence tag and length octets, the mech OID DER encoding, and the
+first two inner bytes, which indicate the token type.  The token
+"body" consists of everything else.
+
+*/
+
+static int
+der_length_size( int length)
+{
+        if (length < (1<<7))
+                return(1);
+        else if (length < (1<<8))
+                return(2);
+#if (SIZEOF_INT == 2)
+        else
+                return(3);
+#else
+        else if (length < (1<<16))
+                return(3);
+        else if (length < (1<<24))
+                return(4);
+        else
+                return(5);
+#endif
+}
+
+static void
+der_write_length(unsigned char **buf, int length)
+{
+        if (length < (1<<7)) {
+                *(*buf)++ = (unsigned char) length;
+        } else {
+                *(*buf)++ = (unsigned char) (der_length_size(length)+127);
+#if (SIZEOF_INT > 2)
+                if (length >= (1<<24))
+                        *(*buf)++ = (unsigned char) (length>>24);
+                if (length >= (1<<16))
+                        *(*buf)++ = (unsigned char) ((length>>16)&0xff);
+#endif
+                if (length >= (1<<8))
+                        *(*buf)++ = (unsigned char) ((length>>8)&0xff);
+                *(*buf)++ = (unsigned char) (length&0xff);
+        }
+}
+
+/* returns decoded length, or < 0 on failure.  Advances buf and
+   decrements bufsize */
+
+static int
+der_read_length(unsigned char **buf, int *bufsize)
+{
+        unsigned char sf;
+        int ret;
+
+        if (*bufsize < 1)
+                return(-1);
+        sf = *(*buf)++;
+        (*bufsize)--;
+        if (sf & 0x80) {
+                if ((sf &= 0x7f) > ((*bufsize)-1))
+                        return(-1);
+                if (sf > SIZEOF_INT)
+                        return (-1);
+                ret = 0;
+                for (; sf; sf--) {
+                        ret = (ret<<8) + (*(*buf)++);
+                        (*bufsize)--;
+                }
+        } else {
+                ret = sf;
+        }
+
+        return(ret);
+}
+
+/* returns the length of a token, given the mech oid and the body size */
+
+int
+g_token_size(rawobj_t *mech, unsigned int body_size)
+{
+        /* set body_size to sequence contents size */
+        body_size += 4 + (int) mech->len;         /* NEED overflow check */
+        return(1 + der_length_size(body_size) + body_size);
+}
+
+//EXPORT_SYMBOL(g_token_size);
+
+/* fills in a buffer with the token header.  The buffer is assumed to
+   be the right size.  buf is advanced past the token header */
+
+void
+g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf)
+{
+        *(*buf)++ = 0x60;
+        der_write_length(buf, 4 + mech->len + body_size);
+        *(*buf)++ = 0x06;
+        *(*buf)++ = (unsigned char) mech->len;
+        TWRITE_STR(*buf, mech->data, ((int) mech->len));
+}
+
+//EXPORT_SYMBOL(g_make_token_header);
+
+/*
+ * Given a buffer containing a token, reads and verifies the token,
+ * leaving buf advanced past the token header, and setting body_size
+ * to the number of remaining bytes.  Returns 0 on success,
+ * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the
+ * mechanism in the token does not match the mech argument.  buf and
+ * *body_size are left unmodified on error.
+ */
+__u32
+g_verify_token_header(rawobj_t *mech, int *body_size,
+                      unsigned char **buf_in, int toksize)
+{
+        unsigned char *buf = *buf_in;
+        int seqsize;
+        rawobj_t toid;
+        int ret = 0;
+
+        if ((toksize-=1) < 0)
+                return(G_BAD_TOK_HEADER);
+        if (*buf++ != 0x60)
+                return(G_BAD_TOK_HEADER);
+
+        if ((seqsize = der_read_length(&buf, &toksize)) < 0)
+                return(G_BAD_TOK_HEADER);
+
+        if (seqsize != toksize)
+                return(G_BAD_TOK_HEADER);
+
+        if ((toksize-=1) < 0)
+                return(G_BAD_TOK_HEADER);
+        if (*buf++ != 0x06)
+                return(G_BAD_TOK_HEADER);
+        if ((toksize-=1) < 0)
+                return(G_BAD_TOK_HEADER);
+        toid.len = *buf++;
+
+        if ((toksize-=toid.len) < 0)
+                return(G_BAD_TOK_HEADER);
+        toid.data = buf;
+        buf+=toid.len;
+
+        if (! g_OID_equal(&toid, mech)) 
+                ret = G_WRONG_MECH;
+   /* G_WRONG_MECH is not returned immediately because it's more important
+      to return G_BAD_TOK_HEADER if the token header is in fact bad */
+
+        if ((toksize-=2) < 0)
+                return(G_BAD_TOK_HEADER);
+
+        if (ret)
+                return(ret);
+
+        if (!ret) {
+                *buf_in = buf;
+                *body_size = toksize;
+        }
+
+        return(ret);
+}
+
+//EXPORT_SYMBOL(g_verify_token_header);
+
+/* Given a buffer containing a token, returns a copy of the mech oid in
+ * the parameter mech. */
+__u32
+g_get_mech_oid(rawobj_t *mech, rawobj_t * in_buf)
+{
+        unsigned char *buf = in_buf->data;
+        int len = in_buf->len;
+        int ret=0;
+        int seqsize;
+
+        if ((len-=1) < 0)
+                return(G_BAD_TOK_HEADER);
+        if (*buf++ != 0x60)
+                return(G_BAD_TOK_HEADER);
+
+        if ((seqsize = der_read_length(&buf, &len)) < 0)
+                return(G_BAD_TOK_HEADER);
+
+        if ((len-=1) < 0)
+                return(G_BAD_TOK_HEADER);
+        if (*buf++ != 0x06)
+                return(G_BAD_TOK_HEADER);
+
+        if ((len-=1) < 0)
+                return(G_BAD_TOK_HEADER);
+        mech->len = *buf++;
+
+        if ((len-=mech->len) < 0)
+                return(G_BAD_TOK_HEADER);
+        OBD_ALLOC(mech->data, mech->len);
+        if (!mech->data) 
+                return(G_BUFFER_ALLOC);
+        memcpy(mech->data, buf, mech->len);
+
+        return ret;
+}
diff --git a/lustre/sec/gss/gss_internal.h b/lustre/sec/gss/gss_internal.h
new file mode 100644 (file)
index 0000000..9b1b76a
--- /dev/null
@@ -0,0 +1,106 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modified from NFSv4 project for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+#ifndef __SEC_GSS_GSS_INTERNAL_H_
+#define __SEC_GSS_GSS_INTERNAL_H_
+
+struct ptlrpc_sec;
+struct ptlrpc_cred;
+
+typedef struct rawobj_s {
+        __u32           len;
+        __u8           *data;
+} rawobj_t;
+
+int rawobj_alloc(rawobj_t *obj, char *buf, int len);
+void rawobj_free(rawobj_t *obj);
+int rawobj_equal(rawobj_t *a, rawobj_t *b);
+int rawobj_dup(rawobj_t *dest, rawobj_t *src);
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen);
+
+typedef struct rawobj_buf_s {
+        __u32           dataoff;
+        __u32           datalen;
+        __u32           buflen;
+        __u8           *buf;
+} rawobj_buf_t;
+
+#define MAXSEQ 0x80000000 /* maximum legal sequence number, from rfc 2203 */
+
+enum rpc_gss_proc {
+        RPC_GSS_PROC_DATA =             0,
+        RPC_GSS_PROC_INIT =             1,
+        RPC_GSS_PROC_CONTINUE_INIT =    2,
+        RPC_GSS_PROC_DESTROY =          3,
+};
+
+enum rpc_gss_svc {
+        RPC_GSS_SVC_NONE =              1,
+        RPC_GSS_SVC_INTEGRITY =         2,
+        RPC_GSS_SVC_PRIVACY =           3,
+};
+
+/* on-the-wire gss cred: */
+struct rpc_gss_wire_cred {
+        __u32                   gc_v;           /* version */
+        __u32                   gc_proc;        /* control procedure */
+        __u32                   gc_seq;         /* sequence number */
+        __u32                   gc_svc;         /* service */
+        rawobj_t                gc_ctx;         /* context handle */
+};
+
+/* on-the-wire gss verifier: */
+struct rpc_gss_wire_verf {
+        __u32                   gv_flavor;
+        rawobj_t                gv_verf;
+};
+
+struct gss_cl_ctx {
+        atomic_t                gc_refcount;
+        __u32                   gc_proc;
+        __u32                   gc_seq;
+        spinlock_t              gc_seq_lock;
+        struct gss_ctx         *gc_gss_ctx;
+        rawobj_t                gc_wire_ctx;
+        __u32                   gc_win;
+};
+
+struct gss_cred {
+        struct ptlrpc_cred      gc_base;
+        ptlrpcs_flavor_t        gc_flavor;
+        struct gss_cl_ctx      *gc_ctx;
+};
+
+/*
+ * This only guaranteed be enough for current krb5 des-cbc-crc . We might
+ * adjust this when new enc type or mech added in.
+ */
+#define GSS_PRIVBUF_PREFIX_LEN         (32)
+#define GSS_PRIVBUF_SUFFIX_LEN         (32)
+
+/* This is too coarse. We'll let mech determine it */
+#define GSS_MAX_AUTH_PAYLOAD    (128)
+
+/* gss_mech_switch.c */
+int init_kerberos_module(void);
+void cleanup_kerberos_module(void);
+
+/* gss_generic_token.c */
+int g_token_size(rawobj_t *mech, unsigned int body_size);
+void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf);
+__u32 g_verify_token_header(rawobj_t *mech, int *body_size,
+                            unsigned char **buf_in, int toksize);
+
+/* svcsec_gss.c */
+int gss_svc_init(void);
+void gss_svc_exit(void);
+
+#endif /* __SEC_GSS_GSS_INTERNAL_H_ */
diff --git a/lustre/sec/gss/gss_krb5.h b/lustre/sec/gss/gss_krb5.h
new file mode 100644 (file)
index 0000000..f00e2c4
--- /dev/null
@@ -0,0 +1,183 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/include/linux/sunrpc/gss_krb5_types.h
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h,
+ *  lib/gssapi/krb5/gssapiP_krb5.h, and others
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ *  Bruce Fields   <bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1995 by the Massachusetts Institute of Technology.
+ * All Rights Reserved.
+ *
+ * Export of this software from the United States of America may
+ *   require a specific license from the United States Government.
+ *   It is the responsibility of any person or organization contemplating
+ *   export to obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of M.I.T. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  Furthermore if you modify this software you must label
+ * your software as modified software and not distribute it in such a
+ * fashion that it might be confused with the original M.I.T. software.
+ * M.I.T. makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ */
+
+extern spinlock_t krb5_seq_lock;
+
+struct krb5_ctx {
+        int                     initiate; /* 1 = initiating, 0 = accepting */
+        int                     seed_init;
+        unsigned char           seed[16];
+        int                     signalg;
+        int                     sealalg;
+        struct crypto_tfm      *enc;
+        struct crypto_tfm      *seq;
+        __s32                   endtime;
+        __u32                   seq_send;
+        rawobj_t                mech_used;
+};
+
+#define KG_TOK_MIC_MSG    0x0101
+#define KG_TOK_WRAP_MSG   0x0201
+
+enum sgn_alg {
+        SGN_ALG_DES_MAC_MD5 = 0x0000,
+        SGN_ALG_MD2_5 = 0x0001,
+        SGN_ALG_DES_MAC = 0x0002,
+        SGN_ALG_3 = 0x0003,                /* not published */
+        SGN_ALG_HMAC_MD5 = 0x0011,        /* microsoft w2k; no support */
+        SGN_ALG_HMAC_SHA1_DES3_KD = 0x0004
+};
+enum seal_alg {
+        SEAL_ALG_NONE = 0xffff,
+        SEAL_ALG_DES = 0x0000,
+        SEAL_ALG_1 = 0x0001,                /* not published */
+        SEAL_ALG_MICROSOFT_RC4 = 0x0010,/* microsoft w2k; no support */
+        SEAL_ALG_DES3KD = 0x0002
+};
+
+#define KRB5_CKSUM_LENGTH 8
+
+#define CKSUMTYPE_CRC32                        0x0001
+#define CKSUMTYPE_RSA_MD4                0x0002
+#define CKSUMTYPE_RSA_MD4_DES                0x0003
+#define CKSUMTYPE_DESCBC                0x0004
+#define CKSUMTYPE_RSA_MD5                0x0007
+#define CKSUMTYPE_RSA_MD5_DES                0x0008
+#define CKSUMTYPE_NIST_SHA                0x0009
+#define CKSUMTYPE_HMAC_SHA1_DES3        0x000c
+
+/* from gssapi_err_krb5.h */
+#define KG_CCACHE_NOMATCH                        (39756032L)
+#define KG_KEYTAB_NOMATCH                        (39756033L)
+#define KG_TGT_MISSING                           (39756034L)
+#define KG_NO_SUBKEY                             (39756035L)
+#define KG_CONTEXT_ESTABLISHED                   (39756036L)
+#define KG_BAD_SIGN_TYPE                         (39756037L)
+#define KG_BAD_LENGTH                            (39756038L)
+#define KG_CTX_INCOMPLETE                        (39756039L)
+#define KG_CONTEXT                               (39756040L)
+#define KG_CRED                                  (39756041L)
+#define KG_ENC_DESC                              (39756042L)
+#define KG_BAD_SEQ                               (39756043L)
+#define KG_EMPTY_CCACHE                          (39756044L)
+#define KG_NO_CTYPES                             (39756045L)
+
+/* per Kerberos v5 protocol spec crypto types from the wire. 
+ * these get mapped to linux kernel crypto routines.  
+ */
+#define ENCTYPE_NULL            0x0000
+#define ENCTYPE_DES_CBC_CRC     0x0001        /* DES cbc mode with CRC-32 */
+#define ENCTYPE_DES_CBC_MD4     0x0002        /* DES cbc mode with RSA-MD4 */
+#define ENCTYPE_DES_CBC_MD5     0x0003        /* DES cbc mode with RSA-MD5 */
+#define ENCTYPE_DES_CBC_RAW     0x0004        /* DES cbc mode raw */
+/* XXX deprecated? */
+#define ENCTYPE_DES3_CBC_SHA    0x0005        /* DES-3 cbc mode with NIST-SHA */
+#define ENCTYPE_DES3_CBC_RAW    0x0006        /* DES-3 cbc mode raw */
+#define ENCTYPE_DES_HMAC_SHA1   0x0008
+#define ENCTYPE_DES3_CBC_SHA1   0x0010
+#define ENCTYPE_UNKNOWN         0x01ff
+
+__s32
+make_checksum(__s32 cksumtype,
+              char *header, int hdrlen,
+              rawobj_t *body,
+              rawobj_t *cksum);
+
+__u32
+krb5_make_token(struct krb5_ctx *ctx,
+                int qop_req,
+                rawobj_t *text,
+                rawobj_t *token);
+
+__u32
+krb5_read_token(struct krb5_ctx *ctx,
+                rawobj_t *read_token,
+                rawobj_t *message_buffer,
+                int *qop_state);
+
+__u32
+krb5_encrypt(struct crypto_tfm *tfm,
+             void * iv,
+             void * in,
+             void * out,
+             int length);
+
+__u32
+krb5_decrypt(struct crypto_tfm *tfm,
+             void * iv,
+             void * in,
+             void * out,
+             int length);
+
+__s32
+krb5_make_seq_num(struct crypto_tfm *key,
+                  int direction,
+                  __s32 seqnum,
+                  unsigned char *cksum,
+                  unsigned char *buf);
+
+__s32
+krb5_get_seq_num(struct crypto_tfm *key,
+                 unsigned char *cksum,
+                 unsigned char *buf,
+                 int *direction,
+                 __s32 *seqnum);
+int
+gss_encrypt_rawobj(struct crypto_tfm *tfm,
+                   rawobj_t *inobj,
+                   rawobj_t *outobj,
+                   int enc);
+__u32
+gss_wrap_kerberos(struct gss_ctx    *ctx,
+                  __u32              qop,
+                  rawobj_buf_t      *in_token,
+                  rawobj_t          *out_token);
+__u32
+gss_unwrap_kerberos(struct gss_ctx  *ctx,
+                    __u32            qop,
+                    rawobj_t        *in_token,
+                    rawobj_t        *out_token);
diff --git a/lustre/sec/gss/gss_krb5_crypto.c b/lustre/sec/gss/gss_krb5_crypto.c
new file mode 100644 (file)
index 0000000..a0358fe
--- /dev/null
@@ -0,0 +1,256 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_crypto.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ *  Bruce Fields   <bfields@umich.edu>
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government.  It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  FundsXpress makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#else
+#include <liblustre.h>
+#include "../kcrypto/libcrypto.h"
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+
+__u32
+krb5_encrypt(struct crypto_tfm *tfm,
+             void * iv,
+             void * in,
+             void * out,
+             int length)
+{
+        __u32 ret = -EINVAL;
+        struct scatterlist sg[1];
+        __u8 local_iv[16] = {0};
+
+        if (length % crypto_tfm_alg_blocksize(tfm) != 0)
+                goto out;
+
+        if (crypto_tfm_alg_ivsize(tfm) > 16) {
+                CERROR("tfm iv size to large %d\n", crypto_tfm_alg_ivsize(tfm));
+                goto out;
+        }
+
+        if (iv)
+                memcpy(local_iv, iv, crypto_tfm_alg_ivsize(tfm));
+
+        memcpy(out, in, length);
+        sg[0].page = virt_to_page(out);
+        sg[0].offset = offset_in_page(out);
+        sg[0].length = length;
+
+        ret = crypto_cipher_encrypt_iv(tfm, sg, sg, length, local_iv);
+
+out:
+        return(ret);
+}
+
+//EXPORT_SYMBOL(krb5_encrypt);
+
+__u32
+krb5_decrypt(struct crypto_tfm *tfm,
+             void * iv,
+             void * in,
+             void * out,
+             int length)
+{
+        __u32 ret = -EINVAL;
+        struct scatterlist sg[1];
+        __u8 local_iv[16] = {0};
+
+        if (length % crypto_tfm_alg_blocksize(tfm) != 0)
+                goto out;
+
+        if (crypto_tfm_alg_ivsize(tfm) > 16) {
+                CERROR("tfm iv size to large %d\n", crypto_tfm_alg_ivsize(tfm));
+                goto out;
+        }
+        if (iv)
+                memcpy(local_iv,iv, crypto_tfm_alg_ivsize(tfm));
+
+        memcpy(out, in, length);
+        sg[0].page = virt_to_page(out);
+        sg[0].offset = offset_in_page(out);
+        sg[0].length = length;
+
+        ret = crypto_cipher_decrypt_iv(tfm, sg, sg, length, local_iv);
+
+out:
+        return(ret);
+}
+
+//EXPORT_SYMBOL(krb5_decrypt);
+
+void
+buf_to_sg(struct scatterlist *sg, char *ptr, int len)
+{
+        sg->page = virt_to_page(ptr);
+        sg->offset = offset_in_page(ptr);
+        sg->length = len;
+}
+
+/* checksum the plaintext data and hdrlen bytes of the token header */
+__s32
+make_checksum(__s32 cksumtype,
+              char *header, int hdrlen,
+              rawobj_t *body,
+              rawobj_t *cksum)
+{
+        char                           *cksumname;
+        struct crypto_tfm              *tfm = NULL; /* XXX add to ctx? */
+        struct scatterlist              sg[1];
+        __u32                           code = GSS_S_FAILURE;
+
+        switch (cksumtype) {
+                case CKSUMTYPE_RSA_MD5:
+                        cksumname = "md5";
+                        break;
+                default:
+                        CERROR("unsupported checksum %d", cksumtype);
+                        goto out;
+        }
+        if (!(tfm = crypto_alloc_tfm(cksumname, 0)))
+                goto out;
+        cksum->len = crypto_tfm_alg_digestsize(tfm);
+        OBD_ALLOC(cksum->data, cksum->len);
+        if (!cksum->data)
+                goto out;
+
+        crypto_digest_init(tfm);
+        buf_to_sg(sg, header, hdrlen);
+        crypto_digest_update(tfm, sg, 1);
+        if (body->len) {
+                buf_to_sg(sg, body->data, body->len);
+                crypto_digest_update(tfm, sg, 1);
+        }
+
+        crypto_digest_final(tfm, cksum->data);
+        code = 0;
+out:
+        if (tfm)
+                crypto_free_tfm(tfm);
+        return code;
+}
+
+//EXPORT_SYMBOL(make_checksum);
+
+static
+void obj_to_scatter_list(rawobj_t *obj, struct scatterlist *list,
+                         int listlen)
+{
+        __u8   *ptr = obj->data;
+        __u32   size = obj->len;
+        int index = 0;
+
+        while (size) {
+                LASSERT(index++ < listlen);
+                list->page = virt_to_page(ptr);
+                list->offset = (int) ptr & (~PAGE_MASK);
+                list->length = (list->offset + size) > PAGE_SIZE ?
+                                (PAGE_SIZE - list->offset) : size;
+                ptr += list->length;
+                size -= list->length;
+                list++;
+        }
+}
+
+int gss_encrypt_rawobj(struct crypto_tfm *tfm,
+                       rawobj_t *inobj, rawobj_t *outobj,
+                       int enc)
+{
+        struct scatterlist *src_list, *dst_list;
+        __u8 local_iv[16] = {0};
+        int list_len;
+        __u32 rc;
+        ENTRY;
+
+        LASSERT(outobj->len >= inobj->len);
+
+        list_len = ((inobj->len + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
+        OBD_ALLOC(src_list, sizeof(*src_list) * list_len * 2);
+        if (!src_list) {
+                CERROR("can't alloc %d\n", sizeof(*src_list) * list_len * 2);
+                RETURN(-ENOMEM);
+        }
+        dst_list = src_list + list_len;
+
+        obj_to_scatter_list(inobj, src_list, list_len);
+        obj_to_scatter_list(outobj, dst_list, list_len);
+
+        if (enc)
+                rc = crypto_cipher_encrypt_iv(tfm, dst_list, src_list,
+                                              inobj->len, local_iv);
+        else
+                rc = crypto_cipher_decrypt_iv(tfm, dst_list, src_list,
+                                              inobj->len, local_iv);
+
+        if (rc) {
+                CERROR("encrypt error %u\n", rc);
+                GOTO(out_free, rc);
+        }
+
+        outobj->len = inobj->len;
+
+out_free:
+        OBD_FREE(src_list, sizeof(*src_list) * list_len * 2);
+        RETURN(rc);
+}
diff --git a/lustre/sec/gss/gss_krb5_mech.c b/lustre/sec/gss/gss_krb5_mech.c
new file mode 100644 (file)
index 0000000..8dcca46
--- /dev/null
@@ -0,0 +1,316 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_mech.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson <andros@umich.edu>
+ *  J. Bruce Fields <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#else
+#include <liblustre.h>
+//#include "../kcrypto/libcrypto.h"
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+
+rawobj_t gss_mech_krb5_oid =
+   {9, "\052\206\110\206\367\022\001\002\002"};
+
+static inline int
+get_bytes(char **ptr, const char *end, void *res, int len)
+{
+        char *p, *q;
+        p = *ptr;
+        q = p + len;
+        if (q > end || q < p)
+                return -1;
+        memcpy(res, p, len);
+        *ptr = q;
+        return 0;
+}
+
+static inline int
+get_rawobj(char **ptr, const char *end, rawobj_t *res)
+{
+        char *p, *q;
+        p = *ptr;
+        if (get_bytes(&p, end, &res->len, sizeof(res->len)))
+                return -1;
+        q = p + res->len;
+        if (q > end || q < p)
+                return -1;
+        OBD_ALLOC(res->data, res->len);
+        if (!res->data)
+                return -1;
+        memcpy(res->data, p, res->len);
+        *ptr = q;
+        return 0;
+}
+
+static inline int
+get_key(char **p, char *end, struct crypto_tfm **res)
+{
+        rawobj_t                key;
+        int                     alg, alg_mode;
+        char                   *alg_name;
+
+        if (get_bytes(p, end, &alg, sizeof(alg)))
+                goto out_err;
+        if ((get_rawobj(p, end, &key)))
+                goto out_err;
+
+        switch (alg) {
+                case ENCTYPE_DES_CBC_RAW:
+                        alg_name = "des";
+                        alg_mode = CRYPTO_TFM_MODE_CBC;
+                        break;
+                default:
+                        CERROR("unsupported algorithm %d\n", alg);
+                        goto out_err_free_key;
+        }
+        if (!(*res = crypto_alloc_tfm(alg_name, alg_mode)))
+                goto out_err_free_key;
+        if (crypto_cipher_setkey(*res, key.data, key.len))
+                goto out_err_free_tfm;
+
+        OBD_FREE(key.data, key.len);
+        return 0;
+
+out_err_free_tfm:
+        crypto_free_tfm(*res);
+out_err_free_key:
+        OBD_FREE(key.data, key.len);
+out_err:
+        return -1;
+}
+
+static __u32
+gss_import_sec_context_kerberos(rawobj_t *inbuf,
+                                struct gss_ctx *ctx_id)
+{
+        char            *p = inbuf->data;
+        char            *end = inbuf->data + inbuf->len;
+        struct krb5_ctx *ctx;
+
+        OBD_ALLOC(ctx, sizeof(*ctx));
+        if (!ctx)
+                goto out_err;
+
+        if (get_bytes(&p, end, &ctx->initiate, sizeof(ctx->initiate)))
+                goto out_err_free_ctx;
+        if (get_bytes(&p, end, &ctx->seed_init, sizeof(ctx->seed_init)))
+                goto out_err_free_ctx;
+        if (get_bytes(&p, end, ctx->seed, sizeof(ctx->seed)))
+                goto out_err_free_ctx;
+        if (get_bytes(&p, end, &ctx->signalg, sizeof(ctx->signalg)))
+                goto out_err_free_ctx;
+        if (get_bytes(&p, end, &ctx->sealalg, sizeof(ctx->sealalg)))
+                goto out_err_free_ctx;
+        if (get_bytes(&p, end, &ctx->endtime, sizeof(ctx->endtime)))
+                goto out_err_free_ctx;
+        if (get_bytes(&p, end, &ctx->seq_send, sizeof(ctx->seq_send)))
+                goto out_err_free_ctx;
+        if (get_rawobj(&p, end, &ctx->mech_used))
+                goto out_err_free_ctx;
+        if (get_key(&p, end, &ctx->enc))
+                goto out_err_free_mech;
+        if (get_key(&p, end, &ctx->seq))
+                goto out_err_free_key1;
+        if (p != end)
+                goto out_err_free_key2;
+
+        ctx_id->internal_ctx_id = ctx;
+        CDEBUG(D_SEC, "Succesfully imported new context.\n");
+        return 0;
+
+out_err_free_key2:
+        crypto_free_tfm(ctx->seq);
+out_err_free_key1:
+        crypto_free_tfm(ctx->enc);
+out_err_free_mech:
+        OBD_FREE(ctx->mech_used.data, ctx->mech_used.len);
+out_err_free_ctx:
+        OBD_FREE(ctx, sizeof(*ctx));
+out_err:
+        return GSS_S_FAILURE;
+}
+
+static __u32
+gss_inquire_context_kerberos(struct gss_ctx    *context_handle,
+                             __u64             *endtime)
+{
+        struct krb5_ctx *kctx = context_handle->internal_ctx_id;
+
+        *endtime = (__u64) kctx->endtime;
+        return GSS_S_COMPLETE;
+}
+
+static void
+gss_delete_sec_context_kerberos(void *internal_ctx)
+{
+        struct krb5_ctx *ctx = internal_ctx;
+
+        if (ctx->seq)
+                crypto_free_tfm(ctx->seq);
+        if (ctx->enc)
+                crypto_free_tfm(ctx->enc);
+        if (ctx->mech_used.data)
+                OBD_FREE(ctx->mech_used.data, ctx->mech_used.len);
+        OBD_FREE(ctx, sizeof(*ctx));
+}
+
+/* XXX the following wrappers have become pointless; kill them. */
+static __u32
+gss_verify_mic_kerberos(struct gss_ctx *ctx,
+                        rawobj_t       *message,
+                        rawobj_t       *mic_token,
+                        __u32          *qstate)
+{
+        struct krb5_ctx *kctx = ctx->internal_ctx_id;
+        __u32 maj_stat;
+        int qop_state;
+
+        maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state);
+        if (!maj_stat && qop_state)
+            *qstate = qop_state;
+
+        CDEBUG(D_SEC, "returning %d\n", maj_stat);
+        return maj_stat;
+}
+
+static __u32
+gss_get_mic_kerberos(struct gss_ctx    *ctx,
+                     __u32              qop,
+                     rawobj_t          *message,
+                     rawobj_t          *mic_token)
+{
+        struct krb5_ctx *kctx = ctx->internal_ctx_id;
+        __u32 err;
+
+        err = krb5_make_token(kctx, qop, message, mic_token);
+
+        CDEBUG(D_SEC, "returning %d\n",err);
+        return err;
+}
+
+static struct gss_api_ops gss_kerberos_ops = {
+        .gss_import_sec_context     = gss_import_sec_context_kerberos,
+        .gss_inquire_context        = gss_inquire_context_kerberos,
+        .gss_get_mic                = gss_get_mic_kerberos,
+        .gss_verify_mic             = gss_verify_mic_kerberos,
+        .gss_wrap                   = gss_wrap_kerberos,
+        .gss_unwrap                 = gss_unwrap_kerberos,
+        .gss_delete_sec_context     = gss_delete_sec_context_kerberos,
+};
+
+static struct subflavor_desc gss_kerberos_sfs[] = {
+        {
+                .subflavor      = PTLRPC_SEC_GSS_KRB5,
+                .qop            = 0,
+                .service        = PTLRPC_SEC_TYPE_NONE,
+                .name           = "krb5"
+        },
+        {
+                .subflavor      = PTLRPC_SEC_GSS_KRB5I,
+                .qop            = 0,
+                .service        = PTLRPC_SEC_TYPE_AUTH,
+                .name           = "krb5i"
+        },
+        {
+                .subflavor      = PTLRPC_SEC_GSS_KRB5P,
+                .qop            = 0,
+                .service        = PTLRPC_SEC_TYPE_PRIV,
+                .name           = "krb5p"
+        }
+};
+
+static struct gss_api_mech gss_kerberos_mech = {
+        .gm_name        = "krb5",
+        .gm_owner       = THIS_MODULE,
+        .gm_ops         = &gss_kerberos_ops,
+        .gm_sf_num      = 3,
+        .gm_sfs         = gss_kerberos_sfs,
+};
+
+/*static*/ int __init init_kerberos_module(void)
+{
+        int status;
+
+        status = kgss_mech_register(&gss_kerberos_mech);
+        if (status)
+                CERROR("Failed to register kerberos gss mechanism!\n");
+        return status;
+}
+
+/*static*/ void __exit cleanup_kerberos_module(void)
+{
+        kgss_mech_unregister(&gss_kerberos_mech);
+}
+
+/* XXX enable this when module works */
+#if 0
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("GSS Krb5 mechanism for Lustre");
+
+module_init(init_kerberos_module);
+module_exit(cleanup_kerberos_module);
+#endif
diff --git a/lustre/sec/gss/gss_krb5_seal.c b/lustre/sec/gss/gss_krb5_seal.c
new file mode 100644 (file)
index 0000000..3037a54
--- /dev/null
@@ -0,0 +1,178 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_seal.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5seal.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson        <andros@umich.edu>
+ *  J. Bruce Fields     <bfields@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government.  It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  FundsXpress makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#else
+#include <liblustre.h>
+#include "../kcrypto/libcrypto.h"
+#include <netinet/in.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+
+spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED;
+
+__u32
+krb5_make_token(struct krb5_ctx *ctx,
+                int qop_req,
+                rawobj_t *text,
+                rawobj_t *token)
+{
+        __s32                   checksum_type;
+        rawobj_t                md5cksum = {.len = 0, .data = NULL};
+        unsigned char          *ptr, *krb5_hdr, *msg_start;
+        __s32                   now, seq_send;
+        ENTRY;
+
+        now = get_seconds();
+
+        if (qop_req != 0)
+                goto out_err;
+
+        switch (ctx->signalg) {
+                case SGN_ALG_DES_MAC_MD5:
+                        checksum_type = CKSUMTYPE_RSA_MD5;
+                        break;
+                default:
+                        CERROR("ctx->signalg %d not supported\n", ctx->signalg);
+                        goto out_err;
+        }
+        if (ctx->sealalg != SEAL_ALG_NONE && ctx->sealalg != SEAL_ALG_DES) {
+                CERROR("ctx->sealalg %d not supported\n", ctx->sealalg);
+                goto out_err;
+        }
+
+        token->len = g_token_size(&ctx->mech_used, 22);
+
+        ptr = token->data;
+        g_make_token_header(&ctx->mech_used, 22, &ptr);
+
+        *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff);
+        *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff);
+
+        /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */
+        krb5_hdr = ptr - 2;
+        msg_start = krb5_hdr + 24;
+
+        *(__u16 *)(krb5_hdr + 2) = cpu_to_be16(ctx->signalg);
+        memset(krb5_hdr + 4, 0xff, 4);
+
+        if (make_checksum(checksum_type, krb5_hdr, 8, text, &md5cksum))
+                goto out_err;
+
+        switch (ctx->signalg) {
+        case SGN_ALG_DES_MAC_MD5:
+                if (krb5_encrypt(ctx->seq, NULL, md5cksum.data,
+                                 md5cksum.data, md5cksum.len))
+                        goto out_err;
+                memcpy(krb5_hdr + 16,
+                       md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH,
+                       KRB5_CKSUM_LENGTH);
+
+                break;
+        default:
+                LBUG();
+        }
+
+        OBD_FREE(md5cksum.data, md5cksum.len);
+
+        spin_lock(&krb5_seq_lock);
+        seq_send = ctx->seq_send++;
+        spin_unlock(&krb5_seq_lock);
+
+        if ((krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff,
+                               seq_send, krb5_hdr + 16, krb5_hdr + 8)))
+                goto out_err;
+
+        return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE);
+out_err:
+        if (md5cksum.data)
+                OBD_FREE(md5cksum.data, md5cksum.len);
+        return GSS_S_FAILURE;
+}
diff --git a/lustre/sec/gss/gss_krb5_seqnum.c b/lustre/sec/gss/gss_krb5_seqnum.c
new file mode 100644 (file)
index 0000000..c80fc0f
--- /dev/null
@@ -0,0 +1,116 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_seqnum.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/util_seqnum.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#else
+#include <liblustre.h>
+#include "../kcrypto/libcrypto.h"
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+
+__s32
+krb5_make_seq_num(struct crypto_tfm *key,
+                  int direction,
+                  __s32 seqnum,
+                  unsigned char *cksum,
+                  unsigned char *buf)
+{
+        unsigned char plain[8];
+
+        plain[0] = (unsigned char) (seqnum & 0xff);
+        plain[1] = (unsigned char) ((seqnum >> 8) & 0xff);
+        plain[2] = (unsigned char) ((seqnum >> 16) & 0xff);
+        plain[3] = (unsigned char) ((seqnum >> 24) & 0xff);
+
+        plain[4] = direction;
+        plain[5] = direction;
+        plain[6] = direction;
+        plain[7] = direction;
+
+        return krb5_encrypt(key, cksum, plain, buf, 8);
+}
+
+__s32
+krb5_get_seq_num(struct crypto_tfm *key,
+                 unsigned char *cksum,
+                 unsigned char *buf,
+                 int *direction,
+                 __s32 * seqnum)
+{
+        __s32 code;
+        unsigned char plain[8];
+
+        if ((code = krb5_decrypt(key, cksum, buf, plain, 8)))
+                return code;
+
+        if ((plain[4] != plain[5]) || (plain[4] != plain[6])
+                                   || (plain[4] != plain[7]))
+                return (__s32)KG_BAD_SEQ;
+
+        *direction = plain[4];
+
+        *seqnum = ((plain[0]) |
+                   (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24));
+
+        return (0);
+}
diff --git a/lustre/sec/gss/gss_krb5_unseal.c b/lustre/sec/gss/gss_krb5_unseal.c
new file mode 100644 (file)
index 0000000..ba6e058
--- /dev/null
@@ -0,0 +1,212 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_krb5_unseal.c
+ *
+ *  Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5unseal.c
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Andy Adamson   <andros@umich.edu>
+ */
+
+/*
+ * Copyright 1993 by OpenVision Technologies, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appears in all copies and
+ * that both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of OpenVision not be used
+ * in advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. OpenVision makes no
+ * representations about the suitability of this software for any
+ * purpose.  It is provided "as is" without express or implied warranty.
+ *
+ * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
+ * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * Copyright (C) 1998 by the FundsXpress, INC.
+ *
+ * All rights reserved.
+ *
+ * Export of this software from the United States of America may require
+ * a specific license from the United States Government.  It is the
+ * responsibility of any person or organization contemplating export to
+ * obtain such a license before exporting.
+ *
+ * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
+ * distribute this software and its documentation for any purpose and
+ * without fee is hereby granted, provided that the above copyright
+ * notice appear in all copies and that both that copyright notice and
+ * this permission notice appear in supporting documentation, and that
+ * the name of FundsXpress. not be used in advertising or publicity pertaining
+ * to distribution of the software without specific, written prior
+ * permission.  FundsXpress makes no representations about the suitability of
+ * this software for any purpose.  It is provided "as is" without express
+ * or implied warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
+ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#else
+#include <liblustre.h>
+#include "../kcrypto/libcrypto.h"
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+
+
+/* read_token is a mic token, and message_buffer is the data that the mic was
+ * supposedly taken over. */
+
+__u32
+krb5_read_token(struct krb5_ctx *ctx,
+                rawobj_t *read_token,
+                rawobj_t *message_buffer,
+                int *qop_state)
+{
+        int                     signalg;
+        int                     sealalg;
+        __s32                   checksum_type;
+        rawobj_t                md5cksum = {.len = 0, .data = NULL};
+        __s32                   now;
+        int                     direction;
+        __s32                   seqnum;
+        unsigned char          *ptr = (unsigned char *)read_token->data;
+        int                     bodysize;
+        __u32                   ret = GSS_S_DEFECTIVE_TOKEN;
+        ENTRY;
+
+        if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr,
+                                  read_token->len))
+                goto out;
+
+        if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) ||
+            (*ptr++ != ( KG_TOK_MIC_MSG    &0xff))   )
+                goto out;
+
+        /* XXX sanity-check bodysize?? */
+
+        /* get the sign and seal algorithms */
+
+        signalg = ptr[0] + (ptr[1] << 8);
+        sealalg = ptr[2] + (ptr[3] << 8);
+
+        /* Sanity checks */
+
+        if ((ptr[4] != 0xff) || (ptr[5] != 0xff))
+                goto out;
+
+        if (sealalg != 0xffff)
+                goto out;
+
+        /* there are several mappings of seal algorithms to sign algorithms,
+           but few enough that we can try them all. */
+
+        if ((ctx->sealalg == SEAL_ALG_NONE && signalg > 1) ||
+            (ctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) ||
+            (ctx->sealalg == SEAL_ALG_DES3KD &&
+             signalg != SGN_ALG_HMAC_SHA1_DES3_KD))
+                goto out;
+
+        /* compute the checksum of the message */
+
+        /* initialize the the cksum */
+        switch (signalg) {
+        case SGN_ALG_DES_MAC_MD5:
+                checksum_type = CKSUMTYPE_RSA_MD5;
+                break;
+        default:
+                ret = GSS_S_DEFECTIVE_TOKEN;
+                goto out;
+        }
+
+        switch (signalg) {
+        case SGN_ALG_DES_MAC_MD5:
+                ret = make_checksum(checksum_type, ptr - 2, 8,
+                                    message_buffer, &md5cksum);
+                if (ret)
+                        goto out;
+
+                ret = krb5_encrypt(ctx->seq, NULL, md5cksum.data,
+                                   md5cksum.data, 16);
+                if (ret)
+                        goto out;
+
+                if (memcmp(md5cksum.data + 8, ptr + 14, 8)) {
+                        ret = GSS_S_BAD_SIG;
+                        goto out;
+                }
+                break;
+        default:
+                ret = GSS_S_DEFECTIVE_TOKEN;
+                goto out;
+        }
+
+        /* it got through unscathed.  Make sure the context is unexpired */
+
+        if (qop_state)
+                *qop_state = GSS_C_QOP_DEFAULT;
+
+        now = get_seconds();
+
+        ret = GSS_S_CONTEXT_EXPIRED;
+        if (now > ctx->endtime)
+                goto out;
+
+        /* do sequencing checks */
+
+        ret = GSS_S_BAD_SIG;
+        if ((ret = krb5_get_seq_num(ctx->seq, ptr + 14, ptr + 6, &direction,
+                                    &seqnum)))
+                goto out;
+
+        if ((ctx->initiate && direction != 0xff) ||
+            (!ctx->initiate && direction != 0))
+                goto out;
+
+        ret = GSS_S_COMPLETE;
+out:
+        if (md5cksum.data)
+                OBD_FREE(md5cksum.data, md5cksum.len);
+        return ret;
+}
diff --git a/lustre/sec/gss/gss_krb5_wrap.c b/lustre/sec/gss/gss_krb5_wrap.c
new file mode 100644 (file)
index 0000000..1099156
--- /dev/null
@@ -0,0 +1,381 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *   Modified from NFSv4 projects for Lustre
+ *   Copyright 2004, Cluster File Systems, Inc.
+ *   All rights reserved
+ *   Author: Eric Mei <ericm@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/random.h>
+#else
+#include <liblustre.h>
+#include "../kcrypto/libcrypto.h"
+#include <netinet/in.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+#include "gss_krb5.h"
+#include "gss_asn1.h"
+
+static inline
+int add_padding(rawobj_buf_t *msgbuf, int blocksize)
+{
+        int padding;
+
+        padding = (blocksize - (msgbuf->datalen & (blocksize - 1))) &
+                  (blocksize - 1);
+        if (padding == 0)
+                return 0;
+
+        CWARN("add padding %d\n", padding);
+        if (msgbuf->dataoff + msgbuf->datalen + padding > msgbuf->buflen) {
+                CERROR("bufsize %u too small: off %u, len %u, padding %u\n",
+                        msgbuf->buflen, msgbuf->dataoff, msgbuf->datalen,
+                        padding);
+                return -EINVAL;
+        }
+        memset(msgbuf->buf + msgbuf->dataoff + msgbuf->datalen,
+               padding, padding);
+        msgbuf->datalen += padding;
+        return 0;
+}
+
+static inline
+int generate_confounder(rawobj_buf_t *msgbuf, int blocksize)
+{
+        __u8 *p;
+
+        p = msgbuf->buf + msgbuf->dataoff - blocksize;
+        if (p < msgbuf->buf) {
+                CERROR("buf underflow\n");
+                return -EINVAL;
+        }
+
+        get_random_bytes(p, blocksize);
+        return 0;
+}
+
+__u32
+gss_wrap_kerberos(struct gss_ctx    *ctx,
+                  __u32              qop,
+                  rawobj_buf_t      *msgbuf,
+                  rawobj_t          *token)
+{
+        struct krb5_ctx        *kctx = ctx->internal_ctx_id;
+        __u32                   checksum_type;
+        rawobj_t                data_desc, cipher_out, md5cksum;
+        int                     blocksize;
+        unsigned char          *ptr, *krb5_hdr, *msg_start;
+        int                     head_len, plain_len;
+        __u32                   seq_send, major;
+        ENTRY;
+
+        if (qop) {
+                CERROR("not support qop %x yet\n", qop);
+                RETURN(GSS_S_FAILURE);
+        }
+
+        switch (kctx->signalg) {
+        case SGN_ALG_DES_MAC_MD5:
+                checksum_type = CKSUMTYPE_RSA_MD5;
+                break;
+        default:
+                CERROR("not support signalg %x\n", kctx->signalg);
+                RETURN(GSS_S_FAILURE);
+        }
+        if (kctx->sealalg != SEAL_ALG_NONE &&
+            kctx->sealalg != SEAL_ALG_DES) {
+                CERROR("not support sealalg %x\n", kctx->sealalg);
+                RETURN(GSS_S_FAILURE);
+        }
+
+        blocksize = crypto_tfm_alg_blocksize(kctx->enc);
+        LASSERT(blocksize <= 16);
+        LASSERT(blocksize == 8); /* acutally must be 8 for now */
+
+        if (add_padding(msgbuf, blocksize))
+                RETURN(GSS_S_FAILURE);
+
+        /* confounder size == blocksize */
+        plain_len = msgbuf->datalen + blocksize;
+
+        head_len = g_token_size(&kctx->mech_used, 22 + plain_len) -
+                   msgbuf->datalen;
+
+        LASSERT(token->len >= head_len);
+        ptr = token->data;
+
+        /*
+         * fill in gss header and  krb5 header
+         */
+        g_make_token_header(&kctx->mech_used, 22 + plain_len, &ptr);
+        krb5_hdr = ptr;
+        msg_start = krb5_hdr + 24;
+        *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff);
+        *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG & 0xff);
+        *(__u16 *)(krb5_hdr + 2) = cpu_to_be16(kctx->signalg);
+        memset(krb5_hdr + 4, 0xff, 4);
+        *(__u16 *)(krb5_hdr + 4) = cpu_to_be16(kctx->sealalg);
+
+        /*
+         * prepend confounder on plain text
+         */
+        if (generate_confounder(msgbuf, blocksize))
+                RETURN(GSS_S_FAILURE);
+
+        /*
+         * compute checksum including confounder
+         */
+        data_desc.data = msgbuf->buf + msgbuf->dataoff - blocksize;
+        data_desc.len = msgbuf->datalen + blocksize;
+
+        if (make_checksum(checksum_type, krb5_hdr, 8, &data_desc, &md5cksum)) {
+                CERROR("checksum error\n");
+                RETURN(GSS_S_FAILURE);
+        }
+
+        major = GSS_S_FAILURE;
+        switch (kctx->signalg) {
+        case SGN_ALG_DES_MAC_MD5:
+                if (krb5_encrypt(kctx->seq, NULL, md5cksum.data,
+                                 md5cksum.data, md5cksum.len)) {
+                        rawobj_free(&md5cksum);
+                        RETURN(GSS_S_FAILURE);
+                }
+                memcpy(krb5_hdr + 16,
+                       md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH,
+                       KRB5_CKSUM_LENGTH);
+                break;
+        default:
+                LBUG();
+        }
+
+        rawobj_free(&md5cksum);
+
+        /*
+         * fill sequence number in krb5 header
+         */
+        spin_lock(&krb5_seq_lock);
+        seq_send = kctx->seq_send++;
+        spin_unlock(&krb5_seq_lock);
+
+        if (krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff,
+                               seq_send, krb5_hdr + 16, krb5_hdr + 8))
+                RETURN(GSS_S_FAILURE);
+
+        /* do encryption */
+        data_desc.data = msgbuf->buf + msgbuf->dataoff - blocksize;
+        data_desc.len = msgbuf->datalen + blocksize;
+        cipher_out.data = msg_start;
+        cipher_out.len = token->len - (msg_start - token->data);
+        LASSERT(data_desc.len % blocksize == 0);
+        LASSERT(data_desc.len <= cipher_out.len);
+
+        if (gss_encrypt_rawobj(kctx->enc, &data_desc, &cipher_out, 1))
+                RETURN(GSS_S_FAILURE);
+
+        token->len = (msg_start - token->data) + cipher_out.len;
+        RETURN(0);
+}
+
+__u32
+gss_unwrap_kerberos(struct gss_ctx  *ctx,
+                    __u32            qop,
+                    rawobj_t        *in_token,
+                    rawobj_t        *out_token)
+{
+        struct krb5_ctx        *kctx = ctx->internal_ctx_id;
+        int                     signalg, sealalg;
+        rawobj_t                cipher_in, plain_out, md5cksum;
+        unsigned char          *ptr, *krb5_hdr, *tmpbuf;
+        int                     bodysize;
+        int                     blocksize, seqnum, direction;
+        __u32                   checksum_type;
+        __u32                   major;
+        ENTRY;
+
+        ptr = in_token->data;
+
+        /*
+         * verify gss header
+         */
+        major = g_verify_token_header(&kctx->mech_used, &bodysize, &ptr,
+                                      in_token->len);
+        if (major) {
+                CERROR("gss token error %d\n", major);
+                RETURN(GSS_S_FAILURE);
+        }
+
+        krb5_hdr = ptr;
+
+        if ((*ptr++ != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) ||
+            (*ptr++ !=  (KG_TOK_WRAP_MSG & 0xff))) {
+                CERROR("token type not matched\n");
+                RETURN(G_BAD_TOK_HEADER);
+        }
+
+        if (bodysize < 22) {
+                CERROR("body size only %d\n", bodysize);
+                RETURN(G_WRONG_SIZE);
+        }
+
+        /*
+         * extract algorithms
+         */
+        signalg = ptr[0] | (ptr[1] << 8);
+        sealalg = ptr[2] | (ptr[3] << 8);
+
+        if (ptr[4] != 0xFF || ptr[5] != 0xFF) {
+                CERROR("4/5: %d, %d\n", ptr[4], ptr[5]);
+                RETURN(GSS_S_DEFECTIVE_TOKEN);
+        }
+
+        if (sealalg != kctx->sealalg) {
+                CERROR("sealalg %d not matched my %d\n",
+                        sealalg, kctx->sealalg);
+                RETURN(GSS_S_DEFECTIVE_TOKEN);
+        }
+
+        if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) ||
+            (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) ||
+            (kctx->sealalg == SEAL_ALG_DES3KD &&
+             signalg != SGN_ALG_HMAC_SHA1_DES3_KD)) {
+                CERROR("bad sealalg %d\n", sealalg);
+                RETURN(GSS_S_DEFECTIVE_TOKEN);
+        }
+
+        /* make bodysize as the actual cipher text size */
+        bodysize -= 22;
+        if (bodysize <= 0) {
+                CERROR("cipher text size %d?\n", bodysize);
+                RETURN(GSS_S_DEFECTIVE_TOKEN);
+        }
+
+        blocksize = crypto_tfm_alg_blocksize(kctx->enc);
+        if (bodysize % blocksize) {
+                CERROR("odd bodysize %d\n", bodysize);
+                RETURN(GSS_S_DEFECTIVE_TOKEN);
+        }
+
+        OBD_ALLOC(tmpbuf, bodysize);
+        if (!tmpbuf) {
+                CERROR("fail alloc %d\n", bodysize);
+                RETURN(GSS_S_FAILURE);
+        }
+
+        cipher_in.data = krb5_hdr + 24;
+        cipher_in.len = bodysize;
+        plain_out.data = tmpbuf;
+        plain_out.len = bodysize;
+
+        major = GSS_S_DEFECTIVE_TOKEN;
+        if (gss_encrypt_rawobj(kctx->enc, &cipher_in, &plain_out, 0)) {
+                CERROR("error decrypt: 0x%x\n", major);
+                GOTO(out_free, major);
+        }
+        LASSERT(plain_out.len == bodysize);
+
+        /*
+         * verify checksum
+         */
+        switch (signalg) {
+        case SGN_ALG_DES_MAC_MD5:
+                checksum_type = CKSUMTYPE_RSA_MD5;
+                major = make_checksum(checksum_type, krb5_hdr, 8,
+                                      &plain_out, &md5cksum);
+                if (major) {
+                        CERROR("make checksum err: 0x%x\n", major);
+                        GOTO(out_free, major);
+                }
+
+                major = krb5_encrypt(kctx->seq, NULL, md5cksum.data,
+                                     md5cksum.data, md5cksum.len);
+                if (major) {
+                        CERROR("encrypt checksum err: 0x%x\n", major);
+                        rawobj_free(&md5cksum);
+                        GOTO(out_free, major);
+                }
+
+                if (memcmp(md5cksum.data + 8, krb5_hdr + 16, 8)) {
+                        CERROR("checksum mismatch\n");
+                        rawobj_free(&md5cksum);
+                        GOTO(out_free, major = GSS_S_BAD_SIG);
+                }
+                break;
+        default:
+                CERROR("not support signalg %d\n", signalg);
+                GOTO(out_free, major);
+        }
+
+        rawobj_free(&md5cksum);
+
+        /* FIXME add expire checking here */
+
+        major = krb5_get_seq_num(kctx->seq, krb5_hdr + 16,
+                                 krb5_hdr + 8, &direction,
+                                 &seqnum);
+        if (major) {
+                CERROR("get seq number err: 0x%x\n", major);
+                GOTO(out_free, major);
+        }
+
+        if ((kctx->initiate && direction != 0xff) ||
+            (!kctx->initiate && direction != 0)) {
+                CERROR("flag checking error\n");
+                GOTO(out_free, major = GSS_S_BAD_SIG);
+        }
+
+        /* FIXME how to remove the padding? */
+
+        /*
+         * copy back
+         */
+        if (out_token->len < bodysize - blocksize) {
+                CERROR("data size %d while buffer only %d\n",
+                        bodysize - blocksize, out_token->len);
+                GOTO(out_free, major = GSS_S_DEFECTIVE_TOKEN);
+        }
+
+        out_token->len = bodysize - blocksize;
+        memcpy(out_token->data, plain_out.data + blocksize, out_token->len);
+        major = 0;
+out_free:
+        OBD_FREE(tmpbuf, bodysize);
+        RETURN(major);
+}
diff --git a/lustre/sec/gss/gss_mech_switch.c b/lustre/sec/gss/gss_mech_switch.c
new file mode 100644 (file)
index 0000000..f4d1d7f
--- /dev/null
@@ -0,0 +1,302 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ *  linux/net/sunrpc/gss_mech_switch.c
+ *
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  J. Bruce Fields   <bfields@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static LIST_HEAD(registered_mechs);
+static spinlock_t registered_mechs_lock = SPIN_LOCK_UNLOCKED;
+
+int
+kgss_mech_register(struct gss_api_mech *gm)
+{
+        spin_lock(&registered_mechs_lock);
+        list_add(&gm->gm_list, &registered_mechs);
+        spin_unlock(&registered_mechs_lock);
+        CWARN("registered gss mechanism %s\n", gm->gm_name);
+        return 0;
+}
+
+//EXPORT_SYMBOL(kgss_mech_register);
+
+void
+kgss_mech_unregister(struct gss_api_mech *gm)
+{
+        spin_lock(&registered_mechs_lock);
+        list_del(&gm->gm_list);
+        spin_unlock(&registered_mechs_lock);
+        CWARN("unregistered gss mechanism %s\n", gm->gm_name);
+//        gss_mech_free(gm);
+}
+
+//EXPORT_SYMBOL(gss_mech_unregister);
+
+struct gss_api_mech *
+kgss_mech_get(struct gss_api_mech *gm)
+{
+        __module_get(gm->gm_owner);
+        return gm;
+}
+
+//EXPORT_SYMBOL(kgss_mech_get);
+
+struct gss_api_mech *
+kgss_name_to_mech(char *name)
+{
+        struct gss_api_mech *pos, *gm = NULL;
+
+        spin_lock(&registered_mechs_lock);
+        list_for_each_entry(pos, &registered_mechs, gm_list) {
+                if (0 == strcmp(name, pos->gm_name)) {
+                        if (!try_module_get(pos->gm_owner))
+                                continue;
+                        gm = pos;
+                        break;
+                }
+        }
+        spin_unlock(&registered_mechs_lock);
+        return gm;
+
+}
+
+//EXPORT_SYMBOL(gss_name_to_mech);
+
+static inline int
+mech_supports_subflavor(struct gss_api_mech *gm, __u32 subflavor)
+{
+        int i;
+
+        for (i = 0; i < gm->gm_sf_num; i++) {
+                if (gm->gm_sfs[i].subflavor == subflavor)
+                        return 1;
+        }
+        return 0;
+}
+
+struct gss_api_mech *
+kgss_subflavor_to_mech(__u32 subflavor)
+{
+        struct gss_api_mech *pos, *gm = NULL;
+
+        spin_lock(&registered_mechs_lock);
+        list_for_each_entry(pos, &registered_mechs, gm_list) {
+                if (!try_module_get(pos->gm_owner))
+                        continue;
+                if (!mech_supports_subflavor(pos, subflavor)) {
+                        module_put(pos->gm_owner);
+                        continue;
+                }
+                gm = pos;
+                break;
+        }
+        spin_unlock(&registered_mechs_lock);
+        return gm;
+}
+
+//EXPORT_SYMBOL(gss_subflavor_to_mech);
+
+void
+kgss_mech_put(struct gss_api_mech *gm)
+{
+        module_put(gm->gm_owner);
+}
+
+//EXPORT_SYMBOL(kgss_mech_put);
+
+/* The mech could probably be determined from the token instead, but it's just
+ * as easy for now to pass it in. */
+__u32
+kgss_import_sec_context(rawobj_t                *input_token,
+                        struct gss_api_mech     *mech,
+                        struct gss_ctx         **ctx_id)
+{
+        OBD_ALLOC(*ctx_id, sizeof(**ctx_id));
+        if (*ctx_id == NULL)
+                return GSS_S_FAILURE;
+
+        (*ctx_id)->mech_type = kgss_mech_get(mech);
+
+        LASSERT(mech);
+        LASSERT(mech->gm_ops);
+        LASSERT(mech->gm_ops->gss_import_sec_context);
+        return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id);
+}
+
+/*
+ * this interface is much simplified, currently we only need endtime.
+ */
+__u32
+kgss_inquire_context(struct gss_ctx    *context_handle,
+                     __u64             *endtime)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_inquire_context);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_inquire_context(context_handle,
+                                      endtime);
+}
+
+/* gss_get_mic: compute a mic over message and return mic_token. */
+__u32
+kgss_get_mic(struct gss_ctx     *context_handle,
+             __u32               qop,
+             rawobj_t           *message,
+             rawobj_t           *mic_token)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_get_mic);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_get_mic(context_handle,
+                              qop,
+                              message,
+                              mic_token);
+}
+
+/* gss_verify_mic: check whether the provided mic_token verifies message. */
+__u32
+kgss_verify_mic(struct gss_ctx  *context_handle,
+                rawobj_t        *message,
+                rawobj_t        *mic_token,
+                __u32           *qstate)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_verify_mic);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_verify_mic(context_handle,
+                                 message,
+                                 mic_token,
+                                 qstate);
+}
+
+__u32
+kgss_wrap(struct gss_ctx        *context_handle,
+          __u32                  qop,
+          rawobj_buf_t          *inbuf,
+          rawobj_t              *outbuf)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_wrap);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_wrap(context_handle, qop, inbuf, outbuf);
+}
+
+__u32
+kgss_unwrap(struct gss_ctx        *context_handle,
+            __u32                  qop,
+            rawobj_t              *inbuf,
+            rawobj_t              *outbuf)
+{
+        LASSERT(context_handle);
+        LASSERT(context_handle->mech_type);
+        LASSERT(context_handle->mech_type->gm_ops);
+        LASSERT(context_handle->mech_type->gm_ops->gss_unwrap);
+
+        return context_handle->mech_type->gm_ops
+                ->gss_unwrap(context_handle, qop, inbuf, outbuf);
+}
+
+
+/* gss_delete_sec_context: free all resources associated with context_handle.
+ * Note this differs from the RFC 2744-specified prototype in that we don't
+ * bother returning an output token, since it would never be used anyway. */
+
+__u32
+kgss_delete_sec_context(struct gss_ctx  **context_handle)
+{
+        struct gss_api_mech *mech;
+
+        CDEBUG(D_SEC, "deleting %p\n", *context_handle);
+
+        if (!*context_handle)
+                return(GSS_S_NO_CONTEXT);
+
+        mech = (*context_handle)->mech_type;
+        if ((*context_handle)->internal_ctx_id != 0) {
+                LASSERT(mech);
+                LASSERT(mech->gm_ops);
+                LASSERT(mech->gm_ops->gss_delete_sec_context);
+                mech->gm_ops->gss_delete_sec_context(
+                                        (*context_handle)->internal_ctx_id);
+        }
+        if (mech)
+                kgss_mech_put(mech);
+
+        OBD_FREE(*context_handle, sizeof(**context_handle));
+        *context_handle=NULL;
+        return GSS_S_COMPLETE;
+}
diff --git a/lustre/sec/gss/rawobj.c b/lustre/sec/gss/rawobj.c
new file mode 100644 (file)
index 0000000..6c6edc4
--- /dev/null
@@ -0,0 +1,170 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_internal.h"
+
+int rawobj_alloc(rawobj_t *obj, char *buf, int len)
+{
+        LASSERT(obj);
+        LASSERT(len >= 0);
+
+        obj->len = len;
+        if (len) {
+                OBD_ALLOC(obj->data, len);
+                if (!obj->data)
+                        RETURN(-ENOMEM);
+                memcpy(obj->data, buf, len);
+        } else
+                obj->data = NULL;
+        return 0;
+}
+
+void rawobj_free(rawobj_t *obj)
+{
+        LASSERT(obj);
+
+        if (obj->len) {
+                LASSERT(obj->data);
+                OBD_FREE(obj->data, obj->len);
+                obj->len = 0;
+                obj->data = NULL;
+        } else
+                LASSERT(!obj->data);
+}
+
+int rawobj_equal(rawobj_t *a, rawobj_t *b)
+{
+        LASSERT(a && b);
+
+        return (a->len == b->len &&
+                !memcmp(a->data, b->data, a->len));
+}
+
+int rawobj_dup(rawobj_t *dest, rawobj_t *src)
+{
+        LASSERT(src && dest);
+
+        dest->len = src->len;
+        if (dest->len) {
+                OBD_ALLOC(dest->data, dest->len);
+                if (!dest->data)
+                        return -ENOMEM;
+                memcpy(dest->data, src->data, dest->len);
+        } else
+                dest->data = NULL;
+        return 0;
+}
+
+int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        __u32 len;
+
+        LASSERT(obj);
+        LASSERT(buf);
+        LASSERT(buflen);
+
+        len = size_round4(obj->len);
+
+        if (*buflen < 4 + len) {
+                CERROR("buflen %u <  %u\n", *buflen, 4 + len);
+                return -EINVAL;
+        }
+
+        *(*buf)++ = cpu_to_le32(obj->len);
+        memcpy(*buf, obj->data, obj->len);
+        *buf += (len >> 2);
+        *buflen -= (4 + len);
+
+        return 0;
+}
+
+static int __rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen,
+                       int alloc, int local)
+{
+        __u32 len;
+
+        if (*buflen < sizeof(__u32)) {
+               CERROR("buflen %u\n", *buflen);
+                return -EINVAL;
+        }
+
+        obj->len = *(*buf)++;
+        if (!local)
+               obj->len = le32_to_cpu(obj->len);
+        *buflen -= sizeof(__u32);
+
+        if (!obj->len) {
+                obj->data = NULL;
+                return 0;
+        }
+
+        len = local ? obj->len : size_round4(obj->len);
+        if (*buflen < len) {
+               CERROR("buflen %u < %u\n", *buflen, len);
+                return -EINVAL;
+        }
+
+        if (!alloc)
+               obj->data = (__u8 *) *buf;
+        else {
+               OBD_ALLOC(obj->data, obj->len);
+               if (!obj->data) {
+                       CERROR("fail to alloc %u bytes\n", obj->len);
+                       return -ENOMEM;
+               }
+               memcpy(obj->data, *buf, obj->len);
+        }
+
+        *((char **)buf) += len;
+        *buflen -= len;
+
+        return 0;
+}
+
+int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        return __rawobj_extract(obj, buf, buflen, 0, 0);
+}
+
+int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen)
+{
+        return __rawobj_extract(obj, buf, buflen, 0, 1);
+}
diff --git a/lustre/sec/gss/sec_gss.c b/lustre/sec/gss/sec_gss.c
new file mode 100644 (file)
index 0000000..db89a71
--- /dev/null
@@ -0,0 +1,1799 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * linux/net/sunrpc/auth_gss.c
+ *
+ * RPCSEC_GSS client authentication.
+ *
+ *  Copyright (c) 2000 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Dug Song       <dugsong@monkey.org>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Id: sec_gss.c,v 1.2 2005/03/31 22:18:24 ericm Exp $
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/random.h>
+/* for rpc_pipefs */
+struct rpc_clnt;
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+#define GSS_CREDCACHE_EXPIRE    (60)               /* 1 minute */
+#define GSS_CRED_EXPIRE         (8 * 60 * 60)      /* 8 hours */
+#define GSS_CRED_SIGN_SIZE      (1024)
+#define GSS_CRED_VERIFY_SIZE    (56)
+
+#define LUSTRE_PIPEDIR          "/lustre"
+
+/**********************************************
+ * gss security init/fini helper              *
+ **********************************************/
+
+#define SECINIT_RPC_TIMEOUT     (10)
+#define SECFINI_RPC_TIMEOUT     (10)
+
+static int secinit_compose_request(struct obd_import *imp,
+                                   char *buf, int bufsize,
+                                   char __user *token)
+{
+        struct ptlrpcs_wire_hdr *hdr;
+        struct lustre_msg       *lmsg;
+        char __user             *token_buf;
+        __u64                    token_size;
+        __u32                    lmsg_size, *p;
+        int rc;
+
+        lmsg_size = lustre_msg_size(0, NULL);
+
+        if (copy_from_user(&token_size, token, sizeof(token_size))) {
+                CERROR("read token error\n");
+                return -EFAULT;
+        }
+        if (sizeof(*hdr) + lmsg_size + size_round(token_size) > bufsize) {
+                CERROR("token size "LPU64" too large\n", token_size);
+                return -EINVAL;
+        }
+
+        if (copy_from_user(&token_buf, (token + sizeof(token_size)),
+                           sizeof(void*))) {
+                CERROR("read token buf pointer error\n");
+                return -EFAULT;
+        }
+
+        /* security wire hdr */
+        hdr = buf_to_sec_hdr(buf);
+        hdr->flavor  = cpu_to_le32(PTLRPC_SEC_GSS);
+        hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_NONE);
+        hdr->msg_len = cpu_to_le32(lmsg_size);
+        hdr->sec_len = cpu_to_le32(7 * 4 + token_size);
+
+        /* lustre message */
+        lmsg = buf_to_lustre_msg(buf);
+        lustre_init_msg(lmsg, 0, NULL, NULL);
+        lmsg->handle   = imp->imp_remote_handle;
+        lmsg->type     = PTL_RPC_MSG_REQUEST;
+        lmsg->opc      = SEC_INIT;
+        lmsg->flags    = 0;
+        lmsg->conn_cnt = imp->imp_conn_cnt;
+
+        p = (__u32 *) (buf + sizeof(*hdr) + lmsg_size);
+
+        /* gss hdr */
+        *p++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION);     /* gss version */
+        *p++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I);       /* subflavor */
+        *p++ = cpu_to_le32(PTLRPC_GSS_PROC_INIT);       /* proc */
+        *p++ = cpu_to_le32(0);                          /* seq */
+        *p++ = cpu_to_le32(PTLRPC_GSS_SVC_NONE);        /* service */
+        *p++ = cpu_to_le32(0);                          /* context handle */
+
+        /* now the token part */
+        *p++ = (__u32)(cpu_to_le64(token_size));
+        LASSERT(((char *)p - buf) + token_size <= bufsize);
+
+        rc = copy_from_user(p, token_buf, token_size);
+        if (rc) {
+                CERROR("can't copy token\n");
+                return -EFAULT;
+        }
+
+        rc = size_round(((char *)p - buf) + token_size);
+        return rc;
+}
+
+static int secinit_parse_reply(char *repbuf, int replen,
+                               char __user *outbuf, int outlen)
+{
+        __u32 *p = (__u32 *)repbuf;
+        __u32 lmsg_len, sec_len, status, major, minor, seq, obj_len, round_len;
+        __u32 effective = 0;
+
+        if (replen <= (4 + 6) * 4) {
+                CERROR("reply size %d too small\n", replen);
+                return -EINVAL;
+        }
+
+        lmsg_len = le32_to_cpu(p[2]);
+        sec_len = le32_to_cpu(p[3]);
+
+        /* sanity checks */
+        if (p[0] != cpu_to_le32(PTLRPC_SEC_GSS) ||
+            p[1] != cpu_to_le32(PTLRPC_SEC_TYPE_NONE)) {
+                CERROR("unexpected reply\n");
+                return -EINVAL;
+        }
+        if (lmsg_len % 8 ||
+            4 * 4 + lmsg_len + sec_len > replen) {
+                CERROR("unexpected reply\n");
+                return -EINVAL;
+        }
+        if (sec_len > outlen) {
+                CERROR("outbuf too small\n");
+                return -EINVAL;
+        }
+
+        p += 4;                 /* skip hdr */
+        p += lmsg_len / 4;      /* skip lmsg */
+        effective = 0;
+
+        status = le32_to_cpu(*p++);
+        major = le32_to_cpu(*p++);
+        minor = le32_to_cpu(*p++);
+        seq = le32_to_cpu(*p++);
+        effective += 4 * 4;
+
+        copy_to_user(outbuf, &status, 4);
+        outbuf += 4;
+        copy_to_user(outbuf, &major, 4);
+        outbuf += 4;
+        copy_to_user(outbuf, &minor, 4);
+        outbuf += 4;
+        copy_to_user(outbuf, &seq, 4);
+        outbuf += 4;
+
+        obj_len = le32_to_cpu(*p++);
+        round_len = (obj_len + 3) & ~ 3;
+        copy_to_user(outbuf, &obj_len, 4);
+        outbuf += 4;
+        copy_to_user(outbuf, (char *)p, round_len);
+        p += round_len / 4;
+        outbuf += round_len;
+        effective += 4 + round_len;
+
+        obj_len = le32_to_cpu(*p++);
+        round_len = (obj_len + 3) & ~ 3;
+        copy_to_user(outbuf, &obj_len, 4);
+        outbuf += 4;
+        copy_to_user(outbuf, (char *)p, round_len);
+        p += round_len / 4;
+        outbuf += round_len;
+        effective += 4 + round_len;
+
+        return effective;
+}
+
+/* input: 
+ *   1. ptr to uuid
+ *   2. ptr to send_token
+ *   3. ptr to output buffer
+ *   4. output buffer size
+ * output:
+ *   1. return code. 0 is success
+ *   2. no meaning
+ *   3. ptr output data
+ *   4. output data size
+ *
+ * return:
+ *   < 0: error
+ *   = 0: success
+ *
+ * FIXME This interface looks strange, should be reimplemented
+ */
+static int gss_send_secinit_rpc(__user char *buffer, unsigned long count)
+{
+        struct obd_import *imp;
+        const int reqbuf_size = 1024;
+        const int repbuf_size = 1024;
+        char *reqbuf, *repbuf;
+        struct obd_device *obd;
+        char obdname[64];
+        long inbuf[4], lsize;
+        int rc, reqlen, replen;
+
+        if (count != 4 * sizeof(long)) {
+                CERROR("count %lu\n", count);
+                RETURN(-EINVAL);
+        }
+        if (copy_from_user(inbuf, buffer, count)) {
+                CERROR("Invalid pointer\n");
+                RETURN(-EFAULT);
+        }
+
+        /* take name */
+        if (strncpy_from_user(obdname, (char *)inbuf[0],
+                              sizeof(obdname)) <= 0) {
+                CERROR("Invalid obdname pointer\n");
+                RETURN(-EFAULT);
+        }
+
+        obd = class_name2obd(obdname);
+        if (!obd) {
+                CERROR("no such obd %s\n", obdname);
+                RETURN(-EINVAL);
+        }
+        if (strcmp(obd->obd_type->typ_name, "mdc") &&
+            strcmp(obd->obd_type->typ_name, "osc")) {
+                CERROR("%s not a mdc/osc device\n", obdname);
+                RETURN(-EINVAL);
+        }
+
+        imp = class_import_get(obd->u.cli.cl_import);
+
+        OBD_ALLOC(reqbuf, reqbuf_size);
+        OBD_ALLOC(repbuf, reqbuf_size);
+
+        if (!reqbuf || !repbuf) {
+                CERROR("Can't alloc buffer: %p/%p\n", reqbuf, repbuf);
+                GOTO(out_free, rc = -ENOMEM);
+        }
+
+        /* get token */
+        reqlen = secinit_compose_request(imp, reqbuf, reqbuf_size,
+                                         (char *)inbuf[1]);
+        if (reqlen < 0)
+                GOTO(out_free, rc = reqlen);
+
+        replen = repbuf_size;
+        rc = ptlrpc_do_rawrpc(imp, reqbuf, reqlen,
+                              repbuf, &replen, SECINIT_RPC_TIMEOUT);
+        if (rc)
+                GOTO(out_free, rc);
+
+        if (replen > inbuf[3]) {
+                CERROR("output buffer size %ld too small, need %d\n",
+                        inbuf[3], replen);
+                GOTO(out_free, rc = -EINVAL);
+        }
+
+        lsize = secinit_parse_reply(repbuf, replen,
+                                    (char *)inbuf[2], (int)inbuf[3]);
+        if (lsize < 0)
+                GOTO(out_free, rc = (int)lsize);
+
+        copy_to_user(buffer + 3 * sizeof(long), &lsize, sizeof(lsize));
+        lsize = 0;
+        copy_to_user((char*)buffer, &lsize, sizeof(lsize));
+        rc = 0;
+out_free:
+        class_import_put(imp);
+        if (repbuf)
+                OBD_FREE(repbuf, repbuf_size);
+        if (reqbuf)
+                OBD_FREE(reqbuf, reqbuf_size);
+        RETURN(rc);
+}
+
+static int gss_send_secfini_rpc(struct obd_import *imp,
+                                char *reqbuf, int reqlen)
+{
+        const int repbuf_size = 1024;
+        char *repbuf;
+        int replen = repbuf_size;
+        int rc;
+
+        OBD_ALLOC(repbuf, repbuf_size);
+        if (!repbuf) {
+                CERROR("Out of memory\n");
+                return -ENOMEM;
+        }
+
+        rc = ptlrpc_do_rawrpc(imp, reqbuf, reqlen, repbuf, &replen,
+                              SECFINI_RPC_TIMEOUT);
+
+        OBD_FREE(repbuf, repbuf_size);
+        return rc;
+}
+
+/**********************************************
+ * structure definitions                      *
+ **********************************************/
+struct gss_sec {
+        struct ptlrpc_sec       gs_base;
+        struct gss_api_mech    *gs_mech;
+#ifdef __KERNEL__
+        spinlock_t              gs_lock;
+        struct list_head        gs_upcalls;
+        char                    gs_pipepath[64];
+        struct dentry          *gs_depipe;
+#endif
+};
+
+static rwlock_t gss_ctx_lock = RW_LOCK_UNLOCKED;
+
+#ifdef __KERNEL__
+
+struct gss_upcall_msg {
+        struct rpc_pipe_msg             gum_base;
+        atomic_t                        gum_refcount;
+        struct list_head                gum_list;
+        struct gss_sec                 *gum_gsec;
+        wait_queue_head_t               gum_waitq;
+        char                            gum_obdname[64];
+        uid_t                           gum_uid;
+        __u32                           gum_ip; /* XXX IPv6? */
+        __u32                           gum_svc;
+        __u32                           gum_pad;
+};
+
+/**********************************************
+ * rpc_pipe upcall helpers                    *
+ **********************************************/
+static
+void gss_release_msg(struct gss_upcall_msg *gmsg)
+{
+        ENTRY;
+        LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+
+        if (!atomic_dec_and_test(&gmsg->gum_refcount)) {
+                CDEBUG(D_SEC, "gmsg %p ref %d\n", gmsg,
+                       atomic_read(&gmsg->gum_refcount));
+                EXIT;
+                return;
+        }
+        LASSERT(list_empty(&gmsg->gum_list));
+        OBD_FREE(gmsg, sizeof(*gmsg));
+        EXIT;
+}
+
+static void
+gss_unhash_msg_nolock(struct gss_upcall_msg *gmsg)
+{
+        ENTRY;
+        if (list_empty(&gmsg->gum_list)) {
+                EXIT;
+                return;
+        }
+        /* FIXME should not do this. when we in upper upcall queue,
+         * downcall will call unhash_msg, thus later put_msg might
+         * free msg buffer while it's not dequeued XXX */
+        list_del_init(&gmsg->gum_base.list);
+        /* FIXME */
+
+        list_del_init(&gmsg->gum_list);
+        wake_up(&gmsg->gum_waitq);
+        atomic_dec(&gmsg->gum_refcount);
+        CDEBUG(D_SEC, "gmsg %p refcount now %d\n",
+               gmsg, atomic_read(&gmsg->gum_refcount));
+        LASSERT(atomic_read(&gmsg->gum_refcount) > 0);
+        EXIT;
+}
+
+static void
+gss_unhash_msg(struct gss_upcall_msg *gmsg)
+{
+        struct gss_sec *gsec = gmsg->gum_gsec;
+
+        spin_lock(&gsec->gs_lock);
+        gss_unhash_msg_nolock(gmsg);
+        spin_unlock(&gsec->gs_lock);
+}
+
+static
+struct gss_upcall_msg * gss_find_upcall(struct gss_sec *gsec,
+                                        char *obdname,
+                                        uid_t uid, __u32 dest_ip)
+{
+        struct gss_upcall_msg *gmsg;
+        ENTRY;
+
+        list_for_each_entry(gmsg, &gsec->gs_upcalls, gum_list) {
+                if (gmsg->gum_uid != uid)
+                        continue;
+                if (gmsg->gum_ip != dest_ip)
+                        continue;
+                if (strcmp(gmsg->gum_obdname, obdname))
+                        continue;
+                atomic_inc(&gmsg->gum_refcount);
+                CDEBUG(D_SEC, "found gmsg at %p: obdname %s, uid %d, ref %d\n",
+                       gmsg, obdname, uid, atomic_read(&gmsg->gum_refcount));
+                RETURN(gmsg);
+        }
+        RETURN(NULL);
+}
+
+static void gss_init_upcall_msg(struct gss_upcall_msg *gmsg,
+                                struct gss_sec *gsec,
+                                char *obdname,
+                                uid_t uid, __u32 dest_ip, __u32 svc)
+{
+        struct rpc_pipe_msg *rpcmsg;
+        ENTRY;
+
+        /* 2 refs: 1 for hash, 1 for current user */
+        init_waitqueue_head(&gmsg->gum_waitq);
+        list_add(&gmsg->gum_list, &gsec->gs_upcalls);
+        atomic_set(&gmsg->gum_refcount, 2);
+        gmsg->gum_gsec = gsec;
+        strncpy(gmsg->gum_obdname, obdname, sizeof(gmsg->gum_obdname));
+        gmsg->gum_uid = uid;
+        gmsg->gum_ip = dest_ip;
+        gmsg->gum_svc = svc;
+
+        rpcmsg = &gmsg->gum_base;
+        rpcmsg->data = &gmsg->gum_uid;
+        rpcmsg->len = sizeof(gmsg->gum_uid) + sizeof(gmsg->gum_ip) +
+                      sizeof(gmsg->gum_svc) + sizeof(gmsg->gum_pad);
+        EXIT;
+}
+#endif /* __KERNEL__ */
+
+/********************************************
+ * gss cred manupulation helpers            *
+ ********************************************/
+static
+int gss_cred_is_uptodate_ctx(struct ptlrpc_cred *cred)
+{
+        struct gss_cred *gcred = container_of(cred, struct gss_cred, gc_base);
+        int res = 0;
+
+        read_lock(&gss_ctx_lock);
+        if ((cred->pc_flags & PTLRPC_CRED_UPTODATE) && gcred->gc_ctx)
+                res = 1;
+        read_unlock(&gss_ctx_lock);
+        return res;
+}
+
+static inline
+struct gss_cl_ctx * gss_get_ctx(struct gss_cl_ctx *ctx)
+{
+        atomic_inc(&ctx->gc_refcount);
+        return ctx;
+}
+
+static
+void gss_destroy_ctx(struct gss_cl_ctx *ctx)
+{
+        ENTRY;
+
+        CDEBUG(D_SEC, "destroy cl_ctx %p\n", ctx);
+        if (ctx->gc_gss_ctx)
+                kgss_delete_sec_context(&ctx->gc_gss_ctx);
+
+        if (ctx->gc_wire_ctx.len > 0) {
+                OBD_FREE(ctx->gc_wire_ctx.data, ctx->gc_wire_ctx.len);
+                ctx->gc_wire_ctx.len = 0;
+        }
+
+        OBD_FREE(ctx, sizeof(*ctx));
+}
+
+static
+void gss_put_ctx(struct gss_cl_ctx *ctx)
+{
+        if (atomic_dec_and_test(&ctx->gc_refcount))
+                gss_destroy_ctx(ctx);
+}
+
+static
+struct gss_cl_ctx *gss_cred_get_ctx(struct ptlrpc_cred *cred)
+{
+        struct gss_cred *gcred = container_of(cred, struct gss_cred, gc_base);
+        struct gss_cl_ctx *ctx = NULL;
+
+        read_lock(&gss_ctx_lock);
+        if (gcred->gc_ctx)
+                ctx = gss_get_ctx(gcred->gc_ctx);
+        read_unlock(&gss_ctx_lock);
+        return ctx;
+}
+
+static
+void gss_cred_set_ctx(struct ptlrpc_cred *cred, struct gss_cl_ctx *ctx)
+{
+        struct gss_cred *gcred = container_of(cred, struct gss_cred, gc_base);
+        struct gss_cl_ctx *old;
+        __u64 ctx_expiry;
+        ENTRY;
+
+        if (kgss_inquire_context(ctx->gc_gss_ctx, &ctx_expiry)) {
+                CERROR("unable to get expire time\n");
+                ctx_expiry = 1; /* make it expired now */
+        }
+        cred->pc_expire = (unsigned long) ctx_expiry;
+
+        write_lock(&gss_ctx_lock);
+        old = gcred->gc_ctx;
+        gcred->gc_ctx = ctx;
+        cred->pc_flags |= PTLRPC_CRED_UPTODATE;
+        write_unlock(&gss_ctx_lock);
+        if (old)
+                gss_put_ctx(old);
+
+        CWARN("client refreshed gss cred %p(uid %u)\n", cred, cred->pc_uid);
+        EXIT;
+}
+
+static int
+simple_get_bytes(char **buf, __u32 *buflen, void *res, __u32 reslen)
+{
+        if (*buflen < reslen) {
+                CERROR("buflen %u < %u\n", *buflen, reslen);
+                return -EINVAL;
+        }
+
+        memcpy(res, *buf, reslen);
+        *buf += reslen;
+        *buflen -= reslen;
+        return 0;
+}
+
+/* data passed down:
+ *  - uid
+ *  - timeout
+ *  - gc_win / error
+ *  - wire_ctx (rawobj)
+ *  - mech_ctx? (rawobj)
+ */
+static
+int gss_parse_init_downcall(struct gss_api_mech *gm, rawobj_t *buf,
+                            struct gss_cl_ctx **gc, struct vfs_cred *vcred,
+                            __u32 *dest_ip, int *gss_err)
+{
+        char *p = buf->data;
+        __u32 len = buf->len;
+        struct gss_cl_ctx *ctx;
+        rawobj_t tmp_buf;
+        unsigned int timeout;
+        int err = -EIO;
+        ENTRY;
+
+        *gc = NULL;
+
+        OBD_ALLOC(ctx, sizeof(*ctx));
+        if (!ctx)
+                RETURN(-ENOMEM);
+
+        ctx->gc_proc = RPC_GSS_PROC_DATA;
+        ctx->gc_seq = 0;
+        spin_lock_init(&ctx->gc_seq_lock);
+        atomic_set(&ctx->gc_refcount,1);
+
+        if (simple_get_bytes(&p, &len, &vcred->vc_uid, sizeof(vcred->vc_uid)))
+                GOTO(err_free_ctx, err);
+        vcred->vc_pag = vcred->vc_uid; /* FIXME */
+        if (simple_get_bytes(&p, &len, dest_ip, sizeof(*dest_ip)))
+                GOTO(err_free_ctx, err);
+        /* FIXME: discarded timeout for now */
+        if (simple_get_bytes(&p, &len, &timeout, sizeof(timeout)))
+                GOTO(err_free_ctx, err);
+        *gss_err = 0;
+        if (simple_get_bytes(&p, &len, &ctx->gc_win, sizeof(ctx->gc_win)))
+                GOTO(err_free_ctx, err);
+        /* gssd signals an error by passing ctx->gc_win = 0: */
+        if (!ctx->gc_win) {
+                /* in which case the next int is an error code: */
+                if (simple_get_bytes(&p, &len, gss_err, sizeof(*gss_err)))
+                        GOTO(err_free_ctx, err);
+                GOTO(err_free_ctx, err = 0);
+        }
+        if (rawobj_extract_local(&tmp_buf, (__u32 **) &p, &len))
+                GOTO(err_free_ctx, err);
+        if (rawobj_dup(&ctx->gc_wire_ctx, &tmp_buf)) {
+                GOTO(err_free_ctx, err = -ENOMEM);
+        }
+        if (rawobj_extract_local(&tmp_buf, (__u32 **) &p, &len))
+                GOTO(err_free_wire_ctx, err);
+        if (len) {
+                CERROR("unexpected trailing %u bytes\n", len);
+                GOTO(err_free_wire_ctx, err);
+        }
+        if (kgss_import_sec_context(&tmp_buf, gm, &ctx->gc_gss_ctx))
+                GOTO(err_free_wire_ctx, err);
+
+        *gc = ctx;
+        RETURN(0);
+
+err_free_wire_ctx:
+        if (ctx->gc_wire_ctx.data)
+                OBD_FREE(ctx->gc_wire_ctx.data, ctx->gc_wire_ctx.len);
+err_free_ctx:
+        OBD_FREE(ctx, sizeof(*ctx));
+        CDEBUG(D_SEC, "err_code %d, gss code %d\n", err, *gss_err);
+        return err;
+}
+
+/***************************************
+ * cred APIs                           *
+ ***************************************/
+#ifdef __KERNEL__
+static int gss_cred_refresh(struct ptlrpc_cred *cred)
+{
+        struct obd_import          *import;
+        struct gss_sec             *gsec;
+        struct gss_upcall_msg      *gss_msg, *gss_new;
+        struct dentry              *dentry;
+        char                       *obdname, *obdtype;
+        wait_queue_t                wait;
+        uid_t                       uid = cred->pc_uid;
+        ptl_nid_t                   peer_nid;
+        __u32                       dest_ip, svc;
+        int                         res;
+        ENTRY;
+
+        if (ptlrpcs_cred_is_uptodate(cred))
+                RETURN(0);
+
+        LASSERT(cred->pc_sec);
+        LASSERT(cred->pc_sec->ps_import);
+        LASSERT(cred->pc_sec->ps_import->imp_obd);
+
+        import = cred->pc_sec->ps_import;
+        if (!import->imp_connection) {
+                CERROR("import has no connection set\n");
+                RETURN(-EINVAL);
+        }
+
+        peer_nid = import->imp_connection->c_peer.peer_id.nid;
+        dest_ip = (__u32) (peer_nid & 0xFFFFFFFF);
+
+        obdtype = import->imp_obd->obd_type->typ_name;
+        if (!strcmp(obdtype, "mdc"))
+                svc = 0;
+        else if (!strcmp(obdtype, "osc"))
+                svc = 1;
+        else {
+                CERROR("gss on %s?\n", obdtype);
+                RETURN(-EINVAL);
+        }
+
+        gsec = container_of(cred->pc_sec, struct gss_sec, gs_base);
+        obdname = import->imp_obd->obd_name;
+        dentry = gsec->gs_depipe;
+        gss_new = NULL;
+        res = 0;
+
+        CWARN("Initiate gss context %p(%u@%s)\n",
+               container_of(cred, struct gss_cred, gc_base),
+               uid, import->imp_target_uuid.uuid);
+
+again:
+        spin_lock(&gsec->gs_lock);
+        gss_msg = gss_find_upcall(gsec, obdname, uid, dest_ip);
+        if (gss_msg) {
+                spin_unlock(&gsec->gs_lock);
+                GOTO(waiting, res);
+        }
+        if (!gss_new) {
+                spin_unlock(&gsec->gs_lock);
+                OBD_ALLOC(gss_new, sizeof(*gss_new));
+                if (!gss_new) {
+                        CERROR("fail to alloc memory\n");
+                        RETURN(-ENOMEM);
+                }
+                goto again;
+        }
+        /* so far we'v created gss_new */
+        gss_init_upcall_msg(gss_new, gsec, obdname, uid, dest_ip, svc);
+
+        if (gss_cred_is_uptodate_ctx(cred)) {
+                /* someone else had done it for us, simply cancel
+                 * our own upcall */
+                CDEBUG(D_SEC, "cred("LPU64"/%u) has been refreshed by someone "
+                       "else, simply drop our request\n",
+                       cred->pc_pag, cred->pc_uid);
+                gss_unhash_msg_nolock(gss_new);
+                spin_unlock(&gsec->gs_lock);
+                gss_release_msg(gss_new);
+                RETURN(0);
+        }
+
+        /* need to make upcall now */
+        spin_unlock(&gsec->gs_lock);
+        res = rpc_queue_upcall(dentry->d_inode, &gss_new->gum_base);
+        if (res) {
+                CERROR("rpc_queue_upcall failed: %d\n", res);
+                gss_unhash_msg(gss_new);
+                gss_release_msg(gss_new);
+                RETURN(res);
+        }
+        gss_msg = gss_new;
+
+waiting:
+        init_waitqueue_entry(&wait, current);
+        spin_lock(&gsec->gs_lock);
+        add_wait_queue(&gss_msg->gum_waitq, &wait);
+        set_current_state(TASK_INTERRUPTIBLE);
+        spin_unlock(&gsec->gs_lock);
+
+        schedule();
+
+        remove_wait_queue(&gss_msg->gum_waitq, &wait);
+        if (signal_pending(current)) {
+                CERROR("interrupted gss upcall %p\n", gss_msg);
+                res = -EINTR;
+        }
+        gss_release_msg(gss_msg);
+        RETURN(res);
+}
+#else /* !__KERNEL__ */
+extern int lgss_handle_krb5_upcall(uid_t uid, __u32 dest_ip,
+                                   char *obd_name,
+                                   char *buf, int bufsize,
+                                   int (*callback)(char*, unsigned long));
+
+static int gss_cred_refresh(struct ptlrpc_cred *cred)
+{
+        char                    buf[4096];
+        rawobj_t                obj;
+        struct obd_import      *imp;
+        struct gss_sec         *gsec;
+        struct gss_api_mech    *mech;
+        struct gss_cl_ctx      *ctx = NULL;
+        struct vfs_cred         vcred = { 0 };
+        ptl_nid_t               peer_nid;
+        __u32                   dest_ip;
+        __u32                   subflavor;
+        int                     rc, gss_err;
+
+        LASSERT(cred);
+        LASSERT(cred->pc_sec);
+        LASSERT(cred->pc_sec->ps_import);
+        LASSERT(cred->pc_sec->ps_import->imp_obd);
+
+        if (ptlrpcs_cred_is_uptodate(cred))
+                RETURN(0);
+
+        imp = cred->pc_sec->ps_import;
+        peer_nid = imp->imp_connection->c_peer.peer_id.nid;
+        dest_ip = (__u32) (peer_nid & 0xFFFFFFFF);
+        subflavor = cred->pc_sec->ps_flavor.subflavor;
+
+        if (subflavor != PTLRPC_SEC_GSS_KRB5I) {
+                CERROR("unknown subflavor %u\n", subflavor);
+                GOTO(err_out, rc = -EINVAL);
+        }
+
+        rc = lgss_handle_krb5_upcall(cred->pc_uid, dest_ip,
+                                     imp->imp_obd->obd_name,
+                                     buf, sizeof(buf),
+                                     gss_send_secinit_rpc);
+        LASSERT(rc != 0);
+        if (rc < 0)
+                goto err_out;
+
+        obj.data = buf;
+        obj.len = rc;
+
+        gsec = container_of(cred->pc_sec, struct gss_sec, gs_base);
+        mech = gsec->gs_mech;
+        LASSERT(mech);
+        rc = gss_parse_init_downcall(mech, &obj, &ctx, &vcred, &dest_ip,
+                                     &gss_err);
+        if (rc) {
+                CERROR("parse init downcall error %d\n", rc);
+                goto err_out;
+        }
+
+        if (gss_err) {
+                CERROR("cred fresh got gss error %x\n", gss_err);
+                rc = -EINVAL;
+                goto err_out;
+        }
+
+        gss_cred_set_ctx(cred, ctx);
+        LASSERT(gss_cred_is_uptodate_ctx(cred));
+
+        return 0;
+err_out:
+        cred->pc_flags |= PTLRPC_CRED_DEAD;
+        return rc;
+}
+#endif
+
+static int gss_cred_match(struct ptlrpc_cred *cred,
+                          struct ptlrpc_request *req,
+                          struct vfs_cred *vcred)
+{
+        RETURN(cred->pc_pag == vcred->vc_pag);
+}
+
+static int gss_cred_sign(struct ptlrpc_cred *cred,
+                         struct ptlrpc_request *req)
+{
+        struct gss_cred         *gcred;
+        struct gss_cl_ctx       *ctx;
+        rawobj_t                 lmsg, mic;
+        __u32                   *vp, *vpsave, vlen, seclen;
+        __u32                    seqnum, major, rc = 0;
+        ENTRY;
+
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_cred == cred);
+
+        gcred = container_of(cred, struct gss_cred, gc_base);
+        ctx = gss_cred_get_ctx(cred);
+        if (!ctx) {
+                CERROR("cred %p("LPU64"/%u) invalidated?\n",
+                        cred, cred->pc_pag, cred->pc_uid);
+                RETURN(-EPERM);
+        }
+
+        lmsg.len = req->rq_reqlen;
+        lmsg.data = (__u8 *) req->rq_reqmsg;
+
+        vp = (__u32 *) (lmsg.data + lmsg.len);
+        vlen = req->rq_reqbuf_len - sizeof(struct ptlrpcs_wire_hdr) -
+               lmsg.len;
+        seclen = vlen;
+
+        if (vlen < 6 * 4 + size_round4(ctx->gc_wire_ctx.len)) {
+                CERROR("vlen %d, need %d\n",
+                        vlen, 6 * 4 + size_round4(ctx->gc_wire_ctx.len));
+                rc = -EIO;
+                goto out;
+        }
+
+        spin_lock(&ctx->gc_seq_lock);
+        seqnum = ctx->gc_seq++;
+        spin_unlock(&ctx->gc_seq_lock);
+
+        *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION);    /* version */
+        *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I);      /* subflavor */
+        *vp++ = cpu_to_le32(ctx->gc_proc);              /* proc */
+        *vp++ = cpu_to_le32(seqnum);                    /* seq */
+        *vp++ = cpu_to_le32(PTLRPC_GSS_SVC_INTEGRITY);  /* service */
+        vlen -= 5 * 4;
+
+        if (rawobj_serialize(&ctx->gc_wire_ctx, &vp, &vlen)) {
+                rc = -EIO;
+                goto out;
+        }
+        CDEBUG(D_SEC, "encoded wire_ctx length %d\n", ctx->gc_wire_ctx.len);
+
+        vpsave = vp++;  /* reserve for size */
+        vlen -= 4;
+
+        mic.len = vlen;
+        mic.data = (char *) vp;
+
+        CDEBUG(D_SEC, "reqbuf at %p, lmsg at %p, len %d, mic at %p, len %d\n",
+               req->rq_reqbuf, lmsg.data, lmsg.len, mic.data, mic.len);
+        major = kgss_get_mic(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, &lmsg, &mic);
+        if (major) {
+                CERROR("gss compute mic error, major %x\n", major);
+                rc = -EACCES;
+                goto out;
+        }
+
+        *vpsave = cpu_to_le32(mic.len);
+        
+        seclen = seclen - vlen + mic.len;
+        buf_to_sec_hdr(req->rq_reqbuf)->sec_len = cpu_to_le32(seclen);
+        req->rq_reqdata_len += size_round(seclen);
+        CDEBUG(D_SEC, "msg size %d, checksum size %d, total sec size %d\n",
+               lmsg.len, mic.len, seclen);
+out:
+        gss_put_ctx(ctx);
+        RETURN(rc);
+}
+
+static int gss_cred_verify(struct ptlrpc_cred *cred,
+                           struct ptlrpc_request *req)
+{
+        struct gss_cred        *gcred;
+        struct gss_cl_ctx      *ctx;
+        struct ptlrpcs_wire_hdr *sec_hdr;
+        rawobj_t                lmsg, mic;
+        __u32                   *vp, vlen, subflavor, proc, seq, svc;
+        __u32                   major, minor, rc;
+        ENTRY;
+
+        LASSERT(req->rq_repbuf);
+        LASSERT(req->rq_cred == cred);
+
+        sec_hdr = buf_to_sec_hdr(req->rq_repbuf);
+        vp = (__u32 *) (req->rq_repbuf + sizeof(*sec_hdr) + sec_hdr->msg_len);
+        vlen = sec_hdr->sec_len;
+
+        if (vlen < 7 * 4) {
+                CERROR("reply sec size %u too small\n", vlen);
+                RETURN(-EPROTO);
+        }
+
+        if (*vp++ != cpu_to_le32(PTLRPC_SEC_GSS_VERSION)) {
+                CERROR("reply have different gss version\n");
+                RETURN(-EPROTO);
+        }
+        subflavor = le32_to_cpu(*vp++);
+        proc = le32_to_cpu(*vp++);
+        vlen -= 3 * 4;
+
+        switch (proc) {
+        case PTLRPC_GSS_PROC_DATA:
+                seq = le32_to_cpu(*vp++);
+                svc = le32_to_cpu(*vp++);
+                if (svc != PTLRPC_GSS_SVC_INTEGRITY) {
+                        CERROR("Unknown svc %d\n", svc);
+                        RETURN(-EPROTO);
+                }
+                if (*vp++ != 0) {
+                        CERROR("Unexpected ctx handle\n");
+                        RETURN(-EPROTO);
+                }
+                mic.len = le32_to_cpu(*vp++);
+                vlen -= 4 * 4;
+                if (vlen < mic.len) {
+                        CERROR("vlen %d, mic.len %d\n", vlen, mic.len);
+                        RETURN(-EINVAL);
+                }
+                mic.data = (char *) vp;
+
+                gcred = container_of(cred, struct gss_cred, gc_base);
+                ctx = gss_cred_get_ctx(cred);
+                LASSERT(ctx);
+
+                lmsg.len = sec_hdr->msg_len;
+                lmsg.data = (__u8 *) buf_to_lustre_msg(req->rq_repbuf);
+
+                major = kgss_verify_mic(ctx->gc_gss_ctx, &lmsg, &mic, NULL);
+                if (major != GSS_S_COMPLETE) {
+                        CERROR("gss verify mic error: major %x\n", major);
+                        GOTO(proc_data_out, rc = -EINVAL);
+                }
+
+                req->rq_repmsg = (struct lustre_msg *) lmsg.data;
+                req->rq_replen = lmsg.len;
+
+                /* here we could check the seq number is the same one
+                 * we sent to server. but portals has prevent us from
+                 * replay attack, so maybe we don't need check it again.
+                 */
+                rc = 0;
+proc_data_out:
+                gss_put_ctx(ctx);
+                break;
+        case PTLRPC_GSS_PROC_ERR:
+                major = le32_to_cpu(*vp++);
+                minor = le32_to_cpu(*vp++);
+                /* server return NO_CONTEXT might be caused by context expire
+                 * or server reboot/failover. we refresh the cred transparently
+                 * to upper layer.
+                 * In some cases, our gss handle is possible to be incidentally
+                 * identical to another handle since the handle itself is not
+                 * fully random. In krb5 case, the GSS_S_BAD_SIG will be
+                 * returned, maybe other gss error for other mechanism. Here we
+                 * only consider krb5 mech (FIXME) and try to establish new
+                 * context.
+                 */
+                if (major == GSS_S_NO_CONTEXT ||
+                    major == GSS_S_BAD_SIG) {
+                        CWARN("req %p: server report cred %p %s, expired?\n",
+                               req, cred, (major == GSS_S_NO_CONTEXT) ?
+                                           "NO_CONTEXT" : "BAD_SIG");
+
+                        ptlrpcs_cred_die(cred);
+                        rc = ptlrpcs_req_replace_dead_cred(req);
+                        if (!rc)
+                                req->rq_ptlrpcs_restart = 1;
+                        else
+                                CERROR("replace dead cred failed %d\n", rc);
+                } else {
+                        CERROR("Unrecognized gss error (%x/%x)\n",
+                                major, minor);
+                        rc = -EACCES;
+                }
+                break;
+        default:
+                CERROR("unknown gss proc %d\n", proc);
+                rc = -EPROTO;
+        }
+
+        RETURN(rc);
+}
+
+static int gss_cred_seal(struct ptlrpc_cred *cred,
+                         struct ptlrpc_request *req)
+{
+        struct gss_cred         *gcred;
+        struct gss_cl_ctx       *ctx;
+        struct ptlrpcs_wire_hdr *sec_hdr;
+        rawobj_buf_t             msg_buf;
+        rawobj_t                 cipher_buf;
+        __u32                   *vp, *vpsave, vlen, seclen;
+        __u32                    major, seqnum, rc = 0;
+        ENTRY;
+
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_cred == cred);
+
+        gcred = container_of(cred, struct gss_cred, gc_base);
+        ctx = gss_cred_get_ctx(cred);
+        if (!ctx) {
+                CERROR("cred %p("LPU64"/%u) invalidated?\n",
+                        cred, cred->pc_pag, cred->pc_uid);
+                RETURN(-EPERM);
+        }
+
+        vp = (__u32 *) (req->rq_reqbuf + sizeof(*sec_hdr));
+        vlen = req->rq_reqbuf_len - sizeof(*sec_hdr);
+        seclen = vlen;
+
+        if (vlen < 6 * 4 + size_round4(ctx->gc_wire_ctx.len)) {
+                CERROR("vlen %d, need %d\n",
+                        vlen, 6 * 4 + size_round4(ctx->gc_wire_ctx.len));
+                rc = -EIO;
+                goto out;
+        }
+
+        spin_lock(&ctx->gc_seq_lock);
+        seqnum = ctx->gc_seq++;
+        spin_unlock(&ctx->gc_seq_lock);
+
+        *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION);    /* version */
+        *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5P);      /* subflavor */
+        *vp++ = cpu_to_le32(ctx->gc_proc);              /* proc */
+        *vp++ = cpu_to_le32(seqnum);                    /* seq */
+        *vp++ = cpu_to_le32(PTLRPC_GSS_SVC_PRIVACY);    /* service */
+        vlen -= 5 * 4;
+
+        if (rawobj_serialize(&ctx->gc_wire_ctx, &vp, &vlen)) {
+                rc = -EIO;
+                goto out;
+        }
+        CDEBUG(D_SEC, "encoded wire_ctx length %d\n", ctx->gc_wire_ctx.len);
+
+        vpsave = vp++;  /* reserve for size */
+        vlen -= 4;
+
+        msg_buf.buf = (__u8 *) req->rq_reqmsg - GSS_PRIVBUF_PREFIX_LEN;
+        msg_buf.buflen = req->rq_reqlen + GSS_PRIVBUF_PREFIX_LEN + GSS_PRIVBUF_SUFFIX_LEN;
+        msg_buf.dataoff = GSS_PRIVBUF_PREFIX_LEN;
+        msg_buf.datalen = req->rq_reqlen;
+
+        cipher_buf.data = (__u8 *) vp;
+        cipher_buf.len = vlen;
+
+        major = kgss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT,
+                          &msg_buf, &cipher_buf);
+        if (major) {
+                CERROR("error wrap: major 0x%x\n", major);
+                GOTO(out, rc = -EINVAL);
+        }
+
+        *vpsave = cpu_to_le32(cipher_buf.len);
+
+        seclen = seclen - vlen + cipher_buf.len;
+        sec_hdr = buf_to_sec_hdr(req->rq_reqbuf);
+        sec_hdr->sec_len = cpu_to_le32(seclen);
+        req->rq_reqdata_len += size_round(seclen);
+
+        CDEBUG(D_SEC, "msg size %d, total sec size %d\n",
+               req->rq_reqlen, seclen);
+out:
+        gss_put_ctx(ctx);
+        RETURN(rc);
+}
+
+static int gss_cred_unseal(struct ptlrpc_cred *cred,
+                           struct ptlrpc_request *req)
+{
+        struct gss_cred        *gcred;
+        struct gss_cl_ctx      *ctx;
+        struct ptlrpcs_wire_hdr *sec_hdr;
+        rawobj_t                cipher_text, plain_text;
+        __u32                   *vp, vlen, subflavor, proc, seq, svc;
+        int                     rc;
+        ENTRY;
+
+        LASSERT(req->rq_repbuf);
+        LASSERT(req->rq_cred == cred);
+
+        sec_hdr = buf_to_sec_hdr(req->rq_repbuf);
+        if (sec_hdr->msg_len != 0) {
+                CERROR("unexpected msg_len %u\n", sec_hdr->msg_len);
+                RETURN(-EPROTO);
+        }
+
+        vp = (__u32 *) (req->rq_repbuf + sizeof(*sec_hdr));
+        vlen = sec_hdr->sec_len;
+
+        if (vlen < 7 * 4) {
+                CERROR("reply sec size %u too small\n", vlen);
+                RETURN(-EPROTO);
+        }
+
+        if (*vp++ != cpu_to_le32(PTLRPC_SEC_GSS_VERSION)) {
+                CERROR("reply have different gss version\n");
+                RETURN(-EPROTO);
+        }
+        subflavor = le32_to_cpu(*vp++);
+        proc = le32_to_cpu(*vp++);
+        seq = le32_to_cpu(*vp++);
+        svc = le32_to_cpu(*vp++);
+        vlen -= 5 * 4;
+
+        switch (proc) {
+        case PTLRPC_GSS_PROC_DATA:
+                if (svc != PTLRPC_GSS_SVC_PRIVACY) {
+                        CERROR("Unknown svc %d\n", svc);
+                        RETURN(-EPROTO);
+                }
+                if (*vp++ != 0) {
+                        CERROR("Unexpected ctx handle\n");
+                        RETURN(-EPROTO);
+                }
+                vlen -= 4;
+
+                cipher_text.len = le32_to_cpu(*vp++);
+                cipher_text.data = (__u8 *) vp;
+                vlen -= 4;
+
+                if (vlen < cipher_text.len) {
+                        CERROR("cipher text to be %u while buf only %u\n",
+                                cipher_text.len, vlen);
+                        RETURN(-EPROTO);
+                }
+
+                plain_text = cipher_text;
+
+                gcred = container_of(cred, struct gss_cred, gc_base);
+                ctx = gss_cred_get_ctx(cred);
+                LASSERT(ctx);
+
+                rc = kgss_unwrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT,
+                                 &cipher_text, &plain_text);
+                if (rc) {
+                        CERROR("error unwrap: 0x%x\n", rc);
+                        GOTO(proc_out, rc = -EINVAL);
+                }
+
+                req->rq_repmsg = (struct lustre_msg *) vp;
+                req->rq_replen = plain_text.len;
+
+                rc = 0;
+proc_out:
+                gss_put_ctx(ctx);
+                break;
+        default:
+                CERROR("unknown gss proc %d\n", proc);
+                rc = -EPROTO;
+        }
+
+        RETURN(rc);
+}
+
+static void destroy_gss_context(struct ptlrpc_cred *cred)
+{
+        struct ptlrpcs_wire_hdr *hdr;
+        struct lustre_msg       *lmsg;
+        struct gss_cred         *gcred;
+        struct ptlrpc_request    req;
+        struct obd_import       *imp;
+        __u32                   *vp, lmsg_size;
+        ENTRY;
+
+        /* cred's refcount is 0, steal one */
+        atomic_inc(&cred->pc_refcount);
+
+        gcred = container_of(cred, struct gss_cred, gc_base);
+        gcred->gc_ctx->gc_proc = PTLRPC_GSS_PROC_DESTROY;
+        imp = cred->pc_sec->ps_import;
+        LASSERT(imp);
+
+        if (!(cred->pc_flags & PTLRPC_CRED_UPTODATE)) {
+                CWARN("Destroy a dead gss cred %p(%u@%s), don't send rpc\n",
+                       gcred, cred->pc_uid, imp->imp_target_uuid.uuid);
+                atomic_dec(&cred->pc_refcount);
+                EXIT;
+                return;
+        }
+
+        CWARN("client destroy gss cred %p(%u@%s)\n",
+               gcred, cred->pc_uid, imp->imp_target_uuid.uuid);
+
+        lmsg_size = lustre_msg_size(0, NULL);
+        req.rq_reqbuf_len = sizeof(*hdr) + lmsg_size +
+                            ptlrpcs_est_req_payload(cred->pc_sec, lmsg_size);
+
+        OBD_ALLOC(req.rq_reqbuf, req.rq_reqbuf_len);
+        if (!req.rq_reqbuf) {
+                CERROR("Fail to alloc reqbuf, cancel anyway\n");
+                atomic_dec(&cred->pc_refcount);
+                EXIT;
+                return;
+        }
+
+        /* wire hdr */
+        hdr = buf_to_sec_hdr(req.rq_reqbuf);
+        hdr->flavor  = cpu_to_le32(PTLRPC_SEC_GSS);
+        hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_AUTH);
+        hdr->msg_len = cpu_to_le32(lmsg_size);
+        hdr->sec_len = cpu_to_le32(0);
+
+        /* lustre message */
+        lmsg = buf_to_lustre_msg(req.rq_reqbuf);
+        lustre_init_msg(lmsg, 0, NULL, NULL);
+        lmsg->handle   = imp->imp_remote_handle;
+        lmsg->type     = PTL_RPC_MSG_REQUEST;
+        lmsg->opc      = SEC_FINI;
+        lmsg->flags    = 0;
+        lmsg->conn_cnt = imp->imp_conn_cnt;
+        /* add this for randomize */
+        get_random_bytes(&lmsg->last_xid, sizeof(lmsg->last_xid));
+        get_random_bytes(&lmsg->transno, sizeof(lmsg->transno));
+
+        vp = (__u32 *) req.rq_reqbuf;
+
+        req.rq_cred = cred;
+        req.rq_reqmsg = buf_to_lustre_msg(req.rq_reqbuf);
+        req.rq_reqlen = lmsg_size;
+        req.rq_reqdata_len = sizeof(*hdr) + lmsg_size;
+
+        if (gss_cred_sign(cred, &req)) {
+                CERROR("failed to sign, cancel anyway\n");
+                atomic_dec(&cred->pc_refcount);
+                goto exit;
+        }
+        atomic_dec(&cred->pc_refcount);
+
+        /* send out */
+        gss_send_secfini_rpc(imp, req.rq_reqbuf, req.rq_reqdata_len);
+exit:
+        OBD_FREE(req.rq_reqbuf, req.rq_reqbuf_len);
+        EXIT;
+}
+
+static void gss_cred_destroy(struct ptlrpc_cred *cred)
+{
+        struct gss_cred *gcred;
+        ENTRY;
+
+        LASSERT(cred);
+        LASSERT(!atomic_read(&cred->pc_refcount));
+
+        gcred = container_of(cred, struct gss_cred, gc_base);
+        if (gcred->gc_ctx) {
+                destroy_gss_context(cred);
+                gss_put_ctx(gcred->gc_ctx);
+        }
+
+        CDEBUG(D_SEC, "GSS_SEC: destroy cred %p\n", gcred);
+
+        OBD_FREE(gcred, sizeof(*gcred));
+        EXIT;
+}
+
+static struct ptlrpc_credops gss_credops = {
+        .refresh        = gss_cred_refresh,
+        .match          = gss_cred_match,
+        .sign           = gss_cred_sign,
+        .verify         = gss_cred_verify,
+        .seal           = gss_cred_seal,
+        .unseal         = gss_cred_unseal,
+        .destroy        = gss_cred_destroy,
+};
+
+#ifdef __KERNEL__
+/*******************************************
+ * rpc_pipe APIs                           *
+ *******************************************/
+static ssize_t
+gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg,
+                char *dst, size_t buflen)
+{
+        char *data = (char *)msg->data + msg->copied;
+        ssize_t mlen = msg->len;
+        ssize_t left;
+        ENTRY;
+
+        if (mlen > buflen)
+                mlen = buflen;
+        left = copy_to_user(dst, data, mlen);
+        if (left < 0) {
+                msg->errno = left;
+                RETURN(left);
+        }
+        mlen -= left;
+        msg->copied += mlen;
+        msg->errno = 0;
+        RETURN(mlen);
+}
+
+static ssize_t
+gss_pipe_downcall(struct file *filp, const char *src, size_t mlen)
+{
+        char *buf;
+        const int bufsize = 1024;
+        rawobj_t obj;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct rpc_inode *rpci = RPC_I(inode);
+        struct obd_import *import;
+        struct ptlrpc_sec *sec;
+        struct gss_sec *gsec;
+        char *obdname;
+        struct gss_api_mech *mech;
+        struct vfs_cred vcred = { 0 };
+        struct ptlrpc_cred *cred;
+        struct gss_upcall_msg *gss_msg;
+        struct gss_cl_ctx *ctx = NULL;
+        __u32  dest_ip;
+        ssize_t left;
+        int err, gss_err;
+        ENTRY;
+
+        if (mlen > bufsize) {
+                CERROR("mlen %ld > bufsize %d\n", (long)mlen, bufsize);
+                RETURN(-ENOSPC);
+        }
+
+        OBD_ALLOC(buf, bufsize);
+        if (!buf) {
+                CERROR("alloc mem failed\n");
+                RETURN(-ENOMEM);
+        }
+
+        left = copy_from_user(buf, src, mlen);
+        if (left)
+                GOTO(err_free, err = -EFAULT);
+
+        obj.data = buf;
+        obj.len = mlen;
+
+        LASSERT(rpci->private);
+        gsec = (struct gss_sec *)rpci->private;
+        sec = &gsec->gs_base;
+        LASSERT(sec->ps_import);
+        import = class_import_get(sec->ps_import);
+        LASSERT(import->imp_obd);
+        obdname = import->imp_obd->obd_name;
+        mech = gsec->gs_mech;
+
+        err = gss_parse_init_downcall(mech, &obj, &ctx, &vcred, &dest_ip,
+                                      &gss_err);
+        if (err) {
+                CERROR("parse downcall err %d\n", err);
+                GOTO(err, err);
+        }
+        cred = ptlrpcs_cred_lookup(sec, &vcred);
+        if (!cred) {
+                CWARN("didn't find cred\n");
+                GOTO(err, err);
+        }
+        if (gss_err) {
+                CERROR("got gss err %d, set cred %p dead\n", gss_err, cred);
+                cred->pc_flags |= PTLRPC_CRED_DEAD;
+        } else {
+                CDEBUG(D_SEC, "get initial ctx:\n");
+                gss_cred_set_ctx(cred, ctx);
+        }
+
+        spin_lock(&gsec->gs_lock);
+        gss_msg = gss_find_upcall(gsec, obdname, vcred.vc_uid, dest_ip);
+        if (gss_msg) {
+                gss_unhash_msg_nolock(gss_msg);
+                spin_unlock(&gsec->gs_lock);
+                gss_release_msg(gss_msg);
+        } else
+                spin_unlock(&gsec->gs_lock);
+
+        ptlrpcs_cred_put(cred, 1);
+        class_import_put(import);
+        OBD_FREE(buf, bufsize);
+        RETURN(mlen);
+err:
+        if (ctx)
+                gss_destroy_ctx(ctx);
+        class_import_put(import);
+err_free:
+        OBD_FREE(buf, bufsize);
+        CDEBUG(D_SEC, "gss_pipe_downcall returning %d\n", err);
+        RETURN(err);
+}
+
+static
+void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
+{
+        struct gss_upcall_msg *gmsg;
+        static unsigned long ratelimit;
+        ENTRY;
+
+        if (msg->errno >= 0) {
+                EXIT;
+                return;
+        }
+
+        gmsg = container_of(msg, struct gss_upcall_msg, gum_base);
+        CDEBUG(D_SEC, "destroy gmsg %p\n", gmsg);
+        atomic_inc(&gmsg->gum_refcount);
+        gss_unhash_msg(gmsg);
+        if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) {
+                unsigned long now = get_seconds();
+                if (time_after(now, ratelimit)) {
+                        CWARN("GSS_SEC upcall timed out.\n"
+                              "Please check user daemon is running!\n");
+                        ratelimit = now + 15;
+                }
+        }
+        gss_release_msg(gmsg);
+        EXIT;
+}
+
+static
+void gss_pipe_release(struct inode *inode)
+{
+        struct rpc_inode *rpci = RPC_I(inode);
+        struct ptlrpc_sec *sec;
+        struct gss_sec *gsec;
+        ENTRY;
+
+        gsec = (struct gss_sec *)rpci->private;
+        sec = &gsec->gs_base;
+        spin_lock(&gsec->gs_lock);
+        while (!list_empty(&gsec->gs_upcalls)) {
+                struct gss_upcall_msg *gmsg;
+
+                gmsg = list_entry(gsec->gs_upcalls.next,
+                                  struct gss_upcall_msg, gum_list);
+                gmsg->gum_base.errno = -EPIPE;
+                atomic_inc(&gmsg->gum_refcount);
+                gss_unhash_msg_nolock(gmsg);
+                gss_release_msg(gmsg);
+        }
+        spin_unlock(&gsec->gs_lock);
+        EXIT;
+}
+
+static struct rpc_pipe_ops gss_upcall_ops = {
+        .upcall         = gss_pipe_upcall,
+        .downcall       = gss_pipe_downcall,
+        .destroy_msg    = gss_pipe_destroy_msg,
+        .release_pipe   = gss_pipe_release,
+};
+#endif /* __KERNEL__ */
+
+/*********************************************
+ * GSS security APIs                         *
+ *********************************************/
+
+static
+struct ptlrpc_sec* gss_create_sec(ptlrpcs_flavor_t *flavor,
+                                  const char *pipe_dir,
+                                  void *pipe_data)
+{
+        struct gss_sec *gsec;
+        struct ptlrpc_sec *sec;
+        char *pos;
+        ENTRY;
+
+        LASSERT(flavor->flavor == PTLRPC_SEC_GSS);
+
+        OBD_ALLOC(gsec, sizeof(*gsec));
+        if (!gsec) {
+                CERROR("can't alloc gsec\n");
+                RETURN(NULL);
+        }
+
+        gsec->gs_mech = kgss_subflavor_to_mech(flavor->subflavor);
+        if (!gsec->gs_mech) {
+                CERROR("subflavor %d not found\n", flavor->subflavor);
+                goto err_free;
+        }
+
+        /* initialize gss sec */
+#ifdef __KERNEL__
+        INIT_LIST_HEAD(&gsec->gs_upcalls);
+        spin_lock_init(&gsec->gs_lock);
+
+        snprintf(gsec->gs_pipepath, sizeof(gsec->gs_pipepath),
+                 LUSTRE_PIPEDIR"/%s", pipe_dir);
+        if (IS_ERR(rpc_mkdir(gsec->gs_pipepath, NULL))) {
+                CERROR("can't make pipedir %s\n", gsec->gs_pipepath);
+                goto err_mech_put;
+        }
+
+        snprintf(gsec->gs_pipepath, sizeof(gsec->gs_pipepath),
+                 LUSTRE_PIPEDIR"/%s/%s", pipe_dir, gsec->gs_mech->gm_name); 
+        gsec->gs_depipe = rpc_mkpipe(gsec->gs_pipepath, gsec,
+                                     &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
+        if (IS_ERR(gsec->gs_depipe)) {
+                CERROR("failed to make rpc_pipe %s: %ld\n",
+                        gsec->gs_pipepath, PTR_ERR(gsec->gs_depipe));
+                goto err_rmdir;
+        }
+        CDEBUG(D_SEC, "gss sec %p, pipe path %s\n", gsec, gsec->gs_pipepath);
+#endif
+
+        sec = &gsec->gs_base;
+
+        switch (flavor->subflavor) {
+        case PTLRPC_SEC_GSS_KRB5I:
+                sec->ps_sectype = PTLRPC_SEC_TYPE_AUTH;
+                break;
+        case PTLRPC_SEC_GSS_KRB5P:
+                sec->ps_sectype = PTLRPC_SEC_TYPE_PRIV;
+                break;
+        default:
+                LBUG();
+        }
+
+        sec->ps_expire = GSS_CREDCACHE_EXPIRE;
+        sec->ps_nextgc = get_seconds() + sec->ps_expire;
+        sec->ps_flags = 0;
+
+        CDEBUG(D_SEC, "Create GSS security instance at %p(external %p)\n",
+               gsec, sec);
+        RETURN(sec);
+
+#ifdef __KERNEL__
+err_rmdir:
+        pos = strrchr(gsec->gs_pipepath, '/');
+        LASSERT(pos);
+        *pos = 0;
+        rpc_rmdir(gsec->gs_pipepath);
+err_mech_put:
+#endif
+        kgss_mech_put(gsec->gs_mech);
+err_free:
+        OBD_FREE(gsec, sizeof(*gsec));
+        RETURN(NULL);
+}
+
+static
+void gss_destroy_sec(struct ptlrpc_sec *sec)
+{
+        struct gss_sec *gsec;
+        char *pos;
+        ENTRY;
+
+        gsec = container_of(sec, struct gss_sec, gs_base);
+        CDEBUG(D_SEC, "Destroy GSS security instance at %p\n", gsec);
+
+        LASSERT(gsec->gs_mech);
+        LASSERT(!atomic_read(&sec->ps_refcount));
+        LASSERT(!atomic_read(&sec->ps_credcount));
+#ifdef __KERNEL__
+        rpc_unlink(gsec->gs_pipepath);
+        pos = strrchr(gsec->gs_pipepath, '/');
+        LASSERT(pos);
+        *pos = 0;
+        rpc_rmdir(gsec->gs_pipepath);
+#endif
+
+        kgss_mech_put(gsec->gs_mech);
+        OBD_FREE(gsec, sizeof(*gsec));
+        EXIT;
+}
+
+static
+struct ptlrpc_cred * gss_create_cred(struct ptlrpc_sec *sec,
+                                     struct ptlrpc_request *req,
+                                     struct vfs_cred *vcred)
+{
+        struct gss_cred *gcred;
+        struct ptlrpc_cred *cred;
+        ENTRY;
+
+        OBD_ALLOC(gcred, sizeof(*gcred));
+        if (!gcred)
+                RETURN(NULL);
+
+        cred = &gcred->gc_base;
+        INIT_LIST_HEAD(&cred->pc_hash);
+        atomic_set(&cred->pc_refcount, 0);
+        cred->pc_sec = sec;
+        cred->pc_ops = &gss_credops;
+        cred->pc_req = req;
+        cred->pc_expire = get_seconds() + GSS_CRED_EXPIRE;
+        cred->pc_flags = 0;
+        cred->pc_pag = vcred->vc_pag;
+        cred->pc_uid = vcred->vc_uid;
+        CDEBUG(D_SEC, "create a gss cred at %p("LPU64"/%u)\n",
+               cred, vcred->vc_pag, vcred->vc_uid);
+
+        RETURN(cred);
+}
+
+static int gss_estimate_payload(struct ptlrpc_sec *sec, int msgsize)
+{
+        switch (sec->ps_sectype) {
+        case PTLRPC_SEC_TYPE_AUTH:
+                return GSS_MAX_AUTH_PAYLOAD;
+        case PTLRPC_SEC_TYPE_PRIV:
+                return size_round16(GSS_MAX_AUTH_PAYLOAD + msgsize +
+                                    GSS_PRIVBUF_PREFIX_LEN +
+                                    GSS_PRIVBUF_SUFFIX_LEN);
+        default:
+                LBUG();
+                return 0;
+        }
+}
+
+static int gss_alloc_reqbuf(struct ptlrpc_sec *sec,
+                            struct ptlrpc_request *req,
+                            int lmsg_size)
+{
+        int msg_payload, sec_payload;
+        int privacy, rc;
+        ENTRY;
+
+        /* In PRIVACY mode, lustre message is always 0 (already encoded into
+         * security payload).
+         */
+        privacy = sec->ps_sectype == PTLRPC_SEC_TYPE_PRIV;
+        msg_payload = privacy ? 0 : lmsg_size;
+        sec_payload = gss_estimate_payload(sec, lmsg_size);
+
+        rc = sec_alloc_reqbuf(sec, req, msg_payload, sec_payload);
+        if (rc)
+                return rc;
+
+        if (privacy) {
+                int buflen = lmsg_size + GSS_PRIVBUF_PREFIX_LEN +
+                             GSS_PRIVBUF_SUFFIX_LEN;
+                char *buf;
+
+                OBD_ALLOC(buf, buflen);
+                if (!buf) {
+                        CERROR("Fail to alloc %d\n", buflen);
+                        sec_free_reqbuf(sec, req);
+                        RETURN(-ENOMEM);
+                }
+                req->rq_reqmsg = (struct lustre_msg *)
+                                        (buf + GSS_PRIVBUF_PREFIX_LEN);
+        }
+
+        RETURN(0);
+}
+
+static void gss_free_reqbuf(struct ptlrpc_sec *sec,
+                            struct ptlrpc_request *req)
+{
+        char *buf;
+        int privacy;
+        ENTRY;
+
+        LASSERT(req->rq_reqmsg);
+        LASSERT(req->rq_reqlen);
+
+        privacy = sec->ps_sectype == PTLRPC_SEC_TYPE_PRIV;
+        if (privacy) {
+                buf = (char *) req->rq_reqmsg - GSS_PRIVBUF_PREFIX_LEN;
+                LASSERT(buf < req->rq_reqbuf ||
+                        buf >= req->rq_reqbuf + req->rq_reqbuf_len);
+                OBD_FREE(buf, req->rq_reqlen + GSS_PRIVBUF_PREFIX_LEN +
+                              GSS_PRIVBUF_SUFFIX_LEN);
+                req->rq_reqmsg = NULL;
+        }
+
+        sec_free_reqbuf(sec, req);
+}
+
+static struct ptlrpc_secops gss_secops = {
+        .create_sec             = gss_create_sec,
+        .destroy_sec            = gss_destroy_sec,
+        .create_cred            = gss_create_cred,
+        .est_req_payload        = gss_estimate_payload,
+        .est_rep_payload        = gss_estimate_payload,
+        .alloc_reqbuf           = gss_alloc_reqbuf,
+        .free_reqbuf            = gss_free_reqbuf,
+};
+
+static struct ptlrpc_sec_type gss_type = {
+        .pst_owner      = THIS_MODULE,
+        .pst_name       = "GSS_SEC",
+        .pst_inst       = ATOMIC_INIT(0),
+        .pst_flavor     = {PTLRPC_SEC_GSS, 0},
+        .pst_ops        = &gss_secops,
+};
+
+extern int
+(*lustre_secinit_downcall_handler)(char *buffer, unsigned long count);
+
+int __init ptlrpcs_gss_init(void)
+{
+        int rc;
+
+        rc = ptlrpcs_register(&gss_type);
+        if (rc)
+                return rc;
+
+#ifdef __KERNEL__
+        gss_svc_init();
+
+        rc = PTR_ERR(rpc_mkdir(LUSTRE_PIPEDIR, NULL));
+        if (IS_ERR((void *)rc) && rc != -EEXIST) {
+                CERROR("fail to make rpcpipedir for lustre\n");
+                gss_svc_exit();
+                ptlrpcs_unregister(&gss_type);
+                return -1;
+        }
+        rc = 0;
+#else
+#endif
+        rc = init_kerberos_module();
+        if (rc) {
+                ptlrpcs_unregister(&gss_type);
+        }
+
+        lustre_secinit_downcall_handler = gss_send_secinit_rpc;
+
+        return rc;
+}
+
+static void __exit ptlrpcs_gss_exit(void)
+{
+        lustre_secinit_downcall_handler = NULL;
+
+        cleanup_kerberos_module();
+#ifndef __KERNEL__
+#else
+        rpc_rmdir(LUSTRE_PIPEDIR);
+        gss_svc_exit();
+#endif
+        ptlrpcs_unregister(&gss_type);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("GSS Security module for Lustre");
+MODULE_LICENSE("GPL");
+
+module_init(ptlrpcs_gss_init);
+module_exit(ptlrpcs_gss_exit);
diff --git a/lustre/sec/gss/svcsec_gss.c b/lustre/sec/gss/svcsec_gss.c
new file mode 100644 (file)
index 0000000..1ac060e
--- /dev/null
@@ -0,0 +1,1534 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Modifications for Lustre
+ * Copyright 2004, Cluster File Systems, Inc.
+ * All rights reserved
+ * Author: Eric Mei <ericm@clusterfs.com>
+ */
+
+/*
+ * Neil Brown <neilb@cse.unsw.edu.au>
+ * J. Bruce Fields <bfields@umich.edu>
+ * Andy Adamson <andros@umich.edu>
+ * Dug Song <dugsong@monkey.org>
+ *
+ * RPCSEC_GSS server authentication.
+ * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078
+ * (gssapi)
+ *
+ * The RPCSEC_GSS involves three stages:
+ *  1/ context creation
+ *  2/ data exchange
+ *  3/ context destruction
+ *
+ * Context creation is handled largely by upcalls to user-space.
+ *  In particular, GSS_Accept_sec_context is handled by an upcall
+ * Data exchange is handled entirely within the kernel
+ *  In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel.
+ * Context destruction is handled in-kernel
+ *  GSS_Delete_sec_context is in-kernel
+ *
+ * Context creation is initiated by a RPCSEC_GSS_INIT request arriving.
+ * The context handle and gss_token are used as a key into the rpcsec_init cache.
+ * The content of this cache includes some of the outputs of GSS_Accept_sec_context,
+ * being major_status, minor_status, context_handle, reply_token.
+ * These are sent back to the client.
+ * Sequence window management is handled by the kernel.  The window size if currently
+ * a compile time constant.
+ *
+ * When user-space is happy that a context is established, it places an entry
+ * in the rpcsec_context cache. The key for this cache is the context_handle.
+ * The content includes:
+ *   uid/gidlist - for determining access rights
+ *   mechanism type
+ *   mechanism specific information, such as a key
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <linux/sunrpc/cache.h>
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_sec.h>
+                                                                                                                        
+#include "gss_err.h"
+#include "gss_internal.h"
+#include "gss_api.h"
+
+static inline unsigned long hash_mem(char *buf, int length, int bits)
+{
+        unsigned long hash = 0;
+        unsigned long l = 0;
+        int len = 0;
+        unsigned char c;
+        do {
+                if (len == length) {
+                        c = (char)len; len = -1;
+                } else
+                        c = *buf++;
+                l = (l << 8) | c;
+                len++;
+                if ((len & (BITS_PER_LONG/8-1))==0)
+                        hash = hash_long(hash^l, BITS_PER_LONG);
+        } while (len);
+        return hash >> (BITS_PER_LONG - bits);
+}
+
+/* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests
+ * into replies.
+ *
+ * Key is context handle (\x if empty) and gss_token.
+ * Content is major_status minor_status (integers) context_handle, reply_token.
+ *
+ */
+
+#define RSI_HASHBITS    6
+#define RSI_HASHMAX     (1<<RSI_HASHBITS)
+#define RSI_HASHMASK    (RSI_HASHMAX-1)
+
+struct rsi {
+        struct cache_head       h;
+        rawobj_t                in_handle, in_token;
+        rawobj_t                out_handle, out_token;
+        int                     major_status, minor_status;
+};
+
+static struct cache_head *rsi_table[RSI_HASHMAX];
+static struct cache_detail rsi_cache;
+
+static void rsi_free(struct rsi *rsii)
+{
+        rawobj_free(&rsii->in_handle);
+        rawobj_free(&rsii->in_token);
+        rawobj_free(&rsii->out_handle);
+        rawobj_free(&rsii->out_token);
+}
+
+static void rsi_put(struct cache_head *item, struct cache_detail *cd)
+{
+        struct rsi *rsii = container_of(item, struct rsi, h);
+        if (cache_put(item, cd)) {
+                rsi_free(rsii);
+                OBD_FREE(rsii, sizeof(*rsii));
+        }
+}
+
+static inline int rsi_hash(struct rsi *item)
+{
+        return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS)
+              ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS);
+}
+
+static inline int rsi_match(struct rsi *item, struct rsi *tmp)
+{
+        return (rawobj_equal(&item->in_handle, &tmp->in_handle) &&
+                rawobj_equal(&item->in_token, &tmp->in_token));
+}
+
+static void rsi_request(struct cache_detail *cd,
+                        struct cache_head *h,
+                        char **bpp, int *blen)
+{
+        struct rsi *rsii = container_of(h, struct rsi, h);
+
+        qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len);
+        qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len);
+        (*bpp)[-1] = '\n';
+}
+
+static int
+gssd_reply(struct rsi *item)
+{
+        struct rsi *tmp;
+        struct cache_head **hp, **head;
+        ENTRY;
+
+        head = &rsi_cache.hash_table[rsi_hash(item)];
+        write_lock(&rsi_cache.hash_lock);
+        for (hp = head; *hp != NULL; hp = &tmp->h.next) {
+                tmp = container_of(*hp, struct rsi, h);
+                if (rsi_match(tmp, item)) {
+                        cache_get(&tmp->h);
+                        clear_bit(CACHE_HASHED, &tmp->h.flags);
+                        *hp = tmp->h.next;
+                        tmp->h.next = NULL;
+                        rsi_cache.entries--;
+                        if (test_bit(CACHE_VALID, &tmp->h.flags)) {
+                                write_unlock(&rsi_cache.hash_lock);
+                                rsi_put(&tmp->h, &rsi_cache);
+                                RETURN(-EINVAL);
+                        }
+                        set_bit(CACHE_HASHED, &item->h.flags);
+                        item->h.next = *hp;
+                        *hp = &item->h;
+                        rsi_cache.entries++;
+                        set_bit(CACHE_VALID, &item->h.flags);
+                        item->h.last_refresh = get_seconds();
+                        write_unlock(&rsi_cache.hash_lock);
+                        cache_fresh(&rsi_cache, &tmp->h, 0);
+                        rsi_put(&tmp->h, &rsi_cache);
+                        RETURN(0);
+                }
+        }
+        write_unlock(&rsi_cache.hash_lock);
+        RETURN(-EINVAL);
+}
+
+/* XXX
+ * here we just wait here for its completion or timedout. it's a
+ * hacking but works, and we'll comeup with real fix if we decided
+ * to still stick with NFS4 cache code
+ */
+static struct rsi *
+gssd_upcall(struct rsi *item, struct cache_req *chandle)
+{
+        struct rsi *tmp;
+        struct cache_head **hp, **head;
+        unsigned long starttime;
+        ENTRY;
+
+        head = &rsi_cache.hash_table[rsi_hash(item)];
+        read_lock(&rsi_cache.hash_lock);
+        for (hp = head; *hp != NULL; hp = &tmp->h.next) {
+                tmp = container_of(*hp, struct rsi, h);
+                if (rsi_match(tmp, item)) {
+                        LBUG();
+                        if (!test_bit(CACHE_VALID, &tmp->h.flags)) {
+                                CERROR("found rsi without VALID\n");
+                                read_unlock(&rsi_cache.hash_lock);
+                                return NULL;
+                        }
+                        *hp = tmp->h.next;
+                        tmp->h.next = NULL;
+                        rsi_cache.entries--;
+                        cache_get(&tmp->h);
+                        read_unlock(&rsi_cache.hash_lock);
+                        return tmp;
+                }
+        }
+        // cache_get(&item->h);
+        set_bit(CACHE_HASHED, &item->h.flags);
+        item->h.next = *head;
+        *head = &item->h;
+        rsi_cache.entries++;
+        read_unlock(&rsi_cache.hash_lock);
+        cache_get(&item->h);
+
+        cache_check(&rsi_cache, &item->h, chandle);
+        starttime = get_seconds();
+        do {
+                yield();
+                read_lock(&rsi_cache.hash_lock);
+                for (hp = head; *hp != NULL; hp = &tmp->h.next) {
+                        tmp = container_of(*hp, struct rsi, h);
+                        if (tmp == item)
+                                continue;
+                        if (rsi_match(tmp, item)) {
+                                if (!test_bit(CACHE_VALID, &tmp->h.flags)) {
+                                        read_unlock(&rsi_cache.hash_lock);
+                                        return NULL;
+                                }
+                                cache_get(&tmp->h);
+                                clear_bit(CACHE_HASHED, &tmp->h.flags);
+                                *hp = tmp->h.next;
+                                tmp->h.next = NULL;
+                                rsi_cache.entries--;
+                                read_unlock(&rsi_cache.hash_lock);
+                                return tmp;
+                        }
+                }
+                read_unlock(&rsi_cache.hash_lock);
+        } while ((get_seconds() - starttime) <= 5);
+        CERROR("5s timeout while waiting cache refill\n");
+        return NULL;
+}
+
+static int rsi_parse(struct cache_detail *cd,
+                     char *mesg, int mlen)
+{
+        /* context token expiry major minor context token */
+        char *buf = mesg;
+        char *ep;
+        int len;
+        struct rsi *rsii;
+        time_t expiry;
+        int status = -EINVAL;
+        ENTRY;
+
+        OBD_ALLOC(rsii, sizeof(*rsii));
+        if (!rsii) {
+                CERROR("failed to alloc rsii\n");
+                RETURN(-ENOMEM);
+        }
+        cache_init(&rsii->h);
+
+        /* handle */
+        len = qword_get(&mesg, buf, mlen);
+        if (len < 0)
+                goto out;
+        status = -ENOMEM;
+        if (rawobj_alloc(&rsii->in_handle, buf, len))
+                goto out;
+
+        /* token */
+        len = qword_get(&mesg, buf, mlen);
+        status = -EINVAL;
+        if (len < 0)
+                goto out;;
+        status = -ENOMEM;
+        if (rawobj_alloc(&rsii->in_token, buf, len))
+                goto out;
+
+        /* expiry */
+        expiry = get_expiry(&mesg);
+        status = -EINVAL;
+        if (expiry == 0)
+                goto out;
+
+        /* major/minor */
+        len = qword_get(&mesg, buf, mlen);
+        if (len < 0)
+                goto out;
+        if (len == 0) {
+                goto out;
+        } else {
+                rsii->major_status = simple_strtoul(buf, &ep, 10);
+                if (*ep)
+                        goto out;
+                len = qword_get(&mesg, buf, mlen);
+                if (len <= 0)
+                        goto out;
+                rsii->minor_status = simple_strtoul(buf, &ep, 10);
+                if (*ep)
+                        goto out;
+
+                /* out_handle */
+                len = qword_get(&mesg, buf, mlen);
+                if (len < 0)
+                        goto out;
+                status = -ENOMEM;
+                if (rawobj_alloc(&rsii->out_handle, buf, len))
+                        goto out;
+
+                /* out_token */
+                len = qword_get(&mesg, buf, mlen);
+                status = -EINVAL;
+                if (len < 0)
+                        goto out;
+                status = -ENOMEM;
+                if (rawobj_alloc(&rsii->out_token, buf, len))
+                        goto out;
+        }
+        rsii->h.expiry_time = expiry;
+        status = gssd_reply(rsii);
+out:
+        if (rsii)
+                rsi_put(&rsii->h, &rsi_cache);
+        RETURN(status);
+}
+
+static struct cache_detail rsi_cache = {
+        .hash_size      = RSI_HASHMAX,
+        .hash_table     = rsi_table,
+        .name           = "auth.ptlrpcs.init",
+        .cache_put      = rsi_put,
+        .cache_request  = rsi_request,
+        .cache_parse    = rsi_parse,
+};
+
+/*
+ * The rpcsec_context cache is used to store a context that is
+ * used in data exchange.
+ * The key is a context handle. The content is:
+ *  uid, gidlist, mechanism, service-set, mech-specific-data
+ */
+
+#define RSC_HASHBITS    10
+#define RSC_HASHMAX     (1<<RSC_HASHBITS)
+#define RSC_HASHMASK    (RSC_HASHMAX-1)
+
+#define GSS_SEQ_WIN     128
+
+struct gss_svc_seq_data {
+        /* highest seq number seen so far: */
+        __u32                   sd_max;
+        /* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of
+         * sd_win is nonzero iff sequence number i has been seen already: */
+        unsigned long           sd_win[GSS_SEQ_WIN/BITS_PER_LONG];
+        spinlock_t              sd_lock;
+};
+
+struct rsc {
+        struct cache_head       h;
+        rawobj_t                handle;
+        __u32                   remote;
+        struct vfs_cred         cred;
+        struct gss_svc_seq_data seqdata;
+        struct gss_ctx         *mechctx;
+};
+
+static struct cache_head *rsc_table[RSC_HASHMAX];
+static struct cache_detail rsc_cache;
+
+static void rsc_free(struct rsc *rsci)
+{
+        rawobj_free(&rsci->handle);
+        if (rsci->mechctx)
+                kgss_delete_sec_context(&rsci->mechctx);
+#if 0
+        if (rsci->cred.vc_ginfo)
+                put_group_info(rsci->cred.vc_ginfo);
+#endif
+}
+
+static void rsc_put(struct cache_head *item, struct cache_detail *cd)
+{
+        struct rsc *rsci = container_of(item, struct rsc, h);
+
+        if (cache_put(item, cd)) {
+                rsc_free(rsci);
+                OBD_FREE(rsci, sizeof(*rsci));
+        }
+}
+
+static inline int
+rsc_hash(struct rsc *rsci)
+{
+        return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS);
+}
+
+static inline int
+rsc_match(struct rsc *new, struct rsc *tmp)
+{
+        return rawobj_equal(&new->handle, &tmp->handle);
+}
+
+static struct rsc *rsc_lookup(struct rsc *item, int set)
+{
+        struct rsc *tmp = NULL;
+        struct cache_head **hp, **head;
+        head = &rsc_cache.hash_table[rsc_hash(item)];
+        ENTRY;
+
+        if (set)
+                write_lock(&rsc_cache.hash_lock);
+        else
+                read_lock(&rsc_cache.hash_lock);
+        for (hp = head; *hp != NULL; hp = &tmp->h.next) {
+                tmp = container_of(*hp, struct rsc, h);
+                if (!rsc_match(tmp, item))
+                        continue;
+                cache_get(&tmp->h);
+                if (!set) {
+                        goto out_noset;
+                }
+                *hp = tmp->h.next;
+                tmp->h.next = NULL;
+                clear_bit(CACHE_HASHED, &tmp->h.flags);
+                rsc_put(&tmp->h, &rsc_cache);
+                goto out_set;
+        }
+        /* Didn't find anything */
+        if (!set)
+                goto out_noset;
+        rsc_cache.entries++;
+out_set:
+        set_bit(CACHE_HASHED, &item->h.flags);
+        item->h.next = *head;
+        *head = &item->h;
+        write_unlock(&rsc_cache.hash_lock);
+        cache_fresh(&rsc_cache, &item->h, item->h.expiry_time);
+        cache_get(&item->h);
+        RETURN(item);
+out_noset:
+        read_unlock(&rsc_cache.hash_lock);
+        RETURN(tmp);
+}
+                                                                                                                        
+static int rsc_parse(struct cache_detail *cd,
+                     char *mesg, int mlen)
+{
+        /* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */
+        char *buf = mesg;
+        int len, rv;
+        struct rsc *rsci, *res = NULL;
+        time_t expiry;
+        int status = -EINVAL;
+
+        OBD_ALLOC(rsci, sizeof(*rsci));
+        if (!rsci) {
+                CERROR("fail to alloc rsci\n");
+                return -ENOMEM;
+        }
+        cache_init(&rsci->h);
+
+        /* context handle */
+        len = qword_get(&mesg, buf, mlen);
+        if (len < 0) goto out;
+        status = -ENOMEM;
+        if (rawobj_alloc(&rsci->handle, buf, len))
+                goto out;
+
+        /* expiry */
+        expiry = get_expiry(&mesg);
+        status = -EINVAL;
+        if (expiry == 0)
+                goto out;
+
+        /* remote flag */
+        rv = get_int(&mesg, &rsci->remote);
+        if (rv) {
+                CERROR("fail to get remote flag\n");
+                goto out;
+        }
+
+        /* uid, or NEGATIVE */
+        rv = get_int(&mesg, &rsci->cred.vc_uid);
+        if (rv == -EINVAL)
+                goto out;
+        if (rv == -ENOENT)
+                set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+        else {
+                int N, i;
+                struct gss_api_mech *gm;
+                rawobj_t tmp_buf;
+                __u64 ctx_expiry;
+
+                /* gid */
+                if (get_int(&mesg, &rsci->cred.vc_gid))
+                        goto out;
+
+                /* number of additional gid's */
+                if (get_int(&mesg, &N))
+                        goto out;
+                status = -ENOMEM;
+#if 0
+                rsci->cred.vc_ginfo = groups_alloc(N);
+                if (rsci->cred.vc_ginfo == NULL)
+                        goto out;
+#endif
+
+                /* gid's */
+                status = -EINVAL;
+                for (i=0; i<N; i++) {
+                        gid_t gid;
+                        if (get_int(&mesg, &gid))
+                                goto out;
+#if 0
+                        GROUP_AT(rsci->cred.vc_ginfo, i) = gid;
+#endif
+                }
+
+                /* mech name */
+                len = qword_get(&mesg, buf, mlen);
+                if (len < 0)
+                        goto out;
+                gm = kgss_name_to_mech(buf);
+                status = -EOPNOTSUPP;
+                if (!gm)
+                        goto out;
+
+                status = -EINVAL;
+                /* mech-specific data: */
+                len = qword_get(&mesg, buf, mlen);
+                if (len < 0) {
+                        kgss_mech_put(gm);
+                        goto out;
+                }
+                tmp_buf.len = len;
+                tmp_buf.data = buf;
+                if (kgss_import_sec_context(&tmp_buf, gm, &rsci->mechctx)) {
+                        kgss_mech_put(gm);
+                        goto out;
+                }
+
+                /* currently the expiry time passed down from user-space
+                 * is invalid, here we retrive it from mech.
+                 */
+                if (kgss_inquire_context(rsci->mechctx, &ctx_expiry)) {
+                        CERROR("unable to get expire time, drop it\n");
+                        set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+                        kgss_mech_put(gm);
+                        goto out;
+                }
+                expiry = (time_t) ctx_expiry;
+
+                kgss_mech_put(gm);
+        }
+        rsci->h.expiry_time = expiry;
+        spin_lock_init(&rsci->seqdata.sd_lock);
+        res = rsc_lookup(rsci, 1);
+        rsc_put(&res->h, &rsc_cache);
+        status = 0;
+out:
+        if (rsci)
+                rsc_put(&rsci->h, &rsc_cache);
+        return status;
+}
+
+/*
+ * flush all entries with @uid. @uid == -1 will match all.
+ * we only know the uid, maybe netid/nid in the future, in all cases
+ * we must search the whole cache
+ */
+static void rsc_flush(uid_t uid)
+{
+        struct cache_head **ch;
+        struct rsc *rscp;
+        int n;
+        ENTRY;
+
+        write_lock(&rsc_cache.hash_lock);
+        for (n = 0; n < RSC_HASHMAX; n++) {
+                for (ch = &rsc_cache.hash_table[n]; *ch;) {
+                        rscp = container_of(*ch, struct rsc, h);
+                        if (uid == -1 || rscp->cred.vc_uid == uid) {
+                                /* it seems simply set NEGATIVE doesn't work */
+                                *ch = (*ch)->next;
+                                rscp->h.next = NULL;
+                                cache_get(&rscp->h);
+                                set_bit(CACHE_NEGATIVE, &rscp->h.flags);
+                                clear_bit(CACHE_HASHED, &rscp->h.flags);
+                                CWARN("flush rsc %p for uid %u\n",
+                                       rscp, rscp->cred.vc_uid);
+                                rsc_put(&rscp->h, &rsc_cache);
+                                rsc_cache.entries--;
+                                continue;
+                        }
+                        ch = &((*ch)->next);
+                }
+        }
+        write_unlock(&rsc_cache.hash_lock);
+        EXIT;
+}
+
+static struct cache_detail rsc_cache = {
+        .hash_size      = RSC_HASHMAX,
+        .hash_table     = rsc_table,
+        .name           = "auth.ptlrpcs.context",
+        .cache_put      = rsc_put,
+        .cache_parse    = rsc_parse,
+};
+
+static struct rsc *
+gss_svc_searchbyctx(rawobj_t *handle)
+{
+        struct rsc rsci;
+        struct rsc *found;
+
+        rsci.handle = *handle;
+        found = rsc_lookup(&rsci, 0);
+        if (!found)
+                return NULL;
+
+        if (cache_check(&rsc_cache, &found->h, NULL))
+                return NULL;
+
+        return found;
+}
+
+struct gss_svc_data {
+        /* decoded gss client cred: */
+        struct rpc_gss_wire_cred        clcred;
+        /* internal used status */
+        unsigned int                    is_init:1,
+                                        is_init_continue:1,
+                                        is_err_notify:1,
+                                        is_fini:1;
+        int                             reserve_len;
+};
+
+/* FIXME
+ * again hacking: only try to give the svcgssd a chance to handle
+ * upcalls.
+ */
+struct cache_deferred_req* my_defer(struct cache_req *req)
+{
+        yield();
+        return NULL;
+}
+static struct cache_req my_chandle = {my_defer};
+
+/* Implements sequence number algorithm as specified in RFC 2203. */
+static int
+gss_check_seq_num(struct gss_svc_seq_data *sd, __u32 seq_num)
+{
+        int rc = 0;
+
+        spin_lock(&sd->sd_lock);
+        if (seq_num > sd->sd_max) {
+                if (seq_num >= sd->sd_max + GSS_SEQ_WIN) {
+                        memset(sd->sd_win, 0, sizeof(sd->sd_win));
+                        sd->sd_max = seq_num;
+                } else {
+                        while(sd->sd_max < seq_num) {
+                                sd->sd_max++;
+                                __clear_bit(sd->sd_max % GSS_SEQ_WIN,
+                                            sd->sd_win);
+                        }
+                }
+                __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win);
+                goto exit;
+        } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) {
+                rc = 1;
+                goto exit;
+        }
+
+        if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win))
+                rc = 1;
+exit:
+        spin_unlock(&sd->sd_lock);
+        return rc;
+}
+
+static int
+gss_svc_verify_request(struct ptlrpc_request *req,
+                       struct rsc *rsci,
+                       struct rpc_gss_wire_cred *gc,
+                       __u32 *vp, __u32 vlen)
+{
+        struct ptlrpcs_wire_hdr *sec_hdr;
+        struct gss_ctx *ctx = rsci->mechctx;
+        __u32 maj_stat;
+        rawobj_t msg;
+        rawobj_t mic;
+        ENTRY;
+
+        sec_hdr = (struct ptlrpcs_wire_hdr *) req->rq_reqbuf;
+
+        req->rq_reqmsg = (struct lustre_msg *) (req->rq_reqbuf + sizeof(*sec_hdr));
+        req->rq_reqlen = sec_hdr->msg_len;
+
+        msg.len = sec_hdr->msg_len;
+        msg.data = (__u8 *)req->rq_reqmsg;
+
+        mic.len = le32_to_cpu(*vp++);
+        mic.data = (char *) vp;
+        vlen -= 4;
+
+        if (mic.len > vlen) {
+                CERROR("checksum len %d, while buffer len %d\n",
+                        mic.len, vlen);
+                RETURN(GSS_S_CALL_BAD_STRUCTURE);
+        }
+
+        if (mic.len > 256) {
+                CERROR("invalid mic len %d\n", mic.len);
+                RETURN(GSS_S_CALL_BAD_STRUCTURE);
+        }
+
+        maj_stat = kgss_verify_mic(ctx, &msg, &mic, NULL);
+        if (maj_stat != GSS_S_COMPLETE) {
+                CERROR("MIC verification error: major %x\n", maj_stat);
+                RETURN(maj_stat);
+        }
+
+        if (gss_check_seq_num(&rsci->seqdata, gc->gc_seq)) {
+                CERROR("discard request %p with old seq_num %u\n",
+                        req, gc->gc_seq);
+                RETURN(GSS_S_DUPLICATE_TOKEN);
+        }
+
+        RETURN(GSS_S_COMPLETE);
+}
+
+static int
+gss_svc_unseal_request(struct ptlrpc_request *req,
+                       struct rsc *rsci,
+                       struct rpc_gss_wire_cred *gc,
+                       __u32 *vp, __u32 vlen)
+{
+        struct ptlrpcs_wire_hdr *sec_hdr;
+        struct gss_ctx *ctx = rsci->mechctx;
+        rawobj_t cipher_text, plain_text;
+        __u32 major;
+        ENTRY;
+
+        sec_hdr = (struct ptlrpcs_wire_hdr *) req->rq_reqbuf;
+
+        if (vlen < 4) {
+                CERROR("vlen only %u\n", vlen);
+                RETURN(GSS_S_CALL_BAD_STRUCTURE);
+        }
+
+        cipher_text.len = le32_to_cpu(*vp++);
+        cipher_text.data = (__u8 *) vp;
+        vlen -= 4;
+        
+        if (cipher_text.len > vlen) {
+                CERROR("cipher claimed %u while buf only %u\n",
+                        cipher_text.len, vlen);
+                RETURN(GSS_S_CALL_BAD_STRUCTURE);
+        }
+
+        plain_text = cipher_text;
+
+        major = kgss_unwrap(ctx, GSS_C_QOP_DEFAULT, &cipher_text, &plain_text);
+        if (major) {
+                CERROR("unwrap error 0x%x\n", major);
+                RETURN(major);
+        }
+
+        if (gss_check_seq_num(&rsci->seqdata, gc->gc_seq)) {
+                CERROR("discard request %p with old seq_num %u\n",
+                        req, gc->gc_seq);
+                RETURN(GSS_S_DUPLICATE_TOKEN);
+        }
+
+        req->rq_reqmsg = (struct lustre_msg *) (vp);
+        req->rq_reqlen = plain_text.len;
+
+        CDEBUG(D_SEC, "msg len %d\n", req->rq_reqlen);
+
+        RETURN(GSS_S_COMPLETE);
+}
+
+static int
+gss_pack_err_notify(struct ptlrpc_request *req,
+                    __u32 major, __u32 minor)
+{
+        struct gss_svc_data *svcdata = req->rq_sec_svcdata;
+        __u32 reslen, *resp, *reslenp;
+        char  nidstr[PTL_NALFMT_SIZE];
+        const __u32 secdata_len = 7 * 4;
+        int rc;
+        ENTRY;
+
+        OBD_FAIL_RETURN(OBD_FAIL_SVCGSS_ERR_NOTIFY|OBD_FAIL_ONCE, -EINVAL);
+
+        LASSERT(svcdata);
+        svcdata->is_err_notify = 1;
+        svcdata->reserve_len = 7 * 4;
+
+        rc = lustre_pack_reply(req, 0, NULL, NULL);
+        if (rc) {
+                CERROR("could not pack reply, err %d\n", rc);
+                RETURN(rc);
+        }
+
+        LASSERT(req->rq_reply_state);
+        LASSERT(req->rq_reply_state->rs_repbuf);
+        LASSERT(req->rq_reply_state->rs_repbuf_len >= secdata_len);
+        resp = (__u32 *) req->rq_reply_state->rs_repbuf;
+
+        /* header */
+        *resp++ = cpu_to_le32(PTLRPC_SEC_GSS);
+        *resp++ = cpu_to_le32(PTLRPC_SEC_TYPE_NONE);
+        *resp++ = cpu_to_le32(req->rq_replen);
+        reslenp = resp++;
+
+        /* skip lustre msg */
+        resp += req->rq_replen / 4;
+        reslen = svcdata->reserve_len;
+
+        /* gss replay:
+         * version, subflavor, notify, major, minor,
+         * obj1(fake), obj2(fake)
+         */
+        *resp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION);
+        *resp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I);
+        *resp++ = cpu_to_le32(PTLRPC_GSS_PROC_ERR);
+        *resp++ = cpu_to_le32(major);
+        *resp++ = cpu_to_le32(minor);
+        *resp++ = 0;
+        *resp++ = 0;
+        reslen -= (4 * 4);
+        /* the actual sec data length */
+        *reslenp = cpu_to_le32(secdata_len);
+
+        req->rq_reply_state->rs_repdata_len += (secdata_len);
+        CWARN("prepare gss error notify(0x%x/0x%x) to %s\n", major, minor,
+               portals_nid2str(req->rq_peer.peer_ni->pni_number,
+                               req->rq_peer.peer_id.nid, nidstr));
+        RETURN(0);
+}
+
+static int
+gss_svcsec_handle_init(struct ptlrpc_request *req,
+                       struct rpc_gss_wire_cred *gc,
+                       __u32 *secdata, __u32 seclen,
+                       enum ptlrpcs_error *res)
+{
+        struct gss_svc_data *svcdata = req->rq_sec_svcdata;
+        struct rsc          *rsci;
+        struct rsi          *rsikey, *rsip;
+        rawobj_t             tmpobj;
+        __u32 reslen,       *resp, *reslenp;
+        char                 nidstr[PTL_NALFMT_SIZE];
+        int                  rc;
+        ENTRY;
+
+        LASSERT(svcdata);
+
+        CWARN("processing gss init(%d) request from %s\n", gc->gc_proc,
+               portals_nid2str(req->rq_peer.peer_ni->pni_number,
+                               req->rq_peer.peer_id.nid, nidstr));
+
+        *res = PTLRPCS_BADCRED;
+        OBD_FAIL_RETURN(OBD_FAIL_SVCGSS_INIT_REQ|OBD_FAIL_ONCE, SVC_DROP);
+
+        if (gc->gc_proc == RPC_GSS_PROC_INIT &&
+            gc->gc_ctx.len != 0) {
+                CERROR("proc %d, ctx_len %d: not really init?\n",
+                gc->gc_proc == RPC_GSS_PROC_INIT, gc->gc_ctx.len);
+                RETURN(SVC_DROP);
+        }
+
+        OBD_ALLOC(rsikey, sizeof(*rsikey));
+        if (!rsikey) {
+                CERROR("out of memory\n");
+                RETURN(SVC_DROP);
+        }
+        cache_init(&rsikey->h);
+
+        if (rawobj_dup(&rsikey->in_handle, &gc->gc_ctx)) {
+                CERROR("fail to dup context handle\n");
+                GOTO(out_rsikey, rc = SVC_DROP);
+        }
+        *res = PTLRPCS_BADVERF;
+        if (rawobj_extract(&tmpobj, &secdata, &seclen)) {
+                CERROR("can't extract token\n");
+                GOTO(out_rsikey, rc = SVC_DROP);
+        }
+        if (rawobj_dup(&rsikey->in_token, &tmpobj)) {
+                CERROR("can't duplicate token\n");
+                GOTO(out_rsikey, rc = SVC_DROP);
+        }
+
+        rsip = gssd_upcall(rsikey, &my_chandle);
+        if (!rsip) {
+                CERROR("error in gssd_upcall.\n");
+                GOTO(out_rsikey, rc = SVC_DROP);
+        }
+
+        rsci = gss_svc_searchbyctx(&rsip->out_handle);
+        if (!rsci) {
+                CERROR("rsci still not mature yet?\n");
+                GOTO(out_rsip, rc = SVC_DROP);
+        }
+        CWARN("svcsec create gss context %p(%u@%s)\n",
+               rsci, rsci->cred.vc_uid,
+               portals_nid2str(req->rq_peer.peer_ni->pni_number,
+                               req->rq_peer.peer_id.nid, nidstr));
+
+        svcdata->is_init = 1;
+        svcdata->reserve_len = 6 * 4 +
+                size_round4(rsip->out_handle.len) +
+                size_round4(rsip->out_token.len);
+
+        rc = lustre_pack_reply(req, 0, NULL, NULL);
+        if (rc) {
+                CERROR("failed to pack reply, rc = %d\n", rc);
+                GOTO(out, rc = SVC_DROP);
+        }
+
+        /* header */
+        resp = (__u32 *) req->rq_reply_state->rs_repbuf;
+        *resp++ = cpu_to_le32(PTLRPC_SEC_GSS);
+        *resp++ = cpu_to_le32(PTLRPC_SEC_TYPE_NONE);
+        *resp++ = cpu_to_le32(req->rq_replen);
+        reslenp = resp++;
+
+        resp += req->rq_replen / 4;
+        reslen = svcdata->reserve_len;
+
+        /* gss reply:
+         * status, major, minor, seq, out_handle, out_token
+         */
+        *resp++ = cpu_to_le32(PTLRPCS_OK);
+        *resp++ = cpu_to_le32(rsip->major_status);
+        *resp++ = cpu_to_le32(rsip->minor_status);
+        *resp++ = cpu_to_le32(GSS_SEQ_WIN);
+        reslen -= (4 * 4);
+        if (rawobj_serialize(&rsip->out_handle,
+                             &resp, &reslen))
+                LBUG();
+        if (rawobj_serialize(&rsip->out_token,
+                             &resp, &reslen))
+                LBUG();
+        /* the actual sec data length */
+        *reslenp = cpu_to_le32(svcdata->reserve_len - reslen);
+
+        req->rq_reply_state->rs_repdata_len += le32_to_cpu(*reslenp);
+        CDEBUG(D_SEC, "req %p: msgsize %d, authsize %d, "
+               "total size %d\n", req, req->rq_replen,
+               le32_to_cpu(*reslenp),
+               req->rq_reply_state->rs_repdata_len);
+
+        *res = PTLRPCS_OK;
+
+        /* This is simplified since right now we doesn't support
+         * INIT_CONTINUE yet.
+         */
+        if (gc->gc_proc == RPC_GSS_PROC_INIT) {
+                struct ptlrpcs_wire_hdr *hdr;
+
+                hdr = buf_to_sec_hdr(req->rq_reqbuf);
+                req->rq_reqmsg = buf_to_lustre_msg(req->rq_reqbuf);
+                req->rq_reqlen = hdr->msg_len;
+
+                rc = SVC_LOGIN;
+        } else
+                rc = SVC_COMPLETE;
+
+out:
+        rsc_put(&rsci->h, &rsc_cache);
+out_rsip:
+        rsi_put(&rsip->h, &rsi_cache);
+out_rsikey:
+        rsi_put(&rsikey->h, &rsi_cache);
+
+        RETURN(rc);
+}
+
+static int
+gss_svcsec_handle_data(struct ptlrpc_request *req,
+                       struct rpc_gss_wire_cred *gc,
+                       __u32 *secdata, __u32 seclen,
+                       enum ptlrpcs_error *res)
+{
+        struct rsc          *rsci;
+        char                 nidstr[PTL_NALFMT_SIZE];
+        __u32                major;
+        int                  rc;
+        ENTRY;
+
+        *res = PTLRPCS_GSS_CREDPROBLEM;
+
+        rsci = gss_svc_searchbyctx(&gc->gc_ctx);
+        if (!rsci) {
+                CWARN("Invalid gss context handle from %s\n",
+                       portals_nid2str(req->rq_peer.peer_ni->pni_number,
+                                       req->rq_peer.peer_id.nid, nidstr));
+                major = GSS_S_NO_CONTEXT;
+                goto notify_err;
+        }
+
+        switch (gc->gc_svc) {
+        case PTLRPC_GSS_SVC_INTEGRITY:
+                major = gss_svc_verify_request(req, rsci, gc, secdata, seclen);
+                if (major == GSS_S_COMPLETE)
+                        break;
+
+                CWARN("fail in verify:0x%x: ctx %p@%s\n", major, rsci,
+                       portals_nid2str(req->rq_peer.peer_ni->pni_number,
+                                       req->rq_peer.peer_id.nid, nidstr));
+                goto notify_err;
+        case PTLRPC_GSS_SVC_PRIVACY:
+                major = gss_svc_unseal_request(req, rsci, gc, secdata, seclen);
+                if (major == GSS_S_COMPLETE)
+                        break;
+
+                CWARN("fail in decrypt:0x%x: ctx %p@%s\n", major, rsci,
+                       portals_nid2str(req->rq_peer.peer_ni->pni_number,
+                                       req->rq_peer.peer_id.nid, nidstr));
+                goto notify_err;
+        default:
+                CERROR("unsupported gss service %d\n", gc->gc_svc);
+                GOTO(out, rc = SVC_DROP);
+        }
+
+        req->rq_auth_uid = rsci->cred.vc_uid;
+        req->rq_remote = rsci->remote;
+
+        *res = PTLRPCS_OK;
+        GOTO(out, rc = SVC_OK);
+
+notify_err:
+        if (gss_pack_err_notify(req, major, 0))
+                rc = SVC_DROP;
+        else
+                rc = SVC_COMPLETE;
+out:
+        if (rsci)
+                rsc_put(&rsci->h, &rsc_cache);
+        RETURN(rc);
+}
+
+static int
+gss_svcsec_handle_destroy(struct ptlrpc_request *req,
+                          struct rpc_gss_wire_cred *gc,
+                          __u32 *secdata, __u32 seclen,
+                          enum ptlrpcs_error *res)
+{
+        struct gss_svc_data *svcdata = req->rq_sec_svcdata;
+        struct rsc          *rsci;
+        char                 nidstr[PTL_NALFMT_SIZE];
+        int                  rc;
+        ENTRY;
+
+        LASSERT(svcdata);
+        *res = PTLRPCS_GSS_CREDPROBLEM;
+
+        rsci = gss_svc_searchbyctx(&gc->gc_ctx);
+        if (!rsci) {
+                CWARN("invalid gss context handle for destroy.\n");
+                RETURN(SVC_DROP);
+        }
+
+        if (gc->gc_svc != PTLRPC_GSS_SVC_INTEGRITY) {
+                CERROR("service %d is not supported in destroy.\n",
+                        gc->gc_svc);
+                GOTO(out, rc = SVC_DROP);
+        }
+
+        *res = gss_svc_verify_request(req, rsci, gc, secdata, seclen);
+        if (*res)
+                GOTO(out, rc = SVC_DROP);
+
+        /* compose reply, which is actually nothing */
+        svcdata->is_fini = 1;
+        if (lustre_pack_reply(req, 0, NULL, NULL))
+                GOTO(out, rc = SVC_DROP);
+
+        CWARN("svcsec destroy gss context %p(%u@%s)\n",
+               rsci, rsci->cred.vc_uid,
+               portals_nid2str(req->rq_peer.peer_ni->pni_number,
+                               req->rq_peer.peer_id.nid, nidstr));
+
+        set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+        *res = PTLRPCS_OK;
+        rc = SVC_LOGOUT;
+out:
+        rsc_put(&rsci->h, &rsc_cache);
+        RETURN(rc);
+}
+
+/*
+ * let incomming request go through security check:
+ *  o context establishment: invoke user space helper
+ *  o data exchange: verify/decrypt
+ *  o context destruction: mark context invalid
+ *
+ * in most cases, error will result to drop the packet silently.
+ */
+static int
+gss_svcsec_accept(struct ptlrpc_request *req, enum ptlrpcs_error *res)
+{
+        struct gss_svc_data *svcdata;
+        struct rpc_gss_wire_cred *gc;
+        struct ptlrpcs_wire_hdr *sec_hdr;
+        __u32 seclen, *secdata, version, subflavor;
+        int rc;
+        ENTRY;
+
+        CDEBUG(D_SEC, "request %p\n", req);
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_reqbuf_len);
+
+        *res = PTLRPCS_BADCRED;
+
+        sec_hdr = buf_to_sec_hdr(req->rq_reqbuf);
+        LASSERT(sec_hdr->flavor == PTLRPC_SEC_GSS);
+
+        seclen = req->rq_reqbuf_len - sizeof(*sec_hdr) - sec_hdr->msg_len;
+        secdata = (__u32 *) buf_to_sec_data(req->rq_reqbuf);
+
+        if (sec_hdr->sec_len > seclen) {
+                CERROR("seclen %d, while max buf %d\n",
+                        sec_hdr->sec_len, seclen);
+                RETURN(SVC_DROP);
+        }
+
+        if (seclen < 6 * 4) {
+                CERROR("sec size %d too small\n", seclen);
+                RETURN(SVC_DROP);
+        }
+
+        LASSERT(!req->rq_sec_svcdata);
+        OBD_ALLOC(svcdata, sizeof(*svcdata));
+        if (!svcdata) {
+                CERROR("fail to alloc svcdata\n");
+                RETURN(SVC_DROP);
+        }
+        req->rq_sec_svcdata = svcdata;
+        gc = &svcdata->clcred;
+
+        /* Now secdata/seclen is what we want to parse
+         */
+        version = le32_to_cpu(*secdata++);      /* version */
+        subflavor = le32_to_cpu(*secdata++);    /* subflavor */
+        gc->gc_proc = le32_to_cpu(*secdata++);  /* proc */
+        gc->gc_seq = le32_to_cpu(*secdata++);   /* seq */
+        gc->gc_svc = le32_to_cpu(*secdata++);   /* service */
+        seclen -= 5 * 4;
+
+        CDEBUG(D_SEC, "wire gss_hdr: %u/%u/%u/%u/%u\n",
+               version, subflavor, gc->gc_proc, gc->gc_seq, gc->gc_svc);
+
+        if (version != PTLRPC_SEC_GSS_VERSION) {
+                CERROR("gss version mismatch: %d - %d\n",
+                        version, PTLRPC_SEC_GSS_VERSION);
+                GOTO(err_free, rc = SVC_DROP);
+        }
+
+        if (rawobj_extract(&gc->gc_ctx, &secdata, &seclen)) {
+                CERROR("fail to obtain gss context handle\n");
+                GOTO(err_free, rc = SVC_DROP);
+        }
+
+        *res = PTLRPCS_BADVERF;
+        switch(gc->gc_proc) {
+        case RPC_GSS_PROC_INIT:
+        case RPC_GSS_PROC_CONTINUE_INIT:
+                rc = gss_svcsec_handle_init(req, gc, secdata, seclen, res);
+                break;
+        case RPC_GSS_PROC_DATA:
+                rc = gss_svcsec_handle_data(req, gc, secdata, seclen, res);
+                break;
+        case RPC_GSS_PROC_DESTROY:
+                rc = gss_svcsec_handle_destroy(req, gc, secdata, seclen, res);
+                break;
+        default:
+                rc = SVC_DROP;
+                LBUG();
+        }
+
+err_free:
+        if (rc == SVC_DROP && req->rq_sec_svcdata) {
+                OBD_FREE(req->rq_sec_svcdata, sizeof(struct gss_svc_data));
+                req->rq_sec_svcdata = NULL;
+        }
+
+        RETURN(rc);
+}
+
+static int
+gss_svcsec_authorize(struct ptlrpc_request *req)
+{
+        struct ptlrpc_reply_state *rs = req->rq_reply_state;
+        struct gss_svc_data *gsd = (struct gss_svc_data *)req->rq_sec_svcdata;
+        struct rpc_gss_wire_cred  *gc = &gsd->clcred;
+        struct rsc                *rscp;
+        struct ptlrpcs_wire_hdr   *sec_hdr;
+        rawobj_buf_t               msg_buf;
+        rawobj_t                   cipher_buf;
+        __u32                     *vp, *vpsave, major, vlen, seclen;
+        rawobj_t                   lmsg, mic;
+        int                        ret;
+        ENTRY;
+
+        LASSERT(rs);
+        LASSERT(rs->rs_repbuf);
+        LASSERT(gsd);
+
+        if (gsd->is_init || gsd->is_init_continue ||
+            gsd->is_err_notify || gsd->is_fini) {
+                /* nothing to do in these cases */
+                CDEBUG(D_SEC, "req %p: init/fini/err\n", req);
+                RETURN(0);
+        }
+
+        if (gc->gc_proc != RPC_GSS_PROC_DATA) {
+                CERROR("proc %d not support\n", gc->gc_proc);
+                RETURN(-EINVAL);
+        }
+
+        rscp = gss_svc_searchbyctx(&gc->gc_ctx);
+        if (!rscp) {
+                CERROR("ctx disapeared under us?\n");
+                RETURN(-EINVAL);
+        }
+
+        sec_hdr = (struct ptlrpcs_wire_hdr *) rs->rs_repbuf;
+        switch (gc->gc_svc) {
+        case  PTLRPC_GSS_SVC_INTEGRITY:
+                /* prepare various pointers */
+                lmsg.len = req->rq_replen;
+                lmsg.data = (__u8 *) (rs->rs_repbuf + sizeof(*sec_hdr));
+                vp = (__u32 *) (lmsg.data + lmsg.len);
+                vlen = rs->rs_repbuf_len - sizeof(*sec_hdr) - lmsg.len;
+                seclen = vlen;
+
+                sec_hdr->flavor = cpu_to_le32(PTLRPC_SEC_GSS);
+                sec_hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_AUTH);
+                sec_hdr->msg_len = cpu_to_le32(req->rq_replen);
+
+                /* standard gss hdr */
+                LASSERT(vlen >= 7 * 4);
+                *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION);
+                *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I);
+                *vp++ = cpu_to_le32(RPC_GSS_PROC_DATA);
+                *vp++ = cpu_to_le32(gc->gc_seq);
+                *vp++ = cpu_to_le32(PTLRPC_GSS_SVC_INTEGRITY);
+                *vp++ = 0;      /* fake ctx handle */
+                vpsave = vp++;  /* reserve size */
+                vlen -= 7 * 4;
+
+                mic.len = vlen;
+                mic.data = (char *) vp;
+
+                major = kgss_get_mic(rscp->mechctx, 0, &lmsg, &mic);
+                if (major) {
+                        CERROR("fail to get MIC: 0x%x\n", major);
+                        GOTO(out, ret = -EINVAL);
+                }
+                *vpsave = cpu_to_le32(mic.len);
+                seclen = seclen - vlen + mic.len;
+                sec_hdr->sec_len = cpu_to_le32(seclen);
+                rs->rs_repdata_len += size_round(seclen);
+                break;
+        case  PTLRPC_GSS_SVC_PRIVACY:
+                vp = (__u32 *) (rs->rs_repbuf + sizeof(*sec_hdr));
+                vlen = rs->rs_repbuf_len - sizeof(*sec_hdr);
+                seclen = vlen;
+
+                sec_hdr->flavor = cpu_to_le32(PTLRPC_SEC_GSS);
+                sec_hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_PRIV);
+                sec_hdr->msg_len = cpu_to_le32(0);
+
+                /* standard gss hdr */
+                LASSERT(vlen >= 7 * 4);
+                *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION);
+                *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I);
+                *vp++ = cpu_to_le32(RPC_GSS_PROC_DATA);
+                *vp++ = cpu_to_le32(gc->gc_seq);
+                *vp++ = cpu_to_le32(PTLRPC_GSS_SVC_PRIVACY);
+                *vp++ = 0;      /* fake ctx handle */
+                vpsave = vp++;  /* reserve size */
+                vlen -= 7 * 4;
+
+                msg_buf.buf = (__u8 *) rs->rs_msg - GSS_PRIVBUF_PREFIX_LEN;
+                msg_buf.buflen = req->rq_replen + GSS_PRIVBUF_PREFIX_LEN +
+                                 GSS_PRIVBUF_SUFFIX_LEN;
+                msg_buf.dataoff = GSS_PRIVBUF_PREFIX_LEN;
+                msg_buf.datalen = req->rq_replen;
+
+                cipher_buf.data = (__u8 *) vp;
+                cipher_buf.len = vlen;
+
+                major = kgss_wrap(rscp->mechctx, GSS_C_QOP_DEFAULT,
+                                &msg_buf, &cipher_buf);
+                if (major) {
+                        CERROR("failed to wrap: 0x%x\n", major);
+                        GOTO(out, ret = -EINVAL);
+                }
+
+                *vpsave = cpu_to_le32(cipher_buf.len);
+                seclen = seclen - vlen + cipher_buf.len;
+                sec_hdr->sec_len = cpu_to_le32(seclen);
+                rs->rs_repdata_len += size_round(seclen);
+                break;
+        default:
+                CERROR("Unknown service %d\n", gc->gc_svc);
+                GOTO(out, ret = -EINVAL);
+        }
+        ret = 0;
+out:
+        rsc_put(&rscp->h, &rsc_cache);
+
+        RETURN(ret);
+}
+
+static
+void gss_svcsec_cleanup_req(struct ptlrpc_svcsec *svcsec,
+                            struct ptlrpc_request *req)
+{
+        struct gss_svc_data *gsd = (struct gss_svc_data *) req->rq_sec_svcdata;
+
+        if (!gsd) {
+                CDEBUG(D_SEC, "no svc_data present. do nothing\n");
+                return;
+        }
+
+        /* gsd->clclred.gc_ctx is NOT allocated, just set pointer
+         * to the incoming packet buffer, so don't need free it
+         */
+        OBD_FREE(gsd, sizeof(*gsd));
+        req->rq_sec_svcdata = NULL;
+        return;
+}
+
+static
+int gss_svcsec_est_payload(struct ptlrpc_svcsec *svcsec,
+                           struct ptlrpc_request *req,
+                           int msgsize)
+{
+        struct gss_svc_data *svcdata = req->rq_sec_svcdata;
+        ENTRY;
+
+        /* just return the pre-set reserve_len for init/fini/err cases.
+         */
+        LASSERT(svcdata);
+        if (svcdata->is_init) {
+                CDEBUG(D_SEC, "is_init, reserver size %d(%d)\n",
+                       size_round(svcdata->reserve_len),
+                       svcdata->reserve_len);
+                LASSERT(svcdata->reserve_len);
+                LASSERT(svcdata->reserve_len % 4 == 0);
+                RETURN(size_round(svcdata->reserve_len));
+        } else if (svcdata->is_err_notify) {
+                CDEBUG(D_SEC, "is_err_notify, reserver size %d(%d)\n",
+                       size_round(svcdata->reserve_len),
+                       svcdata->reserve_len);
+                RETURN(size_round(svcdata->reserve_len));
+        } else if (svcdata->is_fini) {
+                CDEBUG(D_SEC, "is_fini, reserver size 0\n");
+                RETURN(0);
+        } else {
+                if (svcdata->clcred.gc_svc == PTLRPC_GSS_SVC_NONE ||
+                    svcdata->clcred.gc_svc == PTLRPC_GSS_SVC_INTEGRITY)
+                        RETURN(size_round(GSS_MAX_AUTH_PAYLOAD));
+                else if (svcdata->clcred.gc_svc == PTLRPC_GSS_SVC_PRIVACY)
+                        RETURN(size_round16(GSS_MAX_AUTH_PAYLOAD + msgsize +
+                                            GSS_PRIVBUF_PREFIX_LEN +
+                                            GSS_PRIVBUF_SUFFIX_LEN));
+                else {
+                        CERROR("unknown gss svc %u\n", svcdata->clcred.gc_svc);
+                        *((int *)0) = 0;
+                        LBUG();
+                }
+        }
+        RETURN(0);
+}
+
+int gss_svcsec_alloc_repbuf(struct ptlrpc_svcsec *svcsec,
+                            struct ptlrpc_request *req,
+                            int msgsize)
+{
+        struct gss_svc_data *gsd = (struct gss_svc_data *) req->rq_sec_svcdata;
+        struct ptlrpc_reply_state *rs;
+        int msg_payload, sec_payload;
+        int privacy, rc;
+        ENTRY;
+
+        /* determine the security type: none/auth or priv, we have
+         * different pack scheme for them.
+         * init/fini/err will always be treated as none/auth.
+         */
+        LASSERT(gsd);
+        if (!gsd->is_init && !gsd->is_init_continue &&
+            !gsd->is_fini && !gsd->is_err_notify &&
+            gsd->clcred.gc_svc == PTLRPC_GSS_SVC_PRIVACY)
+                privacy = 1;
+        else
+                privacy = 0;
+
+        msg_payload = privacy ? 0 : msgsize;
+        sec_payload = gss_svcsec_est_payload(svcsec, req, msgsize);
+
+        rc = svcsec_alloc_reply_state(req, msg_payload, sec_payload);
+        if (rc)
+                RETURN(rc);
+
+        rs = req->rq_reply_state;
+        LASSERT(rs);
+        rs->rs_msg_len = msgsize;
+
+        if (privacy) {
+                /* we can choose to let msg simply point to the rear of the
+                 * buffer, which lead to buffer overlap when doing encryption.
+                 * usually it's ok and it indeed passed all existing tests.
+                 * but not sure if there will be subtle problems in the future.
+                 * so right now we choose to alloc another new buffer. we'll
+                 * see how it works.
+                 */
+#if 0
+                rs->rs_msg = (struct lustre_msg *)
+                             (rs->rs_repbuf + rs->rs_repbuf_len -
+                              msgsize - GSS_PRIVBUF_SUFFIX_LEN);
+#endif
+                char *msgbuf;
+
+                msgsize += GSS_PRIVBUF_PREFIX_LEN + GSS_PRIVBUF_SUFFIX_LEN;
+                OBD_ALLOC(msgbuf, msgsize);
+                if (!msgbuf) {
+                        CERROR("can't alloc %d\n", msgsize);
+                        svcsec_free_reply_state(rs);
+                        req->rq_reply_state = NULL;
+                        RETURN(-ENOMEM);
+                }
+                rs->rs_msg = (struct lustre_msg *)
+                                (msgbuf + GSS_PRIVBUF_PREFIX_LEN);
+        }
+
+        req->rq_repmsg = rs->rs_msg;
+
+        RETURN(0);
+}
+
+static
+void gss_svcsec_free_repbuf(struct ptlrpc_svcsec *svcsec,
+                            struct ptlrpc_reply_state *rs)
+{
+        unsigned long p1 = (unsigned long) rs->rs_msg;
+        unsigned long p2 = (unsigned long) rs->rs_buf;
+
+        LASSERT(rs->rs_buf);
+        LASSERT(rs->rs_msg);
+        LASSERT(rs->rs_msg_len);
+
+        if (p1 < p2 || p1 >= p2 + rs->rs_buf_len) {
+                char *start = (char*) rs->rs_msg - GSS_PRIVBUF_PREFIX_LEN;
+                int size = rs->rs_msg_len + GSS_PRIVBUF_PREFIX_LEN +
+                           GSS_PRIVBUF_SUFFIX_LEN;
+                OBD_FREE(start, size);
+        }
+
+        svcsec_free_reply_state(rs);
+}
+
+struct ptlrpc_svcsec svcsec_gss = {
+        .pss_owner              = THIS_MODULE,
+        .pss_name               = "GSS_SVCSEC",
+        .pss_flavor             = {PTLRPC_SEC_GSS, 0},
+        .accept                 = gss_svcsec_accept,
+        .authorize              = gss_svcsec_authorize,
+        .alloc_repbuf           = gss_svcsec_alloc_repbuf,
+        .free_repbuf            = gss_svcsec_free_repbuf,
+        .cleanup_req            = gss_svcsec_cleanup_req,
+};
+
+/* XXX hacking */
+void lgss_svc_cache_purge_all(void)
+{
+        cache_purge(&rsi_cache);
+        cache_purge(&rsc_cache);
+}
+EXPORT_SYMBOL(lgss_svc_cache_purge_all);
+
+void lgss_svc_cache_flush(__u32 uid)
+{
+        rsc_flush(uid);
+}
+EXPORT_SYMBOL(lgss_svc_cache_flush);
+
+int gss_svc_init(void)
+{
+        int rc;
+
+        rc = svcsec_register(&svcsec_gss);
+        if (!rc) {
+                cache_register(&rsc_cache);
+                cache_register(&rsi_cache);
+        }
+        return rc;
+}
+
+void gss_svc_exit(void)
+{
+        int rc;
+        if ((rc = cache_unregister(&rsi_cache)))
+                CERROR("unregister rsi cache: %d\n", rc);
+        if ((rc = cache_unregister(&rsc_cache)))
+                CERROR("unregister rsc cache: %d\n", rc);
+        if ((rc = svcsec_unregister(&svcsec_gss)))
+                CERROR("unregister svcsec_gss: %d\n", rc);
+}
diff --git a/lustre/sec/sec.c b/lustre/sec/sec.c
new file mode 100644 (file)
index 0000000..9dd5d4f
--- /dev/null
@@ -0,0 +1,932 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_dlm.h>
+#include <linux/lustre_sec.h>
+
+static spinlock_t sectypes_lock = SPIN_LOCK_UNLOCKED;
+static struct ptlrpc_sec_type *sectypes[PTLRPC_SEC_MAX_FLAVORS] = {
+        NULL,
+};
+
+int ptlrpcs_register(struct ptlrpc_sec_type *type)
+{
+        __u32 flavor = type->pst_flavor.flavor;
+
+        LASSERT(type->pst_name);
+        LASSERT(type->pst_ops);
+
+        if (flavor >= PTLRPC_SEC_MAX_FLAVORS)
+                return -EINVAL;
+
+        spin_lock(&sectypes_lock);
+        if (sectypes[flavor]) {
+                spin_unlock(&sectypes_lock);
+                return -EALREADY;
+        }
+        sectypes[flavor] = type;
+        atomic_set(&type->pst_inst, 0);
+        spin_unlock(&sectypes_lock);
+
+        CWARN("Security module %s registered\n", type->pst_name);
+        return 0;
+}
+
+int ptlrpcs_unregister(struct ptlrpc_sec_type *type)
+{
+        __u32 flavor = type->pst_flavor.flavor;
+
+        if (flavor >= PTLRPC_SEC_MAX_FLAVORS)
+                return -EINVAL;
+
+        spin_lock(&sectypes_lock);
+        if (!sectypes[flavor]) {
+                spin_unlock(&sectypes_lock);
+                return -EINVAL;
+        }
+
+        if (sectypes[flavor] != type) {
+                CERROR("invalid unregister\n");
+                return -EINVAL;
+        }
+
+        if (atomic_read(&type->pst_inst)) {
+                CERROR("sec module %s still have instance %d\n",
+                       type->pst_name, atomic_read(&type->pst_inst));
+                spin_unlock(&sectypes_lock);
+                return -EINVAL;
+        }
+
+        CDEBUG(D_SEC, "Security module %s unregistered\n", type->pst_name);
+        sectypes[flavor] = NULL;
+        spin_unlock(&sectypes_lock);
+
+        return 0;
+}
+
+static
+struct ptlrpc_sec_type * ptlrpcs_flavor2type(ptlrpcs_flavor_t *flavor)
+{
+        struct ptlrpc_sec_type *type;
+        __u32 major = flavor->flavor;
+
+        if (major >= PTLRPC_SEC_MAX_FLAVORS)
+                return NULL;
+
+        spin_lock(&sectypes_lock);
+        type = sectypes[major];
+        if (type && !try_module_get(type->pst_owner))
+                type = NULL;
+        spin_unlock(&sectypes_lock);
+        return type;
+}
+
+static inline
+void ptlrpcs_type_put(struct ptlrpc_sec_type *type)
+{
+        module_put(type->pst_owner);
+}
+
+/***********************************************
+ * credential cache helpers                    *
+ ***********************************************/
+
+void ptlrpcs_init_credcache(struct ptlrpc_sec *sec)
+{
+        int i;
+        for (i = 0; i < PTLRPC_CREDCACHE_NR; i++)
+                INIT_LIST_HEAD(&sec->ps_credcache[i]);
+        sec->ps_nextgc = get_seconds() + (sec->ps_expire >> 1);
+}
+
+static void ptlrpcs_cred_destroy(struct ptlrpc_cred *cred)
+{
+        struct ptlrpc_sec *sec = cred->pc_sec;
+
+        LASSERT(cred->pc_sec);
+        LASSERT(atomic_read(&cred->pc_refcount) == 0);
+        LASSERT(list_empty(&cred->pc_hash));
+
+        cred->pc_ops->destroy(cred);
+        atomic_dec(&sec->ps_credcount);
+}
+
+static void ptlrpcs_destroy_credlist(struct list_head *head)
+{
+        struct ptlrpc_cred *cred;
+
+        while (!list_empty(head)) {
+                cred = list_entry(head->next, struct ptlrpc_cred, pc_hash);
+                list_del_init(&cred->pc_hash);
+                ptlrpcs_cred_destroy(cred);
+        }
+}
+
+static
+int ptlrpcs_cred_unlink_expired(struct ptlrpc_cred *cred,
+                                struct list_head *freelist)
+{
+        LASSERT(cred->pc_sec);
+
+        if (atomic_read(&cred->pc_refcount) != 0)
+                return 0;
+        if (time_after(cred->pc_expire, get_seconds()))
+                return 0;
+
+        list_del(&cred->pc_hash);
+        list_add(&cred->pc_hash, freelist);
+        CDEBUG(D_SEC, "put cred %p into freelist\n", cred);
+        return 1;
+}
+
+static
+void ptlrpcs_credcache_gc(struct ptlrpc_sec *sec,
+                          struct list_head *freelist)
+{
+        struct ptlrpc_cred *cred, *n;
+        int i;
+        ENTRY;
+
+        CDEBUG(D_SEC, "do gc on sec %s\n", sec->ps_type->pst_name);
+        for (i = 0; i < PTLRPC_CREDCACHE_NR; i++) {
+                list_for_each_entry_safe(cred, n, &sec->ps_credcache[i],
+                                         pc_hash) {
+                        ptlrpcs_cred_unlink_expired(cred, freelist);
+                }
+        }
+        sec->ps_nextgc = get_seconds() + sec->ps_expire;
+        EXIT;
+}
+
+static
+int ptlrpcs_flush_credcache(struct ptlrpc_sec *sec, int force)
+{
+        struct ptlrpc_cred *cred, *n;
+        LIST_HEAD(freelist);
+        int i, busy = 0;
+        ENTRY;
+
+        spin_lock(&sec->ps_lock);
+        for (i = 0; i < PTLRPC_CREDCACHE_NR; i++) {
+                list_for_each_entry_safe(cred, n, &sec->ps_credcache[i],
+                                         pc_hash) {
+                        LASSERT(atomic_read(&cred->pc_refcount) >= 0);
+                        if (atomic_read(&cred->pc_refcount)) {
+                                busy = 1;
+                                if (!force)
+                                        continue;
+                                list_del_init(&cred->pc_hash);
+                        } else
+                                list_move(&cred->pc_hash, &freelist);
+
+                        /* don't remove CRED_UPTODATE flag here */
+                        cred->pc_flags |= PTLRPC_CRED_DEAD;
+                }
+        }
+        spin_unlock(&sec->ps_lock);
+        ptlrpcs_destroy_credlist(&freelist);
+        RETURN(busy);
+}
+
+/**************************************************
+ * credential APIs                                *
+ **************************************************/
+
+static inline
+int ptlrpcs_cred_get_hash(__u64 pag)
+{
+        LASSERT((pag & PTLRPC_CREDCACHE_MASK) < PTLRPC_CREDCACHE_NR);
+        return (pag & PTLRPC_CREDCACHE_MASK);
+}
+
+static
+struct ptlrpc_cred * cred_cache_lookup(struct ptlrpc_sec *sec,
+                                       struct vfs_cred *vcred,
+                                       struct ptlrpc_request *req,
+                                       int create)
+{
+        struct ptlrpc_cred *cred, *new = NULL, *n;
+        LIST_HEAD(freelist);
+        int hash, found = 0;
+        ENTRY;
+
+        hash = ptlrpcs_cred_get_hash(vcred->vc_pag);
+
+retry:
+        spin_lock(&sec->ps_lock);
+        /* do gc if expired */
+        if (time_after(get_seconds(), sec->ps_nextgc))
+                ptlrpcs_credcache_gc(sec, &freelist);
+
+        list_for_each_entry_safe(cred, n, &sec->ps_credcache[hash], pc_hash) {
+                if (cred->pc_flags & PTLRPC_CRED_DEAD)
+                        continue;
+                if (ptlrpcs_cred_unlink_expired(cred, &freelist))
+                        continue;
+                if (cred->pc_ops->match(cred, req, vcred)) {
+                        found = 1;
+                        break;
+                }
+        }
+
+        if (found) {
+                if (new && new != cred) {
+                        /* lost the race, just free it */
+                        list_add(&new->pc_hash, &freelist);
+                }
+                list_move(&cred->pc_hash, &sec->ps_credcache[hash]);
+        } else {
+                if (new) {
+                        list_add(&new->pc_hash, &sec->ps_credcache[hash]);
+                        cred = new;
+                } else if (create) {
+                        spin_unlock(&sec->ps_lock);
+                        new = sec->ps_type->pst_ops->create_cred(sec, req, vcred);
+                        if (new) {
+                                atomic_inc(&sec->ps_credcount);
+                                goto retry;
+                        }
+                } else
+                        cred = NULL;
+        }
+
+        /* hold a ref */
+        if (cred)
+                atomic_inc(&cred->pc_refcount);
+
+        spin_unlock(&sec->ps_lock);
+
+        ptlrpcs_destroy_credlist(&freelist);
+        RETURN(cred);
+}
+
+struct ptlrpc_cred * ptlrpcs_cred_lookup(struct ptlrpc_sec *sec,
+                                         struct vfs_cred *vcred)
+{
+        struct ptlrpc_cred *cred;
+        ENTRY;
+
+        cred = cred_cache_lookup(sec, vcred, NULL, 0);
+        RETURN(cred);
+}
+
+int ptlrpcs_req_get_cred(struct ptlrpc_request *req)
+{
+        struct obd_import *imp = req->rq_import;
+        struct vfs_cred vcred;
+        ENTRY;
+
+        LASSERT(!req->rq_cred);
+        LASSERT(imp);
+        LASSERT(imp->imp_sec);
+
+        /* XXX
+         * for now we simply let PAG == real uid
+         */
+        vcred.vc_pag = (__u64) current->uid;
+        vcred.vc_uid = current->uid;
+
+        req->rq_cred = cred_cache_lookup(imp->imp_sec, &vcred, req, 1);
+
+        if (!req->rq_cred) {
+                CERROR("req %p: fail to get cred from cache\n", req);
+                RETURN(-ENOMEM);
+        }
+
+        RETURN(0);
+}
+
+static void ptlrpcs_sec_destroy(struct ptlrpc_sec *sec);
+
+void ptlrpcs_cred_put(struct ptlrpc_cred *cred, int sync)
+{
+        struct ptlrpc_sec *sec = cred->pc_sec;
+
+        LASSERT(cred);
+        LASSERT(sec);
+        LASSERT(atomic_read(&cred->pc_refcount));
+
+        spin_lock(&sec->ps_lock);
+        if (atomic_dec_and_test(&cred->pc_refcount) &&
+            sync && cred->pc_flags & PTLRPC_CRED_DEAD) {
+                list_del_init(&cred->pc_hash);
+                ptlrpcs_cred_destroy(cred);
+                if (!atomic_read(&sec->ps_credcount) &&
+                    !atomic_read(&sec->ps_refcount)) {
+                        CWARN("put last cred on a dead sec %p(%s), "
+                              "also destroy the sec\n", sec,
+                               sec->ps_type->pst_name);
+                        spin_unlock(&sec->ps_lock);
+
+                        ptlrpcs_sec_destroy(sec);
+                        return;
+                }
+        }
+        spin_unlock(&sec->ps_lock);
+}
+
+void ptlrpcs_req_drop_cred(struct ptlrpc_request *req)
+{
+        ENTRY;
+
+        LASSERT(req);
+        LASSERT(req->rq_cred);
+
+        if (req->rq_cred) {
+                /* We'd like to not use 'sync' mode, but might cause
+                 * some cred leak. Need more thinking here. FIXME
+                 */
+                ptlrpcs_cred_put(req->rq_cred, 1);
+                req->rq_cred = NULL;
+        } else
+                CDEBUG(D_SEC, "req %p have no cred\n", req);
+        EXIT;
+}
+
+/* 
+ * request must have a cred. if failed to get new cred,
+ * just restore the old one
+ */
+int ptlrpcs_req_replace_dead_cred(struct ptlrpc_request *req)
+{
+        struct ptlrpc_cred *cred = req->rq_cred;
+        int rc;
+        ENTRY;
+
+        LASSERT(cred);
+        LASSERT(cred->pc_flags & PTLRPC_CRED_DEAD);
+
+        ptlrpcs_cred_get(cred);
+        ptlrpcs_req_drop_cred(req);
+        LASSERT(!req->rq_cred);
+        rc = ptlrpcs_req_get_cred(req);
+        if (!rc) {
+                LASSERT(req->rq_cred);
+                LASSERT(req->rq_cred != cred);
+                ptlrpcs_cred_put(cred, 1);
+        } else {
+                LASSERT(!req->rq_cred);
+                req->rq_cred = cred;
+        }
+        RETURN(rc);
+}
+
+int ptlrpcs_req_refresh_cred(struct ptlrpc_request *req)
+{
+        struct ptlrpc_cred *cred = req->rq_cred;
+        int rc;
+        ENTRY;
+
+        LASSERT(cred);
+
+        if ((cred->pc_flags & (PTLRPC_CRED_UPTODATE | PTLRPC_CRED_DEAD)) ==
+            PTLRPC_CRED_UPTODATE)
+                RETURN(0);
+
+        if (cred->pc_flags & PTLRPC_CRED_DEAD) {
+                rc = ptlrpcs_req_replace_dead_cred(req);
+                if (!rc) {
+                        LASSERT(cred != req->rq_cred);
+                        CWARN("req %p: replace cred %p => %p\n",
+                               req, cred, req->rq_cred);
+                        cred = req->rq_cred;
+                } else {
+                        LASSERT(cred == req->rq_cred);
+                        CERROR("req %p: failed to replace dead cred %p\n",
+                                req, cred);
+                        RETURN(-ENOMEM);
+                }
+        }
+
+        rc = ptlrpcs_cred_refresh(cred);
+        if (!(cred->pc_flags & PTLRPC_CRED_UPTODATE)) {
+                CERROR("req %p: failed to refresh cred %p, rc %d\n",
+                        req, cred, rc);
+                if (!rc)
+                        rc = -EACCES;
+        }
+        RETURN(rc);
+}
+
+int ptlrpcs_cli_wrap_request(struct ptlrpc_request *req)
+{
+        struct ptlrpc_cred     *cred;
+        int rc;
+        ENTRY;
+
+        LASSERT(req->rq_cred);
+        LASSERT(req->rq_cred->pc_sec);
+        LASSERT(req->rq_cred->pc_ops);
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_reqbuf_len);
+
+        rc = ptlrpcs_req_refresh_cred(req);
+        if (rc)
+                RETURN(rc);
+
+        CDEBUG(D_SEC, "wrap req %p\n", req);
+        cred = req->rq_cred;
+
+        switch (cred->pc_sec->ps_sectype) {
+        case PTLRPC_SEC_TYPE_NONE:
+        case PTLRPC_SEC_TYPE_AUTH:
+                if (req->rq_req_wrapped) {
+                        CWARN("req %p(o%u,x"LPU64",t"LPU64") "
+                              "already signed, resend?\n", req,
+                               req->rq_reqmsg ? req->rq_reqmsg->opc : -1,
+                               req->rq_xid, req->rq_transno);
+                        req->rq_req_wrapped = 0;
+                        req->rq_reqdata_len = sizeof(struct ptlrpcs_wire_hdr) +
+                                              req->rq_reqlen;
+                        LASSERT(req->rq_reqdata_len % 8 == 0);
+                }
+
+                LASSERT(cred->pc_ops->sign);
+                rc = cred->pc_ops->sign(cred, req);
+                if (!rc)
+                        req->rq_req_wrapped = 1;
+                break;
+        case PTLRPC_SEC_TYPE_PRIV:
+                if (req->rq_req_wrapped) {
+                        CWARN("req %p(o%u,x"LPU64",t"LPU64") "
+                              "already encrypted, resend?\n", req,
+                               req->rq_reqmsg ? req->rq_reqmsg->opc : -1,
+                               req->rq_xid, req->rq_transno);
+                        req->rq_req_wrapped = 0;
+                        req->rq_reqdata_len = sizeof(struct ptlrpcs_wire_hdr);
+                        LASSERT(req->rq_reqdata_len % 8 == 0);
+                }
+
+                LASSERT(cred->pc_ops->seal);
+                rc = cred->pc_ops->seal(cred, req);
+                if (!rc)
+                        req->rq_req_wrapped = 1;
+                break;
+        default:
+                LBUG();
+        }
+        LASSERT(req->rq_reqdata_len);
+        LASSERT(req->rq_reqdata_len % 8 == 0);
+        LASSERT(req->rq_reqdata_len >= sizeof(struct ptlrpcs_wire_hdr));
+        LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len);
+
+        RETURN(rc);
+}
+
+/* rq_nob_received is the actual received data length */
+int ptlrpcs_cli_unwrap_reply(struct ptlrpc_request *req)
+{
+        struct ptlrpc_cred *cred = req->rq_cred;
+        struct ptlrpc_sec *sec;
+        struct ptlrpcs_wire_hdr *sec_hdr;
+        int rc;
+        ENTRY;
+
+        LASSERT(cred);
+        LASSERT(cred->pc_sec);
+        LASSERT(cred->pc_ops);
+        LASSERT(req->rq_repbuf);
+        
+        if (req->rq_nob_received < sizeof(*sec_hdr)) {
+                CERROR("req %p: reply size only %d\n",
+                        req, req->rq_nob_received);
+                RETURN(-EPROTO);
+        }
+
+        sec_hdr = (struct ptlrpcs_wire_hdr *) req->rq_repbuf;
+        sec_hdr->flavor = le32_to_cpu(sec_hdr->flavor);
+        sec_hdr->sectype = le32_to_cpu(sec_hdr->sectype);
+        sec_hdr->msg_len = le32_to_cpu(sec_hdr->msg_len);
+        sec_hdr->sec_len = le32_to_cpu(sec_hdr->sec_len);
+
+        CDEBUG(D_SEC, "req %p, cred %p, flavor %u, sectype %u\n",
+               req, cred, sec_hdr->flavor, sec_hdr->sectype);
+
+        sec = cred->pc_sec;
+        if (sec_hdr->flavor != sec->ps_flavor.flavor) {
+                CERROR("unmatched flavor %u while expect %u\n",
+                       sec_hdr->flavor, sec->ps_flavor.flavor);
+                RETURN(-EPROTO);
+        }
+
+        if (sizeof(*sec_hdr) + sec_hdr->msg_len + sec_hdr->sec_len >
+            req->rq_nob_received) {
+                CERROR("msg %u, sec %u, while only get %d\n",
+                        sec_hdr->msg_len, sec_hdr->sec_len,
+                        req->rq_nob_received);
+                RETURN(-EPROTO);
+        }
+
+        switch (sec_hdr->sectype) {
+        case PTLRPC_SEC_TYPE_NONE:
+        case PTLRPC_SEC_TYPE_AUTH: {
+                LASSERT(cred->pc_ops->verify);
+                rc = cred->pc_ops->verify(cred, req);
+                LASSERT(rc || req->rq_repmsg || req->rq_ptlrpcs_restart);
+                break;
+        case PTLRPC_SEC_TYPE_PRIV:
+                LASSERT(cred->pc_ops->unseal);
+                rc = cred->pc_ops->unseal(cred, req);
+                LASSERT(rc || req->rq_repmsg || req->rq_ptlrpcs_restart);
+                break;
+        }
+        default:
+                rc = -1;
+                LBUG();
+        }
+        RETURN(rc);
+}
+
+/**************************************************
+ * security APIs                                  *
+ **************************************************/
+
+struct ptlrpc_sec * ptlrpcs_sec_create(ptlrpcs_flavor_t *flavor,
+                                       struct obd_import *import,
+                                       const char *pipe_dir,
+                                       void *pipe_data)
+{
+        struct ptlrpc_sec_type *type;
+        struct ptlrpc_sec *sec;
+        ENTRY;
+
+        type = ptlrpcs_flavor2type(flavor);
+        if (!type) {
+                CDEBUG(D_SEC, "invalid major flavor %u\n", flavor->flavor);
+                RETURN(NULL);
+        }
+
+        sec = type->pst_ops->create_sec(flavor, pipe_dir, pipe_data);
+        if (sec) {
+                spin_lock_init(&sec->ps_lock);
+                ptlrpcs_init_credcache(sec);
+                sec->ps_type = type;
+                sec->ps_flavor = *flavor;
+                sec->ps_import = class_import_get(import);
+                atomic_set(&sec->ps_refcount, 1);
+                atomic_set(&sec->ps_credcount, 0);
+                atomic_inc(&type->pst_inst);
+        } else
+                ptlrpcs_type_put(type);
+
+        return sec;
+}
+
+static void ptlrpcs_sec_destroy(struct ptlrpc_sec *sec)
+{
+        struct ptlrpc_sec_type *type = sec->ps_type;
+        struct ptlrpc_import *imp = sec->ps_import;
+
+        LASSERT(type && type->pst_ops);
+        LASSERT(type->pst_ops->destroy_sec);
+
+        type->pst_ops->destroy_sec(sec);
+        atomic_dec(&type->pst_inst);
+        ptlrpcs_type_put(type);
+        class_import_put(imp);
+}
+
+void ptlrpcs_sec_put(struct ptlrpc_sec *sec)
+{
+        if (atomic_dec_and_test(&sec->ps_refcount)) {
+                ptlrpcs_flush_credcache(sec, 1);
+
+                if (atomic_read(&sec->ps_credcount) == 0) {
+                        ptlrpcs_sec_destroy(sec);
+                } else {
+                        CWARN("sec %p(%s) is no usage while %d cred still "
+                              "holded, destroy delayed\n",
+                               sec, sec->ps_type->pst_name,
+                               atomic_read(&sec->ps_credcount));
+                }
+        }
+}
+
+void ptlrpcs_sec_invalidate_cache(struct ptlrpc_sec *sec)
+{
+        ptlrpcs_flush_credcache(sec, 1);
+}
+
+int sec_alloc_reqbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req,
+                     int msgsize, int secsize)
+{
+        struct ptlrpcs_wire_hdr *hdr;
+        ENTRY;
+
+        LASSERT(msgsize % 8 == 0);
+        LASSERT(secsize % 8 == 0);
+
+        req->rq_reqbuf_len = sizeof(*hdr) + msgsize + secsize;
+        OBD_ALLOC(req->rq_reqbuf, req->rq_reqbuf_len);
+        if (!req->rq_reqbuf) {
+                CERROR("can't alloc %d\n", req->rq_reqbuf_len);
+                RETURN(-ENOMEM);
+        }
+
+        hdr = buf_to_sec_hdr(req->rq_reqbuf);
+        hdr->flavor = cpu_to_le32(sec->ps_flavor.flavor);
+        hdr->sectype = cpu_to_le32(sec->ps_sectype);
+        hdr->msg_len = msgsize;
+        /* security length will be filled later */
+
+        /* later reqdata_len will be added on actual security payload */
+        req->rq_reqdata_len = sizeof(*hdr) + msgsize;
+        req->rq_reqmsg = buf_to_lustre_msg(req->rq_reqbuf);
+
+        CDEBUG(D_SEC, "req %p: rqbuf at %p, len %d, msg %d, sec %d\n",
+               req, req->rq_reqbuf, req->rq_reqbuf_len,
+               msgsize, secsize);
+
+        RETURN(0);
+}
+
+/* when complete successfully, req->rq_reqmsg should point to the
+ * right place.
+ */
+int ptlrpcs_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize)
+{
+        struct ptlrpc_cred *cred = req->rq_cred;
+        struct ptlrpc_sec *sec;
+        struct ptlrpc_secops *ops;
+
+        LASSERT(msgsize % 8 == 0);
+        LASSERT(sizeof(struct ptlrpcs_wire_hdr) % 8 == 0);
+        LASSERT(cred);
+        LASSERT(atomic_read(&cred->pc_refcount));
+        LASSERT(cred->pc_sec);
+        LASSERT(cred->pc_sec->ps_type);
+        LASSERT(cred->pc_sec->ps_type->pst_ops);
+        LASSERT(req->rq_reqbuf == NULL);
+        LASSERT(req->rq_reqmsg == NULL);
+
+        sec = cred->pc_sec;
+        ops = sec->ps_type->pst_ops;
+        if (ops->alloc_reqbuf)
+                return ops->alloc_reqbuf(sec, req, msgsize);
+        else
+                return sec_alloc_reqbuf(sec, req, msgsize, 0);
+}
+
+void sec_free_reqbuf(struct ptlrpc_sec *sec,
+                     struct ptlrpc_request *req)
+{
+        LASSERT(req->rq_reqbuf);
+        LASSERT(req->rq_reqbuf_len);
+
+        /* sanity check */
+        if (req->rq_reqmsg) {
+                LASSERT((char *) req->rq_reqmsg >= req->rq_reqbuf &&
+                        (char *) req->rq_reqmsg < req->rq_reqbuf +
+                                                  req->rq_reqbuf_len);
+        }
+
+        OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len);
+        req->rq_reqbuf = NULL;
+        req->rq_reqmsg = NULL;
+}
+
+void ptlrpcs_cli_free_reqbuf(struct ptlrpc_request *req)
+{
+        struct ptlrpc_cred *cred = req->rq_cred;
+        struct ptlrpc_sec *sec;
+        struct ptlrpc_secops *ops;
+
+        LASSERT(cred);
+        LASSERT(atomic_read(&cred->pc_refcount));
+        LASSERT(cred->pc_sec);
+        LASSERT(cred->pc_sec->ps_type);
+        LASSERT(cred->pc_sec->ps_type->pst_ops);
+        LASSERT(req->rq_reqbuf);
+
+        sec = cred->pc_sec;
+        ops = sec->ps_type->pst_ops;
+        if (ops->free_reqbuf)
+                ops->free_reqbuf(sec, req);
+        else
+                sec_free_reqbuf(sec, req);
+}
+
+int ptlrpcs_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize)
+{
+        struct ptlrpc_cred *cred = req->rq_cred;
+        struct ptlrpc_sec *sec;
+        struct ptlrpc_secops *ops;
+        int msg_payload, sec_payload;
+        ENTRY;
+
+        LASSERT(msgsize % 8 == 0);
+        LASSERT(sizeof(struct ptlrpcs_wire_hdr) % 8 == 0);
+        LASSERT(cred);
+        LASSERT(atomic_read(&cred->pc_refcount));
+        LASSERT(cred->pc_sec);
+        LASSERT(cred->pc_sec->ps_type);
+        LASSERT(cred->pc_sec->ps_type->pst_ops);
+        LASSERT(req->rq_repbuf == NULL);
+
+        sec = cred->pc_sec;
+        ops = sec->ps_type->pst_ops;
+        if (ops->alloc_repbuf)
+                RETURN(ops->alloc_repbuf(sec, req, msgsize));
+
+        /* default allocation scheme */
+        msg_payload = sec->ps_sectype == PTLRPC_SEC_TYPE_PRIV ? 0 : msgsize;
+        sec_payload = size_round(ptlrpcs_est_rep_payload(sec, msgsize));
+
+        req->rq_repbuf_len = sizeof(struct ptlrpcs_wire_hdr) +
+                             msg_payload + sec_payload;
+        OBD_ALLOC(req->rq_repbuf, req->rq_repbuf_len);
+        if (!req->rq_repbuf)
+                RETURN(-ENOMEM);
+
+        CDEBUG(D_SEC, "req %p: repbuf at %p, len %d, msg %d, sec %d\n",
+               req, req->rq_repbuf, req->rq_repbuf_len,
+               msg_payload, sec_payload);
+
+        RETURN(0);
+}
+
+void ptlrpcs_cli_free_repbuf(struct ptlrpc_request *req)
+{
+        struct ptlrpc_cred *cred = req->rq_cred;
+        struct ptlrpc_sec *sec;
+        struct ptlrpc_secops *ops;
+        ENTRY;
+
+        LASSERT(cred);
+        LASSERT(atomic_read(&cred->pc_refcount));
+        LASSERT(cred->pc_sec);
+        LASSERT(cred->pc_sec->ps_type);
+        LASSERT(cred->pc_sec->ps_type->pst_ops);
+        LASSERT(req->rq_repbuf);
+
+        sec = cred->pc_sec;
+        ops = sec->ps_type->pst_ops;
+        if (ops->free_repbuf)
+                ops->free_repbuf(sec, req);
+        else {
+                OBD_FREE(req->rq_repbuf, req->rq_repbuf_len);
+                req->rq_repbuf = NULL;
+                req->rq_repmsg = NULL;
+        }
+        EXIT;
+}
+
+int ptlrpcs_import_get_sec(struct obd_import *imp)
+{
+        ptlrpcs_flavor_t flavor = {PTLRPC_SEC_NULL, 0};
+        char *pipedir = NULL;
+        ENTRY;
+
+        LASSERT(imp->imp_obd);
+        LASSERT(imp->imp_obd->obd_type);
+
+        /* old sec might be still there in reconnecting */
+        if (imp->imp_sec)
+                RETURN(0);
+
+        /* find actual flavor for client obd. right now server side
+         * obd (reverse imp, etc) will simply use NULL.
+         */
+        if (!strcmp(imp->imp_obd->obd_type->typ_name, "mdc") ||
+            !strcmp(imp->imp_obd->obd_type->typ_name, "osc")) {
+                struct client_obd *cli = &imp->imp_obd->u.cli;
+
+                if (cli->cl_sec_flavor == PTLRPC_SEC_GSS) {
+                        CWARN("select security gss/%s for %s(%s)\n",
+                               cli->cl_sec_subflavor == PTLRPC_SEC_GSS_KRB5I ?
+                               "krb5i" : "krb5p",
+                               imp->imp_obd->obd_type->typ_name,
+                               imp->imp_obd->obd_name);
+                        flavor.flavor = cli->cl_sec_flavor;
+                        flavor.subflavor = cli->cl_sec_subflavor;
+                        pipedir = imp->imp_obd->obd_name;
+                } else if (cli->cl_sec_flavor == PTLRPC_SEC_NULL) {
+                        CWARN("select security null for %s(%s)\n",
+                               imp->imp_obd->obd_type->typ_name,
+                               imp->imp_obd->obd_name);
+                } else {
+                        CWARN("unknown security flavor for mdc(%s), "
+                              "use 'null'\n", imp->imp_obd->obd_name);
+                }
+        }
+
+        imp->imp_sec = ptlrpcs_sec_create(&flavor, imp, pipedir, imp);
+        if (!imp->imp_sec)
+                RETURN(-EINVAL);
+        else
+                RETURN(0);
+}
+
+void ptlrpcs_import_drop_sec(struct obd_import *imp)
+{
+        ENTRY;
+        if (imp->imp_sec) {
+                ptlrpcs_sec_put(imp->imp_sec);
+                imp->imp_sec = NULL;
+        }
+        EXIT;
+}
+
+int __init ptlrpc_sec_init(void)
+{
+        int rc;
+
+        if ((rc = ptlrpcs_null_init()))
+                return rc;
+
+        if ((rc = svcsec_null_init())) {
+                ptlrpcs_null_exit();
+                return rc;
+        }
+
+#if 0
+#if !defined __KERNEL__ && defined ENABLE_GSS
+        ptlrpcs_gss_init();
+#endif
+#endif
+        return 0;
+}
+
+static void __exit ptlrpc_sec_exit(void)
+{
+        svcsec_null_exit();
+        ptlrpcs_null_exit();
+}
+
+
+EXPORT_SYMBOL(ptlrpcs_register);
+EXPORT_SYMBOL(ptlrpcs_unregister);
+EXPORT_SYMBOL(ptlrpcs_sec_create);
+EXPORT_SYMBOL(ptlrpcs_sec_put);
+EXPORT_SYMBOL(ptlrpcs_sec_invalidate_cache);
+EXPORT_SYMBOL(ptlrpcs_import_get_sec);
+EXPORT_SYMBOL(ptlrpcs_import_drop_sec);
+EXPORT_SYMBOL(ptlrpcs_cred_lookup);
+EXPORT_SYMBOL(ptlrpcs_cred_put);
+EXPORT_SYMBOL(ptlrpcs_req_get_cred);
+EXPORT_SYMBOL(ptlrpcs_req_drop_cred);
+EXPORT_SYMBOL(ptlrpcs_req_replace_dead_cred);
+EXPORT_SYMBOL(ptlrpcs_req_refresh_cred);
+EXPORT_SYMBOL(ptlrpcs_cli_alloc_reqbuf);
+EXPORT_SYMBOL(ptlrpcs_cli_free_reqbuf);
+EXPORT_SYMBOL(ptlrpcs_cli_alloc_repbuf);
+EXPORT_SYMBOL(ptlrpcs_cli_free_repbuf);
+EXPORT_SYMBOL(ptlrpcs_cli_wrap_request);
+EXPORT_SYMBOL(ptlrpcs_cli_unwrap_reply);
+EXPORT_SYMBOL(sec_alloc_reqbuf);
+EXPORT_SYMBOL(sec_free_reqbuf);
+
+EXPORT_SYMBOL(svcsec_register);
+EXPORT_SYMBOL(svcsec_unregister);
+EXPORT_SYMBOL(svcsec_accept);
+EXPORT_SYMBOL(svcsec_authorize);
+EXPORT_SYMBOL(svcsec_alloc_repbuf);
+EXPORT_SYMBOL(svcsec_cleanup_req);
+EXPORT_SYMBOL(svcsec_get);
+EXPORT_SYMBOL(svcsec_put);
+EXPORT_SYMBOL(svcsec_alloc_reply_state);
+EXPORT_SYMBOL(svcsec_free_reply_state);
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Lustre Security Support");
+MODULE_LICENSE("GPL");
+
+module_init(ptlrpc_sec_init);
+module_exit(ptlrpc_sec_exit);
diff --git a/lustre/sec/sec_null.c b/lustre/sec/sec_null.c
new file mode 100644 (file)
index 0000000..3d9d908
--- /dev/null
@@ -0,0 +1,195 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
+
+static int null_cred_refresh(struct ptlrpc_cred *cred)
+{
+        ENTRY;
+        LASSERT(cred->pc_flags & PTLRPC_CRED_UPTODATE);
+        RETURN(0);
+}
+
+static int null_cred_match(struct ptlrpc_cred *cred,
+                           struct ptlrpc_request *req,
+                           struct vfs_cred *vcred)
+{
+        ENTRY;
+        RETURN(1);
+}
+
+static int null_cred_sign(struct ptlrpc_cred *cred,
+                          struct ptlrpc_request *req)
+{
+        struct ptlrpcs_wire_hdr *hdr = buf_to_sec_hdr(req->rq_reqbuf);
+        ENTRY;
+
+        hdr->sec_len = cpu_to_le32(0);
+
+        RETURN(0);
+}
+
+static int null_cred_verify(struct ptlrpc_cred *cred,
+                            struct ptlrpc_request *req)
+{
+        struct ptlrpcs_wire_hdr *hdr = buf_to_sec_hdr(req->rq_repbuf);
+
+        if (hdr->sec_len != 0) {
+                CERROR("security payload %u not zero\n", hdr->sec_len);
+                RETURN(-EPROTO);
+        }
+
+        req->rq_repmsg = (struct lustre_msg *)(hdr + 1);
+        req->rq_replen = hdr->msg_len;
+        CDEBUG(D_SEC, "set repmsg at %p, len %d\n",
+               req->rq_repmsg, req->rq_replen);
+
+        RETURN(0);
+}
+
+static void null_cred_destroy(struct ptlrpc_cred *cred)
+{
+        LASSERT(!atomic_read(&cred->pc_refcount));
+
+        CDEBUG(D_SEC, "NULL_SEC: destroy cred %p\n", cred);
+        OBD_FREE(cred, sizeof(*cred));
+}
+
+static struct ptlrpc_credops null_credops = {
+        .refresh        = null_cred_refresh,
+        .match          = null_cred_match,
+        .sign           = null_cred_sign,
+        .verify         = null_cred_verify,
+        .destroy        = null_cred_destroy,
+};
+
+static
+struct ptlrpc_sec* null_create_sec(ptlrpcs_flavor_t *flavor,
+                                   const char *pipe_dir,
+                                   void *pipe_data)
+{
+        struct ptlrpc_sec *sec;
+        ENTRY;
+
+        LASSERT(flavor->flavor == PTLRPC_SEC_NULL);
+
+        OBD_ALLOC(sec, sizeof(*sec));
+        if (!sec)
+                RETURN(ERR_PTR(-ENOMEM));
+
+        sec->ps_sectype = PTLRPC_SEC_TYPE_NONE;
+        sec->ps_expire = (-1UL >> 1); /* never expire */
+        sec->ps_nextgc = (-1UL >> 1);
+        sec->ps_flags = 0;
+
+        CDEBUG(D_SEC, "Create NULL security module at %p\n", sec);
+        RETURN(sec);
+}
+
+static
+void null_destroy_sec(struct ptlrpc_sec *sec)
+{
+        ENTRY;
+
+        CDEBUG(D_SEC, "Destroy NULL security module at %p\n", sec);
+
+        LASSERT(!atomic_read(&sec->ps_refcount));
+        OBD_FREE(sec, sizeof(*sec));
+        EXIT;
+}
+
+static
+struct ptlrpc_cred* null_create_cred(struct ptlrpc_sec *sec,
+                                     struct ptlrpc_request *req,
+                                     struct vfs_cred *vcred)
+{
+        struct ptlrpc_cred *cred;
+        ENTRY;
+
+        OBD_ALLOC(cred, sizeof(*cred));
+        if (!cred)
+                RETURN(NULL);
+
+        INIT_LIST_HEAD(&cred->pc_hash);
+        atomic_set(&cred->pc_refcount, 0);
+        cred->pc_sec = sec;
+        cred->pc_ops = &null_credops;
+        cred->pc_req = req;
+        cred->pc_expire = (-1UL >> 1); /* never expire */
+        cred->pc_flags = PTLRPC_CRED_UPTODATE;
+        cred->pc_pag = vcred->vc_pag;
+        cred->pc_uid = vcred->vc_uid;
+        CDEBUG(D_SEC, "create a null cred at %p("LPU64"/%u)\n",
+               cred, vcred->vc_pag, vcred->vc_uid);
+
+        RETURN(cred);
+}
+
+static struct ptlrpc_secops null_secops = {
+        .create_sec     = null_create_sec,
+        .destroy_sec    = null_destroy_sec,
+        .create_cred    = null_create_cred,
+};
+
+static struct ptlrpc_sec_type null_type = {
+        .pst_owner      = THIS_MODULE,
+        .pst_name       = "NULL_SEC",
+        .pst_inst       = ATOMIC_INIT(0),
+        .pst_flavor     = {PTLRPC_SEC_NULL, 0},
+        .pst_ops        = &null_secops,
+};
+
+int ptlrpcs_null_init(void)
+{
+        int rc;
+
+        rc = ptlrpcs_register(&null_type);
+        if (rc)
+                CERROR("failed to register NULL security: %d\n", rc);
+
+        return rc;
+}
+
+int ptlrpcs_null_exit(void)
+{
+        int rc;
+
+        rc = ptlrpcs_unregister(&null_type);
+        if (rc)
+                CERROR("cannot unregister NULL security: %d\n", rc);
+
+        return rc;
+}
diff --git a/lustre/sec/svcsec.c b/lustre/sec/svcsec.c
new file mode 100644 (file)
index 0000000..b6792c1
--- /dev/null
@@ -0,0 +1,273 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
+
+static spinlock_t svcsecs_lock = SPIN_LOCK_UNLOCKED;
+static struct ptlrpc_svcsec *svcsecs[PTLRPC_SEC_MAX_FLAVORS] = {
+        NULL,
+};
+
+int svcsec_register(struct ptlrpc_svcsec *sec)
+{
+        __u32 flavor = sec->pss_flavor.flavor;
+
+        if (flavor >= PTLRPC_SEC_MAX_FLAVORS)
+                return -EINVAL;
+
+        spin_lock(&svcsecs_lock);
+        if (svcsecs[flavor]) {
+                spin_unlock(&svcsecs_lock);
+                return -EALREADY;
+        }
+        svcsecs[flavor] = sec;
+        spin_unlock(&svcsecs_lock);
+
+        CDEBUG(D_SEC, "Registered svc security module %s\n", sec->pss_name);
+        return 0;
+}
+
+int svcsec_unregister(struct ptlrpc_svcsec *sec)
+{
+        __u32 flavor = sec->pss_flavor.flavor;
+
+        if (flavor >= PTLRPC_SEC_MAX_FLAVORS)
+                return -EINVAL;
+
+        spin_lock(&svcsecs_lock);
+        if (!svcsecs[flavor]) {
+                spin_unlock(&svcsecs_lock);
+                return -EINVAL;
+        }
+
+        LASSERT(svcsecs[flavor] == sec);
+
+        CDEBUG(D_SEC, "Unregistered svc security module %s\n", sec->pss_name);
+        svcsecs[flavor] = NULL;
+        spin_unlock(&svcsecs_lock);
+
+        return 0;
+}
+
+static
+struct ptlrpc_svcsec * flavor2svcsec(__u32 flavor)
+{
+        struct ptlrpc_svcsec *sec;
+
+        if (flavor >= PTLRPC_SEC_MAX_FLAVORS)
+                return NULL;
+
+        spin_lock(&svcsecs_lock);
+        sec = svcsecs[flavor];
+        if (sec && !try_module_get(sec->pss_owner))
+                sec = NULL;
+        spin_unlock(&svcsecs_lock);
+        return sec;
+}
+
+struct ptlrpc_svcsec * svcsec_get(struct ptlrpc_svcsec *sec)
+{
+        int rc;
+
+        spin_lock(&svcsecs_lock);
+        rc = try_module_get(sec->pss_owner);
+        spin_unlock(&svcsecs_lock);
+        LASSERT(rc);
+        return sec;
+}
+
+void svcsec_put(struct ptlrpc_svcsec *sec)
+{
+        spin_lock(&svcsecs_lock);
+        module_put(sec->pss_owner);
+        spin_unlock(&svcsecs_lock);
+}
+
+/*
+ * common code to allocate reply_state buffer.
+ */
+int svcsec_alloc_reply_state(struct ptlrpc_request *req,
+                             int msgsize, int secsize)
+{
+        struct ptlrpc_reply_state *rs;
+        char *buf;
+        int repsize, bufsize;
+        ENTRY;
+
+        LASSERT(msgsize % 8 == 0);
+        LASSERT(secsize % 8 == 0);
+
+        repsize = sizeof(struct ptlrpcs_wire_hdr) + msgsize + secsize;
+        bufsize = repsize + sizeof(struct ptlrpc_reply_state);
+
+        OBD_ALLOC(buf, bufsize);
+        if (!buf) {
+                CERROR("can't alloc %d\n", bufsize);
+                RETURN(-ENOMEM);
+        }
+
+        /* req->rq_repbuf is not used on server side */
+        rs = (struct ptlrpc_reply_state *) (buf + repsize);
+        rs->rs_buf = buf;
+        rs->rs_buf_len = bufsize;
+        rs->rs_repbuf = buf;
+        rs->rs_repbuf_len = repsize;
+        /* current known data length is hdr + msg, security payload
+         * will be added on later.
+         */
+        rs->rs_repdata_len = sizeof(struct ptlrpcs_wire_hdr) + msgsize;
+        req->rq_repmsg = rs->rs_msg = (struct lustre_msg *)
+                         (rs->rs_repbuf + sizeof(struct ptlrpcs_wire_hdr));
+
+        req->rq_reply_state = rs;
+
+        CDEBUG(D_SEC, "alloc rs buf at %p, len %d; repbuf at %p, len %d\n",
+               rs->rs_buf, rs->rs_buf_len, rs->rs_repbuf, rs->rs_repbuf_len);
+
+        RETURN(0);
+}
+
+void svcsec_free_reply_state(struct ptlrpc_reply_state *rs)
+{
+        char *p;
+        ENTRY;
+
+        /* for work around memory-alloc debug poison */
+        LASSERT(rs);
+        p = rs->rs_buf;
+        OBD_FREE(p, rs->rs_buf_len);
+        EXIT;
+}
+
+int svcsec_alloc_repbuf(struct ptlrpc_svcsec *svcsec,
+                        struct ptlrpc_request *req,
+                        int msgsize)
+{
+        LASSERT(svcsec);
+        LASSERT(msgsize % 8 == 0);
+
+        if (svcsec->alloc_repbuf)
+                return svcsec->alloc_repbuf(svcsec, req, msgsize);
+        else
+                return svcsec_alloc_reply_state(req, msgsize, 0);
+}
+
+int svcsec_accept(struct ptlrpc_request *req, enum ptlrpcs_error *res)
+{
+        struct ptlrpc_svcsec           *sec;
+        struct ptlrpcs_wire_hdr        *sec_hdr;
+        int                             rc;
+        ENTRY;
+
+        LASSERT(req->rq_reqbuf);
+        LASSERT(!req->rq_reqmsg);
+        LASSERT(!req->rq_svcsec);
+
+        *res = PTLRPCS_BADCRED;
+        if (req->rq_reqbuf_len < sizeof(*sec_hdr)) {
+                CERROR("drop too short msg (length: %d)\n", req->rq_reqbuf_len);
+                RETURN(SVC_DROP);
+        }
+
+        sec_hdr = (struct ptlrpcs_wire_hdr *) req->rq_reqbuf;
+        sec_hdr->flavor = le32_to_cpu(sec_hdr->flavor);
+        sec_hdr->sectype = le32_to_cpu(sec_hdr->sectype);
+        sec_hdr->msg_len = le32_to_cpu(sec_hdr->msg_len);
+        sec_hdr->sec_len = le32_to_cpu(sec_hdr->sec_len);
+
+        /* sanity check */
+        switch (sec_hdr->sectype) {
+        case PTLRPC_SEC_TYPE_NONE:
+        case PTLRPC_SEC_TYPE_AUTH:
+        case PTLRPC_SEC_TYPE_PRIV:
+                break;
+        default:
+                CERROR("unknown security type %d\n", sec_hdr->sectype);
+                RETURN(SVC_DROP);
+        }
+
+        if (sizeof(*sec_hdr) + sec_hdr->msg_len + sec_hdr->sec_len >
+            req->rq_reqbuf_len) {
+                CERROR("received %d, msg %d, sec %d\n",
+                        req->rq_reqbuf_len, sec_hdr->msg_len, sec_hdr->sec_len);
+                RETURN(SVC_DROP);
+        }
+
+        req->rq_svcsec = sec = flavor2svcsec(sec_hdr->flavor);
+        if (!sec) {
+                CERROR("drop msg: unsupported flavor %d\n", sec_hdr->flavor);
+                RETURN(SVC_DROP);
+        }
+        LASSERT(sec->accept);
+
+        rc = sec->accept(req, res);
+
+        switch (rc) {
+        case SVC_DROP:
+                svcsec_put(sec);
+                req->rq_svcsec = NULL;
+                break;
+        case SVC_OK:
+        case SVC_LOGIN:
+        case SVC_LOGOUT:
+                LASSERT(req->rq_reqmsg);
+                break;
+        }
+
+        RETURN(rc);
+}
+
+int svcsec_authorize(struct ptlrpc_request *req)
+{
+        LASSERT(req->rq_svcsec);
+        LASSERT(req->rq_svcsec->authorize);
+
+        return (req->rq_svcsec->authorize(req));
+}
+
+void svcsec_cleanup_req(struct ptlrpc_request *req)
+{
+        struct ptlrpc_svcsec *svcsec = req->rq_svcsec;
+        ENTRY;
+
+        LASSERT(svcsec);
+        LASSERT(svcsec->cleanup_req || !req->rq_sec_svcdata);
+
+        if (svcsec->cleanup_req)
+                svcsec->cleanup_req(svcsec, req);
+        EXIT;
+}
diff --git a/lustre/sec/svcsec_null.c b/lustre/sec/svcsec_null.c
new file mode 100644 (file)
index 0000000..5e7eed8
--- /dev/null
@@ -0,0 +1,111 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_SEC
+#ifdef __KERNEL__
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <libcfs/kp30.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_sec.h>
+
+static
+int null_svcsec_accept(struct ptlrpc_request *req, enum ptlrpcs_error *res)
+{
+        struct ptlrpcs_wire_hdr *hdr = buf_to_sec_hdr(req->rq_reqbuf);
+        ENTRY;
+
+        LASSERT(hdr->flavor == PTLRPC_SEC_NULL);
+
+        if (hdr->sec_len != 0) {
+                CERROR("security payload %d not zero\n", hdr->sec_len);
+                *res = PTLRPCS_REJECTEDCRED;
+                RETURN(SVC_DROP);
+        }
+
+        req->rq_reqmsg = (struct lustre_msg *)(hdr + 1);
+        req->rq_reqlen = hdr->msg_len;
+        *res = PTLRPCS_OK;
+        CDEBUG(D_SEC, "req %p: set reqmsg at %p, len %d\n",
+               req, req->rq_reqmsg, req->rq_reqlen);
+        RETURN(SVC_OK);
+}
+
+static
+int null_svcsec_authorize(struct ptlrpc_request *req)
+{
+        struct ptlrpc_reply_state *rs = req->rq_reply_state;
+        struct ptlrpcs_wire_hdr *hdr;
+        ENTRY;
+
+        LASSERT(rs);
+        LASSERT(rs->rs_repbuf_len >= 4 * 4);
+
+        hdr = buf_to_sec_hdr(rs->rs_repbuf);
+        hdr->flavor = cpu_to_le32(PTLRPC_SEC_NULL);
+        hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_AUTH);
+        hdr->msg_len = cpu_to_le32(req->rq_replen);
+        hdr->sec_len = cpu_to_le32(0);
+
+        CDEBUG(D_SEC, "fill in datasize %d\n", rs->rs_repdata_len);
+        RETURN(0);
+}
+
+static struct ptlrpc_svcsec null_svcsec = {
+        .pss_owner      = THIS_MODULE,
+        .pss_name       = "NULL_SVCSEC",
+        .pss_flavor     = {PTLRPC_SEC_NULL, 0},
+        .accept         = null_svcsec_accept,
+        .authorize      = null_svcsec_authorize,
+};
+
+int svcsec_null_init()
+{
+        int rc;
+
+        rc = svcsec_register(&null_svcsec);
+        if (rc)
+                CERROR("failed to register SVCNULL security: %d\n", rc);
+
+        return rc;
+}
+
+int svcsec_null_exit()
+{
+        int rc;
+
+        rc = svcsec_unregister(&null_svcsec);
+        if (rc)
+                CERROR("cannot unregister SVCNULL security: %d\n", rc);
+
+        return rc;
+}
+
diff --git a/lustre/sec/upcall_cache.c b/lustre/sec/upcall_cache.c
new file mode 100644 (file)
index 0000000..49e9522
--- /dev/null
@@ -0,0 +1,414 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_LOV
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+
+#include <linux/obd_support.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_idl.h>
+#include <linux/obd_class.h>
+#include <linux/lustre_ucache.h>
+
+/* FIXME
+ * current ucache implementation is simply took from group hash code, almost
+ * without any change. it's very simple and have very limited functionality,
+ * and probably it's also only suitable for usage of group hash.
+ */
+
+void upcall_cache_init_entry(struct upcall_cache *cache,
+                             struct upcall_cache_entry *entry,
+                             __u64 key)
+{
+        UC_CACHE_SET_NEW(entry);
+        INIT_LIST_HEAD(&entry->ue_hash);
+        atomic_set(&entry->ue_refcount, 0);
+        entry->ue_key = key;
+        entry->ue_cache = cache;
+        init_waitqueue_head(&entry->ue_waitq);
+}
+EXPORT_SYMBOL(upcall_cache_init_entry);
+
+static inline struct upcall_cache_entry *
+alloc_entry(struct upcall_cache *cache, __u64 key)
+{
+        LASSERT(cache->alloc_entry);
+        return cache->alloc_entry(cache, key);
+}
+
+static void free_entry(struct upcall_cache_entry *entry)
+{
+        struct upcall_cache *cache = entry->ue_cache;
+
+        LASSERT(cache);
+        LASSERT(cache->free_entry);
+        LASSERT(atomic_read(&entry->ue_refcount) == 0);
+
+        CDEBUG(D_OTHER, "destroy %s entry %p for key "LPU64"\n",
+               cache->uc_name, entry, entry->ue_key);
+
+        list_del(&entry->ue_hash);
+        cache->free_entry(cache, entry);
+}
+
+static inline void get_entry(struct upcall_cache_entry *entry)
+{
+        atomic_inc(&entry->ue_refcount);
+}
+
+static inline void put_entry(struct upcall_cache_entry *entry)
+{
+        if (atomic_dec_and_test(&entry->ue_refcount) &&
+            !UC_CACHE_IS_VALID(entry)) {
+                free_entry(entry);
+        }
+}
+
+static inline int refresh_entry(struct upcall_cache_entry *entry)
+{
+        struct upcall_cache *cache = entry->ue_cache;
+
+        LASSERT(cache);
+        LASSERT(cache->make_upcall);
+
+        return cache->make_upcall(cache, entry);
+}
+
+static int check_unlink_entry(struct upcall_cache_entry *entry)
+{
+        if (UC_CACHE_IS_VALID(entry) &&
+            time_before(get_seconds(), entry->ue_expire))
+                return 0;
+
+        if (UC_CACHE_IS_ACQUIRING(entry) &&
+            time_after(get_seconds(), entry->ue_acquire_expire)) {
+                UC_CACHE_SET_EXPIRED(entry);
+                wake_up_all(&entry->ue_waitq);
+        } else if (!UC_CACHE_IS_INVALID(entry)) {
+                UC_CACHE_SET_EXPIRED(entry);
+        }
+
+        list_del_init(&entry->ue_hash);
+        if (!atomic_read(&entry->ue_refcount))
+                free_entry(entry);
+        return 1;
+}
+
+/* XXX
+ * currently always use write_lock
+ */
+static struct upcall_cache_entry *
+__get_entry(struct upcall_cache *cache, unsigned int hash, __u64 key,
+            int create, int async)
+{
+        struct list_head *head;
+        struct upcall_cache_entry *entry, *next, *new = NULL;
+        int found = 0, rc;
+        ENTRY;
+
+        LASSERT(hash < cache->uc_hashsize);
+
+        head = &cache->uc_hashtable[hash];
+
+find_again:
+        write_lock(&cache->uc_hashlock);
+        list_for_each_entry_safe(entry, next, head, ue_hash) {
+                if (check_unlink_entry(entry))
+                        continue;
+                if (entry->ue_key == key) {
+                        found = 1;
+                        break;
+                }
+        }
+
+        if (!found) {
+                if (!create)
+                        RETURN(NULL);
+                if (!new) {
+                        write_unlock(&cache->uc_hashlock);
+                        new = alloc_entry(cache, key);
+                        if (!new) {
+                                CERROR("fail to alloc entry\n");
+                                RETURN(NULL);
+                        }
+                        goto find_again;
+                } else {
+                        list_add(&new->ue_hash, head);
+                        entry = new;
+                }
+        } else {
+                if (new) {
+                        free_entry(new);
+                        new = NULL;
+                }
+                list_move(&entry->ue_hash, head);
+        }
+        get_entry(entry);
+
+        /* as for this moment, we have found matched entry
+         * and hold a ref of it. if it's NEW (we created it),
+         * we must give it a push to refresh
+         */
+        if (UC_CACHE_IS_NEW(entry)) {
+                LASSERT(entry == new);
+                UC_CACHE_SET_ACQUIRING(entry);
+                UC_CACHE_CLEAR_NEW(entry);
+                entry->ue_acquire_expire = get_seconds() +
+                                           cache->uc_acquire_expire;
+
+                write_unlock(&cache->uc_hashlock);
+                rc = refresh_entry(entry);
+                write_lock(&cache->uc_hashlock);
+                if (rc) {
+                        UC_CACHE_CLEAR_ACQUIRING(entry);
+                        UC_CACHE_SET_INVALID(entry);
+                }
+        }
+
+        /* caller don't want to wait */
+        if (async) {
+                write_unlock(&cache->uc_hashlock);
+                RETURN(entry);
+        }
+
+        /* someone (and only one) is doing upcall upon
+         * this item, just wait it complete
+         */
+        if (UC_CACHE_IS_ACQUIRING(entry)) {
+                wait_queue_t wait;
+
+                init_waitqueue_entry(&wait, current);
+                add_wait_queue(&entry->ue_waitq, &wait);
+                set_current_state(TASK_INTERRUPTIBLE);
+                write_unlock(&cache->uc_hashlock);
+
+                schedule_timeout(cache->uc_acquire_expire);
+
+                write_lock(&cache->uc_hashlock);
+                remove_wait_queue(&entry->ue_waitq, &wait);
+                if (UC_CACHE_IS_ACQUIRING(entry)) {
+                        /* we're interrupted or upcall failed
+                         * in the middle
+                         */
+                        CERROR("entry %p not refreshed: cur %lu, key "LPU64", "
+                               "ref %d fl %u, ac %ld, ex %ld\n",
+                               entry, get_seconds(), entry->ue_key,
+                               atomic_read(&entry->ue_refcount),
+                               entry->ue_flags, entry->ue_acquire_expire,
+                               entry->ue_expire);
+                        put_entry(entry);
+                        write_unlock(&cache->uc_hashlock);
+                        RETURN(NULL);
+                }
+                /* fall through */
+        }
+
+        /* invalid means error, don't need to try again */
+        if (UC_CACHE_IS_INVALID(entry)) {
+                put_entry(entry);
+                write_unlock(&cache->uc_hashlock);
+                RETURN(NULL);
+        }
+
+        /* check expired 
+         * We can't refresh the existed one because some
+         * memory might be shared by multiple processes.
+         */
+        if (check_unlink_entry(entry)) {
+                /* if expired, try again. but if this entry is
+                 * created by me and too quickly turn to expired
+                 * without any error, should at least give a
+                 * chance to use it once.
+                 */
+                if (entry != new) {
+                        put_entry(entry);
+                        write_unlock(&cache->uc_hashlock);
+                        new = NULL;
+                        goto find_again;
+                }
+        }
+        
+        /* Now we know it's good */
+        LASSERT(UC_CACHE_IS_VALID(entry));
+        write_unlock(&cache->uc_hashlock);
+
+        RETURN(entry);
+}
+
+struct upcall_cache_entry *
+upcall_cache_get_entry(struct upcall_cache *cache, __u64 key)
+{
+        unsigned int hash;
+
+        LASSERT(cache->hash);
+
+        hash = cache->hash(cache, key);
+
+        return __get_entry(cache, hash, key, 1, 0);
+}
+EXPORT_SYMBOL(upcall_cache_get_entry);
+
+void upcall_cache_put_entry(struct upcall_cache_entry *entry)
+{
+        struct upcall_cache *cache = entry->ue_cache;
+
+        write_lock(&cache->uc_hashlock);
+        LASSERTF(atomic_read(&entry->ue_refcount) > 0,
+                 "entry %p: ref %d\n", entry, atomic_read(&entry->ue_refcount));
+        put_entry(entry);
+        write_unlock(&cache->uc_hashlock);
+}
+EXPORT_SYMBOL(upcall_cache_put_entry);
+
+int upcall_cache_downcall(struct upcall_cache *cache, __u64 key,
+                          int err, void *args)
+{
+        struct list_head *head;
+        struct upcall_cache_entry *entry;
+        int found = 0, rc;
+        unsigned int hash;
+        ENTRY;
+
+        hash = cache->hash(cache, key);
+        LASSERT(hash < cache->uc_hashsize);
+
+        head = &cache->uc_hashtable[hash];
+
+        write_lock(&cache->uc_hashlock);
+        list_for_each_entry(entry, head, ue_hash) {
+                if (entry->ue_key == key) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found) {
+                /* haven't found, it's possible */
+                write_unlock(&cache->uc_hashlock);
+                CWARN("key "LPU64" entry dosen't found\n", key);
+                RETURN(-EINVAL);
+        }
+
+        if (err < 0) {
+                UC_CACHE_SET_INVALID(entry);
+                GOTO(out, rc = err);
+        }
+
+        if (!UC_CACHE_IS_ACQUIRING(entry) ||
+            UC_CACHE_IS_INVALID(entry) ||
+            UC_CACHE_IS_EXPIRED(entry)) {
+                CWARN("stale entry %p: cur %lu, key "LPU64", ref %d, "
+                      "fl %u, ac %ld, ex %ld\n",
+                       entry, get_seconds(), entry->ue_key,
+                       atomic_read(&entry->ue_refcount), entry->ue_flags,
+                       entry->ue_acquire_expire, entry->ue_expire);
+                GOTO(out, rc = -EINVAL);
+        }
+
+        atomic_inc(&entry->ue_refcount);
+        write_unlock(&cache->uc_hashlock);
+        rc = cache->parse_downcall(cache, entry, args);
+        write_lock(&cache->uc_hashlock);
+        atomic_dec(&entry->ue_refcount);
+        if (rc) {
+                UC_CACHE_SET_INVALID(entry);
+                list_del_init(&entry->ue_hash);
+                GOTO(out, rc);
+        }
+        entry->ue_expire = get_seconds() + cache->uc_entry_expire;
+        UC_CACHE_SET_VALID(entry);
+        CDEBUG(D_OTHER, "create ucache entry %p(key "LPU64")\n",
+               entry, entry->ue_key);
+out:
+        wake_up_all(&entry->ue_waitq);
+        write_unlock(&cache->uc_hashlock);
+        RETURN(rc);
+}
+EXPORT_SYMBOL(upcall_cache_downcall);
+
+void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key)
+{
+        struct list_head *head;
+        struct upcall_cache_entry *entry;
+        unsigned int hash;
+        int found = 0;
+        ENTRY;
+
+        hash = cache->hash(cache, key);
+        LASSERT(hash < cache->uc_hashsize);
+
+        head = &cache->uc_hashtable[hash];
+
+        write_lock(&cache->uc_hashlock);
+        list_for_each_entry(entry, head, ue_hash) {
+                if (entry->ue_key == key) {
+                        found = 1;
+                        break;
+                }
+        }
+
+        if (found) {
+                UC_CACHE_SET_EXPIRED(entry);
+                if (!atomic_read(&entry->ue_refcount))
+                        free_entry(entry);
+        }
+        write_unlock(&cache->uc_hashlock);
+}
+EXPORT_SYMBOL(upcall_cache_flush_one);
+
+static void cache_flush(struct upcall_cache *cache, int force, int sync)
+{
+        struct upcall_cache_entry *entry, *next;
+        int i;
+        ENTRY;
+
+        write_lock(&cache->uc_hashlock);
+        for (i = 0; i < cache->uc_hashsize; i++) {
+                list_for_each_entry_safe(entry, next,
+                                         &cache->uc_hashtable[i], ue_hash) {
+                        if (!force && atomic_read(&entry->ue_refcount)) {
+                                UC_CACHE_SET_EXPIRED(entry);
+                                continue;
+                        }
+                        LASSERT(!atomic_read(&entry->ue_refcount));
+                        free_entry(entry);
+                }
+        }
+        write_unlock(&cache->uc_hashlock);
+        EXIT;
+}
+
+void upcall_cache_flush_idle(struct upcall_cache *cache)
+{
+        cache_flush(cache, 0, 0);
+}
+
+void upcall_cache_flush_all(struct upcall_cache *cache)
+{
+        cache_flush(cache, 1, 0);
+}
+EXPORT_SYMBOL(upcall_cache_flush_idle);
+EXPORT_SYMBOL(upcall_cache_flush_all);
index 3d77ef7..423aab8 100644 (file)
@@ -169,7 +169,6 @@ static struct dentry *smfs_lookup(struct inode *dir, struct dentry *dentry,
                 d_add(dentry, inode);
         
         SMFS_POST_HOOK(dir, HOOK_LOOKUP, &msg, rc);
-exit:        
         post_smfs_dentry(cache_dentry);
         post_smfs_dentry(cache_parent);
         RETURN(ERR_PTR(rc));
@@ -688,7 +687,6 @@ static int smfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
         SMFS_POST_HOOK(dentry->d_inode, HOOK_READDIR, &msg, rc);
         duplicate_file(filp, sfi->c_file);
 
-exit:
         if (rc > 0)
                 rc = 0;
 
diff --git a/lustre/tests/acl_asroot.test b/lustre/tests/acl_asroot.test
new file mode 100644 (file)
index 0000000..af0fed3
--- /dev/null
@@ -0,0 +1,46 @@
+!
+! Test that can only be run as root as it uses mknod.
+!
+$mkdir asroot
+$ umask 027
+$ mknod asroot/null c 1 3
+$ acl_mode asroot/null
+crw-r-----
+$ setfacl -m u:joe:rw,u:lisa:- asroot/null
+$ acl_mode asroot/null
+crw-rw----+
+$ setfacl -m u:lisa:r asroot/null
+$ getfacl --omit-header asroot/null
+user::rw-
+user:joe:rw-
+user:lisa:r--
+group::r--
+mask::rw-
+other::---
+
+$ su - lisa -c chmod\ +rw\ /mnt/lustre/asroot/null
+chmod: changing permissions of `/mnt/lustre/asroot/null': Operation not permitted
+$ rm -f asroot/null
+$ mkfifo asroot/fifo
+$ acl_mode asroot/fifo
+prw-r-----
+$ setfacl -m u:joe:- asroot/fifo
+$ getfacl --omit-header asroot/fifo
+user::rw-
+user:joe:---
+group::r--
+mask::r--
+other::---
+
+$ rm asroot/fifo
+$ mknod asroot/block b 1 1
+$ setfacl -m u:joe:- asroot/block
+$ getfacl --omit-header asroot/block
+user::rw-
+user:joe:---
+group::r--
+mask::r--
+other::---
+
+$ rm asroot/block
+$ rmdir asroot
diff --git a/lustre/tests/acl_fileutil.test b/lustre/tests/acl_fileutil.test
new file mode 100644 (file)
index 0000000..9760bf4
--- /dev/null
@@ -0,0 +1,66 @@
+!
+! Test for the patched file utilities.
+!
+$ umask 022
+$ mkdir dir
+$ acl_mode dir
+drwxr-xr-x
+$ touch dir/f
+$ getfacl --omit-header dir/f
+user::rw-
+group::r--
+other::r--
+
+$ umask 027
+$ cp -p dir/f dir/g
+$ getfacl --omit-header dir/g
+user::rw-
+group::r--
+other::r--
+
+$ rm dir/g
+$ cp dir/f dir/g
+$ getfacl --omit-header dir/g
+user::rw-
+group::r--
+other::---
+
+$ setfacl -m u::rwx,u:joe:rwx,g::rwx,o::r-x dir/.
+$ setfacl -dm u::rwx,u:joe:rwx,g::rwx,o::r-x dir/.
+$ acl_mode dir
+drwxrwxr-x+
+$ touch dir/h
+$ getfacl --omit-header --no-effective dir/h
+user::rw-
+user:joe:rwx
+group::rwx
+mask::r--
+other::---
+
+$ mkdir dir/d
+$ getfacl --omit-header --no-effective dir/d
+user::rwx
+user:joe:rwx
+group::rwx
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:rwx
+default:group::rwx
+default:mask::rwx
+default:other::r-x
+
+$ cp dir/f dir/i
+$ getfacl --omit-header --no-effective dir/i
+user::rw-
+user:joe:rwx
+group::rwx
+mask::r--
+other::---
+
+$ acl_mode dir/f
+-rw-r--r--
+$ cp -p dir/f dir/j
+$ acl_mode dir/j
+-rw-r--r--
+$ rm -r dir
diff --git a/lustre/tests/acl_misc.test b/lustre/tests/acl_misc.test
new file mode 100644 (file)
index 0000000..def2929
--- /dev/null
@@ -0,0 +1,386 @@
+!
+! Pretty comprehensive ACL tests.
+!
+! This must be run on a filesystem with ACL support. Also, you will need
+! two dummy users (lisa and joe) and a dummy group (toolies).
+!
+$ umask 027
+$ touch f
+! Only change a base ACL:
+$ setfacl -m u::r f
+$ setfacl -m u::rw,u:lisa:rw f
+$ acl_mode f
+-rw-rw----+
+$ getfacl --omit-header f
+user::rw-
+user:lisa:rw-
+group::r--
+mask::rw-
+other::---
+
+$ rm f
+$ umask 022
+$ touch f
+$ setfacl -m u:lisa:rw f
+$ acl_mode f
+-rw-rw-r--+
+$ getfacl --omit-header f
+user::rw-
+user:lisa:rw-
+group::r--
+mask::rw-
+other::r--
+
+$rm f
+$ umask 027
+$ mkdir d
+$ setfacl -m u:lisa:rwx d
+$ acl_mode d
+drwxrwx---+
+$ getfacl --omit-header d
+user::rwx
+user:lisa:rwx
+group::r-x
+mask::rwx
+other::---
+
+$ rmdir d
+$ umask 022
+$ mkdir d
+$ setfacl -m u:lisa:rwx d
+$ acl_mode d
+drwxrwxr-x+
+$ getfacl --omit-header d
+user::rwx
+user:lisa:rwx
+group::r-x
+mask::rwx
+other::r-x
+
+$ rmdir d
+!
+! Multiple users
+!
+$ umask 022
+$ touch f
+$ setfacl -m u:lisa:rw,u:joe:r f
+$ acl_mode f
+-rw-rw-r--+
+$ getfacl --omit-header f
+user::rw-
+user:joe:r--
+user:lisa:rw-
+group::r--
+mask::rw-
+other::r--
+
+!
+! Multiple groups
+!
+$ setfacl -m g:users:rw,g:toolies:r f
+$ acl_mode f
+-rw-rw-r--+
+$ getfacl --omit-header f
+user::rw-
+user:joe:r--
+user:lisa:rw-
+group::r--
+group:users:rw-
+group:toolies:r--
+mask::rw-
+other::r--
+
+!
+! Remove one group
+!
+$ setfacl -x g:users f
+$ acl_mode f
+-rw-rw-r--+
+$ getfacl --omit-header f
+user::rw-
+user:joe:r--
+user:lisa:rw-
+group::r--
+group:toolies:r--
+mask::rw-
+other::r--
+
+!
+! Remove one user
+!
+$ setfacl -x u:joe f
+$ acl_mode f
+-rw-rw-r--+
+$ getfacl --omit-header f
+user::rw-
+user:lisa:rw-
+group::r--
+group:toolies:r--
+mask::rw-
+other::r--
+
+$ rm f
+!
+! Default ACL
+!
+$ umask 027
+$ mkdir d
+$ setfacl -m u:lisa:rwx,u:joe:rw,d:u:lisa:rwx,d:m:rx d
+$ acl_mode d
+drwxrwx---+
+$ getfacl --omit-header d
+user::rwx
+user:joe:rw-
+user:lisa:rwx
+group::r-x
+mask::rwx
+other::---
+default:user::rwx
+default:user:lisa:rwx  #effective:r-x
+default:group::r-x
+default:mask::r-x
+default:other::---
+
+!
+! Umask now ignored?
+!
+$ umask 027
+$ touch d/f
+$ acl_mode d/f
+-rw-r-----+
+$ getfacl --omit-header d/f
+user::rw-
+user:lisa:rwx  #effective:r--
+group::r-x     #effective:r--
+mask::r--
+other::---
+
+$ rm d/f
+$ umask 022
+$ touch d/f
+$ acl_mode d/f
+-rw-r-----+
+$ getfacl --omit-header d/f
+user::rw-
+user:lisa:rwx  #effective:r--
+group::r-x     #effective:r--
+mask::r--
+other::---
+
+$ rm d/f
+!
+! Default ACL copying
+!
+$ umask 000
+$ mkdir d/d
+$ acl_mode d/d
+drwxr-x---+
+$ getfacl --omit-header d/d
+user::rwx
+user:lisa:rwx  #effective:r-x
+group::r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:lisa:rwx  #effective:r-x
+default:group::r-x
+default:mask::r-x
+default:other::---
+
+$ rmdir d/d
+$ umask 022
+$ mkdir d/d
+$ acl_mode d/d
+drwxr-x---+
+$ getfacl --omit-header d/d
+user::rwx
+user:lisa:rwx  #effective:r-x
+group::r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:lisa:rwx  #effective:r-x
+default:group::r-x
+default:mask::r-x
+default:other::---
+
+!
+! Add some users and groups
+!
+$ setfacl -nm u:joe:rx,d:u:joe:rx,g:users:rx,g:toolies:rwx d/d
+$ acl_mode d/d
+drwxr-x---+
+$ getfacl --omit-header d/d
+user::rwx
+user:joe:r-x
+user:lisa:rwx  #effective:r-x
+group::r-x
+group:users:r-x
+group:toolies:rwx      #effective:r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:r-x
+default:user:lisa:rwx  #effective:r-x
+default:group::r-x
+default:mask::r-x
+default:other::---
+
+!
+! symlink in directory with default ACL?
+!
+$ ln -s d d/l
+$ acl_mode d/l
+lrwxrwxrwx
+$ acl_mode -L d/l
+drwxr-x---+
+$ getfacl --omit-header d/l
+user::rwx
+user:joe:r-x
+user:lisa:rwx  #effective:r-x
+group::r-x
+group:users:r-x
+group:toolies:rwx      #effective:r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:r-x
+default:user:lisa:rwx  #effective:r-x
+default:group::r-x
+default:mask::r-x
+default:other::---
+
+$ rm d/l
+!
+! Does mask manipulation work?
+!
+$ setfacl -m g:toolies:rx,u:lisa:rx d/d
+$ acl_mode d/d
+drwxr-x---+
+$ getfacl --omit-header d/d
+user::rwx
+user:joe:r-x
+user:lisa:r-x
+group::r-x
+group:users:r-x
+group:toolies:r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:r-x
+default:user:lisa:rwx  #effective:r-x
+default:group::r-x
+default:mask::r-x
+default:other::---
+
+$ setfacl -m d:u:lisa:rwx d/d
+$ acl_mode d/d
+drwxr-x---+
+$ getfacl --omit-header d/d
+user::rwx
+user:joe:r-x
+user:lisa:r-x
+group::r-x
+group:users:r-x
+group:toolies:r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:r-x
+default:user:lisa:rwx
+default:group::r-x
+default:mask::rwx
+default:other::---
+
+$ rmdir d/d
+!
+! Remove the default ACL
+!
+$ setfacl -k d
+$ acl_mode d
+drwxrwx---+
+$ getfacl --omit-header d
+user::rwx
+user:joe:rw-
+user:lisa:rwx
+group::r-x
+mask::rwx
+other::---
+
+!
+! Reset to base entries
+!
+$ setfacl -b d
+$ acl_mode d
+drwxr-x---
+$ getfacl --omit-header d
+user::rwx
+group::r-x
+other::---
+
+!
+! Now, chmod should change the group_obj entry
+!
+$ chmod 775 d
+$ acl_mode d
+drwxrwxr-x
+$ getfacl --omit-header d
+user::rwx
+group::rwx
+other::r-x
+
+$ rmdir d
+$ umask 002
+$ mkdir d
+$ setfacl -m u:joe:rwx,u:lisa:rx,d:u:joe:rwx,d:u:lisa:rx d
+$ acl_mode d
+drwxrwxr-x+
+$ getfacl --omit-header d
+user::rwx
+user:joe:rwx
+user:lisa:r-x
+group::rwx
+mask::rwx
+other::r-x
+default:user::rwx
+default:user:joe:rwx
+default:user:lisa:r-x
+default:group::rwx
+default:mask::rwx
+default:other::r-x
+
+$ chmod 750 d
+$ acl_mode d
+drwxr-x---+
+$ getfacl --omit-header d
+user::rwx
+user:joe:rwx   #effective:r-x
+user:lisa:r-x
+group::rwx     #effective:r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:rwx
+default:user:lisa:r-x
+default:group::rwx
+default:mask::rwx
+default:other::r-x
+
+$ chmod 750 d
+$ acl_mode d
+drwxr-x---+
+$ getfacl --omit-header d
+user::rwx
+user:joe:rwx   #effective:r-x
+user:lisa:r-x
+group::rwx     #effective:r-x
+mask::r-x
+other::---
+default:user::rwx
+default:user:joe:rwx
+default:user:lisa:r-x
+default:group::rwx
+default:mask::rwx
+default:other::r-x
+
+$ rmdir d
diff --git a/lustre/tests/acl_mode b/lustre/tests/acl_mode
new file mode 100755 (executable)
index 0000000..af4b5eb
--- /dev/null
@@ -0,0 +1,2 @@
+#!/bin/sh
+ls -dl $* | awk -- '!/^total/ { print $1; }'
diff --git a/lustre/tests/acl_perm.test b/lustre/tests/acl_perm.test
new file mode 100644 (file)
index 0000000..0e79724
--- /dev/null
@@ -0,0 +1,18 @@
+!
+! Test whether ACL permissions work
+!
+$ umask 022
+$ mkdir dir
+$ umask 077
+$ touch dir/file
+$ setfacl -m u:joe:rw,u:lisa:- dir/file
+$ su - lisa -c cat\ /mnt/lustre/dir/file
+cat: /mnt/lustre/dir/file: Permission denied
+$ su - joe -c cat\ /mnt/lustre/dir/file
+$ su - joe -c touch\ /mnt/lustre/dir/file
+$ cat dir/file
+$ setfacl -m g:users:-  dir/file
+$ su - nobody -c cat\ /mnt/lustre/dir/file
+cat: /mnt/lustre/dir/file: Permission denied
+$ rm dir/file
+$ rmdir dir
index 246f482..acab312 100644 (file)
@@ -45,10 +45,12 @@ gen_second_config() {
 start_mds() {
        echo "start mds1 service on `facet_active_host mds1`"
        start mds1 --reformat $MDSLCONFARGS  || return 94
+       start_lsvcgssd || return 501
 }
 stop_mds() {
        echo "stop mds1 service on `facet_active_host mds1`"
        stop mds1 $@  || return 97
+       stop_lsvcgssd
 }
 
 start_ost() {
@@ -63,6 +65,7 @@ stop_ost() {
 
 mount_client() {
        local MOUNTPATH=$1
+       start_lgssd || return 502
        echo "mount lustre on ${MOUNTPATH}....."
        zconf_mount `hostname`  $MOUNTPATH  || return 96
 }
@@ -71,11 +74,13 @@ umount_client() {
        local MOUNTPATH=$1
        echo "umount lustre on ${MOUNTPATH}....."
        zconf_umount `hostname`  $MOUNTPATH || return 97
+       stop_lgssd
 }
 
 manual_umount_client(){
        echo "manual umount lustre on ${MOUNTPATH}...."
        do_facet  client "umount $MOUNT"
+       stop_lgssd
 }
 
 setup() {
@@ -115,6 +120,7 @@ build_test_filter
 #create single point mountpoint
 
 gen_config
+start_krb5_kdc || exit 1
 
 
 test_0() {
@@ -189,6 +195,7 @@ test_5() {
        kill -TERM $UMOUNT_PID
        echo "waiting for umount to finish"
        wait $UMOUNT_PID
+       stop_lgssd
 
        # cleanup client modules
        $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
@@ -209,10 +216,12 @@ test_5b() {
 
        [ -d $MOUNT ] || mkdir -p $MOUNT
        $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
+       start_lgssd || return 1
        llmount $mds_HOST://mds1_svc/client_facet $MOUNT  && exit 1
 
        # cleanup client modules
        $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
+       stop_lgssd
        
        # stop_mds is a no-op here, and should not fail
        stop_mds || return 2
@@ -230,15 +239,17 @@ test_5c() {
 
        [ -d $MOUNT ] || mkdir -p $MOUNT
        $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
-        llmount $mds_HOST://wrong_mds1_svc/client_facet $MOUNT  && return 1
+       start_lgssd || return 1
+        llmount $mds_HOST://wrong_mds1_svc/client_facet $MOUNT  && return 2
 
        # cleanup client modules
        $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
+       stop_lgssd
        
-       stop_mds || return 2
-       stop_ost || return 3
+       stop_mds || return 3
+       stop_ost || return 4
 
-       lsmod | grep -q portals && return 4
+       lsmod | grep -q portals && return 5
        return 0
 
 }
@@ -251,11 +262,13 @@ test_5d() {
 
        [ -d $MOUNT ] || mkdir -p $MOUNT
        $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null
+       start_lgssd || return 1
        llmount $mds_HOST://mds1_svc/client_facet $MOUNT  || return 1
 
        umount $MOUNT || return 2
        # cleanup client modules
        $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null
+       stop_lgssd
 
        stop_mds || return 3
 
diff --git a/lustre/tests/gns-upcall.sh b/lustre/tests/gns-upcall.sh
new file mode 100755 (executable)
index 0000000..ed4c6ca
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+MOUNT=`which mount 2>/dev/null`
+test "x$MOUNT" = "x" && MOUNT="/bin/mount"
+
+OPTIONS=$1
+MNTPATH=$2
+
+test "x$OPTIONS" = "x" || "x$MNTPATH" = "x" && 
+    exit 1
+
+$MOUNT $OPTIONS $MNTPATH > /tmp/gns-log 2>&1
+exit $?
index 0c3dc9a..1b5a28c 100755 (executable)
@@ -143,11 +143,14 @@ gen_config() {
 setup() {
     gen_config
 
+    start_krb5_kdc || exit 1
     rm -rf logs/*
     for i in `seq $NUMOST`; do
        wait_for ost$i
        start ost$i ${REFORMAT} $OSTLCONFARGS 
     done
+    start_lsvcgssd || exit 2
+    start_lgssd || exit 3
     [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
     for mds in `mds_list`; do
        wait_for $mds
@@ -164,6 +167,8 @@ cleanup() {
     for mds in `mds_list`; do
        stop $mds ${FORCE} $MDSLCONFARGS || :
     done
+    stop_lgssd
+    stop_lsvcgssd
     for i in `seq $NUMOST`; do
        stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS  || :
     done
diff --git a/lustre/tests/krb5_env.sh b/lustre/tests/krb5_env.sh
new file mode 100755 (executable)
index 0000000..07e9f8e
--- /dev/null
@@ -0,0 +1,101 @@
+#!/bin/sh
+
+#
+# KDC could be on remote hosts, but we suppose lgssd/lsvcgssd only
+# runs locally.
+#
+
+export KDCHOST=${KDCHOST:-"localhost"}
+export KDCDIR=${KDCDIR:-"/usr/kerberos/sbin"}
+export KRB5DIR=${KRB5DIR:-"/usr/kerberos"}
+export LGSSD=${LGSSD:-"/sbin/lgssd"}
+export SVCGSSD=${SVCGSSD:-"/sbin/lsvcgssd"}
+export PDSH=${PDSH:-"ssh"}
+
+using_krb5_sec() {
+    if [ "x$1" != "xkrb5i" -a "x$1" != "xkrb5p" ]; then
+        echo "n"
+    else
+        echo "y"
+    fi
+}
+
+start_krb5_kdc() {
+    if [ `using_krb5_sec $SECURITY` == 'n' ] ; then
+        return 0
+    fi
+
+    num=`$PDSH $KDCHOST "PATH=\$PATH:$KDCDIR; ps ax | grep krb5kdc | grep -v "grep" | wc -l"`
+    if [ $num -eq 1 ]; then
+        return 0
+    fi
+
+    $PDSH $KDCHOST "PATH=\$PATH:$KDCDIR; krb5kdc"
+    num=`$PDSH $KDCHOST "PATH=\$PATH:$KDCDIR; ps ax | grep krb5kdc | grep -v "grep" | wc -l"`
+    if [ $num -ne 1 ]; then
+        echo "fail to start krb5 KDC, check env KDCHOST and KDCDIR"
+        return 1
+    fi
+    return 0
+}
+
+prepare_krb5_cache() {
+    if [ `using_krb5_sec $SECURITY` == 'n' ] ; then
+        return 0
+    fi
+
+    $KRB5DIR/bin/klist -5 -s
+    invalid=$?
+    if [ $invalid -eq 0 ]; then
+        return 0
+    fi
+
+    echo "***** refresh Kerberos V5 TGT for uid $UID *****"
+    $KRB5DIR/bin/kinit
+    ret=$?
+    return $ret
+}
+
+start_lsvcgssd() {
+    if [ `using_krb5_sec $SECURITY` == 'n' ] ; then
+        return 0
+    fi
+
+    killall -q -9 lsvcgssd || true
+
+    `$SVCGSSD`
+    num=`ps -o cmd -C "lsvcgssd" | grep lsvcgssd | wc -l`
+    if [ $num -ne 1 ]; then
+        echo "failed to start lsvcgssd"
+        return 1
+    fi
+    return 0
+}
+
+stop_lsvcgssd() {
+    killall -q -9 lsvcgssd || true
+    return 0
+}
+
+start_lgssd() {
+    if [ `using_krb5_sec $SECURITY` == 'n' ] ; then
+        return 0
+    fi
+
+    prepare_krb5_cache || exit 1
+
+    killall -q -9 lgssd || true
+
+    `$LGSSD`
+    num=`ps -o cmd -C "lgssd" | grep lgssd | wc -l`
+    if [ $num -ne 1 ]; then
+        echo "failed to start lgssd $num"
+        return 1
+    fi
+    return 0
+}
+
+stop_lgssd() {
+    killall -q -9 lgssd || true
+    return 0
+}
diff --git a/lustre/tests/krb5_refresh_cache.sh b/lustre/tests/krb5_refresh_cache.sh
new file mode 100755 (executable)
index 0000000..b356306
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+KRB5DIR=${KRB5DIR:-"/usr/kerberos"}
+
+$KRB5DIR/bin/klist -5 -s
+invalid=$?
+
+if [ $invalid -eq 0 ]; then
+    exit 0
+fi
+
+echo "***** refresh Kerberos V5 TGT for uid $UID *****"
+$KRB5DIR/bin/kinit
+ret=$?
+exit $ret
index 5a8c205..17ce6f5 100755 (executable)
@@ -6,10 +6,14 @@ export PATH=`dirname $0`/../utils:$PATH
 LCONF=${LCONF:-lconf}
 NAME=${NAME:-local}
 LLMOUNT=${LLMOUNT:-llmount}
+SECURITY=${SECURITY:-"null"}
 
 config=$NAME.xml
 mkconfig=$NAME.sh
 
+. krb5_env.sh
+start_krb5_kdc || exit 1
+
 if [ "$PORTALS" ]; then
     portals_opt="--portals=$PORTALS"
 fi
@@ -21,16 +25,22 @@ fi
 if [ "$LDAPURL" ]; then
     conf_opt="--ldapurl $LDAPURL --config $NAME"
 else
-    sh $mkconfig $config || exit 1
+    sh $mkconfig $config || exit 2
     conf_opt="$config"
 fi    
 
 [ "$NODE" ] && node_opt="--node $NODE"
 
-${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt ${REFORMAT:---reformat} $@ \
-       $conf_opt  || exit 2
+# We'd better start lsvcgssd after gss modules loaded.
+# remove this if we don't depend on lsvcgssd in the future
+${LCONF} --nosetup --sec $SECURITY $portals_opt $node_opt $@ $conf_opt || exit 3
+start_lsvcgssd || exit 4
+start_lgssd || exit 5
+
+${LCONF} $NOMOD --sec $SECURITY $portals_opt $lustre_opt $node_opt \
+         ${REFORMAT:---reformat} $@ $conf_opt  || exit 6
 
 if [ "$MOUNT2" ]; then
-       $LLMOUNT -v `hostname`:/mds1/client $MOUNT2 || exit 3
+       $LLMOUNT -v -o sec=$SECURITY `hostname`:/mds1/client $MOUNT2 || exit 7
 fi
 
index 05ac8a4..ea054ee 100755 (executable)
@@ -9,6 +9,8 @@ TMP=${TMP:-/tmp}
 config=$NAME.xml
 mkconfig=$NAME.sh
 
+. krb5_env.sh
+
 if [ "$PORTALS" ]; then
   portals_opt="--portals=$PORTALS"
 fi
@@ -36,6 +38,9 @@ ${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt --cleanup $@ \
     --dump $TMP/debug $conf_opt
 rc=$?
 echo "lconf DONE"
+stop_lsvcgssd
+stop_lgssd
+
 BUSY=`dmesg | grep -i destruct`
 if [ "$BUSY" ]; then
        echo "$BUSY" 1>&2
index 9a5cbfa..3de7fcf 100755 (executable)
@@ -5,10 +5,15 @@ export PATH=`dirname $0`/../utils:$PATH
 LCONF=${LCONF:-lconf}
 NAME=${NAME:-local}
 LLMOUNT=${LLMOUNT:-llmount}
+SECURITY=${SECURITY:-"null"}
 
 config=$NAME.xml
 mkconfig=$NAME.sh
 
+. krb5_env.sh
+
+start_krb5_kdc || exit 1
+
 if [ "$PORTALS" ]; then
   portals_opt="--portals=$PORTALS"
 fi
@@ -21,16 +26,23 @@ if [ "$LDAPURL" ]; then
     conf_opt="--ldapurl $LDAPURL --config $NAME"
 else
     if [ ! -f $config -o $mkconfig -nt $config ]; then
-       sh $mkconfig $config || exit 1
+       sh $mkconfig $config || exit 2
     fi
     conf_opt="$config"
 fi    
 
 [ "$NODE" ] && node_opt="--node $NODE"
 
-${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt $@ $conf_opt || exit 2
+# We'd better start lsvcgssd after gss modules loaded.
+# remove this if we don't depend on lsvcgssd in the future
+${LCONF} --nosetup --sec $SECURITY $portals_opt $node_opt $@ $conf_opt || exit 3
+start_lsvcgssd || exit 4
+start_lgssd || exit 5
+
+${LCONF} $NOMOD --sec $SECURITY $portals_opt $lustre_opt $node_opt \
+         $@ $conf_opt  || exit 6
 
 if [ "$MOUNT2" ]; then
-       $LLMOUNT -v `hostname`:/mds1/client $MOUNT2 || exit 3
+       $LLMOUNT -v -o sec=$SECURITY `hostname`:/mds1/client $MOUNT2 || exit 7
 fi
 
index f2ebf52..b2270f4 100755 (executable)
@@ -34,7 +34,7 @@ rm -f $config
 
 # create nodes
 ${LMC} -m $config --add node --node localhost || exit 10
-${LMC} -m $config --add net --node localhost --nid localhost --nettype tcp || exit 11
+${LMC} -m $config --add net --node localhost --nid `hostname` --nettype tcp || exit 11
 
 # configure mds server
 ${LMC} -m $config --add lmv --lmv lmv1 || exit 12
index 3007a0b..6e9d31d 100755 (executable)
@@ -50,13 +50,15 @@ gen_config() {
 
 setup() {
     gen_config
+    start_krb5_kdc || exit 1
     start ost --reformat $OSTLCONFARGS 
     start ost2 --reformat $OSTLCONFARGS 
+    start_lsvcgssd || exit 2
+    start_lgssd || exit 3
     [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
     for mds in `mds_list`; do
        start $mds --reformat $MDSLCONFARGS
     done
-
     grep " $MOUNT " /proc/mounts || zconf_mount `hostname`  $MOUNT
 }
 
@@ -65,6 +67,8 @@ cleanup() {
     for mds in `mds_list`; do
        stop $mds ${FORCE} $MDSLCONFARGS
     done
+    stop_lgssd
+    stop_lsvcgssd
     stop ost2 ${FORCE} --dump cleanup.log
     stop ost ${FORCE} --dump cleanup.log
 }
index e6e06e1..8848b78 100755 (executable)
@@ -54,6 +54,8 @@ cleanup() {
     for mds in `mds_list`; do
        stop $mds ${FORCE} $MDSLCONFARGS
     done
+    stop_lgssd
+    stop_lsvcgssd
     stop ost2 ${FORCE}
     stop ost ${FORCE}  --dump cleanup-dual.log
 }
@@ -66,6 +68,8 @@ fi
 
 setup() {
     gen_config
+
+    start_krb5_kdc || exit 1
     start ost --reformat $OSTLCONFARGS 
     PINGER=`cat /proc/fs/lustre/pinger`
 
@@ -76,6 +80,8 @@ setup() {
     fi
 
     start ost2 --reformat $OSTLCONFARGS 
+    start_lsvcgssd || exit 2
+    start_lgssd || exit 3
     [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
     for mds in `mds_list`; do
        start $mds --reformat $MDSLCONFARGS
index 1f3e2d6..ef0e09c 100755 (executable)
@@ -20,6 +20,11 @@ assert_env MDSCOUNT
 # Skip these tests
 ALWAYS_EXCEPT=""
 
+if [ `using_krb5_sec $SECURITY` == 'n' ] ; then
+    ALWAYS_EXCEPT="0c $ALWAYS_EXCEPT"
+fi
+
+
 gen_config() {
     rm -f $XMLCONFIG
 
@@ -60,6 +65,8 @@ cleanup() {
     for mds in `mds_list`; do
        stop $mds ${FORCE} $MDSLCONFARGS
     done
+    stop_lgssd
+    stop_lsvcgssd
     stop ost2 ${FORCE} --dump cleanup.log
     stop ost ${FORCE} --dump cleanup.log
 }
@@ -76,8 +83,11 @@ CLEANUP=${CLEANUP:-"cleanup"}
 setup() {
     gen_config
 
+    start_krb5_kdc || exit 1
     start ost --reformat $OSTLCONFARGS 
     start ost2 --reformat $OSTLCONFARGS 
+    start_lsvcgssd || exit 2
+    start_lgssd || exit 3
     [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
     for mds in `mds_list`; do
        start $mds --reformat $MDSLCONFARGS
@@ -108,6 +118,17 @@ test_0b() {
 }
 run_test 0b "ensure object created after recover exists. (3284)"
 
+test_0c() {
+    # drop gss error notification
+    replay_barrier mds1
+    fail_drop mds1 0x760
+
+    # drop gss init request
+    replay_barrier mds1
+    fail_drop mds1 0x780
+}
+run_test 0c "empty replay with gss init failures"
+
 test_1() {
     replay_barrier mds1
     mcreate $DIR/$tfile
diff --git a/lustre/tests/runacltest b/lustre/tests/runacltest
new file mode 100755 (executable)
index 0000000..1b7287a
--- /dev/null
@@ -0,0 +1,160 @@
+#!/usr/bin/perl
+
+use strict;
+use FileHandle;
+use POSIX qw(geteuid getegid isatty);
+
+my $owner = getpwuid(geteuid());
+my $group = getgrgid(getegid());
+
+my ($OK, $FAILED) = ("ok", "failed");
+if (isatty(fileno(STDOUT))) {
+       $OK = "\033[32m" . $OK . "\033[m";
+       $FAILED = "\033[31m\033[1m" . $FAILED . "\033[m";
+}
+
+my ($prog, $in, $out) = ([], [], []);
+my $line = 0;
+my $prog_line;
+my ($tests, $failed);
+
+for (;;) {
+  my $script = <>; $line++;
+  $script =~ s/\@OWNER\@/$owner/g;
+  $script =~ s/\@GROUP\@/$group/g;
+  next if (defined($script) && $script =~ /^!/);
+  if (!defined($script) || $script =~ s/^\$ ?//) {
+    if (@$prog) {
+       #print "[$prog_line] \$ ", join(' ', @$prog), " -- ";
+       my $p = [ @$prog ];
+       print "[$prog_line] \$ ", join(' ',
+             map { s/\s/\\$&/g; $_ } @$p), " -- ";
+       my $result = exec_test($prog, $in);
+       my $good = 1;
+       my $nmax = (@$out > @$result) ? @$out : @$result;
+       for (my $n=0; $n < $nmax; $n++) {
+        if (!defined($out->[$n]) || !defined($result->[$n]) ||
+            $out->[$n] ne $result->[$n]) {
+                $good = 0;
+                #chomp $out->[$n];
+                #chomp $result->[$n];
+                #print "$out->[$n] != $result->[$n]";
+        }
+       }
+       $tests++;
+       $failed++ unless $good;
+       print $good ? $OK : $FAILED, "\n";
+       if (!$good) {
+         for (my $n=0; $n < $nmax; $n++) {
+          my $l = defined($out->[$n]) ? $out->[$n] : "~";
+          chomp $l;
+          my $r = defined($result->[$n]) ? $result->[$n] : "~";
+          chomp $r;
+          print sprintf("%-37s | %-39s\n", $l, $r);
+         }
+       }
+    }
+    #$prog = [ split /\s+/, $script ] if $script;
+    $prog = [ map { s/\\(.)/$1/g; $_ } split /(?<!\\)\s+/, $script ] if $script;
+    $prog_line = $line;
+    $in = [];
+    $out = [];
+  } elsif ($script =~ s/^> ?//) {
+    push @$in, $script;
+  } else {
+    push @$out, $script;
+  }
+  last unless defined($script);
+}
+my $status = sprintf("%d commands (%d passed, %d failed)",
+       $tests, $tests-$failed, $failed);
+if (isatty(fileno(STDOUT))) {
+       if ($failed) {
+               $status = "\033[31m\033[1m" . $status . "\033[m";
+       } else {
+               $status = "\033[32m" . $status . "\033[m";
+       }
+}
+print $status, "\n";
+exit $failed ? 1 : 0;
+
+sub exec_test($$) {
+  my ($prog, $in) = @_;
+  local (*IN, *IN_DUP, *IN2, *OUT_DUP, *OUT, *OUT2);
+
+  if ($prog->[0] eq "umask") {
+    umask oct $prog->[1];
+    return [];
+  } elsif ($prog->[0] eq "cd") {
+    if (!chdir $prog->[1]) {
+      return [ "chdir: $prog->[1]: $!\n" ];
+    }
+    return [];
+  }
+
+  pipe *IN2, *OUT
+    or die "Can't create pipe for reading: $!";
+  open *IN_DUP, "<&STDIN"
+    or *IN_DUP = undef;
+  open *STDIN, "<&IN2"
+    or die "Can't duplicate pipe for reading: $!";
+  close *IN2;
+
+  open *OUT_DUP, ">&STDOUT"
+    or die "Can't duplicate STDOUT: $!";
+  pipe *IN, *OUT2
+    or die "Can't create pipe for writing: $!";
+  open *STDOUT, ">&OUT2"
+    or die "Can't duplicate pipe for writing: $!";
+  close *OUT2;
+
+  *STDOUT->autoflush();
+  *OUT->autoflush();
+
+  if (fork()) {
+    # Server
+    if (*IN_DUP) {
+      open *STDIN, "<&IN_DUP"
+        or die "Can't duplicate STDIN: $!";
+      close *IN_DUP
+        or die "Can't close STDIN duplicate: $!";
+    }
+    open *STDOUT, ">&OUT_DUP"
+      or die "Can't duplicate STDOUT: $!";
+    close *OUT_DUP
+      or die "Can't close STDOUT duplicate: $!";
+
+    foreach my $line (@$in) {
+      #print "> $line";
+      print OUT $line;
+    }
+    close *OUT
+      or die "Can't close pipe for writing: $!";
+
+    my $result = [];
+    while (<IN>) {
+      #print "< $_";
+      push @$result, $_;
+    }
+    return $result;
+  } else {
+    # Client
+    close IN
+      or die "Can't close read end for input pipe: $!";
+    close OUT
+      or die "Can't close write end for output pipe: $!";
+    close OUT_DUP
+      or die "Can't close STDOUT duplicate: $!";
+    local *ERR_DUP;
+    open ERR_DUP, ">&STDERR"
+      or die "Can't duplicate STDERR: $!";
+    open STDERR, ">&STDOUT"
+      or die "Can't join STDOUT and STDERR: $!";
+
+    #print ERR_DUP "<", join(' ', @$prog), ">\n";
+    exec @$prog;
+    print ERR_DUP $prog->[0], ": $!\n";
+    exit;
+  }
+}
+
diff --git a/lustre/tests/sanity-gns.sh b/lustre/tests/sanity-gns.sh
new file mode 100644 (file)
index 0000000..74e5657
--- /dev/null
@@ -0,0 +1,387 @@
+#!/bin/bash
+#
+# Run select tests by setting ONLY, or as arguments to the script.
+# Skip specific tests by setting EXCEPT.
+#
+# e.g. ONLY="22 23" or ONLY="`seq 32 39`" or EXCEPT="31"
+set -e
+
+ONLY=${ONLY:-"$*"}
+ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""}
+[ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT"
+
+SRCDIR=`dirname $0`
+export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
+
+TMP=${TMP:-/tmp}
+FSTYPE=${FSTYPE:-ext3}
+
+CHECKSTAT=${CHECKSTAT:-"checkstat -v"}
+CREATETEST=${CREATETEST:-createtest}
+LFS=${LFS:-lfs}
+LSTRIPE=${LSTRIPE:-"$LFS setstripe"}
+LFIND=${LFIND:-"$LFS find"}
+LVERIFY=${LVERIFY:-ll_dirstripe_verify}
+LCTL=${LCTL:-lctl}
+MCREATE=${MCREATE:-mcreate}
+OPENFILE=${OPENFILE:-openfile}
+OPENUNLINK=${OPENUNLINK:-openunlink}
+TOEXCL=${TOEXCL:-toexcl}
+TRUNCATE=${TRUNCATE:-truncate}
+MUNLINK=${MUNLINK:-munlink}
+SOCKETSERVER=${SOCKETSERVER:-socketserver}
+SOCKETCLIENT=${SOCKETCLIENT:-socketclient}
+IOPENTEST1=${IOPENTEST1:-iopentest1}
+IOPENTEST2=${IOPENTEST2:-iopentest2}
+PTLDEBUG=${PTLDEBUG:-0}
+
+if [ $UID -ne 0 ]; then
+       RUNAS_ID="$UID"
+       RUNAS=""
+else
+       RUNAS_ID=${RUNAS_ID:-500}
+       RUNAS=${RUNAS:-"runas -u $RUNAS_ID"}
+fi
+
+export NAME=${NAME:-local}
+
+SAVE_PWD=$PWD
+
+clean() {
+       echo -n "cln.."
+       sh llmountcleanup.sh > /dev/null || exit 20
+       I_MOUNTED=no
+}
+CLEAN=${CLEAN:-clean}
+
+start() {
+       echo -n "mnt.."
+       sh llrmount.sh > /dev/null || exit 10
+       I_MOUNTED=yes
+       echo "done"
+}
+START=${START:-start}
+
+log() {
+       echo "$*"
+       lctl mark "$*" 2> /dev/null || true
+}
+
+trace() {
+       log "STARTING: $*"
+       strace -o $TMP/$1.strace -ttt $*
+       RC=$?
+       log "FINISHED: $*: rc $RC"
+       return 1
+}
+TRACE=${TRACE:-""}
+
+check_kernel_version() {
+       VERSION_FILE=/proc/fs/lustre/kernel_version
+       WANT_VER=$1
+       [ ! -f $VERSION_FILE ] && echo "can't find kernel version" && return 1
+       GOT_VER=`cat $VERSION_FILE`
+       [ $GOT_VER -ge $WANT_VER ] && return 0
+       log "test needs at least kernel version $WANT_VER, running $GOT_VER"
+       return 1
+}
+
+run_one() {
+       if ! mount | grep -q $DIR; then
+               $START
+       fi
+       echo $PTLDEBUG >/proc/sys/portals/debug 
+       log "== test $1: $2"
+       export TESTNAME=test_$1
+       test_$1 || error "test_$1: exit with rc=$?"
+       unset TESTNAME
+       pass
+       cd $SAVE_PWD
+       $CLEAN
+}
+
+build_test_filter() {
+        for O in $ONLY; do
+            eval ONLY_${O}=true
+        done
+        for E in $EXCEPT $ALWAYS_EXCEPT; do
+            eval EXCEPT_${E}=true
+        done
+}
+
+_basetest() {
+    echo $*
+}
+
+basetest() {
+    IFS=abcdefghijklmnopqrstuvwxyz _basetest $1
+}
+
+run_test() {
+         base=`basetest $1`
+         if [ "$ONLY" ]; then
+                 testname=ONLY_$1
+                 if [ ${!testname}x != x ]; then
+                       run_one $1 "$2"
+                       return $?
+                 fi
+                 testname=ONLY_$base
+                 if [ ${!testname}x != x ]; then
+                         run_one $1 "$2"
+                         return $?
+                 fi
+                 echo -n "."
+                 return 0
+       fi
+        testname=EXCEPT_$1
+        if [ ${!testname}x != x ]; then
+                 echo "skipping excluded test $1"
+                 return 0
+        fi
+        testname=EXCEPT_$base
+        if [ ${!testname}x != x ]; then
+                 echo "skipping excluded test $1 (base $base)"
+                 return 0
+        fi
+        run_one $1 "$2"
+       return $?
+}
+
+[ "$SANITYLOG" ] && rm -f $SANITYLOG || true
+
+error() { 
+       log "FAIL: $@"
+       if [ "$SANITYLOG" ]; then
+               echo "FAIL: $TESTNAME $@" >> $SANITYLOG
+       else
+               exit 1
+       fi
+}
+
+pass() { 
+       echo PASS
+}
+
+MOUNT="`mount | awk '/^'$NAME' .* lustre_lite / { print $3 }'`"
+if [ -z "$MOUNT" ]; then
+       sh llmount.sh
+       MOUNT="`mount | awk '/^'$NAME' .* lustre_lite / { print $3 }'`"
+       [ -z "$MOUNT" ] && error "NAME=$NAME not mounted"
+       I_MOUNTED=yes
+fi
+
+[ `echo $MOUNT | wc -w` -gt 1 ] && error "NAME=$NAME mounted more than once"
+
+DIR=${DIR:-$MOUNT}
+[ -z "`echo $DIR | grep $MOUNT`" ] && echo "$DIR not in $MOUNT" && exit 99
+
+rm -rf $DIR/[Rdfs][1-9]*
+build_test_filter
+
+echo preparing for tests involving mounts
+EXT2_DEV=${EXT2_DEV:-/tmp/SANITY.LOOP}
+touch $EXT2_DEV
+mke2fs -j -F $EXT2_DEV 8000 >/dev/null 2>&1
+
+find_free_loop() {
+    local LOOP_DEV=""
+    test -b /dev/loop0 && 
+       base="/dev/loop" || base="/dev/loop/"
+
+    for ((i=0;i<256;i++)); do
+       test -b $base$i || continue
+       
+       losetup $base$i >/dev/null 2>&1 || {
+           LOOP_DEV="$base$i"
+           break
+       }
+    done
+    echo $LOOP_DEV
+}
+
+cleanup_loop() {
+    local LOOP_DEV=$1
+    local LOOP_FILE=$2
+    local LOOP_MNTPT=$3
+    
+    chmod u-s $LOOP_MNTPT >/dev/null 2>&1
+    umount $LOOP_MNTPT >/dev/null 2>&1
+    losetup -d $LOOP_DEV >/dev/null 2>&1
+    rm -fr $LOOP_FILE >/dev/null 2>&1
+    rm -fr $LOOP_MNTPT >/dev/null 2>&1
+}
+
+setup_loop() {
+    local LOOP_DEV=$1
+    local LOOP_FILE=$2
+    
+    dd if=/dev/zero of=$LOOP_FILE bs=1M count=10 2>/dev/null || return $?
+
+    losetup $LOOP_DEV $LOOP_FILE || {
+       rc=$?
+       cleanup_mount $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a
+       return $rc
+    }
+    
+    mke2fs -F $LOOP_DEV >/dev/null 2>&1 || {
+       rc=$?
+       cleanup_mount $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a
+       echo "cannot create test ext2 fs on $LOOP_DEV"
+       return $?
+    }
+    return 0
+}
+
+prep_upcall() {
+    local INJECTION=""
+    local UPCALL=$1
+    local MODE=$2
+    local LOG=$3
+
+    test "x$MODE" = "xDEADLOCK" &&
+    INJECTION="touch \$MNTPATH/file"
+    
+    cat > $UPCALL <<- EOF
+#!/bin/sh
+
+MOUNT=\`which mount 2>/dev/null\`
+test "x\$MOUNT" = "x" && MOUNT="/bin/mount"
+
+OPTIONS=\$1
+MNTPATH=\$2
+
+test "x\$OPTIONS" = "x" || "x\$MNTPATH" = "x" &&
+exit 1
+
+$INJECTION
+\$MOUNT \$OPTIONS \$MNTPATH > $LOG 2>&1
+exit \$?
+EOF
+    chmod +x $UPCALL
+    return $?
+}
+
+check_gns() {
+    local LOG="/tmp/gns-log"
+    local UPCALL_PATH=""
+    
+    local UPCALL=$1
+    local OBJECT=$2
+    local TIMOUT=$3
+    local TICK=$4
+    
+    rm -fr $LOG >/dev/null 2>&1
+    UPCALL_PATH="/tmp/gns-upcall-$UPCALL.sh"
+    
+    echo "generating upcall $UPCALL_PATH"
+    prep_upcall $UPCALL_PATH $UPCALL $LOG || return $rc
+    echo "======================== upcall script ==========================="
+    cat $UPCALL_PATH 2>/dev/null || return $?
+    echo "=================================================================="
+   
+    echo "$UPCALL_PATH" > /proc/fs/lustre/llite/fs0/gns_upcall || return $?
+    echo "upcall:  $(cat /proc/fs/lustre/llite/fs0/gns_upcall)"
+
+    echo -n "mount on open $OBJECT/test_file1: "
+    echo -n "test data" > $OBJECT/test_file1 >/dev/null 2>&1 || return $?
+
+    local ENTRY="`basename $OBJECT`"
+    
+    cat /proc/mounts | grep -q "$ENTRY" || {
+       echo "fail"
+       test -f $LOG && {
+           echo "======================== upcall log ==========================="
+           cat $LOG
+           echo "==============================================================="
+       } || {
+           echo "upcall log file $LOG is not found"
+       }
+       return 1
+    }
+    echo "success"
+
+    local sleep_time=$TIMOUT
+    let sleep_time+=$TICK*2
+    echo -n "waiting for umount ${sleep_time}s (timeout + tick*2): "
+    sleep $sleep_time
+
+    cat /proc/mounts | grep -q "$ENTRY" && {
+       echo "failed"
+       return 2
+    }
+    echo "success"
+    return 0
+}
+
+test_1a() {
+    local LOOP_DEV=$(find_free_loop 2>/dev/null)
+    local UPCALL="/tmp/gns-upcall.sh"
+    local LOOP_FILE="/tmp/gns_loop"
+    local OBJECT=".mntinfo"
+    local TIMOUT=5
+    local TICK=1
+
+    test "x$LOOP_DEV" != "x" && test -b $LOOP_DEV ||
+       error "can't find free loop device"
+
+    echo "preparing loop device $LOOP_DEV <-> $LOOP_FILE..."
+    cleanup_loop $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a
+    setup_loop $LOOP_DEV $LOOP_FILE || error
+
+    echo "setting up GNS timeouts and mount object..."
+    echo "$OBJECT" > /proc/fs/lustre/llite/fs0/gns_object_name || error
+    echo "$TIMOUT" > /proc/fs/lustre/llite/fs0/gns_timeout || error
+    echo "$TICK" > /proc/fs/lustre/llite/fs0/gns_tick || error
+
+    echo ""
+    echo "timeout: $(cat /proc/fs/lustre/llite/fs0/gns_timeout)s"
+    echo "object:  $(cat /proc/fs/lustre/llite/fs0/gns_object_name)"
+    echo "tick:    $(cat /proc/fs/lustre/llite/fs0/gns_tick)s"
+    echo ""
+
+    echo "preparing mount object at $DIR/gns_test_1a/$OBJECT..."
+    mkdir -p $DIR/gns_test_1a || error
+    echo -n "-t ext2 $LOOP_DEV" > $DIR/gns_test_1a/$OBJECT
+    echo "======================== mount object ==========================="
+    cat $DIR/gns_test_1a/$OBJECT
+    echo ""
+    echo "================================================================="
+    chmod u+s $DIR/gns_test_1a || error
+
+    echo ""
+    echo "testing GNS with GENERIC upcall 2 times on the row"
+    for ((i=0;i<2;i++)); do
+       check_gns GENERIC $DIR/gns_test_1a $TIMOUT $TICK || {
+           cleanup_loop $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a
+           error
+       }
+    done
+    
+    echo ""
+    echo "testing GNS with DEADLOCK upcall 2 times on the row"
+    for ((i=0;i<2;i++)); do
+       check_gns DEADLOCK $DIR/gns_test_1a $TIMOUT $TICK || {
+           cleanup_loop $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a
+           error
+       }
+    done
+    
+    cleanup_loop $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a
+}
+
+run_test 1a " general GNS test - mounting/umount ===================="
+
+TMPDIR=$OLDTMPDIR
+TMP=$OLDTMP
+HOME=$OLDHOME
+
+log "cleanup: ==========================================================="
+if [ "`mount | grep ^$NAME`" ]; then
+       rm -rf $DIR/[Rdfs][1-9]*
+       if [ "$I_MOUNTED" = "yes" ]; then
+               sh llmountcleanup.sh || error
+       fi
+fi
+
+echo '=========================== finished ==============================='
+[ -f "$SANITYLOG" ] && cat $SANITYLOG && exit 1 || true
index a7d79cc..8e0a86e 100644 (file)
@@ -18,6 +18,7 @@ ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""}
 
 SRCDIR=`dirname $0`
 export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
+export SECURITY=${SECURITY:-"null"}
 
 TMP=${TMP:-/tmp}
 FSTYPE=${FSTYPE:-ext3}
@@ -41,6 +42,8 @@ IOPENTEST1=${IOPENTEST1:-iopentest1}
 IOPENTEST2=${IOPENTEST2:-iopentest2}
 PTLDEBUG=${PTLDEBUG:-0}
 
+. krb5_env.sh
+
 if [ $UID -ne 0 ]; then
        RUNAS_ID="$UID"
        RUNAS=""
@@ -49,6 +52,13 @@ else
        RUNAS=${RUNAS:-"runas -u $RUNAS_ID"}
 fi
 
+if [ `using_krb5_sec $SECURITY` == 'y' ] ; then
+    start_krb5_kdc || exit 1
+    if [ $RUNAS_ID -ne $UID ]; then
+        $RUNAS ./krb5_refresh_cache.sh || exit 2
+    fi
+fi
+
 export NAME=${NAME:-lmv}
 
 SAVE_PWD=$PWD
index 09431e4..d8a5598 100644 (file)
@@ -14,6 +14,7 @@ ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""}
 
 SRCDIR=`dirname $0`
 export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
+export SECURITY=${SECURITY:-"null"}
 
 TMP=${TMP:-/tmp}
 FSTYPE=${FSTYPE:-ext3}
@@ -36,6 +37,8 @@ SOCKETCLIENT=${SOCKETCLIENT:-socketclient}
 IOPENTEST1=${IOPENTEST1:-iopentest1}
 IOPENTEST2=${IOPENTEST2:-iopentest2}
 
+. krb5_env.sh
+
 if [ $UID -ne 0 ]; then
        RUNAS_ID="$UID"
        RUNAS=""
@@ -44,6 +47,13 @@ else
        RUNAS=${RUNAS:-"runas -u $RUNAS_ID"}
 fi
 
+if [ `using_krb5_sec $SECURITY` == 'y' ] ; then
+    start_krb5_kdc || exit 1
+    if [ $RUNAS_ID -ne $UID ]; then
+        $RUNAS ./krb5_refresh_cache.sh || exit 2
+    fi
+fi
+
 export NAME=${NAME:-local}
 
 SAVE_PWD=$PWD
@@ -256,6 +266,67 @@ EOF
 
 run_test 1 "test root_squash ============================"
 
+test_2() {
+        touch $DIR/f2
+                                                                                                                             
+        #test set/get xattr
+        setfattr -n trusted.name1 -v value1 $DIR/f2 || error
+        [ "`getfattr -n trusted.name1 $DIR/f2 2> /dev/null | \
+        grep "trusted.name1"`" == "trusted.name1=\"value1\"" ] || error
+                                                                                                                             
+        setfattr -n user.author1 -v author1 $DIR/f2 || error
+        [ "`getfattr -n user.author1 $DIR/f2 2> /dev/null | \
+        grep "user.author1"`" == "user.author1=\"author1\"" ] || error
+
+        # test listxattr
+        setfattr -n trusted.name2 -v value2 $DIR/f2 || error
+        setfattr -n trusted.name3 -v value3 $DIR/f2 || error
+        [ `getfattr -d -m "^trusted" $DIR/f2 2> /dev/null | \
+        grep "trusted" | wc -l` -eq 5 ] || error
+
+                                                                                                                             
+        setfattr -n user.author2 -v author2 $DIR/f2 || error
+        setfattr -n user.author3 -v author3 $DIR/f2 || error
+        [ `getfattr -d -m "^user" $DIR/f2 2> /dev/null | \
+        grep "user" | wc -l` -eq 3 ] || error
+        #test removexattr
+        setfattr -x trusted.name1 $DIR/f2 2> /dev/null || error
+        getfattr -d -m trusted $DIR/f2 2> /dev/null | \
+        grep "trusted.name1" && error || true
+
+        setfattr -x user.author1 $DIR/f2 2> /dev/null || error
+        getfattr -d -m user $DIR/f2 2> /dev/null | \
+        grep "user.author1" && error || true
+}
+run_test 2 "set/get xattr test (trusted xattr only) ============"
+
+test_3 () {
+        SAVE_UMASK=`umask`
+        umask 022
+        USER1=rpm
+        USER2=vsx2
+        GROUP1=nobody
+        GROUP2=users
+
+        chmod +x runacltest
+        chmod +x acl_mode
+        cd $DIR
+
+       #sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/setfacl.test | runacltest ||
+#error "$? setfacl tests failed"
+
+        #sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/acl_asroot.test | runacltest || error "$? acl_asroot tests failed"
+
+        #sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/acl_perm.test | runacltest || error "$? acl_perm tests failed"
+
+        #sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/acl_misc.test | runacltest || error "$? acl_misc tests failed"
+
+        sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/acl_fileutil.test | runacltest || error "$? acl_fileutil tests failed"
+
+        umask $SAVE_UMASK
+}
+run_test 3 "==============acl test ============="
+
 TMPDIR=$OLDTMPDIR
 TMP=$OLDTMP
 HOME=$OLDHOME
index acefc28..ded1e08 100644 (file)
@@ -10,13 +10,15 @@ ONLY=${ONLY:-"$*"}
 # bug number for skipped test: 2739
 # 51b and 51c depend on kernel
 # 65* fixes in b_hd_cray_merge3
-ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"51b 51c 65a 65b 65c 65d 65e 65f"}
+# the new kernel api make 48 not valid anymore
+ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"48 51b 51c 65a 65b 65c 65d 65e 65f"}
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
 [ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT"
 
 SRCDIR=`dirname $0`
 export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
+export SECURITY=${SECURITY:-"null"}
 
 TMP=${TMP:-/tmp}
 FSTYPE=${FSTYPE:-ext3}
@@ -40,6 +42,8 @@ IOPENTEST1=${IOPENTEST1:-iopentest1}
 IOPENTEST2=${IOPENTEST2:-iopentest2}
 MEMHOG=${MEMHOG:-memhog}
 
+. krb5_env.sh
+
 if [ $UID -ne 0 ]; then
        RUNAS_ID="$UID"
        RUNAS=""
@@ -48,6 +52,13 @@ else
        RUNAS=${RUNAS:-"runas -u $RUNAS_ID"}
 fi
 
+if [ `using_krb5_sec $SECURITY` == 'y' ] ; then
+    start_krb5_kdc || exit 1
+    if [ $RUNAS_ID -ne $UID ]; then
+        $RUNAS ./krb5_refresh_cache.sh || exit 2
+    fi
+fi
+
 export NAME=${NAME:-local}
 
 SAVE_PWD=$PWD
index 8ef4207..faecfc4 100644 (file)
@@ -366,6 +366,39 @@ test_18() {
 }
 run_test 18 "mmap sanity check ================================="
 
+test_19() {     # bug 2441
+        touch $DIR1/f2b
+                                                                                                                             
+        #test set/get xattr
+        setfattr -n trusted.name1 -v value1 $DIR1/f2b || error
+        [ "`getfattr -n trusted.name1 $DIR2/f2b 2> /dev/null | \
+        grep "trusted.name1"`" == "trusted.name1=\"value1\"" ] || error
+                                                                                                                             
+        setfattr -n user.author1 -v author1 $DIR/f2b || error
+        [ "`getfattr -n user.author1 $DIR/f2b 2> /dev/null | \
+        grep "user.author1"`" == "user.author1=\"author1\"" ] || error
+
+        # test listxattr
+        setfattr -n trusted.name2 -v value2 $DIR2/f2b || error
+        setfattr -n trusted.name3 -v value3 $DIR1/f2b || error
+        [ `getfattr -d -m "^trusted" $DIR2/f2b 2> /dev/null | \
+        grep "trusted" | wc -l` -eq 5 ] || error
+                                                                                                                             
+        setfattr -n user.author2 -v author2 $DIR/f2b || error
+        setfattr -n user.author3 -v author3 $DIR/f2b || error
+        [ `getfattr -d -m "^user" $DIR/f2b 2> /dev/null | \
+        grep "user" | wc -l` -eq 3 ] || error
+        #test removexattr
+        setfattr -x trusted.name1 $DIR2/f2b 2> /dev/null || error
+        getfattr -d -m trusted $DIR2/f2b 2> /dev/null | \
+        grep "trusted.name1" && error || true
+
+        setfattr -x user.author1 $DIR/f2b 2> /dev/null || error
+        getfattr -d -m user $DIR/f2b 2> /dev/null | \
+        grep "user.author1" && error || true
+}
+run_test 19 "test set/get xattr on multiple mounts ============"
+
 
 log "cleanup: ======================================================"
 rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true
diff --git a/lustre/tests/setfacl.test b/lustre/tests/setfacl.test
new file mode 100644 (file)
index 0000000..af19462
--- /dev/null
@@ -0,0 +1,123 @@
+!
+! setfacl tests.
+!
+! Run these tests on a filesystem with ACL support.
+!
+$ umask 027
+$ touch g
+$ acl_mode g
+-rw-r-----
+$ setfacl -m m:- g
+$ acl_mode g
+-rw-------+
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rw-
+group::r--     #effective:---
+mask::---
+other::---
+
+$ setfacl -x m g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rw-
+group::r--
+other::---
+
+$ setfacl -m u:joe:rw g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rw-
+user:joe:rw-
+group::r--
+mask::rw-
+other::---
+
+$ setfacl -m u::rwx,g::r-x,o:- g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rwx
+user:joe:rw-
+group::r-x
+mask::rwx
+other::---
+
+$ setfacl -m u::rwx,g::r-x,o:-,m:- g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rwx
+user:joe:rw-   #effective:---
+group::r-x     #effective:---
+mask::---
+other::---
+
+$ setfacl -m u::rwx,g::r-x,o:-,u:root:-,m:- g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rwx
+user:root:---
+user:joe:rw-   #effective:---
+group::r-x     #effective:---
+mask::---
+other::---
+
+$ setfacl -m u::rwx,g::r-x,o:-,u:root:-,m:- g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rwx
+user:root:---
+user:joe:rw-   #effective:---
+group::r-x     #effective:---
+mask::---
+other::---
+
+$ setfacl -m u::rwx,g::r-x,o:-,u:root:- g
+$ getfacl g
+# file: g
+# owner: @OWNER@
+# group: @GROUP@
+user::rwx
+user:root:---
+user:joe:rw-
+group::r-x
+mask::rwx
+other::---
+
+$ setfacl --test -x u: g
+setfacl: g: Malformed access ACL `user:root:---,user:joe:rw-,group::r-x,mask::rwx,other::---': Missing or wrong entry at entry 1
+$ setfacl --test -x u:x
+setfacl: Option -x: Invalid argument near character 3
+$ setfacl -m d:u:root:rwx g
+setfacl: g: Only directories can have default ACLs
+$ setfacl -x m g
+setfacl: g: Malformed access ACL `user::rwx,user:root:---,user:joe:rw-,group::r-x,other::---': Missing or wrong entry at entry 5
+!setfacl --test -m d:u:joe:rwx setfacl
+!setfacl --test -n -m d:u:joe:rwx setfacl
+$ rm g
+!
+! Check if the mask is properly recalculated
+!
+$ mkdir d
+$ setfacl --test -m u::rwx,u:@OWNER@:rwx,g::r-x,o::--- d
+d: u::rwx,u:@OWNER@:rwx,g::r-x,m::rwx,o::---,*
+$ setfacl --test -m u::rwx,u:@OWNER@:rwx,g::r-x,m::---,o::--- d
+d: u::rwx,u:@OWNER@:rwx,g::r-x,m::---,o::---,*
+$ setfacl --test -d -m u::rwx,u:@OWNER@:rwx,g::r-x,o::--- d
+d: *,d:u::rwx,d:u:@OWNER@:rwx,d:g::r-x,d:m::rwx,d:o::---
+$ setfacl --test -d -m u::rwx,u:@OWNER@:rwx,g::r-x,m::---,o::--- d
+d: *,d:u::rwx,d:u:@OWNER@:rwx,d:g::r-x,d:m::---,d:o::---
+$ rmdir d
index c0f8ccd..395184d 100644 (file)
@@ -39,6 +39,7 @@ init_test_env() {
     export LCTL=${LCTL:-"$LUSTRE/utils/lctl"}
     export CHECKSTAT="${CHECKSTAT:-checkstat} "
     export FSYTPE=${FSTYPE:-"ext3"}
+    export SECURITY=${SECURITY:-"null"}
 
     # Paths on remote nodes, if different 
     export RLUSTRE=${RLUSTRE:-$LUSTRE}
@@ -63,6 +64,8 @@ init_test_env() {
 #    echo "CONFIG=`canonical_path $CONFIG`"  > $LUSTRE/tests/CONFIG
 }
 
+. krb5_env.sh
+
 # Facet functions
 start() {
     facet=$1
@@ -70,7 +73,7 @@ start() {
     active=`facet_active $facet`
     do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \
         --node ${active}_facet  --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \
-        $@ $XMLCONFIG
+        --sec $SECURITY $@ $XMLCONFIG
 }
 
 stop() {
@@ -89,11 +92,13 @@ zconf_mount() {
     do_node $client mkdir $mnt 2> /dev/null || :
 
     if [ -x /sbin/mount.lustre ] ; then
-       do_node $client mount -t lustre -o nettype=$NETTYPE `facet_active_host mds1`:/mds1_svc/client_facet $mnt || return 1
+       do_node $client mount -t lustre -o sec=$SECURITY,nettype=$NETTYPE \
+                `facet_active_host mds1`:/mds1_svc/client_facet $mnt || return 2
     else
        # this is so cheating
        do_node $client $LCONF --nosetup --node client_facet $XMLCONFIG  > /dev/null || return 2
-       do_node $client $LLMOUNT `facet_active_host mds1`:/mds1_svc/client_facet $mnt -o nettype=$NETTYPE|| return 4
+       do_node $client $LLMOUNT `facet_active_host mds1`:/mds1_svc/client_facet $mnt \
+               -o sec=$SECURITY,nettype=$NETTYPE|| return 4
     fi
 
     [ -d /r ] && $LCTL modules > /r/tmp/ogdb-`hostname`
@@ -180,6 +185,16 @@ fail() {
     df $MOUNT || error "post-failover df: $?"
 }
 
+fail_drop() {
+    local facet=$1
+    local failcode=$2
+    facet_failover $facet
+    do_facet mds "echo $failcode > /proc/sys/lustre/fail_loc"
+    cat /proc/sys/lustre/fail_loc
+    df $MOUNT || error "post-failover df: $?"
+    do_facet mds "echo 0 > /proc/sys/lustre/fail_loc"
+}
+
 fail_abort() {
     local facet=$1
     stop $facet --force --failover --nomod
index 59147ac..5c5ce2e 100644 (file)
@@ -18,5 +18,6 @@ lfs
 llmount
 mount.lustre
 wiretest
+lsd_upcall
 .*.cmd
 .*.d
index 23eb876..62707cc 100644 (file)
@@ -13,7 +13,7 @@ bin_scripts = lfind lstripe
 
 if UTILS
 rootsbin_SCRIPTS = mount.lustre
-sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount l_getgroups
+sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount lsd_upcall
 bin_PROGRAMS = lfs
 lib_LIBRARIES = liblustreapi.a
 sbin_SCRIPTS = $(sbin_scripts)
@@ -33,7 +33,7 @@ obdbarrier_SOURCES = obdbarrier.c obdiolib.c obdiolib.h
 lfs_SOURCES = lfs.c 
 llmount_SOURCES = llmount.c 
 llmount_LDADD = $(LIBREADLINE) -lptlctl
-l_getgroups_SOURCES = l_getgroups.c
+lsd_upcall_SOURCES = lsd_upcall.c
 
 EXTRA_DIST = $(bin_scripts) $(sbin_scripts)
 
index f704c77..d42ae9d 100755 (executable)
@@ -587,6 +587,13 @@ class LCTLInterface:
   quit""" % (type, name, uuid)
         self.run(cmds)
         
+    def set_security(self, name, key, value):
+        cmds = """
+  cfg_device %s
+  set_security %s %s
+  quit""" % (name, key, value)
+        self.run(cmds)
+
     def setup(self, name, setup = ""):
         cmds = """
   cfg_device %s
@@ -1177,6 +1184,8 @@ class kmod:
         self.dev_dir = dev_dir
         self.name = name
 
+    # FIXME we ignore the failure of loading gss module, because we might
+    # don't need it at all.
     def load(self):
         """Load module"""
         log ('loading module:', self.name, 'srcdir',
@@ -1184,15 +1193,21 @@ class kmod:
         if self.src_dir:
             module = kmod_find(self.src_dir, self.dev_dir,
                                self.name)
-            if not module:
+            if not module and self.name != 'ptlrpcs_gss':
                 panic('module not found:', self.name)
             (rc, out)  = run('/sbin/insmod', module)
             if rc:
-                raise CommandError('insmod', out, rc)
+                if self.name == 'ptlrpcs_gss':
+                    print "Warning: not support gss security!"
+                else:
+                    raise CommandError('insmod', out, rc)
         else:
             (rc, out) = run('/sbin/modprobe', self.name)
             if rc:
-                raise CommandError('modprobe', out, rc)
+                if self.name == 'ptlrpcs_gss':
+                    print "Warning: not support gss security!"
+                else:
+                    raise CommandError('modprobe', out, rc)
 
     def cleanup(self):
        """Unload module"""
@@ -1545,7 +1560,9 @@ class LDLM(Module):
     def add_module(self, manager):
         manager.add_lustre_module('lvfs', 'lvfs')
         manager.add_lustre_module('obdclass', 'obdclass')
+        manager.add_lustre_module('sec', 'ptlrpcs')
         manager.add_lustre_module('ptlrpc', 'ptlrpc')
+       manager.add_lustre_module('sec/gss', 'ptlrpcs_gss')
 
     def prepare(self):
         return
@@ -1892,16 +1909,21 @@ class MDSDEV(Module):
            self.info("mds", realdev, mountfsoptions, self.fstype, self.size, 
                      self.format, master_name, profile_name, self.obdtype)
            
-           lctl.newdev("mds", self.name, self.uuid,
-                       setup = "%s %s %s %s %s %s" %(realdev, 
+            lctl.attach("mds", self.name, self.uuid)
+            if config.mds_mds_sec:
+                lctl.set_security(self.name, "mds_mds_sec", config.mds_mds_sec)
+            if config.mds_ost_sec:
+                lctl.set_security(self.name, "mds_ost_sec", config.mds_ost_sec)
+
+            lctl.setup(self.name, setup = "%s %s %s %s %s %s" %(realdev, 
                            self.fstype, profile_name, mountfsoptions,
                             master_name, self.obdtype))
 
             if development_mode():
-                procentry = "/proc/fs/lustre/mds/grp_hash_upcall"
-                upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/l_getgroups")
+                procentry = "/proc/fs/lustre/mds/lsd_upcall"
+                upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/lsd_upcall")
                 if not (os.access(procentry, os.R_OK) and os.access(upcall, os.R_OK)):
-                    print "MDS Warning: failed to set group-hash upcall"
+                    print "MDS Warning: failed to set lsd cache upcall"
                 else:
                     run("echo ", upcall, " > ", procentry)
 
@@ -2686,8 +2708,10 @@ class Mountpoint(Module):
             self.clientoptions = string.replace(self.clientoptions, "async", 
                                                "lasync")
 
-        cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
-              (self.vosc.get_name(), vmdc_name, self.clientoptions, 
+        if not config.sec:
+            config.sec = "null"
+        cmd = "mount -t lustre_lite -o osc=%s,mdc=%s,sec=%s%s %s %s" % \
+              (self.vosc.get_name(), vmdc_name, config.sec, self.clientoptions,
               config.config, self.path)
         run("mkdir", self.path)
         ret, val = run(cmd)
@@ -3483,6 +3507,9 @@ lconf_options = [
     ('config', "Cluster config name used for LDAP query", PARAM),
     ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
     ('node',   "Load config for <nodename>", PARAM),
+    ('sec',    "security flavor <null|krb5i|krb5p> of client", PARAM),
+    ('mds_mds_sec', "security flavor <null|krb5i|krb5p> of inter mds's", PARAM),
+    ('mds_ost_sec', "security flavor <null|krb5i|krb5p> of mds's-ost's", PARAM),
     ('cleanup,d', "Cleans up config. (Shutdown)"),
     ('force,f', "Forced unmounting and/or obd detach during cleanup",
                FLAG, 0),
index cd70a94..27d2b5f 100644 (file)
@@ -224,7 +224,6 @@ command_t cmdlist[] = {
         {"deactivate", jt_obd_deactivate, 0, "deactivate an import\n"},
         {"recover", jt_obd_recover, 0, "usage: recover [<connection UUID>]"},
         {"lookup", jt_obd_mdc_lookup, 0, "usage: lookup <directory> <file>"},
-        {"finish_gns", jt_obd_finish_gns, 0, "usage: finish_gns <directory>"},
         {"notransno", jt_obd_no_transno, 0,
          "disable sending of committed-transno updates\n"},
         {"readonly", jt_obd_set_readonly, 0,
@@ -246,6 +245,8 @@ command_t cmdlist[] = {
          "usage: add_conn <conn_uuid> [priority]\n"},
         {"del_conn ", jt_lcfg_del_conn, 0,
          "usage: del_conn <conn_uuid> \n"},
+        {"set_security", jt_lcfg_set_security, 0,
+         "usage: set_security key value\n"},
         {"lsync", jt_obd_reint_sync, 0,
          "usage: lsync\n"},  
         {"cache_on", jt_obd_cache_on, 0,
index 8ab5705..27b39cd 100644 (file)
@@ -33,6 +33,8 @@
 #define _GNU_SOURCE
 #include <getopt.h>
 #include <sys/utsname.h>
+#include <pwd.h>
+#include <grp.h>
 
 #include "obdctl.h"
 #include <portals/ptlctl.h>
@@ -117,6 +119,9 @@ init_options(struct lustre_mount_data *lmd)
         lmd->lmd_local_nid = PTL_NID_ANY;
         lmd->lmd_port = 988;    /* XXX define LUSTRE_DEFAULT_PORT */
         lmd->lmd_nal = SOCKNAL;
+        lmd->lmd_nllu = 99;
+        lmd->lmd_nllg = 99;
+        strncpy(lmd->lmd_security, "null", sizeof(lmd->lmd_security));
         return 0;
 }
 
@@ -127,6 +132,7 @@ print_options(struct lustre_mount_data *lmd)
 
         printf("mds:             %s\n", lmd->lmd_mds);
         printf("profile:         %s\n", lmd->lmd_profile);
+        printf("sec_flavor:      %s\n", lmd->lmd_security);
         printf("server_nid:      "LPX64"\n", lmd->lmd_server_nid);
         printf("local_nid:       "LPX64"\n", lmd->lmd_local_nid);
         printf("nal:             %d\n", lmd->lmd_nal);
@@ -199,6 +205,60 @@ static int parse_route(char *opteq, char *opttgts)
         return(0);
 }
 
+/*
+ * here all what we do is gurantee the result is exactly
+ * what user intend to get, no ambiguous. maybe there have
+ * simpler library call could do the same job for us?
+ */
+static int parse_u32(char *str, uint32_t *res)
+{
+        unsigned long id;
+        char *endptr = NULL;
+
+        id = strtol(str, &endptr, 0);
+        if (endptr && *endptr != 0)
+                return -1;
+
+        if (id == LONG_MAX || id == LONG_MIN)
+                return -1;
+
+        if ((uint32_t)id != id)
+                return -1;
+
+        *res = (uint32_t) id;
+        return 0;
+}
+
+static int parse_nllu(struct lustre_mount_data *lmd, char *str_nllu)
+{
+        struct passwd *pass;
+
+        if (parse_u32(str_nllu, &lmd->lmd_nllu) == 0)
+                return 0;
+
+        pass = getpwnam(str_nllu);
+        if (pass == NULL)
+                return -1;
+
+        lmd->lmd_nllu = pass->pw_uid;
+        return 0;
+}
+
+static int parse_nllg(struct lustre_mount_data *lmd, char *str_nllg)
+{
+        struct group *grp;
+
+        if (parse_u32(str_nllg, &lmd->lmd_nllg) == 0)
+                return 0;
+
+        grp = getgrnam(str_nllg);
+        if (grp == NULL)
+                return -1;
+
+        lmd->lmd_nllg = grp->gr_gid;
+        return 0;
+}
+
 int parse_options(char * options, struct lustre_mount_data *lmd)
 {
         ptl_nid_t nid = 0, cluster_id = 0;
@@ -247,6 +307,23 @@ int parse_options(char * options, struct lustre_mount_data *lmd)
                                 lmd->lmd_server_nid = nid;
                         } else if (!strcmp(opt, "port")) {
                                 lmd->lmd_port = val;
+                        } else if (!strcmp(opt, "sec")) {
+                                strncpy(lmd->lmd_security, opteq + 1,
+                                        sizeof(lmd->lmd_security));
+                        } else if (!strcmp(opt, "nllu")) {
+                                if (parse_nllu(lmd, opteq + 1)) {
+                                        fprintf(stderr, "%s: "
+                                                "can't parse user: %s\n",
+                                                progname, opteq + 1);
+                                        return (-1);
+                                }
+                        } else if (!strcmp(opt, "nllg")) {
+                                if (parse_nllg(lmd, opteq + 1)) {
+                                        fprintf(stderr, "%s: "
+                                                "can't parse group: %s\n",
+                                                progname, opteq + 1);
+                                        return (-1);
+                                }
                         }
                 } else {
                         val = 1;
index 56d3d04..6106634 100755 (executable)
@@ -2,11 +2,13 @@
 
 LIBLUSTRE_MOUNT_POINT=${LIBLUSTRE_MOUNT_POINT:-"/mnt/lustre"}
 LIBLUSTRE_MOUNT_TARGET=${LIBLUSTRE_MOUNT_TARGET:-"TARGET_NOT_SET"}
+LIBLUSTRE_SECURITY=${LIBLUSTRE_SECURITY:-"null"}
 LIBLUSTRE_DUMPFILE=${LIBLUSTRE_DUMPFILE:-"/tmp/DUMP_FILE"}
 LD_PRELOAD=${LD_PRELOAD:-"/usr/lib/liblustre.so"}
 
 export LIBLUSTRE_MOUNT_POINT
 export LIBLUSTRE_MOUNT_TARGET
+export LIBLUSTRE_SECURITY
 export LIBLUSTRE_DUMPFILE
 export LD_PRELOAD
 
similarity index 84%
rename from lustre/utils/l_getgroups.c
rename to lustre/utils/lsd_upcall.c
index 2f9b7d0..8b55d45 100644 (file)
 #include <pwd.h>
 #include <grp.h>
 
+#include <liblustre.h>
+#include <linux/lustre_idl.h>
+#include <linux/obd.h>
+#include <linux/lustre_mds.h>
+
 /*
  * return:
  *  0:      fail to insert (found identical)
@@ -55,7 +60,7 @@ int insert_sort(gid_t *groups, int size, gid_t grp)
         return 1;
 }
 
-int get_groups_local(uid_t uid, int *ngroups, gid_t **groups)
+int get_groups_local(uid_t uid, gid_t *gid, int *ngroups, gid_t **groups)
 {
         int     maxgroups;
         int     i, size = 0;
@@ -73,6 +78,8 @@ int get_groups_local(uid_t uid, int *ngroups, gid_t **groups)
         if (!pw)
                 return -errno;
 
+        *gid = pw->pw_gid;
+
         while ((gr = getgrent())) {
                 if (!gr->gr_mem)
                         continue;
@@ -92,14 +99,9 @@ int get_groups_local(uid_t uid, int *ngroups, gid_t **groups)
 
 int main (int argc, char **argv)
 {
+        char   *pathname = "/proc/fs/lustre/mds/lsd_downcall";
         int     fd, rc;
-        struct {
-                uint32_t err;
-                uint32_t uid;
-                uint32_t ngroups;
-                gid_t   *groups;
-        } ioc_data;
-        char    *pathname = "/proc/fs/lustre/mds/group_info";
+        struct lsd_downcall_args ioc_data;
 
         if (argc != 2) {
                 printf("bad parameter\n");
@@ -115,7 +117,13 @@ int main (int argc, char **argv)
                 return rc;
         }
 
-        ioc_data.err = get_groups_local(ioc_data.uid, &ioc_data.ngroups, &ioc_data.groups);
+        ioc_data.err = get_groups_local(ioc_data.uid, &ioc_data.gid,
+                                        &ioc_data.ngroups, &ioc_data.groups);
+
+        /* FIXME get these from config file */
+        ioc_data.allow_setuid = 1;
+        ioc_data.allow_setgid = 1;
+        ioc_data.allow_setgrp = 1;
 
         rc = write(fd, &ioc_data, sizeof(ioc_data));
         return (rc != sizeof(ioc_data));
index 9565aaa..b0af4a6 100644 (file)
@@ -706,3 +706,46 @@ int jt_lcfg_del_conn(int argc, char **argv)
         return rc;
 }
 
+int jt_lcfg_set_security(int argc, char **argv)
+{
+        struct lustre_cfg lcfg;
+        int rc;
+
+        if (argc != 3)
+                return CMD_HELP;
+
+        if (lcfg_devname == NULL) {
+                fprintf(stderr, "%s: please use 'cfg_device name' to set the "
+                        "device name for config commands.\n",
+                        jt_cmdname(argv[0]));
+               return -EINVAL;
+        }
+
+        LCFG_INIT(lcfg, LCFG_SET_SECURITY, lcfg_devname);
+
+        /* currently only used to set on mds */
+        if (strcmp(argv[1], "mds_mds_sec") && strcmp(argv[1], "mds_ost_sec")) {
+                fprintf(stderr, "%s: invalid security key %s\n",
+                        jt_cmdname(argv[0]), argv[1]);
+                return -EINVAL;
+        }
+        if (strcmp(argv[2], "null") && strcmp(argv[2], "krb5")) {
+                fprintf(stderr, "%s: invalid security value %s\n",
+                        jt_cmdname(argv[0]), argv[2]);
+                return -EINVAL;
+        }
+
+        /* connection uuid */
+        lcfg.lcfg_inllen1 = strlen(argv[1]) + 1;
+        lcfg.lcfg_inlbuf1 = argv[1];
+        lcfg.lcfg_inllen2 = strlen(argv[2]) + 1;
+        lcfg.lcfg_inlbuf2 = argv[2];
+
+        rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg);
+        if (rc < 0) {
+                fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]),
+                        strerror(rc = errno));
+        }
+
+        return rc;
+}
index 3c40db2..962e26a 100644 (file)
@@ -2005,36 +2005,6 @@ int jt_obd_mdc_lookup(int argc, char **argv)
         return rc;
 }
 
-int jt_obd_finish_gns(int argc, char **argv)
-{
-        char *mtpt;
-        int rc, fd;
-        struct obd_ioctl_data data;
-
-        if (argc != 2)
-                return CMD_HELP;
-
-        mtpt = argv[1];
-
-        fd = open(mtpt, O_RDONLY);
-        if (fd < 0) {
-                fprintf(stderr, "open \"%s\" failed: %s\n", mtpt,
-                        strerror(errno));
-                return -1;
-        }
-
-        IOC_INIT(data);
-        IOC_PACK(argv[0], data);
-        rc = ioctl(fd, IOC_MDC_FINISH_GNS, buf);
-        if (rc < 0) {
-                fprintf(stderr, "error: %s(%s) ioctl error: %s\n",
-                        jt_cmdname(argv[0]), mtpt, strerror(rc = errno));
-        }
-        close(fd);
-
-        return rc;
-}
-
 int jt_obd_close_uuid(int argc, char **argv)
 {
         int rc, nal;
index 415b752..e4b47da 100644 (file)
@@ -74,7 +74,6 @@ int jt_obd_activate(int argc, char **argv);
 int jt_obd_deactivate(int argc, char **argv);
 int jt_obd_recover(int argc, char **argv);
 int jt_obd_mdc_lookup(int argc, char **argv);
-int jt_obd_finish_gns(int argc, char **argv);
 int jt_get_version(int argc, char **argv);
 int jt_obd_close_uuid(int argc, char **argv);
 int jt_cfg_record(int argc, char **argv);
@@ -115,6 +114,7 @@ int jt_lcfg_set_timeout(int argc, char **argv);
 int jt_lcfg_set_lustre_upcall(int argc, char **argv);
 int jt_lcfg_add_conn(int argc, char **argv);
 int jt_lcfg_del_conn(int argc, char **argv);
+int jt_lcfg_set_security(int argc, char **argv);
 
 int obd_add_uuid(char *uuid, ptl_nid_t nid, int nal);