From 89f9a5bced24ecb7c84040a1ed88dcef4384f7c6 Mon Sep 17 00:00:00 2001 From: ericm Date: Thu, 31 Mar 2005 22:18:52 +0000 Subject: [PATCH] land lustre part of b_hd_sec on HEAD. --- .../patches/ext3-wantedi-2.6-suse.patch | 50 +- lustre/Makefile.in | 1 + lustre/autoMakefile.am | 2 +- lustre/autoconf/lustre-core.m4 | 24 +- lustre/cobd/cache_obd.c | 10 +- lustre/include/liblustre.h | 68 +- lustre/include/linux/Makefile.am | 3 +- lustre/include/linux/lustre_acl.h | 36 + lustre/include/linux/lustre_cfg.h | 4 + lustre/include/linux/lustre_compat25.h | 10 + lustre/include/linux/lustre_export.h | 7 + lustre/include/linux/lustre_idl.h | 36 +- lustre/include/linux/lustre_import.h | 7 +- lustre/include/linux/lustre_lite.h | 18 +- lustre/include/linux/lustre_mds.h | 50 +- lustre/include/linux/lustre_net.h | 35 +- lustre/include/linux/lustre_sec.h | 360 + lustre/include/linux/lustre_smfs.h | 1 + lustre/include/linux/lustre_ucache.h | 79 + lustre/include/linux/lvfs.h | 35 +- lustre/include/linux/obd.h | 15 +- lustre/include/linux/obd_class.h | 6 +- lustre/include/linux/obd_support.h | 8 + .../patches/dcache-mds-num-2.6.7.patch | 13 +- .../patches/export-vanilla-2.6.patch | 94 + .../patches/ext3-wantedi-2.6-suse.patch | 50 +- .../patches/header_guards-vanilla-2.6.patch | 45 + .../kernel_patches/patches/iopen-2.6-vanilla.patch | 6 +- .../patches/linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch | 16246 +++++++++++++++++++ .../patches/vfs-dcache_locking-vanilla-2.6.patch | 85 + .../vfs-dcache_lustre_invalid-vanilla-2.6.patch | 37 + .../kernel_patches/patches/vfs-do_truncate.patch | 87 + .../patches/vfs-gns_export_doumount.patch | 34 + .../patches/vfs-intent_api-vanilla-2.6.patch | 555 + .../patches/vfs-lookup_last-vanilla-2.6.patch | 77 + .../kernel_patches/patches/vfs-pdirops-2.6.7.patch | 28 +- .../patches/vfs-raw_ops-vanilla-2.6.patch | 235 + .../patches/vfs_fmode_exec-2.6.patch | 34 + .../patches/vfs_gns-2.6-vanilla.patch | 55 + .../patches/vfs_intent-2.6-vanilla.patch | 27 +- lustre/kernel_patches/series/2.6-vanilla.series | 30 +- lustre/ldlm/ldlm_lib.c | 52 +- lustre/ldlm/ldlm_lock.c | 2 + lustre/liblustre/Makefile.am | 5 +- lustre/liblustre/dir.c | 2 +- lustre/liblustre/file.c | 9 +- lustre/liblustre/genlib.sh | 1 + lustre/liblustre/namei.c | 14 +- lustre/liblustre/super.c | 19 +- lustre/llite/dcache.c | 178 +- lustre/llite/dir.c | 12 +- lustre/llite/file.c | 290 +- lustre/llite/llite_gns.c | 483 +- lustre/llite/llite_internal.h | 70 +- lustre/llite/llite_lib.c | 150 +- lustre/llite/llite_nfs.c | 3 +- lustre/llite/lproc_llite.c | 137 +- lustre/llite/namei.c | 111 +- lustre/llite/special.c | 11 +- lustre/llite/super.c | 15 +- lustre/llite/super25.c | 18 +- lustre/llite/symlink.c | 10 +- lustre/lmv/lmv_intent.c | 65 +- lustre/lmv/lmv_obd.c | 62 +- lustre/lmv/lmv_objmgr.c | 2 +- lustre/lov/lov_obd.c | 35 + lustre/lvfs/lvfs_reint.c | 2 +- lustre/mdc/autoMakefile.am | 2 +- lustre/mdc/mdc_locks.c | 81 +- lustre/mdc/mdc_request.c | 122 +- lustre/mds/Makefile.in | 2 +- lustre/mds/handler.c | 442 +- lustre/mds/lproc_mds.c | 150 +- lustre/mds/mds_fs.c | 2 + lustre/mds/mds_groups.c | 451 - lustre/mds/mds_internal.h | 40 +- lustre/mds/mds_lib.c | 373 +- lustre/mds/mds_lmv.c | 21 +- lustre/mds/mds_lov.c | 10 + lustre/mds/mds_lsd.c | 240 + lustre/mds/mds_open.c | 19 +- lustre/mds/mds_reint.c | 83 +- lustre/obdclass/class_obd.c | 25 + lustre/obdclass/genops.c | 2 + lustre/obdfilter/filter_log.c | 10 +- lustre/osc/osc_lib.c | 2 + lustre/osc/osc_request.c | 26 + lustre/ost/ost_handler.c | 12 + lustre/ptlrpc/autoMakefile.am | 2 +- lustre/ptlrpc/client.c | 87 +- lustre/ptlrpc/events.c | 8 +- lustre/ptlrpc/import.c | 28 +- lustre/ptlrpc/lproc_ptlrpc.c | 3 + lustre/ptlrpc/niobuf.c | 211 +- lustre/ptlrpc/pack_generic.c | 57 +- lustre/ptlrpc/ptlrpc_internal.h | 11 +- lustre/ptlrpc/ptlrpc_module.c | 1 + lustre/ptlrpc/service.c | 41 +- lustre/sec/.cvsignore | 15 + lustre/sec/Makefile.in | 6 + lustre/sec/Makefile.mk | 10 + lustre/sec/autoMakefile.am | 22 + lustre/sec/doc/oss_gss_HLD.lyx | 258 + lustre/sec/doc/remote_ugid_HLD.lyx | 884 + lustre/sec/doc/revoke_user_HLD.lyx | 244 + lustre/sec/gss/.cvsignore | 15 + lustre/sec/gss/Makefile.in | 9 + lustre/sec/gss/Makefile.mk | 14 + lustre/sec/gss/autoMakefile.am | 23 + lustre/sec/gss/gss_api.h | 132 + lustre/sec/gss/gss_asn1.h | 87 + lustre/sec/gss/gss_err.h | 181 + lustre/sec/gss/gss_generic_token.c | 295 + lustre/sec/gss/gss_internal.h | 106 + lustre/sec/gss/gss_krb5.h | 183 + lustre/sec/gss/gss_krb5_crypto.c | 256 + lustre/sec/gss/gss_krb5_mech.c | 316 + lustre/sec/gss/gss_krb5_seal.c | 178 + lustre/sec/gss/gss_krb5_seqnum.c | 116 + lustre/sec/gss/gss_krb5_unseal.c | 212 + lustre/sec/gss/gss_krb5_wrap.c | 381 + lustre/sec/gss/gss_mech_switch.c | 302 + lustre/sec/gss/rawobj.c | 170 + lustre/sec/gss/sec_gss.c | 1799 ++ lustre/sec/gss/svcsec_gss.c | 1534 ++ lustre/sec/sec.c | 932 ++ lustre/sec/sec_null.c | 195 + lustre/sec/svcsec.c | 273 + lustre/sec/svcsec_null.c | 111 + lustre/sec/upcall_cache.c | 414 + lustre/smfs/dir.c | 2 - lustre/tests/acl_asroot.test | 46 + lustre/tests/acl_fileutil.test | 66 + lustre/tests/acl_misc.test | 386 + lustre/tests/acl_mode | 2 + lustre/tests/acl_perm.test | 18 + lustre/tests/conf-sanity.sh | 21 +- lustre/tests/gns-upcall.sh | 13 + lustre/tests/insanity.sh | 5 + lustre/tests/krb5_env.sh | 101 + lustre/tests/krb5_refresh_cache.sh | 15 + lustre/tests/llmount.sh | 18 +- lustre/tests/llmountcleanup.sh | 5 + lustre/tests/llrmount.sh | 18 +- lustre/tests/lmv.sh | 2 +- lustre/tests/recovery-small.sh | 6 +- lustre/tests/replay-dual.sh | 6 + lustre/tests/replay-single.sh | 21 + lustre/tests/runacltest | 160 + lustre/tests/sanity-gns.sh | 387 + lustre/tests/sanity-lmv.sh | 10 + lustre/tests/sanity-sec.sh | 71 + lustre/tests/sanity.sh | 13 +- lustre/tests/sanityN.sh | 33 + lustre/tests/setfacl.test | 123 + lustre/tests/test-framework.sh | 21 +- lustre/utils/.cvsignore | 1 + lustre/utils/Makefile.am | 4 +- lustre/utils/lconf | 47 +- lustre/utils/lctl.c | 3 +- lustre/utils/llmount.c | 77 + lustre/utils/lrun | 2 + lustre/utils/{l_getgroups.c => lsd_upcall.c} | 26 +- lustre/utils/lustre_cfg.c | 43 + lustre/utils/obd.c | 30 - lustre/utils/obdctl.h | 2 +- 166 files changed, 32931 insertions(+), 1471 deletions(-) create mode 100644 lustre/include/linux/lustre_acl.h create mode 100644 lustre/include/linux/lustre_sec.h create mode 100644 lustre/include/linux/lustre_ucache.h create mode 100644 lustre/kernel_patches/patches/export-vanilla-2.6.patch create mode 100644 lustre/kernel_patches/patches/header_guards-vanilla-2.6.patch create mode 100644 lustre/kernel_patches/patches/linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch create mode 100644 lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.patch create mode 100644 lustre/kernel_patches/patches/vfs-dcache_lustre_invalid-vanilla-2.6.patch create mode 100644 lustre/kernel_patches/patches/vfs-do_truncate.patch create mode 100644 lustre/kernel_patches/patches/vfs-gns_export_doumount.patch create mode 100644 lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.patch create mode 100644 lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.patch create mode 100644 lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.patch create mode 100644 lustre/kernel_patches/patches/vfs_fmode_exec-2.6.patch create mode 100644 lustre/kernel_patches/patches/vfs_gns-2.6-vanilla.patch delete mode 100644 lustre/mds/mds_groups.c create mode 100644 lustre/mds/mds_lsd.c create mode 100644 lustre/sec/.cvsignore create mode 100644 lustre/sec/Makefile.in create mode 100644 lustre/sec/Makefile.mk create mode 100644 lustre/sec/autoMakefile.am create mode 100644 lustre/sec/doc/oss_gss_HLD.lyx create mode 100644 lustre/sec/doc/remote_ugid_HLD.lyx create mode 100644 lustre/sec/doc/revoke_user_HLD.lyx create mode 100644 lustre/sec/gss/.cvsignore create mode 100644 lustre/sec/gss/Makefile.in create mode 100644 lustre/sec/gss/Makefile.mk create mode 100644 lustre/sec/gss/autoMakefile.am create mode 100644 lustre/sec/gss/gss_api.h create mode 100644 lustre/sec/gss/gss_asn1.h create mode 100644 lustre/sec/gss/gss_err.h create mode 100644 lustre/sec/gss/gss_generic_token.c create mode 100644 lustre/sec/gss/gss_internal.h create mode 100644 lustre/sec/gss/gss_krb5.h create mode 100644 lustre/sec/gss/gss_krb5_crypto.c create mode 100644 lustre/sec/gss/gss_krb5_mech.c create mode 100644 lustre/sec/gss/gss_krb5_seal.c create mode 100644 lustre/sec/gss/gss_krb5_seqnum.c create mode 100644 lustre/sec/gss/gss_krb5_unseal.c create mode 100644 lustre/sec/gss/gss_krb5_wrap.c create mode 100644 lustre/sec/gss/gss_mech_switch.c create mode 100644 lustre/sec/gss/rawobj.c create mode 100644 lustre/sec/gss/sec_gss.c create mode 100644 lustre/sec/gss/svcsec_gss.c create mode 100644 lustre/sec/sec.c create mode 100644 lustre/sec/sec_null.c create mode 100644 lustre/sec/svcsec.c create mode 100644 lustre/sec/svcsec_null.c create mode 100644 lustre/sec/upcall_cache.c create mode 100644 lustre/tests/acl_asroot.test create mode 100644 lustre/tests/acl_fileutil.test create mode 100644 lustre/tests/acl_misc.test create mode 100755 lustre/tests/acl_mode create mode 100644 lustre/tests/acl_perm.test create mode 100755 lustre/tests/gns-upcall.sh create mode 100755 lustre/tests/krb5_env.sh create mode 100755 lustre/tests/krb5_refresh_cache.sh create mode 100755 lustre/tests/runacltest create mode 100644 lustre/tests/sanity-gns.sh create mode 100644 lustre/tests/setfacl.test rename lustre/utils/{l_getgroups.c => lsd_upcall.c} (84%) diff --git a/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch index a4867a5..4fd69a5 100644 --- a/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-wantedi-2.6-suse.patch @@ -5,10 +5,10 @@ include/linux/ext3_fs.h | 5 ++++- 5 files changed, 85 insertions(+), 6 deletions(-) -Index: uml-2.6.3/fs/ext3/ialloc.c +Index: linux-2.6.7/fs/ext3/ialloc.c =================================================================== ---- uml-2.6.3.orig/fs/ext3/ialloc.c 2004-02-20 15:00:48.000000000 +0800 -+++ uml-2.6.3/fs/ext3/ialloc.c 2004-02-21 00:24:45.202693776 +0800 +--- linux-2.6.7.orig/fs/ext3/ialloc.c 2005-03-24 00:27:43.282608616 +0800 ++++ linux-2.6.7/fs/ext3/ialloc.c 2005-03-24 00:27:43.888516504 +0800 @@ -420,7 +420,8 @@ * For other inodes, search forward from the parent directory's block * group to find a free inode. @@ -58,11 +58,19 @@ Index: uml-2.6.3/fs/ext3/ialloc.c if (S_ISDIR(mode)) { if (test_opt (sb, OLDALLOC)) group = find_group_dir(sb, dir); -Index: uml-2.6.3/fs/ext3/ioctl.c +Index: linux-2.6.7/fs/ext3/ioctl.c =================================================================== ---- uml-2.6.3.orig/fs/ext3/ioctl.c 2004-01-09 14:59:26.000000000 +0800 -+++ uml-2.6.3/fs/ext3/ioctl.c 2004-02-21 00:21:04.541239416 +0800 -@@ -24,6 +24,31 @@ +--- linux-2.6.7.orig/fs/ext3/ioctl.c 2004-06-16 13:19:13.000000000 +0800 ++++ linux-2.6.7/fs/ext3/ioctl.c 2005-03-24 00:31:16.113253440 +0800 +@@ -9,6 +9,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -24,6 +25,31 @@ ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); switch (cmd) { @@ -93,12 +101,12 @@ Index: uml-2.6.3/fs/ext3/ioctl.c + } case EXT3_IOC_GETFLAGS: flags = ei->i_flags & EXT3_FL_USER_VISIBLE; - return put_user(flags, (int *) arg); -Index: uml-2.6.3/fs/ext3/namei.c + return put_user(flags, (int __user *) arg); +Index: linux-2.6.7/fs/ext3/namei.c =================================================================== ---- uml-2.6.3.orig/fs/ext3/namei.c 2004-02-20 15:01:27.000000000 +0800 -+++ uml-2.6.3/fs/ext3/namei.c 2004-02-21 00:21:04.611228776 +0800 -@@ -1617,6 +1617,19 @@ +--- linux-2.6.7.orig/fs/ext3/namei.c 2005-03-24 00:27:43.536570008 +0800 ++++ linux-2.6.7/fs/ext3/namei.c 2005-03-24 00:27:43.893515744 +0800 +@@ -1939,6 +1939,19 @@ return err; } @@ -118,7 +126,7 @@ Index: uml-2.6.3/fs/ext3/namei.c /* * By the time this is called, we already have created * the directory cache entry for the new file, but it -@@ -1640,7 +1653,7 @@ +@@ -1963,7 +1976,7 @@ if (IS_DIRSYNC(dir)) handle->h_sync = 1; @@ -127,7 +135,7 @@ Index: uml-2.6.3/fs/ext3/namei.c err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext3_file_inode_operations; -@@ -1670,7 +1683,7 @@ +@@ -1994,7 +2007,7 @@ if (IS_DIRSYNC(dir)) handle->h_sync = 1; @@ -136,7 +144,7 @@ Index: uml-2.6.3/fs/ext3/namei.c err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); -@@ -1702,7 +1715,7 @@ +@@ -2027,7 +2040,7 @@ if (IS_DIRSYNC(dir)) handle->h_sync = 1; @@ -145,7 +153,7 @@ Index: uml-2.6.3/fs/ext3/namei.c err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; -@@ -2094,7 +2107,7 @@ +@@ -2439,7 +2452,7 @@ if (IS_DIRSYNC(dir)) handle->h_sync = 1; @@ -154,10 +162,10 @@ Index: uml-2.6.3/fs/ext3/namei.c err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; -Index: uml-2.6.3/include/linux/ext3_fs.h +Index: linux-2.6.7/include/linux/ext3_fs.h =================================================================== ---- uml-2.6.3.orig/include/linux/ext3_fs.h 2004-01-09 14:59:44.000000000 +0800 -+++ uml-2.6.3/include/linux/ext3_fs.h 2004-02-21 00:21:04.613228472 +0800 +--- linux-2.6.7.orig/include/linux/ext3_fs.h 2005-03-24 00:27:43.542569096 +0800 ++++ linux-2.6.7/include/linux/ext3_fs.h 2005-03-24 00:27:43.893515744 +0800 @@ -203,6 +203,7 @@ #define EXT3_IOC_SETFLAGS _IOW('f', 2, long) #define EXT3_IOC_GETVERSION _IOR('f', 3, long) @@ -166,7 +174,7 @@ Index: uml-2.6.3/include/linux/ext3_fs.h #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) #ifdef CONFIG_JBD_DEBUG -@@ -707,7 +708,8 @@ +@@ -708,7 +709,8 @@ dx_hash_info *hinfo); /* ialloc.c */ @@ -176,7 +184,7 @@ Index: uml-2.6.3/include/linux/ext3_fs.h extern void ext3_free_inode (handle_t *, struct inode *); extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); extern unsigned long ext3_count_free_inodes (struct super_block *); -@@ -792,4 +794,5 @@ +@@ -793,4 +795,5 @@ #endif /* __KERNEL__ */ diff --git a/lustre/Makefile.in b/lustre/Makefile.in index 1907eb1..1a5db43 100644 --- a/lustre/Makefile.in +++ b/lustre/Makefile.in @@ -2,6 +2,7 @@ subdir-m += lvfs subdir-m += obdclass +subdir-m += sec subdir-m += lov subdir-m += lmv subdir-m += ptlrpc diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am index a8197e1..24f80d0 100644 --- a/lustre/autoMakefile.am +++ b/lustre/autoMakefile.am @@ -5,7 +5,7 @@ AUTOMAKE_OPTIONS = foreign -SUBDIRS = include ldiskfs lvfs obdclass lov ldlm ptlrpc \ +SUBDIRS = include ldiskfs lvfs obdclass lov ldlm sec ptlrpc \ obdecho osc mdc lmv mds obdfilter ost llite cobd ptlbd smfs snapfs \ cmobd liblustre doc utils tests conf scripts autoconf diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index c19132d..dae4f44 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -317,6 +317,23 @@ AC_DEFINE_UNQUOTED(OBD_MAX_IOCTL_BUFFER, $OBD_BUFFER_SIZE, [IOCTL Buffer Size]) ]) # +# LC_CONFIG_GSS +# +# whether build-in gss/krb5 capability +# +AC_DEFUN([LC_CONFIG_GSS], +[AC_MSG_CHECKING([whether to enable gss/krb5 support]) +AC_ARG_ENABLE([gss], + AC_HELP_STRING([--enable-gss], + [enable gss/krb5 support]), + [],[enable_gss='yes']) +AC_MSG_RESULT([$enable_gss]) +if test x$enable_gss != xno ; then + AC_DEFINE(ENABLE_GSS, 1, Support GSS/krb5) +fi +]) + +# # LC_CONFIG_SNAPFS # # Whether snapfs is desired @@ -353,6 +370,7 @@ AC_MSG_RESULT([$enable_smfs]) AC_DEFUN([LC_PROG_LINUX], [LC_CONFIG_BACKINGFS LC_CONFIG_PINGER +LC_CONFIG_GSS LC_CONFIG_SNAPFS LC_CONFIG_SMFS @@ -423,6 +441,7 @@ AM_CONDITIONAL(USE_QUILT, test x$QUILT != xno) AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests) AM_CONDITIONAL(SNAPFS, test x$enable_snapfs = xyes) AM_CONDITIONAL(SMFS, test x$enable_smfs = xyes) +AM_CONDITIONAL(GSS, test x$enable_gss = xyes) AM_CONDITIONAL(LIBLUSTRE, test x$enable_liblustre = xyes) AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests) ]) @@ -450,7 +469,6 @@ lustre/ldiskfs/Makefile lustre/ldiskfs/autoMakefile lustre/ldlm/Makefile lustre/liblustre/Makefile -lustre/liblustre/tests/Makefile lustre/llite/Makefile lustre/llite/autoMakefile lustre/lmv/Makefile @@ -479,6 +497,10 @@ lustre/ptlrpc/Makefile lustre/ptlrpc/autoMakefile lustre/scripts/Makefile lustre/scripts/version_tag.pl +lustre/sec/Makefile +lustre/sec/autoMakefile +lustre/sec/gss/Makefile +lustre/sec/gss/autoMakefile lustre/smfs/Makefile lustre/smfs/autoMakefile lustre/snapfs/Makefile diff --git a/lustre/cobd/cache_obd.c b/lustre/cobd/cache_obd.c index dd446bd..8a28304 100644 --- a/lustre/cobd/cache_obd.c +++ b/lustre/cobd/cache_obd.c @@ -351,7 +351,7 @@ static int cobd_precleanup(struct obd_device *obd, int flags) } static int cobd_getattr(struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *lsm) + struct lov_stripe_md *ea) { struct obd_device *obd = class_exp2obd(exp); struct obd_export *cobd_exp; @@ -362,7 +362,7 @@ static int cobd_getattr(struct obd_export *exp, struct obdo *oa, return -EINVAL; } cobd_exp = cobd_get_exp(obd); - return obd_getattr(cobd_exp, oa, lsm); + return obd_getattr(cobd_exp, oa, ea); } static int cobd_getattr_async(struct obd_export *exp, @@ -870,8 +870,8 @@ static int cobd_import_event(struct obd_device *obd, } static int cobd_md_getattr(struct obd_export *exp, struct lustre_id *id, - __u64 valid, unsigned int ea_size, - struct ptlrpc_request **request) + __u64 valid, const char *ea_name, int ea_namelen, + unsigned int ea_size, struct ptlrpc_request **request) { struct obd_device *obd = class_exp2obd(exp); struct obd_export *cobd_exp; @@ -882,7 +882,7 @@ static int cobd_md_getattr(struct obd_export *exp, struct lustre_id *id, return -EINVAL; } cobd_exp = cobd_get_exp(obd); - return md_getattr(cobd_exp, id, valid, ea_size, request); + return md_getattr(cobd_exp, id, valid, NULL, 0, ea_size, request); } static int cobd_md_req2lustre_md (struct obd_export *mdc_exp, diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index 8f925e6..c99e6a5 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -197,16 +197,17 @@ struct module { int count; }; -static inline void MODULE_AUTHOR(char *name) -{ - printf("%s\n", name); -} -#define MODULE_DESCRIPTION(name) MODULE_AUTHOR(name) -#define MODULE_LICENSE(name) MODULE_AUTHOR(name) +#define MODULE_AUTHOR(name) +#define MODULE_DESCRIPTION(name) +#define MODULE_LICENSE(name) + +#define module_init(init) +#define module_exit(exit) #define THIS_MODULE NULL #define __init #define __exit +#define __user /* devices */ @@ -275,6 +276,14 @@ static inline void spin_unlock_bh(spinlock_t *l) {} static inline void spin_lock_irqsave(spinlock_t *a, unsigned long b) {} static inline void spin_unlock_irqrestore(spinlock_t *a, unsigned long b) {} +typedef struct { } rwlock_t; +#define rwlock_init(x) do {} while(0) +#define RW_LOCK_UNLOCKED (rwlock_t) {} +#define read_lock(l) +#define read_unlock(l) +#define write_lock(l) +#define write_unlock(l) + #define min(x,y) ((x)<(y) ? (x) : (y)) #define max(x,y) ((x)>(y) ? (x) : (y)) @@ -287,6 +296,10 @@ static inline void spin_unlock_irqrestore(spinlock_t *a, unsigned long b) {} ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; }) #endif +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + /* registering symbols */ #define ERESTARTSYS ERESTART @@ -313,6 +326,12 @@ static inline int copy_to_user(void *a,void *b, int c) return 0; } +static inline long strncpy_from_user(char *dest, const char *src, long n) +{ + char *s; + s = strncpy(dest, src, n); + return strnlen(s, n); +} /* slabs */ typedef struct { @@ -427,7 +446,7 @@ static inline struct page* __grab_cache_page(unsigned long index) #define ATTR_ATTR_FLAG 0x0400 #define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ #define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ -#define ATTR_CTIME_SET 0x2000 +/* ATTR_CTIME_SET has been defined in lustre_idl.h */ struct iattr { unsigned int ia_valid; @@ -457,25 +476,28 @@ struct iattr { #define INTENT_MAGIC 0x19620323 -struct lustre_intent_data { - int it_disposition; - int it_status; - __u64 it_lock_handle; - void *it_data; - int it_lock_mode; - int it_int_flags; -}; struct lookup_intent { int it_magic; void (*it_op_release)(struct lookup_intent *); int it_op; int it_flags; int it_create_mode; - union { - struct lustre_intent_data lustre; - } d; + union { + void *fs_data; /* FS-specific intent data */ + } d; }; +struct lustre_intent_data { + int it_disposition; + int it_status; + __u64 it_lock_handle; + void *it_data; + int it_lock_mode; + int it_int_flags; +}; + +#define LUSTRE_IT(it) ((struct lustre_intent_data *)((it)->d.fs_data)) + static inline void intent_init(struct lookup_intent *it, int op, int flags) { memset(it, 0, sizeof(*it)); @@ -543,6 +565,8 @@ struct task_struct { struct signal pending; char comm[32]; int pid; + uid_t uid; + gid_t gid; int fsuid; int fsgid; int max_groups; @@ -625,6 +649,14 @@ static inline int schedule_timeout(signed long t) #define time_after(a, b) ((long)(b) - (long)(a) < 0) #define time_before(a, b) time_after(b,a) +static inline unsigned long get_seconds(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec + tv.tv_usec / 1000000); +} + struct timer_list { struct list_head tl_list; void (*function)(unsigned long unused); diff --git a/lustre/include/linux/Makefile.am b/lustre/include/linux/Makefile.am index d187775..fc1017df 100644 --- a/lustre/include/linux/Makefile.am +++ b/lustre/include/linux/Makefile.am @@ -15,4 +15,5 @@ EXTRA_DIST = lprocfs_status.h lustre_debug.h lustre_ha.h lustre_lib.h \ lustre_export.h lustre_log.h obd_echo.h obd_ptlbd.h obd_trace.h \ lustre_compat25.h lustre_fsfilt.h lustre_import.h lustre_mds.h obd.h \ lvfs.h lvfs_linux.h lustre_cfg.h lustre_lite.h lustre_idl.h lustre_smfs.h \ - lustre_cmobd.h obd_lmv.h lustre_snap.h + lustre_cmobd.h obd_lmv.h lustre_snap.h lustre_sec.h lustre_ucache.h \ + lustre_acl.h diff --git a/lustre/include/linux/lustre_acl.h b/lustre/include/linux/lustre_acl.h new file mode 100644 index 0000000..2267997 --- /dev/null +++ b/lustre/include/linux/lustre_acl.h @@ -0,0 +1,36 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef _LUSTRE_ACL_H_ +#define _LUSTRE_ACL_H_ + +#include + +/* +* the value of LL_ACL_MAX_ENTRIES and LL_ACL_NOT_CACHED should be +* kept step with related definition in ext3 (EXT3_ACL_MAX_ENTRIES and +* EXT3_ACL_NOT_CACHED) +*/ +#define LL_ACL_MAX_ENTRIES 32 // EXT3_ACL_MAX_ENTRIES +#define LL_ACL_NOT_CACHED ((void *)-1) //EXT3_ACL_NOT_CACHED + +#endif diff --git a/lustre/include/linux/lustre_cfg.h b/lustre/include/linux/lustre_cfg.h index fe446e5..3f2038f 100644 --- a/lustre/include/linux/lustre_cfg.h +++ b/lustre/include/linux/lustre_cfg.h @@ -40,6 +40,7 @@ enum lcfg_command_type { LCFG_LOV_DEL_OBD = 0x00cf00c, LCFG_ADD_CONN = 0x00cf00d, LCFG_DEL_CONN = 0x00cf00e, + LCFG_SET_SECURITY = 0x00cf00f, }; struct lustre_cfg { @@ -279,6 +280,9 @@ struct lustre_mount_data { uint32_t lmd_nal; uint32_t lmd_server_ipaddr; uint32_t lmd_port; + uint32_t lmd_nllu; + uint32_t lmd_nllg; + char lmd_security[16]; char lmd_mds[64]; char lmd_profile[64]; }; diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 711f282..03a88a4 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -99,6 +99,16 @@ static inline int cleanup_group_info(void) #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) +/* New (actually old) intent naming */ +#define lookup_intent open_intent + +/* And internals */ +#define it_flags flags +#define it_op op +#define it_magic magic +#define it_op_release op_release +#define it_create_mode create_mode + /* * OBD need working random driver, thus all our * initialization routines must be called after device diff --git a/lustre/include/linux/lustre_export.h b/lustre/include/linux/lustre_export.h index 525110d..2e4e760 100644 --- a/lustre/include/linux/lustre_export.h +++ b/lustre/include/linux/lustre_export.h @@ -14,6 +14,7 @@ #include struct mds_client_data; +struct mds_idmap_table; struct mds_export_data { struct list_head med_open_head; @@ -21,6 +22,12 @@ struct mds_export_data { struct mds_client_data *med_mcd; loff_t med_off; int med_idx; + unsigned int med_local:1; + __u32 med_nllu; + __u32 med_nllg; + /* simple idmapping */ + spinlock_t med_idmap_lock; + struct mds_idmap_table *med_idmap; }; struct osc_creator { diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index 198f89c..184572f 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -361,6 +361,7 @@ struct lov_mds_md_v0 { /* LOV EA mds/wire data (little-endian) */ #define OBD_MD_FLUID (0x0000000000000200LL) /* user ID */ #define OBD_MD_FLGID (0x0000000000000400LL) /* group ID */ #define OBD_MD_FLFLAGS (0x0000000000000800LL) /* flags word */ +#define OBD_MD_FLEA (0x0000000000001000LL) /* extended attributes */ #define OBD_MD_FLNLINK (0x0000000000002000LL) /* link count */ #define OBD_MD_FLGENER (0x0000000000004000LL) /* generation number */ #define OBD_MD_FLINLINE (0x0000000000008000LL) /* inline data */ @@ -380,12 +381,15 @@ struct lov_mds_md_v0 { /* LOV EA mds/wire data (little-endian) */ #define OBD_MD_FLDIREA (0x0000000020000000LL) /* dir's extended attribute data */ #define OBD_MD_REINT (0x0000000040000000LL) /* reintegrate oa */ #define OBD_MD_FID (0x0000000080000000LL) /* lustre_id data */ +#define OBD_MD_FLEALIST (0x0000000100000000LL) /* list extended attributes */ +#define OBD_MD_FLACL_ACCESS (0x0000000200000000LL) /*access acl*/ #define OBD_MD_FLNOTOBD (~(OBD_MD_FLBLOCKS | OBD_MD_LINKNAME | \ OBD_MD_FLEASIZE | OBD_MD_FLHANDLE | \ OBD_MD_FLCKSUM | OBD_MD_FLQOS | \ OBD_MD_FLOSCOPQ | OBD_MD_FLCOOKIE | \ - OBD_MD_MDS)) + OBD_MD_FLEA | OBD_MD_FLEALIST | \ + OBD_MD_FLACL_ACCESS | OBD_MD_MDS)) static inline struct lustre_handle *obdo_handle(struct obdo *oa) { @@ -487,10 +491,6 @@ extern void lustre_swab_ost_lvb(struct ost_lvb *); /* * security descriptor in mds request - * - * note gid & cap might need be removed later: - * - cap should be obtained on mds - * - gid is actually not used. */ struct mds_req_sec_desc { __u32 rsd_uid; @@ -635,6 +635,7 @@ struct lustre_md { struct mds_body *body; struct lov_stripe_md *lsm; struct mea *mea; + struct posix_acl *acl_access; }; struct mdc_op_data { @@ -666,11 +667,21 @@ struct mds_rec_setattr { __u64 sa_ctime; }; -/* Remove this once we declare it in include/linux/fs.h (v21 kernel patch?) */ -#ifndef ATTR_CTIME_SET -#define ATTR_CTIME_SET 0x2000 +/* XXX Following ATTR_XXX should go to vfs patch... */ +#ifdef ATTR_CTIME_SET +#error "ATTR_CTIME_SET has been defined somewhere else" +#endif +#ifdef ATTR_EA +#error "ATTR_EA has been defined somewhere else" +#endif +#ifdef ATTR_EA_RM +#error "ATTR_EA_RM has been defined somewhere else" #endif +#define ATTR_CTIME_SET 0x00002000 +#define ATTR_EA 0x00040000 +#define ATTR_EA_RM 0x00080000 + extern void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa); #ifndef FMODE_READ @@ -1116,4 +1127,13 @@ static inline struct lustre_id *obdo_id(struct obdo *oa) return (struct lustre_id *)raw_id; } +/* security negotiate */ +typedef enum { + SEC_INIT = 600, + SEC_INIT_CONTINUE = 601, + SEC_FINI = 602, + SEC_LAST_OPC +} sec_cmd_t; +#define SEC_FIRST_OPC SEC_INIT + #endif diff --git a/lustre/include/linux/lustre_import.h b/lustre/include/linux/lustre_import.h index e7230d0..d3c182c 100644 --- a/lustre/include/linux/lustre_import.h +++ b/lustre/include/linux/lustre_import.h @@ -46,6 +46,8 @@ enum obd_import_event { IMP_EVENT_ACTIVE = 0x808004, }; +struct ptlrpc_sec; + struct obd_import_conn { struct list_head oic_item; struct ptlrpc_connection *oic_conn; @@ -53,7 +55,6 @@ struct obd_import_conn { unsigned long oic_last_attempt; /* in jiffies */ }; - struct obd_import { struct portals_handle imp_handle; atomic_t imp_refcount; @@ -70,7 +71,11 @@ struct obd_import { struct list_head imp_sending_list; struct list_head imp_delayed_list; + /* list of ongoing raw rpcs (only used by gss) */ + struct list_head imp_rawrpc_list; + struct obd_device *imp_obd; + struct ptlrpc_sec *imp_sec; wait_queue_head_t imp_recovery_waitq; __u64 imp_last_replay_transno; atomic_t imp_inflight; diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h index bd8341b..866d429 100644 --- a/lustre/include/linux/lustre_lite.h +++ b/lustre/include/linux/lustre_lite.h @@ -36,6 +36,8 @@ #include #include #include +#include + /* careful, this is easy to screw up */ #define PAGE_CACHE_MAXBYTES ((__u64)(~0UL) << PAGE_CACHE_SHIFT) @@ -45,7 +47,7 @@ static inline struct lookup_intent *ll_nd2it(struct nameidata *nd) { #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) - return &nd->intent; + return &nd->intent.open; #else return nd->intent; #endif @@ -96,6 +98,7 @@ struct ll_inode_info { #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) struct inode lli_vfs_inode; #endif + struct posix_acl *lli_acl_access; }; // FIXME: replace the name of this with LL_I to conform to kernel stuff @@ -140,8 +143,19 @@ enum { LPROC_LL_DIRECT_READ, LPROC_LL_DIRECT_WRITE, - LPROC_LL_FILE_OPCODES + LPROC_LL_SETXATTR, + LPROC_LL_GETXATTR, + LPROC_LL_FILE_OPCODES, +}; + +struct lustre_intent_data { + int it_disposition; + int it_status; + __u64 it_lock_handle; + void *it_data; + int it_lock_mode; }; +#define LUSTRE_IT(it) ((struct lustre_intent_data *)((it)->d.fs_data)) static inline void ll_inode2id(struct lustre_id *id, struct inode *inode) diff --git a/lustre/include/linux/lustre_mds.h b/lustre/include/linux/lustre_mds.h index d918380..da6aafe 100644 --- a/lustre/include/linux/lustre_mds.h +++ b/lustre/include/linux/lustre_mds.h @@ -38,6 +38,7 @@ #include #include #include +#include struct ldlm_lock_desc; struct mds_obd; @@ -62,8 +63,10 @@ struct mds_update_record { char *ur_tgt; int ur_eadatalen; void *ur_eadata; - int ur_cookielen; - struct llog_cookie *ur_logcookies; + int ur_ea2datalen; + void *ur_ea2data; + int ur_cookielen; /* obsolete? */ + struct llog_cookie *ur_logcookies; /* obsolete? */ struct iattr ur_iattr; struct lvfs_ucred ur_uc; __u64 ur_rdev; @@ -130,6 +133,19 @@ struct mds_client_data { __u8 mcd_padding[MDS_LR_CLIENT_SIZE - 64]; }; +/* simple uid/gid mapping hash table */ +struct mds_idmap_item { + struct list_head hash; + __u32 id1; + __u32 id2; +}; + +#define MDS_IDMAP_HASHSIZE (32) +struct mds_idmap_table { + struct list_head uidmap[MDS_IDMAP_HASHSIZE]; + struct list_head gidmap[MDS_IDMAP_HASHSIZE]; +}; + /* file data for open files on MDS */ struct mds_file_data { struct portals_handle mfd_handle; /* must be first */ @@ -166,6 +182,32 @@ struct mds_grp_hash { unsigned int gh_allow_setgroups:1; }; +/* lustre security descriptor */ +struct lustre_sec_desc { + uid_t lsd_uid; + gid_t lsd_gid; + struct group_info *lsd_ginfo; + unsigned int lsd_allow_setuid:1, + lsd_allow_setgid:1, + lsd_allow_setgrp:1; +}; + +struct lsd_cache_entry { + struct upcall_cache_entry base; + struct lustre_sec_desc lsd; +}; + +struct lsd_downcall_args { + int err; + uid_t uid; + gid_t gid; + __u32 ngroups; + gid_t *groups; + __u32 allow_setuid; + __u32 allow_setgid; + __u32 allow_setgrp; +}; + /* mds/mds_reint.c */ int mds_reint_rec(struct mds_update_record *r, int offset, struct ptlrpc_request *req, struct lustre_handle *); @@ -224,8 +266,8 @@ int mdc_req2lustre_md(struct obd_export *exp_lmv, struct ptlrpc_request *req, struct lustre_md *md); int mdc_getstatus(struct obd_export *exp, struct lustre_id *rootid); int mdc_getattr(struct obd_export *exp, struct lustre_id *id, - __u64 valid, unsigned int ea_size, - struct ptlrpc_request **request); + __u64 valid, const char *ea_name, int ea_namelen, + unsigned int ea_size, struct ptlrpc_request **request); int mdc_getattr_lock(struct obd_export *exp, struct lustre_id *id, char *filename, int namelen, __u64 valid, unsigned int ea_size, struct ptlrpc_request **request); diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index d938260..019e1de 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -241,6 +241,9 @@ struct ptlrpc_cb_id { void *cbid_arg; /* additional arg */ }; +struct ptlrpc_cred; +struct ptlrpc_svcsec; + #define RS_MAX_LOCKS 4 #define RS_DEBUG 1 @@ -259,7 +262,15 @@ struct ptlrpc_reply_state { unsigned int rs_handled:1; /* been handled yet? */ unsigned int rs_on_net:1; /* reply_out_callback pending? */ - int rs_size; + struct ptlrpc_svcsec *rs_svcsec; + char *rs_buf; /* backend buffer */ + int rs_buf_len; /* backend buffer length */ + char *rs_repbuf; /* will be sent on wire */ + int rs_repbuf_len; /* max on-wire data length */ + int rs_repdata_len; /* actual on-wire data length */ + struct lustre_msg *rs_msg; /* lustre msg pointer */ + int rs_msg_len; /* length of lustre msg */ + __u64 rs_transno; __u64 rs_xid; struct obd_export *rs_export; @@ -271,9 +282,6 @@ struct ptlrpc_reply_state { struct lustre_handle rs_locks[RS_MAX_LOCKS]; ldlm_mode_t rs_modes[RS_MAX_LOCKS]; struct llog_create_locks *rs_llog_locks; - - /* last member: variable sized reply message */ - struct lustre_msg rs_msg; }; struct ptlrpc_request { @@ -285,7 +293,8 @@ struct ptlrpc_request { unsigned int rq_intr:1, rq_replied:1, rq_err:1, rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1, rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, - rq_no_delay:1, rq_net_err:1; + rq_no_delay:1, rq_net_err:1, rq_req_wrapped:1, + rq_ptlrpcs_restart:1; int rq_phase; /* client-side refcount for SENT race */ atomic_t rq_refcount; @@ -306,6 +315,20 @@ struct ptlrpc_request { __u64 rq_xid; struct list_head rq_replay_list; + struct ptlrpc_cred *rq_cred; /* client side credit */ + struct ptlrpc_svcsec *rq_svcsec; /* server side security */ + /* XXX temporarily put here XXX */ + void *rq_sec_svcdata; /* server security data */ + unsigned int rq_remote; /* from remote client */ + uid_t rq_auth_uid; + + char *rq_reqbuf; /* backend request buffer */ + int rq_reqbuf_len; /* backend request buffer length */ + int rq_reqdata_len; /* actual request data length */ + char *rq_repbuf; /* backend reply buffer */ + int rq_repbuf_len; /* backend reply buffer length */ + int rq_repdata_len; /* actual reply data length, not used yet */ + #if SWAB_PARANOIA __u32 rq_req_swab_mask; __u32 rq_rep_swab_mask; @@ -574,6 +597,8 @@ int ptlrpc_error(struct ptlrpc_request *req); void ptlrpc_resend_req(struct ptlrpc_request *request); int ptl_send_rpc(struct ptlrpc_request *request); int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd); +int ptlrpc_do_rawrpc(struct obd_import *imp, char *reqbuf, int reqlen, + char *repbuf, int *replenp, int timeout); /* ptlrpc/client.c */ void ptlrpc_init_client(int req_portal, int rep_portal, char *name, diff --git a/lustre/include/linux/lustre_sec.h b/lustre/include/linux/lustre_sec.h new file mode 100644 index 0000000..e1e866c --- /dev/null +++ b/lustre/include/linux/lustre_sec.h @@ -0,0 +1,360 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __LINUX_SEC_H_ +#define __LINUX_SEC_H_ + +/* forward declaration */ +struct obd_import; +struct ptlrpc_request; +struct ptlrpc_cred; +struct ptlrpc_credops; +struct ptlrpc_sec; +struct ptlrpc_secops; + +#define PTLRPC_SEC_MAX_FLAVORS (4) + +typedef struct ptlrpcs_flavor_s { + __u32 flavor; + __u32 subflavor; +} ptlrpcs_flavor_t; + +enum ptlrpcs_security_type { + PTLRPC_SEC_TYPE_NONE = 0, /* no security */ + PTLRPC_SEC_TYPE_AUTH = 1, /* authentication */ + PTLRPC_SEC_TYPE_PRIV = 2, /* privacy */ +}; + +/* + * This header is prepended at any on-wire ptlrpc packets + */ +struct ptlrpcs_wire_hdr { + __u32 flavor; + __u32 sectype; + __u32 msg_len; + __u32 sec_len; +}; + +static inline +struct ptlrpcs_wire_hdr *buf_to_sec_hdr(void *buf) +{ + return (struct ptlrpcs_wire_hdr *) buf; +} + +static inline +struct lustre_msg *buf_to_lustre_msg(void *buf) +{ + return (struct lustre_msg *) + ((char *) buf + sizeof(struct ptlrpcs_wire_hdr)); +} + +static inline +__u8 *buf_to_sec_data(void *buf) +{ + struct ptlrpcs_wire_hdr *hdr = buf_to_sec_hdr(buf); + return (__u8 *) (buf + sizeof(*hdr) + hdr->msg_len); +} + +enum ptlrpcs_flavors { + PTLRPC_SEC_NULL = 0, + PTLRPC_SEC_GSS = 1, +}; + +#define PTLRPC_SEC_GSS_VERSION (1) + +enum ptlrpcs_gss_subflavors { + PTLRPC_SEC_GSS_KRB5 = 0, + PTLRPC_SEC_GSS_KRB5I = 1, + PTLRPC_SEC_GSS_KRB5P = 2, +}; + +enum ptlrpcs_gss_proc { + PTLRPC_GSS_PROC_DATA = 0, + PTLRPC_GSS_PROC_INIT = 1, + PTLRPC_GSS_PROC_CONTINUE_INIT = 2, + PTLRPC_GSS_PROC_DESTROY = 3, + PTLRPC_GSS_PROC_ERR = 4, +}; + +enum ptlrpcs_gss_svc { + PTLRPC_GSS_SVC_NONE = 1, + PTLRPC_GSS_SVC_INTEGRITY = 2, + PTLRPC_GSS_SVC_PRIVACY = 3, +}; + +enum ptlrpcs_error { + PTLRPCS_OK = 0, + PTLRPCS_BADCRED = 1, + PTLRPCS_REJECTEDCRED = 2, + PTLRPCS_BADVERF = 3, + PTLRPCS_REJECTEDVERF = 4, + PTLRPCS_TOOWEAK = 5, + /* GSS errors */ + PTLRPCS_GSS_CREDPROBLEM = 13, + PTLRPCS_GSS_CTXPROBLEM = 14, +}; + +struct vfs_cred { + __u64 vc_pag; + uid_t vc_uid; + gid_t vc_gid; + struct group_info *vc_ginfo; +}; + +struct ptlrpc_credops { + int (*refresh)(struct ptlrpc_cred *cred); + int (*match) (struct ptlrpc_cred *cred, + struct ptlrpc_request *req, + struct vfs_cred *vcred); + int (*sign) (struct ptlrpc_cred *cred, struct ptlrpc_request *req); + int (*verify) (struct ptlrpc_cred *cred, struct ptlrpc_request *req); + int (*seal) (struct ptlrpc_cred *cred, struct ptlrpc_request *req); + int (*unseal) (struct ptlrpc_cred *cred, struct ptlrpc_request *req); + void (*destroy)(struct ptlrpc_cred *cred); +}; + +#define PTLRPC_CRED_UPTODATE 0x00000001 +#define PTLRPC_CRED_DEAD 0x00000002 + +struct ptlrpc_cred { + struct list_head pc_hash; /* linked into hash table */ + atomic_t pc_refcount; + struct ptlrpc_sec *pc_sec; + struct ptlrpc_credops *pc_ops; + struct ptlrpc_request *pc_req; + unsigned long pc_expire; + int pc_flags; + /* XXX maybe should not be here */ + __u64 pc_pag; + uid_t pc_uid; +}; + +struct ptlrpc_secops { + struct ptlrpc_sec * (*create_sec) (ptlrpcs_flavor_t *flavor, + const char *pipe_dir, + void *pipe_data); + void (*destroy_sec) (struct ptlrpc_sec *sec); + struct ptlrpc_cred * (*create_cred) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + struct vfs_cred *vcred); + /* buffer manipulation */ + int (*alloc_reqbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lustre_msg_size); + int (*alloc_repbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lustre_msg_size); + void (*free_reqbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req); + void (*free_repbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req); + /* security payload size estimation */ + int (*est_req_payload)(struct ptlrpc_sec *sec, + int msgsize); + int (*est_rep_payload)(struct ptlrpc_sec *sec, + int msgsize); +}; + +struct ptlrpc_sec_type { + struct module *pst_owner; + char *pst_name; + atomic_t pst_inst; /* instance, debug only */ + ptlrpcs_flavor_t pst_flavor; + struct ptlrpc_secops *pst_ops; +}; + +#define PTLRPC_CREDCACHE_NR 8 +#define PTLRPC_CREDCACHE_MASK (PTLRPC_CREDCACHE_NR - 1) + +struct ptlrpc_sec { + struct ptlrpc_sec_type *ps_type; + struct list_head ps_credcache[PTLRPC_CREDCACHE_NR]; + spinlock_t ps_lock; /* protect cred cache */ + __u32 ps_sectype; + ptlrpcs_flavor_t ps_flavor; + atomic_t ps_refcount; + atomic_t ps_credcount; + struct obd_import *ps_import; + /* actual security model need initialize following fields */ + unsigned long ps_expire; /* cache expire interval */ + unsigned long ps_nextgc; /* next gc time */ + unsigned int ps_flags; +}; + +/* sec.c */ +int ptlrpcs_register(struct ptlrpc_sec_type *type); +int ptlrpcs_unregister(struct ptlrpc_sec_type *type); + +struct ptlrpc_sec * ptlrpcs_sec_create(ptlrpcs_flavor_t *flavor, + struct obd_import *import, + const char *pipe_dir, + void *pipe_data); +void ptlrpcs_sec_put(struct ptlrpc_sec *sec); +void ptlrpcs_sec_invalidate_cache(struct ptlrpc_sec *sec); + +struct ptlrpc_cred * ptlrpcs_cred_lookup(struct ptlrpc_sec *sec, + struct vfs_cred *vcred); +void ptlrpcs_cred_put(struct ptlrpc_cred *cred, int sync); + +static inline void ptlrpcs_cred_get(struct ptlrpc_cred *cred) +{ + LASSERT(atomic_read(&cred->pc_refcount)); + atomic_inc(&cred->pc_refcount); +} + +static inline int ptlrpcs_cred_is_uptodate(struct ptlrpc_cred *cred) +{ + LASSERT(cred); + LASSERT(atomic_read(&cred->pc_refcount)); + return (cred->pc_flags & PTLRPC_CRED_UPTODATE); +} +static inline int ptlrpcs_cred_refresh(struct ptlrpc_cred *cred) +{ + LASSERT(cred); + LASSERT(atomic_read(&cred->pc_refcount)); + LASSERT(cred->pc_ops); + LASSERT(cred->pc_ops->refresh); + return cred->pc_ops->refresh(cred); +} +static inline void ptlrpcs_cred_die(struct ptlrpc_cred *cred) +{ + LASSERT(atomic_read(&cred->pc_refcount)); + LASSERT(cred->pc_sec); + if (!(cred->pc_flags & PTLRPC_CRED_DEAD)) { + spin_lock(&cred->pc_sec->ps_lock); + cred->pc_flags |= PTLRPC_CRED_DEAD; + cred->pc_flags &= ~PTLRPC_CRED_UPTODATE; + list_del_init(&cred->pc_hash); + spin_unlock(&cred->pc_sec->ps_lock); + } +} +static inline int ptlrpcs_cred_is_dead(struct ptlrpc_cred *cred) +{ + return(cred->pc_flags & PTLRPC_CRED_DEAD); +} + +static inline int ptlrpcs_est_req_payload(struct ptlrpc_sec *sec, + int datasize) +{ + struct ptlrpc_secops *ops; + + LASSERT(sec); + LASSERT(sec->ps_type); + LASSERT(sec->ps_type->pst_ops); + + ops = sec->ps_type->pst_ops; + if (ops->est_req_payload) + return ops->est_req_payload(sec, datasize); + else + return 0; +} + +static inline int ptlrpcs_est_rep_payload(struct ptlrpc_sec *sec, + int datasize) +{ + struct ptlrpc_secops *ops; + + LASSERT(sec); + LASSERT(sec->ps_type); + LASSERT(sec->ps_type->pst_ops); + + ops = sec->ps_type->pst_ops; + if (ops->est_rep_payload) + return ops->est_rep_payload(sec, datasize); + else + return 0; +} + +int ptlrpcs_cli_wrap_request(struct ptlrpc_request *req); +int ptlrpcs_cli_unwrap_reply(struct ptlrpc_request *req); +int ptlrpcs_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize); +int ptlrpcs_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize); +void ptlrpcs_cli_free_reqbuf(struct ptlrpc_request *req); +void ptlrpcs_cli_free_repbuf(struct ptlrpc_request *req); + +/* higher interface */ +int ptlrpcs_import_get_sec(struct obd_import *imp); +void ptlrpcs_import_drop_sec(struct obd_import *imp); +int ptlrpcs_req_get_cred(struct ptlrpc_request *req); +void ptlrpcs_req_drop_cred(struct ptlrpc_request *req); +int ptlrpcs_req_replace_dead_cred(struct ptlrpc_request *req); +int ptlrpcs_req_refresh_cred(struct ptlrpc_request *req); + +/* internal helpers */ +int sec_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, + int msgsize, int secsize); +void sec_free_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req); + +/* sec_null.c */ +int ptlrpcs_null_init(void); +int ptlrpcs_null_exit(void); + +/********************************************************** + * Server side stuff + **********************************************************/ + +struct ptlrpc_reply_state; + +struct ptlrpc_svcsec { + struct module *pss_owner; + char *pss_name; + ptlrpcs_flavor_t pss_flavor; + int pss_sec_size; + + int (*accept) (struct ptlrpc_request *req, + enum ptlrpcs_error *res); + int (*authorize) (struct ptlrpc_request *req); + int (*alloc_repbuf)(struct ptlrpc_svcsec *svcsec, + struct ptlrpc_request *req, + int msgsize); + void (*free_repbuf) (struct ptlrpc_svcsec *svcsec, + struct ptlrpc_reply_state *rs); + void (*cleanup_req) (struct ptlrpc_svcsec *svcsec, + struct ptlrpc_request *req); +}; + +#define SVC_OK 1 +#define SVC_COMPLETE 2 +#define SVC_DROP 3 +#define SVC_LOGIN 4 +#define SVC_LOGOUT 5 + +int svcsec_register(struct ptlrpc_svcsec *ss); +int svcsec_unregister(struct ptlrpc_svcsec *ss); +int svcsec_accept(struct ptlrpc_request *req, enum ptlrpcs_error *res); +int svcsec_authorize(struct ptlrpc_request *req); +int svcsec_alloc_repbuf(struct ptlrpc_svcsec *svcsec, + struct ptlrpc_request *req, int msgsize); +void svcsec_cleanup_req(struct ptlrpc_request *req); + +struct ptlrpc_svcsec * svcsec_get(struct ptlrpc_svcsec *sec); +void svcsec_put(struct ptlrpc_svcsec *sec); + +/* internal helpers */ +int svcsec_alloc_reply_state(struct ptlrpc_request *req, + int msgsize, int secsize); +void svcsec_free_reply_state(struct ptlrpc_reply_state *rs); + +/* svcsec_null.c */ +int svcsec_null_init(void); +int svcsec_null_exit(void); + +#endif /* __LINUX_SEC_H_ */ diff --git a/lustre/include/linux/lustre_smfs.h b/lustre/include/linux/lustre_smfs.h index ee3e43a..7f83f04 100644 --- a/lustre/include/linux/lustre_smfs.h +++ b/lustre/include/linux/lustre_smfs.h @@ -26,6 +26,7 @@ #ifndef __LUSTRE_SMFS_H #define __LUSTRE_SMFS_H +#include struct snap_inode_info { int sn_flags; /*the flags indicated inode type */ int sn_gen; /*the inode generation*/ diff --git a/lustre/include/linux/lustre_ucache.h b/lustre/include/linux/lustre_ucache.h new file mode 100644 index 0000000..68e37db --- /dev/null +++ b/lustre/include/linux/lustre_ucache.h @@ -0,0 +1,79 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ + +#ifndef _UPCALL_CACHE_H +#define _UPCALL_CACHE_H + +#define UC_CACHE_NEW 0x01 +#define UC_CACHE_ACQUIRING 0x02 +#define UC_CACHE_INVALID 0x04 +#define UC_CACHE_EXPIRED 0x08 + +#define UC_CACHE_IS_NEW(i) ((i)->ue_flags & UC_CACHE_NEW) +#define UC_CACHE_IS_INVALID(i) ((i)->ue_flags & UC_CACHE_INVALID) +#define UC_CACHE_IS_ACQUIRING(i) ((i)->ue_flags & UC_CACHE_ACQUIRING) +#define UC_CACHE_IS_EXPIRED(i) ((i)->ue_flags & UC_CACHE_EXPIRED) +#define UC_CACHE_IS_VALID(i) ((i)->ue_flags == 0) + +#define UC_CACHE_SET_NEW(i) (i)->ue_flags |= UC_CACHE_NEW +#define UC_CACHE_SET_INVALID(i) (i)->ue_flags |= UC_CACHE_INVALID +#define UC_CACHE_SET_ACQUIRING(i) (i)->ue_flags |= UC_CACHE_ACQUIRING +#define UC_CACHE_SET_EXPIRED(i) (i)->ue_flags |= UC_CACHE_EXPIRED +#define UC_CACHE_SET_VALID(i) (i)->ue_flags = 0 + +#define UC_CACHE_CLEAR_NEW(i) (i)->ue_flags &= ~UC_CACHE_NEW +#define UC_CACHE_CLEAR_ACQUIRING(i) (i)->ue_flags &= ~UC_CACHE_ACQUIRING +#define UC_CACHE_CLEAR_INVALID(i) (i)->ue_flags &= ~UC_CACHE_INVALID +#define UC_CACHE_CLEAR_EXPIRED(i) (i)->ue_flags &= ~UC_CACHE_EXPIRED + +struct upcall_cache; + +struct upcall_cache_entry { + struct list_head ue_hash; + atomic_t ue_refcount; + __u64 ue_key; + struct upcall_cache *ue_cache; + int ue_flags; + wait_queue_head_t ue_waitq; + unsigned long ue_acquire_expire; + unsigned long ue_expire; +}; + +#define UC_CACHE_UPCALL_MAXPATH (1024) + +struct upcall_cache { + struct list_head *uc_hashtable; + int uc_hashsize; + rwlock_t uc_hashlock; + + char *uc_name; + char uc_upcall[UC_CACHE_UPCALL_MAXPATH]; + unsigned long uc_acquire_expire; + unsigned long uc_entry_expire; + + /* functions */ + unsigned int (*hash)(struct upcall_cache *, __u64); + struct upcall_cache_entry* (*alloc_entry)(struct upcall_cache *, __u64); + void (*free_entry)(struct upcall_cache *, + struct upcall_cache_entry *); + int (*make_upcall)(struct upcall_cache *, + struct upcall_cache_entry *); + int (*parse_downcall)(struct upcall_cache *, + struct upcall_cache_entry *, + void *args); +}; + +void upcall_cache_init_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry, + __u64 key); +struct upcall_cache_entry * +upcall_cache_get_entry(struct upcall_cache *cache, __u64 key); +void upcall_cache_put_entry(struct upcall_cache_entry *entry); +int upcall_cache_downcall(struct upcall_cache *cache, __u64 key, + int err, void *args); +void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key); +void upcall_cache_flush_idle(struct upcall_cache *cache); +void upcall_cache_flush_all(struct upcall_cache *cache); + +#endif /* _UPCALL_CACHE_H */ diff --git a/lustre/include/linux/lvfs.h b/lustre/include/linux/lvfs.h index 5e3cbd0..96898fd 100644 --- a/lustre/include/linux/lvfs.h +++ b/lustre/include/linux/lvfs.h @@ -1,3 +1,6 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ #ifndef __LVFS_H__ #define __LVFS_H__ @@ -6,6 +9,8 @@ #define LL_ID_NAMELEN (16 + 1 + 8 + 1) #if defined __KERNEL__ +#include +#include #include #include #endif @@ -18,13 +23,13 @@ struct mds_grp_hash_entry; /* simple.c */ struct lvfs_ucred { - struct mds_grp_hash_entry *luc_ghash; - struct group_info *luc_ginfo; + struct lustre_sec_desc *luc_lsd; + struct group_info *luc_ginfo; __u32 luc_fsuid; __u32 luc_fsgid; __u32 luc_cap; __u32 luc_uid; - __u32 luc_umask; + __u32 luc_umask; }; struct lvfs_callback_ops { @@ -100,11 +105,11 @@ ll_lookup_one_len(const char *name, struct dentry *dparent, int namelen) { struct dentry *dchild; #ifdef S_PDIROPS - struct qstr qstr; - void *lock; - qstr.name = name; - qstr.len = namelen; - lock = lock_dir(dparent->d_inode, &qstr); + struct qstr qstr; + void *lock; + qstr.name = name; + qstr.len = namelen; + lock = lock_dir(dparent->d_inode, &qstr); #else down(&dparent->d_inode->i_sem); #endif @@ -112,7 +117,7 @@ ll_lookup_one_len(const char *name, struct dentry *dparent, int namelen) dchild = lookup_one_len(name, dparent, namelen); #ifdef S_PDIROPS - unlock_dir(dparent->d_inode, lock); + unlock_dir(dparent->d_inode, lock); #else up(&dparent->d_inode->i_sem); #endif @@ -125,6 +130,18 @@ static inline void ll_sleep(int t) schedule_timeout(t * HZ); set_current_state(TASK_RUNNING); } + +static inline struct dentry * +ll_d_lookup(const char *name, + struct dentry *dparent, int len) +{ + struct qstr qstr; + + qstr.len = len; + qstr.name = name; + qstr.hash = full_name_hash(name, len); + return d_lookup(dparent, &qstr); +} #endif static inline int ll_id2str(char *str, __u64 id, __u32 generation) diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 29c77c7..a7f8b5f 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -20,7 +20,6 @@ #define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *) /* Moved to lustre_user.h #define IOC_MDC_GETSTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_mds_md *) */ -#define IOC_MDC_FINISH_GNS _IOWR(IOC_MDC_TYPE, 22, struct obd_device *) #define IOC_MDC_MAX_NR 50 #ifdef __KERNEL__ @@ -275,6 +274,12 @@ struct client_obd { int cl_max_mds_cookiesize; kdev_t cl_sandev; + /* security flavors */ + __u32 cl_sec_flavor; + __u32 cl_sec_subflavor; + __u32 cl_nllu; /* non lustre local user */ + __u32 cl_nllg; /* non lustre local group */ + //struct llog_canceld_ctxt *cl_llcd; /* it's included by obd_llog_ctxt */ void *cl_llcd_offset; @@ -386,6 +391,10 @@ struct mds_obd { struct dentry *mds_id_dir; int mds_obd_type; struct dentry *mds_unnamed_dir; /* for mdt_obd_create only */ + + /* security related */ + char *mds_mds_sec; + char *mds_ost_sec; }; struct echo_obd { @@ -850,8 +859,8 @@ struct md_ops { void *, int, ldlm_completion_callback, ldlm_blocking_callback, void *); int (*m_getattr)(struct obd_export *, struct lustre_id *, - __u64, unsigned int, - struct ptlrpc_request **); + __u64, const char *, int, + unsigned int, struct ptlrpc_request **); int (*m_getattr_lock)(struct obd_export *, struct lustre_id *, char *, int, __u64, unsigned int, struct ptlrpc_request **); diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h index faba9a6..6bb4dca 100644 --- a/lustre/include/linux/obd_class.h +++ b/lustre/include/linux/obd_class.h @@ -1245,14 +1245,14 @@ static inline int md_delete_inode(struct obd_export *exp, } static inline int md_getattr(struct obd_export *exp, struct lustre_id *id, - __u64 valid, unsigned int ea_size, - struct ptlrpc_request **request) + __u64 valid, const char *ea_name, int ea_namelen, + unsigned int ea_size, struct ptlrpc_request **request) { int rc; ENTRY; EXP_CHECK_MD_OP(exp, getattr); MD_COUNTER_INCREMENT(exp->exp_obd, getattr); - rc = MDP(exp->exp_obd, getattr)(exp, id, valid, ea_size, request); + rc = MDP(exp->exp_obd, getattr)(exp, id, valid, ea_name, ea_namelen, ea_size, request); RETURN(rc); } diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index 8783209..64db5f7 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -145,6 +145,14 @@ extern wait_queue_head_t obd_race_waitq; #define OBD_FAIL_TGT_REPLY_NET 0x700 #define OBD_FAIL_TGT_CONN_RACE 0x701 +#define OBD_FAIL_SVCSEC_ACCEPT_BEG 0x750 +#define OBD_FAIL_SVCSEC_ACCEPT_END 0x751 +#define OBD_FAIL_SVCSEC_WRAP_BEG 0x752 +#define OBD_FAIL_SVCSEC_WRAP_END 0x753 +#define OBD_FAIL_SVCGSS_ERR_NOTIFY 0x760 +#define OBD_FAIL_SVCGSS_INIT_REQ 0x780 +#define OBD_FAIL_SVCGSS_INIT_REP 0x781 + /* preparation for a more advanced failure testbed (not functional yet) */ #define OBD_FAIL_MASK_SYS 0x0000FF00 #define OBD_FAIL_MASK_LOC (0x000000FF | OBD_FAIL_MASK_SYS) diff --git a/lustre/kernel_patches/patches/dcache-mds-num-2.6.7.patch b/lustre/kernel_patches/patches/dcache-mds-num-2.6.7.patch index d86d1b6..466235d 100644 --- a/lustre/kernel_patches/patches/dcache-mds-num-2.6.7.patch +++ b/lustre/kernel_patches/patches/dcache-mds-num-2.6.7.patch @@ -1,8 +1,8 @@ Index: linux-2.6.7/include/linux/dcache.h =================================================================== ---- linux-2.6.7.orig/include/linux/dcache.h 2004-08-30 17:20:57.000000000 +0800 -+++ linux-2.6.7/include/linux/dcache.h 2004-08-30 17:39:12.000000000 +0800 -@@ -94,6 +94,9 @@ +--- linux-2.6.7.orig/include/linux/dcache.h 2005-03-23 23:28:49.669799416 +0800 ++++ linux-2.6.7/include/linux/dcache.h 2005-03-23 23:38:25.648237384 +0800 +@@ -86,6 +86,9 @@ spinlock_t d_lock; /* per dentry lock */ struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ @@ -12,11 +12,12 @@ Index: linux-2.6.7/include/linux/dcache.h /* * The next three fields are touched by __d_lookup. Place them here * so they all fit in a 16-byte range, with 16-byte alignment. -@@ -166,6 +169,7 @@ +@@ -158,6 +161,8 @@ #define DCACHE_UNHASHED 0x0010 - #define DCACHE_LUSTRE_INVALID 0x0020 /* Lustre invalidated */ + #define DCACHE_LUSTRE_INVALID 0x0020 /* invalidated by Lustre */ +#define DCACHE_CROSS_REF 0x0040 /* entry points to inode on another MDS */ - ++ extern spinlock_t dcache_lock; + /** diff --git a/lustre/kernel_patches/patches/export-vanilla-2.6.patch b/lustre/kernel_patches/patches/export-vanilla-2.6.patch new file mode 100644 index 0000000..c18a380 --- /dev/null +++ b/lustre/kernel_patches/patches/export-vanilla-2.6.patch @@ -0,0 +1,94 @@ +Index: linux-2.6.7/mm/truncate.c +=================================================================== +--- linux-2.6.7.orig/mm/truncate.c 2004-06-16 13:20:04.000000000 +0800 ++++ linux-2.6.7/mm/truncate.c 2005-03-23 23:30:30.676444072 +0800 +@@ -42,7 +42,7 @@ + * its lock, b) when a concurrent invalidate_inode_pages got there first and + * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. + */ +-static void ++void + truncate_complete_page(struct address_space *mapping, struct page *page) + { + if (page->mapping != mapping) +@@ -58,6 +58,8 @@ + page_cache_release(page); /* pagecache ref */ + } + ++EXPORT_SYMBOL(truncate_complete_page); ++ + /* + * This is for invalidate_inode_pages(). That function can be called at + * any time, and is not supposed to throw away dirty pages. But pages can +Index: linux-2.6.7/fs/super.c +=================================================================== +--- linux-2.6.7.orig/fs/super.c 2004-06-16 13:19:22.000000000 +0800 ++++ linux-2.6.7/fs/super.c 2005-03-23 23:30:30.648448328 +0800 +@@ -804,6 +804,8 @@ + return (struct vfsmount *)sb; + } + ++EXPORT_SYMBOL(do_kern_mount); ++ + struct vfsmount *kern_mount(struct file_system_type *type) + { + return do_kern_mount(type->name, 0, type->name, NULL); +Index: linux-2.6.7/fs/jbd/journal.c +=================================================================== +--- linux-2.6.7.orig/fs/jbd/journal.c 2004-06-16 13:18:59.000000000 +0800 ++++ linux-2.6.7/fs/jbd/journal.c 2005-03-23 23:30:30.647448480 +0800 +@@ -71,6 +71,7 @@ + EXPORT_SYMBOL(journal_errno); + EXPORT_SYMBOL(journal_ack_err); + EXPORT_SYMBOL(journal_clear_err); ++EXPORT_SYMBOL(log_start_commit); + EXPORT_SYMBOL(log_wait_commit); + EXPORT_SYMBOL(journal_start_commit); + EXPORT_SYMBOL(journal_wipe); +Index: linux-2.6.7/kernel/exit.c +=================================================================== +--- linux-2.6.7.orig/kernel/exit.c 2004-06-16 13:19:52.000000000 +0800 ++++ linux-2.6.7/kernel/exit.c 2005-03-23 23:34:17.539955576 +0800 +@@ -256,6 +256,8 @@ + write_unlock_irq(&tasklist_lock); + } + ++EXPORT_SYMBOL(reparent_to_init); ++ + void __set_special_pids(pid_t session, pid_t pgrp) + { + struct task_struct *curr = current; +@@ -435,6 +437,7 @@ + { + __exit_files(tsk); + } ++EXPORT_SYMBOL(exit_files); + + static inline void __put_fs_struct(struct fs_struct *fs) + { +Index: linux-2.6.7/include/linux/fs.h +=================================================================== +--- linux-2.6.7.orig/include/linux/fs.h 2005-03-23 23:30:08.535809960 +0800 ++++ linux-2.6.7/include/linux/fs.h 2005-03-23 23:30:30.675444224 +0800 +@@ -1133,6 +1133,7 @@ + extern struct vfsmount *kern_mount(struct file_system_type *); + extern int may_umount_tree(struct vfsmount *); + extern int may_umount(struct vfsmount *); ++struct vfsmount *do_kern_mount(const char *type, int flags, const char *name, void *data); + extern long do_mount(char *, char *, char *, unsigned long, void *); + + extern int vfs_statfs(struct super_block *, struct kstatfs *); +Index: linux-2.6.7/include/linux/mm.h +=================================================================== +--- linux-2.6.7.orig/include/linux/mm.h 2004-06-16 13:18:56.000000000 +0800 ++++ linux-2.6.7/include/linux/mm.h 2005-03-23 23:30:30.673444528 +0800 +@@ -653,6 +653,9 @@ + + extern unsigned long do_brk(unsigned long, unsigned long); + ++/* truncate.c */ ++extern void truncate_complete_page(struct address_space *mapping,struct page *); ++ + /* filemap.c */ + extern unsigned long page_unuse(struct page *); + extern void truncate_inode_pages(struct address_space *, loff_t); diff --git a/lustre/kernel_patches/patches/ext3-wantedi-2.6-suse.patch b/lustre/kernel_patches/patches/ext3-wantedi-2.6-suse.patch index a4867a5..4fd69a5 100644 --- a/lustre/kernel_patches/patches/ext3-wantedi-2.6-suse.patch +++ b/lustre/kernel_patches/patches/ext3-wantedi-2.6-suse.patch @@ -5,10 +5,10 @@ include/linux/ext3_fs.h | 5 ++++- 5 files changed, 85 insertions(+), 6 deletions(-) -Index: uml-2.6.3/fs/ext3/ialloc.c +Index: linux-2.6.7/fs/ext3/ialloc.c =================================================================== ---- uml-2.6.3.orig/fs/ext3/ialloc.c 2004-02-20 15:00:48.000000000 +0800 -+++ uml-2.6.3/fs/ext3/ialloc.c 2004-02-21 00:24:45.202693776 +0800 +--- linux-2.6.7.orig/fs/ext3/ialloc.c 2005-03-24 00:27:43.282608616 +0800 ++++ linux-2.6.7/fs/ext3/ialloc.c 2005-03-24 00:27:43.888516504 +0800 @@ -420,7 +420,8 @@ * For other inodes, search forward from the parent directory's block * group to find a free inode. @@ -58,11 +58,19 @@ Index: uml-2.6.3/fs/ext3/ialloc.c if (S_ISDIR(mode)) { if (test_opt (sb, OLDALLOC)) group = find_group_dir(sb, dir); -Index: uml-2.6.3/fs/ext3/ioctl.c +Index: linux-2.6.7/fs/ext3/ioctl.c =================================================================== ---- uml-2.6.3.orig/fs/ext3/ioctl.c 2004-01-09 14:59:26.000000000 +0800 -+++ uml-2.6.3/fs/ext3/ioctl.c 2004-02-21 00:21:04.541239416 +0800 -@@ -24,6 +24,31 @@ +--- linux-2.6.7.orig/fs/ext3/ioctl.c 2004-06-16 13:19:13.000000000 +0800 ++++ linux-2.6.7/fs/ext3/ioctl.c 2005-03-24 00:31:16.113253440 +0800 +@@ -9,6 +9,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -24,6 +25,31 @@ ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); switch (cmd) { @@ -93,12 +101,12 @@ Index: uml-2.6.3/fs/ext3/ioctl.c + } case EXT3_IOC_GETFLAGS: flags = ei->i_flags & EXT3_FL_USER_VISIBLE; - return put_user(flags, (int *) arg); -Index: uml-2.6.3/fs/ext3/namei.c + return put_user(flags, (int __user *) arg); +Index: linux-2.6.7/fs/ext3/namei.c =================================================================== ---- uml-2.6.3.orig/fs/ext3/namei.c 2004-02-20 15:01:27.000000000 +0800 -+++ uml-2.6.3/fs/ext3/namei.c 2004-02-21 00:21:04.611228776 +0800 -@@ -1617,6 +1617,19 @@ +--- linux-2.6.7.orig/fs/ext3/namei.c 2005-03-24 00:27:43.536570008 +0800 ++++ linux-2.6.7/fs/ext3/namei.c 2005-03-24 00:27:43.893515744 +0800 +@@ -1939,6 +1939,19 @@ return err; } @@ -118,7 +126,7 @@ Index: uml-2.6.3/fs/ext3/namei.c /* * By the time this is called, we already have created * the directory cache entry for the new file, but it -@@ -1640,7 +1653,7 @@ +@@ -1963,7 +1976,7 @@ if (IS_DIRSYNC(dir)) handle->h_sync = 1; @@ -127,7 +135,7 @@ Index: uml-2.6.3/fs/ext3/namei.c err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext3_file_inode_operations; -@@ -1670,7 +1683,7 @@ +@@ -1994,7 +2007,7 @@ if (IS_DIRSYNC(dir)) handle->h_sync = 1; @@ -136,7 +144,7 @@ Index: uml-2.6.3/fs/ext3/namei.c err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); -@@ -1702,7 +1715,7 @@ +@@ -2027,7 +2040,7 @@ if (IS_DIRSYNC(dir)) handle->h_sync = 1; @@ -145,7 +153,7 @@ Index: uml-2.6.3/fs/ext3/namei.c err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; -@@ -2094,7 +2107,7 @@ +@@ -2439,7 +2452,7 @@ if (IS_DIRSYNC(dir)) handle->h_sync = 1; @@ -154,10 +162,10 @@ Index: uml-2.6.3/fs/ext3/namei.c err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; -Index: uml-2.6.3/include/linux/ext3_fs.h +Index: linux-2.6.7/include/linux/ext3_fs.h =================================================================== ---- uml-2.6.3.orig/include/linux/ext3_fs.h 2004-01-09 14:59:44.000000000 +0800 -+++ uml-2.6.3/include/linux/ext3_fs.h 2004-02-21 00:21:04.613228472 +0800 +--- linux-2.6.7.orig/include/linux/ext3_fs.h 2005-03-24 00:27:43.542569096 +0800 ++++ linux-2.6.7/include/linux/ext3_fs.h 2005-03-24 00:27:43.893515744 +0800 @@ -203,6 +203,7 @@ #define EXT3_IOC_SETFLAGS _IOW('f', 2, long) #define EXT3_IOC_GETVERSION _IOR('f', 3, long) @@ -166,7 +174,7 @@ Index: uml-2.6.3/include/linux/ext3_fs.h #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) #ifdef CONFIG_JBD_DEBUG -@@ -707,7 +708,8 @@ +@@ -708,7 +709,8 @@ dx_hash_info *hinfo); /* ialloc.c */ @@ -176,7 +184,7 @@ Index: uml-2.6.3/include/linux/ext3_fs.h extern void ext3_free_inode (handle_t *, struct inode *); extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); extern unsigned long ext3_count_free_inodes (struct super_block *); -@@ -792,4 +794,5 @@ +@@ -793,4 +795,5 @@ #endif /* __KERNEL__ */ diff --git a/lustre/kernel_patches/patches/header_guards-vanilla-2.6.patch b/lustre/kernel_patches/patches/header_guards-vanilla-2.6.patch new file mode 100644 index 0000000..e8b6abb --- /dev/null +++ b/lustre/kernel_patches/patches/header_guards-vanilla-2.6.patch @@ -0,0 +1,45 @@ +%diffstat + blockgroup_lock.h | 4 +++- + percpu_counter.h | 4 ++++ + 2 files changed, 7 insertions(+), 1 deletion(-) + +%patch +Index: linux-2.6.6/include/linux/percpu_counter.h +=================================================================== +--- linux-2.6.6.orig/include/linux/percpu_counter.h 2004-04-04 11:37:23.000000000 +0800 ++++ linux-2.6.6/include/linux/percpu_counter.h 2004-05-22 16:08:16.000000000 +0800 +@@ -3,6 +3,8 @@ + * + * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4. + */ ++#ifndef _LINUX_PERCPU_COUNTER_H ++#define _LINUX_PERCPU_COUNTER_H + + #include + #include +@@ -101,3 +103,5 @@ static inline void percpu_counter_dec(st + { + percpu_counter_mod(fbc, -1); + } ++ ++#endif /* _LINUX_PERCPU_COUNTER_H */ +Index: linux-2.6.6/include/linux/blockgroup_lock.h +=================================================================== +--- linux-2.6.6.orig/include/linux/blockgroup_lock.h 2004-04-04 11:36:26.000000000 +0800 ++++ linux-2.6.6/include/linux/blockgroup_lock.h 2004-05-22 16:08:45.000000000 +0800 +@@ -3,6 +3,8 @@ + * + * Simple hashed spinlocking. + */ ++#ifndef _LINUX_BLOCKGROUP_LOCK_H ++#define _LINUX_BLOCKGROUP_LOCK_H + + #include + #include +@@ -55,4 +57,4 @@ static inline void bgl_lock_init(struct + #define sb_bgl_lock(sb, block_group) \ + (&(sb)->s_blockgroup_lock.locks[(block_group) & (NR_BG_LOCKS-1)].lock) + +- ++#endif + diff --git a/lustre/kernel_patches/patches/iopen-2.6-vanilla.patch b/lustre/kernel_patches/patches/iopen-2.6-vanilla.patch index cb504d9..88e0843 100644 --- a/lustre/kernel_patches/patches/iopen-2.6-vanilla.patch +++ b/lustre/kernel_patches/patches/iopen-2.6-vanilla.patch @@ -159,7 +159,7 @@ Index: linux-stage/fs/ext3/iopen.c + list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ + dentry->d_inode = inode; + -+ __d_rehash(dentry, 0); /* d_rehash */ ++ __d_rehash(dentry); /* d_rehash */ + spin_unlock(&dcache_lock); + + return NULL; @@ -222,7 +222,7 @@ Index: linux-stage/fs/ext3/iopen.c + /* Move the goal to the de hash queue */ + goal->d_flags &= ~ DCACHE_DISCONNECTED; + security_d_instantiate(goal, inode); -+ __d_rehash(dentry, 0); ++ __d_rehash(dentry); + __d_move(goal, dentry); + spin_unlock(&dcache_lock); + iput(inode); @@ -235,7 +235,7 @@ Index: linux-stage/fs/ext3/iopen.c + dentry->d_inode = inode; +do_rehash: + if (rehash) -+ __d_rehash(dentry, 0); /* d_rehash */ ++ __d_rehash(dentry); /* d_rehash */ + spin_unlock(&dcache_lock); + + return NULL; diff --git a/lustre/kernel_patches/patches/linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch b/lustre/kernel_patches/patches/linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch new file mode 100644 index 0000000..f754546 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch @@ -0,0 +1,16246 @@ +--- linux-2.6.7/Documentation/filesystems/00-INDEX.lsec 2004-06-15 23:20:26.000000000 -0600 ++++ linux-2.6.7/Documentation/filesystems/00-INDEX 2005-03-23 14:28:24.576313528 -0700 +@@ -28,6 +28,8 @@ jfs.txt + - info and mount options for the JFS filesystem. + ncpfs.txt + - info on Novell Netware(tm) filesystem using NCP protocol. ++nfs4.txt ++ - info and mount options for the nfs4 filesystem. + ntfs.txt + - info and mount options for the NTFS filesystem (Windows NT). + proc.txt +--- linux-2.6.7/Documentation/filesystems/nfs4.txt.lsec 2005-03-23 14:28:24.576313528 -0700 ++++ linux-2.6.7/Documentation/filesystems/nfs4.txt 2005-03-23 14:28:24.576313528 -0700 +@@ -0,0 +1,20 @@ ++NFS version 4 ++============= ++ ++NFS version 4 is specified by RFC3530. Compared to earlier NFS versions, ++it provides enhanced security and better client caching, among other features. ++ ++In addition to basic file operations, the NFS client supports locking, kerberos ++(basic authentication and integrity), and reboot recovery. ++ ++As this writing (July 2004), patches to nfs-utils and util-linux are required ++for NFSv4 support; see http://www.citi.umich.edu/projects/nfsv4/linux/ for ++patches and instructions. ++ ++The kernel treats NFS version 4 as a separate filesystem type, nfs4, so it is ++mounted using "mount -tnfs4 server:/path /mntpoint", not by mounting the nfs ++filesystem with -onfsver=4. ++ ++Mount options: ++ ++XXX? +--- linux-2.6.7/fs/locks.c.lsec 2004-06-15 23:20:03.000000000 -0600 ++++ linux-2.6.7/fs/locks.c 2005-03-23 14:28:22.425640480 -0700 +@@ -317,7 +317,7 @@ static int flock_to_posix_lock(struct fi + if (l->l_len == 0) + fl->fl_end = OFFSET_MAX; + +- fl->fl_owner = current->files; ++ fl->fl_owner = 0; + fl->fl_pid = current->tgid; + fl->fl_file = filp; + fl->fl_flags = FL_POSIX; +@@ -357,7 +357,7 @@ static int flock64_to_posix_lock(struct + if (l->l_len == 0) + fl->fl_end = OFFSET_MAX; + +- fl->fl_owner = current->files; ++ fl->fl_owner = 0; + fl->fl_pid = current->tgid; + fl->fl_file = filp; + fl->fl_flags = FL_POSIX; +@@ -920,7 +920,7 @@ int posix_lock_file(struct file *filp, s + */ + int locks_mandatory_locked(struct inode *inode) + { +- fl_owner_t owner = current->files; ++ unsigned int pid = current->tgid; + struct file_lock *fl; + + /* +@@ -930,7 +930,9 @@ int locks_mandatory_locked(struct inode + for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { + if (!IS_POSIX(fl)) + continue; +- if (fl->fl_owner != owner) ++ if (fl->fl_owner != 0) ++ break; ++ if (fl->fl_pid != pid) + break; + } + unlock_kernel(); +@@ -958,7 +960,7 @@ int locks_mandatory_area(int read_write, + int error; + + locks_init_lock(&fl); +- fl.fl_owner = current->files; ++ fl.fl_owner = 0; + fl.fl_pid = current->tgid; + fl.fl_file = filp; + fl.fl_flags = FL_POSIX | FL_ACCESS; +@@ -1684,7 +1686,7 @@ void locks_remove_posix(struct file *fil + lock_kernel(); + while (*before != NULL) { + struct file_lock *fl = *before; +- if (IS_POSIX(fl) && (fl->fl_owner == owner)) { ++ if (IS_POSIX(fl) && posix_same_owner(fl, &lock)) { + locks_delete_lock(before); + continue; + } +@@ -1982,18 +1984,6 @@ int lock_may_write(struct inode *inode, + + EXPORT_SYMBOL(lock_may_write); + +-static inline void __steal_locks(struct file *file, fl_owner_t from) +-{ +- struct inode *inode = file->f_dentry->d_inode; +- struct file_lock *fl = inode->i_flock; +- +- while (fl) { +- if (fl->fl_file == file && fl->fl_owner == from) +- fl->fl_owner = current->files; +- fl = fl->fl_next; +- } +-} +- + /* When getting ready for executing a binary, we make sure that current + * has a files_struct on its own. Before dropping the old files_struct, + * we take over ownership of all locks for all file descriptors we own. +@@ -2002,31 +1992,6 @@ static inline void __steal_locks(struct + */ + void steal_locks(fl_owner_t from) + { +- struct files_struct *files = current->files; +- int i, j; +- +- if (from == files) +- return; +- +- lock_kernel(); +- j = 0; +- for (;;) { +- unsigned long set; +- i = j * __NFDBITS; +- if (i >= files->max_fdset || i >= files->max_fds) +- break; +- set = files->open_fds->fds_bits[j++]; +- while (set) { +- if (set & 1) { +- struct file *file = files->fd[i]; +- if (file) +- __steal_locks(file, from); +- } +- i++; +- set >>= 1; +- } +- } +- unlock_kernel(); + } + EXPORT_SYMBOL(steal_locks); + +--- linux-2.6.7/fs/hostfs/hostfs_kern.c.lsec 2005-03-23 14:25:58.982447160 -0700 ++++ linux-2.6.7/fs/hostfs/hostfs_kern.c 2005-03-23 14:33:11.946626600 -0700 +@@ -290,7 +290,6 @@ static void hostfs_delete_inode(struct i + { + if(HOSTFS_I(inode)->fd != -1) { + close_file(&HOSTFS_I(inode)->fd); +- printk("Closing host fd in .delete_inode\n"); + HOSTFS_I(inode)->fd = -1; + } + clear_inode(inode); +@@ -303,7 +302,6 @@ static void hostfs_destroy_inode(struct + + if(HOSTFS_I(inode)->fd != -1) { + close_file(&HOSTFS_I(inode)->fd); +- printk("Closing host fd in .destroy_inode\n"); + } + + kfree(HOSTFS_I(inode)); +--- linux-2.6.7/fs/open.c.lsec 2005-03-23 14:26:01.774022776 -0700 ++++ linux-2.6.7/fs/open.c 2005-03-23 14:28:23.226518728 -0700 +@@ -1025,7 +1025,7 @@ int filp_close(struct file *filp, fl_own + } + + dnotify_flush(filp, id); +- locks_remove_posix(filp, id); ++ locks_remove_posix(filp, 0); + fput(filp); + return retval; + } +--- linux-2.6.7/fs/nfsd/export.c.lsec 2004-06-15 23:19:36.000000000 -0600 ++++ linux-2.6.7/fs/nfsd/export.c 2005-03-23 14:28:24.686296808 -0700 +@@ -255,7 +255,7 @@ static inline void svc_expkey_update(str + new->ek_export = item->ek_export; + } + +-static DefineSimpleCacheLookup(svc_expkey,0) /* no inplace updates */ ++static DefineSimpleCacheLookup(svc_expkey) + + #define EXPORT_HASHBITS 8 + #define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) +@@ -487,8 +487,72 @@ static inline void svc_export_update(str + new->ex_fsid = item->ex_fsid; + } + +-static DefineSimpleCacheLookup(svc_export,1) /* allow inplace updates */ ++struct svc_export * ++svc_export_lookup(struct svc_export *item, int set) ++{ ++ struct svc_export *tmp, *new = NULL; ++ struct cache_head **hp, **head; + ++ head = &svc_export_cache.hash_table[svc_export_hash(item)]; ++retry: ++ if (set||new) ++ write_lock(&svc_export_cache.hash_lock); ++ else ++ read_lock(&svc_export_cache.hash_lock); ++ for(hp=head; *hp != NULL; hp = &tmp->h.next) { ++ tmp = container_of(*hp, struct svc_export, h); ++ if (svc_export_match(item, tmp)) { /* found a match */ ++ cache_get(&tmp->h); ++ if (set) { ++ if (test_bit(CACHE_NEGATIVE, &item->h.flags)) ++ set_bit(CACHE_NEGATIVE, &tmp->h.flags); ++ else { ++ clear_bit(CACHE_NEGATIVE, &tmp->h.flags); ++ svc_export_update(tmp, item); ++ } ++ } ++ if (set||new) ++ write_unlock(&svc_export_cache.hash_lock); ++ else ++ read_unlock(&svc_export_cache.hash_lock); ++ if (set) ++ cache_fresh(&svc_export_cache, &tmp->h, ++ item->h.expiry_time); ++ if (new) ++ svc_export_put(&new->h, &svc_export_cache); ++ return tmp; ++ } ++ } ++ /* Didn't find anything */ ++ if (new) { ++ svc_export_init(new, item); ++ new->h.next = *head; ++ *head = &new->h; ++ set_bit(CACHE_HASHED, &new->h.flags); ++ svc_export_cache.entries++; ++ if (set) { ++ tmp = new; ++ if (test_bit(CACHE_NEGATIVE, &item->h.flags)) ++ set_bit(CACHE_NEGATIVE, &tmp->h.flags); ++ else ++ svc_export_update(tmp, item); ++ } ++ } ++ if (set||new) ++ write_unlock(&svc_export_cache.hash_lock); ++ else ++ read_unlock(&svc_export_cache.hash_lock); ++ if (new && set) ++ cache_fresh(&svc_export_cache, &new->h, item->h.expiry_time); ++ if (new) ++ return new; ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (new) { ++ cache_init(&new->h); ++ goto retry; ++ } ++ return NULL; ++} + + struct svc_expkey * + exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) +--- linux-2.6.7/fs/nfsd/nfs4callback.c.lsec 2005-03-23 14:28:24.578313224 -0700 ++++ linux-2.6.7/fs/nfsd/nfs4callback.c 2005-03-23 14:28:24.578313224 -0700 +@@ -0,0 +1,631 @@ ++/* ++ * linux/fs/nfsd/nfs4callback.c ++ * ++ * Copyright (c) 2001 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Kendrick Smith ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++#define NFSPROC4_CB_NULL 0 ++#define NFSPROC4_CB_COMPOUND 1 ++ ++/* forward declarations */ ++static void nfs4_cb_null(struct rpc_task *task); ++ ++/* Index of predefined Linux callback client operations */ ++ ++enum { ++ NFSPROC4_CLNT_CB_NULL = 0, ++ NFSPROC4_CLNT_CB_GETATTR, ++ NFSPROC4_CLNT_CB_RECALL, ++}; ++ ++enum nfs_cb_opnum4 { ++ OP_CB_GETATTR = 3, ++ OP_CB_RECALL = 4, ++ OP_CB_ILLEGAL = 10044 ++}; ++ ++ ++#define NFS4_MAXTAGLEN 20 ++ ++#define cb_compound_enc_hdr_sz 4 ++#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) ++#define op_enc_sz 1 ++#define op_dec_sz 2 ++#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) ++#define enc_stateid_sz 16 ++ ++#define NFS4_enc_cb_getattr_sz (cb_compound_enc_hdr_sz + \ ++ op_enc_sz + \ ++ enc_nfs4_fh_sz + 4) ++ ++#define NFS4_dec_cb_getattr_sz (cb_compound_dec_hdr_sz + \ ++ op_dec_sz + \ ++ 11) ++ ++#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ ++ 1 + enc_stateid_sz + \ ++ enc_nfs4_fh_sz) ++ ++#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ ++ op_dec_sz) ++ ++/* ++* Generic encode routines from fs/nfs/nfs4xdr.c ++*/ ++static inline u32 * ++xdr_writemem(u32 *p, const void *ptr, int nbytes) ++{ ++ int tmp = XDR_QUADLEN(nbytes); ++ if (!tmp) ++ return p; ++ p[tmp-1] = 0; ++ memcpy(p, ptr, nbytes); ++ return p + tmp; ++} ++ ++#define WRITE32(n) *p++ = htonl(n) ++#define WRITEMEM(ptr,nbytes) do { \ ++ p = xdr_writemem(p, ptr, nbytes); \ ++} while (0) ++#define RESERVE_SPACE(nbytes) do { \ ++ p = xdr_reserve_space(xdr, nbytes); \ ++ if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \ ++ BUG_ON(!p); \ ++} while (0) ++ ++/* ++ * Generic decode routines from fs/nfs/nfs4xdr.c ++ */ ++#define DECODE_TAIL \ ++ status = 0; \ ++out: \ ++ return status; \ ++xdr_error: \ ++ dprintk("NFSD: xdr error! (%s:%d)\n", __FILE__, __LINE__); \ ++ status = -EIO; \ ++ goto out ++ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (u64)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define READTIME(x) do { \ ++ p++; \ ++ (x.tv_sec) = ntohl(*p++); \ ++ (x.tv_nsec) = ntohl(*p++); \ ++} while (0) ++#define READ_BUF(nbytes) do { \ ++ p = xdr_inline_decode(xdr, nbytes); \ ++ if (!p) { \ ++ dprintk("NFSD: %s: reply buffer overflowed in line %d.", \ ++ __FUNCTION__, __LINE__); \ ++ return -EIO; \ ++ } \ ++} while (0) ++ ++struct nfs4_cb_compound_hdr { ++ int status; ++ u32 ident; ++ u32 nops; ++ u32 taglen; ++ char * tag; ++}; ++ ++struct nfs4_cb_getattr { ++ struct nfs_fh fh; ++ u32 bm0; ++ u32 bm1; ++ __u64 change_attr; ++ __u64 size; ++ struct timespec mtime; ++}; ++ ++struct nfs4_cb_recall { ++ nfs4_stateid stateid; ++ int trunc; ++ struct nfs_fh fh; ++}; ++ ++static struct { ++ int stat; ++ int errno; ++} nfs_cb_errtbl[] = { ++ { NFS4_OK, 0 }, ++ { NFS4ERR_PERM, EPERM }, ++ { NFS4ERR_NOENT, ENOENT }, ++ { NFS4ERR_IO, EIO }, ++ { NFS4ERR_NXIO, ENXIO }, ++ { NFS4ERR_ACCESS, EACCES }, ++ { NFS4ERR_EXIST, EEXIST }, ++ { NFS4ERR_XDEV, EXDEV }, ++ { NFS4ERR_NOTDIR, ENOTDIR }, ++ { NFS4ERR_ISDIR, EISDIR }, ++ { NFS4ERR_INVAL, EINVAL }, ++ { NFS4ERR_FBIG, EFBIG }, ++ { NFS4ERR_NOSPC, ENOSPC }, ++ { NFS4ERR_ROFS, EROFS }, ++ { NFS4ERR_MLINK, EMLINK }, ++ { NFS4ERR_NAMETOOLONG, ENAMETOOLONG }, ++ { NFS4ERR_NOTEMPTY, ENOTEMPTY }, ++ { NFS4ERR_DQUOT, EDQUOT }, ++ { NFS4ERR_STALE, ESTALE }, ++ { NFS4ERR_BADHANDLE, EBADHANDLE }, ++ { NFS4ERR_BAD_COOKIE, EBADCOOKIE }, ++ { NFS4ERR_NOTSUPP, ENOTSUPP }, ++ { NFS4ERR_TOOSMALL, ETOOSMALL }, ++ { NFS4ERR_SERVERFAULT, ESERVERFAULT }, ++ { NFS4ERR_BADTYPE, EBADTYPE }, ++ { NFS4ERR_LOCKED, EAGAIN }, ++ { NFS4ERR_RESOURCE, EREMOTEIO }, ++ { NFS4ERR_SYMLINK, ELOOP }, ++ { NFS4ERR_OP_ILLEGAL, EOPNOTSUPP }, ++ { NFS4ERR_DEADLOCK, EDEADLK }, ++ { -1, EIO } ++}; ++ ++static int ++nfs_cb_stat_to_errno(int stat) ++{ ++ int i; ++ for (i = 0; nfs_cb_errtbl[i].stat != -1; i++) { ++ if (nfs_cb_errtbl[i].stat == stat) ++ return nfs_cb_errtbl[i].errno; ++ } ++ /* If we cannot translate the error, the recovery routines should ++ * handle it. ++ * Note: remaining NFSv4 error codes have values > 10000, so should ++ * not conflict with native Linux error codes. ++ */ ++ return stat; ++} ++ ++/* ++ * XDR encode ++ */ ++ ++static int ++encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 * p; ++ ++ RESERVE_SPACE(16); ++ WRITE32(0); /* tag length is always 0 */ ++ WRITE32(NFS4_MINOR_VERSION); ++ WRITE32(hdr->ident); ++ WRITE32(hdr->nops); ++ return 0; ++} ++ ++static int ++encode_cb_getattr(struct xdr_stream *xdr, struct nfs4_cb_getattr *cb_get) ++{ ++ u32 *p; ++ int len = cb_get->fh.size; ++ ++ RESERVE_SPACE(20 + len); ++ WRITE32(OP_CB_GETATTR); ++ WRITE32(len); ++ WRITEMEM(cb_get->fh.data, len); ++ WRITE32(2); ++ WRITE32(cb_get->bm0); ++ WRITE32(cb_get->bm1); ++ return 0; ++} ++ ++static int ++encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) ++{ ++ u32 *p; ++ int len = cb_rec->fh.size; ++ ++ RESERVE_SPACE(8+sizeof(cb_rec->stateid.data)); ++ WRITE32(OP_CB_RECALL); ++ WRITEMEM(cb_rec->stateid.data, sizeof(cb_rec->stateid.data)); ++ WRITE32(cb_rec->trunc); ++ WRITE32(len); ++ WRITEMEM(cb_rec->fh.data, len); ++ return 0; ++} ++ ++static int ++nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req, u32 *p, struct nfs4_cb_getattr *args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr = { ++ .nops = 1, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ return (encode_cb_getattr(&xdr, args)); ++} ++ ++static int ++nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr = { ++ .nops = 1, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ return (encode_cb_recall(&xdr, args)); ++} ++ ++ ++static int ++decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ ++ u32 *p; ++ ++ READ_BUF(8); ++ READ32(hdr->status); ++ READ32(hdr->taglen); ++ READ_BUF(hdr->taglen + 4); ++ hdr->tag = (char *)p; ++ p += XDR_QUADLEN(hdr->taglen); ++ READ32(hdr->nops); ++ return 0; ++} ++ ++static int ++decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) ++{ ++ u32 *p; ++ u32 op; ++ int32_t nfserr; ++ ++ READ_BUF(8); ++ READ32(op); ++ if (op != expected) { ++ dprintk("NFSD: decode_cb_op_hdr: Callback server returned operation" ++ " %d but we issued a request for %d\n", ++ op, expected); ++ return -EIO; ++ } ++ READ32(nfserr); ++ if (nfserr != NFS_OK) ++ return -nfs_cb_stat_to_errno(nfserr); ++ return 0; ++} ++ ++static int ++decode_cb_getattr(struct xdr_stream *xdr, struct nfs4_cb_getattr *cb_get) ++{ ++ int status; ++ u32 bmlen, ++ attrlen =0, ++ bmval0 =0, ++ bmval1 =0, ++ len = 0; ++ u32 *p; ++ ++ status = decode_cb_op_hdr(xdr, OP_CB_GETATTR); ++ if (status) ++ return status; ++ READ_BUF(4); ++ READ32(bmlen); ++ if( (bmlen < 1) || (bmlen > 2)) ++ goto xdr_error; ++ READ_BUF((bmlen << 2) + 4); ++ READ32(bmval0); ++ if (bmval0 & ~(FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) ++ goto out_bad_bitmap; ++ if (bmlen == 2) { ++ READ32(bmval1); ++ if (bmval1 & ~ FATTR4_WORD1_TIME_MODIFY) ++ goto out_bad_bitmap; ++ } ++ READ32(attrlen); ++ if (bmval0 & FATTR4_WORD0_CHANGE) { ++ READ_BUF(8); ++ len += 8; ++ READ64(cb_get->change_attr); ++ dprintk("decode_cb_getattr: changeid=%Ld\n", ++ (long long)cb_get->change_attr); ++ } ++ if (bmval0 & FATTR4_WORD0_SIZE) { ++ READ_BUF(8); ++ len += 8; ++ READ64(cb_get->size); ++ dprintk("decode_cb_getattr: size=%Ld\n", ++ (long long)cb_get->size); ++ } ++ if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { ++ READ_BUF(12); ++ len += 12; ++ READTIME(cb_get->mtime); ++ dprintk("decode_cb_gatattr: mtime=%ld\n", ++ (long)cb_get->mtime.tv_sec); ++ } ++ if (len != attrlen) ++ goto xdr_error; ++ ++ DECODE_TAIL; ++ ++out_bad_bitmap: ++ dprintk("NFSD: %s Callback server returned bad attribute bitmap\n", ++ __FUNCTION__); ++ return -EIO; ++ ++} ++ ++static int ++nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, u32 *p, struct nfs4_cb_getattr *res) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_getattr(&xdr, res); ++out: ++ return status; ++} ++ ++static int ++nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, u32 *p) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_RECALL); ++out: ++ return status; ++} ++ ++static int ++nfs4_xdr_enc_null(struct rpc_rqst *req, u32 *p) ++{ ++ struct xdr_stream xdrs, *xdr = &xdrs; ++ ++ xdr_init_encode(&xdrs, &req->rq_snd_buf, p); ++ RESERVE_SPACE(0); ++ return 0; ++} ++ ++static int ++nfs4_xdr_dec_null(struct rpc_rqst *req, u32 *p) ++{ ++ return 0; ++} ++ ++/* ++ * RPC procedure tables ++ */ ++#ifndef MAX ++# define MAX(a, b) (((a) > (b))? (a) : (b)) ++#endif ++ ++#define PROC(proc, argtype, restype) \ ++[NFSPROC4_CLNT_##proc] = { \ ++ .p_proc = NFSPROC4_CB_COMPOUND, \ ++ .p_encode = (kxdrproc_t) nfs4_xdr_##argtype, \ ++ .p_decode = (kxdrproc_t) nfs4_xdr_##restype, \ ++ .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2, \ ++} ++ ++struct rpc_procinfo nfs4_cb_procedures[] = { ++ PROC(CB_GETATTR, enc_cb_getattr, dec_cb_getattr), ++ PROC(CB_RECALL, enc_cb_recall, dec_cb_recall), ++}; ++ ++struct rpc_version nfs_cb_version4 = { ++ .number = 1, ++ .nrprocs = sizeof(nfs4_cb_procedures)/sizeof(nfs4_cb_procedures[0]), ++ .procs = nfs4_cb_procedures ++}; ++ ++static struct rpc_version * nfs_cb_version[] = { ++ NULL, ++ &nfs_cb_version4, ++}; ++ ++struct rpc_procinfo nfs4_cb_null_proc= { ++ .p_proc = NFSPROC4_CB_NULL, ++ .p_encode = (kxdrproc_t)nfs4_xdr_enc_null, ++ .p_decode = (kxdrproc_t) nfs4_xdr_dec_null, ++ .p_bufsiz = 0, ++}; ++ ++/* ++ * Use the SETCLIENTID credential ++ */ ++struct rpc_cred * ++nfsd4_lookupcred(struct nfs4_client *clp, int taskflags) ++{ ++ struct auth_cred acred; ++ struct rpc_clnt *clnt = clp->cl_callback.cb_client; ++ struct rpc_cred *ret = NULL; ++ ++ if (!clnt) ++ goto out; ++ get_group_info(clp->cl_cred.cr_group_info); ++ acred.uid = clp->cl_cred.cr_uid; ++ acred.gid = clp->cl_cred.cr_gid; ++ acred.group_info = clp->cl_cred.cr_group_info; ++ ++ dprintk("NFSD: looking up %s cred\n", ++ clnt->cl_auth->au_ops->au_name); ++ ret = rpcauth_lookup_credcache(clnt->cl_auth, &acred, taskflags); ++ put_group_info(clp->cl_cred.cr_group_info); ++out: ++ return ret; ++} ++ ++/* ++ * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... ++ */ ++void ++nfsd4_probe_callback(struct nfs4_client *clp) ++{ ++ struct sockaddr_in addr; ++ struct nfs4_callback *cb = &clp->cl_callback; ++ struct rpc_timeout timeparms; ++ struct rpc_xprt * xprt; ++ struct rpc_program * program = &cb->cb_program; ++ struct rpc_stat * stat = &cb->cb_stat; ++ struct rpc_clnt * clnt; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_null_proc, ++ .rpc_argp = clp, ++ }; ++ char hostname[32]; ++ int status; ++ ++ dprintk("NFSD: probe_callback. cb_parsed %d cb_set %d 1\n", ++ cb->cb_parsed, cb->cb_set); ++ if (!cb->cb_parsed || cb->cb_set) ++ goto out_err; ++ ++ /* Currently, we only support tcp for the callback channel */ ++ if (cb->cb_netid.len !=3 || memcmp((char *)cb->cb_netid.data, "tcp", 3)) ++ goto out_err; ++ ++ /* Initialize address */ ++ memset(&addr, 0, sizeof(addr)); ++ addr.sin_family = AF_INET; ++ addr.sin_port = htons(cb->cb_port); ++ addr.sin_addr.s_addr = htonl(cb->cb_addr); ++ ++ /* Initialize timeout */ ++ timeparms.to_initval = HZ; ++ timeparms.to_retries = 5; ++ timeparms.to_maxval = NFSD_LEASE_TIME*HZ; ++ timeparms.to_exponential = 1; ++ ++ /* Create RPC transport */ ++ if (!(xprt = xprt_create_proto(IPPROTO_TCP, &addr, &timeparms))) { ++ dprintk("NFSD: couldn't create callback transport!\n"); ++ goto out_err; ++ } ++ ++ /* Initialize rpc_program */ ++ program->name = "nfs4_cb"; ++ program->number = cb->cb_prog; ++ program->nrvers = sizeof(nfs_cb_version)/sizeof(nfs_cb_version[0]); ++ program->version = nfs_cb_version; ++ program->stats = stat; ++ ++ /* Initialize rpc_stat */ ++ memset(stat, 0, sizeof(struct rpc_stat)); ++ stat->program = program; ++ ++ /* Create RPC client ++ * ++ * XXX AUTH_UNIX only - need AUTH_GSS.... ++ */ ++ sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(addr.sin_addr.s_addr)); ++ if (!(clnt = rpc_create_client(xprt, hostname, program, 1, RPC_AUTH_UNIX))) { ++ dprintk("NFSD: couldn't create callback client\n"); ++ goto out_xprt; ++ } ++ clnt->cl_intr = 1; ++ clnt->cl_softrtry = 1; ++ clnt->cl_chatty = 1; ++ cb->cb_client = clnt; ++ ++ /* Kick rpciod, put the call on the wire. */ ++ ++ if (rpciod_up() != 0) { ++ dprintk("nfsd: couldn't start rpciod for callbacks!\n"); ++ goto out_clnt; ++ } ++ ++ /* the task holds a reference to the nfs4_client struct */ ++ atomic_inc(&clp->cl_count); ++ ++ msg.rpc_cred = nfsd4_lookupcred(clp,0); ++ status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, nfs4_cb_null, 0); ++ ++ if (status != 0) { ++ dprintk("NFSD: asynchronous NFSPROC4_CB_NULL failed!\n"); ++ goto out_rpciod; ++ } ++ return; ++ ++out_rpciod: ++ rpciod_down(); ++out_clnt: ++ rpc_shutdown_client(clnt); ++ goto out_err; ++out_xprt: ++ xprt_destroy(xprt); ++out_err: ++ dprintk("NFSD: warning: no callback path to client %.*s\n", ++ clp->cl_name.len, clp->cl_name.data); ++ cb->cb_client = NULL; ++} ++ ++static void ++nfs4_cb_null(struct rpc_task *task) ++{ ++ struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp; ++ struct nfs4_callback *cb = &clp->cl_callback; ++ u32 addr = htonl(cb->cb_addr); ++ ++ dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status); ++ ++ if (task->tk_status < 0) { ++ dprintk("NFSD: callback establishment to client %.*s failed\n", ++ clp->cl_name.len, clp->cl_name.data); ++ goto out; ++ } ++ cb->cb_set = 1; ++ dprintk("NFSD: callback set to client %u.%u.%u.%u\n", NIPQUAD(addr)); ++out: ++ put_nfs4_client(clp); ++} +--- linux-2.6.7/fs/nfsd/nfs4xdr.c.lsec 2004-06-15 23:19:52.000000000 -0600 ++++ linux-2.6.7/fs/nfsd/nfs4xdr.c 2005-03-23 14:28:23.924412632 -0700 +@@ -55,6 +55,8 @@ + #include + #include + #include ++#include ++#include + + #define NFSDDBG_FACILITY NFSDDBG_XDR + +@@ -287,27 +289,40 @@ u32 *read_buf(struct nfsd4_compoundargs + return p; + } + +-char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes) ++static int ++defer_free(struct nfsd4_compoundargs *argp, ++ void (*release)(const void *), void *p) + { + struct tmpbuf *tb; ++ ++ tb = kmalloc(sizeof(*tb), GFP_KERNEL); ++ if (!tb) ++ return -ENOMEM; ++ tb->buf = p; ++ tb->release = release; ++ tb->next = argp->to_free; ++ argp->to_free = tb; ++ return 0; ++} ++ ++char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes) ++{ ++ void *new = NULL; + if (p == argp->tmp) { +- p = kmalloc(nbytes, GFP_KERNEL); +- if (!p) return NULL; ++ new = kmalloc(nbytes, GFP_KERNEL); ++ if (!new) return NULL; ++ p = new; + memcpy(p, argp->tmp, nbytes); + } else { + if (p != argp->tmpp) + BUG(); + argp->tmpp = NULL; + } +- tb = kmalloc(sizeof(*tb), GFP_KERNEL); +- if (!tb) { +- kfree(p); ++ if (defer_free(argp, kfree, p)) { ++ kfree(new); + return NULL; +- } +- tb->buf = p; +- tb->next = argp->to_free; +- argp->to_free = tb; +- return (char*)p; ++ } else ++ return (char *)p; + } + + +@@ -335,7 +350,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoun + } + + static int +-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr) ++nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, ++ struct nfs4_acl **acl) + { + int expected_len, len = 0; + u32 dummy32; +@@ -364,6 +380,51 @@ nfsd4_decode_fattr(struct nfsd4_compound + READ64(iattr->ia_size); + iattr->ia_valid |= ATTR_SIZE; + } ++ if (bmval[0] & FATTR4_WORD0_ACL) { ++ int nace, i; ++ struct nfs4_ace ace; ++ ++ READ_BUF(4); len += 4; ++ READ32(nace); ++ ++ *acl = nfs4_acl_new(); ++ if (*acl == NULL) { ++ status = -ENOMEM; ++ goto out_nfserr; ++ } ++ defer_free(argp, (void (*)(const void *))nfs4_acl_free, *acl); ++ ++ for (i = 0; i < nace; i++) { ++ READ_BUF(16); len += 16; ++ READ32(ace.type); ++ READ32(ace.flag); ++ READ32(ace.access_mask); ++ READ32(dummy32); ++ READ_BUF(dummy32); ++ len += XDR_QUADLEN(dummy32) << 2; ++ READMEM(buf, dummy32); ++ if (check_utf8(buf, dummy32)) ++ return nfserr_inval; ++ ace.whotype = nfs4_acl_get_whotype(buf, dummy32); ++ status = 0; ++ if (ace.whotype != NFS4_ACL_WHO_NAMED) ++ ace.who = 0; ++ else if (ace.flag & NFS4_ACE_IDENTIFIER_GROUP) ++ status = nfsd_map_name_to_gid(argp->rqstp, ++ buf, dummy32, &ace.who); ++ else ++ status = nfsd_map_name_to_uid(argp->rqstp, ++ buf, dummy32, &ace.who); ++ if (status) ++ goto out_nfserr; ++ if (nfs4_acl_add_ace(*acl, ace.type, ace.flag, ++ ace.access_mask, ace.whotype, ace.who) != 0) { ++ status = -ENOMEM; ++ goto out_nfserr; ++ } ++ } ++ } else ++ *acl = NULL; + if (bmval[1] & FATTR4_WORD1_MODE) { + READ_BUF(4); + len += 4; +@@ -549,7 +610,7 @@ nfsd4_decode_create(struct nfsd4_compoun + if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) + return status; + +- if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr))) ++ if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl))) + goto out; + + DECODE_TAIL; +@@ -698,7 +759,7 @@ nfsd4_decode_open(struct nfsd4_compounda + switch (open->op_createmode) { + case NFS4_CREATE_UNCHECKED: + case NFS4_CREATE_GUARDED: +- if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr))) ++ if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl))) + goto out; + break; + case NFS4_CREATE_EXCLUSIVE: +@@ -875,7 +936,7 @@ nfsd4_decode_setattr(struct nfsd4_compou + READ_BUF(sizeof(stateid_t)); + READ32(setattr->sa_stateid.si_generation); + COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t)); +- if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr))) ++ if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl))) + goto out; + + DECODE_TAIL; +@@ -1288,32 +1349,24 @@ static u32 nfs4_ftypes[16] = { + NF4SOCK, NF4BAD, NF4LNK, NF4BAD, + }; + +-static inline int +-xdr_padding(int l) +-{ +- return 3 - ((l - 1) & 3); /* smallest i>=0 such that (l+i)%4 = 0 */ +-} +- + static int +-nfsd4_encode_name(struct svc_rqst *rqstp, int group, uid_t id, ++nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group, + u32 **p, int *buflen) + { + int status; +- u32 len; + + if (*buflen < (XDR_QUADLEN(IDMAP_NAMESZ) << 2) + 4) + return nfserr_resource; +- if (group) ++ if (whotype != NFS4_ACL_WHO_NAMED) ++ status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1)); ++ else if (group) + status = nfsd_map_gid_to_name(rqstp, id, (u8 *)(*p + 1)); + else + status = nfsd_map_uid_to_name(rqstp, id, (u8 *)(*p + 1)); + if (status < 0) + return nfserrno(status); +- len = (unsigned)status; +- *(*p)++ = htonl(len); +- memset((u8 *)*p + len, 0, xdr_padding(len)); +- *p += XDR_QUADLEN(len); +- *buflen -= (XDR_QUADLEN(len) << 2) + 4; ++ *p = xdr_encode_opaque(*p, NULL, status); ++ *buflen -= (XDR_QUADLEN(status) << 2) + 4; + BUG_ON(*buflen < 0); + return 0; + } +@@ -1321,13 +1374,20 @@ nfsd4_encode_name(struct svc_rqst *rqstp + static inline int + nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, u32 **p, int *buflen) + { +- return nfsd4_encode_name(rqstp, uid, 0, p, buflen); ++ return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen); + } + + static inline int + nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, u32 **p, int *buflen) + { +- return nfsd4_encode_name(rqstp, gid, 1, p, buflen); ++ return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen); ++} ++ ++static inline int ++nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group, ++ u32 **p, int *buflen) ++{ ++ return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen); + } + + +@@ -1354,6 +1414,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s + u64 dummy64; + u32 *p = buffer; + int status; ++ int aclsupport = 0; ++ struct nfs4_acl *acl = NULL; + + BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); + BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0); +@@ -1376,6 +1438,17 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s + goto out; + fhp = &tempfh; + } ++ if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT ++ | FATTR4_WORD0_SUPPORTED_ATTRS)) { ++ status = nfsd4_get_nfs4_acl(rqstp, dentry, &acl); ++ aclsupport = (status == 0); ++ if (bmval0 & FATTR4_WORD0_ACL) { ++ if (status == -EOPNOTSUPP) ++ bmval0 &= ~FATTR4_WORD0_ACL; ++ else if (status != 0) ++ goto out_nfserr; ++ } ++ } + if ((buflen -= 16) < 0) + goto out_resource; + +@@ -1388,7 +1461,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s + if ((buflen -= 12) < 0) + goto out_resource; + WRITE32(2); +- WRITE32(NFSD_SUPPORTED_ATTRS_WORD0); ++ WRITE32(aclsupport ? ++ NFSD_SUPPORTED_ATTRS_WORD0 : ++ NFSD_SUPPORTED_ATTRS_WORD0 & ~FATTR4_WORD0_ACL); + WRITE32(NFSD_SUPPORTED_ATTRS_WORD1); + } + if (bmval0 & FATTR4_WORD0_TYPE) { +@@ -1459,10 +1534,44 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s + goto out_resource; + WRITE32(0); + } ++ if (bmval0 & FATTR4_WORD0_ACL) { ++ struct nfs4_ace *ace; ++ struct list_head *h; ++ ++ if (acl == NULL) { ++ if ((buflen -= 4) < 0) ++ goto out_resource; ++ ++ WRITE32(0); ++ goto out_acl; ++ } ++ if ((buflen -= 4) < 0) ++ goto out_resource; ++ WRITE32(acl->naces); ++ ++ list_for_each(h, &acl->ace_head) { ++ ace = list_entry(h, struct nfs4_ace, l_ace); ++ ++ if ((buflen -= 4*3) < 0) ++ goto out_resource; ++ WRITE32(ace->type); ++ WRITE32(ace->flag); ++ WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL); ++ status = nfsd4_encode_aclname(rqstp, ace->whotype, ++ ace->who, ace->flag & NFS4_ACE_IDENTIFIER_GROUP, ++ &p, &buflen); ++ if (status == nfserr_resource) ++ goto out_resource; ++ if (status) ++ goto out; ++ } ++ } ++out_acl: + if (bmval0 & FATTR4_WORD0_ACLSUPPORT) { + if ((buflen -= 4) < 0) + goto out_resource; +- WRITE32(0); ++ WRITE32(aclsupport ? ++ ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0); + } + if (bmval0 & FATTR4_WORD0_CANSETTIME) { + if ((buflen -= 4) < 0) +@@ -1645,6 +1754,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, s + status = nfs_ok; + + out: ++ nfs4_acl_free(acl); + if (fhp == &tempfh) + fh_put(&tempfh); + return status; +@@ -2471,6 +2581,24 @@ nfs4svc_encode_voidres(struct svc_rqst * + return xdr_ressize_check(rqstp, p); + } + ++void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args) ++{ ++ if (args->ops != args->iops) { ++ kfree(args->ops); ++ args->ops = args->iops; ++ } ++ if (args->tmpp) { ++ kfree(args->tmpp); ++ args->tmpp = NULL; ++ } ++ while (args->to_free) { ++ struct tmpbuf *tb = args->to_free; ++ args->to_free = tb->next; ++ tb->release(tb->buf); ++ kfree(tb); ++ } ++} ++ + int + nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoundargs *args) + { +@@ -2487,20 +2615,7 @@ nfs4svc_decode_compoundargs(struct svc_r + + status = nfsd4_decode_compound(args); + if (status) { +- if (args->ops != args->iops) { +- kfree(args->ops); +- args->ops = args->iops; +- } +- if (args->tmpp) { +- kfree(args->tmpp); +- args->tmpp = NULL; +- } +- while (args->to_free) { +- struct tmpbuf *tb = args->to_free; +- args->to_free = tb->next; +- kfree(tb->buf); +- kfree(tb); +- } ++ nfsd4_release_compoundargs(args); + } + return !status; + } +--- linux-2.6.7/fs/nfsd/nfs4proc.c.lsec 2004-06-15 23:20:26.000000000 -0600 ++++ linux-2.6.7/fs/nfsd/nfs4proc.c 2005-03-23 14:28:24.080388920 -0700 +@@ -52,6 +52,7 @@ + #include + #include + #include ++#include + + #define NFSDDBG_FACILITY NFSDDBG_PROC + +@@ -135,9 +136,11 @@ do_open_fhandle(struct svc_rqst *rqstp, + { + int status; + +- dprintk("NFSD: do_open_fhandle\n"); ++ /* Only reclaims from previously confirmed clients are valid */ ++ if ((status = nfs4_check_open_reclaim(&open->op_clientid))) ++ return status; + +- /* we don't know the target directory, and therefore can not ++ /* We don't know the target directory, and therefore can not + * set the change info + */ + +@@ -172,8 +175,7 @@ nfsd4_open(struct svc_rqst *rqstp, struc + if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + return nfserr_grace; + +- if (nfs4_in_no_grace() && +- open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) ++ if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + return nfserr_no_grace; + + /* This check required by spec. */ +@@ -318,7 +320,7 @@ nfsd4_commit(struct svc_rqst *rqstp, str + return status; + } + +-static inline int ++static int + nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_create *create) + { + struct svc_fh resfh; +@@ -435,7 +437,7 @@ nfsd4_link(struct svc_rqst *rqstp, struc + return status; + } + +-static inline int ++static int + nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh) + { + struct svc_fh tmp_fh; +@@ -619,7 +621,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, st + status = nfserr_bad_stateid; + if (ZERO_STATEID(&setattr->sa_stateid) || ONE_STATEID(&setattr->sa_stateid)) { + dprintk("NFSD: nfsd4_setattr: magic stateid!\n"); +- return status; ++ goto out; + } + + nfs4_lock_state(); +@@ -627,17 +629,25 @@ nfsd4_setattr(struct svc_rqst *rqstp, st + &setattr->sa_stateid, + CHECK_FH | RDWR_STATE, &stp))) { + dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); +- goto out; ++ goto out_unlock; + } + status = nfserr_openmode; + if (!access_bits_permit_write(stp->st_access_bmap)) { + dprintk("NFSD: nfsd4_setattr: not opened for write!\n"); +- goto out; ++ goto out_unlock; + } + nfs4_unlock_state(); + } +- return (nfsd_setattr(rqstp, current_fh, &setattr->sa_iattr, 0, (time_t)0)); ++ status = nfs_ok; ++ if (setattr->sa_acl != NULL) ++ status = nfsd4_set_nfs4_acl(rqstp, current_fh, setattr->sa_acl); ++ if (status) ++ goto out; ++ status = nfsd_setattr(rqstp, current_fh, &setattr->sa_iattr, ++ 0, (time_t)0); + out: ++ return status; ++out_unlock: + nfs4_unlock_state(); + return status; + } +@@ -773,13 +783,20 @@ nfsd4_proc_compound(struct svc_rqst *rqs + struct nfsd4_compoundres *resp) + { + struct nfsd4_op *op; +- struct svc_fh current_fh; +- struct svc_fh save_fh; ++ struct svc_fh *current_fh = NULL; ++ struct svc_fh *save_fh = NULL; + int slack_space; /* in words, not bytes! */ + int status; + +- fh_init(¤t_fh, NFS4_FHSIZE); +- fh_init(&save_fh, NFS4_FHSIZE); ++ status = nfserr_resource; ++ current_fh = kmalloc(sizeof(*current_fh), GFP_KERNEL); ++ if (current_fh == NULL) ++ goto out; ++ fh_init(current_fh, NFS4_FHSIZE); ++ save_fh = kmalloc(sizeof(*save_fh), GFP_KERNEL); ++ if (save_fh == NULL) ++ goto out; ++ fh_init(save_fh, NFS4_FHSIZE); + + resp->xbuf = &rqstp->rq_res; + resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; +@@ -831,7 +848,7 @@ nfsd4_proc_compound(struct svc_rqst *rqs + * SETATTR NOFILEHANDLE error handled in nfsd4_setattr + * due to required returned bitmap argument + */ +- if ((!current_fh.fh_dentry) && ++ if ((!current_fh->fh_dentry) && + !((op->opnum == OP_PUTFH) || (op->opnum == OP_PUTROOTFH) || + (op->opnum == OP_SETCLIENTID) || + (op->opnum == OP_SETCLIENTID_CONFIRM) || +@@ -843,105 +860,105 @@ nfsd4_proc_compound(struct svc_rqst *rqs + } + switch (op->opnum) { + case OP_ACCESS: +- op->status = nfsd4_access(rqstp, ¤t_fh, &op->u.access); ++ op->status = nfsd4_access(rqstp, current_fh, &op->u.access); + break; + case OP_CLOSE: +- op->status = nfsd4_close(rqstp, ¤t_fh, &op->u.close); ++ op->status = nfsd4_close(rqstp, current_fh, &op->u.close); + if (op->u.close.cl_stateowner) + op->replay = + &op->u.close.cl_stateowner->so_replay; + break; + case OP_COMMIT: +- op->status = nfsd4_commit(rqstp, ¤t_fh, &op->u.commit); ++ op->status = nfsd4_commit(rqstp, current_fh, &op->u.commit); + break; + case OP_CREATE: +- op->status = nfsd4_create(rqstp, ¤t_fh, &op->u.create); ++ op->status = nfsd4_create(rqstp, current_fh, &op->u.create); + break; + case OP_GETATTR: +- op->status = nfsd4_getattr(rqstp, ¤t_fh, &op->u.getattr); ++ op->status = nfsd4_getattr(rqstp, current_fh, &op->u.getattr); + break; + case OP_GETFH: +- op->status = nfsd4_getfh(¤t_fh, &op->u.getfh); ++ op->status = nfsd4_getfh(current_fh, &op->u.getfh); + break; + case OP_LINK: +- op->status = nfsd4_link(rqstp, ¤t_fh, &save_fh, &op->u.link); ++ op->status = nfsd4_link(rqstp, current_fh, save_fh, &op->u.link); + break; + case OP_LOCK: +- op->status = nfsd4_lock(rqstp, ¤t_fh, &op->u.lock); ++ op->status = nfsd4_lock(rqstp, current_fh, &op->u.lock); + if (op->u.lock.lk_stateowner) + op->replay = + &op->u.lock.lk_stateowner->so_replay; + break; + case OP_LOCKT: +- op->status = nfsd4_lockt(rqstp, ¤t_fh, &op->u.lockt); ++ op->status = nfsd4_lockt(rqstp, current_fh, &op->u.lockt); + break; + case OP_LOCKU: +- op->status = nfsd4_locku(rqstp, ¤t_fh, &op->u.locku); ++ op->status = nfsd4_locku(rqstp, current_fh, &op->u.locku); + if (op->u.locku.lu_stateowner) + op->replay = + &op->u.locku.lu_stateowner->so_replay; + break; + case OP_LOOKUP: +- op->status = nfsd4_lookup(rqstp, ¤t_fh, &op->u.lookup); ++ op->status = nfsd4_lookup(rqstp, current_fh, &op->u.lookup); + break; + case OP_LOOKUPP: +- op->status = nfsd4_lookupp(rqstp, ¤t_fh); ++ op->status = nfsd4_lookupp(rqstp, current_fh); + break; + case OP_NVERIFY: +- op->status = nfsd4_verify(rqstp, ¤t_fh, &op->u.nverify); ++ op->status = nfsd4_verify(rqstp, current_fh, &op->u.nverify); + if (op->status == nfserr_not_same) + op->status = nfs_ok; + break; + case OP_OPEN: +- op->status = nfsd4_open(rqstp, ¤t_fh, &op->u.open); ++ op->status = nfsd4_open(rqstp, current_fh, &op->u.open); + if (op->u.open.op_stateowner) + op->replay = + &op->u.open.op_stateowner->so_replay; + break; + case OP_OPEN_CONFIRM: +- op->status = nfsd4_open_confirm(rqstp, ¤t_fh, &op->u.open_confirm); ++ op->status = nfsd4_open_confirm(rqstp, current_fh, &op->u.open_confirm); + if (op->u.open_confirm.oc_stateowner) + op->replay = + &op->u.open_confirm.oc_stateowner->so_replay; + break; + case OP_OPEN_DOWNGRADE: +- op->status = nfsd4_open_downgrade(rqstp, ¤t_fh, &op->u.open_downgrade); ++ op->status = nfsd4_open_downgrade(rqstp, current_fh, &op->u.open_downgrade); + if (op->u.open_downgrade.od_stateowner) + op->replay = + &op->u.open_downgrade.od_stateowner->so_replay; + break; + case OP_PUTFH: +- op->status = nfsd4_putfh(rqstp, ¤t_fh, &op->u.putfh); ++ op->status = nfsd4_putfh(rqstp, current_fh, &op->u.putfh); + break; + case OP_PUTROOTFH: +- op->status = nfsd4_putrootfh(rqstp, ¤t_fh); ++ op->status = nfsd4_putrootfh(rqstp, current_fh); + break; + case OP_READ: +- op->status = nfsd4_read(rqstp, ¤t_fh, &op->u.read); ++ op->status = nfsd4_read(rqstp, current_fh, &op->u.read); + break; + case OP_READDIR: +- op->status = nfsd4_readdir(rqstp, ¤t_fh, &op->u.readdir); ++ op->status = nfsd4_readdir(rqstp, current_fh, &op->u.readdir); + break; + case OP_READLINK: +- op->status = nfsd4_readlink(rqstp, ¤t_fh, &op->u.readlink); ++ op->status = nfsd4_readlink(rqstp, current_fh, &op->u.readlink); + break; + case OP_REMOVE: +- op->status = nfsd4_remove(rqstp, ¤t_fh, &op->u.remove); ++ op->status = nfsd4_remove(rqstp, current_fh, &op->u.remove); + break; + case OP_RENAME: +- op->status = nfsd4_rename(rqstp, ¤t_fh, &save_fh, &op->u.rename); ++ op->status = nfsd4_rename(rqstp, current_fh, save_fh, &op->u.rename); + break; + case OP_RENEW: + op->status = nfsd4_renew(&op->u.renew); + break; + case OP_RESTOREFH: +- op->status = nfsd4_restorefh(¤t_fh, &save_fh); ++ op->status = nfsd4_restorefh(current_fh, save_fh); + break; + case OP_SAVEFH: +- op->status = nfsd4_savefh(¤t_fh, &save_fh); ++ op->status = nfsd4_savefh(current_fh, save_fh); + break; + case OP_SETATTR: +- op->status = nfsd4_setattr(rqstp, ¤t_fh, &op->u.setattr); ++ op->status = nfsd4_setattr(rqstp, current_fh, &op->u.setattr); + break; + case OP_SETCLIENTID: + op->status = nfsd4_setclientid(rqstp, &op->u.setclientid); +@@ -950,12 +967,12 @@ nfsd4_proc_compound(struct svc_rqst *rqs + op->status = nfsd4_setclientid_confirm(rqstp, &op->u.setclientid_confirm); + break; + case OP_VERIFY: +- op->status = nfsd4_verify(rqstp, ¤t_fh, &op->u.verify); ++ op->status = nfsd4_verify(rqstp, current_fh, &op->u.verify); + if (op->status == nfserr_same) + op->status = nfs_ok; + break; + case OP_WRITE: +- op->status = nfsd4_write(rqstp, ¤t_fh, &op->u.write); ++ op->status = nfsd4_write(rqstp, current_fh, &op->u.write); + break; + case OP_RELEASE_LOCKOWNER: + op->status = nfsd4_release_lockowner(rqstp, &op->u.release_lockowner); +@@ -976,22 +993,13 @@ encode_op: + } + + out: +- if (args->ops != args->iops) { +- kfree(args->ops); +- args->ops = args->iops; +- } +- if (args->tmpp) { +- kfree(args->tmpp); +- args->tmpp = NULL; +- } +- while (args->to_free) { +- struct tmpbuf *tb = args->to_free; +- args->to_free = tb->next; +- kfree(tb->buf); +- kfree(tb); +- } +- fh_put(¤t_fh); +- fh_put(&save_fh); ++ nfsd4_release_compoundargs(args); ++ if (current_fh) ++ fh_put(current_fh); ++ kfree(current_fh); ++ if (save_fh) ++ fh_put(save_fh); ++ kfree(save_fh); + return status; + } + +--- linux-2.6.7/fs/nfsd/nfs4state.c.lsec 2004-06-15 23:19:43.000000000 -0600 ++++ linux-2.6.7/fs/nfsd/nfs4state.c 2005-03-23 14:28:24.028396824 -0700 +@@ -51,6 +51,9 @@ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ ++static time_t lease_time = 90; /* default lease time */ ++static time_t old_lease_time = 90; /* past incarnation lease time */ ++static u32 nfs4_reclaim_init = 0; + time_t boot_time; + static time_t grace_end = 0; + static u32 current_clientid = 1; +@@ -82,7 +85,7 @@ struct nfs4_stateid * find_stateid(state + * protects clientid_hashtbl[], clientstr_hashtbl[], + * unconfstr_hashtbl[], uncofid_hashtbl[]. + */ +-static struct semaphore client_sema; ++static DECLARE_MUTEX(client_sema); + + void + nfs4_lock_state(void) +@@ -131,8 +134,11 @@ static void release_file(struct nfs4_fil + ((id) & CLIENT_HASH_MASK) + #define clientstr_hashval(name, namelen) \ + (opaque_hashval((name), (namelen)) & CLIENT_HASH_MASK) +- +-/* conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed ++/* ++ * reclaim_str_hashtbl[] holds known client info from previous reset/reboot ++ * used in reboot/reset lease grace period processing ++ * ++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed + * setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed +@@ -144,6 +150,8 @@ static void release_file(struct nfs4_fil + * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time + * for last close replay. + */ ++static struct list_head reclaim_str_hashtbl[CLIENT_HASH_SIZE]; ++static int reclaim_str_hashtbl_size; + static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE]; + static struct list_head conf_str_hashtbl[CLIENT_HASH_SIZE]; + static struct list_head unconf_str_hashtbl[CLIENT_HASH_SIZE]; +@@ -208,12 +216,20 @@ free_client(struct nfs4_client *clp) + kfree(clp); + } + +-static void ++void ++put_nfs4_client(struct nfs4_client *clp) ++{ ++ if (atomic_dec_and_test(&clp->cl_count)) ++ free_client(clp); ++} ++ ++void + expire_client(struct nfs4_client *clp) + { + struct nfs4_stateowner *sop; + +- dprintk("NFSD: expire_client\n"); ++ dprintk("NFSD: expire_client cl_count %d\n", ++ atomic_read(&clp->cl_count)); + list_del(&clp->cl_idhash); + list_del(&clp->cl_strhash); + list_del(&clp->cl_lru); +@@ -221,7 +237,7 @@ expire_client(struct nfs4_client *clp) + sop = list_entry(clp->cl_perclient.next, struct nfs4_stateowner, so_perclient); + release_stateowner(sop); + } +- free_client(clp); ++ put_nfs4_client(clp); + } + + static struct nfs4_client * +@@ -230,6 +246,7 @@ create_client(struct xdr_netobj name) { + + if(!(clp = alloc_client(name))) + goto out; ++ atomic_set(&clp->cl_count, 1); + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_perclient); +@@ -339,6 +356,99 @@ move_to_confirmed(struct nfs4_client *cl + renew_client(clp); + } + ++ ++/* a helper function for parse_callback */ ++static int ++parse_octet(unsigned int *lenp, char **addrp) ++{ ++ unsigned int len = *lenp; ++ char *p = *addrp; ++ int n = -1; ++ char c; ++ ++ for (;;) { ++ if (!len) ++ break; ++ len--; ++ c = *p++; ++ if (c == '.') ++ break; ++ if ((c < '0') || (c > '9')) { ++ n = -1; ++ break; ++ } ++ if (n < 0) ++ n = 0; ++ n = (n * 10) + (c - '0'); ++ if (n > 255) { ++ n = -1; ++ break; ++ } ++ } ++ *lenp = len; ++ *addrp = p; ++ return n; ++} ++ ++/* parse and set the setclientid ipv4 callback address */ ++int ++parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp) ++{ ++ int temp = 0; ++ u32 cbaddr = 0; ++ u16 cbport = 0; ++ u32 addrlen = addr_len; ++ char *addr = addr_val; ++ int i, shift; ++ ++ /* ipaddress */ ++ shift = 24; ++ for(i = 4; i > 0 ; i--) { ++ if ((temp = parse_octet(&addrlen, &addr)) < 0) { ++ return 0; ++ } ++ cbaddr |= (temp << shift); ++ if(shift > 0) ++ shift -= 8; ++ } ++ *cbaddrp = cbaddr; ++ ++ /* port */ ++ shift = 8; ++ for(i = 2; i > 0 ; i--) { ++ if ((temp = parse_octet(&addrlen, &addr)) < 0) { ++ return 0; ++ } ++ cbport |= (temp << shift); ++ if(shift > 0) ++ shift -= 8; ++ } ++ *cbportp = cbport; ++ return 1; ++} ++ ++void ++gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) ++{ ++ struct nfs4_callback *cb = &clp->cl_callback; ++ ++ if( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val, ++ &cb->cb_addr, &cb->cb_port))) { ++ printk(KERN_INFO "NFSD: BAD callback address. client will not receive delegations\n"); ++ printk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " ++ "will not receive delegations\n", ++ clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); ++ ++ cb->cb_parsed = 0; ++ return; ++ } ++ cb->cb_netid.len = se->se_callback_netid_len; ++ cb->cb_netid.data = se->se_callback_netid_val; ++ cb->cb_prog = se->se_callback_prog; ++ cb->cb_ident = se->se_callback_ident; ++ cb->cb_parsed = 1; ++} ++ + /* + * RFC 3010 has a complex implmentation description of processing a + * SETCLIENTID request consisting of 5 bullets, labeled as +@@ -450,6 +560,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp + copy_cred(&new->cl_cred,&rqstp->rq_cred); + gen_clid(new); + gen_confirm(new); ++ gen_callback(new, setclid); + add_to_unconfirmed(new, strhashval); + } else if (cmp_verf(&conf->cl_verifier, &clverifier)) { + /* +@@ -477,6 +588,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp + copy_cred(&new->cl_cred,&rqstp->rq_cred); + copy_clid(new, conf); + gen_confirm(new); ++ gen_callback(new, setclid); + add_to_unconfirmed(new,strhashval); + } else if (!unconf) { + /* +@@ -494,6 +606,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp + copy_cred(&new->cl_cred,&rqstp->rq_cred); + gen_clid(new); + gen_confirm(new); ++ gen_callback(new, setclid); + add_to_unconfirmed(new, strhashval); + } else if (!cmp_verf(&conf->cl_confirm, &unconf->cl_confirm)) { + /* +@@ -519,6 +632,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp + copy_cred(&new->cl_cred,&rqstp->rq_cred); + gen_clid(new); + gen_confirm(new); ++ gen_callback(new, setclid); + add_to_unconfirmed(new, strhashval); + } else { + /* No cases hit !!! */ +@@ -529,7 +643,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp + setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; + setclid->se_clientid.cl_id = new->cl_clientid.cl_id; + memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); +- printk(KERN_INFO "NFSD: this client will not receive delegations\n"); + status = nfs_ok; + out: + nfs4_unlock_state(); +@@ -575,7 +688,7 @@ nfsd4_setclientid_confirm(struct svc_rqs + * not been found. + */ + if (clp->cl_addr != ip_addr) { +- printk("NFSD: setclientid: string in use by client" ++ dprintk("NFSD: setclientid: string in use by client" + "(clientid %08x/%08x)\n", + clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); + goto out; +@@ -588,7 +701,7 @@ nfsd4_setclientid_confirm(struct svc_rqs + continue; + status = nfserr_inval; + if (clp->cl_addr != ip_addr) { +- printk("NFSD: setclientid: string in use by client" ++ dprintk("NFSD: setclientid: string in use by client" + "(clientid %08x/%08x)\n", + clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); + goto out; +@@ -610,6 +723,7 @@ nfsd4_setclientid_confirm(struct svc_rqs + status = nfserr_clid_inuse; + else { + expire_client(conf); ++ clp = unconf; + move_to_confirmed(unconf, idhashval); + status = nfs_ok; + } +@@ -627,6 +741,7 @@ nfsd4_setclientid_confirm(struct svc_rqs + if (!cmp_creds(&conf->cl_cred,&rqstp->rq_cred)) { + status = nfserr_clid_inuse; + } else { ++ clp = conf; + status = nfs_ok; + } + goto out; +@@ -641,6 +756,7 @@ nfsd4_setclientid_confirm(struct svc_rqs + status = nfserr_clid_inuse; + } else { + status = nfs_ok; ++ clp = unconf; + move_to_confirmed(unconf, idhashval); + } + goto out; +@@ -660,7 +776,9 @@ nfsd4_setclientid_confirm(struct svc_rqs + status = nfserr_inval; + goto out; + out: +- /* XXX if status == nfs_ok, probe callback path */ ++ if (!status) ++ nfsd4_probe_callback(clp); ++ + nfs4_unlock_state(); + return status; + } +@@ -1510,10 +1628,12 @@ nfs4_preprocess_seqid_op(struct svc_fh * + + status = nfserr_bad_stateid; + +- /* for new lock stateowners, check that the lock->v.new.open_stateid +- * refers to an open stateowner, and that the lockclid +- * (nfs4_lock->v.new.clientid) is the same as the +- * open_stateid->st_stateowner->so_client->clientid ++ /* for new lock stateowners: ++ * check that the lock->v.new.open_stateid ++ * refers to an open stateowner ++ * ++ * check that the lockclid (nfs4_lock->v.new.clientid) is the same ++ * as the open_stateid->st_stateowner->so_client->clientid + */ + if (lockclid) { + struct nfs4_stateowner *sop = stp->st_stateowner; +@@ -1599,6 +1719,17 @@ check_replay: + } + + /* ++ * eventually, this will perform an upcall to the 'state daemon' as well as ++ * set the cl_first_state field. ++ */ ++void ++first_state(struct nfs4_client *clp) ++{ ++ if (!clp->cl_first_state) ++ clp->cl_first_state = get_seconds(); ++} ++ ++/* + * nfs4_unlock_state(); called in encode + */ + int +@@ -1635,6 +1766,7 @@ nfsd4_open_confirm(struct svc_rqst *rqst + stp->st_stateid.si_fileid, + stp->st_stateid.si_generation); + status = nfs_ok; ++ first_state(sop->so_client); + out: + return status; + } +@@ -1850,6 +1982,21 @@ nfs4_set_lock_denied(struct file_lock *f + deny->ld_type = NFS4_WRITE_LT; + } + ++static struct nfs4_stateowner * ++find_lockstateowner(struct xdr_netobj *owner, clientid_t *clid) ++{ ++ struct nfs4_stateowner *local = NULL; ++ int i; ++ ++ for (i = 0; i < LOCK_HASH_SIZE; i++) { ++ list_for_each_entry(local, &lock_ownerid_hashtbl[i], so_idhash) { ++ if(!cmp_owner_str(local, owner, clid)) ++ continue; ++ return local; ++ } ++ } ++ return NULL; ++} + + static int + find_lockstateowner_str(unsigned int hashval, struct xdr_netobj *owner, clientid_t *clid, struct nfs4_stateowner **op) { +@@ -1969,7 +2116,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struc + + if (nfs4_in_grace() && !lock->lk_reclaim) + return nfserr_grace; +- if (nfs4_in_no_grace() && lock->lk_reclaim) ++ if (!nfs4_in_grace() && lock->lk_reclaim) + return nfserr_no_grace; + + if (check_lock_length(lock->lk_offset, lock->lk_length)) +@@ -1992,7 +2139,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struc + printk("NFSD: nfsd4_lock: clientid is stale!\n"); + goto out; + } +- /* does the clientid in the lock owner own the open stateid? */ ++ ++ /* is the new lock seqid presented by the client zero? */ ++ status = nfserr_bad_seqid; ++ if (lock->v.new.lock_seqid != 0) ++ goto out; + + /* validate and update open stateid and open seqid */ + status = nfs4_preprocess_seqid_op(current_fh, +@@ -2011,15 +2162,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struc + strhashval = lock_ownerstr_hashval(fp->fi_inode, + open_sop->so_client->cl_clientid.cl_id, + lock->v.new.owner); +- + /* + * If we already have this lock owner, the client is in + * error (or our bookeeping is wrong!) + * for asking for a 'new lock'. + */ + status = nfserr_bad_stateid; +- if (find_lockstateowner_str(strhashval, &lock->v.new.owner, +- &lock->v.new.clientid, &lock_sop)) ++ lock_sop = find_lockstateowner(&lock->v.new.owner, ++ &lock->v.new.clientid); ++ if (lock_sop) + goto out; + status = nfserr_resource; + if (!(lock->lk_stateowner = alloc_init_lock_stateowner(strhashval, open_sop->so_client, open_stp, lock))) +@@ -2315,7 +2466,7 @@ nfsd4_release_lockowner(struct svc_rqst + clientid_t *clid = &rlockowner->rl_clientid; + struct nfs4_stateowner *local = NULL; + struct xdr_netobj *owner = &rlockowner->rl_owner; +- int status, i; ++ int status; + + dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", + clid->cl_boot, clid->cl_id); +@@ -2330,34 +2481,136 @@ nfsd4_release_lockowner(struct svc_rqst + + nfs4_lock_state(); + +- /* find the lockowner */ + status = nfs_ok; +- for (i=0; i < LOCK_HASH_SIZE; i++) +- list_for_each_entry(local, &lock_ownerstr_hashtbl[i], so_strhash) +- if(cmp_owner_str(local, owner, clid)) { +- struct nfs4_stateid *stp; +- +- /* check for any locks held by any stateid +- * associated with the (lock) stateowner */ +- status = nfserr_locks_held; +- list_for_each_entry(stp, &local->so_perfilestate, +- st_perfilestate) { +- if(stp->st_vfs_set) { +- if (check_for_locks(&stp->st_vfs_file, +- local)) +- goto out; +- } +- } +- /* no locks held by (lock) stateowner */ +- status = nfs_ok; +- release_stateowner(local); +- goto out; ++ local = find_lockstateowner(owner, clid); ++ if (local) { ++ struct nfs4_stateid *stp; ++ ++ /* check for any locks held by any stateid ++ * associated with the (lock) stateowner */ ++ status = nfserr_locks_held; ++ list_for_each_entry(stp, &local->so_perfilestate, ++ st_perfilestate) { ++ if(stp->st_vfs_set) { ++ if (check_for_locks(&stp->st_vfs_file, local)) ++ goto out; + } ++ } ++ /* no locks held by (lock) stateowner */ ++ status = nfs_ok; ++ release_stateowner(local); ++ } + out: + nfs4_unlock_state(); + return status; + } + ++static inline struct nfs4_client_reclaim * ++alloc_reclaim(int namelen) ++{ ++ struct nfs4_client_reclaim *crp = NULL; ++ ++ crp = kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL); ++ if (!crp) ++ return NULL; ++ crp->cr_name.data = kmalloc(namelen, GFP_KERNEL); ++ if (!crp->cr_name.data) { ++ kfree(crp); ++ return NULL; ++ } ++ return crp; ++} ++ ++/* ++ * failure => all reset bets are off, nfserr_no_grace... ++ */ ++static int ++nfs4_client_to_reclaim(struct nfs4_client *clp) ++{ ++ unsigned int strhashval; ++ struct nfs4_client_reclaim *crp = NULL; ++ ++ crp = alloc_reclaim(clp->cl_name.len); ++ if (!crp) ++ return 0; ++ strhashval = clientstr_hashval(clp->cl_name.data, clp->cl_name.len); ++ INIT_LIST_HEAD(&crp->cr_strhash); ++ list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]); ++ memcpy(crp->cr_name.data, clp->cl_name.data, clp->cl_name.len); ++ crp->cr_name.len = clp->cl_name.len; ++ crp->cr_first_state = clp->cl_first_state; ++ crp->cr_expired = 0; ++ return 1; ++} ++ ++static void ++nfs4_release_reclaim(void) ++{ ++ struct nfs4_client_reclaim *crp = NULL; ++ int i; ++ ++ BUG_ON(!nfs4_reclaim_init); ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) { ++ while (!list_empty(&reclaim_str_hashtbl[i])) { ++ crp = list_entry(reclaim_str_hashtbl[i].next, ++ struct nfs4_client_reclaim, cr_strhash); ++ list_del(&crp->cr_strhash); ++ kfree(crp->cr_name.data); ++ kfree(crp); ++ reclaim_str_hashtbl_size--; ++ } ++ } ++ BUG_ON(reclaim_str_hashtbl_size); ++} ++ ++/* ++ * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ ++struct nfs4_client_reclaim * ++nfs4_find_reclaim_client(clientid_t *clid) ++{ ++ unsigned int idhashval = clientid_hashval(clid->cl_id); ++ unsigned int strhashval; ++ struct nfs4_client *clp, *client = NULL; ++ struct nfs4_client_reclaim *crp = NULL; ++ ++ ++ /* find clientid in conf_id_hashtbl */ ++ list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { ++ if (cmp_clid(&clp->cl_clientid, clid)) { ++ client = clp; ++ break; ++ } ++ } ++ if (!client) ++ return NULL; ++ ++ /* find clp->cl_name in reclaim_str_hashtbl */ ++ strhashval = clientstr_hashval(client->cl_name.data, ++ client->cl_name.len); ++ list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) { ++ if(cmp_name(&crp->cr_name, &client->cl_name)) { ++ return crp; ++ } ++ } ++ return NULL; ++} ++ ++/* ++* Called from OPEN. Look for clientid in reclaim list. ++*/ ++int ++nfs4_check_open_reclaim(clientid_t *clid) ++{ ++ struct nfs4_client_reclaim *crp; ++ ++ if ((crp = nfs4_find_reclaim_client(clid)) == NULL) ++ return nfserr_reclaim_bad; ++ if (crp->cr_expired) ++ return nfserr_no_grace; ++ return nfs_ok; ++} ++ ++ + /* + * Start and stop routines + */ +@@ -2366,10 +2619,16 @@ void + nfs4_state_init(void) + { + int i; +- time_t start = get_seconds(); ++ time_t grace_time; + + if (nfs4_init) + return; ++ if (!nfs4_reclaim_init) { ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&reclaim_str_hashtbl[i]); ++ reclaim_str_hashtbl_size = 0; ++ nfs4_reclaim_init = 1; ++ } + for (i = 0; i < CLIENT_HASH_SIZE; i++) { + INIT_LIST_HEAD(&conf_id_hashtbl[i]); + INIT_LIST_HEAD(&conf_str_hashtbl[i]); +@@ -2396,27 +2655,36 @@ nfs4_state_init(void) + + INIT_LIST_HEAD(&close_lru); + INIT_LIST_HEAD(&client_lru); +- init_MUTEX(&client_sema); +- boot_time = start; +- grace_end = start + NFSD_LEASE_TIME; ++ boot_time = get_seconds(); ++ grace_time = max(old_lease_time, lease_time); ++ if (reclaim_str_hashtbl_size == 0) ++ grace_time = 0; ++ if (grace_time) ++ printk("NFSD: starting %ld-second grace period\n", grace_time); ++ grace_end = boot_time + grace_time; + INIT_WORK(&laundromat_work,laundromat_main, NULL); + schedule_delayed_work(&laundromat_work, NFSD_LEASE_TIME*HZ); + nfs4_init = 1; +- + } + + int + nfs4_in_grace(void) + { +- return time_before(get_seconds(), (unsigned long)grace_end); ++ return get_seconds() < grace_end; + } + +-int +-nfs4_in_no_grace(void) ++void ++set_no_grace(void) + { +- return (grace_end < get_seconds()); ++ printk("NFSD: ERROR in reboot recovery. State reclaims will fail.\n"); ++ grace_end = get_seconds(); + } + ++time_t ++nfs4_lease_time(void) ++{ ++ return lease_time; ++} + + static void + __nfs4_state_shutdown(void) +@@ -2454,6 +2722,61 @@ void + nfs4_state_shutdown(void) + { + nfs4_lock_state(); ++ nfs4_release_reclaim(); + __nfs4_state_shutdown(); + nfs4_unlock_state(); + } ++ ++/* ++ * Called when leasetime is changed. ++ * ++ * if nfsd is not started, simply set the global lease. ++ * ++ * if nfsd(s) are running, lease change requires nfsv4 state to be reset. ++ * e.g: boot_time is reset, existing nfs4_client structs are ++ * used to fill reclaim_str_hashtbl, then all state (except for the ++ * reclaim_str_hashtbl) is re-initialized. ++ * ++ * if the old lease time is greater than the new lease time, the grace ++ * period needs to be set to the old lease time to allow clients to reclaim ++ * their state. XXX - we may want to set the grace period == lease time ++ * after an initial grace period == old lease time ++ * ++ * if an error occurs in this process, the new lease is set, but the server ++ * will not honor OPEN or LOCK reclaims, and will return nfserr_no_grace ++ * which means OPEN/LOCK/READ/WRITE will fail during grace period. ++ * ++ * clients will attempt to reset all state with SETCLIENTID/CONFIRM, and ++ * OPEN and LOCK reclaims. ++ */ ++void ++nfs4_reset_lease(time_t leasetime) ++{ ++ struct nfs4_client *clp; ++ int i; ++ ++ printk("NFSD: New leasetime %ld\n",leasetime); ++ if (!nfs4_init) ++ return; ++ nfs4_lock_state(); ++ old_lease_time = lease_time; ++ lease_time = leasetime; ++ ++ nfs4_release_reclaim(); ++ ++ /* populate reclaim_str_hashtbl with current confirmed nfs4_clientid */ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) { ++ list_for_each_entry(clp, &conf_id_hashtbl[i], cl_idhash) { ++ if (!nfs4_client_to_reclaim(clp)) { ++ nfs4_release_reclaim(); ++ goto init_state; ++ } ++ reclaim_str_hashtbl_size++; ++ } ++ } ++init_state: ++ __nfs4_state_shutdown(); ++ nfs4_state_init(); ++ nfs4_unlock_state(); ++} ++ +--- linux-2.6.7/fs/nfsd/vfs.c.lsec 2004-06-15 23:19:13.000000000 -0600 ++++ linux-2.6.7/fs/nfsd/vfs.c 2005-03-23 14:28:24.520322040 -0700 +@@ -44,6 +44,16 @@ + #include + #include + #include ++#ifdef CONFIG_NFSD_V4 ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#endif /* CONFIG_NFSD_V4 */ + + #include + +@@ -344,6 +354,177 @@ out_nfserr: + goto out; + } + ++#if defined(CONFIG_NFSD_V4) ++ ++static int ++set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) ++{ ++ int len; ++ size_t buflen; ++ char *buf = NULL; ++ int error = 0; ++ struct inode *inode = dentry->d_inode; ++ ++ buflen = posix_acl_xattr_size(pacl->a_count); ++ buf = kmalloc(buflen, GFP_KERNEL); ++ error = -ENOMEM; ++ if (buf == NULL) ++ goto out; ++ ++ len = posix_acl_to_xattr(pacl, buf, buflen); ++ if (len < 0) { ++ error = len; ++ goto out; ++ } ++ ++ error = -EOPNOTSUPP; ++ if (inode->i_op && inode->i_op->setxattr) { ++ down(&inode->i_sem); ++ security_inode_setxattr(dentry, key, buf, len, 0); ++ error = inode->i_op->setxattr(dentry, key, buf, len, 0); ++ if (!error) ++ security_inode_post_setxattr(dentry, key, buf, len, 0); ++ up(&inode->i_sem); ++ } ++out: ++ kfree(buf); ++ return (error); ++} ++ ++int ++nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, ++ struct nfs4_acl *acl) ++{ ++ int error; ++ struct dentry *dentry; ++ struct inode *inode; ++ struct posix_acl *pacl = NULL, *dpacl = NULL; ++ unsigned int flags = 0; ++ ++ /* Get inode */ ++ error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR); ++ if (error) ++ goto out; ++ ++ dentry = fhp->fh_dentry; ++ inode = dentry->d_inode; ++ if (S_ISDIR(inode->i_mode)) ++ flags = NFS4_ACL_DIR; ++ ++ error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); ++ if (error < 0) ++ goto out_nfserr; ++ ++ if (pacl) { ++ error = set_nfsv4_acl_one(dentry, pacl, XATTR_NAME_ACL_ACCESS); ++ if (error < 0) ++ goto out_nfserr; ++ } ++ ++ if (dpacl) { ++ error = set_nfsv4_acl_one(dentry, dpacl, XATTR_NAME_ACL_DEFAULT); ++ if (error < 0) ++ goto out_nfserr; ++ } ++ ++ error = nfs_ok; ++ ++out: ++ posix_acl_release(pacl); ++ posix_acl_release(dpacl); ++ return (error); ++out_nfserr: ++ error = nfserrno(error); ++ goto out; ++} ++ ++static struct posix_acl * ++_get_posix_acl(struct dentry *dentry, char *key) ++{ ++ struct inode *inode = dentry->d_inode; ++ char *buf = NULL; ++ int buflen, error = 0; ++ struct posix_acl *pacl = NULL; ++ ++ down(&inode->i_sem); ++ ++ buflen = inode->i_op->getxattr(dentry, key, NULL, 0); ++ if (buflen <= 0) { ++ error = buflen < 0 ? buflen : -ENODATA; ++ goto out_sem; ++ } ++ ++ buf = kmalloc(buflen, GFP_KERNEL); ++ if (buf == NULL) { ++ error = -ENOMEM; ++ goto out_sem; ++ } ++ ++ error = -EOPNOTSUPP; ++ if (inode->i_op && inode->i_op->getxattr) { ++ error = security_inode_getxattr(dentry, key); ++ if (error) ++ goto out_sem; ++ error = inode->i_op->getxattr(dentry, key, buf, buflen); ++ } ++ if (error < 0) ++ goto out_sem; ++ ++ error = 0; ++ up(&inode->i_sem); ++ ++ pacl = posix_acl_from_xattr(buf, buflen); ++ out: ++ kfree(buf); ++ return pacl; ++ out_sem: ++ up(&inode->i_sem); ++ pacl = ERR_PTR(error); ++ goto out; ++} ++ ++int ++nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl) ++{ ++ struct inode *inode = dentry->d_inode; ++ int error = 0; ++ struct posix_acl *pacl = NULL, *dpacl = NULL; ++ unsigned int flags = 0; ++ ++ pacl = _get_posix_acl(dentry, XATTR_NAME_ACL_ACCESS); ++ if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA) ++ pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); ++ if (IS_ERR(pacl)) { ++ error = PTR_ERR(pacl); ++ pacl = NULL; ++ goto out; ++ } ++ ++ if (S_ISDIR(inode->i_mode)) { ++ dpacl = _get_posix_acl(dentry, XATTR_NAME_ACL_DEFAULT); ++ if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA) ++ dpacl = NULL; ++ else if (IS_ERR(dpacl)) { ++ error = PTR_ERR(dpacl); ++ dpacl = NULL; ++ goto out; ++ } ++ flags = NFS4_ACL_DIR; ++ } ++ ++ *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags); ++ if (IS_ERR(*acl)) { ++ error = PTR_ERR(*acl); ++ *acl = NULL; ++ } ++ out: ++ posix_acl_release(pacl); ++ posix_acl_release(dpacl); ++ return error; ++} ++ ++#endif /* defined(CONFIG_NFS_V4) */ ++ + #ifdef CONFIG_NFSD_V3 + /* + * Check server access rights to a file system object +--- linux-2.6.7/fs/nfsd/nfs4idmap.c.lsec 2004-06-15 23:19:43.000000000 -0600 ++++ linux-2.6.7/fs/nfsd/nfs4idmap.c 2005-03-23 14:28:24.687296656 -0700 +@@ -78,9 +78,9 @@ struct ent { + + #define DefineSimpleCacheLookupMap(STRUCT, FUNC) \ + DefineCacheLookup(struct STRUCT, h, FUNC##_lookup, \ +- (struct STRUCT *item, int set), /*no setup */, \ ++ (struct STRUCT *item, int set), \ + & FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp), \ +- STRUCT##_init(new, item), STRUCT##_update(tmp, item), 0) ++ STRUCT##_init(new, item), STRUCT##_update(tmp, item)) + + /* Common entry handling */ + +--- linux-2.6.7/fs/nfsd/nfs4acl.c.lsec 2005-03-23 14:28:24.463330704 -0700 ++++ linux-2.6.7/fs/nfsd/nfs4acl.c 2005-03-23 14:28:24.463330704 -0700 +@@ -0,0 +1,974 @@ ++/* ++ * fs/nfs4acl/acl.c ++ * ++ * Common NFSv4 ACL handling code. ++ * ++ * Copyright (c) 2002, 2003 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Marius Aamodt Eriksen ++ * Jeff Sedlak ++ * J. Bruce Fields ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++/* mode bit translations: */ ++#define NFS4_READ_MODE (NFS4_ACE_READ_DATA | NFS4_ACE_READ_NAMED_ATTRS) ++#define NFS4_WRITE_MODE (NFS4_ACE_WRITE_DATA | NFS4_ACE_WRITE_NAMED_ATTRS | NFS4_ACE_APPEND_DATA) ++#define NFS4_EXECUTE_MODE NFS4_ACE_EXECUTE ++#define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE) ++#define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL) ++ ++/* flags used to simulate posix default ACLs */ ++#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \ ++ | NFS4_ACE_DIRECTORY_INHERIT_ACE | NFS4_ACE_INHERIT_ONLY_ACE) ++ ++#define MASK_EQUAL(mask1, mask2) \ ++ ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) ) ++ ++static u32 ++mask_from_posix(unsigned short perm, unsigned int flags) ++{ ++ int mask = NFS4_ANYONE_MODE; ++ ++ if (flags & NFS4_ACL_OWNER) ++ mask |= NFS4_OWNER_MODE; ++ if (perm & ACL_READ) ++ mask |= NFS4_READ_MODE; ++ if (perm & ACL_WRITE) ++ mask |= NFS4_WRITE_MODE; ++ if ((perm & ACL_WRITE) && (flags & NFS4_ACL_DIR)) ++ mask |= NFS4_ACE_DELETE_CHILD; ++ if (perm & ACL_EXECUTE) ++ mask |= NFS4_EXECUTE_MODE; ++ return mask; ++} ++ ++static u32 ++deny_mask(u32 allow_mask, unsigned int flags) ++{ ++ u32 ret = ~allow_mask & ~NFS4_ACE_DELETE; ++ if (!(flags & NFS4_ACL_DIR)) ++ ret &= ~NFS4_ACE_DELETE_CHILD; ++ return ret; ++} ++ ++static int ++mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags) ++{ ++ u32 ignore = 0; ++ ++ if (!(flags & NFS4_ACL_DIR)) ++ ignore |= NFS4_ACE_DELETE_CHILD; /* ignore it */ ++ perm |= ignore; ++ *mode = 0; ++ if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE) ++ *mode |= ACL_READ; ++ if ((perm & NFS4_WRITE_MODE) == NFS4_WRITE_MODE) ++ *mode |= ACL_WRITE; ++ if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE) ++ *mode |= ACL_EXECUTE; ++ if (!MASK_EQUAL(perm, ignore|mask_from_posix(*mode, flags))) ++ return -EINVAL; ++ return 0; ++} ++ ++struct ace_container { ++ struct nfs4_ace *ace; ++ struct list_head ace_l; ++}; ++ ++static short ace2type(struct nfs4_ace *); ++static int _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int); ++static struct posix_acl *_nfsv4_to_posix_one(struct nfs4_acl *, unsigned int); ++int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t); ++int nfs4_acl_split(struct nfs4_acl *, struct nfs4_acl *); ++ ++struct nfs4_acl * ++nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl, ++ unsigned int flags) ++{ ++ struct nfs4_acl *acl; ++ int error = -EINVAL; ++ ++ if ((pacl != NULL && ++ (posix_acl_valid(pacl) < 0 || pacl->a_count == 0)) || ++ (dpacl != NULL && ++ (posix_acl_valid(dpacl) < 0 || dpacl->a_count == 0))) ++ goto out_err; ++ ++ acl = nfs4_acl_new(); ++ if (acl == NULL) { ++ error = -ENOMEM; ++ goto out_err; ++ } ++ ++ if (pacl != NULL) { ++ error = _posix_to_nfsv4_one(pacl, acl, ++ flags & ~NFS4_ACL_TYPE_DEFAULT); ++ if (error < 0) ++ goto out_acl; ++ } ++ ++ if (dpacl != NULL) { ++ error = _posix_to_nfsv4_one(dpacl, acl, ++ flags | NFS4_ACL_TYPE_DEFAULT); ++ if (error < 0) ++ goto out_acl; ++ } ++ ++ return acl; ++ ++out_acl: ++ nfs4_acl_free(acl); ++out_err: ++ acl = ERR_PTR(error); ++ ++ return acl; ++} ++ ++static int ++nfs4_acl_add_pair(struct nfs4_acl *acl, int eflag, u32 mask, int whotype, ++ uid_t owner, unsigned int flags) ++{ ++ int error; ++ ++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE, ++ eflag, mask, whotype, owner); ++ if (error < 0) ++ return error; ++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, ++ eflag, deny_mask(mask, flags), whotype, owner); ++ return error; ++} ++ ++/* We assume the acl has been verified with posix_acl_valid. */ ++static int ++_posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl, ++ unsigned int flags) ++{ ++ struct posix_acl_entry *pa, *pe, *group_owner_entry; ++ int error = -EINVAL; ++ u32 mask, mask_mask; ++ int eflag = ((flags & NFS4_ACL_TYPE_DEFAULT) ? ++ NFS4_INHERITANCE_FLAGS : 0); ++ ++ BUG_ON(pacl->a_count < 3); ++ pe = pacl->a_entries + pacl->a_count; ++ pa = pe - 2; /* if mask entry exists, it's second from the last. */ ++ if (pa->e_tag == ACL_MASK) ++ mask_mask = deny_mask(mask_from_posix(pa->e_perm, flags), flags); ++ else ++ mask_mask = 0; ++ ++ pa = pacl->a_entries; ++ BUG_ON(pa->e_tag != ACL_USER_OBJ); ++ mask = mask_from_posix(pa->e_perm, flags | NFS4_ACL_OWNER); ++ error = nfs4_acl_add_pair(acl, eflag, mask, NFS4_ACL_WHO_OWNER, 0, flags); ++ if (error < 0) ++ goto out; ++ pa++; ++ ++ while (pa->e_tag == ACL_USER) { ++ mask = mask_from_posix(pa->e_perm, flags); ++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, ++ eflag, mask_mask, NFS4_ACL_WHO_NAMED, pa->e_id); ++ if (error < 0) ++ goto out; ++ ++ ++ error = nfs4_acl_add_pair(acl, eflag, mask, ++ NFS4_ACL_WHO_NAMED, pa->e_id, flags); ++ if (error < 0) ++ goto out; ++ pa++; ++ } ++ ++ /* In the case of groups, we apply allow ACEs first, then deny ACEs, ++ * since a user can be in more than one group. */ ++ ++ /* allow ACEs */ ++ ++ if (pacl->a_count > 3) { ++ BUG_ON(pa->e_tag != ACL_GROUP_OBJ); ++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, ++ NFS4_ACE_IDENTIFIER_GROUP | eflag, mask_mask, ++ NFS4_ACL_WHO_GROUP, 0); ++ if (error < 0) ++ goto out; ++ } ++ group_owner_entry = pa; ++ mask = mask_from_posix(pa->e_perm, flags); ++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE, ++ NFS4_ACE_IDENTIFIER_GROUP | eflag, mask, ++ NFS4_ACL_WHO_GROUP, 0); ++ if (error < 0) ++ goto out; ++ pa++; ++ ++ while (pa->e_tag == ACL_GROUP) { ++ mask = mask_from_posix(pa->e_perm, flags); ++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, ++ NFS4_ACE_IDENTIFIER_GROUP | eflag, mask_mask, ++ NFS4_ACL_WHO_NAMED, pa->e_id); ++ if (error < 0) ++ goto out; ++ ++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE, ++ NFS4_ACE_IDENTIFIER_GROUP | eflag, mask, ++ NFS4_ACL_WHO_NAMED, pa->e_id); ++ if (error < 0) ++ goto out; ++ pa++; ++ } ++ ++ /* deny ACEs */ ++ ++ pa = group_owner_entry; ++ mask = mask_from_posix(pa->e_perm, flags); ++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, ++ NFS4_ACE_IDENTIFIER_GROUP | eflag, ++ deny_mask(mask, flags), NFS4_ACL_WHO_GROUP, 0); ++ if (error < 0) ++ goto out; ++ pa++; ++ while (pa->e_tag == ACL_GROUP) { ++ mask = mask_from_posix(pa->e_perm, flags); ++ error = nfs4_acl_add_ace(acl, NFS4_ACE_ACCESS_DENIED_ACE_TYPE, ++ NFS4_ACE_IDENTIFIER_GROUP | eflag, ++ deny_mask(mask, flags), NFS4_ACL_WHO_NAMED, pa->e_id); ++ if (error < 0) ++ goto out; ++ pa++; ++ } ++ ++ if (pa->e_tag == ACL_MASK) ++ pa++; ++ BUG_ON(pa->e_tag != ACL_OTHER); ++ mask = mask_from_posix(pa->e_perm, flags); ++ error = nfs4_acl_add_pair(acl, eflag, mask, NFS4_ACL_WHO_EVERYONE, 0, flags); ++ ++out: ++ return error; ++} ++ ++static void ++sort_pacl_range(struct posix_acl *pacl, int start, int end) { ++ int sorted = 0, i; ++ struct posix_acl_entry tmp; ++ ++ /* We just do a bubble sort; easy to do in place, and we're not ++ * expecting acl's to be long enough to justify anything more. */ ++ while (!sorted) { ++ sorted = 1; ++ for (i = start; i < end; i++) { ++ if (pacl->a_entries[i].e_id ++ > pacl->a_entries[i+1].e_id) { ++ sorted = 0; ++ tmp = pacl->a_entries[i]; ++ pacl->a_entries[i] = pacl->a_entries[i+1]; ++ pacl->a_entries[i+1] = tmp; ++ } ++ } ++ } ++} ++ ++static void ++sort_pacl(struct posix_acl *pacl) ++{ ++ /* posix_acl_valid requires that users and groups be in order ++ * by uid/gid. */ ++ int i, j; ++ ++ if (pacl->a_count <= 4) ++ return; /* no users or groups */ ++ i = 1; ++ while (pacl->a_entries[i].e_tag == ACL_USER) ++ i++; ++ sort_pacl_range(pacl, 1, i-1); ++ ++ BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ); ++ j = i++; ++ while (pacl->a_entries[j].e_tag == ACL_GROUP) ++ j++; ++ sort_pacl_range(pacl, i, j-1); ++ return; ++} ++ ++static int ++write_pace(struct nfs4_ace *ace, struct posix_acl *pacl, ++ struct posix_acl_entry **pace, short tag, unsigned int flags) ++{ ++ struct posix_acl_entry *this = *pace; ++ ++ if (*pace == pacl->a_entries + pacl->a_count) ++ return -EINVAL; /* fell off the end */ ++ (*pace)++; ++ this->e_tag = tag; ++ if (tag == ACL_USER_OBJ) ++ flags |= NFS4_ACL_OWNER; ++ if (mode_from_nfs4(ace->access_mask, &this->e_perm, flags)) ++ return -EINVAL; ++ this->e_id = (tag == ACL_USER || tag == ACL_GROUP ? ++ ace->who : ACL_UNDEFINED_ID); ++ return 0; ++} ++ ++static struct nfs4_ace * ++get_next_v4_ace(struct list_head **p, struct list_head *head) ++{ ++ struct nfs4_ace *ace; ++ ++ *p = (*p)->next; ++ if (*p == head) ++ return NULL; ++ ace = list_entry(*p, struct nfs4_ace, l_ace); ++ ++ return ace; ++} ++ ++int ++nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, ++ struct posix_acl **dpacl, unsigned int flags) ++{ ++ struct nfs4_acl *dacl; ++ int error = -ENOMEM; ++ ++ *pacl = NULL; ++ *dpacl = NULL; ++ ++ dacl = nfs4_acl_new(); ++ if (dacl == NULL) ++ goto out; ++ ++ error = nfs4_acl_split(acl, dacl); ++ if (error < 0) ++ goto out_acl; ++ ++ if (pacl != NULL) { ++ if (acl->naces == 0) { ++ error = -ENODATA; ++ goto try_dpacl; ++ } ++ ++ *pacl = _nfsv4_to_posix_one(acl, flags); ++ if (IS_ERR(*pacl)) { ++ error = PTR_ERR(*pacl); ++ *pacl = NULL; ++ goto out_acl; ++ } ++ } ++ ++try_dpacl: ++ if (dpacl != NULL) { ++ if (dacl->naces == 0) { ++ if (pacl == NULL || *pacl == NULL) ++ error = -ENODATA; ++ goto out_acl; ++ } ++ ++ error = 0; ++ *dpacl = _nfsv4_to_posix_one(dacl, flags); ++ if (IS_ERR(*dpacl)) { ++ error = PTR_ERR(*dpacl); ++ *dpacl = NULL; ++ goto out_acl; ++ } ++ } ++ ++out_acl: ++ if (error && pacl) { ++ posix_acl_release(*pacl); ++ *pacl = NULL; ++ } ++ nfs4_acl_free(dacl); ++out: ++ return error; ++} ++ ++static int ++same_who(struct nfs4_ace *a, struct nfs4_ace *b) ++{ ++ return a->whotype == b->whotype && ++ (a->whotype != NFS4_ACL_WHO_NAMED || a->who == b->who); ++} ++ ++static int ++complementary_ace_pair(struct nfs4_ace *allow, struct nfs4_ace *deny, ++ unsigned int flags) ++{ ++ int ignore = 0; ++ if (!(flags & NFS4_ACL_DIR)) ++ ignore |= NFS4_ACE_DELETE_CHILD; ++ return MASK_EQUAL(ignore|deny_mask(allow->access_mask, flags), ++ ignore|deny->access_mask) && ++ allow->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE && ++ deny->type == NFS4_ACE_ACCESS_DENIED_ACE_TYPE && ++ allow->flag == deny->flag && ++ same_who(allow, deny); ++} ++ ++static inline int ++user_obj_from_v4(struct nfs4_acl *n4acl, struct list_head **p, ++ struct posix_acl *pacl, struct posix_acl_entry **pace, ++ unsigned int flags) ++{ ++ int error = -EINVAL; ++ struct nfs4_ace *ace, *ace2; ++ ++ ace = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace == NULL) ++ goto out; ++ if (ace2type(ace) != ACL_USER_OBJ) ++ goto out; ++ error = write_pace(ace, pacl, pace, ACL_USER_OBJ, flags); ++ if (error < 0) ++ goto out; ++ error = -EINVAL; ++ ace2 = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace2 == NULL) ++ goto out; ++ if (!complementary_ace_pair(ace, ace2, flags)) ++ goto out; ++ error = 0; ++out: ++ return error; ++} ++ ++static inline int ++users_from_v4(struct nfs4_acl *n4acl, struct list_head **p, ++ struct nfs4_ace **mask_ace, ++ struct posix_acl *pacl, struct posix_acl_entry **pace, ++ unsigned int flags) ++{ ++ int error = -EINVAL; ++ struct nfs4_ace *ace, *ace2; ++ ++ ace = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace == NULL) ++ goto out; ++ while (ace2type(ace) == ACL_USER) { ++ if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) ++ goto out; ++ if (*mask_ace && ++ !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask)) ++ goto out; ++ *mask_ace = ace; ++ ace = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace == NULL) ++ goto out; ++ if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) ++ goto out; ++ error = write_pace(ace, pacl, pace, ACL_USER, flags); ++ if (error < 0) ++ goto out; ++ error = -EINVAL; ++ ace2 = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace2 == NULL) ++ goto out; ++ if (!complementary_ace_pair(ace, ace2, flags)) ++ goto out; ++ if ((*mask_ace)->flag != ace2->flag || ++ !same_who(*mask_ace, ace2)) ++ goto out; ++ ace = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace == NULL) ++ goto out; ++ } ++ error = 0; ++out: ++ return error; ++} ++ ++static inline int ++group_obj_and_groups_from_v4(struct nfs4_acl *n4acl, struct list_head **p, ++ struct nfs4_ace **mask_ace, ++ struct posix_acl *pacl, struct posix_acl_entry **pace, ++ unsigned int flags) ++{ ++ int error = -EINVAL; ++ struct nfs4_ace *ace, *ace2; ++ struct ace_container *ac; ++ struct list_head group_l; ++ ++ INIT_LIST_HEAD(&group_l); ++ ace = list_entry(*p, struct nfs4_ace, l_ace); ++ ++ /* group owner (mask and allow aces) */ ++ ++ if (pacl->a_count != 3) { ++ /* then the group owner should be preceded by mask */ ++ if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) ++ goto out; ++ if (*mask_ace && ++ !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask)) ++ goto out; ++ *mask_ace = ace; ++ ace = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace == NULL) ++ goto out; ++ ++ if ((*mask_ace)->flag != ace->flag || !same_who(*mask_ace, ace)) ++ goto out; ++ } ++ ++ if (ace2type(ace) != ACL_GROUP_OBJ) ++ goto out; ++ ++ ac = kmalloc(sizeof(*ac), GFP_KERNEL); ++ error = -ENOMEM; ++ if (ac == NULL) ++ goto out; ++ ac->ace = ace; ++ list_add_tail(&ac->ace_l, &group_l); ++ ++ error = -EINVAL; ++ if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) ++ goto out; ++ ++ error = write_pace(ace, pacl, pace, ACL_GROUP_OBJ, flags); ++ if (error < 0) ++ goto out; ++ ++ error = -EINVAL; ++ ace = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace == NULL) ++ goto out; ++ ++ /* groups (mask and allow aces) */ ++ ++ while (ace2type(ace) == ACL_GROUP) { ++ if (*mask_ace == NULL) ++ goto out; ++ ++ if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE || ++ !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask)) ++ goto out; ++ *mask_ace = ace; ++ ++ ace = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace == NULL) ++ goto out; ++ ac = kmalloc(sizeof(*ac), GFP_KERNEL); ++ error = -ENOMEM; ++ if (ac == NULL) ++ goto out; ++ error = -EINVAL; ++ if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE || ++ !same_who(ace, *mask_ace)) ++ goto out; ++ ++ ac->ace = ace; ++ list_add_tail(&ac->ace_l, &group_l); ++ ++ error = write_pace(ace, pacl, pace, ACL_GROUP, flags); ++ if (error < 0) ++ goto out; ++ error = -EINVAL; ++ ace = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace == NULL) ++ goto out; ++ } ++ ++ /* group owner (deny ace) */ ++ ++ if (ace2type(ace) != ACL_GROUP_OBJ) ++ goto out; ++ ac = list_entry(group_l.next, struct ace_container, ace_l); ++ ace2 = ac->ace; ++ if (!complementary_ace_pair(ace2, ace, flags)) ++ goto out; ++ list_del(group_l.next); ++ kfree(ac); ++ ++ /* groups (deny aces) */ ++ ++ while (!list_empty(&group_l)) { ++ ace = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace == NULL) ++ goto out; ++ if (ace2type(ace) != ACL_GROUP) ++ goto out; ++ ac = list_entry(group_l.next, struct ace_container, ace_l); ++ ace2 = ac->ace; ++ if (!complementary_ace_pair(ace2, ace, flags)) ++ goto out; ++ list_del(group_l.next); ++ kfree(ac); ++ } ++ ++ ace = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace == NULL) ++ goto out; ++ if (ace2type(ace) != ACL_OTHER) ++ goto out; ++ error = 0; ++out: ++ while (!list_empty(&group_l)) { ++ ac = list_entry(group_l.next, struct ace_container, ace_l); ++ list_del(group_l.next); ++ kfree(ac); ++ } ++ return error; ++} ++ ++static inline int ++mask_from_v4(struct nfs4_acl *n4acl, struct list_head **p, ++ struct nfs4_ace **mask_ace, ++ struct posix_acl *pacl, struct posix_acl_entry **pace, ++ unsigned int flags) ++{ ++ int error = -EINVAL; ++ struct nfs4_ace *ace; ++ ++ ace = list_entry(*p, struct nfs4_ace, l_ace); ++ if (pacl->a_count != 3) { ++ if (*mask_ace == NULL) ++ goto out; ++ (*mask_ace)->access_mask = deny_mask((*mask_ace)->access_mask, flags); ++ write_pace(*mask_ace, pacl, pace, ACL_MASK, flags); ++ } ++ error = 0; ++out: ++ return error; ++} ++ ++static inline int ++other_from_v4(struct nfs4_acl *n4acl, struct list_head **p, ++ struct posix_acl *pacl, struct posix_acl_entry **pace, ++ unsigned int flags) ++{ ++ int error = -EINVAL; ++ struct nfs4_ace *ace, *ace2; ++ ++ ace = list_entry(*p, struct nfs4_ace, l_ace); ++ if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) ++ goto out; ++ error = write_pace(ace, pacl, pace, ACL_OTHER, flags); ++ if (error < 0) ++ goto out; ++ error = -EINVAL; ++ ace2 = get_next_v4_ace(p, &n4acl->ace_head); ++ if (ace2 == NULL) ++ goto out; ++ if (!complementary_ace_pair(ace, ace2, flags)) ++ goto out; ++ error = 0; ++out: ++ return error; ++} ++ ++static int ++calculate_posix_ace_count(struct nfs4_acl *n4acl) ++{ ++ if (n4acl->naces == 6) /* owner, owner group, and other only */ ++ return 3; ++ else { /* Otherwise there must be a mask entry. */ ++ /* Also, the remaining entries are for named users and ++ * groups, and come in threes (mask, allow, deny): */ ++ if (n4acl->naces < 7) ++ return -1; ++ if ((n4acl->naces - 7) % 3) ++ return -1; ++ return 4 + (n4acl->naces - 7)/3; ++ } ++} ++ ++ ++static struct posix_acl * ++_nfsv4_to_posix_one(struct nfs4_acl *n4acl, unsigned int flags) ++{ ++ struct posix_acl *pacl; ++ int error = -EINVAL, nace = 0; ++ struct list_head *p; ++ struct nfs4_ace *mask_ace = NULL; ++ struct posix_acl_entry *pace; ++ ++ nace = calculate_posix_ace_count(n4acl); ++ if (nace < 0) ++ goto out_err; ++ ++ pacl = posix_acl_alloc(nace, GFP_KERNEL); ++ error = -ENOMEM; ++ if (pacl == NULL) ++ goto out_err; ++ ++ pace = &pacl->a_entries[0]; ++ p = &n4acl->ace_head; ++ ++ error = user_obj_from_v4(n4acl, &p, pacl, &pace, flags); ++ if (error) ++ goto out_acl; ++ ++ error = users_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags); ++ if (error) ++ goto out_acl; ++ ++ error = group_obj_and_groups_from_v4(n4acl, &p, &mask_ace, pacl, &pace, ++ flags); ++ if (error) ++ goto out_acl; ++ ++ error = mask_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags); ++ if (error) ++ goto out_acl; ++ error = other_from_v4(n4acl, &p, pacl, &pace, flags); ++ if (error) ++ goto out_acl; ++ ++ error = -EINVAL; ++ if (p->next != &n4acl->ace_head) ++ goto out_acl; ++ if (pace != pacl->a_entries + pacl->a_count) ++ goto out_acl; ++ ++ sort_pacl(pacl); ++ ++ return pacl; ++out_acl: ++ posix_acl_release(pacl); ++out_err: ++ pacl = ERR_PTR(error); ++ return pacl; ++} ++ ++int ++nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl) ++{ ++ struct list_head *h, *n; ++ struct nfs4_ace *ace; ++ int error = 0; ++ ++ list_for_each_safe(h, n, &acl->ace_head) { ++ ace = list_entry(h, struct nfs4_ace, l_ace); ++ ++ if ((ace->flag & NFS4_INHERITANCE_FLAGS) ++ != NFS4_INHERITANCE_FLAGS) ++ continue; ++ ++ error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, ++ ace->access_mask, ace->whotype, ace->who) == -1; ++ if (error < 0) ++ goto out; ++ ++ list_del(h); ++ kfree(ace); ++ acl->naces--; ++ } ++ ++out: ++ return error; ++} ++ ++static short ++ace2type(struct nfs4_ace *ace) ++{ ++ switch (ace->whotype) { ++ case NFS4_ACL_WHO_NAMED: ++ return (ace->flag & NFS4_ACE_IDENTIFIER_GROUP ? ++ ACL_GROUP : ACL_USER); ++ case NFS4_ACL_WHO_OWNER: ++ return ACL_USER_OBJ; ++ case NFS4_ACL_WHO_GROUP: ++ return ACL_GROUP_OBJ; ++ case NFS4_ACL_WHO_EVERYONE: ++ return ACL_OTHER; ++ } ++ BUG(); ++ return -1; ++} ++ ++EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4); ++EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix); ++ ++struct nfs4_acl * ++nfs4_acl_new(void) ++{ ++ struct nfs4_acl *acl; ++ ++ if ((acl = kmalloc(sizeof(*acl), GFP_KERNEL)) == NULL) ++ return NULL; ++ ++ acl->naces = 0; ++ INIT_LIST_HEAD(&acl->ace_head); ++ ++ return acl; ++} ++ ++void ++nfs4_acl_free(struct nfs4_acl *acl) ++{ ++ struct list_head *h; ++ struct nfs4_ace *ace; ++ ++ if (!acl) ++ return; ++ ++ while (!list_empty(&acl->ace_head)) { ++ h = acl->ace_head.next; ++ list_del(h); ++ ace = list_entry(h, struct nfs4_ace, l_ace); ++ kfree(ace); ++ } ++ ++ kfree(acl); ++ ++ return; ++} ++ ++int ++nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask, ++ int whotype, uid_t who) ++{ ++ struct nfs4_ace *ace; ++ ++ if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL) ++ return -1; ++ ++ ace->type = type; ++ ace->flag = flag; ++ ace->access_mask = access_mask; ++ ace->whotype = whotype; ++ ace->who = who; ++ ++ list_add_tail(&ace->l_ace, &acl->ace_head); ++ acl->naces++; ++ ++ return 0; ++} ++ ++static struct { ++ char *string; ++ int stringlen; ++ int type; ++} s2t_map[] = { ++ { ++ .string = "OWNER@", ++ .stringlen = sizeof("OWNER@") - 1, ++ .type = NFS4_ACL_WHO_OWNER, ++ }, ++ { ++ .string = "GROUP@", ++ .stringlen = sizeof("GROUP@") - 1, ++ .type = NFS4_ACL_WHO_GROUP, ++ }, ++ { ++ .string = "EVERYONE@", ++ .stringlen = sizeof("EVERYONE@") - 1, ++ .type = NFS4_ACL_WHO_EVERYONE, ++ }, ++}; ++ ++int ++nfs4_acl_get_whotype(char *p, u32 len) ++{ ++ int i; ++ ++ for (i=0; i < sizeof(s2t_map) / sizeof(*s2t_map); i++) { ++ if (s2t_map[i].stringlen == len && ++ 0 == memcmp(s2t_map[i].string, p, len)) ++ return s2t_map[i].type; ++ } ++ return NFS4_ACL_WHO_NAMED; ++} ++ ++int ++nfs4_acl_write_who(int who, char *p) ++{ ++ int i; ++ ++ for (i=0; i < sizeof(s2t_map) / sizeof(*s2t_map); i++) { ++ if (s2t_map[i].type == who) { ++ memcpy(p, s2t_map[i].string, s2t_map[i].stringlen); ++ return s2t_map[i].stringlen; ++ } ++ } ++ BUG(); ++ return -1; ++} ++ ++static inline int ++match_who(struct nfs4_ace *ace, uid_t owner, gid_t group, uid_t who) ++{ ++ switch (ace->whotype) { ++ case NFS4_ACL_WHO_NAMED: ++ return who == ace->who; ++ case NFS4_ACL_WHO_OWNER: ++ return who == owner; ++ case NFS4_ACL_WHO_GROUP: ++ return who == group; ++ case NFS4_ACL_WHO_EVERYONE: ++ return 1; ++ default: ++ return 0; ++ } ++} ++ ++/* 0 = granted, -EACCES = denied; mask is an nfsv4 mask, not mode bits */ ++int ++nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group, ++ uid_t who, u32 mask) ++{ ++ struct nfs4_ace *ace; ++ u32 allowed = 0; ++ ++ list_for_each_entry(ace, &acl->ace_head, l_ace) { ++ if (!match_who(ace, group, owner, who)) ++ continue; ++ switch (ace->type) { ++ case NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE: ++ allowed |= ace->access_mask; ++ if ((allowed & mask) == mask) ++ return 0; ++ break; ++ case NFS4_ACE_ACCESS_DENIED_ACE_TYPE: ++ if (ace->access_mask & mask) ++ return -EACCES; ++ break; ++ } ++ } ++ return -EACCES; ++} ++ ++EXPORT_SYMBOL(nfs4_acl_new); ++EXPORT_SYMBOL(nfs4_acl_free); ++EXPORT_SYMBOL(nfs4_acl_add_ace); ++EXPORT_SYMBOL(nfs4_acl_get_whotype); ++EXPORT_SYMBOL(nfs4_acl_write_who); ++EXPORT_SYMBOL(nfs4_acl_permission); +--- linux-2.6.7/fs/nfsd/Makefile.lsec 2004-06-15 23:19:13.000000000 -0600 ++++ linux-2.6.7/fs/nfsd/Makefile 2005-03-23 14:28:24.461331008 -0700 +@@ -7,5 +7,6 @@ obj-$(CONFIG_NFSD) += nfsd.o + nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ + export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o + nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o +-nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o ++nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ ++ nfs4acl.o nfs4callback.o + nfsd-objs := $(nfsd-y) +--- linux-2.6.7/fs/nfsd/nfsctl.c.lsec 2004-06-15 23:19:01.000000000 -0600 ++++ linux-2.6.7/fs/nfsd/nfsctl.c 2005-03-23 14:28:24.132381016 -0700 +@@ -36,7 +36,7 @@ + #include + + /* +- * We have a single directory with 8 nodes in it. ++ * We have a single directory with 9 nodes in it. + */ + enum { + NFSD_Root = 1, +@@ -50,6 +50,7 @@ enum { + NFSD_List, + NFSD_Fh, + NFSD_Threads, ++ NFSD_Leasetime, + }; + + /* +@@ -64,6 +65,7 @@ static ssize_t write_getfd(struct file * + static ssize_t write_getfs(struct file *file, char *buf, size_t size); + static ssize_t write_filehandle(struct file *file, char *buf, size_t size); + static ssize_t write_threads(struct file *file, char *buf, size_t size); ++static ssize_t write_leasetime(struct file *file, char *buf, size_t size); + + static ssize_t (*write_op[])(struct file *, char *, size_t) = { + [NFSD_Svc] = write_svc, +@@ -75,6 +77,7 @@ static ssize_t (*write_op[])(struct file + [NFSD_Getfs] = write_getfs, + [NFSD_Fh] = write_filehandle, + [NFSD_Threads] = write_threads, ++ [NFSD_Leasetime] = write_leasetime, + }; + + /* an argresp is stored in an allocated page and holds the +@@ -393,6 +396,29 @@ static ssize_t write_threads(struct file + return strlen(buf); + } + ++extern time_t nfs4_leasetime(void); ++ ++static ssize_t write_leasetime(struct file *file, char *buf, size_t size) ++{ ++ /* if size > 10 seconds, call ++ * nfs4_reset_lease() then write out the new lease (seconds) as reply ++ */ ++ char *mesg = buf; ++ int rv; ++ ++ if (size > 0) { ++ int lease; ++ rv = get_int(&mesg, &lease); ++ if (rv) ++ return rv; ++ if (lease < 10 || lease > 3600) ++ return -EINVAL; ++ nfs4_reset_lease(lease); ++ } ++ sprintf(buf, "%ld\n", nfs4_lease_time()); ++ return strlen(buf); ++} ++ + /*----------------------------------------------------------------------------*/ + /* + * populating the filesystem. +@@ -411,6 +437,7 @@ static int nfsd_fill_super(struct super_ + [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, + [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, ++ [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, + /* last one */ {""} + }; + return simple_fill_super(sb, 0x6e667364, nfsd_files); +--- linux-2.6.7/fs/nfs/callback_proc.c.lsec 2005-03-23 14:28:22.485631360 -0700 ++++ linux-2.6.7/fs/nfs/callback_proc.c 2005-03-23 14:28:22.485631360 -0700 +@@ -0,0 +1,85 @@ ++/* ++ * linux/fs/nfs/callback_proc.c ++ * ++ * Copyright (C) 2004 Trond Myklebust ++ * ++ * NFSv4 callback procedures ++ */ ++#include ++#include ++#include ++#include "callback.h" ++#include "delegation.h" ++ ++#define NFSDBG_FACILITY NFSDBG_CALLBACK ++ ++unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res) ++{ ++ struct nfs4_client *clp; ++ struct nfs_delegation *delegation; ++ struct nfs_inode *nfsi; ++ struct inode *inode; ++ ++ res->bitmap[0] = res->bitmap[1] = 0; ++ res->status = htonl(NFS4ERR_BADHANDLE); ++ clp = nfs4_find_client(&args->addr->sin_addr); ++ if (clp == NULL) ++ goto out; ++ inode = nfs_delegation_find_inode(clp, &args->fh); ++ if (inode == NULL) ++ goto out_putclient; ++ nfsi = NFS_I(inode); ++ down_read(&nfsi->rwsem); ++ delegation = nfsi->delegation; ++ if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) ++ goto out_iput; ++ res->size = i_size_read(inode); ++ res->change_attr = NFS_CHANGE_ATTR(inode); ++ res->ctime = inode->i_ctime; ++ res->mtime = inode->i_mtime; ++ res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & ++ args->bitmap[0]; ++ res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) & ++ args->bitmap[1]; ++ res->status = 0; ++out_iput: ++ up_read(&nfsi->rwsem); ++ iput(inode); ++out_putclient: ++ nfs4_put_client(clp); ++out: ++ dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res->status)); ++ return res->status; ++} ++ ++unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy) ++{ ++ struct nfs4_client *clp; ++ struct inode *inode; ++ unsigned res; ++ ++ res = htonl(NFS4ERR_BADHANDLE); ++ clp = nfs4_find_client(&args->addr->sin_addr); ++ if (clp == NULL) ++ goto out; ++ inode = nfs_delegation_find_inode(clp, &args->fh); ++ if (inode == NULL) ++ goto out_putclient; ++ /* Set up a helper thread to actually return the delegation */ ++ switch(nfs_async_inode_return_delegation(inode, &args->stateid)) { ++ case 0: ++ res = 0; ++ break; ++ case -ENOENT: ++ res = htonl(NFS4ERR_BAD_STATEID); ++ break; ++ default: ++ res = htonl(NFS4ERR_RESOURCE); ++ } ++ iput(inode); ++out_putclient: ++ nfs4_put_client(clp); ++out: ++ dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res)); ++ return res; ++} +--- linux-2.6.7/fs/nfs/delegation.c.lsec 2005-03-23 14:28:22.546622088 -0700 ++++ linux-2.6.7/fs/nfs/delegation.c 2005-03-23 14:28:22.545622240 -0700 +@@ -0,0 +1,320 @@ ++/* ++ * linux/fs/nfs/delegation.c ++ * ++ * Copyright (C) 2004 Trond Myklebust ++ * ++ * NFS file delegation management ++ * ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "delegation.h" ++ ++static struct nfs_delegation *nfs_alloc_delegation(void) ++{ ++ return (struct nfs_delegation *)kmalloc(sizeof(struct nfs_delegation), GFP_KERNEL); ++} ++ ++static void nfs_free_delegation(struct nfs_delegation *delegation) ++{ ++ if (delegation->cred) ++ put_rpccred(delegation->cred); ++ kfree(delegation); ++} ++ ++static void nfs_delegation_claim_opens(struct inode *inode) ++{ ++ struct nfs_inode *nfsi = NFS_I(inode); ++ struct nfs_open_context *ctx; ++ struct nfs4_state *state; ++ ++again: ++ spin_lock(&inode->i_lock); ++ list_for_each_entry(ctx, &nfsi->open_files, list) { ++ state = ctx->state; ++ if (state == NULL) ++ continue; ++ if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) ++ continue; ++ get_nfs_open_context(ctx); ++ spin_unlock(&inode->i_lock); ++ if (nfs4_open_delegation_recall(ctx->dentry, state) < 0) ++ return; ++ put_nfs_open_context(ctx); ++ goto again; ++ } ++ spin_unlock(&inode->i_lock); ++} ++ ++/* ++ * Set up a delegation on an inode ++ */ ++void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) ++{ ++ struct nfs_delegation *delegation = NFS_I(inode)->delegation; ++ ++ if (delegation == NULL) ++ return; ++ memcpy(delegation->stateid.data, res->delegation.data, ++ sizeof(delegation->stateid.data)); ++ delegation->type = res->delegation_type; ++ delegation->maxsize = res->maxsize; ++ put_rpccred(cred); ++ delegation->cred = get_rpccred(cred); ++ delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM; ++ NFS_I(inode)->delegation_state = delegation->type; ++ wmb(); ++} ++ ++/* ++ * Set up a delegation on an inode ++ */ ++int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) ++{ ++ struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ struct nfs_delegation *delegation; ++ int status = 0; ++ ++ delegation = nfs_alloc_delegation(); ++ if (delegation == NULL) ++ return -ENOMEM; ++ memcpy(delegation->stateid.data, res->delegation.data, ++ sizeof(delegation->stateid.data)); ++ delegation->type = res->delegation_type; ++ delegation->maxsize = res->maxsize; ++ delegation->cred = get_rpccred(cred); ++ delegation->inode = inode; ++ ++ spin_lock(&clp->cl_lock); ++ if (nfsi->delegation == NULL) { ++ list_add(&delegation->super_list, &clp->cl_delegations); ++ nfsi->delegation = delegation; ++ nfsi->delegation_state = delegation->type; ++ delegation = NULL; ++ } else { ++ if (memcmp(&delegation->stateid, &nfsi->delegation->stateid, ++ sizeof(delegation->stateid)) != 0 || ++ delegation->type != nfsi->delegation->type) { ++ printk("%s: server %u.%u.%u.%u, handed out a duplicate delegation!\n", ++ __FUNCTION__, NIPQUAD(clp->cl_addr)); ++ status = -EIO; ++ } ++ } ++ spin_unlock(&clp->cl_lock); ++ if (delegation != NULL) ++ kfree(delegation); ++ return status; ++} ++ ++static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation) ++{ ++ int res = 0; ++ ++ __nfs_revalidate_inode(NFS_SERVER(inode), inode); ++ ++ res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid); ++ nfs_free_delegation(delegation); ++ return res; ++} ++ ++/* Sync all data to disk upon delegation return */ ++static void nfs_msync_inode(struct inode *inode) ++{ ++ down(&inode->i_sem); ++ filemap_fdatawrite(inode->i_mapping); ++ nfs_wb_all(inode); ++ filemap_fdatawait(inode->i_mapping); ++ up(&inode->i_sem); ++} ++ ++/* ++ * Basic procedure for returning a delegation to the server ++ */ ++int nfs_inode_return_delegation(struct inode *inode) ++{ ++ struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ struct nfs_delegation *delegation; ++ int res = 0; ++ ++ nfs_msync_inode(inode); ++ down_read(&clp->cl_sem); ++ /* Guard against new delegated open calls */ ++ down_write(&nfsi->rwsem); ++ spin_lock(&clp->cl_lock); ++ delegation = nfsi->delegation; ++ if (delegation != NULL) { ++ list_del_init(&delegation->super_list); ++ nfsi->delegation = NULL; ++ nfsi->delegation_state = 0; ++ } ++ spin_unlock(&clp->cl_lock); ++ nfs_delegation_claim_opens(inode); ++ up_write(&nfsi->rwsem); ++ up_read(&clp->cl_sem); ++ nfs_msync_inode(inode); ++ ++ if (delegation != NULL) ++ res = nfs_do_return_delegation(inode, delegation); ++ return res; ++} ++ ++/* ++ * Return all delegations associated to a super block ++ */ ++void nfs_return_all_delegations(struct super_block *sb) ++{ ++ struct nfs4_client *clp = NFS_SB(sb)->nfs4_state; ++ struct nfs_delegation *delegation; ++ struct inode *inode; ++ ++ if (clp == NULL) ++ return; ++restart: ++ spin_lock(&clp->cl_lock); ++ list_for_each_entry(delegation, &clp->cl_delegations, super_list) { ++ if (delegation->inode->i_sb != sb) ++ continue; ++ inode = igrab(delegation->inode); ++ if (inode == NULL) ++ continue; ++ spin_unlock(&clp->cl_lock); ++ nfs_inode_return_delegation(inode); ++ iput(inode); ++ goto restart; ++ } ++ spin_unlock(&clp->cl_lock); ++} ++ ++struct recall_threadargs { ++ struct inode *inode; ++ struct nfs4_client *clp; ++ const nfs4_stateid *stateid; ++ ++ struct completion started; ++ int result; ++}; ++ ++static int recall_thread(void *data) ++{ ++ struct recall_threadargs *args = (struct recall_threadargs *)data; ++ struct inode *inode = igrab(args->inode); ++ struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ struct nfs_delegation *delegation; ++ ++ daemonize("nfsv4-delegreturn"); ++ ++ nfs_msync_inode(inode); ++ down_read(&clp->cl_sem); ++ down_write(&nfsi->rwsem); ++ spin_lock(&clp->cl_lock); ++ delegation = nfsi->delegation; ++ if (delegation != NULL && memcmp(delegation->stateid.data, ++ args->stateid->data, ++ sizeof(delegation->stateid.data)) == 0) { ++ list_del_init(&delegation->super_list); ++ nfsi->delegation = NULL; ++ nfsi->delegation_state = 0; ++ args->result = 0; ++ } else { ++ delegation = NULL; ++ args->result = -ENOENT; ++ } ++ spin_unlock(&clp->cl_lock); ++ complete(&args->started); ++ nfs_delegation_claim_opens(inode); ++ up_write(&nfsi->rwsem); ++ up_read(&clp->cl_sem); ++ nfs_msync_inode(inode); ++ ++ if (delegation != NULL) ++ nfs_do_return_delegation(inode, delegation); ++ iput(inode); ++ module_put_and_exit(0); ++} ++ ++/* ++ * Asynchronous delegation recall! ++ */ ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) ++{ ++ struct recall_threadargs data = { ++ .inode = inode, ++ .stateid = stateid, ++ }; ++ int status; ++ ++ init_completion(&data.started); ++ __module_get(THIS_MODULE); ++ status = kernel_thread(recall_thread, &data, CLONE_KERNEL); ++ if (status < 0) ++ goto out_module_put; ++ wait_for_completion(&data.started); ++ return data.result; ++out_module_put: ++ module_put(THIS_MODULE); ++ return status; ++} ++ ++/* ++ * Retrieve the inode associated with a delegation ++ */ ++struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle) ++{ ++ struct nfs_delegation *delegation; ++ struct inode *res = NULL; ++ spin_lock(&clp->cl_lock); ++ list_for_each_entry(delegation, &clp->cl_delegations, super_list) { ++ if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { ++ res = igrab(delegation->inode); ++ break; ++ } ++ } ++ spin_unlock(&clp->cl_lock); ++ return res; ++} ++ ++/* ++ * Mark all delegations as needing to be reclaimed ++ */ ++void nfs_delegation_mark_reclaim(struct nfs4_client *clp) ++{ ++ struct nfs_delegation *delegation; ++ spin_lock(&clp->cl_lock); ++ list_for_each_entry(delegation, &clp->cl_delegations, super_list) ++ delegation->flags |= NFS_DELEGATION_NEED_RECLAIM; ++ spin_unlock(&clp->cl_lock); ++} ++ ++/* ++ * Reap all unclaimed delegations after reboot recovery is done ++ */ ++void nfs_delegation_reap_unclaimed(struct nfs4_client *clp) ++{ ++ struct nfs_delegation *delegation, *n; ++ LIST_HEAD(head); ++ spin_lock(&clp->cl_lock); ++ list_for_each_entry_safe(delegation, n, &clp->cl_delegations, super_list) { ++ if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) ++ continue; ++ list_move(&delegation->super_list, &head); ++ NFS_I(delegation->inode)->delegation = NULL; ++ NFS_I(delegation->inode)->delegation_state = 0; ++ } ++ spin_unlock(&clp->cl_lock); ++ while(!list_empty(&head)) { ++ delegation = list_entry(head.next, struct nfs_delegation, super_list); ++ list_del(&delegation->super_list); ++ nfs_free_delegation(delegation); ++ } ++} +--- linux-2.6.7/fs/nfs/delegation.h.lsec 2005-03-23 14:28:22.546622088 -0700 ++++ linux-2.6.7/fs/nfs/delegation.h 2005-03-23 14:28:22.546622088 -0700 +@@ -0,0 +1,56 @@ ++/* ++ * linux/fs/nfs/delegation.h ++ * ++ * Copyright (c) Trond Myklebust ++ * ++ * Definitions pertaining to NFS delegated files ++ */ ++#ifndef FS_NFS_DELEGATION_H ++#define FS_NFS_DELEGATION_H ++ ++#if defined(CONFIG_NFS_V4) ++/* ++ * NFSv4 delegation ++ */ ++struct nfs_delegation { ++ struct list_head super_list; ++ struct rpc_cred *cred; ++ struct inode *inode; ++ nfs4_stateid stateid; ++ int type; ++#define NFS_DELEGATION_NEED_RECLAIM 1 ++ long flags; ++ loff_t maxsize; ++}; ++ ++int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); ++void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); ++int nfs_inode_return_delegation(struct inode *inode); ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); ++ ++struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle); ++void nfs_return_all_delegations(struct super_block *sb); ++ ++void nfs_delegation_mark_reclaim(struct nfs4_client *clp); ++void nfs_delegation_reap_unclaimed(struct nfs4_client *clp); ++ ++/* NFSv4 delegation-related procedures */ ++int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid); ++int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state); ++ ++static inline int nfs_have_delegation(struct inode *inode, int flags) ++{ ++ flags &= FMODE_READ|FMODE_WRITE; ++ rmb(); ++ if ((NFS_I(inode)->delegation_state & flags) == flags) ++ return 1; ++ return 0; ++} ++#else ++static inline int nfs_have_delegation(struct inode *inode, int flags) ++{ ++ return 0; ++} ++#endif ++ ++#endif +--- linux-2.6.7/fs/nfs/nfs3proc.c.lsec 2004-06-15 23:19:23.000000000 -0600 ++++ linux-2.6.7/fs/nfs/nfs3proc.c 2005-03-23 14:28:22.820580440 -0700 +@@ -68,18 +68,6 @@ nfs3_async_handle_jukebox(struct rpc_tas + return 1; + } + +-static struct rpc_cred * +-nfs_cred(struct inode *inode, struct file *filp) +-{ +- struct rpc_cred *cred = NULL; +- +- if (filp) +- cred = (struct rpc_cred *)filp->private_data; +- if (!cred) +- cred = NFS_I(inode)->mm_cred; +- return cred; +-} +- + /* + * Bare-bones access to getattr: this is for nfs_read_super. + */ +@@ -164,8 +152,7 @@ nfs3_proc_lookup(struct inode *dir, stru + return status; + } + +-static int +-nfs3_proc_access(struct inode *inode, struct rpc_cred *cred, int mode) ++static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) + { + struct nfs_fattr fattr; + struct nfs3_accessargs arg = { +@@ -178,9 +165,10 @@ nfs3_proc_access(struct inode *inode, st + .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], + .rpc_argp = &arg, + .rpc_resp = &res, +- .rpc_cred = cred ++ .rpc_cred = entry->cred + }; +- int status; ++ int mode = entry->mask; ++ int status; + + dprintk("NFS call access\n"); + fattr.valid = 0; +@@ -200,10 +188,16 @@ nfs3_proc_access(struct inode *inode, st + } + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_refresh_inode(inode, &fattr); +- dprintk("NFS reply access\n"); +- +- if (status == 0 && (arg.access & res.access) != arg.access) +- status = -EACCES; ++ if (status == 0) { ++ entry->mask = 0; ++ if (res.access & NFS3_ACCESS_READ) ++ entry->mask |= MAY_READ; ++ if (res.access & (NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE)) ++ entry->mask |= MAY_WRITE; ++ if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) ++ entry->mask |= MAY_EXEC; ++ } ++ dprintk("NFS reply access, status = %d\n", status); + return status; + } + +@@ -227,8 +221,7 @@ nfs3_proc_readlink(struct inode *inode, + return status; + } + +-static int +-nfs3_proc_read(struct nfs_read_data *rdata, struct file *filp) ++static int nfs3_proc_read(struct nfs_read_data *rdata) + { + int flags = rdata->flags; + struct inode * inode = rdata->inode; +@@ -237,13 +230,13 @@ nfs3_proc_read(struct nfs_read_data *rda + .rpc_proc = &nfs3_procedures[NFS3PROC_READ], + .rpc_argp = &rdata->args, + .rpc_resp = &rdata->res, ++ .rpc_cred = rdata->cred, + }; + int status; + + dprintk("NFS call read %d @ %Ld\n", rdata->args.count, + (long long) rdata->args.offset); + fattr->valid = 0; +- msg.rpc_cred = nfs_cred(inode, filp); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); + if (status >= 0) + nfs_refresh_inode(inode, fattr); +@@ -251,8 +244,7 @@ nfs3_proc_read(struct nfs_read_data *rda + return status; + } + +-static int +-nfs3_proc_write(struct nfs_write_data *wdata, struct file *filp) ++static int nfs3_proc_write(struct nfs_write_data *wdata) + { + int rpcflags = wdata->flags; + struct inode * inode = wdata->inode; +@@ -261,13 +253,13 @@ nfs3_proc_write(struct nfs_write_data *w + .rpc_proc = &nfs3_procedures[NFS3PROC_WRITE], + .rpc_argp = &wdata->args, + .rpc_resp = &wdata->res, ++ .rpc_cred = wdata->cred, + }; + int status; + + dprintk("NFS call write %d @ %Ld\n", wdata->args.count, + (long long) wdata->args.offset); + fattr->valid = 0; +- msg.rpc_cred = nfs_cred(inode, filp); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, rpcflags); + if (status >= 0) + nfs_refresh_inode(inode, fattr); +@@ -275,8 +267,7 @@ nfs3_proc_write(struct nfs_write_data *w + return status < 0? status : wdata->res.count; + } + +-static int +-nfs3_proc_commit(struct nfs_write_data *cdata, struct file *filp) ++static int nfs3_proc_commit(struct nfs_write_data *cdata) + { + struct inode * inode = cdata->inode; + struct nfs_fattr * fattr = cdata->res.fattr; +@@ -284,13 +275,13 @@ nfs3_proc_commit(struct nfs_write_data * + .rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT], + .rpc_argp = &cdata->args, + .rpc_resp = &cdata->res, ++ .rpc_cred = cdata->cred, + }; + int status; + + dprintk("NFS call commit %d @ %Ld\n", cdata->args.count, + (long long) cdata->args.offset); + fattr->valid = 0; +- msg.rpc_cred = nfs_cred(inode, filp); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + if (status >= 0) + nfs_refresh_inode(inode, fattr); +@@ -534,6 +525,8 @@ nfs3_proc_symlink(struct inode *dir, str + }; + int status; + ++ if (path->len > NFS3_MAXPATHLEN) ++ return -ENAMETOOLONG; + dprintk("NFS call symlink %s -> %s\n", name->name, path->name); + dir_attr.valid = 0; + fattr->valid = 0; +@@ -832,27 +825,6 @@ nfs3_proc_commit_setup(struct nfs_write_ + rpc_call_setup(task, &msg, 0); + } + +-/* +- * Set up the nfspage struct with the right credentials +- */ +-void +-nfs3_request_init(struct nfs_page *req, struct file *filp) +-{ +- req->wb_cred = get_rpccred(nfs_cred(req->wb_inode, filp)); +-} +- +-static int +-nfs3_request_compatible(struct nfs_page *req, struct file *filp, struct page *page) +-{ +- if (req->wb_file != filp) +- return 0; +- if (req->wb_page != page) +- return 0; +- if (req->wb_cred != nfs_file_cred(filp)) +- return 0; +- return 1; +-} +- + static int + nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) + { +@@ -863,6 +835,7 @@ struct nfs_rpc_ops nfs_v3_clientops = { + .version = 3, /* protocol version */ + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, ++ .file_inode_ops = &nfs_file_inode_operations, + .getroot = nfs3_proc_get_root, + .getattr = nfs3_proc_getattr, + .setattr = nfs3_proc_setattr, +@@ -892,7 +865,5 @@ struct nfs_rpc_ops nfs_v3_clientops = { + .commit_setup = nfs3_proc_commit_setup, + .file_open = nfs_open, + .file_release = nfs_release, +- .request_init = nfs3_request_init, +- .request_compatible = nfs3_request_compatible, + .lock = nfs3_proc_lock, + }; +--- linux-2.6.7/fs/nfs/proc.c.lsec 2004-06-15 23:20:03.000000000 -0600 ++++ linux-2.6.7/fs/nfs/proc.c 2005-03-23 14:28:23.058544264 -0700 +@@ -49,18 +49,6 @@ + + extern struct rpc_procinfo nfs_procedures[]; + +-static struct rpc_cred * +-nfs_cred(struct inode *inode, struct file *filp) +-{ +- struct rpc_cred *cred = NULL; +- +- if (filp) +- cred = (struct rpc_cred *)filp->private_data; +- if (!cred) +- cred = NFS_I(inode)->mm_cred; +- return cred; +-} +- + /* + * Bare-bones access to getattr: this is for nfs_read_super. + */ +@@ -167,8 +155,7 @@ nfs_proc_readlink(struct inode *inode, s + return status; + } + +-static int +-nfs_proc_read(struct nfs_read_data *rdata, struct file *filp) ++static int nfs_proc_read(struct nfs_read_data *rdata) + { + int flags = rdata->flags; + struct inode * inode = rdata->inode; +@@ -177,15 +164,14 @@ nfs_proc_read(struct nfs_read_data *rdat + .rpc_proc = &nfs_procedures[NFSPROC_READ], + .rpc_argp = &rdata->args, + .rpc_resp = &rdata->res, ++ .rpc_resp = rdata->cred, + }; + int status; + + dprintk("NFS call read %d @ %Ld\n", rdata->args.count, + (long long) rdata->args.offset); + fattr->valid = 0; +- msg.rpc_cred = nfs_cred(inode, filp); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); +- + if (status >= 0) { + nfs_refresh_inode(inode, fattr); + /* Emulate the eof flag, which isn't normally needed in NFSv2 +@@ -198,8 +184,7 @@ nfs_proc_read(struct nfs_read_data *rdat + return status; + } + +-static int +-nfs_proc_write(struct nfs_write_data *wdata, struct file *filp) ++static int nfs_proc_write(struct nfs_write_data *wdata) + { + int flags = wdata->flags; + struct inode * inode = wdata->inode; +@@ -208,13 +193,13 @@ nfs_proc_write(struct nfs_write_data *wd + .rpc_proc = &nfs_procedures[NFSPROC_WRITE], + .rpc_argp = &wdata->args, + .rpc_resp = &wdata->res, ++ .rpc_resp = wdata->cred, + }; + int status; + + dprintk("NFS call write %d @ %Ld\n", wdata->args.count, + (long long) wdata->args.offset); + fattr->valid = 0; +- msg.rpc_cred = nfs_cred(inode, filp); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, flags); + if (status >= 0) { + nfs_refresh_inode(inode, fattr); +@@ -400,6 +385,8 @@ nfs_proc_symlink(struct inode *dir, stru + }; + int status; + ++ if (path->len > NFS2_MAXPATHLEN) ++ return -ENAMETOOLONG; + dprintk("NFS call symlink %s -> %s\n", name->name, path->name); + fattr->valid = 0; + status = rpc_call(NFS_CLIENT(dir), NFSPROC_SYMLINK, &arg, NULL, 0); +@@ -619,27 +606,6 @@ nfs_proc_commit_setup(struct nfs_write_d + BUG(); + } + +-/* +- * Set up the nfspage struct with the right credentials +- */ +-static void +-nfs_request_init(struct nfs_page *req, struct file *filp) +-{ +- req->wb_cred = get_rpccred(nfs_cred(req->wb_inode, filp)); +-} +- +-static int +-nfs_request_compatible(struct nfs_page *req, struct file *filp, struct page *page) +-{ +- if (req->wb_file != filp) +- return 0; +- if (req->wb_page != page) +- return 0; +- if (req->wb_cred != nfs_file_cred(filp)) +- return 0; +- return 1; +-} +- + static int + nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) + { +@@ -651,6 +617,7 @@ struct nfs_rpc_ops nfs_v2_clientops = { + .version = 2, /* protocol version */ + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, ++ .file_inode_ops = &nfs_file_inode_operations, + .getroot = nfs_proc_get_root, + .getattr = nfs_proc_getattr, + .setattr = nfs_proc_setattr, +@@ -680,7 +647,5 @@ struct nfs_rpc_ops nfs_v2_clientops = { + .commit_setup = nfs_proc_commit_setup, + .file_open = nfs_open, + .file_release = nfs_release, +- .request_init = nfs_request_init, +- .request_compatible = nfs_request_compatible, + .lock = nfs_proc_lock, + }; +--- linux-2.6.7/fs/nfs/file.c.lsec 2004-06-15 23:19:37.000000000 -0600 ++++ linux-2.6.7/fs/nfs/file.c 2005-03-23 14:28:22.760589560 -0700 +@@ -31,6 +31,8 @@ + #include + #include + ++#include "delegation.h" ++ + #define NFSDBG_FACILITY NFSDBG_FILE + + static long nfs_file_fcntl(int fd, unsigned int cmd, +@@ -66,6 +68,19 @@ struct inode_operations nfs_file_inode_o + .setattr = nfs_setattr, + }; + ++#ifdef CONFIG_NFS_V4 ++ ++struct inode_operations nfs4_file_inode_operations = { ++ .permission = nfs_permission, ++ .getattr = nfs_getattr, ++ .setattr = nfs_setattr, ++ .getxattr = nfs_getxattr, ++ .setxattr = nfs_setxattr, ++ .listxattr = nfs_listxattr, ++}; ++ ++#endif /* CONFIG_NFS_V4 */ ++ + /* Hack for future NFS swap support */ + #ifndef IS_SWAPFILE + # define IS_SWAPFILE(inode) (0) +@@ -127,6 +142,7 @@ nfs_file_release(struct inode *inode, st + static int + nfs_file_flush(struct file *file) + { ++ struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; + struct inode *inode = file->f_dentry->d_inode; + int status; + +@@ -138,9 +154,9 @@ nfs_file_flush(struct file *file) + /* Ensure that data+attribute caches are up to date after close() */ + status = nfs_wb_all(inode); + if (!status) { +- status = file->f_error; +- file->f_error = 0; +- if (!status) ++ status = ctx->error; ++ ctx->error = 0; ++ if (!status && !nfs_have_delegation(inode, FMODE_READ)) + __nfs_revalidate_inode(NFS_SERVER(inode), inode); + } + unlock_kernel(); +@@ -211,6 +227,7 @@ nfs_file_mmap(struct file * file, struct + static int + nfs_fsync(struct file *file, struct dentry *dentry, int datasync) + { ++ struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; + struct inode *inode = dentry->d_inode; + int status; + +@@ -219,8 +236,8 @@ nfs_fsync(struct file *file, struct dent + lock_kernel(); + status = nfs_wb_all(inode); + if (!status) { +- status = file->f_error; +- file->f_error = 0; ++ status = ctx->error; ++ ctx->error = 0; + } + unlock_kernel(); + return status; +@@ -302,6 +319,90 @@ out_swapfile: + goto out; + } + ++static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) ++{ ++ struct inode *inode = filp->f_mapping->host; ++ int status; ++ ++ lock_kernel(); ++ status = NFS_PROTO(inode)->lock(filp, cmd, fl); ++ unlock_kernel(); ++ return status; ++} ++ ++static int do_unlk(struct file *filp, int cmd, struct file_lock *fl) ++{ ++ struct inode *inode = filp->f_mapping->host; ++ sigset_t oldset; ++ int status; ++ ++ rpc_clnt_sigmask(NFS_CLIENT(inode), &oldset); ++ /* ++ * Flush all pending writes before doing anything ++ * with locks.. ++ */ ++ filemap_fdatawrite(filp->f_mapping); ++ down(&inode->i_sem); ++ nfs_wb_all(inode); ++ up(&inode->i_sem); ++ filemap_fdatawait(filp->f_mapping); ++ ++ /* NOTE: special case ++ * If we're signalled while cleaning up locks on process exit, we ++ * still need to complete the unlock. ++ */ ++ lock_kernel(); ++ status = NFS_PROTO(inode)->lock(filp, cmd, fl); ++ rpc_clnt_sigunmask(NFS_CLIENT(inode), &oldset); ++ return status; ++} ++ ++static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) ++{ ++ struct inode *inode = filp->f_mapping->host; ++ int status; ++ ++ /* ++ * Flush all pending writes before doing anything ++ * with locks.. ++ */ ++ status = filemap_fdatawrite(filp->f_mapping); ++ if (status == 0) { ++ down(&inode->i_sem); ++ status = nfs_wb_all(inode); ++ up(&inode->i_sem); ++ if (status == 0) ++ status = filemap_fdatawait(filp->f_mapping); ++ } ++ if (status < 0) ++ return status; ++ ++ lock_kernel(); ++ status = NFS_PROTO(inode)->lock(filp, cmd, fl); ++ /* If we were signalled we still need to ensure that ++ * we clean up any state on the server. We therefore ++ * record the lock call as having succeeded in order to ++ * ensure that locks_remove_posix() cleans it out when ++ * the process exits. ++ */ ++ if (status == -EINTR || status == -ERESTARTSYS) ++ posix_lock_file(filp, fl); ++ unlock_kernel(); ++ if (status < 0) ++ return status; ++ /* ++ * Make sure we clear the cache whenever we try to get the lock. ++ * This makes locking act as a cache coherency point. ++ */ ++ filemap_fdatawrite(filp->f_mapping); ++ down(&inode->i_sem); ++ nfs_wb_all(inode); /* we may have slept */ ++ up(&inode->i_sem); ++ filemap_fdatawait(filp->f_mapping); ++ nfs_zap_caches(inode); ++ return 0; ++} ++ + /* + * Lock a (portion of) a file + */ +@@ -309,8 +410,6 @@ int + nfs_lock(struct file *filp, int cmd, struct file_lock *fl) + { + struct inode * inode = filp->f_mapping->host; +- int status = 0; +- int status2; + + dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n", + inode->i_sb->s_id, inode->i_ino, +@@ -328,8 +427,8 @@ nfs_lock(struct file *filp, int cmd, str + /* Fake OK code if mounted without NLM support */ + if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM) { + if (IS_GETLK(cmd)) +- status = LOCK_USE_CLNT; +- goto out_ok; ++ return LOCK_USE_CLNT; ++ return 0; + } + } + +@@ -340,45 +439,12 @@ nfs_lock(struct file *filp, int cmd, str + * Not sure whether that would be unique, though, or whether + * that would break in other places. + */ +- if (!fl->fl_owner || !(fl->fl_flags & FL_POSIX)) ++ if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + +- /* +- * Flush all pending writes before doing anything +- * with locks.. +- */ +- status = filemap_fdatawrite(filp->f_mapping); +- down(&inode->i_sem); +- status2 = nfs_wb_all(inode); +- if (!status) +- status = status2; +- up(&inode->i_sem); +- status2 = filemap_fdatawait(filp->f_mapping); +- if (!status) +- status = status2; +- if (status < 0) +- return status; +- +- lock_kernel(); +- status = NFS_PROTO(inode)->lock(filp, cmd, fl); +- unlock_kernel(); +- if (status < 0) +- return status; +- +- status = 0; +- +- /* +- * Make sure we clear the cache whenever we try to get the lock. +- * This makes locking act as a cache coherency point. +- */ +- out_ok: +- if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { +- filemap_fdatawrite(filp->f_mapping); +- down(&inode->i_sem); +- nfs_wb_all(inode); /* we may have slept */ +- up(&inode->i_sem); +- filemap_fdatawait(filp->f_mapping); +- nfs_zap_caches(inode); +- } +- return status; ++ if (IS_GETLK(cmd)) ++ return do_getlk(filp, cmd, fl); ++ if (fl->fl_type == F_UNLCK) ++ return do_unlk(filp, cmd, fl); ++ return do_setlk(filp, cmd, fl); + } +--- linux-2.6.7/fs/nfs/write.c.lsec 2004-06-15 23:19:43.000000000 -0600 ++++ linux-2.6.7/fs/nfs/write.c 2005-03-23 14:28:23.225518880 -0700 +@@ -63,6 +63,8 @@ + #include + #include + ++#include "delegation.h" ++ + #define NFSDBG_FACILITY NFSDBG_PAGECACHE + + #define MIN_POOL_WRITE (32) +@@ -71,7 +73,8 @@ + /* + * Local function declarations + */ +-static struct nfs_page * nfs_update_request(struct file*, struct inode *, ++static struct nfs_page * nfs_update_request(struct nfs_open_context*, ++ struct inode *, + struct page *, + unsigned int, unsigned int); + static void nfs_writeback_done_partial(struct nfs_write_data *, int); +@@ -173,7 +176,7 @@ static void nfs_mark_uptodate(struct pag + * Write a page synchronously. + * Offset is the data offset within the page. + */ +-static int nfs_writepage_sync(struct file *file, struct inode *inode, ++static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode, + struct page *page, unsigned int offset, unsigned int count, + int how) + { +@@ -187,9 +190,10 @@ static int nfs_writepage_sync(struct fil + + memset(wdata, 0, sizeof(*wdata)); + wdata->flags = how; ++ wdata->cred = ctx->cred; + wdata->inode = inode; + wdata->args.fh = NFS_FH(inode); +- wdata->args.lockowner = current->files; ++ wdata->args.context = ctx; + wdata->args.pages = &page; + wdata->args.stable = NFS_FILE_SYNC; + wdata->args.pgbase = offset; +@@ -208,7 +212,7 @@ static int nfs_writepage_sync(struct fil + wdata->args.count = count; + wdata->args.offset = page_offset(page) + wdata->args.pgbase; + +- result = NFS_PROTO(inode)->write(wdata, file); ++ result = NFS_PROTO(inode)->write(wdata); + + if (result < 0) { + /* Must mark the page invalid after I/O error */ +@@ -241,13 +245,14 @@ io_error: + return written ? written : result; + } + +-static int nfs_writepage_async(struct file *file, struct inode *inode, +- struct page *page, unsigned int offset, unsigned int count) ++static int nfs_writepage_async(struct nfs_open_context *ctx, ++ struct inode *inode, struct page *page, ++ unsigned int offset, unsigned int count) + { + struct nfs_page *req; + int status; + +- req = nfs_update_request(file, inode, page, offset, count); ++ req = nfs_update_request(ctx, inode, page, offset, count); + status = (IS_ERR(req)) ? PTR_ERR(req) : 0; + if (status < 0) + goto out; +@@ -274,6 +279,7 @@ static int wb_priority(struct writeback_ + */ + int nfs_writepage(struct page *page, struct writeback_control *wbc) + { ++ struct nfs_open_context *ctx; + struct inode *inode = page->mapping->host; + unsigned long end_index; + unsigned offset = PAGE_CACHE_SIZE; +@@ -308,16 +314,21 @@ int nfs_writepage(struct page *page, str + if (page->index >= end_index+1 || !offset) + goto out; + do_it: ++ ctx = nfs_find_open_context(inode, FMODE_WRITE); ++ if (ctx == NULL) { ++ err = -EBADF; ++ goto out; ++ } + lock_kernel(); + if (!IS_SYNC(inode) && inode_referenced) { +- err = nfs_writepage_async(NULL, inode, page, 0, offset); ++ err = nfs_writepage_async(ctx, inode, page, 0, offset); + if (err >= 0) { + err = 0; + if (wbc->for_reclaim) + nfs_flush_inode(inode, 0, 0, FLUSH_STABLE); + } + } else { +- err = nfs_writepage_sync(NULL, inode, page, 0, ++ err = nfs_writepage_sync(ctx, inode, page, 0, + offset, priority); + if (err >= 0) { + if (err != offset) +@@ -326,6 +337,7 @@ do_it: + } + } + unlock_kernel(); ++ put_nfs_open_context(ctx); + out: + unlock_page(page); + if (inode_referenced) +@@ -374,8 +386,7 @@ out: + /* + * Insert a write request into an inode + */ +-static inline int +-nfs_inode_add_request(struct inode *inode, struct nfs_page *req) ++static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) + { + struct nfs_inode *nfsi = NFS_I(inode); + int error; +@@ -387,6 +398,8 @@ nfs_inode_add_request(struct inode *inod + if (!nfsi->npages) { + igrab(inode); + nfs_begin_data_update(inode); ++ if (nfs_have_delegation(inode, FMODE_WRITE)) ++ nfsi->change_attr++; + } + nfsi->npages++; + req->wb_count++; +@@ -404,7 +417,7 @@ nfs_inode_remove_request(struct nfs_page + + BUG_ON (!NFS_WBACK_BUSY(req)); + spin_lock(&nfs_wreq_lock); +- inode = req->wb_inode; ++ inode = req->wb_context->dentry->d_inode; + nfsi = NFS_I(inode); + radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); + nfsi->npages--; +@@ -450,7 +463,7 @@ nfs_find_request(struct inode *inode, un + static void + nfs_mark_request_dirty(struct nfs_page *req) + { +- struct inode *inode = req->wb_inode; ++ struct inode *inode = req->wb_context->dentry->d_inode; + struct nfs_inode *nfsi = NFS_I(inode); + + spin_lock(&nfs_wreq_lock); +@@ -467,7 +480,7 @@ nfs_mark_request_dirty(struct nfs_page * + static inline int + nfs_dirty_request(struct nfs_page *req) + { +- struct nfs_inode *nfsi = NFS_I(req->wb_inode); ++ struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode); + return !list_empty(&req->wb_list) && req->wb_list_head == &nfsi->dirty; + } + +@@ -478,7 +491,7 @@ nfs_dirty_request(struct nfs_page *req) + static void + nfs_mark_request_commit(struct nfs_page *req) + { +- struct inode *inode = req->wb_inode; ++ struct inode *inode = req->wb_context->dentry->d_inode; + struct nfs_inode *nfsi = NFS_I(inode); + + spin_lock(&nfs_wreq_lock); +@@ -619,9 +632,9 @@ static int nfs_wait_on_write_congestion( + * + * Note: Should always be called with the Page Lock held! + */ +-static struct nfs_page * +-nfs_update_request(struct file* file, struct inode *inode, struct page *page, +- unsigned int offset, unsigned int bytes) ++static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, ++ struct inode *inode, struct page *page, ++ unsigned int offset, unsigned int bytes) + { + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_page *req, *new = NULL; +@@ -668,13 +681,9 @@ nfs_update_request(struct file* file, st + } + spin_unlock(&nfs_wreq_lock); + +- new = nfs_create_request(file, inode, page, offset, bytes); ++ new = nfs_create_request(ctx, inode, page, offset, bytes); + if (IS_ERR(new)) + return new; +- if (file) { +- new->wb_file = file; +- get_file(file); +- } + } + + /* We have a request for our page. +@@ -684,7 +693,7 @@ nfs_update_request(struct file* file, st + * request. + */ + rqend = req->wb_offset + req->wb_bytes; +- if (req->wb_file != file ++ if (req->wb_context != ctx + || req->wb_page != page + || !nfs_dirty_request(req) + || offset > rqend || end < req->wb_offset) { +@@ -705,9 +714,9 @@ nfs_update_request(struct file* file, st + return req; + } + +-int +-nfs_flush_incompatible(struct file *file, struct page *page) ++int nfs_flush_incompatible(struct file *file, struct page *page) + { ++ struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int status = 0; +@@ -721,7 +730,7 @@ nfs_flush_incompatible(struct file *file + */ + req = nfs_find_request(inode, page->index); + if (req) { +- if (!NFS_PROTO(inode)->request_compatible(req, file, page)) ++ if (req->wb_page != page || ctx != req->wb_context) + status = nfs_wb_page(inode, page); + nfs_release_request(req); + } +@@ -737,6 +746,7 @@ nfs_flush_incompatible(struct file *file + int nfs_updatepage(struct file *file, struct page *page, + unsigned int offset, unsigned int count) + { ++ struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; + struct dentry *dentry = file->f_dentry; + struct inode *inode = page->mapping->host; + struct nfs_page *req; +@@ -747,7 +757,7 @@ int nfs_updatepage(struct file *file, st + count, (long long)(page_offset(page) +offset)); + + if (IS_SYNC(inode)) { +- status = nfs_writepage_sync(file, inode, page, offset, count, 0); ++ status = nfs_writepage_sync(ctx, inode, page, offset, count, 0); + if (status > 0) { + if (offset == 0 && status == PAGE_CACHE_SIZE) + SetPageUptodate(page); +@@ -784,7 +794,7 @@ int nfs_updatepage(struct file *file, st + * it out now. + */ + do { +- req = nfs_update_request(file, inode, page, offset, count); ++ req = nfs_update_request(ctx, inode, page, offset, count); + status = (IS_ERR(req)) ? PTR_ERR(req) : 0; + if (status != -EBUSY) + break; +@@ -860,16 +870,15 @@ static void nfs_write_rpcsetup(struct nf + * NB: take care not to mess about with data->commit et al. */ + + data->req = req; +- data->inode = inode = req->wb_inode; +- data->cred = req->wb_cred; ++ data->inode = inode = req->wb_context->dentry->d_inode; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; + data->args.pgbase = req->wb_pgbase + offset; + data->args.pages = data->pagevec; + data->args.count = count; +- data->args.lockowner = req->wb_lockowner; +- data->args.state = req->wb_state; ++ data->args.context = req->wb_context; + + data->res.fattr = &data->fattr; + data->res.count = count; +@@ -1029,7 +1038,7 @@ nfs_flush_list(struct list_head *head, i + while (!list_empty(head)) { + pages += nfs_coalesce_requests(head, &one_request, wpages); + req = nfs_list_entry(one_request.next); +- error = nfs_flush_one(&one_request, req->wb_inode, how); ++ error = nfs_flush_one(&one_request, req->wb_context->dentry->d_inode, how); + if (error < 0) + break; + } +@@ -1054,16 +1063,15 @@ static void nfs_writeback_done_partial(s + struct page *page = req->wb_page; + + dprintk("NFS: write (%s/%Ld %d@%Ld)", +- req->wb_inode->i_sb->s_id, +- (long long)NFS_FILEID(req->wb_inode), ++ req->wb_context->dentry->d_inode->i_sb->s_id, ++ (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + req->wb_bytes, + (long long)req_offset(req)); + + if (status < 0) { + ClearPageUptodate(page); + SetPageError(page); +- if (req->wb_file) +- req->wb_file->f_error = status; ++ req->wb_context->error = status; + dprintk(", error = %d\n", status); + } else { + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +@@ -1104,16 +1112,15 @@ static void nfs_writeback_done_full(stru + page = req->wb_page; + + dprintk("NFS: write (%s/%Ld %d@%Ld)", +- req->wb_inode->i_sb->s_id, +- (long long)NFS_FILEID(req->wb_inode), ++ req->wb_context->dentry->d_inode->i_sb->s_id, ++ (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + req->wb_bytes, + (long long)req_offset(req)); + + if (status < 0) { + ClearPageUptodate(page); + SetPageError(page); +- if (req->wb_file) +- req->wb_file->f_error = status; ++ req->wb_context->error = status; + end_page_writeback(page); + nfs_inode_remove_request(req); + dprintk(", error = %d\n", status); +@@ -1232,7 +1239,7 @@ static void nfs_commit_rpcsetup(struct l + list_splice_init(head, &data->pages); + first = nfs_list_entry(data->pages.next); + last = nfs_list_entry(data->pages.prev); +- inode = first->wb_inode; ++ inode = first->wb_context->dentry->d_inode; + + /* + * Determine the offset range of requests in the COMMIT call. +@@ -1246,7 +1253,7 @@ static void nfs_commit_rpcsetup(struct l + len = 0; + + data->inode = inode; +- data->cred = first->wb_cred; ++ data->cred = first->wb_context->cred; + + data->args.fh = NFS_FH(data->inode); + data->args.offset = start; +@@ -1313,13 +1320,12 @@ nfs_commit_done(struct rpc_task *task) + nfs_list_remove_request(req); + + dprintk("NFS: commit (%s/%Ld %d@%Ld)", +- req->wb_inode->i_sb->s_id, +- (long long)NFS_FILEID(req->wb_inode), ++ req->wb_context->dentry->d_inode->i_sb->s_id, ++ (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + req->wb_bytes, + (long long)req_offset(req)); + if (task->tk_status < 0) { +- if (req->wb_file) +- req->wb_file->f_error = task->tk_status; ++ req->wb_context->error = task->tk_status; + nfs_inode_remove_request(req); + dprintk(", error = %d\n", task->tk_status); + goto next; +--- linux-2.6.7/fs/nfs/nfs4xdr.c.lsec 2004-06-15 23:20:26.000000000 -0600 ++++ linux-2.6.7/fs/nfs/nfs4xdr.c 2005-03-23 14:28:23.056544568 -0700 +@@ -84,9 +84,13 @@ static int nfs_stat_to_errno(int); + ((3+NFS4_FHSIZE) >> 2)) + #define encode_getattr_maxsz (op_encode_hdr_maxsz + 3) + #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) ++#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) + #define nfs4_fattr_bitmap_maxsz (36 + 2 * nfs4_name_maxsz) + #define decode_getattr_maxsz (op_decode_hdr_maxsz + 3 + \ + nfs4_fattr_bitmap_maxsz) ++#define encode_setattr_maxsz (op_decode_hdr_maxsz + 4 + \ ++ nfs4_fattr_bitmap_maxsz) ++#define decode_setattr_maxsz (op_decode_hdr_maxsz + 3) + #define encode_savefh_maxsz (op_encode_hdr_maxsz) + #define decode_savefh_maxsz (op_decode_hdr_maxsz) + #define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2) +@@ -118,10 +122,17 @@ static int nfs_stat_to_errno(int); + #define encode_link_maxsz (op_encode_hdr_maxsz + \ + nfs4_name_maxsz) + #define decode_link_maxsz (op_decode_hdr_maxsz + 5) ++#define encode_symlink_maxsz (op_encode_hdr_maxsz + \ ++ 1 + nfs4_name_maxsz + \ ++ nfs4_path_maxsz + \ ++ nfs4_fattr_bitmap_maxsz) ++#define decode_symlink_maxsz (op_decode_hdr_maxsz + 8) + #define encode_create_maxsz (op_encode_hdr_maxsz + \ +- 2 + 2 * nfs4_name_maxsz + \ ++ 2 + nfs4_name_maxsz + \ + nfs4_fattr_bitmap_maxsz) + #define decode_create_maxsz (op_decode_hdr_maxsz + 8) ++#define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4) ++#define decode_delegreturn_maxsz (op_decode_hdr_maxsz) + #define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ + #define NFS4_dec_compound_sz (1024) /* XXX: large enough? */ + #define NFS4_enc_read_sz (compound_encode_hdr_maxsz + \ +@@ -172,16 +183,14 @@ static int nfs_stat_to_errno(int); + #define NFS4_dec_open_confirm_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + op_decode_hdr_maxsz + 4) +-#define NFS4_enc_open_reclaim_sz (compound_encode_hdr_maxsz + \ ++#define NFS4_enc_open_noattr_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + op_encode_hdr_maxsz + \ +- 11 + \ +- encode_getattr_maxsz) +-#define NFS4_dec_open_reclaim_sz (compound_decode_hdr_maxsz + \ ++ 11) ++#define NFS4_dec_open_noattr_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + op_decode_hdr_maxsz + \ +- 4 + 5 + 2 + 3 + \ +- decode_getattr_maxsz) ++ 4 + 5 + 2 + 3) + #define NFS4_enc_open_downgrade_sz \ + (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ +@@ -313,6 +322,16 @@ static int nfs_stat_to_errno(int); + decode_savefh_maxsz + \ + decode_putfh_maxsz + \ + decode_link_maxsz) ++#define NFS4_enc_symlink_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_symlink_maxsz + \ ++ encode_getattr_maxsz + \ ++ encode_getfh_maxsz) ++#define NFS4_dec_symlink_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_symlink_maxsz + \ ++ decode_getattr_maxsz + \ ++ decode_getfh_maxsz) + #define NFS4_enc_create_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_create_maxsz + \ +@@ -339,6 +358,33 @@ static int nfs_stat_to_errno(int); + encode_getattr_maxsz) + #define NFS4_dec_server_caps_sz (compound_decode_hdr_maxsz + \ + decode_getattr_maxsz) ++#define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_delegreturn_maxsz) ++#define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ ++ decode_delegreturn_maxsz) ++#define username_maxsz (1 + ((IDMAP_NAMESZ + 3) >> 2)) ++/* XXX: fix ACL bounds */ ++#define ace_maxsz (3 + username_maxsz) ++#define NFS_ACL_MAX_ENTRIES 32 ++#define acl_maxentries ((NFS_ACL_MAX_ENTRIES - 3) * 3 + 6) ++#define acl_maxsz (1 + acl_maxentries * ace_maxsz) ++#define NFS4_enc_getacl_sz compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_getattr_maxsz ++#define username_maxsz (1 + ((IDMAP_NAMESZ + 3) >> 2)) ++#define ace_maxsz (3 + username_maxsz) ++#define acl_maxentries ((NFS_ACL_MAX_ENTRIES - 3) * 3 + 6) ++#define acl_maxsz (1 + acl_maxentries * ace_maxsz) ++#define NFS4_dec_getacl_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ op_decode_hdr_maxsz + 3 + 1 + acl_maxsz) ++#define NFS4_enc_setacl_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ op_encode_hdr_maxsz + 4 + 1 + acl_maxsz) ++#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_setattr_maxsz) + + static struct { + unsigned int mode; +@@ -388,6 +434,15 @@ struct compound_hdr { + BUG_ON(!p); \ + } while (0) + ++static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) ++{ ++ uint32_t *p; ++ ++ p = xdr_reserve_space(xdr, 4 + len); ++ BUG_ON(p == NULL); ++ xdr_encode_opaque(p, str, len); ++} ++ + static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) + { + uint32_t *p; +@@ -402,6 +457,15 @@ static int encode_compound_hdr(struct xd + return 0; + } + ++static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf) ++{ ++ uint32_t *p; ++ ++ p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); ++ BUG_ON(p == NULL); ++ xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE); ++} ++ + static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) + { + char owner_name[IDMAP_NAMESZ]; +@@ -420,7 +484,7 @@ static int encode_attrs(struct xdr_strea + * In the worst-case, this would be + * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) + * = 36 bytes, plus any contribution from variable-length fields +- * such as owner/group/acl's. ++ * such as owner/group. + */ + len = 16; + +@@ -742,19 +806,12 @@ static int encode_lookup(struct xdr_stre + return 0; + } + +-static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg) ++static void encode_share_access(struct xdr_stream *xdr, int open_flags) + { +- int status; + uint32_t *p; + +- /* +- * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, +- * owner 4, opentype 4 = 36 +- */ +- RESERVE_SPACE(36); +- WRITE32(OP_OPEN); +- WRITE32(arg->seqid); +- switch (arg->share_access) { ++ RESERVE_SPACE(8); ++ switch (open_flags & (FMODE_READ|FMODE_WRITE)) { + case FMODE_READ: + WRITE32(NFS4_SHARE_ACCESS_READ); + break; +@@ -767,84 +824,135 @@ static int encode_open(struct xdr_stream + default: + BUG(); + } +- WRITE32(0); /* for linux, share_deny = 0 always */ ++ WRITE32(0); /* for linux, share_deny = 0 always */ ++} ++ ++static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) ++{ ++ uint32_t *p; ++ /* ++ * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, ++ * owner 4 = 32 ++ */ ++ RESERVE_SPACE(8); ++ WRITE32(OP_OPEN); ++ WRITE32(arg->seqid); ++ encode_share_access(xdr, arg->open_flags); ++ RESERVE_SPACE(16); + WRITE64(arg->clientid); + WRITE32(4); + WRITE32(arg->id); +- WRITE32(arg->opentype); ++} + +- if (arg->opentype == NFS4_OPEN_CREATE) { +- if (arg->createmode == NFS4_CREATE_EXCLUSIVE) { +- RESERVE_SPACE(12); +- WRITE32(arg->createmode); +- WRITEMEM(arg->u.verifier.data, sizeof(arg->u.verifier.data)); +- } +- else if (arg->u.attrs) { +- RESERVE_SPACE(4); +- WRITE32(arg->createmode); +- if ((status = encode_attrs(xdr, arg->u.attrs, arg->server))) +- return status; +- } +- else { +- RESERVE_SPACE(12); +- WRITE32(arg->createmode); +- WRITE32(0); +- WRITE32(0); +- } ++static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) ++{ ++ uint32_t *p; ++ ++ RESERVE_SPACE(4); ++ switch(arg->open_flags & O_EXCL) { ++ case 0: ++ WRITE32(NFS4_CREATE_UNCHECKED); ++ encode_attrs(xdr, arg->u.attrs, arg->server); ++ break; ++ default: ++ WRITE32(NFS4_CREATE_EXCLUSIVE); ++ encode_nfs4_verifier(xdr, &arg->u.verifier); + } ++} + +- RESERVE_SPACE(8 + arg->name->len); +- WRITE32(NFS4_OPEN_CLAIM_NULL); +- WRITE32(arg->name->len); +- WRITEMEM(arg->name->name, arg->name->len); ++static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg) ++{ ++ uint32_t *p; + +- return 0; ++ RESERVE_SPACE(4); ++ switch (arg->open_flags & O_CREAT) { ++ case 0: ++ WRITE32(NFS4_OPEN_NOCREATE); ++ break; ++ default: ++ BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); ++ WRITE32(NFS4_OPEN_CREATE); ++ encode_createmode(xdr, arg); ++ } + } + +-static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg) ++static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type) + { + uint32_t *p; + +- RESERVE_SPACE(8+sizeof(arg->stateid.data)); +- WRITE32(OP_OPEN_CONFIRM); +- WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); +- WRITE32(arg->seqid); ++ RESERVE_SPACE(4); ++ switch (delegation_type) { ++ case 0: ++ WRITE32(NFS4_OPEN_DELEGATE_NONE); ++ break; ++ case FMODE_READ: ++ WRITE32(NFS4_OPEN_DELEGATE_READ); ++ break; ++ case FMODE_WRITE|FMODE_READ: ++ WRITE32(NFS4_OPEN_DELEGATE_WRITE); ++ break; ++ default: ++ BUG(); ++ } ++} + +- return 0; ++static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name) ++{ ++ uint32_t *p; ++ ++ RESERVE_SPACE(4); ++ WRITE32(NFS4_OPEN_CLAIM_NULL); ++ encode_string(xdr, name->len, name->name); + } + ++static inline void encode_claim_previous(struct xdr_stream *xdr, int type) ++{ ++ uint32_t *p; ++ ++ RESERVE_SPACE(4); ++ WRITE32(NFS4_OPEN_CLAIM_PREVIOUS); ++ encode_delegation_type(xdr, type); ++} + +-static int encode_open_reclaim(struct xdr_stream *xdr, const struct nfs_open_reclaimargs *arg) ++static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid) + { + uint32_t *p; + +- /* +- * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, +- * owner 4, opentype 4, claim 4, delegation_type 4 = 44 +- */ +- RESERVE_SPACE(44); +- WRITE32(OP_OPEN); +- WRITE32(arg->seqid); +- switch (arg->share_access) { +- case FMODE_READ: +- WRITE32(NFS4_SHARE_ACCESS_READ); ++ RESERVE_SPACE(4+sizeof(stateid->data)); ++ WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR); ++ WRITEMEM(stateid->data, sizeof(stateid->data)); ++ encode_string(xdr, name->len, name->name); ++} ++ ++static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg) ++{ ++ encode_openhdr(xdr, arg); ++ encode_opentype(xdr, arg); ++ switch (arg->claim) { ++ case NFS4_OPEN_CLAIM_NULL: ++ encode_claim_null(xdr, arg->name); + break; +- case FMODE_WRITE: +- WRITE32(NFS4_SHARE_ACCESS_WRITE); ++ case NFS4_OPEN_CLAIM_PREVIOUS: ++ encode_claim_previous(xdr, arg->u.delegation_type); + break; +- case FMODE_READ|FMODE_WRITE: +- WRITE32(NFS4_SHARE_ACCESS_BOTH); ++ case NFS4_OPEN_CLAIM_DELEGATE_CUR: ++ encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation); + break; + default: + BUG(); + } +- WRITE32(0); /* for linux, share_deny = 0 always */ +- WRITE64(arg->clientid); +- WRITE32(4); +- WRITE32(arg->id); +- WRITE32(NFS4_OPEN_NOCREATE); +- WRITE32(NFS4_OPEN_CLAIM_PREVIOUS); +- WRITE32(NFS4_OPEN_DELEGATE_NONE); ++ return 0; ++} ++ ++static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg) ++{ ++ uint32_t *p; ++ ++ RESERVE_SPACE(8+sizeof(arg->stateid.data)); ++ WRITE32(OP_OPEN_CONFIRM); ++ WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); ++ WRITE32(arg->seqid); ++ + return 0; + } + +@@ -852,14 +960,11 @@ static int encode_open_downgrade(struct + { + uint32_t *p; + +- RESERVE_SPACE(16+sizeof(arg->stateid.data)); ++ RESERVE_SPACE(8+sizeof(arg->stateid.data)); + WRITE32(OP_OPEN_DOWNGRADE); + WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); + WRITE32(arg->seqid); +- WRITE32(arg->share_access); +- /* No deny modes */ +- WRITE32(0); +- ++ encode_share_access(xdr, arg->open_flags); + return 0; + } + +@@ -887,15 +992,15 @@ static int encode_putrootfh(struct xdr_s + return 0; + } + +-static void encode_stateid(struct xdr_stream *xdr, struct nfs4_state *state, fl_owner_t lockowner) ++static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) + { + extern nfs4_stateid zero_stateid; + nfs4_stateid stateid; + uint32_t *p; + + RESERVE_SPACE(16); +- if (state != NULL) { +- nfs4_copy_stateid(&stateid, state, lockowner); ++ if (ctx->state != NULL) { ++ nfs4_copy_stateid(&stateid, ctx->state, ctx->pid); + WRITEMEM(stateid.data, sizeof(stateid.data)); + } else + WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data)); +@@ -908,7 +1013,7 @@ static int encode_read(struct xdr_stream + RESERVE_SPACE(4); + WRITE32(OP_READ); + +- encode_stateid(xdr, args->state, args->lockowner); ++ encode_stateid(xdr, args->context); + + RESERVE_SPACE(12); + WRITE64(args->offset); +@@ -1003,6 +1108,45 @@ static int encode_renew(struct xdr_strea + return 0; + } + ++extern nfs4_stateid zero_stateid; ++ ++static int ++encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg) ++{ ++ uint32_t *p; ++ uint32_t *q = (uint32_t *)arg->acl; ++ uint32_t *end = (uint32_t *)(arg->acl + arg->acl_len); ++ uint32_t tmp; ++ int naces, i; ++ ++ RESERVE_SPACE(4+sizeof(zero_stateid.data)); ++ WRITE32(OP_SETATTR); ++ WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data)); ++ RESERVE_SPACE(4*4); ++ WRITE32(1); ++ WRITE32(FATTR4_WORD0_ACL); ++ WRITE32(arg->acl_len); ++ if (q + 1 > end) ++ return -EINVAL; ++ naces = ntohl(*q++); ++ WRITE32(naces); ++ for (i = 0; i < naces; i++) { ++ if (q + 4 > end) ++ return -EINVAL; ++ RESERVE_SPACE(3*4); ++ memcpy(p, q, 3*4); /* type, flag, access_mask, length */ ++ q += 3; ++ tmp = ntohl(*q++); /* length */ ++ if (tmp > XDR_MAX_NETOBJ) ++ return -EINVAL; ++ if (q + XDR_QUADLEN(tmp) > end) ++ return -EINVAL; ++ RESERVE_SPACE((XDR_QUADLEN(tmp) << 2) + 4); ++ p = xdr_encode_opaque(p, q, tmp); ++ } ++ return 0; ++} ++ + static int + encode_savefh(struct xdr_stream *xdr) + { +@@ -1031,26 +1175,18 @@ static int encode_setattr(struct xdr_str + + static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid) + { +- uint32_t total_len; +- uint32_t len1, len2, len3; + uint32_t *p; + +- len1 = strlen(setclientid->sc_name); +- len2 = strlen(setclientid->sc_netid); +- len3 = strlen(setclientid->sc_uaddr); +- total_len = XDR_QUADLEN(len1) + XDR_QUADLEN(len2) + XDR_QUADLEN(len3); +- total_len = (total_len << 2) + 24 + sizeof(setclientid->sc_verifier.data); +- +- RESERVE_SPACE(total_len); ++ RESERVE_SPACE(4 + sizeof(setclientid->sc_verifier->data)); + WRITE32(OP_SETCLIENTID); +- WRITEMEM(setclientid->sc_verifier.data, sizeof(setclientid->sc_verifier.data)); +- WRITE32(len1); +- WRITEMEM(setclientid->sc_name, len1); ++ WRITEMEM(setclientid->sc_verifier->data, sizeof(setclientid->sc_verifier->data)); ++ ++ encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); ++ RESERVE_SPACE(4); + WRITE32(setclientid->sc_prog); +- WRITE32(len2); +- WRITEMEM(setclientid->sc_netid, len2); +- WRITE32(len3); +- WRITEMEM(setclientid->sc_uaddr, len3); ++ encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); ++ encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); ++ RESERVE_SPACE(4); + WRITE32(setclientid->sc_cb_ident); + + return 0; +@@ -1075,7 +1211,7 @@ static int encode_write(struct xdr_strea + RESERVE_SPACE(4); + WRITE32(OP_WRITE); + +- encode_stateid(xdr, args->state, args->lockowner); ++ encode_stateid(xdr, args->context); + + RESERVE_SPACE(16); + WRITE64(args->offset); +@@ -1086,6 +1222,18 @@ static int encode_write(struct xdr_strea + + return 0; + } ++ ++static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid) ++{ ++ uint32_t *p; ++ ++ RESERVE_SPACE(20); ++ ++ WRITE32(OP_DELEGRETURN); ++ WRITEMEM(stateid->data, sizeof(stateid->data)); ++ return 0; ++ ++} + /* + * END OF "GENERIC" ENCODE ROUTINES. + */ +@@ -1244,6 +1392,14 @@ out: + } + + /* ++ * Encode SYMLINK request ++ */ ++static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, uint32_t *p, const struct nfs4_create_arg *args) ++{ ++ return nfs4_xdr_enc_create(req, p, args); ++} ++ ++/* + * Encode GETATTR request + */ + static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, uint32_t *p, const struct nfs4_getattr_arg *args) +@@ -1331,13 +1487,13 @@ out: + } + + /* +- * Encode an OPEN request ++ * Encode an OPEN request with no attributes. + */ +-static int nfs4_xdr_enc_open_reclaim(struct rpc_rqst *req, uint32_t *p, struct nfs_open_reclaimargs *args) ++static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nfs_openargs *args) + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .nops = 3, ++ .nops = 2, + }; + int status; + +@@ -1346,10 +1502,7 @@ static int nfs4_xdr_enc_open_reclaim(str + status = encode_putfh(&xdr, args->fh); + if (status) + goto out; +- status = encode_open_reclaim(&xdr, args); +- if (status) +- goto out; +- status = encode_getfattr(&xdr, args->bitmask); ++ status = encode_open(&xdr, args); + out: + return status; + } +@@ -1538,6 +1691,52 @@ out: + } + + /* ++ * Encode an SETACL request ++ */ ++static int ++nfs4_xdr_enc_setacl(struct rpc_rqst *req, uint32_t *p, struct nfs_setaclargs *args) ++ ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .nops = 2, ++ }; ++ int status; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, &hdr); ++ status = encode_putfh(&xdr, args->fh); ++ if(status) ++ goto out; ++ status = encode_setacl(&xdr, args); ++out: ++ return status; ++} ++ ++/* ++ * Encode a GETACL request ++ */ ++static int ++nfs4_xdr_enc_getacl(struct rpc_rqst *req, uint32_t *p,struct nfs_fh *fhandle) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .nops = 2, ++ }; ++ int status; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, &hdr); ++ status = encode_putfh(&xdr, fhandle); ++ if (status) ++ goto out; ++ status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0); ++out: ++ return status; ++ ++} ++ ++/* + * Encode a WRITE request + */ + static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args) +@@ -1716,6 +1915,24 @@ static int nfs4_xdr_enc_setclientid_conf + } + + /* ++ * DELEGRETURN request ++ */ ++static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, uint32_t *p, const struct nfs4_delegreturnargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .nops = 2, ++ }; ++ int status; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, &hdr); ++ if ((status = encode_putfh(&xdr, args->fhandle)) == 0) ++ status = encode_delegreturn(&xdr, args->stateid); ++ return status; ++} ++ ++/* + * START OF "GENERIC" DECODE ROUTINES. + * These may look a little ugly since they are imported from a "generic" + * set of XDR encode/decode routines which are intended to be shared by +@@ -1749,6 +1966,17 @@ static int nfs4_xdr_enc_setclientid_conf + } \ + } while (0) + ++static int decode_opaque_inline(struct xdr_stream *xdr, uint32_t *len, char **string) ++{ ++ uint32_t *p; ++ ++ READ_BUF(4); ++ READ32(*len); ++ READ_BUF(*len); ++ *string = (char *)p; ++ return 0; ++} ++ + static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) + { + uint32_t *p; +@@ -1785,6 +2013,17 @@ static int decode_op_hdr(struct xdr_stre + return 0; + } + ++/* Dummy routine */ ++static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp) ++{ ++ uint32_t *p; ++ uint32_t strlen; ++ char *str; ++ ++ READ_BUF(12); ++ return decode_opaque_inline(xdr, &strlen, &str); ++} ++ + static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) + { + uint32_t bmlen, *p; +@@ -2717,10 +2956,56 @@ static int decode_lookup(struct xdr_stre + return decode_op_hdr(xdr, OP_LOOKUP); + } + ++/* This is too sick! */ ++static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) ++{ ++ uint32_t *p; ++ uint32_t limit_type, nblocks, blocksize; ++ ++ READ_BUF(12); ++ READ32(limit_type); ++ switch (limit_type) { ++ case 1: ++ READ64(*maxsize); ++ break; ++ case 2: ++ READ32(nblocks); ++ READ32(blocksize); ++ *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; ++ } ++ return 0; ++} ++ ++static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) ++{ ++ uint32_t *p; ++ uint32_t delegation_type; ++ ++ READ_BUF(4); ++ READ32(delegation_type); ++ if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { ++ res->delegation_type = 0; ++ return 0; ++ } ++ READ_BUF(20); ++ COPYMEM(res->delegation.data, sizeof(res->delegation.data)); ++ READ32(res->do_recall); ++ switch (delegation_type) { ++ case NFS4_OPEN_DELEGATE_READ: ++ res->delegation_type = FMODE_READ; ++ break; ++ case NFS4_OPEN_DELEGATE_WRITE: ++ res->delegation_type = FMODE_WRITE|FMODE_READ; ++ if (decode_space_limit(xdr, &res->maxsize) < 0) ++ return -EIO; ++ } ++ return decode_ace(xdr, NULL, res->server->nfs4_state); ++} ++ + static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) + { + uint32_t *p; +- uint32_t bmlen, delegation_type; ++ uint32_t bmlen; + int status; + + status = decode_op_hdr(xdr, OP_OPEN); +@@ -2737,11 +3022,9 @@ static int decode_open(struct xdr_stream + if (bmlen > 10) + goto xdr_error; + +- READ_BUF((bmlen << 2) + 4); ++ READ_BUF(bmlen << 2); + p += bmlen; +- READ32(delegation_type); +- if (delegation_type == NFS4_OPEN_DELEGATE_NONE) +- return 0; ++ return decode_delegation(xdr, res); + xdr_error: + printk(KERN_NOTICE "%s: xdr error!\n", __FUNCTION__); + return -EIO; +@@ -2967,6 +3250,72 @@ static int decode_renew(struct xdr_strea + return decode_op_hdr(xdr, OP_RENEW); + } + ++static int decode_attr_acl(struct xdr_stream *xdr, uint32_t *bitmap, ++ struct nfs_getaclres *res) ++{ ++ uint32_t *p; ++ ++ if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) ++ return -EIO; ++ if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { ++ ssize_t size = res->acl_len; ++ uint32_t nace, tmp; ++ u32 *start; ++ int i; ++ ++ res->acl_len = 0; ++ READ_BUF(4); ++ start = p; ++ READ32(nace); ++ res->acl_len += 4; ++ ++ for (i = 0; i < nace; i++) { ++ READ_BUF(4*4); ++ res->acl_len += 4*4; ++ p += 3; ++ READ32(tmp); /* namelen */ ++ READ_BUF(tmp); ++ if (tmp > XDR_MAX_NETOBJ) { ++ printk(KERN_WARNING "%s: name too long (%u)!\n", ++ __FUNCTION__, tmp); ++ return -EIO; ++ } ++ res->acl_len += XDR_QUADLEN(tmp) << 2; ++ } ++ if (size && res->acl_len > size) ++ return -ERANGE; ++ if (size == 0 && res->acl_len <= XATTR_SIZE_MAX) ++ res->acl = kmalloc(res->acl_len, GFP_KERNEL); ++ if (res->acl) ++ memcpy(res->acl, start, res->acl_len); ++ } ++ return 0; ++} ++ ++static int decode_getacl(struct xdr_stream *xdr, struct nfs_getaclres *res) ++{ ++ uint32_t *savep; ++ uint32_t attrlen, ++ bitmap[2] = {0}; ++ int status; ++ ++ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) ++ goto xdr_error; ++ ++ if ((status = decode_attr_acl(xdr, bitmap, res)) != 0) ++ goto xdr_error; ++ ++ status = verify_attr_len(xdr, savep, attrlen); ++xdr_error: ++ if (status != 0) ++ printk(KERN_NOTICE "%s: xdr error %d!\n", __FUNCTION__, -status); ++ return status; ++} ++ + static int + decode_savefh(struct xdr_stream *xdr) + { +@@ -3048,6 +3397,11 @@ static int decode_write(struct xdr_strea + return 0; + } + ++static int decode_delegreturn(struct xdr_stream *xdr) ++{ ++ return decode_op_hdr(xdr, OP_DELEGRETURN); ++} ++ + /* + * Decode OPEN_DOWNGRADE response + */ +@@ -3222,6 +3576,14 @@ out: + } + + /* ++ * Decode SYMLINK response ++ */ ++static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_create_res *res) ++{ ++ return nfs4_xdr_dec_create(rqstp, p, res); ++} ++ ++/* + * Decode GETATTR response + */ + static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_getattr_res *res) +@@ -3243,6 +3605,50 @@ out: + + } + ++/* ++ * Decode SETACL response ++ */ ++static int ++nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, uint32_t *p, void *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_setattr(&xdr, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode GETACL response ++ */ ++static int ++nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_getaclres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_getacl(&xdr, res); ++ ++out: ++ return status; ++} + + /* + * Decode CLOSE response +@@ -3314,9 +3720,9 @@ out: + } + + /* +- * Decode OPEN_RECLAIM response ++ * Decode OPEN response + */ +-static int nfs4_xdr_dec_open_reclaim(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res) ++static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res) + { + struct xdr_stream xdr; + struct compound_hdr hdr; +@@ -3330,9 +3736,6 @@ static int nfs4_xdr_dec_open_reclaim(str + if (status) + goto out; + status = decode_open(&xdr, res); +- if (status) +- goto out; +- status = decode_getfattr(&xdr, res->f_attr, res->server); + out: + return status; + } +@@ -3665,6 +4068,25 @@ static int nfs4_xdr_dec_setclientid_conf + return status; + } + ++/* ++ * DELEGRETURN request ++ */ ++static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, void *dummy) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status == 0) { ++ status = decode_putfh(&xdr); ++ if (status == 0) ++ status = decode_delegreturn(&xdr); ++ } ++ return status; ++} ++ + uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus) + { + uint32_t len; +@@ -3756,7 +4178,7 @@ nfs_stat_to_errno(int stat) + if (nfs_errtbl[i].stat == stat) + return nfs_errtbl[i].errno; + } +- if (stat < 0) { ++ if (stat <= 10000 || stat > 10100) { + /* The server is looney tunes. */ + return ESERVERFAULT; + } +@@ -3786,7 +4208,7 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(COMMIT, enc_commit, dec_commit), + PROC(OPEN, enc_open, dec_open), + PROC(OPEN_CONFIRM, enc_open_confirm, dec_open_confirm), +- PROC(OPEN_RECLAIM, enc_open_reclaim, dec_open_reclaim), ++ PROC(OPEN_NOATTR, enc_open_noattr, dec_open_noattr), + PROC(OPEN_DOWNGRADE, enc_open_downgrade, dec_open_downgrade), + PROC(CLOSE, enc_close, dec_close), + PROC(SETATTR, enc_setattr, dec_setattr), +@@ -3804,12 +4226,16 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(REMOVE, enc_remove, dec_remove), + PROC(RENAME, enc_rename, dec_rename), + PROC(LINK, enc_link, dec_link), ++ PROC(SYMLINK, enc_symlink, dec_symlink), + PROC(CREATE, enc_create, dec_create), + PROC(PATHCONF, enc_pathconf, dec_pathconf), + PROC(STATFS, enc_statfs, dec_statfs), + PROC(READLINK, enc_readlink, dec_readlink), + PROC(READDIR, enc_readdir, dec_readdir), + PROC(SERVER_CAPS, enc_server_caps, dec_server_caps), ++ PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), ++ PROC(GETACL, enc_getacl, dec_getacl), ++ PROC(SETACL, enc_setacl, dec_setacl), + }; + + struct rpc_version nfs_version4 = { +--- linux-2.6.7/fs/nfs/pagelist.c.lsec 2004-06-15 23:20:03.000000000 -0600 ++++ linux-2.6.7/fs/nfs/pagelist.c 2005-03-23 14:28:23.057544416 -0700 +@@ -36,7 +36,6 @@ nfs_page_alloc(void) + if (p) { + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->wb_list); +- init_waitqueue_head(&p->wb_wait); + } + return p; + } +@@ -62,7 +61,7 @@ nfs_page_free(struct nfs_page *p) + * User should ensure it is safe to sleep in this function. + */ + struct nfs_page * +-nfs_create_request(struct file *file, struct inode *inode, ++nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, + struct page *page, + unsigned int offset, unsigned int count) + { +@@ -94,33 +93,38 @@ nfs_create_request(struct file *file, st + req->wb_offset = offset; + req->wb_pgbase = offset; + req->wb_bytes = count; +- req->wb_inode = inode; + req->wb_count = 1; +- server->rpc_ops->request_init(req, file); ++ req->wb_context = get_nfs_open_context(ctx); + + return req; + } + + /** ++ * nfs_unlock_request - Unlock request and wake up sleepers. ++ * @req: ++ */ ++void nfs_unlock_request(struct nfs_page *req) ++{ ++ if (!NFS_WBACK_BUSY(req)) { ++ printk(KERN_ERR "NFS: Invalid unlock attempted\n"); ++ BUG(); ++ } ++ smp_mb__before_clear_bit(); ++ clear_bit(PG_BUSY, &req->wb_flags); ++ smp_mb__after_clear_bit(); ++ wake_up_all(&req->wb_context->waitq); ++ nfs_release_request(req); ++} ++ ++/** + * nfs_clear_request - Free up all resources allocated to the request + * @req: + * +- * Release all resources associated with a write request after it ++ * Release page resources associated with a write request after it + * has completed. + */ + void nfs_clear_request(struct nfs_page *req) + { +- if (req->wb_state) +- req->wb_state = NULL; +- /* Release struct file or cached credential */ +- if (req->wb_file) { +- fput(req->wb_file); +- req->wb_file = NULL; +- } +- if (req->wb_cred) { +- put_rpccred(req->wb_cred); +- req->wb_cred = NULL; +- } + if (req->wb_page) { + page_cache_release(req->wb_page); + req->wb_page = NULL; +@@ -151,6 +155,7 @@ nfs_release_request(struct nfs_page *req + + /* Release struct file or cached credential */ + nfs_clear_request(req); ++ put_nfs_open_context(req->wb_context); + nfs_page_free(req); + } + +@@ -194,12 +199,12 @@ nfs_list_add_request(struct nfs_page *re + int + nfs_wait_on_request(struct nfs_page *req) + { +- struct inode *inode = req->wb_inode; ++ struct inode *inode = req->wb_context->dentry->d_inode; + struct rpc_clnt *clnt = NFS_CLIENT(inode); + + if (!NFS_WBACK_BUSY(req)) + return 0; +- return nfs_wait_event(clnt, req->wb_wait, !NFS_WBACK_BUSY(req)); ++ return nfs_wait_event(clnt, req->wb_context->waitq, !NFS_WBACK_BUSY(req)); + } + + /** +@@ -224,7 +229,11 @@ nfs_coalesce_requests(struct list_head * + + req = nfs_list_entry(head->next); + if (prev) { +- if (req->wb_cred != prev->wb_cred) ++ if (req->wb_context->cred != prev->wb_context->cred) ++ break; ++ if (req->wb_context->pid != prev->wb_context->pid) ++ break; ++ if (req->wb_context->state != prev->wb_context->state) + break; + if (req->wb_index != (prev->wb_index + 1)) + break; +--- linux-2.6.7/fs/nfs/nfs4proc.c.lsec 2004-06-15 23:19:44.000000000 -0600 ++++ linux-2.6.7/fs/nfs/nfs4proc.c 2005-03-23 14:32:35.532162440 -0700 +@@ -47,12 +47,16 @@ + #include + #include + ++#include "delegation.h" ++ + #define NFSDBG_FACILITY NFSDBG_PROC + +-#define NFS4_POLL_RETRY_TIME (15*HZ) ++#define NFS4_POLL_RETRY_MIN (1*HZ) ++#define NFS4_POLL_RETRY_MAX (15*HZ) + + static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); + static int nfs4_async_handle_error(struct rpc_task *, struct nfs_server *); ++static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); + extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus); + extern struct rpc_procinfo nfs4_procedures[]; + +@@ -189,53 +193,296 @@ static void update_changeattr(struct ino + * reclaim state on the server after a reboot. + * Assumes caller is holding the sp->so_sem + */ +-int +-nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state) ++static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state) + { + struct inode *inode = state->inode; + struct nfs_server *server = NFS_SERVER(inode); +- struct nfs_fattr fattr = { +- .valid = 0, +- }; +- struct nfs_open_reclaimargs o_arg = { ++ struct nfs_delegation *delegation = NFS_I(inode)->delegation; ++ struct nfs_openargs o_arg = { + .fh = NFS_FH(inode), + .seqid = sp->so_seqid, + .id = sp->so_id, +- .share_access = state->state, ++ .open_flags = state->state, + .clientid = server->nfs4_state->cl_clientid, + .claim = NFS4_OPEN_CLAIM_PREVIOUS, + .bitmask = server->attr_bitmask, + }; + struct nfs_openres o_res = { +- .f_attr = &fattr, + .server = server, /* Grrr */ + }; + struct rpc_message msg = { +- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_RECLAIM], ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR], + .rpc_argp = &o_arg, + .rpc_resp = &o_res, + .rpc_cred = sp->so_cred, + }; + int status; + ++ if (delegation != NULL) { ++ if (!(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) { ++ memcpy(&state->stateid, &delegation->stateid, ++ sizeof(state->stateid)); ++ set_bit(NFS_DELEGATED_STATE, &state->flags); ++ return 0; ++ } ++ o_arg.u.delegation_type = delegation->type; ++ } + status = rpc_call_sync(server->client, &msg, 0); + nfs4_increment_seqid(status, sp); +- if (status == 0) ++ if (status == 0) { + memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid)); +- /* Update the inode attributes */ +- nfs_refresh_inode(inode, &fattr); ++ if (o_res.delegation_type != 0) { ++ nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res); ++ /* Did the server issue an immediate delegation recall? */ ++ if (o_res.do_recall) ++ nfs_async_inode_return_delegation(inode, &o_res.stateid); ++ } ++ } ++ clear_bit(NFS_DELEGATED_STATE, &state->flags); ++ /* Ensure we update the inode attributes */ ++ NFS_CACHEINV(inode); + return status; + } + ++int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state) ++{ ++ struct nfs_server *server = NFS_SERVER(state->inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = _nfs4_open_reclaim(sp, state); ++ switch (err) { ++ case 0: ++ case -NFS4ERR_STALE_CLIENTID: ++ case -NFS4ERR_STALE_STATEID: ++ case -NFS4ERR_EXPIRED: ++ return err; ++ } ++ err = nfs4_handle_exception(server, err, &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) ++{ ++ struct nfs4_state_owner *sp = state->owner; ++ struct inode *inode = dentry->d_inode; ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct dentry *parent = dget_parent(dentry); ++ struct nfs_openargs arg = { ++ .fh = NFS_FH(parent->d_inode), ++ .clientid = server->nfs4_state->cl_clientid, ++ .name = &dentry->d_name, ++ .id = sp->so_id, ++ .server = server, ++ .bitmask = server->attr_bitmask, ++ .claim = NFS4_OPEN_CLAIM_DELEGATE_CUR, ++ }; ++ struct nfs_openres res = { ++ .server = server, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR], ++ .rpc_argp = &arg, ++ .rpc_resp = &res, ++ .rpc_cred = sp->so_cred, ++ }; ++ int status = 0; ++ ++ down(&sp->so_sema); ++ if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) ++ goto out; ++ if (state->state == 0) ++ goto out; ++ arg.seqid = sp->so_seqid; ++ arg.open_flags = state->state; ++ memcpy(arg.u.delegation.data, state->stateid.data, sizeof(arg.u.delegation.data)); ++ status = rpc_call_sync(server->client, &msg, 0); ++ nfs4_increment_seqid(status, sp); ++ if (status >= 0) { ++ memcpy(state->stateid.data, res.stateid.data, ++ sizeof(state->stateid.data)); ++ clear_bit(NFS_DELEGATED_STATE, &state->flags); ++ } ++out: ++ up(&sp->so_sema); ++ dput(parent); ++ return status; ++} ++ ++int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) ++{ ++ struct nfs4_exception exception = { }; ++ struct nfs_server *server = NFS_SERVER(dentry->d_inode); ++ int err; ++ do { ++ err = _nfs4_open_delegation_recall(dentry, state); ++ switch (err) { ++ case 0: ++ return err; ++ case -NFS4ERR_STALE_CLIENTID: ++ case -NFS4ERR_STALE_STATEID: ++ case -NFS4ERR_EXPIRED: ++ /* Don't recall a delegation if it was lost */ ++ nfs4_schedule_state_recovery(server->nfs4_state); ++ return err; ++ } ++ err = nfs4_handle_exception(server, err, &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid) ++{ ++ struct nfs_open_confirmargs arg = { ++ .fh = fh, ++ .seqid = sp->so_seqid, ++ .stateid = *stateid, ++ }; ++ struct nfs_open_confirmres res; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM], ++ .rpc_argp = &arg, ++ .rpc_resp = &res, ++ .rpc_cred = sp->so_cred, ++ }; ++ int status; ++ ++ status = rpc_call_sync(clnt, &msg, 0); ++ nfs4_increment_seqid(status, sp); ++ if (status >= 0) ++ memcpy(stateid, &res.stateid, sizeof(*stateid)); ++ return status; ++} ++ ++static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int mask) ++{ ++ struct nfs_access_entry cache; ++ int status; ++ ++ status = nfs_access_get_cached(inode, cred, &cache); ++ if (status == 0) ++ goto out; ++ ++ /* Be clever: ask server to check for all possible rights */ ++ cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; ++ cache.cred = cred; ++ cache.jiffies = jiffies; ++ status = _nfs4_proc_access(inode, &cache); ++ if (status != 0) ++ return status; ++ nfs_access_add_cache(inode, &cache); ++out: ++ if ((cache.mask & mask) == mask) ++ return 0; ++ return -EACCES; ++} ++ ++/* ++ * Returns an nfs4_state + an extra reference to the inode ++ */ ++int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred, struct nfs4_state **res) ++{ ++ struct nfs_delegation *delegation; ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs4_client *clp = server->nfs4_state; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ struct nfs4_state_owner *sp = NULL; ++ struct nfs4_state *state = NULL; ++ int open_flags = flags & (FMODE_READ|FMODE_WRITE); ++ int mask = 0; ++ int err; ++ ++ /* Protect against reboot recovery - NOTE ORDER! */ ++ down_read(&clp->cl_sem); ++ /* Protect against delegation recall */ ++ down_read(&nfsi->rwsem); ++ delegation = NFS_I(inode)->delegation; ++ err = -ENOENT; ++ if (delegation == NULL || (delegation->type & open_flags) != open_flags) ++ goto out_err; ++ err = -ENOMEM; ++ if (!(sp = nfs4_get_state_owner(server, cred))) { ++ dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__); ++ goto out_err; ++ } ++ down(&sp->so_sema); ++ state = nfs4_get_open_state(inode, sp); ++ if (state == NULL) ++ goto out_err; ++ ++ err = -ENOENT; ++ if ((state->state & open_flags) == open_flags) { ++ spin_lock(&inode->i_lock); ++ if (open_flags & FMODE_READ) ++ state->nreaders++; ++ if (open_flags & FMODE_WRITE) ++ state->nwriters++; ++ spin_unlock(&inode->i_lock); ++ goto out_ok; ++ } else if (state->state != 0) ++ goto out_err; ++ ++ lock_kernel(); ++ err = _nfs4_do_access(inode, cred, mask); ++ unlock_kernel(); ++ if (err != 0) ++ goto out_err; ++ spin_lock(&inode->i_lock); ++ memcpy(state->stateid.data, delegation->stateid.data, ++ sizeof(state->stateid.data)); ++ state->state |= open_flags; ++ if (open_flags & FMODE_READ) ++ state->nreaders++; ++ if (open_flags & FMODE_WRITE) ++ state->nwriters++; ++ set_bit(NFS_DELEGATED_STATE, &state->flags); ++ spin_unlock(&inode->i_lock); ++out_ok: ++ up(&sp->so_sema); ++ nfs4_put_state_owner(sp); ++ up_read(&nfsi->rwsem); ++ up_read(&clp->cl_sem); ++ igrab(inode); ++ *res = state; ++ return 0; ++out_err: ++ if (sp != NULL) { ++ if (state != NULL) ++ nfs4_put_open_state(state); ++ up(&sp->so_sema); ++ nfs4_put_state_owner(sp); ++ } ++ up_read(&nfsi->rwsem); ++ up_read(&clp->cl_sem); ++ return err; ++} ++ ++static struct nfs4_state *nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred) ++{ ++ struct nfs4_exception exception = { }; ++ struct nfs4_state *res; ++ int err; ++ ++ do { ++ err = _nfs4_open_delegated(inode, flags, cred, &res); ++ if (err == 0) ++ break; ++ res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(inode), ++ err, &exception)); ++ } while (exception.retry); ++ return res; ++} ++ + /* + * Returns an nfs4_state + an referenced inode + */ +-struct nfs4_state * +-nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred) ++static int _nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) + { + struct nfs4_state_owner *sp; + struct nfs4_state *state = NULL; + struct nfs_server *server = NFS_SERVER(dir); ++ struct nfs4_client *clp = server->nfs4_state; + struct inode *inode = NULL; + int status; + struct nfs_fattr f_attr = { +@@ -243,12 +490,11 @@ nfs4_do_open(struct inode *dir, struct q + }; + struct nfs_openargs o_arg = { + .fh = NFS_FH(dir), +- .share_access = flags & (FMODE_READ|FMODE_WRITE), +- .opentype = (flags & O_CREAT) ? NFS4_OPEN_CREATE : NFS4_OPEN_NOCREATE, +- .createmode = (flags & O_EXCL) ? NFS4_CREATE_EXCLUSIVE : NFS4_CREATE_UNCHECKED, ++ .open_flags = flags, + .name = name, + .server = server, + .bitmask = server->attr_bitmask, ++ .claim = NFS4_OPEN_CLAIM_NULL, + }; + struct nfs_openres o_res = { + .f_attr = &f_attr, +@@ -261,60 +507,44 @@ nfs4_do_open(struct inode *dir, struct q + .rpc_cred = cred, + }; + +-retry: ++ /* Protect against reboot recovery conflicts */ ++ down_read(&clp->cl_sem); + status = -ENOMEM; +- if (!(sp = nfs4_get_state_owner(NFS_SERVER(dir), cred))) { ++ if (!(sp = nfs4_get_state_owner(server, cred))) { + dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); +- goto out; ++ goto out_err; + } +- if (o_arg.createmode & NFS4_CREATE_EXCLUSIVE){ ++ if (flags & O_EXCL) { + u32 *p = (u32 *) o_arg.u.verifier.data; + p[0] = jiffies; + p[1] = current->pid; +- } else if (o_arg.createmode == NFS4_CREATE_UNCHECKED) { ++ } else + o_arg.u.attrs = sattr; +- } + /* Serialization for the sequence id */ + down(&sp->so_sema); + o_arg.seqid = sp->so_seqid; + o_arg.id = sp->so_id; +- o_arg.clientid = NFS_SERVER(dir)->nfs4_state->cl_clientid, ++ o_arg.clientid = clp->cl_clientid; + + status = rpc_call_sync(server->client, &msg, 0); + nfs4_increment_seqid(status, sp); + if (status) +- goto out_up; ++ goto out_err; + update_changeattr(dir, &o_res.cinfo); ++ if(o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) { ++ status = _nfs4_proc_open_confirm(server->client, &o_res.fh, sp, &o_res.stateid); ++ if (status) ++ goto out_err; ++ } + + status = -ENOMEM; + inode = nfs_fhget(dir->i_sb, &o_res.fh, &f_attr); + if (!inode) +- goto out_up; ++ goto out_err; + state = nfs4_get_open_state(inode, sp); + if (!state) +- goto out_up; +- +- if(o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) { +- struct nfs_open_confirmargs oc_arg = { +- .fh = &o_res.fh, +- .seqid = sp->so_seqid, +- }; +- struct nfs_open_confirmres oc_res; +- struct rpc_message msg = { +- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM], +- .rpc_argp = &oc_arg, +- .rpc_resp = &oc_res, +- .rpc_cred = cred, +- }; +- +- memcpy(&oc_arg.stateid, &o_res.stateid, sizeof(oc_arg.stateid)); +- status = rpc_call_sync(server->client, &msg, 0); +- nfs4_increment_seqid(status, sp); +- if (status) +- goto out_up; +- memcpy(&state->stateid, &oc_res.stateid, sizeof(state->stateid)); +- } else +- memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid)); ++ goto out_err; ++ memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid)); + spin_lock(&inode->i_lock); + if (flags & FMODE_READ) + state->nreaders++; +@@ -322,47 +552,62 @@ retry: + state->nwriters++; + state->state |= flags & (FMODE_READ|FMODE_WRITE); + spin_unlock(&inode->i_lock); +- ++ if (o_res.delegation_type != 0) ++ nfs_inode_set_delegation(inode, cred, &o_res); + up(&sp->so_sema); + nfs4_put_state_owner(sp); +- return state; +- +-out_up: +- up(&sp->so_sema); +- nfs4_put_state_owner(sp); +- if (state) { +- nfs4_put_open_state(state); +- state = NULL; +- } +- if (inode) { ++ up_read(&clp->cl_sem); ++ *res = state; ++ return 0; ++out_err: ++ if (sp != NULL) { ++ if (state != NULL) ++ nfs4_put_open_state(state); ++ up(&sp->so_sema); ++ nfs4_put_state_owner(sp); ++ } ++ /* Note: clp->cl_sem must be released before nfs4_put_open_state()! */ ++ up_read(&clp->cl_sem); ++ if (inode != NULL) + iput(inode); +- inode = NULL; +- } +- /* NOTE: BAD_SEQID means the server and client disagree about the +- * book-keeping w.r.t. state-changing operations +- * (OPEN/CLOSE/LOCK/LOCKU...) +- * It is actually a sign of a bug on the client or on the server. +- * +- * If we receive a BAD_SEQID error in the particular case of +- * doing an OPEN, we assume that nfs4_increment_seqid() will +- * have unhashed the old state_owner for us, and that we can +- * therefore safely retry using a new one. We should still warn +- * the user though... +- */ +- if (status == -NFS4ERR_BAD_SEQID) { +- printk(KERN_WARNING "NFS: v4 server returned a bad sequence-id error!\n"); +- goto retry; +- } +- status = nfs4_handle_error(server, status); +- if (!status) +- goto retry; +- BUG_ON(status < -1000 || status > 0); +-out: +- return ERR_PTR(status); ++ *res = NULL; ++ return status; + } + +-int +-nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr, ++ ++struct nfs4_state *nfs4_do_open(struct inode *dir, struct qstr *name, int flags, struct iattr *sattr, struct rpc_cred *cred) ++{ ++ struct nfs4_exception exception = { }; ++ struct nfs4_state *res; ++ int status; ++ ++ do { ++ status = _nfs4_do_open(dir, name, flags, sattr, cred, &res); ++ if (status == 0) ++ break; ++ /* NOTE: BAD_SEQID means the server and client disagree about the ++ * book-keeping w.r.t. state-changing operations ++ * (OPEN/CLOSE/LOCK/LOCKU...) ++ * It is actually a sign of a bug on the client or on the server. ++ * ++ * If we receive a BAD_SEQID error in the particular case of ++ * doing an OPEN, we assume that nfs4_increment_seqid() will ++ * have unhashed the old state_owner for us, and that we can ++ * therefore safely retry using a new one. We should still warn ++ * the user though... ++ */ ++ if (status == -NFS4ERR_BAD_SEQID) { ++ printk(KERN_WARNING "NFS: v4 server returned a bad sequence-id error!\n"); ++ exception.retry = 1; ++ continue; ++ } ++ res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), ++ status, &exception)); ++ } while (exception.retry); ++ return res; ++} ++ ++static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr, + struct nfs_fh *fhandle, struct iattr *sattr, + struct nfs4_state *state) + { +@@ -381,9 +626,7 @@ nfs4_do_setattr(struct nfs_server *serve + .rpc_argp = &arg, + .rpc_resp = &res, + }; +- int status; + +-retry: + fattr->valid = 0; + + if (sattr->ia_valid & ATTR_SIZE) +@@ -391,13 +634,22 @@ retry: + else + memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); + +- status = rpc_call_sync(server->client, &msg, 0); +- if (status) { +- status = nfs4_handle_error(server, status); +- if (!status) +- goto retry; +- } +- return status; ++ return rpc_call_sync(server->client, &msg, 0); ++} ++ ++int nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr, ++ struct nfs_fh *fhandle, struct iattr *sattr, ++ struct nfs4_state *state) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_do_setattr(server, fattr, fhandle, sattr, ++ state), ++ &exception); ++ } while (exception.retry); ++ return err; + } + + /* +@@ -411,8 +663,7 @@ retry: + * + * NOTE: Caller must be holding the sp->so_owner semaphore! + */ +-int +-nfs4_do_close(struct inode *inode, struct nfs4_state *state) ++static int _nfs4_do_close(struct inode *inode, struct nfs4_state *state) + { + struct nfs4_state_owner *sp = state->owner; + int status = 0; +@@ -426,6 +677,8 @@ nfs4_do_close(struct inode *inode, struc + .rpc_resp = &res, + }; + ++ if (test_bit(NFS_DELEGATED_STATE, &state->flags)) ++ return 0; + memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid)); + /* Serialization for the sequence id */ + arg.seqid = sp->so_seqid, +@@ -441,15 +694,34 @@ nfs4_do_close(struct inode *inode, struc + return status; + } + +-int +-nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) ++int nfs4_do_close(struct inode *inode, struct nfs4_state *state) ++{ ++ struct nfs_server *server = NFS_SERVER(state->inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = _nfs4_do_close(inode, state); ++ switch (err) { ++ case -NFS4ERR_STALE_STATEID: ++ case -NFS4ERR_EXPIRED: ++ nfs4_schedule_state_recovery(server->nfs4_state); ++ case 0: ++ state->state = 0; ++ return 0; ++ } ++ err = nfs4_handle_exception(server, err, &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) + { + struct nfs4_state_owner *sp = state->owner; + int status = 0; + struct nfs_closeargs arg = { + .fh = NFS_FH(inode), + .seqid = sp->so_seqid, +- .share_access = mode, ++ .open_flags = mode, + }; + struct nfs_closeres res; + struct rpc_message msg = { +@@ -458,6 +730,8 @@ nfs4_do_downgrade(struct inode *inode, s + .rpc_resp = &res, + }; + ++ if (test_bit(NFS_DELEGATED_STATE, &state->flags)) ++ return 0; + memcpy(&arg.stateid, &state->stateid, sizeof(arg.stateid)); + status = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0); + nfs4_increment_seqid(status, sp); +@@ -467,6 +741,26 @@ nfs4_do_downgrade(struct inode *inode, s + return status; + } + ++int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode) ++{ ++ struct nfs_server *server = NFS_SERVER(state->inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = _nfs4_do_downgrade(inode, state, mode); ++ switch (err) { ++ case -NFS4ERR_STALE_STATEID: ++ case -NFS4ERR_EXPIRED: ++ nfs4_schedule_state_recovery(server->nfs4_state); ++ case 0: ++ state->state = mode; ++ return 0; ++ } ++ err = nfs4_handle_exception(server, err, &exception); ++ } while (exception.retry); ++ return err; ++} ++ + struct inode * + nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) + { +@@ -500,7 +794,9 @@ nfs4_open_revalidate(struct inode *dir, + struct inode *inode; + + cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); +- state = nfs4_do_open(dir, &dentry->d_name, openflags, NULL, cred); ++ state = nfs4_open_delegated(dentry->d_inode, openflags, cred); ++ if (IS_ERR(state)) ++ state = nfs4_do_open(dir, &dentry->d_name, openflags, NULL, cred); + put_rpccred(cred); + if (state == ERR_PTR(-ENOENT) && dentry->d_inode == 0) + return 1; +@@ -518,7 +814,7 @@ nfs4_open_revalidate(struct inode *dir, + } + + +-static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) ++static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) + { + struct nfs4_server_caps_res res = {}; + struct rpc_message msg = { +@@ -542,7 +838,19 @@ static int nfs4_server_capabilities(stru + return status; + } + +-static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, ++static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_server_capabilities(server, fhandle), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) + { + struct nfs_fattr * fattr = info->fattr; +@@ -563,6 +871,19 @@ static int nfs4_lookup_root(struct nfs_s + return rpc_call_sync(server->client, &msg, 0); + } + ++static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, ++ struct nfs_fsinfo *info) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_lookup_root(server, fhandle, info), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ + static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) + { +@@ -597,6 +918,8 @@ static int nfs4_proc_get_root(struct nfs + + p = server->mnt_path; + for (;;) { ++ struct nfs4_exception exception = { }; ++ + while (*p == '/') + p++; + if (!*p) +@@ -606,9 +929,13 @@ static int nfs4_proc_get_root(struct nfs + p++; + q.len = p - q.name; + +- fattr->valid = 0; +- status = rpc_call_sync(server->client, &msg, 0); +- if (!status) ++ do { ++ fattr->valid = 0; ++ status = nfs4_handle_exception(server, ++ rpc_call_sync(server->client, &msg, 0), ++ &exception); ++ } while (exception.retry); ++ if (status == 0) + continue; + if (status == -ENOENT) { + printk(KERN_NOTICE "NFS: mount path %s does not exist!\n", server->mnt_path); +@@ -621,10 +948,10 @@ static int nfs4_proc_get_root(struct nfs + if (status == 0) + status = nfs4_do_fsinfo(server, fhandle, info); + out: +- return nfs4_map_errors(status); ++ return status; + } + +-static int nfs4_proc_getattr(struct inode *inode, struct nfs_fattr *fattr) ++static int _nfs4_proc_getattr(struct inode *inode, struct nfs_fattr *fattr) + { + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_getattr_arg args = { +@@ -642,8 +969,19 @@ static int nfs4_proc_getattr(struct inod + }; + + fattr->valid = 0; ++ return rpc_call_sync(NFS_CLIENT(inode), &msg, 0); ++} + +- return nfs4_map_errors(rpc_call_sync(NFS_CLIENT(inode), &msg, 0)); ++static int nfs4_proc_getattr(struct inode *inode, struct nfs_fattr *fattr) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(inode), ++ _nfs4_proc_getattr(inode, fattr), ++ &exception); ++ } while (exception.retry); ++ return err; + } + + /* +@@ -678,9 +1016,13 @@ nfs4_proc_setattr(struct dentry *dentry, + if (size_change) { + struct rpc_cred *cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0); + state = nfs4_find_state(inode, cred, FMODE_WRITE); +- if (!state) { +- state = nfs4_do_open(dentry->d_parent->d_inode, +- &dentry->d_name, FMODE_WRITE, NULL, cred); ++ if (state == NULL) { ++ state = nfs4_open_delegated(dentry->d_inode, ++ FMODE_WRITE, cred); ++ if (IS_ERR(state)) ++ state = nfs4_do_open(dentry->d_parent->d_inode, ++ &dentry->d_name, FMODE_WRITE, ++ NULL, cred); + need_iput = 1; + } + put_rpccred(cred); +@@ -705,7 +1047,7 @@ out: + return status; + } + +-static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, ++static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) + { + int status; +@@ -731,12 +1073,23 @@ static int nfs4_proc_lookup(struct inode + dprintk("NFS call lookup %s\n", name->name); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + dprintk("NFS reply lookup: %d\n", status); +- return nfs4_map_errors(status); ++ return status; + } + +-static int nfs4_proc_access(struct inode *inode, struct rpc_cred *cred, int mode) ++static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(dir), ++ _nfs4_proc_lookup(dir, name, fhandle, fattr), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) + { +- int status; + struct nfs4_accessargs args = { + .fh = NFS_FH(inode), + }; +@@ -745,8 +1098,10 @@ static int nfs4_proc_access(struct inode + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], + .rpc_argp = &args, + .rpc_resp = &res, +- .rpc_cred = cred, ++ .rpc_cred = entry->cred, + }; ++ int mode = entry->mask; ++ int status; + + /* + * Determine which access bits we want to ask for... +@@ -758,8 +1113,7 @@ static int nfs4_proc_access(struct inode + args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE; + if (mode & MAY_EXEC) + args.access |= NFS4_ACCESS_LOOKUP; +- } +- else { ++ } else { + if (mode & MAY_WRITE) + args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND; + if (mode & MAY_EXEC) +@@ -767,13 +1121,27 @@ static int nfs4_proc_access(struct inode + } + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + if (!status) { +- if (args.access != res.supported) { +- printk(KERN_NOTICE "NFS: server didn't support all access bits!\n"); +- status = -ENOTSUPP; +- } else if ((args.access & res.access) != args.access) +- status = -EACCES; ++ entry->mask = 0; ++ if (res.access & NFS4_ACCESS_READ) ++ entry->mask |= MAY_READ; ++ if (res.access & (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) ++ entry->mask |= MAY_WRITE; ++ if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) ++ entry->mask |= MAY_EXEC; + } +- return nfs4_map_errors(status); ++ return status; ++} ++ ++static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(inode), ++ _nfs4_proc_access(inode, entry), ++ &exception); ++ } while (exception.retry); ++ return err; + } + + /* +@@ -800,7 +1168,7 @@ static int nfs4_proc_access(struct inode + * Both of these changes to the XDR layer would in fact be quite + * minor, but I decided to leave them for a subsequent patch. + */ +-static int nfs4_proc_readlink(struct inode *inode, struct page *page) ++static int _nfs4_proc_readlink(struct inode *inode, struct page *page) + { + struct nfs4_readlink args = { + .fh = NFS_FH(inode), +@@ -813,11 +1181,22 @@ static int nfs4_proc_readlink(struct ino + .rpc_resp = NULL, + }; + +- return nfs4_map_errors(rpc_call_sync(NFS_CLIENT(inode), &msg, 0)); ++ return rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + } + +-static int +-nfs4_proc_read(struct nfs_read_data *rdata, struct file *filp) ++static int nfs4_proc_readlink(struct inode *inode, struct page *page) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(inode), ++ _nfs4_proc_readlink(inode, page), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_proc_read(struct nfs_read_data *rdata) + { + int flags = rdata->flags; + struct inode *inode = rdata->inode; +@@ -827,6 +1206,7 @@ nfs4_proc_read(struct nfs_read_data *rda + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ], + .rpc_argp = &rdata->args, + .rpc_resp = &rdata->res, ++ .rpc_cred = rdata->cred, + }; + unsigned long timestamp = jiffies; + int status; +@@ -834,29 +1214,27 @@ nfs4_proc_read(struct nfs_read_data *rda + dprintk("NFS call read %d @ %Ld\n", rdata->args.count, + (long long) rdata->args.offset); + +- /* +- * Try first to use O_RDONLY, then O_RDWR stateid. +- */ +- if (filp) { +- struct nfs4_state *state; +- state = (struct nfs4_state *)filp->private_data; +- rdata->args.state = state; +- msg.rpc_cred = state->owner->so_cred; +- } else { +- rdata->args.state = NULL; +- msg.rpc_cred = NFS_I(inode)->mm_cred; +- } +- + fattr->valid = 0; + status = rpc_call_sync(server->client, &msg, flags); + if (!status) + renew_lease(server, timestamp); + dprintk("NFS reply read: %d\n", status); +- return nfs4_map_errors(status); ++ return status; + } + +-static int +-nfs4_proc_write(struct nfs_write_data *wdata, struct file *filp) ++static int nfs4_proc_read(struct nfs_read_data *rdata) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(rdata->inode), ++ _nfs4_proc_read(rdata), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_proc_write(struct nfs_write_data *wdata) + { + int rpcflags = wdata->flags; + struct inode *inode = wdata->inode; +@@ -866,33 +1244,32 @@ nfs4_proc_write(struct nfs_write_data *w + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE], + .rpc_argp = &wdata->args, + .rpc_resp = &wdata->res, ++ .rpc_cred = wdata->cred, + }; + int status; + + dprintk("NFS call write %d @ %Ld\n", wdata->args.count, + (long long) wdata->args.offset); + +- /* +- * Try first to use O_WRONLY, then O_RDWR stateid. +- */ +- if (filp) { +- struct nfs4_state *state; +- state = (struct nfs4_state *)filp->private_data; +- wdata->args.state = state; +- msg.rpc_cred = state->owner->so_cred; +- } else { +- wdata->args.state = NULL; +- msg.rpc_cred = NFS_I(inode)->mm_cred; +- } +- + fattr->valid = 0; + status = rpc_call_sync(server->client, &msg, rpcflags); + dprintk("NFS reply write: %d\n", status); +- return nfs4_map_errors(status); ++ return status; + } + +-static int +-nfs4_proc_commit(struct nfs_write_data *cdata, struct file *filp) ++static int nfs4_proc_write(struct nfs_write_data *wdata) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(wdata->inode), ++ _nfs4_proc_write(wdata), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_proc_commit(struct nfs_write_data *cdata) + { + struct inode *inode = cdata->inode; + struct nfs_fattr *fattr = cdata->res.fattr; +@@ -901,24 +1278,29 @@ nfs4_proc_commit(struct nfs_write_data * + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT], + .rpc_argp = &cdata->args, + .rpc_resp = &cdata->res, ++ .rpc_cred = cdata->cred, + }; + int status; + + dprintk("NFS call commit %d @ %Ld\n", cdata->args.count, + (long long) cdata->args.offset); + +- /* +- * Try first to use O_WRONLY, then O_RDWR stateid. +- */ +- if (filp) +- msg.rpc_cred = ((struct nfs4_state *)filp->private_data)->owner->so_cred; +- else +- msg.rpc_cred = NFS_I(inode)->mm_cred; +- + fattr->valid = 0; + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply commit: %d\n", status); +- return nfs4_map_errors(status); ++ return status; ++} ++ ++static int nfs4_proc_commit(struct nfs_write_data *cdata) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(cdata->inode), ++ _nfs4_proc_commit(cdata), ++ &exception); ++ } while (exception.retry); ++ return err; + } + + /* +@@ -965,7 +1347,7 @@ nfs4_proc_create(struct inode *dir, stru + return inode; + } + +-static int nfs4_proc_remove(struct inode *dir, struct qstr *name) ++static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) + { + struct nfs4_remove_arg args = { + .fh = NFS_FH(dir), +@@ -982,7 +1364,19 @@ static int nfs4_proc_remove(struct inode + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + if (status == 0) + update_changeattr(dir, &res); +- return nfs4_map_errors(status); ++ return status; ++} ++ ++static int nfs4_proc_remove(struct inode *dir, struct qstr *name) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(dir), ++ _nfs4_proc_remove(dir, name), ++ &exception); ++ } while (exception.retry); ++ return err; + } + + struct unlink_desc { +@@ -1023,7 +1417,7 @@ static int nfs4_proc_unlink_done(struct + return 0; + } + +-static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, ++static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, + struct inode *new_dir, struct qstr *new_name) + { + struct nfs4_rename_arg arg = { +@@ -1046,10 +1440,24 @@ static int nfs4_proc_rename(struct inode + update_changeattr(old_dir, &res.old_cinfo); + update_changeattr(new_dir, &res.new_cinfo); + } +- return nfs4_map_errors(status); ++ return status; + } + +-static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) ++static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, ++ struct inode *new_dir, struct qstr *new_name) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(old_dir), ++ _nfs4_proc_rename(old_dir, old_name, ++ new_dir, new_name), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) + { + struct nfs4_link_arg arg = { + .fh = NFS_FH(inode), +@@ -1068,10 +1476,22 @@ static int nfs4_proc_link(struct inode * + if (!status) + update_changeattr(dir, &cinfo); + +- return nfs4_map_errors(status); ++ return status; ++} ++ ++static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(inode), ++ _nfs4_proc_link(inode, dir, name), ++ &exception); ++ } while (exception.retry); ++ return err; + } + +-static int nfs4_proc_symlink(struct inode *dir, struct qstr *name, ++static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name, + struct qstr *path, struct iattr *sattr, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) + { +@@ -1090,22 +1510,39 @@ static int nfs4_proc_symlink(struct inod + .fattr = fattr, + }; + struct rpc_message msg = { +- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE], ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK], + .rpc_argp = &arg, + .rpc_resp = &res, + }; + int status; + ++ if (path->len > NFS4_MAXPATHLEN) ++ return -ENAMETOOLONG; + arg.u.symlink = path; + fattr->valid = 0; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + if (!status) + update_changeattr(dir, &res.dir_cinfo); +- return nfs4_map_errors(status); ++ return status; + } + +-static int nfs4_proc_mkdir(struct inode *dir, struct qstr *name, ++static int nfs4_proc_symlink(struct inode *dir, struct qstr *name, ++ struct qstr *path, struct iattr *sattr, struct nfs_fh *fhandle, ++ struct nfs_fattr *fattr) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(dir), ++ _nfs4_proc_symlink(dir, name, path, sattr, ++ fhandle, fattr), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_proc_mkdir(struct inode *dir, struct qstr *name, + struct iattr *sattr, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) + { +@@ -1135,10 +1572,25 @@ static int nfs4_proc_mkdir(struct inode + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + if (!status) + update_changeattr(dir, &res.dir_cinfo); +- return nfs4_map_errors(status); ++ return status; + } + +-static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, ++static int nfs4_proc_mkdir(struct inode *dir, struct qstr *name, ++ struct iattr *sattr, struct nfs_fh *fhandle, ++ struct nfs_fattr *fattr) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(dir), ++ _nfs4_proc_mkdir(dir, name, sattr, ++ fhandle, fattr), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, + u64 cookie, struct page *page, unsigned int count, int plus) + { + struct inode *dir = dentry->d_inode; +@@ -1164,10 +1616,24 @@ static int nfs4_proc_readdir(struct dent + if (status == 0) + memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); + unlock_kernel(); +- return nfs4_map_errors(status); ++ return status; + } + +-static int nfs4_proc_mknod(struct inode *dir, struct qstr *name, ++static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, ++ u64 cookie, struct page *page, unsigned int count, int plus) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), ++ _nfs4_proc_readdir(dentry, cred, cookie, ++ page, count, plus), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_proc_mknod(struct inode *dir, struct qstr *name, + struct iattr *sattr, dev_t rdev, struct nfs_fh *fh, + struct nfs_fattr *fattr) + { +@@ -1214,10 +1680,25 @@ static int nfs4_proc_mknod(struct inode + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + if (!status) + update_changeattr(dir, &res.dir_cinfo); +- return nfs4_map_errors(status); ++ return status; ++} ++ ++static int nfs4_proc_mknod(struct inode *dir, struct qstr *name, ++ struct iattr *sattr, dev_t rdev, struct nfs_fh *fh, ++ struct nfs_fattr *fattr) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(dir), ++ _nfs4_proc_mknod(dir, name, sattr, rdev, ++ fh, fattr), ++ &exception); ++ } while (exception.retry); ++ return err; + } + +-static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, ++static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsstat *fsstat) + { + struct nfs4_statfs_arg args = { +@@ -1231,10 +1712,22 @@ static int nfs4_proc_statfs(struct nfs_s + }; + + fsstat->fattr->valid = 0; +- return nfs4_map_errors(rpc_call_sync(server->client, &msg, 0)); ++ return rpc_call_sync(server->client, &msg, 0); + } + +-static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, ++static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_proc_statfs(server, fhandle, fsstat), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *fsinfo) + { + struct nfs4_fsinfo_arg args = { +@@ -1247,16 +1740,29 @@ static int nfs4_do_fsinfo(struct nfs_ser + .rpc_resp = fsinfo, + }; + +- return nfs4_map_errors(rpc_call_sync(server->client, &msg, 0)); ++ return rpc_call_sync(server->client, &msg, 0); ++} ++ ++static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_do_fsinfo(server, fhandle, fsinfo), ++ &exception); ++ } while (exception.retry); ++ return err; + } + + static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) + { + fsinfo->fattr->valid = 0; +- return nfs4_map_errors(nfs4_do_fsinfo(server, fhandle, fsinfo)); ++ return nfs4_do_fsinfo(server, fhandle, fsinfo); + } + +-static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, ++static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *pathconf) + { + struct nfs4_pathconf_arg args = { +@@ -1276,7 +1782,21 @@ static int nfs4_proc_pathconf(struct nfs + } + + pathconf->fattr->valid = 0; +- return nfs4_map_errors(rpc_call_sync(server->client, &msg, 0)); ++ return rpc_call_sync(server->client, &msg, 0); ++} ++ ++static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, ++ struct nfs_pathconf *pathconf) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_proc_pathconf(server, fhandle, pathconf), ++ &exception); ++ } while (exception.retry); ++ return err; + } + + static void +@@ -1467,8 +1987,10 @@ static int + nfs4_proc_file_open(struct inode *inode, struct file *filp) + { + struct dentry *dentry = filp->f_dentry; +- struct nfs4_state *state; ++ struct nfs_open_context *ctx; ++ struct nfs4_state *state = NULL; + struct rpc_cred *cred; ++ int status = -ENOMEM; + + dprintk("nfs4_proc_file_open: starting on (%.*s/%.*s)\n", + (int)dentry->d_parent->d_name.len, +@@ -1478,21 +2000,28 @@ nfs4_proc_file_open(struct inode *inode, + + /* Find our open stateid */ + cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0); +- state = nfs4_find_state(inode, cred, filp->f_mode); ++ if (unlikely(cred == NULL)) ++ return -ENOMEM; ++ ctx = alloc_nfs_open_context(dentry, cred); + put_rpccred(cred); +- if (state == NULL) { +- printk(KERN_WARNING "NFS: v4 raced in function %s\n", __FUNCTION__); +- return -EIO; /* ERACE actually */ +- } ++ if (unlikely(ctx == NULL)) ++ return -ENOMEM; ++ status = -EIO; /* ERACE actually */ ++ state = nfs4_find_state(inode, cred, filp->f_mode); ++ if (unlikely(state == NULL)) ++ goto no_state; ++ ctx->state = state; + nfs4_close_state(state, filp->f_mode); +- if (filp->f_mode & FMODE_WRITE) { +- lock_kernel(); +- nfs_set_mmcred(inode, state->owner->so_cred); ++ ctx->mode = filp->f_mode; ++ nfs_file_set_open_context(filp, ctx); ++ put_nfs_open_context(ctx); ++ if (filp->f_mode & FMODE_WRITE) + nfs_begin_data_update(inode); +- unlock_kernel(); +- } +- filp->private_data = state; + return 0; ++no_state: ++ printk(KERN_WARNING "NFS: v4 raced in function %s\n", __FUNCTION__); ++ put_nfs_open_context(ctx); ++ return status; + } + + /* +@@ -1501,35 +2030,148 @@ nfs4_proc_file_open(struct inode *inode, + static int + nfs4_proc_file_release(struct inode *inode, struct file *filp) + { +- struct nfs4_state *state = (struct nfs4_state *)filp->private_data; +- +- if (state) +- nfs4_close_state(state, filp->f_mode); +- if (filp->f_mode & FMODE_WRITE) { +- lock_kernel(); ++ if (filp->f_mode & FMODE_WRITE) + nfs_end_data_update(inode); +- unlock_kernel(); +- } ++ nfs_file_clear_open_context(filp); + return 0; + } + +-/* +- * Set up the nfspage struct with the right state info and credentials +- */ ++static ssize_t ++nfs4_read_acl_attr(struct inode *inode, char *buf, ssize_t buflen) ++{ ++ struct nfs_inode *nfsi = NFS_I(inode); ++ int ret; ++ ++ spin_lock(&inode->i_lock); ++ if (buf == NULL && nfsi->acl_len) ++ goto out_len; ++ ret = -ENOENT; ++ if (nfsi->acl_len == 0) ++ goto out; ++ ret = -ERANGE; /* see getxattr(2) man page */ ++ if (nfsi->acl_len > buflen) ++ goto out; ++ memcpy(buf, nfsi->acl, nfsi->acl_len); ++out_len: ++ ret = nfsi->acl_len; ++out: ++ spin_unlock(&inode->i_lock); ++ return ret; ++} ++ + static void +-nfs4_request_init(struct nfs_page *req, struct file *filp) ++nfs4_set_acl_attr(struct inode *inode, char *buf, ssize_t buflen) + { +- struct nfs4_state *state; ++ struct nfs_inode *nfsi = NFS_I(inode); + +- if (!filp) { +- req->wb_cred = get_rpccred(NFS_I(req->wb_inode)->mm_cred); +- req->wb_state = NULL; +- return; ++ spin_lock(&inode->i_lock); ++ kfree(nfsi->acl); ++ nfsi->acl = buf; ++ nfsi->acl_len = buflen; ++ spin_unlock(&inode->i_lock); ++} ++ ++static int ++nfs4_write_acl_attr(struct inode *inode, const char *buf, ssize_t buflen) ++{ ++ void *abuf = NULL; ++ ++ if (buflen > PAGE_SIZE) ++ goto out_nomem; ++ abuf = kmalloc(buflen, GFP_KERNEL); ++ if (abuf == NULL) ++ goto out_nomem; ++ memcpy(abuf, buf, buflen); ++ nfs4_set_acl_attr(inode, abuf, buflen); ++ return 0; ++out_nomem: ++ nfs4_set_acl_attr(inode, NULL, 0); ++ return -ENOMEM; ++} ++ ++void ++nfs4_zap_acl_attr(struct inode *inode) ++{ ++ nfs4_set_acl_attr(inode, NULL, 0); ++} ++ ++static int ++nfs4_server_supports_acls(struct nfs_server *server) ++{ ++ return (server->caps & NFS_CAP_ACLS) ++ && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) ++ && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL); ++} ++ ++ssize_t ++nfs4_proc_get_acl(struct inode *inode, void *buf, ssize_t buflen) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs_getaclres res = { ++ .acl = buf, ++ .acl_len = buflen, ++ .server = server, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL], ++ .rpc_argp = NFS_FH(inode), ++ .rpc_resp = &res, ++ }; ++ int ret; ++ ++ if (!nfs4_server_supports_acls(server)) ++ return -EOPNOTSUPP; ++ lock_kernel(); ++ ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); ++ if (ret < 0) ++ goto out; ++ ret = nfs4_read_acl_attr(inode, buf, buflen); ++ if (ret == -ENOENT) { ++ ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); ++ if (ret == 0) { ++ nfs4_write_acl_attr(inode, res.acl, res.acl_len); ++ ret = res.acl_len; ++ } ++ if (res.acl != buf) { ++ /* xdr decode allocated the memory: */ ++ kfree(res.acl); ++ } + } +- state = (struct nfs4_state *)filp->private_data; +- req->wb_state = state; +- req->wb_cred = get_rpccred(state->owner->so_cred); +- req->wb_lockowner = current->files; ++out: ++ unlock_kernel(); ++ return ret; ++} ++ ++int ++nfs4_proc_set_acl(struct inode *inode, const void *buf, ssize_t buflen) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs_setaclargs arg = { ++ .fh = NFS_FH(inode), ++ .server = server, ++ .acl = buf, ++ .acl_len = buflen, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETACL], ++ .rpc_argp = &arg, ++ .rpc_resp = NULL, ++ }; ++ int ret; ++ ++ if (!nfs4_server_supports_acls(server)) ++ return -EOPNOTSUPP; ++ ++ /* XXX: should check for buflen too large? */ ++ ++ lock_kernel(); ++ ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0); ++ unlock_kernel(); ++ ++ if (ret == 0) ++ nfs4_write_acl_attr(inode, buf, buflen); ++ ++ return ret; + } + + static int +@@ -1545,11 +2187,13 @@ nfs4_async_handle_error(struct rpc_task + case -NFS4ERR_EXPIRED: + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL); + nfs4_schedule_state_recovery(clp); ++ if (test_bit(NFS4CLNT_OK, &clp->cl_state)) ++ rpc_wake_up_task(task); + task->tk_status = 0; + return -EAGAIN; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: +- rpc_delay(task, NFS4_POLL_RETRY_TIME); ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); + task->tk_status = 0; + return -EAGAIN; + case -NFS4ERR_OLD_STATEID: +@@ -1560,12 +2204,11 @@ nfs4_async_handle_error(struct rpc_task + return 0; + } + +-int +-nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp) ++int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp) + { + DEFINE_WAIT(wait); + sigset_t oldset; +- int interruptible, res; ++ int interruptible, res = 0; + + might_sleep(); + +@@ -1573,101 +2216,85 @@ nfs4_wait_clnt_recover(struct rpc_clnt * + interruptible = TASK_UNINTERRUPTIBLE; + if (clnt->cl_intr) + interruptible = TASK_INTERRUPTIBLE; +- do { +- res = 0; +- prepare_to_wait(&clp->cl_waitq, &wait, interruptible); +- nfs4_schedule_state_recovery(clp); +- if (test_bit(NFS4CLNT_OK, &clp->cl_state) && +- !test_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state)) +- break; +- if (clnt->cl_intr && signalled()) { +- res = -ERESTARTSYS; +- break; +- } ++ prepare_to_wait(&clp->cl_waitq, &wait, interruptible); ++ nfs4_schedule_state_recovery(clp); ++ if (clnt->cl_intr && signalled()) ++ res = -ERESTARTSYS; ++ else if (!test_bit(NFS4CLNT_OK, &clp->cl_state)) + schedule(); +- } while(!test_bit(NFS4CLNT_OK, &clp->cl_state)); + finish_wait(&clp->cl_waitq, &wait); + rpc_clnt_sigunmask(clnt, &oldset); + return res; + } + +-static int +-nfs4_delay(struct rpc_clnt *clnt) ++static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) + { + sigset_t oldset; + int res = 0; + + might_sleep(); + ++ if (*timeout <= 0) ++ *timeout = NFS4_POLL_RETRY_MIN; ++ if (*timeout > NFS4_POLL_RETRY_MAX) ++ *timeout = NFS4_POLL_RETRY_MAX; + rpc_clnt_sigmask(clnt, &oldset); + if (clnt->cl_intr) { + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(NFS4_POLL_RETRY_TIME); ++ schedule_timeout(*timeout); + if (signalled()) + res = -ERESTARTSYS; + } else { + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(NFS4_POLL_RETRY_TIME); ++ schedule_timeout(*timeout); + } + rpc_clnt_sigunmask(clnt, &oldset); ++ *timeout <<= 1; + return res; + } + + /* This is the error handling routine for processes that are allowed + * to sleep. + */ +-int +-nfs4_handle_error(struct nfs_server *server, int errorcode) ++int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) + { + struct nfs4_client *clp = server->nfs4_state; + int ret = errorcode; + ++ exception->retry = 0; + switch(errorcode) { ++ case 0: ++ return 0; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + ret = nfs4_wait_clnt_recover(server->client, clp); ++ if (ret == 0) ++ exception->retry = 1; + break; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: +- ret = nfs4_delay(server->client); ++ ret = nfs4_delay(server->client, &exception->timeout); ++ if (ret == 0) ++ exception->retry = 1; + break; + case -NFS4ERR_OLD_STATEID: +- ret = 0; ++ if (ret == 0) ++ exception->retry = 1; + } + /* We failed to handle the error */ + return nfs4_map_errors(ret); + } + +- +-static int +-nfs4_request_compatible(struct nfs_page *req, struct file *filp, struct page *page) +-{ +- struct nfs4_state *state = NULL; +- struct rpc_cred *cred = NULL; +- +- if (req->wb_file != filp) +- return 0; +- if (req->wb_page != page) +- return 0; +- state = (struct nfs4_state *)filp->private_data; +- if (req->wb_state != state) +- return 0; +- if (req->wb_lockowner != current->files) +- return 0; +- cred = state->owner->so_cred; +- if (req->wb_cred != cred) +- return 0; +- return 1; +-} +- +-int +-nfs4_proc_setclientid(struct nfs4_client *clp, +- u32 program, unsigned short port) ++int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short port) + { +- u32 *p; +- struct nfs4_setclientid setclientid; +- struct timespec tv; ++ static nfs4_verifier sc_verifier; ++ static int initialized; ++ ++ struct nfs4_setclientid setclientid = { ++ .sc_verifier = &sc_verifier, ++ .sc_prog = program, ++ }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], + .rpc_argp = &setclientid, +@@ -1675,15 +2302,24 @@ nfs4_proc_setclientid(struct nfs4_client + .rpc_cred = clp->cl_cred, + }; + +- tv = CURRENT_TIME; +- p = (u32*)setclientid.sc_verifier.data; +- *p++ = (u32)tv.tv_sec; +- *p = (u32)tv.tv_nsec; +- setclientid.sc_name = clp->cl_ipaddr; +- sprintf(setclientid.sc_netid, "tcp"); +- sprintf(setclientid.sc_uaddr, "%s.%d.%d", clp->cl_ipaddr, port >> 8, port & 255); +- setclientid.sc_prog = htonl(program); +- setclientid.sc_cb_ident = 0; ++ if (!initialized) { ++ struct timespec boot_time; ++ u32 *p; ++ ++ initialized = 1; ++ boot_time = CURRENT_TIME; ++ p = (u32*)sc_verifier.data; ++ *p++ = htonl((u32)boot_time.tv_sec); ++ *p = htonl((u32)boot_time.tv_nsec); ++ } ++ setclientid.sc_name_len = scnprintf(setclientid.sc_name, ++ sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u", ++ clp->cl_ipaddr, NIPQUAD(clp->cl_addr.s_addr)); ++ setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, ++ sizeof(setclientid.sc_netid), "tcp"); ++ setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr, ++ sizeof(setclientid.sc_uaddr), "%s.%d.%d", ++ clp->cl_ipaddr, port >> 8, port & 255); + + return rpc_call_sync(clp->cl_rpcclient, &msg, 0); + } +@@ -1712,6 +2348,40 @@ nfs4_proc_setclientid_confirm(struct nfs + return status; + } + ++static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid) ++{ ++ struct nfs4_delegreturnargs args = { ++ .fhandle = NFS_FH(inode), ++ .stateid = stateid, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN], ++ .rpc_argp = &args, ++ .rpc_cred = cred, ++ }; ++ ++ return rpc_call_sync(NFS_CLIENT(inode), &msg, 0); ++} ++ ++int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = _nfs4_proc_delegreturn(inode, cred, stateid); ++ switch (err) { ++ case -NFS4ERR_STALE_STATEID: ++ case -NFS4ERR_EXPIRED: ++ nfs4_schedule_state_recovery(server->nfs4_state); ++ case 0: ++ return 0; ++ } ++ err = nfs4_handle_exception(server, err, &exception); ++ } while (exception.retry); ++ return err; ++} ++ + #define NFS4_LOCK_MINTIMEOUT (1 * HZ) + #define NFS4_LOCK_MAXTIMEOUT (30 * HZ) + +@@ -1753,8 +2423,7 @@ nfs4_lck_length(struct file_lock *reques + return request->fl_end - request->fl_start + 1; + } + +-int +-nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) ++static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) + { + struct inode *inode = state->inode; + struct nfs_server *server = NFS_SERVER(inode); +@@ -1778,9 +2447,10 @@ nfs4_proc_getlk(struct nfs4_state *state + struct nfs4_lock_state *lsp; + int status; + ++ down_read(&clp->cl_sem); + nlo.clientid = clp->cl_clientid; + down(&state->lock_sema); +- lsp = nfs4_find_lock_state(state, request->fl_owner); ++ lsp = nfs4_find_lock_state(state, request->fl_pid); + if (lsp) + nlo.id = lsp->ls_id; + else { +@@ -1811,14 +2481,28 @@ nfs4_proc_getlk(struct nfs4_state *state + if (lsp) + nfs4_put_lock_state(lsp); + up(&state->lock_sema); +- return nfs4_map_errors(status); ++ up_read(&clp->cl_sem); ++ return status; + } + +-int +-nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) ++static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(state->inode), ++ _nfs4_proc_getlk(state, cmd, request), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) + { + struct inode *inode = state->inode; + struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs4_client *clp = server->nfs4_state; + struct nfs_lockargs arg = { + .fh = NFS_FH(inode), + .type = nfs4_lck_type(cmd, request), +@@ -1838,29 +2522,46 @@ nfs4_proc_unlck(struct nfs4_state *state + struct nfs_locku_opargs luargs; + int status = 0; + ++ down_read(&clp->cl_sem); + down(&state->lock_sema); +- lsp = nfs4_find_lock_state(state, request->fl_owner); ++ lsp = nfs4_find_lock_state(state, request->fl_pid); + if (!lsp) + goto out; +- luargs.seqid = lsp->ls_seqid; +- memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid)); +- arg.u.locku = &luargs; +- status = rpc_call_sync(server->client, &msg, 0); +- nfs4_increment_lock_seqid(status, lsp); ++ /* We might have lost the locks! */ ++ if ((lsp->flags & NFS_LOCK_INITIALIZED) != 0) { ++ luargs.seqid = lsp->ls_seqid; ++ memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid)); ++ arg.u.locku = &luargs; ++ status = rpc_call_sync(server->client, &msg, 0); ++ nfs4_increment_lock_seqid(status, lsp); ++ } + + if (status == 0) { + memcpy(&lsp->ls_stateid, &res.u.stateid, + sizeof(lsp->ls_stateid)); +- nfs4_notify_unlck(inode, request, lsp); ++ nfs4_notify_unlck(state, request, lsp); + } + nfs4_put_lock_state(lsp); + out: + up(&state->lock_sema); +- return nfs4_map_errors(status); ++ up_read(&clp->cl_sem); ++ return status; + } + +-static int +-nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) ++static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(state->inode), ++ _nfs4_proc_unlck(state, cmd, request), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *request, int reclaim) + { + struct inode *inode = state->inode; + struct nfs_server *server = NFS_SERVER(inode); +@@ -1881,23 +2582,22 @@ nfs4_proc_setlk(struct nfs4_state *state + .rpc_cred = state->owner->so_cred, + }; + struct nfs_lock_opargs largs = { ++ .reclaim = reclaim, + .new_lock_owner = 0, + }; + int status; + +- down(&state->lock_sema); +- lsp = nfs4_find_lock_state(state, request->fl_owner); +- if (lsp == NULL) { ++ lsp = nfs4_get_lock_state(state, request->fl_pid); ++ if (lsp == NULL) ++ return -ENOMEM; ++ if (!(lsp->flags & NFS_LOCK_INITIALIZED)) { + struct nfs4_state_owner *owner = state->owner; + struct nfs_open_to_lock otl = { + .lock_owner = { + .clientid = server->nfs4_state->cl_clientid, + }, + }; +- status = -ENOMEM; +- lsp = nfs4_alloc_lock_state(state, request->fl_owner); +- if (!lsp) +- goto out; ++ + otl.lock_seqid = lsp->ls_seqid; + otl.lock_owner.id = lsp->ls_id; + memcpy(&otl.open_stateid, &state->stateid, sizeof(otl.open_stateid)); +@@ -1926,25 +2626,60 @@ nfs4_proc_setlk(struct nfs4_state *state + /* save the returned stateid. */ + if (status == 0) { + memcpy(&lsp->ls_stateid, &res.u.stateid, sizeof(nfs4_stateid)); +- nfs4_notify_setlk(inode, request, lsp); ++ if (!reclaim) ++ nfs4_notify_setlk(state, request, lsp); + } else if (status == -NFS4ERR_DENIED) + status = -EAGAIN; + nfs4_put_lock_state(lsp); +-out: ++ return status; ++} ++ ++int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request) ++{ ++#ifdef F_SETLK64 ++ return _nfs4_do_setlk(state, F_SETLK64, request, 1); ++#else ++ return _nfs4_do_setlk(state, F_SETLK, request, 1); ++#endif ++} ++ ++static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) ++{ ++ struct nfs4_client *clp = state->owner->so_client; ++ int status; ++ ++ down_read(&clp->cl_sem); ++ down(&state->lock_sema); ++ status = _nfs4_do_setlk(state, cmd, request, 0); + up(&state->lock_sema); +- return nfs4_map_errors(status); ++ up_read(&clp->cl_sem); ++ return status; ++} ++ ++static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(NFS_SERVER(state->inode), ++ _nfs4_proc_setlk(state, cmd, request), ++ &exception); ++ } while (exception.retry); ++ return err; + } + + static int + nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) + { ++ struct nfs_open_context *ctx; + struct nfs4_state *state; + unsigned long timeout = NFS4_LOCK_MINTIMEOUT; + int status; + + /* verify open state */ +- state = (struct nfs4_state *)filp->private_data; +- BUG_ON(!state); ++ ctx = (struct nfs_open_context *)filp->private_data; ++ state = ctx->state; + + if (request->fl_start < 0 || request->fl_end < 0) + return -EINVAL; +@@ -1975,6 +2710,7 @@ struct nfs_rpc_ops nfs_v4_clientops = { + .version = 4, /* protocol version */ + .dentry_ops = &nfs4_dentry_operations, + .dir_inode_ops = &nfs4_dir_inode_operations, ++ .file_inode_ops = &nfs4_file_inode_operations, + .getroot = nfs4_proc_get_root, + .getattr = nfs4_proc_getattr, + .setattr = nfs4_proc_setattr, +@@ -2004,8 +2740,6 @@ struct nfs_rpc_ops nfs_v4_clientops = { + .commit_setup = nfs4_proc_commit_setup, + .file_open = nfs4_proc_file_open, + .file_release = nfs4_proc_file_release, +- .request_init = nfs4_request_init, +- .request_compatible = nfs4_request_compatible, + .lock = nfs4_proc_lock, + }; + +--- linux-2.6.7/fs/nfs/callback.h.lsec 2005-03-23 14:28:22.484631512 -0700 ++++ linux-2.6.7/fs/nfs/callback.h 2005-03-23 14:28:22.484631512 -0700 +@@ -0,0 +1,70 @@ ++/* ++ * linux/fs/nfs/callback.h ++ * ++ * Copyright (C) 2004 Trond Myklebust ++ * ++ * NFSv4 callback definitions ++ */ ++#ifndef __LINUX_FS_NFS_CALLBACK_H ++#define __LINUX_FS_NFS_CALLBACK_H ++ ++#define NFS4_CALLBACK 0x40000000 ++#define NFS4_CALLBACK_XDRSIZE 2048 ++#define NFS4_CALLBACK_BUFSIZE (1024 + NFS4_CALLBACK_XDRSIZE) ++ ++enum nfs4_callback_procnum { ++ CB_NULL = 0, ++ CB_COMPOUND = 1, ++}; ++ ++enum nfs4_callback_opnum { ++ OP_CB_GETATTR = 3, ++ OP_CB_RECALL = 4, ++ OP_CB_ILLEGAL = 10044, ++}; ++ ++struct cb_compound_hdr_arg { ++ int taglen; ++ const char *tag; ++ unsigned int callback_ident; ++ unsigned nops; ++}; ++ ++struct cb_compound_hdr_res { ++ uint32_t *status; ++ int taglen; ++ const char *tag; ++ uint32_t *nops; ++}; ++ ++struct cb_getattrargs { ++ struct sockaddr_in *addr; ++ struct nfs_fh fh; ++ uint32_t bitmap[2]; ++}; ++ ++struct cb_getattrres { ++ uint32_t status; ++ uint32_t bitmap[2]; ++ uint64_t size; ++ uint64_t change_attr; ++ struct timespec ctime; ++ struct timespec mtime; ++}; ++ ++struct cb_recallargs { ++ struct sockaddr_in *addr; ++ struct nfs_fh fh; ++ nfs4_stateid stateid; ++ uint32_t truncate; ++}; ++ ++extern unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); ++extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy); ++ ++extern int nfs_callback_up(void); ++extern int nfs_callback_down(void); ++ ++extern unsigned short nfs_callback_tcpport; ++ ++#endif /* __LINUX_FS_NFS_CALLBACK_H */ +--- linux-2.6.7/fs/nfs/direct.c.lsec 2004-06-15 23:19:53.000000000 -0600 ++++ linux-2.6.7/fs/nfs/direct.c 2005-03-23 14:28:22.702598376 -0700 +@@ -110,7 +110,7 @@ nfs_free_user_pages(struct page **pages, + * nfs_direct_read_seg - Read in one iov segment. Generate separate + * read RPCs for each "rsize" bytes. + * @inode: target inode +- * @file: target file (may be NULL) ++ * @ctx: target file open context + * user_addr: starting address of this segment of user's buffer + * count: size of this segment + * file_offset: offset in file to begin the operation +@@ -118,7 +118,7 @@ nfs_free_user_pages(struct page **pages, + * nr_pages: size of pages array + */ + static int +-nfs_direct_read_seg(struct inode *inode, struct file *file, ++nfs_direct_read_seg(struct inode *inode, struct nfs_open_context *ctx, + unsigned long user_addr, size_t count, loff_t file_offset, + struct page **pages, int nr_pages) + { +@@ -127,9 +127,10 @@ nfs_direct_read_seg(struct inode *inode, + int curpage = 0; + struct nfs_read_data rdata = { + .inode = inode, ++ .cred = ctx->cred, + .args = { + .fh = NFS_FH(inode), +- .lockowner = current->files, ++ .context = ctx, + }, + .res = { + .fattr = &rdata.fattr, +@@ -151,7 +152,7 @@ nfs_direct_read_seg(struct inode *inode, + user_addr + tot_bytes, rdata.args.pgbase, curpage); + + lock_kernel(); +- result = NFS_PROTO(inode)->read(&rdata, file); ++ result = NFS_PROTO(inode)->read(&rdata); + unlock_kernel(); + + if (result <= 0) { +@@ -183,7 +184,7 @@ nfs_direct_read_seg(struct inode *inode, + * nfs_direct_read - For each iov segment, map the user's buffer + * then generate read RPCs. + * @inode: target inode +- * @file: target file (may be NULL) ++ * @ctx: target file open context + * @iov: array of vectors that define I/O buffer + * file_offset: offset in file to begin the operation + * nr_segs: size of iovec array +@@ -193,7 +194,7 @@ nfs_direct_read_seg(struct inode *inode, + * server. + */ + static ssize_t +-nfs_direct_read(struct inode *inode, struct file *file, ++nfs_direct_read(struct inode *inode, struct nfs_open_context *ctx, + const struct iovec *iov, loff_t file_offset, + unsigned long nr_segs) + { +@@ -216,7 +217,7 @@ nfs_direct_read(struct inode *inode, str + return page_count; + } + +- result = nfs_direct_read_seg(inode, file, user_addr, size, ++ result = nfs_direct_read_seg(inode, ctx, user_addr, size, + file_offset, pages, page_count); + + nfs_free_user_pages(pages, page_count, 1); +@@ -239,7 +240,7 @@ nfs_direct_read(struct inode *inode, str + * nfs_direct_write_seg - Write out one iov segment. Generate separate + * write RPCs for each "wsize" bytes, then commit. + * @inode: target inode +- * @file: target file (may be NULL) ++ * @ctx: target file open context + * user_addr: starting address of this segment of user's buffer + * count: size of this segment + * file_offset: offset in file to begin the operation +@@ -247,7 +248,7 @@ nfs_direct_read(struct inode *inode, str + * nr_pages: size of pages array + */ + static int +-nfs_direct_write_seg(struct inode *inode, struct file *file, ++nfs_direct_write_seg(struct inode *inode, struct nfs_open_context *ctx, + unsigned long user_addr, size_t count, loff_t file_offset, + struct page **pages, int nr_pages) + { +@@ -257,9 +258,10 @@ nfs_direct_write_seg(struct inode *inode + struct nfs_writeverf first_verf; + struct nfs_write_data wdata = { + .inode = inode, ++ .cred = ctx->cred, + .args = { + .fh = NFS_FH(inode), +- .lockowner = current->files, ++ .context = ctx, + }, + .res = { + .fattr = &wdata.fattr, +@@ -290,7 +292,7 @@ retry: + user_addr + tot_bytes, wdata.args.pgbase, curpage); + + lock_kernel(); +- result = NFS_PROTO(inode)->write(&wdata, file); ++ result = NFS_PROTO(inode)->write(&wdata); + unlock_kernel(); + + if (result <= 0) { +@@ -325,7 +327,7 @@ retry: + wdata.args.offset = file_offset; + + lock_kernel(); +- result = NFS_PROTO(inode)->commit(&wdata, file); ++ result = NFS_PROTO(inode)->commit(&wdata); + unlock_kernel(); + + if (result < 0 || memcmp(&first_verf.verifier, +@@ -349,7 +351,7 @@ sync_retry: + * nfs_direct_write - For each iov segment, map the user's buffer + * then generate write and commit RPCs. + * @inode: target inode +- * @file: target file (may be NULL) ++ * @ctx: target file open context + * @iov: array of vectors that define I/O buffer + * file_offset: offset in file to begin the operation + * nr_segs: size of iovec array +@@ -358,8 +360,7 @@ sync_retry: + * that non-direct readers might access, so they will pick up these + * writes immediately. + */ +-static ssize_t +-nfs_direct_write(struct inode *inode, struct file *file, ++static int nfs_direct_write(struct inode *inode, struct nfs_open_context *ctx, + const struct iovec *iov, loff_t file_offset, + unsigned long nr_segs) + { +@@ -382,7 +383,7 @@ nfs_direct_write(struct inode *inode, st + return page_count; + } + +- result = nfs_direct_write_seg(inode, file, user_addr, size, ++ result = nfs_direct_write_seg(inode, ctx, user_addr, size, + file_offset, pages, page_count); + nfs_free_user_pages(pages, page_count, 0); + +@@ -414,6 +415,7 @@ nfs_direct_IO(int rw, struct kiocb *iocb + { + ssize_t result = -EINVAL; + struct file *file = iocb->ki_filp; ++ struct nfs_open_context *ctx; + struct dentry *dentry = file->f_dentry; + struct inode *inode = dentry->d_inode; + +@@ -423,19 +425,20 @@ nfs_direct_IO(int rw, struct kiocb *iocb + if (!is_sync_kiocb(iocb)) + return result; + ++ ctx = (struct nfs_open_context *)file->private_data; + switch (rw) { + case READ: + dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n", + dentry->d_name.name, file_offset, nr_segs); + +- result = nfs_direct_read(inode, file, iov, ++ result = nfs_direct_read(inode, ctx, iov, + file_offset, nr_segs); + break; + case WRITE: + dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n", + dentry->d_name.name, file_offset, nr_segs); + +- result = nfs_direct_write(inode, file, iov, ++ result = nfs_direct_write(inode, ctx, iov, + file_offset, nr_segs); + break; + default: +@@ -471,6 +474,8 @@ nfs_file_direct_read(struct kiocb *iocb, + ssize_t retval = -EINVAL; + loff_t *ppos = &iocb->ki_pos; + struct file *file = iocb->ki_filp; ++ struct nfs_open_context *ctx = ++ (struct nfs_open_context *) file->private_data; + struct dentry *dentry = file->f_dentry; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; +@@ -502,7 +507,7 @@ nfs_file_direct_read(struct kiocb *iocb, + goto out; + } + +- retval = nfs_direct_read(inode, file, &iov, pos, 1); ++ retval = nfs_direct_read(inode, ctx, &iov, pos, 1); + if (retval > 0) + *ppos = pos + retval; + +@@ -542,6 +547,8 @@ nfs_file_direct_write(struct kiocb *iocb + loff_t *ppos = &iocb->ki_pos; + unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + struct file *file = iocb->ki_filp; ++ struct nfs_open_context *ctx = ++ (struct nfs_open_context *) file->private_data; + struct dentry *dentry = file->f_dentry; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; +@@ -589,7 +596,7 @@ nfs_file_direct_write(struct kiocb *iocb + goto out; + } + +- retval = nfs_direct_write(inode, file, &iov, pos, 1); ++ retval = nfs_direct_write(inode, ctx, &iov, pos, 1); + if (mapping->nrpages) + invalidate_inode_pages2(mapping); + if (retval > 0) +--- linux-2.6.7/fs/nfs/nfs4state.c.lsec 2004-06-15 23:18:47.000000000 -0600 ++++ linux-2.6.7/fs/nfs/nfs4state.c 2005-03-23 14:28:22.939562352 -0700 +@@ -40,11 +40,15 @@ + + #include + #include ++#include + #include + #include + #include + #include + ++#include "callback.h" ++#include "delegation.h" ++ + #define OPENOWNER_POOL_SIZE 8 + + static spinlock_t state_spinlock = SPIN_LOCK_UNLOCKED; +@@ -93,21 +97,26 @@ nfs4_alloc_client(struct in_addr *addr) + { + struct nfs4_client *clp; + +- if ((clp = kmalloc(sizeof(*clp), GFP_KERNEL))) { +- memset(clp, 0, sizeof(*clp)); +- memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr)); +- init_rwsem(&clp->cl_sem); +- INIT_LIST_HEAD(&clp->cl_state_owners); +- INIT_LIST_HEAD(&clp->cl_unused); +- spin_lock_init(&clp->cl_lock); +- atomic_set(&clp->cl_count, 1); +- INIT_WORK(&clp->cl_recoverd, nfs4_recover_state, clp); +- INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp); +- INIT_LIST_HEAD(&clp->cl_superblocks); +- init_waitqueue_head(&clp->cl_waitq); +- rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client"); +- clp->cl_state = 1 << NFS4CLNT_NEW; ++ if (nfs_callback_up() < 0) ++ return NULL; ++ if ((clp = kmalloc(sizeof(*clp), GFP_KERNEL)) == NULL) { ++ nfs_callback_down(); ++ return NULL; + } ++ memset(clp, 0, sizeof(*clp)); ++ memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr)); ++ init_rwsem(&clp->cl_sem); ++ INIT_LIST_HEAD(&clp->cl_delegations); ++ INIT_LIST_HEAD(&clp->cl_state_owners); ++ INIT_LIST_HEAD(&clp->cl_unused); ++ spin_lock_init(&clp->cl_lock); ++ atomic_set(&clp->cl_count, 1); ++ INIT_WORK(&clp->cl_recoverd, nfs4_recover_state, clp); ++ INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp); ++ INIT_LIST_HEAD(&clp->cl_superblocks); ++ init_waitqueue_head(&clp->cl_waitq); ++ rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client"); ++ clp->cl_state = 1 << NFS4CLNT_OK; + return clp; + } + +@@ -130,25 +139,52 @@ nfs4_free_client(struct nfs4_client *clp + if (clp->cl_rpcclient) + rpc_shutdown_client(clp->cl_rpcclient); + kfree(clp); ++ nfs_callback_down(); ++} ++ ++static struct nfs4_client *__nfs4_find_client(struct in_addr *addr) ++{ ++ struct nfs4_client *clp; ++ list_for_each_entry(clp, &nfs4_clientid_list, cl_servers) { ++ if (memcmp(&clp->cl_addr, addr, sizeof(clp->cl_addr)) == 0) { ++ atomic_inc(&clp->cl_count); ++ return clp; ++ } ++ } ++ return NULL; ++} ++ ++struct nfs4_client *nfs4_find_client(struct in_addr *addr) ++{ ++ struct nfs4_client *clp; ++ spin_lock(&state_spinlock); ++ clp = __nfs4_find_client(addr); ++ spin_unlock(&state_spinlock); ++ return clp; + } + + struct nfs4_client * + nfs4_get_client(struct in_addr *addr) + { +- struct nfs4_client *new, *clp = NULL; ++ struct nfs4_client *clp, *new = NULL; + +- new = nfs4_alloc_client(addr); + spin_lock(&state_spinlock); +- list_for_each_entry(clp, &nfs4_clientid_list, cl_servers) { +- if (memcmp(&clp->cl_addr, addr, sizeof(clp->cl_addr)) == 0) +- goto found; ++ for (;;) { ++ clp = __nfs4_find_client(addr); ++ if (clp != NULL) ++ break; ++ clp = new; ++ if (clp != NULL) { ++ list_add(&clp->cl_servers, &nfs4_clientid_list); ++ new = NULL; ++ break; ++ } ++ spin_unlock(&state_spinlock); ++ new = nfs4_alloc_client(addr); ++ spin_lock(&state_spinlock); ++ if (new == NULL) ++ break; + } +- if (new) +- list_add(&new->cl_servers, &nfs4_clientid_list); +- spin_unlock(&state_spinlock); +- return new; +-found: +- atomic_inc(&clp->cl_count); + spin_unlock(&state_spinlock); + if (new) + nfs4_free_client(new); +@@ -169,6 +205,16 @@ nfs4_put_client(struct nfs4_client *clp) + nfs4_free_client(clp); + } + ++int nfs4_init_client(struct nfs4_client *clp) ++{ ++ int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, nfs_callback_tcpport); ++ if (status == 0) ++ status = nfs4_proc_setclientid_confirm(clp); ++ if (status == 0) ++ nfs4_schedule_state_renewal(clp); ++ return status; ++} ++ + u32 + nfs4_alloc_lockowner_id(struct nfs4_client *clp) + { +@@ -185,7 +231,6 @@ nfs4_client_grab_unused(struct nfs4_clie + atomic_inc(&sp->so_count); + sp->so_cred = cred; + list_move(&sp->so_list, &clp->cl_state_owners); +- sp->so_generation = clp->cl_generation; + clp->cl_nunused--; + } + return sp; +@@ -224,6 +269,7 @@ nfs4_alloc_state_owner(void) + init_MUTEX(&sp->so_sema); + sp->so_seqid = 0; /* arbitrary */ + INIT_LIST_HEAD(&sp->so_states); ++ INIT_LIST_HEAD(&sp->so_delegations); + atomic_set(&sp->so_count, 1); + return sp; + } +@@ -237,8 +283,11 @@ nfs4_unhash_state_owner(struct nfs4_stat + spin_unlock(&clp->cl_lock); + } + +-struct nfs4_state_owner * +-nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred) ++/* ++ * Note: must be called with clp->cl_sem held in order to prevent races ++ * with reboot recovery! ++ */ ++struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred) + { + struct nfs4_client *clp = server->nfs4_state; + struct nfs4_state_owner *sp, *new; +@@ -254,23 +303,23 @@ nfs4_get_state_owner(struct nfs_server * + new->so_client = clp; + new->so_id = nfs4_alloc_lockowner_id(clp); + new->so_cred = cred; +- new->so_generation = clp->cl_generation; + sp = new; + new = NULL; + } + spin_unlock(&clp->cl_lock); + if (new) + kfree(new); +- if (sp) { +- if (!test_bit(NFS4CLNT_OK, &clp->cl_state)) +- nfs4_wait_clnt_recover(server->client, clp); +- } else +- put_rpccred(cred); +- return sp; ++ if (sp != NULL) ++ return sp; ++ put_rpccred(cred); ++ return NULL; + } + +-void +-nfs4_put_state_owner(struct nfs4_state_owner *sp) ++/* ++ * Must be called with clp->cl_sem held in order to avoid races ++ * with state recovery... ++ */ ++void nfs4_put_state_owner(struct nfs4_state_owner *sp) + { + struct nfs4_client *clp = sp->so_client; + struct rpc_cred *cred = sp->so_cred; +@@ -330,8 +379,6 @@ __nfs4_find_state(struct inode *inode, s + continue; + if ((state->state & mode) != mode) + continue; +- /* Add the state to the head of the inode's list */ +- list_move(&state->inode_states, &nfsi->open_states); + atomic_inc(&state->count); + if (mode & FMODE_READ) + state->nreaders++; +@@ -353,8 +400,6 @@ __nfs4_find_state_byowner(struct inode * + if (state->nreaders == 0 && state->nwriters == 0) + continue; + if (state->owner == owner) { +- /* Add the state to the head of the inode's list */ +- list_move(&state->inode_states, &nfsi->open_states); + atomic_inc(&state->count); + return state; + } +@@ -411,51 +456,40 @@ out: + return state; + } + +-static void +-__nfs4_put_open_state(struct nfs4_state *state) ++/* ++ * Beware! Caller must be holding exactly one ++ * reference to clp->cl_sem and owner->so_sema! ++ */ ++void nfs4_put_open_state(struct nfs4_state *state) + { + struct inode *inode = state->inode; + struct nfs4_state_owner *owner = state->owner; +- int status = 0; + +- if (!atomic_dec_and_lock(&state->count, &inode->i_lock)) { +- up(&owner->so_sema); ++ if (!atomic_dec_and_lock(&state->count, &inode->i_lock)) + return; +- } + if (!list_empty(&state->inode_states)) + list_del(&state->inode_states); + spin_unlock(&inode->i_lock); + list_del(&state->open_states); +- if (state->state != 0) { +- do { +- status = nfs4_do_close(inode, state); +- if (!status) +- break; +- up(&owner->so_sema); +- status = nfs4_handle_error(NFS_SERVER(inode), status); +- down(&owner->so_sema); +- } while (!status); +- } +- up(&owner->so_sema); ++ BUG_ON (state->state != 0); + nfs4_free_open_state(state); + nfs4_put_state_owner(owner); + } + +-void +-nfs4_put_open_state(struct nfs4_state *state) +-{ +- down(&state->owner->so_sema); +- __nfs4_put_open_state(state); +-} +- +-void +-nfs4_close_state(struct nfs4_state *state, mode_t mode) ++/* ++ * Beware! Caller must be holding no references to clp->cl_sem! ++ * of owner->so_sema! ++ */ ++void nfs4_close_state(struct nfs4_state *state, mode_t mode) + { + struct inode *inode = state->inode; + struct nfs4_state_owner *owner = state->owner; ++ struct nfs4_client *clp = owner->so_client; + int newstate; + int status = 0; + ++ atomic_inc(&owner->so_count); ++ down_read(&clp->cl_sem); + down(&owner->so_sema); + /* Protect against nfs4_find_state() */ + spin_lock(&inode->i_lock); +@@ -466,29 +500,24 @@ nfs4_close_state(struct nfs4_state *stat + if (state->nwriters == 0 && state->nreaders == 0) + list_del_init(&state->inode_states); + spin_unlock(&inode->i_lock); +- do { +- newstate = 0; +- if (state->state == 0) +- break; ++ newstate = 0; ++ if (state->state != 0) { + if (state->nreaders) + newstate |= FMODE_READ; + if (state->nwriters) + newstate |= FMODE_WRITE; + if (state->state == newstate) +- break; ++ goto out; + if (newstate != 0) + status = nfs4_do_downgrade(inode, state, newstate); + else + status = nfs4_do_close(inode, state); +- if (!status) { +- state->state = newstate; +- break; +- } +- up(&owner->so_sema); +- status = nfs4_handle_error(NFS_SERVER(inode), status); +- down(&owner->so_sema); +- } while (!status); +- __nfs4_put_open_state(state); ++ } ++out: ++ nfs4_put_open_state(state); ++ up(&owner->so_sema); ++ nfs4_put_state_owner(owner); ++ up_read(&clp->cl_sem); + } + + /* +@@ -496,11 +525,11 @@ nfs4_close_state(struct nfs4_state *stat + * that is compatible with current->files + */ + static struct nfs4_lock_state * +-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++__nfs4_find_lock_state(struct nfs4_state *state, unsigned int pid) + { + struct nfs4_lock_state *pos; + list_for_each_entry(pos, &state->lock_states, ls_locks) { +- if (pos->ls_owner != fl_owner) ++ if (pos->ls_pid != pid) + continue; + atomic_inc(&pos->ls_count); + return pos; +@@ -509,23 +538,16 @@ __nfs4_find_lock_state(struct nfs4_state + } + + struct nfs4_lock_state * +-nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++nfs4_find_lock_state(struct nfs4_state *state, unsigned int pid) + { + struct nfs4_lock_state *lsp; + read_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, fl_owner); ++ lsp = __nfs4_find_lock_state(state, pid); + read_unlock(&state->state_lock); + return lsp; + } + +-/* +- * Return a compatible lock_state. If no initialized lock_state structure +- * exists, return an uninitialized one. +- * +- * The caller must be holding state->lock_sema +- */ +-struct nfs4_lock_state * +-nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, unsigned int pid) + { + struct nfs4_lock_state *lsp; + struct nfs4_client *clp = state->owner->so_client; +@@ -533,12 +555,12 @@ nfs4_alloc_lock_state(struct nfs4_state + lsp = kmalloc(sizeof(*lsp), GFP_KERNEL); + if (lsp == NULL) + return NULL; ++ lsp->flags = 0; + lsp->ls_seqid = 0; /* arbitrary */ + lsp->ls_id = -1; + memset(lsp->ls_stateid.data, 0, sizeof(lsp->ls_stateid.data)); + atomic_set(&lsp->ls_count, 1); +- lsp->ls_owner = fl_owner; +- lsp->ls_parent = state; ++ lsp->ls_pid = pid; + INIT_LIST_HEAD(&lsp->ls_locks); + spin_lock(&clp->cl_lock); + lsp->ls_id = nfs4_alloc_lockowner_id(clp); +@@ -547,16 +569,32 @@ nfs4_alloc_lock_state(struct nfs4_state + } + + /* ++ * Return a compatible lock_state. If no initialized lock_state structure ++ * exists, return an uninitialized one. ++ * ++ * The caller must be holding state->lock_sema and clp->cl_sem ++ */ ++struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, unsigned int pid) ++{ ++ struct nfs4_lock_state * lsp; ++ ++ lsp = nfs4_find_lock_state(state, pid); ++ if (lsp == NULL) ++ lsp = nfs4_alloc_lock_state(state, pid); ++ return lsp; ++} ++ ++/* + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ + void +-nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) ++nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, unsigned int pid) + { + if (test_bit(LK_STATE_IN_USE, &state->flags)) { + struct nfs4_lock_state *lsp; + +- lsp = nfs4_find_lock_state(state, fl_owner); ++ lsp = nfs4_find_lock_state(state, pid); + if (lsp) { + memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); + nfs4_put_lock_state(lsp); +@@ -567,13 +605,14 @@ nfs4_copy_stateid(nfs4_stateid *dst, str + } + + /* +-* Called with state->lock_sema held. ++* Called with state->lock_sema and clp->cl_sem held. + */ +-void +-nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp) ++void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp) + { +- if (status == NFS_OK || seqid_mutating_err(-status)) ++ if (status == NFS_OK || seqid_mutating_err(-status)) { + lsp->ls_seqid++; ++ lsp->flags |= NFS_LOCK_INITIALIZED; ++ } + } + + /* +@@ -598,12 +637,11 @@ nfs4_check_unlock(struct file_lock *fl, + * Post an initialized lock_state on the state->lock_states list. + */ + void +-nfs4_notify_setlk(struct inode *inode, struct file_lock *request, struct nfs4_lock_state *lsp) ++nfs4_notify_setlk(struct nfs4_state *state, struct file_lock *request, struct nfs4_lock_state *lsp) + { +- struct nfs4_state *state = lsp->ls_parent; +- + if (!list_empty(&lsp->ls_locks)) + return; ++ atomic_inc(&lsp->ls_count); + write_lock(&state->state_lock); + list_add(&lsp->ls_locks, &state->lock_states); + set_bit(LK_STATE_IN_USE, &state->flags); +@@ -620,15 +658,15 @@ nfs4_notify_setlk(struct inode *inode, s + * + */ + void +-nfs4_notify_unlck(struct inode *inode, struct file_lock *request, struct nfs4_lock_state *lsp) ++nfs4_notify_unlck(struct nfs4_state *state, struct file_lock *request, struct nfs4_lock_state *lsp) + { +- struct nfs4_state *state = lsp->ls_parent; ++ struct inode *inode = state->inode; + struct file_lock *fl; + + for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { + if (!(fl->fl_flags & FL_POSIX)) + continue; +- if (fl->fl_owner != lsp->ls_owner) ++ if (fl->fl_pid != lsp->ls_pid) + continue; + /* Exit if we find at least one lock which is not consumed */ + if (nfs4_check_unlock(fl,request) == 0) +@@ -640,6 +678,7 @@ nfs4_notify_unlck(struct inode *inode, s + if (list_empty(&state->lock_states)) + clear_bit(LK_STATE_IN_USE, &state->flags); + write_unlock(&state->state_lock); ++ nfs4_put_lock_state(lsp); + } + + /* +@@ -651,20 +690,18 @@ nfs4_put_lock_state(struct nfs4_lock_sta + { + if (!atomic_dec_and_test(&lsp->ls_count)) + return; +- if (!list_empty(&lsp->ls_locks)) +- return; ++ BUG_ON (!list_empty(&lsp->ls_locks)); + kfree(lsp); + } + + /* +-* Called with sp->so_sema held. ++* Called with sp->so_sema and clp->cl_sem held. + * + * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or + * failed with a seqid incrementing error - + * see comments nfs_fs.h:seqid_mutating_error() + */ +-void +-nfs4_increment_seqid(int status, struct nfs4_state_owner *sp) ++void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp) + { + if (status == NFS_OK || seqid_mutating_err(-status)) + sp->so_seqid++; +@@ -693,21 +730,14 @@ nfs4_recover_state(void *data) + + init_completion(&args.complete); + +- down_read(&clp->cl_sem); +- if (test_and_set_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state)) +- goto out_failed; + if (kernel_thread(reclaimer, &args, CLONE_KERNEL) < 0) + goto out_failed_clear; + wait_for_completion(&args.complete); + return; + out_failed_clear: +- smp_mb__before_clear_bit(); +- clear_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state); +- smp_mb__after_clear_bit(); ++ set_bit(NFS4CLNT_OK, &clp->cl_state); + wake_up_all(&clp->cl_waitq); + rpc_wake_up(&clp->cl_rpcwaitq); +-out_failed: +- up_read(&clp->cl_sem); + } + + /* +@@ -718,24 +748,66 @@ nfs4_schedule_state_recovery(struct nfs4 + { + if (!clp) + return; +- smp_mb__before_clear_bit(); +- clear_bit(NFS4CLNT_OK, &clp->cl_state); +- smp_mb__after_clear_bit(); +- schedule_work(&clp->cl_recoverd); ++ if (test_and_clear_bit(NFS4CLNT_OK, &clp->cl_state)) ++ schedule_work(&clp->cl_recoverd); + } + +-static int +-nfs4_reclaim_open_state(struct nfs4_state_owner *sp) ++static int nfs4_reclaim_locks(struct nfs4_state *state) ++{ ++ struct inode *inode = state->inode; ++ struct file_lock *fl; ++ int status = 0; ++ ++ for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) { ++ if (!(fl->fl_flags & FL_POSIX)) ++ continue; ++ if (((struct nfs_open_context *)fl->fl_file->private_data)->state != state) ++ continue; ++ status = nfs4_lock_reclaim(state, fl); ++ if (status >= 0) ++ continue; ++ switch (status) { ++ default: ++ printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", ++ __FUNCTION__, status); ++ case -NFS4ERR_EXPIRED: ++ case -NFS4ERR_NO_GRACE: ++ case -NFS4ERR_RECLAIM_BAD: ++ case -NFS4ERR_RECLAIM_CONFLICT: ++ /* kill_proc(fl->fl_pid, SIGLOST, 1); */ ++ break; ++ case -NFS4ERR_STALE_CLIENTID: ++ goto out_err; ++ } ++ } ++ return 0; ++out_err: ++ return status; ++} ++ ++static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp) + { + struct nfs4_state *state; ++ struct nfs4_lock_state *lock; + int status = 0; + + list_for_each_entry(state, &sp->so_states, open_states) { + if (state->state == 0) + continue; + status = nfs4_open_reclaim(sp, state); +- if (status >= 0) ++ list_for_each_entry(lock, &state->lock_states, ls_locks) ++ lock->flags &= ~NFS_LOCK_INITIALIZED; ++ if (status >= 0) { ++ status = nfs4_reclaim_locks(state); ++ if (status < 0) ++ goto out_err; ++ list_for_each_entry(lock, &state->lock_states, ls_locks) { ++ if (!(lock->flags & NFS_LOCK_INITIALIZED)) ++ printk("%s: Lock reclaim failed!\n", ++ __FUNCTION__); ++ } + continue; ++ } + switch (status) { + default: + printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", +@@ -762,75 +834,55 @@ out_err: + return status; + } + +-static int +-reclaimer(void *ptr) ++static int reclaimer(void *ptr) + { + struct reclaimer_args *args = (struct reclaimer_args *)ptr; + struct nfs4_client *clp = args->clp; + struct nfs4_state_owner *sp; +- int generation; + int status; + + daemonize("%u.%u.%u.%u-reclaim", NIPQUAD(clp->cl_addr)); + allow_signal(SIGKILL); + ++ atomic_inc(&clp->cl_count); + complete(&args->complete); + ++ /* Ensure exclusive access to NFSv4 state */ ++ lock_kernel(); ++ down_write(&clp->cl_sem); + /* Are there any NFS mounts out there? */ + if (list_empty(&clp->cl_superblocks)) + goto out; +- if (!test_bit(NFS4CLNT_NEW, &clp->cl_state)) { +- status = nfs4_proc_renew(clp); +- if (status == 0) { +- set_bit(NFS4CLNT_OK, &clp->cl_state); +- goto out; +- } +- } +- status = nfs4_proc_setclientid(clp, 0, 0); +- if (status) +- goto out_error; +- status = nfs4_proc_setclientid_confirm(clp); ++restart_loop: ++ status = nfs4_proc_renew(clp); ++ if (status == 0) ++ goto out; ++ status = nfs4_init_client(clp); + if (status) + goto out_error; +- generation = ++(clp->cl_generation); +- clear_bit(NFS4CLNT_NEW, &clp->cl_state); +- set_bit(NFS4CLNT_OK, &clp->cl_state); +- up_read(&clp->cl_sem); +- nfs4_schedule_state_renewal(clp); +-restart_loop: +- spin_lock(&clp->cl_lock); ++ /* Mark all delagations for reclaim */ ++ nfs_delegation_mark_reclaim(clp); ++ /* Note: list is protected by exclusive lock on cl->cl_sem */ + list_for_each_entry(sp, &clp->cl_state_owners, so_list) { +- if (sp->so_generation - generation >= 0) +- continue; +- atomic_inc(&sp->so_count); +- spin_unlock(&clp->cl_lock); +- down(&sp->so_sema); +- if (sp->so_generation - generation < 0) { +- smp_rmb(); +- sp->so_generation = clp->cl_generation; +- status = nfs4_reclaim_open_state(sp); +- } +- up(&sp->so_sema); +- nfs4_put_state_owner(sp); ++ status = nfs4_reclaim_open_state(sp); + if (status < 0) { + if (status == -NFS4ERR_STALE_CLIENTID) +- nfs4_schedule_state_recovery(clp); +- goto out; ++ goto restart_loop; ++ goto out_error; + } +- goto restart_loop; + } +- spin_unlock(&clp->cl_lock); ++ nfs_delegation_reap_unclaimed(clp); + out: +- smp_mb__before_clear_bit(); +- clear_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state); +- smp_mb__after_clear_bit(); ++ set_bit(NFS4CLNT_OK, &clp->cl_state); ++ up_write(&clp->cl_sem); ++ unlock_kernel(); + wake_up_all(&clp->cl_waitq); + rpc_wake_up(&clp->cl_rpcwaitq); ++ nfs4_put_client(clp); + return 0; + out_error: +- printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u\n", +- NIPQUAD(clp->cl_addr.s_addr)); +- up_read(&clp->cl_sem); ++ printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n", ++ NIPQUAD(clp->cl_addr.s_addr), -status); + goto out; + } + +--- linux-2.6.7/fs/nfs/inode.c.lsec 2004-06-15 23:19:44.000000000 -0600 ++++ linux-2.6.7/fs/nfs/inode.c 2005-03-23 14:28:22.818580744 -0700 +@@ -39,6 +39,8 @@ + #include + #include + ++#include "delegation.h" ++ + #define NFSDBG_FACILITY NFSDBG_VFS + #define NFS_PARANOIA 1 + +@@ -123,8 +125,9 @@ nfs_delete_inode(struct inode * inode) + { + dprintk("NFS: delete_inode(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); + ++ nfs_wb_all(inode); + /* +- * The following can never actually happen... ++ * The following should never happen... + */ + if (nfs_have_writebacks(inode)) { + printk(KERN_ERR "nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino); +@@ -133,18 +136,15 @@ nfs_delete_inode(struct inode * inode) + clear_inode(inode); + } + +-/* +- * For the moment, the only task for the NFS clear_inode method is to +- * release the mmap credential +- */ + static void + nfs_clear_inode(struct inode *inode) + { + struct nfs_inode *nfsi = NFS_I(inode); +- struct rpc_cred *cred = nfsi->mm_cred; ++ struct rpc_cred *cred; + +- if (cred) +- put_rpccred(cred); ++ nfs4_zap_acl_attr(inode); ++ nfs_wb_all(inode); ++ BUG_ON (!list_empty(&nfsi->open_files)); + cred = nfsi->cache_access.cred; + if (cred) + put_rpccred(cred); +@@ -704,7 +704,7 @@ nfs_fhget(struct super_block *sb, struct + /* Why so? Because we want revalidate for devices/FIFOs, and + * that's precisely what we have in nfs_file_inode_operations. + */ +- inode->i_op = &nfs_file_inode_operations; ++ inode->i_op = NFS_SB(sb)->rpc_ops->file_inode_ops; + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &nfs_file_operations; + inode->i_data.a_ops = &nfs_file_aops; +@@ -859,53 +859,114 @@ int nfs_getattr(struct vfsmount *mnt, st + return err; + } + ++struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred) ++{ ++ struct nfs_open_context *ctx; ++ ++ ctx = (struct nfs_open_context *)kmalloc(sizeof(*ctx), GFP_KERNEL); ++ if (ctx != NULL) { ++ atomic_set(&ctx->count, 1); ++ ctx->dentry = dget(dentry); ++ ctx->cred = get_rpccred(cred); ++ ctx->state = NULL; ++ ctx->pid = current->tgid; ++ ctx->error = 0; ++ init_waitqueue_head(&ctx->waitq); ++ } ++ return ctx; ++} ++ ++struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) ++{ ++ if (ctx != NULL) ++ atomic_inc(&ctx->count); ++ return ctx; ++} ++ ++void put_nfs_open_context(struct nfs_open_context *ctx) ++{ ++ if (atomic_dec_and_test(&ctx->count)) { ++ if (ctx->state != NULL) ++ nfs4_close_state(ctx->state, ctx->mode); ++ if (ctx->cred != NULL) ++ put_rpccred(ctx->cred); ++ dput(ctx->dentry); ++ kfree(ctx); ++ } ++} ++ + /* + * Ensure that mmap has a recent RPC credential for use when writing out + * shared pages + */ +-void +-nfs_set_mmcred(struct inode *inode, struct rpc_cred *cred) ++void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ ++ filp->private_data = get_nfs_open_context(ctx); ++ spin_lock(&inode->i_lock); ++ list_add(&ctx->list, &nfsi->open_files); ++ spin_unlock(&inode->i_lock); ++} ++ ++struct nfs_open_context *nfs_find_open_context(struct inode *inode, int mode) ++{ ++ struct nfs_inode *nfsi = NFS_I(inode); ++ struct nfs_open_context *pos, *ctx = NULL; ++ ++ spin_lock(&inode->i_lock); ++ list_for_each_entry(pos, &nfsi->open_files, list) { ++ if ((pos->mode & mode) == mode) { ++ ctx = get_nfs_open_context(pos); ++ break; ++ } ++ } ++ spin_unlock(&inode->i_lock); ++ return ctx; ++} ++ ++void nfs_file_clear_open_context(struct file *filp) + { +- struct rpc_cred **p = &NFS_I(inode)->mm_cred, +- *oldcred = *p; ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_open_context *ctx = (struct nfs_open_context *)filp->private_data; + +- *p = get_rpccred(cred); +- if (oldcred) +- put_rpccred(oldcred); ++ if (ctx) { ++ filp->private_data = NULL; ++ spin_lock(&inode->i_lock); ++ list_del(&ctx->list); ++ spin_unlock(&inode->i_lock); ++ put_nfs_open_context(ctx); ++ } + } + + /* +- * These are probably going to contain hooks for +- * allocating and releasing RPC credentials for +- * the file. I'll have to think about Tronds patch +- * a bit more.. ++ * These allocate and release file read/write context information. + */ + int nfs_open(struct inode *inode, struct file *filp) + { +- struct rpc_auth *auth; ++ struct nfs_open_context *ctx; + struct rpc_cred *cred; + +- auth = NFS_CLIENT(inode)->cl_auth; +- cred = rpcauth_lookupcred(auth, 0); +- filp->private_data = cred; +- if ((filp->f_mode & FMODE_WRITE) != 0) { +- nfs_set_mmcred(inode, cred); ++ if ((cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0)) == NULL) ++ return -ENOMEM; ++ ctx = alloc_nfs_open_context(filp->f_dentry, cred); ++ put_rpccred(cred); ++ if (ctx == NULL) ++ return -ENOMEM; ++ ctx->mode = filp->f_mode; ++ nfs_file_set_open_context(filp, ctx); ++ put_nfs_open_context(ctx); ++ if ((filp->f_mode & FMODE_WRITE) != 0) + nfs_begin_data_update(inode); +- } + return 0; + } + + int nfs_release(struct inode *inode, struct file *filp) + { +- struct rpc_cred *cred; +- +- lock_kernel(); + if ((filp->f_mode & FMODE_WRITE) != 0) + nfs_end_data_update(inode); +- cred = nfs_file_cred(filp); +- if (cred) +- put_rpccred(cred); +- unlock_kernel(); ++ nfs_file_clear_open_context(filp); + return 0; + } + +@@ -1002,6 +1063,30 @@ out: + return status; + } + ++int nfs_attribute_timeout(struct inode *inode) ++{ ++ struct nfs_inode *nfsi = NFS_I(inode); ++ ++ if (nfs_have_delegation(inode, FMODE_READ)) ++ return 0; ++ return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo); ++} ++ ++/** ++ * nfs_revalidate_inode - Revalidate the inode attributes ++ * @server - pointer to nfs_server struct ++ * @inode - pointer to inode struct ++ * ++ * Updates inode attribute information by retrieving the data from the server. ++ */ ++int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) ++{ ++ if (!(NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) ++ && !nfs_attribute_timeout(inode)) ++ return NFS_STALE(inode) ? -ESTALE : 0; ++ return __nfs_revalidate_inode(server, inode); ++} ++ + /** + * nfs_begin_data_update + * @inode - pointer to inode +@@ -1023,11 +1108,13 @@ void nfs_end_data_update(struct inode *i + { + struct nfs_inode *nfsi = NFS_I(inode); + +- /* Mark the attribute cache for revalidation */ +- nfsi->flags |= NFS_INO_INVALID_ATTR; +- /* Directories and symlinks: invalidate page cache too */ +- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) +- nfsi->flags |= NFS_INO_INVALID_DATA; ++ if (!nfs_have_delegation(inode, FMODE_READ)) { ++ /* Mark the attribute cache for revalidation */ ++ nfsi->flags |= NFS_INO_INVALID_ATTR; ++ /* Directories and symlinks: invalidate page cache too */ ++ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) ++ nfsi->flags |= NFS_INO_INVALID_DATA; ++ } + nfsi->cache_change_attribute ++; + atomic_dec(&nfsi->data_updates); + } +@@ -1068,6 +1155,10 @@ int nfs_refresh_inode(struct inode *inod + loff_t cur_size, new_isize; + int data_unstable; + ++ /* Do we hold a delegation? */ ++ if (nfs_have_delegation(inode, FMODE_READ)) ++ return 0; ++ + /* Are we in the process of updating data on the server? */ + data_unstable = nfs_caches_unstable(inode); + +@@ -1240,6 +1331,7 @@ static int nfs_update_inode(struct inode + inode->i_nlink = fattr->nlink; + inode->i_uid = fattr->uid; + inode->i_gid = fattr->gid; ++ nfs4_zap_acl_attr(inode); + + if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { + /* +@@ -1265,7 +1357,8 @@ static int nfs_update_inode(struct inode + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) + || S_ISLNK(inode->i_mode))) + invalid &= ~NFS_INO_INVALID_DATA; +- nfsi->flags |= invalid; ++ if (!nfs_have_delegation(inode, FMODE_READ)) ++ nfsi->flags |= invalid; + + return 0; + out_changed: +@@ -1400,6 +1493,52 @@ static struct file_system_type nfs_fs_ty + + #ifdef CONFIG_NFS_V4 + ++#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" ++ ++int ++nfs_setxattr(struct dentry *dentry, const char *key, const void *buf, ++ size_t buflen, int flags) ++{ ++ struct inode *inode = dentry->d_inode; ++ ++ if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) ++ return -EINVAL; ++ ++ if (!S_ISREG(inode->i_mode) && ++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) ++ return -EPERM; ++ ++ return nfs4_proc_set_acl(inode, buf, buflen); ++} ++ ++/* The getxattr man page suggests returning -ENODATA for unknown attributes, ++ * and that's what we'll do for e.g. user attributes that haven't been set. ++ * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported ++ * attributes in kernel-managed attribute namespaces. */ ++ssize_t ++nfs_getxattr(struct dentry *dentry, const char *key, void *buf, ++ size_t buflen) ++{ ++ struct inode *inode = dentry->d_inode; ++ ++ if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) ++ return -EOPNOTSUPP; ++ ++ return nfs4_proc_get_acl(inode, buf, buflen); ++} ++ ++ssize_t ++nfs_listxattr(struct dentry *dentry, char *buf, size_t buflen) ++{ ++ ssize_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1; ++ ++ if (buf && buflen < len) ++ return -ERANGE; ++ if (buf) ++ memcpy(buf, XATTR_NAME_NFSV4_ACL, len); ++ return len; ++} ++ + static void nfs4_clear_inode(struct inode *); + + static struct super_operations nfs4_sops = { +@@ -1423,6 +1562,12 @@ static void nfs4_clear_inode(struct inod + { + struct nfs_inode *nfsi = NFS_I(inode); + ++ /* If we are holding a delegation, return it! */ ++ if (nfsi->delegation != NULL) ++ nfs_inode_return_delegation(inode); ++ /* First call standard NFS clear_inode() code */ ++ nfs_clear_inode(inode); ++ /* Now clear out any remaining state */ + while (!list_empty(&nfsi->open_states)) { + struct nfs4_state *state; + +@@ -1437,8 +1582,6 @@ static void nfs4_clear_inode(struct inod + BUG_ON(atomic_read(&state->count) != 1); + nfs4_close_state(state, state->state); + } +- /* Now call standard NFS clear_inode() code */ +- nfs_clear_inode(inode); + } + + +@@ -1536,8 +1679,19 @@ static int nfs4_fill_super(struct super_ + memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr)); + nfs_idmap_new(clp); + } +- if (list_empty(&clp->cl_superblocks)) +- clear_bit(NFS4CLNT_OK, &clp->cl_state); ++ /* Fire up rpciod if not yet running */ ++ if (rpciod_up() != 0) { ++ printk(KERN_WARNING "NFS: couldn't start rpciod!\n"); ++ goto out_fail; ++ } ++ ++ if (list_empty(&clp->cl_superblocks)) { ++ err = nfs4_init_client(clp); ++ if (err != 0) { ++ up_write(&clp->cl_sem); ++ goto out_rpciod; ++ } ++ } + list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks); + clnt = rpc_clone_client(clp->cl_rpcclient); + if (!IS_ERR(clnt)) +@@ -1567,17 +1721,10 @@ static int nfs4_fill_super(struct super_ + } + } + +- /* Fire up rpciod if not yet running */ +- if (rpciod_up() != 0) { +- printk(KERN_WARNING "NFS: couldn't start rpciod!\n"); +- goto out_shutdown; +- } +- + sb->s_op = &nfs4_sops; + err = nfs_sb_init(sb, authflavour); + if (err == 0) + return 0; +- rpciod_down(); + out_shutdown: + rpc_shutdown_client(server->client); + out_remove_list: +@@ -1585,6 +1732,8 @@ out_remove_list: + list_del_init(&server->nfs4_siblings); + up_write(&server->nfs4_state->cl_sem); + destroy_nfsv4_state(server); ++out_rpciod: ++ rpciod_down(); + out_fail: + if (clp) + nfs4_put_client(clp); +@@ -1709,22 +1858,31 @@ out_free: + return s; + } + ++static void nfs4_kill_super(struct super_block *sb) ++{ ++ nfs_return_all_delegations(sb); ++ nfs_kill_super(sb); ++} ++ + static struct file_system_type nfs4_fs_type = { + .owner = THIS_MODULE, + .name = "nfs4", + .get_sb = nfs4_get_sb, +- .kill_sb = nfs_kill_super, ++ .kill_sb = nfs4_kill_super, + .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + }; + +-#define nfs4_zero_state(nfsi) \ ++#define nfs4_init_once(nfsi) \ + do { \ + INIT_LIST_HEAD(&(nfsi)->open_states); \ ++ nfsi->delegation = NULL; \ ++ nfsi->delegation_state = 0; \ ++ init_rwsem(&nfsi->rwsem); \ + } while(0) + #define register_nfs4fs() register_filesystem(&nfs4_fs_type) + #define unregister_nfs4fs() unregister_filesystem(&nfs4_fs_type) + #else +-#define nfs4_zero_state(nfsi) \ ++#define nfs4_init_once(nfsi) \ + do { } while (0) + #define register_nfs4fs() (0) + #define unregister_nfs4fs() +@@ -1746,8 +1904,8 @@ static struct inode *nfs_alloc_inode(str + if (!nfsi) + return NULL; + nfsi->flags = 0; +- nfsi->mm_cred = NULL; +- nfs4_zero_state(nfsi); ++ nfsi->acl_len = 0; ++ nfsi->acl = NULL; + return &nfsi->vfs_inode; + } + +@@ -1765,12 +1923,14 @@ static void init_once(void * foo, kmem_c + inode_init_once(&nfsi->vfs_inode); + INIT_LIST_HEAD(&nfsi->dirty); + INIT_LIST_HEAD(&nfsi->commit); ++ INIT_LIST_HEAD(&nfsi->open_files); + INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); + atomic_set(&nfsi->data_updates, 0); + nfsi->ndirty = 0; + nfsi->ncommit = 0; + nfsi->npages = 0; + init_waitqueue_head(&nfsi->nfs_i_wait); ++ nfs4_init_once(nfsi); + } + } + +--- linux-2.6.7/fs/nfs/dir.c.lsec 2004-06-15 23:19:23.000000000 -0600 ++++ linux-2.6.7/fs/nfs/dir.c 2005-03-23 14:28:22.701598528 -0700 +@@ -32,6 +32,8 @@ + #include + #include + ++#include "delegation.h" ++ + #define NFS_PARANOIA 1 + /* #define NFS_DEBUG_VERBOSE 1 */ + +@@ -88,6 +90,9 @@ struct inode_operations nfs4_dir_inode_o + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, ++ .getxattr = nfs_getxattr, ++ .setxattr = nfs_setxattr, ++ .listxattr = nfs_listxattr, + }; + + #endif /* CONFIG_NFS_V4 */ +@@ -850,22 +855,22 @@ static int nfs_open_revalidate(struct de + unsigned long verifier; + int openflags, ret = 0; + +- /* NFS only supports OPEN for regular files */ +- if (inode && !S_ISREG(inode->i_mode)) +- goto no_open; + parent = dget_parent(dentry); + dir = parent->d_inode; + if (!is_atomic_open(dir, nd)) + goto no_open; ++ /* We can't create new files in nfs_open_revalidate(), so we ++ * optimize away revalidation of negative dentries. ++ */ ++ if (inode == NULL) ++ goto out; ++ /* NFS only supports OPEN on regular files */ ++ if (!S_ISREG(inode->i_mode)) ++ goto no_open; + openflags = nd->intent.open.flags; +- if (openflags & O_CREAT) { +- /* If this is a negative dentry, just drop it */ +- if (!inode) +- goto out; +- /* If this is exclusive open, just revalidate */ +- if (openflags & O_EXCL) +- goto no_open; +- } ++ /* We cannot do exclusive creation on a positive dentry */ ++ if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) ++ goto no_open; + /* We can't create new files, or truncate existing ones here */ + openflags &= ~(O_CREAT|O_TRUNC); + +@@ -887,6 +892,8 @@ out: + return ret; + no_open: + dput(parent); ++ if (inode != NULL && nfs_have_delegation(inode, FMODE_READ)) ++ return 1; + return nfs_lookup_revalidate(dentry, nd); + } + #endif /* CONFIG_NFSV4 */ +@@ -1299,19 +1306,6 @@ nfs_symlink(struct inode *dir, struct de + dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id, + dir->i_ino, dentry->d_name.name, symname); + +- error = -ENAMETOOLONG; +- switch (NFS_PROTO(dir)->version) { +- case 2: +- if (strlen(symname) > NFS2_MAXPATHLEN) +- goto out; +- break; +- case 3: +- if (strlen(symname) > NFS3_MAXPATHLEN) +- goto out; +- default: +- break; +- } +- + #ifdef NFS_PARANOIA + if (dentry->d_inode) + printk("nfs_proc_symlink: %s/%s not negative!\n", +@@ -1341,8 +1335,6 @@ dentry->d_parent->d_name.name, dentry->d + d_drop(dentry); + } + unlock_kernel(); +- +-out: + return error; + } + +@@ -1498,10 +1490,56 @@ out: + return error; + } + +-int +-nfs_permission(struct inode *inode, int mask, struct nameidata *nd) ++int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) ++{ ++ struct nfs_access_entry *cache = &NFS_I(inode)->cache_access; ++ ++ if (cache->cred != cred ++ || time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)) ++ || (NFS_FLAGS(inode) & NFS_INO_INVALID_ATTR)) ++ return -ENOENT; ++ memcpy(res, cache, sizeof(*res)); ++ return 0; ++} ++ ++void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) ++{ ++ struct nfs_access_entry *cache = &NFS_I(inode)->cache_access; ++ ++ if (cache->cred != set->cred) { ++ if (cache->cred) ++ put_rpccred(cache->cred); ++ cache->cred = get_rpccred(set->cred); ++ } ++ cache->jiffies = set->jiffies; ++ cache->mask = set->mask; ++} ++ ++static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) ++{ ++ struct nfs_access_entry cache; ++ int status; ++ ++ status = nfs_access_get_cached(inode, cred, &cache); ++ if (status == 0) ++ goto out; ++ ++ /* Be clever: ask server to check for all possible rights */ ++ cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; ++ cache.cred = cred; ++ cache.jiffies = jiffies; ++ status = NFS_PROTO(inode)->access(inode, &cache); ++ if (status != 0) ++ return status; ++ nfs_access_add_cache(inode, &cache); ++out: ++ if ((cache.mask & mask) == mask) ++ return 0; ++ return -EACCES; ++} ++ ++int nfs_permission(struct inode *inode, int mask, struct nameidata *nd) + { +- struct nfs_access_cache *cache = &NFS_I(inode)->cache_access; + struct rpc_cred *cred; + int mode = inode->i_mode; + int res; +@@ -1542,24 +1580,7 @@ nfs_permission(struct inode *inode, int + goto out_notsup; + + cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); +- if (cache->cred == cred +- && time_before(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)) +- && !(NFS_FLAGS(inode) & NFS_INO_INVALID_ATTR)) { +- if (!(res = cache->err)) { +- /* Is the mask a subset of an accepted mask? */ +- if ((cache->mask & mask) == mask) +- goto out; +- } else { +- /* ...or is it a superset of a rejected mask? */ +- if ((cache->mask & mask) == cache->mask) +- goto out; +- } +- } +- +- res = NFS_PROTO(inode)->access(inode, cred, mask); +- if (!res || res == -EACCES) +- goto add_cache; +-out: ++ res = nfs_do_access(inode, cred, mask); + put_rpccred(cred); + unlock_kernel(); + return res; +@@ -1568,15 +1589,6 @@ out_notsup: + res = vfs_permission(inode, mask); + unlock_kernel(); + return res; +-add_cache: +- cache->jiffies = jiffies; +- if (cache->cred) +- put_rpccred(cache->cred); +- cache->cred = cred; +- cache->mask = mask; +- cache->err = res; +- unlock_kernel(); +- return res; + } + + /* +--- linux-2.6.7/fs/nfs/unlink.c.lsec 2004-06-15 23:20:04.000000000 -0600 ++++ linux-2.6.7/fs/nfs/unlink.c 2005-03-23 14:28:23.170527240 -0700 +@@ -215,7 +215,6 @@ nfs_complete_unlink(struct dentry *dentr + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; + spin_unlock(&dentry->d_lock); +- if (data->task.tk_rpcwait == &nfs_delete_queue) +- rpc_wake_up_task(&data->task); ++ rpc_wake_up_task(&data->task); + nfs_put_unlinkdata(data); + } +--- linux-2.6.7/fs/nfs/callback_xdr.c.lsec 2005-03-23 14:28:22.545622240 -0700 ++++ linux-2.6.7/fs/nfs/callback_xdr.c 2005-03-23 14:28:22.544622392 -0700 +@@ -0,0 +1,481 @@ ++/* ++ * linux/fs/nfs/callback_xdr.c ++ * ++ * Copyright (C) 2004 Trond Myklebust ++ * ++ * NFSv4 callback encode/decode procedures ++ */ ++#include ++#include ++#include ++#include ++#include ++#include "callback.h" ++ ++#define CB_OP_TAGLEN_MAXSZ (512) ++#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) ++#define CB_OP_GETATTR_BITMAP_MAXSZ (4) ++#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ ++ CB_OP_GETATTR_BITMAP_MAXSZ + \ ++ 2 + 2 + 3 + 3) ++#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) ++ ++#define NFSDBG_FACILITY NFSDBG_CALLBACK ++ ++typedef unsigned (*callback_process_op_t)(void *, void *); ++typedef unsigned (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *); ++typedef unsigned (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *); ++ ++ ++struct callback_op { ++ callback_process_op_t process_op; ++ callback_decode_arg_t decode_args; ++ callback_encode_res_t encode_res; ++ long res_maxsize; ++}; ++ ++static struct callback_op callback_ops[]; ++ ++static int nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp) ++{ ++ return htonl(NFS4_OK); ++} ++ ++static int nfs4_decode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy) ++{ ++ return xdr_argsize_check(rqstp, p); ++} ++ ++static int nfs4_encode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy) ++{ ++ return xdr_ressize_check(rqstp, p); ++} ++ ++static uint32_t *read_buf(struct xdr_stream *xdr, int nbytes) ++{ ++ uint32_t *p; ++ ++ p = xdr_inline_decode(xdr, nbytes); ++ if (unlikely(p == NULL)) ++ printk(KERN_WARNING "NFSv4 callback reply buffer overflowed!\n"); ++ return p; ++} ++ ++static unsigned decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str) ++{ ++ uint32_t *p; ++ ++ p = read_buf(xdr, 4); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ *len = ntohl(*p); ++ ++ if (*len != 0) { ++ p = read_buf(xdr, *len); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ *str = (const char *)p; ++ } else ++ *str = NULL; ++ ++ return 0; ++} ++ ++static unsigned decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh) ++{ ++ uint32_t *p; ++ ++ p = read_buf(xdr, 4); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ fh->size = ntohl(*p); ++ if (fh->size > NFS4_FHSIZE) ++ return htonl(NFS4ERR_BADHANDLE); ++ p = read_buf(xdr, fh->size); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ memcpy(&fh->data[0], p, fh->size); ++ memset(&fh->data[fh->size], 0, sizeof(fh->data) - fh->size); ++ return 0; ++} ++ ++static unsigned decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) ++{ ++ uint32_t *p; ++ unsigned int attrlen; ++ ++ p = read_buf(xdr, 4); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ attrlen = ntohl(*p); ++ p = read_buf(xdr, attrlen << 2); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ if (likely(attrlen > 0)) ++ bitmap[0] = ntohl(*p++); ++ if (attrlen > 1) ++ bitmap[1] = ntohl(*p); ++ return 0; ++} ++ ++static unsigned decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) ++{ ++ uint32_t *p; ++ ++ p = read_buf(xdr, 16); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ memcpy(stateid->data, p, 16); ++ return 0; ++} ++ ++static unsigned decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr) ++{ ++ uint32_t *p; ++ unsigned int minor_version; ++ unsigned status; ++ ++ status = decode_string(xdr, &hdr->taglen, &hdr->tag); ++ if (unlikely(status != 0)) ++ return status; ++ /* We do not like overly long tags! */ ++ if (hdr->taglen > CB_OP_TAGLEN_MAXSZ-12 || hdr->taglen < 0) { ++ printk("NFSv4 CALLBACK %s: client sent tag of length %u\n", ++ __FUNCTION__, hdr->taglen); ++ return htonl(NFS4ERR_RESOURCE); ++ } ++ p = read_buf(xdr, 12); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ minor_version = ntohl(*p++); ++ /* Check minor version is zero. */ ++ if (minor_version != 0) { ++ printk(KERN_WARNING "%s: NFSv4 server callback with illegal minor version %u!\n", ++ __FUNCTION__, minor_version); ++ return htonl(NFS4ERR_MINOR_VERS_MISMATCH); ++ } ++ hdr->callback_ident = ntohl(*p++); ++ hdr->nops = ntohl(*p); ++ return 0; ++} ++ ++static unsigned decode_op_hdr(struct xdr_stream *xdr, unsigned int *op) ++{ ++ uint32_t *p; ++ p = read_buf(xdr, 4); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ *op = ntohl(*p); ++ return 0; ++} ++ ++static unsigned decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args) ++{ ++ unsigned status; ++ ++ status = decode_fh(xdr, &args->fh); ++ if (unlikely(status != 0)) ++ goto out; ++ args->addr = &rqstp->rq_addr; ++ status = decode_bitmap(xdr, args->bitmap); ++out: ++ dprintk("%s: exit with status = %d\n", __FUNCTION__, status); ++ return status; ++} ++ ++static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args) ++{ ++ uint32_t *p; ++ unsigned status; ++ ++ args->addr = &rqstp->rq_addr; ++ status = decode_stateid(xdr, &args->stateid); ++ if (unlikely(status != 0)) ++ goto out; ++ p = read_buf(xdr, 4); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ args->truncate = ntohl(*p); ++ status = decode_fh(xdr, &args->fh); ++out: ++ dprintk("%s: exit with status = %d\n", __FUNCTION__, status); ++ return 0; ++} ++ ++static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) ++{ ++ uint32_t *p; ++ ++ p = xdr_reserve_space(xdr, 4 + len); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ xdr_encode_opaque(p, str, len); ++ return 0; ++} ++ ++#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) ++#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) ++static unsigned encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, uint32_t **savep) ++{ ++ uint32_t bm[2]; ++ uint32_t *p; ++ ++ bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0); ++ bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1); ++ if (bm[1] != 0) { ++ p = xdr_reserve_space(xdr, 16); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ *p++ = htonl(2); ++ *p++ = bm[0]; ++ *p++ = bm[1]; ++ } else if (bm[0] != 0) { ++ p = xdr_reserve_space(xdr, 12); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ *p++ = htonl(1); ++ *p++ = bm[0]; ++ } else { ++ p = xdr_reserve_space(xdr, 8); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ *p++ = htonl(0); ++ } ++ *savep = p; ++ return 0; ++} ++ ++static unsigned encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change) ++{ ++ uint32_t *p; ++ ++ if (!(bitmap[0] & FATTR4_WORD0_CHANGE)) ++ return 0; ++ p = xdr_reserve_space(xdr, 8); ++ if (unlikely(p == 0)) ++ return htonl(NFS4ERR_RESOURCE); ++ p = xdr_encode_hyper(p, change); ++ return 0; ++} ++ ++static unsigned encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size) ++{ ++ uint32_t *p; ++ ++ if (!(bitmap[0] & FATTR4_WORD0_SIZE)) ++ return 0; ++ p = xdr_reserve_space(xdr, 8); ++ if (unlikely(p == 0)) ++ return htonl(NFS4ERR_RESOURCE); ++ p = xdr_encode_hyper(p, size); ++ return 0; ++} ++ ++static unsigned encode_attr_time(struct xdr_stream *xdr, const struct timespec *time) ++{ ++ uint32_t *p; ++ ++ p = xdr_reserve_space(xdr, 12); ++ if (unlikely(p == 0)) ++ return htonl(NFS4ERR_RESOURCE); ++ p = xdr_encode_hyper(p, time->tv_sec); ++ *p = htonl(time->tv_nsec); ++ return 0; ++} ++ ++static unsigned encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) ++{ ++ if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) ++ return 0; ++ return encode_attr_time(xdr,time); ++} ++ ++static unsigned encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) ++{ ++ if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) ++ return 0; ++ return encode_attr_time(xdr,time); ++} ++ ++static unsigned encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr) ++{ ++ unsigned status; ++ ++ hdr->status = xdr_reserve_space(xdr, 4); ++ if (unlikely(hdr->status == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ status = encode_string(xdr, hdr->taglen, hdr->tag); ++ if (unlikely(status != 0)) ++ return status; ++ hdr->nops = xdr_reserve_space(xdr, 4); ++ if (unlikely(hdr->nops == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ return 0; ++} ++ ++static unsigned encode_op_hdr(struct xdr_stream *xdr, uint32_t op, uint32_t res) ++{ ++ uint32_t *p; ++ ++ p = xdr_reserve_space(xdr, 8); ++ if (unlikely(p == NULL)) ++ return htonl(NFS4ERR_RESOURCE); ++ *p++ = htonl(op); ++ *p = htonl(res); ++ return 0; ++} ++ ++static unsigned encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res) ++{ ++ uint32_t *savep; ++ unsigned status = res->status; ++ ++ if (unlikely(status != 0)) ++ goto out; ++ status = encode_attr_bitmap(xdr, res->bitmap, &savep); ++ if (unlikely(status != 0)) ++ goto out; ++ status = encode_attr_change(xdr, res->bitmap, res->change_attr); ++ if (unlikely(status != 0)) ++ goto out; ++ status = encode_attr_size(xdr, res->bitmap, res->size); ++ if (unlikely(status != 0)) ++ goto out; ++ status = encode_attr_ctime(xdr, res->bitmap, &res->ctime); ++ if (unlikely(status != 0)) ++ goto out; ++ status = encode_attr_mtime(xdr, res->bitmap, &res->mtime); ++ *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1))); ++out: ++ dprintk("%s: exit with status = %d\n", __FUNCTION__, status); ++ return status; ++} ++ ++static unsigned process_op(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr_in, void *argp, ++ struct xdr_stream *xdr_out, void *resp) ++{ ++ struct callback_op *op; ++ unsigned int op_nr; ++ unsigned int status = 0; ++ long maxlen; ++ unsigned res; ++ ++ dprintk("%s: start\n", __FUNCTION__); ++ status = decode_op_hdr(xdr_in, &op_nr); ++ if (unlikely(status != 0)) { ++ op_nr = OP_CB_ILLEGAL; ++ op = &callback_ops[0]; ++ } else if (unlikely(op_nr != OP_CB_GETATTR && op_nr != OP_CB_RECALL)) { ++ op_nr = OP_CB_ILLEGAL; ++ op = &callback_ops[0]; ++ status = htonl(NFS4ERR_OP_ILLEGAL); ++ } else ++ op = &callback_ops[op_nr]; ++ ++ maxlen = xdr_out->end - xdr_out->p; ++ if (maxlen > 0 && maxlen < PAGE_SIZE) { ++ if (likely(status == 0 && op->decode_args != NULL)) ++ status = op->decode_args(rqstp, xdr_in, argp); ++ if (likely(status == 0 && op->process_op != NULL)) ++ status = op->process_op(argp, resp); ++ } else ++ status = htonl(NFS4ERR_RESOURCE); ++ ++ res = encode_op_hdr(xdr_out, op_nr, status); ++ if (status == 0) ++ status = res; ++ if (op->encode_res != NULL && status == 0) ++ status = op->encode_res(rqstp, xdr_out, resp); ++ dprintk("%s: done, status = %d\n", __FUNCTION__, status); ++ return status; ++} ++ ++/* ++ * Decode, process and encode a COMPOUND ++ */ ++static int nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp) ++{ ++ struct cb_compound_hdr_arg hdr_arg; ++ struct cb_compound_hdr_res hdr_res; ++ struct xdr_stream xdr_in, xdr_out; ++ uint32_t *p; ++ unsigned int status; ++ unsigned int nops = 1; ++ ++ dprintk("%s: start\n", __FUNCTION__); ++ ++ xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); ++ ++ p = (uint32_t*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); ++ rqstp->rq_res.head[0].iov_len = PAGE_SIZE; ++ xdr_init_encode(&xdr_out, &rqstp->rq_res, p); ++ ++ decode_compound_hdr_arg(&xdr_in, &hdr_arg); ++ hdr_res.taglen = hdr_arg.taglen; ++ hdr_res.tag = hdr_arg.tag; ++ encode_compound_hdr_res(&xdr_out, &hdr_res); ++ ++ for (;;) { ++ status = process_op(rqstp, &xdr_in, argp, &xdr_out, resp); ++ if (status != 0) ++ break; ++ if (nops == hdr_arg.nops) ++ break; ++ nops++; ++ } ++ *hdr_res.status = status; ++ *hdr_res.nops = htonl(nops); ++ dprintk("%s: done, status = %u\n", __FUNCTION__, status); ++ return rpc_success; ++} ++ ++/* ++ * Define NFS4 callback COMPOUND ops. ++ */ ++static struct callback_op callback_ops[] = { ++ [0] = { ++ .res_maxsize = CB_OP_HDR_RES_MAXSZ, ++ }, ++ [OP_CB_GETATTR] = { ++ .process_op = (callback_process_op_t)nfs4_callback_getattr, ++ .decode_args = (callback_decode_arg_t)decode_getattr_args, ++ .encode_res = (callback_encode_res_t)encode_getattr_res, ++ .res_maxsize = CB_OP_GETATTR_RES_MAXSZ, ++ }, ++ [OP_CB_RECALL] = { ++ .process_op = (callback_process_op_t)nfs4_callback_recall, ++ .decode_args = (callback_decode_arg_t)decode_recall_args, ++ .res_maxsize = CB_OP_RECALL_RES_MAXSZ, ++ } ++}; ++ ++/* ++ * Define NFS4 callback procedures ++ */ ++static struct svc_procedure nfs4_callback_procedures1[] = { ++ [CB_NULL] = { ++ .pc_func = nfs4_callback_null, ++ .pc_decode = (kxdrproc_t)nfs4_decode_void, ++ .pc_encode = (kxdrproc_t)nfs4_encode_void, ++ .pc_xdrressize = 1, ++ }, ++ [CB_COMPOUND] = { ++ .pc_func = nfs4_callback_compound, ++ .pc_encode = (kxdrproc_t)nfs4_encode_void, ++ .pc_argsize = 256, ++ .pc_ressize = 256, ++ .pc_xdrressize = NFS4_CALLBACK_BUFSIZE, ++ } ++}; ++ ++struct svc_version nfs4_callback_version1 = { ++ .vs_vers = 1, ++ .vs_nproc = ARRAY_SIZE(nfs4_callback_procedures1), ++ .vs_proc = nfs4_callback_procedures1, ++ .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, ++ .vs_dispatch = NULL, ++}; ++ +--- linux-2.6.7/fs/nfs/callback.c.lsec 2005-03-23 14:28:22.484631512 -0700 ++++ linux-2.6.7/fs/nfs/callback.c 2005-03-23 14:28:22.483631664 -0700 +@@ -0,0 +1,325 @@ ++/* ++ * linux/fs/nfs/callback.c ++ * ++ * Copyright (C) 2004 Trond Myklebust ++ * ++ * NFSv4 callback handling ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "callback.h" ++ ++#define NFSDBG_FACILITY NFSDBG_CALLBACK ++ ++struct nfs_callback_data { ++ unsigned int users; ++ struct svc_serv *serv; ++ pid_t pid; ++ struct completion started; ++ struct completion stopped; ++}; ++ ++static struct nfs_callback_data nfs_callback_info; ++static DECLARE_MUTEX(nfs_callback_sema); ++static struct svc_program nfs4_callback_program; ++ ++unsigned short nfs_callback_tcpport; ++ ++/* ++ * This is the callback kernel thread. ++ */ ++static void nfs_callback_svc(struct svc_rqst *rqstp) ++{ ++ struct svc_serv *serv = rqstp->rq_server; ++ int err; ++ ++ __module_get(THIS_MODULE); ++ lock_kernel(); ++ ++ nfs_callback_info.pid = current->pid; ++ daemonize("nfsv4-svc"); ++ /* Process request with signals blocked, but allow SIGKILL. */ ++ allow_signal(SIGKILL); ++ ++ complete(&nfs_callback_info.started); ++ ++ while (nfs_callback_info.users != 0 || !signalled()) { ++ /* ++ * Listen for a request on the socket ++ */ ++ err = svc_recv(serv, rqstp, MAX_SCHEDULE_TIMEOUT); ++ if (err == -EAGAIN || err == -EINTR) ++ continue; ++ if (err < 0) { ++ printk(KERN_WARNING ++ "%s: terminating on error %d\n", ++ __FUNCTION__, -err); ++ break; ++ } ++ dprintk("%s: request from %u.%u.%u.%u\n", __FUNCTION__, ++ NIPQUAD(rqstp->rq_addr.sin_addr.s_addr)); ++ svc_process(serv, rqstp); ++ } ++ ++ nfs_callback_info.pid = 0; ++ complete(&nfs_callback_info.stopped); ++ unlock_kernel(); ++ module_put_and_exit(0); ++} ++ ++/* ++ * Bring up the server process if it is not already up. ++ */ ++int nfs_callback_up(void) ++{ ++ struct svc_serv *serv; ++ struct svc_sock *svsk; ++ int ret = 0; ++ ++ lock_kernel(); ++ down(&nfs_callback_sema); ++ if (nfs_callback_info.users++ || nfs_callback_info.pid != 0) ++ goto out; ++ init_completion(&nfs_callback_info.started); ++ init_completion(&nfs_callback_info.stopped); ++ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE); ++ ret = -ENOMEM; ++ if (!serv) ++ goto out_err; ++ /* FIXME: We don't want to register this socket with the portmapper */ ++ ret = svc_makesock(serv, IPPROTO_TCP, 0); ++ if (ret < 0) ++ goto out_destroy; ++ if (!list_empty(&serv->sv_permsocks)) { ++ svsk = list_entry(serv->sv_permsocks.next, ++ struct svc_sock, sk_list); ++ nfs_callback_tcpport = ntohs(inet_sk(svsk->sk_sk)->sport); ++ dprintk ("Callback port = 0x%x\n", nfs_callback_tcpport); ++ } else ++ BUG(); ++ ret = svc_create_thread(nfs_callback_svc, serv); ++ if (ret < 0) ++ goto out_destroy; ++ nfs_callback_info.serv = serv; ++ wait_for_completion(&nfs_callback_info.started); ++out: ++ up(&nfs_callback_sema); ++ unlock_kernel(); ++ return ret; ++out_destroy: ++ svc_destroy(serv); ++out_err: ++ nfs_callback_info.users--; ++ goto out; ++} ++ ++/* ++ * Kill the server process if it is not already up. ++ */ ++int nfs_callback_down(void) ++{ ++ int ret = 0; ++ ++ lock_kernel(); ++ down(&nfs_callback_sema); ++ if (--nfs_callback_info.users || nfs_callback_info.pid == 0) ++ goto out; ++ kill_proc(nfs_callback_info.pid, SIGKILL, 1); ++ wait_for_completion(&nfs_callback_info.stopped); ++out: ++ up(&nfs_callback_sema); ++ unlock_kernel(); ++ return ret; ++} ++ ++/* ++ * AUTH_NULL authentication ++ */ ++static int nfs_callback_null_accept(struct svc_rqst *rqstp, u32 *authp) ++{ ++ struct iovec *argv = &rqstp->rq_arg.head[0]; ++ struct iovec *resv = &rqstp->rq_res.head[0]; ++ ++ if (argv->iov_len < 3*4) ++ return SVC_GARBAGE; ++ ++ if (svc_getu32(argv) != 0) { ++ dprintk("svc: bad null cred\n"); ++ *authp = rpc_autherr_badcred; ++ return SVC_DENIED; ++ } ++ if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) { ++ dprintk("svc: bad null verf\n"); ++ *authp = rpc_autherr_badverf; ++ return SVC_DENIED; ++ } ++ ++ /* Signal that mapping to nobody uid/gid is required */ ++ rqstp->rq_cred.cr_uid = (uid_t) -1; ++ rqstp->rq_cred.cr_gid = (gid_t) -1; ++ rqstp->rq_cred.cr_group_info = groups_alloc(0); ++ if (rqstp->rq_cred.cr_group_info == NULL) ++ return SVC_DROP; /* kmalloc failure - client must retry */ ++ ++ /* Put NULL verifier */ ++ svc_putu32(resv, RPC_AUTH_NULL); ++ svc_putu32(resv, 0); ++ dprintk("%s: success, returning %d!\n", __FUNCTION__, SVC_OK); ++ return SVC_OK; ++} ++ ++static int nfs_callback_null_release(struct svc_rqst *rqstp) ++{ ++ if (rqstp->rq_cred.cr_group_info) ++ put_group_info(rqstp->rq_cred.cr_group_info); ++ rqstp->rq_cred.cr_group_info = NULL; ++ return 0; /* don't drop */ ++} ++ ++static struct auth_ops nfs_callback_auth_null = { ++ .name = "null", ++ .flavour = RPC_AUTH_NULL, ++ .accept = nfs_callback_null_accept, ++ .release = nfs_callback_null_release, ++}; ++ ++/* ++ * AUTH_SYS authentication ++ */ ++static int nfs_callback_unix_accept(struct svc_rqst *rqstp, u32 *authp) ++{ ++ struct iovec *argv = &rqstp->rq_arg.head[0]; ++ struct iovec *resv = &rqstp->rq_res.head[0]; ++ struct svc_cred *cred = &rqstp->rq_cred; ++ u32 slen, i; ++ int len = argv->iov_len; ++ ++ dprintk("%s: start\n", __FUNCTION__); ++ cred->cr_group_info = NULL; ++ rqstp->rq_client = NULL; ++ if ((len -= 3*4) < 0) ++ return SVC_GARBAGE; ++ ++ /* Get length, time stamp and machine name */ ++ svc_getu32(argv); ++ svc_getu32(argv); ++ slen = XDR_QUADLEN(ntohl(svc_getu32(argv))); ++ if (slen > 64 || (len -= (slen + 3)*4) < 0) ++ goto badcred; ++ argv->iov_base = (void*)((u32*)argv->iov_base + slen); ++ argv->iov_len -= slen*4; ++ ++ cred->cr_uid = ntohl(svc_getu32(argv)); ++ cred->cr_gid = ntohl(svc_getu32(argv)); ++ slen = ntohl(svc_getu32(argv)); ++ if (slen > 16 || (len -= (slen + 2)*4) < 0) ++ goto badcred; ++ cred->cr_group_info = groups_alloc(slen); ++ if (cred->cr_group_info == NULL) ++ return SVC_DROP; ++ for (i = 0; i < slen; i++) ++ GROUP_AT(cred->cr_group_info, i) = ntohl(svc_getu32(argv)); ++ ++ if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) { ++ *authp = rpc_autherr_badverf; ++ return SVC_DENIED; ++ } ++ /* Put NULL verifier */ ++ svc_putu32(resv, RPC_AUTH_NULL); ++ svc_putu32(resv, 0); ++ dprintk("%s: success, returning %d!\n", __FUNCTION__, SVC_OK); ++ return SVC_OK; ++badcred: ++ *authp = rpc_autherr_badcred; ++ return SVC_DENIED; ++} ++ ++static int nfs_callback_unix_release(struct svc_rqst *rqstp) ++{ ++ if (rqstp->rq_cred.cr_group_info) ++ put_group_info(rqstp->rq_cred.cr_group_info); ++ rqstp->rq_cred.cr_group_info = NULL; ++ return 0; ++} ++ ++static struct auth_ops nfs_callback_auth_unix = { ++ .name = "unix", ++ .flavour = RPC_AUTH_UNIX, ++ .accept = nfs_callback_unix_accept, ++ .release = nfs_callback_unix_release, ++}; ++ ++/* ++ * Hook the authentication protocol ++ */ ++static int nfs_callback_auth(struct svc_rqst *rqstp, u32 *authp) ++{ ++ struct in_addr *addr = &rqstp->rq_addr.sin_addr; ++ struct nfs4_client *clp; ++ struct iovec *argv = &rqstp->rq_arg.head[0]; ++ int flavour; ++ int retval; ++ ++ /* Don't talk to strangers */ ++ clp = nfs4_find_client(addr); ++ if (clp == NULL) ++ return SVC_DROP; ++ dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr)); ++ nfs4_put_client(clp); ++ flavour = ntohl(svc_getu32(argv)); ++ switch(flavour) { ++ case RPC_AUTH_NULL: ++ if (rqstp->rq_proc != CB_NULL) { ++ *authp = rpc_autherr_tooweak; ++ retval = SVC_DENIED; ++ break; ++ } ++ rqstp->rq_authop = &nfs_callback_auth_null; ++ retval = nfs_callback_null_accept(rqstp, authp); ++ break; ++ case RPC_AUTH_UNIX: ++ /* Eat the authentication flavour */ ++ rqstp->rq_authop = &nfs_callback_auth_unix; ++ retval = nfs_callback_unix_accept(rqstp, authp); ++ break; ++ default: ++ /* FIXME: need to add RPCSEC_GSS upcalls */ ++#if 0 ++ svc_ungetu32(argv); ++ retval = svc_authenticate(rqstp, authp); ++#else ++ *authp = rpc_autherr_rejectedcred; ++ retval = SVC_DENIED; ++#endif ++ } ++ dprintk("%s: flavour %d returning error %d\n", __FUNCTION__, flavour, retval); ++ return retval; ++} ++ ++/* ++ * Define NFS4 callback program ++ */ ++extern struct svc_version nfs4_callback_version1; ++ ++static struct svc_version *nfs4_callback_version[] = { ++ [1] = &nfs4_callback_version1, ++}; ++ ++static struct svc_stat nfs4_callback_stats; ++ ++static struct svc_program nfs4_callback_program = { ++ .pg_prog = NFS4_CALLBACK, /* RPC service number */ ++ .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */ ++ .pg_vers = nfs4_callback_version, /* version table */ ++ .pg_name = "NFSv4 callback", /* service name */ ++ .pg_class = "nfs", /* authentication class */ ++ .pg_stats = &nfs4_callback_stats, ++ .pg_authenticate = nfs_callback_auth, ++}; +--- linux-2.6.7/fs/nfs/read.c.lsec 2004-06-15 23:18:37.000000000 -0600 ++++ linux-2.6.7/fs/nfs/read.c 2005-03-23 14:28:23.114535752 -0700 +@@ -91,8 +91,8 @@ int nfs_return_empty_page(struct page *p + /* + * Read a page synchronously. + */ +-static int +-nfs_readpage_sync(struct file *file, struct inode *inode, struct page *page) ++static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode, ++ struct page *page) + { + unsigned int rsize = NFS_SERVER(inode)->rsize; + unsigned int count = PAGE_CACHE_SIZE; +@@ -105,10 +105,11 @@ nfs_readpage_sync(struct file *file, str + + memset(rdata, 0, sizeof(*rdata)); + rdata->flags = (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); ++ rdata->cred = ctx->cred; + rdata->inode = inode; + INIT_LIST_HEAD(&rdata->pages); + rdata->args.fh = NFS_FH(inode); +- rdata->args.lockowner = current->files; ++ rdata->args.context = ctx; + rdata->args.pages = &page; + rdata->args.pgbase = 0UL; + rdata->args.count = rsize; +@@ -134,7 +135,7 @@ nfs_readpage_sync(struct file *file, str + rdata->args.count); + + lock_kernel(); +- result = NFS_PROTO(inode)->read(rdata, file); ++ result = NFS_PROTO(inode)->read(rdata); + unlock_kernel(); + + /* +@@ -169,8 +170,8 @@ io_error: + return result; + } + +-static int +-nfs_readpage_async(struct file *file, struct inode *inode, struct page *page) ++static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, ++ struct page *page) + { + LIST_HEAD(one_request); + struct nfs_page *new; +@@ -179,7 +180,7 @@ nfs_readpage_async(struct file *file, st + len = nfs_page_length(inode, page); + if (len == 0) + return nfs_return_empty_page(page); +- new = nfs_create_request(file, inode, page, 0, len); ++ new = nfs_create_request(ctx, inode, page, 0, len); + if (IS_ERR(new)) { + unlock_page(page); + return PTR_ERR(new); +@@ -202,8 +203,8 @@ static void nfs_readpage_release(struct + nfs_unlock_request(req); + + dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", +- req->wb_inode->i_sb->s_id, +- (long long)NFS_FILEID(req->wb_inode), ++ req->wb_context->dentry->d_inode->i_sb->s_id, ++ (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + req->wb_bytes, + (long long)req_offset(req)); + } +@@ -217,16 +218,15 @@ static void nfs_read_rpcsetup(struct nfs + struct inode *inode; + + data->req = req; +- data->inode = inode = req->wb_inode; +- data->cred = req->wb_cred; ++ data->inode = inode = req->wb_context->dentry->d_inode; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; + data->args.pgbase = req->wb_pgbase + offset; + data->args.pages = data->pagevec; + data->args.count = count; +- data->args.lockowner = req->wb_lockowner; +- data->args.state = req->wb_state; ++ data->args.context = req->wb_context; + + data->res.fattr = &data->fattr; + data->res.count = count; +@@ -396,7 +396,7 @@ nfs_pagein_list(struct list_head *head, + while (!list_empty(head)) { + pages += nfs_coalesce_requests(head, &one_request, rpages); + req = nfs_list_entry(one_request.next); +- error = nfs_pagein_one(&one_request, req->wb_inode); ++ error = nfs_pagein_one(&one_request, req->wb_context->dentry->d_inode); + if (error < 0) + break; + } +@@ -500,9 +500,9 @@ void nfs_readpage_result(struct rpc_task + * - The error flag is set for this page. This happens only when a + * previous async read operation failed. + */ +-int +-nfs_readpage(struct file *file, struct page *page) ++int nfs_readpage(struct file *file, struct page *page) + { ++ struct nfs_open_context *ctx; + struct inode *inode = page->mapping->host; + int error; + +@@ -519,25 +519,33 @@ nfs_readpage(struct file *file, struct p + if (error) + goto out_error; + ++ if (file == NULL) { ++ ctx = nfs_find_open_context(inode, FMODE_READ); ++ if (ctx == NULL) ++ return -EBADF; ++ } else ++ ctx = get_nfs_open_context((struct nfs_open_context *) ++ file->private_data); + if (!IS_SYNC(inode)) { +- error = nfs_readpage_async(file, inode, page); ++ error = nfs_readpage_async(ctx, inode, page); + goto out; + } + +- error = nfs_readpage_sync(file, inode, page); ++ error = nfs_readpage_sync(ctx, inode, page); + if (error < 0 && IS_SWAPFILE(inode)) + printk("Aiee.. nfs swap-in of page failed!\n"); + out: ++ put_nfs_open_context(ctx); + return error; + + out_error: + unlock_page(page); +- goto out; ++ return error; + } + + struct nfs_readdesc { + struct list_head *head; +- struct file *filp; ++ struct nfs_open_context *ctx; + }; + + static int +@@ -552,7 +560,7 @@ readpage_async_filler(void *data, struct + len = nfs_page_length(inode, page); + if (len == 0) + return nfs_return_empty_page(page); +- new = nfs_create_request(desc->filp, inode, page, 0, len); ++ new = nfs_create_request(desc->ctx, inode, page, 0, len); + if (IS_ERR(new)) { + SetPageError(page); + unlock_page(page); +@@ -565,13 +573,11 @@ readpage_async_filler(void *data, struct + return 0; + } + +-int +-nfs_readpages(struct file *filp, struct address_space *mapping, ++int nfs_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) + { + LIST_HEAD(head); + struct nfs_readdesc desc = { +- .filp = filp, + .head = &head, + }; + struct inode *inode = mapping->host; +@@ -583,12 +589,20 @@ nfs_readpages(struct file *filp, struct + (long long)NFS_FILEID(inode), + nr_pages); + ++ if (filp == NULL) { ++ desc.ctx = nfs_find_open_context(inode, FMODE_READ); ++ if (desc.ctx == NULL) ++ return -EBADF; ++ } else ++ desc.ctx = get_nfs_open_context((struct nfs_open_context *) ++ filp->private_data); + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + if (!list_empty(&head)) { + int err = nfs_pagein_list(&head, server->rpages); + if (!ret) + ret = err; + } ++ put_nfs_open_context(desc.ctx); + return ret; + } + +--- linux-2.6.7/fs/nfs/Makefile.lsec 2004-06-15 23:19:01.000000000 -0600 ++++ linux-2.6.7/fs/nfs/Makefile 2005-03-23 14:28:22.819580592 -0700 +@@ -9,6 +9,7 @@ nfs-y := dir.o file.o inode.o nfs2xdr + nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o + nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o + nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ +- idmap.o ++ delegation.o idmap.o \ ++ callback.o callback_xdr.o callback_proc.o + nfs-$(CONFIG_NFS_DIRECTIO) += direct.o + nfs-objs := $(nfs-y) +--- linux-2.6.7/fs/Kconfig.lsec 2004-06-15 23:19:36.000000000 -0600 ++++ linux-2.6.7/fs/Kconfig 2005-03-23 14:28:23.871420688 -0700 +@@ -322,7 +322,7 @@ config FS_POSIX_ACL + # Never use this symbol for ifdefs. + # + bool +- depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL ++ depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL || NFS_V4 + default y + + config XFS_FS +@@ -1443,6 +1443,7 @@ config NFSD_V3 + config NFSD_V4 + bool "Provide NFSv4 server support (EXPERIMENTAL)" + depends on NFSD_V3 && EXPERIMENTAL ++ select NFSD_TCP + help + If you would like to include the NFSv4 server as well as the NFSv2 + and NFSv3 servers, say Y here. This feature is experimental, and +@@ -1450,11 +1451,13 @@ config NFSD_V4 + If unsure, say N. + + config NFSD_TCP +- bool "Provide NFS server over TCP support (EXPERIMENTAL)" +- depends on NFSD && EXPERIMENTAL ++ bool "Provide NFS server over TCP support" ++ depends on NFSD ++ default y + help +- Enable NFS service over TCP connections. This the officially +- still experimental, but seems to work well. ++ If you want your NFS server to support TCP connections, say Y here. ++ TCP connections usually perform better than the default UDP when ++ the network is lossy or congested. If unsure, say Y. + + config ROOT_NFS + bool "Root file system on NFS" +@@ -1505,6 +1508,22 @@ config RPCSEC_GSS_KRB5 + + If unsure, say N. + ++config RPCSEC_GSS_SPKM3 ++ tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)" ++ depends on SUNRPC && EXPERIMENTAL ++ select SUNRPC_GSS ++ select CRYPTO ++ select CRYPTO_MD5 ++ select CRYPTO_DES ++ help ++ Provides for secure RPC calls by means of a gss-api ++ mechanism based on the SPKM3 public-key mechanism. ++ ++ Note: Requires an auxiliary userspace daemon which may be found on ++ http://www.citi.umich.edu/projects/nfsv4/ ++ ++ If unsure, say N. ++ + config SMB_FS + tristate "SMB file system support (to mount Windows shares etc.)" + depends on INET +--- linux-2.6.7/include/linux/fs.h.lsec 2005-03-23 14:26:03.300790672 -0700 ++++ linux-2.6.7/include/linux/fs.h 2005-03-23 14:28:23.280510520 -0700 +@@ -632,7 +632,7 @@ struct file_lock { + struct file_lock *fl_next; /* singly linked list for this inode */ + struct list_head fl_link; /* doubly linked list of all locks */ + struct list_head fl_block; /* circular list of blocked processes */ +- fl_owner_t fl_owner; ++ fl_owner_t fl_owner; /* 0 if lock owned by a local process */ + unsigned int fl_pid; + wait_queue_head_t fl_wait; + struct file *fl_file; +--- linux-2.6.7/include/linux/nfs4.h.lsec 2004-06-15 23:19:22.000000000 -0600 ++++ linux-2.6.7/include/linux/nfs4.h 2005-03-23 14:28:23.335502160 -0700 +@@ -13,8 +13,12 @@ + #ifndef _LINUX_NFS4_H + #define _LINUX_NFS4_H + ++#include ++#include ++ + #define NFS4_VERIFIER_SIZE 8 + #define NFS4_FHSIZE 128 ++#define NFS4_MAXPATHLEN PATH_MAX + #define NFS4_MAXNAMLEN NAME_MAX + + #define NFS4_ACCESS_READ 0x0001 +@@ -52,6 +56,60 @@ + #define ACL4_SUPPORT_AUDIT_ACL 0x04 + #define ACL4_SUPPORT_ALARM_ACL 0x08 + ++#define NFS4_ACE_FILE_INHERIT_ACE 0x00000001 ++#define NFS4_ACE_DIRECTORY_INHERIT_ACE 0x00000002 ++#define NFS4_ACE_NO_PROPAGATE_INHERIT_ACE 0x00000004 ++#define NFS4_ACE_INHERIT_ONLY_ACE 0x00000008 ++#define NFS4_ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x00000010 ++#define NFS4_ACE_FAILED_ACCESS_ACE_FLAG 0x00000020 ++#define NFS4_ACE_IDENTIFIER_GROUP 0x00000040 ++#define NFS4_ACE_OWNER 0x00000080 ++#define NFS4_ACE_GROUP 0x00000100 ++#define NFS4_ACE_EVERYONE 0x00000200 ++ ++#define NFS4_ACE_READ_DATA 0x00000001 ++#define NFS4_ACE_LIST_DIRECTORY 0x00000001 ++#define NFS4_ACE_WRITE_DATA 0x00000002 ++#define NFS4_ACE_ADD_FILE 0x00000002 ++#define NFS4_ACE_APPEND_DATA 0x00000004 ++#define NFS4_ACE_ADD_SUBDIRECTORY 0x00000004 ++#define NFS4_ACE_READ_NAMED_ATTRS 0x00000008 ++#define NFS4_ACE_WRITE_NAMED_ATTRS 0x00000010 ++#define NFS4_ACE_EXECUTE 0x00000020 ++#define NFS4_ACE_DELETE_CHILD 0x00000040 ++#define NFS4_ACE_READ_ATTRIBUTES 0x00000080 ++#define NFS4_ACE_WRITE_ATTRIBUTES 0x00000100 ++#define NFS4_ACE_DELETE 0x00010000 ++#define NFS4_ACE_READ_ACL 0x00020000 ++#define NFS4_ACE_WRITE_ACL 0x00040000 ++#define NFS4_ACE_WRITE_OWNER 0x00080000 ++#define NFS4_ACE_SYNCHRONIZE 0x00100000 ++#define NFS4_ACE_GENERIC_READ 0x00120081 ++#define NFS4_ACE_GENERIC_WRITE 0x00160106 ++#define NFS4_ACE_GENERIC_EXECUTE 0x001200A0 ++#define NFS4_ACE_MASK_ALL 0x001F01FF ++ ++enum nfs4_acl_whotype { ++ NFS4_ACL_WHO_NAMED = 0, ++ NFS4_ACL_WHO_OWNER, ++ NFS4_ACL_WHO_GROUP, ++ NFS4_ACL_WHO_EVERYONE, ++}; ++ ++struct nfs4_ace { ++ uint32_t type; ++ uint32_t flag; ++ uint32_t access_mask; ++ int whotype; ++ uid_t who; ++ struct list_head l_ace; ++}; ++ ++struct nfs4_acl { ++ uint32_t naces; ++ struct list_head ace_head; ++}; ++ + typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; + typedef struct { char data[16]; } nfs4_stateid; + +@@ -297,7 +355,7 @@ enum { + NFSPROC4_CLNT_COMMIT, + NFSPROC4_CLNT_OPEN, + NFSPROC4_CLNT_OPEN_CONFIRM, +- NFSPROC4_CLNT_OPEN_RECLAIM, ++ NFSPROC4_CLNT_OPEN_NOATTR, + NFSPROC4_CLNT_OPEN_DOWNGRADE, + NFSPROC4_CLNT_CLOSE, + NFSPROC4_CLNT_SETATTR, +@@ -315,12 +373,16 @@ enum { + NFSPROC4_CLNT_REMOVE, + NFSPROC4_CLNT_RENAME, + NFSPROC4_CLNT_LINK, ++ NFSPROC4_CLNT_SYMLINK, + NFSPROC4_CLNT_CREATE, + NFSPROC4_CLNT_PATHCONF, + NFSPROC4_CLNT_STATFS, + NFSPROC4_CLNT_READLINK, + NFSPROC4_CLNT_READDIR, + NFSPROC4_CLNT_SERVER_CAPS, ++ NFSPROC4_CLNT_DELEGRETURN, ++ NFSPROC4_CLNT_GETACL, ++ NFSPROC4_CLNT_SETACL, + }; + + #endif +--- linux-2.6.7/include/linux/nfs_page.h.lsec 2004-06-15 23:18:57.000000000 -0600 ++++ linux-2.6.7/include/linux/nfs_page.h 2005-03-23 14:28:23.392493496 -0700 +@@ -29,14 +29,9 @@ + struct nfs_page { + struct list_head wb_list, /* Defines state of page: */ + *wb_list_head; /* read/write/commit */ +- struct file *wb_file; +- fl_owner_t wb_lockowner; +- struct inode *wb_inode; +- struct rpc_cred *wb_cred; +- struct nfs4_state *wb_state; + struct page *wb_page; /* page to read in/write out */ ++ struct nfs_open_context *wb_context; /* File state context info */ + atomic_t wb_complete; /* i/os we're waiting for */ +- wait_queue_head_t wb_wait; /* wait queue */ + unsigned long wb_index; /* Offset >> PAGE_CACHE_SHIFT */ + unsigned int wb_offset, /* Offset & ~PAGE_CACHE_MASK */ + wb_pgbase, /* Start of page data */ +@@ -50,9 +45,11 @@ struct nfs_page { + #define NFS_NEED_COMMIT(req) (test_bit(PG_NEED_COMMIT,&(req)->wb_flags)) + #define NFS_NEED_RESCHED(req) (test_bit(PG_NEED_RESCHED,&(req)->wb_flags)) + +-extern struct nfs_page *nfs_create_request(struct file *, struct inode *, +- struct page *, +- unsigned int, unsigned int); ++extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx, ++ struct inode *inode, ++ struct page *page, ++ unsigned int offset, ++ unsigned int count); + extern void nfs_clear_request(struct nfs_page *req); + extern void nfs_release_request(struct nfs_page *req); + +@@ -64,6 +61,7 @@ extern int nfs_scan_list(struct list_hea + extern int nfs_coalesce_requests(struct list_head *, struct list_head *, + unsigned int); + extern int nfs_wait_on_request(struct nfs_page *); ++extern void nfs_unlock_request(struct nfs_page *req); + + extern spinlock_t nfs_wreq_lock; + +@@ -90,19 +88,6 @@ nfs_lock_request(struct nfs_page *req) + return 1; + } + +-static inline void +-nfs_unlock_request(struct nfs_page *req) +-{ +- if (!NFS_WBACK_BUSY(req)) { +- printk(KERN_ERR "NFS: Invalid unlock attempted\n"); +- BUG(); +- } +- smp_mb__before_clear_bit(); +- clear_bit(PG_BUSY, &req->wb_flags); +- smp_mb__after_clear_bit(); +- wake_up_all(&req->wb_wait); +- nfs_release_request(req); +-} + + /** + * nfs_list_remove_request - Remove a request from its wb_list +--- linux-2.6.7/include/linux/sunrpc/svc.h.lsec 2004-06-15 23:19:35.000000000 -0600 ++++ linux-2.6.7/include/linux/sunrpc/svc.h 2005-03-23 14:28:23.541470848 -0700 +@@ -87,6 +87,14 @@ static inline u32 svc_getu32(struct iove + iov->iov_len -= sizeof(u32); + return val; + } ++ ++static inline void svc_ungetu32(struct iovec *iov) ++{ ++ u32 *vp = (u32 *)iov->iov_base; ++ iov->iov_base = (void *)(vp - 1); ++ iov->iov_len += sizeof(*vp); ++} ++ + static inline void svc_putu32(struct iovec *iov, u32 val) + { + u32 *vp = iov->iov_base + iov->iov_len; +@@ -243,6 +251,8 @@ struct svc_program { + char * pg_name; /* service name */ + char * pg_class; /* class name: services sharing authentication */ + struct svc_stat * pg_stats; /* rpc statistics */ ++ /* Override authentication. NULL means use default */ ++ int (*pg_authenticate)(struct svc_rqst *, u32 *); + }; + + /* +--- linux-2.6.7/include/linux/sunrpc/gss_spkm3.h.lsec 2005-03-23 14:28:24.186372808 -0700 ++++ linux-2.6.7/include/linux/sunrpc/gss_spkm3.h 2005-03-23 14:28:24.185372960 -0700 +@@ -0,0 +1,61 @@ ++/* ++ * linux/include/linux/sunrpc/gss_spkm3.h ++ * ++ * Copyright (c) 2000 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ */ ++ ++#include ++#include ++#include ++ ++struct spkm3_ctx { ++ struct xdr_netobj ctx_id; /* per message context id */ ++ int qop; /* negotiated qop */ ++ struct xdr_netobj mech_used; ++ unsigned int ret_flags ; ++ unsigned int req_flags ; ++ struct xdr_netobj share_key; ++ int conf_alg; ++ struct crypto_tfm* derived_conf_key; ++ int intg_alg; ++ struct crypto_tfm* derived_integ_key; ++ int keyestb_alg; /* alg used to get share_key */ ++ int owf_alg; /* one way function */ ++}; ++ ++/* from openssl/objects.h */ ++/* XXX need SEAL_ALG_NONE */ ++#define NID_md5 4 ++#define NID_dhKeyAgreement 28 ++#define NID_des_cbc 31 ++#define NID_sha1 64 ++#define NID_cast5_cbc 108 ++ ++/* SPKM InnerContext Token types */ ++ ++#define SPKM_ERROR_TOK 3 ++#define SPKM_MIC_TOK 4 ++#define SPKM_WRAP_TOK 5 ++#define SPKM_DEL_TOK 6 ++ ++u32 spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, struct xdr_buf * text, struct xdr_netobj * token, int toktype); ++ ++u32 spkm3_read_token(struct spkm3_ctx *ctx, struct xdr_netobj *read_token, struct xdr_buf *message_buffer, int *qop_state, int toktype); ++ ++#define CKSUMTYPE_RSA_MD5 0x0007 ++ ++s32 make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, ++ struct xdr_netobj *cksum); ++void asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits); ++int decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen, ++ int explen); ++void spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen, ++ unsigned char *ctxhdr, int elen, int zbit); ++void spkm3_make_mic_token(unsigned char **tokp, int toklen, ++ struct xdr_netobj *mic_hdr, ++ struct xdr_netobj *md5cksum, int md5elen, int md5zbit); ++u32 spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, ++ unsigned char **cksum); +--- linux-2.6.7/include/linux/sunrpc/sched.h.lsec 2004-06-15 23:19:42.000000000 -0600 ++++ linux-2.6.7/include/linux/sunrpc/sched.h 2005-03-23 14:28:23.540471000 -0700 +@@ -11,7 +11,9 @@ + + #include + #include ++#include + #include ++#include + #include + + /* +@@ -25,11 +27,18 @@ struct rpc_message { + struct rpc_cred * rpc_cred; /* Credentials */ + }; + ++struct rpc_wait_queue; ++struct rpc_wait { ++ struct list_head list; /* wait queue links */ ++ struct list_head links; /* Links to related tasks */ ++ wait_queue_head_t waitq; /* sync: sleep on this q */ ++ struct rpc_wait_queue * rpc_waitq; /* RPC wait queue we're on */ ++}; ++ + /* + * This is the RPC task struct + */ + struct rpc_task { +- struct list_head tk_list; /* wait queue links */ + #ifdef RPC_DEBUG + unsigned long tk_magic; /* 0xf00baa */ + #endif +@@ -37,7 +46,6 @@ struct rpc_task { + struct rpc_clnt * tk_client; /* RPC client */ + struct rpc_rqst * tk_rqstp; /* RPC request */ + int tk_status; /* result of last operation */ +- struct rpc_wait_queue * tk_rpcwait; /* RPC wait queue we're on */ + + /* + * RPC call state +@@ -70,13 +78,18 @@ struct rpc_task { + * you have a pathological interest in kernel oopses. + */ + struct timer_list tk_timer; /* kernel timer */ +- wait_queue_head_t tk_wait; /* sync: sleep on this q */ + unsigned long tk_timeout; /* timeout for rpc_sleep() */ + unsigned short tk_flags; /* misc flags */ + unsigned char tk_active : 1;/* Task has been activated */ + unsigned char tk_priority : 2;/* Task priority */ + unsigned long tk_runstate; /* Task run status */ +- struct list_head tk_links; /* links to related tasks */ ++ struct workqueue_struct *tk_workqueue; /* Normally rpciod, but could ++ * be any workqueue ++ */ ++ union { ++ struct work_struct tk_work; /* Async task work queue */ ++ struct rpc_wait tk_wait; /* RPC wait */ ++ } u; + #ifdef RPC_DEBUG + unsigned short tk_pid; /* debugging aid */ + #endif +@@ -87,11 +100,11 @@ struct rpc_task { + /* support walking a list of tasks on a wait queue */ + #define task_for_each(task, pos, head) \ + list_for_each(pos, head) \ +- if ((task=list_entry(pos, struct rpc_task, tk_list)),1) ++ if ((task=list_entry(pos, struct rpc_task, u.tk_wait.list)),1) + + #define task_for_first(task, head) \ + if (!list_empty(head) && \ +- ((task=list_entry((head)->next, struct rpc_task, tk_list)),1)) ++ ((task=list_entry((head)->next, struct rpc_task, u.tk_wait.list)),1)) + + /* .. and walking list of all tasks */ + #define alltask_for_each(task, pos, head) \ +@@ -124,22 +137,24 @@ typedef void (*rpc_action)(struct rpc_ + #define RPC_DO_CALLBACK(t) ((t)->tk_callback != NULL) + #define RPC_IS_SOFT(t) ((t)->tk_flags & RPC_TASK_SOFT) + +-#define RPC_TASK_SLEEPING 0 +-#define RPC_TASK_RUNNING 1 +-#define RPC_IS_SLEEPING(t) (test_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate)) +-#define RPC_IS_RUNNING(t) (test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) ++#define RPC_TASK_RUNNING 0 ++#define RPC_TASK_QUEUED 1 + ++#define RPC_IS_RUNNING(t) (test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) + #define rpc_set_running(t) (set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) +-#define rpc_clear_running(t) (clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) +- +-#define rpc_set_sleeping(t) (set_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate)) +- +-#define rpc_clear_sleeping(t) \ ++#define rpc_test_and_set_running(t) \ ++ (test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)) ++#define rpc_clear_running(t) \ + do { \ + smp_mb__before_clear_bit(); \ +- clear_bit(RPC_TASK_SLEEPING, &(t)->tk_runstate); \ ++ clear_bit(RPC_TASK_RUNNING, &(t)->tk_runstate); \ + smp_mb__after_clear_bit(); \ +- } while(0) ++ } while (0) ++ ++#define RPC_IS_QUEUED(t) (test_bit(RPC_TASK_QUEUED, &(t)->tk_runstate)) ++#define rpc_set_queued(t) (set_bit(RPC_TASK_QUEUED, &(t)->tk_runstate)) ++#define rpc_test_and_clear_queued(t) \ ++ (test_and_clear_bit(RPC_TASK_QUEUED, &(t)->tk_runstate)) + + /* + * Task priorities. +@@ -155,6 +170,7 @@ typedef void (*rpc_action)(struct rpc_ + * RPC synchronization objects + */ + struct rpc_wait_queue { ++ spinlock_t lock; + struct list_head tasks[RPC_NR_PRIORITY]; /* task queue for each priority level */ + unsigned long cookie; /* cookie of last task serviced */ + unsigned char maxpriority; /* maximum priority (0 if queue is not a priority queue) */ +@@ -175,6 +191,7 @@ struct rpc_wait_queue { + + #ifndef RPC_DEBUG + # define RPC_WAITQ_INIT(var,qname) { \ ++ .lock = SPIN_LOCK_UNLOCKED, \ + .tasks = { \ + [0] = LIST_HEAD_INIT(var.tasks[0]), \ + [1] = LIST_HEAD_INIT(var.tasks[1]), \ +@@ -183,6 +200,7 @@ struct rpc_wait_queue { + } + #else + # define RPC_WAITQ_INIT(var,qname) { \ ++ .lock = SPIN_LOCK_UNLOCKED, \ + .tasks = { \ + [0] = LIST_HEAD_INIT(var.tasks[0]), \ + [1] = LIST_HEAD_INIT(var.tasks[1]), \ +@@ -207,13 +225,10 @@ void rpc_killall_tasks(struct rpc_clnt + int rpc_execute(struct rpc_task *); + void rpc_run_child(struct rpc_task *parent, struct rpc_task *child, + rpc_action action); +-int rpc_add_wait_queue(struct rpc_wait_queue *, struct rpc_task *); +-void rpc_remove_wait_queue(struct rpc_task *); + void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); + void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); + void rpc_sleep_on(struct rpc_wait_queue *, struct rpc_task *, + rpc_action action, rpc_action timer); +-void rpc_add_timer(struct rpc_task *, rpc_action); + void rpc_wake_up_task(struct rpc_task *); + void rpc_wake_up(struct rpc_wait_queue *); + struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); +--- linux-2.6.7/include/linux/sunrpc/gss_api.h.lsec 2004-06-15 23:20:03.000000000 -0600 ++++ linux-2.6.7/include/linux/sunrpc/gss_api.h 2005-03-23 14:28:24.688296504 -0700 +@@ -47,6 +47,18 @@ u32 gss_verify_mic( + struct xdr_buf *message, + struct xdr_netobj *mic_token, + u32 *qstate); ++u32 gss_wrap( ++ struct gss_ctx *ctx_id, ++ u32 qop, ++ int offset, ++ struct xdr_buf *outbuf, ++ struct page **inpages); ++u32 gss_unwrap( ++ struct gss_ctx *ctx_id, ++ u32 *qop, ++ int offset, ++ struct xdr_buf *inbuf, ++ int *out_offset); + u32 gss_delete_sec_context( + struct gss_ctx **ctx_id); + +@@ -93,6 +105,18 @@ struct gss_api_ops { + struct xdr_buf *message, + struct xdr_netobj *mic_token, + u32 *qstate); ++ u32 (*gss_wrap)( ++ struct gss_ctx *ctx_id, ++ u32 qop, ++ int offset, ++ struct xdr_buf *outbuf, ++ struct page **inpages); ++ u32 (*gss_unwrap)( ++ struct gss_ctx *ctx_id, ++ u32 *qop, ++ int offset, ++ struct xdr_buf *buf, ++ int *out_offset); + void (*gss_delete_sec_context)( + void *internal_ctx_id); + }; +--- linux-2.6.7/include/linux/sunrpc/xprt.h.lsec 2004-06-15 23:19:43.000000000 -0600 ++++ linux-2.6.7/include/linux/sunrpc/xprt.h 2005-03-23 14:28:24.783282064 -0700 +@@ -95,7 +95,10 @@ struct rpc_rqst { + int rq_cong; /* has incremented xprt->cong */ + int rq_received; /* receive completed */ + u32 rq_seqno; /* gss seq no. used on req. */ +- ++ int rq_enc_pages_num; ++ struct page **rq_enc_pages; /* scratch pages for use by ++ gss privacy code */ ++ void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */ + struct list_head rq_list; + + struct xdr_buf rq_private_buf; /* The receive buffer +--- linux-2.6.7/include/linux/sunrpc/gss_krb5.h.lsec 2004-06-15 23:19:29.000000000 -0600 ++++ linux-2.6.7/include/linux/sunrpc/gss_krb5.h 2005-03-23 14:28:24.840273400 -0700 +@@ -53,6 +53,8 @@ struct krb5_ctx { + struct xdr_netobj mech_used; + }; + ++extern spinlock_t krb5_seq_lock; ++ + #define KG_TOK_MIC_MSG 0x0101 + #define KG_TOK_WRAP_MSG 0x0201 + +@@ -116,18 +118,25 @@ enum seal_alg { + + s32 + make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, +- struct xdr_netobj *cksum); ++ int body_offset, struct xdr_netobj *cksum); + + u32 + krb5_make_token(struct krb5_ctx *context_handle, int qop_req, + struct xdr_buf *input_message_buffer, +- struct xdr_netobj *output_message_buffer, int toktype); ++ struct xdr_netobj *output_message_buffer); + + u32 + krb5_read_token(struct krb5_ctx *context_handle, + struct xdr_netobj *input_token_buffer, +- struct xdr_buf *message_buffer, +- int *qop_state, int toktype); ++ struct xdr_buf *message_buffer, int *qop_state); ++ ++u32 ++gss_wrap_kerberos(struct gss_ctx *ctx_id, u32 qop, int offset, ++ struct xdr_buf *outbuf, struct page **pages); ++ ++u32 ++gss_unwrap_kerberos(struct gss_ctx *ctx_id, u32 *qop, int offset, ++ struct xdr_buf *buf, int *out_offset); + + u32 + krb5_encrypt(struct crypto_tfm * key, +@@ -137,6 +146,13 @@ u32 + krb5_decrypt(struct crypto_tfm * key, + void *iv, void *in, void *out, int length); + ++int ++gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *outbuf, int offset, ++ struct page **pages); ++ ++int ++gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *inbuf, int offset); ++ + s32 + krb5_make_seq_num(struct crypto_tfm * key, + int direction, +--- linux-2.6.7/include/linux/sunrpc/gss_asn1.h.lsec 2004-06-15 23:20:04.000000000 -0600 ++++ linux-2.6.7/include/linux/sunrpc/gss_asn1.h 2005-03-23 14:28:23.706445768 -0700 +@@ -69,7 +69,6 @@ u32 g_verify_token_header( + struct xdr_netobj *mech, + int *body_size, + unsigned char **buf_in, +- int tok_type, + int toksize); + + u32 g_get_mech_oid(struct xdr_netobj *mech, struct xdr_netobj * in_buf); +--- linux-2.6.7/include/linux/sunrpc/cache.h.lsec 2004-06-15 23:19:28.000000000 -0600 ++++ linux-2.6.7/include/linux/sunrpc/cache.h 2005-03-23 14:28:24.349348032 -0700 +@@ -128,20 +128,17 @@ struct cache_deferred_req { + * just like a template in C++, this macro does cache lookup + * for us. + * The function is passed some sort of HANDLE from which a cache_detail +- * structure can be determined (via SETUP, DETAIL), a template ++ * structure can be determined (via DETAIL), a template + * cache entry (type RTN*), and a "set" flag. Using the HASHFN and the + * TEST, the function will try to find a matching cache entry in the cache. + * If "set" == 0 : + * If an entry is found, it is returned + * If no entry is found, a new non-VALID entry is created. +- * If "set" == 1 and INPLACE == 0 : ++ * If "set" == 1: + * If no entry is found a new one is inserted with data from "template" + * If a non-CACHE_VALID entry is found, it is updated from template using UPDATE + * If a CACHE_VALID entry is found, a new entry is swapped in with data + * from "template" +- * If set == 1, and INPLACE == 1 : +- * As above, except that if a CACHE_VALID entry is found, we UPDATE in place +- * instead of swapping in a new entry. + * + * If the passed handle has the CACHE_NEGATIVE flag set, then UPDATE is not + * run but insteead CACHE_NEGATIVE is set in any new item. +@@ -153,21 +150,18 @@ struct cache_deferred_req { + * MEMBER is the member of the cache which is cache_head, which must be first + * FNAME is the name for the function + * ARGS are arguments to function and must contain RTN *item, int set. May +- * also contain something to be usedby SETUP or DETAIL to find cache_detail. +- * SETUP locates the cache detail and makes it available as... +- * DETAIL identifies the cache detail, possibly set up by SETUP ++ * also contain something to be used by DETAIL to find cache_detail. ++ * DETAIL identifies the cache detail + * HASHFN returns a hash value of the cache entry "item" + * TEST tests if "tmp" matches "item" + * INIT copies key information from "item" to "new" + * UPDATE copies content information from "item" to "tmp" +- * INPLACE is true if updates can happen inplace rather than allocating a new structure + */ +-#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE,INPLACE) \ ++#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,DETAIL,HASHFN,TEST,INIT,UPDATE) \ + RTN *FNAME ARGS \ + { \ + RTN *tmp, *new=NULL; \ + struct cache_head **hp, **head; \ +- SETUP; \ + head = &(DETAIL)->hash_table[HASHFN]; \ + retry: \ + if (set||new) write_lock(&(DETAIL)->hash_lock); \ +@@ -176,14 +170,14 @@ RTN *FNAME ARGS \ + tmp = container_of(*hp, RTN, MEMBER); \ + if (TEST) { /* found a match */ \ + \ +- if (set && !INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \ ++ if (set && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \ + break; \ + \ + if (new) \ + {INIT;} \ + cache_get(&tmp->MEMBER); \ + if (set) { \ +- if (!INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags))\ ++ if (test_bit(CACHE_VALID, &tmp->MEMBER.flags))\ + { /* need to swap in new */ \ + RTN *t2; \ + \ +@@ -205,7 +199,7 @@ RTN *FNAME ARGS \ + else read_unlock(&(DETAIL)->hash_lock); \ + if (set) \ + cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \ +- if (set && !INPLACE && new) cache_fresh(DETAIL, &new->MEMBER, 0); \ ++ if (set && new) cache_fresh(DETAIL, &new->MEMBER, 0); \ + if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL); \ + return tmp; \ + } \ +@@ -233,16 +227,15 @@ RTN *FNAME ARGS \ + new = kmalloc(sizeof(*new), GFP_KERNEL); \ + if (new) { \ + cache_init(&new->MEMBER); \ +- cache_get(&new->MEMBER); \ + goto retry; \ + } \ + return NULL; \ + } + +-#define DefineSimpleCacheLookup(STRUCT,INPLACE) \ +- DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), /*no setup */, \ ++#define DefineSimpleCacheLookup(STRUCT) \ ++ DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), \ + & STRUCT##_cache, STRUCT##_hash(item), STRUCT##_match(item, tmp),\ +- STRUCT##_init(new, item), STRUCT##_update(tmp, item),INPLACE) ++ STRUCT##_init(new, item), STRUCT##_update(tmp, item)) + + #define cache_for_each(pos, detail, index, member) \ + for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ; \ +--- linux-2.6.7/include/linux/sunrpc/xdr.h.lsec 2004-06-15 23:20:26.000000000 -0600 ++++ linux-2.6.7/include/linux/sunrpc/xdr.h 2005-03-23 14:28:24.783282064 -0700 +@@ -192,6 +192,7 @@ extern void xdr_write_pages(struct xdr_s + extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p); + extern uint32_t *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); + extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len); ++extern void truncate_xdr_buf(struct xdr_buf *xdr, int len); + + #endif /* __KERNEL__ */ + +--- linux-2.6.7/include/linux/nfsd/state.h.lsec 2004-06-15 23:18:56.000000000 -0600 ++++ linux-2.6.7/include/linux/nfsd/state.h 2005-03-23 14:28:24.081388768 -0700 +@@ -38,6 +38,7 @@ + #define _NFSD4_STATE_H + + #include ++#include + + #define NFS4_OPAQUE_LIMIT 1024 + typedef struct { +@@ -65,6 +66,22 @@ extern stateid_t onestateid; + #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) + #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) + ++/* client delegation callback info */ ++struct nfs4_callback { ++ /* SETCLIENTID info */ ++ u32 cb_parsed; /* addr parsed */ ++ u32 cb_addr; ++ unsigned short cb_port; ++ u32 cb_prog; ++ u32 cb_ident; ++ struct xdr_netobj cb_netid; ++ /* RPC client info */ ++ u32 cb_set; /* successful CB_NULL call */ ++ struct rpc_program cb_program; ++ struct rpc_stat cb_stat; ++ struct rpc_clnt * cb_client; ++}; ++ + /* + * struct nfs4_client - one per client. Clientids live here. + * o Each nfs4_client is hashed by clientid. +@@ -87,6 +104,21 @@ struct nfs4_client { + struct svc_cred cl_cred; /* setclientid principal */ + clientid_t cl_clientid; /* generated by server */ + nfs4_verifier cl_confirm; /* generated by server */ ++ struct nfs4_callback cl_callback; /* callback info */ ++ time_t cl_first_state; /* first state aquisition*/ ++ atomic_t cl_count; /* ref count */ ++}; ++ ++/* struct nfs4_client_reset ++ * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl ++ * upon lease reset, or from upcall to state_daemon (to read in state ++ * from non-volitile storage) upon reboot. ++ */ ++struct nfs4_client_reclaim { ++ struct list_head cr_strhash; /* hash by cr_name */ ++ struct xdr_netobj cr_name; /* id generated by client */ ++ time_t cr_first_state; /* first state aquisition */ ++ u32 cr_expired; /* boolean: lease expired? */ + }; + + static inline void +@@ -216,5 +248,8 @@ extern int nfs4_share_conflict(struct sv + extern void nfs4_lock_state(void); + extern void nfs4_unlock_state(void); + extern int nfs4_in_grace(void); +-extern int nfs4_in_no_grace(void); ++extern int nfs4_check_open_reclaim(clientid_t *clid); ++extern void nfsd4_probe_callback(struct nfs4_client *clp); ++extern void expire_client(struct nfs4_client *clp); ++extern void put_nfs4_client(struct nfs4_client *clp); + #endif /* NFSD4_STATE_H */ +--- linux-2.6.7/include/linux/nfsd/nfsd.h.lsec 2004-06-15 23:20:04.000000000 -0600 ++++ linux-2.6.7/include/linux/nfsd/nfsd.h 2005-03-23 14:28:24.133380864 -0700 +@@ -76,6 +76,11 @@ int nfsd_lookup(struct svc_rqst *, stru + const char *, int, struct svc_fh *); + int nfsd_setattr(struct svc_rqst *, struct svc_fh *, + struct iattr *, int, time_t); ++#ifdef CONFIG_NFSD_V4 ++int nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, ++ struct nfs4_acl *); ++int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); ++#endif /* CONFIG_NFSD_V4 */ + int nfsd_create(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); +@@ -126,9 +131,13 @@ int nfsd_permission(struct svc_export * + #ifdef CONFIG_NFSD_V4 + void nfs4_state_init(void); + void nfs4_state_shutdown(void); ++time_t nfs4_lease_time(void); ++void nfs4_reset_lease(time_t leasetime); + #else + void static inline nfs4_state_init(void){} + void static inline nfs4_state_shutdown(void){} ++time_t static inline nfs4_lease_time(void){return 0;} ++void static inline nfs4_reset_lease(time_t leasetime){} + #endif + + /* +@@ -249,12 +258,11 @@ static inline int is_fsid(struct svc_fh + #define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */ + #define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */ + +-#define NFSD_LEASE_TIME 60 /* seconds */ ++#define NFSD_LEASE_TIME (nfs4_lease_time()) + #define NFSD_LAUNDROMAT_MINTIMEOUT 10 /* seconds */ + + /* + * The following attributes are currently not supported by the NFSv4 server: +- * ACL (will be supported in a forthcoming patch) + * ARCHIVE (deprecated anyway) + * FS_LOCATIONS (will be supported eventually) + * HIDDEN (unlikely to be supported any time soon) +@@ -274,7 +282,7 @@ static inline int is_fsid(struct svc_fh + | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \ + | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_HOMOGENEOUS \ + | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \ +- | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE) ++ | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_ACL) + + #define NFSD_SUPPORTED_ATTRS_WORD1 \ + (FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \ +@@ -289,7 +297,8 @@ static inline int is_fsid(struct svc_fh + (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) + + /* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ +-#define NFSD_WRITEABLE_ATTRS_WORD0 FATTR4_WORD0_SIZE ++#define NFSD_WRITEABLE_ATTRS_WORD0 \ ++(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL ) + #define NFSD_WRITEABLE_ATTRS_WORD1 \ + (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ + | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY_SET) +--- linux-2.6.7/include/linux/nfsd/xdr4.h.lsec 2004-06-15 23:18:59.000000000 -0600 ++++ linux-2.6.7/include/linux/nfsd/xdr4.h 2005-03-23 14:28:24.082388616 -0700 +@@ -39,6 +39,8 @@ + #ifndef _LINUX_NFSD_XDR4_H + #define _LINUX_NFSD_XDR4_H + ++#include ++ + #define NFSD4_MAX_TAGLEN 128 + #define XDR_LEN(n) (((n) + 3) & ~3) + +@@ -95,6 +97,7 @@ struct nfsd4_create { + u32 cr_bmval[2]; /* request */ + struct iattr cr_iattr; /* request */ + struct nfsd4_change_info cr_cinfo; /* response */ ++ struct nfs4_acl *cr_acl; + }; + #define cr_linklen u.link.namelen + #define cr_linkname u.link.name +@@ -216,7 +219,7 @@ struct nfsd4_open { + u32 op_rflags; /* response */ + int op_truncate; /* used during processing */ + struct nfs4_stateowner *op_stateowner; /* used during processing */ +- ++ struct nfs4_acl *op_acl; + }; + #define op_iattr u.iattr + #define op_verf u.verf +@@ -291,6 +294,7 @@ struct nfsd4_setattr { + stateid_t sa_stateid; /* request */ + u32 sa_bmval[2]; /* request */ + struct iattr sa_iattr; /* request */ ++ struct nfs4_acl *sa_acl; + }; + + struct nfsd4_setclientid { +@@ -378,6 +382,7 @@ struct nfsd4_compoundargs { + u32 * tmpp; + struct tmpbuf { + struct tmpbuf *next; ++ void (*release)(const void *); + void *buf; + } *to_free; + +@@ -449,6 +454,7 @@ extern int nfsd4_locku(struct svc_rqst * + extern int + nfsd4_release_lockowner(struct svc_rqst *rqstp, + struct nfsd4_release_lockowner *rlockowner); ++extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *); + #endif + + /* +--- linux-2.6.7/include/linux/nfs_fs.h.lsec 2004-06-15 23:19:13.000000000 -0600 ++++ linux-2.6.7/include/linux/nfs_fs.h 2005-03-23 14:28:23.338501704 -0700 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -75,15 +76,33 @@ + #ifdef __KERNEL__ + + /* +- * NFSv3 Access mode cache ++ * NFSv3/v4 Access mode cache entry + */ +-struct nfs_access_cache { ++struct nfs_access_entry { + unsigned long jiffies; + struct rpc_cred * cred; + int mask; +- int err; + }; + ++struct nfs4_state; ++struct nfs_open_context { ++ atomic_t count; ++ struct dentry *dentry; ++ struct rpc_cred *cred; ++ struct nfs4_state *state; ++ unsigned int pid; ++ int mode; ++ int error; ++ ++ struct list_head list; ++ wait_queue_head_t waitq; ++}; ++ ++/* ++ * NFSv4 delegation ++ */ ++struct nfs_delegation; ++ + /* + * nfs fs inode data in memory + */ +@@ -137,7 +156,7 @@ struct nfs_inode { + */ + atomic_t data_updates; + +- struct nfs_access_cache cache_access; ++ struct nfs_access_entry cache_access; + + /* + * This is the cookie verifier used for NFSv3 readdir +@@ -156,16 +175,20 @@ struct nfs_inode { + ncommit, + npages; + +- /* Credentials for shared mmap */ +- struct rpc_cred *mm_cred; ++ /* Open contexts for shared mmap writes */ ++ struct list_head open_files; + + wait_queue_head_t nfs_i_wait; + + #ifdef CONFIG_NFS_V4 + /* NFSv4 state */ + struct list_head open_states; ++ struct nfs_delegation *delegation; ++ int delegation_state; ++ struct rw_semaphore rwsem; + #endif /* CONFIG_NFS_V4*/ +- ++ void *acl; ++ ssize_t acl_len; + struct inode vfs_inode; + }; + +@@ -259,6 +282,18 @@ static inline int nfs_verify_change_attr + && chattr == NFS_I(inode)->cache_change_attribute; + } + ++/** ++ * nfs_compare_fh - compare two filehandles for equality ++ * @fh1 - pointer to first filehandle ++ * @fh2 - pointer to second filehandle ++ */ ++static inline int nfs_compare_fh(const struct nfs_fh *fh1, const struct nfs_fh *fh2) ++{ ++ if (fh1->size == fh2->size) ++ return memcmp(fh1->data, fh2->data, fh1->size); ++ return (fh1->size > fh2->size) ? 1 : -1; ++} ++ + /* + * linux/fs/nfs/inode.c + */ +@@ -268,9 +303,12 @@ extern struct inode *nfs_fhget(struct su + extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); + extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); + extern int nfs_permission(struct inode *, int, struct nameidata *); +-extern void nfs_set_mmcred(struct inode *, struct rpc_cred *); ++extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *); ++extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *); + extern int nfs_open(struct inode *, struct file *); + extern int nfs_release(struct inode *, struct file *); ++extern int nfs_attribute_timeout(struct inode *inode); ++extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode); + extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); + extern int nfs_setattr(struct dentry *, struct iattr *); + extern void nfs_begin_attr_update(struct inode *); +@@ -278,6 +316,12 @@ extern void nfs_end_attr_update(struct i + extern void nfs_begin_data_update(struct inode *); + extern void nfs_end_data_update(struct inode *); + extern void nfs_end_data_update_defer(struct inode *); ++extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred); ++extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); ++extern void put_nfs_open_context(struct nfs_open_context *ctx); ++extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx); ++extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, int mode); ++extern void nfs_file_clear_open_context(struct file *filp); + + /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ + extern u32 root_nfs_parse_addr(char *name); /*__init*/ +@@ -289,16 +333,15 @@ extern struct inode_operations nfs_file_ + extern struct file_operations nfs_file_operations; + extern struct address_space_operations nfs_file_aops; + +-static __inline__ struct rpc_cred * +-nfs_file_cred(struct file *file) ++static inline struct rpc_cred *nfs_file_cred(struct file *file) + { +- struct rpc_cred *cred = NULL; +- if (file) +- cred = (struct rpc_cred *)file->private_data; +-#ifdef RPC_DEBUG +- BUG_ON(cred && cred->cr_magic != RPCAUTH_CRED_MAGIC); +-#endif +- return cred; ++ if (file != NULL) { ++ struct nfs_open_context *ctx; ++ ++ ctx = (struct nfs_open_context*)file->private_data; ++ return ctx->cred; ++ } ++ return NULL; + } + + /* +@@ -418,28 +461,6 @@ extern int nfsroot_mount(struct sockadd + * inline functions + */ + +-static inline int nfs_attribute_timeout(struct inode *inode) +-{ +- struct nfs_inode *nfsi = NFS_I(inode); +- +- return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo); +-} +- +-/** +- * nfs_revalidate_inode - Revalidate the inode attributes +- * @server - pointer to nfs_server struct +- * @inode - pointer to inode struct +- * +- * Updates inode attribute information by retrieving the data from the server. +- */ +-static inline int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) +-{ +- if (!(NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) +- && !nfs_attribute_timeout(inode)) +- return NFS_STALE(inode) ? -ESTALE : 0; +- return __nfs_revalidate_inode(server, inode); +-} +- + static inline loff_t + nfs_size_to_loff_t(__u64 size) + { +@@ -507,8 +528,6 @@ struct idmap; + + enum nfs4_client_state { + NFS4CLNT_OK = 0, +- NFS4CLNT_NEW, +- NFS4CLNT_SETUP_STATE, + }; + + /* +@@ -520,7 +539,6 @@ struct nfs4_client { + u64 cl_clientid; /* constant */ + nfs4_verifier cl_confirm; + unsigned long cl_state; +- long cl_generation; + + u32 cl_lockowner_id; + +@@ -530,6 +548,7 @@ struct nfs4_client { + */ + struct rw_semaphore cl_sem; + ++ struct list_head cl_delegations; + struct list_head cl_state_owners; + struct list_head cl_unused; + int cl_nunused; +@@ -573,12 +592,11 @@ struct nfs4_state_owner { + u32 so_id; /* 32-bit identifier, unique */ + struct semaphore so_sema; + u32 so_seqid; /* protected by so_sema */ +- unsigned int so_flags; /* protected by so_sema */ + atomic_t so_count; +- long so_generation; + + struct rpc_cred *so_cred; /* Associated cred */ + struct list_head so_states; ++ struct list_head so_delegations; + }; + + /* +@@ -593,10 +611,13 @@ struct nfs4_state_owner { + * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) + */ + ++/* bits for nfs4_lock_state->flags */ ++ + struct nfs4_lock_state { + struct list_head ls_locks; /* Other lock stateids */ +- fl_owner_t ls_owner; /* POSIX lock owner */ +- struct nfs4_state * ls_parent; /* Parent nfs4_state */ ++ unsigned int ls_pid; /* pid of owner process */ ++#define NFS_LOCK_INITIALIZED 1 ++ int flags; + u32 ls_seqid; + u32 ls_id; + nfs4_stateid ls_stateid; +@@ -606,6 +627,7 @@ struct nfs4_lock_state { + /* bits for nfs4_state->flags */ + enum { + LK_STATE_IN_USE, ++ NFS_DELEGATED_STATE, + }; + + struct nfs4_state { +@@ -629,8 +651,19 @@ struct nfs4_state { + }; + + ++struct nfs4_exception { ++ long timeout; ++ int retry; ++}; ++ + extern struct dentry_operations nfs4_dentry_operations; + extern struct inode_operations nfs4_dir_inode_operations; ++extern struct inode_operations nfs4_file_inode_operations; ++ ++/* inode.c */ ++extern ssize_t nfs_getxattr(struct dentry *, const char *, void *, size_t); ++extern int nfs_setxattr(struct dentry *, const char *, const void *, size_t, int); ++extern ssize_t nfs_listxattr(struct dentry *, char *, size_t); + + /* nfs4proc.c */ + extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short); +@@ -639,10 +672,15 @@ extern int nfs4_open_reclaim(struct nfs4 + extern int nfs4_proc_async_renew(struct nfs4_client *); + extern int nfs4_proc_renew(struct nfs4_client *); + extern int nfs4_do_close(struct inode *, struct nfs4_state *); +-int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode); ++extern int nfs4_do_downgrade(struct inode *inode, struct nfs4_state *state, mode_t mode); + extern int nfs4_wait_clnt_recover(struct rpc_clnt *, struct nfs4_client *); + extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); + extern int nfs4_open_revalidate(struct inode *, struct dentry *, int); ++extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); ++extern int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request); ++extern ssize_t nfs4_proc_get_acl(struct inode *, void *buf, ssize_t buflen); ++extern int nfs4_proc_set_acl(struct inode *, const void *buf, ssize_t buflen); ++extern void nfs4_zap_acl_attr(struct inode *inode); + + /* nfs4renewd.c */ + extern void nfs4_schedule_state_renewal(struct nfs4_client *); +@@ -654,6 +692,8 @@ extern void init_nfsv4_state(struct nfs_ + extern void destroy_nfsv4_state(struct nfs_server *); + extern struct nfs4_client *nfs4_get_client(struct in_addr *); + extern void nfs4_put_client(struct nfs4_client *clp); ++extern int nfs4_init_client(struct nfs4_client *clp); ++extern struct nfs4_client *nfs4_find_client(struct in_addr *); + extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *); + + extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); +@@ -663,15 +703,14 @@ extern void nfs4_put_open_state(struct n + extern void nfs4_close_state(struct nfs4_state *, mode_t); + extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode); + extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp); +-extern int nfs4_handle_error(struct nfs_server *, int); + extern void nfs4_schedule_state_recovery(struct nfs4_client *); +-extern struct nfs4_lock_state *nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t); +-extern struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t); ++extern struct nfs4_lock_state *nfs4_find_lock_state(struct nfs4_state *state, unsigned int pid); ++extern struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, unsigned int pid); + extern void nfs4_put_lock_state(struct nfs4_lock_state *state); + extern void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *ls); +-extern void nfs4_notify_setlk(struct inode *, struct file_lock *, struct nfs4_lock_state *); +-extern void nfs4_notify_unlck(struct inode *, struct file_lock *, struct nfs4_lock_state *); +-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); ++extern void nfs4_notify_setlk(struct nfs4_state *, struct file_lock *, struct nfs4_lock_state *); ++extern void nfs4_notify_unlck(struct nfs4_state *, struct file_lock *, struct nfs4_lock_state *); ++extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, unsigned int pid); + + + +@@ -681,6 +720,7 @@ struct nfs4_mount_data; + #define destroy_nfsv4_state(server) do { } while (0) + #define nfs4_put_state_owner(inode, owner) do { } while (0) + #define nfs4_put_open_state(state) do { } while (0) ++#define nfs4_close_state(a, b) do { } while (0) + #define nfs4_renewd_prepare_shutdown(server) do { } while (0) + #endif + +@@ -697,6 +737,7 @@ struct nfs4_mount_data; + #define NFSDBG_XDR 0x0020 + #define NFSDBG_FILE 0x0040 + #define NFSDBG_ROOT 0x0080 ++#define NFSDBG_CALLBACK 0x0100 + #define NFSDBG_ALL 0xFFFF + + #ifdef __KERNEL__ +--- linux-2.6.7/include/linux/nfs4_acl.h.lsec 2005-03-23 14:28:24.519322192 -0700 ++++ linux-2.6.7/include/linux/nfs4_acl.h 2005-03-23 14:28:24.518322344 -0700 +@@ -0,0 +1,59 @@ ++/* ++ * include/linux/nfs4_acl.c ++ * ++ * Common NFSv4 ACL handling definitions. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Marius Aamodt Eriksen ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef LINUX_NFS4_ACL_H ++#define LINUX_NFS4_ACL_H ++ ++#include ++ ++struct nfs4_acl *nfs4_acl_new(void); ++void nfs4_acl_free(struct nfs4_acl *); ++int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t); ++int nfs4_acl_get_whotype(char *, u32); ++int nfs4_acl_write_who(int who, char *p); ++int nfs4_acl_permission(struct nfs4_acl *acl, uid_t owner, gid_t group, ++ uid_t who, u32 mask); ++ ++#define NFS4_ACL_TYPE_DEFAULT 0x01 ++#define NFS4_ACL_DIR 0x02 ++#define NFS4_ACL_OWNER 0x04 ++ ++struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *, ++ struct posix_acl *, unsigned int flags); ++int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **, ++ struct posix_acl **, unsigned int flags); ++ ++#endif /* LINUX_NFS4_ACL_H */ +--- linux-2.6.7/include/linux/nfs_xdr.h.lsec 2004-06-15 23:19:52.000000000 -0600 ++++ linux-2.6.7/include/linux/nfs_xdr.h 2005-03-23 14:28:23.539471152 -0700 +@@ -99,20 +99,21 @@ struct nfs4_change_info { + * Arguments to the open call. + */ + struct nfs_openargs { +- struct nfs_fh * fh; ++ const struct nfs_fh * fh; + __u32 seqid; +- __u32 share_access; ++ int open_flags; + __u64 clientid; + __u32 id; +- __u32 opentype; +- __u32 createmode; + union { + struct iattr * attrs; /* UNCHECKED, GUARDED */ + nfs4_verifier verifier; /* EXCLUSIVE */ ++ nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ ++ int delegation_type; /* CLAIM_PREVIOUS */ + } u; + const struct qstr * name; + const struct nfs_server *server; /* Needed for ID mapping */ + const u32 * bitmask; ++ __u32 claim; + }; + + struct nfs_openres { +@@ -122,13 +123,17 @@ struct nfs_openres { + __u32 rflags; + struct nfs_fattr * f_attr; + const struct nfs_server *server; ++ int delegation_type; ++ nfs4_stateid delegation; ++ __u32 do_recall; ++ __u64 maxsize; + }; + + /* + * Arguments to the open_confirm call. + */ + struct nfs_open_confirmargs { +- struct nfs_fh * fh; ++ const struct nfs_fh * fh; + nfs4_stateid stateid; + __u32 seqid; + }; +@@ -138,26 +143,13 @@ struct nfs_open_confirmres { + }; + + /* +- * Arguments to the open_reclaim call. +- */ +-struct nfs_open_reclaimargs { +- struct nfs_fh * fh; +- __u64 clientid; +- __u32 seqid; +- __u32 id; +- __u32 share_access; +- __u32 claim; +- const __u32 * bitmask; +-}; +- +-/* + * Arguments to the close call. + */ + struct nfs_closeargs { + struct nfs_fh * fh; + nfs4_stateid stateid; + __u32 seqid; +- __u32 share_access; ++ int open_flags; + }; + + struct nfs_closeres { +@@ -224,6 +216,11 @@ struct nfs_lockres { + const struct nfs_server * server; + }; + ++struct nfs4_delegreturnargs { ++ const struct nfs_fh *fhandle; ++ const nfs4_stateid *stateid; ++}; ++ + /* + * Arguments to the read call. + */ +@@ -235,8 +232,7 @@ struct nfs_lockres { + + struct nfs_readargs { + struct nfs_fh * fh; +- fl_owner_t lockowner; +- struct nfs4_state * state; ++ struct nfs_open_context *context; + __u64 offset; + __u32 count; + unsigned int pgbase; +@@ -259,8 +255,7 @@ struct nfs_readres { + + struct nfs_writeargs { + struct nfs_fh * fh; +- fl_owner_t lockowner; +- struct nfs4_state * state; ++ struct nfs_open_context *context; + __u64 offset; + __u32 count; + enum nfs3_stable_how stable; +@@ -331,6 +326,19 @@ struct nfs_setattrargs { + const u32 * bitmask; + }; + ++struct nfs_setaclargs { ++ struct nfs_fh * fh; ++ const char * acl; ++ ssize_t acl_len; ++ const struct nfs_server * server; /* Needed for name mapping */ ++}; ++ ++struct nfs_getaclres { ++ char * acl; ++ ssize_t acl_len; ++ const struct nfs_server * server; /* Needed for name mapping */ ++}; ++ + struct nfs_setattrres { + struct nfs_fattr * fattr; + const struct nfs_server * server; +@@ -597,13 +605,15 @@ struct nfs4_rename_res { + }; + + struct nfs4_setclientid { +- nfs4_verifier sc_verifier; /* request */ +- char * sc_name; /* request */ ++ const nfs4_verifier * sc_verifier; /* request */ ++ unsigned int sc_name_len; ++ char sc_name[32]; /* request */ + u32 sc_prog; /* request */ ++ unsigned int sc_netid_len; + char sc_netid[4]; /* request */ ++ unsigned int sc_uaddr_len; + char sc_uaddr[24]; /* request */ + u32 sc_cb_ident; /* request */ +- struct nfs4_client * sc_state; /* response */ + }; + + struct nfs4_statfs_arg { +@@ -657,6 +667,8 @@ struct nfs_write_data { + void (*complete) (struct nfs_write_data *, int); + }; + ++struct nfs_access_entry; ++ + /* + * RPC procedure vector for NFSv2/NFSv3 demuxing + */ +@@ -664,6 +676,7 @@ struct nfs_rpc_ops { + int version; /* Protocol version */ + struct dentry_operations *dentry_ops; + struct inode_operations *dir_inode_ops; ++ struct inode_operations *file_inode_ops; + + int (*getroot) (struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); +@@ -672,11 +685,11 @@ struct nfs_rpc_ops { + struct iattr *); + int (*lookup) (struct inode *, struct qstr *, + struct nfs_fh *, struct nfs_fattr *); +- int (*access) (struct inode *, struct rpc_cred *, int); ++ int (*access) (struct inode *, struct nfs_access_entry *); + int (*readlink)(struct inode *, struct page *); +- int (*read) (struct nfs_read_data *, struct file *); +- int (*write) (struct nfs_write_data *, struct file *); +- int (*commit) (struct nfs_write_data *, struct file *); ++ int (*read) (struct nfs_read_data *); ++ int (*write) (struct nfs_write_data *); ++ int (*commit) (struct nfs_write_data *); + struct inode * (*create) (struct inode *, struct qstr *, + struct iattr *, int); + int (*remove) (struct inode *, struct qstr *); +@@ -708,8 +721,6 @@ struct nfs_rpc_ops { + void (*commit_setup) (struct nfs_write_data *, int how); + int (*file_open) (struct inode *, struct file *); + int (*file_release) (struct inode *, struct file *); +- void (*request_init)(struct nfs_page *, struct file *); +- int (*request_compatible)(struct nfs_page *, struct file *, struct page *); + int (*lock)(struct file *, int, struct file_lock *); + }; + +--- linux-2.6.7/arch/s390/defconfig.lsec 2004-06-15 23:19:52.000000000 -0600 ++++ linux-2.6.7/arch/s390/defconfig 2005-03-23 14:28:23.869420992 -0700 +@@ -422,7 +422,7 @@ CONFIG_NFS_V3=y + CONFIG_NFSD=y + CONFIG_NFSD_V3=y + # CONFIG_NFSD_V4 is not set +-# CONFIG_NFSD_TCP is not set ++CONFIG_NFSD_TCP=y + CONFIG_LOCKD=y + CONFIG_LOCKD_V4=y + CONFIG_EXPORTFS=y +--- linux-2.6.7/arch/ia64/defconfig.lsec 2004-06-15 23:18:57.000000000 -0600 ++++ linux-2.6.7/arch/ia64/defconfig 2005-03-23 14:28:23.816429048 -0700 +@@ -987,7 +987,7 @@ CONFIG_NFS_DIRECTIO=y + CONFIG_NFSD=y + CONFIG_NFSD_V3=y + # CONFIG_NFSD_V4 is not set +-# CONFIG_NFSD_TCP is not set ++CONFIG_NFSD_TCP=y + CONFIG_LOCKD=y + CONFIG_LOCKD_V4=y + CONFIG_EXPORTFS=y +--- linux-2.6.7/arch/ppc/defconfig.lsec 2004-06-15 23:19:52.000000000 -0600 ++++ linux-2.6.7/arch/ppc/defconfig 2005-03-23 14:28:23.817428896 -0700 +@@ -1230,7 +1230,7 @@ CONFIG_NFS_V3=y + CONFIG_NFSD=y + CONFIG_NFSD_V3=y + # CONFIG_NFSD_V4 is not set +-# CONFIG_NFSD_TCP is not set ++CONFIG_NFSD_TCP=y + CONFIG_LOCKD=y + CONFIG_LOCKD_V4=y + CONFIG_EXPORTFS=y +--- linux-2.6.7/arch/i386/defconfig.lsec 2004-06-15 23:19:42.000000000 -0600 ++++ linux-2.6.7/arch/i386/defconfig 2005-03-23 14:28:23.763437104 -0700 +@@ -1148,7 +1148,7 @@ CONFIG_NFS_FS=y + # CONFIG_NFS_DIRECTIO is not set + CONFIG_NFSD=y + # CONFIG_NFSD_V3 is not set +-# CONFIG_NFSD_TCP is not set ++CONFIG_NFSD_TCP=y + CONFIG_LOCKD=y + CONFIG_EXPORTFS=y + CONFIG_SUNRPC=y +--- linux-2.6.7/arch/alpha/defconfig.lsec 2004-06-15 23:19:23.000000000 -0600 ++++ linux-2.6.7/arch/alpha/defconfig 2005-03-23 14:28:23.762437256 -0700 +@@ -791,7 +791,7 @@ CONFIG_NFS_V3=y + CONFIG_NFSD=m + CONFIG_NFSD_V3=y + # CONFIG_NFSD_V4 is not set +-# CONFIG_NFSD_TCP is not set ++CONFIG_NFSD_TCP=y + CONFIG_LOCKD=m + CONFIG_LOCKD_V4=y + CONFIG_EXPORTFS=m +--- linux-2.6.7/net/sunrpc/svcauth_unix.c.lsec 2004-06-15 23:19:37.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/svcauth_unix.c 2005-03-23 14:28:24.295356240 -0700 +@@ -55,12 +55,10 @@ struct auth_domain *unix_domain_find(cha + if (new == NULL) + return NULL; + cache_init(&new->h.h); +- atomic_inc(&new->h.h.refcnt); + new->h.name = strdup(name); + new->h.flavour = RPC_AUTH_UNIX; + new->addr_changes = 0; + new->h.h.expiry_time = NEVER; +- new->h.h.flags = 0; + + rv = auth_domain_lookup(&new->h, 2); + if (rv == &new->h) { +@@ -262,7 +260,7 @@ struct cache_detail ip_map_cache = { + .cache_show = ip_map_show, + }; + +-static DefineSimpleCacheLookup(ip_map, 0) ++static DefineSimpleCacheLookup(ip_map) + + + int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom) +@@ -318,7 +316,8 @@ struct auth_domain *auth_unix_lookup(str + return NULL; + + if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) { +- set_bit(CACHE_NEGATIVE, &ipm->h.flags); ++ if (test_and_set_bit(CACHE_NEGATIVE, &ipm->h.flags) == 0) ++ auth_domain_put(&ipm->m_client->h); + rv = NULL; + } else { + rv = &ipm->m_client->h; +@@ -405,6 +404,9 @@ svcauth_null_release(struct svc_rqst *rq + if (rqstp->rq_client) + auth_domain_put(rqstp->rq_client); + rqstp->rq_client = NULL; ++ if (rqstp->rq_cred.cr_group_info) ++ put_group_info(rqstp->rq_cred.cr_group_info); ++ rqstp->rq_cred.cr_group_info = NULL; + + return 0; /* don't drop */ + } +--- linux-2.6.7/net/sunrpc/xprt.c.lsec 2004-06-15 23:19:42.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/xprt.c 2005-03-23 14:28:23.706445768 -0700 +@@ -1099,7 +1099,7 @@ xprt_write_space(struct sock *sk) + goto out; + + spin_lock_bh(&xprt->sock_lock); +- if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->pending) ++ if (xprt->snd_task) + rpc_wake_up_task(xprt->snd_task); + spin_unlock_bh(&xprt->sock_lock); + out: +@@ -1357,6 +1357,7 @@ xprt_request_init(struct rpc_task *task, + req->rq_task = task; + req->rq_xprt = xprt; + req->rq_xid = xprt_alloc_xid(xprt); ++ req->rq_release_snd_buf = NULL; + dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, + req, req->rq_xid); + } +@@ -1382,6 +1383,8 @@ xprt_release(struct rpc_task *task) + mod_timer(&xprt->timer, xprt->last_used + XPRT_IDLE_TIMEOUT); + spin_unlock_bh(&xprt->sock_lock); + task->tk_rqstp = NULL; ++ if (req->rq_release_snd_buf) ++ req->rq_release_snd_buf(req); + memset(req, 0, sizeof(*req)); /* mark unused */ + + dprintk("RPC: %4d release request %p\n", task->tk_pid, req); +--- linux-2.6.7/net/sunrpc/sched.c.lsec 2004-06-15 23:19:35.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/sched.c 2005-03-23 14:28:23.651454128 -0700 +@@ -41,13 +41,7 @@ static mempool_t *rpc_buffer_mempool; + + static void __rpc_default_timer(struct rpc_task *task); + static void rpciod_killall(void); +- +-/* +- * When an asynchronous RPC task is activated within a bottom half +- * handler, or while executing another RPC task, it is put on +- * schedq, and rpciod is woken up. +- */ +-static RPC_WAITQ(schedq, "schedq"); ++static void rpc_async_schedule(void *); + + /* + * RPC tasks that create another task (e.g. for contacting the portmapper) +@@ -68,26 +62,18 @@ static LIST_HEAD(all_tasks); + /* + * rpciod-related stuff + */ +-static DECLARE_WAIT_QUEUE_HEAD(rpciod_idle); +-static DECLARE_COMPLETION(rpciod_killer); + static DECLARE_MUTEX(rpciod_sema); + static unsigned int rpciod_users; +-static pid_t rpciod_pid; +-static int rpc_inhibit; ++static struct workqueue_struct *rpciod_workqueue; + + /* +- * Spinlock for wait queues. Access to the latter also has to be +- * interrupt-safe in order to allow timers to wake up sleeping tasks. +- */ +-static spinlock_t rpc_queue_lock = SPIN_LOCK_UNLOCKED; +-/* + * Spinlock for other critical sections of code. + */ + static spinlock_t rpc_sched_lock = SPIN_LOCK_UNLOCKED; + + /* + * Disable the timer for a given RPC task. Should be called with +- * rpc_queue_lock and bh_disabled in order to avoid races within ++ * queue->lock and bh_disabled in order to avoid races within + * rpc_run_timer(). + */ + static inline void +@@ -105,16 +91,13 @@ __rpc_disable_timer(struct rpc_task *tas + * without calling del_timer_sync(). The latter could cause a + * deadlock if called while we're holding spinlocks... + */ +-static void +-rpc_run_timer(struct rpc_task *task) ++static void rpc_run_timer(struct rpc_task *task) + { + void (*callback)(struct rpc_task *); + +- spin_lock_bh(&rpc_queue_lock); + callback = task->tk_timeout_fn; + task->tk_timeout_fn = NULL; +- spin_unlock_bh(&rpc_queue_lock); +- if (callback) { ++ if (callback && RPC_IS_QUEUED(task)) { + dprintk("RPC: %4d running timer\n", task->tk_pid); + callback(task); + } +@@ -140,19 +123,8 @@ __rpc_add_timer(struct rpc_task *task, r + } + + /* +- * Set up a timer for an already sleeping task. +- */ +-void rpc_add_timer(struct rpc_task *task, rpc_action timer) +-{ +- spin_lock_bh(&rpc_queue_lock); +- if (!RPC_IS_RUNNING(task)) +- __rpc_add_timer(task, timer); +- spin_unlock_bh(&rpc_queue_lock); +-} +- +-/* + * Delete any timer for the current task. Because we use del_timer_sync(), +- * this function should never be called while holding rpc_queue_lock. ++ * this function should never be called while holding queue->lock. + */ + static inline void + rpc_delete_timer(struct rpc_task *task) +@@ -169,16 +141,17 @@ static void __rpc_add_wait_queue_priorit + struct list_head *q; + struct rpc_task *t; + ++ INIT_LIST_HEAD(&task->u.tk_wait.links); + q = &queue->tasks[task->tk_priority]; + if (unlikely(task->tk_priority > queue->maxpriority)) + q = &queue->tasks[queue->maxpriority]; +- list_for_each_entry(t, q, tk_list) { ++ list_for_each_entry(t, q, u.tk_wait.list) { + if (t->tk_cookie == task->tk_cookie) { +- list_add_tail(&task->tk_list, &t->tk_links); ++ list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links); + return; + } + } +- list_add_tail(&task->tk_list, q); ++ list_add_tail(&task->u.tk_wait.list, q); + } + + /* +@@ -189,37 +162,21 @@ static void __rpc_add_wait_queue_priorit + * improve overall performance. + * Everyone else gets appended to the queue to ensure proper FIFO behavior. + */ +-static int __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) ++static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) + { +- if (task->tk_rpcwait == queue) +- return 0; ++ BUG_ON (RPC_IS_QUEUED(task)); + +- if (task->tk_rpcwait) { +- printk(KERN_WARNING "RPC: doubly enqueued task!\n"); +- return -EWOULDBLOCK; +- } + if (RPC_IS_PRIORITY(queue)) + __rpc_add_wait_queue_priority(queue, task); + else if (RPC_IS_SWAPPER(task)) +- list_add(&task->tk_list, &queue->tasks[0]); ++ list_add(&task->u.tk_wait.list, &queue->tasks[0]); + else +- list_add_tail(&task->tk_list, &queue->tasks[0]); +- task->tk_rpcwait = queue; ++ list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); ++ task->u.tk_wait.rpc_waitq = queue; ++ rpc_set_queued(task); + + dprintk("RPC: %4d added to queue %p \"%s\"\n", + task->tk_pid, queue, rpc_qname(queue)); +- +- return 0; +-} +- +-int rpc_add_wait_queue(struct rpc_wait_queue *q, struct rpc_task *task) +-{ +- int result; +- +- spin_lock_bh(&rpc_queue_lock); +- result = __rpc_add_wait_queue(q, task); +- spin_unlock_bh(&rpc_queue_lock); +- return result; + } + + /* +@@ -229,12 +186,12 @@ static void __rpc_remove_wait_queue_prio + { + struct rpc_task *t; + +- if (!list_empty(&task->tk_links)) { +- t = list_entry(task->tk_links.next, struct rpc_task, tk_list); +- list_move(&t->tk_list, &task->tk_list); +- list_splice_init(&task->tk_links, &t->tk_links); ++ if (!list_empty(&task->u.tk_wait.links)) { ++ t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list); ++ list_move(&t->u.tk_wait.list, &task->u.tk_wait.list); ++ list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links); + } +- list_del(&task->tk_list); ++ list_del(&task->u.tk_wait.list); + } + + /* +@@ -243,31 +200,17 @@ static void __rpc_remove_wait_queue_prio + */ + static void __rpc_remove_wait_queue(struct rpc_task *task) + { +- struct rpc_wait_queue *queue = task->tk_rpcwait; +- +- if (!queue) +- return; ++ struct rpc_wait_queue *queue; ++ queue = task->u.tk_wait.rpc_waitq; + + if (RPC_IS_PRIORITY(queue)) + __rpc_remove_wait_queue_priority(task); + else +- list_del(&task->tk_list); +- task->tk_rpcwait = NULL; +- ++ list_del(&task->u.tk_wait.list); + dprintk("RPC: %4d removed from queue %p \"%s\"\n", + task->tk_pid, queue, rpc_qname(queue)); + } + +-void +-rpc_remove_wait_queue(struct rpc_task *task) +-{ +- if (!task->tk_rpcwait) +- return; +- spin_lock_bh(&rpc_queue_lock); +- __rpc_remove_wait_queue(task); +- spin_unlock_bh(&rpc_queue_lock); +-} +- + static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) + { + queue->priority = priority; +@@ -290,6 +233,7 @@ static void __rpc_init_priority_wait_que + { + int i; + ++ spin_lock_init(&queue->lock); + for (i = 0; i < ARRAY_SIZE(queue->tasks); i++) + INIT_LIST_HEAD(&queue->tasks[i]); + queue->maxpriority = maxprio; +@@ -316,34 +260,27 @@ EXPORT_SYMBOL(rpc_init_wait_queue); + * Note: If the task is ASYNC, this must be called with + * the spinlock held to protect the wait queue operation. + */ +-static inline void +-rpc_make_runnable(struct rpc_task *task) ++static void rpc_make_runnable(struct rpc_task *task) + { +- if (task->tk_timeout_fn) { +- printk(KERN_ERR "RPC: task w/ running timer in rpc_make_runnable!!\n"); ++ if (rpc_test_and_set_running(task)) + return; +- } +- rpc_set_running(task); ++ BUG_ON(task->tk_timeout_fn); + if (RPC_IS_ASYNC(task)) { +- if (RPC_IS_SLEEPING(task)) { +- int status; +- status = __rpc_add_wait_queue(&schedq, task); +- if (status < 0) { +- printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); +- task->tk_status = status; +- return; +- } +- rpc_clear_sleeping(task); +- wake_up(&rpciod_idle); ++ int status; ++ ++ INIT_WORK(&task->u.tk_work, rpc_async_schedule, (void *)task); ++ status = queue_work(task->tk_workqueue, &task->u.tk_work); ++ if (status < 0) { ++ printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); ++ task->tk_status = status; ++ return; + } +- } else { +- rpc_clear_sleeping(task); +- wake_up(&task->tk_wait); +- } ++ } else ++ wake_up(&task->u.tk_wait.waitq); + } + + /* +- * Place a newly initialized task on the schedq. ++ * Place a newly initialized task on the workqueue. + */ + static inline void + rpc_schedule_run(struct rpc_task *task) +@@ -352,33 +289,18 @@ rpc_schedule_run(struct rpc_task *task) + if (RPC_IS_ACTIVATED(task)) + return; + task->tk_active = 1; +- rpc_set_sleeping(task); + rpc_make_runnable(task); + } + + /* +- * For other people who may need to wake the I/O daemon +- * but should (for now) know nothing about its innards +- */ +-void rpciod_wake_up(void) +-{ +- if(rpciod_pid==0) +- printk(KERN_ERR "rpciod: wot no daemon?\n"); +- wake_up(&rpciod_idle); +-} +- +-/* + * Prepare for sleeping on a wait queue. + * By always appending tasks to the list we ensure FIFO behavior. + * NB: An RPC task will only receive interrupt-driven events as long + * as it's on a wait queue. + */ +-static void +-__rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, ++static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action, rpc_action timer) + { +- int status; +- + dprintk("RPC: %4d sleep_on(queue \"%s\" time %ld)\n", task->tk_pid, + rpc_qname(q), jiffies); + +@@ -388,49 +310,36 @@ __rpc_sleep_on(struct rpc_wait_queue *q, + } + + /* Mark the task as being activated if so needed */ +- if (!RPC_IS_ACTIVATED(task)) { ++ if (!RPC_IS_ACTIVATED(task)) + task->tk_active = 1; +- rpc_set_sleeping(task); +- } + +- status = __rpc_add_wait_queue(q, task); +- if (status) { +- printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); +- task->tk_status = status; +- } else { +- rpc_clear_running(task); +- if (task->tk_callback) { +- dprintk(KERN_ERR "RPC: %4d overwrites an active callback\n", task->tk_pid); +- BUG(); +- } +- task->tk_callback = action; +- __rpc_add_timer(task, timer); +- } ++ __rpc_add_wait_queue(q, task); ++ ++ BUG_ON(task->tk_callback != NULL); ++ task->tk_callback = action; ++ __rpc_add_timer(task, timer); + } + +-void +-rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, ++void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action, rpc_action timer) + { + /* + * Protect the queue operations. + */ +- spin_lock_bh(&rpc_queue_lock); ++ spin_lock_bh(&q->lock); + __rpc_sleep_on(q, task, action, timer); +- spin_unlock_bh(&rpc_queue_lock); ++ spin_unlock_bh(&q->lock); + } + + /** +- * __rpc_wake_up_task - wake up a single rpc_task ++ * __rpc_do_wake_up_task - wake up a single rpc_task + * @task: task to be woken up + * +- * Caller must hold rpc_queue_lock ++ * Caller must hold queue->lock, and have cleared the task queued flag. + */ +-static void +-__rpc_wake_up_task(struct rpc_task *task) ++static void __rpc_do_wake_up_task(struct rpc_task *task) + { +- dprintk("RPC: %4d __rpc_wake_up_task (now %ld inh %d)\n", +- task->tk_pid, jiffies, rpc_inhibit); ++ dprintk("RPC: %4d __rpc_wake_up_task (now %ld)\n", task->tk_pid, jiffies); + + #ifdef RPC_DEBUG + if (task->tk_magic != 0xf00baa) { +@@ -445,12 +354,9 @@ __rpc_wake_up_task(struct rpc_task *task + printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task); + return; + } +- if (RPC_IS_RUNNING(task)) +- return; + + __rpc_disable_timer(task); +- if (task->tk_rpcwait != &schedq) +- __rpc_remove_wait_queue(task); ++ __rpc_remove_wait_queue(task); + + rpc_make_runnable(task); + +@@ -458,6 +364,15 @@ __rpc_wake_up_task(struct rpc_task *task + } + + /* ++ * Wake up the specified task ++ */ ++static void __rpc_wake_up_task(struct rpc_task *task) ++{ ++ if (rpc_test_and_clear_queued(task)) ++ __rpc_do_wake_up_task(task); ++} ++ ++/* + * Default timeout handler if none specified by user + */ + static void +@@ -471,14 +386,15 @@ __rpc_default_timer(struct rpc_task *tas + /* + * Wake up the specified task + */ +-void +-rpc_wake_up_task(struct rpc_task *task) ++void rpc_wake_up_task(struct rpc_task *task) + { +- if (RPC_IS_RUNNING(task)) +- return; +- spin_lock_bh(&rpc_queue_lock); +- __rpc_wake_up_task(task); +- spin_unlock_bh(&rpc_queue_lock); ++ if (rpc_test_and_clear_queued(task)) { ++ struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq; ++ ++ spin_lock_bh(&queue->lock); ++ __rpc_do_wake_up_task(task); ++ spin_unlock_bh(&queue->lock); ++ } + } + + /* +@@ -494,11 +410,11 @@ static struct rpc_task * __rpc_wake_up_n + */ + q = &queue->tasks[queue->priority]; + if (!list_empty(q)) { +- task = list_entry(q->next, struct rpc_task, tk_list); ++ task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + if (queue->cookie == task->tk_cookie) { + if (--queue->nr) + goto out; +- list_move_tail(&task->tk_list, q); ++ list_move_tail(&task->u.tk_wait.list, q); + } + /* + * Check if we need to switch queues. +@@ -516,7 +432,7 @@ static struct rpc_task * __rpc_wake_up_n + else + q = q - 1; + if (!list_empty(q)) { +- task = list_entry(q->next, struct rpc_task, tk_list); ++ task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + goto new_queue; + } + } while (q != &queue->tasks[queue->priority]); +@@ -541,14 +457,14 @@ struct rpc_task * rpc_wake_up_next(struc + struct rpc_task *task = NULL; + + dprintk("RPC: wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue)); +- spin_lock_bh(&rpc_queue_lock); ++ spin_lock_bh(&queue->lock); + if (RPC_IS_PRIORITY(queue)) + task = __rpc_wake_up_next_priority(queue); + else { + task_for_first(task, &queue->tasks[0]) + __rpc_wake_up_task(task); + } +- spin_unlock_bh(&rpc_queue_lock); ++ spin_unlock_bh(&queue->lock); + + return task; + } +@@ -557,25 +473,25 @@ struct rpc_task * rpc_wake_up_next(struc + * rpc_wake_up - wake up all rpc_tasks + * @queue: rpc_wait_queue on which the tasks are sleeping + * +- * Grabs rpc_queue_lock ++ * Grabs queue->lock + */ + void rpc_wake_up(struct rpc_wait_queue *queue) + { + struct rpc_task *task; + + struct list_head *head; +- spin_lock_bh(&rpc_queue_lock); ++ spin_lock_bh(&queue->lock); + head = &queue->tasks[queue->maxpriority]; + for (;;) { + while (!list_empty(head)) { +- task = list_entry(head->next, struct rpc_task, tk_list); ++ task = list_entry(head->next, struct rpc_task, u.tk_wait.list); + __rpc_wake_up_task(task); + } + if (head == &queue->tasks[0]) + break; + head--; + } +- spin_unlock_bh(&rpc_queue_lock); ++ spin_unlock_bh(&queue->lock); + } + + /** +@@ -583,18 +499,18 @@ void rpc_wake_up(struct rpc_wait_queue * + * @queue: rpc_wait_queue on which the tasks are sleeping + * @status: status value to set + * +- * Grabs rpc_queue_lock ++ * Grabs queue->lock + */ + void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) + { + struct list_head *head; + struct rpc_task *task; + +- spin_lock_bh(&rpc_queue_lock); ++ spin_lock_bh(&queue->lock); + head = &queue->tasks[queue->maxpriority]; + for (;;) { + while (!list_empty(head)) { +- task = list_entry(head->next, struct rpc_task, tk_list); ++ task = list_entry(head->next, struct rpc_task, u.tk_wait.list); + task->tk_status = status; + __rpc_wake_up_task(task); + } +@@ -602,7 +518,7 @@ void rpc_wake_up_status(struct rpc_wait_ + break; + head--; + } +- spin_unlock_bh(&rpc_queue_lock); ++ spin_unlock_bh(&queue->lock); + } + + /* +@@ -626,18 +542,14 @@ __rpc_atrun(struct rpc_task *task) + /* + * This is the RPC `scheduler' (or rather, the finite state machine). + */ +-static int +-__rpc_execute(struct rpc_task *task) ++static int __rpc_execute(struct rpc_task *task) + { + int status = 0; + + dprintk("RPC: %4d rpc_execute flgs %x\n", + task->tk_pid, task->tk_flags); + +- if (!RPC_IS_RUNNING(task)) { +- printk(KERN_WARNING "RPC: rpc_execute called for sleeping task!!\n"); +- return 0; +- } ++ BUG_ON(RPC_IS_QUEUED(task)); + + restarted: + while (1) { +@@ -657,7 +569,9 @@ __rpc_execute(struct rpc_task *task) + */ + save_callback=task->tk_callback; + task->tk_callback=NULL; ++ lock_kernel(); + save_callback(task); ++ unlock_kernel(); + } + + /* +@@ -665,43 +579,41 @@ __rpc_execute(struct rpc_task *task) + * tk_action may be NULL when the task has been killed + * by someone else. + */ +- if (RPC_IS_RUNNING(task)) { ++ if (!RPC_IS_QUEUED(task)) { + /* + * Garbage collection of pending timers... + */ + rpc_delete_timer(task); + if (!task->tk_action) + break; ++ lock_kernel(); + task->tk_action(task); +- /* micro-optimization to avoid spinlock */ +- if (RPC_IS_RUNNING(task)) +- continue; ++ unlock_kernel(); + } + + /* +- * Check whether task is sleeping. ++ * Lockless check for whether task is sleeping or not. + */ +- spin_lock_bh(&rpc_queue_lock); +- if (!RPC_IS_RUNNING(task)) { +- rpc_set_sleeping(task); +- if (RPC_IS_ASYNC(task)) { +- spin_unlock_bh(&rpc_queue_lock); ++ if (!RPC_IS_QUEUED(task)) ++ continue; ++ if (RPC_IS_ASYNC(task)) { ++ rpc_clear_running(task); ++ /* Careful! we may have raced... */ ++ if (RPC_IS_QUEUED(task)) + return 0; +- } ++ if (rpc_test_and_set_running(task)) ++ return 0; ++ continue; + } +- spin_unlock_bh(&rpc_queue_lock); + +- if (!RPC_IS_SLEEPING(task)) +- continue; ++ init_waitqueue_head(&task->u.tk_wait.waitq); ++ rpc_clear_running(task); + /* sync task: sleep here */ + dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid); +- if (current->pid == rpciod_pid) +- printk(KERN_ERR "RPC: rpciod waiting on sync task!\n"); +- + if (!task->tk_client->cl_intr) { +- __wait_event(task->tk_wait, !RPC_IS_SLEEPING(task)); ++ __wait_event(task->u.tk_wait.waitq, RPC_IS_RUNNING(task)); + } else { +- __wait_event_interruptible(task->tk_wait, !RPC_IS_SLEEPING(task), status); ++ __wait_event_interruptible(task->u.tk_wait.waitq, RPC_IS_RUNNING(task), status); + /* + * When a sync task receives a signal, it exits with + * -ERESTARTSYS. In order to catch any callbacks that +@@ -719,7 +631,9 @@ __rpc_execute(struct rpc_task *task) + } + + if (task->tk_exit) { ++ lock_kernel(); + task->tk_exit(task); ++ unlock_kernel(); + /* If tk_action is non-null, the user wants us to restart */ + if (task->tk_action) { + if (!RPC_ASSASSINATED(task)) { +@@ -738,7 +652,6 @@ __rpc_execute(struct rpc_task *task) + + /* Release all resources associated with the task */ + rpc_release_task(task); +- + return status; + } + +@@ -754,57 +667,16 @@ __rpc_execute(struct rpc_task *task) + int + rpc_execute(struct rpc_task *task) + { +- int status = -EIO; +- if (rpc_inhibit) { +- printk(KERN_INFO "RPC: execution inhibited!\n"); +- goto out_release; +- } +- +- status = -EWOULDBLOCK; +- if (task->tk_active) { +- printk(KERN_ERR "RPC: active task was run twice!\n"); +- goto out_err; +- } ++ BUG_ON(task->tk_active); + + task->tk_active = 1; + rpc_set_running(task); + return __rpc_execute(task); +- out_release: +- rpc_release_task(task); +- out_err: +- return status; + } + +-/* +- * This is our own little scheduler for async RPC tasks. +- */ +-static void +-__rpc_schedule(void) ++static void rpc_async_schedule(void *arg) + { +- struct rpc_task *task; +- int count = 0; +- +- dprintk("RPC: rpc_schedule enter\n"); +- while (1) { +- +- task_for_first(task, &schedq.tasks[0]) { +- __rpc_remove_wait_queue(task); +- spin_unlock_bh(&rpc_queue_lock); +- +- __rpc_execute(task); +- spin_lock_bh(&rpc_queue_lock); +- } else { +- break; +- } +- +- if (++count >= 200 || need_resched()) { +- count = 0; +- spin_unlock_bh(&rpc_queue_lock); +- schedule(); +- spin_lock_bh(&rpc_queue_lock); +- } +- } +- dprintk("RPC: rpc_schedule leave\n"); ++ __rpc_execute((struct rpc_task *)arg); + } + + /* +@@ -862,7 +734,6 @@ void rpc_init_task(struct rpc_task *task + task->tk_client = clnt; + task->tk_flags = flags; + task->tk_exit = callback; +- init_waitqueue_head(&task->tk_wait); + if (current->uid != current->fsuid || current->gid != current->fsgid) + task->tk_flags |= RPC_TASK_SETUID; + +@@ -873,7 +744,9 @@ void rpc_init_task(struct rpc_task *task + + task->tk_priority = RPC_PRIORITY_NORMAL; + task->tk_cookie = (unsigned long)current; +- INIT_LIST_HEAD(&task->tk_links); ++ ++ /* Initialize workqueue for async tasks */ ++ task->tk_workqueue = rpciod_workqueue; + + /* Add to global list of all tasks */ + spin_lock(&rpc_sched_lock); +@@ -942,8 +815,7 @@ cleanup: + goto out; + } + +-void +-rpc_release_task(struct rpc_task *task) ++void rpc_release_task(struct rpc_task *task) + { + dprintk("RPC: %4d release task\n", task->tk_pid); + +@@ -961,19 +833,9 @@ rpc_release_task(struct rpc_task *task) + list_del(&task->tk_task); + spin_unlock(&rpc_sched_lock); + +- /* Protect the execution below. */ +- spin_lock_bh(&rpc_queue_lock); +- +- /* Disable timer to prevent zombie wakeup */ +- __rpc_disable_timer(task); +- +- /* Remove from any wait queue we're still on */ +- __rpc_remove_wait_queue(task); +- ++ BUG_ON (rpc_test_and_clear_queued(task)); + task->tk_active = 0; + +- spin_unlock_bh(&rpc_queue_lock); +- + /* Synchronously delete any running timer */ + rpc_delete_timer(task); + +@@ -1003,10 +865,9 @@ rpc_release_task(struct rpc_task *task) + * queue 'childq'. If so returns a pointer to the parent. + * Upon failure returns NULL. + * +- * Caller must hold rpc_queue_lock ++ * Caller must hold childq.lock + */ +-static inline struct rpc_task * +-rpc_find_parent(struct rpc_task *child) ++static inline struct rpc_task *rpc_find_parent(struct rpc_task *child) + { + struct rpc_task *task, *parent; + struct list_head *le; +@@ -1019,17 +880,16 @@ rpc_find_parent(struct rpc_task *child) + return NULL; + } + +-static void +-rpc_child_exit(struct rpc_task *child) ++static void rpc_child_exit(struct rpc_task *child) + { + struct rpc_task *parent; + +- spin_lock_bh(&rpc_queue_lock); ++ spin_lock_bh(&childq.lock); + if ((parent = rpc_find_parent(child)) != NULL) { + parent->tk_status = child->tk_status; + __rpc_wake_up_task(parent); + } +- spin_unlock_bh(&rpc_queue_lock); ++ spin_unlock_bh(&childq.lock); + } + + /* +@@ -1052,22 +912,20 @@ fail: + return NULL; + } + +-void +-rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func) ++void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func) + { +- spin_lock_bh(&rpc_queue_lock); ++ spin_lock_bh(&childq.lock); + /* N.B. Is it possible for the child to have already finished? */ + __rpc_sleep_on(&childq, task, func, NULL); + rpc_schedule_run(child); +- spin_unlock_bh(&rpc_queue_lock); ++ spin_unlock_bh(&childq.lock); + } + + /* + * Kill all tasks for the given client. + * XXX: kill their descendants as well? + */ +-void +-rpc_killall_tasks(struct rpc_clnt *clnt) ++void rpc_killall_tasks(struct rpc_clnt *clnt) + { + struct rpc_task *rovr; + struct list_head *le; +@@ -1089,93 +947,14 @@ rpc_killall_tasks(struct rpc_clnt *clnt) + + static DECLARE_MUTEX_LOCKED(rpciod_running); + +-static inline int +-rpciod_task_pending(void) +-{ +- return !list_empty(&schedq.tasks[0]); +-} +- +- +-/* +- * This is the rpciod kernel thread +- */ +-static int +-rpciod(void *ptr) +-{ +- int rounds = 0; +- +- lock_kernel(); +- /* +- * Let our maker know we're running ... +- */ +- rpciod_pid = current->pid; +- up(&rpciod_running); +- +- daemonize("rpciod"); +- allow_signal(SIGKILL); +- +- dprintk("RPC: rpciod starting (pid %d)\n", rpciod_pid); +- spin_lock_bh(&rpc_queue_lock); +- while (rpciod_users) { +- DEFINE_WAIT(wait); +- if (signalled()) { +- spin_unlock_bh(&rpc_queue_lock); +- rpciod_killall(); +- flush_signals(current); +- spin_lock_bh(&rpc_queue_lock); +- } +- __rpc_schedule(); +- if (current->flags & PF_FREEZE) { +- spin_unlock_bh(&rpc_queue_lock); +- refrigerator(PF_FREEZE); +- spin_lock_bh(&rpc_queue_lock); +- } +- +- if (++rounds >= 64) { /* safeguard */ +- spin_unlock_bh(&rpc_queue_lock); +- schedule(); +- rounds = 0; +- spin_lock_bh(&rpc_queue_lock); +- } +- +- dprintk("RPC: rpciod back to sleep\n"); +- prepare_to_wait(&rpciod_idle, &wait, TASK_INTERRUPTIBLE); +- if (!rpciod_task_pending() && !signalled()) { +- spin_unlock_bh(&rpc_queue_lock); +- schedule(); +- rounds = 0; +- spin_lock_bh(&rpc_queue_lock); +- } +- finish_wait(&rpciod_idle, &wait); +- dprintk("RPC: switch to rpciod\n"); +- } +- spin_unlock_bh(&rpc_queue_lock); +- +- dprintk("RPC: rpciod shutdown commences\n"); +- if (!list_empty(&all_tasks)) { +- printk(KERN_ERR "rpciod: active tasks at shutdown?!\n"); +- rpciod_killall(); +- } +- +- dprintk("RPC: rpciod exiting\n"); +- unlock_kernel(); +- +- rpciod_pid = 0; +- complete_and_exit(&rpciod_killer, 0); +- return 0; +-} +- +-static void +-rpciod_killall(void) ++static void rpciod_killall(void) + { + unsigned long flags; + + while (!list_empty(&all_tasks)) { + clear_thread_flag(TIF_SIGPENDING); + rpc_killall_tasks(NULL); +- spin_lock_bh(&rpc_queue_lock); +- __rpc_schedule(); +- spin_unlock_bh(&rpc_queue_lock); ++ flush_workqueue(rpciod_workqueue); + if (!list_empty(&all_tasks)) { + dprintk("rpciod_killall: waiting for tasks to exit\n"); + yield(); +@@ -1193,28 +972,30 @@ rpciod_killall(void) + int + rpciod_up(void) + { ++ struct workqueue_struct *wq; + int error = 0; + + down(&rpciod_sema); +- dprintk("rpciod_up: pid %d, users %d\n", rpciod_pid, rpciod_users); ++ dprintk("rpciod_up: users %d\n", rpciod_users); + rpciod_users++; +- if (rpciod_pid) ++ if (rpciod_workqueue) + goto out; + /* + * If there's no pid, we should be the first user. + */ + if (rpciod_users > 1) +- printk(KERN_WARNING "rpciod_up: no pid, %d users??\n", rpciod_users); ++ printk(KERN_WARNING "rpciod_up: no workqueue, %d users??\n", rpciod_users); + /* + * Create the rpciod thread and wait for it to start. + */ +- error = kernel_thread(rpciod, NULL, 0); +- if (error < 0) { +- printk(KERN_WARNING "rpciod_up: create thread failed, error=%d\n", error); ++ error = -ENOMEM; ++ wq = create_workqueue("rpciod"); ++ if (wq == NULL) { ++ printk(KERN_WARNING "rpciod_up: create workqueue failed, error=%d\n", error); + rpciod_users--; + goto out; + } +- down(&rpciod_running); ++ rpciod_workqueue = wq; + error = 0; + out: + up(&rpciod_sema); +@@ -1225,20 +1006,21 @@ void + rpciod_down(void) + { + down(&rpciod_sema); +- dprintk("rpciod_down pid %d sema %d\n", rpciod_pid, rpciod_users); ++ dprintk("rpciod_down sema %d\n", rpciod_users); + if (rpciod_users) { + if (--rpciod_users) + goto out; + } else +- printk(KERN_WARNING "rpciod_down: pid=%d, no users??\n", rpciod_pid); ++ printk(KERN_WARNING "rpciod_down: no users??\n"); + +- if (!rpciod_pid) { ++ if (!rpciod_workqueue) { + dprintk("rpciod_down: Nothing to do!\n"); + goto out; + } ++ rpciod_killall(); + +- kill_proc(rpciod_pid, SIGKILL, 1); +- wait_for_completion(&rpciod_killer); ++ destroy_workqueue(rpciod_workqueue); ++ rpciod_workqueue = NULL; + out: + up(&rpciod_sema); + } +@@ -1256,7 +1038,12 @@ void rpc_show_tasks(void) + } + printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout " + "-rpcwait -action- --exit--\n"); +- alltask_for_each(t, le, &all_tasks) ++ alltask_for_each(t, le, &all_tasks) { ++ const char *rpc_waitq = "none"; ++ ++ if (RPC_IS_QUEUED(t)) ++ rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq); ++ + printk("%05d %04d %04x %06d %8p %6d %8p %08ld %8s %8p %8p\n", + t->tk_pid, + (t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1), +@@ -1264,8 +1051,9 @@ void rpc_show_tasks(void) + t->tk_client, + (t->tk_client ? t->tk_client->cl_prog : 0), + t->tk_rqstp, t->tk_timeout, +- rpc_qname(t->tk_rpcwait), ++ rpc_waitq, + t->tk_action, t->tk_exit); ++ } + spin_unlock(&rpc_sched_lock); + } + #endif +--- linux-2.6.7/net/sunrpc/svcsock.c.lsec 2004-06-15 23:18:57.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/svcsock.c 2005-03-23 14:28:24.029396672 -0700 +@@ -414,7 +414,6 @@ svc_sendto(struct svc_rqst *rqstp, struc + } + /* send tail */ + if (xdr->tail[0].iov_len) { +- /* The tail *will* be in respages[0]; */ + result = sock->ops->sendpage(sock, rqstp->rq_respages[rqstp->rq_restailpage], + ((unsigned long)xdr->tail[0].iov_base)& (PAGE_SIZE-1), + xdr->tail[0].iov_len, 0); +--- linux-2.6.7/net/sunrpc/clnt.c.lsec 2004-06-15 23:19:13.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/clnt.c 2005-03-23 14:28:23.595462640 -0700 +@@ -351,7 +351,9 @@ int rpc_call_sync(struct rpc_clnt *clnt, + rpc_clnt_sigmask(clnt, &oldset); + + /* Create/initialize a new RPC task */ +- rpc_init_task(task, clnt, NULL, flags); ++ task = rpc_new_task(clnt, NULL, flags); ++ if (task == NULL) ++ return -ENOMEM; + rpc_call_setup(task, msg, 0); + + /* Set up the call info struct and execute the task */ +@@ -620,8 +622,14 @@ call_encode(struct rpc_task *task) + rpc_exit(task, -EIO); + return; + } +- if (encode && (status = rpcauth_wrap_req(task, encode, req, p, +- task->tk_msg.rpc_argp)) < 0) { ++ if (encode == NULL) ++ return; ++ ++ status = rpcauth_wrap_req(task, encode, req, p, task->tk_msg.rpc_argp); ++ if (status == -EAGAIN) { ++ printk("XXXJBF: out of memeory? Should retry here!!!\n"); ++ } ++ if (status < 0) { + printk(KERN_WARNING "%s: can't encode arguments: %d\n", + clnt->cl_protname, -status); + rpc_exit(task, status); +--- linux-2.6.7/net/sunrpc/sunrpc_syms.c.lsec 2004-06-15 23:19:52.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/sunrpc_syms.c 2005-03-23 14:32:35.589153776 -0700 +@@ -58,6 +58,8 @@ EXPORT_SYMBOL(rpc_unlink); + EXPORT_SYMBOL(rpc_wake_up); + EXPORT_SYMBOL(rpc_queue_upcall); + EXPORT_SYMBOL(rpc_mkpipe); ++EXPORT_SYMBOL(rpc_mkdir); ++EXPORT_SYMBOL(rpc_rmdir); + + /* Client transport */ + EXPORT_SYMBOL(xprt_create_proto); +@@ -89,6 +91,7 @@ EXPORT_SYMBOL(svc_makesock); + EXPORT_SYMBOL(svc_reserve); + EXPORT_SYMBOL(svc_auth_register); + EXPORT_SYMBOL(auth_domain_lookup); ++EXPORT_SYMBOL(svc_authenticate); + + /* RPC statistics */ + #ifdef CONFIG_PROC_FS +--- linux-2.6.7/net/sunrpc/pmap_clnt.c.lsec 2004-06-15 23:19:23.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/pmap_clnt.c 2005-03-23 14:28:24.134380712 -0700 +@@ -183,8 +183,10 @@ rpc_register(u32 prog, u32 vers, int pro + map.pm_prot = prot; + map.pm_port = port; + ++ rpciod_up(); + error = rpc_call(pmap_clnt, port? PMAP_SET : PMAP_UNSET, + &map, okay, 0); ++ rpciod_down(); + + if (error < 0) { + printk(KERN_WARNING +--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_unseal.c.lsec 2004-06-15 23:19:44.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_unseal.c 2005-03-23 14:28:23.761437408 -0700 +@@ -68,20 +68,13 @@ + #endif + + +-/* message_buffer is an input if toktype is MIC and an output if it is WRAP: +- * If toktype is MIC: read_token is a mic token, and message_buffer is the +- * data that the mic was supposedly taken over. +- * If toktype is WRAP: read_token is a wrap token, and message_buffer is used +- * to return the decrypted data. +- */ ++/* read_token is a mic token, and message_buffer is the data that the mic was ++ * supposedly taken over. */ + +-/* XXX will need to change prototype and/or just split into a separate function +- * when we add privacy (because read_token will be in pages too). */ + u32 + krb5_read_token(struct krb5_ctx *ctx, + struct xdr_netobj *read_token, +- struct xdr_buf *message_buffer, +- int *qop_state, int toktype) ++ struct xdr_buf *message_buffer, int *qop_state) + { + int signalg; + int sealalg; +@@ -96,20 +89,16 @@ krb5_read_token(struct krb5_ctx *ctx, + + dprintk("RPC: krb5_read_token\n"); + +- if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr, toktype, ++ if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr, + read_token->len)) + goto out; + +- if ((*ptr++ != ((toktype>>8)&0xff)) || (*ptr++ != (toktype&0xff))) ++ if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) || ++ (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) ) + goto out; + + /* XXX sanity-check bodysize?? */ + +- if (toktype == KG_TOK_WRAP_MSG) { +- /* XXX gone */ +- goto out; +- } +- + /* get the sign and seal algorithms */ + + signalg = ptr[0] + (ptr[1] << 8); +@@ -120,14 +109,7 @@ krb5_read_token(struct krb5_ctx *ctx, + if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + goto out; + +- if (((toktype != KG_TOK_WRAP_MSG) && (sealalg != 0xffff)) || +- ((toktype == KG_TOK_WRAP_MSG) && (sealalg == 0xffff))) +- goto out; +- +- /* in the current spec, there is only one valid seal algorithm per +- key type, so a simple comparison is ok */ +- +- if ((toktype == KG_TOK_WRAP_MSG) && !(sealalg == ctx->sealalg)) ++ if (sealalg != 0xffff) + goto out; + + /* there are several mappings of seal algorithms to sign algorithms, +@@ -154,7 +136,7 @@ krb5_read_token(struct krb5_ctx *ctx, + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + ret = make_checksum(checksum_type, ptr - 2, 8, +- message_buffer, &md5cksum); ++ message_buffer, 0, &md5cksum); + if (ret) + goto out; + +--- linux-2.6.7/net/sunrpc/auth_gss/auth_gss.c.lsec 2004-06-15 23:19:22.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/auth_gss/auth_gss.c 2005-03-23 14:28:24.185372960 -0700 +@@ -45,6 +45,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -397,7 +398,7 @@ retry: + spin_unlock(&gss_auth->lock); + } + gss_release_msg(gss_msg); +- dprintk("RPC: %4u gss_upcall for uid %u result %d", task->tk_pid, ++ dprintk("RPC: %4u gss_upcall for uid %u result %d\n", task->tk_pid, + uid, res); + return res; + out_sleep: +@@ -740,6 +741,8 @@ gss_marshal(struct rpc_task *task, u32 * + maj_stat = gss_get_mic(ctx->gc_gss_ctx, + GSS_C_QOP_DEFAULT, + &verf_buf, &mic); ++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) ++ cred->cr_flags |= RPCAUTH_CRED_DEAD; + if(maj_stat != 0){ + printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat); + goto out_put_ctx; +@@ -779,6 +782,7 @@ gss_validate(struct rpc_task *task, u32 + struct xdr_netobj mic; + u32 flav,len; + u32 service; ++ u32 maj_stat; + + dprintk("RPC: %4u gss_validate\n", task->tk_pid); + +@@ -794,8 +798,11 @@ gss_validate(struct rpc_task *task, u32 + mic.data = (u8 *)p; + mic.len = len; + +- if (gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state)) +- goto out_bad; ++ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state); ++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) ++ cred->cr_flags |= RPCAUTH_CRED_DEAD; ++ if (maj_stat) ++ goto out_bad; + service = gss_pseudoflavor_to_service(ctx->gc_gss_ctx->mech_type, + gss_cred->gc_flavor); + switch (service) { +@@ -807,6 +814,11 @@ gss_validate(struct rpc_task *task, u32 + /* verifier data, flavor, length, length, sequence number: */ + task->tk_auth->au_rslack = XDR_QUADLEN(len) + 4; + break; ++ case RPC_GSS_SVC_PRIVACY: ++ /* XXXJBF: Ugh. Going for a wild overestimate. ++ * Need some info from krb5 layer? */ ++ task->tk_auth->au_rslack = XDR_QUADLEN(len) + 32; ++ break; + default: + goto out_bad; + } +@@ -821,11 +833,11 @@ out_bad: + } + + static inline int +-gss_wrap_req_integ(struct gss_cl_ctx *ctx, +- kxdrproc_t encode, void *rqstp, u32 *p, void *obj) ++gss_wrap_req_integ(struct rpc_cred *cred, kxdrproc_t encode, ++ struct rpc_rqst *rqstp, u32 *p, void *obj) + { +- struct rpc_rqst *req = (struct rpc_rqst *)rqstp; +- struct xdr_buf *snd_buf = &req->rq_snd_buf; ++ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); ++ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + struct xdr_buf integ_buf; + u32 *integ_len = NULL; + struct xdr_netobj mic; +@@ -836,7 +848,7 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct + + integ_len = p++; + offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; +- *p++ = htonl(req->rq_seqno); ++ *p++ = htonl(rqstp->rq_seqno); + + status = encode(rqstp, p, obj); + if (status) +@@ -848,7 +860,7 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct + *integ_len = htonl(integ_buf.len); + + /* guess whether we're in the head or the tail: */ +- if (snd_buf->page_len || snd_buf->tail[0].iov_len) ++ if (snd_buf->page_len || snd_buf->tail[0].iov_len) + iov = snd_buf->tail; + else + iov = snd_buf->head; +@@ -857,6 +869,8 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct + + maj_stat = gss_get_mic(ctx->gc_gss_ctx, + GSS_C_QOP_DEFAULT, &integ_buf, &mic); ++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) ++ cred->cr_flags |= RPCAUTH_CRED_DEAD; + status = -EIO; /* XXX? */ + if (maj_stat) + return status; +@@ -868,6 +882,113 @@ gss_wrap_req_integ(struct gss_cl_ctx *ct + return 0; + } + ++static void ++priv_release_snd_buf(struct rpc_rqst *rqstp) ++{ ++ int i; ++ ++ for (i=0; i < rqstp->rq_enc_pages_num; i++) ++ __free_page(rqstp->rq_enc_pages[i]); ++ kfree(rqstp->rq_enc_pages); ++} ++ ++static int ++alloc_enc_pages(struct rpc_rqst *rqstp) ++{ ++ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; ++ int first, last, i; ++ ++ if (snd_buf->page_len == 0) { ++ rqstp->rq_enc_pages_num = 0; ++ return 0; ++ } ++ ++ first = snd_buf->page_base >> PAGE_CACHE_SHIFT; ++ last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT; ++ rqstp->rq_enc_pages_num = last - first + 1 + 1; ++ rqstp->rq_enc_pages ++ = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *), ++ GFP_NOFS); ++ if (!rqstp->rq_enc_pages) ++ goto out; ++ for (i=0; i < rqstp->rq_enc_pages_num; i++) { ++ rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS); ++ if (rqstp->rq_enc_pages[i] == NULL) ++ goto out_free; ++ } ++ rqstp->rq_release_snd_buf = priv_release_snd_buf; ++ return 0; ++out_free: ++ for (i--; i >= 0; i--) { ++ __free_page(rqstp->rq_enc_pages[i]); ++ } ++out: ++ return -EAGAIN; ++} ++ ++static inline int ++gss_wrap_req_priv(struct rpc_cred *cred, kxdrproc_t encode, ++ struct rpc_rqst *rqstp, u32 *p, void *obj) ++{ ++ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); ++ struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; ++ u32 offset; ++ u32 maj_stat; ++ int status; ++ u32 *opaque_len; ++ struct page **inpages; ++ int first; ++ int pad; ++ struct iovec *iov; ++ char *tmp; ++ ++ opaque_len = p++; ++ offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; ++ *p++ = htonl(rqstp->rq_seqno); ++ ++ status = encode(rqstp, p, obj); ++ if (status) ++ return status; ++ ++ status = alloc_enc_pages(rqstp); ++ if (status) ++ return status; ++ /* XXXJBF: Oops! Do we need rq_enc_pages really any more?? */ ++ first = snd_buf->page_base >> PAGE_CACHE_SHIFT; ++ inpages = snd_buf->pages + first; ++ snd_buf->pages = rqstp->rq_enc_pages; ++ snd_buf->page_base -= first << PAGE_CACHE_SHIFT; ++ /* XXX?: tail needs to be separate if we want to be able to expand ++ * the head (since it's often put right after the head). But is ++ * expanding the head safe in any case? */ ++ if (snd_buf->page_len || snd_buf->tail[0].iov_len) { ++ tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); ++ memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); ++ snd_buf->tail[0].iov_base = tmp; ++ } ++ maj_stat = gss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, offset, ++ snd_buf, inpages); ++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) ++ cred->cr_flags |= RPCAUTH_CRED_DEAD; ++ status = -EIO; /* XXX? */ ++ if (maj_stat) ++ return status; ++ ++ *opaque_len = htonl(snd_buf->len - offset); ++ /* guess whether we're in the head or the tail: */ ++ if (snd_buf->page_len || snd_buf->tail[0].iov_len) ++ iov = snd_buf->tail; ++ else ++ iov = snd_buf->head; ++ p = iov->iov_base + iov->iov_len; ++ pad = 3 - ((snd_buf->len - offset - 1) & 3); ++ memset(p, 0, pad); ++ iov->iov_len += pad; ++ snd_buf->len += pad; ++ ++ return 0; ++} ++ + static int + gss_wrap_req(struct rpc_task *task, + kxdrproc_t encode, void *rqstp, u32 *p, void *obj) +@@ -894,9 +1015,11 @@ gss_wrap_req(struct rpc_task *task, + status = encode(rqstp, p, obj); + goto out; + case RPC_GSS_SVC_INTEGRITY: +- status = gss_wrap_req_integ(ctx, encode, rqstp, p, obj); ++ status = gss_wrap_req_integ(cred, encode, rqstp, p, obj); + goto out; + case RPC_GSS_SVC_PRIVACY: ++ status = gss_wrap_req_priv(cred, encode, rqstp, p, obj); ++ goto out; + default: + goto out; + } +@@ -907,11 +1030,10 @@ out: + } + + static inline int +-gss_unwrap_resp_integ(struct gss_cl_ctx *ctx, +- kxdrproc_t decode, void *rqstp, u32 **p, void *obj) ++gss_unwrap_resp_integ(struct rpc_cred *cred, struct rpc_rqst *rqstp, u32 **p) + { +- struct rpc_rqst *req = (struct rpc_rqst *)rqstp; +- struct xdr_buf *rcv_buf = &req->rq_rcv_buf; ++ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); ++ struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; + struct xdr_buf integ_buf; + struct xdr_netobj mic; + u32 data_offset, mic_offset; +@@ -926,7 +1048,7 @@ gss_unwrap_resp_integ(struct gss_cl_ctx + mic_offset = integ_len + data_offset; + if (mic_offset > rcv_buf->len) + return status; +- if (ntohl(*(*p)++) != req->rq_seqno) ++ if (ntohl(*(*p)++) != rqstp->rq_seqno) + return status; + + if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, +@@ -938,11 +1060,44 @@ gss_unwrap_resp_integ(struct gss_cl_ctx + + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, + &mic, NULL); ++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) ++ cred->cr_flags |= RPCAUTH_CRED_DEAD; + if (maj_stat != GSS_S_COMPLETE) + return status; + return 0; + } + ++static inline int ++gss_unwrap_resp_priv(struct rpc_cred *cred, struct rpc_rqst *rqstp, u32 **p) ++{ ++ struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); ++ struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; ++ u32 offset, out_offset; ++ u32 opaque_len; ++ u32 maj_stat; ++ int status = -EIO; ++ ++ opaque_len = ntohl(*(*p)++); ++ offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base; ++ if (offset + opaque_len > rcv_buf->len) ++ return status; ++ /* remove padding: */ ++ rcv_buf->len = offset + opaque_len; ++ ++ maj_stat = gss_unwrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, ++ offset, rcv_buf, &out_offset); ++ if (maj_stat == GSS_S_CONTEXT_EXPIRED) ++ cred->cr_flags |= RPCAUTH_CRED_DEAD; ++ if (maj_stat != GSS_S_COMPLETE) ++ return status; ++ *p = (u32 *)(rcv_buf->head[0].iov_base + out_offset); ++ if (ntohl(*(*p)++) != rqstp->rq_seqno) ++ return status; ++ ++ return 0; ++} ++ ++ + static int + gss_unwrap_resp(struct rpc_task *task, + kxdrproc_t decode, void *rqstp, u32 *p, void *obj) +@@ -962,12 +1117,16 @@ gss_unwrap_resp(struct rpc_task *task, + case RPC_GSS_SVC_NONE: + goto out_decode; + case RPC_GSS_SVC_INTEGRITY: +- status = gss_unwrap_resp_integ(ctx, decode, +- rqstp, &p, obj); ++ status = gss_unwrap_resp_integ(cred, rqstp, &p); + if (status) + goto out; + break; + case RPC_GSS_SVC_PRIVACY: ++ status = gss_unwrap_resp_priv(cred, rqstp, &p); ++ if (status) ++ goto out; ++ break; ++ + default: + goto out; + } +--- linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_mech.c.lsec 2005-03-23 14:28:24.187372656 -0700 ++++ linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_mech.c 2005-03-23 14:28:24.186372808 -0700 +@@ -0,0 +1,296 @@ ++/* ++ * linux/net/sunrpc/gss_spkm3_mech.c ++ * ++ * Copyright (c) 2003 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * J. Bruce Fields ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef RPC_DEBUG ++# define RPCDBG_FACILITY RPCDBG_AUTH ++#endif ++ ++struct xdr_netobj gss_mech_spkm3_oid = ++ {7, "\053\006\001\005\005\001\003"}; ++ ++static inline int ++get_bytes(char **ptr, const char *end, void *res, int len) ++{ ++ char *p, *q; ++ p = *ptr; ++ q = p + len; ++ if (q > end || q < p) ++ return -1; ++ memcpy(res, p, len); ++ *ptr = q; ++ return 0; ++} ++ ++static inline int ++get_netobj(char **ptr, const char *end, struct xdr_netobj *res) ++{ ++ char *p, *q; ++ p = *ptr; ++ if (get_bytes(&p, end, &res->len, sizeof(res->len))) ++ return -1; ++ q = p + res->len; ++ if(res->len == 0) ++ goto out_nocopy; ++ if (q > end || q < p) ++ return -1; ++ if (!(res->data = kmalloc(res->len, GFP_KERNEL))) ++ return -1; ++ memcpy(res->data, p, res->len); ++out_nocopy: ++ *ptr = q; ++ return 0; ++} ++ ++static inline int ++get_key(char **p, char *end, struct crypto_tfm **res, int *resalg) ++{ ++ struct xdr_netobj key = { ++ .len = 0, ++ .data = NULL, ++ }; ++ int alg_mode,setkey = 0; ++ char *alg_name; ++ ++ if (get_bytes(p, end, resalg, sizeof(int))) ++ goto out_err; ++ if ((get_netobj(p, end, &key))) ++ goto out_err; ++ ++ switch (*resalg) { ++ case NID_des_cbc: ++ alg_name = "des"; ++ alg_mode = CRYPTO_TFM_MODE_CBC; ++ setkey = 1; ++ break; ++ case NID_md5: ++ if (key.len == 0) { ++ dprintk("RPC: SPKM3 get_key: NID_md5 zero Key length\n"); ++ } ++ alg_name = "md5"; ++ alg_mode = 0; ++ setkey = 0; ++ break; ++ case NID_cast5_cbc: ++ dprintk("RPC: SPKM3 get_key: case cast5_cbc, UNSUPPORTED \n"); ++ goto out_err; ++ break; ++ default: ++ dprintk("RPC: SPKM3 get_key: unsupported algorithm %d", *resalg); ++ goto out_err_free_key; ++ } ++ if (!(*res = crypto_alloc_tfm(alg_name, alg_mode))) ++ goto out_err_free_key; ++ if (setkey) { ++ if (crypto_cipher_setkey(*res, key.data, key.len)) ++ goto out_err_free_tfm; ++ } ++ ++ if(key.len > 0) ++ kfree(key.data); ++ return 0; ++ ++out_err_free_tfm: ++ crypto_free_tfm(*res); ++out_err_free_key: ++ if(key.len > 0) ++ kfree(key.data); ++out_err: ++ return -1; ++} ++ ++static u32 ++gss_import_sec_context_spkm3(struct xdr_netobj *inbuf, ++ struct gss_ctx *ctx_id) ++{ ++ char *p = inbuf->data; ++ char *end = inbuf->data + inbuf->len; ++ struct spkm3_ctx *ctx; ++ ++ if (!(ctx = kmalloc(sizeof(*ctx), GFP_KERNEL))) ++ goto out_err; ++ memset(ctx, 0, sizeof(*ctx)); ++ ++ if (get_netobj(&p, end, &ctx->ctx_id)) ++ goto out_err_free_ctx; ++ ++ if (get_bytes(&p, end, &ctx->qop, sizeof(ctx->qop))) ++ goto out_err_free_ctx_id; ++ ++ if (get_netobj(&p, end, &ctx->mech_used)) ++ goto out_err_free_mech; ++ ++ if (get_bytes(&p, end, &ctx->ret_flags, sizeof(ctx->ret_flags))) ++ goto out_err_free_mech; ++ ++ if (get_bytes(&p, end, &ctx->req_flags, sizeof(ctx->req_flags))) ++ goto out_err_free_mech; ++ ++ if (get_netobj(&p, end, &ctx->share_key)) ++ goto out_err_free_s_key; ++ ++ if (get_key(&p, end, &ctx->derived_conf_key, &ctx->conf_alg)) { ++ dprintk("RPC: SPKM3 confidentiality key will be NULL\n"); ++ } ++ ++ if (get_key(&p, end, &ctx->derived_integ_key, &ctx->intg_alg)) { ++ dprintk("RPC: SPKM3 integrity key will be NULL\n"); ++ } ++ ++ if (get_bytes(&p, end, &ctx->owf_alg, sizeof(ctx->owf_alg))) ++ goto out_err_free_s_key; ++ ++ if (get_bytes(&p, end, &ctx->owf_alg, sizeof(ctx->owf_alg))) ++ goto out_err_free_s_key; ++ ++ if (p != end) ++ goto out_err_free_s_key; ++ ++ ctx_id->internal_ctx_id = ctx; ++ ++ dprintk("Succesfully imported new spkm context.\n"); ++ return 0; ++ ++out_err_free_s_key: ++ kfree(ctx->share_key.data); ++out_err_free_mech: ++ kfree(ctx->mech_used.data); ++out_err_free_ctx_id: ++ kfree(ctx->ctx_id.data); ++out_err_free_ctx: ++ kfree(ctx); ++out_err: ++ return GSS_S_FAILURE; ++} ++ ++void ++gss_delete_sec_context_spkm3(void *internal_ctx) { ++ struct spkm3_ctx *sctx = internal_ctx; ++ ++ if(sctx->derived_integ_key) ++ crypto_free_tfm(sctx->derived_integ_key); ++ if(sctx->derived_conf_key) ++ crypto_free_tfm(sctx->derived_conf_key); ++ if(sctx->share_key.data) ++ kfree(sctx->share_key.data); ++ if(sctx->mech_used.data) ++ kfree(sctx->mech_used.data); ++ kfree(sctx); ++} ++ ++u32 ++gss_verify_mic_spkm3(struct gss_ctx *ctx, ++ struct xdr_buf *signbuf, ++ struct xdr_netobj *checksum, ++ u32 *qstate) { ++ u32 maj_stat = 0; ++ int qop_state = 0; ++ struct spkm3_ctx *sctx = ctx->internal_ctx_id; ++ ++ dprintk("RPC: gss_verify_mic_spkm3 calling spkm3_read_token\n"); ++ maj_stat = spkm3_read_token(sctx, checksum, signbuf, &qop_state, ++ SPKM_MIC_TOK); ++ ++ if (!maj_stat && qop_state) ++ *qstate = qop_state; ++ ++ dprintk("RPC: gss_verify_mic_spkm3 returning %d\n", maj_stat); ++ return maj_stat; ++} ++ ++u32 ++gss_get_mic_spkm3(struct gss_ctx *ctx, ++ u32 qop, ++ struct xdr_buf *message_buffer, ++ struct xdr_netobj *message_token) { ++ u32 err = 0; ++ struct spkm3_ctx *sctx = ctx->internal_ctx_id; ++ ++ dprintk("RPC: gss_get_mic_spkm3\n"); ++ ++ err = spkm3_make_token(sctx, qop, message_buffer, ++ message_token, SPKM_MIC_TOK); ++ return err; ++} ++ ++static struct gss_api_ops gss_spkm3_ops = { ++ .gss_import_sec_context = gss_import_sec_context_spkm3, ++ .gss_get_mic = gss_get_mic_spkm3, ++ .gss_verify_mic = gss_verify_mic_spkm3, ++ .gss_delete_sec_context = gss_delete_sec_context_spkm3, ++}; ++ ++static struct pf_desc gss_spkm3_pfs[] = { ++ {RPC_AUTH_GSS_SPKM, 0, RPC_GSS_SVC_NONE, "spkm3"}, ++ {RPC_AUTH_GSS_SPKMI, 0, RPC_GSS_SVC_INTEGRITY, "spkm3i"}, ++}; ++ ++static struct gss_api_mech gss_spkm3_mech = { ++ .gm_name = "spkm3", ++ .gm_owner = THIS_MODULE, ++ .gm_ops = &gss_spkm3_ops, ++ .gm_pf_num = ARRAY_SIZE(gss_spkm3_pfs), ++ .gm_pfs = gss_spkm3_pfs, ++}; ++ ++static int __init init_spkm3_module(void) ++{ ++ int status; ++ ++ status = gss_mech_register(&gss_spkm3_mech); ++ if (status) ++ printk("Failed to register spkm3 gss mechanism!\n"); ++ return 0; ++} ++ ++static void __exit cleanup_spkm3_module(void) ++{ ++ gss_mech_unregister(&gss_spkm3_mech); ++} ++ ++MODULE_LICENSE("GPL"); ++module_init(init_spkm3_module); ++module_exit(cleanup_spkm3_module); +--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_crypto.c.lsec 2004-06-15 23:18:55.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_crypto.c 2005-03-23 14:28:24.840273400 -0700 +@@ -139,17 +139,91 @@ buf_to_sg(struct scatterlist *sg, char * + sg->length = len; + } + ++static int ++process_xdr_buf(struct xdr_buf *buf, int offset, int len, ++ int (*actor)(struct scatterlist *, void *), void *data) ++{ ++ int i, page_len, thislen, page_offset, ret = 0; ++ struct scatterlist sg[1]; ++ ++ if (offset >= buf->head[0].iov_len) { ++ offset -= buf->head[0].iov_len; ++ } else { ++ thislen = buf->head[0].iov_len - offset; ++ if (thislen > len) ++ thislen = len; ++ buf_to_sg(sg, buf->head[0].iov_base + offset, thislen); ++ ret = actor(sg, data); ++ if (ret) ++ goto out; ++ offset = 0; ++ len -= thislen; ++ } ++ if (len == 0) ++ goto out; ++ ++ if (offset >= buf->page_len) { ++ offset -= buf->page_len; ++ } else { ++ page_len = buf->page_len - offset; ++ if (page_len > len) ++ page_len = len; ++ len -= page_len; ++ page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1); ++ i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT; ++ thislen = PAGE_CACHE_SIZE - page_offset; ++ do { ++ if (thislen > page_len) ++ thislen = page_len; ++ sg->page = buf->pages[i]; ++ sg->offset = page_offset; ++ sg->length = thislen; ++ ret = actor(sg, data); ++ if (ret) ++ goto out; ++ page_len -= thislen; ++ i++; ++ page_offset = 0; ++ thislen = PAGE_CACHE_SIZE; ++ } while (page_len != 0); ++ offset = 0; ++ } ++ if (len == 0) ++ goto out; ++ ++ if (offset < buf->tail[0].iov_len) { ++ thislen = buf->tail[0].iov_len - offset; ++ if (thislen > len) ++ thislen = len; ++ buf_to_sg(sg, buf->tail[0].iov_base + offset, thislen); ++ ret = actor(sg, data); ++ len -= thislen; ++ } ++ if (len != 0) ++ ret = -EINVAL; ++out: ++ return ret; ++} ++ ++static int ++checksummer(struct scatterlist *sg, void *data) ++{ ++ struct crypto_tfm *tfm = (struct crypto_tfm *)data; ++ ++ crypto_digest_update(tfm, sg, 1); ++ ++ return 0; ++} ++ + /* checksum the plaintext data and hdrlen bytes of the token header */ + s32 + make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, +- struct xdr_netobj *cksum) ++ int body_offset, struct xdr_netobj *cksum) + { + char *cksumname; + struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */ + struct scatterlist sg[1]; + u32 code = GSS_S_FAILURE; +- int len, thislen, offset; +- int i; + + switch (cksumtype) { + case CKSUMTYPE_RSA_MD5: +@@ -169,35 +243,8 @@ make_checksum(s32 cksumtype, char *heade + crypto_digest_init(tfm); + buf_to_sg(sg, header, hdrlen); + crypto_digest_update(tfm, sg, 1); +- if (body->head[0].iov_len) { +- buf_to_sg(sg, body->head[0].iov_base, body->head[0].iov_len); +- crypto_digest_update(tfm, sg, 1); +- } +- +- len = body->page_len; +- if (len != 0) { +- offset = body->page_base & (PAGE_CACHE_SIZE - 1); +- i = body->page_base >> PAGE_CACHE_SHIFT; +- thislen = PAGE_CACHE_SIZE - offset; +- do { +- if (thislen > len) +- thislen = len; +- sg->page = body->pages[i]; +- sg->offset = offset; +- sg->length = thislen; +- kmap(sg->page); /* XXX kmap_atomic? */ +- crypto_digest_update(tfm, sg, 1); +- kunmap(sg->page); +- len -= thislen; +- i++; +- offset = 0; +- thislen = PAGE_CACHE_SIZE; +- } while(len != 0); +- } +- if (body->tail[0].iov_len) { +- buf_to_sg(sg, body->tail[0].iov_base, body->tail[0].iov_len); +- crypto_digest_update(tfm, sg, 1); +- } ++ process_xdr_buf(body, body_offset, body->len - body_offset, ++ checksummer, tfm); + crypto_digest_final(tfm, cksum->data); + code = 0; + out: +@@ -207,3 +254,154 @@ out: + } + + EXPORT_SYMBOL(make_checksum); ++ ++struct encryptor_desc { ++ u8 iv[8]; /* XXX hard-coded blocksize */ ++ struct crypto_tfm *tfm; ++ int pos; ++ struct xdr_buf *outbuf; ++ struct page **pages; ++ struct scatterlist infrags[4]; ++ struct scatterlist outfrags[4]; ++ int fragno; ++ int fraglen; ++}; ++ ++static int ++encryptor(struct scatterlist *sg, void *data) ++{ ++ struct encryptor_desc *desc = data; ++ struct xdr_buf *outbuf = desc->outbuf; ++ struct page *in_page; ++ int thislen = desc->fraglen + sg->length; ++ int fraglen, ret; ++ int page_pos; ++ ++ /* Worst case is 4 fragments: head, end of page 1, start ++ * of page 2, tail. Anything more is a bug. */ ++ BUG_ON(desc->fragno > 3); ++ desc->infrags[desc->fragno] = *sg; ++ desc->outfrags[desc->fragno] = *sg; ++ ++ page_pos = desc->pos - outbuf->head[0].iov_len; ++ if (page_pos >= 0 && page_pos < outbuf->page_len) { ++ /* pages are not in place: */ ++ int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT; ++ in_page = desc->pages[i]; ++ } else { ++ in_page = sg->page; ++ } ++ desc->infrags[desc->fragno].page = in_page; ++ desc->fragno++; ++ desc->fraglen += sg->length; ++ desc->pos += sg->length; ++ ++ fraglen = thislen & 7; /* XXX hardcoded blocksize */ ++ thislen -= fraglen; ++ ++ if (thislen == 0) ++ return 0; ++ ++ ret = crypto_cipher_encrypt_iv(desc->tfm, desc->outfrags, desc->infrags, ++ thislen, desc->iv); ++ if (ret) ++ return ret; ++ if (fraglen) { ++ desc->outfrags[0].page = sg->page; ++ desc->outfrags[0].offset = sg->offset + sg->length - fraglen; ++ desc->outfrags[0].length = fraglen; ++ desc->infrags[0] = desc->outfrags[0]; ++ desc->infrags[0].page = in_page; ++ desc->fragno = 1; ++ desc->fraglen = fraglen; ++ } else { ++ desc->fragno = 0; ++ desc->fraglen = 0; ++ } ++ return 0; ++} ++ ++int ++gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset, ++ struct page **pages) ++{ ++ int ret; ++ struct encryptor_desc desc; ++ ++ BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0); ++ ++ memset(desc.iv, 0, sizeof(desc.iv)); ++ desc.tfm = tfm; ++ desc.pos = offset; ++ desc.outbuf = buf; ++ desc.pages = pages; ++ desc.fragno = 0; ++ desc.fraglen = 0; ++ ++ ret = process_xdr_buf(buf, offset, buf->len - offset, encryptor, &desc); ++ return ret; ++} ++ ++EXPORT_SYMBOL(gss_encrypt_xdr_buf); ++ ++struct decryptor_desc { ++ u8 iv[8]; /* XXX hard-coded blocksize */ ++ struct crypto_tfm *tfm; ++ struct scatterlist frags[4]; ++ int fragno; ++ int fraglen; ++}; ++ ++static int ++decryptor(struct scatterlist *sg, void *data) ++{ ++ struct decryptor_desc *desc = data; ++ int thislen = desc->fraglen + sg->length; ++ int fraglen, ret; ++ ++ /* Worst case is 4 fragments: head, end of page 1, start ++ * of page 2, tail. Anything more is a bug. */ ++ BUG_ON(desc->fragno > 3); ++ desc->frags[desc->fragno] = *sg; ++ desc->fragno++; ++ desc->fraglen += sg->length; ++ ++ fraglen = thislen & 7; /* XXX hardcoded blocksize */ ++ thislen -= fraglen; ++ ++ if (thislen == 0) ++ return 0; ++ ++ ret = crypto_cipher_decrypt_iv(desc->tfm, desc->frags, desc->frags, ++ thislen, desc->iv); ++ if (ret) ++ return ret; ++ if (fraglen) { ++ desc->frags[0].page = sg->page; ++ desc->frags[0].offset = sg->offset + sg->length - fraglen; ++ desc->frags[0].length = fraglen; ++ desc->fragno = 1; ++ desc->fraglen = fraglen; ++ } else { ++ desc->fragno = 0; ++ desc->fraglen = 0; ++ } ++ return 0; ++} ++ ++int ++gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset) ++{ ++ struct decryptor_desc desc; ++ ++ /* XXXJBF: */ ++ BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0); ++ ++ memset(desc.iv, 0, sizeof(desc.iv)); ++ desc.tfm = tfm; ++ desc.fragno = 0; ++ desc.fraglen = 0; ++ return process_xdr_buf(buf, offset, buf->len - offset, decryptor, &desc); ++} ++ ++EXPORT_SYMBOL(gss_decrypt_xdr_buf); +--- linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_seal.c.lsec 2005-03-23 14:28:24.239364752 -0700 ++++ linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_seal.c 2005-03-23 14:28:24.238364904 -0700 +@@ -0,0 +1,132 @@ ++/* ++ * linux/net/sunrpc/gss_spkm3_seal.c ++ * ++ * Copyright (c) 2003 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef RPC_DEBUG ++# define RPCDBG_FACILITY RPCDBG_AUTH ++#endif ++ ++/* ++ * spkm3_make_token() ++ * ++ * Only SPKM_MIC_TOK with md5 intg-alg is supported ++ */ ++ ++u32 ++spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, ++ struct xdr_buf * text, struct xdr_netobj * token, ++ int toktype) ++{ ++ s32 checksum_type; ++ char tokhdrbuf[25]; ++ struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; ++ struct xdr_netobj mic_hdr = {.len = 0, .data = tokhdrbuf}; ++ int tmsglen, tokenlen = 0; ++ unsigned char *ptr; ++ s32 now; ++ int ctxelen = 0, ctxzbit = 0; ++ int md5elen = 0, md5zbit = 0; ++ ++ dprintk("RPC: spkm3_make_token\n"); ++ ++ now = jiffies; ++ if (qop_req != 0) ++ goto out_err; ++ ++ if (ctx->ctx_id.len != 16) { ++ dprintk("RPC: spkm3_make_token BAD ctx_id.len %d\n", ++ ctx->ctx_id.len); ++ goto out_err; ++ } ++ ++ switch (ctx->intg_alg) { ++ case NID_md5: ++ checksum_type = CKSUMTYPE_RSA_MD5; ++ break; ++ default: ++ dprintk("RPC: gss_spkm3_seal: ctx->signalg %d not" ++ " supported\n", ctx->intg_alg); ++ goto out_err; ++ } ++ /* XXX since we don't support WRAP, perhaps we don't care... */ ++ if (ctx->conf_alg != NID_cast5_cbc) { ++ dprintk("RPC: gss_spkm3_seal: ctx->sealalg %d not supported\n", ++ ctx->conf_alg); ++ goto out_err; ++ } ++ ++ if (toktype == SPKM_MIC_TOK) { ++ tmsglen = 0; ++ /* Calculate checksum over the mic-header */ ++ asn1_bitstring_len(&ctx->ctx_id, &ctxelen, &ctxzbit); ++ spkm3_mic_header(&mic_hdr.data, &mic_hdr.len, ctx->ctx_id.data, ++ ctxelen, ctxzbit); ++ ++ if (make_checksum(checksum_type, mic_hdr.data, mic_hdr.len, ++ text, &md5cksum)) ++ goto out_err; ++ ++ asn1_bitstring_len(&md5cksum, &md5elen, &md5zbit); ++ tokenlen = 10 + ctxelen + 1 + 2 + md5elen + 1; ++ ++ /* Create token header using generic routines */ ++ token->len = g_token_size(&ctx->mech_used, tokenlen + tmsglen); ++ ++ ptr = token->data; ++ g_make_token_header(&ctx->mech_used, tokenlen + tmsglen, &ptr); ++ ++ spkm3_make_mic_token(&ptr, tokenlen, &mic_hdr, &md5cksum, md5elen, md5zbit); ++ } else if (toktype == SPKM_WRAP_TOK) { /* Not Supported */ ++ dprintk("RPC: gss_spkm3_seal: SPKM_WRAP_TOK not supported\n"); ++ goto out_err; ++ } ++ kfree(md5cksum.data); ++ ++ /* XXX need to implement sequence numbers, and ctx->expired */ ++ ++ return GSS_S_COMPLETE; ++out_err: ++ if (md5cksum.data) ++ kfree(md5cksum.data); ++ token->data = 0; ++ token->len = 0; ++ return GSS_S_FAILURE; ++} +--- linux-2.6.7/net/sunrpc/auth_gss/svcauth_gss.c.lsec 2004-06-15 23:19:22.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/auth_gss/svcauth_gss.c 2005-03-23 14:28:24.405339520 -0700 +@@ -37,6 +37,7 @@ + * + */ + ++#include + #include + #include + #include +@@ -78,7 +79,6 @@ struct rsi { + + static struct cache_head *rsi_table[RSI_HASHMAX]; + static struct cache_detail rsi_cache; +-static struct rsi *rsi_lookup(struct rsi *item, int set); + + static void rsi_free(struct rsi *rsii) + { +@@ -125,38 +125,6 @@ static inline int dup_netobj(struct xdr_ + return dup_to_netobj(dst, src->data, src->len); + } + +-static inline void rsi_init(struct rsi *new, struct rsi *item) +-{ +- new->out_handle.data = NULL; +- new->out_handle.len = 0; +- new->out_token.data = NULL; +- new->out_token.len = 0; +- new->in_handle.len = item->in_handle.len; +- item->in_handle.len = 0; +- new->in_token.len = item->in_token.len; +- item->in_token.len = 0; +- new->in_handle.data = item->in_handle.data; +- item->in_handle.data = NULL; +- new->in_token.data = item->in_token.data; +- item->in_token.data = NULL; +-} +- +-static inline void rsi_update(struct rsi *new, struct rsi *item) +-{ +- BUG_ON(new->out_handle.data || new->out_token.data); +- new->out_handle.len = item->out_handle.len; +- item->out_handle.len = 0; +- new->out_token.len = item->out_token.len; +- item->out_token.len = 0; +- new->out_handle.data = item->out_handle.data; +- item->out_handle.data = NULL; +- new->out_token.data = item->out_token.data; +- item->out_token.data = NULL; +- +- new->major_status = item->major_status; +- new->minor_status = item->minor_status; +-} +- + static void rsi_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +@@ -168,6 +136,75 @@ static void rsi_request(struct cache_det + (*bpp)[-1] = '\n'; + } + ++static inline int ++gssd_reply(struct rsi *item) ++{ ++ struct rsi *tmp; ++ struct cache_head **hp, **head; ++ ++ head = &rsi_cache.hash_table[rsi_hash(item)]; ++ write_lock(&rsi_cache.hash_lock); ++ for (hp = head; *hp != NULL; hp = &tmp->h.next) { ++ tmp = container_of(*hp, struct rsi, h); ++ if (rsi_match(tmp, item)) { ++ cache_get(&tmp->h); ++ clear_bit(CACHE_HASHED, &tmp->h.flags); ++ *hp = tmp->h.next; ++ tmp->h.next = NULL; ++ rsi_cache.entries--; ++ if (test_bit(CACHE_VALID, &tmp->h.flags)) { ++ write_unlock(&rsi_cache.hash_lock); ++ rsi_put(&tmp->h, &rsi_cache); ++ return -EINVAL; ++ } ++ set_bit(CACHE_HASHED, &item->h.flags); ++ item->h.next = *hp; ++ *hp = &item->h; ++ rsi_cache.entries++; ++ set_bit(CACHE_VALID, &item->h.flags); ++ item->h.last_refresh = get_seconds(); ++ write_unlock(&rsi_cache.hash_lock); ++ cache_fresh(&rsi_cache, &tmp->h, 0); ++ rsi_put(&tmp->h, &rsi_cache); ++ return 0; ++ } ++ } ++ write_unlock(&rsi_cache.hash_lock); ++ return -EINVAL; ++} ++ ++static inline struct rsi * ++gssd_upcall(struct rsi *item, struct svc_rqst *rqstp) ++{ ++ struct rsi *tmp; ++ struct cache_head **hp, **head; ++ ++ head = &rsi_cache.hash_table[rsi_hash(item)]; ++ read_lock(&rsi_cache.hash_lock); ++ for (hp = head; *hp != NULL; hp = &tmp->h.next) { ++ tmp = container_of(*hp, struct rsi, h); ++ if (rsi_match(tmp, item)) { ++ if (!test_bit(CACHE_VALID, &tmp->h.flags)) { ++ read_unlock(&rsi_cache.hash_lock); ++ return NULL; ++ } ++ *hp = tmp->h.next; ++ tmp->h.next = NULL; ++ rsi_cache.entries--; ++ read_unlock(&rsi_cache.hash_lock); ++ return tmp; ++ } ++ } ++ cache_get(&item->h); ++ item->h.next = *head; ++ *head = &item->h; ++ rsi_cache.entries++; ++ read_unlock(&rsi_cache.hash_lock); ++ cache_get(&item->h); ++ if (cache_check(&rsi_cache, &item->h, &rqstp->rq_chandle)) ++ return NULL; ++ return item; ++} + + static int rsi_parse(struct cache_detail *cd, + char *mesg, int mlen) +@@ -176,17 +213,22 @@ static int rsi_parse(struct cache_detail + char *buf = mesg; + char *ep; + int len; +- struct rsi rsii, *rsip = NULL; ++ struct rsi *rsii; + time_t expiry; + int status = -EINVAL; + +- memset(&rsii, 0, sizeof(rsii)); ++ rsii = kmalloc(sizeof(*rsii), GFP_KERNEL); ++ if (!rsii) ++ return -ENOMEM; ++ memset(rsii, 0, sizeof(*rsii)); ++ cache_init(&rsii->h); ++ + /* handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + status = -ENOMEM; +- if (dup_to_netobj(&rsii.in_handle, buf, len)) ++ if (dup_to_netobj(&rsii->in_handle, buf, len)) + goto out; + + /* token */ +@@ -195,10 +237,9 @@ static int rsi_parse(struct cache_detail + if (len < 0) + goto out; + status = -ENOMEM; +- if (dup_to_netobj(&rsii.in_token, buf, len)) ++ if (dup_to_netobj(&rsii->in_token, buf, len)) + goto out; + +- rsii.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; +@@ -212,13 +253,13 @@ static int rsi_parse(struct cache_detail + if (len == 0) { + goto out; + } else { +- rsii.major_status = simple_strtoul(buf, &ep, 10); ++ rsii->major_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; +- rsii.minor_status = simple_strtoul(buf, &ep, 10); ++ rsii->minor_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + +@@ -227,7 +268,7 @@ static int rsi_parse(struct cache_detail + if (len < 0) + goto out; + status = -ENOMEM; +- if (dup_to_netobj(&rsii.out_handle, buf, len)) ++ if (dup_to_netobj(&rsii->out_handle, buf, len)) + goto out; + + /* out_token */ +@@ -236,16 +277,14 @@ static int rsi_parse(struct cache_detail + if (len < 0) + goto out; + status = -ENOMEM; +- if (dup_to_netobj(&rsii.out_token, buf, len)) ++ if (dup_to_netobj(&rsii->out_token, buf, len)) + goto out; + } +- rsii.h.expiry_time = expiry; +- rsip = rsi_lookup(&rsii, 1); +- status = 0; ++ rsii->h.expiry_time = expiry; ++ status = gssd_reply(rsii); + out: +- rsi_free(&rsii); +- if (rsip) +- rsi_put(&rsip->h, &rsi_cache); ++ if (rsii) ++ rsi_put(&rsii->h, &rsi_cache); + return status; + } + +@@ -258,8 +297,6 @@ static struct cache_detail rsi_cache = { + .cache_parse = rsi_parse, + }; + +-static DefineSimpleCacheLookup(rsi, 0) +- + /* + * The rpcsec_context cache is used to store a context that is + * used in data exchange. +@@ -292,7 +329,6 @@ struct rsc { + + static struct cache_head *rsc_table[RSC_HASHMAX]; + static struct cache_detail rsc_cache; +-static struct rsc *rsc_lookup(struct rsc *item, int set); + + static void rsc_free(struct rsc *rsci) + { +@@ -325,26 +361,44 @@ rsc_match(struct rsc *new, struct rsc *t + return netobj_equal(&new->handle, &tmp->handle); + } + +-static inline void +-rsc_init(struct rsc *new, struct rsc *tmp) ++static struct rsc *rsc_lookup(struct rsc *item, int set) + { +- new->handle.len = tmp->handle.len; +- tmp->handle.len = 0; +- new->handle.data = tmp->handle.data; +- tmp->handle.data = NULL; +- new->mechctx = NULL; +- new->cred.cr_group_info = NULL; +-} +- +-static inline void +-rsc_update(struct rsc *new, struct rsc *tmp) +-{ +- new->mechctx = tmp->mechctx; +- tmp->mechctx = NULL; +- memset(&new->seqdata, 0, sizeof(new->seqdata)); +- spin_lock_init(&new->seqdata.sd_lock); +- new->cred = tmp->cred; +- tmp->cred.cr_group_info = NULL; ++ struct rsc *tmp = NULL; ++ struct cache_head **hp, **head; ++ head = &rsc_cache.hash_table[rsc_hash(item)]; ++ ++ if (set) ++ write_lock(&rsc_cache.hash_lock); ++ else ++ read_lock(&rsc_cache.hash_lock); ++ for (hp = head; *hp != NULL; hp = &tmp->h.next) { ++ tmp = container_of(*hp, struct rsc, h); ++ if (!rsc_match(tmp, item)) ++ continue; ++ cache_get(&tmp->h); ++ if (!set) ++ goto out_noset; ++ *hp = tmp->h.next; ++ tmp->h.next = NULL; ++ clear_bit(CACHE_HASHED, &tmp->h.flags); ++ rsc_put(&tmp->h, &rsc_cache); ++ goto out_set; ++ } ++ /* Didn't find anything */ ++ if (!set) ++ goto out_noset; ++ rsc_cache.entries++; ++out_set: ++ set_bit(CACHE_HASHED, &item->h.flags); ++ item->h.next = *head; ++ *head = &item->h; ++ write_unlock(&rsc_cache.hash_lock); ++ cache_fresh(&rsc_cache, &item->h, item->h.expiry_time); ++ cache_get(&item->h); ++ return item; ++out_noset: ++ read_unlock(&rsc_cache.hash_lock); ++ return tmp; + } + + static int rsc_parse(struct cache_detail *cd, +@@ -353,19 +407,22 @@ static int rsc_parse(struct cache_detail + /* contexthandle expiry [ uid gid N mechname ...mechdata... ] */ + char *buf = mesg; + int len, rv; +- struct rsc rsci, *rscp = NULL; ++ struct rsc *rsci, *res = NULL; + time_t expiry; + int status = -EINVAL; + +- memset(&rsci, 0, sizeof(rsci)); ++ rsci = kmalloc(sizeof(*rsci), GFP_KERNEL); ++ if (!rsci) ++ return -ENOMEM; ++ memset(rsci, 0, sizeof(*rsci)); ++ cache_init(&rsci->h); + /* context handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) goto out; + status = -ENOMEM; +- if (dup_to_netobj(&rsci.handle, buf, len)) ++ if (dup_to_netobj(&rsci->handle, buf, len)) + goto out; + +- rsci.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; +@@ -373,26 +430,26 @@ static int rsc_parse(struct cache_detail + goto out; + + /* uid, or NEGATIVE */ +- rv = get_int(&mesg, &rsci.cred.cr_uid); ++ rv = get_int(&mesg, &rsci->cred.cr_uid); + if (rv == -EINVAL) + goto out; + if (rv == -ENOENT) +- set_bit(CACHE_NEGATIVE, &rsci.h.flags); ++ set_bit(CACHE_NEGATIVE, &rsci->h.flags); + else { + int N, i; + struct gss_api_mech *gm; + struct xdr_netobj tmp_buf; + + /* gid */ +- if (get_int(&mesg, &rsci.cred.cr_gid)) ++ if (get_int(&mesg, &rsci->cred.cr_gid)) + goto out; + + /* number of additional gid's */ + if (get_int(&mesg, &N)) + goto out; + status = -ENOMEM; +- rsci.cred.cr_group_info = groups_alloc(N); +- if (rsci.cred.cr_group_info == NULL) ++ rsci->cred.cr_group_info = groups_alloc(N); ++ if (rsci->cred.cr_group_info == NULL) + goto out; + + /* gid's */ +@@ -401,7 +458,7 @@ static int rsc_parse(struct cache_detail + gid_t gid; + if (get_int(&mesg, &gid)) + goto out; +- GROUP_AT(rsci.cred.cr_group_info, i) = gid; ++ GROUP_AT(rsci->cred.cr_group_info, i) = gid; + } + + /* mech name */ +@@ -422,19 +479,21 @@ static int rsc_parse(struct cache_detail + } + tmp_buf.len = len; + tmp_buf.data = buf; +- if (gss_import_sec_context(&tmp_buf, gm, &rsci.mechctx)) { ++ if (gss_import_sec_context(&tmp_buf, gm, &rsci->mechctx)) { + gss_mech_put(gm); + goto out; + } + gss_mech_put(gm); + } +- rsci.h.expiry_time = expiry; +- rscp = rsc_lookup(&rsci, 1); ++ rsci->h.expiry_time = expiry; ++ spin_lock_init(&rsci->seqdata.sd_lock); ++ res = rsc_lookup(rsci, 1); ++ rsc_put(&res->h, &rsc_cache); ++ rsci = NULL; + status = 0; + out: +- rsc_free(&rsci); +- if (rscp) +- rsc_put(&rscp->h, &rsc_cache); ++ if (rsci) ++ rsc_put(&rsci->h, &rsc_cache); + return status; + } + +@@ -446,19 +505,14 @@ static struct cache_detail rsc_cache = { + .cache_parse = rsc_parse, + }; + +-static DefineSimpleCacheLookup(rsc, 0); +- + struct rsc * + gss_svc_searchbyctx(struct xdr_netobj *handle) + { + struct rsc rsci; + struct rsc *found; + +- memset(&rsci, 0, sizeof(rsci)); +- if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) +- return NULL; ++ rsci.handle = *handle; + found = rsc_lookup(&rsci, 0); +- rsc_free(&rsci); + if (!found) + return NULL; + if (cache_check(&rsc_cache, &found->h, NULL)) +@@ -643,7 +697,6 @@ svcauth_gss_register_pseudoflavor(u32 ps + if (!new) + goto out; + cache_init(&new->h.h); +- atomic_inc(&new->h.h.refcnt); + new->h.name = kmalloc(strlen(name) + 1, GFP_KERNEL); + if (!new->h.name) + goto out_free_dom; +@@ -651,7 +704,6 @@ svcauth_gss_register_pseudoflavor(u32 ps + new->h.flavour = RPC_AUTH_GSS; + new->pseudoflavor = pseudoflavor; + new->h.h.expiry_time = NEVER; +- new->h.h.flags = 0; + + test = auth_domain_lookup(&new->h, 1); + if (test == &new->h) { +@@ -723,6 +775,45 @@ out: + return stat; + } + ++static int ++unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) ++{ ++ int stat = -EINVAL; ++ int out_offset; ++ u32 * lenp; ++ u32 priv_len, maj_stat; ++ int saved_len; ++ ++ lenp = buf->head[0].iov_base; ++ priv_len = ntohl(svc_getu32(&buf->head[0])); ++ if (priv_len > buf->len) /* XXXJBF: wrong check */ ++ goto out; ++ /* XXXJBF: bizarre hack: to handle revisits (and not decrypt ++ * twice), the first time through we write an offset ++ * telling us where to skip to find the already-decrypted data */ ++ if (rqstp->rq_deferred) { ++ buf->head[0].iov_base += priv_len; ++ buf->head[0].iov_len -= priv_len; ++ return 0; ++ } ++ saved_len = buf->len; /* XXX HACK */ ++ buf->len = priv_len; ++ maj_stat = gss_unwrap(ctx, GSS_C_QOP_DEFAULT, 0, buf, &out_offset); ++ buf->len = saved_len; ++ buf->head[0].iov_base += out_offset; ++ buf->head[0].iov_len -= out_offset; ++ BUG_ON(buf->head[0].iov_len <= 0); ++ if (maj_stat != GSS_S_COMPLETE) ++ goto out; ++ if (ntohl(svc_getu32(&buf->head[0])) != seq) ++ goto out; ++ /* XXXJBF: see "bizarre hack", above. */ ++ *lenp = htonl(out_offset + 4); ++ stat = 0; ++out: ++ return stat; ++} ++ + struct gss_svc_data { + /* decoded gss client cred: */ + struct rpc_gss_wire_cred clcred; +@@ -750,7 +841,7 @@ svcauth_gss_accept(struct svc_rqst *rqst + struct gss_svc_data *svcdata = rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc; + struct rsc *rsci = NULL; +- struct rsi *rsip, rsikey; ++ struct rsi *rsip, *rsikey = NULL; + u32 *rpcstart; + u32 *reject_stat = resv->iov_base + resv->iov_len; + int ret; +@@ -843,30 +934,23 @@ svcauth_gss_accept(struct svc_rqst *rqst + *authp = rpc_autherr_badcred; + if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) + goto auth_err; +- memset(&rsikey, 0, sizeof(rsikey)); +- if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) ++ rsikey = kmalloc(sizeof(*rsikey), GFP_KERNEL); ++ if (!rsikey) ++ goto drop; ++ memset(rsikey, 0, sizeof(*rsikey)); ++ cache_init(&rsikey->h); ++ if (dup_netobj(&rsikey->in_handle, &gc->gc_ctx)) + goto drop; + *authp = rpc_autherr_badverf; +- if (svc_safe_getnetobj(argv, &tmpobj)) { +- kfree(rsikey.in_handle.data); ++ if (svc_safe_getnetobj(argv, &tmpobj)) + goto auth_err; +- } +- if (dup_netobj(&rsikey.in_token, &tmpobj)) { +- kfree(rsikey.in_handle.data); ++ if (dup_netobj(&rsikey->in_token, &tmpobj)) + goto drop; +- } + +- rsip = rsi_lookup(&rsikey, 0); +- rsi_free(&rsikey); +- if (!rsip) { +- goto drop; +- } +- switch(cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) { +- case -EAGAIN: ++ rsip = gssd_upcall(rsikey, rqstp); ++ if (!rsip) + goto drop; +- case -ENOENT: +- goto drop; +- case 0: ++ else { + rsci = gss_svc_searchbyctx(&rsip->out_handle); + if (!rsci) { + goto drop; +@@ -921,7 +1005,16 @@ svcauth_gss_accept(struct svc_rqst *rqst + svc_putu32(resv, 0); + break; + case RPC_GSS_SVC_PRIVACY: +- /* currently unsupported */ ++ if (unwrap_priv_data(rqstp, &rqstp->rq_arg, ++ gc->gc_seq, rsci->mechctx)) ++ goto auth_err; ++ svcdata->rsci = rsci; ++ cache_get(&rsci->h); ++ /* placeholders for length and seq. number: */ ++ svcdata->body_start = resv->iov_base + resv->iov_len; ++ svc_putu32(resv, 0); ++ svc_putu32(resv, 0); ++ break; + default: + goto auth_err; + } +@@ -939,13 +1032,15 @@ complete: + drop: + ret = SVC_DROP; + out: ++ if (rsikey) ++ rsi_put(&rsikey->h, &rsi_cache); + if (rsci) + rsc_put(&rsci->h, &rsc_cache); + return ret; + } + +-static int +-svcauth_gss_release(struct svc_rqst *rqstp) ++static inline int ++svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) + { + struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc = &gsd->clcred; +@@ -957,6 +1052,156 @@ svcauth_gss_release(struct svc_rqst *rqs + int integ_offset, integ_len; + int stat = -EINVAL; + ++ p = gsd->body_start; ++ gsd->body_start = 0; ++ /* move accept_stat to right place: */ ++ memcpy(p, p + 2, 4); ++ /* Don't wrap in failure case: */ ++ /* Counting on not getting here if call was not even accepted! */ ++ if (*p != rpc_success) { ++ resbuf->head[0].iov_len -= 2 * 4; ++ goto out; ++ } ++ p++; ++ integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; ++ integ_len = resbuf->len - integ_offset; ++ BUG_ON(integ_len % 4); ++ *p++ = htonl(integ_len); ++ *p++ = htonl(gc->gc_seq); ++ if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, ++ integ_len)) ++ BUG(); ++ if (resbuf->page_len == 0 ++ && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE ++ < PAGE_SIZE) { ++ BUG_ON(resbuf->tail[0].iov_len); ++ /* Use head for everything */ ++ resv = &resbuf->head[0]; ++ } else if (resbuf->tail[0].iov_base == NULL) { ++ /* copied from nfsd4_encode_read */ ++ svc_take_page(rqstp); ++ resbuf->tail[0].iov_base = page_address(rqstp ++ ->rq_respages[rqstp->rq_resused-1]); ++ rqstp->rq_restailpage = rqstp->rq_resused-1; ++ resbuf->tail[0].iov_len = 0; ++ resv = &resbuf->tail[0]; ++ } else { ++ resv = &resbuf->tail[0]; ++ } ++ mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; ++ if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic)) ++ goto out_err; ++ svc_putu32(resv, htonl(mic.len)); ++ memset(mic.data + mic.len, 0, ++ round_up_to_quad(mic.len) - mic.len); ++ resv->iov_len += XDR_QUADLEN(mic.len) << 2; ++ /* not strictly required: */ ++ resbuf->len += XDR_QUADLEN(mic.len) << 2; ++ BUG_ON(resv->iov_len > PAGE_SIZE); ++out: ++ stat = 0; ++out_err: ++ return stat; ++} ++ ++/* XXXJBF: Look for chances to share code with client */ ++/* XXXJBF: Do we need to preallocate these pages somehow? E.g. see ++ * buffer size calculations in svcsock.c */ ++/* XXXJBF: how does reference counting on pages work? */ ++static struct page ** ++svc_alloc_enc_pages(struct xdr_buf *buf) ++{ ++ struct page **ret; ++ int last, i; ++ ++ if (buf->page_len == 0) ++ return NULL; ++ BUG_ON(buf->page_base >> PAGE_CACHE_SHIFT); ++ last = (buf->page_base + buf->page_len - 1) >> PAGE_CACHE_SHIFT; ++ ret = kmalloc((last + 1) * sizeof(struct page *), GFP_KERNEL); ++ if (!ret) ++ goto out; ++ for (i = 0; i<= last; i++) { ++ ret[i] = alloc_page(GFP_KERNEL); ++ if (ret[i] == NULL) ++ goto out_free; ++ } ++out: ++ return ret; ++out_free: ++ for (i--; i >= 0; i--) { ++ __free_page(ret[i]); ++ } ++ return NULL; ++} ++ ++static inline int ++svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp) ++{ ++ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; ++ struct rpc_gss_wire_cred *gc = &gsd->clcred; ++ struct xdr_buf *resbuf = &rqstp->rq_res; ++ struct page **inpages; ++ u32 *p; ++ int offset, *len; ++ int pad; ++ int stat = -EINVAL; ++ ++ p = gsd->body_start; ++ gsd->body_start = 0; ++ /* move accept_stat to right place: */ ++ memcpy(p, p + 2, 4); ++ /* Don't wrap in failure case: */ ++ /* Counting on not getting here if call was not even accepted! */ ++ if (*p != rpc_success) { ++ resbuf->head[0].iov_len -= 2 * 4; ++ goto out; ++ } ++ p++; ++ len = p++; ++ offset = (u8 *)p - (u8 *)resbuf->head[0].iov_base; ++ *p++ = htonl(gc->gc_seq); ++ stat = -ENOMEM; ++ inpages = resbuf->pages; ++ /* XXXJBF: huge memory leaks here: allocated pages probably aren't ++ * freed, and neither is memory used to hold page array. */ ++ resbuf->pages = svc_alloc_enc_pages(resbuf); ++ if (resbuf->page_len && !resbuf->pages) ++ goto out_err; /* XXX sleep and retry? Reserve ahead of time ++ and BUG_ON? */ ++ if (resbuf->tail[0].iov_len == 0 || resbuf->tail[0].iov_base == NULL) { ++ /* copied from nfsd4_encode_read */ ++ {int i = svc_take_page(rqstp); BUG_ON(i); } ++ resbuf->tail[0].iov_base = page_address(rqstp ++ ->rq_respages[rqstp->rq_resused-1]); ++ rqstp->rq_restailpage = rqstp->rq_resused-1; ++ resbuf->tail[0].iov_len = 0; ++ } ++ /* XXX: Will svc code attempt to free stuff in xdr_buf->pages? ++ * Or can we leave it in any old state on error?? */ ++ stat = -EINVAL; ++ if (gss_wrap(gsd->rsci->mechctx, GSS_C_QOP_DEFAULT, offset, ++ resbuf, inpages)) ++ goto out_err; ++ *len = htonl(resbuf->len - offset); ++ pad = 3 - ((resbuf->len - offset - 1)&3); ++ p = (u32 *)(resbuf->tail[0].iov_base + resbuf->tail[0].iov_len); ++ memset(p, 0, pad); ++ resbuf->tail[0].iov_len += pad; ++out: ++ return 0; ++out_err: ++ return stat; ++} ++ ++static int ++svcauth_gss_release(struct svc_rqst *rqstp) ++{ ++ struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; ++ struct rpc_gss_wire_cred *gc = &gsd->clcred; ++ struct xdr_buf *resbuf = &rqstp->rq_res; ++ int stat = -EINVAL; ++ + if (gc->gc_proc != RPC_GSS_PROC_DATA) + goto out; + /* Release can be called twice, but we only wrap once. */ +@@ -969,55 +1214,15 @@ svcauth_gss_release(struct svc_rqst *rqs + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: +- p = gsd->body_start; +- gsd->body_start = 0; +- /* move accept_stat to right place: */ +- memcpy(p, p + 2, 4); +- /* don't wrap in failure case: */ +- /* Note: counting on not getting here if call was not even +- * accepted! */ +- if (*p != rpc_success) { +- resbuf->head[0].iov_len -= 2 * 4; +- goto out; +- } +- p++; +- integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; +- integ_len = resbuf->len - integ_offset; +- BUG_ON(integ_len % 4); +- *p++ = htonl(integ_len); +- *p++ = htonl(gc->gc_seq); +- if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, +- integ_len)) +- BUG(); +- if (resbuf->page_len == 0 +- && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE +- < PAGE_SIZE) { +- BUG_ON(resbuf->tail[0].iov_len); +- /* Use head for everything */ +- resv = &resbuf->head[0]; +- } else if (resbuf->tail[0].iov_base == NULL) { +- /* copied from nfsd4_encode_read */ +- svc_take_page(rqstp); +- resbuf->tail[0].iov_base = page_address(rqstp +- ->rq_respages[rqstp->rq_resused-1]); +- rqstp->rq_restailpage = rqstp->rq_resused-1; +- resbuf->tail[0].iov_len = 0; +- resv = &resbuf->tail[0]; +- } else { +- resv = &resbuf->tail[0]; +- } +- mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; +- if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic)) ++ stat = svcauth_gss_wrap_resp_integ(rqstp); ++ if (stat) + goto out_err; +- svc_putu32(resv, htonl(mic.len)); +- memset(mic.data + mic.len, 0, +- round_up_to_quad(mic.len) - mic.len); +- resv->iov_len += XDR_QUADLEN(mic.len) << 2; +- /* not strictly required: */ +- resbuf->len += XDR_QUADLEN(mic.len) << 2; +- BUG_ON(resv->iov_len > PAGE_SIZE); + break; + case RPC_GSS_SVC_PRIVACY: ++ stat = svcauth_gss_wrap_resp_priv(rqstp); ++ if (stat) ++ goto out_err; ++ break; + default: + goto out_err; + } +--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_wrap.c.lsec 2005-03-23 14:28:24.900264280 -0700 ++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_wrap.c 2005-03-23 14:28:24.900264280 -0700 +@@ -0,0 +1,337 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef RPC_DEBUG ++# define RPCDBG_FACILITY RPCDBG_AUTH ++#endif ++ ++static inline int ++gss_krb5_padding(int blocksize, int length) ++{ ++ /* Most of the code is block-size independent but currently we ++ * use only 8: */ ++ BUG_ON(blocksize != 8); ++ return 8 - (length & 7); ++} ++ ++static inline void ++gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize) ++{ ++ int padding = gss_krb5_padding(blocksize, buf->len - offset); ++ char *p; ++ struct iovec *iov; ++ ++ if (buf->page_len || buf->tail[0].iov_len) ++ iov = &buf->tail[0]; ++ else ++ iov = &buf->head[0]; ++ p = iov->iov_base + iov->iov_len; ++ iov->iov_len += padding; ++ buf->len += padding; ++ memset(p, padding, padding); ++} ++ ++static inline int ++gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize) ++{ ++ u8 *ptr; ++ u8 pad; ++ int len = buf->len; ++ ++ if (len <= buf->head[0].iov_len) { ++ pad = *(u8 *)(buf->head[0].iov_base + len - 1); ++ goto out; ++ } else ++ len -= buf->head[0].iov_len; ++ if (len <= buf->page_len) { ++ int last = (buf->page_base + len - 1) ++ >>PAGE_CACHE_SHIFT; ++ int offset = (buf->page_base + len - 1) ++ & (PAGE_CACHE_SIZE - 1); ++ ptr = kmap_atomic(buf->pages[last], KM_SKB_SUNRPC_DATA); ++ pad = *(ptr + offset); ++ kunmap_atomic(ptr, KM_SKB_SUNRPC_DATA); ++ goto out; ++ } else ++ len -= buf->page_len; ++ BUG_ON(len > buf->tail[0].iov_len); ++ pad = *(u8 *)(buf->tail[0].iov_base + len - 1); ++out: ++ if (pad > blocksize) ++ return -EINVAL; ++ buf->len -= pad; ++ return 0; ++} ++ ++static inline void ++make_confounder(char *p, int blocksize) ++{ ++ /* XXX? Is this OK to do on every packet? */ ++ get_random_bytes(p, blocksize); ++} ++ ++/* Assumptions: the head and tail of inbuf are ours to play with. ++ * The pages, however, may be real pages in the page cache and we replace ++ * them with scratch pages from **pages before writing to them. */ ++/* XXX: obviously the above should be documentation of wrap interface, ++ * and shouldn't be in this kerberos-specific file. */ ++ ++/* XXX factor out common code with seal/unseal. */ ++ ++u32 ++gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset, ++ struct xdr_buf *buf, struct page **pages) ++{ ++ struct krb5_ctx *kctx = ctx->internal_ctx_id; ++ s32 checksum_type; ++ struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; ++ int blocksize = 0, plainlen; ++ unsigned char *ptr, *krb5_hdr, *msg_start; ++ s32 now; ++ int headlen; ++ struct page **tmp_pages; ++ u32 seq_send; ++ ++ dprintk("RPC: gss_wrap_kerberos\n"); ++ ++ now = get_seconds(); ++ ++ if (qop != 0) ++ goto out_err; ++ ++ switch (kctx->signalg) { ++ case SGN_ALG_DES_MAC_MD5: ++ checksum_type = CKSUMTYPE_RSA_MD5; ++ break; ++ default: ++ dprintk("RPC: gss_krb5_seal: kctx->signalg %d not" ++ " supported\n", kctx->signalg); ++ goto out_err; ++ } ++ if (kctx->sealalg != SEAL_ALG_NONE && kctx->sealalg != SEAL_ALG_DES) { ++ dprintk("RPC: gss_krb5_seal: kctx->sealalg %d not supported\n", ++ kctx->sealalg); ++ goto out_err; ++ } ++ ++ blocksize = crypto_tfm_alg_blocksize(kctx->enc); ++ gss_krb5_add_padding(buf, offset, blocksize); ++ BUG_ON((buf->len - offset) % blocksize); ++ plainlen = blocksize + buf->len - offset; ++ ++ headlen = g_token_size(&kctx->mech_used, 22 + plainlen) - ++ (buf->len - offset); ++ ++ ptr = buf->head[0].iov_base + offset; ++ /* shift data to make room for header. */ ++ /* XXX Would be cleverer to encrypt while copying. */ ++ /* XXX bounds checking, slack, etc. */ ++ memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset); ++ buf->head[0].iov_len += headlen; ++ buf->len += headlen; ++ BUG_ON((buf->len - offset - headlen) % blocksize); ++ ++ g_make_token_header(&kctx->mech_used, 22 + plainlen, &ptr); ++ ++ ++ *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff); ++ *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff); ++ ++ /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ ++ krb5_hdr = ptr - 2; ++ msg_start = krb5_hdr + 24; ++ /* XXXJBF: */ BUG_ON(buf->head[0].iov_base + offset + headlen != msg_start + blocksize); ++ ++ *(u16 *)(krb5_hdr + 2) = htons(kctx->signalg); ++ memset(krb5_hdr + 4, 0xff, 4); ++ *(u16 *)(krb5_hdr + 4) = htons(kctx->sealalg); ++ ++ make_confounder(msg_start, blocksize); ++ ++ /* XXXJBF: UGH!: */ ++ tmp_pages = buf->pages; ++ buf->pages = pages; ++ if (make_checksum(checksum_type, krb5_hdr, 8, buf, ++ offset + headlen - blocksize, &md5cksum)) ++ goto out_err; ++ buf->pages = tmp_pages; ++ ++ switch (kctx->signalg) { ++ case SGN_ALG_DES_MAC_MD5: ++ if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, ++ md5cksum.data, md5cksum.len)) ++ goto out_err; ++ memcpy(krb5_hdr + 16, ++ md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH, ++ KRB5_CKSUM_LENGTH); ++ ++ dprintk("RPC: make_seal_token: cksum data: \n"); ++ print_hexl((u32 *) (krb5_hdr + 16), KRB5_CKSUM_LENGTH, 0); ++ break; ++ default: ++ BUG(); ++ } ++ ++ kfree(md5cksum.data); ++ ++ spin_lock(&krb5_seq_lock); ++ seq_send = kctx->seq_send++; ++ spin_unlock(&krb5_seq_lock); ++ ++ /* XXX would probably be more efficient to compute checksum ++ * and encrypt at the same time: */ ++ if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, ++ seq_send, krb5_hdr + 16, krb5_hdr + 8))) ++ goto out_err; ++ ++ if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize, ++ pages)) ++ goto out_err; ++ ++ return ((kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); ++out_err: ++ if (md5cksum.data) kfree(md5cksum.data); ++ return GSS_S_FAILURE; ++} ++ ++u32 ++gss_unwrap_kerberos(struct gss_ctx *ctx, u32 *qop, int offset, ++ struct xdr_buf *buf, int *out_offset) ++{ ++ struct krb5_ctx *kctx = ctx->internal_ctx_id; ++ int signalg; ++ int sealalg; ++ s32 checksum_type; ++ struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; ++ s32 now; ++ int direction; ++ s32 seqnum; ++ unsigned char *ptr; ++ int bodysize; ++ u32 ret = GSS_S_DEFECTIVE_TOKEN; ++ u8 *data_start; ++ int blocksize; ++ ++ dprintk("RPC: gss_unwrap_kerberos\n"); ++ ++ ptr = (u8 *)buf->head[0].iov_base + offset; ++ if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr, ++ buf->len - offset)) ++ goto out; ++ ++ if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) || ++ (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) ) ++ goto out; ++ ++ /* XXX sanity-check bodysize?? */ ++ ++ /* get the sign and seal algorithms */ ++ ++ signalg = ptr[0] + (ptr[1] << 8); ++ sealalg = ptr[2] + (ptr[3] << 8); ++ ++ /* Sanity checks */ ++ ++ if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) ++ goto out; ++ ++ if (sealalg == 0xffff) ++ goto out; ++ ++ /* in the current spec, there is only one valid seal algorithm per ++ key type, so a simple comparison is ok */ ++ ++ if (sealalg != kctx->sealalg) ++ goto out; ++ ++ /* there are several mappings of seal algorithms to sign algorithms, ++ but few enough that we can try them all. */ ++ ++ if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) || ++ (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) || ++ (kctx->sealalg == SEAL_ALG_DES3KD && ++ signalg != SGN_ALG_HMAC_SHA1_DES3_KD)) ++ goto out; ++ ++ if (gss_decrypt_xdr_buf(kctx->enc, buf, ++ ptr + 22 - (unsigned char *)buf->head[0].iov_base)) ++ goto out; ++ ++ /* compute the checksum of the message */ ++ ++ /* initialize the the cksum */ ++ switch (signalg) { ++ case SGN_ALG_DES_MAC_MD5: ++ checksum_type = CKSUMTYPE_RSA_MD5; ++ break; ++ default: ++ ret = GSS_S_DEFECTIVE_TOKEN; ++ goto out; ++ } ++ ++ switch (signalg) { ++ case SGN_ALG_DES_MAC_MD5: ++ ret = make_checksum(checksum_type, ptr - 2, 8, buf, ++ ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum); ++ if (ret) ++ goto out; ++ ++ ret = krb5_encrypt(kctx->seq, NULL, md5cksum.data, ++ md5cksum.data, md5cksum.len); ++ if (ret) ++ goto out; ++ ++ if (memcmp(md5cksum.data + 8, ptr + 14, 8)) { ++ ret = GSS_S_BAD_SIG; ++ goto out; ++ } ++ break; ++ default: ++ ret = GSS_S_DEFECTIVE_TOKEN; ++ goto out; ++ } ++ ++ /* it got through unscathed. Make sure the context is unexpired */ ++ ++ if (qop) ++ *qop = GSS_C_QOP_DEFAULT; ++ ++ now = get_seconds(); ++ ++ ret = GSS_S_CONTEXT_EXPIRED; ++ if (now > kctx->endtime) ++ goto out; ++ ++ /* do sequencing checks */ ++ ++ ret = GSS_S_BAD_SIG; ++ if ((ret = krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction, ++ &seqnum))) ++ goto out; ++ ++ if ((kctx->initiate && direction != 0xff) || ++ (!kctx->initiate && direction != 0)) ++ goto out; ++ ++ /* Copy the data back to the right position. XXX: Would probably be ++ * better to copy and encrypt at the same time. */ ++ ++ blocksize = crypto_tfm_alg_blocksize(kctx->enc); ++ data_start = ptr + 22 + blocksize; ++ *out_offset = data_start - (u8 *)buf->head[0].iov_base; ++ ++ ret = GSS_S_DEFECTIVE_TOKEN; ++ if (gss_krb5_remove_padding(buf, blocksize)) ++ goto out; ++ ++ ret = GSS_S_COMPLETE; ++out: ++ if (md5cksum.data) kfree(md5cksum.data); ++ return ret; ++} +--- linux-2.6.7/net/sunrpc/auth_gss/gss_mech_switch.c.lsec 2004-06-15 23:19:37.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/auth_gss/gss_mech_switch.c 2005-03-23 14:28:24.782282216 -0700 +@@ -279,6 +279,29 @@ gss_verify_mic(struct gss_ctx *context_ + qstate); + } + ++u32 ++gss_wrap(struct gss_ctx *ctx_id, ++ u32 qop, ++ int offset, ++ struct xdr_buf *buf, ++ struct page **inpages) ++{ ++ return ctx_id->mech_type->gm_ops ++ ->gss_wrap(ctx_id, qop, offset, buf, inpages); ++} ++ ++u32 ++gss_unwrap(struct gss_ctx *ctx_id, ++ u32 *qop, ++ int offset, ++ struct xdr_buf *buf, ++ int *out_offset) ++{ ++ return ctx_id->mech_type->gm_ops ++ ->gss_unwrap(ctx_id, qop, offset, buf, out_offset); ++} ++ ++ + /* gss_delete_sec_context: free all resources associated with context_handle. + * Note this differs from the RFC 2744-specified prototype in that we don't + * bother returning an output token, since it would never be used anyway. */ +--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_mech.c.lsec 2004-06-15 23:19:42.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_mech.c 2005-03-23 14:28:24.841273248 -0700 +@@ -182,6 +182,7 @@ gss_delete_sec_context_kerberos(void *in + kfree(kctx); + } + ++/* XXX the following wrappers have become pointless; kill them. */ + static u32 + gss_verify_mic_kerberos(struct gss_ctx *ctx, + struct xdr_buf *message, +@@ -191,8 +192,7 @@ gss_verify_mic_kerberos(struct gss_ctx + int qop_state; + struct krb5_ctx *kctx = ctx->internal_ctx_id; + +- maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state, +- KG_TOK_MIC_MSG); ++ maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state); + if (!maj_stat && qop_state) + *qstate = qop_state; + +@@ -208,7 +208,7 @@ gss_get_mic_kerberos(struct gss_ctx *ctx + u32 err = 0; + struct krb5_ctx *kctx = ctx->internal_ctx_id; + +- err = krb5_make_token(kctx, qop, message, mic_token, KG_TOK_MIC_MSG); ++ err = krb5_make_token(kctx, qop, message, mic_token); + + dprintk("RPC: gss_get_mic_kerberos returning %d\n",err); + +@@ -219,6 +219,8 @@ static struct gss_api_ops gss_kerberos_o + .gss_import_sec_context = gss_import_sec_context_kerberos, + .gss_get_mic = gss_get_mic_kerberos, + .gss_verify_mic = gss_verify_mic_kerberos, ++ .gss_wrap = gss_wrap_kerberos, ++ .gss_unwrap = gss_unwrap_kerberos, + .gss_delete_sec_context = gss_delete_sec_context_kerberos, + }; + +@@ -233,6 +235,11 @@ static struct pf_desc gss_kerberos_pfs[] + .service = RPC_GSS_SVC_INTEGRITY, + .name = "krb5i", + }, ++ [2] = { ++ .pseudoflavor = RPC_AUTH_GSS_KRB5P, ++ .service = RPC_GSS_SVC_PRIVACY, ++ .name = "krb5p", ++ }, + }; + + static struct gss_api_mech gss_kerberos_mech = { +--- linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_seal.c.lsec 2004-06-15 23:18:37.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/auth_gss/gss_krb5_seal.c 2005-03-23 14:28:24.898264584 -0700 +@@ -70,24 +70,17 @@ + # define RPCDBG_FACILITY RPCDBG_AUTH + #endif + +-static inline int +-gss_krb5_padding(int blocksize, int length) { +- /* Most of the code is block-size independent but in practice we +- * use only 8: */ +- BUG_ON(blocksize != 8); +- return 8 - (length & 7); +-} ++spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED; + + u32 + krb5_make_token(struct krb5_ctx *ctx, int qop_req, +- struct xdr_buf *text, struct xdr_netobj *token, +- int toktype) ++ struct xdr_buf *text, struct xdr_netobj *token) + { + s32 checksum_type; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; +- int blocksize = 0, tmsglen; + unsigned char *ptr, *krb5_hdr, *msg_start; + s32 now; ++ u32 seq_send; + + dprintk("RPC: gss_krb5_seal\n"); + +@@ -111,21 +104,13 @@ krb5_make_token(struct krb5_ctx *ctx, in + goto out_err; + } + +- if (toktype == KG_TOK_WRAP_MSG) { +- blocksize = crypto_tfm_alg_blocksize(ctx->enc); +- tmsglen = blocksize + text->len +- + gss_krb5_padding(blocksize, blocksize + text->len); +- } else { +- tmsglen = 0; +- } +- +- token->len = g_token_size(&ctx->mech_used, 22 + tmsglen); ++ token->len = g_token_size(&ctx->mech_used, 22); + + ptr = token->data; +- g_make_token_header(&ctx->mech_used, 22 + tmsglen, &ptr); ++ g_make_token_header(&ctx->mech_used, 22, &ptr); + +- *ptr++ = (unsigned char) ((toktype>>8)&0xff); +- *ptr++ = (unsigned char) (toktype&0xff); ++ *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff); ++ *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff); + + /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ + krb5_hdr = ptr - 2; +@@ -133,17 +118,9 @@ krb5_make_token(struct krb5_ctx *ctx, in + + *(u16 *)(krb5_hdr + 2) = htons(ctx->signalg); + memset(krb5_hdr + 4, 0xff, 4); +- if (toktype == KG_TOK_WRAP_MSG) +- *(u16 *)(krb5_hdr + 4) = htons(ctx->sealalg); + +- if (toktype == KG_TOK_WRAP_MSG) { +- /* XXX removing support for now */ +- goto out_err; +- } else { /* Sign only. */ +- if (make_checksum(checksum_type, krb5_hdr, 8, text, +- &md5cksum)) ++ if (make_checksum(checksum_type, krb5_hdr, 8, text, 0, &md5cksum)) + goto out_err; +- } + + switch (ctx->signalg) { + case SGN_ALG_DES_MAC_MD5: +@@ -163,12 +140,14 @@ krb5_make_token(struct krb5_ctx *ctx, in + + kfree(md5cksum.data); + ++ spin_lock(&krb5_seq_lock); ++ seq_send = ctx->seq_send++; ++ spin_unlock(&krb5_seq_lock); ++ + if ((krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff, +- ctx->seq_send, krb5_hdr + 16, krb5_hdr + 8))) ++ seq_send, krb5_hdr + 16, krb5_hdr + 8))) + goto out_err; + +- ctx->seq_send++; +- + return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); + out_err: + if (md5cksum.data) kfree(md5cksum.data); +--- linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_token.c.lsec 2005-03-23 14:28:24.240364600 -0700 ++++ linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_token.c 2005-03-23 14:28:24.239364752 -0700 +@@ -0,0 +1,266 @@ ++/* ++ * linux/net/sunrpc/gss_spkm3_token.c ++ * ++ * Copyright (c) 2003 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef RPC_DEBUG ++# define RPCDBG_FACILITY RPCDBG_AUTH ++#endif ++ ++/* ++ * asn1_bitstring_len() ++ * ++ * calculate the asn1 bitstring length of the xdr_netobject ++ */ ++void ++asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits) ++{ ++ int i, zbit = 0,elen = in->len; ++ char *ptr; ++ ++ ptr = &in->data[in->len -1]; ++ ++ /* count trailing 0's */ ++ for(i = in->len; i > 0; i--) { ++ if (*ptr == 0) { ++ ptr--; ++ elen--; ++ } else ++ break; ++ } ++ ++ /* count number of 0 bits in final octet */ ++ ptr = &in->data[elen - 1]; ++ for(i = 0; i < 8; i++) { ++ short mask = 0x01; ++ ++ if (!((mask << i) & *ptr)) ++ zbit++; ++ else ++ break; ++ } ++ *enclen = elen; ++ *zerobits = zbit; ++} ++ ++/* ++ * decode_asn1_bitstring() ++ * ++ * decode a bitstring into a buffer of the expected length. ++ * enclen = bit string length ++ * explen = expected length (define in rfc) ++ */ ++int ++decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen, int explen) ++{ ++ if (!(out->data = kmalloc(explen,GFP_KERNEL))) ++ return 0; ++ out->len = explen; ++ memset(out->data, 0, explen); ++ memcpy(out->data, in, enclen); ++ return 1; ++} ++ ++/* ++ * SPKMInnerContextToken choice SPKM_MIC asn1 token layout ++ * ++ * contextid is always 16 bytes plain data. max asn1 bitstring len = 17. ++ * ++ * tokenlen = pos[0] to end of token (max pos[45] with MD5 cksum) ++ * ++ * pos value ++ * ---------- ++ * [0] a4 SPKM-MIC tag ++ * [1] ?? innertoken length (max 44) ++ * ++ * ++ * tok_hdr piece of checksum data starts here ++ * ++ * the maximum mic-header len = 9 + 17 = 26 ++ * mic-header ++ * ---------- ++ * [2] 30 SEQUENCE tag ++ * [3] ?? mic-header length: (max 23) = TokenID + ContextID ++ * ++ * TokenID - all fields constant and can be hardcoded ++ * ------- ++ * [4] 02 Type 2 ++ * [5] 02 Length 2 ++ * [6][7] 01 01 TokenID (SPKM_MIC_TOK) ++ * ++ * ContextID - encoded length not constant, calculated ++ * --------- ++ * [8] 03 Type 3 ++ * [9] ?? encoded length ++ * [10] ?? ctxzbit ++ * [11] contextid ++ * ++ * mic_header piece of checksum data ends here. ++ * ++ * int-cksum - encoded length not constant, calculated ++ * --------- ++ * [??] 03 Type 3 ++ * [??] ?? encoded length ++ * [??] ?? md5zbit ++ * [??] int-cksum (NID_md5 = 16) ++ * ++ * maximum SPKM-MIC innercontext token length = ++ * 10 + encoded contextid_size(17 max) + 2 + encoded ++ * cksum_size (17 maxfor NID_md5) = 46 ++ */ ++ ++/* ++ * spkm3_mic_header() ++ * ++ * Prepare the SPKM_MIC_TOK mic-header for check-sum calculation ++ * elen: 16 byte context id asn1 bitstring encoded length ++ */ ++void ++spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen, unsigned char *ctxdata, int elen, int zbit) ++{ ++ char *hptr = *hdrbuf; ++ char *top = *hdrbuf; ++ ++ *(u8 *)hptr++ = 0x30; ++ *(u8 *)hptr++ = elen + 7; /* on the wire header length */ ++ ++ /* tokenid */ ++ *(u8 *)hptr++ = 0x02; ++ *(u8 *)hptr++ = 0x02; ++ *(u8 *)hptr++ = 0x01; ++ *(u8 *)hptr++ = 0x01; ++ ++ /* coniextid */ ++ *(u8 *)hptr++ = 0x03; ++ *(u8 *)hptr++ = elen + 1; /* add 1 to include zbit */ ++ *(u8 *)hptr++ = zbit; ++ memcpy(hptr, ctxdata, elen); ++ hptr += elen; ++ *hdrlen = hptr - top; ++} ++ ++/* ++ * spkm3_mic_innercontext_token() ++ * ++ * *tokp points to the beginning of the SPKM_MIC token described ++ * in rfc 2025, section 3.2.1: ++ * ++ */ ++void ++spkm3_make_mic_token(unsigned char **tokp, int toklen, struct xdr_netobj *mic_hdr, struct xdr_netobj *md5cksum, int md5elen, int md5zbit) ++{ ++ unsigned char *ict = *tokp; ++ ++ *(u8 *)ict++ = 0xa4; ++ *(u8 *)ict++ = toklen - 2; ++ memcpy(ict, mic_hdr->data, mic_hdr->len); ++ ict += mic_hdr->len; ++ ++ *(u8 *)ict++ = 0x03; ++ *(u8 *)ict++ = md5elen + 1; /* add 1 to include zbit */ ++ *(u8 *)ict++ = md5zbit; ++ memcpy(ict, md5cksum->data, md5elen); ++} ++ ++u32 ++spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, unsigned char **cksum) ++{ ++ struct xdr_netobj spkm3_ctx_id = {.len =0, .data = NULL}; ++ unsigned char *ptr = *tokp; ++ int ctxelen; ++ u32 ret = GSS_S_DEFECTIVE_TOKEN; ++ ++ /* spkm3 innercontext token preamble */ ++ if ((ptr[0] != 0xa4) || (ptr[2] != 0x30)) { ++ dprintk("RPC: BAD SPKM ictoken preamble\n"); ++ goto out; ++ } ++ ++ *mic_hdrlen = ptr[3]; ++ ++ /* token type */ ++ if ((ptr[4] != 0x02) || (ptr[5] != 0x02)) { ++ dprintk("RPC: BAD asn1 SPKM3 token type\n"); ++ goto out; ++ } ++ ++ /* only support SPKM_MIC_TOK */ ++ if((ptr[6] != 0x01) || (ptr[7] != 0x01)) { ++ dprintk("RPC: ERROR unsupported SPKM3 token \n"); ++ goto out; ++ } ++ ++ /* contextid */ ++ if (ptr[8] != 0x03) { ++ dprintk("RPC: BAD SPKM3 asn1 context-id type\n"); ++ goto out; ++ } ++ ++ ctxelen = ptr[9]; ++ if (ctxelen > 17) { /* length includes asn1 zbit octet */ ++ dprintk("RPC: BAD SPKM3 contextid len %d\n", ctxelen); ++ goto out; ++ } ++ ++ /* ignore ptr[10] */ ++ ++ if(!decode_asn1_bitstring(&spkm3_ctx_id, &ptr[11], ctxelen - 1, 16)) ++ goto out; ++ ++ /* ++ * in the current implementation: the optional int-alg is not present ++ * so the default int-alg (md5) is used the optional snd-seq field is ++ * also not present ++ */ ++ ++ if (*mic_hdrlen != 6 + ctxelen) { ++ dprintk("RPC: BAD SPKM_ MIC_TOK header len %d: we only support default int-alg (should be absent) and do not support snd-seq\n", *mic_hdrlen); ++ goto out; ++ } ++ /* checksum */ ++ *cksum = (&ptr[10] + ctxelen); /* ctxelen includes ptr[10] */ ++ ++ ret = GSS_S_COMPLETE; ++out: ++ if (spkm3_ctx_id.data) ++ kfree(spkm3_ctx_id.data); ++ return ret; ++} ++ +--- linux-2.6.7/net/sunrpc/auth_gss/gss_generic_token.c.lsec 2004-06-15 23:19:10.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/auth_gss/gss_generic_token.c 2005-03-23 14:28:23.707445616 -0700 +@@ -179,7 +179,7 @@ EXPORT_SYMBOL(g_make_token_header); + */ + u32 + g_verify_token_header(struct xdr_netobj *mech, int *body_size, +- unsigned char **buf_in, int tok_type, int toksize) ++ unsigned char **buf_in, int toksize) + { + unsigned char *buf = *buf_in; + int seqsize; +--- linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_unseal.c.lsec 2005-03-23 14:28:24.240364600 -0700 ++++ linux-2.6.7/net/sunrpc/auth_gss/gss_spkm3_unseal.c 2005-03-23 14:28:24.240364600 -0700 +@@ -0,0 +1,128 @@ ++/* ++ * linux/net/sunrpc/gss_spkm3_unseal.c ++ * ++ * Copyright (c) 2003 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef RPC_DEBUG ++# define RPCDBG_FACILITY RPCDBG_AUTH ++#endif ++ ++/* ++ * spkm3_read_token() ++ * ++ * only SPKM_MIC_TOK with md5 intg-alg is supported ++ */ ++u32 ++spkm3_read_token(struct spkm3_ctx *ctx, ++ struct xdr_netobj *read_token, /* checksum */ ++ struct xdr_buf *message_buffer, /* signbuf */ ++ int *qop_state, int toktype) ++{ ++ s32 code; ++ struct xdr_netobj wire_cksum = {.len =0, .data = NULL}; ++ struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; ++ unsigned char *ptr = (unsigned char *)read_token->data; ++ unsigned char *cksum; ++ int bodysize, md5elen; ++ int mic_hdrlen; ++ u32 ret = GSS_S_DEFECTIVE_TOKEN; ++ ++ dprintk("RPC: spkm3_read_token read_token->len %d\n", read_token->len); ++ ++ if (g_verify_token_header((struct xdr_netobj *) &ctx->mech_used, ++ &bodysize, &ptr, read_token->len)) ++ goto out; ++ ++ /* decode the token */ ++ ++ if (toktype == SPKM_MIC_TOK) { ++ ++ if ((ret = spkm3_verify_mic_token(&ptr, &mic_hdrlen, &cksum))) ++ goto out; ++ ++ if (*cksum++ != 0x03) { ++ dprintk("RPC: spkm3_read_token BAD checksum type\n"); ++ goto out; ++ } ++ md5elen = *cksum++; ++ cksum++; /* move past the zbit */ ++ ++ if(!decode_asn1_bitstring(&wire_cksum, cksum, md5elen - 1, 16)) ++ goto out; ++ ++ /* HARD CODED FOR MD5 */ ++ ++ /* compute the checksum of the message. ++ * ptr + 2 = start of header piece of checksum ++ * mic_hdrlen + 2 = length of header piece of checksum ++ */ ++ ret = GSS_S_DEFECTIVE_TOKEN; ++ code = make_checksum(CKSUMTYPE_RSA_MD5, ptr + 2, ++ mic_hdrlen + 2, ++ message_buffer, &md5cksum); ++ ++ if (code) ++ goto out; ++ ++ dprintk("RPC: spkm3_read_token: digest wire_cksum.len %d:\n", ++ wire_cksum.len); ++ dprintk(" md5cksum.data\n"); ++ print_hexl((u32 *) md5cksum.data, 16, 0); ++ dprintk(" cksum.data:\n"); ++ print_hexl((u32 *) wire_cksum.data, wire_cksum.len, 0); ++ ++ ret = GSS_S_BAD_SIG; ++ code = memcmp(md5cksum.data, wire_cksum.data, wire_cksum.len); ++ if (code) ++ goto out; ++ ++ } else { ++ dprintk("RPC: BAD or UNSUPPORTED SPKM3 token type: %d\n",toktype); ++ goto out; ++ } ++ ++ /* XXX: need to add expiration and sequencing */ ++ ret = GSS_S_COMPLETE; ++out: ++ if (md5cksum.data) ++ kfree(md5cksum.data); ++ if (wire_cksum.data) ++ kfree(wire_cksum.data); ++ return ret; ++} +--- linux-2.6.7/net/sunrpc/auth_gss/Makefile.lsec 2004-06-15 23:19:22.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/auth_gss/Makefile 2005-03-23 14:28:24.294356392 -0700 +@@ -10,5 +10,9 @@ auth_rpcgss-objs := auth_gss.o gss_gener + obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o + + rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ +- gss_krb5_seqnum.o ++ gss_krb5_seqnum.o gss_krb5_wrap.o + ++obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o ++ ++rpcsec_gss_spkm3-objs := gss_spkm3_mech.o gss_spkm3_seal.o gss_spkm3_unseal.o \ ++ gss_spkm3_token.o +--- linux-2.6.7/net/sunrpc/cache.c.lsec 2004-06-15 23:19:36.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/cache.c 2005-03-23 14:28:24.406339368 -0700 +@@ -38,7 +38,7 @@ void cache_init(struct cache_head *h) + time_t now = get_seconds(); + h->next = NULL; + h->flags = 0; +- atomic_set(&h->refcnt, 0); ++ atomic_set(&h->refcnt, 1); + h->expiry_time = now + CACHE_NEW_EXPIRY; + h->last_refresh = now; + } +--- linux-2.6.7/net/sunrpc/svc.c.lsec 2004-06-15 23:20:03.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/svc.c 2005-03-23 14:28:23.652453976 -0700 +@@ -263,6 +263,7 @@ svc_process(struct svc_serv *serv, struc + u32 *statp; + u32 dir, prog, vers, proc, + auth_stat, rpc_stat; ++ int auth_res; + + rpc_stat = rpc_success; + +@@ -304,12 +305,17 @@ svc_process(struct svc_serv *serv, struc + rqstp->rq_vers = vers = ntohl(svc_getu32(argv)); /* version number */ + rqstp->rq_proc = proc = ntohl(svc_getu32(argv)); /* procedure number */ + ++ progp = serv->sv_program; + /* + * Decode auth data, and add verifier to reply buffer. + * We do this before anything else in order to get a decent + * auth verifier. + */ +- switch (svc_authenticate(rqstp, &auth_stat)) { ++ if (progp->pg_authenticate != NULL) ++ auth_res = progp->pg_authenticate(rqstp, &auth_stat); ++ else ++ auth_res = svc_authenticate(rqstp, &auth_stat); ++ switch (auth_res) { + case SVC_OK: + break; + case SVC_GARBAGE: +@@ -326,7 +332,6 @@ svc_process(struct svc_serv *serv, struc + goto sendit; + } + +- progp = serv->sv_program; + if (prog != progp->pg_prog) + goto err_bad_prog; + +--- linux-2.6.7/net/sunrpc/svcauth.c.lsec 2004-06-15 23:19:44.000000000 -0600 ++++ linux-2.6.7/net/sunrpc/svcauth.c 2005-03-23 14:28:24.407339216 -0700 +@@ -156,25 +156,47 @@ static inline int auth_domain_match(stru + { + return strcmp(tmp->name, item->name) == 0; + } +-DefineCacheLookup(struct auth_domain, +- h, +- auth_domain_lookup, +- (struct auth_domain *item, int set), +- /* no setup */, +- &auth_domain_cache, +- auth_domain_hash(item), +- auth_domain_match(tmp, item), +- kfree(new); if(!set) { +- if (new) +- write_unlock(&auth_domain_cache.hash_lock); +- else +- read_unlock(&auth_domain_cache.hash_lock); +- return NULL; +- } +- new=item; atomic_inc(&new->h.refcnt), +- /* no update */, +- 0 /* no inplace updates */ +- ) ++ ++struct auth_domain * ++auth_domain_lookup(struct auth_domain *item, int set) ++{ ++ struct auth_domain *tmp = NULL; ++ struct cache_head **hp, **head; ++ head = &auth_domain_cache.hash_table[auth_domain_hash(item)]; ++ ++ if (set) ++ write_lock(&auth_domain_cache.hash_lock); ++ else ++ read_lock(&auth_domain_cache.hash_lock); ++ for (hp=head; *hp != NULL; hp = &tmp->h.next) { ++ tmp = container_of(*hp, struct auth_domain, h); ++ if (!auth_domain_match(tmp, item)) ++ continue; ++ cache_get(&tmp->h); ++ if (!set) ++ goto out_noset; ++ *hp = tmp->h.next; ++ tmp->h.next = NULL; ++ clear_bit(CACHE_HASHED, &tmp->h.flags); ++ auth_domain_drop(&tmp->h, &auth_domain_cache); ++ goto out_set; ++ } ++ /* Didn't find anything */ ++ if (!set) ++ goto out_noset; ++ auth_domain_cache.entries++; ++out_set: ++ set_bit(CACHE_HASHED, &item->h.flags); ++ item->h.next = *head; ++ *head = &item->h; ++ write_unlock(&auth_domain_cache.hash_lock); ++ cache_fresh(&auth_domain_cache, &item->h, item->h.expiry_time); ++ cache_get(&item->h); ++ return item; ++out_noset: ++ read_unlock(&auth_domain_cache.hash_lock); ++ return tmp; ++} + + struct auth_domain *auth_domain_find(char *name) + { diff --git a/lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.patch b/lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.patch new file mode 100644 index 0000000..f99ff70 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-dcache_locking-vanilla-2.6.patch @@ -0,0 +1,85 @@ +Introduce lock-free versions of d_rehash and d_move. + + fs/dcache.c | 22 ++++++++++++++++++---- + include/linux/dcache.h | 2 ++ + 2 files changed, 20 insertions(+), 4 deletions(-) + +Index: linus-2.6.7-bk5/fs/dcache.c +=================================================================== +--- linus-2.6.7-bk5.orig/fs/dcache.c 2004-06-24 10:39:11.232154728 +0300 ++++ linus-2.6.7-bk5/fs/dcache.c 2004-06-24 10:56:01.043640048 +0300 +@@ -1115,16 +1115,23 @@ + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void __d_rehash(struct dentry * entry) + { + struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash); + +- spin_lock(&dcache_lock); + spin_lock(&entry->d_lock); + entry->d_flags &= ~DCACHE_UNHASHED; + spin_unlock(&entry->d_lock); + entry->d_bucket = list; + hlist_add_head_rcu(&entry->d_hash, list); ++} ++ ++EXPORT_SYMBOL(__d_rehash); ++ ++void d_rehash(struct dentry * entry) ++{ ++ spin_lock(&dcache_lock); ++ __d_rehash(entry); + spin_unlock(&dcache_lock); + } + +@@ -1200,12 +1207,11 @@ + * dcache entries should not be moved in this way. + */ + +-void d_move(struct dentry * dentry, struct dentry * target) ++void __d_move(struct dentry * dentry, struct dentry * target) + { + if (!dentry->d_inode) + printk(KERN_WARNING "VFS: moving negative dcache entry\n"); + +- spin_lock(&dcache_lock); + write_seqlock(&rename_lock); + /* + * XXXX: do we really need to take target->d_lock? +@@ -1257,6 +1263,14 @@ + spin_unlock(&target->d_lock); + spin_unlock(&dentry->d_lock); + write_sequnlock(&rename_lock); ++} ++ ++EXPORT_SYMBOL(__d_move); ++ ++void d_move(struct dentry *dentry, struct dentry *target) ++{ ++ spin_lock(&dcache_lock); ++ __d_move(dentry, target); + spin_unlock(&dcache_lock); + } + +Index: linus-2.6.7-bk5/include/linux/dcache.h +=================================================================== +--- linus-2.6.7-bk5.orig/include/linux/dcache.h 2004-06-24 10:39:29.534372368 +0300 ++++ linus-2.6.7-bk5/include/linux/dcache.h 2004-06-24 10:53:10.319594048 +0300 +@@ -227,6 +227,7 @@ + * This adds the entry to the hash queues. + */ + extern void d_rehash(struct dentry *); ++extern void __d_rehash(struct dentry *); + + /** + * d_add - add dentry to hash queues +@@ -245,6 +246,7 @@ + + /* used for rename() and baskets */ + extern void d_move(struct dentry *, struct dentry *); ++extern void __d_move(struct dentry *, struct dentry *); + + /* appendix may either be NULL or be used for transname suffixes */ + extern struct dentry * d_lookup(struct dentry *, struct qstr *); diff --git a/lustre/kernel_patches/patches/vfs-dcache_lustre_invalid-vanilla-2.6.patch b/lustre/kernel_patches/patches/vfs-dcache_lustre_invalid-vanilla-2.6.patch new file mode 100644 index 0000000..f83b663 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-dcache_lustre_invalid-vanilla-2.6.patch @@ -0,0 +1,37 @@ +%diffstat + fs/dcache.c | 7 +++++++ + include/linux/dcache.h | 1 + + 2 files changed, 8 insertions(+) + +%patch +Index: linux-2.6.6/fs/dcache.c +=================================================================== +--- linux-2.6.6.orig/fs/dcache.c 2004-05-22 02:11:17.000000000 +0800 ++++ linux-2.6.6/fs/dcache.c 2004-05-22 02:14:46.000000000 +0800 +@@ -217,6 +217,13 @@ int d_invalidate(struct dentry * dentry) + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +Index: linux-2.6.6/include/linux/dcache.h +=================================================================== +--- linux-2.6.6.orig/include/linux/dcache.h 2004-05-22 02:10:01.000000000 +0800 ++++ linux-2.6.6/include/linux/dcache.h 2004-05-22 02:15:17.000000000 +0800 +@@ -153,6 +153,7 @@ d_iput: no no yes + + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ + #define DCACHE_UNHASHED 0x0010 ++#define DCACHE_LUSTRE_INVALID 0x0020 /* invalidated by Lustre */ + + extern spinlock_t dcache_lock; + + diff --git a/lustre/kernel_patches/patches/vfs-do_truncate.patch b/lustre/kernel_patches/patches/vfs-do_truncate.patch new file mode 100644 index 0000000..1cfd57b --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-do_truncate.patch @@ -0,0 +1,87 @@ +Index: linux-2.6.6/fs/namei.c +=================================================================== +--- linux-2.6.6.orig/fs/namei.c 2004-05-30 23:17:06.267030976 +0300 ++++ linux-2.6.6/fs/namei.c 2004-05-30 23:23:15.642877312 +0300 +@@ -1270,7 +1270,7 @@ + if (!error) { + DQUOT_INIT(inode); + +- error = do_truncate(dentry, 0); ++ error = do_truncate(dentry, 0, 1); + } + put_write_access(inode); + if (error) +Index: linux-2.6.6/fs/open.c +=================================================================== +--- linux-2.6.6.orig/fs/open.c 2004-05-30 20:05:26.857206992 +0300 ++++ linux-2.6.6/fs/open.c 2004-05-30 23:24:38.908219056 +0300 +@@ -189,7 +189,7 @@ + return error; + } + +-int do_truncate(struct dentry *dentry, loff_t length) ++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open) + { + int err; + struct iattr newattrs; +@@ -202,6 +202,8 @@ + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + down(&dentry->d_inode->i_sem); + down_write(&dentry->d_inode->i_alloc_sem); ++ if (called_from_open) ++ newattrs.ia_valid |= ATTR_FROM_OPEN; + err = notify_change(dentry, &newattrs); + up_write(&dentry->d_inode->i_alloc_sem); + up(&dentry->d_inode->i_sem); +@@ -259,7 +261,7 @@ + error = locks_verify_truncate(inode, NULL, length); + if (!error) { + DQUOT_INIT(inode); +- error = do_truncate(nd.dentry, length); ++ error = do_truncate(nd.dentry, length, 0); + } + put_write_access(inode); + +@@ -311,7 +313,7 @@ + + error = locks_verify_truncate(inode, file, length); + if (!error) +- error = do_truncate(dentry, length); ++ error = do_truncate(dentry, length, 0); + out_putf: + fput(file); + out: +Index: linux-2.6.6/fs/exec.c +=================================================================== +--- linux-2.6.6.orig/fs/exec.c 2004-05-30 20:05:26.862206232 +0300 ++++ linux-2.6.6/fs/exec.c 2004-05-30 23:23:15.648876400 +0300 +@@ -1395,7 +1395,7 @@ + goto close_fail; + if (!file->f_op->write) + goto close_fail; +- if (do_truncate(file->f_dentry, 0) != 0) ++ if (do_truncate(file->f_dentry, 0, 0) != 0) + goto close_fail; + + retval = binfmt->core_dump(signr, regs, file); +Index: linux-2.6.6/include/linux/fs.h +=================================================================== +--- linux-2.6.6.orig/include/linux/fs.h 2004-05-30 23:20:11.979798344 +0300 ++++ linux-2.6.6/include/linux/fs.h 2004-05-30 23:25:29.167578472 +0300 +@@ -249,6 +249,7 @@ + #define ATTR_ATTR_FLAG 1024 + #define ATTR_KILL_SUID 2048 + #define ATTR_KILL_SGID 4096 ++#define ATTR_FROM_OPEN 16384 /* called from open path, ie O_TRUNC */ + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -1189,7 +1190,7 @@ + + /* fs/open.c */ + +-extern int do_truncate(struct dentry *, loff_t start); ++extern int do_truncate(struct dentry *, loff_t start, int called_from_open); + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); + extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *); diff --git a/lustre/kernel_patches/patches/vfs-gns_export_doumount.patch b/lustre/kernel_patches/patches/vfs-gns_export_doumount.patch new file mode 100644 index 0000000..36ae7b4 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-gns_export_doumount.patch @@ -0,0 +1,34 @@ +Index: linux-2.6.7/fs/namespace.c +=================================================================== +--- linux-2.6.7.orig/fs/namespace.c 2004-11-21 00:25:13.000000000 +0200 ++++ linux-2.6.7/fs/namespace.c 2004-11-21 00:25:15.000000000 +0200 +@@ -360,7 +360,7 @@ + } + } + +-static int do_umount(struct vfsmount *mnt, int flags) ++int do_umount(struct vfsmount *mnt, int flags) + { + struct super_block * sb = mnt->mnt_sb; + int retval; +@@ -434,6 +434,8 @@ + return retval; + } + ++EXPORT_SYMBOL(do_umount); ++ + /* + * Now umount can handle mount points as well as block devices. + * This is important for filesystems which use unnamed block devices. +Index: linux-2.6.7/include/linux/mount.h +=================================================================== +--- linux-2.6.7.orig/include/linux/mount.h 2004-11-21 00:25:13.000000000 +0200 ++++ linux-2.6.7/include/linux/mount.h 2005-01-11 15:28:26.627030408 +0200 +@@ -56,6 +56,7 @@ + extern struct vfsmount *alloc_vfsmnt(const char *name); + extern struct vfsmount *do_kern_mount(const char *fstype, int flags, + const char *name, void *data); ++extern int do_umount(struct vfsmount *mnt, int flags); + extern spinlock_t vfsmount_lock; + + #endif diff --git a/lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.patch b/lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.patch new file mode 100644 index 0000000..49c2938 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-intent_api-vanilla-2.6.patch @@ -0,0 +1,555 @@ +Index: linus-2.6.7-bk-latest/include/linux/namei.h +=================================================================== +--- linus-2.6.7-bk-latest.orig/include/linux/namei.h 2004-07-07 10:56:34.232378296 +0300 ++++ linus-2.6.7-bk-latest/include/linux/namei.h 2004-07-07 11:41:48.569736296 +0300 +@@ -2,13 +2,40 @@ + #define _LINUX_NAMEI_H + + #include ++#include + + struct vfsmount; + ++/* intent opcodes */ ++#define IT_OPEN (1) ++#define IT_CREAT (1<<1) ++#define IT_READDIR (1<<2) ++#define IT_GETATTR (1<<3) ++#define IT_LOOKUP (1<<4) ++#define IT_UNLINK (1<<5) ++#define IT_TRUNC (1<<6) ++#define IT_GETXATTR (1<<7) ++ ++#define INTENT_MAGIC 0x19620323 ++ + struct open_intent { ++ int magic; ++ int op; ++ void (*op_release)(struct open_intent *); + int flags; + int create_mode; ++ union { ++ void *fs_data; /* FS-specific intent data */ ++ } d; + }; + ++static inline void intent_init(struct open_intent *it, int op) ++{ ++ memset(it, 0, sizeof(*it)); ++ it->magic = INTENT_MAGIC; ++ it->op = op; ++} ++ ++ + struct nameidata { + struct dentry *dentry; +@@ -53,14 +76,22 @@ + #define LOOKUP_ACCESS (0x0400) + + extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); ++extern int FASTCALL(__user_walk_it(const char __user *, unsigned, struct nameidata *)); + #define user_path_walk(name,nd) \ + __user_walk(name, LOOKUP_FOLLOW, nd) ++#define user_path_walk_it(name,nd) \ ++ __user_walk_it(name, LOOKUP_FOLLOW, nd) + #define user_path_walk_link(name,nd) \ + __user_walk(name, 0, nd) ++#define user_path_walk_link_it(name,nd) \ ++ __user_walk_it(name, 0, nd) + extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *)); ++extern int FASTCALL(path_lookup_it(const char *, unsigned, struct nameidata *)); + extern int FASTCALL(path_walk(const char *, struct nameidata *)); ++extern int FASTCALL(path_walk_it(const char *, struct nameidata *)); + extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); + extern void path_release(struct nameidata *); ++extern void intent_release(struct open_intent *); + + extern struct dentry * lookup_one_len(const char *, struct dentry *, int); + extern struct dentry * lookup_hash(struct qstr *, struct dentry *); +Index: linus-2.6.7-bk-latest/include/linux/fs.h +=================================================================== +--- linus-2.6.7-bk-latest.orig/include/linux/fs.h 2004-07-07 10:56:33.720456120 +0300 ++++ linus-2.6.7-bk-latest/include/linux/fs.h 2004-07-07 11:38:42.864967712 +0300 +@@ -583,6 +583,7 @@ + spinlock_t f_ep_lock; + #endif /* #ifdef CONFIG_EPOLL */ + struct address_space *f_mapping; ++ struct open_intent *f_it; + }; + extern spinlock_t files_lock; + #define file_list_lock() spin_lock(&files_lock); +@@ -1201,6 +1202,7 @@ + extern int do_truncate(struct dentry *, loff_t start); + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); ++extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct open_intent *); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char __user *); + +Index: linus-2.6.7-bk-latest/fs/namei.c +=================================================================== +--- linus-2.6.7-bk-latest.orig/fs/namei.c 2004-07-07 10:56:13.455536856 +0300 ++++ linus-2.6.7-bk-latest/fs/namei.c 2004-07-07 11:38:42.866967408 +0300 +@@ -272,8 +272,19 @@ + return 0; + } + ++void intent_release(struct open_intent *it) ++{ ++ if (!it) ++ return; ++ if (it->magic != INTENT_MAGIC) ++ return; ++ if (it->op_release) ++ it->op_release(it); ++} ++ + void path_release(struct nameidata *nd) + { ++ intent_release(&nd->intent.open); + dput(nd->dentry); + mntput(nd->mnt); + } +@@ -790,8 +801,14 @@ + return err; + } + ++int fastcall path_walk_it(const char * name, struct nameidata *nd) ++{ ++ current->total_link_count = 0; ++ return link_path_walk(name, nd); ++} + int fastcall path_walk(const char * name, struct nameidata *nd) + { ++ intent_init(&nd->intent.open, IT_LOOKUP); + current->total_link_count = 0; + return link_path_walk(name, nd); + } +@@ -800,7 +817,7 @@ + /* returns 1 if everything is done */ + static int __emul_lookup_dentry(const char *name, struct nameidata *nd) + { +- if (path_walk(name, nd)) ++ if (path_walk_it(name, nd)) + return 0; /* something went wrong... */ + + if (!nd->dentry->d_inode || S_ISDIR(nd->dentry->d_inode->i_mode)) { +@@ -878,7 +895,18 @@ + return 1; + } + +-int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd) ++static inline int it_mode_from_lookup_flags(int flags) ++{ ++ int mode = IT_LOOKUP; ++ ++ if (flags & LOOKUP_OPEN) ++ mode = IT_OPEN; ++ if (flags & LOOKUP_CREATE) ++ mode |= IT_CREAT; ++ return mode; ++} ++ ++int fastcall path_lookup_it(const char *name, unsigned int flags, struct nameidata *nd) + { + int retval; + +@@ -914,6 +942,12 @@ + return retval; + } + ++int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd) ++{ ++ intent_init(&nd->intent.open, it_mode_from_lookup_flags(flags)); ++ return path_lookup_it(name, flags, nd); ++} ++ + /* + * Restricted form of lookup. Doesn't follow links, single-component only, + * needs parent already locked. Doesn't follow mounts. +@@ -964,7 +998,7 @@ + } + + /* SMP-safe */ +-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct nameidata *nd) + { + unsigned long hash; + struct qstr this; +@@ -984,11 +1018,16 @@ + } + this.hash = end_name_hash(hash); + +- return lookup_hash(&this, base); ++ return __lookup_hash(&this, base, nd); + access: + return ERR_PTR(-EACCES); + } + ++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++{ ++ return lookup_one_len_it(name, base, len, NULL); ++} ++ + /* + * namei() + * +@@ -1000,18 +1039,24 @@ + * that namei follows links, while lnamei does not. + * SMP-safe + */ +-int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) ++int fastcall __user_walk_it(const char __user *name, unsigned flags, struct nameidata *nd) + { + char *tmp = getname(name); + int err = PTR_ERR(tmp); + + if (!IS_ERR(tmp)) { +- err = path_lookup(tmp, flags, nd); ++ err = path_lookup_it(tmp, flags, nd); + putname(tmp); + } + return err; + } + ++int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd) ++{ ++ intent_init(&nd->intent.open, it_mode_from_lookup_flags(flags)); ++ return __user_walk_it(name, flags, nd); ++} ++ + /* + * It's inline, so penalty for filesystems that don't use sticky bit is + * minimal. +@@ -1296,7 +1341,7 @@ + * The simplest case - just a plain lookup. + */ + if (!(flag & O_CREAT)) { +- error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd); ++ error = path_lookup_it(pathname, lookup_flags(flag), nd); + if (error) + return error; + goto ok; +@@ -1305,7 +1350,8 @@ + /* + * Create - we need to know the parent. + */ +- error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd); ++ nd->intent.open.op |= IT_CREAT; ++ error = path_lookup_it(pathname, LOOKUP_PARENT, nd); + if (error) + return error; + +@@ -2214,6 +2260,7 @@ + static int __vfs_follow_link(struct nameidata *nd, const char *link) + { + int res = 0; ++ struct open_intent it = nd->intent.open; + char *name; + if (IS_ERR(link)) + goto fail; +@@ -2224,6 +2271,10 @@ + /* weird __emul_prefix() stuff did it */ + goto out; + } ++ intent_release(&nd->intent.open); ++ intent_init(&nd->intent.open, it.op); ++ nd->intent.open.flags = it.flags; ++ nd->intent.open.create_mode = it.create_mode; + res = link_path_walk(link, nd); + out: + if (nd->depth || res || nd->last_type!=LAST_NORM) +@@ -2322,6 +2372,7 @@ + return res; + } + ++ + int page_symlink(struct inode *inode, const char *symname, int len) + { + struct address_space *mapping = inode->i_mapping; +@@ -2385,8 +2436,10 @@ + EXPORT_SYMBOL(page_symlink); + EXPORT_SYMBOL(page_symlink_inode_operations); + EXPORT_SYMBOL(path_lookup); ++EXPORT_SYMBOL(path_lookup_it); + EXPORT_SYMBOL(path_release); + EXPORT_SYMBOL(path_walk); ++EXPORT_SYMBOL(path_walk_it); + EXPORT_SYMBOL(permission); + EXPORT_SYMBOL(unlock_rename); + EXPORT_SYMBOL(vfs_create); +Index: linus-2.6.7-bk-latest/fs/open.c +=================================================================== +--- linus-2.6.7-bk-latest.orig/fs/open.c 2004-07-07 10:56:13.610513296 +0300 ++++ linus-2.6.7-bk-latest/fs/open.c 2004-07-07 11:38:42.867967256 +0300 +@@ -216,11 +216,12 @@ + struct inode * inode; + int error; + ++ intent_init(&nd.intent.open, IT_GETATTR); + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + +- error = user_path_walk(path, &nd); ++ error = user_path_walk_it(path, &nd); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -475,6 +476,7 @@ + kernel_cap_t old_cap; + int res; + ++ intent_init(&nd.intent.open, IT_GETATTR); + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; + +@@ -498,7 +500,7 @@ + else + current->cap_effective = current->cap_permitted; + +- res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); ++ res = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); + if (!res) { + res = permission(nd.dentry->d_inode, mode, &nd); + /* SuS v2 requires we report a read only fs too */ +@@ -520,7 +522,8 @@ + struct nameidata nd; + int error; + +- error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); ++ intent_init(&nd.intent.open, IT_GETATTR); ++ error = __user_walk_it(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + if (error) + goto out; + +@@ -571,7 +574,8 @@ + struct nameidata nd; + int error; + +- error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); ++ intent_init(&nd.intent.open, IT_GETATTR); ++ error = __user_walk_it(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + if (error) + goto out; + +@@ -754,6 +758,7 @@ + { + int namei_flags, error; + struct nameidata nd; ++ intent_init(&nd.intent.open, IT_OPEN); + + namei_flags = flags; + if ((namei_flags+1) & O_ACCMODE) +@@ -763,14 +768,14 @@ + + error = open_namei(filename, namei_flags, mode, &nd); + if (!error) +- return dentry_open(nd.dentry, nd.mnt, flags); ++ return dentry_open_it(nd.dentry, nd.mnt, flags, &nd.intent.open); + + return ERR_PTR(error); + } + + EXPORT_SYMBOL(filp_open); + +-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, int flags, struct open_intent *it) + { + struct file * f; + struct inode *inode; +@@ -782,6 +787,7 @@ + goto cleanup_dentry; + f->f_flags = flags; + f->f_mode = (flags+1) & O_ACCMODE; ++ f->f_it = it; + inode = dentry->d_inode; + if (f->f_mode & FMODE_WRITE) { + error = get_write_access(inode); +@@ -800,6 +806,7 @@ + error = f->f_op->open(inode,f); + if (error) + goto cleanup_all; ++ intent_release(it); + } + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + +@@ -825,11 +832,20 @@ + cleanup_file: + put_filp(f); + cleanup_dentry: ++ intent_release(it); + dput(dentry); + mntput(mnt); + return ERR_PTR(error); + } + ++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++{ ++ struct open_intent it; ++ intent_init(&it, IT_LOOKUP); ++ ++ return dentry_open_it(dentry, mnt, flags, &it); ++} ++ + EXPORT_SYMBOL(dentry_open); + + /* +Index: linus-2.6.7-bk-latest/fs/stat.c +=================================================================== +--- linus-2.6.7-bk-latest.orig/fs/stat.c 2004-07-07 10:56:13.635509496 +0300 ++++ linus-2.6.7-bk-latest/fs/stat.c 2004-07-07 11:38:42.868967104 +0300 +@@ -59,15 +59,15 @@ + } + return 0; + } +- + EXPORT_SYMBOL(vfs_getattr); + + int vfs_stat(char __user *name, struct kstat *stat) + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent.open, IT_GETATTR); + +- error = user_path_walk(name, &nd); ++ error = user_path_walk_it(name, &nd); + if (!error) { + error = vfs_getattr(nd.mnt, nd.dentry, stat); + path_release(&nd); +@@ -81,8 +81,9 @@ + { + struct nameidata nd; + int error; ++ intent_init(&nd.intent.open, IT_GETATTR); + +- error = user_path_walk_link(name, &nd); ++ error = user_path_walk_link_it(name, &nd); + if (!error) { + error = vfs_getattr(nd.mnt, nd.dentry, stat); + path_release(&nd); +@@ -96,9 +97,12 @@ + { + struct file *f = fget(fd); + int error = -EBADF; ++ struct nameidata nd; ++ intent_init(&nd.intent.open, IT_GETATTR); + + if (f) { + error = vfs_getattr(f->f_vfsmnt, f->f_dentry, stat); ++ intent_release(&nd.intent.open); + fput(f); + } + return error; +Index: linus-2.6.7-bk-latest/fs/namespace.c +=================================================================== +--- linus-2.6.7-bk-latest.orig/fs/namespace.c 2004-07-07 10:56:13.605514056 +0300 ++++ linus-2.6.7-bk-latest/fs/namespace.c 2004-07-07 11:38:42.868967104 +0300 +@@ -117,6 +117,7 @@ + + static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd) + { ++ memset(old_nd, 0, sizeof(*old_nd)); + old_nd->dentry = mnt->mnt_mountpoint; + old_nd->mnt = mnt->mnt_parent; + mnt->mnt_parent = mnt; +Index: linus-2.6.7-bk-latest/fs/exec.c +=================================================================== +--- linus-2.6.7-bk-latest.orig/fs/exec.c 2004-07-07 10:56:13.395545976 +0300 ++++ linus-2.6.7-bk-latest/fs/exec.c 2004-07-07 11:38:42.869966952 +0300 +@@ -121,8 +121,9 @@ + struct nameidata nd; + int error; + ++ intent_init(&nd.intent.open, IT_OPEN); + nd.intent.open.flags = FMODE_READ; +- error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); ++ error = user_path_walk_it(library, &nd); + if (error) + goto out; + +@@ -134,7 +135,7 @@ + if (error) + goto exit; + +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent.open); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; +@@ -474,8 +475,9 @@ + int err; + struct file *file; + ++ intent_init(&nd.intent.open, IT_OPEN); + nd.intent.open.flags = FMODE_READ; +- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd); ++ err = path_lookup_it(name, LOOKUP_FOLLOW, &nd); + file = ERR_PTR(err); + + if (!err) { +@@ -488,7 +490,7 @@ + err = -EACCES; + file = ERR_PTR(err); + if (!err) { +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &nd.intent.open); + if (!IS_ERR(file)) { + err = deny_write_access(file); + if (err) { +Index: linus-2.6.7-bk-latest/fs/xattr.c +=================================================================== +--- linus-2.6.7-bk-latest.orig/fs/xattr.c 2004-07-07 10:56:13.643508280 +0300 ++++ linus-2.6.7-bk-latest/fs/xattr.c 2004-07-07 11:38:42.870966800 +0300 +@@ -161,7 +161,8 @@ + struct nameidata nd; + ssize_t error; + +- error = user_path_walk(path, &nd); ++ intent_init(&nd.intent.open, IT_GETXATTR); ++ error = user_path_walk_it(path, &nd); + if (error) + return error; + error = getxattr(nd.dentry, name, value, size); +@@ -176,7 +177,8 @@ + struct nameidata nd; + ssize_t error; + +- error = user_path_walk_link(path, &nd); ++ intent_init(&nd.intent.open, IT_GETXATTR); ++ error = user_path_walk_link_it(path, &nd); + if (error) + return error; + error = getxattr(nd.dentry, name, value, size); +@@ -242,7 +244,8 @@ + struct nameidata nd; + ssize_t error; + +- error = user_path_walk(path, &nd); ++ intent_init(&nd.intent.open, IT_GETXATTR); ++ error = user_path_walk_it(path, &nd); + if (error) + return error; + error = listxattr(nd.dentry, list, size); +@@ -256,7 +259,8 @@ + struct nameidata nd; + ssize_t error; + +- error = user_path_walk_link(path, &nd); ++ intent_init(&nd.intent.open, IT_GETXATTR); ++ error = user_path_walk_link_it(path, &nd); + if (error) + return error; + error = listxattr(nd.dentry, list, size); + +--- linux-2.6.7.orig/include/linux/mount.h 2004-06-16 13:18:57.000000000 +0800 ++++ linux-2.6.7/include/linux/mount.h 2004-09-06 21:05:29.000000000 +0800 +@@ -31,6 +31,8 @@ + int mnt_flags; + char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ + struct list_head mnt_list; ++ struct list_head mnt_lustre_list; /* GNS mount list */ ++ unsigned long mnt_last_used; /* for GNS auto-umount (jiffies) */ + }; + + static inline struct vfsmount *mntget(struct vfsmount *mnt) diff --git a/lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.patch b/lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.patch new file mode 100644 index 0000000..76ccd7b --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-lookup_last-vanilla-2.6.patch @@ -0,0 +1,77 @@ +Index: linus-2.6.7/fs/namei.c +=================================================================== +--- linus-2.6.7.orig/fs/namei.c 2005-03-05 20:24:52.000000000 +0200 ++++ linus-2.6.7/fs/namei.c 2005-03-28 17:11:20.486991680 +0300 +@@ -676,8 +676,11 @@ + goto out_dput; + + if (inode->i_op->follow_link) { ++ int saved_flags = nd->flags; + mntget(next.mnt); ++ nd->flags |= LOOKUP_LINK_NOTLAST; + err = do_follow_link(next.dentry, nd); ++ nd->flags = saved_flags; + dput(next.dentry); + mntput(next.mnt); + if (err) +@@ -723,7 +726,9 @@ + if (err < 0) + break; + } ++ nd->flags |= LOOKUP_LAST; + err = do_lookup(nd, &this, &next); ++ nd->flags &= ~LOOKUP_LAST; + if (err) + break; + follow_mount(&next.mnt, &next.dentry); +@@ -769,10 +774,14 @@ + */ + if (nd->dentry && nd->dentry->d_sb && + (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { +- err = -ESTALE; ++ nd->flags |= LOOKUP_LAST; ++ err = !nd->dentry->d_op->d_revalidate(nd->dentry, nd); ++ nd->flags &= ~LOOKUP_LAST; + /* Note: we do not d_invalidate() */ +- if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd)) ++ if (err) { ++ err = -ESTALE; + break; ++ } + } + return_base: + return 0; +@@ -1344,7 +1353,9 @@ + dir = nd->dentry; + nd->flags &= ~LOOKUP_PARENT; + down(&dir->d_inode->i_sem); ++ nd->flags |= LOOKUP_LAST; + dentry = __lookup_hash(&nd->last, nd->dentry, nd); ++ nd->flags &= ~LOOKUP_LAST; + + do_last: + error = PTR_ERR(dentry); +@@ -1449,7 +1460,9 @@ + } + dir = nd->dentry; + down(&dir->d_inode->i_sem); ++ nd->flags |= LOOKUP_LAST; + dentry = __lookup_hash(&nd->last, nd->dentry, nd); ++ nd->flags &= ~LOOKUP_LAST; + putname(nd->last.name); + goto do_last; + } +Index: linus-2.6.7/include/linux/namei.h +=================================================================== +--- linus-2.6.7.orig/include/linux/namei.h 2005-03-05 20:24:52.000000000 +0200 ++++ linus-2.6.7/include/linux/namei.h 2005-03-05 20:24:52.000000000 +0200 +@@ -68,6 +68,9 @@ + #define LOOKUP_CONTINUE 4 + #define LOOKUP_PARENT 16 + #define LOOKUP_NOALT 32 ++#define LOOKUP_LAST 64 ++#define LOOKUP_LINK_NOTLAST 128 ++ + /* + * Intent data + */ diff --git a/lustre/kernel_patches/patches/vfs-pdirops-2.6.7.patch b/lustre/kernel_patches/patches/vfs-pdirops-2.6.7.patch index 9f95068..0621750 100644 --- a/lustre/kernel_patches/patches/vfs-pdirops-2.6.7.patch +++ b/lustre/kernel_patches/patches/vfs-pdirops-2.6.7.patch @@ -46,14 +46,12 @@ Index: linux-2.6.7/fs/namei.c /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. -@@ -362,10 +394,11 @@ +@@ -362,8 +394,9 @@ + { struct dentry * result; struct inode *dir = parent->d_inode; - int counter = 0; + void *lock; - again: - counter++; - down(&dir->i_sem); + lock = lock_dir(dir, name); /* @@ -149,10 +147,10 @@ Index: linux-2.6.7/fs/namei.c out2: path_release(&nd); out: -@@ -1765,14 +1798,14 @@ - goto exit1; - } - +@@ -1735,14 +1735,14 @@ + error = -EBUSY; + goto exit1; + } - down(&nd.dentry->d_inode->i_sem); + nd.lock = lock_dir(nd.dentry->d_inode, &nd.last); dentry = lookup_hash(&nd.last, nd.dentry); @@ -166,10 +164,10 @@ Index: linux-2.6.7/fs/namei.c exit1: path_release(&nd); exit: -@@ -1842,7 +1875,7 @@ - if (error != -EOPNOTSUPP) - goto exit1; - } +@@ -1808,7 +1808,7 @@ + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; - down(&nd.dentry->d_inode->i_sem); + nd.lock = lock_dir(nd.dentry->d_inode, &nd.last); dentry = lookup_hash(&nd.last, nd.dentry); @@ -257,8 +255,8 @@ Index: linux-2.6.7/include/linux/namei.h @@ -52,6 +52,7 @@ unsigned int flags; int last_type; - struct lookup_intent intent; + void *lock; - }; - /* + /* Intent data */ + union { + struct open_intent open; diff --git a/lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.patch b/lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.patch new file mode 100644 index 0000000..21d4e12 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs-raw_ops-vanilla-2.6.patch @@ -0,0 +1,235 @@ +Index: linus-2.6.7/fs/namei.c +=================================================================== +--- linus-2.6.7.orig/fs/namei.c 2005-03-05 20:24:52.000000000 +0200 ++++ linus-2.6.7/fs/namei.c 2005-03-23 13:37:48.563339840 +0200 +@@ -758,14 +758,20 @@ + lookup_parent: + nd->last = this; + nd->last_type = LAST_NORM; +- if (this.name[0] != '.') +- goto return_base; +- if (this.len == 1) +- nd->last_type = LAST_DOT; +- else if (this.len == 2 && this.name[1] == '.') +- nd->last_type = LAST_DOTDOT; +- else +- goto return_base; ++ if (this.name[0] == '.') { ++ if (this.len == 1) ++ nd->last_type = LAST_DOT; ++ else if (this.len == 2 && this.name[1] == '.') ++ nd->last_type = LAST_DOTDOT; ++ } ++ ++ if ((nd->last_type == LAST_NORM) && inode->i_op && ++ inode->i_op->endparentlookup) { ++ err = inode->i_op->endparentlookup(nd); ++ if (err) ++ break; ++ } ++ goto return_base; + return_reval: + /* + * We bypassed the ordinary revalidation routines. +@@ -1535,9 +1541,16 @@ + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + +- error = path_lookup(tmp, LOOKUP_PARENT, &nd); ++ intent_init(&nd.intent.open, IT_MKNOD); ++ nd.intent.open.create_mode = mode; ++ nd.intent.open.create.dev = dev; ++ ++ error = path_lookup_it(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.intent.open.flags & IT_STATUS_RAW) ++ goto out2; ++ + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + +@@ -1564,6 +1577,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1606,9 +1620,13 @@ + struct dentry *dentry; + struct nameidata nd; + +- error = path_lookup(tmp, LOOKUP_PARENT, &nd); ++ intent_init(&nd.intent.open, IT_MKDIR); ++ nd.intent.open.create_mode = mode; ++ error = path_lookup_it(tmp, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.intent.open.flags & IT_STATUS_RAW) ++ goto out2; + dentry = lookup_create(&nd, 1); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -1618,6 +1636,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1703,9 +1722,12 @@ + if(IS_ERR(name)) + return PTR_ERR(name); + +- error = path_lookup(name, LOOKUP_PARENT, &nd); ++ intent_init(&nd.intent.open, IT_RMDIR); ++ error = path_lookup_it(name, LOOKUP_PARENT, &nd); + if (error) + goto exit; ++ if (nd.intent.open.flags & IT_STATUS_RAW) ++ goto exit1; + + switch(nd.last_type) { + case LAST_DOTDOT: +@@ -1781,9 +1803,13 @@ + if(IS_ERR(name)) + return PTR_ERR(name); + +- error = path_lookup(name, LOOKUP_PARENT, &nd); ++ intent_init(&nd.intent.open, IT_UNLINK); ++ error = path_lookup_it(name, LOOKUP_PARENT, &nd); + if (error) + goto exit; ++ if (nd.intent.open.flags & IT_STATUS_RAW) ++ goto exit1; ++ + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; +@@ -1855,9 +1881,13 @@ + struct dentry *dentry; + struct nameidata nd; + +- error = path_lookup(to, LOOKUP_PARENT, &nd); ++ intent_init(&nd.intent.open, IT_SYMLINK); ++ nd.intent.open.create.link = from; ++ error = path_lookup_it(to, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.intent.open.flags & IT_STATUS_RAW) ++ goto out2; + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +@@ -1865,6 +1895,7 @@ + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(to); +@@ -1936,9 +1967,13 @@ + error = __user_walk(oldname, 0, &old_nd); + if (error) + goto exit; +- error = path_lookup(to, LOOKUP_PARENT, &nd); ++ intent_init(&nd.intent.open, IT_LINK); ++ nd.intent.open.create.source_nd = &old_nd; ++ error = path_lookup_it(to, LOOKUP_PARENT, &nd); + if (error) + goto out; ++ if (nd.intent.open.flags & IT_STATUS_RAW) ++ goto out_release; + error = -EXDEV; + if (old_nd.mnt != nd.mnt) + goto out_release; +@@ -2119,9 +2154,18 @@ + if (error) + goto exit; + +- error = path_lookup(newname, LOOKUP_PARENT, &newnd); ++ error = -EBUSY; ++ if (oldnd.last_type != LAST_NORM) ++ goto exit1; ++ ++ intent_init(&newnd.intent.open, IT_RENAME); ++ newnd.intent.open.create.source_nd = &oldnd; ++ error = path_lookup_it(newname, LOOKUP_PARENT, &newnd); + if (error) + goto exit1; ++ if (newnd.intent.open.flags & IT_STATUS_RAW) { ++ goto exit2; ++ } + + error = -EXDEV; + if (oldnd.mnt != newnd.mnt) +@@ -2129,8 +2173,6 @@ + + old_dir = oldnd.dentry; + error = -EBUSY; +- if (oldnd.last_type != LAST_NORM) +- goto exit2; + + new_dir = newnd.dentry; + if (newnd.last_type != LAST_NORM) +@@ -2238,6 +2280,7 @@ + intent_init(&nd->intent.open, it.op); + nd->intent.open.flags = it.flags; + nd->intent.open.create_mode = it.create_mode; ++ nd->intent.open.create = it.create; + res = link_path_walk(link, nd); + out: + if (current->link_count || res || nd->last_type!=LAST_NORM) +Index: linus-2.6.7/include/linux/namei.h +=================================================================== +--- linus-2.6.7.orig/include/linux/namei.h 2005-03-05 20:24:52.000000000 +0200 ++++ linus-2.6.7/include/linux/namei.h 2005-03-23 13:34:56.632477304 +0200 +@@ -15,9 +15,19 @@ + #define IT_UNLINK (1<<5) + #define IT_TRUNC (1<<6) + #define IT_GETXATTR (1<<7) ++#define IT_RMDIR (1<<8) ++#define IT_LINK (1<<9) ++#define IT_RENAME (1<<10) ++#define IT_MKDIR (1<<11) ++#define IT_MKNOD (1<<12) ++#define IT_SYMLINK (1<<13) ++#define IT_CHDIR (1<<14) + + #define INTENT_MAGIC 0x19620323 +- ++#define IT_STATUS_RAW (1<<10) /* Setting this in it_flags on exit from lookup ++ means everything was done already and return ++ value from lookup is in fact status of ++ already performed operation */ + struct open_intent { + int magic; + int op; +@@ -25,6 +35,11 @@ + int flags; + int create_mode; + union { ++ unsigned dev; /* For mknod */ ++ char *link; /* For symlink */ ++ struct nameidata *source_nd; /* For link/rename */ ++ } create; ++ union { + void *fs_data; /* FS-specific intent data */ + } d; + }; +Index: linus-2.6.7/include/linux/fs.h +=================================================================== +--- linus-2.6.7.orig/include/linux/fs.h 2005-03-05 20:24:52.000000000 +0200 ++++ linus-2.6.7/include/linux/fs.h 2005-03-23 13:35:08.796628072 +0200 +@@ -909,6 +909,7 @@ + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); + int (*removexattr) (struct dentry *, const char *); ++ int (*endparentlookup) (struct nameidata *); + }; + + struct seq_file; diff --git a/lustre/kernel_patches/patches/vfs_fmode_exec-2.6.patch b/lustre/kernel_patches/patches/vfs_fmode_exec-2.6.patch new file mode 100644 index 0000000..9c3a5d6 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_fmode_exec-2.6.patch @@ -0,0 +1,34 @@ + fs/exec.c | 4 ++-- + include/linux/fs.h | 1 + + 2 files changed, 3 insertions(+), 2 deletions(-) + +--- linus-2.6.7-bk-latest/include/linux/fs.h.orig 2004-07-07 12:33:21.246507224 +0300 ++++ linus-2.6.7-bk-latest/include/linux/fs.h 2004-07-07 12:33:55.069365368 +0300 +@@ -74,6 +74,7 @@ extern int leases_enable, dir_notify_ena + + #define FMODE_READ 1 + #define FMODE_WRITE 2 ++#define FMODE_EXEC 4 + + #define RW_MASK 1 + #define RWA_MASK 2 +--- linus-2.6.7-bk-latest/fs/exec.c.orig 2004-07-07 12:33:05.466906088 +0300 ++++ linus-2.6.7-bk-latest/fs/exec.c 2004-07-07 12:33:38.127940856 +0300 +@@ -122,7 +122,7 @@ asmlinkage long sys_uselib(const char __ + int error; + + intent_init(&nd.intent.open, IT_OPEN); +- nd.intent.open.flags = FMODE_READ; ++ nd.intent.open.flags = FMODE_READ|FMODE_EXEC; + error = user_path_walk_it(library, &nd); + if (error) + goto out; +@@ -476,7 +476,7 @@ struct file *open_exec(const char *name) + struct file *file; + + intent_init(&nd.intent.open, IT_OPEN); +- nd.intent.open.flags = FMODE_READ; ++ nd.intent.open.flags = FMODE_READ|FMODE_EXEC; + err = path_lookup_it(name, LOOKUP_FOLLOW, &nd); + file = ERR_PTR(err); + diff --git a/lustre/kernel_patches/patches/vfs_gns-2.6-vanilla.patch b/lustre/kernel_patches/patches/vfs_gns-2.6-vanilla.patch new file mode 100644 index 0000000..0cb55e6 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_gns-2.6-vanilla.patch @@ -0,0 +1,55 @@ +diff -rupN linux-2.6.7/fs/namei.c linux-2.6.7.new/fs/namei.c +--- linux-2.6.7/fs/namei.c 2005-03-29 18:54:13.000000000 +0300 ++++ linux-2.6.7.new/fs/namei.c 2005-03-31 14:42:01.605302456 +0300 +@@ -422,6 +422,16 @@ static struct dentry * real_lookup(struc + result = dentry; + } + unlock_dir(dir, lock); ++ if (!IS_ERR(result)) { ++ spin_lock(&result->d_lock); ++ if (result->d_flags & DCACHE_GNS_PENDING) { ++ spin_unlock(&result->d_lock); ++ if (result->d_op && result->d_op->d_revalidate) ++ result->d_op->d_revalidate(result, nd); ++ } else { ++ spin_unlock(&result->d_lock); ++ } ++ } + return result; + } + +diff -rupN linux-2.6.7/fs/namespace.c linux-2.6.7.new/fs/namespace.c +--- linux-2.6.7/fs/namespace.c 2005-03-29 18:54:13.000000000 +0300 ++++ linux-2.6.7.new/fs/namespace.c 2005-03-30 17:51:39.000000000 +0300 +@@ -60,6 +60,7 @@ struct vfsmount *alloc_vfsmnt(const char + INIT_LIST_HEAD(&mnt->mnt_child); + INIT_LIST_HEAD(&mnt->mnt_mounts); + INIT_LIST_HEAD(&mnt->mnt_list); ++ INIT_LIST_HEAD(&mnt->mnt_lustre_list); + if (name) { + int size = strlen(name)+1; + char *newname = kmalloc(size, GFP_KERNEL); +@@ -173,6 +174,9 @@ void __mntput(struct vfsmount *mnt) + { + struct super_block *sb = mnt->mnt_sb; + dput(mnt->mnt_root); ++ spin_lock(&dcache_lock); ++ list_del(&mnt->mnt_lustre_list); ++ spin_unlock(&dcache_lock); + free_vfsmnt(mnt); + deactivate_super(sb); + } +diff -rupN linux-2.6.7/include/linux/dcache.h linux-2.6.7.new/include/linux/dcache.h +--- linux-2.6.7/include/linux/dcache.h 2005-03-29 18:54:13.000000000 +0300 ++++ linux-2.6.7.new/include/linux/dcache.h 2005-03-31 14:35:51.589553400 +0300 +@@ -167,7 +167,9 @@ d_iput: no no no yes + #define DCACHE_UNHASHED 0x0010 + #define DCACHE_LUSTRE_INVALID 0x0020 /* invalidated by Lustre */ + +-#define DCACHE_CROSS_REF 0x0040 /* entry points to inode on another MDS */ ++#define DCACHE_CROSS_REF 0x0040 /* entry points to inode on another MDS */ ++#define DCACHE_GNS_PENDING 0x0080 /* entry is GNS pending mount point */ ++#define DCACHE_GNS_MOUNTING 0x0100 /* entry is GNS mount in progress */ + + extern spinlock_t dcache_lock; + diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6-vanilla.patch b/lustre/kernel_patches/patches/vfs_intent-2.6-vanilla.patch index 6cfae66..5598314 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.6-vanilla.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.6-vanilla.patch @@ -634,9 +634,9 @@ Index: linux-2.6.7/include/linux/dcache.h int nr_unused; Index: linux-2.6.7/include/linux/fs.h =================================================================== ---- linux-2.6.7.orig/include/linux/fs.h 2004-08-26 17:12:41.000000000 +0400 -+++ linux-2.6.7/include/linux/fs.h 2005-01-18 11:27:18.092496832 +0300 -@@ -74,6 +74,7 @@ +--- linux-2.6.7.old/include/linux/fs.h 2005-01-31 14:27:16.000000000 +0800 ++++ linux-2.6.7/include/linux/fs.h 2005-01-31 14:32:19.000000000 +0800 +@@ -74,6 +74,7 @@ extern int leases_enable, dir_notify_ena #define FMODE_READ 1 #define FMODE_WRITE 2 @@ -644,16 +644,21 @@ Index: linux-2.6.7/include/linux/fs.h #define RW_MASK 1 #define RWA_MASK 2 -@@ -250,6 +251,8 @@ +@@ -250,6 +251,13 @@ typedef void (dio_iodone_t)(struct inode #define ATTR_ATTR_FLAG 1024 #define ATTR_KILL_SUID 2048 #define ATTR_KILL_SGID 4096 +#define ATTR_RAW 8192 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 16384 /* called from open path, ie O_TRUNC */ ++ ++#define ATTR_CTIME_SET 0x2000 ++/* ea support */ ++#define ATTR_EA 0x40000 ++#define ATTR_EA_RM 0x80000 /* * This is the Inode Attributes structure, used for notify_change(). It -@@ -446,6 +449,7 @@ +@@ -446,6 +454,7 @@ struct inode { struct block_device *i_bdev; struct cdev *i_cdev; int i_cindex; @@ -661,7 +666,7 @@ Index: linux-2.6.7/include/linux/fs.h unsigned long i_dnotify_mask; /* Directory notify events */ struct dnotify_struct *i_dnotify; /* for directory notifications */ -@@ -579,6 +583,7 @@ +@@ -579,6 +588,7 @@ struct file { spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; @@ -669,7 +674,7 @@ Index: linux-2.6.7/include/linux/fs.h }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); -@@ -903,7 +908,9 @@ +@@ -903,7 +913,9 @@ struct inode_operations { void (*truncate) (struct inode *); int (*permission) (struct inode *, int, struct nameidata *); int (*setattr) (struct dentry *, struct iattr *); @@ -679,7 +684,7 @@ Index: linux-2.6.7/include/linux/fs.h int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); -@@ -943,6 +950,7 @@ +@@ -943,6 +955,7 @@ struct super_operations { int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); void (*umount_begin) (struct super_block *); @@ -687,7 +692,7 @@ Index: linux-2.6.7/include/linux/fs.h int (*show_options)(struct seq_file *, struct vfsmount *); }; -@@ -1131,6 +1139,7 @@ +@@ -1131,6 +1144,7 @@ extern int unregister_filesystem(struct extern struct vfsmount *kern_mount(struct file_system_type *); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); @@ -695,14 +700,14 @@ Index: linux-2.6.7/include/linux/fs.h extern long do_mount(char *, char *, char *, unsigned long, void *); extern int vfs_statfs(struct super_block *, struct kstatfs *); -@@ -1195,6 +1204,7 @@ +@@ -1195,6 +1209,7 @@ static inline int break_lease(struct ino extern int do_truncate(struct dentry *, loff_t start); extern struct file *filp_open(const char *, int, int); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); +extern struct file * dentry_open_it(struct dentry *, struct vfsmount *, int, struct lookup_intent *); extern int filp_close(struct file *, fl_owner_t id); extern char * getname(const char __user *); - + Index: linux-2.6.7/include/linux/namei.h =================================================================== --- linux-2.6.7.orig/include/linux/namei.h 2003-07-24 15:52:31.000000000 +0400 diff --git a/lustre/kernel_patches/series/2.6-vanilla.series b/lustre/kernel_patches/series/2.6-vanilla.series index 8e88ec9..b5f5e74d 100644 --- a/lustre/kernel_patches/series/2.6-vanilla.series +++ b/lustre/kernel_patches/series/2.6-vanilla.series @@ -1,20 +1,22 @@ uml-2.6.7-01-bb2.patch lustre_version.patch -vfs_intent-2.6-vanilla.patch -vfs_nointent-2.6-vanilla.patch -vfs_races-2.6-vanilla.patch -vfs-wantedi-misc-2.6-suse.patch -nfs-cifs-intent-2.6-vanilla.patch -iopen-misc-2.6-suse.patch -export-truncate-2.6-suse.patch -export_symbols-2.6-suse.patch -dev_read_only-2.6-suse.patch -export-2.6-suse.patch -header-guards-2.6-suse.patch +vfs-dcache_locking-vanilla-2.6.patch +vfs-dcache_lustre_invalid-vanilla-2.6.patch +vfs-intent_api-vanilla-2.6.patch +vfs-lookup_last-vanilla-2.6.patch +vfs-raw_ops-vanilla-2.6.patch +export-vanilla-2.6.patch +header_guards-vanilla-2.6.patch +vfs-do_truncate.patch +vfs_fmode_exec-2.6.patch +vfs-gns_export_doumount.patch ext3-super-ntohl.patch -lookup_bdev_init_intent.patch -dcache-mds-num-2.6.7.patch +dcache-mds-num-2.6.7.patch dynamic-locks-2.6.7.patch vfs-pdirops-2.6.7.patch dcache-fid-2.6.7.patch -jbd-buffer-release-2.6.7.patch +vfs-wantedi-misc-2.6-suse.patch +jbd-buffer-release-2.6.7.patch +dev_read_only-2.6-suse.patch +vfs_gns-2.6-vanilla.patch +linux-2.6.7-CITI_NFS4_ALL-7-lsec.patch diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 70bd9b7..d1b8914 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -35,6 +35,8 @@ #include #include #include +#include + /* @priority: if non-zero, move the selected to the list head * @nocreate: if non-zero, only search in existed connections */ @@ -344,6 +346,7 @@ err: int client_obd_cleanup(struct obd_device *obddev, int flags) { struct client_obd *cli = &obddev->u.cli; + ENTRY; if (!cli->cl_import) RETURN(-EINVAL); @@ -354,7 +357,14 @@ int client_obd_cleanup(struct obd_device *obddev, int flags) dereg_f(cli->cl_mgmtcli_obd, obddev); inter_module_put("mgmtcli_deregister_for_events"); } + + /* Here we try to drop the security structure after destroy import, + * to avoid issue of "sleep in spinlock". + */ + class_import_get(cli->cl_import); class_destroy_import(cli->cl_import); + ptlrpcs_import_drop_sec(cli->cl_import); + class_import_put(cli->cl_import); cli->cl_import = NULL; ldlm_put_ref(flags & OBD_OPT_FORCE); @@ -390,6 +400,10 @@ int client_connect_import(struct lustre_handle *dlm_handle, if (obd->obd_namespace == NULL) GOTO(out_disco, rc = -ENOMEM); + rc = ptlrpcs_import_get_sec(imp); + if (rc != 0) + GOTO(out_ldlm, rc); + imp->imp_dlm_handle = *dlm_handle; rc = ptlrpc_init_import(imp); if (rc != 0) @@ -721,15 +735,38 @@ int target_handle_connect(struct ptlrpc_request *req) memcpy(&conn, lustre_msg_buf(req->rq_reqmsg, 2, sizeof conn), sizeof conn); - if (export->exp_imp_reverse != NULL) + if (export->exp_imp_reverse != NULL) { + /* same logic as client_obd_cleanup */ + class_import_get(export->exp_imp_reverse); class_destroy_import(export->exp_imp_reverse); + ptlrpcs_import_drop_sec(export->exp_imp_reverse); + class_import_put(export->exp_imp_reverse); + } + + /* for the rest part, we return -ENOTCONN in case of errors + * in order to let client initialize connection again. + */ revimp = export->exp_imp_reverse = class_new_import(); + if (!revimp) { + CERROR("fail to alloc new reverse import.\n"); + GOTO(out, rc = -ENOTCONN); + } + revimp->imp_connection = ptlrpc_connection_addref(export->exp_connection); revimp->imp_client = &export->exp_obd->obd_ldlm_client; revimp->imp_remote_handle = conn; revimp->imp_obd = target; revimp->imp_dlm_fake = 1; revimp->imp_state = LUSTRE_IMP_FULL; + + rc = ptlrpcs_import_get_sec(revimp); + if (rc) { + CERROR("reverse import can not get sec: %d\n", rc); + class_destroy_import(revimp); + export->exp_imp_reverse = NULL; + GOTO(out, rc = -ENOTCONN); + } + class_import_put(revimp); rc = obd_connect_post(export, connect_flags); @@ -759,8 +796,10 @@ void target_destroy_export(struct obd_export *exp) { /* exports created from last_rcvd data, and "fake" exports created by lctl don't have an import */ - if (exp->exp_imp_reverse != NULL) + if (exp->exp_imp_reverse != NULL) { + ptlrpcs_import_drop_sec(exp->exp_imp_reverse); class_destroy_import(exp->exp_imp_reverse); + } /* We cancel locks at disconnect time, but this will catch any locks * granted in a race with recovery-induced disconnect. */ @@ -789,8 +828,9 @@ ptlrpc_clone_req( struct ptlrpc_request *orig_req) memcpy(copy_req, orig_req, sizeof *copy_req); memcpy(copy_reqmsg, orig_req->rq_reqmsg, orig_req->rq_reqlen); - /* the copied req takes over the reply state */ + /* the copied req takes over the reply state and security data */ orig_req->rq_reply_state = NULL; + orig_req->rq_sec_svcdata = NULL; copy_req->rq_reqmsg = copy_reqmsg; class_export_get(copy_req->rq_export); @@ -800,6 +840,9 @@ ptlrpc_clone_req( struct ptlrpc_request *orig_req) } void ptlrpc_free_clone( struct ptlrpc_request *req) { + if (req->rq_svcsec) + svcsec_cleanup_req(req); + class_export_put(req->rq_export); list_del(&req->rq_list); OBD_FREE(req->rq_reqmsg, req->rq_reqlen); @@ -810,6 +853,9 @@ void ptlrpc_free_clone( struct ptlrpc_request *req) static void target_release_saved_req(struct ptlrpc_request *req) { + if (req->rq_svcsec) + svcsec_cleanup_req(req); + class_export_put(req->rq_export); OBD_FREE(req->rq_reqmsg, req->rq_reqlen); OBD_FREE(req, sizeof *req); diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 4b58aea..9f863b4 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -25,6 +25,8 @@ #ifdef __KERNEL__ # include +# include +# include # include # include #else diff --git a/lustre/liblustre/Makefile.am b/lustre/liblustre/Makefile.am index 90a35c4..860d2ec 100644 --- a/lustre/liblustre/Makefile.am +++ b/lustre/liblustre/Makefile.am @@ -1,5 +1,7 @@ ## Liblustre excecutables & libraries Makefile -SUBDIRS = . tests + +# FIXME: we disable building any executables for this moment. +#SUBDIRS = . tests AM_CPPFLAGS = $(HAVE_EFENCE) -I$(SYSIO)/include -D_LARGEFILE64_SOURCE=1 \ $(LLCPPFLAGS) -I$(top_srcdir)/portals/unals @@ -13,6 +15,7 @@ LUSTRE_LIBS = liblutils.a libllite.a \ $(top_builddir)/lustre/osc/libosc.a \ $(top_builddir)/lustre/mdc/libmdc.a \ $(top_builddir)/lustre/ptlrpc/libptlrpc.a \ + $(top_builddir)/lustre/sec/libptlrpcs.a \ $(top_builddir)/lustre/obdclass/liblustreclass.a \ $(top_builddir)/lustre/lvfs/liblvfs.a diff --git a/lustre/liblustre/dir.c b/lustre/liblustre/dir.c index 0200da9..7e1d7dd 100644 --- a/lustre/liblustre/dir.c +++ b/lustre/liblustre/dir.c @@ -74,7 +74,7 @@ static int llu_dir_do_readpage(struct inode *inode, struct page *page) &data, &lockh, NULL, 0, ldlm_completion_ast, llu_mdc_blocking_ast, inode); - request = (struct ptlrpc_request *)it.d.lustre.it_data; + request = (struct ptlrpc_request *)LUSTRE_IT(&it)->it_data; if (request) ptlrpc_req_finished(request); if (rc < 0) { diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c index 00a0b82..e393198 100644 --- a/lustre/liblustre/file.c +++ b/lustre/liblustre/file.c @@ -90,7 +90,7 @@ void obdo_refresh_inode(struct inode *dst, static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it) { - struct ptlrpc_request *req = it->d.lustre.it_data; + struct ptlrpc_request *req = LUSTRE_IT(it)->it_data; struct ll_file_data *fd; struct mds_body *body; ENTRY; @@ -114,7 +114,7 @@ static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it) fd->fd_mds_och.och_magic = OBD_CLIENT_HANDLE_MAGIC; lli->lli_file_data = fd; - mdc_set_open_replay_data(NULL, &fd->fd_mds_och, it->d.lustre.it_data); + mdc_set_open_replay_data(NULL, &fd->fd_mds_och, LUSTRE_IT(it)->it_data); RETURN(0); } @@ -139,9 +139,8 @@ int llu_iop_open(struct pnode *pnode, int flags, mode_t mode) CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", lli->lli_st_ino); LL_GET_INTENT(inode, it); - if (!it->d.lustre.it_disposition) { + if (!LUSTRE_IT(it)->it_disposition) LBUG(); - } rc = it_open_error(DISP_OPEN_OPEN, it); if (rc) @@ -168,7 +167,7 @@ int llu_iop_open(struct pnode *pnode, int flags, mode_t mode) lli->lli_open_flags = flags & ~(O_CREAT | O_EXCL | O_TRUNC); out_release: - request = it->d.lustre.it_data; + request = LUSTRE_IT(it)->it_data; ptlrpc_req_finished(request); it->it_op_release(it); diff --git a/lustre/liblustre/genlib.sh b/lustre/liblustre/genlib.sh index 1324cf9..04e27fe2 100755 --- a/lustre/liblustre/genlib.sh +++ b/lustre/liblustre/genlib.sh @@ -82,6 +82,7 @@ build_obj_list ../obdecho libobdecho.a build_obj_list ../osc libosc.a build_obj_list ../mdc libmdc.a build_obj_list ../ptlrpc libptlrpc.a +build_obj_list ../sec libptlrpcs.a build_obj_list ../obdclass liblustreclass.a build_obj_list ../lvfs liblvfs.a diff --git a/lustre/liblustre/namei.c b/lustre/liblustre/namei.c index 280c1dd..0949b5d 100644 --- a/lustre/liblustre/namei.c +++ b/lustre/liblustre/namei.c @@ -46,15 +46,15 @@ static void ll_intent_drop_lock(struct lookup_intent *it) { struct lustre_handle *handle; - if (it->it_op && it->d.lustre.it_lock_mode) { - handle = (struct lustre_handle *)&it->d.lustre.it_lock_handle; + if (it->it_op && LUSTRE_IT(it)->it_lock_mode) { + handle = (struct lustre_handle *)&LUSTRE_IT(it)->it_lock_handle; CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64 " from it %p\n", handle->cookie, it); - ldlm_lock_decref(handle, it->d.lustre.it_lock_mode); + ldlm_lock_decref(handle, LUSTRE_IT(it)->it_lock_mode); /* bug 494: intent_release may be called multiple times, from * this thread and we don't want to double-decref this lock */ - it->d.lustre.it_lock_mode = 0; + LUSTRE_IT(it)->it_lock_mode = 0; } } @@ -65,8 +65,8 @@ static void ll_intent_release(struct lookup_intent *it) ll_intent_drop_lock(it); it->it_magic = 0; it->it_op_release = 0; - it->d.lustre.it_disposition = 0; - it->d.lustre.it_data = NULL; + LUSTRE_IT(it)->it_disposition = 0; + LUSTRE_IT(it)->it_data = NULL; EXIT; } @@ -107,7 +107,7 @@ void llu_lookup_finish_locks(struct lookup_intent *it, struct pnode *pnode) CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%lu)\n", inode, llu_i2info(inode)->lli_st_ino, llu_i2info(inode)->lli_st_generation); - mdc_set_lock_data(NULL, &it->d.lustre.it_lock_handle, inode); + mdc_set_lock_data(NULL, &LUSTRE_IT(it)->it_lock_handle, inode); } /* drop lookup/getattr locks */ diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index 211be83..1962920 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -181,8 +181,8 @@ void obdo_to_inode(struct inode *dst, struct obdo *src, obd_valid valid) valid &= src->o_valid; if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) - CDEBUG(D_INODE, "valid %x, cur time %lu/%lu, new %lu/%lu\n", - src->o_valid, + CDEBUG(D_INODE, "valid %llx, cur time %lu/%lu, new %lu/%lu\n", + (unsigned long long)src->o_valid, LTIME_S(lli->lli_st_mtime), LTIME_S(lli->lli_st_ctime), (long)src->o_mtime, (long)src->o_ctime); @@ -221,8 +221,8 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, obd_valid valid) obd_valid newvalid = 0; if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) - CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n", - valid, LTIME_S(lli->lli_st_mtime), + CDEBUG(D_INODE, "valid %llx, new time %lu/%lu\n", + (unsigned long long)valid, LTIME_S(lli->lli_st_mtime), LTIME_S(lli->lli_st_ctime)); if (valid & OBD_MD_FLATIME) { @@ -438,7 +438,8 @@ static int llu_inode_revalidate(struct inode *inode) valid |= OBD_MD_FLEASIZE; } ll_inode2id(&id, inode); - rc = mdc_getattr(sbi->ll_md_exp, &id, valid, ealen, &req); + rc = mdc_getattr(sbi->ll_md_exp, &id, valid, NULL, 0, + ealen, &req); if (rc) { CERROR("failure %d inode %lu\n", rc, lli->lli_st_ino); RETURN(-abs(rc)); @@ -869,7 +870,7 @@ static int llu_readlink_internal(struct inode *inode, ll_inode2id(&id, inode); rc = mdc_getattr(sbi->ll_md_exp, &id, - OBD_MD_LINKNAME, symlen, request); + OBD_MD_LINKNAME, NULL, 0, symlen, request); if (rc) { CERROR("inode %lu: rc = %d\n", lli->lli_st_ino, rc); RETURN(rc); @@ -1355,7 +1356,8 @@ struct inode *llu_iget(struct filesys *fs, struct lustre_md *md) if ((md->body->valid & (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) != (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) { - CERROR("bad md body valid mask 0x%x\n", md->body->valid); + CERROR("bad md body valid mask 0x%llx\n", + (unsigned long long)md->body->valid); LBUG(); return ERR_PTR(-EPERM); } @@ -1522,7 +1524,8 @@ llu_fsswop_mount(const char *source, /* fetch attr of root inode */ err = mdc_getattr(sbi->ll_md_exp, &rootid, - OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, 0, &request); + OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, NULL, 0, + 0, &request); if (err) { CERROR("mdc_getattr failed for root: rc = %d\n", err); GOTO(out_lov, err); diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c index 2d5a7c8..b8a6d0a 100644 --- a/lustre/llite/dcache.c +++ b/lustre/llite/dcache.c @@ -40,11 +40,15 @@ static void ll_release(struct dentry *de) struct ll_dentry_data *lld; ENTRY; LASSERT(de != NULL); + + CDEBUG(D_DENTRY, "releasing dentry %p\n", de); + lld = ll_d2d(de); - LASSERT(lld != NULL); - LASSERT(lld->lld_cwd_count == 0); - LASSERT(lld->lld_mnt_count == 0); - OBD_FREE(de->d_fsdata, sizeof(struct ll_dentry_data)); + if (lld) { /* Root dentry does not have ll_dentry_data */ + LASSERT(lld->lld_cwd_count == 0); + LASSERT(lld->lld_mnt_count == 0); + OBD_FREE(de->d_fsdata, sizeof(struct ll_dentry_data)); + } EXIT; } @@ -82,16 +86,17 @@ void ll_set_dd(struct dentry *de) void ll_intent_drop_lock(struct lookup_intent *it) { struct lustre_handle *handle; + struct lustre_intent_data *itdata = LUSTRE_IT(it); - if (it->it_op && it->d.lustre.it_lock_mode) { - handle = (struct lustre_handle *)&it->d.lustre.it_lock_handle; + if (it->it_op && itdata && itdata->it_lock_mode) { + handle = (struct lustre_handle *)&itdata->it_lock_handle; CDEBUG(D_DLMTRACE, "releasing lock with cookie "LPX64 " from it %p\n", handle->cookie, it); - ldlm_lock_decref(handle, it->d.lustre.it_lock_mode); + ldlm_lock_decref(handle, itdata->it_lock_mode); /* bug 494: intent_release may be called multiple times, from * this thread and we don't want to double-decref this lock */ - it->d.lustre.it_lock_mode = 0; + itdata->it_lock_mode = 0; } } @@ -102,11 +107,19 @@ void ll_intent_release(struct lookup_intent *it) ll_intent_drop_lock(it); it->it_magic = 0; it->it_op_release = 0; - it->d.lustre.it_disposition = 0; - it->d.lustre.it_data = NULL; + ll_intent_free(it); EXIT; } +void ll_intent_free(struct lookup_intent *it) +{ + if (it->d.fs_data) { + OBD_SLAB_FREE(it->d.fs_data, ll_intent_slab, + sizeof(struct lustre_intent_data)); + it->d.fs_data = NULL; + } +} + void ll_unhash_aliases(struct inode *inode) { struct list_head *tmp, *head; @@ -180,11 +193,11 @@ void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry) LASSERT(it != NULL); LASSERT(dentry != NULL); - if (it->d.lustre.it_lock_mode && dentry->d_inode != NULL) { + if (LUSTRE_IT(it)->it_lock_mode && dentry->d_inode != NULL) { struct inode *inode = dentry->d_inode; CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n", inode, inode->i_ino, inode->i_generation); - mdc_set_lock_data(NULL, &it->d.lustre.it_lock_handle, inode); + mdc_set_lock_data(NULL, &LUSTRE_IT(it)->it_lock_handle, inode); } /* drop lookup or getattr locks immediately */ @@ -206,7 +219,7 @@ void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry) void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft) { struct lookup_intent *it = *itp; - + #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) if (it) { LASSERTF(it->it_magic == INTENT_MAGIC, "bad intent magic: %x\n", @@ -217,7 +230,34 @@ void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft) if (!it || it->it_op == IT_GETXATTR) it = *itp = deft; + if (it->d.fs_data) + return; + + if (ll_intent_alloc(it)) { + CERROR("Failed to allocate memory for lustre specific intent " + "data\n"); + /* XXX: we cannot return status just yet */ + LBUG(); + } +} + +int ll_intent_alloc(struct lookup_intent *it) +{ + if (it->d.fs_data) { + CERROR("Intent alloc on already allocated intent\n"); + return 0; + } + OBD_SLAB_ALLOC(it->d.fs_data, ll_intent_slab, SLAB_KERNEL, + sizeof(struct lustre_intent_data)); + if (!it->d.fs_data) { + CERROR("Failed to allocate memory for lustre specific intent " + "data\n"); + return -ENOMEM; + } + it->it_op_release = ll_intent_release; + + return 0; } int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd, @@ -229,16 +269,38 @@ int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd, struct obd_export *exp; struct lustre_id pid; struct lustre_id cid; - int rc; + int orig_it, rc = 0; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op:name=%s, intent=%s\n", de->d_name.name, - LL_IT2STR(it)); + spin_lock(&de->d_lock); + + if ((de->d_flags & DCACHE_GNS_PENDING) && + !(de->d_flags & DCACHE_GNS_MOUNTING)) + { + spin_unlock(&de->d_lock); + + if (nd) { + int err = ll_gns_mount_object(de, nd->mnt); + if (err) + CERROR("can't mount %s, err = %d\n", + de->d_name.name, err); + } + RETURN(1); + } + spin_unlock(&de->d_lock); + + CDEBUG(D_VFSTRACE, "VFS Op:name=%s (%p), intent=%s\n", de->d_name.name, + de, LL_IT2STR(it)); /* Cached negative dentries are unsafe for now - look them up again */ if (de->d_inode == NULL) RETURN(0); + /* Root of the tree is always valid, attributes would be fixed in + ll_inode_revalidate_it */ + if (de->d_sb->s_root == de) + RETURN(1); + CDEBUG(D_INODE, "revalidate 0x%p: %*s -> %lu/%lu\n", de, de->d_name.len, de->d_name.name, (unsigned long) de->d_inode->i_ino, @@ -262,11 +324,17 @@ int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd, if (nd != NULL) nd->mnt->mnt_last_used = jiffies; + orig_it = it ? it->it_op : IT_OPEN; ll_frob_intent(&it, &lookup_it); LASSERT(it != NULL); if (it->it_op == IT_GETATTR) { /* We need to check for LOOKUP lock as well */ + rc = ll_intent_alloc(&lookup_it); + if (rc) + LBUG(); /* Can't think of better idea just yet */ + + rc = md_intent_lock(exp, &pid, de->d_name.name, de->d_name.len, NULL, 0, &cid, &lookup_it, flags, &req, ll_mdc_blocking_ast); @@ -274,11 +342,15 @@ int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd, UPDATE lock */ if (!rc) { it = &lookup_it; + if (!req) { + ll_intent_free(it); + goto do_lookup; + } GOTO(out, rc); } if (it_disposition(&lookup_it, DISP_LOOKUP_NEG)) { - ll_intent_release(&lookup_it); it = &lookup_it; + ll_intent_free(it); GOTO(out, rc = 0); } @@ -286,6 +358,8 @@ int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd, ptlrpc_req_finished(req); req = NULL; ll_lookup_finish_locks(&lookup_it, de); + /* XXX: on 2.6 ll_lookup_finish_locks does not call ll_intent_release */ + ll_intent_release(&lookup_it); } rc = md_intent_lock(exp, &pid, de->d_name.name, de->d_name.len, @@ -294,17 +368,20 @@ int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd, /* If req is NULL, then mdc_intent_lock only tried to do a lock match; * if all was well, it will return 1 if it found locks, 0 otherwise. */ - if (req == NULL && rc >= 0) + if (req == NULL && rc >= 0) { + if (!rc) + goto do_lookup; GOTO(out, rc); + } if (rc < 0) { if (rc != -ESTALE) { CDEBUG(D_INFO, "ll_intent_lock(): rc %d : it->it_status " - "%d\n", rc, it->d.lustre.it_status); + "%d\n", rc, LUSTRE_IT(it)->it_status); } GOTO(out, rc = 0); } - +revalidate_finish: rc = revalidate_it_finish(req, 1, it, de); if (rc != 0) { ll_intent_release(it); @@ -316,14 +393,21 @@ int ll_revalidate_it(struct dentry *de, int flags, struct nameidata *nd, dentry */ spin_lock(&dcache_lock); hlist_del_init(&de->d_hash); - __d_rehash(de, 0); + __d_rehash(de); spin_unlock(&dcache_lock); GOTO(out, rc); out: if (req != NULL && rc == 1) ptlrpc_req_finished(req); + if (rc == 0) { + if (it == &lookup_it) { + ll_intent_release(it); + if (req) /* Special case: We did lookup and it failed, + need to free request */ + ptlrpc_req_finished(req); + } ll_unhash_aliases(de->d_inode); return rc; } @@ -334,13 +418,37 @@ out: atomic_read(&de->d_count)); ll_lookup_finish_locks(it, de); de->d_flags &= ~DCACHE_LUSTRE_INVALID; - if (!((de->d_inode->i_mode & S_ISUID) &&S_ISDIR(de->d_inode->i_mode)) || - !(flags & LOOKUP_CONTINUE || (it->it_op & (IT_CHDIR | IT_OPEN)))) + if (it == &lookup_it) + ll_intent_release(it); + + if (!((de->d_inode->i_mode & S_ISUID) && S_ISDIR(de->d_inode->i_mode)) || + !(flags & LOOKUP_CONTINUE || (orig_it & (IT_CHDIR | IT_OPEN)))) return rc; - if (nd) - (void)ll_dir_process_mount_object(de, nd->mnt); + if (nd && !(de->d_flags & DCACHE_GNS_MOUNTING)) { + int err = ll_gns_mount_object(de, nd->mnt); + if (err) + CERROR("can't mount %s, err = %d\n", + de->d_name.name, err); + } return rc; +do_lookup: + it = &lookup_it; + if (ll_intent_alloc(it)) + LBUG(); +// We did that already, right? ll_inode2id(&pid, de->d_parent->d_inode); + rc = md_intent_lock(exp, &pid, de->d_name.name, + de->d_name.len, NULL, 0, NULL, + it, 0, &req, ll_mdc_blocking_ast); + if (rc >= 0) { + struct mds_body *mds_body = lustre_msg_buf(req->rq_repmsg, 1, sizeof(*mds_body)); + + /* See if we got same inode, if not - return error */ + if (id_equal_stc(&cid, &mds_body->id1)) + goto revalidate_finish; + } + + GOTO(out, rc = 0); } /*static*/ void ll_pin(struct dentry *de, struct vfsmount *mnt, int flag) @@ -433,7 +541,7 @@ static int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd) ENTRY; if (nd && nd->flags & LOOKUP_LAST && !(nd->flags & LOOKUP_LINK_NOTLAST)) - rc = ll_revalidate_it(dentry, nd->flags, nd, &nd->intent); + rc = ll_revalidate_it(dentry, nd->flags, nd, &nd->intent.open); else rc = ll_revalidate_it(dentry, 0, nd, NULL); @@ -462,14 +570,18 @@ static void ll_dentry_iput(struct dentry *dentry, struct inode *inode) struct ll_sb_info *sbi = ll_i2sbi(inode); struct lustre_id parent, child; - LASSERT(dentry->d_parent && dentry->d_parent->d_inode); - ll_inode2id(&parent, dentry->d_parent->d_inode); - ll_inode2id(&child, inode); - md_change_cbdata_name(sbi->ll_md_exp, &parent, - (char *)dentry->d_name.name, - dentry->d_name.len, &child, - null_if_equal, inode); + if (dentry->d_parent != dentry) { + /* Do not do this for root of the tree */ + LASSERT(dentry->d_parent && dentry->d_parent->d_inode); + ll_inode2id(&parent, dentry->d_parent->d_inode); + ll_inode2id(&child, inode); + md_change_cbdata_name(sbi->ll_md_exp, &parent, + (char *)dentry->d_name.name, + dentry->d_name.len, &child, + null_if_equal, inode); + } iput(inode); + } #endif diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index b13bd1a..fa9a335 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -232,12 +232,18 @@ static struct page *ll_get_dir_page(struct inode *dir, unsigned long n) ll_prepare_mdc_data(op_data, dir, NULL, NULL, 0, 0); + rc = ll_intent_alloc(&it); + if (rc) + return ERR_PTR(rc); + rc = md_enqueue(ll_i2sbi(dir)->ll_md_exp, LDLM_IBITS, &it, LCK_PR, op_data, &lockh, NULL, 0, ldlm_completion_ast, ll_mdc_blocking_ast, dir); OBD_FREE(op_data, sizeof(*op_data)); - request = (struct ptlrpc_request *)it.d.lustre.it_data; + request = (struct ptlrpc_request *)LUSTRE_IT(&it)->it_data; + ll_intent_free(&it); + if (request) ptlrpc_req_finished(request); if (rc < 0) { @@ -479,8 +485,6 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, } case LL_IOC_MDC_MKDIRSTRIPE: RETURN(ll_mkdir_stripe(inode, arg)); - case IOC_MDC_FINISH_GNS: - RETURN(ll_finish_gns(sbi)); case LL_IOC_LOV_SETSTRIPE: { struct ptlrpc_request *request = NULL; struct mdc_op_data *op_data; @@ -527,7 +531,7 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, valid |= OBD_MD_FLDIREA; ll_inode2id(&id, inode); - rc = md_getattr(sbi->ll_md_exp, &id, valid, + rc = md_getattr(sbi->ll_md_exp, &id, valid, NULL, 0, obd_size_diskmd(sbi->ll_dt_exp, NULL), &request); if (rc < 0) { diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 1f4a49a..e13260c 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -27,12 +27,14 @@ #include #include #include +#include #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) #include #endif #include "llite_internal.h" #include +#define XATTR_NAME_MAX 255 int ll_md_close(struct obd_export *md_exp, struct inode *inode, struct file *file) { @@ -144,9 +146,10 @@ static int ll_intent_file_open(struct file *file, void *lmm, ll_mdc_blocking_ast, NULL); OBD_FREE(op_data, sizeof(*op_data)); if (rc == 0) { - if (itp->d.lustre.it_lock_mode) - memcpy(&itp->d.lustre.it_lock_handle, + if (LUSTRE_IT(itp)->it_lock_mode) + memcpy(&LUSTRE_IT(itp)->it_lock_handle, &lockh, sizeof(lockh)); + } else if (rc < 0) { CERROR("lock enqueue: err: %d\n", rc); } @@ -156,7 +159,7 @@ static int ll_intent_file_open(struct file *file, void *lmm, int ll_local_open(struct file *file, struct lookup_intent *it) { - struct ptlrpc_request *req = it->d.lustre.it_data; + struct ptlrpc_request *req = LUSTRE_IT(it)->it_data; struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode); struct obd_export *md_exp = ll_i2mdexp(file->f_dentry->d_inode); struct ll_file_data *fd; @@ -189,8 +192,8 @@ int ll_local_open(struct file *file, struct lookup_intent *it) lli->lli_io_epoch = body->io_epoch; - mdc_set_open_replay_data(md_exp, &fd->fd_mds_och, it->d.lustre.it_data); - + mdc_set_open_replay_data(md_exp, &fd->fd_mds_och, LUSTRE_IT(it)->it_data); + RETURN(0); } @@ -228,13 +231,17 @@ int ll_file_open(struct inode *inode, struct file *file) it = file->f_it; - if (!it || !it->d.lustre.it_disposition) { + if (!it || !LUSTRE_IT(it) || !LUSTRE_IT(it)->it_disposition) { it = &oit; + rc = ll_intent_alloc(it); + if (rc) + GOTO(out, rc); rc = ll_intent_file_open(file, NULL, 0, it); if (rc) GOTO(out, rc); } + lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN); /* mdc_intent_lock() didn't get a request ref if there was an open * error, so don't do cleanup on the request here (bug 3430) */ @@ -260,7 +267,9 @@ int ll_file_open(struct inode *inode, struct file *file) file->f_flags &= ~O_LOV_DELAY_CREATE; GOTO(out, rc); out: - req = it->d.lustre.it_data; + req = LUSTRE_IT(it)->it_data; + ll_intent_release(it); + ptlrpc_req_finished(req); if (rc == 0) ll_open_complete(inode); @@ -1010,13 +1019,18 @@ static int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file, f->f_dentry = file->f_dentry; f->f_vfsmnt = file->f_vfsmnt; + rc = ll_intent_alloc(&oit); + if (rc) + GOTO(out, rc); + rc = ll_intent_file_open(f, lum, lum_size, &oit); if (rc) GOTO(out, rc); if (it_disposition(&oit, DISP_LOOKUP_NEG)) GOTO(out, -ENOENT); - req = oit.d.lustre.it_data; - rc = oit.d.lustre.it_status; + + req = LUSTRE_IT(&oit)->it_data; + rc = LUSTRE_IT(&oit)->it_status; if (rc < 0) GOTO(out, rc); @@ -1034,6 +1048,7 @@ static int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file, rc = ll_file_release(f->f_dentry->d_inode, f); EXIT; out: + ll_intent_release(&oit); if (f) put_filp(f); up(&lli->lli_open_sem); @@ -1438,7 +1453,7 @@ int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) RETURN(rc); } -int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it) +int ll_inode_revalidate_it(struct dentry *dentry) { struct lookup_intent oit = { .it_op = IT_GETATTR }; struct inode *inode = dentry->d_inode; @@ -1448,7 +1463,6 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it) struct ll_sb_info *sbi; struct lustre_id id; int rc; - ENTRY; if (!inode) { @@ -1462,14 +1476,18 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it) lli = ll_i2info(inode); LASSERT(id_fid(&id) != 0); - CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), name=%s, intent=%s\n", + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), name=%s(%p)\n", inode->i_ino, inode->i_generation, inode, dentry->d_name.name, - LL_IT2STR(it)); + dentry); #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0)) lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE); #endif + rc = ll_intent_alloc(&oit); + if (rc) + RETURN(-ENOMEM); + rc = md_intent_lock(sbi->ll_md_exp, &id, NULL, 0, NULL, 0, &id, &oit, 0, &req, ll_mdc_blocking_ast); if (rc < 0) @@ -1477,7 +1495,6 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it) rc = revalidate_it_finish(req, 1, &oit, dentry); if (rc) { - ll_intent_release(&oit); GOTO(out, rc); } @@ -1494,19 +1511,19 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it) rc = ll_glimpse_size(inode); EXIT; out: + ll_intent_release(&oit); if (req) ptlrpc_req_finished(req); return rc; } #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) -int ll_getattr(struct vfsmount *mnt, struct dentry *de, - struct lookup_intent *it, struct kstat *stat) +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat) { int res = 0; struct inode *inode = de->d_inode; - res = ll_inode_revalidate_it(de, it); + res = ll_inode_revalidate_it(de); lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR); if (res) @@ -1529,6 +1546,237 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, } #endif +static +int ll_setxattr_internal(struct inode *inode, const char *name, + const void *value, size_t size, int flags, + __u64 valid) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ptlrpc_request *request = NULL; + struct mdc_op_data op_data; + struct iattr attr; + int rc = 0; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino); + lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_SETXATTR); + + memset(&attr, 0x0, sizeof(attr)); + attr.ia_valid |= valid; + attr.ia_attr_flags = flags; + + ll_prepare_mdc_data(&op_data, inode, NULL, NULL, 0, 0); + + rc = md_setattr(sbi->ll_md_exp, &op_data, &attr, + (void*) name, strnlen(name, XATTR_NAME_MAX)+1, + (void*) value, size, &request); + if (rc) { + CERROR("md_setattr fails: rc = %d\n", rc); + GOTO(out, rc); + } + + out: + ptlrpc_req_finished(request); + RETURN(rc); +} + +int ll_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + int rc, error; + struct posix_acl *acl; + struct ll_inode_info *lli; + ENTRY; + + rc = ll_setxattr_internal(dentry->d_inode, name, value, size, + flags, ATTR_EA); + + /* update inode's acl info */ + if (rc == 0 && strcmp(name, XATTR_NAME_ACL_ACCESS) == 0) { + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) { + CERROR("convert from xattr to acl error: %ld", + PTR_ERR(acl)); + GOTO(out, rc); + } else if (acl) { + error = posix_acl_valid(acl); + if (error) { + CERROR("acl valid error: %d", error); + posix_acl_release(acl); + GOTO(out, rc); + } + } + } else { + acl = NULL; + } + + lli = ll_i2info(dentry->d_inode); + spin_lock(&lli->lli_lock); + if (lli->lli_acl_access != NULL) + posix_acl_release(lli->lli_acl_access); + lli->lli_acl_access = acl; + spin_unlock(&lli->lli_lock); + } + EXIT; +out: + return(rc); +} + +int ll_removexattr(struct dentry *dentry, const char *name) +{ + return ll_setxattr_internal(dentry->d_inode, name, NULL, 0, 0, + ATTR_EA_RM); +} + +static +int ll_getxattr_internal(struct inode *inode, const char *name, int namelen, + void *value, size_t size, __u64 valid) +{ + struct ptlrpc_request *request = NULL; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct lustre_id id; + struct mds_body *body; + void *ea_data; + int rc, ea_size; + ENTRY; + + lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETXATTR); + + ll_inode2id(&id, inode); + rc = md_getattr(sbi->ll_md_exp, &id, valid, name, namelen, + size, &request); + if (rc) { + if (rc != -ENODATA && rc != -EOPNOTSUPP) + CERROR("md_getattr fails: rc = %d\n", rc); + GOTO(out, rc); + } + + body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body)); + LASSERT(body != NULL); + LASSERT_REPSWABBED(request, 0); + + ea_size = body->eadatasize; + LASSERT(ea_size <= request->rq_repmsg->buflens[0]); + + if (size == 0) + GOTO(out, rc = ea_size); + + ea_data = lustre_msg_buf(request->rq_repmsg, 1, ea_size); + LASSERT(ea_data != NULL); + LASSERT_REPSWABBED(request, 1); + + if (value) + memcpy(value, ea_data, ea_size); + rc = ea_size; + out: + ptlrpc_req_finished(request); + RETURN(rc); +} + +int ll_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size) +{ + return ll_getxattr_internal(dentry->d_inode, name, strlen(name) + 1, + value, size, OBD_MD_FLEA); +} + +int ll_listxattr(struct dentry *dentry, char *list, size_t size) +{ + return ll_getxattr_internal(dentry->d_inode, NULL, 0, list, size, + OBD_MD_FLEALIST); +} + +int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd) +{ + struct lookup_intent it = { .it_op = IT_GETATTR }; + int mode = inode->i_mode; + struct dentry de; + struct ll_sb_info *sbi; + struct lustre_id id; + struct ptlrpc_request *req = NULL; + int rc; + ENTRY; + + sbi = ll_i2sbi(inode); + ll_inode2id(&id, inode); + + /* Nobody gets write access to a read-only fs */ + if ((mask & MAY_WRITE) && IS_RDONLY(inode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + return -EROFS; + /* Nobody gets write access to an immutable file */ + if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) + return -EACCES; + if (current->fsuid == inode->i_uid) { + mode >>= 6; + } else if (1) { + struct ll_inode_info *lli = ll_i2info(inode); + struct posix_acl *acl; + + /* The access ACL cannot grant access if the group class + permission bits don't contain all requested permissions. */ + if (((mode >> 3) & mask & S_IRWXO) != mask) + goto check_groups; + + if (ll_intent_alloc(&it)) + return -EACCES; + + de.d_inode = inode; + rc = md_intent_lock(sbi->ll_md_exp, &id, NULL, 0, NULL, 0, &id, + &it, 0, &req, ll_mdc_blocking_ast); + if (rc < 0) { + ll_intent_free(&it); + GOTO(out, rc); + } + + rc = revalidate_it_finish(req, 1, &it, &de); + if (rc) { + ll_intent_release(&it); + GOTO(out, rc); + } + + ll_lookup_finish_locks(&it, &de); + ll_intent_free(&it); + + spin_lock(&lli->lli_lock); + acl = posix_acl_dup(ll_i2info(inode)->lli_acl_access); + spin_unlock(&lli->lli_lock); + + if (!acl) + goto check_groups; + + rc = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + if (rc == -EACCES) + goto check_capabilities; + GOTO(out, rc); + } else { +check_groups: + if (in_group_p(inode->i_gid)) + mode >>= 3; + } + if ((mode & mask & S_IRWXO) == mask) + GOTO(out, rc = 0); + +check_capabilities: + rc = -EACCES; + /* Allowed to override Discretionary Access Control? */ + if (!(mask & MAY_EXEC) || + (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) + if (capable(CAP_DAC_OVERRIDE)) + GOTO(out, rc = 0); + /* Read and search granted if capable(CAP_DAC_READ_SEARCH) */ + if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) || + (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))) + GOTO(out, rc = 0); +out: + if (req) + ptlrpc_req_finished(req); + + return rc; +} + struct file_operations ll_file_operations = { .read = ll_file_read, .write = ll_file_write, @@ -1545,13 +1793,17 @@ struct file_operations ll_file_operations = { }; struct inode_operations ll_file_inode_operations = { - .setattr_raw = ll_setattr_raw, .setattr = ll_setattr, .truncate = ll_truncate, #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) - .getattr_it = ll_getattr, + .getattr = ll_getattr, #else .revalidate_it = ll_inode_revalidate_it, #endif + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, + .permission = ll_inode_permission, }; diff --git a/lustre/llite/llite_gns.c b/lustre/llite/llite_gns.c index f53eeac..d3ae81c 100644 --- a/lustre/llite/llite_gns.c +++ b/lustre/llite/llite_gns.c @@ -31,199 +31,252 @@ #include #include "llite_internal.h" -/* After roughly how long should we remove an inactive mount? */ -#define GNS_MOUNT_TIMEOUT 120 -/* How often should the GNS timer look for mounts to cleanup? */ -#define GNS_TICK 30 +static struct list_head gns_sbi_list = LIST_HEAD_INIT(gns_sbi_list); +static spinlock_t gns_lock = SPIN_LOCK_UNLOCKED; +static struct ptlrpc_thread gns_thread; +static struct ll_gns_ctl gns_ctl; -int ll_finish_gns(struct ll_sb_info *sbi) +/* + * waits until passed dentry gets mountpoint or timeout and attempts are + * exhausted. Returns 1 if dentry became mountpoint and 0 otherwise. + */ +static int +ll_gns_wait_for_mount(struct dentry *dentry, + int timeout, int tries) { - down(&sbi->ll_gns_sem); - if (sbi->ll_gns_state != LL_GNS_STATE_MOUNTING) { - up(&sbi->ll_gns_sem); - CERROR("FINISH_GNS called on mount which was not expecting " - "completion.\n"); - return -EINVAL; - } - - sbi->ll_gns_state = LL_GNS_STATE_FINISHED; - up(&sbi->ll_gns_sem); - complete(&sbi->ll_gns_completion); - - return 0; -} + struct l_wait_info lwi; + struct ll_sb_info *sbi; + int rc; + ENTRY; -/* Pass exactly one (1) page in; when this function returns "page" will point - * somewhere into the middle of the page. */ -int fill_page_with_path(struct dentry *dentry, struct vfsmount *mnt, - char **pagep) -{ - char *path = *pagep, *p; - - path[PAGE_SIZE - 1] = '\0'; - p = path + PAGE_SIZE - 1; - - while (1) { - if (p - path < dentry->d_name.len + 1) - return -ENAMETOOLONG; - if (dentry->d_name.name[0] != '/') { - p -= dentry->d_name.len; - memcpy(p, dentry->d_name.name, dentry->d_name.len); - p--; - *p = '/'; - } + LASSERT(dentry != NULL); + LASSERT(!IS_ERR(dentry)); + sbi = ll_s2sbi(dentry->d_sb); + + for (; !d_mountpoint(dentry) && tries > 0; tries--) { + lwi = LWI_TIMEOUT(timeout * HZ, NULL, NULL); + l_wait_event(sbi->ll_gns_waitq, d_mountpoint(dentry), &lwi); + } - dentry = dentry->d_parent; - if (dentry->d_parent == dentry) { - if (mnt->mnt_parent == mnt) - break; /* finished walking up */ - mnt = mntget(mnt); - dget(dentry); - while (dentry->d_parent == dentry && - follow_up(&mnt, &dentry)) - ; - mntput(mnt); - dput(dentry); - } + if ((rc = d_mountpoint(dentry) ? 1 : 0)) { + spin_lock(&sbi->ll_gns_lock); + LASSERT(sbi->ll_gns_state == LL_GNS_MOUNTING); + sbi->ll_gns_state = LL_GNS_FINISHED; + spin_unlock(&sbi->ll_gns_lock); } - *pagep = p; - return 0; + + complete(&sbi->ll_gns_mount_finished); + RETURN(rc); } -int ll_dir_process_mount_object(struct dentry *dentry, struct vfsmount *mnt) +/* + * tries to mount the mount object under passed @dentry. In the case of success + * @dentry will become mount point and 0 will be retuned. Error code will be + * returned otherwise. + */ +int ll_gns_mount_object(struct dentry *dentry, + struct vfsmount *mnt) { - struct ll_sb_info *sbi; + struct ll_dentry_data *lld = dentry->d_fsdata; + char *p, *path, *pathpage, *argv[4]; struct file *mntinfo_fd = NULL; - struct page *datapage = NULL, *pathpage; struct address_space *mapping; - struct ll_dentry_data *lld = dentry->d_fsdata; - struct dentry *dchild, *tmp_dentry; - struct vfsmount *tmp_mnt; - char *p, *path, *argv[4]; - int stage = 0, rc = 0; + int cleanup_phase = 0, rc = 0; + struct ll_sb_info *sbi; + struct dentry *dchild; + struct page *datapage; + filler_t *filler; ENTRY; if (mnt == NULL) { - CERROR("suid directory found, but no vfsmount available.\n"); - RETURN(-1); + CERROR("suid directory found, but no " + "vfsmount available.\n"); + RETURN(-EINVAL); } + CDEBUG(D_INODE, "mounting dentry %p\n", dentry); + LASSERT(dentry->d_inode != NULL); LASSERT(S_ISDIR(dentry->d_inode->i_mode)); LASSERT(lld != NULL); + sbi = ll_i2sbi(dentry->d_inode); LASSERT(sbi != NULL); - down(&sbi->ll_gns_sem); - if (sbi->ll_gns_state == LL_GNS_STATE_MOUNTING) { - up(&sbi->ll_gns_sem); - wait_for_completion(&sbi->ll_gns_completion); + /* another thead is in progress of mouning some entry */ + spin_lock(&sbi->ll_gns_lock); + if (sbi->ll_gns_state == LL_GNS_MOUNTING) { + spin_unlock(&sbi->ll_gns_lock); + + wait_for_completion(&sbi->ll_gns_mount_finished); if (d_mountpoint(dentry)) RETURN(0); - RETURN(-1); } - if (sbi->ll_gns_state == LL_GNS_STATE_FINISHED) { + + /* another thread mounted it already */ + if (sbi->ll_gns_state == LL_GNS_FINISHED) { + spin_unlock(&sbi->ll_gns_lock); + /* we lost a race; just return */ - up(&sbi->ll_gns_sem); if (d_mountpoint(dentry)) RETURN(0); - RETURN(-1); } - LASSERT(sbi->ll_gns_state == LL_GNS_STATE_IDLE); - sbi->ll_gns_state = LL_GNS_STATE_MOUNTING; + LASSERT(sbi->ll_gns_state == LL_GNS_IDLE); + + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_GNS_MOUNTING; + spin_unlock(&dentry->d_lock); + + /* mounting started */ + sbi->ll_gns_state = LL_GNS_MOUNTING; + spin_unlock(&sbi->ll_gns_lock); + + /* we need to build an absolute pathname to pass to mount */ + pathpage = (char *)__get_free_page(GFP_KERNEL); + if (!pathpage) + GOTO(cleanup, rc = -ENOMEM); + cleanup_phase = 1; + + /* getting @dentry path stored in @pathpage. */ + path = d_path(dentry, mnt, pathpage, PAGE_SIZE); + if (IS_ERR(path)) { + CERROR("can't build mount object path, err %d\n", + (int)PTR_ERR(dchild)); + GOTO(cleanup, rc = PTR_ERR(dchild)); + } + + /* sychronizing with possible /proc/fs/...write */ + down(&sbi->ll_gns_sem); + + /* + * mount object name is taken from sbi, where it is set in mount time or + * via /proc/fs... tunable. It may be ".mntinfo" or so. + */ + dchild = ll_d_lookup(sbi->ll_gns_oname, dentry, + strlen(sbi->ll_gns_oname)); up(&sbi->ll_gns_sem); - /* We need to build an absolute pathname to pass to mount */ - pathpage = alloc_pages(GFP_HIGHUSER, 0); - if (pathpage == NULL) - GOTO(cleanup, rc = -ENOMEM); - path = kmap(pathpage); - LASSERT(path != NULL); - stage = 1; - fill_page_with_path(dentry, mnt, &path); - - dchild = lookup_one_len(".mntinfo", dentry, strlen(".mntinfo")); - if (dchild == NULL || IS_ERR(dchild)) { - CERROR("Directory %*s is setuid, but without a mount object.\n", - dentry->d_name.len, dentry->d_name.name); - GOTO(cleanup, rc = -1); + if (!dchild) + GOTO(cleanup, rc = -ENOENT); + + if (IS_ERR(dchild)) { + CERROR("can't find mount object %*s/%*s err = %d.\n", + (int)dentry->d_name.len, dentry->d_name.name, + (int)dchild->d_name.len, dchild->d_name.name, + (int)PTR_ERR(dchild)); + GOTO(cleanup, rc = PTR_ERR(dchild)); } mntget(mnt); + /* ok, mount object if found, opening it. */ mntinfo_fd = dentry_open(dchild, mnt, 0); if (IS_ERR(mntinfo_fd)) { + CERROR("can't open mount object %*s/%*s err = %d.\n", + (int)dentry->d_name.len, dentry->d_name.name, + (int)dchild->d_name.len, dchild->d_name.name, + (int)PTR_ERR(mntinfo_fd)); dput(dchild); mntput(mnt); GOTO(cleanup, rc = PTR_ERR(mntinfo_fd)); } - stage = 2; + cleanup_phase = 2; if (mntinfo_fd->f_dentry->d_inode->i_size > PAGE_SIZE) { - CERROR("Mount object file is too big (%Ld)\n", + CERROR("mount object %*s/%*s is too big (%Ld)\n", + (int)dentry->d_name.len, dentry->d_name.name, + (int)dchild->d_name.len, dchild->d_name.name, mntinfo_fd->f_dentry->d_inode->i_size); - GOTO(cleanup, rc = -1); + GOTO(cleanup, rc = -EFBIG); } + + /* read data from mount object. */ mapping = mntinfo_fd->f_dentry->d_inode->i_mapping; - datapage = read_cache_page(mapping, 0, - (filler_t *)mapping->a_ops->readpage, + filler = (filler_t *)mapping->a_ops->readpage; + datapage = read_cache_page(mapping, 0, filler, mntinfo_fd); - if (IS_ERR(datapage)) + if (IS_ERR(datapage)) { + CERROR("can't read data from mount object %*s/%*s\n", + (int)dentry->d_name.len, dentry->d_name.name, + (int)dchild->d_name.len, dchild->d_name.name); GOTO(cleanup, rc = PTR_ERR(datapage)); + } p = kmap(datapage); LASSERT(p != NULL); - stage = 3; - p[PAGE_SIZE - 1] = '\0'; + cleanup_phase = 3; fput(mntinfo_fd); mntinfo_fd = NULL; - argv[0] = "/usr/lib/lustre/gns-upcall.sh"; + /* sychronizing with possible /proc/fs/...write */ + down(&sbi->ll_gns_sem); + + /* + * upcall is initialized in mount time or via /proc/fs/... tuneable and + * may be /usr/lib/lustre/gns-upcall.sh + */ + argv[0] = sbi->ll_gns_upcall; argv[1] = p; argv[2] = path; argv[3] = NULL; - rc = USERMODEHELPER(argv[0], argv, NULL); + + up(&sbi->ll_gns_sem); - if (rc != 0) { - CERROR("GNS mount failed: %d\n", rc); + rc = USERMODEHELPER(argv[0], argv, NULL); + if (rc) { + CERROR("failed to call GNS upcall %s, err = %d\n", + sbi->ll_gns_upcall, rc); GOTO(cleanup, rc); } - wait_for_completion(&sbi->ll_gns_completion); - LASSERT(sbi->ll_gns_state == LL_GNS_STATE_FINISHED); - - if (d_mountpoint(dentry)) { - /* successful follow_down will mntput and dput */ - tmp_mnt = mntget(mnt); - tmp_dentry = dget(dentry); - rc = follow_down(&tmp_mnt, &tmp_dentry); - if (rc == 1) { - struct ll_sb_info *sbi = ll_s2sbi(dentry->d_sb); + /* + * wait for mount completion. This is actually not need, because + * USERMODEHELPER() returns only when usermode process finishes. But we + * doing this just for case USERMODEHELPER() semanthics will be changed + * or usermode upcall program will start mounting in backgound and + * return instantly. --umka + */ + if (ll_gns_wait_for_mount(dentry, 1, GNS_WAIT_ATTEMPTS)) { + struct dentry *rdentry; + struct vfsmount *rmnt; + + /* mount is successful */ + LASSERT(sbi->ll_gns_state == LL_GNS_FINISHED); + + rmnt = mntget(mnt); + rdentry = dget(dentry); + + if (follow_down(&rmnt, &rdentry)) { + /* + * registering new mount in GNS mounts list and thus + * make it accessible from GNS control thread. + */ spin_lock(&dcache_lock); - LASSERT(list_empty(&tmp_mnt->mnt_lustre_list)); - list_add_tail(&tmp_mnt->mnt_lustre_list, + LASSERT(list_empty(&rmnt->mnt_lustre_list)); + list_add_tail(&rmnt->mnt_lustre_list, &sbi->ll_mnt_list); spin_unlock(&dcache_lock); - - tmp_mnt->mnt_last_used = jiffies; - - mntput(tmp_mnt); - dput(tmp_dentry); - rc = 0; + rmnt->mnt_last_used = jiffies; + mntput(rmnt); + dput(rdentry); } else { mntput(mnt); dput(dentry); } + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_GNS_PENDING; + spin_unlock(&dentry->d_lock); } else { - CERROR("Woke up from GNS mount, but no mountpoint in place.\n"); - rc = -1; + CERROR("usermode upcall %s failed to mount %s\n", + sbi->ll_gns_upcall, path); + rc = -ETIME; } EXIT; cleanup: - switch (stage) { + switch (cleanup_phase) { case 3: kunmap(datapage); page_cache_release(datapage); @@ -231,82 +284,87 @@ cleanup: if (mntinfo_fd != NULL) fput(mntinfo_fd); case 1: - kunmap(pathpage); - __free_pages(pathpage, 0); + free_page((unsigned long)pathpage); case 0: - down(&sbi->ll_gns_sem); - sbi->ll_gns_state = LL_GNS_STATE_IDLE; - up(&sbi->ll_gns_sem); + spin_lock(&sbi->ll_gns_lock); + sbi->ll_gns_state = LL_GNS_IDLE; + spin_unlock(&sbi->ll_gns_lock); + + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_GNS_MOUNTING; + spin_unlock(&dentry->d_lock); } return rc; } -/* If timeout == 1, only remove the mounts which are properly aged. - * - * If timeout == 0, we are unmounting -- remove them all. */ -int ll_gns_umount_all(struct ll_sb_info *sbi, int timeout) +/* tries to umount passed @mnt. */ +int ll_gns_umount_object(struct vfsmount *mnt) { - struct list_head kill_list = LIST_HEAD_INIT(kill_list); - struct page *page = NULL; - char *kpage = NULL, *path; - int rc; + int rc = 0; ENTRY; - - if (timeout == 0) { - page = alloc_pages(GFP_HIGHUSER, 0); - if (page == NULL) - RETURN(-ENOMEM); - kpage = kmap(page); - LASSERT(kpage != NULL); + + CDEBUG(D_INODE, "unmounting mnt %p\n", mnt); + rc = do_umount(mnt, 0); + if (rc) { + CDEBUG(D_INODE, "can't umount 0x%p, err = %d\n", + mnt, rc); } + + RETURN(rc); +} + +int ll_gns_check_mounts(struct ll_sb_info *sbi, int flags) +{ + struct list_head check_list = LIST_HEAD_INIT(check_list); + struct vfsmount *mnt; + unsigned long pass; + ENTRY; spin_lock(&dcache_lock); - list_splice_init(&sbi->ll_mnt_list, &kill_list); - - /* Walk the list in reverse order, and put them on the front of the - * sbi list each iteration; this avoids list-ordering problems if we - * race with another gns-mounting thread */ - while (!list_empty(&kill_list)) { - struct vfsmount *mnt = - list_entry(kill_list.prev, struct vfsmount, - mnt_lustre_list); + list_splice_init(&sbi->ll_mnt_list, &check_list); + + /* + * walk the list in reverse order, and put them on the front of the sbi + * list each iteration; this avoids list-ordering problems if we race + * with another gns-mounting thread. + */ + while (!list_empty(&check_list)) { + mnt = list_entry(check_list.prev, + struct vfsmount, + mnt_lustre_list); + mntget(mnt); + list_del_init(&mnt->mnt_lustre_list); - list_add(&mnt->mnt_lustre_list, &sbi->ll_mnt_list); - if (timeout && - jiffies - mnt->mnt_last_used < GNS_MOUNT_TIMEOUT * HZ) { + list_add(&mnt->mnt_lustre_list, + &sbi->ll_mnt_list); + + /* check for timeout if needed */ + pass = jiffies - mnt->mnt_last_used; + + if (flags == LL_GNS_CHECK && + pass < sbi->ll_gns_timeout * HZ) + { mntput(mnt); continue; } spin_unlock(&dcache_lock); - CDEBUG(D_INODE, "unmounting mnt %p from sbi %p\n", mnt, sbi); + /* umounting @mnt */ + ll_gns_umount_object(mnt); - rc = do_umount(mnt, 0); - if (rc != 0 && page != NULL) { - int rc2; - path = kpage; - rc2 = fill_page_with_path(mnt->mnt_root, mnt, &path); - CERROR("GNS umount(%s): %d\n", rc2 == 0 ? path : "", - rc); - } mntput(mnt); spin_lock(&dcache_lock); } spin_unlock(&dcache_lock); - - if (page != NULL) { - kunmap(page); - __free_pages(page, 0); - } RETURN(0); } -static struct list_head gns_sbi_list = LIST_HEAD_INIT(gns_sbi_list); -static spinlock_t gns_lock = SPIN_LOCK_UNLOCKED; -static struct ptlrpc_thread gns_thread; - +/* + * GNS timer callback function. It restarts gns timer and wakes up GNS cvontrol + * thread to process mounts list. + */ void ll_gns_timer_callback(unsigned long data) { struct ll_sb_info *sbi = (void *)data; @@ -316,27 +374,35 @@ void ll_gns_timer_callback(unsigned long data) if (list_empty(&sbi->ll_gns_sbi_head)) list_add(&sbi->ll_gns_sbi_head, &gns_sbi_list); spin_unlock(&gns_lock); + wake_up(&gns_thread.t_ctl_waitq); - mod_timer(&sbi->ll_gns_timer, jiffies + GNS_TICK * HZ); + mod_timer(&sbi->ll_gns_timer, + jiffies + sbi->ll_gns_tick * HZ); } -static int gns_check_event(void) +/* this function checks if something new happened to exist in gns list. */ +static int inline ll_gns_check_event(void) { int rc; + spin_lock(&gns_lock); rc = !list_empty(&gns_sbi_list); spin_unlock(&gns_lock); + return rc; } -static int inline gns_check_stopping(void) +/* should we staop GNS control thread? */ +static int inline ll_gns_check_stop(void) { mb(); return (gns_thread.t_flags & SVC_STOPPING) ? 1 : 0; } +/* GNS control thread function. */ static int ll_gns_thread_main(void *arg) { + struct ll_gns_ctl *ctl = arg; unsigned long flags; ENTRY; @@ -345,42 +411,57 @@ static int ll_gns_thread_main(void *arg) snprintf(name, sizeof(name) - 1, "ll_gns"); kportal_daemonize(name); } + SIGNAL_MASK_LOCK(current, flags); sigfillset(¤t->blocked); RECALC_SIGPENDING; SIGNAL_MASK_UNLOCK(current, flags); + /* + * letting starting function know, that we are ready and control may be + * returned. + */ gns_thread.t_flags = SVC_RUNNING; - wake_up(&gns_thread.t_ctl_waitq); + complete(&ctl->gc_starting); - while (!gns_check_stopping()) { + while (!ll_gns_check_stop()) { struct l_wait_info lwi = { 0 }; - l_wait_event(gns_thread.t_ctl_waitq, gns_check_event() || - gns_check_stopping(), &lwi); - + l_wait_event(gns_thread.t_ctl_waitq, + (ll_gns_check_event() || + ll_gns_check_stop()), &lwi); + spin_lock(&gns_lock); while (!list_empty(&gns_sbi_list)) { - struct ll_sb_info *sbi = - list_entry(gns_sbi_list.prev, struct ll_sb_info, - ll_gns_sbi_head); + struct ll_sb_info *sbi; + + sbi = list_entry(gns_sbi_list.prev, + struct ll_sb_info, + ll_gns_sbi_head); + list_del_init(&sbi->ll_gns_sbi_head); spin_unlock(&gns_lock); - ll_gns_umount_all(sbi, 1); + ll_gns_check_mounts(sbi, LL_GNS_CHECK); spin_lock(&gns_lock); } spin_unlock(&gns_lock); } + /* + * letting know stop function know that thread is stoped and it may + * return. + */ + EXIT; gns_thread.t_flags = SVC_STOPPED; - wake_up(&gns_thread.t_ctl_waitq); - RETURN(0); + /* this is SMP-safe way to finish thread. */ + complete_and_exit(&ctl->gc_finishing, 0); } void ll_gns_add_timer(struct ll_sb_info *sbi) { - mod_timer(&sbi->ll_gns_timer, jiffies + GNS_TICK * HZ); + mod_timer(&sbi->ll_gns_timer, + jiffies + sbi->ll_gns_tick * HZ); } void ll_gns_del_timer(struct ll_sb_info *sbi) @@ -388,32 +469,40 @@ void ll_gns_del_timer(struct ll_sb_info *sbi) del_timer(&sbi->ll_gns_timer); } +/* + * starts GNS control thread and waits for a signal it is up and work may be + * continued. + */ int ll_gns_start_thread(void) { - struct l_wait_info lwi = { 0 }; int rc; + ENTRY; LASSERT(gns_thread.t_flags == 0); - + init_completion(&gns_ctl.gc_starting); + init_completion(&gns_ctl.gc_finishing); init_waitqueue_head(&gns_thread.t_ctl_waitq); - rc = kernel_thread(ll_gns_thread_main, NULL, CLONE_VM | CLONE_FILES); + + rc = kernel_thread(ll_gns_thread_main, &gns_ctl, + (CLONE_VM | CLONE_FILES)); if (rc < 0) { - CERROR("cannot start thread: %d\n", rc); - return rc; + CERROR("cannot start GNS control thread, " + "err = %d\n", rc); + RETURN(rc); } - l_wait_event(gns_thread.t_ctl_waitq, gns_thread.t_flags & SVC_RUNNING, - &lwi); - return 0; + wait_for_completion(&gns_ctl.gc_starting); + LASSERT(gns_thread.t_flags == SVC_RUNNING); + RETURN(0); } +/* stops GNS control thread and waits its actual stop. */ void ll_gns_stop_thread(void) { - struct l_wait_info lwi = { 0 }; - + ENTRY; gns_thread.t_flags = SVC_STOPPING; - wake_up(&gns_thread.t_ctl_waitq); - l_wait_event(gns_thread.t_ctl_waitq, gns_thread.t_flags & SVC_STOPPED, - &lwi); + wait_for_completion(&gns_ctl.gc_finishing); + LASSERT(gns_thread.t_flags == SVC_STOPPED); gns_thread.t_flags = 0; + EXIT; } diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 2dd8aae..ec99d29 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -34,6 +34,15 @@ struct ll_ra_info { unsigned long ra_stats[_NR_RA_STAT]; }; +/* after roughly how long should we remove an inactive mount? */ +#define GNS_MOUNT_TIMEOUT 120 + +/* how often should the GNS timer look for mounts to cleanup? */ +#define GNS_TICK_TIMEOUT 1 + +/* how many times GNS will try to wait for 1 second for mount */ +#define GNS_WAIT_ATTEMPTS 10 + struct ll_sb_info { /* this protects pglist and max_r_a_pages. It isn't safe to grab from * interrupt contexts. */ @@ -78,16 +87,36 @@ struct ll_sb_info { struct list_head ll_mnt_list; struct semaphore ll_gns_sem; + spinlock_t ll_gns_lock; wait_queue_head_t ll_gns_waitq; - struct completion ll_gns_completion; int ll_gns_state; struct timer_list ll_gns_timer; struct list_head ll_gns_sbi_head; + + unsigned long ll_gns_tick; + unsigned long ll_gns_timeout; + struct completion ll_gns_mount_finished; + + /* path to upcall */ + char ll_gns_upcall[PATH_MAX]; + + /* mount object entry name */ + char ll_gns_oname[PATH_MAX]; +}; + +struct ll_gns_ctl { + struct completion gc_starting; + struct completion gc_finishing; }; -#define LL_GNS_STATE_IDLE 1100 -#define LL_GNS_STATE_MOUNTING 1101 -#define LL_GNS_STATE_FINISHED 1102 +/* mounting states */ +#define LL_GNS_IDLE (1 << 0) +#define LL_GNS_MOUNTING (1 << 1) +#define LL_GNS_FINISHED (1 << 2) + +/* mounts checking flags */ +#define LL_GNS_UMOUNT (1 << 0) +#define LL_GNS_CHECK (1 << 1) struct ll_readahead_state { spinlock_t ras_lock; @@ -98,6 +127,7 @@ struct ll_readahead_state { }; extern kmem_cache_t *ll_file_data_slab; +extern kmem_cache_t *ll_intent_slab; struct lustre_handle; struct ll_file_data { struct obd_client_handle fd_mds_och; @@ -192,7 +222,13 @@ void ll_truncate(struct inode *inode); /* llite/file.c */ extern struct file_operations ll_file_operations; extern struct inode_operations ll_file_inode_operations; -extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *); +extern int ll_inode_revalidate_it(struct dentry *); +extern int ll_setxattr(struct dentry *, const char *, const void *, + size_t, int); +extern int ll_getxattr(struct dentry *, const char *, void *, size_t); +extern int ll_listxattr(struct dentry *, char *, size_t); +extern int ll_removexattr(struct dentry *, const char *); +extern int ll_inode_permission(struct inode *, int, struct nameidata *); int ll_refresh_lsm(struct inode *inode, struct lov_stripe_md *lsm); int ll_extent_lock(struct ll_file_data *, struct inode *, struct lov_stripe_md *, int mode, ldlm_policy_data_t *, @@ -208,8 +244,7 @@ int ll_local_open(struct file *file, struct lookup_intent *it); int ll_md_close(struct obd_export *md_exp, struct inode *inode, struct file *file); #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) -int ll_getattr(struct vfsmount *mnt, struct dentry *de, - struct lookup_intent *it, struct kstat *stat); +int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat); #endif void ll_stime_record(struct ll_sb_info *sbi, struct timeval *start, struct obd_service_time *stime); @@ -217,6 +252,8 @@ void ll_stime_record(struct ll_sb_info *sbi, struct timeval *start, /* llite/dcache.c */ void ll_intent_drop_lock(struct lookup_intent *); void ll_intent_release(struct lookup_intent *); +int ll_intent_alloc(struct lookup_intent *); +void ll_intent_free(struct lookup_intent *it); extern void ll_set_dd(struct dentry *de); void ll_unhash_aliases(struct inode *); void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft); @@ -226,22 +263,26 @@ int revalidate_it_finish(struct ptlrpc_request *request, int offset, /* llite/llite_gns.c */ -int ll_finish_gns(struct ll_sb_info *sbi); -int fill_page_with_path(struct dentry *, struct vfsmount *, char **pagep); -int ll_dir_process_mount_object(struct dentry *, struct vfsmount *); -int ll_gns_umount_all(struct ll_sb_info *sbi, int timeout); +int ll_gns_start_thread(void); +void ll_gns_stop_thread(void); + +int ll_gns_mount_object(struct dentry *dentry, + struct vfsmount *mnt); +int ll_gns_umount_object(struct vfsmount *mnt); + +int ll_gns_check_mounts(struct ll_sb_info *sbi, + int flags); + void ll_gns_timer_callback(unsigned long data); void ll_gns_add_timer(struct ll_sb_info *sbi); void ll_gns_del_timer(struct ll_sb_info *sbi); -int ll_gns_start_thread(void); -void ll_gns_stop_thread(void); /* llite/llite_lib.c */ extern struct super_operations lustre_super_operations; char *ll_read_opt(const char *opt, char *data); int ll_set_opt(const char *opt, char *data, int fl); -void ll_options(char *options, char **ost, char **mds, int *flags); +void ll_options(char *options, char **ost, char **mds, char **sec, int *flags); void ll_lli_init(struct ll_inode_info *lli); int ll_fill_super(struct super_block *sb, void *data, int silent); int lustre_fill_super(struct super_block *sb, void *data, int silent); @@ -335,7 +376,6 @@ int ll_get_fid(struct obd_export *exp, struct lustre_id *idp, #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) #define ll_s2sbi(sb) ((struct ll_sb_info *)((sb)->s_fs_info)) #define ll_set_sbi(sb, sbi) ((sb)->s_fs_info = sbi) -void __d_rehash(struct dentry * entry, int lock); static inline __u64 ll_ts2u64(struct timespec *time) { __u64 t = time->tv_sec; diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index f0443a1..338a597 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -32,9 +32,11 @@ #include #include #include +#include #include "llite_internal.h" kmem_cache_t *ll_file_data_slab; +kmem_cache_t *ll_intent_slab; extern struct address_space_operations ll_aops; extern struct address_space_operations ll_dir_aops; @@ -63,13 +65,28 @@ struct ll_sb_info *lustre_init_sbi(struct super_block *sb) INIT_LIST_HEAD(&sbi->ll_conn_chain); INIT_HLIST_HEAD(&sbi->ll_orphan_dentry_list); INIT_LIST_HEAD(&sbi->ll_mnt_list); + sema_init(&sbi->ll_gns_sem, 1); - init_completion(&sbi->ll_gns_completion); - sbi->ll_gns_state = LL_GNS_STATE_IDLE; + spin_lock_init(&sbi->ll_gns_lock); + INIT_LIST_HEAD(&sbi->ll_gns_sbi_head); + init_waitqueue_head(&sbi->ll_gns_waitq); + init_completion(&sbi->ll_gns_mount_finished); + + /* this later may be reset via /proc/fs/... */ + memcpy(sbi->ll_gns_oname, ".mntinfo", strlen(".mntinfo")); + sbi->ll_gns_oname[strlen(sbi->ll_gns_oname) - 1] = '\0'; + + /* this later may be reset via /proc/fs/... */ + memset(sbi->ll_gns_upcall, 0, sizeof(sbi->ll_gns_upcall)); + + /* default values, may be changed via /proc/fs/... */ + sbi->ll_gns_state = LL_GNS_IDLE; + sbi->ll_gns_tick = GNS_TICK_TIMEOUT; + sbi->ll_gns_timeout = GNS_MOUNT_TIMEOUT; + sbi->ll_gns_timer.data = (unsigned long)sbi; sbi->ll_gns_timer.function = ll_gns_timer_callback; init_timer(&sbi->ll_gns_timer); - INIT_LIST_HEAD(&sbi->ll_gns_sbi_head); ll_set_sbi(sb, sbi); @@ -104,7 +121,10 @@ int lustre_init_dt_desc(struct ll_sb_info *sbi) RETURN(rc); } -int lustre_common_fill_super(struct super_block *sb, char *lmv, char *lov) +extern struct dentry_operations ll_d_ops; + +int lustre_common_fill_super(struct super_block *sb, char *lmv, char *lov, + char *security, __u32 *nllu) { struct ll_sb_info *sbi = ll_s2sbi(sb); struct ptlrpc_request *request = NULL; @@ -124,6 +144,25 @@ int lustre_common_fill_super(struct super_block *sb, char *lmv, char *lov) RETURN(-EINVAL); } + if (security == NULL) + security = "null"; + + err = obd_set_info(obd->obd_self_export, strlen("sec"), "sec", + strlen(security), security); + if (err) { + CERROR("LMV %s: failed to set security %s, err %d\n", + lmv, security, err); + RETURN(err); + } + + err = obd_set_info(obd->obd_self_export, strlen("nllu"), "nllu", + sizeof(__u32) * 2, nllu); + if (err) { + CERROR("LMV %s: failed to set NLLU, err %d\n", + lmv, err); + RETURN(err); + } + if (proc_lustre_fs_root) { err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb, lov, lmv); @@ -199,7 +238,7 @@ int lustre_common_fill_super(struct super_block *sb, char *lmv, char *lov) /* make root inode * XXX: move this to after cbd setup? */ err = md_getattr(sbi->ll_md_exp, &sbi->ll_rootid, - (OBD_MD_FLNOTOBD | OBD_MD_FLBLOCKS | OBD_MD_FID), + (OBD_MD_FLNOTOBD | OBD_MD_FLBLOCKS | OBD_MD_FID), NULL, 0, 0, &request); if (err) { CERROR("md_getattr failed for root: rc = %d\n", err); @@ -241,6 +280,7 @@ int lustre_common_fill_super(struct super_block *sb, char *lmv, char *lov) #endif sb->s_root = d_alloc_root(root); + sb->s_root->d_op = &ll_d_ops; #ifdef S_PDIROPS CWARN("Enabling PDIROPS\n"); @@ -327,7 +367,7 @@ int ll_set_opt(const char *opt, char *data, int fl) RETURN(fl); } -void ll_options(char *options, char **lov, char **lmv, int *flags) +void ll_options(char *options, char **lov, char **lmv, char **sec, int *flags) { char *this_char; #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) @@ -352,6 +392,8 @@ void ll_options(char *options, char **lov, char **lmv, int *flags) continue; if (!*lmv && (*lmv = ll_read_opt("mdc", this_char))) continue; + if (!*sec && (*sec = ll_read_opt("sec", this_char))) + continue; if (!(*flags & LL_SBI_NOLCK) && ((*flags) = (*flags) | ll_set_opt("nolock", this_char, @@ -378,6 +420,8 @@ int ll_fill_super(struct super_block *sb, void *data, int silent) struct ll_sb_info *sbi; char *lov = NULL; char *lmv = NULL; + char *sec = NULL; + __u32 nllu[2] = { 99, 99 }; int err; ENTRY; @@ -388,7 +432,7 @@ int ll_fill_super(struct super_block *sb, void *data, int silent) RETURN(-ENOMEM); sbi->ll_flags |= LL_SBI_READAHEAD; - ll_options(data, &lov, &lmv, &sbi->ll_flags); + ll_options(data, &lov, &lmv, &sec, &sbi->ll_flags); if (!lov) { CERROR("no osc\n"); @@ -400,12 +444,14 @@ int ll_fill_super(struct super_block *sb, void *data, int silent) GOTO(out, err = -EINVAL); } - err = lustre_common_fill_super(sb, lmv, lov); + err = lustre_common_fill_super(sb, lmv, lov, sec, nllu); EXIT; out: if (err) lustre_free_sbi(sb); + if (sec) + OBD_FREE(sec, strlen(sec) + 1); if (lmv) OBD_FREE(lmv, strlen(lmv) + 1); if (lov) @@ -426,8 +472,7 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char *profile, class_uuid_t uuid; struct obd_uuid lmv_uuid; struct llog_ctxt *ctxt; - int rc = 0; - int err; + int rc, err = 0; ENTRY; if (lmd_bad_magic(lmd)) @@ -440,9 +485,9 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char *profile, PCFG_INIT(pcfg, NAL_CMD_REGISTER_MYNID); pcfg.pcfg_nal = lmd->lmd_nal; pcfg.pcfg_nid = lmd->lmd_local_nid; - err = libcfs_nal_cmd(&pcfg); - if (err <0) - GOTO(out, err); + rc = libcfs_nal_cmd(&pcfg); + if (rc < 0) + GOTO(out, rc); } if (lmd->lmd_nal == SOCKNAL || @@ -455,9 +500,9 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char *profile, pcfg.pcfg_nid = lmd->lmd_server_nid; pcfg.pcfg_id = lmd->lmd_server_ipaddr; pcfg.pcfg_misc = lmd->lmd_port; - err = libcfs_nal_cmd(&pcfg); - if (err <0) - GOTO(out, err); + rc = libcfs_nal_cmd(&pcfg); + if (rc < 0) + GOTO(out, rc); } LCFG_INIT(lcfg, LCFG_ADD_UUID, name); @@ -465,9 +510,9 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char *profile, lcfg.lcfg_inllen1 = strlen(peer) + 1; lcfg.lcfg_inlbuf1 = peer; lcfg.lcfg_nal = lmd->lmd_nal; - err = class_process_config(&lcfg); - if (err < 0) - GOTO(out_del_conn, err); + rc = class_process_config(&lcfg); + if (rc < 0) + GOTO(out_del_conn, rc); LCFG_INIT(lcfg, LCFG_ATTACH, name); lcfg.lcfg_inlbuf1 = "mdc"; @@ -475,33 +520,38 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char *profile, lcfg.lcfg_inlbuf2 = lmv_uuid.uuid; lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1; err = class_process_config(&lcfg); - if (err < 0) - GOTO(out_del_uuid, err); + if (rc < 0) + GOTO(out_del_uuid, rc); LCFG_INIT(lcfg, LCFG_SETUP, name); lcfg.lcfg_inlbuf1 = lmd->lmd_mds; lcfg.lcfg_inllen1 = strlen(lcfg.lcfg_inlbuf1) + 1; lcfg.lcfg_inlbuf2 = peer; lcfg.lcfg_inllen2 = strlen(lcfg.lcfg_inlbuf2) + 1; - err = class_process_config(&lcfg); - if (err < 0) - GOTO(out_detach, err); + rc = class_process_config(&lcfg); + if (rc < 0) + GOTO(out_detach, rc); obd = class_name2obd(name); if (obd == NULL) - GOTO(out_cleanup, err = -EINVAL); + GOTO(out_cleanup, rc = -EINVAL); + + rc = obd_set_info(obd->obd_self_export, strlen("sec"), "sec", + strlen(lmd->lmd_security), lmd->lmd_security); + if (rc) + GOTO(out_cleanup, rc); /* Disable initial recovery on this import */ - err = obd_set_info(obd->obd_self_export, - strlen("initial_recov"), "initial_recov", - sizeof(allow_recov), &allow_recov); - if (err) - GOTO(out_cleanup, err); + rc = obd_set_info(obd->obd_self_export, + strlen("initial_recov"), "initial_recov", + sizeof(allow_recov), &allow_recov); + if (rc) + GOTO(out_cleanup, rc); - err = obd_connect(&md_conn, obd, &lmv_uuid, 0); - if (err) { - CERROR("cannot connect to %s: rc = %d\n", lmd->lmd_mds, err); - GOTO(out_cleanup, err); + rc = obd_connect(&md_conn, obd, &lmv_uuid, 0); + if (rc) { + CERROR("cannot connect to %s: rc = %d\n", lmd->lmd_mds, rc); + GOTO(out_cleanup, rc); } exp = class_conn2export(&md_conn); @@ -511,7 +561,7 @@ static int lustre_process_log(struct lustre_mount_data *lmd, char *profile, if (rc) CERROR("class_config_process_llog failed: rc = %d\n", rc); - err = obd_disconnect(exp, 0); + rc = obd_disconnect(exp, 0); EXIT; out_cleanup: @@ -538,12 +588,16 @@ out_del_conn: lmd->lmd_nal == IIBNAL || lmd->lmd_nal == VIBNAL || lmd->lmd_nal == RANAL) { + int err2; + PCFG_INIT(pcfg, NAL_CMD_DEL_PEER); pcfg.pcfg_nal = lmd->lmd_nal; pcfg.pcfg_nid = lmd->lmd_server_nid; pcfg.pcfg_flags = 1; /* single_share */ - err = libcfs_nal_cmd(&pcfg); - if (err <0) + err2 = libcfs_nal_cmd(&pcfg); + if (err2 && !err) + err = err2; + if (err < 0) GOTO(out, err); } out: @@ -580,6 +634,7 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent) CERROR("no mds name\n"); GOTO(out_free, err = -EINVAL); } + lmd->lmd_security[sizeof(lmd->lmd_security) - 1] = 0; OBD_ALLOC(sbi->ll_lmd, sizeof(*sbi->ll_lmd)); if (sbi->ll_lmd == NULL) @@ -631,7 +686,8 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent) GOTO(out_free, err = -EINVAL); } - err = lustre_common_fill_super(sb, lmv, lov); + err = lustre_common_fill_super(sb, lmv, lov, lmd->lmd_security, + &lmd->lmd_nllu); if (err) GOTO(out_free, err); @@ -957,7 +1013,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) /* If only OST attributes being set on objects, don't do MDS RPC. * In that case, we need to check permissions and update the local * inode ourselves so we can call obdo_from_inode() always. */ - if (ia_valid & (lsm ? ~(ATTR_SIZE | ATTR_FROM_OPEN | ATTR_RAW) : ~0)) { + if (ia_valid & (lsm ? ~(ATTR_SIZE | ATTR_FROM_OPEN /*| ATTR_RAW*/) : ~0)) { struct lustre_md md; OBD_ALLOC(op_data, sizeof(*op_data)); @@ -1094,8 +1150,8 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) int ll_setattr(struct dentry *de, struct iattr *attr) { - LBUG(); /* code is unused, but leave this in case of VFS changes */ - RETURN(-ENOSYS); + LASSERT(de->d_inode); + return ll_setattr_raw(de->d_inode, attr); } int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, @@ -1184,10 +1240,12 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) struct lov_stripe_md *lsm = md->lsm; struct mds_body *body = md->body; struct mea *mea = md->mea; + struct posix_acl *ll_acl_access = md->acl_access; ENTRY; LASSERT((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); LASSERT((mea != NULL) == ((body->valid & OBD_MD_FLDIREA) != 0)); + if (lsm != NULL) { LASSERT(lsm->lsm_object_gr > 0); if (lli->lli_smd == NULL) { @@ -1250,6 +1308,14 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) if (body->valid & OBD_MD_FLGENER) id_gen(&lli->lli_id) = id_gen(&body->id1); + spin_lock(&lli->lli_lock); + if (ll_acl_access != NULL) { + if (lli->lli_acl_access != NULL) + posix_acl_release(lli->lli_acl_access); + lli->lli_acl_access = ll_acl_access; + } + spin_unlock(&lli->lli_lock); + if (body->valid & OBD_MD_FLID) inode->i_ino = id_ino(&body->id1); if (body->valid & OBD_MD_FLGENER) @@ -1415,7 +1481,7 @@ int ll_iocontrol(struct inode *inode, struct file *file, struct mds_body *body; ll_inode2id(&id, inode); - rc = md_getattr(sbi->ll_md_exp, &id, valid, 0, &req); + rc = md_getattr(sbi->ll_md_exp, &id, valid, NULL, 0, 0, &req); if (rc) { CERROR("failure %d inode %lu\n", rc, inode->i_ino); RETURN(-abs(rc)); diff --git a/lustre/llite/llite_nfs.c b/lustre/llite/llite_nfs.c index e94b605..2d35405 100644 --- a/lustre/llite/llite_nfs.c +++ b/lustre/llite/llite_nfs.c @@ -61,7 +61,8 @@ static struct inode *search_inode_for_lustre(struct super_block *sb, id_ino(&id) = (__u64)ino; id_gen(&id) = generation; - rc = md_getattr(sbi->ll_md_exp, &id, valid, eadatalen, &req); + rc = md_getattr(sbi->ll_md_exp, &id, valid, NULL, 0, + eadatalen, &req); if (rc) { CERROR("failure %d inode %lu\n", rc, ino); return ERR_PTR(rc); diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 665e9d7..815c1ac 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -35,7 +35,6 @@ struct file_operations llite_dump_pgcache_fops; struct file_operations ll_ra_stats_fops; struct file_operations llite_wait_times_fops; - #ifndef LPROCFS int lprocfs_register_mountpoint(struct proc_dir_entry *parent, struct super_block *sb, char *osc, char *mdc) @@ -263,6 +262,126 @@ static int ll_wr_max_read_ahead_mb(struct file *file, const char *buffer, return count; } +static int ll_rd_gns_upcall(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int len; + + down(&sbi->ll_gns_sem); + len = snprintf(page, count, "%s\n", sbi->ll_gns_upcall); + up(&sbi->ll_gns_sem); + + return len; +} + +static int ll_wr_gns_upcall(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + down(&sbi->ll_gns_sem); + snprintf(sbi->ll_gns_upcall, count, "%s", buffer); + up(&sbi->ll_gns_sem); + + return count; +} + +static int ll_rd_gns_object_name(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int len; + + down(&sbi->ll_gns_sem); + len = snprintf(page, count, "%s\n", sbi->ll_gns_oname); + up(&sbi->ll_gns_sem); + + return len; +} + +static int ll_wr_gns_object_name(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + down(&sbi->ll_gns_sem); + snprintf(sbi->ll_gns_oname, count, "%s", buffer); + up(&sbi->ll_gns_sem); + + return count; +} + +static int ll_rd_gns_timeout(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int len; + + down(&sbi->ll_gns_sem); + len = snprintf(page, count, "%lu\n", + (unsigned long)sbi->ll_gns_timeout); + up(&sbi->ll_gns_sem); + + return len; +} + +static int ll_wr_gns_timeout(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + down(&sbi->ll_gns_sem); + sbi->ll_gns_timeout = val; + up(&sbi->ll_gns_sem); + + return count; +} + +static int ll_rd_gns_tick(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int len; + + down(&sbi->ll_gns_sem); + len = snprintf(page, count, "%lu\n", + (unsigned long)sbi->ll_gns_tick); + up(&sbi->ll_gns_sem); + + return len; +} + +static int ll_wr_gns_tick(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = (struct super_block *)data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + down(&sbi->ll_gns_sem); + if (sbi->ll_gns_tick < sbi->ll_gns_timeout) + sbi->ll_gns_tick = val; + up(&sbi->ll_gns_sem); + + return count; +} static struct lprocfs_vars lprocfs_obd_vars[] = { { "uuid", ll_rd_sb_uuid, 0, 0 }, //{ "mntpt_path", ll_rd_path, 0, 0 }, @@ -278,6 +397,19 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { { "config_update", 0, ll_wr_config_update, 0 }, { "max_read_ahead_mb", ll_rd_max_read_ahead_mb, ll_wr_max_read_ahead_mb, 0 }, + + { "gns_upcall", ll_rd_gns_upcall, + ll_wr_gns_upcall, 0 }, + + { "gns_timeout", ll_rd_gns_timeout, + ll_wr_gns_timeout, 0 }, + + { "gns_tick", ll_rd_gns_tick, + ll_wr_gns_tick, 0 }, + + { "gns_object_name", ll_rd_gns_object_name, + ll_wr_gns_object_name, 0 }, + { 0 } }; @@ -329,7 +461,8 @@ struct llite_file_opcode { "direct_read" }, { LPROC_LL_DIRECT_WRITE, LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES, "direct_write" }, - + { LPROC_LL_SETXATTR, LPROCFS_TYPE_REGS, "setxattr" }, + { LPROC_LL_GETXATTR, LPROCFS_TYPE_REGS, "getxattr" }, }; int lprocfs_register_mountpoint(struct proc_dir_entry *parent, diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 0b16f62..d291096 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -254,7 +254,7 @@ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de) list_del_init(&dentry->d_lru); hlist_del_init(&dentry->d_hash); - __d_rehash(dentry, 0); /* avoid taking dcache_lock inside */ + __d_rehash(dentry); /* avoid taking dcache_lock inside */ spin_unlock(&dcache_lock); atomic_inc(&dentry->d_count); iput(inode); @@ -294,7 +294,7 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset, CDEBUG(D_DLMTRACE, "setting l_data to inode %p (%lu/%u)\n", inode, inode->i_ino, inode->i_generation); - mdc_set_lock_data(NULL, &it->d.lustre.it_lock_handle, inode); + mdc_set_lock_data(NULL, &LUSTRE_IT(it)->it_lock_handle, inode); /* If this is a stat, get the authoritative file size */ if (it->it_op == IT_GETATTR && S_ISREG(inode->i_mode) && @@ -329,20 +329,17 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset, } static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, - struct nameidata *nd, struct lookup_intent *it, - int flags) + struct nameidata *nd, int flags) { struct dentry *save = dentry, *retval; + struct lookup_intent *it = flags ? &nd->intent.open : NULL; struct lustre_id pid; struct it_cb_data icbd; struct ptlrpc_request *req = NULL; struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; - int rc; + int rc, orig_it; ENTRY; - if (dentry->d_name.len > EXT3_NAME_LEN) - RETURN(ERR_PTR(-ENAMETOOLONG)); - CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n", dentry->d_name.name, parent->i_ino, parent->i_generation, parent, LL_IT2STR(it)); @@ -353,6 +350,7 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, if (nd != NULL) nd->mnt->mnt_last_used = jiffies; + orig_it = it ? it->it_op : IT_OPEN; ll_frob_intent(&it, &lookup_it); icbd.icbd_childp = &dentry; @@ -376,8 +374,12 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, if (nd && dentry->d_inode != NULL && dentry->d_inode->i_mode & S_ISUID && S_ISDIR(dentry->d_inode->i_mode) && - (flags & LOOKUP_CONTINUE || (it->it_op & (IT_CHDIR | IT_OPEN)))) - ll_dir_process_mount_object(dentry, nd->mnt); + ((flags & LOOKUP_CONTINUE) || (orig_it & (IT_CHDIR | IT_OPEN)))) + { + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_GNS_PENDING; + spin_unlock(&dentry->d_lock); + } if (dentry == save) GOTO(out, retval = NULL); @@ -386,6 +388,8 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, out: if (req) ptlrpc_req_finished(req); + if (it == &lookup_it) + ll_intent_release(it); if (dentry->d_inode) CDEBUG(D_INODE, "lookup 0x%p in %lu/%lu: %*s -> %lu/%lu\n", dentry, @@ -411,9 +415,9 @@ static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry, ENTRY; if (nd && nd->flags & LOOKUP_LAST && !(nd->flags & LOOKUP_LINK_NOTLAST)) - de = ll_lookup_it(parent, dentry, nd, &nd->intent, nd->flags); + de = ll_lookup_it(parent, dentry, nd, nd->flags); else - de = ll_lookup_it(parent, dentry, nd, NULL, 0); + de = ll_lookup_it(parent, dentry, nd, 0); RETURN(de); } @@ -431,9 +435,10 @@ static struct inode *ll_create_node(struct inode *dir, const char *name, int rc; ENTRY; - LASSERT(it && it->d.lustre.it_disposition); - request = it->d.lustre.it_data; + LASSERT(it && LUSTRE_IT(it)->it_disposition); + + request = LUSTRE_IT(it)->it_data; rc = ll_prep_inode(sbi->ll_dt_exp, sbi->ll_md_exp, &inode, request, 1, dir->i_sb); if (rc) @@ -446,7 +451,7 @@ static struct inode *ll_create_node(struct inode *dir, const char *name, * stuff it in the lock. */ CDEBUG(D_DLMTRACE, "setting l_ast_data to inode %p (%lu/%u)\n", inode, inode->i_ino, inode->i_generation); - mdc_set_lock_data(NULL, &it->d.lustre.it_lock_handle, inode); + mdc_set_lock_data(NULL, &LUSTRE_IT(it)->it_lock_handle, inode); EXIT; out: ptlrpc_req_finished(request); @@ -471,7 +476,7 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode, struct lookup_intent *it) { struct inode *inode; - struct ptlrpc_request *request = it->d.lustre.it_data; + struct ptlrpc_request *request = LUSTRE_IT(it)->it_data; struct obd_export *md_exp = ll_i2mdexp(dir); int rc = 0; ENTRY; @@ -497,7 +502,7 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry, int mode, #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) static int ll_create_nd(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) { - return ll_create_it(dir, dentry, mode, &nd->intent); + return ll_create_it(dir, dentry, mode, &nd->intent.open); } #endif @@ -533,9 +538,6 @@ static int ll_mknod_raw(struct nameidata *nd, int mode, dev_t rdev) CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n", name, dir->i_ino, dir->i_generation, dir); - if (dir->i_nlink >= EXT3_LINK_MAX) - RETURN(err); - mode &= ~current->fs->umask; switch (mode & S_IFMT) { @@ -582,9 +584,6 @@ static int ll_mknod(struct inode *dir, struct dentry *child, CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n", name, dir->i_ino, dir->i_generation, dir); - if (dir->i_nlink >= EXT3_LINK_MAX) - RETURN(err); - mode &= ~current->fs->umask; switch (mode & S_IFMT) { @@ -640,14 +639,12 @@ static int ll_symlink_raw(struct nameidata *nd, const char *tgt) CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),target=%s\n", name, dir->i_ino, dir->i_generation, dir, tgt); - - if (dir->i_nlink >= EXT3_LINK_MAX) - RETURN(err); - + OBD_ALLOC(op_data, sizeof(*op_data)); if (op_data == NULL) RETURN(-ENOMEM); ll_prepare_mdc_data(op_data, dir, NULL, name, len, 0); + LASSERT(tgt); err = md_create(sbi->ll_md_exp, op_data, tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO, current->fsuid, current->fsgid, 0, &request); @@ -883,17 +880,53 @@ static int ll_rename_raw(struct nameidata *oldnd, struct nameidata *newnd) RETURN(err); } +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) +#define LLITE_IT_RAWOPS (IT_MKNOD|IT_MKDIR|IT_SYMLINK|IT_LINK|IT_UNLINK|IT_RMDIR|IT_RENAME) +static int ll_rawop_from_intent(struct nameidata *nd) +{ + int error = 0; + + if (!nd || !(nd->intent.open.op & LLITE_IT_RAWOPS)) + return 0; + + switch (nd->intent.open.op) { + case IT_MKNOD: + error = ll_mknod_raw(nd, nd->intent.open.create_mode, + nd->intent.open.create.dev); + break; + case IT_MKDIR: + error = ll_mkdir_raw(nd, nd->intent.open.create_mode); + break; + case IT_RMDIR: + error = ll_rmdir_raw(nd); + break; + case IT_UNLINK: + error = ll_unlink_raw(nd); + break; + case IT_SYMLINK: + LASSERT(nd->intent.open.create.link); + error = ll_symlink_raw(nd, nd->intent.open.create.link); + break; + case IT_LINK: + error = ll_link_raw(nd->intent.open.create.source_nd, nd); + break; + case IT_RENAME: + LASSERT(nd->intent.open.create.source_nd); + error = ll_rename_raw(nd->intent.open.create.source_nd, nd); + break; + default: + LBUG(); + } + if (error != -EOPNOTSUPP) + nd->intent.open.flags |= IT_STATUS_RAW; + + return error; +} +#endif + struct inode_operations ll_dir_inode_operations = { - .link_raw = ll_link_raw, - .unlink_raw = ll_unlink_raw, - .symlink_raw = ll_symlink_raw, - .mkdir_raw = ll_mkdir_raw, - .rmdir_raw = ll_rmdir_raw, - .mknod_raw = ll_mknod_raw, .mknod = ll_mknod, - .rename_raw = ll_rename_raw, .setattr = ll_setattr, - .setattr_raw = ll_setattr_raw, #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) .create_it = ll_create_it, .lookup_it = ll_lookup_it, @@ -901,6 +934,12 @@ struct inode_operations ll_dir_inode_operations = { #else .lookup = ll_lookup_nd, .create = ll_create_nd, - .getattr_it = ll_getattr, + .getattr = ll_getattr, + .endparentlookup = ll_rawop_from_intent, #endif + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, + .permission = ll_inode_permission, }; diff --git a/lustre/llite/special.c b/lustre/llite/special.c index ae0d11f..befc716 100644 --- a/lustre/llite/special.c +++ b/lustre/llite/special.c @@ -320,7 +320,7 @@ static int ll_special_open(struct inode *inode, struct file *filp) rc = err; } - req = it->d.lustre.it_data; + req = LUSTRE_IT(it)->it_data; if (req) ptlrpc_req_finished(req); @@ -338,13 +338,18 @@ static int ll_special_file_release(struct inode *inode, struct file *filp) } struct inode_operations ll_special_inode_operations = { - .setattr_raw = ll_setattr_raw, .setattr = ll_setattr, #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) - .getattr_it = ll_getattr, + .getattr = ll_getattr, #else .revalidate_it = ll_inode_revalidate_it, #endif + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, + .permission = ll_inode_permission, + }; struct file_operations ll_special_chr_inode_fops = { diff --git a/lustre/llite/super.c b/lustre/llite/super.c index fcb89b0..f267dfc 100644 --- a/lustre/llite/super.c +++ b/lustre/llite/super.c @@ -64,8 +64,7 @@ static struct super_block *lustre_read_super(struct super_block *sb, static void ll_umount_lustre(struct super_block *sb) { struct ll_sb_info *sbi = ll_s2sbi(sb); - - ll_gns_umount_all(sbi, 0); + ll_gns_check_all(sbi, LL_GNS_UMOUNT); } static struct file_system_type lustre_lite_fs_type = { @@ -108,6 +107,16 @@ static int __init init_lustre_lite(void) if (ll_file_data_slab == NULL) return -ENOMEM; + ll_intent_slab = kmem_cache_create("lustre_intent_data", + sizeof(struct lustre_intent_data), + 0, SLAB_HWCACHE_ALIGN, NULL, + NULL); + if (ll_intent_slab == NULL) { + kmem_cache_destroy(ll_file_data_slab); + return -ENOMEM; + } + + proc_lustre_fs_root = proc_lustre_root ? proc_mkdir("llite", proc_lustre_root) : NULL; rc = register_filesystem(&lustre_lite_fs_type); @@ -146,6 +155,8 @@ static void __exit exit_lustre_lite(void) LASSERTF(kmem_cache_destroy(ll_file_data_slab) == 0, "couldn't destroy ll_file_data slab\n"); + LASSERTF(kmem_cache_destroy(ll_intent_slab) == 0, + "couldn't destroy ll_intent_slab slab\n"); if (proc_lustre_fs_root) { lprocfs_remove(proc_lustre_fs_root); diff --git a/lustre/llite/super25.c b/lustre/llite/super25.c index 8ebcc4b..22c165a 100644 --- a/lustre/llite/super25.c +++ b/lustre/llite/super25.c @@ -114,7 +114,7 @@ struct file_system_type lustre_lite_fs_type = { .name = "lustre_lite", .get_sb = ll_get_sb, .kill_sb = kill_anon_super, - .fs_flags = FS_BINARY_MOUNTDATA, + .fs_flags = FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; struct file_system_type lustre_fs_type = { @@ -122,7 +122,7 @@ struct file_system_type lustre_fs_type = { .name = "lustre", .get_sb = lustre_get_sb, .kill_sb = kill_anon_super, - .fs_flags = FS_BINARY_MOUNTDATA, + .fs_flags = FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; static int __init init_lustre_lite(void) @@ -143,6 +143,16 @@ static int __init init_lustre_lite(void) rc = -ENOMEM; goto out; } + ll_intent_slab = kmem_cache_create("lustre_intent_data", + sizeof(struct lustre_intent_data), + 0, SLAB_HWCACHE_ALIGN, NULL, + NULL); + if (ll_intent_slab == NULL) { + kmem_cache_destroy(ll_file_data_slab); + ll_destroy_inodecache(); + return -ENOMEM; + } + proc_lustre_fs_root = proc_lustre_root ? proc_mkdir("llite", proc_lustre_root) : NULL; @@ -178,9 +188,13 @@ static void __exit exit_lustre_lite(void) unregister_filesystem(&lustre_fs_type); unregister_filesystem(&lustre_lite_fs_type); ll_destroy_inodecache(); + + ll_gns_stop_thread(); LASSERTF(kmem_cache_destroy(ll_file_data_slab) == 0, "couldn't destroy ll_file_data slab\n"); + LASSERTF(kmem_cache_destroy(ll_intent_slab) == 0, + "couldn't destroy ll_intent_slab slab\n"); if (proc_lustre_fs_root) { lprocfs_remove(proc_lustre_fs_root); proc_lustre_fs_root = NULL; diff --git a/lustre/llite/symlink.c b/lustre/llite/symlink.c index f913d8a..6061f74 100644 --- a/lustre/llite/symlink.c +++ b/lustre/llite/symlink.c @@ -49,8 +49,9 @@ static int ll_readlink_internal(struct inode *inode, } ll_inode2id(&id, inode); - rc = md_getattr(sbi->ll_md_exp, &id, OBD_MD_LINKNAME, symlen, + rc = md_getattr(sbi->ll_md_exp, &id, OBD_MD_LINKNAME, NULL, 0, symlen, request); + if (rc) { if (rc != -ENOENT) CERROR("inode %lu: rc = %d\n", inode->i_ino, rc); @@ -152,11 +153,14 @@ static int ll_follow_link(struct dentry *dentry, struct nameidata *nd) struct inode_operations ll_fast_symlink_inode_operations = { .readlink = ll_readlink, .setattr = ll_setattr, - .setattr_raw = ll_setattr_raw, .follow_link = ll_follow_link, + .setxattr = ll_setxattr, + .getxattr = ll_getxattr, + .listxattr = ll_listxattr, + .removexattr = ll_removexattr, #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) .revalidate_it = ll_inode_revalidate_it #else - .getattr_it = ll_getattr + .getattr = ll_getattr #endif }; diff --git a/lustre/lmv/lmv_intent.c b/lustre/lmv/lmv_intent.c index ce3d0f0..205d4a7 100644 --- a/lustre/lmv/lmv_intent.c +++ b/lustre/lmv/lmv_intent.c @@ -46,14 +46,16 @@ #include #include #include +#include +#include #include "lmv_internal.h" static inline void lmv_drop_intent_lock(struct lookup_intent *it) { - if (it->d.lustre.it_lock_mode != 0) - ldlm_lock_decref((void *)&it->d.lustre.it_lock_handle, - it->d.lustre.it_lock_mode); + if (LUSTRE_IT(it)->it_lock_mode != 0) + ldlm_lock_decref((void *)&LUSTRE_IT(it)->it_lock_handle, + LUSTRE_IT(it)->it_lock_mode); } int lmv_handle_remote_inode(struct obd_export *exp, void *lmm, @@ -89,17 +91,17 @@ int lmv_handle_remote_inode(struct obd_export *exp, void *lmm, } /* we got LOOKUP lock, but we really need attrs */ - pmode = it->d.lustre.it_lock_mode; + pmode = LUSTRE_IT(it)->it_lock_mode; if (pmode) { - memcpy(&plock, &it->d.lustre.it_lock_handle, + memcpy(&plock, &LUSTRE_IT(it)->it_lock_handle, sizeof(plock)); - it->d.lustre.it_lock_mode = 0; + LUSTRE_IT(it)->it_lock_mode = 0; } LASSERT((body->valid & OBD_MD_FID) != 0); nid = body->id1; - it->d.lustre.it_disposition &= ~DISP_ENQ_COMPLETE; + LUSTRE_IT(it)->it_disposition &= ~DISP_ENQ_COMPLETE; rc = md_intent_lock(lmv->tgts[id_group(&nid)].ltd_exp, &nid, NULL, 0, lmm, lmmsize, NULL, it, flags, &req, cb_blocking); @@ -110,9 +112,9 @@ int lmv_handle_remote_inode(struct obd_export *exp, void *lmm, */ if (rc == 0) { lmv_drop_intent_lock(it); - memcpy(&it->d.lustre.it_lock_handle, &plock, + memcpy(&LUSTRE_IT(it)->it_lock_handle, &plock, sizeof(plock)); - it->d.lustre.it_lock_mode = pmode; + LUSTRE_IT(it)->it_lock_mode = pmode; } else if (pmode) ldlm_lock_decref(&plock, pmode); @@ -194,7 +196,7 @@ repeat: * nothing is found, do not access body->id1 as it is zero and thus * pointless. */ - if (it->d.lustre.it_disposition & DISP_LOOKUP_NEG) + if (LUSTRE_IT(it)->it_disposition & DISP_LOOKUP_NEG) RETURN(0); /* caller may use attrs MDS returns on IT_OPEN lock request so, we have @@ -317,7 +319,7 @@ int lmv_intent_getattr(struct obd_export *exp, struct lustre_id *pid, * nothing is found, do not access body->id1 as it is zero and thus * pointless. */ - if (it->d.lustre.it_disposition & DISP_LOOKUP_NEG) + if (LUSTRE_IT(it)->it_disposition & DISP_LOOKUP_NEG) RETURN(0); body = lustre_msg_buf((*reqp)->rq_repmsg, 1, sizeof(*body)); @@ -406,11 +408,13 @@ int lmv_lookup_slaves(struct obd_export *exp, struct ptlrpc_request **reqp) /* is obj valid? */ memset(&it, 0, sizeof(it)); it.it_op = IT_GETATTR; + OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data)); + rc = md_intent_lock(lmv->tgts[id_group(&id)].ltd_exp, &id, NULL, 0, NULL, 0, &id, &it, 0, &req, lmv_dirobj_blocking_ast); - lockh = (struct lustre_handle *)&it.d.lustre.it_lock_handle; + lockh = (struct lustre_handle *)&LUSTRE_IT(&it)->it_lock_handle; if (rc > 0 && req == NULL) { /* nice, this slave is valid */ LASSERT(req == NULL); @@ -418,10 +422,11 @@ int lmv_lookup_slaves(struct obd_export *exp, struct ptlrpc_request **reqp) goto release_lock; } - if (rc < 0) + if (rc < 0) { + OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data)); /* error during lookup */ GOTO(cleanup, rc); - + } lock = ldlm_handle2lock(lockh); LASSERT(lock); @@ -442,8 +447,9 @@ int lmv_lookup_slaves(struct obd_export *exp, struct ptlrpc_request **reqp) release_lock: lmv_update_body_from_obj(body, obj->objs + i); - if (it.d.lustre.it_lock_mode) - ldlm_lock_decref(lockh, it.d.lustre.it_lock_mode); + if (LUSTRE_IT(&it)->it_lock_mode) + ldlm_lock_decref(lockh, LUSTRE_IT(&it)->it_lock_mode); + OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data)); } EXIT; @@ -655,8 +661,10 @@ int lmv_revalidate_slaves(struct obd_export *exp, struct ptlrpc_request **reqp, memset(&it, 0, sizeof(it)); it.it_op = IT_GETATTR; + cb = lmv_dirobj_blocking_ast; + OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data)); if (id_equal_fid(&id, &obj->id)) { if (master_valid) { /* lmv_intent_getattr() already checked @@ -678,11 +686,12 @@ int lmv_revalidate_slaves(struct obd_export *exp, struct ptlrpc_request **reqp, cb = cb_blocking; } + /* is obj valid? */ rc = md_intent_lock(lmv->tgts[id_group(&id)].ltd_exp, &id, NULL, 0, NULL, 0, &id, &it, 0, &req, cb); - lockh = (struct lustre_handle *) &it.d.lustre.it_lock_handle; + lockh = (struct lustre_handle *) &LUSTRE_IT(&it)->it_lock_handle; if (rc > 0 && req == NULL) { /* nice, this slave is valid */ LASSERT(req == NULL); @@ -690,17 +699,18 @@ int lmv_revalidate_slaves(struct obd_export *exp, struct ptlrpc_request **reqp, goto release_lock; } - if (rc < 0) + if (rc < 0) { + OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data)); /* error during revalidation */ GOTO(cleanup, rc); - + } if (master) { LASSERT(master_valid == 0); /* save lock on master to be returned to the caller */ CDEBUG(D_OTHER, "no lock on master yet\n"); memcpy(&master_lockh, lockh, sizeof(master_lockh)); - master_lock_mode = it.d.lustre.it_lock_mode; - it.d.lustre.it_lock_mode = 0; + master_lock_mode = LUSTRE_IT(&it)->it_lock_mode; + LUSTRE_IT(&it)->it_lock_mode = 0; } else { /* this is slave. we want to control it */ lock = ldlm_handle2lock(lockh); @@ -726,14 +736,15 @@ update: CDEBUG(D_OTHER, "fresh: %lu\n", (unsigned long)obj->objs[i].size); - + if (req) ptlrpc_req_finished(req); release_lock: size += obj->objs[i].size; - if (it.d.lustre.it_lock_mode) - ldlm_lock_decref(lockh, it.d.lustre.it_lock_mode); + if (LUSTRE_IT(&it)->it_lock_mode) + ldlm_lock_decref(lockh, LUSTRE_IT(&it)->it_lock_mode); + OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data)); } if (*reqp) { @@ -757,16 +768,16 @@ release_lock: // body->mds = id_group(&obj->id); } if (master_valid == 0) { - memcpy(&oit->d.lustre.it_lock_handle, + memcpy(&LUSTRE_IT(oit)->it_lock_handle, &master_lockh, sizeof(master_lockh)); - oit->d.lustre.it_lock_mode = master_lock_mode; + LUSTRE_IT(oit)->it_lock_mode = master_lock_mode; } rc = 0; } else { /* it seems all the attrs are fresh and we did no request */ CDEBUG(D_OTHER, "all the attrs were fresh\n"); if (master_valid == 0) - oit->d.lustre.it_lock_mode = master_lock_mode; + LUSTRE_IT(oit)->it_lock_mode = master_lock_mode; rc = 1; } diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index 7aac1f0..86b1f97 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -31,6 +31,7 @@ #include #include #include +#include #else #include #endif @@ -47,6 +48,7 @@ #include #include #include +#include #include "lmv_internal.h" /* object cache. */ @@ -675,8 +677,8 @@ static int lmv_getstatus(struct obd_export *exp, struct lustre_id *id) } static int lmv_getattr(struct obd_export *exp, struct lustre_id *id, - __u64 valid, unsigned int ea_size, - struct ptlrpc_request **request) + __u64 valid, const char *ea_name, int ea_namelen, + unsigned int ea_size, struct ptlrpc_request **request) { struct obd_device *obd = exp->exp_obd; struct lmv_obd *lmv = &obd->u.lmv; @@ -690,8 +692,9 @@ static int lmv_getattr(struct obd_export *exp, struct lustre_id *id, LASSERT(i < lmv->desc.ld_tgt_count); + rc = md_getattr(lmv->tgts[i].ltd_exp, id, valid, - ea_size, request); + ea_name, ea_namelen, ea_size, request); if (rc) RETURN(rc); @@ -860,7 +863,7 @@ int lmv_get_mea_and_update_object(struct obd_export *exp, /* time to update mea of parent id */ rc = md_getattr(lmv->tgts[id_group(id)].ltd_exp, - id, valid, mealen, &req); + id, valid, NULL, 0, mealen, &req); if (rc) { CERROR("md_getattr() failed, error %d\n", rc); GOTO(cleanup, rc); @@ -994,17 +997,17 @@ int lmv_enqueue_slaves(struct obd_export *exp, int locktype, cb_compl, cb_blocking, cb_data); CDEBUG(D_OTHER, "take lock on slave "DLID4" -> %d/%d\n", - OLID4(&mea->mea_ids[i]), rc, it->d.lustre.it_status); + OLID4(&mea->mea_ids[i]), rc, LUSTRE_IT(it)->it_status); if (rc) GOTO(cleanup, rc); - if (it->d.lustre.it_data) { + if (LUSTRE_IT(it)->it_data) { struct ptlrpc_request *req; - req = (struct ptlrpc_request *)it->d.lustre.it_data; + req = (struct ptlrpc_request *) LUSTRE_IT(it)->it_data; ptlrpc_req_finished(req); } - if (it->d.lustre.it_status) - GOTO(cleanup, rc = it->d.lustre.it_status); + if (LUSTRE_IT(it)->it_status) + GOTO(cleanup, rc = LUSTRE_IT(it)->it_status); } OBD_FREE(data2, sizeof(*data2)); @@ -1827,7 +1830,46 @@ int lmv_set_info(struct obd_export *exp, obd_count keylen, lmv_set_timeouts(obd); RETURN(0); } - + + /* maybe this could be default */ + if ((keylen == strlen("sec") && strcmp(key, "sec") == 0) || + (keylen == strlen("nllu") && strcmp(key, "nllu") == 0)) { + struct lmv_tgt_desc *tgt; + struct obd_export *exp; + int rc = 0, err, i; + + spin_lock(&lmv->lmv_lock); + for (i = 0, tgt = lmv->tgts; i < lmv->desc.ld_tgt_count; + i++, tgt++) { + exp = tgt->ltd_exp; + /* during setup time the connections to mdc might + * haven't been established. + */ + if (exp == NULL) { + struct obd_device *tgt_obd; + + tgt_obd = class_find_client_obd(&tgt->uuid, + LUSTRE_MDC_NAME, + &obd->obd_uuid); + if (!tgt_obd) { + CERROR("can't set info %s, " + "device %s not attached?\n", + (char *) key, tgt->uuid.uuid); + rc = -EINVAL; + continue; + } + exp = tgt_obd->obd_self_export; + } + + err = obd_set_info(exp, keylen, key, vallen, val); + if (!rc) + rc = err; + } + spin_unlock(&lmv->lmv_lock); + + RETURN(rc); + } + RETURN(-EINVAL); } diff --git a/lustre/lmv/lmv_objmgr.c b/lustre/lmv/lmv_objmgr.c index 447320c..4c2ef10 100644 --- a/lustre/lmv/lmv_objmgr.c +++ b/lustre/lmv/lmv_objmgr.c @@ -310,7 +310,7 @@ lmv_create_obj(struct obd_export *exp, struct lustre_id *id, struct mea *mea) valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA; rc = md_getattr(lmv->tgts[id_group(id)].ltd_exp, - id, valid, mealen, &req); + id, valid, NULL, 0, mealen, &req); if (rc) { CERROR("md_getattr() failed, error %d\n", rc); GOTO(cleanup, obj = ERR_PTR(rc)); diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 18aa9e4..7246a1d 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -3116,6 +3116,41 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen, } else if (KEY_IS("unlinked") || KEY_IS("unrecovery")) { if (vallen != 0) RETURN(-EINVAL); + } else if (KEY_IS("sec")) { + struct lov_tgt_desc *tgt; + struct obd_export *exp; + int rc = 0, err, i; + + spin_lock(&lov->lov_lock); + for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; + i++, tgt++) { + exp = tgt->ltd_exp; + /* during setup time the connections to osc might + * haven't been established. + */ + if (exp == NULL) { + struct obd_device *tgt_obd; + + tgt_obd = class_find_client_obd(&tgt->uuid, + LUSTRE_OSC_NAME, + &obddev->obd_uuid); + if (!tgt_obd) { + CERROR("can't set security flavor, " + "device %s not attached?\n", + tgt->uuid.uuid); + rc = -EINVAL; + continue; + } + exp = tgt_obd->obd_self_export; + } + + err = obd_set_info(exp, keylen, key, vallen, val); + if (!rc) + rc = err; + } + spin_unlock(&lov->lov_lock); + + RETURN(rc); } else { RETURN(-EINVAL); } diff --git a/lustre/lvfs/lvfs_reint.c b/lustre/lvfs/lvfs_reint.c index 0bf6444..fbcf400 100644 --- a/lustre/lvfs/lvfs_reint.c +++ b/lustre/lvfs/lvfs_reint.c @@ -139,7 +139,7 @@ static int lvfs_reint_create(struct super_block *sb, struct reint_record *r_rec) handle = fsfilt->fs_start(dir, FSFILT_OP_SYMLINK, NULL, 0); if (IS_ERR(handle)) GOTO(cleanup, rc = PTR_ERR(handle)); - rc = ll_vfs_symlink(dir, dentry, new_path); + rc = ll_vfs_symlink(dir, dentry, new_path, S_IALLUGO); break; } case S_IFCHR: diff --git a/lustre/mdc/autoMakefile.am b/lustre/mdc/autoMakefile.am index e46e120..4a71d24 100644 --- a/lustre/mdc/autoMakefile.am +++ b/lustre/mdc/autoMakefile.am @@ -5,7 +5,7 @@ if LIBLUSTRE noinst_LIBRARIES = libmdc.a -libmdc_a_SOURCES = mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c +libmdc_a_SOURCES = #mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c libmdc_a_CPPFLAGS = $(LLCPPFLAGS) libmdc_a_CFLAGS = $(LLCFLAGS) endif diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index 2478afc..e3bda59 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -37,17 +37,19 @@ #include #include #include +#include +#include #include "mdc_internal.h" int it_disposition(struct lookup_intent *it, int flag) { - return it->d.lustre.it_disposition & flag; + return LUSTRE_IT(it)->it_disposition & flag; } EXPORT_SYMBOL(it_disposition); void it_set_disposition(struct lookup_intent *it, int flag) { - it->d.lustre.it_disposition |= flag; + LUSTRE_IT(it)->it_disposition |= flag; } EXPORT_SYMBOL(it_set_disposition); @@ -88,33 +90,33 @@ int it_open_error(int phase, struct lookup_intent *it) { if (it_disposition(it, DISP_OPEN_OPEN)) { if (phase == DISP_OPEN_OPEN) - return it->d.lustre.it_status; + return LUSTRE_IT(it)->it_status; else return 0; } if (it_disposition(it, DISP_OPEN_CREATE)) { if (phase == DISP_OPEN_CREATE) - return it->d.lustre.it_status; + return LUSTRE_IT(it)->it_status; else return 0; } if (it_disposition(it, DISP_LOOKUP_EXECD)) { if (phase == DISP_LOOKUP_EXECD) - return it->d.lustre.it_status; + return LUSTRE_IT(it)->it_status; else return 0; } if (it_disposition(it, DISP_IT_EXECD)) { if (phase == DISP_IT_EXECD) - return it->d.lustre.it_status; + return LUSTRE_IT(it)->it_status; else return 0; } - CERROR("it disp: %X, status: %d\n", it->d.lustre.it_disposition, - it->d.lustre.it_status); + CERROR("it disp: %X, status: %d\n", LUSTRE_IT(it)->it_disposition, + LUSTRE_IT(it)->it_status); LBUG(); return 0; } @@ -199,10 +201,9 @@ int mdc_enqueue(struct obd_export *exp, int reqsize[6] = {[MDS_REQ_SECDESC_OFF] = 0, [MDS_REQ_INTENT_LOCKREQ_OFF] = sizeof(*lockreq), [MDS_REQ_INTENT_IT_OFF] = sizeof(*lit)}; - int repsize[4] = {sizeof(struct ldlm_reply), + int repsize[5] = {sizeof(struct ldlm_reply), sizeof(struct mds_body), - obddev->u.cli.cl_max_mds_easize, - obddev->u.cli.cl_max_mds_cookiesize}; + obddev->u.cli.cl_max_mds_easize}; int req_buffers = 3, reply_buffers = 0; int rc, flags = LDLM_FL_HAS_INTENT; void *eadata; @@ -240,10 +241,13 @@ int mdc_enqueue(struct obd_export *exp, it->it_create_mode, 0, it->it_flags, lmm, lmmsize); /* get ready for the reply */ - reply_buffers = 3; - req->rq_replen = lustre_msg_size(3, repsize); + repsize[3] = 4; + repsize[4] = xattr_acl_size(LL_ACL_MAX_ENTRIES); + reply_buffers = 5; + req->rq_replen = lustre_msg_size(5, repsize); } else if (it->it_op & (IT_GETATTR | IT_LOOKUP | IT_CHDIR)) { - __u64 valid = data->valid | OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE; + __u64 valid = data->valid | OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE | + OBD_MD_FLACL_ACCESS; reqsize[req_buffers++] = sizeof(struct mds_body); reqsize[req_buffers++] = data->namelen + 1; @@ -267,8 +271,10 @@ int mdc_enqueue(struct obd_export *exp, valid, it->it_flags, data); /* get ready for the reply */ - reply_buffers = 3; - req->rq_replen = lustre_msg_size(3, repsize); + repsize[3] = 4; + repsize[4] = xattr_acl_size(LL_ACL_MAX_ENTRIES); + reply_buffers = 5; + req->rq_replen = lustre_msg_size(5, repsize); } else if (it->it_op == IT_READDIR) { policy.l_inodebits.bits = MDS_INODELOCK_UPDATE; req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_DLM_VERSION, @@ -353,12 +359,12 @@ int mdc_enqueue(struct obd_export *exp, LASSERT(dlm_rep != NULL); /* checked by ldlm_cli_enqueue() */ LASSERT_REPSWABBED(req, 0); /* swabbed by ldlm_cli_enqueue() */ - it->d.lustre.it_disposition = (int) dlm_rep->lock_policy_res1; - it->d.lustre.it_status = (int) dlm_rep->lock_policy_res2; - it->d.lustre.it_lock_mode = lock_mode; - it->d.lustre.it_data = req; + LUSTRE_IT(it)->it_disposition = (int) dlm_rep->lock_policy_res1; + LUSTRE_IT(it)->it_status = (int) dlm_rep->lock_policy_res2; + LUSTRE_IT(it)->it_lock_mode = lock_mode; + LUSTRE_IT(it)->it_data = req; - if (it->d.lustre.it_status < 0 && req->rq_replay) { + if (LUSTRE_IT(it)->it_status < 0 && req->rq_replay) { LASSERT(req->rq_transno == 0); /* Don't hold error requests for replay. */ spin_lock(&req->rq_lock); @@ -367,10 +373,11 @@ int mdc_enqueue(struct obd_export *exp, } DEBUG_REQ(D_RPCTRACE, req, "disposition: %x, status: %d", - it->d.lustre.it_disposition, it->d.lustre.it_status); + LUSTRE_IT(it)->it_disposition, LUSTRE_IT(it)->it_status); /* We know what to expect, so we do any byte flipping required here */ - LASSERT(reply_buffers == 4 || reply_buffers == 3 || reply_buffers == 1); + LASSERT(reply_buffers == 5 || reply_buffers == 4 || + reply_buffers == 3 || reply_buffers == 1); if (reply_buffers >= 3) { struct mds_body *body; @@ -427,15 +434,15 @@ EXPORT_SYMBOL(mdc_enqueue); * ll_create/ll_open gets called. * * The server will return to us, in it_disposition, an indication of - * exactly what d.lustre.it_status refers to. + * exactly what d.lustre->it_status refers to. * - * If DISP_OPEN_OPEN is set, then d.lustre.it_status refers to the open() call, + * If DISP_OPEN_OPEN is set, then d.lustre->it_status refers to the open() call, * otherwise if DISP_OPEN_CREATE is set, then it status is the * creation failure mode. In either case, one of DISP_LOOKUP_NEG or * DISP_LOOKUP_POS will be set, indicating whether the child lookup * was successful. * - * Else, if DISP_LOOKUP_EXECD then d.lustre.it_status is the rc of the + * Else, if DISP_LOOKUP_EXECD then d.lustre->it_status is the rc of the * child lookup. */ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid, @@ -486,9 +493,9 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid, &lockh); } if (rc) { - memcpy(&it->d.lustre.it_lock_handle, &lockh, + memcpy(&LUSTRE_IT(it)->it_lock_handle, &lockh, sizeof(lockh)); - it->d.lustre.it_lock_mode = mode; + LUSTRE_IT(it)->it_lock_mode = mode; } /* Only return failure if it was not GETATTR by cid (from @@ -524,9 +531,9 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid, if (rc < 0) RETURN(rc); - memcpy(&it->d.lustre.it_lock_handle, &lockh, sizeof(lockh)); + memcpy(&LUSTRE_IT(it)->it_lock_handle, &lockh, sizeof(lockh)); } - request = *reqp = it->d.lustre.it_data; + request = *reqp = LUSTRE_IT(it)->it_data; LASSERT(request != NULL); /* If we're doing an IT_OPEN which did not result in an actual @@ -538,7 +545,7 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid, * 3440) */ if (it->it_op & IT_OPEN) { if (!it_disposition(it, DISP_OPEN_OPEN) || - it->d.lustre.it_status != 0) { + LUSTRE_IT(it)->it_status != 0) { unsigned long irqflags; spin_lock_irqsave(&request->rq_lock, irqflags); @@ -549,8 +556,8 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid, if (!it_disposition(it, DISP_IT_EXECD)) { /* The server failed before it even started executing the * intent, i.e. because it couldn't unpack the request. */ - LASSERT(it->d.lustre.it_status != 0); - RETURN(it->d.lustre.it_status); + LASSERT(LUSTRE_IT(it)->it_status != 0); + RETURN(LUSTRE_IT(it)->it_status); } rc = it_open_error(DISP_IT_EXECD, it); if (rc) @@ -620,15 +627,15 @@ int mdc_intent_lock(struct obd_export *exp, struct lustre_id *pid, if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL, LDLM_IBITS, &policy, LCK_NL, &old_lock)) { ldlm_lock_decref_and_cancel(&lockh, - it->d.lustre.it_lock_mode); + LUSTRE_IT(it)->it_lock_mode); memcpy(&lockh, &old_lock, sizeof(old_lock)); - memcpy(&it->d.lustre.it_lock_handle, &lockh, + memcpy(&LUSTRE_IT(it)->it_lock_handle, &lockh, sizeof(lockh)); } } CDEBUG(D_DENTRY, "D_IT dentry %*s intent: %s status %d disp %x rc %d\n", - len, name, ldlm_it2str(it->it_op), it->d.lustre.it_status, - it->d.lustre.it_disposition, rc); + len, name, ldlm_it2str(it->it_op), LUSTRE_IT(it)->it_status, + LUSTRE_IT(it)->it_disposition, rc); RETURN(rc); } diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index bd7af6a..6cf6e08 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -36,7 +36,9 @@ #include #include #include +#include #include +#include #include "mdc_internal.h" #define REQUEST_MINOR 244 @@ -159,10 +161,10 @@ int mdc_getstatus(struct obd_export *exp, struct lustre_id *rootid) int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size, struct ptlrpc_request *req) { - struct mds_body *body; + struct mds_body *body, *reqbody; void *eadata; int rc; - int repsize[2] = {sizeof(*body), 0}; + int repsize[4] = {sizeof(*body)}; int bufcount = 1; ENTRY; @@ -173,6 +175,14 @@ int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size, CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n", ea_size); } + + reqbody = lustre_msg_buf(req->rq_reqmsg, 1, sizeof(*reqbody)); + + if (reqbody->valid & OBD_MD_FLACL_ACCESS) { + repsize[bufcount++] = 4; + repsize[bufcount++] = xattr_acl_size(LL_ACL_MAX_ENTRIES); + } + req->rq_replen = lustre_msg_size(bufcount, repsize); mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); @@ -191,25 +201,32 @@ int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size, CDEBUG(D_NET, "mode: %o\n", body->mode); LASSERT_REPSWAB (req, 1); - if (body->eadatasize != 0) { + + /* Skip the check if getxattr/listxattr are called with no buffers */ + if ((reqbody->valid & (OBD_MD_FLEA | OBD_MD_FLEALIST)) && + (reqbody->eadatasize != 0)){ + if (body->eadatasize != 0) { /* reply indicates presence of eadata; check it's there... */ - eadata = lustre_msg_buf (req->rq_repmsg, 1, body->eadatasize); - if (eadata == NULL) { - CERROR ("Missing/short eadata\n"); - RETURN (-EPROTO); - } - } + eadata = lustre_msg_buf (req->rq_repmsg, 1, + body->eadatasize); + if (eadata == NULL) { + CERROR ("Missing/short eadata\n"); + RETURN (-EPROTO); + } + } + } RETURN (0); } int mdc_getattr(struct obd_export *exp, struct lustre_id *id, - __u64 valid, unsigned int ea_size, - struct ptlrpc_request **request) + __u64 valid, const char *ea_name, int ea_namelen, + unsigned int ea_size, struct ptlrpc_request **request) { struct ptlrpc_request *req; struct mds_body *body; - int size[2] = {0, sizeof(*body)}; + int bufcount = 2; + int size[3] = {0, sizeof(*body)}; int rc; ENTRY; @@ -218,8 +235,14 @@ int mdc_getattr(struct obd_export *exp, struct lustre_id *id, */ size[0] = mdc_get_secdesc_size(); + LASSERT((ea_name != NULL) == (ea_namelen != 0)); + if (valid & (OBD_MD_FLEA | OBD_MD_FLEALIST)) { + size[bufcount] = ea_namelen; + bufcount++; + } + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, - MDS_GETATTR, 2, size, NULL); + MDS_GETATTR, bufcount, size, NULL); if (!req) GOTO(out, rc = -ENOMEM); @@ -230,6 +253,13 @@ int mdc_getattr(struct obd_export *exp, struct lustre_id *id, body->valid = valid; body->eadatasize = ea_size; + + if (valid & OBD_MD_FLEA) { + LASSERT(strnlen(ea_name, ea_namelen) == (ea_namelen - 1)); + memcpy(lustre_msg_buf(req->rq_reqmsg, 2, ea_namelen), + ea_name, ea_namelen); + } + rc = mdc_getattr_common(exp, ea_size, req); if (rc != 0) { ptlrpc_req_finished (req); @@ -304,6 +334,9 @@ int mdc_req2lustre_md(struct obd_export *exp_lmv, struct ptlrpc_request *req, unsigned int offset, struct obd_export *exp_lov, struct lustre_md *md) { + void *buf; + int size, acl_off; + struct posix_acl *acl; int rc = 0; ENTRY; @@ -378,8 +411,38 @@ int mdc_req2lustre_md(struct obd_export *exp_lmv, struct ptlrpc_request *req, CERROR("Detected invalid mea, which does not " "support neither old either new format.\n"); } else { - LASSERT(0); + LASSERT(S_ISCHR(md->body->mode) || + S_ISBLK(md->body->mode) || + S_ISFIFO(md->body->mode)|| + S_ISLNK(md->body->mode) || + S_ISSOCK(md->body->mode)); } + + acl_off = (md->body->valid & OBD_MD_FLEASIZE) ? (offset + 2) : + (offset + 1); + + if (md->body->valid & OBD_MD_FLACL_ACCESS) { + size = le32_to_cpu(*(__u32 *) lustre_msg_buf(req->rq_repmsg, + acl_off, 4)); + buf = lustre_msg_buf(req->rq_repmsg, acl_off + 1, size); + + acl = posix_acl_from_xattr(buf, size); + if (IS_ERR(acl)) { + rc = PTR_ERR(acl); + CERROR("convert xattr to acl failed: %d\n", rc); + RETURN(rc); + } else if (acl) { + rc = posix_acl_valid(acl); + if (rc) { + CERROR("acl valid error: %d\n", rc); + posix_acl_release(acl); + RETURN(rc); + } + } + + md->acl_access = acl; + } + RETURN(rc); } @@ -844,7 +907,38 @@ int mdc_set_info(struct obd_export *exp, obd_count keylen, imp->imp_server_timeout = 1; CDEBUG(D_OTHER, "%s: timeout / 2\n", exp->exp_obd->obd_name); RETURN(0); + } else if (keylen == strlen("sec") && memcmp(key, "sec", keylen) == 0) { + struct client_obd *cli = &exp->exp_obd->u.cli; + + if (vallen == strlen("null") && + memcmp(val, "null", vallen) == 0) { + cli->cl_sec_flavor = PTLRPC_SEC_NULL; + cli->cl_sec_subflavor = 0; + RETURN(0); + } + if (vallen == strlen("krb5i") && + memcmp(val, "krb5i", vallen) == 0) { + cli->cl_sec_flavor = PTLRPC_SEC_GSS; + cli->cl_sec_subflavor = PTLRPC_SEC_GSS_KRB5I; + RETURN(0); + } + if (vallen == strlen("krb5p") && + memcmp(val, "krb5p", vallen) == 0) { + cli->cl_sec_flavor = PTLRPC_SEC_GSS; + cli->cl_sec_subflavor = PTLRPC_SEC_GSS_KRB5P; + RETURN(0); + } + CERROR("unrecognized security type %s\n", (char*) val); + rc = -EINVAL; + } else if (keylen == strlen("nllu") && memcmp(key, "nllu", keylen) == 0) { + struct client_obd *cli = &exp->exp_obd->u.cli; + + LASSERT(vallen == sizeof(__u32) * 2); + cli->cl_nllu = ((__u32 *) val)[0]; + cli->cl_nllg = ((__u32 *) val)[1]; + RETURN(0); } + RETURN(rc); } diff --git a/lustre/mds/Makefile.in b/lustre/mds/Makefile.in index df14327..98f9e75 100644 --- a/lustre/mds/Makefile.in +++ b/lustre/mds/Makefile.in @@ -1,5 +1,5 @@ MODULES := mds mds-objs := mds_log.o mds_unlink_open.o mds_lov.o handler.o mds_reint.o -mds-objs += mds_fs.o lproc_mds.o mds_open.o mds_lib.o mds_lmv.o mds_groups.o +mds-objs += mds_fs.o lproc_mds.o mds_open.o mds_lib.o mds_lmv.o mds_lsd.o @INCLUDE_RULES@ diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index d328fd1..2b2c223 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) # include @@ -55,6 +56,7 @@ #include #include +#include #include "mds_internal.h" static int mds_intent_policy(struct ldlm_namespace *ns, @@ -720,33 +722,150 @@ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset, RETURN(rc); } +int mds_pack_link(struct dentry *dentry, struct ptlrpc_request *req, + struct mds_body *repbody, int reply_off) +{ + struct inode *inode = dentry->d_inode; + char *symname; + int len, rc; + ENTRY; + + symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1,0); + LASSERT(symname != NULL); + len = req->rq_repmsg->buflens[reply_off + 1]; + + rc = inode->i_op->readlink(dentry, symname, len); + if (rc < 0) { + CERROR("readlink failed: %d\n", rc); + } else if (rc != len - 1) { + CERROR ("Unexpected readlink rc %d: expecting %d\n", + rc, len - 1); + rc = -EINVAL; + } else { + CDEBUG(D_INODE, "read symlink dest %s\n", symname); + repbody->valid |= OBD_MD_LINKNAME; + repbody->eadatasize = rc + 1; + symname[rc] = 0; /* NULL terminate */ + rc = 0; + } + + RETURN(rc); +} + +int mds_pack_ea(struct dentry *dentry, struct ptlrpc_request *req, + struct mds_body *repbody, int req_off, int reply_off) +{ + struct inode *inode = dentry->d_inode; + char *ea_name; + void *value = NULL; + int len, rc; + ENTRY; + + ea_name = lustre_msg_string(req->rq_reqmsg, req_off + 1, 0); + len = req->rq_repmsg->buflens[reply_off + 1]; + if (len != 0) + value = lustre_msg_buf(req->rq_repmsg, reply_off + 1, len); + + rc = -EOPNOTSUPP; + if (inode->i_op && inode->i_op->getxattr) + rc = inode->i_op->getxattr(dentry, ea_name, value, len); + if (rc < 0) { + if (rc != -ENODATA && rc != -EOPNOTSUPP) + CERROR("getxattr failed: %d", rc); + } else { + repbody->valid |= OBD_MD_FLEA; + repbody->eadatasize = rc; + rc = 0; + } + + RETURN(rc); +} + +int mds_pack_ealist(struct dentry *dentry, struct ptlrpc_request *req, + struct mds_body *repbody, int reply_off) +{ + struct inode *inode = dentry->d_inode; + void *value = NULL; + int len, rc; + ENTRY; + + len = req->rq_repmsg->buflens[reply_off + 1]; + if (len != 0) + value = lustre_msg_buf(req->rq_repmsg, reply_off + 1, len); + + rc = -EOPNOTSUPP; + if (inode->i_op && inode->i_op->getxattr) + rc = inode->i_op->listxattr(dentry, value, len); + + if (rc < 0) { + CERROR("listxattr failed: %d", rc); + } else { + repbody->valid |= OBD_MD_FLEALIST; + repbody->eadatasize = rc; + rc = 0; + } + RETURN(rc); +} + +int mds_pack_acl(struct obd_device *obd, struct lustre_msg *repmsg, int offset, + struct mds_body *body, struct inode *inode) +{ + struct dentry de = { .d_inode = inode }; + void *buf; + __u32 buflen, *sizep, size; + ENTRY; + + if (!inode->i_op->getxattr) + RETURN(0); + + buflen = repmsg->buflens[offset + 1]; + buf = lustre_msg_buf(repmsg, offset + 1, buflen); + + size = inode->i_op->getxattr(&de, XATTR_NAME_ACL_ACCESS, buf, buflen); + if (size == -ENODATA) + RETURN(0); + if (size < 0) + RETURN(size); + LASSERT(size); + + sizep = lustre_msg_buf(repmsg, offset, 4); + if (!sizep) { + CERROR("can't locate returned acl size buf\n"); + RETURN(-EPROTO); + } + + *sizep = cpu_to_le32(size); + body->valid |= OBD_MD_FLACL_ACCESS; + + RETURN(0); +} + +/* + * we only take care of fsuid/fsgid. + */ void mds_squash_root(struct mds_obd *mds, struct mds_req_sec_desc *rsd, ptl_nid_t *peernid) { - if (!mds->mds_squash_uid || - (rsd->rsd_uid && rsd->rsd_fsuid)) + if (!mds->mds_squash_uid || rsd->rsd_fsuid) return; if (*peernid == mds->mds_nosquash_nid) return; - CDEBUG(D_OTHER, "squash req from 0x%llx, (%d:%d/%x)=>(%d:%d/%x)\n", + CDEBUG(D_SEC, "squash req from 0x%llx, (%d:%d/%x)=>(%d:%d/%x)\n", *peernid, rsd->rsd_fsuid, rsd->rsd_fsgid, rsd->rsd_cap, mds->mds_squash_uid, mds->mds_squash_gid, (rsd->rsd_cap & ~CAP_FS_MASK)); - rsd->rsd_uid = mds->mds_squash_uid; rsd->rsd_fsuid = mds->mds_squash_uid; rsd->rsd_fsgid = mds->mds_squash_gid; - - /* XXX should we remove all capabilities? */ rsd->rsd_cap &= ~CAP_FS_MASK; } static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry, - struct ptlrpc_request *req, struct mds_body *reqbody, - int reply_off) + struct ptlrpc_request *req, int req_off, + struct mds_body *reqbody, int reply_off) { struct inode *inode = dentry->d_inode; struct mds_body *body; @@ -782,30 +901,22 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry, OBD_MD_FLATIME | OBD_MD_FLMTIME); } else if (S_ISLNK(inode->i_mode) && (reqbody->valid & OBD_MD_LINKNAME) != 0) { - int len = req->rq_repmsg->buflens[reply_off + 1]; - char *symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1, 0); - - LASSERT(symname != NULL); /* caller prepped reply */ - - if (!inode->i_op->readlink) { - rc = -ENOSYS; - } else { - rc = inode->i_op->readlink(dentry, symname, len); - if (rc < 0) { - CERROR("readlink failed: %d\n", rc); - } else if (rc != len - 1) { - CERROR("Unexpected readlink rc %d: expecting %d\n", - rc, len - 1); - rc = -EINVAL; - } else { - CDEBUG(D_INODE, "read symlink dest %s\n", symname); - body->valid |= OBD_MD_LINKNAME; - body->eadatasize = rc + 1; - symname[rc] = 0; - rc = 0; - } - } + rc = mds_pack_link(dentry, req, body, reply_off); + } else if (reqbody->valid & OBD_MD_FLEA) { + rc = mds_pack_ea(dentry, req, body, req_off, reply_off); + } else if (reqbody->valid & OBD_MD_FLEALIST) { + rc = mds_pack_ealist(dentry, req, body, reply_off); } + + if (reqbody->valid & OBD_MD_FLACL_ACCESS) { + int inc = (reqbody->valid & OBD_MD_FLEASIZE) ? 2 : 1; + rc = mds_pack_acl(obd, req->rq_repmsg, reply_off + inc, + body, inode); + } + + /* do reverse uid/gid mapping if needed */ + if (rc == 0 && req->rq_remote) + mds_reverse_map_ugid(req, body); RETURN(rc); } @@ -834,13 +945,13 @@ out: return rc; } -static int mds_getattr_pack_msg(struct ptlrpc_request *req, - struct inode *inode, +static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct dentry *de, int offset) { + struct inode *inode = de->d_inode; struct mds_obd *mds = mds_req2mds(req); struct mds_body *body; - int rc = 0, size[2] = {sizeof(*body)}, bufcount = 1; + int rc = 0, size[4] = {sizeof(*body)}, bufcount = 1; ENTRY; body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*body)); @@ -853,8 +964,6 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, down(&inode->i_sem); rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0); up(&inode->i_sem); - CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n", - rc, inode->i_ino); if (rc < 0) { if (rc != -ENODATA) CERROR("error getting inode %lu MD: rc = %d\n", @@ -876,6 +985,42 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, bufcount++; CDEBUG(D_INODE, "symlink size: %Lu, reply space: %d\n", inode->i_size + 1, body->eadatasize); + } else if ((body->valid & OBD_MD_FLEA)) { + char *ea_name = lustre_msg_string(req->rq_reqmsg, + offset + 1, 0); + rc = -EOPNOTSUPP; + if (inode->i_op && inode->i_op->getxattr) + rc = inode->i_op->getxattr(de, ea_name, NULL, 0); + + if (rc < 0) { + if (rc != -ENODATA) + CERROR("error getting inode %lu EA: rc = %d\n", + inode->i_ino, rc); + size[bufcount] = 0; + } else { + size[bufcount] = min_t(int, body->eadatasize, rc); + } + bufcount++; + } else if (body->valid & OBD_MD_FLEALIST) { + rc = -EOPNOTSUPP; + if (inode->i_op && inode->i_op->getxattr) + rc = inode->i_op->listxattr(de, NULL, 0); + + if (rc < 0) { + if (rc != -ENODATA) + CERROR("error getting inode %lu EA: rc = %d\n", + inode->i_ino, rc); + size[bufcount] = 0; + } else { + size[bufcount] = min_t(int, body->eadatasize, rc); + } + bufcount++; + } + + /* may co-exist with OBD_MD_FLEASIZE */ + if (body->valid & OBD_MD_FLACL_ACCESS) { + size[bufcount++] = 4; + size[bufcount++] = xattr_acl_size(LL_ACL_MAX_ENTRIES); } if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) { @@ -935,7 +1080,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset, struct mds_req_sec_desc *rsd; struct mds_body *body; struct dentry *dparent = NULL, *dchild = NULL; - struct lvfs_ucred uc; + struct lvfs_ucred uc = {NULL, NULL,}; struct lustre_handle parent_lockh[2] = {{0}, {0}}; unsigned int namesize; int rc = 0, cleanup_phase = 0, resent_req = 0, update_mode, reply_offset; @@ -950,7 +1095,6 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset, CERROR("Can't unpack security desc\n"); RETURN(-EFAULT); } - mds_squash_root(mds, rsd, &req->rq_peer.peer_id.nid); /* swab now, before anyone looks inside the request. */ body = lustre_swab_reqbuf(req, offset, sizeof(*body), @@ -981,7 +1125,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset, reply_offset = 0; } - rc = mds_init_ucred(&uc, rsd); + rc = mds_init_ucred(&uc, req, rsd); if (rc) { CERROR("can't init ucred\n"); GOTO(cleanup, rc); @@ -1084,18 +1228,30 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset, id_fid(&body->id1), (unsigned long)id_group(&body->id1), child_lockh->cookie); - dparent = mds_id2dentry(obd, &body->id1, NULL); - LASSERT(dparent); - - dchild = ll_lookup_one_len(name, dparent, namesize - 1); - if (IS_ERR(dchild)) { - DEBUG_REQ(D_ERROR, req, "resent, not enqueuing new locks"); - CDEBUG(D_ERROR, "lock against [%lu:%lu]/%*s\n", - (unsigned long) id_ino(&body->id1), - (unsigned long) id_gen(&body->id1), - namesize - 1, name); + if (name) { + /* usual named request */ + dparent = mds_id2dentry(obd, &body->id1, NULL); + LASSERT(!IS_ERR(dparent)); + dchild = ll_lookup_one_len(name, dparent, namesize - 1); + if (IS_ERR(dchild)) { + DEBUG_REQ(D_ERROR, req, "resent, not enqueuing new locks"); + CDEBUG(D_ERROR, "lock against [%lu:%lu]/%*s\n", + (unsigned long) id_ino(&body->id1), + (unsigned long) id_gen(&body->id1), + namesize - 1, name); + } + LASSERT(!IS_ERR(dchild)); + } else { + /* client wants to get attr. by id */ + dchild = mds_id2dentry(obd, &body->id1, NULL); + if (IS_ERR(dchild)) { + DEBUG_REQ(D_ERROR, req, "resent, not enqueuing new locks"); + CDEBUG(D_ERROR, "lock against [%lu:%lu]\n", + (unsigned long) id_ino(&body->id1), + (unsigned long) id_gen(&body->id1)); + } + LASSERT(!IS_ERR(dchild)); } - LASSERT(!IS_ERR(dchild)); LDLM_LOCK_PUT(granted_lock); } @@ -1117,14 +1273,14 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset, if (dchild->d_flags & DCACHE_CROSS_REF) rc = mds_getattr_pack_msg_cf(req, dchild, offset); else - rc = mds_getattr_pack_msg(req, dchild->d_inode, offset); + rc = mds_getattr_pack_msg(req, dchild, offset); if (rc != 0) { CERROR ("mds_getattr_pack_msg: %d\n", rc); GOTO (cleanup, rc); } } - rc = mds_getattr_internal(obd, dchild, req, body, reply_offset); + rc = mds_getattr_internal(obd, dchild, req, offset, body, reply_offset); GOTO(cleanup, rc); /* returns the lock to the client */ cleanup: @@ -1145,6 +1301,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset, l_dput(dchild); case 1: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); + default: mds_exit_ucred(&uc); } return rc; @@ -1157,7 +1314,7 @@ static int mds_getattr(struct ptlrpc_request *req, int offset) struct dentry *de; struct mds_req_sec_desc *rsd; struct mds_body *body; - struct lvfs_ucred uc; + struct lvfs_ucred uc = {NULL, NULL,}; int rc = 0; ENTRY; @@ -1176,8 +1333,9 @@ static int mds_getattr(struct ptlrpc_request *req, int offset) MD_COUNTER_INCREMENT(obd, getattr); - rc = mds_init_ucred(&uc, rsd); + rc = mds_init_ucred(&uc, req, rsd); if (rc) { + mds_exit_ucred(&uc); CERROR("can't init ucred\n"); RETURN(rc); } @@ -1189,14 +1347,13 @@ static int mds_getattr(struct ptlrpc_request *req, int offset) GOTO(out_pop, rc); } - rc = mds_getattr_pack_msg(req, de->d_inode, offset); + rc = mds_getattr_pack_msg(req, de, offset); if (rc != 0) { CERROR("mds_getattr_pack_msg: %d\n", rc); GOTO(out_pop, rc); } - req->rq_status = mds_getattr_internal(obd, de, req, body, 0); - + req->rq_status = mds_getattr_internal(obd, de, req, offset, body, 0); l_dput(de); EXIT; @@ -1306,7 +1463,6 @@ out: static int mds_readpage(struct ptlrpc_request *req, int offset) { struct obd_device *obd = req->rq_export->exp_obd; - struct mds_obd *mds = &obd->u.mds; struct vfsmount *mnt; struct dentry *de; struct file *file; @@ -1314,7 +1470,7 @@ static int mds_readpage(struct ptlrpc_request *req, int offset) struct mds_body *body, *repbody; struct lvfs_run_ctxt saved; int rc, size = sizeof(*repbody); - struct lvfs_ucred uc; + struct lvfs_ucred uc = {NULL, NULL,}; ENTRY; rc = lustre_pack_reply(req, 1, &size, NULL); @@ -1328,7 +1484,6 @@ static int mds_readpage(struct ptlrpc_request *req, int offset) CERROR("Can't unpack security desc\n"); GOTO (out, rc = -EFAULT); } - mds_squash_root(mds, rsd, &req->rq_peer.peer_id.nid); body = lustre_swab_reqbuf(req, offset, sizeof(*body), lustre_swab_mds_body); @@ -1337,7 +1492,7 @@ static int mds_readpage(struct ptlrpc_request *req, int offset) GOTO (out, rc = -EFAULT); } - rc = mds_init_ucred(&uc, rsd); + rc = mds_init_ucred(&uc, req, rsd); if (rc) { CERROR("can't init ucred\n"); GOTO(out, rc); @@ -1384,8 +1539,8 @@ out_file: filp_close(file, 0); out_pop: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); - mds_exit_ucred(&uc); out: + mds_exit_ucred(&uc); req->rq_status = rc; return 0; } @@ -1479,7 +1634,6 @@ EXPORT_SYMBOL(mds_read_mid); int mds_reint(struct ptlrpc_request *req, int offset, struct lustre_handle *lockh) { - struct mds_obd *mds = &req->rq_export->exp_obd->u.mds; struct mds_update_record *rec; struct mds_req_sec_desc *rsd; int rc; @@ -1494,7 +1648,6 @@ int mds_reint(struct ptlrpc_request *req, int offset, CERROR("Can't unpack security desc\n"); GOTO(out, rc = -EFAULT); } - mds_squash_root(mds, rsd, &req->rq_peer.peer_id.nid); rc = mds_update_unpack(req, offset, rec); if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNPACK)) { @@ -1502,7 +1655,7 @@ int mds_reint(struct ptlrpc_request *req, int offset, GOTO(out, req->rq_status = -EINVAL); } - rc = mds_init_ucred(&rec->ur_uc, rsd); + rc = mds_init_ucred(&rec->ur_uc, req, rsd); if (rc) { CERROR("can't init ucred\n"); GOTO(out, rc); @@ -1510,11 +1663,27 @@ int mds_reint(struct ptlrpc_request *req, int offset, /* rc will be used to interrupt a for loop over multiple records */ rc = mds_reint_rec(rec, offset, req, lockh); - mds_exit_ucred(&rec->ur_uc); - EXIT; + + /* do reverse uid/gid mapping if needed */ + if (rc == 0 && req->rq_remote && + (rec->ur_opcode == REINT_SETATTR || + rec->ur_opcode == REINT_OPEN)) { + struct mds_body *body; + int bodyoff; + + if (rec->ur_opcode == REINT_SETATTR) + bodyoff = 0; + else /* open */ + bodyoff = (offset == 3 ? 1 : 0); + body = lustre_msg_buf(req->rq_repmsg, bodyoff, sizeof(*body)); + LASSERT(body); + + mds_reverse_map_ugid(req, body); + } out: + mds_exit_ucred(&rec->ur_uc); OBD_FREE(rec, sizeof(*rec)); - return rc; + RETURN(rc); } static int mds_filter_recovery_request(struct ptlrpc_request *req, @@ -1655,7 +1824,7 @@ static int mdt_obj_create(struct ptlrpc_request *req) * this only serve to inter-mds request, don't need check group database * here. --ericm. */ - uc.luc_ghash = NULL; + uc.luc_lsd = NULL; uc.luc_ginfo = NULL; uc.luc_uid = body->oa.o_uid; uc.luc_fsuid = body->oa.o_uid; @@ -1891,7 +2060,6 @@ cleanup: l_dput(new); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); - mds_put_group_entry(mds, uc.luc_ghash); return rc; } @@ -2008,6 +2176,38 @@ static int mdt_set_info(struct ptlrpc_request *req) RETURN(-EINVAL); } +static int mds_init_export_data(struct ptlrpc_request *req) +{ + struct mds_export_data *med = &req->rq_export->u.eu_mds_data; + __u32 *nllu; + + nllu = lustre_msg_buf(req->rq_reqmsg, 4, sizeof(__u32) * 2); + if (nllu == NULL) { + CERROR("failed to extract nllu, use 99:99\n"); + med->med_nllu = 99; + med->med_nllg = 99; + } else { + if (lustre_msg_swabbed(req->rq_reqmsg)) { + __swab32s(&nllu[0]); + __swab32s(&nllu[1]); + } + med->med_nllu = nllu[0]; + med->med_nllg = nllu[1]; + } + + if (req->rq_remote) { + CWARN("exp %p, peer "LPX64": set as remote\n", + req->rq_export, req->rq_peer.peer_id.nid); + med->med_local = 0; + } else + med->med_local = 1; + + LASSERT(med->med_idmap == NULL); + spin_lock_init(&med->med_idmap_lock); + + return 0; +} + static int mds_msg_check_version(struct lustre_msg *msg) { int rc; @@ -2066,6 +2266,11 @@ static int mds_msg_check_version(struct lustre_msg *msg) CERROR("bad opc %u version %08x, expecting %08x\n", msg->opc, msg->version, LUSTRE_OBD_VERSION); break; + case SEC_INIT: + case SEC_INIT_CONTINUE: + case SEC_FINI: + rc = 0; + break; default: CERROR("MDS unknown opcode %d\n", msg->opc); rc = -ENOTSUPP; @@ -2093,6 +2298,13 @@ int mds_handle(struct ptlrpc_request *req) RETURN(rc); } + /* Security opc should NOT trigger any recovery events */ + if (req->rq_reqmsg->opc == SEC_INIT || + req->rq_reqmsg->opc == SEC_INIT_CONTINUE || + req->rq_reqmsg->opc == SEC_FINI) { + GOTO(out, rc = 0); + } + LASSERT(current->journal_info == NULL); /* XXX identical to OST */ if (req->rq_reqmsg->opc != MDS_CONNECT) { @@ -2148,9 +2360,11 @@ int mds_handle(struct ptlrpc_request *req) DEBUG_REQ(D_INODE, req, "connect"); OBD_FAIL_RETURN(OBD_FAIL_MDS_CONNECT_NET, 0); rc = target_handle_connect(req); - if (!rc) + if (!rc) { /* Now that we have an export, set mds. */ mds = mds_req2mds(req); + mds_init_export_data(req); + } break; case MDS_DISCONNECT: @@ -2690,10 +2904,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) /* * here we use "iopen_nopriv" hardcoded, because it affects MDS utility * and the rest of options are passed by mount options. Probably this - * should be moved to somewhere else like startup scripts or lconf. - */ - sprintf(options, "iopen_nopriv"); - + * should be moved to somewhere else like startup scripts or lconf. */ + sprintf(options, "iopen_nopriv,acl,user_xattr"); if (lcfg->lcfg_inllen4 > 0 && lcfg->lcfg_inlbuf4) sprintf(options + strlen(options), ",%s", lcfg->lcfg_inlbuf4); @@ -3002,6 +3214,7 @@ static int mds_precleanup(struct obd_device *obd, int flags) RETURN(rc); } +extern void lgss_svc_cache_purge_all(void); static int mds_cleanup(struct obd_device *obd, int flags) { struct mds_obd *mds = &obd->u.mds; @@ -3046,9 +3259,65 @@ static int mds_cleanup(struct obd_device *obd, int flags) dev_clear_rdonly(2); fsfilt_put_ops(obd->obd_fsops); +#ifdef ENABLE_GSS + /* XXX */ + lgss_svc_cache_purge_all(); +#endif RETURN(0); } +static int set_security(const char *value, char **sec) +{ + int rc = 0; + + if (!strcmp(value, "null")) + *sec = "null"; + else if (!strcmp(value, "krb5i")) + *sec = "krb5i"; + else if (!strcmp(value, "krb5p")) + *sec = "krb5p"; + else { + CERROR("Unrecognized value, force use NULL\n"); + rc = -EINVAL; + } + + return rc; +} + +static int mds_process_config(struct obd_device *obd, obd_count len, void *buf) +{ + struct lustre_cfg *lcfg = buf; + struct mds_obd *mds = &obd->u.mds; + int rc = 0; + ENTRY; + + switch(lcfg->lcfg_command) { + case LCFG_SET_SECURITY: { + if (!lcfg->lcfg_inllen1 || !lcfg->lcfg_inllen2) + GOTO(out, rc = -EINVAL); + + if (!strcmp(lcfg->lcfg_inlbuf1, "mds_mds_sec")) + rc = set_security(lcfg->lcfg_inlbuf2, + &mds->mds_mds_sec); + else if (!strcmp(lcfg->lcfg_inlbuf1, "mds_ost_sec")) + rc = set_security(lcfg->lcfg_inlbuf2, + &mds->mds_ost_sec); + else { + CERROR("Unrecognized key\n"); + rc = -EINVAL; + } + break; + } + default: { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + GOTO(out, rc = -EINVAL); + + } + } +out: + RETURN(rc); +} + static void fixup_handle_for_resent_req(struct ptlrpc_request *req, int offset, struct ldlm_lock *new_lock, @@ -3126,10 +3395,11 @@ static int mds_intent_policy(struct ldlm_namespace *ns, struct lustre_handle lockh[2] = {{0}, {0}}; struct ldlm_lock *new_lock = NULL; int getattr_part = MDS_INODELOCK_UPDATE; - int rc, repsize[4] = { sizeof(struct ldlm_reply), - sizeof(struct mds_body), - mds->mds_max_mdsize, - mds->mds_max_cookiesize }; + int rc, reply_buffers; + int repsize[5] = {sizeof(struct ldlm_reply), + sizeof(struct mds_body), + mds->mds_max_mdsize}; + int offset = MDS_REQ_INTENT_REC_OFF; ENTRY; @@ -3153,7 +3423,14 @@ static int mds_intent_policy(struct ldlm_namespace *ns, LDLM_DEBUG(lock, "intent policy, opc: %s", ldlm_it2str(it->opc)); - rc = lustre_pack_reply(req, 3, repsize, NULL); + reply_buffers = 3; + if (it->opc & ( IT_OPEN | IT_GETATTR | IT_LOOKUP | IT_CHDIR )) { + reply_buffers = 5; + repsize[3] = 4; + repsize[4] = xattr_acl_size(LL_ACL_MAX_ENTRIES); + } + + rc = lustre_pack_reply(req, reply_buffers, repsize, NULL); if (rc) RETURN(req->rq_status = rc); @@ -3488,6 +3765,7 @@ static struct obd_ops mds_obd_ops = { .o_setup = mds_setup, .o_precleanup = mds_precleanup, .o_cleanup = mds_cleanup, + .o_process_config = mds_process_config, .o_postrecov = mds_postrecov, .o_statfs = mds_obd_statfs, .o_iocontrol = mds_iocontrol, @@ -3514,7 +3792,7 @@ static int __init mds_init(void) { struct lprocfs_static_vars lvars; - mds_group_hash_init(); + mds_init_lsd_cache(); lprocfs_init_multi_vars(0, &lvars); class_register_type(&mds_obd_ops, NULL, lvars.module_vars, @@ -3528,7 +3806,7 @@ static int __init mds_init(void) static void /*__exit*/ mds_exit(void) { - mds_group_hash_cleanup(); + mds_cleanup_lsd_cache(); class_unregister_type(LUSTRE_MDS_NAME); class_unregister_type(LUSTRE_MDT_NAME); diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c index 3912499..085c840 100644 --- a/lustre/mds/lproc_mds.c +++ b/lustre/mds/lproc_mds.c @@ -142,17 +142,13 @@ struct lprocfs_vars lprocfs_mds_obd_vars[] = { }; /* - * group hash proc entries handler + * LSD proc entry handlers */ -static int lprocfs_wr_group_info(struct file *file, const char *buffer, - unsigned long count, void *data) +static int lprocfs_wr_lsd_downcall(struct file *file, const char *buffer, + unsigned long count, void *data) { - struct { - int err; - uid_t uid; - uint32_t ngroups; - gid_t *groups; - } param; + struct upcall_cache *cache = __mds_get_global_lsd_cache(); + struct lsd_downcall_args param; gid_t gids_local[NGROUPS_SMALL]; gid_t *gids = NULL; @@ -164,9 +160,16 @@ static int lprocfs_wr_group_info(struct file *file, const char *buffer, CERROR("broken downcall\n"); return count; } + + if (param.err) { + CERROR("LSD downcall indicate error %d\n", param.err); + goto do_downcall; + } + if (param.ngroups > NGROUPS_MAX) { CERROR("%d groups?\n", param.ngroups); - return count; + param.err = -EINVAL; + goto do_downcall; } if (param.ngroups <= NGROUPS_SMALL) @@ -176,132 +179,119 @@ static int lprocfs_wr_group_info(struct file *file, const char *buffer, if (!gids) { CERROR("fail to alloc memory for %d gids\n", param.ngroups); - return count; + param.err = -ENOMEM; + goto do_downcall; } } if (copy_from_user(gids, param.groups, param.ngroups * sizeof(gid_t))) { CERROR("broken downcall\n"); - goto out; + param.err = -EFAULT; + goto do_downcall; } - mds_handle_group_downcall(param.err, param.uid, - param.ngroups, gids); + param.groups = gids; + +do_downcall: + upcall_cache_downcall(cache, (__u64) param.uid, param.err, ¶m); -out: if (gids && gids != gids_local) OBD_FREE(gids, param.ngroups * sizeof(gid_t)); return count; } -static int lprocfs_rd_expire(char *page, char **start, off_t off, int count, - int *eof, void *data) +static int lprocfs_rd_lsd_expire(char *page, char **start, off_t off, int count, + int *eof, void *data) { - struct mds_grp_hash *hash = __mds_get_global_group_hash(); + struct upcall_cache *cache= __mds_get_global_lsd_cache(); *eof = 1; - return snprintf(page, count, "%d\n", hash->gh_entry_expire); + return snprintf(page, count, "%lu\n", cache->uc_entry_expire); } - -static int lprocfs_wr_expire(struct file *file, const char *buffer, - unsigned long count, void *data) +static int lprocfs_wr_lsd_expire(struct file *file, const char *buffer, + unsigned long count, void *data) { - struct mds_grp_hash *hash = __mds_get_global_group_hash(); + struct upcall_cache *cache= __mds_get_global_lsd_cache(); char buf[32]; if (copy_from_user(buf, buffer, min(count, 32UL))) return count; buf[31] = 0; - sscanf(buf, "%d", &hash->gh_entry_expire); + sscanf(buf, "%lu", &cache->uc_entry_expire); return count; } -static int lprocfs_rd_ac_expire(char *page, char **start, off_t off, int count, - int *eof, void *data) +static int lprocfs_rd_lsd_ac_expire(char *page, char **start, off_t off, + int count, int *eof, void *data) { - struct mds_grp_hash *hash = __mds_get_global_group_hash(); + struct upcall_cache *cache= __mds_get_global_lsd_cache(); *eof = 1; - return snprintf(page, count, "%d\n", hash->gh_acquire_expire); + return snprintf(page, count, "%lu\n", cache->uc_acquire_expire); } - -static int lprocfs_wr_ac_expire(struct file *file, const char *buffer, - unsigned long count, void *data) +static int lprocfs_wr_lsd_ac_expire(struct file *file, const char *buffer, + unsigned long count, void *data) { - struct mds_grp_hash *hash = __mds_get_global_group_hash(); + struct upcall_cache *cache= __mds_get_global_lsd_cache(); char buf[32]; if (copy_from_user(buf, buffer, min(count, 32UL))) return count; buf[31] = 0; - sscanf(buf, "%d", &hash->gh_acquire_expire); + sscanf(buf, "%lu", &cache->uc_acquire_expire); return count; } -static int lprocfs_rd_hash_upcall(char *page, char **start, off_t off, int count, - int *eof, void *data) +static int lprocfs_rd_lsd_upcall(char *page, char **start, off_t off, int count, + int *eof, void *data) { - struct mds_grp_hash *hash = __mds_get_global_group_hash(); + struct upcall_cache *cache= __mds_get_global_lsd_cache(); *eof = 1; - return snprintf(page, count, "%s\n", hash->gh_upcall); + return snprintf(page, count, "%s\n", cache->uc_upcall); } - -static int lprocfs_wr_hash_upcall(struct file *file, const char *buffer, - unsigned long count, void *data) +static int lprocfs_wr_lsd_upcall(struct file *file, const char *buffer, + unsigned long count, void *data) { - struct mds_grp_hash *hash = __mds_get_global_group_hash(); + struct upcall_cache *cache= __mds_get_global_lsd_cache(); - if (count < MDSGRP_UPCALL_MAXPATH) { - sscanf(buffer, "%1024s", hash->gh_upcall); - hash->gh_upcall[MDSGRP_UPCALL_MAXPATH-1] = 0; + if (count < UC_CACHE_UPCALL_MAXPATH) { + sscanf(buffer, "%1024s", cache->uc_upcall); + cache->uc_upcall[UC_CACHE_UPCALL_MAXPATH - 1] = 0; } return count; } -static int lprocfs_wr_hash_flush(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - mds_group_hash_flush_idle(); - return count; -} - -static int lprocfs_rd_allow_setgroups(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct mds_grp_hash *hash = __mds_get_global_group_hash(); - - *eof = 1; - return snprintf(page, count, "%d\n", hash->gh_allow_setgroups); -} - -static int lprocfs_wr_allow_setgroups(struct file *file, const char *buffer, - unsigned long count, void *data) +extern void lgss_svc_cache_flush(__u32 uid); +static int lprocfs_wr_lsd_flush(struct file *file, const char *buffer, + unsigned long count, void *data) { - struct mds_grp_hash *hash = __mds_get_global_group_hash(); - char buf[8]; - int val; + char buf[32]; + __u32 uid; - if (copy_from_user(buf, buffer, min(count, 8UL))) + if (copy_from_user(buf, buffer, min(count, 32UL))) return count; - buf[7] = 0; - sscanf(buf, "%d", &val); - hash->gh_allow_setgroups = (val != 0); + buf[31] = 0; + sscanf(buf, "%d", &uid); + + mds_flush_lsd(uid); +#ifdef ENABLE_GSS + lgss_svc_cache_flush(uid); +#endif return count; } struct lprocfs_vars lprocfs_mds_module_vars[] = { - { "num_refs", lprocfs_rd_numrefs, 0, 0 }, - { "grp_hash_expire_interval",lprocfs_rd_expire, - lprocfs_wr_expire, 0}, - { "grp_hash_acquire_expire", lprocfs_rd_ac_expire, - lprocfs_wr_ac_expire, 0}, - { "grp_hash_upcall", lprocfs_rd_hash_upcall, - lprocfs_wr_hash_upcall, 0}, - { "grp_hash_flush", 0, lprocfs_wr_hash_flush, 0}, - { "group_info", 0, lprocfs_wr_group_info, 0 }, - { "allow_setgroups", lprocfs_rd_allow_setgroups, - lprocfs_wr_allow_setgroups, 0}, + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + /* LSD stuff */ + { "lsd_expire_interval", lprocfs_rd_lsd_expire, + lprocfs_wr_lsd_expire, 0}, + { "lsd_acquire_expire", lprocfs_rd_lsd_ac_expire, + lprocfs_wr_lsd_ac_expire, 0}, + { "lsd_upcall", lprocfs_rd_lsd_upcall, + lprocfs_wr_lsd_upcall, 0}, + { "lsd_flush", 0, lprocfs_wr_lsd_flush, 0}, + { "lsd_downcall", 0, lprocfs_wr_lsd_downcall, 0}, { 0 } }; diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index ed41934..b52bf4a 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -131,6 +131,8 @@ int mds_client_free(struct obd_export *exp, int clear_client) struct lvfs_run_ctxt saved; int rc; + mds_idmap_cleanup(med); + if (!med->med_mcd) RETURN(0); diff --git a/lustre/mds/mds_groups.c b/lustre/mds/mds_groups.c deleted file mode 100644 index 7da07f7..0000000 --- a/lustre/mds/mds_groups.c +++ /dev/null @@ -1,451 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2004 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_MDS - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include "mds_internal.h" - -#define GRP_HASH_NEW 0x1 -#define GRP_HASH_ACQUIRING 0x2 -#define GRP_HASH_INVALID 0x4 -#define GRP_HASH_EXPIRED 0x8 - -#define GRP_IS_NEW(i) ((i)->ge_flags & GRP_HASH_NEW) -#define GRP_IS_INVALID(i) ((i)->ge_flags & GRP_HASH_INVALID) -#define GRP_IS_ACQUIRING(i) ((i)->ge_flags & GRP_HASH_ACQUIRING) -#define GRP_IS_EXPIRED(i) ((i)->ge_flags & GRP_HASH_EXPIRED) -#define GRP_IS_VALID(i) ((i)->ge_flags == 0) - -#define GRP_SET_NEW(i) (i)->ge_flags |= GRP_HASH_NEW -#define GRP_SET_INVALID(i) (i)->ge_flags |= GRP_HASH_INVALID -#define GRP_SET_ACQUIRING(i) (i)->ge_flags |= GRP_HASH_ACQUIRING -#define GRP_SET_EXPIRED(i) (i)->ge_flags |= GRP_HASH_EXPIRED -#define GRP_SET_VALID(i) (i)->ge_flags = 0 - -#define GRP_CLEAR_NEW(i) (i)->ge_flags &= ~GRP_HASH_NEW -#define GRP_CLEAR_ACQUIRING(i) (i)->ge_flags &= ~GRP_HASH_ACQUIRING -#define GRP_CLEAR_INVALID(i) (i)->ge_flags &= ~GRP_HASH_INVALID -#define GRP_CLEAR_EXPIRED(i) (i)->ge_flags &= ~GRP_HASH_EXPIRED - -/* - * We need share hash table among the groups of MDSs (which server as the same - * lustre file system), maybe MDT? but there's lprocfs problems of putting this - * in MDT. so we make it global to the module. which brings the limitation that - * one node couldn't running multiple MDS which server as different Lustre FS. - * but which maybe not meaningful. - */ -static struct mds_grp_hash _group_hash; - -struct mds_grp_hash *__mds_get_global_group_hash() -{ - return &_group_hash; -} - -static struct mds_grp_hash_entry *alloc_entry(uid_t uid) -{ - struct mds_grp_hash_entry *entry; - - OBD_ALLOC(entry, sizeof(*entry)); - if (!entry) - return NULL; - - GRP_SET_NEW(entry); - INIT_LIST_HEAD(&entry->ge_hash); - entry->ge_uid = uid; - atomic_set(&entry->ge_refcount, 0); - init_waitqueue_head(&entry->ge_waitq); - return entry; -} - -/* protected by hash lock */ -static void free_entry(struct mds_grp_hash_entry *entry) -{ - if (entry->ge_group_info) - groups_free(entry->ge_group_info); - list_del(&entry->ge_hash); - CDEBUG(D_OTHER, "destroy mds_grp_entry %p for uid %d\n", - entry, entry->ge_uid); - OBD_FREE(entry, sizeof(*entry)); -} - -static inline void get_entry(struct mds_grp_hash_entry *entry) -{ - atomic_inc(&entry->ge_refcount); -} -static inline void put_entry(struct mds_grp_hash_entry *entry) -{ - if (atomic_dec_and_test(&entry->ge_refcount) && - (GRP_IS_INVALID(entry) || GRP_IS_EXPIRED(entry))) { - free_entry(entry); - } -} -static int check_unlink_entry(struct mds_grp_hash_entry *entry) -{ - if (GRP_IS_VALID(entry) && - time_before(jiffies, entry->ge_expire)) - return 0; - - if (GRP_IS_ACQUIRING(entry) && - time_after(jiffies, entry->ge_acquire_expire)) { - GRP_SET_EXPIRED(entry); - wake_up_all(&entry->ge_waitq); - } else if (!GRP_IS_INVALID(entry)) { - GRP_SET_EXPIRED(entry); - } - - list_del_init(&entry->ge_hash); - if (!atomic_read(&entry->ge_refcount)) - free_entry(entry); - return 1; -} - -static int refresh_entry(struct mds_grp_hash *hash, - struct mds_grp_hash_entry *entry) -{ - char *argv[4]; - char *envp[3]; - char uidstr[16]; - int rc; - ENTRY; - - snprintf(uidstr, 16, "%d", entry->ge_uid); - - argv[0] = hash->gh_upcall; - argv[1] = uidstr; - argv[2] = NULL; - - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/usr/sbin"; - envp[2] = NULL; - - rc = USERMODEHELPER(argv[0], argv, envp); - if (rc < 0) { - CERROR("Error invoking getgroups upcall %s %s: %d; check " - "/proc/fs/lustre/mds/grp_hash_upcall\n", - argv[0], argv[1], rc); - } else { - CWARN("Invoked upcall %s %s\n", - argv[0], argv[1]); - } - RETURN(rc); -} - -struct mds_grp_hash_entry *mds_get_group_entry(struct mds_obd *mds, uid_t uid) -{ - struct mds_grp_hash_entry *entry = NULL, *new = NULL, *next; - struct mds_grp_hash *hash = &_group_hash; - struct list_head *head; - wait_queue_t wait; - int rc, found; - ENTRY; - - head = &hash->gh_table[MDSGRP_HASH_INDEX(uid)]; - -find_again: - found = 0; - spin_lock(&hash->gh_lock); - list_for_each_entry_safe(entry, next, head, ge_hash) { - /* check invalid & expired items */ - if (check_unlink_entry(entry)) - continue; - if (entry->ge_uid == uid) { - found = 1; - break; - } - } - - if (!found) { /* didn't found */ - if (!new) { - spin_unlock(&hash->gh_lock); - new = alloc_entry(uid); - if (!new) { - CERROR("fail to alloc entry\n"); - RETURN(NULL); - } - goto find_again; - } else { - list_add(&new->ge_hash, head); - entry = new; - } - } else { - if (new) { - free_entry(new); - new = NULL; - } - list_move(&entry->ge_hash, head); - } - get_entry(entry); - - /* acquire for new one */ - if (GRP_IS_NEW(entry)) { - GRP_SET_ACQUIRING(entry); - GRP_CLEAR_NEW(entry); - entry->ge_acquire_expire = jiffies + - hash->gh_acquire_expire * HZ; - spin_unlock(&hash->gh_lock); - - rc = refresh_entry(hash, entry); - - spin_lock(&hash->gh_lock); - if (rc) { - GRP_CLEAR_ACQUIRING(entry); - GRP_SET_INVALID(entry); - } - /* fall through */ - } - - /* - * someone (and only one) is doing upcall upon this item, just wait it - * complete - */ - if (GRP_IS_ACQUIRING(entry)) { - init_waitqueue_entry(&wait, current); - add_wait_queue(&entry->ge_waitq, &wait); - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock(&hash->gh_lock); - - schedule_timeout(hash->gh_acquire_expire * HZ); - - spin_lock(&hash->gh_lock); - remove_wait_queue(&entry->ge_waitq, &wait); - if (GRP_IS_ACQUIRING(entry)) { - /* we're interrupted or upcall failed - * in the middle - */ - put_entry(entry); - spin_unlock(&hash->gh_lock); - RETURN(NULL); - } - /* fall through */ - } - - /* invalid means error, don't need to try again */ - if (GRP_IS_INVALID(entry)) { - put_entry(entry); - spin_unlock(&hash->gh_lock); - RETURN(NULL); - } - - /* - * check expired. We can't refresh the existed one because some memory - * might be shared by multiple processes. - */ - if (check_unlink_entry(entry)) { - /* - * if expired, try again. but if this entry is created by me but - * too quickly turn to expired without any error, should at - * least give a chance to use it once. - */ - if (entry != new) { - put_entry(entry); - spin_unlock(&hash->gh_lock); - new = NULL; - goto find_again; - } - } - - /* Now we know it's good */ - spin_unlock(&hash->gh_lock); - RETURN(entry); -} - -void mds_put_group_entry(struct mds_obd *mds, struct mds_grp_hash_entry *entry) -{ - struct mds_grp_hash *hash = &_group_hash; - ENTRY; - - if (!entry) { - EXIT; - return; - } - - spin_lock(&hash->gh_lock); - LASSERT(atomic_read(&entry->ge_refcount) > 0); - put_entry(entry); - spin_unlock(&hash->gh_lock); - EXIT; -} - -static int entry_set_group_info(struct mds_grp_hash_entry *entry, - __u32 ngroups, gid_t *groups) -{ - struct group_info *ginfo; - ENTRY; - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4) - if (ngroups > NGROUPS) - ngroups = NGROUPS; -#endif - - if (ngroups > NGROUPS_MAX) { - CERROR("too many (%d) supp groups\n", ngroups); - RETURN(-EINVAL); - } - - ginfo = groups_alloc(ngroups); - if (!ginfo) { - CERROR("can't alloc group_info for %d groups\n", ngroups); - RETURN(-ENOMEM); - } - groups_from_buffer(ginfo, groups); - - entry->ge_group_info = ginfo; - RETURN(0); -} - -int mds_handle_group_downcall(int err, uid_t uid, __u32 ngroups, gid_t *groups) -{ - struct mds_grp_hash *hash = &_group_hash; - struct mds_grp_hash_entry *entry = NULL; - struct list_head *head; - int found = 0, rc = 0; - ENTRY; - - LASSERT(hash); - - head = &hash->gh_table[MDSGRP_HASH_INDEX(uid)]; - - spin_lock(&hash->gh_lock); - list_for_each_entry(entry, head, ge_hash) { - if (entry->ge_uid == uid) { - found = 1; - break; - } - } - if (!found) { - /* haven't found, it's possible */ - spin_unlock(&hash->gh_lock); - RETURN(-EINVAL); - } - if (err) { - GRP_SET_INVALID(entry); - GOTO(out, rc = -EINVAL); - } - - if (!GRP_IS_ACQUIRING(entry) || - GRP_IS_INVALID(entry) || - GRP_IS_EXPIRED(entry)) { - CERROR("found a stale entry %p(uid %d) in ioctl\n", - entry, entry->ge_uid); - GOTO(out, rc = -EINVAL); - } - - atomic_inc(&entry->ge_refcount); - spin_unlock(&hash->gh_lock); - rc = entry_set_group_info(entry, ngroups, groups); - spin_lock(&hash->gh_lock); - atomic_dec(&entry->ge_refcount); - if (rc) { - GRP_SET_INVALID(entry); - list_del_init(&entry->ge_hash); - GOTO(out, rc); - } - entry->ge_acquisition_time = LTIME_S(CURRENT_TIME); - entry->ge_expire = jiffies + hash->gh_entry_expire * HZ; - GRP_SET_VALID(entry); - CDEBUG(D_OTHER, "created mds_grp_entry %p for uid %d\n", - entry, entry->ge_uid); -out: - wake_up_all(&entry->ge_waitq); - spin_unlock(&hash->gh_lock); - RETURN(rc); -} - -static void mds_flush_group_hash(struct mds_grp_hash *hash, int force) -{ - struct mds_grp_hash_entry *entry, *next; - int i; - ENTRY; - - spin_lock(&hash->gh_lock); - for (i = 0; i < MDSGRP_HASH_SIZE; i++) { - list_for_each_entry_safe(entry, next, - &hash->gh_table[i], ge_hash) { - if (!force && atomic_read(&entry->ge_refcount)) { - GRP_SET_EXPIRED(entry); - continue; - } - LASSERT(!atomic_read(&entry->ge_refcount)); - free_entry(entry); - } - } - spin_unlock(&hash->gh_lock); - EXIT; -} - -void mds_group_hash_flush_idle() -{ - mds_flush_group_hash(&_group_hash, 0); -} - -int mds_allow_setgroups(void) -{ - return _group_hash.gh_allow_setgroups; -} - -int mds_group_hash_init() -{ - struct mds_grp_hash *hash; - int i; - ENTRY; - - hash = &_group_hash; - - spin_lock_init(&hash->gh_lock); - for (i = 0; i < MDSGRP_HASH_SIZE; i++) - INIT_LIST_HEAD(&hash->gh_table[i]); - /* set default value, proc tunable */ - sprintf(hash->gh_upcall, "%s", "/sbin/l_getgroups"); - hash->gh_entry_expire = 5 * 60; - hash->gh_acquire_expire = 5; - hash->gh_allow_setgroups = 0; - - RETURN(0); -} - -void mds_group_hash_cleanup() -{ - mds_flush_group_hash(&_group_hash, 1); -} diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h index be2fcfc..d68b78e 100644 --- a/lustre/mds/mds_internal.h +++ b/lustre/mds/mds_internal.h @@ -97,7 +97,15 @@ int mds_lock_new_child(struct obd_device *obd, struct inode *inode, void groups_from_buffer(struct group_info *ginfo, __u32 *gids); int mds_update_unpack(struct ptlrpc_request *, int offset, struct mds_update_record *); -int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd); +int mds_idmap_set(struct mds_export_data *med, __u32 id1, __u32 id2, + int is_uid_mapping); +__u32 mds_idmap_get(struct mds_export_data *med, __u32 id, + int is_uid_mapping); +void mds_idmap_cleanup(struct mds_export_data *med); +void mds_reverse_map_ugid(struct ptlrpc_request *req, + struct mds_body *body); +int mds_init_ucred(struct lvfs_ucred *ucred, struct ptlrpc_request *req, + struct mds_req_sec_desc *rsd); void mds_exit_ucred(struct lvfs_ucred *ucred); /* mds/mds_unlink_open.c */ @@ -205,7 +213,14 @@ int mds_get_md(struct obd_device *, struct inode *, void *md, int mds_pack_md(struct obd_device *, struct lustre_msg *, int offset, struct mds_body *, struct inode *, int lock); - +int mds_pack_link(struct dentry *dentry, struct ptlrpc_request *req, + struct mds_body *repbody, int reply_off); +int mds_pack_ea(struct dentry *dentry, struct ptlrpc_request *req, + struct mds_body *repbody, int req_off, int reply_off); +int mds_pack_ealist(struct dentry *dentry, struct ptlrpc_request *req, + struct mds_body *repbody, int reply_off); +int mds_pack_acl(struct obd_device *, struct lustre_msg *, int offset, + struct mds_body *, struct inode *); int mds_pack_inode2id(struct obd_device *, struct lustre_id *, struct inode *, int); @@ -238,19 +253,12 @@ int mds_lock_and_check_slave(int, struct ptlrpc_request *, struct lustre_handle int mds_convert_mea_ea(struct obd_device *, struct inode *, struct lov_mds_md *, int); int mds_is_dir_empty(struct obd_device *, struct dentry *); -/* mds_groups.c */ -int mds_group_hash_init(void); -void mds_group_hash_cleanup(void); -void mds_group_hash_flush_idle(void); -int mds_allow_setgroups(void); - -extern char mds_getgroups_upcall[PATH_MAX]; -extern int mds_grp_hash_entry_expire; -extern int mds_grp_hash_acquire_expire; - -struct mds_grp_hash *__mds_get_global_group_hash(void); -struct mds_grp_hash_entry * mds_get_group_entry(struct mds_obd *mds, uid_t uid); -void mds_put_group_entry(struct mds_obd *mds, struct mds_grp_hash_entry *entry); -int mds_handle_group_downcall(int err, uid_t uid, __u32 ngroups, gid_t *groups); +/* mds_lsd.c */ +struct upcall_cache *__mds_get_global_lsd_cache(void); +int mds_init_lsd_cache(void); +void mds_cleanup_lsd_cache(void); +struct lustre_sec_desc * mds_get_lsd(__u32 uid); +void mds_put_lsd(struct lustre_sec_desc *lsd); +void mds_flush_lsd(__u32 id); #endif /* _MDS_INTERNAL_H */ diff --git a/lustre/mds/mds_lib.c b/lustre/mds/mds_lib.c index c298512..6706841 100644 --- a/lustre/mds/mds_lib.c +++ b/lustre/mds/mds_lib.c @@ -286,11 +286,11 @@ static int mds_setattr_unpack(struct ptlrpc_request *req, int offset, } if (req->rq_reqmsg->bufcount > offset + 2) { - r->ur_logcookies = lustre_msg_buf(req->rq_reqmsg, offset + 2, 0); - if (r->ur_eadata == NULL) + r->ur_ea2data = lustre_msg_buf(req->rq_reqmsg, offset + 2, 0); + if (r->ur_ea2data == NULL) RETURN (-EFAULT); - r->ur_cookielen = req->rq_reqmsg->buflens[offset + 2]; + r->ur_ea2datalen = req->rq_reqmsg->buflens[offset + 2]; } RETURN(0); @@ -504,6 +504,195 @@ int mds_update_unpack(struct ptlrpc_request *req, int offset, RETURN(rc); } +static +struct mds_idmap_table *__get_idmap_table(struct mds_export_data *med, + int create) +{ + struct mds_idmap_table *new; + int i; + + if (!create || med->med_idmap) + return med->med_idmap; + + spin_unlock(&med->med_idmap_lock); + OBD_ALLOC(new, sizeof(*new)); + spin_lock(&med->med_idmap_lock); + + if (!new) { + CERROR("fail to alloc %d\n", sizeof(*new)); + return NULL; + } + + if (med->med_idmap) { + OBD_FREE(new, sizeof(*new)); + return med->med_idmap; + } + + for (i = 0; i < MDS_IDMAP_HASHSIZE; i++) { + INIT_LIST_HEAD(&new->uidmap[i]); + INIT_LIST_HEAD(&new->gidmap[i]); + } + + CDEBUG(D_SEC, "allocate idmap table for med %p\n", med); + med->med_idmap = new; + return new; +} + +static void __flush_mapping_table(struct list_head *table) +{ + struct mds_idmap_item *item; + int i; + + for (i = 0; i < MDS_IDMAP_HASHSIZE; i++) { + while (!list_empty(&table[i])) { + item = list_entry(table[i].next, struct mds_idmap_item, + hash); + list_del(&item->hash); + OBD_FREE(item, sizeof(*item)); + } + } +} + +void mds_idmap_cleanup(struct mds_export_data *med) +{ + ENTRY; + + if (!med->med_idmap) { + EXIT; + return; + } + + spin_lock(&med->med_idmap_lock); + __flush_mapping_table(med->med_idmap->uidmap); + __flush_mapping_table(med->med_idmap->gidmap); + OBD_FREE(med->med_idmap, sizeof(struct mds_idmap_table)); + spin_unlock(&med->med_idmap_lock); +} + +static inline int idmap_hash(__u32 id) +{ + return (id & (MDS_IDMAP_HASHSIZE - 1)); +} + +static +int __idmap_set_item(struct mds_export_data *med, + struct list_head *table, + __u32 id1, __u32 id2) +{ + struct list_head *head; + struct mds_idmap_item *item, *new = NULL; + int found = 0; + + head = table + idmap_hash(id1); +again: + list_for_each_entry(item, head, hash) { + if (item->id1 == id1) { + found = 1; + break; + } + } + + if (!found) { + if (new == NULL) { + spin_unlock(&med->med_idmap_lock); + OBD_ALLOC(new, sizeof(*new)); + spin_lock(&med->med_idmap_lock); + if (!new) { + CERROR("fail to alloc %d\n", sizeof(*new)); + return -ENOMEM; + } + goto again; + } + new->id1 = id1; + new->id2 = id2; + list_add(&new->hash, head); + } else { + if (new) + OBD_FREE(new, sizeof(*new)); + if (item->id2 != id2) { + CWARN("mapping changed: %u ==> (%u -> %u)\n", + id1, item->id2, id2); + item->id2 = id2; + } + list_move(&item->hash, head); + } + + return 0; +} + +int mds_idmap_set(struct mds_export_data *med, __u32 id1, __u32 id2, + int is_uid_mapping) +{ + struct mds_idmap_table *idmap; + int rc; + ENTRY; + + spin_lock(&med->med_idmap_lock); + + idmap = __get_idmap_table(med, 1); + if (!idmap) + GOTO(out, rc = -ENOMEM); + + if (is_uid_mapping) + rc = __idmap_set_item(med, idmap->uidmap, id1, id2); + else + rc = __idmap_set_item(med, idmap->gidmap, id1, id2); + +out: + spin_unlock(&med->med_idmap_lock); + RETURN(rc); +} + +__u32 mds_idmap_get(struct mds_export_data *med, __u32 id, + int is_uid_mapping) +{ + struct mds_idmap_table *idmap; + struct list_head *table; + struct list_head *head; + struct mds_idmap_item *item; + int found = 0; + __u32 res; + + spin_lock(&med->med_idmap_lock); + idmap = __get_idmap_table(med, 0); + if (!idmap) + goto nllu; + + table = is_uid_mapping ? idmap->uidmap : idmap->gidmap; + head = table + idmap_hash(id); + + list_for_each_entry(item, head, hash) { + if (item->id1 == id) { + found = 1; + break; + } + } + if (!found) + goto nllu; + + res = item->id2; +out: + spin_unlock(&med->med_idmap_lock); + return res; +nllu: + res = is_uid_mapping ? med->med_nllu : med->med_nllg; + goto out; +} + +void mds_reverse_map_ugid(struct ptlrpc_request *req, + struct mds_body *body) +{ + struct mds_export_data *med = &req->rq_export->u.eu_mds_data; + + LASSERT(req->rq_remote); + + if (body->valid & OBD_MD_FLUID) + body->uid = mds_idmap_get(med, body->uid, 1); + + if (body->valid & OBD_MD_FLGID) + body->gid = mds_idmap_get(med, body->gid, 0); +} + static inline void drop_ucred_ginfo(struct lvfs_ucred *ucred) { if (ucred->luc_ginfo) { @@ -512,43 +701,164 @@ static inline void drop_ucred_ginfo(struct lvfs_ucred *ucred) } } +static inline void drop_ucred_lsd(struct lvfs_ucred *ucred) +{ + if (ucred->luc_lsd) { + mds_put_lsd(ucred->luc_lsd); + ucred->luc_lsd = NULL; + } +} + /* + * the heart of the uid/gid handling and security checking. + * * root could set any group_info if we allowed setgroups, while * normal user only could 'reduce' their group members -- which * is somewhat expensive. */ -int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd) +int mds_init_ucred(struct lvfs_ucred *ucred, + struct ptlrpc_request *req, + struct mds_req_sec_desc *rsd) { + struct mds_obd *mds = &req->rq_export->exp_obd->u.mds; + struct mds_export_data *med = &req->rq_export->u.eu_mds_data; + struct lustre_sec_desc *lsd; + ptl_nid_t peernid = req->rq_peer.peer_id.nid; struct group_info *gnew; - + unsigned int setuid, setgid, strong_sec; ENTRY; + LASSERT(ucred); LASSERT(rsd); + LASSERT(rsd->rsd_ngroups <= LUSTRE_MAX_GROUPS); + + strong_sec = (req->rq_auth_uid != -1); + LASSERT(!(req->rq_remote && !strong_sec)); + + /* sanity check & set local/remote flag */ + if (req->rq_remote) { + if (med->med_local) { + CWARN("exp %p: client on nid "LPX64" was local, " + "set to remote\n", req->rq_export, peernid); + med->med_local = 0; + } + } else { + if (!med->med_local) { + CWARN("exp %p: client on nid "LPX64" was remote, " + "set to local\n", req->rq_export, peernid); + med->med_local = 1; + } + } + + setuid = (rsd->rsd_fsuid != rsd->rsd_uid); + setgid = (rsd->rsd_fsgid != rsd->rsd_gid); + + /* deny setuid/setgid for remote client */ + if ((setuid || setgid) && !med->med_local) { + CWARN("deny setxid (%u/%u) from remote client "LPX64"\n", + setuid, setgid, peernid); + RETURN(-EPERM); + } + + /* take care of uid/gid mapping for client in remote realm */ + if (req->rq_remote) { + /* record the uid mapping here */ + mds_idmap_set(med, req->rq_auth_uid, rsd->rsd_uid, 1); + + /* now we act as the authenticated user */ + rsd->rsd_uid = rsd->rsd_fsuid = req->rq_auth_uid; + } else if (strong_sec && req->rq_auth_uid != rsd->rsd_uid) { + /* if we use strong authentication on this request, we + * expect the uid which client claimed is true. + * + * FIXME root's machine_credential in krb5 will be interpret + * as "nobody", which is not good for mds-mds and mds-ost + * connection. + */ + CWARN("nid "LPX64": UID %u was authenticated while client " + "claimed %u, set %u by force\n", + peernid, req->rq_auth_uid, rsd->rsd_uid, + req->rq_auth_uid); + rsd->rsd_uid = req->rq_auth_uid; + } + + /* now lsd come into play */ + ucred->luc_ginfo = NULL; + ucred->luc_lsd = lsd = mds_get_lsd(rsd->rsd_uid); + + if (lsd) { + if (req->rq_remote) { + /* record the gid mapping here */ + mds_idmap_set(med, lsd->lsd_gid, rsd->rsd_gid, 0); + /* now we act as the authenticated group */ + rsd->rsd_gid = rsd->rsd_fsgid = lsd->lsd_gid; + } else if (rsd->rsd_gid != lsd->lsd_gid) { + /* verify gid which client declared is true */ + CWARN("GID: %u while client declare %u, " + "set %u by force\n", + lsd->lsd_gid, rsd->rsd_gid, + lsd->lsd_gid); + rsd->rsd_gid = lsd->lsd_gid; + } + + if (lsd->lsd_ginfo) { + ucred->luc_ginfo = lsd->lsd_ginfo; + get_group_info(ucred->luc_ginfo); + } + + /* check permission of setuid */ + if (setuid) { + if (!lsd->lsd_allow_setuid) { + CWARN("mds blocked setuid attempt: %u -> %u\n", + rsd->rsd_uid, rsd->rsd_fsuid); + RETURN(-EPERM); + } + } + + /* check permission of setgid */ + if (setgid) { + if (!lsd->lsd_allow_setgid) { + CWARN("mds blocked setgid attempt: %u -> %u\n", + rsd->rsd_gid, rsd->rsd_fsgid); + RETURN(-EPERM); + } + } + } else { + /* failed to get lsd, right now we simply deny any access + * if strong authentication is used, + */ + if (strong_sec) { + CWARN("mds deny access without LSD\n"); + RETURN(-EPERM); + } + + /* and otherwise deny setuid/setgid attempt */ + if (setuid || setgid) { + CWARN("mds deny setuid/setgid without LSD\n"); + RETURN(-EPERM); + } + } + /* NOTE: we have already obtained supplementary groups, + * it will be retained across root_squash. will it be a + * security problem?? + */ + mds_squash_root(mds, rsd, &peernid); + + /* remove privilege for non-root user */ + if (rsd->rsd_fsuid) + rsd->rsd_cap &= ~CAP_FS_MASK; + + /* by now every fields in rsd have been granted */ ucred->luc_fsuid = rsd->rsd_fsuid; ucred->luc_fsgid = rsd->rsd_fsgid; ucred->luc_cap = rsd->rsd_cap; ucred->luc_uid = rsd->rsd_uid; - ucred->luc_ghash = mds_get_group_entry(NULL, rsd->rsd_uid); - ucred->luc_ginfo = NULL; - if (ucred->luc_ghash && ucred->luc_ghash->ge_group_info) { - ucred->luc_ginfo = ucred->luc_ghash->ge_group_info; - get_group_info(ucred->luc_ginfo); - } - - /* everything is done if we don't allow set groups */ - if (!mds_allow_setgroups()) + /* everything is done if we don't allow setgroups */ + if (!lsd || !lsd->lsd_allow_setgrp) RETURN(0); - if (rsd->rsd_ngroups > LUSTRE_MAX_GROUPS) { - CERROR("client provide too many groups: %d\n", - rsd->rsd_ngroups); - drop_ucred_ginfo(ucred); - mds_put_group_entry(NULL, ucred->luc_ghash); - RETURN(-EFAULT); - } - if (ucred->luc_uid == 0) { if (rsd->rsd_ngroups == 0) { drop_ucred_ginfo(ucred); @@ -559,12 +869,11 @@ int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd) if (!gnew) { CERROR("out of memory\n"); drop_ucred_ginfo(ucred); - mds_put_group_entry(NULL, ucred->luc_ghash); + drop_ucred_lsd(ucred); RETURN(-ENOMEM); } groups_from_buffer(gnew, rsd->rsd_groups); - /* can't rely on client to sort them */ - groups_sort(gnew); + groups_sort(gnew); /* can't rely on client */ drop_ucred_ginfo(ucred); ucred->luc_ginfo = gnew; @@ -586,7 +895,7 @@ int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd) if (!gnew) { CERROR("out of memory\n"); drop_ucred_ginfo(ucred); - mds_put_group_entry(NULL, ucred->luc_ghash); + drop_ucred_lsd(ucred); RETURN(-ENOMEM); } @@ -594,8 +903,8 @@ int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd) while (cur < rsd->rsd_ngroups) { if (groups_search(ginfo, rsd->rsd_groups[cur])) { GROUP_AT(gnew, set) = rsd->rsd_groups[cur]; - set++; - } + set++; + } cur++; } gnew->ngroups = set; @@ -609,11 +918,7 @@ int mds_init_ucred(struct lvfs_ucred *ucred, struct mds_req_sec_desc *rsd) void mds_exit_ucred(struct lvfs_ucred *ucred) { ENTRY; - - if (ucred->luc_ginfo) - put_group_info(ucred->luc_ginfo); - if (ucred->luc_ghash) - mds_put_group_entry(NULL, ucred->luc_ghash); - + drop_ucred_ginfo(ucred); + drop_ucred_lsd(ucred); EXIT; } diff --git a/lustre/mds/mds_lmv.c b/lustre/mds/mds_lmv.c index 0287b11..6685b2b 100644 --- a/lustre/mds/mds_lmv.c +++ b/lustre/mds/mds_lmv.c @@ -28,16 +28,22 @@ #define DEBUG_SUBSYSTEM S_MDS #include +#include +#include +#include +#include +#include +#include #include #include #include #include #include #include +#include #include "mds_internal.h" - /* * TODO: * - magic in mea struct @@ -110,6 +116,13 @@ int mds_md_connect(struct obd_device *obd, char *md_name) if (rc) GOTO(err_reg, rc); + if (mds->mds_mds_sec) { + rc = obd_set_info(mds->mds_md_exp, strlen("sec"), "sec", + strlen(mds->mds_mds_sec), mds->mds_mds_sec); + if (rc) + GOTO(err_reg, rc); + } + mds->mds_md_connected = 1; up(&mds->mds_md_sem); RETURN(0); @@ -952,10 +965,13 @@ int mds_lock_slave_objs(struct obd_device *obd, struct dentry *dentry, op_data->mea1 = mea; it.it_op = IT_UNLINK; + OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data)); + rc = md_enqueue(mds->mds_md_exp, LDLM_IBITS, &it, LCK_EX, op_data, *rlockh, NULL, 0, ldlm_completion_ast, mds_blocking_ast, NULL); OBD_FREE(op_data, sizeof(*op_data)); + OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data)); EXIT; cleanup: OBD_FREE(mea, mea_size); @@ -1133,7 +1149,6 @@ int mds_lock_and_check_slave(int offset, struct ptlrpc_request *req, CERROR("Can't unpack security desc\n"); GOTO(cleanup, rc = -EFAULT); } - mds_squash_root(&obd->u.mds, rsd, &req->rq_peer.peer_id.nid); body = lustre_swab_reqbuf(req, offset, sizeof(*body), lustre_swab_mds_body); @@ -1162,7 +1177,7 @@ int mds_lock_and_check_slave(int offset, struct ptlrpc_request *req, if (!S_ISDIR(dentry->d_inode->i_mode)) GOTO(cleanup, rc = 0); - rc = mds_init_ucred(&uc, rsd); + rc = mds_init_ucred(&uc, req, rsd); if (rc) { CERROR("can't init ucred\n"); GOTO(cleanup, rc); diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index a0f3880..bd3ed48 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -242,6 +242,16 @@ int mds_dt_connect(struct obd_device *obd, char * lov_name) RETURN(-ENOTCONN); } + if (mds->mds_ost_sec) { + rc = obd_set_info(mds->mds_dt_obd->obd_self_export, + strlen("sec"), "sec", + strlen(mds->mds_ost_sec), mds->mds_ost_sec); + if (rc) { + mds->mds_dt_obd = ERR_PTR(rc); + RETURN(rc); + } + } + CDEBUG(D_HA, "obd: %s osc: %s lov_name: %s\n", obd->obd_name, mds->mds_dt_obd->obd_name, lov_name); diff --git a/lustre/mds/mds_lsd.c b/lustre/mds/mds_lsd.c new file mode 100644 index 0000000..fbc3de3 --- /dev/null +++ b/lustre/mds/mds_lsd.c @@ -0,0 +1,240 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_MDS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "mds_internal.h" + +/* + * We need share hash table among the groups of MDSs (which server as the same + * lustre file system), maybe MDT? but there's lprocfs problems of putting this + * in MDT. so we make it global to the module. which brings the limitation that + * one node couldn't running multiple MDS which server as different Lustre FS. + * but which maybe not meaningful. + */ + + +#define MDS_LSD_HASHSIZE (256) +static struct upcall_cache _lsd_cache; +static struct list_head _lsd_hashtable[MDS_LSD_HASHSIZE]; + +struct upcall_cache *__mds_get_global_lsd_cache() +{ + return &_lsd_cache; +} + +static unsigned int lsd_hash(struct upcall_cache *cache, __u64 key) +{ + LASSERT(cache == &_lsd_cache); + return ((__u32) key) & (MDS_LSD_HASHSIZE - 1); +} + +static struct upcall_cache_entry * +lsd_alloc_entry(struct upcall_cache *cache, __u64 key) +{ + struct lsd_cache_entry *entry; + ENTRY; + + OBD_ALLOC(entry, sizeof(*entry)); + if (!entry) { + CERROR("failed to alloc entry\n"); + RETURN(NULL); + } + upcall_cache_init_entry(cache, &entry->base, key); + + RETURN(&entry->base); +} + +static void lsd_free_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + struct lsd_cache_entry *lentry; + + lentry = container_of(entry, struct lsd_cache_entry, base); + if (lentry->lsd.lsd_ginfo) + put_group_info(lentry->lsd.lsd_ginfo); + OBD_FREE(lentry, sizeof(*lentry)); +} + + +static int lsd_make_upcall(struct upcall_cache *cache, + struct upcall_cache_entry *entry) +{ + char *argv[4]; + char *envp[3]; + char uidstr[16]; + int rc; + ENTRY; + + snprintf(uidstr, 16, "%u", (__u32) entry->ue_key); + + argv[0] = cache->uc_upcall; + argv[1] = uidstr; + argv[2] = NULL; + + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/usr/sbin"; + envp[2] = NULL; + + rc = USERMODEHELPER(argv[0], argv, envp); + if (rc < 0) { + CERROR("Error invoking lsd upcall %s %s: %d; check " + "/proc/fs/lustre/mds/lsd_upcall\n", + argv[0], argv[1], rc); + } else { + CWARN("Invoked upcall %s %s\n", + argv[0], argv[1]); + } + RETURN(rc); +} + +static int lsd_parse_downcall(struct upcall_cache *cache, + struct upcall_cache_entry *entry, + void *args) +{ + struct lustre_sec_desc *lsd; + struct lsd_cache_entry *lentry; + struct lsd_downcall_args *lsd_args; + struct group_info *ginfo; + ENTRY; + + LASSERT(args); + + lentry = container_of(entry, struct lsd_cache_entry, base); + lsd = &lentry->lsd; + lsd_args = (struct lsd_downcall_args *) args; + LASSERT(lsd_args->err == 0); + LASSERT(lsd_args->ngroups <= NGROUPS_MAX); + + ginfo = groups_alloc(lsd_args->ngroups); + if (!ginfo) { + CERROR("can't alloc group_info for %d groups\n", + lsd_args->ngroups); + RETURN(-ENOMEM); + } + groups_from_buffer(ginfo, lsd_args->groups); + groups_sort(ginfo); + + lsd->lsd_uid = lsd_args->uid; + lsd->lsd_gid = lsd_args->gid; + lsd->lsd_ginfo = ginfo; + lsd->lsd_allow_setuid = lsd_args->allow_setuid; + lsd->lsd_allow_setgid = lsd_args->allow_setgid; + lsd->lsd_allow_setgrp = lsd_args->allow_setgrp; + + CWARN("LSD: uid %u gid %u ngroups %u, perm (%d/%d/%d)\n", + lsd->lsd_uid, lsd->lsd_gid, ginfo->ngroups, + lsd->lsd_allow_setuid, lsd->lsd_allow_setgid, + lsd->lsd_allow_setgrp); + RETURN(0); +} + +struct lustre_sec_desc * mds_get_lsd(__u32 uid) +{ + struct upcall_cache *cache = &_lsd_cache; + struct upcall_cache_entry *entry; + struct lsd_cache_entry *lentry; + + entry = upcall_cache_get_entry(cache, (__u64) uid); + if (!entry) + return NULL; + + lentry = container_of(entry, struct lsd_cache_entry, base); + return &lentry->lsd; +} + +void mds_put_lsd(struct lustre_sec_desc *lsd) +{ + struct lsd_cache_entry *lentry; + + LASSERT(lsd); + + lentry = container_of(lsd, struct lsd_cache_entry, lsd); + upcall_cache_put_entry(&lentry->base); +} + +int mds_init_lsd_cache() +{ + struct upcall_cache *cache = &_lsd_cache; + int i; + ENTRY; + + cache->uc_hashtable = _lsd_hashtable; + cache->uc_hashsize = MDS_LSD_HASHSIZE; + cache->uc_hashlock = RW_LOCK_UNLOCKED; + for (i = 0; i < cache->uc_hashsize; i++) + INIT_LIST_HEAD(&cache->uc_hashtable[i]); + cache->uc_name = "LSD_CACHE"; + + /* set default value, proc tunable */ + sprintf(cache->uc_upcall, "%s", "/sbin/lsd_upcall"); + cache->uc_entry_expire = 5 * 60; + cache->uc_acquire_expire = 5; + + cache->hash = lsd_hash; + cache->alloc_entry = lsd_alloc_entry; + cache->free_entry = lsd_free_entry; + cache->make_upcall = lsd_make_upcall; + cache->parse_downcall = lsd_parse_downcall; + + RETURN(0); +} + +void mds_flush_lsd(__u32 id) +{ + struct upcall_cache *cache = &_lsd_cache; + + if (id == -1) + upcall_cache_flush_idle(cache); + else + upcall_cache_flush_one(cache, (__u64) id); +} + +void mds_cleanup_lsd_cache() +{ + upcall_cache_flush_all(&_lsd_cache); +} diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index 9d947a1..8b66569 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -111,7 +111,7 @@ static void mds_mfd_destroy(struct mds_file_data *mfd) mds_mfd_put(mfd); } - +#ifdef IFILTERDATA_ACTUALLY_USED /* Caller must hold mds->mds_epoch_sem */ static int mds_alloc_filterdata(struct inode *inode) { @@ -131,6 +131,7 @@ static void mds_free_filterdata(struct inode *inode) inode->i_filterdata = NULL; iput(inode); } +#endif /*IFILTERDATA_ACTUALLY_USED*/ /* Write access to a file: executors cause a negative count, * writers a positive count. The semaphore is needed to perform @@ -155,7 +156,7 @@ static int mds_get_write_access(struct mds_obd *mds, struct inode *inode, RETURN(-ETXTBSY); } - +#ifdef IFILTERDATA_ACTUALLY_USED if (MDS_FILTERDATA(inode) && MDS_FILTERDATA(inode)->io_epoch != 0) { CDEBUG(D_INODE, "continuing MDS epoch "LPU64" for ino %lu/%u\n", MDS_FILTERDATA(inode)->io_epoch, inode->i_ino, @@ -169,14 +170,17 @@ static int mds_get_write_access(struct mds_obd *mds, struct inode *inode, rc = -ENOMEM; goto out; } +#endif /*IFILTERDATA_ACTUALLY_USED*/ if (epoch > mds->mds_io_epoch) mds->mds_io_epoch = epoch; else mds->mds_io_epoch++; +#ifdef IFILTERDATA_ACTUALLY_USED MDS_FILTERDATA(inode)->io_epoch = mds->mds_io_epoch; CDEBUG(D_INODE, "starting MDS epoch "LPU64" for ino %lu/%u\n", mds->mds_io_epoch, inode->i_ino, inode->i_generation); out: +#endif /*IFILTERDATA_ACTUALLY_USED*/ if (rc == 0) atomic_inc(&inode->i_writecount); up(&mds->mds_epoch_sem); @@ -201,7 +205,9 @@ static int mds_put_write_access(struct mds_obd *mds, struct inode *inode, if (!unlinking && !(body->valid & OBD_MD_FLSIZE)) GOTO(out, rc = EAGAIN); #endif +#ifdef IFILTERDATA_ACTUALLY_USED mds_free_filterdata(inode); +#endif out: up(&mds->mds_epoch_sem); return rc; @@ -257,7 +263,9 @@ static struct mds_file_data *mds_dentry_open(struct dentry *dentry, error = mds_get_write_access(mds, dentry->d_inode, 0); if (error) GOTO(cleanup_mfd, error); +#ifdef IFILTERDATA_ACTUALLY_USED body->io_epoch = MDS_FILTERDATA(dentry->d_inode)->io_epoch; +#endif /*IFILTERDATA_ACTUALLY_USED*/ } else if (flags & FMODE_EXEC) { error = mds_deny_write_access(mds, dentry->d_inode); if (error) @@ -666,6 +674,13 @@ static int mds_finish_open(struct ptlrpc_request *req, struct dentry *dchild, } } } + rc = mds_pack_acl(obd, req->rq_repmsg, 3, body, dchild->d_inode); + if (rc < 0) { + CERROR("mds_pack_acl: rc = %d\n", rc); + up(&dchild->d_inode->i_sem); + RETURN(rc); + } + /* If the inode has no EA data, then MDSs hold size, mtime */ if (S_ISREG(dchild->d_inode->i_mode) && !(body->valid & OBD_MD_FLEASIZE)) { diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index 8753866..a4e7a9b 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -42,6 +43,7 @@ #include #include #include +#include #include "mds_internal.h" struct mds_logcancel_data { @@ -191,6 +193,10 @@ out_commit: * chown_common and inode_setattr * utimes and inode_setattr */ +#ifndef ATTR_RAW +/* Just for the case if we have some clients that know about ATTR_RAW */ +#define ATTR_RAW 8192 +#endif int mds_fix_attr(struct inode *inode, struct mds_update_record *rec) { time_t now = LTIME_S(CURRENT_TIME); @@ -200,6 +206,7 @@ int mds_fix_attr(struct inode *inode, struct mds_update_record *rec) ENTRY; /* only fix up attrs if the client VFS didn't already */ + if (!(ia_valid & ATTR_RAW)) RETURN(0); @@ -296,10 +303,10 @@ void mds_steal_ack_locks(struct ptlrpc_request *req) if (oldrep->rs_xid != req->rq_xid) continue; - if (oldrep->rs_msg.opc != req->rq_reqmsg->opc) + if (oldrep->rs_msg->opc != req->rq_reqmsg->opc) CERROR ("Resent req xid "LPX64" has mismatched opc: " "new %d old %d\n", req->rq_xid, - req->rq_reqmsg->opc, oldrep->rs_msg.opc); + req->rq_reqmsg->opc, oldrep->rs_msg->opc); svc = oldrep->rs_srv_ni->sni_service; spin_lock (&svc->srv_lock); @@ -308,7 +315,7 @@ void mds_steal_ack_locks(struct ptlrpc_request *req) CWARN("Stealing %d locks from rs %p x"LPD64".t"LPD64 " o%d NID %s\n", oldrep->rs_nlocks, oldrep, - oldrep->rs_xid, oldrep->rs_transno, oldrep->rs_msg.opc, + oldrep->rs_xid, oldrep->rs_transno, oldrep->rs_msg->opc, ptlrpc_peernid2str(&exp->exp_connection->c_peer, str)); for (i = 0; i < oldrep->rs_nlocks; i++) @@ -444,25 +451,40 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, else /* setattr */ rc = fsfilt_setattr(obd, de, handle, &rec->ur_iattr, 0); - if (rc == 0 && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) && - rec->ur_eadata != NULL) { - struct lov_stripe_md *lsm = NULL; - - rc = ll_permission(inode, MAY_WRITE, NULL); - if (rc < 0) - GOTO(cleanup, rc); + if (rc == 0) { + if (rec->ur_iattr.ia_valid & ATTR_EA) { + int flags = (int)rec->ur_iattr.ia_attr_flags; + + rc = -EOPNOTSUPP; + if (inode->i_op && inode->i_op->setxattr) + rc = inode->i_op->setxattr(de, rec->ur_eadata, + rec->ur_ea2data, rec->ur_ea2datalen, + flags); + } else if (rec->ur_iattr.ia_valid & ATTR_EA_RM) { + rc = -EOPNOTSUPP; + if (inode->i_op && inode->i_op->removexattr) + rc = inode->i_op->removexattr(de, + rec->ur_eadata); + } else if ((S_ISREG(inode->i_mode) || + S_ISDIR(inode->i_mode)) && rec->ur_eadata != NULL) { + struct lov_stripe_md *lsm = NULL; + + rc = ll_permission(inode, MAY_WRITE, NULL); + if (rc < 0) + GOTO(cleanup, rc); - rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE, mds->mds_dt_exp, - 0, &lsm, rec->ur_eadata); - if (rc) - GOTO(cleanup, rc); + rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE, mds->mds_dt_exp, + 0, &lsm, rec->ur_eadata); + if (rc) + GOTO(cleanup, rc); - obd_free_memmd(mds->mds_dt_exp, &lsm); + obd_free_memmd(mds->mds_dt_exp, &lsm); - rc = fsfilt_set_md(obd, inode, handle, rec->ur_eadata, - rec->ur_eadatalen); - if (rc) - GOTO(cleanup, rc); + rc = fsfilt_set_md(obd, inode, handle, rec->ur_eadata, + rec->ur_eadatalen); + if (rc) + GOTO(cleanup, rc); + } } body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body)); @@ -476,6 +498,10 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, if (rec->ur_iattr.ia_valid & (ATTR_ATIME | ATTR_ATIME_SET)) body->valid |= OBD_MD_FLATIME; + /* The logcookie should be no use anymore, why nobody remove + * following code block? + */ + LASSERT(rec->ur_cookielen == 0); if (rc == 0 && rec->ur_cookielen && !IS_ERR(mds->mds_dt_obd)) { OBD_ALLOC(mlcd, sizeof(*mlcd) + rec->ur_cookielen + rec->ur_eadatalen); @@ -2845,24 +2871,31 @@ static int mds_check_for_rename(struct obd_device *obd, mds_pack_dentry2id(obd, &op_data->id1, dentry, 1); it.it_op = IT_UNLINK; + OBD_ALLOC(it.d.fs_data, sizeof(struct lustre_intent_data)); + if (!it.d.fs_data) + RETURN(-ENOMEM); rc = md_enqueue(mds->mds_md_exp, LDLM_IBITS, &it, LCK_EX, op_data, rlockh, NULL, 0, ldlm_completion_ast, mds_blocking_ast, NULL); OBD_FREE(op_data, sizeof(*op_data)); - if (rc) - RETURN(rc); + if (rc) { + OBD_FREE(it.d.fs_data, + sizeof(struct lustre_intent_data)); + RETURN(rc); + } if (rlockh->cookie != 0) ldlm_lock_decref(rlockh, LCK_EX); - if (it.d.lustre.it_data) { - req = (struct ptlrpc_request *)it.d.lustre.it_data; + if (LUSTRE_IT(&it)->it_data) { + req = (struct ptlrpc_request *)LUSTRE_IT(&it)->it_data; ptlrpc_req_finished(req); } - if (it.d.lustre.it_status) - rc = it.d.lustre.it_status; + if (LUSTRE_IT(&it)->it_status) + rc = LUSTRE_IT(&it)->it_status; + OBD_FREE(it.d.fs_data, sizeof(struct lustre_intent_data)); OBD_FREE(rlockh, handle_size); } RETURN(rc); diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 21090cf..90a61e1 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -459,12 +459,37 @@ int obd_proc_read_pinger(char *page, char **start, off_t off, int count, ); } +#if ENABLE_GSS +/* FIXME move these staff to proper place */ +int (*lustre_secinit_downcall_handler)(const char *buffer, + long count) = NULL; +EXPORT_SYMBOL(lustre_secinit_downcall_handler); + +int obd_proc_write_secinit(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + int rc = 0; + + if (lustre_secinit_downcall_handler) { + rc = (*lustre_secinit_downcall_handler)((char *)buffer, count); + if (rc) { + LASSERT(rc < 0); + return rc; + } + } + return (int)count; +} +#endif + /* Root for /proc/fs/lustre */ struct proc_dir_entry *proc_lustre_root = NULL; struct lprocfs_vars lprocfs_base[] = { { "version", obd_proc_read_version, NULL, NULL }, { "kernel_version", obd_proc_read_kernel_version, NULL, NULL }, { "pinger", obd_proc_read_pinger, NULL, NULL }, +#if ENABLE_GSS + { "secinit", NULL, obd_proc_write_secinit, NULL }, +#endif { 0 } }; diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 26c96e6..2bd9b91 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -548,6 +548,7 @@ void class_import_put(struct obd_import *import) if (import->imp_connection) ptlrpc_put_connection_superhack(import->imp_connection); + LASSERT(!import->imp_sec); while (!list_empty(&import->imp_conn_list)) { struct obd_import_conn *imp_conn; @@ -575,6 +576,7 @@ struct obd_import *class_new_import(void) INIT_LIST_HEAD(&imp->imp_replay_list); INIT_LIST_HEAD(&imp->imp_sending_list); INIT_LIST_HEAD(&imp->imp_delayed_list); + INIT_LIST_HEAD(&imp->imp_rawrpc_list); spin_lock_init(&imp->imp_lock); imp->imp_conn_cnt = 0; imp->imp_max_transno = 0; diff --git a/lustre/obdfilter/filter_log.c b/lustre/obdfilter/filter_log.c index 0237fc0..9d7afe9 100644 --- a/lustre/obdfilter/filter_log.c +++ b/lustre/obdfilter/filter_log.c @@ -43,11 +43,14 @@ int filter_log_sz_change(struct llog_handle *cathandle, struct inode *inode) { struct llog_size_change_rec *lsc; - int rc; +#ifdef IFILTERDATA_ACTUALLY_USED struct ost_filterdata *ofd; +#endif + int rc; ENTRY; down(&inode->i_sem); +#ifdef IFILTERDATA_ACTUALLY_USED ofd = inode->i_filterdata; if (ofd && ofd->ofd_epoch >= io_epoch) { @@ -68,6 +71,7 @@ int filter_log_sz_change(struct llog_handle *cathandle, inode->i_filterdata = ofd; ofd->ofd_epoch = io_epoch; } +#endif /* the decision to write a record is now made, unlock */ up(&inode->i_sem); @@ -88,7 +92,9 @@ int filter_log_sz_change(struct llog_handle *cathandle, rc = 0; } - out: +#ifdef IFILTERDATA_ACTUALLY_USED +out: +#endif RETURN(rc); } struct obd_llogs * filter_grab_llog_for_group(struct obd_device *, diff --git a/lustre/osc/osc_lib.c b/lustre/osc/osc_lib.c index 79b4b6b..cdf2ae6 100644 --- a/lustre/osc/osc_lib.c +++ b/lustre/osc/osc_lib.c @@ -26,6 +26,8 @@ #ifdef __KERNEL__ # include +# include +# include # include # include # include diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index f6f1a6c..b41258e 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -2880,6 +2881,31 @@ static int osc_set_info(struct obd_export *exp, obd_count keylen, RETURN(0); } + if (keylen == strlen("sec") && memcmp(key, "sec", keylen) == 0) { + struct client_obd *cli = &exp->exp_obd->u.cli; + + if (vallen == strlen("null") && + memcmp(val, "null", vallen) == 0) { + cli->cl_sec_flavor = PTLRPC_SEC_NULL; + cli->cl_sec_subflavor = 0; + RETURN(0); + } + if (vallen == strlen("krb5i") && + memcmp(val, "krb5i", vallen) == 0) { + cli->cl_sec_flavor = PTLRPC_SEC_GSS; + cli->cl_sec_subflavor = PTLRPC_SEC_GSS_KRB5I; + RETURN(0); + } + if (vallen == strlen("krb5p") && + memcmp(val, "krb5p", vallen) == 0) { + cli->cl_sec_flavor = PTLRPC_SEC_GSS; + cli->cl_sec_subflavor = PTLRPC_SEC_GSS_KRB5P; + RETURN(0); + } + CERROR("unrecognized security type %s\n", (char*) val); + RETURN(-EINVAL); + } + if (keylen < strlen("mds_conn") || memcmp(key, "mds_conn", strlen("mds_conn")) != 0) RETURN(-EINVAL); diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index c615015..d18919a 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -1005,6 +1005,11 @@ int ost_msg_check_version(struct lustre_msg *msg) CERROR("bad opc %u version %08x, expecting %08x\n", msg->opc, msg->version, LUSTRE_LOG_VERSION); break; + case SEC_INIT: + case SEC_INIT_CONTINUE: + case SEC_FINI: + rc = 0; + break; default: CERROR("OST unexpected opcode %d\n", msg->opc); rc = -ENOTSUPP; @@ -1029,6 +1034,13 @@ int ost_handle(struct ptlrpc_request *req) RETURN(rc); } + /* Security opc should NOT trigger any recovery events */ + if (req->rq_reqmsg->opc == SEC_INIT || + req->rq_reqmsg->opc == SEC_INIT_CONTINUE || + req->rq_reqmsg->opc == SEC_FINI) { + GOTO(out, rc = 0); + } + /* XXX identical to MDS */ if (req->rq_reqmsg->opc != OST_CONNECT) { struct obd_device *obd; diff --git a/lustre/ptlrpc/autoMakefile.am b/lustre/ptlrpc/autoMakefile.am index 63b30e9..f2105e8 100644 --- a/lustre/ptlrpc/autoMakefile.am +++ b/lustre/ptlrpc/autoMakefile.am @@ -16,7 +16,7 @@ LDLM_COMM_SOURCES= $(top_srcdir)/lustre/ldlm/l_lock.c \ COMMON_SOURCES = client.c recover.c connection.c niobuf.c pack_generic.c \ events.c ptlrpc_module.c service.c pinger.c recov_thread.c llog_net.c \ - llog_client.c llog_server.c import.c ptlrpcd.c pers.c \ + llog_client.c llog_server.c import.c ptlrpcd.c pers.c \ ptlrpc_internal.h $(LDLM_COMM_SOURCES) if LIBLUSTRE diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 2d03035..1f6127e 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "ptlrpc_internal.h" @@ -181,6 +182,9 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) EXIT; } +/* FIXME prep_req now should return error code other than NULL. but + * this is called everywhere :( + */ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version, int opcode, int count, int *lengths, char **bufs) @@ -197,11 +201,25 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version, RETURN(NULL); } + request->rq_import = class_import_get(imp); + + rc = ptlrpcs_req_get_cred(request); + if (rc) { + CDEBUG(D_SEC, "failed to get credential\n"); + GOTO(out_free, rc); + } + + /* just a try on refresh, but we proceed even if it failed */ + rc = ptlrpcs_cred_refresh(request->rq_cred); + if (!ptlrpcs_cred_is_uptodate(request->rq_cred)) { + CERROR("req %p: failed to refresh cred %p, rc %d, continue\n", + request, request->rq_cred, rc); + } + rc = lustre_pack_request(request, count, lengths, bufs); if (rc) { CERROR("cannot pack request %d\n", rc); - OBD_FREE(request, sizeof(*request)); - RETURN(NULL); + GOTO(out_cred, rc); } request->rq_reqmsg->version |= version; @@ -212,7 +230,6 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version, request->rq_send_state = LUSTRE_IMP_FULL; request->rq_type = PTL_RPC_MSG_REQUEST; - request->rq_import = class_import_get(imp); request->rq_req_cbid.cbid_fn = request_out_callback; request->rq_req_cbid.cbid_arg = request; @@ -237,6 +254,12 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version, request->rq_reqmsg->opc = opcode; request->rq_reqmsg->flags = 0; RETURN(request); +out_cred: + ptlrpcs_req_drop_cred(request); +out_free: + class_import_put(imp); + OBD_FREE(request, sizeof(*request)); + RETURN(NULL); } struct ptlrpc_request_set *ptlrpc_prep_set(void) @@ -469,8 +492,22 @@ static int after_reply(struct ptlrpc_request *req) /* Clear reply swab mask; this is a new reply in sender's byte order */ req->rq_rep_swab_mask = 0; #endif - LASSERT (req->rq_nob_received <= req->rq_replen); - rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received); + LASSERT (req->rq_nob_received <= req->rq_repbuf_len); + rc = ptlrpcs_cli_unwrap_reply(req); + if (rc) { + CERROR("verify reply error: %d\n", rc); + RETURN(rc); + } + /* unwrap_reply may request rpc be resend */ + if (req->rq_ptlrpcs_restart) { + req->rq_resend = 1; + RETURN(0); + } + + /* unwrap_reply will set rq_replen as the actual received + * lustre_msg length + */ + rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen); if (rc) { CERROR("unpack_rep failed: %d\n", rc); RETURN(-EPROTO); @@ -696,8 +733,10 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) req->rq_waiting = 0; if (req->rq_resend) { - lustre_msg_add_flags(req->rq_reqmsg, - MSG_RESENT); + if (!req->rq_ptlrpcs_restart) + lustre_msg_add_flags( + req->rq_reqmsg, + MSG_RESENT); if (req->rq_bulk) { __u64 old_xid = req->rq_xid; @@ -1022,6 +1061,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) LASSERTF(request->rq_rqbd == NULL, "req %p\n",request);/* client-side */ LASSERTF(list_empty(&request->rq_list), "req %p\n", request); LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request); + LASSERT(request->rq_cred); /* We must take it off the imp_replay_list first. Otherwise, we'll set * request->rq_reqmsg to NULL while osc_close is dereferencing it. */ @@ -1042,14 +1082,11 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) LBUG(); } - if (request->rq_repmsg != NULL) { - OBD_FREE(request->rq_repmsg, request->rq_replen); - request->rq_repmsg = NULL; - } - if (request->rq_reqmsg != NULL) { - OBD_FREE(request->rq_reqmsg, request->rq_reqlen); - request->rq_reqmsg = NULL; - } + if (request->rq_repbuf != NULL) + ptlrpcs_cli_free_repbuf(request); + if (request->rq_reqbuf != NULL) + ptlrpcs_cli_free_reqbuf(request); + if (request->rq_export != NULL) { class_export_put(request->rq_export); request->rq_export = NULL; @@ -1061,6 +1098,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) if (request->rq_bulk != NULL) ptlrpc_free_bulk(request->rq_bulk); + ptlrpcs_req_drop_cred(request); OBD_FREE(request, sizeof(*request)); EXIT; } @@ -1399,7 +1437,8 @@ restart: } if (req->rq_resend) { - lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); + if (!req->rq_ptlrpcs_restart) + lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); if (req->rq_bulk != NULL) ptlrpc_unregister_bulk (req); @@ -1537,8 +1576,8 @@ static int ptlrpc_replay_interpret(struct ptlrpc_request *req, /* Clear reply swab mask; this is a new reply in sender's byte order */ req->rq_rep_swab_mask = 0; #endif - LASSERT (req->rq_nob_received <= req->rq_replen); - rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received); + LASSERT (req->rq_nob_received <= req->rq_repbuf_len); + rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen); if (rc) { CERROR("unpack_rep failed: %d\n", rc); GOTO(out, rc = -EPROTO); @@ -1657,6 +1696,18 @@ void ptlrpc_abort_inflight(struct obd_import *imp) spin_unlock (&req->rq_lock); } + list_for_each_safe(tmp, n, &imp->imp_rawrpc_list) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_list); + + DEBUG_REQ(D_HA, req, "aborting raw rpc"); + + spin_lock(&req->rq_lock); + req->rq_err = 1; + ptlrpc_wake_client_req(req); + spin_unlock(&req->rq_lock); + } + /* Last chance to free reqs left on the replay list, but we * will still leak reqs that haven't comitted. */ if (imp->imp_replayable) diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 270351d..37a7f94 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -88,9 +88,9 @@ void reply_in_callback(ptl_event_t *ev) LASSERT (ev->type == PTL_EVENT_PUT_END || ev->type == PTL_EVENT_UNLINK); LASSERT (ev->unlinked); - LASSERT (ev->md.start == req->rq_repmsg); + LASSERT (ev->md.start == req->rq_repbuf); LASSERT (ev->offset == 0); - LASSERT (ev->mlength <= req->rq_replen); + LASSERT (ev->mlength <= req->rq_repbuf_len); DEBUG_REQ((ev->ni_fail_type == PTL_NI_OK) ? D_NET : D_ERROR, req, "type %d, status %d", ev->type, ev->ni_fail_type); @@ -207,10 +207,10 @@ void request_in_callback(ptl_event_t *ev) * flags are reset and scalars are zero. We only set the message * size to non-zero if this was a successful receive. */ req->rq_xid = ev->match_bits; - req->rq_reqmsg = ev->md.start + ev->offset; + req->rq_reqbuf = ev->md.start + ev->offset; if (ev->type == PTL_EVENT_PUT_END && ev->ni_fail_type == PTL_NI_OK) - req->rq_reqlen = ev->mlength; + req->rq_reqbuf_len = ev->mlength; do_gettimeofday(&req->rq_arrival_time); req->rq_peer.peer_id = ev->initiator; req->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni; diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index d2ccb41..122f878 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "ptlrpc_internal.h" @@ -273,10 +274,15 @@ static int import_select_connection(struct obd_import *imp) list_add_tail(&tmp->oic_item, &imp->imp_conn_list); } - /* switch connection, don't mind if it's same as the current one */ - if (imp->imp_connection) - ptlrpc_put_connection(imp->imp_connection); - imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); + /* switch connection if we chose a new one */ + if (imp->imp_connection != imp_conn->oic_conn) { + if (imp->imp_connection) { + ptlrpcs_sec_invalidate_cache(imp->imp_sec); + ptlrpc_put_connection(imp->imp_connection); + } + imp->imp_connection = + ptlrpc_connection_addref(imp_conn->oic_conn); + } dlmexp = class_conn2export(&imp->imp_dlm_handle); LASSERT(dlmexp != NULL); @@ -304,13 +310,15 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) __u64 committed_before_reconnect = 0; struct ptlrpc_request *request; int size[] = {sizeof(imp->imp_target_uuid), - sizeof(obd->obd_uuid), - sizeof(imp->imp_dlm_handle), - sizeof(unsigned long)}; + sizeof(obd->obd_uuid), + sizeof(imp->imp_dlm_handle), + sizeof(unsigned long), + sizeof(__u32) * 2}; char *tmp[] = {imp->imp_target_uuid.uuid, obd->obd_uuid.uuid, (char *)&imp->imp_dlm_handle, - (char *)&imp->imp_connect_flags}; /* XXX: make this portable! */ + (char *)&imp->imp_connect_flags, /* XXX: make this portable! */ + (char*) &obd->u.cli.cl_nllu}; struct ptlrpc_connect_async_args *aa; unsigned long flags; @@ -356,8 +364,10 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) if (rc) GOTO(out, rc); + LASSERT(imp->imp_sec); + request = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, - imp->imp_connect_op, 4, size, tmp); + imp->imp_connect_op, 5, size, tmp); if (!request) GOTO(out, rc = -ENOMEM); diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 7054f99..370fe17 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -78,6 +78,9 @@ struct ll_rpc_opcode { { PTLBD_DISCONNECT, "ptlbd_disconnect" }, { OBD_PING, "obd_ping" }, { OBD_LOG_CANCEL, "llog_origin_handle_cancel"}, + { SEC_INIT, "sec_init"}, + { SEC_INIT_CONTINUE,"sec_init_continue"}, + { SEC_FINI, "sec_fini"}, }; const char* ll_opcode2str(__u32 opcode) diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 6de6be6..d03f2ed 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -24,10 +24,12 @@ #ifndef __KERNEL__ #include #endif +#include #include #include #include #include +#include #include "ptlrpc_internal.h" static int ptl_send_buf (ptl_handle_md_t *mdh, void *base, int len, @@ -311,14 +313,15 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) int rc; /* We must already have a reply buffer (only ptlrpc_error() may be - * called without one). We must also have a request buffer which + * called without one). We usually also have a request buffer which * is either the actual (swabbed) incoming request, or a saved copy - * if this is a req saved in target_queue_final_reply(). */ - LASSERT (req->rq_reqmsg != NULL); + * if this is a req saved in target_queue_final_reply(). but this + * will not be true since some security handling may skip the reqmsg + * setting and prepare reply under normal ptlrpc layer */ LASSERT (rs != NULL); LASSERT (req->rq_repmsg != NULL); LASSERT (may_be_difficult || !rs->rs_difficult); - LASSERT (req->rq_repmsg == &rs->rs_msg); + LASSERT (req->rq_repmsg == rs->rs_msg); LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback); LASSERT (rs->rs_cb_id.cbid_arg == rs); @@ -328,7 +331,7 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) req->rq_repmsg->type = req->rq_type; req->rq_repmsg->status = req->rq_status; - req->rq_repmsg->opc = req->rq_reqmsg->opc; + req->rq_repmsg->opc = req->rq_reqmsg ? req->rq_reqmsg->opc : 0; if (req->rq_export == NULL) conn = ptlrpc_get_connection(&req->rq_peer, NULL); @@ -337,10 +340,17 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) atomic_inc (&svc->srv_outstanding_replies); - rc = ptl_send_buf (&rs->rs_md_h, req->rq_repmsg, req->rq_replen, + rc = svcsec_authorize(req); + if (rc) { + CERROR("Error wrap reply message "LPX64"\n", req->rq_xid); + goto out; + } + + rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, rs->rs_difficult ? PTL_ACK_REQ : PTL_NOACK_REQ, &rs->rs_cb_id, conn, svc->srv_rep_portal, req->rq_xid); +out: if (rc != 0) { atomic_dec (&svc->srv_outstanding_replies); @@ -405,12 +415,21 @@ int ptl_send_rpc(struct ptlrpc_request *request) request->rq_reqmsg->handle = request->rq_import->imp_remote_handle; request->rq_reqmsg->type = PTL_RPC_MSG_REQUEST; request->rq_reqmsg->conn_cnt = request->rq_import->imp_conn_cnt; - + + /* wrap_request might need to refresh gss cred, if this is called + * in ptlrpcd then the whole daemon thread will be waiting on + * gss negotiate rpc. FIXME + */ + rc = ptlrpcs_cli_wrap_request(request); + if (rc) + GOTO(cleanup_bulk, rc); + LASSERT (request->rq_replen != 0); - if (request->rq_repmsg == NULL) - OBD_ALLOC(request->rq_repmsg, request->rq_replen); - if (request->rq_repmsg == NULL) - GOTO(cleanup_bulk, rc = -ENOMEM); + if (request->rq_repbuf == NULL) { + rc = ptlrpcs_cli_alloc_repbuf(request, request->rq_replen); + if (rc) + GOTO(cleanup_bulk, rc); + } rc = PtlMEAttach(connection->c_peer.peer_ni->pni_ni_h, request->rq_reply_portal, /* XXX FIXME bug 249 */ @@ -431,11 +450,12 @@ int ptl_send_rpc(struct ptlrpc_request *request) request->rq_timedout = 0; request->rq_net_err = 0; request->rq_resend = 0; + request->rq_ptlrpcs_restart = 0; request->rq_restart = 0; spin_unlock_irqrestore (&request->rq_lock, flags); - reply_md.start = request->rq_repmsg; - reply_md.length = request->rq_replen; + reply_md.start = request->rq_repbuf; + reply_md.length = request->rq_repbuf_len; reply_md.threshold = 1; reply_md.options = PTLRPC_MD_OPTIONS | PTL_MD_OP_PUT; reply_md.user_ptr = &request->rq_reply_cbid; @@ -460,7 +480,7 @@ int ptl_send_rpc(struct ptlrpc_request *request) request->rq_sent = LTIME_S(CURRENT_TIME); ptlrpc_pinger_sending_on_import(request->rq_import); rc = ptl_send_buf(&request->rq_req_md_h, - request->rq_reqmsg, request->rq_reqlen, + request->rq_reqbuf, request->rq_reqdata_len, PTL_NOACK_REQ, &request->rq_req_cbid, connection, request->rq_request_portal, @@ -482,8 +502,7 @@ int ptl_send_rpc(struct ptlrpc_request *request) LASSERT (!request->rq_receiving_reply); cleanup_repmsg: - OBD_FREE(request->rq_repmsg, request->rq_replen); - request->rq_repmsg = NULL; + ptlrpcs_cli_free_repbuf(request); cleanup_bulk: if (request->rq_bulk != NULL) @@ -537,3 +556,163 @@ int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd) return (-ENOMEM); } + +static int rawrpc_timedout(void *data) +{ + struct ptlrpc_request *req = (struct ptlrpc_request *) data; + unsigned long flags; + + spin_lock_irqsave(&req->rq_lock, flags); + if (!req->rq_replied) + req->rq_timedout = 1; + spin_unlock_irqrestore(&req->rq_lock, flags); + + return 1; +} + +/* to make things as simple as possible */ +static int rawrpc_check_reply(struct ptlrpc_request *req) +{ + unsigned long flags; + int rc; + + spin_lock_irqsave (&req->rq_lock, flags); + rc = req->rq_replied || req->rq_net_err || req->rq_err || + req->rq_resend || req->rq_restart; + spin_unlock_irqrestore(&req->rq_lock, flags); + return rc; +} + +/* + * Construct a fake ptlrpc_request to do the work, in order to + * user the existing callback/wakeup facilities + */ +int ptlrpc_do_rawrpc(struct obd_import *imp, + char *reqbuf, int reqlen, + char *repbuf, int *replenp, + int timeout) +{ + struct ptlrpc_connection *conn; + struct ptlrpc_request request; /* just a fake one */ + ptl_handle_me_t reply_me_h; + ptl_md_t reply_md, req_md; + struct l_wait_info lwi; + unsigned long irq_flags; + int rc; + ENTRY; + + LASSERT(imp); + class_import_get(imp); + if (imp->imp_state == LUSTRE_IMP_CLOSED) { + CWARN("raw rpc on closed imp(=>%s)? send anyway\n", + imp->imp_target_uuid.uuid); + } + + conn = imp->imp_connection; + + /* initialize request */ + memset(&request, 0, sizeof(request)); + request.rq_req_cbid.cbid_fn = request_out_callback; + request.rq_req_cbid.cbid_arg = &request; + request.rq_reply_cbid.cbid_fn = reply_in_callback; + request.rq_reply_cbid.cbid_arg = &request; + request.rq_reqbuf = reqbuf; + request.rq_reqbuf_len = reqlen; + request.rq_repbuf = repbuf; + request.rq_repbuf_len = *replenp; + request.rq_set = NULL; + spin_lock_init(&request.rq_lock); + init_waitqueue_head(&request.rq_reply_waitq); + atomic_set(&request.rq_refcount, 1000000); /* never be droped */ + request.rq_xid = ptlrpc_next_xid(); + + /* add into sending list */ + spin_lock_irqsave(&imp->imp_lock, irq_flags); + list_add_tail(&request.rq_list, &imp->imp_rawrpc_list); + spin_unlock_irqrestore(&imp->imp_lock, irq_flags); + + /* prepare reply buffer */ + rc = PtlMEAttach(conn->c_peer.peer_ni->pni_ni_h, + imp->imp_client->cli_reply_portal, + conn->c_peer.peer_id, request.rq_xid, 0, PTL_UNLINK, + PTL_INS_AFTER, &reply_me_h); + if (rc != PTL_OK) { + CERROR("PtlMEAttach failed: %d\n", rc); + LASSERT (rc == PTL_NO_SPACE); + GOTO(cleanup_imp, rc = -ENOMEM); + } + + spin_lock_irqsave(&request.rq_lock, irq_flags); + request.rq_receiving_reply = 1; + spin_unlock_irqrestore(&request.rq_lock, irq_flags); + + reply_md.start = repbuf; + reply_md.length = *replenp; + reply_md.threshold = 1; + reply_md.options = PTLRPC_MD_OPTIONS | PTL_MD_OP_PUT; + reply_md.user_ptr = &request.rq_reply_cbid; + reply_md.eq_handle = conn->c_peer.peer_ni->pni_eq_h; + + rc = PtlMDAttach(reply_me_h, reply_md, PTL_UNLINK, + &request.rq_reply_md_h); + if (rc != PTL_OK) { + CERROR("PtlMDAttach failed: %d\n", rc); + LASSERT (rc == PTL_NO_SPACE); + GOTO(cleanup_me, rc = -ENOMEM); + } + + /* prepare request buffer */ + req_md.start = reqbuf; + req_md.length = reqlen; + req_md.threshold = 1; + req_md.options = PTLRPC_MD_OPTIONS; + req_md.user_ptr = &request.rq_req_cbid; + req_md.eq_handle = conn->c_peer.peer_ni->pni_eq_h; + + rc = PtlMDBind(conn->c_peer.peer_ni->pni_ni_h, + req_md, PTL_UNLINK, &request.rq_req_md_h); + if (rc != PTL_OK) { + CERROR("PtlMDBind failed %d\n", rc); + LASSERT (rc == PTL_NO_SPACE); + GOTO(cleanup_me, rc = -ENOMEM); + } + + rc = PtlPut(request.rq_req_md_h, PTL_NOACK_REQ, conn->c_peer.peer_id, + imp->imp_client->cli_request_portal, + 0, request.rq_xid, 0, 0); + if (rc != PTL_OK) { + CERROR("PtlPut failed %d\n", rc); + GOTO(cleanup_md, rc); + } + + lwi = LWI_TIMEOUT(timeout * HZ, rawrpc_timedout, &request); + l_wait_event(request.rq_reply_waitq, + rawrpc_check_reply(&request), &lwi); + + ptlrpc_unregister_reply(&request); + + if (request.rq_err || request.rq_resend || request.rq_intr || + request.rq_timedout || !request.rq_replied) { + CERROR("secinit rpc error: err %d, resend %d, " + "intr %d, timeout %d, replied %d\n", + request.rq_err, request.rq_resend, request.rq_intr, + request.rq_timedout, request.rq_replied); + rc = -EINVAL; + } else { + *replenp = request.rq_nob_received; + rc = 0; + } + GOTO(cleanup_imp, rc); + +cleanup_md: + PtlMDUnlink(request.rq_req_md_h); +cleanup_me: + PtlMEUnlink(reply_me_h); +cleanup_imp: + spin_lock_irqsave(&imp->imp_lock, irq_flags); + list_del_init(&request.rq_list); + spin_unlock_irqrestore(&imp->imp_lock, irq_flags); + + class_import_put(imp); + RETURN(rc); +} diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 920ea49..db9a38d 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -76,14 +77,15 @@ void lustre_init_msg (struct lustre_msg *msg, int count, int *lens, char **bufs) int lustre_pack_request (struct ptlrpc_request *req, int count, int *lens, char **bufs) { + int rc; ENTRY; - req->rq_reqlen = lustre_msg_size (count, lens); - OBD_ALLOC(req->rq_reqmsg, req->rq_reqlen); - if (req->rq_reqmsg == NULL) - RETURN(-ENOMEM); + req->rq_reqlen = lustre_msg_size(count, lens); + rc = ptlrpcs_cli_alloc_reqbuf(req, req->rq_reqlen); + if (rc) + RETURN(rc); - lustre_init_msg (req->rq_reqmsg, count, lens, bufs); + lustre_init_msg(req->rq_reqmsg, count, lens, bufs); RETURN (0); } @@ -117,29 +119,29 @@ int lustre_pack_reply (struct ptlrpc_request *req, int count, int *lens, char **bufs) { struct ptlrpc_reply_state *rs; - int msg_len; - int size; + int rc; ENTRY; - LASSERT (req->rq_reply_state == NULL); - - msg_len = lustre_msg_size (count, lens); - size = offsetof (struct ptlrpc_reply_state, rs_msg) + msg_len; - OBD_ALLOC (rs, size); - if (rs == NULL) - RETURN (-ENOMEM); - + LASSERT(req->rq_reply_state == NULL); + LASSERT(req->rq_svcsec); + LASSERT(req->rq_repmsg == NULL); + + req->rq_replen = lustre_msg_size(count, lens); + rc = svcsec_alloc_repbuf(req->rq_svcsec, req, req->rq_replen); + if (rc) + RETURN(rc); + LASSERT(req->rq_reply_state); + LASSERT(req->rq_repmsg == req->rq_reply_state->rs_msg); + + rs = req->rq_reply_state; + rs->rs_svcsec = svcsec_get(req->rq_svcsec); rs->rs_cb_id.cbid_fn = reply_out_callback; rs->rs_cb_id.cbid_arg = rs; rs->rs_srv_ni = req->rq_rqbd->rqbd_srv_ni; - rs->rs_size = size; INIT_LIST_HEAD(&rs->rs_exp_list); INIT_LIST_HEAD(&rs->rs_obd_list); - req->rq_replen = msg_len; - req->rq_reply_state = rs; - req->rq_repmsg = &rs->rs_msg; - lustre_init_msg (&rs->rs_msg, count, lens, bufs); + lustre_init_msg(rs->rs_msg, count, lens, bufs); PTLRPC_RS_DEBUG_LRU_ADD(rs); @@ -148,6 +150,8 @@ int lustre_pack_reply (struct ptlrpc_request *req, void lustre_free_reply_state (struct ptlrpc_reply_state *rs) { + struct ptlrpc_svcsec *svcsec = rs->rs_svcsec; + PTLRPC_RS_DEBUG_LRU_DEL(rs); LASSERT (!rs->rs_difficult || rs->rs_handled); @@ -157,8 +161,14 @@ void lustre_free_reply_state (struct ptlrpc_reply_state *rs) LASSERT (rs->rs_nlocks == 0); LASSERT (list_empty(&rs->rs_exp_list)); LASSERT (list_empty(&rs->rs_obd_list)); + LASSERT (svcsec); + + if (svcsec->free_repbuf) + svcsec->free_repbuf(svcsec, rs); + else + svcsec_free_reply_state(rs); - OBD_FREE (rs, rs->rs_size); + svcsec_put(svcsec); } /* This returns the size of the buffer that is required to hold a lustre_msg @@ -618,6 +628,11 @@ struct mds_req_sec_desc *lustre_swab_mds_secdesc(struct ptlrpc_request *req, __swab32s(&rsd->rsd_ngroups); } + if (rsd->rsd_ngroups > LUSTRE_MAX_GROUPS) { + CERROR("%u groups is not allowed\n", rsd->rsd_ngroups); + return NULL; + } + if (m->buflens[offset] != sizeof(*rsd) + rsd->rsd_ngroups * sizeof(__u32)) { CERROR("bufflen %u while contains %u groups\n", diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index e49b5f9..12a3c20 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -85,6 +85,14 @@ static inline int opcode_offset(__u32 opc) { (LDLM_LAST_OPC - LDLM_FIRST_OPC) + (MDS_LAST_OPC - MDS_FIRST_OPC) + (OST_LAST_OPC - OST_FIRST_OPC)); + } else if (opc < SEC_LAST_OPC) { + /* Security negotiate */ + return (opc - SEC_FIRST_OPC + + (PTLBD_LAST_OPC - PTLBD_FIRST_OPC) + + (LDLM_LAST_OPC - LDLM_FIRST_OPC) + + (MDS_LAST_OPC - MDS_FIRST_OPC) + + (OST_LAST_OPC - OST_FIRST_OPC) + + (OBD_LAST_OPC - OBD_FIRST_OPC)); } else { /* Unknown Opcode */ return -1; @@ -95,7 +103,8 @@ static inline int opcode_offset(__u32 opc) { (LDLM_LAST_OPC - LDLM_FIRST_OPC) + \ (MDS_LAST_OPC - MDS_FIRST_OPC) + \ (OST_LAST_OPC - OST_FIRST_OPC) + \ - (OBD_LAST_OPC - OBD_FIRST_OPC)) + (OBD_LAST_OPC - OBD_FIRST_OPC) + \ + (SEC_LAST_OPC - SEC_FIRST_OPC)) enum { PTLRPC_REQWAIT_CNTR = 0, diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index 5cbdf4f..c42c47c 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -92,6 +92,7 @@ EXPORT_SYMBOL(ptlrpc_reply); EXPORT_SYMBOL(ptlrpc_error); EXPORT_SYMBOL(ptlrpc_resend_req); EXPORT_SYMBOL(ptl_send_rpc); +EXPORT_SYMBOL(ptlrpc_do_rawrpc); /* client.c */ EXPORT_SYMBOL(ptlrpc_init_client); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index fa924fc..30217ab 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include "ptlrpc_internal.h" @@ -42,6 +43,12 @@ static spinlock_t ptlrpc_all_services_lock = SPIN_LOCK_UNLOCKED; static void ptlrpc_free_server_req (struct ptlrpc_request *req) { + if (req->rq_svcsec) { + svcsec_cleanup_req(req); + svcsec_put(req->rq_svcsec); + req->rq_svcsec = NULL; + } + /* The last request to be received into a request buffer uses space * in the request buffer descriptor, otherwise requests are * allocated dynamically in the incoming reply event handler */ @@ -408,7 +415,8 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc) struct timeval work_start; struct timeval work_end; long timediff; - int rc; + enum ptlrpcs_error sec_err; + int secrc, rc; ENTRY; spin_lock_irqsave (&svc->srv_lock, flags); @@ -445,12 +453,32 @@ ptlrpc_server_handle_request (struct ptlrpc_service *svc) /* Clear request swab mask; this is a new request */ request->rq_req_swab_mask = 0; #endif - rc = lustre_unpack_msg (request->rq_reqmsg, request->rq_reqlen); + + /* go through security check/transform */ + request->rq_auth_uid = -1; + secrc = svcsec_accept(request, &sec_err); + switch(secrc) { + case SVC_OK: + CDEBUG(D_SEC, "request accepted ok\n"); + break; + case SVC_COMPLETE: + target_send_reply(request, 0, OBD_FAIL_MDS_ALL_REPLY_NET); + goto put_conn; + case SVC_DROP: + goto out; + case SVC_LOGIN: + case SVC_LOGOUT: + break; + default: + LBUG(); + } + + rc = lustre_unpack_msg(request->rq_reqmsg, request->rq_reqlen); if (rc != 0) { CERROR ("error unpacking request: ptl %d from %s" " xid "LPU64"\n", svc->srv_req_portal, ptlrpc_peernid2str(&request->rq_peer, str), - request->rq_xid); + request->rq_xid); goto out; } @@ -530,11 +558,12 @@ put_conn: CDEBUG((timediff / 1000000 > (long)obd_timeout) ? D_ERROR : D_HA, "request "LPU64" opc %u from NID %s processed in %ldus " - "(%ldus total)\n", request->rq_xid, request->rq_reqmsg->opc, + "(%ldus total)\n", request->rq_xid, + request->rq_reqmsg ? request->rq_reqmsg->opc : 0, ptlrpc_peernid2str(&request->rq_peer, str), timediff, timeval_sub(&work_end, &request->rq_arrival_time)); - if (svc->srv_stats != NULL) { + if (svc->srv_stats != NULL && request->rq_reqmsg != NULL) { int opc = opcode_offset(request->rq_reqmsg->opc); if (opc > 0) { LASSERT(opc < LUSTRE_MAX_OPCODES); @@ -612,7 +641,7 @@ ptlrpc_server_handle_reply (struct ptlrpc_service *svc) " o%d NID %s\n", rs, rs->rs_xid, rs->rs_transno, - rs->rs_msg.opc, + rs->rs_msg->opc, ptlrpc_peernid2str(&exp->exp_connection->c_peer, str)); #endif } diff --git a/lustre/sec/.cvsignore b/lustre/sec/.cvsignore new file mode 100644 index 0000000..d5103fa --- /dev/null +++ b/lustre/sec/.cvsignore @@ -0,0 +1,15 @@ +.Xrefs +config.log +config.status +configure +Makefile +.deps +TAGS +.*.cmd +autoMakefile.in +autoMakefile +*.ko +*.mod.c +.*.o.flags +.tmp_versions +.depend diff --git a/lustre/sec/Makefile.in b/lustre/sec/Makefile.in new file mode 100644 index 0000000..224d66b --- /dev/null +++ b/lustre/sec/Makefile.in @@ -0,0 +1,6 @@ +MODULES := ptlrpcs +ptlrpcs-objs := sec.o sec_null.o svcsec.o svcsec_null.o upcall_cache.o + +@GSS_TRUE@subdir-m += gss + +@INCLUDE_RULES@ diff --git a/lustre/sec/Makefile.mk b/lustre/sec/Makefile.mk new file mode 100644 index 0000000..7dcc93c --- /dev/null +++ b/lustre/sec/Makefile.mk @@ -0,0 +1,10 @@ +# Copyright (C) 2004 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(src)/../portals/Kernelenv + +obj-y += ptlrpcs.o +ptlrpcs-objs := sec.o sec_null.o svcsec.o svcsec_null.o upcall_cache.o + diff --git a/lustre/sec/autoMakefile.am b/lustre/sec/autoMakefile.am new file mode 100644 index 0000000..7422341 --- /dev/null +++ b/lustre/sec/autoMakefile.am @@ -0,0 +1,22 @@ +# Copyright (C) 2004 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +if GSS +SUBDIRS = . gss #kcrypto +endif + +if LIBLUSTRE +noinst_LIBRARIES = libptlrpcs.a +libptlrpcs_a_SOURCES = sec.c sec_null.c svcsec.c svcsec_null.c +libptlrpcs_a_CPPFLAGS = $(LLCPPFLAGS) +libptlrpcs_a_CFLAGS = $(LLCFLAGS) +endif + +if MODULES +modulefs_DATA = ptlrpcs$(KMODEXT) +endif + +DIST_SOURCES = $(ptlrpcs-objs:.o=.c) +MOSTLYCLEANFILES = *.o *.ko *.mod.c diff --git a/lustre/sec/doc/oss_gss_HLD.lyx b/lustre/sec/doc/oss_gss_HLD.lyx new file mode 100644 index 0000000..515eb2b --- /dev/null +++ b/lustre/sec/doc/oss_gss_HLD.lyx @@ -0,0 +1,258 @@ +#LyX 1.3 created this file. For more info see http://www.lyx.org/ +\lyxformat 221 +\textclass article +\language english +\inputencoding auto +\fontscheme times +\graphics default +\paperfontsize 12 +\spacing single +\papersize Default +\paperpackage a4 +\use_geometry 0 +\use_amsmath 0 +\use_natbib 0 +\use_numerical_citations 0 +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation skip +\defskip medskip +\quotes_language english +\quotes_times 2 +\papercolumns 1 +\papersides 1 +\paperpagestyle default + +\layout Title + +High Level Design of Cient-OSS Connection +\layout Author + +Peter Braam, Eric Mei +\layout Date + +Feb 13, 2005 +\layout Section + +Requirements +\layout Itemize + +Establish gss connections between clients and OSS. +\layout Itemize + +Establish gss connections between servers. +\layout Section + +Functional Specification +\layout Standard + +In Lustre system, there are several kinds of connections and security options + can be chosen separately: +\layout Itemize + +between client and MDS's +\layout Itemize + +between client and OSS's +\layout Itemize + +between MDS's +\layout Itemize + +between MDS's and OSS's +\layout Standard + +Currently we are able to establish secure connections between the client + and MDS's, simply by adding a mount parameter 'sec=sec_flavor', here sec_flavor + could be 'krb5i' or 'krb5p' for this moment. + Now we also need the secure connections between client and OSS's also be + an option, to prepare for the coming object security features. + So the original mount option 'sec' will be break into 2 options: 'mds_sec' + and 'oss_sec'. +\layout Itemize + +mount.lustre should be able to recognize options 'mds_sec=sec_flavor' and + 'oss_sec=sec_flavor'. +\layout Itemize + +lmt should be able to add 'mds_sec' and 'oss_sec' into xml file and recognizable + by lconf. + And lconf should be able to write this info into config log with option + --write-conf. +\layout Standard + +Usually we consider MDS and OSS are trusted nodes, but networks are normally + not secure. + So connections of MDS <=> MDS and MDS <=> OSS must be secure in most cases. + We should also provide security on connections between servers. +\layout Standard + +For inter MDS's and MDS's to OSS's, We provide options for lconf and lmt, + just like client <=> OSS's case: +\layout Itemize + +lconf should be able to recognize options '--inter_mds_sec=sec_flavor' and + '--mds_oss_sec=sec_flavor'. +\layout Itemize + +lmt should be able to add 'inter_mds_sec' and 'mds_oss_sec' into xml file + and recognizable by lconf. +\layout Standard + +Servers will have options to accept only certain types of connections. + When setup OSS/MDS via lconf, option '--deny_sec=sec_flavor[,sec_flavor...]' + should be recognized and notify OSS/MDS kernel. + Currently sec_flavor could be 'null', 'krb5i', or 'krb5p'. +\layout Standard + +Maybe privacy connections to the OSS servers are only needed from the MDS, + since there will be no secret transfer between OSS and client. + And if we in the future support mixed security type in single security + context, then integrity type might be enough for most cases. + But anyway we provide the flexibility here. +\layout Section + +Use Cases +\layout Subsection + +Mount lustre at client +\layout Enumerate + +Sysadmin add options into config: lmt --mds_sec krb5p --oss_sec krb5i config.xml. + And setup OSS/MDS ready. +\layout Enumerate + +User mount lustre by 'mount -t lustre server:/mds1/client /mnt/lustre' +\layout Enumerate + +Connections to MDS's are privacy protected, connections to OSS's are integrity + protected. +\layout Enumerate + +User umount lustre. +\layout Enumerate + +User mount lustre by 'mount -t lustre -o mds_sec=krb5i,oss_sec=krb5p server:/mds +1/client /mnt/lustre' +\layout Enumerate + +Connections to MDS's are integrity protected, connections to OSS's are privacy + protected. +\layout Enumerate + +User umount lustre. +\layout Enumerate + +User mount lustre by 'mount -t lustre -o mds_sec=krb5p,oss_sec=krb5p server:/mds +1/client /mnt/lustre +\layout Enumerate + +Connections to all MDS's and OSS's are privacy protected. +\layout Subsection + +Startup MDS +\layout Enumerate + +Sysadmin add options into config: lmt --inter_mds_sec krb5p --mds_oss_sec + krb5p config.xml +\layout Enumerate + +Sysadmin start mds by: lconf --node mds config.xml. +\layout Enumerate + +Connections between MDS's and MDS's to OSS's are privacy protected. +\layout Enumerate + +Sysadmin stop MDS's. +\layout Enumerate + +Sysadmin start mds again by: lconf --node mds --inter_mds_sec=krb5i --mds_oss_se +c=krb5p config.xml. +\layout Enumerate + +Connections between MDS's are integrity protected, while MDS's to OSS's + are privacy protected. +\layout Subsection + +Deny certain type of connection +\layout Enumerate + +Sysadmin start OSS's by 'lconf --node ost1 --deny_sec=null config.xml' +\layout Enumerate + +Sysadmin start MDS's by 'lconf --node mds1 --mds_oss_sec=null config.xml', + setup will fail because OST reject connection from MDS's. +\layout Enumerate + +Sysadmin start MDS's by 'lconf --node mds1 --deny_sec=null --mds_oss_sec=krb5i + config.xml', will succeed. +\layout Enumerate + +Client mount by 'mount -t lustre -o mds_sec=null server:/mds1/client /mnt/lustre +' or 'mount -t lustre -o oss_sec=null server:/mds1/client /mnt/lustre' will + fail because either MDS's or OSS's will reject connection. +\layout Enumerate + +Client mount by 'mount -t lustre -o mds_sec=krb5i,oss_sec=krb5i server:/mds1/cli +ent /mnt/lustre' will succeed. +\layout Section + +Logic Specification +\layout Standard + +With Kerberos, each service provider needs a service principal, and a correspond +ing service key installed. + Usually the principal is bound to certain host for security. + For example, currently lustre service principal is 'lustre/hostname@REALM'. + While in clustered MDS case, we should use single principal for all MDS's, + to minimize the administrator burden. + It should be 'lustre@REALM' for all MDS's. + Now we should break 'lustre@REALM' into 2 principals: 'mds@REALM' for MDS + and 'oss@REALM' for OSS. + All MDS's will be installed service key of 'mds@REALM', while all OSS's + will be installed service key of 'oss@REALM'. +\layout Standard + +If MDS <=> MDS or MDS <=> OSS security is used, we also need start client + gss daemon (lgssd) on MDS's at proper time. + This needs to be incorporated into test scripts. +\layout Standard + +The interaction between kernel gss module and lgssd need some modification, + which need to be notified the target service type (i.e. + mds or oss) to issue the correct gss request. +\layout Standard + +Integrating security flavor setting into MDS's startup procedure and client's + mount procedure needs to be integrated into the MDS startup configuration + log. + +\layout Standard + +Additionally the MDS and OSS should have configuration options that provide + information on what kind of connections to accept. + +\layout Section + +State Management +\layout Standard + +MDS nodes need run lgssd if gss is active on any inter-server connections. +\layout Standard + +No disk format change. + No special recovery consideration. +\layout Section + +Alternatives +\layout Standard + +None. +\layout Section + +Focus of Inspection +\layout Itemize + +Are there more clean design/divide on those new options? +\the_end diff --git a/lustre/sec/doc/remote_ugid_HLD.lyx b/lustre/sec/doc/remote_ugid_HLD.lyx new file mode 100644 index 0000000..c799534 --- /dev/null +++ b/lustre/sec/doc/remote_ugid_HLD.lyx @@ -0,0 +1,884 @@ +#LyX 1.3 created this file. For more info see http://www.lyx.org/ +\lyxformat 221 +\textclass article +\language english +\inputencoding auto +\fontscheme times +\graphics default +\paperfontsize 12 +\spacing single +\papersize Default +\paperpackage a4 +\use_geometry 0 +\use_amsmath 0 +\use_natbib 0 +\use_numerical_citations 0 +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation skip +\defskip medskip +\quotes_language english +\quotes_times 2 +\papercolumns 1 +\papersides 1 +\paperpagestyle default + +\layout Title + +High Level Design of Remote UID/GID Handling +\layout Author + +Peter Braam, Eric Mei +\layout Date + +Jan 27, 2005 +\layout Section + +From the ERS (Engineering Requirements Spec, formerly Architecture) +\layout Itemize + +Perform uid/gid translation between remote clients and local user database. +\layout Itemize + +Handling client program calling setuid/setgid/setgroups syscalls to get + unusual previlege . +\layout Itemize + +Handling supplementary groups membership. +\layout Itemize + +Various security policies in situations with/without strong authentication + like Kerberos V5. +\layout Paragraph + +NOTE: +\layout Itemize + +remote clients may have different user database from that of MDS's. +\layout Itemize + +The remote ACL issues is addressed by a separate module. +\layout Itemize + +Most content of this document has been described in Lustre Book. +\layout Standard + +The architecture prescribes a translation mechanism at the MDS: the MDS + will translate a locally found uid/gid, which is obtained through the kerberos + principal. +\layout Section + +Functional Specification +\layout Subsection + +Determine local/remote clients +\layout Itemize + + +\begin_inset Quotes eld +\end_inset + +local +\begin_inset Quotes erd +\end_inset + + client is the client node which is supposed to share the same user database + with MDS's. + +\layout Itemize + + +\begin_inset Quotes eld +\end_inset + +remote +\begin_inset Quotes erd +\end_inset + + client is the client node which is supposed to have different user database + from MDS's. + +\layout Standard + +The MDS's will be able to determine that a client node is a local or remote + one, upon the client's first connection time to the MDS, and reply back + it's decision to client. + Later both MDS and client will make different operation decision according + to this flag. + This remote flag is per-client, not per user. + Once MDS made the decision, it will keep unchanged until client leave the + cluster membership (umount or so). +\layout Standard + +MDS will do many conversion (mostly uid/gid mapping) for users on remote + clients because of the user database mismatch, and due to the nature of + this mismatch we have to put some limitation on users of remote clients, + compare to local clients. + Following sections have the details description. +\layout Subsection + +Mapping uid/gid from clients +\layout Standard + +For local client, obviously we don't need do any uid/gid mapping. + For remote clients, we need translate uid/gid in each request into one + which lives in local user database; and vice versa: translate uid/gid in + reply into the one in remote user database. + This translation affects the uid/gid's found in the inode as owner/group, + the security context which describes under what uid the MDS is executing + and in some cases (chown is a good example) the arguments of calls. +\layout Standard + +Each MDS will have to access a uid-mapping database, which prescribed that: + which principal from which nid/netid should be mapped to which local uid. + The mapping database must be the same to every MDS to get consistent result. + During runtime, the a remote user authenticated with the MDS, the corresponding + mapping entry will be read from the on-disk database and cached in the + kernel via an upcall. + Note the same principal from different clients might be mapped to different + local user, according to the mapping database. + So on each MDS there's a per-client structure which maintained the uid + mapping cache. +\layout Standard + +Each remote client must have nllu/nllg installed. + 'nllu' is for +\begin_inset Quotes eld +\end_inset + +Non Local Lustre User +\begin_inset Quotes erd +\end_inset + +, while 'nllg' for +\begin_inset Quotes eld +\end_inset + +Non Local Lustre Group +\begin_inset Quotes erd +\end_inset + +. + When client firstly mount a lustre fileset, it should notify MDS which + local uid/gid act as nllu/nllg. + MDS will translate those unrecognized uid/gid to this before send reply + to client. + Thus from client's perspect of view, those files which belong to unauthorized + users will be shown as belonging to nllu/nllg. +\layout Subsection + +Lustre security description (LSD) +\layout Standard + +There's a security configure database on each MDS, which describes who(uid) + from where(nid/netid) have permission to setuid/setgid/setgroups. + Later we might add more into it. + the database must be the same to every MDS to get consistent result. +\layout Standard + +LSD refers to the in-kernel data structure which describe an user's security + property on the MDS. + It roughly be defined as: +\layout LyX-Code + +struct lustre_sec_desc { +\layout LyX-Code + + uid_t uid; +\layout LyX-Code + + gid_t gid; +\layout LyX-Code + + supp_grp_t supp_grp; +\layout LyX-Code + + setxid_desc setxid; +\layout LyX-Code + + /* more security tags added here */ +\layout LyX-Code + +}; +\layout Standard + +In the future we'll add more special security tag into it. + Each LSD entry correspond to an user in the local user database. + the 'setxid_desc' must have the ability to describe setuid/setgid/setgroups + permission for different clients respectively. +\layout Standard + +LSD cache is populated via an upcall during runtime. + The user-level helper will be feed in uid as a parameter, and found out + this uid's principal gid and supplementary groups from local user database, + and find setxid permission bits and other security tags from on-disk security + database. +\layout Standard + +Each LSD entry have limited expiration time, and will be flushed out when + expired. + Next request come from this user will result in the LSD be populated again, + with the uptodate security settings if changed. + System administrator also could choose to flush certain user's LSD forcely. +\layout Standard + +Every filesystem access request from client need go through checking of + LSD. + This checking is uid based, for those request coming from remote client, + uid will be mapped at first as described above, and then go to LSD. +\layout Subsection + +The MDS security context +\layout Standard + +All kernel-level service threads running on MDS are running as root, waiting + request from other nodes, and provide services. + But for those request to access filesystem for certain user, those threads + must act as the user, running as its identities. + Thus such a request comes in, we firstly collect the identity information + for this user as above described, include uid, gid, etc., then switch the + identity in the process context before really execute the filesystem operation; + we also need switch the root directory of process to the root of MDS's + backend filesystem. + after it finished, we switch back to the original context, prepare to the + next service. +\layout Standard + +For some request for special service like llog handling, special interaction + between MDSs, which don't represent any certain user, and require keeping + the root privilege. + In those situation we don't need do such context switch, also user identity + preparation. +\layout Subsection + +Remote client cache flushing +\layout Standard + +For a remote client, it should realize that those locally cached file's + owner information, e.g. + owner, group, is ever translated by server side, some mapping might be + stale as time goes on. + for example: a user newly authenticated, while some cached file which should + be owned by him still shows owner is +\begin_inset Quotes eld +\end_inset + +nllu +\begin_inset Quotes erd +\end_inset + +. + client must choose the proper time to flush those stale owner informations, + to give user a consistent view. + All attribute locks held by clients must be given a revocation callback + when a new user connects. +\layout Section + +Use Cases +\layout Subsection + +Connect rpc from local realm (case 1) +\layout Enumerate + +Alice doing 'mount' +\layout Enumerate + +Alice sends the first ptlrpc request (MDS_CONNECT) without GSS security + to MDS; +\layout Enumerate + +mds_handle() will initialize per-client structure, clear the remote flag + in it; +\layout Enumerate + +After successful connection done, the MDS send the remote flag back to client + for future usage in client side. +\layout Subsection + +Connect rpc from local realm (case 2) +\layout Enumerate + +Alice doing 'mount' +\layout Enumerate + +Alice from a MDS local realm sends the first ptlrpc request (MDS_CONNECT) + with GSS security to MDS; +\layout Enumerate + +MDS svcgssd will determine it's from a local realm client; +\layout Enumerate + +mds_handle() will initialize per-client structure, clear the remote flag + in it; +\layout Enumerate + +After successful connection done, MDS will send the remote flag back to + client for future usage in client side. +\layout Subsection + +Connect rpc from remote realm +\layout Enumerate + +Alice from a MDS remote realm sends the first ptlrpc request (MDS_CONNECT) + with GSS security to MDS, along with its nllu/nllg id number; +\layout Enumerate + +MDS svcgssd will determine it's from a remote realm client; +\layout Enumerate + +mds_handle() logic will initialize per-client structure: +\begin_deeper +\layout Enumerate + +Set the remote flag in it; +\layout Enumerate + +Fill in the nllu/nllg ids obtained from client rpc request; +\end_deeper +\layout Enumerate + +After successful connection done, the MDS will send the remote flag back + to client for future usage in client side. +\layout Subsection + +Filesystem access request +\layout Enumerate + +Alice (from local or remote client) try to access a file in lustre +\layout Enumerate + +If Alice is from remote client, MDS do uid/gid mapping; otherwise do nothing +\layout Enumerate + +MDS obtain LSD item for Alice +\layout Enumerate + +MDS perform permission check, based on LSD policies. +\layout Enumerate + +MDS service process switch to this user's context +\layout Enumerate + +MDS finish the file operation on behave of Alice. +\layout Enumerate + +MDS service process switch back original context +\layout Enumerate + +If Alice is from remote client, MDS do uid/gid reserve mapping if needed. +\layout Enumerate + +MDS send reply. +\layout Subsection + +Rpc after setuid/setgid/setgroups from local clients +\layout Enumerate + +Alice calls setuid/setgid/setgroups to change her identity to Bob in local + client node X; +\layout Enumerate + +Bob (Alice in fact) tries to access a lustre file which belongs to Bob; +\layout Enumerate + +MDS will verify the permission of Bob through local cached LSD configuration; +\layout Enumerate + +MDS turns down or accept the file access request; +\layout Subsection + +Rpc after setuid/setgid/setgroups from remote clients +\layout Enumerate + +Alice calls setuid/setgid/setgroups to change her identity to Bob in remote + client node Y; +\layout Enumerate + +Bob (Alice in fact) tries to access a lustre file which belongs to Bob; +\layout Enumerate + +MDS will find Bob is from the remote realm and in fact he is not real Bob; +\layout Enumerate + +MDS turns down the file access request; +\layout Subsection + +Update LSD configuration in MDS +\layout Enumerate + +Lustre system administrator hopes to update current LSD option; +\layout Enumerate + +The sysadmin uses the lsd update utility which will update the on-disk security + database, and notify the changes of the LSD configuration to MDS; +\layout Enumerate + +MDS re-fresh the cached LSD info through an upcall. +\layout Subsection + +Revoke a local user +\layout Enumerate + +Bob is able to access lustre filesystem +\layout Enumerate + +Sysadmin remove Bob from the MDS's local user database, and flush in-kernel + LSD cache for Bob. +\layout Enumerate + +Bob will not be able to access MDS immediately +\layout Subsection + +Revoke a remote user +\layout Enumerate + +Alice of a remote client is mapped to MDS local user Bob. +\layout Enumerate + +Alice is able to access lustre filesystem +\layout Enumerate + +Sysadmin remove the mapping +\begin_inset Quotes eld +\end_inset + +Alice->Bob +\begin_inset Quotes erd +\end_inset + + from mapping database, and flush in-kernel mapping entry. +\layout Enumerate + +Alice will not be able to access MDS immediately. +\layout Enumerate + +If the mapping +\begin_inset Quotes eld +\end_inset + +anyone else -> Carol +\begin_inset Quotes erd +\end_inset + + exist in the mapping database, Alice could reconnect to MDS and then will + be mapped to Carol. +\layout Subsection + +Revoke a remote user (2) +\layout Enumerate + +Alice of a remote client is mapped to MDS local user Bob. +\layout Enumerate + +Alice is able to access lustre filesystem +\layout Enumerate + +Sysadmin remove Bob from the MDS's local user database, and flush in-kernel + LSD cache for Bob. +\layout Enumerate + +Alice will not be able to access MDS immediately. +\layout Enumerate + +If the mapping +\begin_inset Quotes eld +\end_inset + +anyone else -> Carol +\begin_inset Quotes erd +\end_inset + + exist in the mapping database, Alice could reconnect to MDS and then will + be mapped to Carol. +\layout Subsection + +'ls -l' on remote client +\layout Enumerate + +Suppose on a remote client, Alice's pricinpal group is AliceGrp; Bob's principal + groups is BobGrp. +\layout Enumerate + +there's several files on lustre: file_1 belongs to Alice:AliceGrp; file_2 + belongs to Alice:BobGrp; file_3 belongs to Bob:AliceGrp; file_4 belongs + to Bob:BobGrp; file_5 belongs to Bob:nllg; +\layout Enumerate + +Alice do 'ls -l', output like this: file_1 belongs to Alice:AliceGrp; file_2 + belongs to Alice:nllg; file_3 belongs to nllu:AliceGrp; file_4 belongs + to nllu:nllg; file_5 belongs to nllu:nllg; +\layout Enumerate + +Bob just login the client system, also do a 'ls -l', output like this: file_1 + belongs to Alice:AliceGrp; file_2 belongs to Alice:Bobgrp; file_3 belongs + to Bob:AliceGrp; file_4 belongs to Bob:BobGrp; file_5 belongs to Bob:nllg; +\layout Enumerate + +Alice do 'ls -l' again, output is the same as Bob's list. +\layout Enumerate + +Alice logout, then Bob do a 'ls -l' again, output like this: file_1 belongs + to nllu:nllg; file_2 belongs to nllu:Bobgrp; file_3 belongs to Bob:nllg; + file_4 belongs to Bob:BogGrp; file_5 belongs to Bob:nllg; +\layout Subsection + +Chown on remote client +\layout Enumerate + +Root user on a remote client want to change the owner of a file to Bob, + while Bob didn't login(authenticated with lustre) yet. +\layout Enumerate + +MDS can't find the mapping for the destinated uid, so return error. +\layout Enumerate + +Bob login at that time. +\layout Enumerate + +Root do the same chown again. +\layout Enumerate + +MDS will grant the request, no matter what the original owner of this file + is. +\layout Subsection + +Chgrp on remote client +\layout Enumerate + +Triditional chgrp on remote client is not allowed, since there's no clear + group id mapping between local and remote database. + so the group id on the remote client is not meaningful on the MDS. +\layout Section + +Logic Specification +\layout Subsection + +Specify nllu/nllg +\layout Standard + +When client do mount, in addition to other parameter, user need supply with + the IDs of nllu/nllg on this client, which will be sent to the MDS at connectin +g time. + If no nllu/nllg explicitly supplied, default values will be used. +\layout Subsection + +Determine local or remote client +\layout Standard + +Under GSS protection, user could explicitly supply the remote flag during + mount time. + MDS make decision as following order: +\layout Itemize + +All permitted connections without GSS security are from local realm clients. +\layout Itemize + +All connections with GSS security, if user supplied remote flag during mount, + MDS will grant the flag as requested. +\layout Itemize + +All connections with GSS/local_realm_kerberos are from local realm clients. +\layout Itemize + +All connections with GSS/remote_realm_kerberos are from remote realm clients. +\layout Standard + +Here we made the assumption that: kerberos's local/remote realm == lustre's + local/remote realm. + Later we might bring in more factors into this dicision making. +\layout Standard + +GSS/Kerberos module is responsible to provide the information that the initial + connect request whether has strong security; whether from remote kerberos + realm. +\layout Standard + +On MDS's, the per-client export structure has a flag to indicate local/remote + of this client. + Accordingly, each client has a similar flag, which is send back by MDS's + after initial connection. +\layout Subsection + +Handle local rpc request +\layout Standard + +For each filesystem access request from client, we will get LSD for this + uid at first. + We then lookup in the cache, if not found or already invalid, issue a upcall + to get it. + If finally failed to get LSD(timeout or got an error), we simply deny this + request. +\layout Standard + +After obtained LSD, we also check whether the client intend to do setuid/setgid/ +setgroups. + If yes, check the permission bits in LSD, if not allow we also deny this + request. + The intention of setuid/setgid could be detected by compare the uid, gid, + fsuid, fsgid sent by client, and the local authorized uid/gid. +\layout Standard + +If setgroups is permitted: for root we'll directly use the supplementary + groups array sent by client; for normal user we compare those sent by client + with those in LSD, guarantee client only could reduce the array (can't + add new ids which is not part of group array in LSD). +\layout Standard + +If setgroups is not permitted, we simply use the supplementary group array + provided by LSD. +\layout Standard + +After all security context prepared as above, we switch it into process + context, perform the actual filesystem operation. + after finished, switch back the original context. + send reply out to client. +\layout Standard + +Later an special security policy is needed to allow RAW access by FID without + a capability. + This is used for analyzing audit logs, finding pathnames from fids (for + recovery) etc. +\layout Subsection + +Remote user mapping database +\layout Standard + +There will be a user mapping configuration file on MDS, already defined + in +\begin_inset Quotes eld +\end_inset + +functional specification +\begin_inset Quotes erd +\end_inset + +. + MDS kernel will also maintain a cache of this mapping information. + It is populated by upcall to server side gss daemon, along with the gss + credential information. + +\layout Itemize + +The on-disk mapping database only described how user(principal) is mapped + to an local uid, and don't need specify the gid mapping. +\layout Itemize + +Both on-disk mapping database and kernel mapping cache should be able to + allow map all other remote users to a certain local user. +\layout Itemize + +On the MDS, the per-client structure will maintain this mapping cache. + When a user from remote client get authenticated, we check the on-disk + mapping database. + If no mapping items for this user found, we'll deny this user. + otherwise we record the target uid. +\layout Itemize + +When a fs access request come from remote client, it contains the user's + uid, gid on the remote client. + Here we can establish mapping for uid and target uid. + With target uid we can find the target gid from local user database (from + LSD), thus we can also establish the mapping for gid and target gid. +\layout Itemize + +With mapping we established above, we now do the mapping: replace the uid/gid + in the rpc request with target uid/gid. + If it request chown we also check & map the new owner id. +\layout Itemize + +When reply populated and about to send back, we again check the mapping + cache, and do the reverse mapping if in the case which return file attributes + to clients. + For those can't find the matched items, map them to nllu/nllg of this remote + client. +\layout Subsection + +Handle remote rpc request +\layout Standard + +The overall process of handle remote rpc request is the same as for local + user, except following: +\layout Itemize + +For incoming request, firstly do the uid/gid mapping for the requestor; + and do reserve mapping for the reply, as described above. +\layout Itemize + +No setuid/setgid/setgroups intention is permitted, except we explicitly + allow setuid-root in setxid database. + And so we ignore the supplementary groups sent by client(if any), and simply + use the one provided by LSD. +\layout Itemize + +For chown request, we also do translation for the new owner id (already + described above) according to the in-kernel mapping cache. + It means the root user on remote client can't change owner of a file to + a user which is not login yet. +\layout Itemize + +Deny all chgrp request, since the group on remote client has no clear mapping + on MDS's local user database (We also could choose allow this when the + new group id showup in the in-kernel mapping cache, but it seems dosen't + make much sense). + So we probably need a special tool like +\begin_inset Quotes eld +\end_inset + +lfs chgrp +\begin_inset Quotes erd +\end_inset + + to perform chgrp on remote client, which will send out text name instead + of translate to id locally. +\layout Subsection + +Remote client cache flushing +\layout Standard + +Anytime there might be inodes cached and their owner belongs to nllu/nllg. + If a new user Alice get authenticated and she happens to be the owner of + those inodes, we need to refresh those inode even if it's cache status + is correct, otherwise Alice will find her files belong to others. + Since we don't know whether a inode with nllu/nllg belongs to Alice or + not, we must flush all of them. +\layout Standard + +On MDS, a callback or similar event notification mechanism should be hooked + into gss module. + When a user authenticated at the first time, we should iterate through + all the granted lock corresponding to this client, and revoke them selectively. + Strictly speaking we only want to revoke those inodebits lock and the owner/gro +up of their resource (inode) not show up in the in-kernel mapping database, + but here we just flush all the inodebits locks, a cache is quickly re-populated + - there are a maximum of 20-100 cached locks on clients at the moment. +\layout Standard + +When Alice logs out of the client system, we also do the similar things: + iterate through all the granted lock corresponding to this client, and + revoke them selectively. + Here we want to revoke those inodebits locks and the owner/group of their + resource(inode) is Alice. + We also could choose flush all of them like above case. +\layout Subsection + +LSD upcall +\layout Standard + +There is a general upcall-cache code which do upcall into user space, and + cache data passed down in kernel, and also implemented timeout invalidation. + Kernel LSD could simply be implemented as a instance of it. + So it will be quite simple. +\layout Standard + +A user-space tools should provide following functionality: +\layout Itemize + +Accept uid as parameter +\layout Itemize + +Obtian gid and supplementary groups id array which the uid belongs to, if + failed just return error. +\layout Itemize + +Obtian the setxid permission bits for this user on this NID from database. + If not found a default bitset will be applied: (1) for local client: setuid/set +gid is off, setgroups for root is off, setgroups for normal user is on; + (2) for remote client: all of setuid/setgid/setgroups is off. +\layout Itemize + +Pass all the collected information back to kernel by /proc. +\layout Standard + +Since the upcall could happen concurrently, and admin could modified it + at anytime, so a kind of read-write lock need to be done on the database + file. +\layout Subsection + +Recovery consideration +\layout Standard + +All the code here should have minimal effect on recovery. + After MDS's crash, security context will be established during connection + time in recovery; and uid-mapping cache and LSD actually are +\begin_inset Quotes eld +\end_inset + +adaptive +\begin_inset Quotes erd +\end_inset + +, they will also be re-populated when handling related user's replay request + during/after recovery. +\layout Section + +State Management +\layout Subsection + +configuration states +\layout Itemize + +Client has a remote flag at mount time. +\layout Itemize + +Remote clients must have nllu:nllg installed. + it could simply be nobody:nobody. +\layout Itemize + +MDS could have a remote-user mapping database which contains which principal + at with client should be mapped to which local user. + Without the database no remote client is allowed to connect. +\layout Itemize + +MDS could have a security database which contains setxid permissions along + with other security setting for each affected user. + No such database then a default setting will be applied. +\layout Subsection + +LSD entry states transition +\layout Enumerate + +NEW: generated and submit to upcall +\layout Enumerate + +READY: ready to serve +\layout Enumerate + +INVALID: expired or error +\layout Standard + +Requestor will initiate an NEW LSD entry; after upcall successfully fill + in data it change to READY; if timeout or some error happen (e.g. + not found in user database) during upcall it change to INVALID; a READY + LSD will change to INVALID when expired, or flushed forcely by sysadmin, + or MDS shutdown; an INVALID LSD will be soon destroied. +\layout Standard + +No disk format changed. + When a large number of users access lustre from all kinds of local/remote + clients at the same time, MDS will have more CPU and memory overhead, especiall +y for remote users. + No special recovery consideration. + +\layout Section + +Alternatives +\layout Subsection + +NFSv4 +\layout Standard + +NFSv4 sends user and groups by name. +\layout Section + +Focus of Inspection +\layout Itemize + +Could this pass HP acceptance test? +\layout Itemize + +Any is not reasonable? Any security hole? +\layout Itemize + +Everything recoverable from MDS/client crash? +\the_end diff --git a/lustre/sec/doc/revoke_user_HLD.lyx b/lustre/sec/doc/revoke_user_HLD.lyx new file mode 100644 index 0000000..f454dc5 --- /dev/null +++ b/lustre/sec/doc/revoke_user_HLD.lyx @@ -0,0 +1,244 @@ +#LyX 1.3 created this file. For more info see http://www.lyx.org/ +\lyxformat 221 +\textclass article +\language english +\inputencoding auto +\fontscheme times +\graphics default +\paperfontsize 12 +\spacing single +\papersize Default +\paperpackage a4 +\use_geometry 0 +\use_amsmath 0 +\use_natbib 0 +\use_numerical_citations 0 +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation skip +\defskip medskip +\quotes_language english +\quotes_times 2 +\papercolumns 1 +\papersides 1 +\paperpagestyle default + +\layout Title + +High Level Design of User Revoke +\layout Author + +Peter Braam, Eric Mei +\layout Date + +Jan 31, 2005 +\layout Section + +Requirement +\layout Itemize + +Be able to revoke a user, prevent it from accessing lustre immediately. +\layout Itemize + +Be able to pass sub-test of HP acceptance 4.1.51. +\layout Itemize + +user & mapping databases manipulation API. +\layout Section + +Functional Specification +\layout Standard + +A sub-command +\begin_inset Quotes eld +\end_inset + +revoke +\begin_inset Quotes erd +\end_inset + + will be added into existing tool 'lctl'. + When system administrator want to kick somebody off from lustre filesystem + (e.g. + a certain user has known be malicious or an account be compromised), he + could use this functionality on MDS's to prevent the victim user from access + lustre filesystem right away. + The command format could be: +\layout LyX-Code + +lctl revoke user|all +\layout Itemize + +Here the 'user' format is: uid[@nid[.netid]] +\layout Itemize + +option @nid.netid is only for remote users. + The uid is in term of local uid, thus 'uid@remote_nid.netid' means remote + users on node 'remote_nid.netid' who are mapped to local 'uid', it's not + intend to remove a certain user on specific node. +\layout Itemize + +Specified uid without nid or netid means match all nid or netid. +\layout Itemize + +'all' means revoke all users. +\layout Standard + +Actually lctl only remove those in-kernel cache for the victim user, usually + there's many other configuration work need to be done by using other admin + tools: +\layout Itemize + +Kerberos Database: For removing a user from kerberos principal database, + sysadmin must use kerberos admin tools. + And this change will not take effect right away if the victim user has + authenticated with MDS's before the removal (because of client side credential + cache). +\layout Itemize + +User Database: For removing a user from user database, sysadmin also must + resort to other tools, usually standard unix tools. + This change will not take effect right away if this user had ever accessed + lustre before the removal (because of in-kernel LSD cache). +\layout Itemize + +User Mapping Database: For removing a user from remote user mapping database, + sysadmin need edit the configure file manually. + This only affect certain user on certain remote client. + This change will not take effect right away if this user had ever acessed + lustre before the removal (because of in-kernel uid mapping cache). +\layout Standard + +So when sysadmin actually revoke a user, he usually at first did one or + more steps of above according to requirement, then invoke lctl to finally + revoke the user. + In cases that user database or user mapping database are not centrally + managed by e.g. + LDAP, sysadmin must remove the user from all configure files on each MDS's, + this could be done by using 'pdsh', etc. +\layout Standard + +What above described is the basic requirement. + There's an additional one: for user and mapping database, write a C API + library (probably later add python support), which can query, add, remove, + and enumerate users in each database. + 'edit' could be implemented as remove + add. +\layout Standard + +By using this API, we could provide much complete functionality. + Sysadmin could do everything about user account within single lctl tools; + Kernel upcall helper also could use this API to obtain information from + mapping database, etc. +\layout Section + +Use Cases +\layout Subsection + +Revoke Alice's access right on all clients, permanently +\layout Enumerate + +Sysadmin remove Alice from user database on all MDS's. +\layout Enumerate + +Sysadmin invoke 'lctl revoke alice_uid' on all MDS's. +\layout Enumerate + +Alice from local clients will not be able to access lustre. +\layout Enumerate + +Any remote users who are mapped to Alice will not be able to access lustre. +\layout Subsection + +Revoke Alice's access right on remote client remote1 +\layout Enumerate + +Suppose alice@remote1 is mapped to local user Bob. +\layout Enumerate + +Sysadmin remove mapping entry of 'alice_uid@remote1 -> bob' from user mapping + database. +\layout Enumerate + +Sysadmin invoke 'lctl revoke bob_uid@remote1' on all MDS's. +\layout Enumerate + +Alice will not be able to access lustre from remote1. +\layout Enumerate + +Bob from an local client could still work fine. +\layout Section + +Logic Specification +\layout Standard + +There's several kinds of in-kernel cache for certain user: LSD, gss context, + and uid-mapping. + In the future we might add consideration of removing OSS access capability. +\layout Enumerate + +LSD: On each MDS, each user (uid) correspond to at most one LSD entry. + There's already an existing interface to flush LSD for a certain user: + simply write an uid into '/proc/fs/lustre/mds/lsd_flush' (Note this is + subject to change). + Write in '-1' will flush all LSD entries. +\layout Enumerate + +GSS Context: On each MDS, each user (principal) might correspond to several(even + many) gss contexts. + The gss module should export a proc entry. + When provided uid and remote nid/netid, it should be able to find out the + initiating/established gss contexts and destroy them. + Providing a special tag will flush all gss contexts. +\layout Enumerate + +UID Mapping: Firstly found out per-client structure for specified nid/netid, + then destroy the mapping entries for specified uid. + Since this is strongly related to GSS context, we can use the export proc + entry for gss context to initiate this flush. + Thus when sysadmin trying to flush gss contexts for certain user, we also + flush associated uid-mapping. +\layout Standard + +This work should be done after the completion of GSS and remote uid/gid + handling implementation. +\layout Standard + +The user and mapping databases manipulation API could be simple not much + restriction, and the details is very much related to the actual database + structure. + we leave the details to the following DLD document. +\layout Section + +State Management +\layout Standard + +Since we'll flush several cache separately, we might have situation that + not strictly consistency. + For example, after we flushed alice from cache1, someone re-populate it + in cache1 while do it on cache2. + In fact, the inconsistency between LSD and gss context is perfectly allowed. + Only one thing need be sure is: since uid mapping is established after + that of gss context, thus we need flush uid mapping at first, and then + flush gss context. + This could prevent unnecessary error when doing 'revoke' while we don't + actually remote it from mapping database. +\layout Standard + +No serious locking issues, no special recovery consideration. +\layout Section + +Alternatives +\layout Standard + +None. +\layout Section + +Focus of Inspection +\layout Itemize + +Is the lctl interface reasonably reflect the facts? +\layout Itemize + +Could it pass acceptance test? +\the_end diff --git a/lustre/sec/gss/.cvsignore b/lustre/sec/gss/.cvsignore new file mode 100644 index 0000000..d5103fa --- /dev/null +++ b/lustre/sec/gss/.cvsignore @@ -0,0 +1,15 @@ +.Xrefs +config.log +config.status +configure +Makefile +.deps +TAGS +.*.cmd +autoMakefile.in +autoMakefile +*.ko +*.mod.c +.*.o.flags +.tmp_versions +.depend diff --git a/lustre/sec/gss/Makefile.in b/lustre/sec/gss/Makefile.in new file mode 100644 index 0000000..ccfd0d3 --- /dev/null +++ b/lustre/sec/gss/Makefile.in @@ -0,0 +1,9 @@ +#MODULES := ptlrpcs_gss ptlrpcs_gss_krb5 +MODULES := ptlrpcs_gss +ptlrpcs_gss-objs := sec_gss.o svcsec_gss.o rawobj.o gss_mech_switch.o \ + gss_generic_token.o gss_krb5_crypto.o gss_krb5_seal.o \ + gss_krb5_unseal.o gss_krb5_seqnum.o gss_krb5_mech.o \ + gss_krb5_wrap.o +#ptlrpcs_gss_krb5-objs := gss_krb5_mech.o + +@INCLUDE_RULES@ diff --git a/lustre/sec/gss/Makefile.mk b/lustre/sec/gss/Makefile.mk new file mode 100644 index 0000000..08de7a4 --- /dev/null +++ b/lustre/sec/gss/Makefile.mk @@ -0,0 +1,14 @@ +# Copyright (C) 2004 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(src)/../../portals/Kernelenv + +#obj-y += ptlrpcs_gss.o ptlrpcs_gss_krb5.o +obj-y += ptlrpcs_gss.o +ptlrpcs_gss-objs := sec_gss.o svcsec_gss.o rawobj.o gss_mech_switch.o \ + gss_generic_token.o gss_krb5_crypto.o gss_krb5_seal.o \ + gss_krb5_unseal.o gss_krb5_seqnum.o gss_krb5_mech.o \ + gss_krb5_wrap.o +#ptlrpcs_gss_krb5-objs := gss_krb5_mech.o diff --git a/lustre/sec/gss/autoMakefile.am b/lustre/sec/gss/autoMakefile.am new file mode 100644 index 0000000..f729d06 --- /dev/null +++ b/lustre/sec/gss/autoMakefile.am @@ -0,0 +1,23 @@ +# Copyright (C) 2004 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +if LIBLUSTRE +noinst_LIBRARIES = libptlrpcs_gss.a +libptlrpcs_gss_a_SOURCES = sec_gss.c gss_mech_switch.c gss_krb5_mech.c \ + gss_generic_token.c gss_krb5_crypto.c \ + gss_krb5_seal.c gss_krb5_unseal.c \ + gss_krb5_seqnum.c rawobj.c + +libptlrpcs_gss_a_CPPFLAGS = $(LLCPPFLAGS) +libptlrpcs_gss_a_CFLAGS = $(LLCFLAGS) +endif + +if MODULES +modulefs_DATA = ptlrpcs_gss$(KMODEXT) +endif + +DIST_SOURCES = $(ptlrpcs_gss-objs:.o=.c) gss_internal.h gss_api.h gss_asn1.h \ + gss_err.h gss_krb5.h +MOSTLYCLEANFILES = *.o *.ko *.mod.c diff --git a/lustre/sec/gss/gss_api.h b/lustre/sec/gss/gss_api.h new file mode 100644 index 0000000..06557d4 --- /dev/null +++ b/lustre/sec/gss/gss_api.h @@ -0,0 +1,132 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * Somewhat simplified version of the gss api. + * + * Dug Song + * Andy Adamson + * Bruce Fields + * Copyright (c) 2000 The Regents of the University of Michigan + * + * $Id: gss_api.h,v 1.2 2005/03/31 22:18:24 ericm Exp $ + */ + +#ifndef __SEC_GSS_GSS_API_H_ +#define __SEC_GSS_GSS_API_H_ + +struct gss_api_mech; + +/* The mechanism-independent gss-api context: */ +struct gss_ctx { + struct gss_api_mech *mech_type; + void *internal_ctx_id; +}; + +#define GSS_C_NO_BUFFER ((rawobj_t) 0) +#define GSS_C_NO_CONTEXT ((struct gss_ctx *) 0) +#define GSS_C_NULL_OID ((rawobj_t) 0) + +/*XXX arbitrary length - is this set somewhere? */ +#define GSS_OID_MAX_LEN 32 + +/* gss-api prototypes; note that these are somewhat simplified versions of + * the prototypes specified in RFC 2744. */ +__u32 kgss_import_sec_context( + rawobj_t *input_token, + struct gss_api_mech *mech, + struct gss_ctx **ctx_id); +__u32 kgss_inquire_context( + struct gss_ctx *ctx_id, + __u64 *endtime); +__u32 kgss_get_mic( + struct gss_ctx *ctx_id, + __u32 qop, + rawobj_t *message, + rawobj_t *mic_token); +__u32 kgss_verify_mic( + struct gss_ctx *ctx_id, + rawobj_t *message, + rawobj_t *mic_token, + __u32 *qstate); +__u32 kgss_wrap( + struct gss_ctx *ctx_id, + __u32 qop, + rawobj_buf_t *in_token, + rawobj_t *out_token); +__u32 kgss_unwrap( + struct gss_ctx *ctx_id, + __u32 qop, + rawobj_t *in_token, + rawobj_t *out_token); +__u32 kgss_delete_sec_context( + struct gss_ctx **ctx_id); + +struct subflavor_desc { + __u32 subflavor; + __u32 qop; + __u32 service; + char *name; +}; + +/* Each mechanism is described by the following struct: */ +struct gss_api_mech { + struct list_head gm_list; + struct module *gm_owner; + char *gm_name; + rawobj_t gm_oid; + atomic_t gm_count; + struct gss_api_ops *gm_ops; + int gm_sf_num; + struct subflavor_desc *gm_sfs; +}; + +/* and must provide the following operations: */ +struct gss_api_ops { + __u32 (*gss_import_sec_context)( + rawobj_t *input_token, + struct gss_ctx *ctx_id); + __u32 (*gss_inquire_context)( + struct gss_ctx *ctx_id, + __u64 *endtime); + __u32 (*gss_get_mic)( + struct gss_ctx *ctx_id, + __u32 qop, + rawobj_t *message, + rawobj_t *mic_token); + __u32 (*gss_verify_mic)( + struct gss_ctx *ctx_id, + rawobj_t *message, + rawobj_t *mic_token, + __u32 *qstate); + __u32 (*gss_wrap)( + struct gss_ctx *ctx, + __u32 qop, + rawobj_buf_t *in_token, + rawobj_t *out_token); + __u32 (*gss_unwrap)( + struct gss_ctx *ctx, + __u32 qop, + rawobj_t *in_token, + rawobj_t *out_token); + void (*gss_delete_sec_context)( + void *internal_ctx_id); +}; + +int kgss_mech_register(struct gss_api_mech *mech); +void kgss_mech_unregister(struct gss_api_mech *mech); + +struct gss_api_mech * kgss_OID_to_mech(rawobj_t *); +struct gss_api_mech * kgss_name_to_mech(char *name); +struct gss_api_mech * kgss_subflavor_to_mech(__u32 subflavor); + +struct gss_api_mech * kgss_mech_get(struct gss_api_mech *); +void kgss_mech_put(struct gss_api_mech *); + +#endif /* __SEC_GSS_GSS_API_H_ */ diff --git a/lustre/sec/gss/gss_asn1.h b/lustre/sec/gss/gss_asn1.h new file mode 100644 index 0000000..cd44f6d --- /dev/null +++ b/lustre/sec/gss/gss_asn1.h @@ -0,0 +1,87 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * minimal asn1 for generic encoding/decoding of gss tokens + * + * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, + * lib/gssapi/krb5/gssapiP_krb5.h, and others + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1995 by the Massachusetts Institute of Technology. + * All Rights Reserved. + * + * Export of this software from the United States of America may + * require a specific license from the United States Government. + * It is the responsibility of any person or organization contemplating + * export to obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of M.I.T. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. Furthermore if you modify this software you must label + * your software as modified software and not distribute it in such a + * fashion that it might be confused with the original M.I.T. software. + * M.I.T. makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + */ + +#define SIZEOF_INT 4 + +/* from gssapi_err_generic.h */ +#define G_BAD_SERVICE_NAME (-2045022976L) +#define G_BAD_STRING_UID (-2045022975L) +#define G_NOUSER (-2045022974L) +#define G_VALIDATE_FAILED (-2045022973L) +#define G_BUFFER_ALLOC (-2045022972L) +#define G_BAD_MSG_CTX (-2045022971L) +#define G_WRONG_SIZE (-2045022970L) +#define G_BAD_USAGE (-2045022969L) +#define G_UNKNOWN_QOP (-2045022968L) +#define G_NO_HOSTNAME (-2045022967L) +#define G_BAD_HOSTNAME (-2045022966L) +#define G_WRONG_MECH (-2045022965L) +#define G_BAD_TOK_HEADER (-2045022964L) +#define G_BAD_DIRECTION (-2045022963L) +#define G_TOK_TRUNC (-2045022962L) +#define G_REFLECT (-2045022961L) +#define G_WRONG_TOKID (-2045022960L) + +#define g_OID_equal(o1,o2) \ + (((o1)->len == (o2)->len) && \ + (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0)) + +__u32 g_verify_token_header( + rawobj_t *mech, + int *body_size, + unsigned char **buf_in, + int toksize); + +__u32 g_get_mech_oid(rawobj_t *mech, rawobj_t * in_buf); + +int g_token_size( + rawobj_t *mech, + unsigned int body_size); + +void g_make_token_header( + rawobj_t *mech, + int body_size, + unsigned char **buf); diff --git a/lustre/sec/gss/gss_err.h b/lustre/sec/gss/gss_err.h new file mode 100644 index 0000000..c893983 --- /dev/null +++ b/lustre/sec/gss/gss_err.h @@ -0,0 +1,181 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * Adapted from MIT Kerberos 5-1.2.1 include/gssapi/gssapi.h + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __SEC_GSS_GSS_ERR_H_ +#define __SEC_GSS_GSS_ERR_H_ + +typedef unsigned int OM_uint32; + +/* + * Flag bits for context-level services. + */ +#define GSS_C_DELEG_FLAG 1 +#define GSS_C_MUTUAL_FLAG 2 +#define GSS_C_REPLAY_FLAG 4 +#define GSS_C_SEQUENCE_FLAG 8 +#define GSS_C_CONF_FLAG 16 +#define GSS_C_INTEG_FLAG 32 +#define GSS_C_ANON_FLAG 64 +#define GSS_C_PROT_READY_FLAG 128 +#define GSS_C_TRANS_FLAG 256 + +/* + * Credential usage options + */ +#define GSS_C_BOTH 0 +#define GSS_C_INITIATE 1 +#define GSS_C_ACCEPT 2 + +/* + * Status code types for gss_display_status + */ +#define GSS_C_GSS_CODE 1 +#define GSS_C_MECH_CODE 2 + + +/* + * Define the default Quality of Protection for per-message services. Note + * that an implementation that offers multiple levels of QOP may either reserve + * a value (for example zero, as assumed here) to mean "default protection", or + * alternatively may simply equate GSS_C_QOP_DEFAULT to a specific explicit + * QOP value. However a value of 0 should always be interpreted by a GSSAPI + * implementation as a request for the default protection level. + */ +#define GSS_C_QOP_DEFAULT 0 + +/* + * Expiration time of 2^32-1 seconds means infinite lifetime for a + * credential or security context + */ +#define GSS_C_INDEFINITE ((OM_uint32) 0xfffffffful) + + +/* Major status codes */ + +#define GSS_S_COMPLETE 0 + +/* + * Some "helper" definitions to make the status code macros obvious. + */ +#define GSS_C_CALLING_ERROR_OFFSET 24 +#define GSS_C_ROUTINE_ERROR_OFFSET 16 +#define GSS_C_SUPPLEMENTARY_OFFSET 0 +#define GSS_C_CALLING_ERROR_MASK ((OM_uint32) 0377ul) +#define GSS_C_ROUTINE_ERROR_MASK ((OM_uint32) 0377ul) +#define GSS_C_SUPPLEMENTARY_MASK ((OM_uint32) 0177777ul) + +/* + * The macros that test status codes for error conditions. Note that the + * GSS_ERROR() macro has changed slightly from the V1 GSSAPI so that it now + * evaluates its argument only once. + */ +#define GSS_CALLING_ERROR(x) \ + ((x) & (GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET)) +#define GSS_ROUTINE_ERROR(x) \ + ((x) & (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET)) +#define GSS_SUPPLEMENTARY_INFO(x) \ + ((x) & (GSS_C_SUPPLEMENTARY_MASK << GSS_C_SUPPLEMENTARY_OFFSET)) +#define GSS_ERROR(x) \ + ((x) & ((GSS_C_CALLING_ERROR_MASK << GSS_C_CALLING_ERROR_OFFSET) | \ + (GSS_C_ROUTINE_ERROR_MASK << GSS_C_ROUTINE_ERROR_OFFSET))) + +/* + * Now the actual status code definitions + */ + +/* + * Calling errors: + */ +#define GSS_S_CALL_INACCESSIBLE_READ \ + (((OM_uint32) 1ul) << GSS_C_CALLING_ERROR_OFFSET) +#define GSS_S_CALL_INACCESSIBLE_WRITE \ + (((OM_uint32) 2ul) << GSS_C_CALLING_ERROR_OFFSET) +#define GSS_S_CALL_BAD_STRUCTURE \ + (((OM_uint32) 3ul) << GSS_C_CALLING_ERROR_OFFSET) + +/* + * Routine errors: + */ +#define GSS_S_BAD_MECH (((OM_uint32) 1ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_NAME (((OM_uint32) 2ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_NAMETYPE (((OM_uint32) 3ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_BINDINGS (((OM_uint32) 4ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_STATUS (((OM_uint32) 5ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_SIG (((OM_uint32) 6ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NO_CRED (((OM_uint32) 7ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NO_CONTEXT (((OM_uint32) 8ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DEFECTIVE_TOKEN (((OM_uint32) 9ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DEFECTIVE_CREDENTIAL \ + (((OM_uint32) 10ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_CREDENTIALS_EXPIRED \ + (((OM_uint32) 11ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_CONTEXT_EXPIRED \ + (((OM_uint32) 12ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_FAILURE (((OM_uint32) 13ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_BAD_QOP (((OM_uint32) 14ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_UNAUTHORIZED (((OM_uint32) 15ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_UNAVAILABLE (((OM_uint32) 16ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_DUPLICATE_ELEMENT \ + (((OM_uint32) 17ul) << GSS_C_ROUTINE_ERROR_OFFSET) +#define GSS_S_NAME_NOT_MN \ + (((OM_uint32) 18ul) << GSS_C_ROUTINE_ERROR_OFFSET) + +/* + * Supplementary info bits: + */ +#define GSS_S_CONTINUE_NEEDED (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 0)) +#define GSS_S_DUPLICATE_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 1)) +#define GSS_S_OLD_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 2)) +#define GSS_S_UNSEQ_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 3)) +#define GSS_S_GAP_TOKEN (1 << (GSS_C_SUPPLEMENTARY_OFFSET + 4)) + +/* XXXX these are not part of the GSSAPI C bindings! (but should be) */ + +#define GSS_CALLING_ERROR_FIELD(x) \ + (((x) >> GSS_C_CALLING_ERROR_OFFSET) & GSS_C_CALLING_ERROR_MASK) +#define GSS_ROUTINE_ERROR_FIELD(x) \ + (((x) >> GSS_C_ROUTINE_ERROR_OFFSET) & GSS_C_ROUTINE_ERROR_MASK) +#define GSS_SUPPLEMENTARY_INFO_FIELD(x) \ + (((x) >> GSS_C_SUPPLEMENTARY_OFFSET) & GSS_C_SUPPLEMENTARY_MASK) + +/* XXXX This is a necessary evil until the spec is fixed */ +#define GSS_S_CRED_UNAVAIL GSS_S_FAILURE + +#endif /* __SEC_GSS_GSS_ERR_H_ */ diff --git a/lustre/sec/gss/gss_generic_token.c b/lustre/sec/gss/gss_generic_token.c new file mode 100644 index 0000000..c48653a --- /dev/null +++ b/lustre/sec/gss/gss_generic_token.c @@ -0,0 +1,295 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_generic_token.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_krb5.h" +#include "gss_asn1.h" + + +/* TWRITE_STR from gssapiP_generic.h */ +#define TWRITE_STR(ptr, str, len) \ + memcpy((ptr), (char *) (str), (len)); \ + (ptr) += (len); + +/* XXXX this code currently makes the assumption that a mech oid will + never be longer than 127 bytes. This assumption is not inherent in + the interfaces, so the code can be fixed if the OSI namespace + balloons unexpectedly. */ + +/* Each token looks like this: + +0x60 tag for APPLICATION 0, SEQUENCE + (constructed, definite-length) + possible multiple bytes, need to parse/generate + 0x06 tag for OBJECT IDENTIFIER + compile-time constant string (assume 1 byte) + compile-time constant string + the ANY containing the application token + bytes 0,1 are the token type + bytes 2,n are the token data + +For the purposes of this abstraction, the token "header" consists of +the sequence tag and length octets, the mech OID DER encoding, and the +first two inner bytes, which indicate the token type. The token +"body" consists of everything else. + +*/ + +static int +der_length_size( int length) +{ + if (length < (1<<7)) + return(1); + else if (length < (1<<8)) + return(2); +#if (SIZEOF_INT == 2) + else + return(3); +#else + else if (length < (1<<16)) + return(3); + else if (length < (1<<24)) + return(4); + else + return(5); +#endif +} + +static void +der_write_length(unsigned char **buf, int length) +{ + if (length < (1<<7)) { + *(*buf)++ = (unsigned char) length; + } else { + *(*buf)++ = (unsigned char) (der_length_size(length)+127); +#if (SIZEOF_INT > 2) + if (length >= (1<<24)) + *(*buf)++ = (unsigned char) (length>>24); + if (length >= (1<<16)) + *(*buf)++ = (unsigned char) ((length>>16)&0xff); +#endif + if (length >= (1<<8)) + *(*buf)++ = (unsigned char) ((length>>8)&0xff); + *(*buf)++ = (unsigned char) (length&0xff); + } +} + +/* returns decoded length, or < 0 on failure. Advances buf and + decrements bufsize */ + +static int +der_read_length(unsigned char **buf, int *bufsize) +{ + unsigned char sf; + int ret; + + if (*bufsize < 1) + return(-1); + sf = *(*buf)++; + (*bufsize)--; + if (sf & 0x80) { + if ((sf &= 0x7f) > ((*bufsize)-1)) + return(-1); + if (sf > SIZEOF_INT) + return (-1); + ret = 0; + for (; sf; sf--) { + ret = (ret<<8) + (*(*buf)++); + (*bufsize)--; + } + } else { + ret = sf; + } + + return(ret); +} + +/* returns the length of a token, given the mech oid and the body size */ + +int +g_token_size(rawobj_t *mech, unsigned int body_size) +{ + /* set body_size to sequence contents size */ + body_size += 4 + (int) mech->len; /* NEED overflow check */ + return(1 + der_length_size(body_size) + body_size); +} + +//EXPORT_SYMBOL(g_token_size); + +/* fills in a buffer with the token header. The buffer is assumed to + be the right size. buf is advanced past the token header */ + +void +g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf) +{ + *(*buf)++ = 0x60; + der_write_length(buf, 4 + mech->len + body_size); + *(*buf)++ = 0x06; + *(*buf)++ = (unsigned char) mech->len; + TWRITE_STR(*buf, mech->data, ((int) mech->len)); +} + +//EXPORT_SYMBOL(g_make_token_header); + +/* + * Given a buffer containing a token, reads and verifies the token, + * leaving buf advanced past the token header, and setting body_size + * to the number of remaining bytes. Returns 0 on success, + * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the + * mechanism in the token does not match the mech argument. buf and + * *body_size are left unmodified on error. + */ +__u32 +g_verify_token_header(rawobj_t *mech, int *body_size, + unsigned char **buf_in, int toksize) +{ + unsigned char *buf = *buf_in; + int seqsize; + rawobj_t toid; + int ret = 0; + + if ((toksize-=1) < 0) + return(G_BAD_TOK_HEADER); + if (*buf++ != 0x60) + return(G_BAD_TOK_HEADER); + + if ((seqsize = der_read_length(&buf, &toksize)) < 0) + return(G_BAD_TOK_HEADER); + + if (seqsize != toksize) + return(G_BAD_TOK_HEADER); + + if ((toksize-=1) < 0) + return(G_BAD_TOK_HEADER); + if (*buf++ != 0x06) + return(G_BAD_TOK_HEADER); + + if ((toksize-=1) < 0) + return(G_BAD_TOK_HEADER); + toid.len = *buf++; + + if ((toksize-=toid.len) < 0) + return(G_BAD_TOK_HEADER); + toid.data = buf; + buf+=toid.len; + + if (! g_OID_equal(&toid, mech)) + ret = G_WRONG_MECH; + + /* G_WRONG_MECH is not returned immediately because it's more important + to return G_BAD_TOK_HEADER if the token header is in fact bad */ + + if ((toksize-=2) < 0) + return(G_BAD_TOK_HEADER); + + if (ret) + return(ret); + + if (!ret) { + *buf_in = buf; + *body_size = toksize; + } + + return(ret); +} + +//EXPORT_SYMBOL(g_verify_token_header); + +/* Given a buffer containing a token, returns a copy of the mech oid in + * the parameter mech. */ +__u32 +g_get_mech_oid(rawobj_t *mech, rawobj_t * in_buf) +{ + unsigned char *buf = in_buf->data; + int len = in_buf->len; + int ret=0; + int seqsize; + + if ((len-=1) < 0) + return(G_BAD_TOK_HEADER); + if (*buf++ != 0x60) + return(G_BAD_TOK_HEADER); + + if ((seqsize = der_read_length(&buf, &len)) < 0) + return(G_BAD_TOK_HEADER); + + if ((len-=1) < 0) + return(G_BAD_TOK_HEADER); + if (*buf++ != 0x06) + return(G_BAD_TOK_HEADER); + + if ((len-=1) < 0) + return(G_BAD_TOK_HEADER); + mech->len = *buf++; + + if ((len-=mech->len) < 0) + return(G_BAD_TOK_HEADER); + OBD_ALLOC(mech->data, mech->len); + if (!mech->data) + return(G_BUFFER_ALLOC); + memcpy(mech->data, buf, mech->len); + + return ret; +} diff --git a/lustre/sec/gss/gss_internal.h b/lustre/sec/gss/gss_internal.h new file mode 100644 index 0000000..9b1b76a --- /dev/null +++ b/lustre/sec/gss/gss_internal.h @@ -0,0 +1,106 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modified from NFSv4 project for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +#ifndef __SEC_GSS_GSS_INTERNAL_H_ +#define __SEC_GSS_GSS_INTERNAL_H_ + +struct ptlrpc_sec; +struct ptlrpc_cred; + +typedef struct rawobj_s { + __u32 len; + __u8 *data; +} rawobj_t; + +int rawobj_alloc(rawobj_t *obj, char *buf, int len); +void rawobj_free(rawobj_t *obj); +int rawobj_equal(rawobj_t *a, rawobj_t *b); +int rawobj_dup(rawobj_t *dest, rawobj_t *src); +int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen); +int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen); + +typedef struct rawobj_buf_s { + __u32 dataoff; + __u32 datalen; + __u32 buflen; + __u8 *buf; +} rawobj_buf_t; + +#define MAXSEQ 0x80000000 /* maximum legal sequence number, from rfc 2203 */ + +enum rpc_gss_proc { + RPC_GSS_PROC_DATA = 0, + RPC_GSS_PROC_INIT = 1, + RPC_GSS_PROC_CONTINUE_INIT = 2, + RPC_GSS_PROC_DESTROY = 3, +}; + +enum rpc_gss_svc { + RPC_GSS_SVC_NONE = 1, + RPC_GSS_SVC_INTEGRITY = 2, + RPC_GSS_SVC_PRIVACY = 3, +}; + +/* on-the-wire gss cred: */ +struct rpc_gss_wire_cred { + __u32 gc_v; /* version */ + __u32 gc_proc; /* control procedure */ + __u32 gc_seq; /* sequence number */ + __u32 gc_svc; /* service */ + rawobj_t gc_ctx; /* context handle */ +}; + +/* on-the-wire gss verifier: */ +struct rpc_gss_wire_verf { + __u32 gv_flavor; + rawobj_t gv_verf; +}; + +struct gss_cl_ctx { + atomic_t gc_refcount; + __u32 gc_proc; + __u32 gc_seq; + spinlock_t gc_seq_lock; + struct gss_ctx *gc_gss_ctx; + rawobj_t gc_wire_ctx; + __u32 gc_win; +}; + +struct gss_cred { + struct ptlrpc_cred gc_base; + ptlrpcs_flavor_t gc_flavor; + struct gss_cl_ctx *gc_ctx; +}; + +/* + * This only guaranteed be enough for current krb5 des-cbc-crc . We might + * adjust this when new enc type or mech added in. + */ +#define GSS_PRIVBUF_PREFIX_LEN (32) +#define GSS_PRIVBUF_SUFFIX_LEN (32) + +/* This is too coarse. We'll let mech determine it */ +#define GSS_MAX_AUTH_PAYLOAD (128) + +/* gss_mech_switch.c */ +int init_kerberos_module(void); +void cleanup_kerberos_module(void); + +/* gss_generic_token.c */ +int g_token_size(rawobj_t *mech, unsigned int body_size); +void g_make_token_header(rawobj_t *mech, int body_size, unsigned char **buf); +__u32 g_verify_token_header(rawobj_t *mech, int *body_size, + unsigned char **buf_in, int toksize); + +/* svcsec_gss.c */ +int gss_svc_init(void); +void gss_svc_exit(void); + +#endif /* __SEC_GSS_GSS_INTERNAL_H_ */ diff --git a/lustre/sec/gss/gss_krb5.h b/lustre/sec/gss/gss_krb5.h new file mode 100644 index 0000000..f00e2c4 --- /dev/null +++ b/lustre/sec/gss/gss_krb5.h @@ -0,0 +1,183 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * linux/include/linux/sunrpc/gss_krb5_types.h + * + * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, + * lib/gssapi/krb5/gssapiP_krb5.h, and others + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Bruce Fields + */ + +/* + * Copyright 1995 by the Massachusetts Institute of Technology. + * All Rights Reserved. + * + * Export of this software from the United States of America may + * require a specific license from the United States Government. + * It is the responsibility of any person or organization contemplating + * export to obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of M.I.T. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. Furthermore if you modify this software you must label + * your software as modified software and not distribute it in such a + * fashion that it might be confused with the original M.I.T. software. + * M.I.T. makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + */ + +extern spinlock_t krb5_seq_lock; + +struct krb5_ctx { + int initiate; /* 1 = initiating, 0 = accepting */ + int seed_init; + unsigned char seed[16]; + int signalg; + int sealalg; + struct crypto_tfm *enc; + struct crypto_tfm *seq; + __s32 endtime; + __u32 seq_send; + rawobj_t mech_used; +}; + +#define KG_TOK_MIC_MSG 0x0101 +#define KG_TOK_WRAP_MSG 0x0201 + +enum sgn_alg { + SGN_ALG_DES_MAC_MD5 = 0x0000, + SGN_ALG_MD2_5 = 0x0001, + SGN_ALG_DES_MAC = 0x0002, + SGN_ALG_3 = 0x0003, /* not published */ + SGN_ALG_HMAC_MD5 = 0x0011, /* microsoft w2k; no support */ + SGN_ALG_HMAC_SHA1_DES3_KD = 0x0004 +}; +enum seal_alg { + SEAL_ALG_NONE = 0xffff, + SEAL_ALG_DES = 0x0000, + SEAL_ALG_1 = 0x0001, /* not published */ + SEAL_ALG_MICROSOFT_RC4 = 0x0010,/* microsoft w2k; no support */ + SEAL_ALG_DES3KD = 0x0002 +}; + +#define KRB5_CKSUM_LENGTH 8 + +#define CKSUMTYPE_CRC32 0x0001 +#define CKSUMTYPE_RSA_MD4 0x0002 +#define CKSUMTYPE_RSA_MD4_DES 0x0003 +#define CKSUMTYPE_DESCBC 0x0004 +#define CKSUMTYPE_RSA_MD5 0x0007 +#define CKSUMTYPE_RSA_MD5_DES 0x0008 +#define CKSUMTYPE_NIST_SHA 0x0009 +#define CKSUMTYPE_HMAC_SHA1_DES3 0x000c + +/* from gssapi_err_krb5.h */ +#define KG_CCACHE_NOMATCH (39756032L) +#define KG_KEYTAB_NOMATCH (39756033L) +#define KG_TGT_MISSING (39756034L) +#define KG_NO_SUBKEY (39756035L) +#define KG_CONTEXT_ESTABLISHED (39756036L) +#define KG_BAD_SIGN_TYPE (39756037L) +#define KG_BAD_LENGTH (39756038L) +#define KG_CTX_INCOMPLETE (39756039L) +#define KG_CONTEXT (39756040L) +#define KG_CRED (39756041L) +#define KG_ENC_DESC (39756042L) +#define KG_BAD_SEQ (39756043L) +#define KG_EMPTY_CCACHE (39756044L) +#define KG_NO_CTYPES (39756045L) + +/* per Kerberos v5 protocol spec crypto types from the wire. + * these get mapped to linux kernel crypto routines. + */ +#define ENCTYPE_NULL 0x0000 +#define ENCTYPE_DES_CBC_CRC 0x0001 /* DES cbc mode with CRC-32 */ +#define ENCTYPE_DES_CBC_MD4 0x0002 /* DES cbc mode with RSA-MD4 */ +#define ENCTYPE_DES_CBC_MD5 0x0003 /* DES cbc mode with RSA-MD5 */ +#define ENCTYPE_DES_CBC_RAW 0x0004 /* DES cbc mode raw */ +/* XXX deprecated? */ +#define ENCTYPE_DES3_CBC_SHA 0x0005 /* DES-3 cbc mode with NIST-SHA */ +#define ENCTYPE_DES3_CBC_RAW 0x0006 /* DES-3 cbc mode raw */ +#define ENCTYPE_DES_HMAC_SHA1 0x0008 +#define ENCTYPE_DES3_CBC_SHA1 0x0010 +#define ENCTYPE_UNKNOWN 0x01ff + +__s32 +make_checksum(__s32 cksumtype, + char *header, int hdrlen, + rawobj_t *body, + rawobj_t *cksum); + +__u32 +krb5_make_token(struct krb5_ctx *ctx, + int qop_req, + rawobj_t *text, + rawobj_t *token); + +__u32 +krb5_read_token(struct krb5_ctx *ctx, + rawobj_t *read_token, + rawobj_t *message_buffer, + int *qop_state); + +__u32 +krb5_encrypt(struct crypto_tfm *tfm, + void * iv, + void * in, + void * out, + int length); + +__u32 +krb5_decrypt(struct crypto_tfm *tfm, + void * iv, + void * in, + void * out, + int length); + +__s32 +krb5_make_seq_num(struct crypto_tfm *key, + int direction, + __s32 seqnum, + unsigned char *cksum, + unsigned char *buf); + +__s32 +krb5_get_seq_num(struct crypto_tfm *key, + unsigned char *cksum, + unsigned char *buf, + int *direction, + __s32 *seqnum); +int +gss_encrypt_rawobj(struct crypto_tfm *tfm, + rawobj_t *inobj, + rawobj_t *outobj, + int enc); +__u32 +gss_wrap_kerberos(struct gss_ctx *ctx, + __u32 qop, + rawobj_buf_t *in_token, + rawobj_t *out_token); +__u32 +gss_unwrap_kerberos(struct gss_ctx *ctx, + __u32 qop, + rawobj_t *in_token, + rawobj_t *out_token); diff --git a/lustre/sec/gss/gss_krb5_crypto.c b/lustre/sec/gss/gss_krb5_crypto.c new file mode 100644 index 0000000..a0358fe --- /dev/null +++ b/lustre/sec/gss/gss_krb5_crypto.c @@ -0,0 +1,256 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_krb5_crypto.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Bruce Fields + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#include +#else +#include +#include "../kcrypto/libcrypto.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_krb5.h" + +__u32 +krb5_encrypt(struct crypto_tfm *tfm, + void * iv, + void * in, + void * out, + int length) +{ + __u32 ret = -EINVAL; + struct scatterlist sg[1]; + __u8 local_iv[16] = {0}; + + if (length % crypto_tfm_alg_blocksize(tfm) != 0) + goto out; + + if (crypto_tfm_alg_ivsize(tfm) > 16) { + CERROR("tfm iv size to large %d\n", crypto_tfm_alg_ivsize(tfm)); + goto out; + } + + if (iv) + memcpy(local_iv, iv, crypto_tfm_alg_ivsize(tfm)); + + memcpy(out, in, length); + sg[0].page = virt_to_page(out); + sg[0].offset = offset_in_page(out); + sg[0].length = length; + + ret = crypto_cipher_encrypt_iv(tfm, sg, sg, length, local_iv); + +out: + return(ret); +} + +//EXPORT_SYMBOL(krb5_encrypt); + +__u32 +krb5_decrypt(struct crypto_tfm *tfm, + void * iv, + void * in, + void * out, + int length) +{ + __u32 ret = -EINVAL; + struct scatterlist sg[1]; + __u8 local_iv[16] = {0}; + + if (length % crypto_tfm_alg_blocksize(tfm) != 0) + goto out; + + if (crypto_tfm_alg_ivsize(tfm) > 16) { + CERROR("tfm iv size to large %d\n", crypto_tfm_alg_ivsize(tfm)); + goto out; + } + if (iv) + memcpy(local_iv,iv, crypto_tfm_alg_ivsize(tfm)); + + memcpy(out, in, length); + sg[0].page = virt_to_page(out); + sg[0].offset = offset_in_page(out); + sg[0].length = length; + + ret = crypto_cipher_decrypt_iv(tfm, sg, sg, length, local_iv); + +out: + return(ret); +} + +//EXPORT_SYMBOL(krb5_decrypt); + +void +buf_to_sg(struct scatterlist *sg, char *ptr, int len) +{ + sg->page = virt_to_page(ptr); + sg->offset = offset_in_page(ptr); + sg->length = len; +} + +/* checksum the plaintext data and hdrlen bytes of the token header */ +__s32 +make_checksum(__s32 cksumtype, + char *header, int hdrlen, + rawobj_t *body, + rawobj_t *cksum) +{ + char *cksumname; + struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */ + struct scatterlist sg[1]; + __u32 code = GSS_S_FAILURE; + + switch (cksumtype) { + case CKSUMTYPE_RSA_MD5: + cksumname = "md5"; + break; + default: + CERROR("unsupported checksum %d", cksumtype); + goto out; + } + if (!(tfm = crypto_alloc_tfm(cksumname, 0))) + goto out; + cksum->len = crypto_tfm_alg_digestsize(tfm); + OBD_ALLOC(cksum->data, cksum->len); + if (!cksum->data) + goto out; + + crypto_digest_init(tfm); + buf_to_sg(sg, header, hdrlen); + crypto_digest_update(tfm, sg, 1); + if (body->len) { + buf_to_sg(sg, body->data, body->len); + crypto_digest_update(tfm, sg, 1); + } + + crypto_digest_final(tfm, cksum->data); + code = 0; +out: + if (tfm) + crypto_free_tfm(tfm); + return code; +} + +//EXPORT_SYMBOL(make_checksum); + +static +void obj_to_scatter_list(rawobj_t *obj, struct scatterlist *list, + int listlen) +{ + __u8 *ptr = obj->data; + __u32 size = obj->len; + int index = 0; + + while (size) { + LASSERT(index++ < listlen); + list->page = virt_to_page(ptr); + list->offset = (int) ptr & (~PAGE_MASK); + list->length = (list->offset + size) > PAGE_SIZE ? + (PAGE_SIZE - list->offset) : size; + ptr += list->length; + size -= list->length; + list++; + } +} + +int gss_encrypt_rawobj(struct crypto_tfm *tfm, + rawobj_t *inobj, rawobj_t *outobj, + int enc) +{ + struct scatterlist *src_list, *dst_list; + __u8 local_iv[16] = {0}; + int list_len; + __u32 rc; + ENTRY; + + LASSERT(outobj->len >= inobj->len); + + list_len = ((inobj->len + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; + OBD_ALLOC(src_list, sizeof(*src_list) * list_len * 2); + if (!src_list) { + CERROR("can't alloc %d\n", sizeof(*src_list) * list_len * 2); + RETURN(-ENOMEM); + } + dst_list = src_list + list_len; + + obj_to_scatter_list(inobj, src_list, list_len); + obj_to_scatter_list(outobj, dst_list, list_len); + + if (enc) + rc = crypto_cipher_encrypt_iv(tfm, dst_list, src_list, + inobj->len, local_iv); + else + rc = crypto_cipher_decrypt_iv(tfm, dst_list, src_list, + inobj->len, local_iv); + + if (rc) { + CERROR("encrypt error %u\n", rc); + GOTO(out_free, rc); + } + + outobj->len = inobj->len; + +out_free: + OBD_FREE(src_list, sizeof(*src_list) * list_len * 2); + RETURN(rc); +} diff --git a/lustre/sec/gss/gss_krb5_mech.c b/lustre/sec/gss/gss_krb5_mech.c new file mode 100644 index 0000000..8dcca46 --- /dev/null +++ b/lustre/sec/gss/gss_krb5_mech.c @@ -0,0 +1,316 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_krb5_mech.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#include +#else +#include +//#include "../kcrypto/libcrypto.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_krb5.h" + +rawobj_t gss_mech_krb5_oid = + {9, "\052\206\110\206\367\022\001\002\002"}; + +static inline int +get_bytes(char **ptr, const char *end, void *res, int len) +{ + char *p, *q; + p = *ptr; + q = p + len; + if (q > end || q < p) + return -1; + memcpy(res, p, len); + *ptr = q; + return 0; +} + +static inline int +get_rawobj(char **ptr, const char *end, rawobj_t *res) +{ + char *p, *q; + p = *ptr; + if (get_bytes(&p, end, &res->len, sizeof(res->len))) + return -1; + q = p + res->len; + if (q > end || q < p) + return -1; + OBD_ALLOC(res->data, res->len); + if (!res->data) + return -1; + memcpy(res->data, p, res->len); + *ptr = q; + return 0; +} + +static inline int +get_key(char **p, char *end, struct crypto_tfm **res) +{ + rawobj_t key; + int alg, alg_mode; + char *alg_name; + + if (get_bytes(p, end, &alg, sizeof(alg))) + goto out_err; + if ((get_rawobj(p, end, &key))) + goto out_err; + + switch (alg) { + case ENCTYPE_DES_CBC_RAW: + alg_name = "des"; + alg_mode = CRYPTO_TFM_MODE_CBC; + break; + default: + CERROR("unsupported algorithm %d\n", alg); + goto out_err_free_key; + } + if (!(*res = crypto_alloc_tfm(alg_name, alg_mode))) + goto out_err_free_key; + if (crypto_cipher_setkey(*res, key.data, key.len)) + goto out_err_free_tfm; + + OBD_FREE(key.data, key.len); + return 0; + +out_err_free_tfm: + crypto_free_tfm(*res); +out_err_free_key: + OBD_FREE(key.data, key.len); +out_err: + return -1; +} + +static __u32 +gss_import_sec_context_kerberos(rawobj_t *inbuf, + struct gss_ctx *ctx_id) +{ + char *p = inbuf->data; + char *end = inbuf->data + inbuf->len; + struct krb5_ctx *ctx; + + OBD_ALLOC(ctx, sizeof(*ctx)); + if (!ctx) + goto out_err; + + if (get_bytes(&p, end, &ctx->initiate, sizeof(ctx->initiate))) + goto out_err_free_ctx; + if (get_bytes(&p, end, &ctx->seed_init, sizeof(ctx->seed_init))) + goto out_err_free_ctx; + if (get_bytes(&p, end, ctx->seed, sizeof(ctx->seed))) + goto out_err_free_ctx; + if (get_bytes(&p, end, &ctx->signalg, sizeof(ctx->signalg))) + goto out_err_free_ctx; + if (get_bytes(&p, end, &ctx->sealalg, sizeof(ctx->sealalg))) + goto out_err_free_ctx; + if (get_bytes(&p, end, &ctx->endtime, sizeof(ctx->endtime))) + goto out_err_free_ctx; + if (get_bytes(&p, end, &ctx->seq_send, sizeof(ctx->seq_send))) + goto out_err_free_ctx; + if (get_rawobj(&p, end, &ctx->mech_used)) + goto out_err_free_ctx; + if (get_key(&p, end, &ctx->enc)) + goto out_err_free_mech; + if (get_key(&p, end, &ctx->seq)) + goto out_err_free_key1; + if (p != end) + goto out_err_free_key2; + + ctx_id->internal_ctx_id = ctx; + CDEBUG(D_SEC, "Succesfully imported new context.\n"); + return 0; + +out_err_free_key2: + crypto_free_tfm(ctx->seq); +out_err_free_key1: + crypto_free_tfm(ctx->enc); +out_err_free_mech: + OBD_FREE(ctx->mech_used.data, ctx->mech_used.len); +out_err_free_ctx: + OBD_FREE(ctx, sizeof(*ctx)); +out_err: + return GSS_S_FAILURE; +} + +static __u32 +gss_inquire_context_kerberos(struct gss_ctx *context_handle, + __u64 *endtime) +{ + struct krb5_ctx *kctx = context_handle->internal_ctx_id; + + *endtime = (__u64) kctx->endtime; + return GSS_S_COMPLETE; +} + +static void +gss_delete_sec_context_kerberos(void *internal_ctx) +{ + struct krb5_ctx *ctx = internal_ctx; + + if (ctx->seq) + crypto_free_tfm(ctx->seq); + if (ctx->enc) + crypto_free_tfm(ctx->enc); + if (ctx->mech_used.data) + OBD_FREE(ctx->mech_used.data, ctx->mech_used.len); + OBD_FREE(ctx, sizeof(*ctx)); +} + +/* XXX the following wrappers have become pointless; kill them. */ +static __u32 +gss_verify_mic_kerberos(struct gss_ctx *ctx, + rawobj_t *message, + rawobj_t *mic_token, + __u32 *qstate) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + __u32 maj_stat; + int qop_state; + + maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state); + if (!maj_stat && qop_state) + *qstate = qop_state; + + CDEBUG(D_SEC, "returning %d\n", maj_stat); + return maj_stat; +} + +static __u32 +gss_get_mic_kerberos(struct gss_ctx *ctx, + __u32 qop, + rawobj_t *message, + rawobj_t *mic_token) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + __u32 err; + + err = krb5_make_token(kctx, qop, message, mic_token); + + CDEBUG(D_SEC, "returning %d\n",err); + return err; +} + +static struct gss_api_ops gss_kerberos_ops = { + .gss_import_sec_context = gss_import_sec_context_kerberos, + .gss_inquire_context = gss_inquire_context_kerberos, + .gss_get_mic = gss_get_mic_kerberos, + .gss_verify_mic = gss_verify_mic_kerberos, + .gss_wrap = gss_wrap_kerberos, + .gss_unwrap = gss_unwrap_kerberos, + .gss_delete_sec_context = gss_delete_sec_context_kerberos, +}; + +static struct subflavor_desc gss_kerberos_sfs[] = { + { + .subflavor = PTLRPC_SEC_GSS_KRB5, + .qop = 0, + .service = PTLRPC_SEC_TYPE_NONE, + .name = "krb5" + }, + { + .subflavor = PTLRPC_SEC_GSS_KRB5I, + .qop = 0, + .service = PTLRPC_SEC_TYPE_AUTH, + .name = "krb5i" + }, + { + .subflavor = PTLRPC_SEC_GSS_KRB5P, + .qop = 0, + .service = PTLRPC_SEC_TYPE_PRIV, + .name = "krb5p" + } +}; + +static struct gss_api_mech gss_kerberos_mech = { + .gm_name = "krb5", + .gm_owner = THIS_MODULE, + .gm_ops = &gss_kerberos_ops, + .gm_sf_num = 3, + .gm_sfs = gss_kerberos_sfs, +}; + +/*static*/ int __init init_kerberos_module(void) +{ + int status; + + status = kgss_mech_register(&gss_kerberos_mech); + if (status) + CERROR("Failed to register kerberos gss mechanism!\n"); + return status; +} + +/*static*/ void __exit cleanup_kerberos_module(void) +{ + kgss_mech_unregister(&gss_kerberos_mech); +} + +/* XXX enable this when module works */ +#if 0 +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("GSS Krb5 mechanism for Lustre"); + +module_init(init_kerberos_module); +module_exit(cleanup_kerberos_module); +#endif diff --git a/lustre/sec/gss/gss_krb5_seal.c b/lustre/sec/gss/gss_krb5_seal.c new file mode 100644 index 0000000..3037a54 --- /dev/null +++ b/lustre/sec/gss/gss_krb5_seal.c @@ -0,0 +1,178 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_krb5_seal.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5seal.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * J. Bruce Fields + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#include +#else +#include +#include "../kcrypto/libcrypto.h" +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_krb5.h" + +spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED; + +__u32 +krb5_make_token(struct krb5_ctx *ctx, + int qop_req, + rawobj_t *text, + rawobj_t *token) +{ + __s32 checksum_type; + rawobj_t md5cksum = {.len = 0, .data = NULL}; + unsigned char *ptr, *krb5_hdr, *msg_start; + __s32 now, seq_send; + ENTRY; + + now = get_seconds(); + + if (qop_req != 0) + goto out_err; + + switch (ctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + CERROR("ctx->signalg %d not supported\n", ctx->signalg); + goto out_err; + } + if (ctx->sealalg != SEAL_ALG_NONE && ctx->sealalg != SEAL_ALG_DES) { + CERROR("ctx->sealalg %d not supported\n", ctx->sealalg); + goto out_err; + } + + token->len = g_token_size(&ctx->mech_used, 22); + + ptr = token->data; + g_make_token_header(&ctx->mech_used, 22, &ptr); + + *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff); + *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff); + + /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ + krb5_hdr = ptr - 2; + msg_start = krb5_hdr + 24; + + *(__u16 *)(krb5_hdr + 2) = cpu_to_be16(ctx->signalg); + memset(krb5_hdr + 4, 0xff, 4); + + if (make_checksum(checksum_type, krb5_hdr, 8, text, &md5cksum)) + goto out_err; + + switch (ctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, + md5cksum.data, md5cksum.len)) + goto out_err; + memcpy(krb5_hdr + 16, + md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH, + KRB5_CKSUM_LENGTH); + + break; + default: + LBUG(); + } + + OBD_FREE(md5cksum.data, md5cksum.len); + + spin_lock(&krb5_seq_lock); + seq_send = ctx->seq_send++; + spin_unlock(&krb5_seq_lock); + + if ((krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff, + seq_send, krb5_hdr + 16, krb5_hdr + 8))) + goto out_err; + + return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); +out_err: + if (md5cksum.data) + OBD_FREE(md5cksum.data, md5cksum.len); + return GSS_S_FAILURE; +} diff --git a/lustre/sec/gss/gss_krb5_seqnum.c b/lustre/sec/gss/gss_krb5_seqnum.c new file mode 100644 index 0000000..c80fc0f --- /dev/null +++ b/lustre/sec/gss/gss_krb5_seqnum.c @@ -0,0 +1,116 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_krb5_seqnum.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/util_seqnum.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#include +#else +#include +#include "../kcrypto/libcrypto.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_krb5.h" + +__s32 +krb5_make_seq_num(struct crypto_tfm *key, + int direction, + __s32 seqnum, + unsigned char *cksum, + unsigned char *buf) +{ + unsigned char plain[8]; + + plain[0] = (unsigned char) (seqnum & 0xff); + plain[1] = (unsigned char) ((seqnum >> 8) & 0xff); + plain[2] = (unsigned char) ((seqnum >> 16) & 0xff); + plain[3] = (unsigned char) ((seqnum >> 24) & 0xff); + + plain[4] = direction; + plain[5] = direction; + plain[6] = direction; + plain[7] = direction; + + return krb5_encrypt(key, cksum, plain, buf, 8); +} + +__s32 +krb5_get_seq_num(struct crypto_tfm *key, + unsigned char *cksum, + unsigned char *buf, + int *direction, + __s32 * seqnum) +{ + __s32 code; + unsigned char plain[8]; + + if ((code = krb5_decrypt(key, cksum, buf, plain, 8))) + return code; + + if ((plain[4] != plain[5]) || (plain[4] != plain[6]) + || (plain[4] != plain[7])) + return (__s32)KG_BAD_SEQ; + + *direction = plain[4]; + + *seqnum = ((plain[0]) | + (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24)); + + return (0); +} diff --git a/lustre/sec/gss/gss_krb5_unseal.c b/lustre/sec/gss/gss_krb5_unseal.c new file mode 100644 index 0000000..ba6e058 --- /dev/null +++ b/lustre/sec/gss/gss_krb5_unseal.c @@ -0,0 +1,212 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_krb5_unseal.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5unseal.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#include +#else +#include +#include "../kcrypto/libcrypto.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_krb5.h" + + +/* read_token is a mic token, and message_buffer is the data that the mic was + * supposedly taken over. */ + +__u32 +krb5_read_token(struct krb5_ctx *ctx, + rawobj_t *read_token, + rawobj_t *message_buffer, + int *qop_state) +{ + int signalg; + int sealalg; + __s32 checksum_type; + rawobj_t md5cksum = {.len = 0, .data = NULL}; + __s32 now; + int direction; + __s32 seqnum; + unsigned char *ptr = (unsigned char *)read_token->data; + int bodysize; + __u32 ret = GSS_S_DEFECTIVE_TOKEN; + ENTRY; + + if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr, + read_token->len)) + goto out; + + if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) || + (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) ) + goto out; + + /* XXX sanity-check bodysize?? */ + + /* get the sign and seal algorithms */ + + signalg = ptr[0] + (ptr[1] << 8); + sealalg = ptr[2] + (ptr[3] << 8); + + /* Sanity checks */ + + if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + goto out; + + if (sealalg != 0xffff) + goto out; + + /* there are several mappings of seal algorithms to sign algorithms, + but few enough that we can try them all. */ + + if ((ctx->sealalg == SEAL_ALG_NONE && signalg > 1) || + (ctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) || + (ctx->sealalg == SEAL_ALG_DES3KD && + signalg != SGN_ALG_HMAC_SHA1_DES3_KD)) + goto out; + + /* compute the checksum of the message */ + + /* initialize the the cksum */ + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + ret = GSS_S_DEFECTIVE_TOKEN; + goto out; + } + + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + ret = make_checksum(checksum_type, ptr - 2, 8, + message_buffer, &md5cksum); + if (ret) + goto out; + + ret = krb5_encrypt(ctx->seq, NULL, md5cksum.data, + md5cksum.data, 16); + if (ret) + goto out; + + if (memcmp(md5cksum.data + 8, ptr + 14, 8)) { + ret = GSS_S_BAD_SIG; + goto out; + } + break; + default: + ret = GSS_S_DEFECTIVE_TOKEN; + goto out; + } + + /* it got through unscathed. Make sure the context is unexpired */ + + if (qop_state) + *qop_state = GSS_C_QOP_DEFAULT; + + now = get_seconds(); + + ret = GSS_S_CONTEXT_EXPIRED; + if (now > ctx->endtime) + goto out; + + /* do sequencing checks */ + + ret = GSS_S_BAD_SIG; + if ((ret = krb5_get_seq_num(ctx->seq, ptr + 14, ptr + 6, &direction, + &seqnum))) + goto out; + + if ((ctx->initiate && direction != 0xff) || + (!ctx->initiate && direction != 0)) + goto out; + + ret = GSS_S_COMPLETE; +out: + if (md5cksum.data) + OBD_FREE(md5cksum.data, md5cksum.len); + return ret; +} diff --git a/lustre/sec/gss/gss_krb5_wrap.c b/lustre/sec/gss/gss_krb5_wrap.c new file mode 100644 index 0000000..1099156 --- /dev/null +++ b/lustre/sec/gss/gss_krb5_wrap.c @@ -0,0 +1,381 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modified from NFSv4 projects for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#else +#include +#include "../kcrypto/libcrypto.h" +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" +#include "gss_krb5.h" +#include "gss_asn1.h" + +static inline +int add_padding(rawobj_buf_t *msgbuf, int blocksize) +{ + int padding; + + padding = (blocksize - (msgbuf->datalen & (blocksize - 1))) & + (blocksize - 1); + if (padding == 0) + return 0; + + CWARN("add padding %d\n", padding); + if (msgbuf->dataoff + msgbuf->datalen + padding > msgbuf->buflen) { + CERROR("bufsize %u too small: off %u, len %u, padding %u\n", + msgbuf->buflen, msgbuf->dataoff, msgbuf->datalen, + padding); + return -EINVAL; + } + memset(msgbuf->buf + msgbuf->dataoff + msgbuf->datalen, + padding, padding); + msgbuf->datalen += padding; + return 0; +} + +static inline +int generate_confounder(rawobj_buf_t *msgbuf, int blocksize) +{ + __u8 *p; + + p = msgbuf->buf + msgbuf->dataoff - blocksize; + if (p < msgbuf->buf) { + CERROR("buf underflow\n"); + return -EINVAL; + } + + get_random_bytes(p, blocksize); + return 0; +} + +__u32 +gss_wrap_kerberos(struct gss_ctx *ctx, + __u32 qop, + rawobj_buf_t *msgbuf, + rawobj_t *token) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + __u32 checksum_type; + rawobj_t data_desc, cipher_out, md5cksum; + int blocksize; + unsigned char *ptr, *krb5_hdr, *msg_start; + int head_len, plain_len; + __u32 seq_send, major; + ENTRY; + + if (qop) { + CERROR("not support qop %x yet\n", qop); + RETURN(GSS_S_FAILURE); + } + + switch (kctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + CERROR("not support signalg %x\n", kctx->signalg); + RETURN(GSS_S_FAILURE); + } + if (kctx->sealalg != SEAL_ALG_NONE && + kctx->sealalg != SEAL_ALG_DES) { + CERROR("not support sealalg %x\n", kctx->sealalg); + RETURN(GSS_S_FAILURE); + } + + blocksize = crypto_tfm_alg_blocksize(kctx->enc); + LASSERT(blocksize <= 16); + LASSERT(blocksize == 8); /* acutally must be 8 for now */ + + if (add_padding(msgbuf, blocksize)) + RETURN(GSS_S_FAILURE); + + /* confounder size == blocksize */ + plain_len = msgbuf->datalen + blocksize; + + head_len = g_token_size(&kctx->mech_used, 22 + plain_len) - + msgbuf->datalen; + + LASSERT(token->len >= head_len); + ptr = token->data; + + /* + * fill in gss header and krb5 header + */ + g_make_token_header(&kctx->mech_used, 22 + plain_len, &ptr); + krb5_hdr = ptr; + msg_start = krb5_hdr + 24; + *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG >> 8) & 0xff); + *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG & 0xff); + *(__u16 *)(krb5_hdr + 2) = cpu_to_be16(kctx->signalg); + memset(krb5_hdr + 4, 0xff, 4); + *(__u16 *)(krb5_hdr + 4) = cpu_to_be16(kctx->sealalg); + + /* + * prepend confounder on plain text + */ + if (generate_confounder(msgbuf, blocksize)) + RETURN(GSS_S_FAILURE); + + /* + * compute checksum including confounder + */ + data_desc.data = msgbuf->buf + msgbuf->dataoff - blocksize; + data_desc.len = msgbuf->datalen + blocksize; + + if (make_checksum(checksum_type, krb5_hdr, 8, &data_desc, &md5cksum)) { + CERROR("checksum error\n"); + RETURN(GSS_S_FAILURE); + } + + major = GSS_S_FAILURE; + switch (kctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, + md5cksum.data, md5cksum.len)) { + rawobj_free(&md5cksum); + RETURN(GSS_S_FAILURE); + } + memcpy(krb5_hdr + 16, + md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH, + KRB5_CKSUM_LENGTH); + break; + default: + LBUG(); + } + + rawobj_free(&md5cksum); + + /* + * fill sequence number in krb5 header + */ + spin_lock(&krb5_seq_lock); + seq_send = kctx->seq_send++; + spin_unlock(&krb5_seq_lock); + + if (krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, + seq_send, krb5_hdr + 16, krb5_hdr + 8)) + RETURN(GSS_S_FAILURE); + + /* do encryption */ + data_desc.data = msgbuf->buf + msgbuf->dataoff - blocksize; + data_desc.len = msgbuf->datalen + blocksize; + cipher_out.data = msg_start; + cipher_out.len = token->len - (msg_start - token->data); + LASSERT(data_desc.len % blocksize == 0); + LASSERT(data_desc.len <= cipher_out.len); + + if (gss_encrypt_rawobj(kctx->enc, &data_desc, &cipher_out, 1)) + RETURN(GSS_S_FAILURE); + + token->len = (msg_start - token->data) + cipher_out.len; + RETURN(0); +} + +__u32 +gss_unwrap_kerberos(struct gss_ctx *ctx, + __u32 qop, + rawobj_t *in_token, + rawobj_t *out_token) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + int signalg, sealalg; + rawobj_t cipher_in, plain_out, md5cksum; + unsigned char *ptr, *krb5_hdr, *tmpbuf; + int bodysize; + int blocksize, seqnum, direction; + __u32 checksum_type; + __u32 major; + ENTRY; + + ptr = in_token->data; + + /* + * verify gss header + */ + major = g_verify_token_header(&kctx->mech_used, &bodysize, &ptr, + in_token->len); + if (major) { + CERROR("gss token error %d\n", major); + RETURN(GSS_S_FAILURE); + } + + krb5_hdr = ptr; + + if ((*ptr++ != ((KG_TOK_WRAP_MSG >> 8) & 0xff)) || + (*ptr++ != (KG_TOK_WRAP_MSG & 0xff))) { + CERROR("token type not matched\n"); + RETURN(G_BAD_TOK_HEADER); + } + + if (bodysize < 22) { + CERROR("body size only %d\n", bodysize); + RETURN(G_WRONG_SIZE); + } + + /* + * extract algorithms + */ + signalg = ptr[0] | (ptr[1] << 8); + sealalg = ptr[2] | (ptr[3] << 8); + + if (ptr[4] != 0xFF || ptr[5] != 0xFF) { + CERROR("4/5: %d, %d\n", ptr[4], ptr[5]); + RETURN(GSS_S_DEFECTIVE_TOKEN); + } + + if (sealalg != kctx->sealalg) { + CERROR("sealalg %d not matched my %d\n", + sealalg, kctx->sealalg); + RETURN(GSS_S_DEFECTIVE_TOKEN); + } + + if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) || + (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) || + (kctx->sealalg == SEAL_ALG_DES3KD && + signalg != SGN_ALG_HMAC_SHA1_DES3_KD)) { + CERROR("bad sealalg %d\n", sealalg); + RETURN(GSS_S_DEFECTIVE_TOKEN); + } + + /* make bodysize as the actual cipher text size */ + bodysize -= 22; + if (bodysize <= 0) { + CERROR("cipher text size %d?\n", bodysize); + RETURN(GSS_S_DEFECTIVE_TOKEN); + } + + blocksize = crypto_tfm_alg_blocksize(kctx->enc); + if (bodysize % blocksize) { + CERROR("odd bodysize %d\n", bodysize); + RETURN(GSS_S_DEFECTIVE_TOKEN); + } + + OBD_ALLOC(tmpbuf, bodysize); + if (!tmpbuf) { + CERROR("fail alloc %d\n", bodysize); + RETURN(GSS_S_FAILURE); + } + + cipher_in.data = krb5_hdr + 24; + cipher_in.len = bodysize; + plain_out.data = tmpbuf; + plain_out.len = bodysize; + + major = GSS_S_DEFECTIVE_TOKEN; + if (gss_encrypt_rawobj(kctx->enc, &cipher_in, &plain_out, 0)) { + CERROR("error decrypt: 0x%x\n", major); + GOTO(out_free, major); + } + LASSERT(plain_out.len == bodysize); + + /* + * verify checksum + */ + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + major = make_checksum(checksum_type, krb5_hdr, 8, + &plain_out, &md5cksum); + if (major) { + CERROR("make checksum err: 0x%x\n", major); + GOTO(out_free, major); + } + + major = krb5_encrypt(kctx->seq, NULL, md5cksum.data, + md5cksum.data, md5cksum.len); + if (major) { + CERROR("encrypt checksum err: 0x%x\n", major); + rawobj_free(&md5cksum); + GOTO(out_free, major); + } + + if (memcmp(md5cksum.data + 8, krb5_hdr + 16, 8)) { + CERROR("checksum mismatch\n"); + rawobj_free(&md5cksum); + GOTO(out_free, major = GSS_S_BAD_SIG); + } + break; + default: + CERROR("not support signalg %d\n", signalg); + GOTO(out_free, major); + } + + rawobj_free(&md5cksum); + + /* FIXME add expire checking here */ + + major = krb5_get_seq_num(kctx->seq, krb5_hdr + 16, + krb5_hdr + 8, &direction, + &seqnum); + if (major) { + CERROR("get seq number err: 0x%x\n", major); + GOTO(out_free, major); + } + + if ((kctx->initiate && direction != 0xff) || + (!kctx->initiate && direction != 0)) { + CERROR("flag checking error\n"); + GOTO(out_free, major = GSS_S_BAD_SIG); + } + + /* FIXME how to remove the padding? */ + + /* + * copy back + */ + if (out_token->len < bodysize - blocksize) { + CERROR("data size %d while buffer only %d\n", + bodysize - blocksize, out_token->len); + GOTO(out_free, major = GSS_S_DEFECTIVE_TOKEN); + } + + out_token->len = bodysize - blocksize; + memcpy(out_token->data, plain_out.data + blocksize, out_token->len); + major = 0; +out_free: + OBD_FREE(tmpbuf, bodysize); + RETURN(major); +} diff --git a/lustre/sec/gss/gss_mech_switch.c b/lustre/sec/gss/gss_mech_switch.c new file mode 100644 index 0000000..f4d1d7f --- /dev/null +++ b/lustre/sec/gss/gss_mech_switch.c @@ -0,0 +1,302 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/gss_mech_switch.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +static LIST_HEAD(registered_mechs); +static spinlock_t registered_mechs_lock = SPIN_LOCK_UNLOCKED; + +int +kgss_mech_register(struct gss_api_mech *gm) +{ + spin_lock(®istered_mechs_lock); + list_add(&gm->gm_list, ®istered_mechs); + spin_unlock(®istered_mechs_lock); + CWARN("registered gss mechanism %s\n", gm->gm_name); + return 0; +} + +//EXPORT_SYMBOL(kgss_mech_register); + +void +kgss_mech_unregister(struct gss_api_mech *gm) +{ + spin_lock(®istered_mechs_lock); + list_del(&gm->gm_list); + spin_unlock(®istered_mechs_lock); + CWARN("unregistered gss mechanism %s\n", gm->gm_name); +// gss_mech_free(gm); +} + +//EXPORT_SYMBOL(gss_mech_unregister); + +struct gss_api_mech * +kgss_mech_get(struct gss_api_mech *gm) +{ + __module_get(gm->gm_owner); + return gm; +} + +//EXPORT_SYMBOL(kgss_mech_get); + +struct gss_api_mech * +kgss_name_to_mech(char *name) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (0 == strcmp(name, pos->gm_name)) { + if (!try_module_get(pos->gm_owner)) + continue; + gm = pos; + break; + } + } + spin_unlock(®istered_mechs_lock); + return gm; + +} + +//EXPORT_SYMBOL(gss_name_to_mech); + +static inline int +mech_supports_subflavor(struct gss_api_mech *gm, __u32 subflavor) +{ + int i; + + for (i = 0; i < gm->gm_sf_num; i++) { + if (gm->gm_sfs[i].subflavor == subflavor) + return 1; + } + return 0; +} + +struct gss_api_mech * +kgss_subflavor_to_mech(__u32 subflavor) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (!try_module_get(pos->gm_owner)) + continue; + if (!mech_supports_subflavor(pos, subflavor)) { + module_put(pos->gm_owner); + continue; + } + gm = pos; + break; + } + spin_unlock(®istered_mechs_lock); + return gm; +} + +//EXPORT_SYMBOL(gss_subflavor_to_mech); + +void +kgss_mech_put(struct gss_api_mech *gm) +{ + module_put(gm->gm_owner); +} + +//EXPORT_SYMBOL(kgss_mech_put); + +/* The mech could probably be determined from the token instead, but it's just + * as easy for now to pass it in. */ +__u32 +kgss_import_sec_context(rawobj_t *input_token, + struct gss_api_mech *mech, + struct gss_ctx **ctx_id) +{ + OBD_ALLOC(*ctx_id, sizeof(**ctx_id)); + if (*ctx_id == NULL) + return GSS_S_FAILURE; + + (*ctx_id)->mech_type = kgss_mech_get(mech); + + LASSERT(mech); + LASSERT(mech->gm_ops); + LASSERT(mech->gm_ops->gss_import_sec_context); + return mech->gm_ops->gss_import_sec_context(input_token, *ctx_id); +} + +/* + * this interface is much simplified, currently we only need endtime. + */ +__u32 +kgss_inquire_context(struct gss_ctx *context_handle, + __u64 *endtime) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_inquire_context); + + return context_handle->mech_type->gm_ops + ->gss_inquire_context(context_handle, + endtime); +} + +/* gss_get_mic: compute a mic over message and return mic_token. */ +__u32 +kgss_get_mic(struct gss_ctx *context_handle, + __u32 qop, + rawobj_t *message, + rawobj_t *mic_token) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_get_mic); + + return context_handle->mech_type->gm_ops + ->gss_get_mic(context_handle, + qop, + message, + mic_token); +} + +/* gss_verify_mic: check whether the provided mic_token verifies message. */ +__u32 +kgss_verify_mic(struct gss_ctx *context_handle, + rawobj_t *message, + rawobj_t *mic_token, + __u32 *qstate) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_verify_mic); + + return context_handle->mech_type->gm_ops + ->gss_verify_mic(context_handle, + message, + mic_token, + qstate); +} + +__u32 +kgss_wrap(struct gss_ctx *context_handle, + __u32 qop, + rawobj_buf_t *inbuf, + rawobj_t *outbuf) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_wrap); + + return context_handle->mech_type->gm_ops + ->gss_wrap(context_handle, qop, inbuf, outbuf); +} + +__u32 +kgss_unwrap(struct gss_ctx *context_handle, + __u32 qop, + rawobj_t *inbuf, + rawobj_t *outbuf) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_unwrap); + + return context_handle->mech_type->gm_ops + ->gss_unwrap(context_handle, qop, inbuf, outbuf); +} + + +/* gss_delete_sec_context: free all resources associated with context_handle. + * Note this differs from the RFC 2744-specified prototype in that we don't + * bother returning an output token, since it would never be used anyway. */ + +__u32 +kgss_delete_sec_context(struct gss_ctx **context_handle) +{ + struct gss_api_mech *mech; + + CDEBUG(D_SEC, "deleting %p\n", *context_handle); + + if (!*context_handle) + return(GSS_S_NO_CONTEXT); + + mech = (*context_handle)->mech_type; + if ((*context_handle)->internal_ctx_id != 0) { + LASSERT(mech); + LASSERT(mech->gm_ops); + LASSERT(mech->gm_ops->gss_delete_sec_context); + mech->gm_ops->gss_delete_sec_context( + (*context_handle)->internal_ctx_id); + } + if (mech) + kgss_mech_put(mech); + + OBD_FREE(*context_handle, sizeof(**context_handle)); + *context_handle=NULL; + return GSS_S_COMPLETE; +} diff --git a/lustre/sec/gss/rawobj.c b/lustre/sec/gss/rawobj.c new file mode 100644 index 0000000..6c6edc4 --- /dev/null +++ b/lustre/sec/gss/rawobj.c @@ -0,0 +1,170 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "gss_internal.h" + +int rawobj_alloc(rawobj_t *obj, char *buf, int len) +{ + LASSERT(obj); + LASSERT(len >= 0); + + obj->len = len; + if (len) { + OBD_ALLOC(obj->data, len); + if (!obj->data) + RETURN(-ENOMEM); + memcpy(obj->data, buf, len); + } else + obj->data = NULL; + return 0; +} + +void rawobj_free(rawobj_t *obj) +{ + LASSERT(obj); + + if (obj->len) { + LASSERT(obj->data); + OBD_FREE(obj->data, obj->len); + obj->len = 0; + obj->data = NULL; + } else + LASSERT(!obj->data); +} + +int rawobj_equal(rawobj_t *a, rawobj_t *b) +{ + LASSERT(a && b); + + return (a->len == b->len && + !memcmp(a->data, b->data, a->len)); +} + +int rawobj_dup(rawobj_t *dest, rawobj_t *src) +{ + LASSERT(src && dest); + + dest->len = src->len; + if (dest->len) { + OBD_ALLOC(dest->data, dest->len); + if (!dest->data) + return -ENOMEM; + memcpy(dest->data, src->data, dest->len); + } else + dest->data = NULL; + return 0; +} + +int rawobj_serialize(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + __u32 len; + + LASSERT(obj); + LASSERT(buf); + LASSERT(buflen); + + len = size_round4(obj->len); + + if (*buflen < 4 + len) { + CERROR("buflen %u < %u\n", *buflen, 4 + len); + return -EINVAL; + } + + *(*buf)++ = cpu_to_le32(obj->len); + memcpy(*buf, obj->data, obj->len); + *buf += (len >> 2); + *buflen -= (4 + len); + + return 0; +} + +static int __rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen, + int alloc, int local) +{ + __u32 len; + + if (*buflen < sizeof(__u32)) { + CERROR("buflen %u\n", *buflen); + return -EINVAL; + } + + obj->len = *(*buf)++; + if (!local) + obj->len = le32_to_cpu(obj->len); + *buflen -= sizeof(__u32); + + if (!obj->len) { + obj->data = NULL; + return 0; + } + + len = local ? obj->len : size_round4(obj->len); + if (*buflen < len) { + CERROR("buflen %u < %u\n", *buflen, len); + return -EINVAL; + } + + if (!alloc) + obj->data = (__u8 *) *buf; + else { + OBD_ALLOC(obj->data, obj->len); + if (!obj->data) { + CERROR("fail to alloc %u bytes\n", obj->len); + return -ENOMEM; + } + memcpy(obj->data, *buf, obj->len); + } + + *((char **)buf) += len; + *buflen -= len; + + return 0; +} + +int rawobj_extract(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 0, 0); +} + +int rawobj_extract_local(rawobj_t *obj, __u32 **buf, __u32 *buflen) +{ + return __rawobj_extract(obj, buf, buflen, 0, 1); +} diff --git a/lustre/sec/gss/sec_gss.c b/lustre/sec/gss/sec_gss.c new file mode 100644 index 0000000..db89a71 --- /dev/null +++ b/lustre/sec/gss/sec_gss.c @@ -0,0 +1,1799 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * linux/net/sunrpc/auth_gss.c + * + * RPCSEC_GSS client authentication. + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Dug Song + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $Id: sec_gss.c,v 1.2 2005/03/31 22:18:24 ericm Exp $ + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include +/* for rpc_pipefs */ +struct rpc_clnt; +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +#define GSS_CREDCACHE_EXPIRE (60) /* 1 minute */ +#define GSS_CRED_EXPIRE (8 * 60 * 60) /* 8 hours */ +#define GSS_CRED_SIGN_SIZE (1024) +#define GSS_CRED_VERIFY_SIZE (56) + +#define LUSTRE_PIPEDIR "/lustre" + +/********************************************** + * gss security init/fini helper * + **********************************************/ + +#define SECINIT_RPC_TIMEOUT (10) +#define SECFINI_RPC_TIMEOUT (10) + +static int secinit_compose_request(struct obd_import *imp, + char *buf, int bufsize, + char __user *token) +{ + struct ptlrpcs_wire_hdr *hdr; + struct lustre_msg *lmsg; + char __user *token_buf; + __u64 token_size; + __u32 lmsg_size, *p; + int rc; + + lmsg_size = lustre_msg_size(0, NULL); + + if (copy_from_user(&token_size, token, sizeof(token_size))) { + CERROR("read token error\n"); + return -EFAULT; + } + if (sizeof(*hdr) + lmsg_size + size_round(token_size) > bufsize) { + CERROR("token size "LPU64" too large\n", token_size); + return -EINVAL; + } + + if (copy_from_user(&token_buf, (token + sizeof(token_size)), + sizeof(void*))) { + CERROR("read token buf pointer error\n"); + return -EFAULT; + } + + /* security wire hdr */ + hdr = buf_to_sec_hdr(buf); + hdr->flavor = cpu_to_le32(PTLRPC_SEC_GSS); + hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_NONE); + hdr->msg_len = cpu_to_le32(lmsg_size); + hdr->sec_len = cpu_to_le32(7 * 4 + token_size); + + /* lustre message */ + lmsg = buf_to_lustre_msg(buf); + lustre_init_msg(lmsg, 0, NULL, NULL); + lmsg->handle = imp->imp_remote_handle; + lmsg->type = PTL_RPC_MSG_REQUEST; + lmsg->opc = SEC_INIT; + lmsg->flags = 0; + lmsg->conn_cnt = imp->imp_conn_cnt; + + p = (__u32 *) (buf + sizeof(*hdr) + lmsg_size); + + /* gss hdr */ + *p++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION); /* gss version */ + *p++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I); /* subflavor */ + *p++ = cpu_to_le32(PTLRPC_GSS_PROC_INIT); /* proc */ + *p++ = cpu_to_le32(0); /* seq */ + *p++ = cpu_to_le32(PTLRPC_GSS_SVC_NONE); /* service */ + *p++ = cpu_to_le32(0); /* context handle */ + + /* now the token part */ + *p++ = (__u32)(cpu_to_le64(token_size)); + LASSERT(((char *)p - buf) + token_size <= bufsize); + + rc = copy_from_user(p, token_buf, token_size); + if (rc) { + CERROR("can't copy token\n"); + return -EFAULT; + } + + rc = size_round(((char *)p - buf) + token_size); + return rc; +} + +static int secinit_parse_reply(char *repbuf, int replen, + char __user *outbuf, int outlen) +{ + __u32 *p = (__u32 *)repbuf; + __u32 lmsg_len, sec_len, status, major, minor, seq, obj_len, round_len; + __u32 effective = 0; + + if (replen <= (4 + 6) * 4) { + CERROR("reply size %d too small\n", replen); + return -EINVAL; + } + + lmsg_len = le32_to_cpu(p[2]); + sec_len = le32_to_cpu(p[3]); + + /* sanity checks */ + if (p[0] != cpu_to_le32(PTLRPC_SEC_GSS) || + p[1] != cpu_to_le32(PTLRPC_SEC_TYPE_NONE)) { + CERROR("unexpected reply\n"); + return -EINVAL; + } + if (lmsg_len % 8 || + 4 * 4 + lmsg_len + sec_len > replen) { + CERROR("unexpected reply\n"); + return -EINVAL; + } + if (sec_len > outlen) { + CERROR("outbuf too small\n"); + return -EINVAL; + } + + p += 4; /* skip hdr */ + p += lmsg_len / 4; /* skip lmsg */ + effective = 0; + + status = le32_to_cpu(*p++); + major = le32_to_cpu(*p++); + minor = le32_to_cpu(*p++); + seq = le32_to_cpu(*p++); + effective += 4 * 4; + + copy_to_user(outbuf, &status, 4); + outbuf += 4; + copy_to_user(outbuf, &major, 4); + outbuf += 4; + copy_to_user(outbuf, &minor, 4); + outbuf += 4; + copy_to_user(outbuf, &seq, 4); + outbuf += 4; + + obj_len = le32_to_cpu(*p++); + round_len = (obj_len + 3) & ~ 3; + copy_to_user(outbuf, &obj_len, 4); + outbuf += 4; + copy_to_user(outbuf, (char *)p, round_len); + p += round_len / 4; + outbuf += round_len; + effective += 4 + round_len; + + obj_len = le32_to_cpu(*p++); + round_len = (obj_len + 3) & ~ 3; + copy_to_user(outbuf, &obj_len, 4); + outbuf += 4; + copy_to_user(outbuf, (char *)p, round_len); + p += round_len / 4; + outbuf += round_len; + effective += 4 + round_len; + + return effective; +} + +/* input: + * 1. ptr to uuid + * 2. ptr to send_token + * 3. ptr to output buffer + * 4. output buffer size + * output: + * 1. return code. 0 is success + * 2. no meaning + * 3. ptr output data + * 4. output data size + * + * return: + * < 0: error + * = 0: success + * + * FIXME This interface looks strange, should be reimplemented + */ +static int gss_send_secinit_rpc(__user char *buffer, unsigned long count) +{ + struct obd_import *imp; + const int reqbuf_size = 1024; + const int repbuf_size = 1024; + char *reqbuf, *repbuf; + struct obd_device *obd; + char obdname[64]; + long inbuf[4], lsize; + int rc, reqlen, replen; + + if (count != 4 * sizeof(long)) { + CERROR("count %lu\n", count); + RETURN(-EINVAL); + } + if (copy_from_user(inbuf, buffer, count)) { + CERROR("Invalid pointer\n"); + RETURN(-EFAULT); + } + + /* take name */ + if (strncpy_from_user(obdname, (char *)inbuf[0], + sizeof(obdname)) <= 0) { + CERROR("Invalid obdname pointer\n"); + RETURN(-EFAULT); + } + + obd = class_name2obd(obdname); + if (!obd) { + CERROR("no such obd %s\n", obdname); + RETURN(-EINVAL); + } + if (strcmp(obd->obd_type->typ_name, "mdc") && + strcmp(obd->obd_type->typ_name, "osc")) { + CERROR("%s not a mdc/osc device\n", obdname); + RETURN(-EINVAL); + } + + imp = class_import_get(obd->u.cli.cl_import); + + OBD_ALLOC(reqbuf, reqbuf_size); + OBD_ALLOC(repbuf, reqbuf_size); + + if (!reqbuf || !repbuf) { + CERROR("Can't alloc buffer: %p/%p\n", reqbuf, repbuf); + GOTO(out_free, rc = -ENOMEM); + } + + /* get token */ + reqlen = secinit_compose_request(imp, reqbuf, reqbuf_size, + (char *)inbuf[1]); + if (reqlen < 0) + GOTO(out_free, rc = reqlen); + + replen = repbuf_size; + rc = ptlrpc_do_rawrpc(imp, reqbuf, reqlen, + repbuf, &replen, SECINIT_RPC_TIMEOUT); + if (rc) + GOTO(out_free, rc); + + if (replen > inbuf[3]) { + CERROR("output buffer size %ld too small, need %d\n", + inbuf[3], replen); + GOTO(out_free, rc = -EINVAL); + } + + lsize = secinit_parse_reply(repbuf, replen, + (char *)inbuf[2], (int)inbuf[3]); + if (lsize < 0) + GOTO(out_free, rc = (int)lsize); + + copy_to_user(buffer + 3 * sizeof(long), &lsize, sizeof(lsize)); + lsize = 0; + copy_to_user((char*)buffer, &lsize, sizeof(lsize)); + rc = 0; +out_free: + class_import_put(imp); + if (repbuf) + OBD_FREE(repbuf, repbuf_size); + if (reqbuf) + OBD_FREE(reqbuf, reqbuf_size); + RETURN(rc); +} + +static int gss_send_secfini_rpc(struct obd_import *imp, + char *reqbuf, int reqlen) +{ + const int repbuf_size = 1024; + char *repbuf; + int replen = repbuf_size; + int rc; + + OBD_ALLOC(repbuf, repbuf_size); + if (!repbuf) { + CERROR("Out of memory\n"); + return -ENOMEM; + } + + rc = ptlrpc_do_rawrpc(imp, reqbuf, reqlen, repbuf, &replen, + SECFINI_RPC_TIMEOUT); + + OBD_FREE(repbuf, repbuf_size); + return rc; +} + +/********************************************** + * structure definitions * + **********************************************/ +struct gss_sec { + struct ptlrpc_sec gs_base; + struct gss_api_mech *gs_mech; +#ifdef __KERNEL__ + spinlock_t gs_lock; + struct list_head gs_upcalls; + char gs_pipepath[64]; + struct dentry *gs_depipe; +#endif +}; + +static rwlock_t gss_ctx_lock = RW_LOCK_UNLOCKED; + +#ifdef __KERNEL__ + +struct gss_upcall_msg { + struct rpc_pipe_msg gum_base; + atomic_t gum_refcount; + struct list_head gum_list; + struct gss_sec *gum_gsec; + wait_queue_head_t gum_waitq; + char gum_obdname[64]; + uid_t gum_uid; + __u32 gum_ip; /* XXX IPv6? */ + __u32 gum_svc; + __u32 gum_pad; +}; + +/********************************************** + * rpc_pipe upcall helpers * + **********************************************/ +static +void gss_release_msg(struct gss_upcall_msg *gmsg) +{ + ENTRY; + LASSERT(atomic_read(&gmsg->gum_refcount) > 0); + + if (!atomic_dec_and_test(&gmsg->gum_refcount)) { + CDEBUG(D_SEC, "gmsg %p ref %d\n", gmsg, + atomic_read(&gmsg->gum_refcount)); + EXIT; + return; + } + LASSERT(list_empty(&gmsg->gum_list)); + OBD_FREE(gmsg, sizeof(*gmsg)); + EXIT; +} + +static void +gss_unhash_msg_nolock(struct gss_upcall_msg *gmsg) +{ + ENTRY; + if (list_empty(&gmsg->gum_list)) { + EXIT; + return; + } + /* FIXME should not do this. when we in upper upcall queue, + * downcall will call unhash_msg, thus later put_msg might + * free msg buffer while it's not dequeued XXX */ + list_del_init(&gmsg->gum_base.list); + /* FIXME */ + + list_del_init(&gmsg->gum_list); + wake_up(&gmsg->gum_waitq); + atomic_dec(&gmsg->gum_refcount); + CDEBUG(D_SEC, "gmsg %p refcount now %d\n", + gmsg, atomic_read(&gmsg->gum_refcount)); + LASSERT(atomic_read(&gmsg->gum_refcount) > 0); + EXIT; +} + +static void +gss_unhash_msg(struct gss_upcall_msg *gmsg) +{ + struct gss_sec *gsec = gmsg->gum_gsec; + + spin_lock(&gsec->gs_lock); + gss_unhash_msg_nolock(gmsg); + spin_unlock(&gsec->gs_lock); +} + +static +struct gss_upcall_msg * gss_find_upcall(struct gss_sec *gsec, + char *obdname, + uid_t uid, __u32 dest_ip) +{ + struct gss_upcall_msg *gmsg; + ENTRY; + + list_for_each_entry(gmsg, &gsec->gs_upcalls, gum_list) { + if (gmsg->gum_uid != uid) + continue; + if (gmsg->gum_ip != dest_ip) + continue; + if (strcmp(gmsg->gum_obdname, obdname)) + continue; + atomic_inc(&gmsg->gum_refcount); + CDEBUG(D_SEC, "found gmsg at %p: obdname %s, uid %d, ref %d\n", + gmsg, obdname, uid, atomic_read(&gmsg->gum_refcount)); + RETURN(gmsg); + } + RETURN(NULL); +} + +static void gss_init_upcall_msg(struct gss_upcall_msg *gmsg, + struct gss_sec *gsec, + char *obdname, + uid_t uid, __u32 dest_ip, __u32 svc) +{ + struct rpc_pipe_msg *rpcmsg; + ENTRY; + + /* 2 refs: 1 for hash, 1 for current user */ + init_waitqueue_head(&gmsg->gum_waitq); + list_add(&gmsg->gum_list, &gsec->gs_upcalls); + atomic_set(&gmsg->gum_refcount, 2); + gmsg->gum_gsec = gsec; + strncpy(gmsg->gum_obdname, obdname, sizeof(gmsg->gum_obdname)); + gmsg->gum_uid = uid; + gmsg->gum_ip = dest_ip; + gmsg->gum_svc = svc; + + rpcmsg = &gmsg->gum_base; + rpcmsg->data = &gmsg->gum_uid; + rpcmsg->len = sizeof(gmsg->gum_uid) + sizeof(gmsg->gum_ip) + + sizeof(gmsg->gum_svc) + sizeof(gmsg->gum_pad); + EXIT; +} +#endif /* __KERNEL__ */ + +/******************************************** + * gss cred manupulation helpers * + ********************************************/ +static +int gss_cred_is_uptodate_ctx(struct ptlrpc_cred *cred) +{ + struct gss_cred *gcred = container_of(cred, struct gss_cred, gc_base); + int res = 0; + + read_lock(&gss_ctx_lock); + if ((cred->pc_flags & PTLRPC_CRED_UPTODATE) && gcred->gc_ctx) + res = 1; + read_unlock(&gss_ctx_lock); + return res; +} + +static inline +struct gss_cl_ctx * gss_get_ctx(struct gss_cl_ctx *ctx) +{ + atomic_inc(&ctx->gc_refcount); + return ctx; +} + +static +void gss_destroy_ctx(struct gss_cl_ctx *ctx) +{ + ENTRY; + + CDEBUG(D_SEC, "destroy cl_ctx %p\n", ctx); + if (ctx->gc_gss_ctx) + kgss_delete_sec_context(&ctx->gc_gss_ctx); + + if (ctx->gc_wire_ctx.len > 0) { + OBD_FREE(ctx->gc_wire_ctx.data, ctx->gc_wire_ctx.len); + ctx->gc_wire_ctx.len = 0; + } + + OBD_FREE(ctx, sizeof(*ctx)); +} + +static +void gss_put_ctx(struct gss_cl_ctx *ctx) +{ + if (atomic_dec_and_test(&ctx->gc_refcount)) + gss_destroy_ctx(ctx); +} + +static +struct gss_cl_ctx *gss_cred_get_ctx(struct ptlrpc_cred *cred) +{ + struct gss_cred *gcred = container_of(cred, struct gss_cred, gc_base); + struct gss_cl_ctx *ctx = NULL; + + read_lock(&gss_ctx_lock); + if (gcred->gc_ctx) + ctx = gss_get_ctx(gcred->gc_ctx); + read_unlock(&gss_ctx_lock); + return ctx; +} + +static +void gss_cred_set_ctx(struct ptlrpc_cred *cred, struct gss_cl_ctx *ctx) +{ + struct gss_cred *gcred = container_of(cred, struct gss_cred, gc_base); + struct gss_cl_ctx *old; + __u64 ctx_expiry; + ENTRY; + + if (kgss_inquire_context(ctx->gc_gss_ctx, &ctx_expiry)) { + CERROR("unable to get expire time\n"); + ctx_expiry = 1; /* make it expired now */ + } + cred->pc_expire = (unsigned long) ctx_expiry; + + write_lock(&gss_ctx_lock); + old = gcred->gc_ctx; + gcred->gc_ctx = ctx; + cred->pc_flags |= PTLRPC_CRED_UPTODATE; + write_unlock(&gss_ctx_lock); + if (old) + gss_put_ctx(old); + + CWARN("client refreshed gss cred %p(uid %u)\n", cred, cred->pc_uid); + EXIT; +} + +static int +simple_get_bytes(char **buf, __u32 *buflen, void *res, __u32 reslen) +{ + if (*buflen < reslen) { + CERROR("buflen %u < %u\n", *buflen, reslen); + return -EINVAL; + } + + memcpy(res, *buf, reslen); + *buf += reslen; + *buflen -= reslen; + return 0; +} + +/* data passed down: + * - uid + * - timeout + * - gc_win / error + * - wire_ctx (rawobj) + * - mech_ctx? (rawobj) + */ +static +int gss_parse_init_downcall(struct gss_api_mech *gm, rawobj_t *buf, + struct gss_cl_ctx **gc, struct vfs_cred *vcred, + __u32 *dest_ip, int *gss_err) +{ + char *p = buf->data; + __u32 len = buf->len; + struct gss_cl_ctx *ctx; + rawobj_t tmp_buf; + unsigned int timeout; + int err = -EIO; + ENTRY; + + *gc = NULL; + + OBD_ALLOC(ctx, sizeof(*ctx)); + if (!ctx) + RETURN(-ENOMEM); + + ctx->gc_proc = RPC_GSS_PROC_DATA; + ctx->gc_seq = 0; + spin_lock_init(&ctx->gc_seq_lock); + atomic_set(&ctx->gc_refcount,1); + + if (simple_get_bytes(&p, &len, &vcred->vc_uid, sizeof(vcred->vc_uid))) + GOTO(err_free_ctx, err); + vcred->vc_pag = vcred->vc_uid; /* FIXME */ + if (simple_get_bytes(&p, &len, dest_ip, sizeof(*dest_ip))) + GOTO(err_free_ctx, err); + /* FIXME: discarded timeout for now */ + if (simple_get_bytes(&p, &len, &timeout, sizeof(timeout))) + GOTO(err_free_ctx, err); + *gss_err = 0; + if (simple_get_bytes(&p, &len, &ctx->gc_win, sizeof(ctx->gc_win))) + GOTO(err_free_ctx, err); + /* gssd signals an error by passing ctx->gc_win = 0: */ + if (!ctx->gc_win) { + /* in which case the next int is an error code: */ + if (simple_get_bytes(&p, &len, gss_err, sizeof(*gss_err))) + GOTO(err_free_ctx, err); + GOTO(err_free_ctx, err = 0); + } + if (rawobj_extract_local(&tmp_buf, (__u32 **) &p, &len)) + GOTO(err_free_ctx, err); + if (rawobj_dup(&ctx->gc_wire_ctx, &tmp_buf)) { + GOTO(err_free_ctx, err = -ENOMEM); + } + if (rawobj_extract_local(&tmp_buf, (__u32 **) &p, &len)) + GOTO(err_free_wire_ctx, err); + if (len) { + CERROR("unexpected trailing %u bytes\n", len); + GOTO(err_free_wire_ctx, err); + } + if (kgss_import_sec_context(&tmp_buf, gm, &ctx->gc_gss_ctx)) + GOTO(err_free_wire_ctx, err); + + *gc = ctx; + RETURN(0); + +err_free_wire_ctx: + if (ctx->gc_wire_ctx.data) + OBD_FREE(ctx->gc_wire_ctx.data, ctx->gc_wire_ctx.len); +err_free_ctx: + OBD_FREE(ctx, sizeof(*ctx)); + CDEBUG(D_SEC, "err_code %d, gss code %d\n", err, *gss_err); + return err; +} + +/*************************************** + * cred APIs * + ***************************************/ +#ifdef __KERNEL__ +static int gss_cred_refresh(struct ptlrpc_cred *cred) +{ + struct obd_import *import; + struct gss_sec *gsec; + struct gss_upcall_msg *gss_msg, *gss_new; + struct dentry *dentry; + char *obdname, *obdtype; + wait_queue_t wait; + uid_t uid = cred->pc_uid; + ptl_nid_t peer_nid; + __u32 dest_ip, svc; + int res; + ENTRY; + + if (ptlrpcs_cred_is_uptodate(cred)) + RETURN(0); + + LASSERT(cred->pc_sec); + LASSERT(cred->pc_sec->ps_import); + LASSERT(cred->pc_sec->ps_import->imp_obd); + + import = cred->pc_sec->ps_import; + if (!import->imp_connection) { + CERROR("import has no connection set\n"); + RETURN(-EINVAL); + } + + peer_nid = import->imp_connection->c_peer.peer_id.nid; + dest_ip = (__u32) (peer_nid & 0xFFFFFFFF); + + obdtype = import->imp_obd->obd_type->typ_name; + if (!strcmp(obdtype, "mdc")) + svc = 0; + else if (!strcmp(obdtype, "osc")) + svc = 1; + else { + CERROR("gss on %s?\n", obdtype); + RETURN(-EINVAL); + } + + gsec = container_of(cred->pc_sec, struct gss_sec, gs_base); + obdname = import->imp_obd->obd_name; + dentry = gsec->gs_depipe; + gss_new = NULL; + res = 0; + + CWARN("Initiate gss context %p(%u@%s)\n", + container_of(cred, struct gss_cred, gc_base), + uid, import->imp_target_uuid.uuid); + +again: + spin_lock(&gsec->gs_lock); + gss_msg = gss_find_upcall(gsec, obdname, uid, dest_ip); + if (gss_msg) { + spin_unlock(&gsec->gs_lock); + GOTO(waiting, res); + } + if (!gss_new) { + spin_unlock(&gsec->gs_lock); + OBD_ALLOC(gss_new, sizeof(*gss_new)); + if (!gss_new) { + CERROR("fail to alloc memory\n"); + RETURN(-ENOMEM); + } + goto again; + } + /* so far we'v created gss_new */ + gss_init_upcall_msg(gss_new, gsec, obdname, uid, dest_ip, svc); + + if (gss_cred_is_uptodate_ctx(cred)) { + /* someone else had done it for us, simply cancel + * our own upcall */ + CDEBUG(D_SEC, "cred("LPU64"/%u) has been refreshed by someone " + "else, simply drop our request\n", + cred->pc_pag, cred->pc_uid); + gss_unhash_msg_nolock(gss_new); + spin_unlock(&gsec->gs_lock); + gss_release_msg(gss_new); + RETURN(0); + } + + /* need to make upcall now */ + spin_unlock(&gsec->gs_lock); + res = rpc_queue_upcall(dentry->d_inode, &gss_new->gum_base); + if (res) { + CERROR("rpc_queue_upcall failed: %d\n", res); + gss_unhash_msg(gss_new); + gss_release_msg(gss_new); + RETURN(res); + } + gss_msg = gss_new; + +waiting: + init_waitqueue_entry(&wait, current); + spin_lock(&gsec->gs_lock); + add_wait_queue(&gss_msg->gum_waitq, &wait); + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock(&gsec->gs_lock); + + schedule(); + + remove_wait_queue(&gss_msg->gum_waitq, &wait); + if (signal_pending(current)) { + CERROR("interrupted gss upcall %p\n", gss_msg); + res = -EINTR; + } + gss_release_msg(gss_msg); + RETURN(res); +} +#else /* !__KERNEL__ */ +extern int lgss_handle_krb5_upcall(uid_t uid, __u32 dest_ip, + char *obd_name, + char *buf, int bufsize, + int (*callback)(char*, unsigned long)); + +static int gss_cred_refresh(struct ptlrpc_cred *cred) +{ + char buf[4096]; + rawobj_t obj; + struct obd_import *imp; + struct gss_sec *gsec; + struct gss_api_mech *mech; + struct gss_cl_ctx *ctx = NULL; + struct vfs_cred vcred = { 0 }; + ptl_nid_t peer_nid; + __u32 dest_ip; + __u32 subflavor; + int rc, gss_err; + + LASSERT(cred); + LASSERT(cred->pc_sec); + LASSERT(cred->pc_sec->ps_import); + LASSERT(cred->pc_sec->ps_import->imp_obd); + + if (ptlrpcs_cred_is_uptodate(cred)) + RETURN(0); + + imp = cred->pc_sec->ps_import; + peer_nid = imp->imp_connection->c_peer.peer_id.nid; + dest_ip = (__u32) (peer_nid & 0xFFFFFFFF); + subflavor = cred->pc_sec->ps_flavor.subflavor; + + if (subflavor != PTLRPC_SEC_GSS_KRB5I) { + CERROR("unknown subflavor %u\n", subflavor); + GOTO(err_out, rc = -EINVAL); + } + + rc = lgss_handle_krb5_upcall(cred->pc_uid, dest_ip, + imp->imp_obd->obd_name, + buf, sizeof(buf), + gss_send_secinit_rpc); + LASSERT(rc != 0); + if (rc < 0) + goto err_out; + + obj.data = buf; + obj.len = rc; + + gsec = container_of(cred->pc_sec, struct gss_sec, gs_base); + mech = gsec->gs_mech; + LASSERT(mech); + rc = gss_parse_init_downcall(mech, &obj, &ctx, &vcred, &dest_ip, + &gss_err); + if (rc) { + CERROR("parse init downcall error %d\n", rc); + goto err_out; + } + + if (gss_err) { + CERROR("cred fresh got gss error %x\n", gss_err); + rc = -EINVAL; + goto err_out; + } + + gss_cred_set_ctx(cred, ctx); + LASSERT(gss_cred_is_uptodate_ctx(cred)); + + return 0; +err_out: + cred->pc_flags |= PTLRPC_CRED_DEAD; + return rc; +} +#endif + +static int gss_cred_match(struct ptlrpc_cred *cred, + struct ptlrpc_request *req, + struct vfs_cred *vcred) +{ + RETURN(cred->pc_pag == vcred->vc_pag); +} + +static int gss_cred_sign(struct ptlrpc_cred *cred, + struct ptlrpc_request *req) +{ + struct gss_cred *gcred; + struct gss_cl_ctx *ctx; + rawobj_t lmsg, mic; + __u32 *vp, *vpsave, vlen, seclen; + __u32 seqnum, major, rc = 0; + ENTRY; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_cred == cred); + + gcred = container_of(cred, struct gss_cred, gc_base); + ctx = gss_cred_get_ctx(cred); + if (!ctx) { + CERROR("cred %p("LPU64"/%u) invalidated?\n", + cred, cred->pc_pag, cred->pc_uid); + RETURN(-EPERM); + } + + lmsg.len = req->rq_reqlen; + lmsg.data = (__u8 *) req->rq_reqmsg; + + vp = (__u32 *) (lmsg.data + lmsg.len); + vlen = req->rq_reqbuf_len - sizeof(struct ptlrpcs_wire_hdr) - + lmsg.len; + seclen = vlen; + + if (vlen < 6 * 4 + size_round4(ctx->gc_wire_ctx.len)) { + CERROR("vlen %d, need %d\n", + vlen, 6 * 4 + size_round4(ctx->gc_wire_ctx.len)); + rc = -EIO; + goto out; + } + + spin_lock(&ctx->gc_seq_lock); + seqnum = ctx->gc_seq++; + spin_unlock(&ctx->gc_seq_lock); + + *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION); /* version */ + *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I); /* subflavor */ + *vp++ = cpu_to_le32(ctx->gc_proc); /* proc */ + *vp++ = cpu_to_le32(seqnum); /* seq */ + *vp++ = cpu_to_le32(PTLRPC_GSS_SVC_INTEGRITY); /* service */ + vlen -= 5 * 4; + + if (rawobj_serialize(&ctx->gc_wire_ctx, &vp, &vlen)) { + rc = -EIO; + goto out; + } + CDEBUG(D_SEC, "encoded wire_ctx length %d\n", ctx->gc_wire_ctx.len); + + vpsave = vp++; /* reserve for size */ + vlen -= 4; + + mic.len = vlen; + mic.data = (char *) vp; + + CDEBUG(D_SEC, "reqbuf at %p, lmsg at %p, len %d, mic at %p, len %d\n", + req->rq_reqbuf, lmsg.data, lmsg.len, mic.data, mic.len); + major = kgss_get_mic(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, &lmsg, &mic); + if (major) { + CERROR("gss compute mic error, major %x\n", major); + rc = -EACCES; + goto out; + } + + *vpsave = cpu_to_le32(mic.len); + + seclen = seclen - vlen + mic.len; + buf_to_sec_hdr(req->rq_reqbuf)->sec_len = cpu_to_le32(seclen); + req->rq_reqdata_len += size_round(seclen); + CDEBUG(D_SEC, "msg size %d, checksum size %d, total sec size %d\n", + lmsg.len, mic.len, seclen); +out: + gss_put_ctx(ctx); + RETURN(rc); +} + +static int gss_cred_verify(struct ptlrpc_cred *cred, + struct ptlrpc_request *req) +{ + struct gss_cred *gcred; + struct gss_cl_ctx *ctx; + struct ptlrpcs_wire_hdr *sec_hdr; + rawobj_t lmsg, mic; + __u32 *vp, vlen, subflavor, proc, seq, svc; + __u32 major, minor, rc; + ENTRY; + + LASSERT(req->rq_repbuf); + LASSERT(req->rq_cred == cred); + + sec_hdr = buf_to_sec_hdr(req->rq_repbuf); + vp = (__u32 *) (req->rq_repbuf + sizeof(*sec_hdr) + sec_hdr->msg_len); + vlen = sec_hdr->sec_len; + + if (vlen < 7 * 4) { + CERROR("reply sec size %u too small\n", vlen); + RETURN(-EPROTO); + } + + if (*vp++ != cpu_to_le32(PTLRPC_SEC_GSS_VERSION)) { + CERROR("reply have different gss version\n"); + RETURN(-EPROTO); + } + subflavor = le32_to_cpu(*vp++); + proc = le32_to_cpu(*vp++); + vlen -= 3 * 4; + + switch (proc) { + case PTLRPC_GSS_PROC_DATA: + seq = le32_to_cpu(*vp++); + svc = le32_to_cpu(*vp++); + if (svc != PTLRPC_GSS_SVC_INTEGRITY) { + CERROR("Unknown svc %d\n", svc); + RETURN(-EPROTO); + } + if (*vp++ != 0) { + CERROR("Unexpected ctx handle\n"); + RETURN(-EPROTO); + } + mic.len = le32_to_cpu(*vp++); + vlen -= 4 * 4; + if (vlen < mic.len) { + CERROR("vlen %d, mic.len %d\n", vlen, mic.len); + RETURN(-EINVAL); + } + mic.data = (char *) vp; + + gcred = container_of(cred, struct gss_cred, gc_base); + ctx = gss_cred_get_ctx(cred); + LASSERT(ctx); + + lmsg.len = sec_hdr->msg_len; + lmsg.data = (__u8 *) buf_to_lustre_msg(req->rq_repbuf); + + major = kgss_verify_mic(ctx->gc_gss_ctx, &lmsg, &mic, NULL); + if (major != GSS_S_COMPLETE) { + CERROR("gss verify mic error: major %x\n", major); + GOTO(proc_data_out, rc = -EINVAL); + } + + req->rq_repmsg = (struct lustre_msg *) lmsg.data; + req->rq_replen = lmsg.len; + + /* here we could check the seq number is the same one + * we sent to server. but portals has prevent us from + * replay attack, so maybe we don't need check it again. + */ + rc = 0; +proc_data_out: + gss_put_ctx(ctx); + break; + case PTLRPC_GSS_PROC_ERR: + major = le32_to_cpu(*vp++); + minor = le32_to_cpu(*vp++); + /* server return NO_CONTEXT might be caused by context expire + * or server reboot/failover. we refresh the cred transparently + * to upper layer. + * In some cases, our gss handle is possible to be incidentally + * identical to another handle since the handle itself is not + * fully random. In krb5 case, the GSS_S_BAD_SIG will be + * returned, maybe other gss error for other mechanism. Here we + * only consider krb5 mech (FIXME) and try to establish new + * context. + */ + if (major == GSS_S_NO_CONTEXT || + major == GSS_S_BAD_SIG) { + CWARN("req %p: server report cred %p %s, expired?\n", + req, cred, (major == GSS_S_NO_CONTEXT) ? + "NO_CONTEXT" : "BAD_SIG"); + + ptlrpcs_cred_die(cred); + rc = ptlrpcs_req_replace_dead_cred(req); + if (!rc) + req->rq_ptlrpcs_restart = 1; + else + CERROR("replace dead cred failed %d\n", rc); + } else { + CERROR("Unrecognized gss error (%x/%x)\n", + major, minor); + rc = -EACCES; + } + break; + default: + CERROR("unknown gss proc %d\n", proc); + rc = -EPROTO; + } + + RETURN(rc); +} + +static int gss_cred_seal(struct ptlrpc_cred *cred, + struct ptlrpc_request *req) +{ + struct gss_cred *gcred; + struct gss_cl_ctx *ctx; + struct ptlrpcs_wire_hdr *sec_hdr; + rawobj_buf_t msg_buf; + rawobj_t cipher_buf; + __u32 *vp, *vpsave, vlen, seclen; + __u32 major, seqnum, rc = 0; + ENTRY; + + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_cred == cred); + + gcred = container_of(cred, struct gss_cred, gc_base); + ctx = gss_cred_get_ctx(cred); + if (!ctx) { + CERROR("cred %p("LPU64"/%u) invalidated?\n", + cred, cred->pc_pag, cred->pc_uid); + RETURN(-EPERM); + } + + vp = (__u32 *) (req->rq_reqbuf + sizeof(*sec_hdr)); + vlen = req->rq_reqbuf_len - sizeof(*sec_hdr); + seclen = vlen; + + if (vlen < 6 * 4 + size_round4(ctx->gc_wire_ctx.len)) { + CERROR("vlen %d, need %d\n", + vlen, 6 * 4 + size_round4(ctx->gc_wire_ctx.len)); + rc = -EIO; + goto out; + } + + spin_lock(&ctx->gc_seq_lock); + seqnum = ctx->gc_seq++; + spin_unlock(&ctx->gc_seq_lock); + + *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION); /* version */ + *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5P); /* subflavor */ + *vp++ = cpu_to_le32(ctx->gc_proc); /* proc */ + *vp++ = cpu_to_le32(seqnum); /* seq */ + *vp++ = cpu_to_le32(PTLRPC_GSS_SVC_PRIVACY); /* service */ + vlen -= 5 * 4; + + if (rawobj_serialize(&ctx->gc_wire_ctx, &vp, &vlen)) { + rc = -EIO; + goto out; + } + CDEBUG(D_SEC, "encoded wire_ctx length %d\n", ctx->gc_wire_ctx.len); + + vpsave = vp++; /* reserve for size */ + vlen -= 4; + + msg_buf.buf = (__u8 *) req->rq_reqmsg - GSS_PRIVBUF_PREFIX_LEN; + msg_buf.buflen = req->rq_reqlen + GSS_PRIVBUF_PREFIX_LEN + GSS_PRIVBUF_SUFFIX_LEN; + msg_buf.dataoff = GSS_PRIVBUF_PREFIX_LEN; + msg_buf.datalen = req->rq_reqlen; + + cipher_buf.data = (__u8 *) vp; + cipher_buf.len = vlen; + + major = kgss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, + &msg_buf, &cipher_buf); + if (major) { + CERROR("error wrap: major 0x%x\n", major); + GOTO(out, rc = -EINVAL); + } + + *vpsave = cpu_to_le32(cipher_buf.len); + + seclen = seclen - vlen + cipher_buf.len; + sec_hdr = buf_to_sec_hdr(req->rq_reqbuf); + sec_hdr->sec_len = cpu_to_le32(seclen); + req->rq_reqdata_len += size_round(seclen); + + CDEBUG(D_SEC, "msg size %d, total sec size %d\n", + req->rq_reqlen, seclen); +out: + gss_put_ctx(ctx); + RETURN(rc); +} + +static int gss_cred_unseal(struct ptlrpc_cred *cred, + struct ptlrpc_request *req) +{ + struct gss_cred *gcred; + struct gss_cl_ctx *ctx; + struct ptlrpcs_wire_hdr *sec_hdr; + rawobj_t cipher_text, plain_text; + __u32 *vp, vlen, subflavor, proc, seq, svc; + int rc; + ENTRY; + + LASSERT(req->rq_repbuf); + LASSERT(req->rq_cred == cred); + + sec_hdr = buf_to_sec_hdr(req->rq_repbuf); + if (sec_hdr->msg_len != 0) { + CERROR("unexpected msg_len %u\n", sec_hdr->msg_len); + RETURN(-EPROTO); + } + + vp = (__u32 *) (req->rq_repbuf + sizeof(*sec_hdr)); + vlen = sec_hdr->sec_len; + + if (vlen < 7 * 4) { + CERROR("reply sec size %u too small\n", vlen); + RETURN(-EPROTO); + } + + if (*vp++ != cpu_to_le32(PTLRPC_SEC_GSS_VERSION)) { + CERROR("reply have different gss version\n"); + RETURN(-EPROTO); + } + subflavor = le32_to_cpu(*vp++); + proc = le32_to_cpu(*vp++); + seq = le32_to_cpu(*vp++); + svc = le32_to_cpu(*vp++); + vlen -= 5 * 4; + + switch (proc) { + case PTLRPC_GSS_PROC_DATA: + if (svc != PTLRPC_GSS_SVC_PRIVACY) { + CERROR("Unknown svc %d\n", svc); + RETURN(-EPROTO); + } + if (*vp++ != 0) { + CERROR("Unexpected ctx handle\n"); + RETURN(-EPROTO); + } + vlen -= 4; + + cipher_text.len = le32_to_cpu(*vp++); + cipher_text.data = (__u8 *) vp; + vlen -= 4; + + if (vlen < cipher_text.len) { + CERROR("cipher text to be %u while buf only %u\n", + cipher_text.len, vlen); + RETURN(-EPROTO); + } + + plain_text = cipher_text; + + gcred = container_of(cred, struct gss_cred, gc_base); + ctx = gss_cred_get_ctx(cred); + LASSERT(ctx); + + rc = kgss_unwrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, + &cipher_text, &plain_text); + if (rc) { + CERROR("error unwrap: 0x%x\n", rc); + GOTO(proc_out, rc = -EINVAL); + } + + req->rq_repmsg = (struct lustre_msg *) vp; + req->rq_replen = plain_text.len; + + rc = 0; +proc_out: + gss_put_ctx(ctx); + break; + default: + CERROR("unknown gss proc %d\n", proc); + rc = -EPROTO; + } + + RETURN(rc); +} + +static void destroy_gss_context(struct ptlrpc_cred *cred) +{ + struct ptlrpcs_wire_hdr *hdr; + struct lustre_msg *lmsg; + struct gss_cred *gcred; + struct ptlrpc_request req; + struct obd_import *imp; + __u32 *vp, lmsg_size; + ENTRY; + + /* cred's refcount is 0, steal one */ + atomic_inc(&cred->pc_refcount); + + gcred = container_of(cred, struct gss_cred, gc_base); + gcred->gc_ctx->gc_proc = PTLRPC_GSS_PROC_DESTROY; + imp = cred->pc_sec->ps_import; + LASSERT(imp); + + if (!(cred->pc_flags & PTLRPC_CRED_UPTODATE)) { + CWARN("Destroy a dead gss cred %p(%u@%s), don't send rpc\n", + gcred, cred->pc_uid, imp->imp_target_uuid.uuid); + atomic_dec(&cred->pc_refcount); + EXIT; + return; + } + + CWARN("client destroy gss cred %p(%u@%s)\n", + gcred, cred->pc_uid, imp->imp_target_uuid.uuid); + + lmsg_size = lustre_msg_size(0, NULL); + req.rq_reqbuf_len = sizeof(*hdr) + lmsg_size + + ptlrpcs_est_req_payload(cred->pc_sec, lmsg_size); + + OBD_ALLOC(req.rq_reqbuf, req.rq_reqbuf_len); + if (!req.rq_reqbuf) { + CERROR("Fail to alloc reqbuf, cancel anyway\n"); + atomic_dec(&cred->pc_refcount); + EXIT; + return; + } + + /* wire hdr */ + hdr = buf_to_sec_hdr(req.rq_reqbuf); + hdr->flavor = cpu_to_le32(PTLRPC_SEC_GSS); + hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_AUTH); + hdr->msg_len = cpu_to_le32(lmsg_size); + hdr->sec_len = cpu_to_le32(0); + + /* lustre message */ + lmsg = buf_to_lustre_msg(req.rq_reqbuf); + lustre_init_msg(lmsg, 0, NULL, NULL); + lmsg->handle = imp->imp_remote_handle; + lmsg->type = PTL_RPC_MSG_REQUEST; + lmsg->opc = SEC_FINI; + lmsg->flags = 0; + lmsg->conn_cnt = imp->imp_conn_cnt; + /* add this for randomize */ + get_random_bytes(&lmsg->last_xid, sizeof(lmsg->last_xid)); + get_random_bytes(&lmsg->transno, sizeof(lmsg->transno)); + + vp = (__u32 *) req.rq_reqbuf; + + req.rq_cred = cred; + req.rq_reqmsg = buf_to_lustre_msg(req.rq_reqbuf); + req.rq_reqlen = lmsg_size; + req.rq_reqdata_len = sizeof(*hdr) + lmsg_size; + + if (gss_cred_sign(cred, &req)) { + CERROR("failed to sign, cancel anyway\n"); + atomic_dec(&cred->pc_refcount); + goto exit; + } + atomic_dec(&cred->pc_refcount); + + /* send out */ + gss_send_secfini_rpc(imp, req.rq_reqbuf, req.rq_reqdata_len); +exit: + OBD_FREE(req.rq_reqbuf, req.rq_reqbuf_len); + EXIT; +} + +static void gss_cred_destroy(struct ptlrpc_cred *cred) +{ + struct gss_cred *gcred; + ENTRY; + + LASSERT(cred); + LASSERT(!atomic_read(&cred->pc_refcount)); + + gcred = container_of(cred, struct gss_cred, gc_base); + if (gcred->gc_ctx) { + destroy_gss_context(cred); + gss_put_ctx(gcred->gc_ctx); + } + + CDEBUG(D_SEC, "GSS_SEC: destroy cred %p\n", gcred); + + OBD_FREE(gcred, sizeof(*gcred)); + EXIT; +} + +static struct ptlrpc_credops gss_credops = { + .refresh = gss_cred_refresh, + .match = gss_cred_match, + .sign = gss_cred_sign, + .verify = gss_cred_verify, + .seal = gss_cred_seal, + .unseal = gss_cred_unseal, + .destroy = gss_cred_destroy, +}; + +#ifdef __KERNEL__ +/******************************************* + * rpc_pipe APIs * + *******************************************/ +static ssize_t +gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, + char *dst, size_t buflen) +{ + char *data = (char *)msg->data + msg->copied; + ssize_t mlen = msg->len; + ssize_t left; + ENTRY; + + if (mlen > buflen) + mlen = buflen; + left = copy_to_user(dst, data, mlen); + if (left < 0) { + msg->errno = left; + RETURN(left); + } + mlen -= left; + msg->copied += mlen; + msg->errno = 0; + RETURN(mlen); +} + +static ssize_t +gss_pipe_downcall(struct file *filp, const char *src, size_t mlen) +{ + char *buf; + const int bufsize = 1024; + rawobj_t obj; + struct inode *inode = filp->f_dentry->d_inode; + struct rpc_inode *rpci = RPC_I(inode); + struct obd_import *import; + struct ptlrpc_sec *sec; + struct gss_sec *gsec; + char *obdname; + struct gss_api_mech *mech; + struct vfs_cred vcred = { 0 }; + struct ptlrpc_cred *cred; + struct gss_upcall_msg *gss_msg; + struct gss_cl_ctx *ctx = NULL; + __u32 dest_ip; + ssize_t left; + int err, gss_err; + ENTRY; + + if (mlen > bufsize) { + CERROR("mlen %ld > bufsize %d\n", (long)mlen, bufsize); + RETURN(-ENOSPC); + } + + OBD_ALLOC(buf, bufsize); + if (!buf) { + CERROR("alloc mem failed\n"); + RETURN(-ENOMEM); + } + + left = copy_from_user(buf, src, mlen); + if (left) + GOTO(err_free, err = -EFAULT); + + obj.data = buf; + obj.len = mlen; + + LASSERT(rpci->private); + gsec = (struct gss_sec *)rpci->private; + sec = &gsec->gs_base; + LASSERT(sec->ps_import); + import = class_import_get(sec->ps_import); + LASSERT(import->imp_obd); + obdname = import->imp_obd->obd_name; + mech = gsec->gs_mech; + + err = gss_parse_init_downcall(mech, &obj, &ctx, &vcred, &dest_ip, + &gss_err); + if (err) { + CERROR("parse downcall err %d\n", err); + GOTO(err, err); + } + cred = ptlrpcs_cred_lookup(sec, &vcred); + if (!cred) { + CWARN("didn't find cred\n"); + GOTO(err, err); + } + if (gss_err) { + CERROR("got gss err %d, set cred %p dead\n", gss_err, cred); + cred->pc_flags |= PTLRPC_CRED_DEAD; + } else { + CDEBUG(D_SEC, "get initial ctx:\n"); + gss_cred_set_ctx(cred, ctx); + } + + spin_lock(&gsec->gs_lock); + gss_msg = gss_find_upcall(gsec, obdname, vcred.vc_uid, dest_ip); + if (gss_msg) { + gss_unhash_msg_nolock(gss_msg); + spin_unlock(&gsec->gs_lock); + gss_release_msg(gss_msg); + } else + spin_unlock(&gsec->gs_lock); + + ptlrpcs_cred_put(cred, 1); + class_import_put(import); + OBD_FREE(buf, bufsize); + RETURN(mlen); +err: + if (ctx) + gss_destroy_ctx(ctx); + class_import_put(import); +err_free: + OBD_FREE(buf, bufsize); + CDEBUG(D_SEC, "gss_pipe_downcall returning %d\n", err); + RETURN(err); +} + +static +void gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct gss_upcall_msg *gmsg; + static unsigned long ratelimit; + ENTRY; + + if (msg->errno >= 0) { + EXIT; + return; + } + + gmsg = container_of(msg, struct gss_upcall_msg, gum_base); + CDEBUG(D_SEC, "destroy gmsg %p\n", gmsg); + atomic_inc(&gmsg->gum_refcount); + gss_unhash_msg(gmsg); + if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) { + unsigned long now = get_seconds(); + if (time_after(now, ratelimit)) { + CWARN("GSS_SEC upcall timed out.\n" + "Please check user daemon is running!\n"); + ratelimit = now + 15; + } + } + gss_release_msg(gmsg); + EXIT; +} + +static +void gss_pipe_release(struct inode *inode) +{ + struct rpc_inode *rpci = RPC_I(inode); + struct ptlrpc_sec *sec; + struct gss_sec *gsec; + ENTRY; + + gsec = (struct gss_sec *)rpci->private; + sec = &gsec->gs_base; + spin_lock(&gsec->gs_lock); + while (!list_empty(&gsec->gs_upcalls)) { + struct gss_upcall_msg *gmsg; + + gmsg = list_entry(gsec->gs_upcalls.next, + struct gss_upcall_msg, gum_list); + gmsg->gum_base.errno = -EPIPE; + atomic_inc(&gmsg->gum_refcount); + gss_unhash_msg_nolock(gmsg); + gss_release_msg(gmsg); + } + spin_unlock(&gsec->gs_lock); + EXIT; +} + +static struct rpc_pipe_ops gss_upcall_ops = { + .upcall = gss_pipe_upcall, + .downcall = gss_pipe_downcall, + .destroy_msg = gss_pipe_destroy_msg, + .release_pipe = gss_pipe_release, +}; +#endif /* __KERNEL__ */ + +/********************************************* + * GSS security APIs * + *********************************************/ + +static +struct ptlrpc_sec* gss_create_sec(ptlrpcs_flavor_t *flavor, + const char *pipe_dir, + void *pipe_data) +{ + struct gss_sec *gsec; + struct ptlrpc_sec *sec; + char *pos; + ENTRY; + + LASSERT(flavor->flavor == PTLRPC_SEC_GSS); + + OBD_ALLOC(gsec, sizeof(*gsec)); + if (!gsec) { + CERROR("can't alloc gsec\n"); + RETURN(NULL); + } + + gsec->gs_mech = kgss_subflavor_to_mech(flavor->subflavor); + if (!gsec->gs_mech) { + CERROR("subflavor %d not found\n", flavor->subflavor); + goto err_free; + } + + /* initialize gss sec */ +#ifdef __KERNEL__ + INIT_LIST_HEAD(&gsec->gs_upcalls); + spin_lock_init(&gsec->gs_lock); + + snprintf(gsec->gs_pipepath, sizeof(gsec->gs_pipepath), + LUSTRE_PIPEDIR"/%s", pipe_dir); + if (IS_ERR(rpc_mkdir(gsec->gs_pipepath, NULL))) { + CERROR("can't make pipedir %s\n", gsec->gs_pipepath); + goto err_mech_put; + } + + snprintf(gsec->gs_pipepath, sizeof(gsec->gs_pipepath), + LUSTRE_PIPEDIR"/%s/%s", pipe_dir, gsec->gs_mech->gm_name); + gsec->gs_depipe = rpc_mkpipe(gsec->gs_pipepath, gsec, + &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); + if (IS_ERR(gsec->gs_depipe)) { + CERROR("failed to make rpc_pipe %s: %ld\n", + gsec->gs_pipepath, PTR_ERR(gsec->gs_depipe)); + goto err_rmdir; + } + CDEBUG(D_SEC, "gss sec %p, pipe path %s\n", gsec, gsec->gs_pipepath); +#endif + + sec = &gsec->gs_base; + + switch (flavor->subflavor) { + case PTLRPC_SEC_GSS_KRB5I: + sec->ps_sectype = PTLRPC_SEC_TYPE_AUTH; + break; + case PTLRPC_SEC_GSS_KRB5P: + sec->ps_sectype = PTLRPC_SEC_TYPE_PRIV; + break; + default: + LBUG(); + } + + sec->ps_expire = GSS_CREDCACHE_EXPIRE; + sec->ps_nextgc = get_seconds() + sec->ps_expire; + sec->ps_flags = 0; + + CDEBUG(D_SEC, "Create GSS security instance at %p(external %p)\n", + gsec, sec); + RETURN(sec); + +#ifdef __KERNEL__ +err_rmdir: + pos = strrchr(gsec->gs_pipepath, '/'); + LASSERT(pos); + *pos = 0; + rpc_rmdir(gsec->gs_pipepath); +err_mech_put: +#endif + kgss_mech_put(gsec->gs_mech); +err_free: + OBD_FREE(gsec, sizeof(*gsec)); + RETURN(NULL); +} + +static +void gss_destroy_sec(struct ptlrpc_sec *sec) +{ + struct gss_sec *gsec; + char *pos; + ENTRY; + + gsec = container_of(sec, struct gss_sec, gs_base); + CDEBUG(D_SEC, "Destroy GSS security instance at %p\n", gsec); + + LASSERT(gsec->gs_mech); + LASSERT(!atomic_read(&sec->ps_refcount)); + LASSERT(!atomic_read(&sec->ps_credcount)); +#ifdef __KERNEL__ + rpc_unlink(gsec->gs_pipepath); + pos = strrchr(gsec->gs_pipepath, '/'); + LASSERT(pos); + *pos = 0; + rpc_rmdir(gsec->gs_pipepath); +#endif + + kgss_mech_put(gsec->gs_mech); + OBD_FREE(gsec, sizeof(*gsec)); + EXIT; +} + +static +struct ptlrpc_cred * gss_create_cred(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + struct vfs_cred *vcred) +{ + struct gss_cred *gcred; + struct ptlrpc_cred *cred; + ENTRY; + + OBD_ALLOC(gcred, sizeof(*gcred)); + if (!gcred) + RETURN(NULL); + + cred = &gcred->gc_base; + INIT_LIST_HEAD(&cred->pc_hash); + atomic_set(&cred->pc_refcount, 0); + cred->pc_sec = sec; + cred->pc_ops = &gss_credops; + cred->pc_req = req; + cred->pc_expire = get_seconds() + GSS_CRED_EXPIRE; + cred->pc_flags = 0; + cred->pc_pag = vcred->vc_pag; + cred->pc_uid = vcred->vc_uid; + CDEBUG(D_SEC, "create a gss cred at %p("LPU64"/%u)\n", + cred, vcred->vc_pag, vcred->vc_uid); + + RETURN(cred); +} + +static int gss_estimate_payload(struct ptlrpc_sec *sec, int msgsize) +{ + switch (sec->ps_sectype) { + case PTLRPC_SEC_TYPE_AUTH: + return GSS_MAX_AUTH_PAYLOAD; + case PTLRPC_SEC_TYPE_PRIV: + return size_round16(GSS_MAX_AUTH_PAYLOAD + msgsize + + GSS_PRIVBUF_PREFIX_LEN + + GSS_PRIVBUF_SUFFIX_LEN); + default: + LBUG(); + return 0; + } +} + +static int gss_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lmsg_size) +{ + int msg_payload, sec_payload; + int privacy, rc; + ENTRY; + + /* In PRIVACY mode, lustre message is always 0 (already encoded into + * security payload). + */ + privacy = sec->ps_sectype == PTLRPC_SEC_TYPE_PRIV; + msg_payload = privacy ? 0 : lmsg_size; + sec_payload = gss_estimate_payload(sec, lmsg_size); + + rc = sec_alloc_reqbuf(sec, req, msg_payload, sec_payload); + if (rc) + return rc; + + if (privacy) { + int buflen = lmsg_size + GSS_PRIVBUF_PREFIX_LEN + + GSS_PRIVBUF_SUFFIX_LEN; + char *buf; + + OBD_ALLOC(buf, buflen); + if (!buf) { + CERROR("Fail to alloc %d\n", buflen); + sec_free_reqbuf(sec, req); + RETURN(-ENOMEM); + } + req->rq_reqmsg = (struct lustre_msg *) + (buf + GSS_PRIVBUF_PREFIX_LEN); + } + + RETURN(0); +} + +static void gss_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + char *buf; + int privacy; + ENTRY; + + LASSERT(req->rq_reqmsg); + LASSERT(req->rq_reqlen); + + privacy = sec->ps_sectype == PTLRPC_SEC_TYPE_PRIV; + if (privacy) { + buf = (char *) req->rq_reqmsg - GSS_PRIVBUF_PREFIX_LEN; + LASSERT(buf < req->rq_reqbuf || + buf >= req->rq_reqbuf + req->rq_reqbuf_len); + OBD_FREE(buf, req->rq_reqlen + GSS_PRIVBUF_PREFIX_LEN + + GSS_PRIVBUF_SUFFIX_LEN); + req->rq_reqmsg = NULL; + } + + sec_free_reqbuf(sec, req); +} + +static struct ptlrpc_secops gss_secops = { + .create_sec = gss_create_sec, + .destroy_sec = gss_destroy_sec, + .create_cred = gss_create_cred, + .est_req_payload = gss_estimate_payload, + .est_rep_payload = gss_estimate_payload, + .alloc_reqbuf = gss_alloc_reqbuf, + .free_reqbuf = gss_free_reqbuf, +}; + +static struct ptlrpc_sec_type gss_type = { + .pst_owner = THIS_MODULE, + .pst_name = "GSS_SEC", + .pst_inst = ATOMIC_INIT(0), + .pst_flavor = {PTLRPC_SEC_GSS, 0}, + .pst_ops = &gss_secops, +}; + +extern int +(*lustre_secinit_downcall_handler)(char *buffer, unsigned long count); + +int __init ptlrpcs_gss_init(void) +{ + int rc; + + rc = ptlrpcs_register(&gss_type); + if (rc) + return rc; + +#ifdef __KERNEL__ + gss_svc_init(); + + rc = PTR_ERR(rpc_mkdir(LUSTRE_PIPEDIR, NULL)); + if (IS_ERR((void *)rc) && rc != -EEXIST) { + CERROR("fail to make rpcpipedir for lustre\n"); + gss_svc_exit(); + ptlrpcs_unregister(&gss_type); + return -1; + } + rc = 0; +#else +#endif + rc = init_kerberos_module(); + if (rc) { + ptlrpcs_unregister(&gss_type); + } + + lustre_secinit_downcall_handler = gss_send_secinit_rpc; + + return rc; +} + +static void __exit ptlrpcs_gss_exit(void) +{ + lustre_secinit_downcall_handler = NULL; + + cleanup_kerberos_module(); +#ifndef __KERNEL__ +#else + rpc_rmdir(LUSTRE_PIPEDIR); + gss_svc_exit(); +#endif + ptlrpcs_unregister(&gss_type); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("GSS Security module for Lustre"); +MODULE_LICENSE("GPL"); + +module_init(ptlrpcs_gss_init); +module_exit(ptlrpcs_gss_exit); diff --git a/lustre/sec/gss/svcsec_gss.c b/lustre/sec/gss/svcsec_gss.c new file mode 100644 index 0000000..1ac060e --- /dev/null +++ b/lustre/sec/gss/svcsec_gss.c @@ -0,0 +1,1534 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Modifications for Lustre + * Copyright 2004, Cluster File Systems, Inc. + * All rights reserved + * Author: Eric Mei + */ + +/* + * Neil Brown + * J. Bruce Fields + * Andy Adamson + * Dug Song + * + * RPCSEC_GSS server authentication. + * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078 + * (gssapi) + * + * The RPCSEC_GSS involves three stages: + * 1/ context creation + * 2/ data exchange + * 3/ context destruction + * + * Context creation is handled largely by upcalls to user-space. + * In particular, GSS_Accept_sec_context is handled by an upcall + * Data exchange is handled entirely within the kernel + * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel. + * Context destruction is handled in-kernel + * GSS_Delete_sec_context is in-kernel + * + * Context creation is initiated by a RPCSEC_GSS_INIT request arriving. + * The context handle and gss_token are used as a key into the rpcsec_init cache. + * The content of this cache includes some of the outputs of GSS_Accept_sec_context, + * being major_status, minor_status, context_handle, reply_token. + * These are sent back to the client. + * Sequence window management is handled by the kernel. The window size if currently + * a compile time constant. + * + * When user-space is happy that a context is established, it places an entry + * in the rpcsec_context cache. The key for this cache is the context_handle. + * The content includes: + * uid/gidlist - for determining access rights + * mechanism type + * mechanism specific information, such as a key + * + */ + +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#else +#include +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gss_err.h" +#include "gss_internal.h" +#include "gss_api.h" + +static inline unsigned long hash_mem(char *buf, int length, int bits) +{ + unsigned long hash = 0; + unsigned long l = 0; + int len = 0; + unsigned char c; + do { + if (len == length) { + c = (char)len; len = -1; + } else + c = *buf++; + l = (l << 8) | c; + len++; + if ((len & (BITS_PER_LONG/8-1))==0) + hash = hash_long(hash^l, BITS_PER_LONG); + } while (len); + return hash >> (BITS_PER_LONG - bits); +} + +/* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests + * into replies. + * + * Key is context handle (\x if empty) and gss_token. + * Content is major_status minor_status (integers) context_handle, reply_token. + * + */ + +#define RSI_HASHBITS 6 +#define RSI_HASHMAX (1<in_handle); + rawobj_free(&rsii->in_token); + rawobj_free(&rsii->out_handle); + rawobj_free(&rsii->out_token); +} + +static void rsi_put(struct cache_head *item, struct cache_detail *cd) +{ + struct rsi *rsii = container_of(item, struct rsi, h); + if (cache_put(item, cd)) { + rsi_free(rsii); + OBD_FREE(rsii, sizeof(*rsii)); + } +} + +static inline int rsi_hash(struct rsi *item) +{ + return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS) + ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS); +} + +static inline int rsi_match(struct rsi *item, struct rsi *tmp) +{ + return (rawobj_equal(&item->in_handle, &tmp->in_handle) && + rawobj_equal(&item->in_token, &tmp->in_token)); +} + +static void rsi_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + struct rsi *rsii = container_of(h, struct rsi, h); + + qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len); + qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len); + (*bpp)[-1] = '\n'; +} + +static int +gssd_reply(struct rsi *item) +{ + struct rsi *tmp; + struct cache_head **hp, **head; + ENTRY; + + head = &rsi_cache.hash_table[rsi_hash(item)]; + write_lock(&rsi_cache.hash_lock); + for (hp = head; *hp != NULL; hp = &tmp->h.next) { + tmp = container_of(*hp, struct rsi, h); + if (rsi_match(tmp, item)) { + cache_get(&tmp->h); + clear_bit(CACHE_HASHED, &tmp->h.flags); + *hp = tmp->h.next; + tmp->h.next = NULL; + rsi_cache.entries--; + if (test_bit(CACHE_VALID, &tmp->h.flags)) { + write_unlock(&rsi_cache.hash_lock); + rsi_put(&tmp->h, &rsi_cache); + RETURN(-EINVAL); + } + set_bit(CACHE_HASHED, &item->h.flags); + item->h.next = *hp; + *hp = &item->h; + rsi_cache.entries++; + set_bit(CACHE_VALID, &item->h.flags); + item->h.last_refresh = get_seconds(); + write_unlock(&rsi_cache.hash_lock); + cache_fresh(&rsi_cache, &tmp->h, 0); + rsi_put(&tmp->h, &rsi_cache); + RETURN(0); + } + } + write_unlock(&rsi_cache.hash_lock); + RETURN(-EINVAL); +} + +/* XXX + * here we just wait here for its completion or timedout. it's a + * hacking but works, and we'll comeup with real fix if we decided + * to still stick with NFS4 cache code + */ +static struct rsi * +gssd_upcall(struct rsi *item, struct cache_req *chandle) +{ + struct rsi *tmp; + struct cache_head **hp, **head; + unsigned long starttime; + ENTRY; + + head = &rsi_cache.hash_table[rsi_hash(item)]; + read_lock(&rsi_cache.hash_lock); + for (hp = head; *hp != NULL; hp = &tmp->h.next) { + tmp = container_of(*hp, struct rsi, h); + if (rsi_match(tmp, item)) { + LBUG(); + if (!test_bit(CACHE_VALID, &tmp->h.flags)) { + CERROR("found rsi without VALID\n"); + read_unlock(&rsi_cache.hash_lock); + return NULL; + } + *hp = tmp->h.next; + tmp->h.next = NULL; + rsi_cache.entries--; + cache_get(&tmp->h); + read_unlock(&rsi_cache.hash_lock); + return tmp; + } + } + // cache_get(&item->h); + set_bit(CACHE_HASHED, &item->h.flags); + item->h.next = *head; + *head = &item->h; + rsi_cache.entries++; + read_unlock(&rsi_cache.hash_lock); + cache_get(&item->h); + + cache_check(&rsi_cache, &item->h, chandle); + starttime = get_seconds(); + do { + yield(); + read_lock(&rsi_cache.hash_lock); + for (hp = head; *hp != NULL; hp = &tmp->h.next) { + tmp = container_of(*hp, struct rsi, h); + if (tmp == item) + continue; + if (rsi_match(tmp, item)) { + if (!test_bit(CACHE_VALID, &tmp->h.flags)) { + read_unlock(&rsi_cache.hash_lock); + return NULL; + } + cache_get(&tmp->h); + clear_bit(CACHE_HASHED, &tmp->h.flags); + *hp = tmp->h.next; + tmp->h.next = NULL; + rsi_cache.entries--; + read_unlock(&rsi_cache.hash_lock); + return tmp; + } + } + read_unlock(&rsi_cache.hash_lock); + } while ((get_seconds() - starttime) <= 5); + CERROR("5s timeout while waiting cache refill\n"); + return NULL; +} + +static int rsi_parse(struct cache_detail *cd, + char *mesg, int mlen) +{ + /* context token expiry major minor context token */ + char *buf = mesg; + char *ep; + int len; + struct rsi *rsii; + time_t expiry; + int status = -EINVAL; + ENTRY; + + OBD_ALLOC(rsii, sizeof(*rsii)); + if (!rsii) { + CERROR("failed to alloc rsii\n"); + RETURN(-ENOMEM); + } + cache_init(&rsii->h); + + /* handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + status = -ENOMEM; + if (rawobj_alloc(&rsii->in_handle, buf, len)) + goto out; + + /* token */ + len = qword_get(&mesg, buf, mlen); + status = -EINVAL; + if (len < 0) + goto out;; + status = -ENOMEM; + if (rawobj_alloc(&rsii->in_token, buf, len)) + goto out; + + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; + if (expiry == 0) + goto out; + + /* major/minor */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (len == 0) { + goto out; + } else { + rsii->major_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; + rsii->minor_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + + /* out_handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + status = -ENOMEM; + if (rawobj_alloc(&rsii->out_handle, buf, len)) + goto out; + + /* out_token */ + len = qword_get(&mesg, buf, mlen); + status = -EINVAL; + if (len < 0) + goto out; + status = -ENOMEM; + if (rawobj_alloc(&rsii->out_token, buf, len)) + goto out; + } + rsii->h.expiry_time = expiry; + status = gssd_reply(rsii); +out: + if (rsii) + rsi_put(&rsii->h, &rsi_cache); + RETURN(status); +} + +static struct cache_detail rsi_cache = { + .hash_size = RSI_HASHMAX, + .hash_table = rsi_table, + .name = "auth.ptlrpcs.init", + .cache_put = rsi_put, + .cache_request = rsi_request, + .cache_parse = rsi_parse, +}; + +/* + * The rpcsec_context cache is used to store a context that is + * used in data exchange. + * The key is a context handle. The content is: + * uid, gidlist, mechanism, service-set, mech-specific-data + */ + +#define RSC_HASHBITS 10 +#define RSC_HASHMAX (1<handle); + if (rsci->mechctx) + kgss_delete_sec_context(&rsci->mechctx); +#if 0 + if (rsci->cred.vc_ginfo) + put_group_info(rsci->cred.vc_ginfo); +#endif +} + +static void rsc_put(struct cache_head *item, struct cache_detail *cd) +{ + struct rsc *rsci = container_of(item, struct rsc, h); + + if (cache_put(item, cd)) { + rsc_free(rsci); + OBD_FREE(rsci, sizeof(*rsci)); + } +} + +static inline int +rsc_hash(struct rsc *rsci) +{ + return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS); +} + +static inline int +rsc_match(struct rsc *new, struct rsc *tmp) +{ + return rawobj_equal(&new->handle, &tmp->handle); +} + +static struct rsc *rsc_lookup(struct rsc *item, int set) +{ + struct rsc *tmp = NULL; + struct cache_head **hp, **head; + head = &rsc_cache.hash_table[rsc_hash(item)]; + ENTRY; + + if (set) + write_lock(&rsc_cache.hash_lock); + else + read_lock(&rsc_cache.hash_lock); + for (hp = head; *hp != NULL; hp = &tmp->h.next) { + tmp = container_of(*hp, struct rsc, h); + if (!rsc_match(tmp, item)) + continue; + cache_get(&tmp->h); + if (!set) { + goto out_noset; + } + *hp = tmp->h.next; + tmp->h.next = NULL; + clear_bit(CACHE_HASHED, &tmp->h.flags); + rsc_put(&tmp->h, &rsc_cache); + goto out_set; + } + /* Didn't find anything */ + if (!set) + goto out_noset; + rsc_cache.entries++; +out_set: + set_bit(CACHE_HASHED, &item->h.flags); + item->h.next = *head; + *head = &item->h; + write_unlock(&rsc_cache.hash_lock); + cache_fresh(&rsc_cache, &item->h, item->h.expiry_time); + cache_get(&item->h); + RETURN(item); +out_noset: + read_unlock(&rsc_cache.hash_lock); + RETURN(tmp); +} + +static int rsc_parse(struct cache_detail *cd, + char *mesg, int mlen) +{ + /* contexthandle expiry [ uid gid N mechname ...mechdata... ] */ + char *buf = mesg; + int len, rv; + struct rsc *rsci, *res = NULL; + time_t expiry; + int status = -EINVAL; + + OBD_ALLOC(rsci, sizeof(*rsci)); + if (!rsci) { + CERROR("fail to alloc rsci\n"); + return -ENOMEM; + } + cache_init(&rsci->h); + + /* context handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) goto out; + status = -ENOMEM; + if (rawobj_alloc(&rsci->handle, buf, len)) + goto out; + + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; + if (expiry == 0) + goto out; + + /* remote flag */ + rv = get_int(&mesg, &rsci->remote); + if (rv) { + CERROR("fail to get remote flag\n"); + goto out; + } + + /* uid, or NEGATIVE */ + rv = get_int(&mesg, &rsci->cred.vc_uid); + if (rv == -EINVAL) + goto out; + if (rv == -ENOENT) + set_bit(CACHE_NEGATIVE, &rsci->h.flags); + else { + int N, i; + struct gss_api_mech *gm; + rawobj_t tmp_buf; + __u64 ctx_expiry; + + /* gid */ + if (get_int(&mesg, &rsci->cred.vc_gid)) + goto out; + + /* number of additional gid's */ + if (get_int(&mesg, &N)) + goto out; + status = -ENOMEM; +#if 0 + rsci->cred.vc_ginfo = groups_alloc(N); + if (rsci->cred.vc_ginfo == NULL) + goto out; +#endif + + /* gid's */ + status = -EINVAL; + for (i=0; icred.vc_ginfo, i) = gid; +#endif + } + + /* mech name */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + gm = kgss_name_to_mech(buf); + status = -EOPNOTSUPP; + if (!gm) + goto out; + + status = -EINVAL; + /* mech-specific data: */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) { + kgss_mech_put(gm); + goto out; + } + tmp_buf.len = len; + tmp_buf.data = buf; + if (kgss_import_sec_context(&tmp_buf, gm, &rsci->mechctx)) { + kgss_mech_put(gm); + goto out; + } + + /* currently the expiry time passed down from user-space + * is invalid, here we retrive it from mech. + */ + if (kgss_inquire_context(rsci->mechctx, &ctx_expiry)) { + CERROR("unable to get expire time, drop it\n"); + set_bit(CACHE_NEGATIVE, &rsci->h.flags); + kgss_mech_put(gm); + goto out; + } + expiry = (time_t) ctx_expiry; + + kgss_mech_put(gm); + } + rsci->h.expiry_time = expiry; + spin_lock_init(&rsci->seqdata.sd_lock); + res = rsc_lookup(rsci, 1); + rsc_put(&res->h, &rsc_cache); + status = 0; +out: + if (rsci) + rsc_put(&rsci->h, &rsc_cache); + return status; +} + +/* + * flush all entries with @uid. @uid == -1 will match all. + * we only know the uid, maybe netid/nid in the future, in all cases + * we must search the whole cache + */ +static void rsc_flush(uid_t uid) +{ + struct cache_head **ch; + struct rsc *rscp; + int n; + ENTRY; + + write_lock(&rsc_cache.hash_lock); + for (n = 0; n < RSC_HASHMAX; n++) { + for (ch = &rsc_cache.hash_table[n]; *ch;) { + rscp = container_of(*ch, struct rsc, h); + if (uid == -1 || rscp->cred.vc_uid == uid) { + /* it seems simply set NEGATIVE doesn't work */ + *ch = (*ch)->next; + rscp->h.next = NULL; + cache_get(&rscp->h); + set_bit(CACHE_NEGATIVE, &rscp->h.flags); + clear_bit(CACHE_HASHED, &rscp->h.flags); + CWARN("flush rsc %p for uid %u\n", + rscp, rscp->cred.vc_uid); + rsc_put(&rscp->h, &rsc_cache); + rsc_cache.entries--; + continue; + } + ch = &((*ch)->next); + } + } + write_unlock(&rsc_cache.hash_lock); + EXIT; +} + +static struct cache_detail rsc_cache = { + .hash_size = RSC_HASHMAX, + .hash_table = rsc_table, + .name = "auth.ptlrpcs.context", + .cache_put = rsc_put, + .cache_parse = rsc_parse, +}; + +static struct rsc * +gss_svc_searchbyctx(rawobj_t *handle) +{ + struct rsc rsci; + struct rsc *found; + + rsci.handle = *handle; + found = rsc_lookup(&rsci, 0); + if (!found) + return NULL; + + if (cache_check(&rsc_cache, &found->h, NULL)) + return NULL; + + return found; +} + +struct gss_svc_data { + /* decoded gss client cred: */ + struct rpc_gss_wire_cred clcred; + /* internal used status */ + unsigned int is_init:1, + is_init_continue:1, + is_err_notify:1, + is_fini:1; + int reserve_len; +}; + +/* FIXME + * again hacking: only try to give the svcgssd a chance to handle + * upcalls. + */ +struct cache_deferred_req* my_defer(struct cache_req *req) +{ + yield(); + return NULL; +} +static struct cache_req my_chandle = {my_defer}; + +/* Implements sequence number algorithm as specified in RFC 2203. */ +static int +gss_check_seq_num(struct gss_svc_seq_data *sd, __u32 seq_num) +{ + int rc = 0; + + spin_lock(&sd->sd_lock); + if (seq_num > sd->sd_max) { + if (seq_num >= sd->sd_max + GSS_SEQ_WIN) { + memset(sd->sd_win, 0, sizeof(sd->sd_win)); + sd->sd_max = seq_num; + } else { + while(sd->sd_max < seq_num) { + sd->sd_max++; + __clear_bit(sd->sd_max % GSS_SEQ_WIN, + sd->sd_win); + } + } + __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); + goto exit; + } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) { + rc = 1; + goto exit; + } + + if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) + rc = 1; +exit: + spin_unlock(&sd->sd_lock); + return rc; +} + +static int +gss_svc_verify_request(struct ptlrpc_request *req, + struct rsc *rsci, + struct rpc_gss_wire_cred *gc, + __u32 *vp, __u32 vlen) +{ + struct ptlrpcs_wire_hdr *sec_hdr; + struct gss_ctx *ctx = rsci->mechctx; + __u32 maj_stat; + rawobj_t msg; + rawobj_t mic; + ENTRY; + + sec_hdr = (struct ptlrpcs_wire_hdr *) req->rq_reqbuf; + + req->rq_reqmsg = (struct lustre_msg *) (req->rq_reqbuf + sizeof(*sec_hdr)); + req->rq_reqlen = sec_hdr->msg_len; + + msg.len = sec_hdr->msg_len; + msg.data = (__u8 *)req->rq_reqmsg; + + mic.len = le32_to_cpu(*vp++); + mic.data = (char *) vp; + vlen -= 4; + + if (mic.len > vlen) { + CERROR("checksum len %d, while buffer len %d\n", + mic.len, vlen); + RETURN(GSS_S_CALL_BAD_STRUCTURE); + } + + if (mic.len > 256) { + CERROR("invalid mic len %d\n", mic.len); + RETURN(GSS_S_CALL_BAD_STRUCTURE); + } + + maj_stat = kgss_verify_mic(ctx, &msg, &mic, NULL); + if (maj_stat != GSS_S_COMPLETE) { + CERROR("MIC verification error: major %x\n", maj_stat); + RETURN(maj_stat); + } + + if (gss_check_seq_num(&rsci->seqdata, gc->gc_seq)) { + CERROR("discard request %p with old seq_num %u\n", + req, gc->gc_seq); + RETURN(GSS_S_DUPLICATE_TOKEN); + } + + RETURN(GSS_S_COMPLETE); +} + +static int +gss_svc_unseal_request(struct ptlrpc_request *req, + struct rsc *rsci, + struct rpc_gss_wire_cred *gc, + __u32 *vp, __u32 vlen) +{ + struct ptlrpcs_wire_hdr *sec_hdr; + struct gss_ctx *ctx = rsci->mechctx; + rawobj_t cipher_text, plain_text; + __u32 major; + ENTRY; + + sec_hdr = (struct ptlrpcs_wire_hdr *) req->rq_reqbuf; + + if (vlen < 4) { + CERROR("vlen only %u\n", vlen); + RETURN(GSS_S_CALL_BAD_STRUCTURE); + } + + cipher_text.len = le32_to_cpu(*vp++); + cipher_text.data = (__u8 *) vp; + vlen -= 4; + + if (cipher_text.len > vlen) { + CERROR("cipher claimed %u while buf only %u\n", + cipher_text.len, vlen); + RETURN(GSS_S_CALL_BAD_STRUCTURE); + } + + plain_text = cipher_text; + + major = kgss_unwrap(ctx, GSS_C_QOP_DEFAULT, &cipher_text, &plain_text); + if (major) { + CERROR("unwrap error 0x%x\n", major); + RETURN(major); + } + + if (gss_check_seq_num(&rsci->seqdata, gc->gc_seq)) { + CERROR("discard request %p with old seq_num %u\n", + req, gc->gc_seq); + RETURN(GSS_S_DUPLICATE_TOKEN); + } + + req->rq_reqmsg = (struct lustre_msg *) (vp); + req->rq_reqlen = plain_text.len; + + CDEBUG(D_SEC, "msg len %d\n", req->rq_reqlen); + + RETURN(GSS_S_COMPLETE); +} + +static int +gss_pack_err_notify(struct ptlrpc_request *req, + __u32 major, __u32 minor) +{ + struct gss_svc_data *svcdata = req->rq_sec_svcdata; + __u32 reslen, *resp, *reslenp; + char nidstr[PTL_NALFMT_SIZE]; + const __u32 secdata_len = 7 * 4; + int rc; + ENTRY; + + OBD_FAIL_RETURN(OBD_FAIL_SVCGSS_ERR_NOTIFY|OBD_FAIL_ONCE, -EINVAL); + + LASSERT(svcdata); + svcdata->is_err_notify = 1; + svcdata->reserve_len = 7 * 4; + + rc = lustre_pack_reply(req, 0, NULL, NULL); + if (rc) { + CERROR("could not pack reply, err %d\n", rc); + RETURN(rc); + } + + LASSERT(req->rq_reply_state); + LASSERT(req->rq_reply_state->rs_repbuf); + LASSERT(req->rq_reply_state->rs_repbuf_len >= secdata_len); + resp = (__u32 *) req->rq_reply_state->rs_repbuf; + + /* header */ + *resp++ = cpu_to_le32(PTLRPC_SEC_GSS); + *resp++ = cpu_to_le32(PTLRPC_SEC_TYPE_NONE); + *resp++ = cpu_to_le32(req->rq_replen); + reslenp = resp++; + + /* skip lustre msg */ + resp += req->rq_replen / 4; + reslen = svcdata->reserve_len; + + /* gss replay: + * version, subflavor, notify, major, minor, + * obj1(fake), obj2(fake) + */ + *resp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION); + *resp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I); + *resp++ = cpu_to_le32(PTLRPC_GSS_PROC_ERR); + *resp++ = cpu_to_le32(major); + *resp++ = cpu_to_le32(minor); + *resp++ = 0; + *resp++ = 0; + reslen -= (4 * 4); + /* the actual sec data length */ + *reslenp = cpu_to_le32(secdata_len); + + req->rq_reply_state->rs_repdata_len += (secdata_len); + CWARN("prepare gss error notify(0x%x/0x%x) to %s\n", major, minor, + portals_nid2str(req->rq_peer.peer_ni->pni_number, + req->rq_peer.peer_id.nid, nidstr)); + RETURN(0); +} + +static int +gss_svcsec_handle_init(struct ptlrpc_request *req, + struct rpc_gss_wire_cred *gc, + __u32 *secdata, __u32 seclen, + enum ptlrpcs_error *res) +{ + struct gss_svc_data *svcdata = req->rq_sec_svcdata; + struct rsc *rsci; + struct rsi *rsikey, *rsip; + rawobj_t tmpobj; + __u32 reslen, *resp, *reslenp; + char nidstr[PTL_NALFMT_SIZE]; + int rc; + ENTRY; + + LASSERT(svcdata); + + CWARN("processing gss init(%d) request from %s\n", gc->gc_proc, + portals_nid2str(req->rq_peer.peer_ni->pni_number, + req->rq_peer.peer_id.nid, nidstr)); + + *res = PTLRPCS_BADCRED; + OBD_FAIL_RETURN(OBD_FAIL_SVCGSS_INIT_REQ|OBD_FAIL_ONCE, SVC_DROP); + + if (gc->gc_proc == RPC_GSS_PROC_INIT && + gc->gc_ctx.len != 0) { + CERROR("proc %d, ctx_len %d: not really init?\n", + gc->gc_proc == RPC_GSS_PROC_INIT, gc->gc_ctx.len); + RETURN(SVC_DROP); + } + + OBD_ALLOC(rsikey, sizeof(*rsikey)); + if (!rsikey) { + CERROR("out of memory\n"); + RETURN(SVC_DROP); + } + cache_init(&rsikey->h); + + if (rawobj_dup(&rsikey->in_handle, &gc->gc_ctx)) { + CERROR("fail to dup context handle\n"); + GOTO(out_rsikey, rc = SVC_DROP); + } + *res = PTLRPCS_BADVERF; + if (rawobj_extract(&tmpobj, &secdata, &seclen)) { + CERROR("can't extract token\n"); + GOTO(out_rsikey, rc = SVC_DROP); + } + if (rawobj_dup(&rsikey->in_token, &tmpobj)) { + CERROR("can't duplicate token\n"); + GOTO(out_rsikey, rc = SVC_DROP); + } + + rsip = gssd_upcall(rsikey, &my_chandle); + if (!rsip) { + CERROR("error in gssd_upcall.\n"); + GOTO(out_rsikey, rc = SVC_DROP); + } + + rsci = gss_svc_searchbyctx(&rsip->out_handle); + if (!rsci) { + CERROR("rsci still not mature yet?\n"); + GOTO(out_rsip, rc = SVC_DROP); + } + CWARN("svcsec create gss context %p(%u@%s)\n", + rsci, rsci->cred.vc_uid, + portals_nid2str(req->rq_peer.peer_ni->pni_number, + req->rq_peer.peer_id.nid, nidstr)); + + svcdata->is_init = 1; + svcdata->reserve_len = 6 * 4 + + size_round4(rsip->out_handle.len) + + size_round4(rsip->out_token.len); + + rc = lustre_pack_reply(req, 0, NULL, NULL); + if (rc) { + CERROR("failed to pack reply, rc = %d\n", rc); + GOTO(out, rc = SVC_DROP); + } + + /* header */ + resp = (__u32 *) req->rq_reply_state->rs_repbuf; + *resp++ = cpu_to_le32(PTLRPC_SEC_GSS); + *resp++ = cpu_to_le32(PTLRPC_SEC_TYPE_NONE); + *resp++ = cpu_to_le32(req->rq_replen); + reslenp = resp++; + + resp += req->rq_replen / 4; + reslen = svcdata->reserve_len; + + /* gss reply: + * status, major, minor, seq, out_handle, out_token + */ + *resp++ = cpu_to_le32(PTLRPCS_OK); + *resp++ = cpu_to_le32(rsip->major_status); + *resp++ = cpu_to_le32(rsip->minor_status); + *resp++ = cpu_to_le32(GSS_SEQ_WIN); + reslen -= (4 * 4); + if (rawobj_serialize(&rsip->out_handle, + &resp, &reslen)) + LBUG(); + if (rawobj_serialize(&rsip->out_token, + &resp, &reslen)) + LBUG(); + /* the actual sec data length */ + *reslenp = cpu_to_le32(svcdata->reserve_len - reslen); + + req->rq_reply_state->rs_repdata_len += le32_to_cpu(*reslenp); + CDEBUG(D_SEC, "req %p: msgsize %d, authsize %d, " + "total size %d\n", req, req->rq_replen, + le32_to_cpu(*reslenp), + req->rq_reply_state->rs_repdata_len); + + *res = PTLRPCS_OK; + + /* This is simplified since right now we doesn't support + * INIT_CONTINUE yet. + */ + if (gc->gc_proc == RPC_GSS_PROC_INIT) { + struct ptlrpcs_wire_hdr *hdr; + + hdr = buf_to_sec_hdr(req->rq_reqbuf); + req->rq_reqmsg = buf_to_lustre_msg(req->rq_reqbuf); + req->rq_reqlen = hdr->msg_len; + + rc = SVC_LOGIN; + } else + rc = SVC_COMPLETE; + +out: + rsc_put(&rsci->h, &rsc_cache); +out_rsip: + rsi_put(&rsip->h, &rsi_cache); +out_rsikey: + rsi_put(&rsikey->h, &rsi_cache); + + RETURN(rc); +} + +static int +gss_svcsec_handle_data(struct ptlrpc_request *req, + struct rpc_gss_wire_cred *gc, + __u32 *secdata, __u32 seclen, + enum ptlrpcs_error *res) +{ + struct rsc *rsci; + char nidstr[PTL_NALFMT_SIZE]; + __u32 major; + int rc; + ENTRY; + + *res = PTLRPCS_GSS_CREDPROBLEM; + + rsci = gss_svc_searchbyctx(&gc->gc_ctx); + if (!rsci) { + CWARN("Invalid gss context handle from %s\n", + portals_nid2str(req->rq_peer.peer_ni->pni_number, + req->rq_peer.peer_id.nid, nidstr)); + major = GSS_S_NO_CONTEXT; + goto notify_err; + } + + switch (gc->gc_svc) { + case PTLRPC_GSS_SVC_INTEGRITY: + major = gss_svc_verify_request(req, rsci, gc, secdata, seclen); + if (major == GSS_S_COMPLETE) + break; + + CWARN("fail in verify:0x%x: ctx %p@%s\n", major, rsci, + portals_nid2str(req->rq_peer.peer_ni->pni_number, + req->rq_peer.peer_id.nid, nidstr)); + goto notify_err; + case PTLRPC_GSS_SVC_PRIVACY: + major = gss_svc_unseal_request(req, rsci, gc, secdata, seclen); + if (major == GSS_S_COMPLETE) + break; + + CWARN("fail in decrypt:0x%x: ctx %p@%s\n", major, rsci, + portals_nid2str(req->rq_peer.peer_ni->pni_number, + req->rq_peer.peer_id.nid, nidstr)); + goto notify_err; + default: + CERROR("unsupported gss service %d\n", gc->gc_svc); + GOTO(out, rc = SVC_DROP); + } + + req->rq_auth_uid = rsci->cred.vc_uid; + req->rq_remote = rsci->remote; + + *res = PTLRPCS_OK; + GOTO(out, rc = SVC_OK); + +notify_err: + if (gss_pack_err_notify(req, major, 0)) + rc = SVC_DROP; + else + rc = SVC_COMPLETE; +out: + if (rsci) + rsc_put(&rsci->h, &rsc_cache); + RETURN(rc); +} + +static int +gss_svcsec_handle_destroy(struct ptlrpc_request *req, + struct rpc_gss_wire_cred *gc, + __u32 *secdata, __u32 seclen, + enum ptlrpcs_error *res) +{ + struct gss_svc_data *svcdata = req->rq_sec_svcdata; + struct rsc *rsci; + char nidstr[PTL_NALFMT_SIZE]; + int rc; + ENTRY; + + LASSERT(svcdata); + *res = PTLRPCS_GSS_CREDPROBLEM; + + rsci = gss_svc_searchbyctx(&gc->gc_ctx); + if (!rsci) { + CWARN("invalid gss context handle for destroy.\n"); + RETURN(SVC_DROP); + } + + if (gc->gc_svc != PTLRPC_GSS_SVC_INTEGRITY) { + CERROR("service %d is not supported in destroy.\n", + gc->gc_svc); + GOTO(out, rc = SVC_DROP); + } + + *res = gss_svc_verify_request(req, rsci, gc, secdata, seclen); + if (*res) + GOTO(out, rc = SVC_DROP); + + /* compose reply, which is actually nothing */ + svcdata->is_fini = 1; + if (lustre_pack_reply(req, 0, NULL, NULL)) + GOTO(out, rc = SVC_DROP); + + CWARN("svcsec destroy gss context %p(%u@%s)\n", + rsci, rsci->cred.vc_uid, + portals_nid2str(req->rq_peer.peer_ni->pni_number, + req->rq_peer.peer_id.nid, nidstr)); + + set_bit(CACHE_NEGATIVE, &rsci->h.flags); + *res = PTLRPCS_OK; + rc = SVC_LOGOUT; +out: + rsc_put(&rsci->h, &rsc_cache); + RETURN(rc); +} + +/* + * let incomming request go through security check: + * o context establishment: invoke user space helper + * o data exchange: verify/decrypt + * o context destruction: mark context invalid + * + * in most cases, error will result to drop the packet silently. + */ +static int +gss_svcsec_accept(struct ptlrpc_request *req, enum ptlrpcs_error *res) +{ + struct gss_svc_data *svcdata; + struct rpc_gss_wire_cred *gc; + struct ptlrpcs_wire_hdr *sec_hdr; + __u32 seclen, *secdata, version, subflavor; + int rc; + ENTRY; + + CDEBUG(D_SEC, "request %p\n", req); + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len); + + *res = PTLRPCS_BADCRED; + + sec_hdr = buf_to_sec_hdr(req->rq_reqbuf); + LASSERT(sec_hdr->flavor == PTLRPC_SEC_GSS); + + seclen = req->rq_reqbuf_len - sizeof(*sec_hdr) - sec_hdr->msg_len; + secdata = (__u32 *) buf_to_sec_data(req->rq_reqbuf); + + if (sec_hdr->sec_len > seclen) { + CERROR("seclen %d, while max buf %d\n", + sec_hdr->sec_len, seclen); + RETURN(SVC_DROP); + } + + if (seclen < 6 * 4) { + CERROR("sec size %d too small\n", seclen); + RETURN(SVC_DROP); + } + + LASSERT(!req->rq_sec_svcdata); + OBD_ALLOC(svcdata, sizeof(*svcdata)); + if (!svcdata) { + CERROR("fail to alloc svcdata\n"); + RETURN(SVC_DROP); + } + req->rq_sec_svcdata = svcdata; + gc = &svcdata->clcred; + + /* Now secdata/seclen is what we want to parse + */ + version = le32_to_cpu(*secdata++); /* version */ + subflavor = le32_to_cpu(*secdata++); /* subflavor */ + gc->gc_proc = le32_to_cpu(*secdata++); /* proc */ + gc->gc_seq = le32_to_cpu(*secdata++); /* seq */ + gc->gc_svc = le32_to_cpu(*secdata++); /* service */ + seclen -= 5 * 4; + + CDEBUG(D_SEC, "wire gss_hdr: %u/%u/%u/%u/%u\n", + version, subflavor, gc->gc_proc, gc->gc_seq, gc->gc_svc); + + if (version != PTLRPC_SEC_GSS_VERSION) { + CERROR("gss version mismatch: %d - %d\n", + version, PTLRPC_SEC_GSS_VERSION); + GOTO(err_free, rc = SVC_DROP); + } + + if (rawobj_extract(&gc->gc_ctx, &secdata, &seclen)) { + CERROR("fail to obtain gss context handle\n"); + GOTO(err_free, rc = SVC_DROP); + } + + *res = PTLRPCS_BADVERF; + switch(gc->gc_proc) { + case RPC_GSS_PROC_INIT: + case RPC_GSS_PROC_CONTINUE_INIT: + rc = gss_svcsec_handle_init(req, gc, secdata, seclen, res); + break; + case RPC_GSS_PROC_DATA: + rc = gss_svcsec_handle_data(req, gc, secdata, seclen, res); + break; + case RPC_GSS_PROC_DESTROY: + rc = gss_svcsec_handle_destroy(req, gc, secdata, seclen, res); + break; + default: + rc = SVC_DROP; + LBUG(); + } + +err_free: + if (rc == SVC_DROP && req->rq_sec_svcdata) { + OBD_FREE(req->rq_sec_svcdata, sizeof(struct gss_svc_data)); + req->rq_sec_svcdata = NULL; + } + + RETURN(rc); +} + +static int +gss_svcsec_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct gss_svc_data *gsd = (struct gss_svc_data *)req->rq_sec_svcdata; + struct rpc_gss_wire_cred *gc = &gsd->clcred; + struct rsc *rscp; + struct ptlrpcs_wire_hdr *sec_hdr; + rawobj_buf_t msg_buf; + rawobj_t cipher_buf; + __u32 *vp, *vpsave, major, vlen, seclen; + rawobj_t lmsg, mic; + int ret; + ENTRY; + + LASSERT(rs); + LASSERT(rs->rs_repbuf); + LASSERT(gsd); + + if (gsd->is_init || gsd->is_init_continue || + gsd->is_err_notify || gsd->is_fini) { + /* nothing to do in these cases */ + CDEBUG(D_SEC, "req %p: init/fini/err\n", req); + RETURN(0); + } + + if (gc->gc_proc != RPC_GSS_PROC_DATA) { + CERROR("proc %d not support\n", gc->gc_proc); + RETURN(-EINVAL); + } + + rscp = gss_svc_searchbyctx(&gc->gc_ctx); + if (!rscp) { + CERROR("ctx disapeared under us?\n"); + RETURN(-EINVAL); + } + + sec_hdr = (struct ptlrpcs_wire_hdr *) rs->rs_repbuf; + switch (gc->gc_svc) { + case PTLRPC_GSS_SVC_INTEGRITY: + /* prepare various pointers */ + lmsg.len = req->rq_replen; + lmsg.data = (__u8 *) (rs->rs_repbuf + sizeof(*sec_hdr)); + vp = (__u32 *) (lmsg.data + lmsg.len); + vlen = rs->rs_repbuf_len - sizeof(*sec_hdr) - lmsg.len; + seclen = vlen; + + sec_hdr->flavor = cpu_to_le32(PTLRPC_SEC_GSS); + sec_hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_AUTH); + sec_hdr->msg_len = cpu_to_le32(req->rq_replen); + + /* standard gss hdr */ + LASSERT(vlen >= 7 * 4); + *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION); + *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I); + *vp++ = cpu_to_le32(RPC_GSS_PROC_DATA); + *vp++ = cpu_to_le32(gc->gc_seq); + *vp++ = cpu_to_le32(PTLRPC_GSS_SVC_INTEGRITY); + *vp++ = 0; /* fake ctx handle */ + vpsave = vp++; /* reserve size */ + vlen -= 7 * 4; + + mic.len = vlen; + mic.data = (char *) vp; + + major = kgss_get_mic(rscp->mechctx, 0, &lmsg, &mic); + if (major) { + CERROR("fail to get MIC: 0x%x\n", major); + GOTO(out, ret = -EINVAL); + } + *vpsave = cpu_to_le32(mic.len); + seclen = seclen - vlen + mic.len; + sec_hdr->sec_len = cpu_to_le32(seclen); + rs->rs_repdata_len += size_round(seclen); + break; + case PTLRPC_GSS_SVC_PRIVACY: + vp = (__u32 *) (rs->rs_repbuf + sizeof(*sec_hdr)); + vlen = rs->rs_repbuf_len - sizeof(*sec_hdr); + seclen = vlen; + + sec_hdr->flavor = cpu_to_le32(PTLRPC_SEC_GSS); + sec_hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_PRIV); + sec_hdr->msg_len = cpu_to_le32(0); + + /* standard gss hdr */ + LASSERT(vlen >= 7 * 4); + *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_VERSION); + *vp++ = cpu_to_le32(PTLRPC_SEC_GSS_KRB5I); + *vp++ = cpu_to_le32(RPC_GSS_PROC_DATA); + *vp++ = cpu_to_le32(gc->gc_seq); + *vp++ = cpu_to_le32(PTLRPC_GSS_SVC_PRIVACY); + *vp++ = 0; /* fake ctx handle */ + vpsave = vp++; /* reserve size */ + vlen -= 7 * 4; + + msg_buf.buf = (__u8 *) rs->rs_msg - GSS_PRIVBUF_PREFIX_LEN; + msg_buf.buflen = req->rq_replen + GSS_PRIVBUF_PREFIX_LEN + + GSS_PRIVBUF_SUFFIX_LEN; + msg_buf.dataoff = GSS_PRIVBUF_PREFIX_LEN; + msg_buf.datalen = req->rq_replen; + + cipher_buf.data = (__u8 *) vp; + cipher_buf.len = vlen; + + major = kgss_wrap(rscp->mechctx, GSS_C_QOP_DEFAULT, + &msg_buf, &cipher_buf); + if (major) { + CERROR("failed to wrap: 0x%x\n", major); + GOTO(out, ret = -EINVAL); + } + + *vpsave = cpu_to_le32(cipher_buf.len); + seclen = seclen - vlen + cipher_buf.len; + sec_hdr->sec_len = cpu_to_le32(seclen); + rs->rs_repdata_len += size_round(seclen); + break; + default: + CERROR("Unknown service %d\n", gc->gc_svc); + GOTO(out, ret = -EINVAL); + } + ret = 0; +out: + rsc_put(&rscp->h, &rsc_cache); + + RETURN(ret); +} + +static +void gss_svcsec_cleanup_req(struct ptlrpc_svcsec *svcsec, + struct ptlrpc_request *req) +{ + struct gss_svc_data *gsd = (struct gss_svc_data *) req->rq_sec_svcdata; + + if (!gsd) { + CDEBUG(D_SEC, "no svc_data present. do nothing\n"); + return; + } + + /* gsd->clclred.gc_ctx is NOT allocated, just set pointer + * to the incoming packet buffer, so don't need free it + */ + OBD_FREE(gsd, sizeof(*gsd)); + req->rq_sec_svcdata = NULL; + return; +} + +static +int gss_svcsec_est_payload(struct ptlrpc_svcsec *svcsec, + struct ptlrpc_request *req, + int msgsize) +{ + struct gss_svc_data *svcdata = req->rq_sec_svcdata; + ENTRY; + + /* just return the pre-set reserve_len for init/fini/err cases. + */ + LASSERT(svcdata); + if (svcdata->is_init) { + CDEBUG(D_SEC, "is_init, reserver size %d(%d)\n", + size_round(svcdata->reserve_len), + svcdata->reserve_len); + LASSERT(svcdata->reserve_len); + LASSERT(svcdata->reserve_len % 4 == 0); + RETURN(size_round(svcdata->reserve_len)); + } else if (svcdata->is_err_notify) { + CDEBUG(D_SEC, "is_err_notify, reserver size %d(%d)\n", + size_round(svcdata->reserve_len), + svcdata->reserve_len); + RETURN(size_round(svcdata->reserve_len)); + } else if (svcdata->is_fini) { + CDEBUG(D_SEC, "is_fini, reserver size 0\n"); + RETURN(0); + } else { + if (svcdata->clcred.gc_svc == PTLRPC_GSS_SVC_NONE || + svcdata->clcred.gc_svc == PTLRPC_GSS_SVC_INTEGRITY) + RETURN(size_round(GSS_MAX_AUTH_PAYLOAD)); + else if (svcdata->clcred.gc_svc == PTLRPC_GSS_SVC_PRIVACY) + RETURN(size_round16(GSS_MAX_AUTH_PAYLOAD + msgsize + + GSS_PRIVBUF_PREFIX_LEN + + GSS_PRIVBUF_SUFFIX_LEN)); + else { + CERROR("unknown gss svc %u\n", svcdata->clcred.gc_svc); + *((int *)0) = 0; + LBUG(); + } + } + RETURN(0); +} + +int gss_svcsec_alloc_repbuf(struct ptlrpc_svcsec *svcsec, + struct ptlrpc_request *req, + int msgsize) +{ + struct gss_svc_data *gsd = (struct gss_svc_data *) req->rq_sec_svcdata; + struct ptlrpc_reply_state *rs; + int msg_payload, sec_payload; + int privacy, rc; + ENTRY; + + /* determine the security type: none/auth or priv, we have + * different pack scheme for them. + * init/fini/err will always be treated as none/auth. + */ + LASSERT(gsd); + if (!gsd->is_init && !gsd->is_init_continue && + !gsd->is_fini && !gsd->is_err_notify && + gsd->clcred.gc_svc == PTLRPC_GSS_SVC_PRIVACY) + privacy = 1; + else + privacy = 0; + + msg_payload = privacy ? 0 : msgsize; + sec_payload = gss_svcsec_est_payload(svcsec, req, msgsize); + + rc = svcsec_alloc_reply_state(req, msg_payload, sec_payload); + if (rc) + RETURN(rc); + + rs = req->rq_reply_state; + LASSERT(rs); + rs->rs_msg_len = msgsize; + + if (privacy) { + /* we can choose to let msg simply point to the rear of the + * buffer, which lead to buffer overlap when doing encryption. + * usually it's ok and it indeed passed all existing tests. + * but not sure if there will be subtle problems in the future. + * so right now we choose to alloc another new buffer. we'll + * see how it works. + */ +#if 0 + rs->rs_msg = (struct lustre_msg *) + (rs->rs_repbuf + rs->rs_repbuf_len - + msgsize - GSS_PRIVBUF_SUFFIX_LEN); +#endif + char *msgbuf; + + msgsize += GSS_PRIVBUF_PREFIX_LEN + GSS_PRIVBUF_SUFFIX_LEN; + OBD_ALLOC(msgbuf, msgsize); + if (!msgbuf) { + CERROR("can't alloc %d\n", msgsize); + svcsec_free_reply_state(rs); + req->rq_reply_state = NULL; + RETURN(-ENOMEM); + } + rs->rs_msg = (struct lustre_msg *) + (msgbuf + GSS_PRIVBUF_PREFIX_LEN); + } + + req->rq_repmsg = rs->rs_msg; + + RETURN(0); +} + +static +void gss_svcsec_free_repbuf(struct ptlrpc_svcsec *svcsec, + struct ptlrpc_reply_state *rs) +{ + unsigned long p1 = (unsigned long) rs->rs_msg; + unsigned long p2 = (unsigned long) rs->rs_buf; + + LASSERT(rs->rs_buf); + LASSERT(rs->rs_msg); + LASSERT(rs->rs_msg_len); + + if (p1 < p2 || p1 >= p2 + rs->rs_buf_len) { + char *start = (char*) rs->rs_msg - GSS_PRIVBUF_PREFIX_LEN; + int size = rs->rs_msg_len + GSS_PRIVBUF_PREFIX_LEN + + GSS_PRIVBUF_SUFFIX_LEN; + OBD_FREE(start, size); + } + + svcsec_free_reply_state(rs); +} + +struct ptlrpc_svcsec svcsec_gss = { + .pss_owner = THIS_MODULE, + .pss_name = "GSS_SVCSEC", + .pss_flavor = {PTLRPC_SEC_GSS, 0}, + .accept = gss_svcsec_accept, + .authorize = gss_svcsec_authorize, + .alloc_repbuf = gss_svcsec_alloc_repbuf, + .free_repbuf = gss_svcsec_free_repbuf, + .cleanup_req = gss_svcsec_cleanup_req, +}; + +/* XXX hacking */ +void lgss_svc_cache_purge_all(void) +{ + cache_purge(&rsi_cache); + cache_purge(&rsc_cache); +} +EXPORT_SYMBOL(lgss_svc_cache_purge_all); + +void lgss_svc_cache_flush(__u32 uid) +{ + rsc_flush(uid); +} +EXPORT_SYMBOL(lgss_svc_cache_flush); + +int gss_svc_init(void) +{ + int rc; + + rc = svcsec_register(&svcsec_gss); + if (!rc) { + cache_register(&rsc_cache); + cache_register(&rsi_cache); + } + return rc; +} + +void gss_svc_exit(void) +{ + int rc; + if ((rc = cache_unregister(&rsi_cache))) + CERROR("unregister rsi cache: %d\n", rc); + if ((rc = cache_unregister(&rsc_cache))) + CERROR("unregister rsc cache: %d\n", rc); + if ((rc = svcsec_unregister(&svcsec_gss))) + CERROR("unregister svcsec_gss: %d\n", rc); +} diff --git a/lustre/sec/sec.c b/lustre/sec/sec.c new file mode 100644 index 0000000..9dd5d4f --- /dev/null +++ b/lustre/sec/sec.c @@ -0,0 +1,932 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +static spinlock_t sectypes_lock = SPIN_LOCK_UNLOCKED; +static struct ptlrpc_sec_type *sectypes[PTLRPC_SEC_MAX_FLAVORS] = { + NULL, +}; + +int ptlrpcs_register(struct ptlrpc_sec_type *type) +{ + __u32 flavor = type->pst_flavor.flavor; + + LASSERT(type->pst_name); + LASSERT(type->pst_ops); + + if (flavor >= PTLRPC_SEC_MAX_FLAVORS) + return -EINVAL; + + spin_lock(§ypes_lock); + if (sectypes[flavor]) { + spin_unlock(§ypes_lock); + return -EALREADY; + } + sectypes[flavor] = type; + atomic_set(&type->pst_inst, 0); + spin_unlock(§ypes_lock); + + CWARN("Security module %s registered\n", type->pst_name); + return 0; +} + +int ptlrpcs_unregister(struct ptlrpc_sec_type *type) +{ + __u32 flavor = type->pst_flavor.flavor; + + if (flavor >= PTLRPC_SEC_MAX_FLAVORS) + return -EINVAL; + + spin_lock(§ypes_lock); + if (!sectypes[flavor]) { + spin_unlock(§ypes_lock); + return -EINVAL; + } + + if (sectypes[flavor] != type) { + CERROR("invalid unregister\n"); + return -EINVAL; + } + + if (atomic_read(&type->pst_inst)) { + CERROR("sec module %s still have instance %d\n", + type->pst_name, atomic_read(&type->pst_inst)); + spin_unlock(§ypes_lock); + return -EINVAL; + } + + CDEBUG(D_SEC, "Security module %s unregistered\n", type->pst_name); + sectypes[flavor] = NULL; + spin_unlock(§ypes_lock); + + return 0; +} + +static +struct ptlrpc_sec_type * ptlrpcs_flavor2type(ptlrpcs_flavor_t *flavor) +{ + struct ptlrpc_sec_type *type; + __u32 major = flavor->flavor; + + if (major >= PTLRPC_SEC_MAX_FLAVORS) + return NULL; + + spin_lock(§ypes_lock); + type = sectypes[major]; + if (type && !try_module_get(type->pst_owner)) + type = NULL; + spin_unlock(§ypes_lock); + return type; +} + +static inline +void ptlrpcs_type_put(struct ptlrpc_sec_type *type) +{ + module_put(type->pst_owner); +} + +/*********************************************** + * credential cache helpers * + ***********************************************/ + +void ptlrpcs_init_credcache(struct ptlrpc_sec *sec) +{ + int i; + for (i = 0; i < PTLRPC_CREDCACHE_NR; i++) + INIT_LIST_HEAD(&sec->ps_credcache[i]); + sec->ps_nextgc = get_seconds() + (sec->ps_expire >> 1); +} + +static void ptlrpcs_cred_destroy(struct ptlrpc_cred *cred) +{ + struct ptlrpc_sec *sec = cred->pc_sec; + + LASSERT(cred->pc_sec); + LASSERT(atomic_read(&cred->pc_refcount) == 0); + LASSERT(list_empty(&cred->pc_hash)); + + cred->pc_ops->destroy(cred); + atomic_dec(&sec->ps_credcount); +} + +static void ptlrpcs_destroy_credlist(struct list_head *head) +{ + struct ptlrpc_cred *cred; + + while (!list_empty(head)) { + cred = list_entry(head->next, struct ptlrpc_cred, pc_hash); + list_del_init(&cred->pc_hash); + ptlrpcs_cred_destroy(cred); + } +} + +static +int ptlrpcs_cred_unlink_expired(struct ptlrpc_cred *cred, + struct list_head *freelist) +{ + LASSERT(cred->pc_sec); + + if (atomic_read(&cred->pc_refcount) != 0) + return 0; + if (time_after(cred->pc_expire, get_seconds())) + return 0; + + list_del(&cred->pc_hash); + list_add(&cred->pc_hash, freelist); + CDEBUG(D_SEC, "put cred %p into freelist\n", cred); + return 1; +} + +static +void ptlrpcs_credcache_gc(struct ptlrpc_sec *sec, + struct list_head *freelist) +{ + struct ptlrpc_cred *cred, *n; + int i; + ENTRY; + + CDEBUG(D_SEC, "do gc on sec %s\n", sec->ps_type->pst_name); + for (i = 0; i < PTLRPC_CREDCACHE_NR; i++) { + list_for_each_entry_safe(cred, n, &sec->ps_credcache[i], + pc_hash) { + ptlrpcs_cred_unlink_expired(cred, freelist); + } + } + sec->ps_nextgc = get_seconds() + sec->ps_expire; + EXIT; +} + +static +int ptlrpcs_flush_credcache(struct ptlrpc_sec *sec, int force) +{ + struct ptlrpc_cred *cred, *n; + LIST_HEAD(freelist); + int i, busy = 0; + ENTRY; + + spin_lock(&sec->ps_lock); + for (i = 0; i < PTLRPC_CREDCACHE_NR; i++) { + list_for_each_entry_safe(cred, n, &sec->ps_credcache[i], + pc_hash) { + LASSERT(atomic_read(&cred->pc_refcount) >= 0); + if (atomic_read(&cred->pc_refcount)) { + busy = 1; + if (!force) + continue; + list_del_init(&cred->pc_hash); + } else + list_move(&cred->pc_hash, &freelist); + + /* don't remove CRED_UPTODATE flag here */ + cred->pc_flags |= PTLRPC_CRED_DEAD; + } + } + spin_unlock(&sec->ps_lock); + ptlrpcs_destroy_credlist(&freelist); + RETURN(busy); +} + +/************************************************** + * credential APIs * + **************************************************/ + +static inline +int ptlrpcs_cred_get_hash(__u64 pag) +{ + LASSERT((pag & PTLRPC_CREDCACHE_MASK) < PTLRPC_CREDCACHE_NR); + return (pag & PTLRPC_CREDCACHE_MASK); +} + +static +struct ptlrpc_cred * cred_cache_lookup(struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + struct ptlrpc_request *req, + int create) +{ + struct ptlrpc_cred *cred, *new = NULL, *n; + LIST_HEAD(freelist); + int hash, found = 0; + ENTRY; + + hash = ptlrpcs_cred_get_hash(vcred->vc_pag); + +retry: + spin_lock(&sec->ps_lock); + /* do gc if expired */ + if (time_after(get_seconds(), sec->ps_nextgc)) + ptlrpcs_credcache_gc(sec, &freelist); + + list_for_each_entry_safe(cred, n, &sec->ps_credcache[hash], pc_hash) { + if (cred->pc_flags & PTLRPC_CRED_DEAD) + continue; + if (ptlrpcs_cred_unlink_expired(cred, &freelist)) + continue; + if (cred->pc_ops->match(cred, req, vcred)) { + found = 1; + break; + } + } + + if (found) { + if (new && new != cred) { + /* lost the race, just free it */ + list_add(&new->pc_hash, &freelist); + } + list_move(&cred->pc_hash, &sec->ps_credcache[hash]); + } else { + if (new) { + list_add(&new->pc_hash, &sec->ps_credcache[hash]); + cred = new; + } else if (create) { + spin_unlock(&sec->ps_lock); + new = sec->ps_type->pst_ops->create_cred(sec, req, vcred); + if (new) { + atomic_inc(&sec->ps_credcount); + goto retry; + } + } else + cred = NULL; + } + + /* hold a ref */ + if (cred) + atomic_inc(&cred->pc_refcount); + + spin_unlock(&sec->ps_lock); + + ptlrpcs_destroy_credlist(&freelist); + RETURN(cred); +} + +struct ptlrpc_cred * ptlrpcs_cred_lookup(struct ptlrpc_sec *sec, + struct vfs_cred *vcred) +{ + struct ptlrpc_cred *cred; + ENTRY; + + cred = cred_cache_lookup(sec, vcred, NULL, 0); + RETURN(cred); +} + +int ptlrpcs_req_get_cred(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + struct vfs_cred vcred; + ENTRY; + + LASSERT(!req->rq_cred); + LASSERT(imp); + LASSERT(imp->imp_sec); + + /* XXX + * for now we simply let PAG == real uid + */ + vcred.vc_pag = (__u64) current->uid; + vcred.vc_uid = current->uid; + + req->rq_cred = cred_cache_lookup(imp->imp_sec, &vcred, req, 1); + + if (!req->rq_cred) { + CERROR("req %p: fail to get cred from cache\n", req); + RETURN(-ENOMEM); + } + + RETURN(0); +} + +static void ptlrpcs_sec_destroy(struct ptlrpc_sec *sec); + +void ptlrpcs_cred_put(struct ptlrpc_cred *cred, int sync) +{ + struct ptlrpc_sec *sec = cred->pc_sec; + + LASSERT(cred); + LASSERT(sec); + LASSERT(atomic_read(&cred->pc_refcount)); + + spin_lock(&sec->ps_lock); + if (atomic_dec_and_test(&cred->pc_refcount) && + sync && cred->pc_flags & PTLRPC_CRED_DEAD) { + list_del_init(&cred->pc_hash); + ptlrpcs_cred_destroy(cred); + if (!atomic_read(&sec->ps_credcount) && + !atomic_read(&sec->ps_refcount)) { + CWARN("put last cred on a dead sec %p(%s), " + "also destroy the sec\n", sec, + sec->ps_type->pst_name); + spin_unlock(&sec->ps_lock); + + ptlrpcs_sec_destroy(sec); + return; + } + } + spin_unlock(&sec->ps_lock); +} + +void ptlrpcs_req_drop_cred(struct ptlrpc_request *req) +{ + ENTRY; + + LASSERT(req); + LASSERT(req->rq_cred); + + if (req->rq_cred) { + /* We'd like to not use 'sync' mode, but might cause + * some cred leak. Need more thinking here. FIXME + */ + ptlrpcs_cred_put(req->rq_cred, 1); + req->rq_cred = NULL; + } else + CDEBUG(D_SEC, "req %p have no cred\n", req); + EXIT; +} + +/* + * request must have a cred. if failed to get new cred, + * just restore the old one + */ +int ptlrpcs_req_replace_dead_cred(struct ptlrpc_request *req) +{ + struct ptlrpc_cred *cred = req->rq_cred; + int rc; + ENTRY; + + LASSERT(cred); + LASSERT(cred->pc_flags & PTLRPC_CRED_DEAD); + + ptlrpcs_cred_get(cred); + ptlrpcs_req_drop_cred(req); + LASSERT(!req->rq_cred); + rc = ptlrpcs_req_get_cred(req); + if (!rc) { + LASSERT(req->rq_cred); + LASSERT(req->rq_cred != cred); + ptlrpcs_cred_put(cred, 1); + } else { + LASSERT(!req->rq_cred); + req->rq_cred = cred; + } + RETURN(rc); +} + +int ptlrpcs_req_refresh_cred(struct ptlrpc_request *req) +{ + struct ptlrpc_cred *cred = req->rq_cred; + int rc; + ENTRY; + + LASSERT(cred); + + if ((cred->pc_flags & (PTLRPC_CRED_UPTODATE | PTLRPC_CRED_DEAD)) == + PTLRPC_CRED_UPTODATE) + RETURN(0); + + if (cred->pc_flags & PTLRPC_CRED_DEAD) { + rc = ptlrpcs_req_replace_dead_cred(req); + if (!rc) { + LASSERT(cred != req->rq_cred); + CWARN("req %p: replace cred %p => %p\n", + req, cred, req->rq_cred); + cred = req->rq_cred; + } else { + LASSERT(cred == req->rq_cred); + CERROR("req %p: failed to replace dead cred %p\n", + req, cred); + RETURN(-ENOMEM); + } + } + + rc = ptlrpcs_cred_refresh(cred); + if (!(cred->pc_flags & PTLRPC_CRED_UPTODATE)) { + CERROR("req %p: failed to refresh cred %p, rc %d\n", + req, cred, rc); + if (!rc) + rc = -EACCES; + } + RETURN(rc); +} + +int ptlrpcs_cli_wrap_request(struct ptlrpc_request *req) +{ + struct ptlrpc_cred *cred; + int rc; + ENTRY; + + LASSERT(req->rq_cred); + LASSERT(req->rq_cred->pc_sec); + LASSERT(req->rq_cred->pc_ops); + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len); + + rc = ptlrpcs_req_refresh_cred(req); + if (rc) + RETURN(rc); + + CDEBUG(D_SEC, "wrap req %p\n", req); + cred = req->rq_cred; + + switch (cred->pc_sec->ps_sectype) { + case PTLRPC_SEC_TYPE_NONE: + case PTLRPC_SEC_TYPE_AUTH: + if (req->rq_req_wrapped) { + CWARN("req %p(o%u,x"LPU64",t"LPU64") " + "already signed, resend?\n", req, + req->rq_reqmsg ? req->rq_reqmsg->opc : -1, + req->rq_xid, req->rq_transno); + req->rq_req_wrapped = 0; + req->rq_reqdata_len = sizeof(struct ptlrpcs_wire_hdr) + + req->rq_reqlen; + LASSERT(req->rq_reqdata_len % 8 == 0); + } + + LASSERT(cred->pc_ops->sign); + rc = cred->pc_ops->sign(cred, req); + if (!rc) + req->rq_req_wrapped = 1; + break; + case PTLRPC_SEC_TYPE_PRIV: + if (req->rq_req_wrapped) { + CWARN("req %p(o%u,x"LPU64",t"LPU64") " + "already encrypted, resend?\n", req, + req->rq_reqmsg ? req->rq_reqmsg->opc : -1, + req->rq_xid, req->rq_transno); + req->rq_req_wrapped = 0; + req->rq_reqdata_len = sizeof(struct ptlrpcs_wire_hdr); + LASSERT(req->rq_reqdata_len % 8 == 0); + } + + LASSERT(cred->pc_ops->seal); + rc = cred->pc_ops->seal(cred, req); + if (!rc) + req->rq_req_wrapped = 1; + break; + default: + LBUG(); + } + LASSERT(req->rq_reqdata_len); + LASSERT(req->rq_reqdata_len % 8 == 0); + LASSERT(req->rq_reqdata_len >= sizeof(struct ptlrpcs_wire_hdr)); + LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len); + + RETURN(rc); +} + +/* rq_nob_received is the actual received data length */ +int ptlrpcs_cli_unwrap_reply(struct ptlrpc_request *req) +{ + struct ptlrpc_cred *cred = req->rq_cred; + struct ptlrpc_sec *sec; + struct ptlrpcs_wire_hdr *sec_hdr; + int rc; + ENTRY; + + LASSERT(cred); + LASSERT(cred->pc_sec); + LASSERT(cred->pc_ops); + LASSERT(req->rq_repbuf); + + if (req->rq_nob_received < sizeof(*sec_hdr)) { + CERROR("req %p: reply size only %d\n", + req, req->rq_nob_received); + RETURN(-EPROTO); + } + + sec_hdr = (struct ptlrpcs_wire_hdr *) req->rq_repbuf; + sec_hdr->flavor = le32_to_cpu(sec_hdr->flavor); + sec_hdr->sectype = le32_to_cpu(sec_hdr->sectype); + sec_hdr->msg_len = le32_to_cpu(sec_hdr->msg_len); + sec_hdr->sec_len = le32_to_cpu(sec_hdr->sec_len); + + CDEBUG(D_SEC, "req %p, cred %p, flavor %u, sectype %u\n", + req, cred, sec_hdr->flavor, sec_hdr->sectype); + + sec = cred->pc_sec; + if (sec_hdr->flavor != sec->ps_flavor.flavor) { + CERROR("unmatched flavor %u while expect %u\n", + sec_hdr->flavor, sec->ps_flavor.flavor); + RETURN(-EPROTO); + } + + if (sizeof(*sec_hdr) + sec_hdr->msg_len + sec_hdr->sec_len > + req->rq_nob_received) { + CERROR("msg %u, sec %u, while only get %d\n", + sec_hdr->msg_len, sec_hdr->sec_len, + req->rq_nob_received); + RETURN(-EPROTO); + } + + switch (sec_hdr->sectype) { + case PTLRPC_SEC_TYPE_NONE: + case PTLRPC_SEC_TYPE_AUTH: { + LASSERT(cred->pc_ops->verify); + rc = cred->pc_ops->verify(cred, req); + LASSERT(rc || req->rq_repmsg || req->rq_ptlrpcs_restart); + break; + case PTLRPC_SEC_TYPE_PRIV: + LASSERT(cred->pc_ops->unseal); + rc = cred->pc_ops->unseal(cred, req); + LASSERT(rc || req->rq_repmsg || req->rq_ptlrpcs_restart); + break; + } + default: + rc = -1; + LBUG(); + } + RETURN(rc); +} + +/************************************************** + * security APIs * + **************************************************/ + +struct ptlrpc_sec * ptlrpcs_sec_create(ptlrpcs_flavor_t *flavor, + struct obd_import *import, + const char *pipe_dir, + void *pipe_data) +{ + struct ptlrpc_sec_type *type; + struct ptlrpc_sec *sec; + ENTRY; + + type = ptlrpcs_flavor2type(flavor); + if (!type) { + CDEBUG(D_SEC, "invalid major flavor %u\n", flavor->flavor); + RETURN(NULL); + } + + sec = type->pst_ops->create_sec(flavor, pipe_dir, pipe_data); + if (sec) { + spin_lock_init(&sec->ps_lock); + ptlrpcs_init_credcache(sec); + sec->ps_type = type; + sec->ps_flavor = *flavor; + sec->ps_import = class_import_get(import); + atomic_set(&sec->ps_refcount, 1); + atomic_set(&sec->ps_credcount, 0); + atomic_inc(&type->pst_inst); + } else + ptlrpcs_type_put(type); + + return sec; +} + +static void ptlrpcs_sec_destroy(struct ptlrpc_sec *sec) +{ + struct ptlrpc_sec_type *type = sec->ps_type; + struct ptlrpc_import *imp = sec->ps_import; + + LASSERT(type && type->pst_ops); + LASSERT(type->pst_ops->destroy_sec); + + type->pst_ops->destroy_sec(sec); + atomic_dec(&type->pst_inst); + ptlrpcs_type_put(type); + class_import_put(imp); +} + +void ptlrpcs_sec_put(struct ptlrpc_sec *sec) +{ + if (atomic_dec_and_test(&sec->ps_refcount)) { + ptlrpcs_flush_credcache(sec, 1); + + if (atomic_read(&sec->ps_credcount) == 0) { + ptlrpcs_sec_destroy(sec); + } else { + CWARN("sec %p(%s) is no usage while %d cred still " + "holded, destroy delayed\n", + sec, sec->ps_type->pst_name, + atomic_read(&sec->ps_credcount)); + } + } +} + +void ptlrpcs_sec_invalidate_cache(struct ptlrpc_sec *sec) +{ + ptlrpcs_flush_credcache(sec, 1); +} + +int sec_alloc_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int msgsize, int secsize) +{ + struct ptlrpcs_wire_hdr *hdr; + ENTRY; + + LASSERT(msgsize % 8 == 0); + LASSERT(secsize % 8 == 0); + + req->rq_reqbuf_len = sizeof(*hdr) + msgsize + secsize; + OBD_ALLOC(req->rq_reqbuf, req->rq_reqbuf_len); + if (!req->rq_reqbuf) { + CERROR("can't alloc %d\n", req->rq_reqbuf_len); + RETURN(-ENOMEM); + } + + hdr = buf_to_sec_hdr(req->rq_reqbuf); + hdr->flavor = cpu_to_le32(sec->ps_flavor.flavor); + hdr->sectype = cpu_to_le32(sec->ps_sectype); + hdr->msg_len = msgsize; + /* security length will be filled later */ + + /* later reqdata_len will be added on actual security payload */ + req->rq_reqdata_len = sizeof(*hdr) + msgsize; + req->rq_reqmsg = buf_to_lustre_msg(req->rq_reqbuf); + + CDEBUG(D_SEC, "req %p: rqbuf at %p, len %d, msg %d, sec %d\n", + req, req->rq_reqbuf, req->rq_reqbuf_len, + msgsize, secsize); + + RETURN(0); +} + +/* when complete successfully, req->rq_reqmsg should point to the + * right place. + */ +int ptlrpcs_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_cred *cred = req->rq_cred; + struct ptlrpc_sec *sec; + struct ptlrpc_secops *ops; + + LASSERT(msgsize % 8 == 0); + LASSERT(sizeof(struct ptlrpcs_wire_hdr) % 8 == 0); + LASSERT(cred); + LASSERT(atomic_read(&cred->pc_refcount)); + LASSERT(cred->pc_sec); + LASSERT(cred->pc_sec->ps_type); + LASSERT(cred->pc_sec->ps_type->pst_ops); + LASSERT(req->rq_reqbuf == NULL); + LASSERT(req->rq_reqmsg == NULL); + + sec = cred->pc_sec; + ops = sec->ps_type->pst_ops; + if (ops->alloc_reqbuf) + return ops->alloc_reqbuf(sec, req, msgsize); + else + return sec_alloc_reqbuf(sec, req, msgsize, 0); +} + +void sec_free_reqbuf(struct ptlrpc_sec *sec, + struct ptlrpc_request *req) +{ + LASSERT(req->rq_reqbuf); + LASSERT(req->rq_reqbuf_len); + + /* sanity check */ + if (req->rq_reqmsg) { + LASSERT((char *) req->rq_reqmsg >= req->rq_reqbuf && + (char *) req->rq_reqmsg < req->rq_reqbuf + + req->rq_reqbuf_len); + } + + OBD_FREE(req->rq_reqbuf, req->rq_reqbuf_len); + req->rq_reqbuf = NULL; + req->rq_reqmsg = NULL; +} + +void ptlrpcs_cli_free_reqbuf(struct ptlrpc_request *req) +{ + struct ptlrpc_cred *cred = req->rq_cred; + struct ptlrpc_sec *sec; + struct ptlrpc_secops *ops; + + LASSERT(cred); + LASSERT(atomic_read(&cred->pc_refcount)); + LASSERT(cred->pc_sec); + LASSERT(cred->pc_sec->ps_type); + LASSERT(cred->pc_sec->ps_type->pst_ops); + LASSERT(req->rq_reqbuf); + + sec = cred->pc_sec; + ops = sec->ps_type->pst_ops; + if (ops->free_reqbuf) + ops->free_reqbuf(sec, req); + else + sec_free_reqbuf(sec, req); +} + +int ptlrpcs_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize) +{ + struct ptlrpc_cred *cred = req->rq_cred; + struct ptlrpc_sec *sec; + struct ptlrpc_secops *ops; + int msg_payload, sec_payload; + ENTRY; + + LASSERT(msgsize % 8 == 0); + LASSERT(sizeof(struct ptlrpcs_wire_hdr) % 8 == 0); + LASSERT(cred); + LASSERT(atomic_read(&cred->pc_refcount)); + LASSERT(cred->pc_sec); + LASSERT(cred->pc_sec->ps_type); + LASSERT(cred->pc_sec->ps_type->pst_ops); + LASSERT(req->rq_repbuf == NULL); + + sec = cred->pc_sec; + ops = sec->ps_type->pst_ops; + if (ops->alloc_repbuf) + RETURN(ops->alloc_repbuf(sec, req, msgsize)); + + /* default allocation scheme */ + msg_payload = sec->ps_sectype == PTLRPC_SEC_TYPE_PRIV ? 0 : msgsize; + sec_payload = size_round(ptlrpcs_est_rep_payload(sec, msgsize)); + + req->rq_repbuf_len = sizeof(struct ptlrpcs_wire_hdr) + + msg_payload + sec_payload; + OBD_ALLOC(req->rq_repbuf, req->rq_repbuf_len); + if (!req->rq_repbuf) + RETURN(-ENOMEM); + + CDEBUG(D_SEC, "req %p: repbuf at %p, len %d, msg %d, sec %d\n", + req, req->rq_repbuf, req->rq_repbuf_len, + msg_payload, sec_payload); + + RETURN(0); +} + +void ptlrpcs_cli_free_repbuf(struct ptlrpc_request *req) +{ + struct ptlrpc_cred *cred = req->rq_cred; + struct ptlrpc_sec *sec; + struct ptlrpc_secops *ops; + ENTRY; + + LASSERT(cred); + LASSERT(atomic_read(&cred->pc_refcount)); + LASSERT(cred->pc_sec); + LASSERT(cred->pc_sec->ps_type); + LASSERT(cred->pc_sec->ps_type->pst_ops); + LASSERT(req->rq_repbuf); + + sec = cred->pc_sec; + ops = sec->ps_type->pst_ops; + if (ops->free_repbuf) + ops->free_repbuf(sec, req); + else { + OBD_FREE(req->rq_repbuf, req->rq_repbuf_len); + req->rq_repbuf = NULL; + req->rq_repmsg = NULL; + } + EXIT; +} + +int ptlrpcs_import_get_sec(struct obd_import *imp) +{ + ptlrpcs_flavor_t flavor = {PTLRPC_SEC_NULL, 0}; + char *pipedir = NULL; + ENTRY; + + LASSERT(imp->imp_obd); + LASSERT(imp->imp_obd->obd_type); + + /* old sec might be still there in reconnecting */ + if (imp->imp_sec) + RETURN(0); + + /* find actual flavor for client obd. right now server side + * obd (reverse imp, etc) will simply use NULL. + */ + if (!strcmp(imp->imp_obd->obd_type->typ_name, "mdc") || + !strcmp(imp->imp_obd->obd_type->typ_name, "osc")) { + struct client_obd *cli = &imp->imp_obd->u.cli; + + if (cli->cl_sec_flavor == PTLRPC_SEC_GSS) { + CWARN("select security gss/%s for %s(%s)\n", + cli->cl_sec_subflavor == PTLRPC_SEC_GSS_KRB5I ? + "krb5i" : "krb5p", + imp->imp_obd->obd_type->typ_name, + imp->imp_obd->obd_name); + flavor.flavor = cli->cl_sec_flavor; + flavor.subflavor = cli->cl_sec_subflavor; + pipedir = imp->imp_obd->obd_name; + } else if (cli->cl_sec_flavor == PTLRPC_SEC_NULL) { + CWARN("select security null for %s(%s)\n", + imp->imp_obd->obd_type->typ_name, + imp->imp_obd->obd_name); + } else { + CWARN("unknown security flavor for mdc(%s), " + "use 'null'\n", imp->imp_obd->obd_name); + } + } + + imp->imp_sec = ptlrpcs_sec_create(&flavor, imp, pipedir, imp); + if (!imp->imp_sec) + RETURN(-EINVAL); + else + RETURN(0); +} + +void ptlrpcs_import_drop_sec(struct obd_import *imp) +{ + ENTRY; + if (imp->imp_sec) { + ptlrpcs_sec_put(imp->imp_sec); + imp->imp_sec = NULL; + } + EXIT; +} + +int __init ptlrpc_sec_init(void) +{ + int rc; + + if ((rc = ptlrpcs_null_init())) + return rc; + + if ((rc = svcsec_null_init())) { + ptlrpcs_null_exit(); + return rc; + } + +#if 0 +#if !defined __KERNEL__ && defined ENABLE_GSS + ptlrpcs_gss_init(); +#endif +#endif + return 0; +} + +static void __exit ptlrpc_sec_exit(void) +{ + svcsec_null_exit(); + ptlrpcs_null_exit(); +} + + +EXPORT_SYMBOL(ptlrpcs_register); +EXPORT_SYMBOL(ptlrpcs_unregister); +EXPORT_SYMBOL(ptlrpcs_sec_create); +EXPORT_SYMBOL(ptlrpcs_sec_put); +EXPORT_SYMBOL(ptlrpcs_sec_invalidate_cache); +EXPORT_SYMBOL(ptlrpcs_import_get_sec); +EXPORT_SYMBOL(ptlrpcs_import_drop_sec); +EXPORT_SYMBOL(ptlrpcs_cred_lookup); +EXPORT_SYMBOL(ptlrpcs_cred_put); +EXPORT_SYMBOL(ptlrpcs_req_get_cred); +EXPORT_SYMBOL(ptlrpcs_req_drop_cred); +EXPORT_SYMBOL(ptlrpcs_req_replace_dead_cred); +EXPORT_SYMBOL(ptlrpcs_req_refresh_cred); +EXPORT_SYMBOL(ptlrpcs_cli_alloc_reqbuf); +EXPORT_SYMBOL(ptlrpcs_cli_free_reqbuf); +EXPORT_SYMBOL(ptlrpcs_cli_alloc_repbuf); +EXPORT_SYMBOL(ptlrpcs_cli_free_repbuf); +EXPORT_SYMBOL(ptlrpcs_cli_wrap_request); +EXPORT_SYMBOL(ptlrpcs_cli_unwrap_reply); +EXPORT_SYMBOL(sec_alloc_reqbuf); +EXPORT_SYMBOL(sec_free_reqbuf); + +EXPORT_SYMBOL(svcsec_register); +EXPORT_SYMBOL(svcsec_unregister); +EXPORT_SYMBOL(svcsec_accept); +EXPORT_SYMBOL(svcsec_authorize); +EXPORT_SYMBOL(svcsec_alloc_repbuf); +EXPORT_SYMBOL(svcsec_cleanup_req); +EXPORT_SYMBOL(svcsec_get); +EXPORT_SYMBOL(svcsec_put); +EXPORT_SYMBOL(svcsec_alloc_reply_state); +EXPORT_SYMBOL(svcsec_free_reply_state); + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Lustre Security Support"); +MODULE_LICENSE("GPL"); + +module_init(ptlrpc_sec_init); +module_exit(ptlrpc_sec_exit); diff --git a/lustre/sec/sec_null.c b/lustre/sec/sec_null.c new file mode 100644 index 0000000..3d9d908 --- /dev/null +++ b/lustre/sec/sec_null.c @@ -0,0 +1,195 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#else +#include +#endif + +#include +#include +#include +#include + +static int null_cred_refresh(struct ptlrpc_cred *cred) +{ + ENTRY; + LASSERT(cred->pc_flags & PTLRPC_CRED_UPTODATE); + RETURN(0); +} + +static int null_cred_match(struct ptlrpc_cred *cred, + struct ptlrpc_request *req, + struct vfs_cred *vcred) +{ + ENTRY; + RETURN(1); +} + +static int null_cred_sign(struct ptlrpc_cred *cred, + struct ptlrpc_request *req) +{ + struct ptlrpcs_wire_hdr *hdr = buf_to_sec_hdr(req->rq_reqbuf); + ENTRY; + + hdr->sec_len = cpu_to_le32(0); + + RETURN(0); +} + +static int null_cred_verify(struct ptlrpc_cred *cred, + struct ptlrpc_request *req) +{ + struct ptlrpcs_wire_hdr *hdr = buf_to_sec_hdr(req->rq_repbuf); + + if (hdr->sec_len != 0) { + CERROR("security payload %u not zero\n", hdr->sec_len); + RETURN(-EPROTO); + } + + req->rq_repmsg = (struct lustre_msg *)(hdr + 1); + req->rq_replen = hdr->msg_len; + CDEBUG(D_SEC, "set repmsg at %p, len %d\n", + req->rq_repmsg, req->rq_replen); + + RETURN(0); +} + +static void null_cred_destroy(struct ptlrpc_cred *cred) +{ + LASSERT(!atomic_read(&cred->pc_refcount)); + + CDEBUG(D_SEC, "NULL_SEC: destroy cred %p\n", cred); + OBD_FREE(cred, sizeof(*cred)); +} + +static struct ptlrpc_credops null_credops = { + .refresh = null_cred_refresh, + .match = null_cred_match, + .sign = null_cred_sign, + .verify = null_cred_verify, + .destroy = null_cred_destroy, +}; + +static +struct ptlrpc_sec* null_create_sec(ptlrpcs_flavor_t *flavor, + const char *pipe_dir, + void *pipe_data) +{ + struct ptlrpc_sec *sec; + ENTRY; + + LASSERT(flavor->flavor == PTLRPC_SEC_NULL); + + OBD_ALLOC(sec, sizeof(*sec)); + if (!sec) + RETURN(ERR_PTR(-ENOMEM)); + + sec->ps_sectype = PTLRPC_SEC_TYPE_NONE; + sec->ps_expire = (-1UL >> 1); /* never expire */ + sec->ps_nextgc = (-1UL >> 1); + sec->ps_flags = 0; + + CDEBUG(D_SEC, "Create NULL security module at %p\n", sec); + RETURN(sec); +} + +static +void null_destroy_sec(struct ptlrpc_sec *sec) +{ + ENTRY; + + CDEBUG(D_SEC, "Destroy NULL security module at %p\n", sec); + + LASSERT(!atomic_read(&sec->ps_refcount)); + OBD_FREE(sec, sizeof(*sec)); + EXIT; +} + +static +struct ptlrpc_cred* null_create_cred(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + struct vfs_cred *vcred) +{ + struct ptlrpc_cred *cred; + ENTRY; + + OBD_ALLOC(cred, sizeof(*cred)); + if (!cred) + RETURN(NULL); + + INIT_LIST_HEAD(&cred->pc_hash); + atomic_set(&cred->pc_refcount, 0); + cred->pc_sec = sec; + cred->pc_ops = &null_credops; + cred->pc_req = req; + cred->pc_expire = (-1UL >> 1); /* never expire */ + cred->pc_flags = PTLRPC_CRED_UPTODATE; + cred->pc_pag = vcred->vc_pag; + cred->pc_uid = vcred->vc_uid; + CDEBUG(D_SEC, "create a null cred at %p("LPU64"/%u)\n", + cred, vcred->vc_pag, vcred->vc_uid); + + RETURN(cred); +} + +static struct ptlrpc_secops null_secops = { + .create_sec = null_create_sec, + .destroy_sec = null_destroy_sec, + .create_cred = null_create_cred, +}; + +static struct ptlrpc_sec_type null_type = { + .pst_owner = THIS_MODULE, + .pst_name = "NULL_SEC", + .pst_inst = ATOMIC_INIT(0), + .pst_flavor = {PTLRPC_SEC_NULL, 0}, + .pst_ops = &null_secops, +}; + +int ptlrpcs_null_init(void) +{ + int rc; + + rc = ptlrpcs_register(&null_type); + if (rc) + CERROR("failed to register NULL security: %d\n", rc); + + return rc; +} + +int ptlrpcs_null_exit(void) +{ + int rc; + + rc = ptlrpcs_unregister(&null_type); + if (rc) + CERROR("cannot unregister NULL security: %d\n", rc); + + return rc; +} diff --git a/lustre/sec/svcsec.c b/lustre/sec/svcsec.c new file mode 100644 index 0000000..b6792c1 --- /dev/null +++ b/lustre/sec/svcsec.c @@ -0,0 +1,273 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#else +#include +#endif + +#include +#include +#include +#include +#include + +static spinlock_t svcsecs_lock = SPIN_LOCK_UNLOCKED; +static struct ptlrpc_svcsec *svcsecs[PTLRPC_SEC_MAX_FLAVORS] = { + NULL, +}; + +int svcsec_register(struct ptlrpc_svcsec *sec) +{ + __u32 flavor = sec->pss_flavor.flavor; + + if (flavor >= PTLRPC_SEC_MAX_FLAVORS) + return -EINVAL; + + spin_lock(&svcsecs_lock); + if (svcsecs[flavor]) { + spin_unlock(&svcsecs_lock); + return -EALREADY; + } + svcsecs[flavor] = sec; + spin_unlock(&svcsecs_lock); + + CDEBUG(D_SEC, "Registered svc security module %s\n", sec->pss_name); + return 0; +} + +int svcsec_unregister(struct ptlrpc_svcsec *sec) +{ + __u32 flavor = sec->pss_flavor.flavor; + + if (flavor >= PTLRPC_SEC_MAX_FLAVORS) + return -EINVAL; + + spin_lock(&svcsecs_lock); + if (!svcsecs[flavor]) { + spin_unlock(&svcsecs_lock); + return -EINVAL; + } + + LASSERT(svcsecs[flavor] == sec); + + CDEBUG(D_SEC, "Unregistered svc security module %s\n", sec->pss_name); + svcsecs[flavor] = NULL; + spin_unlock(&svcsecs_lock); + + return 0; +} + +static +struct ptlrpc_svcsec * flavor2svcsec(__u32 flavor) +{ + struct ptlrpc_svcsec *sec; + + if (flavor >= PTLRPC_SEC_MAX_FLAVORS) + return NULL; + + spin_lock(&svcsecs_lock); + sec = svcsecs[flavor]; + if (sec && !try_module_get(sec->pss_owner)) + sec = NULL; + spin_unlock(&svcsecs_lock); + return sec; +} + +struct ptlrpc_svcsec * svcsec_get(struct ptlrpc_svcsec *sec) +{ + int rc; + + spin_lock(&svcsecs_lock); + rc = try_module_get(sec->pss_owner); + spin_unlock(&svcsecs_lock); + LASSERT(rc); + return sec; +} + +void svcsec_put(struct ptlrpc_svcsec *sec) +{ + spin_lock(&svcsecs_lock); + module_put(sec->pss_owner); + spin_unlock(&svcsecs_lock); +} + +/* + * common code to allocate reply_state buffer. + */ +int svcsec_alloc_reply_state(struct ptlrpc_request *req, + int msgsize, int secsize) +{ + struct ptlrpc_reply_state *rs; + char *buf; + int repsize, bufsize; + ENTRY; + + LASSERT(msgsize % 8 == 0); + LASSERT(secsize % 8 == 0); + + repsize = sizeof(struct ptlrpcs_wire_hdr) + msgsize + secsize; + bufsize = repsize + sizeof(struct ptlrpc_reply_state); + + OBD_ALLOC(buf, bufsize); + if (!buf) { + CERROR("can't alloc %d\n", bufsize); + RETURN(-ENOMEM); + } + + /* req->rq_repbuf is not used on server side */ + rs = (struct ptlrpc_reply_state *) (buf + repsize); + rs->rs_buf = buf; + rs->rs_buf_len = bufsize; + rs->rs_repbuf = buf; + rs->rs_repbuf_len = repsize; + /* current known data length is hdr + msg, security payload + * will be added on later. + */ + rs->rs_repdata_len = sizeof(struct ptlrpcs_wire_hdr) + msgsize; + req->rq_repmsg = rs->rs_msg = (struct lustre_msg *) + (rs->rs_repbuf + sizeof(struct ptlrpcs_wire_hdr)); + + req->rq_reply_state = rs; + + CDEBUG(D_SEC, "alloc rs buf at %p, len %d; repbuf at %p, len %d\n", + rs->rs_buf, rs->rs_buf_len, rs->rs_repbuf, rs->rs_repbuf_len); + + RETURN(0); +} + +void svcsec_free_reply_state(struct ptlrpc_reply_state *rs) +{ + char *p; + ENTRY; + + /* for work around memory-alloc debug poison */ + LASSERT(rs); + p = rs->rs_buf; + OBD_FREE(p, rs->rs_buf_len); + EXIT; +} + +int svcsec_alloc_repbuf(struct ptlrpc_svcsec *svcsec, + struct ptlrpc_request *req, + int msgsize) +{ + LASSERT(svcsec); + LASSERT(msgsize % 8 == 0); + + if (svcsec->alloc_repbuf) + return svcsec->alloc_repbuf(svcsec, req, msgsize); + else + return svcsec_alloc_reply_state(req, msgsize, 0); +} + +int svcsec_accept(struct ptlrpc_request *req, enum ptlrpcs_error *res) +{ + struct ptlrpc_svcsec *sec; + struct ptlrpcs_wire_hdr *sec_hdr; + int rc; + ENTRY; + + LASSERT(req->rq_reqbuf); + LASSERT(!req->rq_reqmsg); + LASSERT(!req->rq_svcsec); + + *res = PTLRPCS_BADCRED; + if (req->rq_reqbuf_len < sizeof(*sec_hdr)) { + CERROR("drop too short msg (length: %d)\n", req->rq_reqbuf_len); + RETURN(SVC_DROP); + } + + sec_hdr = (struct ptlrpcs_wire_hdr *) req->rq_reqbuf; + sec_hdr->flavor = le32_to_cpu(sec_hdr->flavor); + sec_hdr->sectype = le32_to_cpu(sec_hdr->sectype); + sec_hdr->msg_len = le32_to_cpu(sec_hdr->msg_len); + sec_hdr->sec_len = le32_to_cpu(sec_hdr->sec_len); + + /* sanity check */ + switch (sec_hdr->sectype) { + case PTLRPC_SEC_TYPE_NONE: + case PTLRPC_SEC_TYPE_AUTH: + case PTLRPC_SEC_TYPE_PRIV: + break; + default: + CERROR("unknown security type %d\n", sec_hdr->sectype); + RETURN(SVC_DROP); + } + + if (sizeof(*sec_hdr) + sec_hdr->msg_len + sec_hdr->sec_len > + req->rq_reqbuf_len) { + CERROR("received %d, msg %d, sec %d\n", + req->rq_reqbuf_len, sec_hdr->msg_len, sec_hdr->sec_len); + RETURN(SVC_DROP); + } + + req->rq_svcsec = sec = flavor2svcsec(sec_hdr->flavor); + if (!sec) { + CERROR("drop msg: unsupported flavor %d\n", sec_hdr->flavor); + RETURN(SVC_DROP); + } + LASSERT(sec->accept); + + rc = sec->accept(req, res); + + switch (rc) { + case SVC_DROP: + svcsec_put(sec); + req->rq_svcsec = NULL; + break; + case SVC_OK: + case SVC_LOGIN: + case SVC_LOGOUT: + LASSERT(req->rq_reqmsg); + break; + } + + RETURN(rc); +} + +int svcsec_authorize(struct ptlrpc_request *req) +{ + LASSERT(req->rq_svcsec); + LASSERT(req->rq_svcsec->authorize); + + return (req->rq_svcsec->authorize(req)); +} + +void svcsec_cleanup_req(struct ptlrpc_request *req) +{ + struct ptlrpc_svcsec *svcsec = req->rq_svcsec; + ENTRY; + + LASSERT(svcsec); + LASSERT(svcsec->cleanup_req || !req->rq_sec_svcdata); + + if (svcsec->cleanup_req) + svcsec->cleanup_req(svcsec, req); + EXIT; +} diff --git a/lustre/sec/svcsec_null.c b/lustre/sec/svcsec_null.c new file mode 100644 index 0000000..5e7eed8 --- /dev/null +++ b/lustre/sec/svcsec_null.c @@ -0,0 +1,111 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_SEC +#ifdef __KERNEL__ +#include +#include +#include +#else +#include +#endif + +#include +#include +#include +#include +#include + +static +int null_svcsec_accept(struct ptlrpc_request *req, enum ptlrpcs_error *res) +{ + struct ptlrpcs_wire_hdr *hdr = buf_to_sec_hdr(req->rq_reqbuf); + ENTRY; + + LASSERT(hdr->flavor == PTLRPC_SEC_NULL); + + if (hdr->sec_len != 0) { + CERROR("security payload %d not zero\n", hdr->sec_len); + *res = PTLRPCS_REJECTEDCRED; + RETURN(SVC_DROP); + } + + req->rq_reqmsg = (struct lustre_msg *)(hdr + 1); + req->rq_reqlen = hdr->msg_len; + *res = PTLRPCS_OK; + CDEBUG(D_SEC, "req %p: set reqmsg at %p, len %d\n", + req, req->rq_reqmsg, req->rq_reqlen); + RETURN(SVC_OK); +} + +static +int null_svcsec_authorize(struct ptlrpc_request *req) +{ + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpcs_wire_hdr *hdr; + ENTRY; + + LASSERT(rs); + LASSERT(rs->rs_repbuf_len >= 4 * 4); + + hdr = buf_to_sec_hdr(rs->rs_repbuf); + hdr->flavor = cpu_to_le32(PTLRPC_SEC_NULL); + hdr->sectype = cpu_to_le32(PTLRPC_SEC_TYPE_AUTH); + hdr->msg_len = cpu_to_le32(req->rq_replen); + hdr->sec_len = cpu_to_le32(0); + + CDEBUG(D_SEC, "fill in datasize %d\n", rs->rs_repdata_len); + RETURN(0); +} + +static struct ptlrpc_svcsec null_svcsec = { + .pss_owner = THIS_MODULE, + .pss_name = "NULL_SVCSEC", + .pss_flavor = {PTLRPC_SEC_NULL, 0}, + .accept = null_svcsec_accept, + .authorize = null_svcsec_authorize, +}; + +int svcsec_null_init() +{ + int rc; + + rc = svcsec_register(&null_svcsec); + if (rc) + CERROR("failed to register SVCNULL security: %d\n", rc); + + return rc; +} + +int svcsec_null_exit() +{ + int rc; + + rc = svcsec_unregister(&null_svcsec); + if (rc) + CERROR("cannot unregister SVCNULL security: %d\n", rc); + + return rc; +} + diff --git a/lustre/sec/upcall_cache.c b/lustre/sec/upcall_cache.c new file mode 100644 index 0000000..49e9522 --- /dev/null +++ b/lustre/sec/upcall_cache.c @@ -0,0 +1,414 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_LOV +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* FIXME + * current ucache implementation is simply took from group hash code, almost + * without any change. it's very simple and have very limited functionality, + * and probably it's also only suitable for usage of group hash. + */ + +void upcall_cache_init_entry(struct upcall_cache *cache, + struct upcall_cache_entry *entry, + __u64 key) +{ + UC_CACHE_SET_NEW(entry); + INIT_LIST_HEAD(&entry->ue_hash); + atomic_set(&entry->ue_refcount, 0); + entry->ue_key = key; + entry->ue_cache = cache; + init_waitqueue_head(&entry->ue_waitq); +} +EXPORT_SYMBOL(upcall_cache_init_entry); + +static inline struct upcall_cache_entry * +alloc_entry(struct upcall_cache *cache, __u64 key) +{ + LASSERT(cache->alloc_entry); + return cache->alloc_entry(cache, key); +} + +static void free_entry(struct upcall_cache_entry *entry) +{ + struct upcall_cache *cache = entry->ue_cache; + + LASSERT(cache); + LASSERT(cache->free_entry); + LASSERT(atomic_read(&entry->ue_refcount) == 0); + + CDEBUG(D_OTHER, "destroy %s entry %p for key "LPU64"\n", + cache->uc_name, entry, entry->ue_key); + + list_del(&entry->ue_hash); + cache->free_entry(cache, entry); +} + +static inline void get_entry(struct upcall_cache_entry *entry) +{ + atomic_inc(&entry->ue_refcount); +} + +static inline void put_entry(struct upcall_cache_entry *entry) +{ + if (atomic_dec_and_test(&entry->ue_refcount) && + !UC_CACHE_IS_VALID(entry)) { + free_entry(entry); + } +} + +static inline int refresh_entry(struct upcall_cache_entry *entry) +{ + struct upcall_cache *cache = entry->ue_cache; + + LASSERT(cache); + LASSERT(cache->make_upcall); + + return cache->make_upcall(cache, entry); +} + +static int check_unlink_entry(struct upcall_cache_entry *entry) +{ + if (UC_CACHE_IS_VALID(entry) && + time_before(get_seconds(), entry->ue_expire)) + return 0; + + if (UC_CACHE_IS_ACQUIRING(entry) && + time_after(get_seconds(), entry->ue_acquire_expire)) { + UC_CACHE_SET_EXPIRED(entry); + wake_up_all(&entry->ue_waitq); + } else if (!UC_CACHE_IS_INVALID(entry)) { + UC_CACHE_SET_EXPIRED(entry); + } + + list_del_init(&entry->ue_hash); + if (!atomic_read(&entry->ue_refcount)) + free_entry(entry); + return 1; +} + +/* XXX + * currently always use write_lock + */ +static struct upcall_cache_entry * +__get_entry(struct upcall_cache *cache, unsigned int hash, __u64 key, + int create, int async) +{ + struct list_head *head; + struct upcall_cache_entry *entry, *next, *new = NULL; + int found = 0, rc; + ENTRY; + + LASSERT(hash < cache->uc_hashsize); + + head = &cache->uc_hashtable[hash]; + +find_again: + write_lock(&cache->uc_hashlock); + list_for_each_entry_safe(entry, next, head, ue_hash) { + if (check_unlink_entry(entry)) + continue; + if (entry->ue_key == key) { + found = 1; + break; + } + } + + if (!found) { + if (!create) + RETURN(NULL); + if (!new) { + write_unlock(&cache->uc_hashlock); + new = alloc_entry(cache, key); + if (!new) { + CERROR("fail to alloc entry\n"); + RETURN(NULL); + } + goto find_again; + } else { + list_add(&new->ue_hash, head); + entry = new; + } + } else { + if (new) { + free_entry(new); + new = NULL; + } + list_move(&entry->ue_hash, head); + } + get_entry(entry); + + /* as for this moment, we have found matched entry + * and hold a ref of it. if it's NEW (we created it), + * we must give it a push to refresh + */ + if (UC_CACHE_IS_NEW(entry)) { + LASSERT(entry == new); + UC_CACHE_SET_ACQUIRING(entry); + UC_CACHE_CLEAR_NEW(entry); + entry->ue_acquire_expire = get_seconds() + + cache->uc_acquire_expire; + + write_unlock(&cache->uc_hashlock); + rc = refresh_entry(entry); + write_lock(&cache->uc_hashlock); + if (rc) { + UC_CACHE_CLEAR_ACQUIRING(entry); + UC_CACHE_SET_INVALID(entry); + } + } + + /* caller don't want to wait */ + if (async) { + write_unlock(&cache->uc_hashlock); + RETURN(entry); + } + + /* someone (and only one) is doing upcall upon + * this item, just wait it complete + */ + if (UC_CACHE_IS_ACQUIRING(entry)) { + wait_queue_t wait; + + init_waitqueue_entry(&wait, current); + add_wait_queue(&entry->ue_waitq, &wait); + set_current_state(TASK_INTERRUPTIBLE); + write_unlock(&cache->uc_hashlock); + + schedule_timeout(cache->uc_acquire_expire); + + write_lock(&cache->uc_hashlock); + remove_wait_queue(&entry->ue_waitq, &wait); + if (UC_CACHE_IS_ACQUIRING(entry)) { + /* we're interrupted or upcall failed + * in the middle + */ + CERROR("entry %p not refreshed: cur %lu, key "LPU64", " + "ref %d fl %u, ac %ld, ex %ld\n", + entry, get_seconds(), entry->ue_key, + atomic_read(&entry->ue_refcount), + entry->ue_flags, entry->ue_acquire_expire, + entry->ue_expire); + put_entry(entry); + write_unlock(&cache->uc_hashlock); + RETURN(NULL); + } + /* fall through */ + } + + /* invalid means error, don't need to try again */ + if (UC_CACHE_IS_INVALID(entry)) { + put_entry(entry); + write_unlock(&cache->uc_hashlock); + RETURN(NULL); + } + + /* check expired + * We can't refresh the existed one because some + * memory might be shared by multiple processes. + */ + if (check_unlink_entry(entry)) { + /* if expired, try again. but if this entry is + * created by me and too quickly turn to expired + * without any error, should at least give a + * chance to use it once. + */ + if (entry != new) { + put_entry(entry); + write_unlock(&cache->uc_hashlock); + new = NULL; + goto find_again; + } + } + + /* Now we know it's good */ + LASSERT(UC_CACHE_IS_VALID(entry)); + write_unlock(&cache->uc_hashlock); + + RETURN(entry); +} + +struct upcall_cache_entry * +upcall_cache_get_entry(struct upcall_cache *cache, __u64 key) +{ + unsigned int hash; + + LASSERT(cache->hash); + + hash = cache->hash(cache, key); + + return __get_entry(cache, hash, key, 1, 0); +} +EXPORT_SYMBOL(upcall_cache_get_entry); + +void upcall_cache_put_entry(struct upcall_cache_entry *entry) +{ + struct upcall_cache *cache = entry->ue_cache; + + write_lock(&cache->uc_hashlock); + LASSERTF(atomic_read(&entry->ue_refcount) > 0, + "entry %p: ref %d\n", entry, atomic_read(&entry->ue_refcount)); + put_entry(entry); + write_unlock(&cache->uc_hashlock); +} +EXPORT_SYMBOL(upcall_cache_put_entry); + +int upcall_cache_downcall(struct upcall_cache *cache, __u64 key, + int err, void *args) +{ + struct list_head *head; + struct upcall_cache_entry *entry; + int found = 0, rc; + unsigned int hash; + ENTRY; + + hash = cache->hash(cache, key); + LASSERT(hash < cache->uc_hashsize); + + head = &cache->uc_hashtable[hash]; + + write_lock(&cache->uc_hashlock); + list_for_each_entry(entry, head, ue_hash) { + if (entry->ue_key == key) { + found = 1; + break; + } + } + if (!found) { + /* haven't found, it's possible */ + write_unlock(&cache->uc_hashlock); + CWARN("key "LPU64" entry dosen't found\n", key); + RETURN(-EINVAL); + } + + if (err < 0) { + UC_CACHE_SET_INVALID(entry); + GOTO(out, rc = err); + } + + if (!UC_CACHE_IS_ACQUIRING(entry) || + UC_CACHE_IS_INVALID(entry) || + UC_CACHE_IS_EXPIRED(entry)) { + CWARN("stale entry %p: cur %lu, key "LPU64", ref %d, " + "fl %u, ac %ld, ex %ld\n", + entry, get_seconds(), entry->ue_key, + atomic_read(&entry->ue_refcount), entry->ue_flags, + entry->ue_acquire_expire, entry->ue_expire); + GOTO(out, rc = -EINVAL); + } + + atomic_inc(&entry->ue_refcount); + write_unlock(&cache->uc_hashlock); + rc = cache->parse_downcall(cache, entry, args); + write_lock(&cache->uc_hashlock); + atomic_dec(&entry->ue_refcount); + if (rc) { + UC_CACHE_SET_INVALID(entry); + list_del_init(&entry->ue_hash); + GOTO(out, rc); + } + entry->ue_expire = get_seconds() + cache->uc_entry_expire; + UC_CACHE_SET_VALID(entry); + CDEBUG(D_OTHER, "create ucache entry %p(key "LPU64")\n", + entry, entry->ue_key); +out: + wake_up_all(&entry->ue_waitq); + write_unlock(&cache->uc_hashlock); + RETURN(rc); +} +EXPORT_SYMBOL(upcall_cache_downcall); + +void upcall_cache_flush_one(struct upcall_cache *cache, __u64 key) +{ + struct list_head *head; + struct upcall_cache_entry *entry; + unsigned int hash; + int found = 0; + ENTRY; + + hash = cache->hash(cache, key); + LASSERT(hash < cache->uc_hashsize); + + head = &cache->uc_hashtable[hash]; + + write_lock(&cache->uc_hashlock); + list_for_each_entry(entry, head, ue_hash) { + if (entry->ue_key == key) { + found = 1; + break; + } + } + + if (found) { + UC_CACHE_SET_EXPIRED(entry); + if (!atomic_read(&entry->ue_refcount)) + free_entry(entry); + } + write_unlock(&cache->uc_hashlock); +} +EXPORT_SYMBOL(upcall_cache_flush_one); + +static void cache_flush(struct upcall_cache *cache, int force, int sync) +{ + struct upcall_cache_entry *entry, *next; + int i; + ENTRY; + + write_lock(&cache->uc_hashlock); + for (i = 0; i < cache->uc_hashsize; i++) { + list_for_each_entry_safe(entry, next, + &cache->uc_hashtable[i], ue_hash) { + if (!force && atomic_read(&entry->ue_refcount)) { + UC_CACHE_SET_EXPIRED(entry); + continue; + } + LASSERT(!atomic_read(&entry->ue_refcount)); + free_entry(entry); + } + } + write_unlock(&cache->uc_hashlock); + EXIT; +} + +void upcall_cache_flush_idle(struct upcall_cache *cache) +{ + cache_flush(cache, 0, 0); +} + +void upcall_cache_flush_all(struct upcall_cache *cache) +{ + cache_flush(cache, 1, 0); +} +EXPORT_SYMBOL(upcall_cache_flush_idle); +EXPORT_SYMBOL(upcall_cache_flush_all); diff --git a/lustre/smfs/dir.c b/lustre/smfs/dir.c index 3d77ef7..423aab8 100644 --- a/lustre/smfs/dir.c +++ b/lustre/smfs/dir.c @@ -169,7 +169,6 @@ static struct dentry *smfs_lookup(struct inode *dir, struct dentry *dentry, d_add(dentry, inode); SMFS_POST_HOOK(dir, HOOK_LOOKUP, &msg, rc); -exit: post_smfs_dentry(cache_dentry); post_smfs_dentry(cache_parent); RETURN(ERR_PTR(rc)); @@ -688,7 +687,6 @@ static int smfs_readdir(struct file *filp, void *dirent, filldir_t filldir) SMFS_POST_HOOK(dentry->d_inode, HOOK_READDIR, &msg, rc); duplicate_file(filp, sfi->c_file); -exit: if (rc > 0) rc = 0; diff --git a/lustre/tests/acl_asroot.test b/lustre/tests/acl_asroot.test new file mode 100644 index 0000000..af0fed3 --- /dev/null +++ b/lustre/tests/acl_asroot.test @@ -0,0 +1,46 @@ +! +! Test that can only be run as root as it uses mknod. +! +$mkdir asroot +$ umask 027 +$ mknod asroot/null c 1 3 +$ acl_mode asroot/null +crw-r----- +$ setfacl -m u:joe:rw,u:lisa:- asroot/null +$ acl_mode asroot/null +crw-rw----+ +$ setfacl -m u:lisa:r asroot/null +$ getfacl --omit-header asroot/null +user::rw- +user:joe:rw- +user:lisa:r-- +group::r-- +mask::rw- +other::--- + +$ su - lisa -c chmod\ +rw\ /mnt/lustre/asroot/null +chmod: changing permissions of `/mnt/lustre/asroot/null': Operation not permitted +$ rm -f asroot/null +$ mkfifo asroot/fifo +$ acl_mode asroot/fifo +prw-r----- +$ setfacl -m u:joe:- asroot/fifo +$ getfacl --omit-header asroot/fifo +user::rw- +user:joe:--- +group::r-- +mask::r-- +other::--- + +$ rm asroot/fifo +$ mknod asroot/block b 1 1 +$ setfacl -m u:joe:- asroot/block +$ getfacl --omit-header asroot/block +user::rw- +user:joe:--- +group::r-- +mask::r-- +other::--- + +$ rm asroot/block +$ rmdir asroot diff --git a/lustre/tests/acl_fileutil.test b/lustre/tests/acl_fileutil.test new file mode 100644 index 0000000..9760bf4 --- /dev/null +++ b/lustre/tests/acl_fileutil.test @@ -0,0 +1,66 @@ +! +! Test for the patched file utilities. +! +$ umask 022 +$ mkdir dir +$ acl_mode dir +drwxr-xr-x +$ touch dir/f +$ getfacl --omit-header dir/f +user::rw- +group::r-- +other::r-- + +$ umask 027 +$ cp -p dir/f dir/g +$ getfacl --omit-header dir/g +user::rw- +group::r-- +other::r-- + +$ rm dir/g +$ cp dir/f dir/g +$ getfacl --omit-header dir/g +user::rw- +group::r-- +other::--- + +$ setfacl -m u::rwx,u:joe:rwx,g::rwx,o::r-x dir/. +$ setfacl -dm u::rwx,u:joe:rwx,g::rwx,o::r-x dir/. +$ acl_mode dir +drwxrwxr-x+ +$ touch dir/h +$ getfacl --omit-header --no-effective dir/h +user::rw- +user:joe:rwx +group::rwx +mask::r-- +other::--- + +$ mkdir dir/d +$ getfacl --omit-header --no-effective dir/d +user::rwx +user:joe:rwx +group::rwx +mask::r-x +other::--- +default:user::rwx +default:user:joe:rwx +default:group::rwx +default:mask::rwx +default:other::r-x + +$ cp dir/f dir/i +$ getfacl --omit-header --no-effective dir/i +user::rw- +user:joe:rwx +group::rwx +mask::r-- +other::--- + +$ acl_mode dir/f +-rw-r--r-- +$ cp -p dir/f dir/j +$ acl_mode dir/j +-rw-r--r-- +$ rm -r dir diff --git a/lustre/tests/acl_misc.test b/lustre/tests/acl_misc.test new file mode 100644 index 0000000..def2929 --- /dev/null +++ b/lustre/tests/acl_misc.test @@ -0,0 +1,386 @@ +! +! Pretty comprehensive ACL tests. +! +! This must be run on a filesystem with ACL support. Also, you will need +! two dummy users (lisa and joe) and a dummy group (toolies). +! +$ umask 027 +$ touch f +! Only change a base ACL: +$ setfacl -m u::r f +$ setfacl -m u::rw,u:lisa:rw f +$ acl_mode f +-rw-rw----+ +$ getfacl --omit-header f +user::rw- +user:lisa:rw- +group::r-- +mask::rw- +other::--- + +$ rm f +$ umask 022 +$ touch f +$ setfacl -m u:lisa:rw f +$ acl_mode f +-rw-rw-r--+ +$ getfacl --omit-header f +user::rw- +user:lisa:rw- +group::r-- +mask::rw- +other::r-- + +$rm f +$ umask 027 +$ mkdir d +$ setfacl -m u:lisa:rwx d +$ acl_mode d +drwxrwx---+ +$ getfacl --omit-header d +user::rwx +user:lisa:rwx +group::r-x +mask::rwx +other::--- + +$ rmdir d +$ umask 022 +$ mkdir d +$ setfacl -m u:lisa:rwx d +$ acl_mode d +drwxrwxr-x+ +$ getfacl --omit-header d +user::rwx +user:lisa:rwx +group::r-x +mask::rwx +other::r-x + +$ rmdir d +! +! Multiple users +! +$ umask 022 +$ touch f +$ setfacl -m u:lisa:rw,u:joe:r f +$ acl_mode f +-rw-rw-r--+ +$ getfacl --omit-header f +user::rw- +user:joe:r-- +user:lisa:rw- +group::r-- +mask::rw- +other::r-- + +! +! Multiple groups +! +$ setfacl -m g:users:rw,g:toolies:r f +$ acl_mode f +-rw-rw-r--+ +$ getfacl --omit-header f +user::rw- +user:joe:r-- +user:lisa:rw- +group::r-- +group:users:rw- +group:toolies:r-- +mask::rw- +other::r-- + +! +! Remove one group +! +$ setfacl -x g:users f +$ acl_mode f +-rw-rw-r--+ +$ getfacl --omit-header f +user::rw- +user:joe:r-- +user:lisa:rw- +group::r-- +group:toolies:r-- +mask::rw- +other::r-- + +! +! Remove one user +! +$ setfacl -x u:joe f +$ acl_mode f +-rw-rw-r--+ +$ getfacl --omit-header f +user::rw- +user:lisa:rw- +group::r-- +group:toolies:r-- +mask::rw- +other::r-- + +$ rm f +! +! Default ACL +! +$ umask 027 +$ mkdir d +$ setfacl -m u:lisa:rwx,u:joe:rw,d:u:lisa:rwx,d:m:rx d +$ acl_mode d +drwxrwx---+ +$ getfacl --omit-header d +user::rwx +user:joe:rw- +user:lisa:rwx +group::r-x +mask::rwx +other::--- +default:user::rwx +default:user:lisa:rwx #effective:r-x +default:group::r-x +default:mask::r-x +default:other::--- + +! +! Umask now ignored? +! +$ umask 027 +$ touch d/f +$ acl_mode d/f +-rw-r-----+ +$ getfacl --omit-header d/f +user::rw- +user:lisa:rwx #effective:r-- +group::r-x #effective:r-- +mask::r-- +other::--- + +$ rm d/f +$ umask 022 +$ touch d/f +$ acl_mode d/f +-rw-r-----+ +$ getfacl --omit-header d/f +user::rw- +user:lisa:rwx #effective:r-- +group::r-x #effective:r-- +mask::r-- +other::--- + +$ rm d/f +! +! Default ACL copying +! +$ umask 000 +$ mkdir d/d +$ acl_mode d/d +drwxr-x---+ +$ getfacl --omit-header d/d +user::rwx +user:lisa:rwx #effective:r-x +group::r-x +mask::r-x +other::--- +default:user::rwx +default:user:lisa:rwx #effective:r-x +default:group::r-x +default:mask::r-x +default:other::--- + +$ rmdir d/d +$ umask 022 +$ mkdir d/d +$ acl_mode d/d +drwxr-x---+ +$ getfacl --omit-header d/d +user::rwx +user:lisa:rwx #effective:r-x +group::r-x +mask::r-x +other::--- +default:user::rwx +default:user:lisa:rwx #effective:r-x +default:group::r-x +default:mask::r-x +default:other::--- + +! +! Add some users and groups +! +$ setfacl -nm u:joe:rx,d:u:joe:rx,g:users:rx,g:toolies:rwx d/d +$ acl_mode d/d +drwxr-x---+ +$ getfacl --omit-header d/d +user::rwx +user:joe:r-x +user:lisa:rwx #effective:r-x +group::r-x +group:users:r-x +group:toolies:rwx #effective:r-x +mask::r-x +other::--- +default:user::rwx +default:user:joe:r-x +default:user:lisa:rwx #effective:r-x +default:group::r-x +default:mask::r-x +default:other::--- + +! +! symlink in directory with default ACL? +! +$ ln -s d d/l +$ acl_mode d/l +lrwxrwxrwx +$ acl_mode -L d/l +drwxr-x---+ +$ getfacl --omit-header d/l +user::rwx +user:joe:r-x +user:lisa:rwx #effective:r-x +group::r-x +group:users:r-x +group:toolies:rwx #effective:r-x +mask::r-x +other::--- +default:user::rwx +default:user:joe:r-x +default:user:lisa:rwx #effective:r-x +default:group::r-x +default:mask::r-x +default:other::--- + +$ rm d/l +! +! Does mask manipulation work? +! +$ setfacl -m g:toolies:rx,u:lisa:rx d/d +$ acl_mode d/d +drwxr-x---+ +$ getfacl --omit-header d/d +user::rwx +user:joe:r-x +user:lisa:r-x +group::r-x +group:users:r-x +group:toolies:r-x +mask::r-x +other::--- +default:user::rwx +default:user:joe:r-x +default:user:lisa:rwx #effective:r-x +default:group::r-x +default:mask::r-x +default:other::--- + +$ setfacl -m d:u:lisa:rwx d/d +$ acl_mode d/d +drwxr-x---+ +$ getfacl --omit-header d/d +user::rwx +user:joe:r-x +user:lisa:r-x +group::r-x +group:users:r-x +group:toolies:r-x +mask::r-x +other::--- +default:user::rwx +default:user:joe:r-x +default:user:lisa:rwx +default:group::r-x +default:mask::rwx +default:other::--- + +$ rmdir d/d +! +! Remove the default ACL +! +$ setfacl -k d +$ acl_mode d +drwxrwx---+ +$ getfacl --omit-header d +user::rwx +user:joe:rw- +user:lisa:rwx +group::r-x +mask::rwx +other::--- + +! +! Reset to base entries +! +$ setfacl -b d +$ acl_mode d +drwxr-x--- +$ getfacl --omit-header d +user::rwx +group::r-x +other::--- + +! +! Now, chmod should change the group_obj entry +! +$ chmod 775 d +$ acl_mode d +drwxrwxr-x +$ getfacl --omit-header d +user::rwx +group::rwx +other::r-x + +$ rmdir d +$ umask 002 +$ mkdir d +$ setfacl -m u:joe:rwx,u:lisa:rx,d:u:joe:rwx,d:u:lisa:rx d +$ acl_mode d +drwxrwxr-x+ +$ getfacl --omit-header d +user::rwx +user:joe:rwx +user:lisa:r-x +group::rwx +mask::rwx +other::r-x +default:user::rwx +default:user:joe:rwx +default:user:lisa:r-x +default:group::rwx +default:mask::rwx +default:other::r-x + +$ chmod 750 d +$ acl_mode d +drwxr-x---+ +$ getfacl --omit-header d +user::rwx +user:joe:rwx #effective:r-x +user:lisa:r-x +group::rwx #effective:r-x +mask::r-x +other::--- +default:user::rwx +default:user:joe:rwx +default:user:lisa:r-x +default:group::rwx +default:mask::rwx +default:other::r-x + +$ chmod 750 d +$ acl_mode d +drwxr-x---+ +$ getfacl --omit-header d +user::rwx +user:joe:rwx #effective:r-x +user:lisa:r-x +group::rwx #effective:r-x +mask::r-x +other::--- +default:user::rwx +default:user:joe:rwx +default:user:lisa:r-x +default:group::rwx +default:mask::rwx +default:other::r-x + +$ rmdir d diff --git a/lustre/tests/acl_mode b/lustre/tests/acl_mode new file mode 100755 index 0000000..af4b5eb --- /dev/null +++ b/lustre/tests/acl_mode @@ -0,0 +1,2 @@ +#!/bin/sh +ls -dl $* | awk -- '!/^total/ { print $1; }' diff --git a/lustre/tests/acl_perm.test b/lustre/tests/acl_perm.test new file mode 100644 index 0000000..0e79724 --- /dev/null +++ b/lustre/tests/acl_perm.test @@ -0,0 +1,18 @@ +! +! Test whether ACL permissions work +! +$ umask 022 +$ mkdir dir +$ umask 077 +$ touch dir/file +$ setfacl -m u:joe:rw,u:lisa:- dir/file +$ su - lisa -c cat\ /mnt/lustre/dir/file +cat: /mnt/lustre/dir/file: Permission denied +$ su - joe -c cat\ /mnt/lustre/dir/file +$ su - joe -c touch\ /mnt/lustre/dir/file +$ cat dir/file +$ setfacl -m g:users:- dir/file +$ su - nobody -c cat\ /mnt/lustre/dir/file +cat: /mnt/lustre/dir/file: Permission denied +$ rm dir/file +$ rmdir dir diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 246f482..acab3123 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -45,10 +45,12 @@ gen_second_config() { start_mds() { echo "start mds1 service on `facet_active_host mds1`" start mds1 --reformat $MDSLCONFARGS || return 94 + start_lsvcgssd || return 501 } stop_mds() { echo "stop mds1 service on `facet_active_host mds1`" stop mds1 $@ || return 97 + stop_lsvcgssd } start_ost() { @@ -63,6 +65,7 @@ stop_ost() { mount_client() { local MOUNTPATH=$1 + start_lgssd || return 502 echo "mount lustre on ${MOUNTPATH}....." zconf_mount `hostname` $MOUNTPATH || return 96 } @@ -71,11 +74,13 @@ umount_client() { local MOUNTPATH=$1 echo "umount lustre on ${MOUNTPATH}....." zconf_umount `hostname` $MOUNTPATH || return 97 + stop_lgssd } manual_umount_client(){ echo "manual umount lustre on ${MOUNTPATH}...." do_facet client "umount $MOUNT" + stop_lgssd } setup() { @@ -115,6 +120,7 @@ build_test_filter #create single point mountpoint gen_config +start_krb5_kdc || exit 1 test_0() { @@ -189,6 +195,7 @@ test_5() { kill -TERM $UMOUNT_PID echo "waiting for umount to finish" wait $UMOUNT_PID + stop_lgssd # cleanup client modules $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null @@ -209,10 +216,12 @@ test_5b() { [ -d $MOUNT ] || mkdir -p $MOUNT $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null + start_lgssd || return 1 llmount $mds_HOST://mds1_svc/client_facet $MOUNT && exit 1 # cleanup client modules $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null + stop_lgssd # stop_mds is a no-op here, and should not fail stop_mds || return 2 @@ -230,15 +239,17 @@ test_5c() { [ -d $MOUNT ] || mkdir -p $MOUNT $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null - llmount $mds_HOST://wrong_mds1_svc/client_facet $MOUNT && return 1 + start_lgssd || return 1 + llmount $mds_HOST://wrong_mds1_svc/client_facet $MOUNT && return 2 # cleanup client modules $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null + stop_lgssd - stop_mds || return 2 - stop_ost || return 3 + stop_mds || return 3 + stop_ost || return 4 - lsmod | grep -q portals && return 4 + lsmod | grep -q portals && return 5 return 0 } @@ -251,11 +262,13 @@ test_5d() { [ -d $MOUNT ] || mkdir -p $MOUNT $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null + start_lgssd || return 1 llmount $mds_HOST://mds1_svc/client_facet $MOUNT || return 1 umount $MOUNT || return 2 # cleanup client modules $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG > /dev/null + stop_lgssd stop_mds || return 3 diff --git a/lustre/tests/gns-upcall.sh b/lustre/tests/gns-upcall.sh new file mode 100755 index 0000000..ed4c6ca --- /dev/null +++ b/lustre/tests/gns-upcall.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +MOUNT=`which mount 2>/dev/null` +test "x$MOUNT" = "x" && MOUNT="/bin/mount" + +OPTIONS=$1 +MNTPATH=$2 + +test "x$OPTIONS" = "x" || "x$MNTPATH" = "x" && + exit 1 + +$MOUNT $OPTIONS $MNTPATH > /tmp/gns-log 2>&1 +exit $? diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index 0c3dc9a..1b5a28c 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -143,11 +143,14 @@ gen_config() { setup() { gen_config + start_krb5_kdc || exit 1 rm -rf logs/* for i in `seq $NUMOST`; do wait_for ost$i start ost$i ${REFORMAT} $OSTLCONFARGS done + start_lsvcgssd || exit 2 + start_lgssd || exit 3 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE for mds in `mds_list`; do wait_for $mds @@ -164,6 +167,8 @@ cleanup() { for mds in `mds_list`; do stop $mds ${FORCE} $MDSLCONFARGS || : done + stop_lgssd + stop_lsvcgssd for i in `seq $NUMOST`; do stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS || : done diff --git a/lustre/tests/krb5_env.sh b/lustre/tests/krb5_env.sh new file mode 100755 index 0000000..07e9f8e --- /dev/null +++ b/lustre/tests/krb5_env.sh @@ -0,0 +1,101 @@ +#!/bin/sh + +# +# KDC could be on remote hosts, but we suppose lgssd/lsvcgssd only +# runs locally. +# + +export KDCHOST=${KDCHOST:-"localhost"} +export KDCDIR=${KDCDIR:-"/usr/kerberos/sbin"} +export KRB5DIR=${KRB5DIR:-"/usr/kerberos"} +export LGSSD=${LGSSD:-"/sbin/lgssd"} +export SVCGSSD=${SVCGSSD:-"/sbin/lsvcgssd"} +export PDSH=${PDSH:-"ssh"} + +using_krb5_sec() { + if [ "x$1" != "xkrb5i" -a "x$1" != "xkrb5p" ]; then + echo "n" + else + echo "y" + fi +} + +start_krb5_kdc() { + if [ `using_krb5_sec $SECURITY` == 'n' ] ; then + return 0 + fi + + num=`$PDSH $KDCHOST "PATH=\$PATH:$KDCDIR; ps ax | grep krb5kdc | grep -v "grep" | wc -l"` + if [ $num -eq 1 ]; then + return 0 + fi + + $PDSH $KDCHOST "PATH=\$PATH:$KDCDIR; krb5kdc" + num=`$PDSH $KDCHOST "PATH=\$PATH:$KDCDIR; ps ax | grep krb5kdc | grep -v "grep" | wc -l"` + if [ $num -ne 1 ]; then + echo "fail to start krb5 KDC, check env KDCHOST and KDCDIR" + return 1 + fi + return 0 +} + +prepare_krb5_cache() { + if [ `using_krb5_sec $SECURITY` == 'n' ] ; then + return 0 + fi + + $KRB5DIR/bin/klist -5 -s + invalid=$? + if [ $invalid -eq 0 ]; then + return 0 + fi + + echo "***** refresh Kerberos V5 TGT for uid $UID *****" + $KRB5DIR/bin/kinit + ret=$? + return $ret +} + +start_lsvcgssd() { + if [ `using_krb5_sec $SECURITY` == 'n' ] ; then + return 0 + fi + + killall -q -9 lsvcgssd || true + + `$SVCGSSD` + num=`ps -o cmd -C "lsvcgssd" | grep lsvcgssd | wc -l` + if [ $num -ne 1 ]; then + echo "failed to start lsvcgssd" + return 1 + fi + return 0 +} + +stop_lsvcgssd() { + killall -q -9 lsvcgssd || true + return 0 +} + +start_lgssd() { + if [ `using_krb5_sec $SECURITY` == 'n' ] ; then + return 0 + fi + + prepare_krb5_cache || exit 1 + + killall -q -9 lgssd || true + + `$LGSSD` + num=`ps -o cmd -C "lgssd" | grep lgssd | wc -l` + if [ $num -ne 1 ]; then + echo "failed to start lgssd $num" + return 1 + fi + return 0 +} + +stop_lgssd() { + killall -q -9 lgssd || true + return 0 +} diff --git a/lustre/tests/krb5_refresh_cache.sh b/lustre/tests/krb5_refresh_cache.sh new file mode 100755 index 0000000..b356306 --- /dev/null +++ b/lustre/tests/krb5_refresh_cache.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +KRB5DIR=${KRB5DIR:-"/usr/kerberos"} + +$KRB5DIR/bin/klist -5 -s +invalid=$? + +if [ $invalid -eq 0 ]; then + exit 0 +fi + +echo "***** refresh Kerberos V5 TGT for uid $UID *****" +$KRB5DIR/bin/kinit +ret=$? +exit $ret diff --git a/lustre/tests/llmount.sh b/lustre/tests/llmount.sh index 5a8c205..17ce6f5 100755 --- a/lustre/tests/llmount.sh +++ b/lustre/tests/llmount.sh @@ -6,10 +6,14 @@ export PATH=`dirname $0`/../utils:$PATH LCONF=${LCONF:-lconf} NAME=${NAME:-local} LLMOUNT=${LLMOUNT:-llmount} +SECURITY=${SECURITY:-"null"} config=$NAME.xml mkconfig=$NAME.sh +. krb5_env.sh +start_krb5_kdc || exit 1 + if [ "$PORTALS" ]; then portals_opt="--portals=$PORTALS" fi @@ -21,16 +25,22 @@ fi if [ "$LDAPURL" ]; then conf_opt="--ldapurl $LDAPURL --config $NAME" else - sh $mkconfig $config || exit 1 + sh $mkconfig $config || exit 2 conf_opt="$config" fi [ "$NODE" ] && node_opt="--node $NODE" -${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt ${REFORMAT:---reformat} $@ \ - $conf_opt || exit 2 +# We'd better start lsvcgssd after gss modules loaded. +# remove this if we don't depend on lsvcgssd in the future +${LCONF} --nosetup --sec $SECURITY $portals_opt $node_opt $@ $conf_opt || exit 3 +start_lsvcgssd || exit 4 +start_lgssd || exit 5 + +${LCONF} $NOMOD --sec $SECURITY $portals_opt $lustre_opt $node_opt \ + ${REFORMAT:---reformat} $@ $conf_opt || exit 6 if [ "$MOUNT2" ]; then - $LLMOUNT -v `hostname`:/mds1/client $MOUNT2 || exit 3 + $LLMOUNT -v -o sec=$SECURITY `hostname`:/mds1/client $MOUNT2 || exit 7 fi diff --git a/lustre/tests/llmountcleanup.sh b/lustre/tests/llmountcleanup.sh index 05ac8a4..ea054ee 100755 --- a/lustre/tests/llmountcleanup.sh +++ b/lustre/tests/llmountcleanup.sh @@ -9,6 +9,8 @@ TMP=${TMP:-/tmp} config=$NAME.xml mkconfig=$NAME.sh +. krb5_env.sh + if [ "$PORTALS" ]; then portals_opt="--portals=$PORTALS" fi @@ -36,6 +38,9 @@ ${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt --cleanup $@ \ --dump $TMP/debug $conf_opt rc=$? echo "lconf DONE" +stop_lsvcgssd +stop_lgssd + BUSY=`dmesg | grep -i destruct` if [ "$BUSY" ]; then echo "$BUSY" 1>&2 diff --git a/lustre/tests/llrmount.sh b/lustre/tests/llrmount.sh index 9a5cbfa..3de7fcf1 100755 --- a/lustre/tests/llrmount.sh +++ b/lustre/tests/llrmount.sh @@ -5,10 +5,15 @@ export PATH=`dirname $0`/../utils:$PATH LCONF=${LCONF:-lconf} NAME=${NAME:-local} LLMOUNT=${LLMOUNT:-llmount} +SECURITY=${SECURITY:-"null"} config=$NAME.xml mkconfig=$NAME.sh +. krb5_env.sh + +start_krb5_kdc || exit 1 + if [ "$PORTALS" ]; then portals_opt="--portals=$PORTALS" fi @@ -21,16 +26,23 @@ if [ "$LDAPURL" ]; then conf_opt="--ldapurl $LDAPURL --config $NAME" else if [ ! -f $config -o $mkconfig -nt $config ]; then - sh $mkconfig $config || exit 1 + sh $mkconfig $config || exit 2 fi conf_opt="$config" fi [ "$NODE" ] && node_opt="--node $NODE" -${LCONF} $NOMOD $portals_opt $lustre_opt $node_opt $@ $conf_opt || exit 2 +# We'd better start lsvcgssd after gss modules loaded. +# remove this if we don't depend on lsvcgssd in the future +${LCONF} --nosetup --sec $SECURITY $portals_opt $node_opt $@ $conf_opt || exit 3 +start_lsvcgssd || exit 4 +start_lgssd || exit 5 + +${LCONF} $NOMOD --sec $SECURITY $portals_opt $lustre_opt $node_opt \ + $@ $conf_opt || exit 6 if [ "$MOUNT2" ]; then - $LLMOUNT -v `hostname`:/mds1/client $MOUNT2 || exit 3 + $LLMOUNT -v -o sec=$SECURITY `hostname`:/mds1/client $MOUNT2 || exit 7 fi diff --git a/lustre/tests/lmv.sh b/lustre/tests/lmv.sh index f2ebf52..b2270f4 100755 --- a/lustre/tests/lmv.sh +++ b/lustre/tests/lmv.sh @@ -34,7 +34,7 @@ rm -f $config # create nodes ${LMC} -m $config --add node --node localhost || exit 10 -${LMC} -m $config --add net --node localhost --nid localhost --nettype tcp || exit 11 +${LMC} -m $config --add net --node localhost --nid `hostname` --nettype tcp || exit 11 # configure mds server ${LMC} -m $config --add lmv --lmv lmv1 || exit 12 diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 3007a0b..6e9d31d 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -50,13 +50,15 @@ gen_config() { setup() { gen_config + start_krb5_kdc || exit 1 start ost --reformat $OSTLCONFARGS start ost2 --reformat $OSTLCONFARGS + start_lsvcgssd || exit 2 + start_lgssd || exit 3 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE for mds in `mds_list`; do start $mds --reformat $MDSLCONFARGS done - grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT } @@ -65,6 +67,8 @@ cleanup() { for mds in `mds_list`; do stop $mds ${FORCE} $MDSLCONFARGS done + stop_lgssd + stop_lsvcgssd stop ost2 ${FORCE} --dump cleanup.log stop ost ${FORCE} --dump cleanup.log } diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index e6e06e1..8848b78 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -54,6 +54,8 @@ cleanup() { for mds in `mds_list`; do stop $mds ${FORCE} $MDSLCONFARGS done + stop_lgssd + stop_lsvcgssd stop ost2 ${FORCE} stop ost ${FORCE} --dump cleanup-dual.log } @@ -66,6 +68,8 @@ fi setup() { gen_config + + start_krb5_kdc || exit 1 start ost --reformat $OSTLCONFARGS PINGER=`cat /proc/fs/lustre/pinger` @@ -76,6 +80,8 @@ setup() { fi start ost2 --reformat $OSTLCONFARGS + start_lsvcgssd || exit 2 + start_lgssd || exit 3 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE for mds in `mds_list`; do start $mds --reformat $MDSLCONFARGS diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 1f3e2d6..ef0e09c 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -20,6 +20,11 @@ assert_env MDSCOUNT # Skip these tests ALWAYS_EXCEPT="" +if [ `using_krb5_sec $SECURITY` == 'n' ] ; then + ALWAYS_EXCEPT="0c $ALWAYS_EXCEPT" +fi + + gen_config() { rm -f $XMLCONFIG @@ -60,6 +65,8 @@ cleanup() { for mds in `mds_list`; do stop $mds ${FORCE} $MDSLCONFARGS done + stop_lgssd + stop_lsvcgssd stop ost2 ${FORCE} --dump cleanup.log stop ost ${FORCE} --dump cleanup.log } @@ -76,8 +83,11 @@ CLEANUP=${CLEANUP:-"cleanup"} setup() { gen_config + start_krb5_kdc || exit 1 start ost --reformat $OSTLCONFARGS start ost2 --reformat $OSTLCONFARGS + start_lsvcgssd || exit 2 + start_lgssd || exit 3 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE for mds in `mds_list`; do start $mds --reformat $MDSLCONFARGS @@ -108,6 +118,17 @@ test_0b() { } run_test 0b "ensure object created after recover exists. (3284)" +test_0c() { + # drop gss error notification + replay_barrier mds1 + fail_drop mds1 0x760 + + # drop gss init request + replay_barrier mds1 + fail_drop mds1 0x780 +} +run_test 0c "empty replay with gss init failures" + test_1() { replay_barrier mds1 mcreate $DIR/$tfile diff --git a/lustre/tests/runacltest b/lustre/tests/runacltest new file mode 100755 index 0000000..1b7287a --- /dev/null +++ b/lustre/tests/runacltest @@ -0,0 +1,160 @@ +#!/usr/bin/perl + +use strict; +use FileHandle; +use POSIX qw(geteuid getegid isatty); + +my $owner = getpwuid(geteuid()); +my $group = getgrgid(getegid()); + +my ($OK, $FAILED) = ("ok", "failed"); +if (isatty(fileno(STDOUT))) { + $OK = "\033[32m" . $OK . "\033[m"; + $FAILED = "\033[31m\033[1m" . $FAILED . "\033[m"; +} + +my ($prog, $in, $out) = ([], [], []); +my $line = 0; +my $prog_line; +my ($tests, $failed); + +for (;;) { + my $script = <>; $line++; + $script =~ s/\@OWNER\@/$owner/g; + $script =~ s/\@GROUP\@/$group/g; + next if (defined($script) && $script =~ /^!/); + if (!defined($script) || $script =~ s/^\$ ?//) { + if (@$prog) { + #print "[$prog_line] \$ ", join(' ', @$prog), " -- "; + my $p = [ @$prog ]; + print "[$prog_line] \$ ", join(' ', + map { s/\s/\\$&/g; $_ } @$p), " -- "; + my $result = exec_test($prog, $in); + my $good = 1; + my $nmax = (@$out > @$result) ? @$out : @$result; + for (my $n=0; $n < $nmax; $n++) { + if (!defined($out->[$n]) || !defined($result->[$n]) || + $out->[$n] ne $result->[$n]) { + $good = 0; + #chomp $out->[$n]; + #chomp $result->[$n]; + #print "$out->[$n] != $result->[$n]"; + } + } + $tests++; + $failed++ unless $good; + print $good ? $OK : $FAILED, "\n"; + if (!$good) { + for (my $n=0; $n < $nmax; $n++) { + my $l = defined($out->[$n]) ? $out->[$n] : "~"; + chomp $l; + my $r = defined($result->[$n]) ? $result->[$n] : "~"; + chomp $r; + print sprintf("%-37s | %-39s\n", $l, $r); + } + } + } + #$prog = [ split /\s+/, $script ] if $script; + $prog = [ map { s/\\(.)/$1/g; $_ } split /(? ?//) { + push @$in, $script; + } else { + push @$out, $script; + } + last unless defined($script); +} +my $status = sprintf("%d commands (%d passed, %d failed)", + $tests, $tests-$failed, $failed); +if (isatty(fileno(STDOUT))) { + if ($failed) { + $status = "\033[31m\033[1m" . $status . "\033[m"; + } else { + $status = "\033[32m" . $status . "\033[m"; + } +} +print $status, "\n"; +exit $failed ? 1 : 0; + +sub exec_test($$) { + my ($prog, $in) = @_; + local (*IN, *IN_DUP, *IN2, *OUT_DUP, *OUT, *OUT2); + + if ($prog->[0] eq "umask") { + umask oct $prog->[1]; + return []; + } elsif ($prog->[0] eq "cd") { + if (!chdir $prog->[1]) { + return [ "chdir: $prog->[1]: $!\n" ]; + } + return []; + } + + pipe *IN2, *OUT + or die "Can't create pipe for reading: $!"; + open *IN_DUP, "<&STDIN" + or *IN_DUP = undef; + open *STDIN, "<&IN2" + or die "Can't duplicate pipe for reading: $!"; + close *IN2; + + open *OUT_DUP, ">&STDOUT" + or die "Can't duplicate STDOUT: $!"; + pipe *IN, *OUT2 + or die "Can't create pipe for writing: $!"; + open *STDOUT, ">&OUT2" + or die "Can't duplicate pipe for writing: $!"; + close *OUT2; + + *STDOUT->autoflush(); + *OUT->autoflush(); + + if (fork()) { + # Server + if (*IN_DUP) { + open *STDIN, "<&IN_DUP" + or die "Can't duplicate STDIN: $!"; + close *IN_DUP + or die "Can't close STDIN duplicate: $!"; + } + open *STDOUT, ">&OUT_DUP" + or die "Can't duplicate STDOUT: $!"; + close *OUT_DUP + or die "Can't close STDOUT duplicate: $!"; + + foreach my $line (@$in) { + #print "> $line"; + print OUT $line; + } + close *OUT + or die "Can't close pipe for writing: $!"; + + my $result = []; + while () { + #print "< $_"; + push @$result, $_; + } + return $result; + } else { + # Client + close IN + or die "Can't close read end for input pipe: $!"; + close OUT + or die "Can't close write end for output pipe: $!"; + close OUT_DUP + or die "Can't close STDOUT duplicate: $!"; + local *ERR_DUP; + open ERR_DUP, ">&STDERR" + or die "Can't duplicate STDERR: $!"; + open STDERR, ">&STDOUT" + or die "Can't join STDOUT and STDERR: $!"; + + #print ERR_DUP "<", join(' ', @$prog), ">\n"; + exec @$prog; + print ERR_DUP $prog->[0], ": $!\n"; + exit; + } +} + diff --git a/lustre/tests/sanity-gns.sh b/lustre/tests/sanity-gns.sh new file mode 100644 index 0000000..74e5657 --- /dev/null +++ b/lustre/tests/sanity-gns.sh @@ -0,0 +1,387 @@ +#!/bin/bash +# +# Run select tests by setting ONLY, or as arguments to the script. +# Skip specific tests by setting EXCEPT. +# +# e.g. ONLY="22 23" or ONLY="`seq 32 39`" or EXCEPT="31" +set -e + +ONLY=${ONLY:-"$*"} +ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""} +[ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT" + +SRCDIR=`dirname $0` +export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH + +TMP=${TMP:-/tmp} +FSTYPE=${FSTYPE:-ext3} + +CHECKSTAT=${CHECKSTAT:-"checkstat -v"} +CREATETEST=${CREATETEST:-createtest} +LFS=${LFS:-lfs} +LSTRIPE=${LSTRIPE:-"$LFS setstripe"} +LFIND=${LFIND:-"$LFS find"} +LVERIFY=${LVERIFY:-ll_dirstripe_verify} +LCTL=${LCTL:-lctl} +MCREATE=${MCREATE:-mcreate} +OPENFILE=${OPENFILE:-openfile} +OPENUNLINK=${OPENUNLINK:-openunlink} +TOEXCL=${TOEXCL:-toexcl} +TRUNCATE=${TRUNCATE:-truncate} +MUNLINK=${MUNLINK:-munlink} +SOCKETSERVER=${SOCKETSERVER:-socketserver} +SOCKETCLIENT=${SOCKETCLIENT:-socketclient} +IOPENTEST1=${IOPENTEST1:-iopentest1} +IOPENTEST2=${IOPENTEST2:-iopentest2} +PTLDEBUG=${PTLDEBUG:-0} + +if [ $UID -ne 0 ]; then + RUNAS_ID="$UID" + RUNAS="" +else + RUNAS_ID=${RUNAS_ID:-500} + RUNAS=${RUNAS:-"runas -u $RUNAS_ID"} +fi + +export NAME=${NAME:-local} + +SAVE_PWD=$PWD + +clean() { + echo -n "cln.." + sh llmountcleanup.sh > /dev/null || exit 20 + I_MOUNTED=no +} +CLEAN=${CLEAN:-clean} + +start() { + echo -n "mnt.." + sh llrmount.sh > /dev/null || exit 10 + I_MOUNTED=yes + echo "done" +} +START=${START:-start} + +log() { + echo "$*" + lctl mark "$*" 2> /dev/null || true +} + +trace() { + log "STARTING: $*" + strace -o $TMP/$1.strace -ttt $* + RC=$? + log "FINISHED: $*: rc $RC" + return 1 +} +TRACE=${TRACE:-""} + +check_kernel_version() { + VERSION_FILE=/proc/fs/lustre/kernel_version + WANT_VER=$1 + [ ! -f $VERSION_FILE ] && echo "can't find kernel version" && return 1 + GOT_VER=`cat $VERSION_FILE` + [ $GOT_VER -ge $WANT_VER ] && return 0 + log "test needs at least kernel version $WANT_VER, running $GOT_VER" + return 1 +} + +run_one() { + if ! mount | grep -q $DIR; then + $START + fi + echo $PTLDEBUG >/proc/sys/portals/debug + log "== test $1: $2" + export TESTNAME=test_$1 + test_$1 || error "test_$1: exit with rc=$?" + unset TESTNAME + pass + cd $SAVE_PWD + $CLEAN +} + +build_test_filter() { + for O in $ONLY; do + eval ONLY_${O}=true + done + for E in $EXCEPT $ALWAYS_EXCEPT; do + eval EXCEPT_${E}=true + done +} + +_basetest() { + echo $* +} + +basetest() { + IFS=abcdefghijklmnopqrstuvwxyz _basetest $1 +} + +run_test() { + base=`basetest $1` + if [ "$ONLY" ]; then + testname=ONLY_$1 + if [ ${!testname}x != x ]; then + run_one $1 "$2" + return $? + fi + testname=ONLY_$base + if [ ${!testname}x != x ]; then + run_one $1 "$2" + return $? + fi + echo -n "." + return 0 + fi + testname=EXCEPT_$1 + if [ ${!testname}x != x ]; then + echo "skipping excluded test $1" + return 0 + fi + testname=EXCEPT_$base + if [ ${!testname}x != x ]; then + echo "skipping excluded test $1 (base $base)" + return 0 + fi + run_one $1 "$2" + return $? +} + +[ "$SANITYLOG" ] && rm -f $SANITYLOG || true + +error() { + log "FAIL: $@" + if [ "$SANITYLOG" ]; then + echo "FAIL: $TESTNAME $@" >> $SANITYLOG + else + exit 1 + fi +} + +pass() { + echo PASS +} + +MOUNT="`mount | awk '/^'$NAME' .* lustre_lite / { print $3 }'`" +if [ -z "$MOUNT" ]; then + sh llmount.sh + MOUNT="`mount | awk '/^'$NAME' .* lustre_lite / { print $3 }'`" + [ -z "$MOUNT" ] && error "NAME=$NAME not mounted" + I_MOUNTED=yes +fi + +[ `echo $MOUNT | wc -w` -gt 1 ] && error "NAME=$NAME mounted more than once" + +DIR=${DIR:-$MOUNT} +[ -z "`echo $DIR | grep $MOUNT`" ] && echo "$DIR not in $MOUNT" && exit 99 + +rm -rf $DIR/[Rdfs][1-9]* +build_test_filter + +echo preparing for tests involving mounts +EXT2_DEV=${EXT2_DEV:-/tmp/SANITY.LOOP} +touch $EXT2_DEV +mke2fs -j -F $EXT2_DEV 8000 >/dev/null 2>&1 + +find_free_loop() { + local LOOP_DEV="" + test -b /dev/loop0 && + base="/dev/loop" || base="/dev/loop/" + + for ((i=0;i<256;i++)); do + test -b $base$i || continue + + losetup $base$i >/dev/null 2>&1 || { + LOOP_DEV="$base$i" + break + } + done + echo $LOOP_DEV +} + +cleanup_loop() { + local LOOP_DEV=$1 + local LOOP_FILE=$2 + local LOOP_MNTPT=$3 + + chmod u-s $LOOP_MNTPT >/dev/null 2>&1 + umount $LOOP_MNTPT >/dev/null 2>&1 + losetup -d $LOOP_DEV >/dev/null 2>&1 + rm -fr $LOOP_FILE >/dev/null 2>&1 + rm -fr $LOOP_MNTPT >/dev/null 2>&1 +} + +setup_loop() { + local LOOP_DEV=$1 + local LOOP_FILE=$2 + + dd if=/dev/zero of=$LOOP_FILE bs=1M count=10 2>/dev/null || return $? + + losetup $LOOP_DEV $LOOP_FILE || { + rc=$? + cleanup_mount $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a + return $rc + } + + mke2fs -F $LOOP_DEV >/dev/null 2>&1 || { + rc=$? + cleanup_mount $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a + echo "cannot create test ext2 fs on $LOOP_DEV" + return $? + } + return 0 +} + +prep_upcall() { + local INJECTION="" + local UPCALL=$1 + local MODE=$2 + local LOG=$3 + + test "x$MODE" = "xDEADLOCK" && + INJECTION="touch \$MNTPATH/file" + + cat > $UPCALL <<- EOF +#!/bin/sh + +MOUNT=\`which mount 2>/dev/null\` +test "x\$MOUNT" = "x" && MOUNT="/bin/mount" + +OPTIONS=\$1 +MNTPATH=\$2 + +test "x\$OPTIONS" = "x" || "x\$MNTPATH" = "x" && +exit 1 + +$INJECTION +\$MOUNT \$OPTIONS \$MNTPATH > $LOG 2>&1 +exit \$? +EOF + chmod +x $UPCALL + return $? +} + +check_gns() { + local LOG="/tmp/gns-log" + local UPCALL_PATH="" + + local UPCALL=$1 + local OBJECT=$2 + local TIMOUT=$3 + local TICK=$4 + + rm -fr $LOG >/dev/null 2>&1 + UPCALL_PATH="/tmp/gns-upcall-$UPCALL.sh" + + echo "generating upcall $UPCALL_PATH" + prep_upcall $UPCALL_PATH $UPCALL $LOG || return $rc + echo "======================== upcall script ===========================" + cat $UPCALL_PATH 2>/dev/null || return $? + echo "==================================================================" + + echo "$UPCALL_PATH" > /proc/fs/lustre/llite/fs0/gns_upcall || return $? + echo "upcall: $(cat /proc/fs/lustre/llite/fs0/gns_upcall)" + + echo -n "mount on open $OBJECT/test_file1: " + echo -n "test data" > $OBJECT/test_file1 >/dev/null 2>&1 || return $? + + local ENTRY="`basename $OBJECT`" + + cat /proc/mounts | grep -q "$ENTRY" || { + echo "fail" + test -f $LOG && { + echo "======================== upcall log ===========================" + cat $LOG + echo "===============================================================" + } || { + echo "upcall log file $LOG is not found" + } + return 1 + } + echo "success" + + local sleep_time=$TIMOUT + let sleep_time+=$TICK*2 + echo -n "waiting for umount ${sleep_time}s (timeout + tick*2): " + sleep $sleep_time + + cat /proc/mounts | grep -q "$ENTRY" && { + echo "failed" + return 2 + } + echo "success" + return 0 +} + +test_1a() { + local LOOP_DEV=$(find_free_loop 2>/dev/null) + local UPCALL="/tmp/gns-upcall.sh" + local LOOP_FILE="/tmp/gns_loop" + local OBJECT=".mntinfo" + local TIMOUT=5 + local TICK=1 + + test "x$LOOP_DEV" != "x" && test -b $LOOP_DEV || + error "can't find free loop device" + + echo "preparing loop device $LOOP_DEV <-> $LOOP_FILE..." + cleanup_loop $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a + setup_loop $LOOP_DEV $LOOP_FILE || error + + echo "setting up GNS timeouts and mount object..." + echo "$OBJECT" > /proc/fs/lustre/llite/fs0/gns_object_name || error + echo "$TIMOUT" > /proc/fs/lustre/llite/fs0/gns_timeout || error + echo "$TICK" > /proc/fs/lustre/llite/fs0/gns_tick || error + + echo "" + echo "timeout: $(cat /proc/fs/lustre/llite/fs0/gns_timeout)s" + echo "object: $(cat /proc/fs/lustre/llite/fs0/gns_object_name)" + echo "tick: $(cat /proc/fs/lustre/llite/fs0/gns_tick)s" + echo "" + + echo "preparing mount object at $DIR/gns_test_1a/$OBJECT..." + mkdir -p $DIR/gns_test_1a || error + echo -n "-t ext2 $LOOP_DEV" > $DIR/gns_test_1a/$OBJECT + echo "======================== mount object ===========================" + cat $DIR/gns_test_1a/$OBJECT + echo "" + echo "=================================================================" + chmod u+s $DIR/gns_test_1a || error + + echo "" + echo "testing GNS with GENERIC upcall 2 times on the row" + for ((i=0;i<2;i++)); do + check_gns GENERIC $DIR/gns_test_1a $TIMOUT $TICK || { + cleanup_loop $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a + error + } + done + + echo "" + echo "testing GNS with DEADLOCK upcall 2 times on the row" + for ((i=0;i<2;i++)); do + check_gns DEADLOCK $DIR/gns_test_1a $TIMOUT $TICK || { + cleanup_loop $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a + error + } + done + + cleanup_loop $LOOP_DEV $LOOP_FILE $DIR/gns_test_1a +} + +run_test 1a " general GNS test - mounting/umount ====================" + +TMPDIR=$OLDTMPDIR +TMP=$OLDTMP +HOME=$OLDHOME + +log "cleanup: ===========================================================" +if [ "`mount | grep ^$NAME`" ]; then + rm -rf $DIR/[Rdfs][1-9]* + if [ "$I_MOUNTED" = "yes" ]; then + sh llmountcleanup.sh || error + fi +fi + +echo '=========================== finished ===============================' +[ -f "$SANITYLOG" ] && cat $SANITYLOG && exit 1 || true diff --git a/lustre/tests/sanity-lmv.sh b/lustre/tests/sanity-lmv.sh index a7d79cc..8e0a86e 100644 --- a/lustre/tests/sanity-lmv.sh +++ b/lustre/tests/sanity-lmv.sh @@ -18,6 +18,7 @@ ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""} SRCDIR=`dirname $0` export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH +export SECURITY=${SECURITY:-"null"} TMP=${TMP:-/tmp} FSTYPE=${FSTYPE:-ext3} @@ -41,6 +42,8 @@ IOPENTEST1=${IOPENTEST1:-iopentest1} IOPENTEST2=${IOPENTEST2:-iopentest2} PTLDEBUG=${PTLDEBUG:-0} +. krb5_env.sh + if [ $UID -ne 0 ]; then RUNAS_ID="$UID" RUNAS="" @@ -49,6 +52,13 @@ else RUNAS=${RUNAS:-"runas -u $RUNAS_ID"} fi +if [ `using_krb5_sec $SECURITY` == 'y' ] ; then + start_krb5_kdc || exit 1 + if [ $RUNAS_ID -ne $UID ]; then + $RUNAS ./krb5_refresh_cache.sh || exit 2 + fi +fi + export NAME=${NAME:-lmv} SAVE_PWD=$PWD diff --git a/lustre/tests/sanity-sec.sh b/lustre/tests/sanity-sec.sh index 09431e4..d8a5598 100644 --- a/lustre/tests/sanity-sec.sh +++ b/lustre/tests/sanity-sec.sh @@ -14,6 +14,7 @@ ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""} SRCDIR=`dirname $0` export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH +export SECURITY=${SECURITY:-"null"} TMP=${TMP:-/tmp} FSTYPE=${FSTYPE:-ext3} @@ -36,6 +37,8 @@ SOCKETCLIENT=${SOCKETCLIENT:-socketclient} IOPENTEST1=${IOPENTEST1:-iopentest1} IOPENTEST2=${IOPENTEST2:-iopentest2} +. krb5_env.sh + if [ $UID -ne 0 ]; then RUNAS_ID="$UID" RUNAS="" @@ -44,6 +47,13 @@ else RUNAS=${RUNAS:-"runas -u $RUNAS_ID"} fi +if [ `using_krb5_sec $SECURITY` == 'y' ] ; then + start_krb5_kdc || exit 1 + if [ $RUNAS_ID -ne $UID ]; then + $RUNAS ./krb5_refresh_cache.sh || exit 2 + fi +fi + export NAME=${NAME:-local} SAVE_PWD=$PWD @@ -256,6 +266,67 @@ EOF run_test 1 "test root_squash ============================" +test_2() { + touch $DIR/f2 + + #test set/get xattr + setfattr -n trusted.name1 -v value1 $DIR/f2 || error + [ "`getfattr -n trusted.name1 $DIR/f2 2> /dev/null | \ + grep "trusted.name1"`" == "trusted.name1=\"value1\"" ] || error + + setfattr -n user.author1 -v author1 $DIR/f2 || error + [ "`getfattr -n user.author1 $DIR/f2 2> /dev/null | \ + grep "user.author1"`" == "user.author1=\"author1\"" ] || error + + # test listxattr + setfattr -n trusted.name2 -v value2 $DIR/f2 || error + setfattr -n trusted.name3 -v value3 $DIR/f2 || error + [ `getfattr -d -m "^trusted" $DIR/f2 2> /dev/null | \ + grep "trusted" | wc -l` -eq 5 ] || error + + + setfattr -n user.author2 -v author2 $DIR/f2 || error + setfattr -n user.author3 -v author3 $DIR/f2 || error + [ `getfattr -d -m "^user" $DIR/f2 2> /dev/null | \ + grep "user" | wc -l` -eq 3 ] || error + #test removexattr + setfattr -x trusted.name1 $DIR/f2 2> /dev/null || error + getfattr -d -m trusted $DIR/f2 2> /dev/null | \ + grep "trusted.name1" && error || true + + setfattr -x user.author1 $DIR/f2 2> /dev/null || error + getfattr -d -m user $DIR/f2 2> /dev/null | \ + grep "user.author1" && error || true +} +run_test 2 "set/get xattr test (trusted xattr only) ============" + +test_3 () { + SAVE_UMASK=`umask` + umask 022 + USER1=rpm + USER2=vsx2 + GROUP1=nobody + GROUP2=users + + chmod +x runacltest + chmod +x acl_mode + cd $DIR + + #sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/setfacl.test | runacltest || +#error "$? setfacl tests failed" + + #sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/acl_asroot.test | runacltest || error "$? acl_asroot tests failed" + + #sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/acl_perm.test | runacltest || error "$? acl_perm tests failed" + + #sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/acl_misc.test | runacltest || error "$? acl_misc tests failed" + + sed -e "s/joe/$USER1/g;s/lisa/$USER2/g;s/users/$GROUP1/g;s/toolies/$GROUP2/g" $SAVE_PWD/acl_fileutil.test | runacltest || error "$? acl_fileutil tests failed" + + umask $SAVE_UMASK +} +run_test 3 "==============acl test =============" + TMPDIR=$OLDTMPDIR TMP=$OLDTMP HOME=$OLDHOME diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index acefc28..ded1e08 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -10,13 +10,15 @@ ONLY=${ONLY:-"$*"} # bug number for skipped test: 2739 # 51b and 51c depend on kernel # 65* fixes in b_hd_cray_merge3 -ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"51b 51c 65a 65b 65c 65d 65e 65f"} +# the new kernel api make 48 not valid anymore +ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"48 51b 51c 65a 65b 65c 65d 65e 65f"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! [ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT" SRCDIR=`dirname $0` export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH +export SECURITY=${SECURITY:-"null"} TMP=${TMP:-/tmp} FSTYPE=${FSTYPE:-ext3} @@ -40,6 +42,8 @@ IOPENTEST1=${IOPENTEST1:-iopentest1} IOPENTEST2=${IOPENTEST2:-iopentest2} MEMHOG=${MEMHOG:-memhog} +. krb5_env.sh + if [ $UID -ne 0 ]; then RUNAS_ID="$UID" RUNAS="" @@ -48,6 +52,13 @@ else RUNAS=${RUNAS:-"runas -u $RUNAS_ID"} fi +if [ `using_krb5_sec $SECURITY` == 'y' ] ; then + start_krb5_kdc || exit 1 + if [ $RUNAS_ID -ne $UID ]; then + $RUNAS ./krb5_refresh_cache.sh || exit 2 + fi +fi + export NAME=${NAME:-local} SAVE_PWD=$PWD diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index 8ef4207..faecfc4 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -366,6 +366,39 @@ test_18() { } run_test 18 "mmap sanity check =================================" +test_19() { # bug 2441 + touch $DIR1/f2b + + #test set/get xattr + setfattr -n trusted.name1 -v value1 $DIR1/f2b || error + [ "`getfattr -n trusted.name1 $DIR2/f2b 2> /dev/null | \ + grep "trusted.name1"`" == "trusted.name1=\"value1\"" ] || error + + setfattr -n user.author1 -v author1 $DIR/f2b || error + [ "`getfattr -n user.author1 $DIR/f2b 2> /dev/null | \ + grep "user.author1"`" == "user.author1=\"author1\"" ] || error + + # test listxattr + setfattr -n trusted.name2 -v value2 $DIR2/f2b || error + setfattr -n trusted.name3 -v value3 $DIR1/f2b || error + [ `getfattr -d -m "^trusted" $DIR2/f2b 2> /dev/null | \ + grep "trusted" | wc -l` -eq 5 ] || error + + setfattr -n user.author2 -v author2 $DIR/f2b || error + setfattr -n user.author3 -v author3 $DIR/f2b || error + [ `getfattr -d -m "^user" $DIR/f2b 2> /dev/null | \ + grep "user" | wc -l` -eq 3 ] || error + #test removexattr + setfattr -x trusted.name1 $DIR2/f2b 2> /dev/null || error + getfattr -d -m trusted $DIR2/f2b 2> /dev/null | \ + grep "trusted.name1" && error || true + + setfattr -x user.author1 $DIR/f2b 2> /dev/null || error + getfattr -d -m user $DIR/f2b 2> /dev/null | \ + grep "user.author1" && error || true +} +run_test 19 "test set/get xattr on multiple mounts ============" + log "cleanup: ======================================================" rm -rf $DIR1/[df][0-9]* $DIR1/lnk || true diff --git a/lustre/tests/setfacl.test b/lustre/tests/setfacl.test new file mode 100644 index 0000000..af19462 --- /dev/null +++ b/lustre/tests/setfacl.test @@ -0,0 +1,123 @@ +! +! setfacl tests. +! +! Run these tests on a filesystem with ACL support. +! +$ umask 027 +$ touch g +$ acl_mode g +-rw-r----- +$ setfacl -m m:- g +$ acl_mode g +-rw-------+ +$ getfacl g +# file: g +# owner: @OWNER@ +# group: @GROUP@ +user::rw- +group::r-- #effective:--- +mask::--- +other::--- + +$ setfacl -x m g +$ getfacl g +# file: g +# owner: @OWNER@ +# group: @GROUP@ +user::rw- +group::r-- +other::--- + +$ setfacl -m u:joe:rw g +$ getfacl g +# file: g +# owner: @OWNER@ +# group: @GROUP@ +user::rw- +user:joe:rw- +group::r-- +mask::rw- +other::--- + +$ setfacl -m u::rwx,g::r-x,o:- g +$ getfacl g +# file: g +# owner: @OWNER@ +# group: @GROUP@ +user::rwx +user:joe:rw- +group::r-x +mask::rwx +other::--- + +$ setfacl -m u::rwx,g::r-x,o:-,m:- g +$ getfacl g +# file: g +# owner: @OWNER@ +# group: @GROUP@ +user::rwx +user:joe:rw- #effective:--- +group::r-x #effective:--- +mask::--- +other::--- + +$ setfacl -m u::rwx,g::r-x,o:-,u:root:-,m:- g +$ getfacl g +# file: g +# owner: @OWNER@ +# group: @GROUP@ +user::rwx +user:root:--- +user:joe:rw- #effective:--- +group::r-x #effective:--- +mask::--- +other::--- + +$ setfacl -m u::rwx,g::r-x,o:-,u:root:-,m:- g +$ getfacl g +# file: g +# owner: @OWNER@ +# group: @GROUP@ +user::rwx +user:root:--- +user:joe:rw- #effective:--- +group::r-x #effective:--- +mask::--- +other::--- + +$ setfacl -m u::rwx,g::r-x,o:-,u:root:- g +$ getfacl g +# file: g +# owner: @OWNER@ +# group: @GROUP@ +user::rwx +user:root:--- +user:joe:rw- +group::r-x +mask::rwx +other::--- + +$ setfacl --test -x u: g +setfacl: g: Malformed access ACL `user:root:---,user:joe:rw-,group::r-x,mask::rwx,other::---': Missing or wrong entry at entry 1 +$ setfacl --test -x u:x +setfacl: Option -x: Invalid argument near character 3 +$ setfacl -m d:u:root:rwx g +setfacl: g: Only directories can have default ACLs +$ setfacl -x m g +setfacl: g: Malformed access ACL `user::rwx,user:root:---,user:joe:rw-,group::r-x,other::---': Missing or wrong entry at entry 5 +!setfacl --test -m d:u:joe:rwx setfacl +!setfacl --test -n -m d:u:joe:rwx setfacl +$ rm g +! +! Check if the mask is properly recalculated +! +$ mkdir d +$ setfacl --test -m u::rwx,u:@OWNER@:rwx,g::r-x,o::--- d +d: u::rwx,u:@OWNER@:rwx,g::r-x,m::rwx,o::---,* +$ setfacl --test -m u::rwx,u:@OWNER@:rwx,g::r-x,m::---,o::--- d +d: u::rwx,u:@OWNER@:rwx,g::r-x,m::---,o::---,* +$ setfacl --test -d -m u::rwx,u:@OWNER@:rwx,g::r-x,o::--- d +d: *,d:u::rwx,d:u:@OWNER@:rwx,d:g::r-x,d:m::rwx,d:o::--- +$ setfacl --test -d -m u::rwx,u:@OWNER@:rwx,g::r-x,m::---,o::--- d +d: *,d:u::rwx,d:u:@OWNER@:rwx,d:g::r-x,d:m::---,d:o::--- +$ rmdir d diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index c0f8ccd..395184d 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -39,6 +39,7 @@ init_test_env() { export LCTL=${LCTL:-"$LUSTRE/utils/lctl"} export CHECKSTAT="${CHECKSTAT:-checkstat} " export FSYTPE=${FSTYPE:-"ext3"} + export SECURITY=${SECURITY:-"null"} # Paths on remote nodes, if different export RLUSTRE=${RLUSTRE:-$LUSTRE} @@ -63,6 +64,8 @@ init_test_env() { # echo "CONFIG=`canonical_path $CONFIG`" > $LUSTRE/tests/CONFIG } +. krb5_env.sh + # Facet functions start() { facet=$1 @@ -70,7 +73,7 @@ start() { active=`facet_active $facet` do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \ --node ${active}_facet --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \ - $@ $XMLCONFIG + --sec $SECURITY $@ $XMLCONFIG } stop() { @@ -89,11 +92,13 @@ zconf_mount() { do_node $client mkdir $mnt 2> /dev/null || : if [ -x /sbin/mount.lustre ] ; then - do_node $client mount -t lustre -o nettype=$NETTYPE `facet_active_host mds1`:/mds1_svc/client_facet $mnt || return 1 + do_node $client mount -t lustre -o sec=$SECURITY,nettype=$NETTYPE \ + `facet_active_host mds1`:/mds1_svc/client_facet $mnt || return 2 else # this is so cheating do_node $client $LCONF --nosetup --node client_facet $XMLCONFIG > /dev/null || return 2 - do_node $client $LLMOUNT `facet_active_host mds1`:/mds1_svc/client_facet $mnt -o nettype=$NETTYPE|| return 4 + do_node $client $LLMOUNT `facet_active_host mds1`:/mds1_svc/client_facet $mnt \ + -o sec=$SECURITY,nettype=$NETTYPE|| return 4 fi [ -d /r ] && $LCTL modules > /r/tmp/ogdb-`hostname` @@ -180,6 +185,16 @@ fail() { df $MOUNT || error "post-failover df: $?" } +fail_drop() { + local facet=$1 + local failcode=$2 + facet_failover $facet + do_facet mds "echo $failcode > /proc/sys/lustre/fail_loc" + cat /proc/sys/lustre/fail_loc + df $MOUNT || error "post-failover df: $?" + do_facet mds "echo 0 > /proc/sys/lustre/fail_loc" +} + fail_abort() { local facet=$1 stop $facet --force --failover --nomod diff --git a/lustre/utils/.cvsignore b/lustre/utils/.cvsignore index 59147ac..5c5ce2e 100644 --- a/lustre/utils/.cvsignore +++ b/lustre/utils/.cvsignore @@ -18,5 +18,6 @@ lfs llmount mount.lustre wiretest +lsd_upcall .*.cmd .*.d diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index 23eb876..62707cc 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -13,7 +13,7 @@ bin_scripts = lfind lstripe if UTILS rootsbin_SCRIPTS = mount.lustre -sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount l_getgroups +sbin_PROGRAMS = lctl obdio obdbarrier lload wirecheck wiretest llmount lsd_upcall bin_PROGRAMS = lfs lib_LIBRARIES = liblustreapi.a sbin_SCRIPTS = $(sbin_scripts) @@ -33,7 +33,7 @@ obdbarrier_SOURCES = obdbarrier.c obdiolib.c obdiolib.h lfs_SOURCES = lfs.c llmount_SOURCES = llmount.c llmount_LDADD = $(LIBREADLINE) -lptlctl -l_getgroups_SOURCES = l_getgroups.c +lsd_upcall_SOURCES = lsd_upcall.c EXTRA_DIST = $(bin_scripts) $(sbin_scripts) diff --git a/lustre/utils/lconf b/lustre/utils/lconf index f704c77..d42ae9d 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -587,6 +587,13 @@ class LCTLInterface: quit""" % (type, name, uuid) self.run(cmds) + def set_security(self, name, key, value): + cmds = """ + cfg_device %s + set_security %s %s + quit""" % (name, key, value) + self.run(cmds) + def setup(self, name, setup = ""): cmds = """ cfg_device %s @@ -1177,6 +1184,8 @@ class kmod: self.dev_dir = dev_dir self.name = name + # FIXME we ignore the failure of loading gss module, because we might + # don't need it at all. def load(self): """Load module""" log ('loading module:', self.name, 'srcdir', @@ -1184,15 +1193,21 @@ class kmod: if self.src_dir: module = kmod_find(self.src_dir, self.dev_dir, self.name) - if not module: + if not module and self.name != 'ptlrpcs_gss': panic('module not found:', self.name) (rc, out) = run('/sbin/insmod', module) if rc: - raise CommandError('insmod', out, rc) + if self.name == 'ptlrpcs_gss': + print "Warning: not support gss security!" + else: + raise CommandError('insmod', out, rc) else: (rc, out) = run('/sbin/modprobe', self.name) if rc: - raise CommandError('modprobe', out, rc) + if self.name == 'ptlrpcs_gss': + print "Warning: not support gss security!" + else: + raise CommandError('modprobe', out, rc) def cleanup(self): """Unload module""" @@ -1545,7 +1560,9 @@ class LDLM(Module): def add_module(self, manager): manager.add_lustre_module('lvfs', 'lvfs') manager.add_lustre_module('obdclass', 'obdclass') + manager.add_lustre_module('sec', 'ptlrpcs') manager.add_lustre_module('ptlrpc', 'ptlrpc') + manager.add_lustre_module('sec/gss', 'ptlrpcs_gss') def prepare(self): return @@ -1892,16 +1909,21 @@ class MDSDEV(Module): self.info("mds", realdev, mountfsoptions, self.fstype, self.size, self.format, master_name, profile_name, self.obdtype) - lctl.newdev("mds", self.name, self.uuid, - setup = "%s %s %s %s %s %s" %(realdev, + lctl.attach("mds", self.name, self.uuid) + if config.mds_mds_sec: + lctl.set_security(self.name, "mds_mds_sec", config.mds_mds_sec) + if config.mds_ost_sec: + lctl.set_security(self.name, "mds_ost_sec", config.mds_ost_sec) + + lctl.setup(self.name, setup = "%s %s %s %s %s %s" %(realdev, self.fstype, profile_name, mountfsoptions, master_name, self.obdtype)) if development_mode(): - procentry = "/proc/fs/lustre/mds/grp_hash_upcall" - upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/l_getgroups") + procentry = "/proc/fs/lustre/mds/lsd_upcall" + upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/lsd_upcall") if not (os.access(procentry, os.R_OK) and os.access(upcall, os.R_OK)): - print "MDS Warning: failed to set group-hash upcall" + print "MDS Warning: failed to set lsd cache upcall" else: run("echo ", upcall, " > ", procentry) @@ -2686,8 +2708,10 @@ class Mountpoint(Module): self.clientoptions = string.replace(self.clientoptions, "async", "lasync") - cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \ - (self.vosc.get_name(), vmdc_name, self.clientoptions, + if not config.sec: + config.sec = "null" + cmd = "mount -t lustre_lite -o osc=%s,mdc=%s,sec=%s%s %s %s" % \ + (self.vosc.get_name(), vmdc_name, config.sec, self.clientoptions, config.config, self.path) run("mkdir", self.path) ret, val = run(cmd) @@ -3483,6 +3507,9 @@ lconf_options = [ ('config', "Cluster config name used for LDAP query", PARAM), ('select', "service=nodeA,service2=nodeB ", PARAMLIST), ('node', "Load config for ", PARAM), + ('sec', "security flavor of client", PARAM), + ('mds_mds_sec', "security flavor of inter mds's", PARAM), + ('mds_ost_sec', "security flavor of mds's-ost's", PARAM), ('cleanup,d', "Cleans up config. (Shutdown)"), ('force,f', "Forced unmounting and/or obd detach during cleanup", FLAG, 0), diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index cd70a94..27d2b5f 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -224,7 +224,6 @@ command_t cmdlist[] = { {"deactivate", jt_obd_deactivate, 0, "deactivate an import\n"}, {"recover", jt_obd_recover, 0, "usage: recover []"}, {"lookup", jt_obd_mdc_lookup, 0, "usage: lookup "}, - {"finish_gns", jt_obd_finish_gns, 0, "usage: finish_gns "}, {"notransno", jt_obd_no_transno, 0, "disable sending of committed-transno updates\n"}, {"readonly", jt_obd_set_readonly, 0, @@ -246,6 +245,8 @@ command_t cmdlist[] = { "usage: add_conn [priority]\n"}, {"del_conn ", jt_lcfg_del_conn, 0, "usage: del_conn \n"}, + {"set_security", jt_lcfg_set_security, 0, + "usage: set_security key value\n"}, {"lsync", jt_obd_reint_sync, 0, "usage: lsync\n"}, {"cache_on", jt_obd_cache_on, 0, diff --git a/lustre/utils/llmount.c b/lustre/utils/llmount.c index 8ab5705..27b39cd 100644 --- a/lustre/utils/llmount.c +++ b/lustre/utils/llmount.c @@ -33,6 +33,8 @@ #define _GNU_SOURCE #include #include +#include +#include #include "obdctl.h" #include @@ -117,6 +119,9 @@ init_options(struct lustre_mount_data *lmd) lmd->lmd_local_nid = PTL_NID_ANY; lmd->lmd_port = 988; /* XXX define LUSTRE_DEFAULT_PORT */ lmd->lmd_nal = SOCKNAL; + lmd->lmd_nllu = 99; + lmd->lmd_nllg = 99; + strncpy(lmd->lmd_security, "null", sizeof(lmd->lmd_security)); return 0; } @@ -127,6 +132,7 @@ print_options(struct lustre_mount_data *lmd) printf("mds: %s\n", lmd->lmd_mds); printf("profile: %s\n", lmd->lmd_profile); + printf("sec_flavor: %s\n", lmd->lmd_security); printf("server_nid: "LPX64"\n", lmd->lmd_server_nid); printf("local_nid: "LPX64"\n", lmd->lmd_local_nid); printf("nal: %d\n", lmd->lmd_nal); @@ -199,6 +205,60 @@ static int parse_route(char *opteq, char *opttgts) return(0); } +/* + * here all what we do is gurantee the result is exactly + * what user intend to get, no ambiguous. maybe there have + * simpler library call could do the same job for us? + */ +static int parse_u32(char *str, uint32_t *res) +{ + unsigned long id; + char *endptr = NULL; + + id = strtol(str, &endptr, 0); + if (endptr && *endptr != 0) + return -1; + + if (id == LONG_MAX || id == LONG_MIN) + return -1; + + if ((uint32_t)id != id) + return -1; + + *res = (uint32_t) id; + return 0; +} + +static int parse_nllu(struct lustre_mount_data *lmd, char *str_nllu) +{ + struct passwd *pass; + + if (parse_u32(str_nllu, &lmd->lmd_nllu) == 0) + return 0; + + pass = getpwnam(str_nllu); + if (pass == NULL) + return -1; + + lmd->lmd_nllu = pass->pw_uid; + return 0; +} + +static int parse_nllg(struct lustre_mount_data *lmd, char *str_nllg) +{ + struct group *grp; + + if (parse_u32(str_nllg, &lmd->lmd_nllg) == 0) + return 0; + + grp = getgrnam(str_nllg); + if (grp == NULL) + return -1; + + lmd->lmd_nllg = grp->gr_gid; + return 0; +} + int parse_options(char * options, struct lustre_mount_data *lmd) { ptl_nid_t nid = 0, cluster_id = 0; @@ -247,6 +307,23 @@ int parse_options(char * options, struct lustre_mount_data *lmd) lmd->lmd_server_nid = nid; } else if (!strcmp(opt, "port")) { lmd->lmd_port = val; + } else if (!strcmp(opt, "sec")) { + strncpy(lmd->lmd_security, opteq + 1, + sizeof(lmd->lmd_security)); + } else if (!strcmp(opt, "nllu")) { + if (parse_nllu(lmd, opteq + 1)) { + fprintf(stderr, "%s: " + "can't parse user: %s\n", + progname, opteq + 1); + return (-1); + } + } else if (!strcmp(opt, "nllg")) { + if (parse_nllg(lmd, opteq + 1)) { + fprintf(stderr, "%s: " + "can't parse group: %s\n", + progname, opteq + 1); + return (-1); + } } } else { val = 1; diff --git a/lustre/utils/lrun b/lustre/utils/lrun index 56d3d04..6106634 100755 --- a/lustre/utils/lrun +++ b/lustre/utils/lrun @@ -2,11 +2,13 @@ LIBLUSTRE_MOUNT_POINT=${LIBLUSTRE_MOUNT_POINT:-"/mnt/lustre"} LIBLUSTRE_MOUNT_TARGET=${LIBLUSTRE_MOUNT_TARGET:-"TARGET_NOT_SET"} +LIBLUSTRE_SECURITY=${LIBLUSTRE_SECURITY:-"null"} LIBLUSTRE_DUMPFILE=${LIBLUSTRE_DUMPFILE:-"/tmp/DUMP_FILE"} LD_PRELOAD=${LD_PRELOAD:-"/usr/lib/liblustre.so"} export LIBLUSTRE_MOUNT_POINT export LIBLUSTRE_MOUNT_TARGET +export LIBLUSTRE_SECURITY export LIBLUSTRE_DUMPFILE export LD_PRELOAD diff --git a/lustre/utils/l_getgroups.c b/lustre/utils/lsd_upcall.c similarity index 84% rename from lustre/utils/l_getgroups.c rename to lustre/utils/lsd_upcall.c index 2f9b7d0..8b55d45 100644 --- a/lustre/utils/l_getgroups.c +++ b/lustre/utils/lsd_upcall.c @@ -30,6 +30,11 @@ #include #include +#include +#include +#include +#include + /* * return: * 0: fail to insert (found identical) @@ -55,7 +60,7 @@ int insert_sort(gid_t *groups, int size, gid_t grp) return 1; } -int get_groups_local(uid_t uid, int *ngroups, gid_t **groups) +int get_groups_local(uid_t uid, gid_t *gid, int *ngroups, gid_t **groups) { int maxgroups; int i, size = 0; @@ -73,6 +78,8 @@ int get_groups_local(uid_t uid, int *ngroups, gid_t **groups) if (!pw) return -errno; + *gid = pw->pw_gid; + while ((gr = getgrent())) { if (!gr->gr_mem) continue; @@ -92,14 +99,9 @@ int get_groups_local(uid_t uid, int *ngroups, gid_t **groups) int main (int argc, char **argv) { + char *pathname = "/proc/fs/lustre/mds/lsd_downcall"; int fd, rc; - struct { - uint32_t err; - uint32_t uid; - uint32_t ngroups; - gid_t *groups; - } ioc_data; - char *pathname = "/proc/fs/lustre/mds/group_info"; + struct lsd_downcall_args ioc_data; if (argc != 2) { printf("bad parameter\n"); @@ -115,7 +117,13 @@ int main (int argc, char **argv) return rc; } - ioc_data.err = get_groups_local(ioc_data.uid, &ioc_data.ngroups, &ioc_data.groups); + ioc_data.err = get_groups_local(ioc_data.uid, &ioc_data.gid, + &ioc_data.ngroups, &ioc_data.groups); + + /* FIXME get these from config file */ + ioc_data.allow_setuid = 1; + ioc_data.allow_setgid = 1; + ioc_data.allow_setgrp = 1; rc = write(fd, &ioc_data, sizeof(ioc_data)); return (rc != sizeof(ioc_data)); diff --git a/lustre/utils/lustre_cfg.c b/lustre/utils/lustre_cfg.c index 9565aaa..b0af4a6 100644 --- a/lustre/utils/lustre_cfg.c +++ b/lustre/utils/lustre_cfg.c @@ -706,3 +706,46 @@ int jt_lcfg_del_conn(int argc, char **argv) return rc; } +int jt_lcfg_set_security(int argc, char **argv) +{ + struct lustre_cfg lcfg; + int rc; + + if (argc != 3) + return CMD_HELP; + + if (lcfg_devname == NULL) { + fprintf(stderr, "%s: please use 'cfg_device name' to set the " + "device name for config commands.\n", + jt_cmdname(argv[0])); + return -EINVAL; + } + + LCFG_INIT(lcfg, LCFG_SET_SECURITY, lcfg_devname); + + /* currently only used to set on mds */ + if (strcmp(argv[1], "mds_mds_sec") && strcmp(argv[1], "mds_ost_sec")) { + fprintf(stderr, "%s: invalid security key %s\n", + jt_cmdname(argv[0]), argv[1]); + return -EINVAL; + } + if (strcmp(argv[2], "null") && strcmp(argv[2], "krb5")) { + fprintf(stderr, "%s: invalid security value %s\n", + jt_cmdname(argv[0]), argv[2]); + return -EINVAL; + } + + /* connection uuid */ + lcfg.lcfg_inllen1 = strlen(argv[1]) + 1; + lcfg.lcfg_inlbuf1 = argv[1]; + lcfg.lcfg_inllen2 = strlen(argv[2]) + 1; + lcfg.lcfg_inlbuf2 = argv[2]; + + rc = lcfg_ioctl(argv[0], OBD_DEV_ID, &lcfg); + if (rc < 0) { + fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), + strerror(rc = errno)); + } + + return rc; +} diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index 3c40db2..962e26a 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -2005,36 +2005,6 @@ int jt_obd_mdc_lookup(int argc, char **argv) return rc; } -int jt_obd_finish_gns(int argc, char **argv) -{ - char *mtpt; - int rc, fd; - struct obd_ioctl_data data; - - if (argc != 2) - return CMD_HELP; - - mtpt = argv[1]; - - fd = open(mtpt, O_RDONLY); - if (fd < 0) { - fprintf(stderr, "open \"%s\" failed: %s\n", mtpt, - strerror(errno)); - return -1; - } - - IOC_INIT(data); - IOC_PACK(argv[0], data); - rc = ioctl(fd, IOC_MDC_FINISH_GNS, buf); - if (rc < 0) { - fprintf(stderr, "error: %s(%s) ioctl error: %s\n", - jt_cmdname(argv[0]), mtpt, strerror(rc = errno)); - } - close(fd); - - return rc; -} - int jt_obd_close_uuid(int argc, char **argv) { int rc, nal; diff --git a/lustre/utils/obdctl.h b/lustre/utils/obdctl.h index 415b752..e4b47da 100644 --- a/lustre/utils/obdctl.h +++ b/lustre/utils/obdctl.h @@ -74,7 +74,6 @@ int jt_obd_activate(int argc, char **argv); int jt_obd_deactivate(int argc, char **argv); int jt_obd_recover(int argc, char **argv); int jt_obd_mdc_lookup(int argc, char **argv); -int jt_obd_finish_gns(int argc, char **argv); int jt_get_version(int argc, char **argv); int jt_obd_close_uuid(int argc, char **argv); int jt_cfg_record(int argc, char **argv); @@ -115,6 +114,7 @@ int jt_lcfg_set_timeout(int argc, char **argv); int jt_lcfg_set_lustre_upcall(int argc, char **argv); int jt_lcfg_add_conn(int argc, char **argv); int jt_lcfg_del_conn(int argc, char **argv); +int jt_lcfg_set_security(int argc, char **argv); int obd_add_uuid(char *uuid, ptl_nid_t nid, int nal); -- 1.8.3.1