From 52971f18ca9cebe86baaa0da662a141f4c498980 Mon Sep 17 00:00:00 2001 From: alex Date: Fri, 6 Feb 2009 21:14:37 +0000 Subject: [PATCH] - update from HEAD --- lustre/ChangeLog | 40 ++ lustre/autoconf/lustre-core.m4 | 40 +- lustre/autoconf/lustre-version.ac | 2 +- lustre/cmm/cmm_device.c | 18 +- lustre/cmm/mdc_device.c | 4 +- lustre/cmm/mdc_internal.h | 1 - lustre/fid/fid_lib.c | 13 + lustre/fld/fld_handler.c | 9 + lustre/include/cl_object.h | 20 +- lustre/include/class_hash.h | 6 +- lustre/include/linux/lvfs.h | 4 + lustre/include/lprocfs_status.h | 31 +- lustre/include/lustre/ll_fiemap.h | 85 +-- lustre/include/lustre/lustre_idl.h | 247 ++++--- lustre/include/lustre/lustre_user.h | 7 +- lustre/include/lustre_dlm.h | 36 +- lustre/include/lustre_export.h | 9 +- lustre/include/lustre_fid.h | 5 + lustre/include/lustre_lib.h | 2 - lustre/include/lustre_net.h | 8 +- lustre/include/lustre_sec.h | 270 +++++--- lustre/include/obd.h | 19 +- lustre/include/obd_class.h | 17 +- .../patches/md-mmp-unplug-dev-sles10.patch | 22 + .../kernel_patches/patches/md-mmp-unplug-dev.patch | 22 + lustre/kernel_patches/series/2.6-rhel5.series | 1 + lustre/kernel_patches/series/2.6-sles10.series | 1 + lustre/kernel_patches/series/2.6.22-vanilla.series | 1 + lustre/lclient/glimpse.c | 14 +- lustre/lclient/lcommon_cl.c | 2 +- lustre/ldlm/Makefile.am | 2 +- lustre/ldlm/ldlm_internal.h | 2 - lustre/ldlm/ldlm_lib.c | 125 ++-- lustre/ldlm/ldlm_lock.c | 13 +- lustre/ldlm/ldlm_lockd.c | 33 +- lustre/ldlm/ldlm_pool.c | 39 +- lustre/ldlm/ldlm_request.c | 132 +--- lustre/ldlm/ldlm_resource.c | 8 - lustre/liblustre/llite_lib.c | 5 +- lustre/liblustre/super.c | 8 +- lustre/llite/Makefile.in | 4 + lustre/llite/autoMakefile.am | 3 - lustre/llite/dcache.c | 9 +- lustre/llite/file.c | 65 +- lustre/llite/llite_internal.h | 22 +- lustre/llite/llite_lib.c | 9 +- lustre/llite/lloop.c | 384 +++++++---- lustre/llite/rw26.c | 39 +- lustre/llite/vvp_page.c | 7 +- lustre/lmv/lmv_obd.c | 30 +- lustre/lov/Makefile.in | 2 + lustre/lov/autoMakefile.am | 1 - lustre/lov/lov_cl_internal.h | 4 +- lustre/lov/lov_internal.h | 3 +- lustre/lov/lov_obd.c | 127 ++-- lustre/lov/lov_pack.c | 18 +- lustre/lov/lov_pool.c | 156 +++-- lustre/lov/lov_qos.c | 13 +- lustre/lvfs/Makefile.in | 5 + lustre/lvfs/autoMakefile.am | 5 - lustre/lvfs/lvfs_linux.c | 52 ++ lustre/mdc/Makefile.in | 2 + lustre/mdc/autoMakefile.am | 1 - lustre/mdc/mdc_request.c | 16 +- lustre/mdd/mdd_device.c | 380 +++++++++- lustre/mdd/mdd_internal.h | 9 + lustre/mdd/mdd_object.c | 4 +- lustre/mds/Makefile.in | 2 + lustre/mds/autoMakefile.am | 1 - lustre/mds/mds_lov.c | 4 +- lustre/mdt/mdt_handler.c | 45 +- lustre/mdt/mdt_open.c | 9 +- lustre/mdt/mdt_recovery.c | 100 ++- lustre/mgc/Makefile.in | 2 + lustre/mgc/autoMakefile.am | 1 - lustre/mgs/Makefile.in | 2 + lustre/mgs/autoMakefile.am | 1 - lustre/mgs/mgs_handler.c | 30 +- lustre/mgs/mgs_llog.c | 5 +- lustre/obdclass/Makefile.in | 3 + lustre/obdclass/autoMakefile.am | 1 - lustre/obdclass/cl_page.c | 7 +- lustre/obdclass/genops.c | 16 +- lustre/obdclass/llog_test.c | 7 +- lustre/obdclass/lu_object.c | 1 + lustre/obdclass/obd_config.c | 4 +- lustre/obdclass/obd_mount.c | 8 +- lustre/obdecho/Makefile.in | 2 + lustre/obdecho/autoMakefile.am | 1 - lustre/obdecho/echo.c | 14 +- lustre/obdecho/echo_client.c | 13 +- lustre/obdfilter/Makefile.in | 2 + lustre/obdfilter/autoMakefile.am | 1 - lustre/obdfilter/filter.c | 69 +- lustre/obdfilter/filter_io_26.c | 59 +- lustre/obdfilter/lproc_obdfilter.c | 45 +- lustre/osc/Makefile.in | 2 + lustre/osc/autoMakefile.am | 1 - lustre/osc/osc_lock.c | 31 +- lustre/osc/osc_request.c | 19 +- lustre/osd/osd_handler.c | 13 +- lustre/ost/Makefile.in | 2 + lustre/ost/autoMakefile.am | 1 - lustre/ost/ost_handler.c | 28 +- lustre/ptlrpc/Makefile.in | 1 + lustre/ptlrpc/autoMakefile.am | 1 - lustre/ptlrpc/client.c | 4 +- lustre/ptlrpc/events.c | 4 +- lustre/ptlrpc/gss/gss_api.h | 44 +- lustre/ptlrpc/gss/gss_bulk.c | 768 +++++++++------------ lustre/ptlrpc/gss/gss_internal.h | 4 + lustre/ptlrpc/gss/gss_keyring.c | 1 + lustre/ptlrpc/gss/gss_krb5_mech.c | 716 +++++++++++++++---- lustre/ptlrpc/gss/gss_mech_switch.c | 54 +- lustre/ptlrpc/gss/sec_gss.c | 142 ++-- lustre/ptlrpc/import.c | 13 +- lustre/ptlrpc/lproc_ptlrpc.c | 6 +- lustre/ptlrpc/niobuf.c | 6 +- lustre/ptlrpc/pack_generic.c | 32 +- lustre/ptlrpc/pers.c | 5 +- lustre/ptlrpc/ptlrpc_module.c | 1 + lustre/ptlrpc/recov_thread.c | 1 + lustre/ptlrpc/sec.c | 456 ++++++------ lustre/ptlrpc/sec_bulk.c | 498 +++---------- lustre/ptlrpc/sec_config.c | 277 ++------ lustre/ptlrpc/sec_lproc.c | 18 +- lustre/ptlrpc/sec_null.c | 22 +- lustre/ptlrpc/sec_plain.c | 429 +++++++++--- lustre/ptlrpc/service.c | 11 + lustre/ptlrpc/wiretest.c | 57 +- lustre/quota/Makefile.in | 2 + lustre/quota/autoMakefile.am | 1 - lustre/quota/quota_adjust_qunit.c | 3 +- lustre/quota/quota_check.c | 11 +- lustre/quota/quota_context.c | 56 +- lustre/quota/quota_interface.c | 3 +- lustre/quota/quota_internal.h | 3 + lustre/quota/quota_master.c | 11 +- lustre/tests/acceptance-small.sh | 2 +- lustre/tests/conf-sanity.sh | 25 +- lustre/tests/mdsrate-create-large.sh | 30 +- lustre/tests/mdsrate-create-small.sh | 25 +- lustre/tests/mdsrate-lookup-1dir.sh | 11 +- lustre/tests/mdsrate-stat-large.sh | 11 +- lustre/tests/mdsrate-stat-small.sh | 11 +- lustre/tests/recovery-small.sh | 71 +- lustre/tests/replay-single.sh | 77 ++- lustre/tests/sanity-gss.sh | 120 +--- lustre/tests/sanity-quota.sh | 51 +- lustre/tests/sanity.sh | 163 +++-- lustre/tests/sanityN.sh | 28 +- lustre/tests/test-framework.sh | 105 +-- lustre/utils/liblustreapi.c | 4 +- lustre/utils/obd.c | 410 ++++++++--- lustre/utils/wirecheck.c | 7 +- lustre/utils/wiretest.c | 59 +- 156 files changed, 4728 insertions(+), 3388 deletions(-) create mode 100644 lustre/kernel_patches/patches/md-mmp-unplug-dev-sles10.patch create mode 100644 lustre/kernel_patches/patches/md-mmp-unplug-dev.patch diff --git a/lustre/ChangeLog b/lustre/ChangeLog index b8a9007..7d14da4 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -14,6 +14,30 @@ tbd Sun Microsystems, Inc. * File join has been disabled in this release, refer to Bugzilla 16929. Severity : normal +Frequency : start MDS on uncleanly shutdowned MDS device +Bugzilla : 16839 +Descriptoin: ll_sync thread stay in waiting mds<>ost recovery finished +Details : stay in waiting mds<>ost recovery finished produce random bugs + due race between two ll_sync thread for one lov target. send + ACTIVATE event only if connect realy finished and import have + FULL state. + +Severity : normal +Frequency : rare, connect and disconnect target at same time +Bugzilla : 17310 +Descriptoin: ASSERTION(atomic_read(&imp->imp_inflight) == 0 +Details : don't call obd_disconnect under lov_lock. this long time + operation and can block ptlrpcd which answer to connect request. + +Severity : normal +Frequency : rare +Bugzilla : 18154 +Descriptoin: don't lose wakeup for imp_recovery_waitq +Details : recover_import_no_retry or invalidate_import and import_close can + both sleep on imp_recovery_waitq, but we was send only one wakeup + to sleep queue. + +Severity : normal Frequency : always with long access acl Bugzilla : 17636 Descriptoin: mds can't pack reply with long acl. @@ -1923,6 +1947,22 @@ Details : enable OBD_CONNECT_MDT flag when connecting from the MDS so that from a different NID, so we do not need to wait for the export to be evicted +Severity : major +Frequency : rare, only if using MMP with Linux RAID +Bugzilla : 17895 +Description: MMP doesn't work with Linux RAID +Details : While using HA for Lustre servers with Linux RAID, it is possible + that MMP will not detect multiple mounts. To make this work we + need to unplug the device queue in RAID when the MMP block is being + written. Also while reading the MMP block, we should read it from + disk and not the cached one. + +Severity : enhancement +Bugzilla : 17187 +Description: open file using fid +Details : A file can be opened using just its fid, like + /.lustre/fid/SEQ:OID:VER - this is needed for HSM and replication + -------------------------------------------------------------------------------- 2007-08-10 Cluster File Systems, Inc. diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index 4e00bf6..04023c7 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -623,7 +623,7 @@ dnl the AES symbol usually tied with arch, e.g. CRYPTO_AES_586 dnl FIXME AC_DEFUN([LC_CONFIG_RMTCLIENT], [LB_LINUX_CONFIG_IM([CRYPTO_AES],[],[ - AC_MSG_ERROR([Lustre remote client require that CONFIG_CRYPTO_AES is enabled in your kernel.]) + AC_MSG_WARN([Lustre remote client require that CONFIG_CRYPTO_AES is enabled in your kernel.]) ]) ]) @@ -654,19 +654,19 @@ AC_DEFUN([LC_CONFIG_SUNRPC], AC_DEFUN([LC_CONFIG_GSS_KEYRING], [AC_MSG_CHECKING([whether to enable gss keyring backend]) AC_ARG_ENABLE([gss_keyring], - [AC_HELP_STRING([--disable-gss-keyring], + [AC_HELP_STRING([--disable-gss-keyring], [disable gss keyring backend])], - [],[enable_gss_keyring='yes']) + [],[enable_gss_keyring='yes']) AC_MSG_RESULT([$enable_gss_keyring]) if test x$enable_gss_keyring != xno; then - LB_LINUX_CONFIG_IM([KEYS],[], + LB_LINUX_CONFIG_IM([KEYS],[], [AC_MSG_ERROR([GSS keyring backend require that CONFIG_KEYS be enabled in your kernel.])]) - AC_CHECK_LIB([keyutils], [keyctl_search], [], + AC_CHECK_LIB([keyutils], [keyctl_search], [], [AC_MSG_ERROR([libkeyutils is not found, which is required by gss keyring backend])],) - AC_DEFINE([HAVE_GSS_KEYRING], [1], + AC_DEFINE([HAVE_GSS_KEYRING], [1], [Define this if you enable gss keyring backend]) fi ]) @@ -685,37 +685,29 @@ AC_DEFUN([LC_CONFIG_GSS], AC_MSG_RESULT([$enable_gss]) if test x$enable_gss == xyes; then - LC_CONFIG_GSS_KEYRING + LC_CONFIG_GSS_KEYRING LC_CONFIG_SUNRPC + AC_DEFINE([HAVE_GSS], [1], [Define this if you enable gss]) + LB_LINUX_CONFIG_IM([CRYPTO_MD5],[], [AC_MSG_WARN([kernel MD5 support is recommended by using GSS.])]) - LB_LINUX_CONFIG_IM([CRYPTO_SHA1],[], + LB_LINUX_CONFIG_IM([CRYPTO_SHA1],[], [AC_MSG_WARN([kernel SHA1 support is recommended by using GSS.])]) - LB_LINUX_CONFIG_IM([CRYPTO_SHA256],[], + LB_LINUX_CONFIG_IM([CRYPTO_SHA256],[], [AC_MSG_WARN([kernel SHA256 support is recommended by using GSS.])]) - LB_LINUX_CONFIG_IM([CRYPTO_SHA512],[], + LB_LINUX_CONFIG_IM([CRYPTO_SHA512],[], [AC_MSG_WARN([kernel SHA512 support is recommended by using GSS.])]) - LB_LINUX_CONFIG_IM([CRYPTO_WP512],[], - [AC_MSG_WARN([kernel WP512 support is recommended by using GSS.])]) - LB_LINUX_CONFIG_IM([CRYPTO_ARC4],[], - [AC_MSG_WARN([kernel ARC4 support is recommended by using GSS.])]) - LB_LINUX_CONFIG_IM([CRYPTO_DES],[], - [AC_MSG_WARN([kernel DES support is recommended by using GSS.])]) - LB_LINUX_CONFIG_IM([CRYPTO_TWOFISH],[], - [AC_MSG_WARN([kernel TWOFISH support is recommended by using GSS.])]) - LB_LINUX_CONFIG_IM([CRYPTO_CAST6],[], - [AC_MSG_WARN([kernel CAST6 support is recommended by using GSS.])]) - - AC_CHECK_LIB([gssapi], [gss_init_sec_context], + + AC_CHECK_LIB([gssapi], [gss_init_sec_context], [GSSAPI_LIBS="$GSSAPI_LDFLAGS -lgssapi"], [AC_CHECK_LIB([gssglue], [gss_init_sec_context], [GSSAPI_LIBS="$GSSAPI_LDFLAGS -lgssglue"], [AC_MSG_ERROR([libgssapi or libgssglue is not found, which is required by GSS.])])],) - AC_SUBST(GSSAPI_LIBS) + AC_SUBST(GSSAPI_LIBS) - AC_KERBEROS_V5 + AC_KERBEROS_V5 fi ]) diff --git a/lustre/autoconf/lustre-version.ac b/lustre/autoconf/lustre-version.ac index 2663bf9..b367e2f 100644 --- a/lustre/autoconf/lustre-version.ac +++ b/lustre/autoconf/lustre-version.ac @@ -1,6 +1,6 @@ m4_define([LUSTRE_MAJOR],[1]) m4_define([LUSTRE_MINOR],[9]) -m4_define([LUSTRE_PATCH],[130]) +m4_define([LUSTRE_PATCH],[150]) m4_define([LUSTRE_FIX],[0]) dnl # don't forget to update the service tags info diff --git a/lustre/cmm/cmm_device.c b/lustre/cmm/cmm_device.c index 01f319d..681e2db 100644 --- a/lustre/cmm/cmm_device.c +++ b/lustre/cmm/cmm_device.c @@ -817,15 +817,29 @@ static void lprocfs_cmm_init_vars(struct lprocfs_static_vars *lvars) static int __init cmm_mod_init(void) { struct lprocfs_static_vars lvars; + int rc; + + /* + * Kludge code : it should be moved mdc_device.c if mdc_(mds)_device + * is really stacked. + */ + rc = lu_device_type_init(&mdc_device_type); + if (rc) + return rc; lprocfs_cmm_init_vars(&lvars); - return class_register_type(&cmm_obd_device_ops, NULL, lvars.module_vars, - LUSTRE_CMM_NAME, &cmm_device_type); + rc = class_register_type(&cmm_obd_device_ops, NULL, lvars.module_vars, + LUSTRE_CMM_NAME, &cmm_device_type); + if (rc) + lu_device_type_fini(&mdc_device_type); + + return rc; } static void __exit cmm_mod_exit(void) { class_unregister_type(LUSTRE_CMM_NAME); + lu_device_type_fini(&mdc_device_type); } MODULE_AUTHOR("Sun Microsystems, Inc. "); diff --git a/lustre/cmm/mdc_device.c b/lustre/cmm/mdc_device.c index db2d0b1..d3a7c3b 100644 --- a/lustre/cmm/mdc_device.c +++ b/lustre/cmm/mdc_device.c @@ -130,7 +130,6 @@ static int mdc_obd_add(const struct lu_env *env, CERROR("target %s not set up\n", mdc->obd_name); rc = -EINVAL; } else { - struct lustre_handle *conn = &desc->cl_conn; struct obd_connect_data *ocd; CDEBUG(D_CONFIG, "connect to %s(%s)\n", @@ -153,13 +152,12 @@ static int mdc_obd_add(const struct lu_env *env, OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | OBD_CONNECT_AT; - rc = obd_connect(env, conn, mdc, &mdc->obd_uuid, ocd, NULL); + rc = obd_connect(env, &desc->cl_exp, mdc, &mdc->obd_uuid, ocd, NULL); OBD_FREE_PTR(ocd); if (rc) { CERROR("target %s connect error %d\n", mdc->obd_name, rc); } else { - desc->cl_exp = class_conn2export(conn); /* set seq controller export for MDC0 if exists */ if (mc->mc_num == 0) ms->ms_control_exp = diff --git a/lustre/cmm/mdc_internal.h b/lustre/cmm/mdc_internal.h index 774912b..e7a1d13 100644 --- a/lustre/cmm/mdc_internal.h +++ b/lustre/cmm/mdc_internal.h @@ -50,7 +50,6 @@ #include struct mdc_cli_desc { - struct lustre_handle cl_conn; /* uuid of remote MDT to connect */ struct obd_uuid cl_srv_uuid; /* mdc uuid */ diff --git a/lustre/fid/fid_lib.c b/lustre/fid/fid_lib.c index 76e779a..ab6422c 100644 --- a/lustre/fid/fid_lib.c +++ b/lustre/fid/fid_lib.c @@ -70,6 +70,7 @@ * * The first 0x400 sequences of normal FID are reserved for special purpose. * FID_SEQ_START + 1 is for local file id generation. + * FID_SEQ_START + 2 is for .lustre directory and its objects */ const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = { FID_SEQ_START + 0x400ULL, @@ -89,3 +90,15 @@ const struct lu_fid LUSTRE_BFL_FID = { .f_seq = 0x0000000000000003, .f_oid = 0x0000000000000001, .f_ver = 0x0000000000000000 }; EXPORT_SYMBOL(LUSTRE_BFL_FID); + +/** Special fid for ".lustre" directory */ +const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = LU_DOT_LUSTRE_SEQ, + .f_oid = 0x0000000000000001, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_DOT_LUSTRE_FID); + +/** Special fid for "fid" special object in .lustre */ +const struct lu_fid LU_OBF_FID = { .f_seq = LU_DOT_LUSTRE_SEQ, + .f_oid = 0x0000000000000002, + .f_ver = 0x0000000000000000 }; +EXPORT_SYMBOL(LU_OBF_FID); diff --git a/lustre/fld/fld_handler.c b/lustre/fld/fld_handler.c index 2b6ab12..5092ac1 100644 --- a/lustre/fld/fld_handler.c +++ b/lustre/fld/fld_handler.c @@ -67,6 +67,7 @@ #include #include #include "fld_internal.h" +#include #ifdef __KERNEL__ @@ -466,6 +467,7 @@ int fld_server_init(struct lu_server_fld *fld, struct dt_device *dt, int mds_node_id) { int cache_size, cache_threshold; + struct lu_seq_range range; int rc; ENTRY; @@ -499,6 +501,13 @@ int fld_server_init(struct lu_server_fld *fld, struct dt_device *dt, GOTO(out, rc); fld->lsf_control_exp = NULL; + + /* Insert reserved sequence number of ".lustre" into fld cache. */ + range.lsr_start = LU_DOT_LUSTRE_SEQ; + range.lsr_end = LU_DOT_LUSTRE_SEQ + 1; + range.lsr_mdt = 0; + fld_cache_insert(fld->lsf_cache, &range); + EXIT; out: if (rc) diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h index ad4b468..5b922bd 100644 --- a/lustre/include/cl_object.h +++ b/lustre/include/cl_object.h @@ -2100,16 +2100,22 @@ enum cl_enq_flags { */ CEF_DISCARD_DATA = 0x00000004, /** - * tell the sub layers that it must be a `real' lock. + * tell the sub layers that it must be a `real' lock. This is used for + * mmapped-buffer locks and glimpse locks that must be never converted + * into lockless mode. + * + * \see vvp_mmap_locks(), cl_glimpse_lock(). */ CEF_MUST = 0x00000008, /** - * tell the sub layers that never request a `real' lock. - * currently, the CEF_MUST & CEF_NEVER are only used for mmap locks. - * cl_io::ci_lockreq and these two flags: ci_lockreq just describes - * generic information of lock requirement for this IO, especially for - * locks which belong to the object doing IO; however, lock itself may - * have precise requirements, this is described by the latter. + * tell the sub layers that never request a `real' lock. This flag is + * not used currently. + * + * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless + * conversion policy: ci_lockreq describes generic information of lock + * requirement for this IO, especially for locks which belong to the + * object doing IO; however, lock itself may have precise requirements + * that are described by the enqueue flags. */ CEF_NEVER = 0x00000010, /** diff --git a/lustre/include/class_hash.h b/lustre/include/class_hash.h index 6210c7f..37bd8d2 100644 --- a/lustre/include/class_hash.h +++ b/lustre/include/class_hash.h @@ -85,11 +85,9 @@ lh_hash(lustre_hash_t *lh, void *key, unsigned mask) { LASSERT(lh); LASSERT(LHO(lh)); + LASSERT(LHP(lh, hash)); - if (LHP(lh, hash)) - return LHP(lh, hash)(lh, key, mask); - - return -EOPNOTSUPP; + return LHP(lh, hash)(lh, key, mask); } static inline void * diff --git a/lustre/include/linux/lvfs.h b/lustre/include/linux/lvfs.h index 17576c3..e90b155 100644 --- a/lustre/include/linux/lvfs.h +++ b/lustre/include/linux/lvfs.h @@ -104,6 +104,10 @@ int lustre_fread(struct file *file, void *buf, int len, loff_t *off); int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off); int lustre_fsync(struct file *file); long l_readdir(struct file * file, struct list_head *dentry_list); +int l_notify_change(struct vfsmount *mnt, struct dentry *dchild, + struct iattr *newattrs); +int simple_truncate(struct dentry *dir, struct vfsmount *mnt, + char *name, loff_t length); static inline void l_dput(struct dentry *de) { diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index 7763498..f20dae2 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -222,9 +222,9 @@ static inline int opcode_offset(__u32 opc) { (LDLM_LAST_OPC - LDLM_FIRST_OPC) + (MDS_LAST_OPC - MDS_FIRST_OPC) + (OST_LAST_OPC - OST_FIRST_OPC)); - } else if (opc < FLD_LAST_OPC) { - /* FLD opcode */ - return (opc - FLD_FIRST_OPC + + } else if (opc < QUOTA_LAST_OPC) { + /* LQUOTA Opcode */ + return (opc - QUOTA_FIRST_OPC + (LLOG_LAST_OPC - LLOG_FIRST_OPC) + (OBD_LAST_OPC - OBD_FIRST_OPC) + (MGS_LAST_OPC - MGS_FIRST_OPC) + @@ -234,7 +234,7 @@ static inline int opcode_offset(__u32 opc) { } else if (opc < SEQ_LAST_OPC) { /* SEQ opcode */ return (opc - SEQ_FIRST_OPC + - (FLD_LAST_OPC - FLD_FIRST_OPC) + + (QUOTA_LAST_OPC- QUOTA_FIRST_OPC) + (LLOG_LAST_OPC - LLOG_FIRST_OPC) + (OBD_LAST_OPC - OBD_FIRST_OPC) + (MGS_LAST_OPC - MGS_FIRST_OPC) + @@ -245,19 +245,19 @@ static inline int opcode_offset(__u32 opc) { /* SEC opcode */ return (opc - SEC_FIRST_OPC + (SEQ_LAST_OPC - SEQ_FIRST_OPC) + - (FLD_LAST_OPC - FLD_FIRST_OPC) + + (QUOTA_LAST_OPC- QUOTA_FIRST_OPC) + (LLOG_LAST_OPC - LLOG_FIRST_OPC) + (OBD_LAST_OPC - OBD_FIRST_OPC) + (MGS_LAST_OPC - MGS_FIRST_OPC) + (LDLM_LAST_OPC - LDLM_FIRST_OPC) + (MDS_LAST_OPC - MDS_FIRST_OPC) + (OST_LAST_OPC - OST_FIRST_OPC)); - } else if (opc < QUOTA_LAST_OPC) { - /* LQUOTA Opcode */ - return (opc - QUOTA_FIRST_OPC + + } else if (opc < FLD_LAST_OPC) { + /* FLD opcode */ + return (opc - FLD_FIRST_OPC + (SEC_LAST_OPC - SEC_FIRST_OPC) + (SEQ_LAST_OPC - SEQ_FIRST_OPC) + - (FLD_LAST_OPC - FLD_FIRST_OPC) + + (QUOTA_LAST_OPC- QUOTA_FIRST_OPC) + (LLOG_LAST_OPC - LLOG_FIRST_OPC) + (OBD_LAST_OPC - OBD_FIRST_OPC) + (MGS_LAST_OPC - MGS_FIRST_OPC) + @@ -270,16 +270,17 @@ static inline int opcode_offset(__u32 opc) { } } -#define LUSTRE_MAX_OPCODES ((LDLM_LAST_OPC - LDLM_FIRST_OPC) + \ + +#define LUSTRE_MAX_OPCODES ((OST_LAST_OPC - OST_FIRST_OPC) + \ (MDS_LAST_OPC - MDS_FIRST_OPC) + \ - (OST_LAST_OPC - OST_FIRST_OPC) + \ - (OBD_LAST_OPC - OBD_FIRST_OPC) + \ - (FLD_LAST_OPC - FLD_FIRST_OPC) + \ - (SEQ_LAST_OPC - SEQ_FIRST_OPC) + \ + (LDLM_LAST_OPC - LDLM_FIRST_OPC) + \ (MGS_LAST_OPC - MGS_FIRST_OPC) + \ + (OBD_LAST_OPC - OBD_FIRST_OPC) + \ (LLOG_LAST_OPC - LLOG_FIRST_OPC) + \ + (QUOTA_LAST_OPC - QUOTA_FIRST_OPC) + \ + (SEQ_LAST_OPC - SEQ_FIRST_OPC) + \ (SEC_LAST_OPC - SEC_FIRST_OPC) + \ - (QUOTA_LAST_OPC - QUOTA_FIRST_OPC)) + (FLD_LAST_OPC - FLD_FIRST_OPC)) #define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR) + \ (EXTRA_LAST_OPC - EXTRA_FIRST_OPC)) diff --git a/lustre/include/lustre/ll_fiemap.h b/lustre/include/lustre/ll_fiemap.h index e8620bf..8bac0f4 100644 --- a/lustre/include/lustre/ll_fiemap.h +++ b/lustre/include/lustre/ll_fiemap.h @@ -48,27 +48,27 @@ #ifndef HAVE_LINUX_FIEMAP_H struct ll_fiemap_extent { - __u64 fe_logical; /* logical offset in bytes for the start of - * the extent from the beginning of the file */ - __u64 fe_physical; /* physical offset in bytes for the start - * of the extent from the beginning of the disk */ - __u64 fe_length; /* length in bytes for the extent */ - __u32 fe_flags; /* FIEMAP_EXTENT_* flags for the extent */ - __u32 fe_device; /* device number for this extent */ + __u64 fe_logical; /* logical offset in bytes for the start of + * the extent from the beginning of the file */ + __u64 fe_physical; /* physical offset in bytes for the start + * of the extent from the beginning of the disk */ + __u64 fe_length; /* length in bytes for this extent */ + __u64 fe_reserved64[2]; + __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + __u32 fe_device; /* device number for this extent */ + __u32 fe_reserved[2]; }; struct ll_user_fiemap { - __u64 fm_start; /* logical offset (inclusive) at - * which to start mapping (in) */ - __u64 fm_length; /* logical length of mapping which - * userspace wants (in) */ - __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ - __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ - __u32 fm_extent_count; /* size of fm_extents array (in) */ - __u32 fm_reserved; - struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out). - * Lustre uses first extent to - * send end_offset */ + __u64 fm_start; /* logical offset (inclusive) at + * which to start mapping (in) */ + __u64 fm_length; /* logical length of mapping which + * userspace wants (in) */ + __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ + __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ + __u32 fm_extent_count; /* size of fm_extents array (in) */ + __u32 fm_reserved; + struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ }; #define FIEMAP_MAX_OFFSET (~0ULL) @@ -80,30 +80,31 @@ struct ll_user_fiemap { #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR | \ FIEMAP_FLAG_DEVICE_ORDER) -#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ -#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ -#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. - * Sets EXTENT_UNKNOWN. */ -#define FIEMAP_EXTENT_NO_DIRECT 0x00000008 /* Data mapping undefined */ -#define FIEMAP_EXTENT_SECONDARY 0x00000010 /* Data copied offline. May - * set EXTENT_NO_DIRECT. */ -#define FIEMAP_EXTENT_NET 0x00000020 /* Data stored remotely. - * Sets EXTENT_NO_DIRECT. */ -#define FIEMAP_EXTENT_DATA_COMPRESSED 0x00000040 /* Data is compressed by fs. - * Sets EXTENT_NO_DIRECT. */ -#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs. - * Sets EXTENT_NO_DIRECT. */ -#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be - * block aligned. */ -#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata. - * Sets EXTENT_NOT_ALIGNED.*/ -#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block. - * Sets EXTENT_NOT_ALIGNED.*/ -#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but - * no data (i.e. zero). */ -#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively - * support extents. Result - * merged for efficiency. */ + +#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ +#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ +#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. + * Sets EXTENT_UNKNOWN. */ +#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read + * while fs is unmounted */ +#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs. + * Sets EXTENT_NO_DIRECT. */ +#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be + * block aligned. */ +#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata. + * Sets EXTENT_NOT_ALIGNED.*/ +#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block. + * Sets EXTENT_NOT_ALIGNED.*/ +#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but + * no data (i.e. zero). */ +#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively + * support extents. Result + * merged for efficiency. */ + +/* Lustre specific flags - use a high bit, don't conflict with upstream flag */ +#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */ +#define FIEMAP_EXTENT_NET 0x80000000 /* Data stored remotely. + * Sets NO_DIRECT flag */ #else diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index c80dd5e..76327b2 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -42,10 +42,6 @@ * * Lustre wire protocol definitions. * - * We assume all nodes are either little-endian or big-endian, and we - * always send messages in the sender's native format. The receiver - * detects the message format by checking the 'magic' field of the message - * (see lustre_msg_swabbed() below). * ALL structs passing over the wire should be declared here. Structs * that are used in interfaces with userspace should go in lustre_user.h. * @@ -72,6 +68,11 @@ * in the code to ensure that new/old clients that see this larger struct * do not fail, otherwise you need to implement protocol compatibility). * + * We assume all nodes are either little-endian or big-endian, and we + * always send messages in the sender's native format. The receiver + * detects the message format by checking the 'magic' field of the message + * (see lustre_msg_swabbed() below). + * * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines, * implemented either here, inline (trivial implementations) or in * ptlrpc/pack_generic.c. These 'swabbers' convert the type from "other" @@ -371,6 +372,7 @@ static inline __u32 lu_igif_gen(const struct lu_fid *fid) } #define DFID "["LPX64":0x%x:0x%x]" +#define SFID "0x%llx:0x%x:0x%x" #define PFID(fid) \ fid_seq(fid), \ @@ -654,48 +656,50 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT) */ -#define MSG_CONNECT_RECOVERING 0x1 -#define MSG_CONNECT_RECONNECT 0x2 -#define MSG_CONNECT_REPLAYABLE 0x4 +#define MSG_CONNECT_RECOVERING 0x00000001 +#define MSG_CONNECT_RECONNECT 0x00000002 +#define MSG_CONNECT_REPLAYABLE 0x00000004 //#define MSG_CONNECT_PEER 0x8 -#define MSG_CONNECT_LIBCLIENT 0x10 -#define MSG_CONNECT_INITIAL 0x20 -#define MSG_CONNECT_ASYNC 0x40 -#define MSG_CONNECT_NEXT_VER 0x80 /* use next version of lustre_msg */ -#define MSG_CONNECT_TRANSNO 0x100 /* report transno */ +#define MSG_CONNECT_LIBCLIENT 0x00000010 +#define MSG_CONNECT_INITIAL 0x00000020 +#define MSG_CONNECT_ASYNC 0x00000040 +#define MSG_CONNECT_NEXT_VER 0x00000080 /* use next version of lustre_msg */ +#define MSG_CONNECT_TRANSNO 0x00000100 /* report transno */ /* Connect flags */ -#define OBD_CONNECT_RDONLY 0x00000001ULL /* client allowed read-only access */ -#define OBD_CONNECT_INDEX 0x00000002ULL /* connect to specific LOV idx */ -#define OBD_CONNECT_MDS 0x00000004ULL /* connect from MDT to OST */ -#define OBD_CONNECT_GRANT 0x00000008ULL /* OSC acquires grant at connect */ -#define OBD_CONNECT_SRVLOCK 0x00000010ULL /* server takes locks for client */ -#define OBD_CONNECT_VERSION 0x00000020ULL /* Server supports versions in ocd */ -#define OBD_CONNECT_REQPORTAL 0x00000040ULL /* Separate portal for non-IO reqs */ -#define OBD_CONNECT_ACL 0x00000080ULL /* client uses access control lists */ -#define OBD_CONNECT_XATTR 0x00000100ULL /* client using extended attributes*/ -#define OBD_CONNECT_TRUNCLOCK 0x00000400ULL /* locks on server for punch b=9528 */ -#define OBD_CONNECT_IBITS 0x00001000ULL /* support for inodebits locks */ -#define OBD_CONNECT_JOIN 0x00002000ULL /* files can be concatenated */ -#define OBD_CONNECT_ATTRFID 0x00004000ULL /* Server supports GetAttr By Fid */ -#define OBD_CONNECT_NODEVOH 0x00008000ULL /* No open handle for special nodes */ -#define OBD_CONNECT_RMT_CLIENT 0x00010000ULL /* Remote client */ -#define OBD_CONNECT_RMT_CLIENT_FORCE 0x00020000ULL /* Remote client by force */ -#define OBD_CONNECT_BRW_SIZE 0x00040000ULL /* Max bytes per rpc */ -#define OBD_CONNECT_QUOTA64 0x00080000ULL /* 64bit qunit_data.qd_count b=10707*/ -#define OBD_CONNECT_MDS_CAPA 0x00100000ULL /* MDS capability */ -#define OBD_CONNECT_OSS_CAPA 0x00200000ULL /* OSS capability */ -#define OBD_CONNECT_CANCELSET 0x00400000ULL /* Early batched cancels. */ -#define OBD_CONNECT_SOM 0x00800000ULL /* SOM feature */ -#define OBD_CONNECT_AT 0x01000000ULL /* client uses adaptive timeouts */ -#define OBD_CONNECT_LRU_RESIZE 0x02000000ULL /* Lru resize feature. */ -#define OBD_CONNECT_MDS_MDS 0x04000000ULL /* MDS-MDS connection*/ -#define OBD_CONNECT_REAL 0x08000000ULL /* real connection */ -#define OBD_CONNECT_CHANGE_QS 0x10000000ULL /* shrink/enlarge qunit b=10600 */ -#define OBD_CONNECT_CKSUM 0x20000000ULL /* support several cksum algos */ -#define OBD_CONNECT_FID 0x40000000ULL /* FID is supported by server */ -#define OBD_CONNECT_LOV_V3 0x100000000ULL /* client supports lov v3 ea */ - +#define OBD_CONNECT_RDONLY 0x1ULL /*client allowed read-only access*/ +#define OBD_CONNECT_INDEX 0x2ULL /*connect to specific LOV idx */ +#define OBD_CONNECT_MDS 0x4ULL /*connect from MDT to OST */ +#define OBD_CONNECT_GRANT 0x8ULL /*OSC acquires grant at connect */ +#define OBD_CONNECT_SRVLOCK 0x10ULL /*server takes locks for client */ +#define OBD_CONNECT_VERSION 0x20ULL /*Lustre versions in ocd */ +#define OBD_CONNECT_REQPORTAL 0x40ULL /*Separate non-IO request portal */ +#define OBD_CONNECT_ACL 0x80ULL /*access control lists */ +#define OBD_CONNECT_XATTR 0x100ULL /*client use extended attributes */ +#define OBD_CONNECT_CROW 0x200ULL /*MDS+OST create objects on write*/ +#define OBD_CONNECT_TRUNCLOCK 0x400ULL /*locks on server for punch */ +#define OBD_CONNECT_TRANSNO 0x800ULL /*replay sends initial transno */ +#define OBD_CONNECT_IBITS 0x1000ULL /*support for inodebits locks */ +#define OBD_CONNECT_JOIN 0x2000ULL /*files can be concatenated */ +#define OBD_CONNECT_ATTRFID 0x4000ULL /*Server supports GetAttr By Fid */ +#define OBD_CONNECT_NODEVOH 0x8000ULL /*No open handle on special nodes*/ +#define OBD_CONNECT_RMT_CLIENT 0x00010000ULL /*Remote client */ +#define OBD_CONNECT_RMT_CLIENT_FORCE 0x00020000ULL /*Remote client by force */ +#define OBD_CONNECT_BRW_SIZE 0x40000ULL /*Max bytes per rpc */ +#define OBD_CONNECT_QUOTA64 0x80000ULL /*64bit qunit_data.qd_count */ +#define OBD_CONNECT_MDS_CAPA 0x100000ULL /*MDS capability */ +#define OBD_CONNECT_OSS_CAPA 0x200000ULL /*OSS capability */ +#define OBD_CONNECT_CANCELSET 0x400000ULL /*Early batched cancels. */ +#define OBD_CONNECT_SOM 0x00800000ULL /*Size on MDS */ +#define OBD_CONNECT_AT 0x01000000ULL /*client uses adaptive timeouts */ +#define OBD_CONNECT_LRU_RESIZE 0x02000000ULL /*LRU resize feature. */ +#define OBD_CONNECT_MDS_MDS 0x04000000ULL /*MDS-MDS connection */ +#define OBD_CONNECT_REAL 0x08000000ULL /*real connection */ +#define OBD_CONNECT_CHANGE_QS 0x10000000ULL /*shrink/enlarge qunit b=10600 */ +#define OBD_CONNECT_CKSUM 0x20000000ULL /*support several cksum algos */ +#define OBD_CONNECT_FID 0x40000000ULL /*FID is supported by server */ +#define OBD_CONNECT_VBR 0x80000000ULL /*version based recovery */ +#define OBD_CONNECT_LOV_V3 0x100000000ULL /*client supports LOV v3 EA */ /* also update obd_connect_names[] for lprocfs_rd_connect_flags() * and lustre/utils/wirecheck.c */ @@ -709,27 +713,26 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \ OBD_CONNECT_IBITS | OBD_CONNECT_JOIN | \ OBD_CONNECT_NODEVOH |/* OBD_CONNECT_ATTRFID |*/\ + OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \ OBD_CONNECT_RMT_CLIENT | \ OBD_CONNECT_RMT_CLIENT_FORCE | \ OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | \ - OBD_CONNECT_MDS_MDS | OBD_CONNECT_CANCELSET | \ - OBD_CONNECT_FID | \ - LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_AT | \ + OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | \ + LRU_RESIZE_CONNECT_FLAG | \ OBD_CONNECT_LOV_V3) #define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \ OBD_CONNECT_BRW_SIZE | OBD_CONNECT_QUOTA64 | \ - OBD_CONNECT_OSS_CAPA | OBD_CONNECT_CANCELSET | \ - OBD_CONNECT_CKSUM | LRU_RESIZE_CONNECT_FLAG | \ - OBD_CONNECT_AT | OBD_CONNECT_CHANGE_QS | \ - OBD_CONNECT_RMT_CLIENT | \ - OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_MDS) + OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \ + LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \ + OBD_CONNECT_CHANGE_QS | \ + OBD_CONNECT_OSS_CAPA | OBD_CONNECT_RMT_CLIENT | \ + OBD_CONNECT_RMT_CLIENT_FORCE | \ + OBD_CONNECT_MDS) #define ECHO_CONNECT_SUPPORTED (0) #define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT) -#define MAX_QUOTA_COUNT32 (0xffffffffULL) - #define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\ ((patch)<<8) + (fix)) #define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255) @@ -805,12 +808,12 @@ typedef __u64 obd_time; typedef __u64 obd_size; typedef __u64 obd_off; typedef __u64 obd_blocks; +typedef __u64 obd_valid; typedef __u32 obd_blksize; typedef __u32 obd_mode; typedef __u32 obd_uid; typedef __u32 obd_gid; typedef __u32 obd_flag; -typedef __u64 obd_valid; typedef __u32 obd_count; #define OBD_FL_INLINEDATA (0x00000001) @@ -822,6 +825,7 @@ typedef __u32 obd_count; #define OBD_FL_DEBUG_CHECK (0x00000040) /* echo client/server debug check */ #define OBD_FL_NO_USRQUOTA (0x00000100) /* the object's owner is over quota */ #define OBD_FL_NO_GRPQUOTA (0x00000200) /* the object's group is over quota */ +#define OBD_FL_CREATE_CROW (0x00000400) /* object should be create on write */ /** * Set this to delegate DLM locking during obd_punch() to the OSTs. Only OSTs @@ -920,7 +924,7 @@ struct lov_mds_md_v3 { /* LOV EA mds/wire data (little-endian) */ #define OBD_MD_FLHANDLE (0x00080000ULL) /* file/lock handle */ #define OBD_MD_FLCKSUM (0x00100000ULL) /* bulk data checksum */ #define OBD_MD_FLQOS (0x00200000ULL) /* quality of service stats */ -#define OBD_MD_FLOSCOPQ (0x00400000ULL) /* osc opaque data */ +/*#define OBD_MD_FLOSCOPQ (0x00400000ULL) osc opaque data, never used */ #define OBD_MD_FLCOOKIE (0x00800000ULL) /* log cancellation cookie */ #define OBD_MD_FLGROUP (0x01000000ULL) /* group */ #define OBD_MD_FLFID (0x02000000ULL) /* ->ost write inline fid */ @@ -1202,9 +1206,19 @@ static inline int ll_inode_to_ext_flags(int oflags, int iflags) } #endif -struct mdt_body { - struct lu_fid fid1; - struct lu_fid fid2; +/* + * while mds_body is to interact with 1.6, mdt_body is to interact with 2.0. + * both of them should have the same fields layout, because at client side + * one could be dynamically cast to the other. + * + * mdt_body has large size than mds_body, with unused padding (48 bytes) + * at the end. client always use size of mdt_body to prepare request/reply + * buffers, and actual data could be interepeted as mdt_body or mds_body + * accordingly. + */ +struct mds_body { + struct ll_fid fid1; + struct ll_fid fid2; struct lustre_handle handle; __u64 valid; __u64 size; /* Offset, in the case of MDS_READPAGE */ @@ -1212,8 +1226,8 @@ struct mdt_body { __u64 atime; __u64 ctime; __u64 blocks; /* XID, in the case of MDS_READPAGE */ - __u64 ioepoch; - __u64 ino; /* for 1.6 compatibility */ + __u64 io_epoch; + __u64 ino; __u32 fsuid; __u32 fsgid; __u32 capability; @@ -1223,24 +1237,20 @@ struct mdt_body { __u32 flags; /* from vfs for pin/unpin, MDS_BFLAG for close */ __u32 rdev; __u32 nlink; /* #bytes to read in the case of MDS_READPAGE */ - __u32 generation; /* for 1.6 compatibility */ + __u32 generation; __u32 suppgid; __u32 eadatasize; __u32 aclsize; __u32 max_mdsize; __u32 max_cookiesize; - __u32 padding_4; /* also fix lustre_swab_mdt_body */ - __u64 padding_5; - __u64 padding_6; - __u64 padding_7; - __u64 padding_8; - __u64 padding_9; - __u64 padding_10; + __u32 padding_4; /* also fix lustre_swab_mds_body */ }; -struct mds_body { - struct ll_fid fid1; - struct ll_fid fid2; +extern void lustre_swab_mds_body (struct mds_body *b); + +struct mdt_body { + struct lu_fid fid1; + struct lu_fid fid2; struct lustre_handle handle; __u64 valid; __u64 size; /* Offset, in the case of MDS_READPAGE */ @@ -1248,8 +1258,8 @@ struct mds_body { __u64 atime; __u64 ctime; __u64 blocks; /* XID, in the case of MDS_READPAGE */ - __u64 io_epoch; - __u64 ino; + __u64 ioepoch; + __u64 ino; /* for 1.6 compatibility */ __u32 fsuid; __u32 fsgid; __u32 capability; @@ -1259,16 +1269,21 @@ struct mds_body { __u32 flags; /* from vfs for pin/unpin, MDS_BFLAG for close */ __u32 rdev; __u32 nlink; /* #bytes to read in the case of MDS_READPAGE */ - __u32 generation; + __u32 generation; /* for 1.6 compatibility */ __u32 suppgid; __u32 eadatasize; __u32 aclsize; __u32 max_mdsize; __u32 max_cookiesize; - __u32 padding_4; /* also fix lustre_swab_mds_body */ -}; + __u32 padding_4; /* also fix lustre_swab_mdt_body */ + __u64 padding_5; + __u64 padding_6; + __u64 padding_7; + __u64 padding_8; + __u64 padding_9; + __u64 padding_10; +}; /* 216 */ -extern void lustre_swab_mds_body (struct mds_body *b); extern void lustre_swab_mdt_body (struct mdt_body *b); struct mdt_epoch { @@ -1507,20 +1522,6 @@ enum { MDS_QUOTA_IGNORE = 1 << 5 }; -struct mds_rec_join { - struct ll_fid jr_fid; - __u64 jr_headsize; -}; - -extern void lustre_swab_mds_rec_join (struct mds_rec_join *jr); - -struct mdt_rec_join { - struct lu_fid jr_fid; - __u64 jr_headsize; -}; - -extern void lustre_swab_mdt_rec_join (struct mdt_rec_join *jr); - struct mds_rec_create { __u32 cr_opcode; __u32 cr_fsuid; @@ -1555,7 +1556,7 @@ struct mdt_rec_create { __u32 cr_suppgid2_h; struct lu_fid cr_fid1; struct lu_fid cr_fid2; - struct lustre_handle cr_old_handle; /* u64 handle in case of open replay */ + struct lustre_handle cr_old_handle; /* handle in case of open replay */ __u64 cr_time; __u64 cr_rdev; __u64 cr_ioepoch; @@ -1570,6 +1571,20 @@ struct mdt_rec_create { extern void lustre_swab_mdt_rec_create (struct mdt_rec_create *cr); +struct mds_rec_join { + struct ll_fid jr_fid; + __u64 jr_headsize; +}; + +extern void lustre_swab_mds_rec_join (struct mds_rec_join *jr); + +struct mdt_rec_join { + struct lu_fid jr_fid; + __u64 jr_headsize; +}; + +extern void lustre_swab_mdt_rec_join (struct mdt_rec_join *jr); + struct mds_rec_link { __u32 lk_opcode; __u32 lk_fsuid; @@ -1761,13 +1776,49 @@ extern void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr); struct lmv_desc { __u32 ld_tgt_count; /* how many MDS's */ __u32 ld_active_tgt_count; /* how many active */ + __u32 ld_default_stripe_count; /* how many objects are used */ + __u32 ld_pattern; /* default MEA_MAGIC_* */ + __u64 ld_default_hash_size; + __u64 ld_padding_1; /* also fix lustre_swab_lmv_desc */ + __u32 ld_padding_2; /* also fix lustre_swab_lmv_desc */ + __u32 ld_qos_maxage; /* in second */ + __u32 ld_padding_3; /* also fix lustre_swab_lmv_desc */ + __u32 ld_padding_4; /* also fix lustre_swab_lmv_desc */ struct obd_uuid ld_uuid; }; extern void lustre_swab_lmv_desc (struct lmv_desc *ld); +/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */ +struct lmv_stripe_md { + __u32 mea_magic; + __u32 mea_count; + __u32 mea_master; + __u32 mea_padding; + char mea_pool_name[LOV_MAXPOOLNAME]; + struct lu_fid mea_ids[0]; +}; + +extern void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea); + +/* lmv structures */ +#define MEA_MAGIC_LAST_CHAR 0xb2221ca1 +#define MEA_MAGIC_ALL_CHARS 0xb222a11c +#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b + +#define MAX_HASH_SIZE_32 0x7fffffffUL +#define MAX_HASH_SIZE 0x7fffffffffffffffULL +#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL + +struct md_fld { + seqno_t mf_seq; + mdsno_t mf_mds; +}; + +extern void lustre_swab_md_fld (struct md_fld *mf); + enum fld_rpc_opc { - FLD_QUERY = 600, + FLD_QUERY = 900, FLD_LAST_OPC, FLD_FIRST_OPC = FLD_QUERY }; @@ -1787,7 +1838,8 @@ enum seq_op { * LOV data structures */ -#define LOV_MIN_STRIPE_SIZE 65536 /* maximum PAGE_SIZE (ia64), power of 2 */ +#define LOV_MIN_STRIPE_BITS 16 /* maximum PAGE_SIZE (ia64), power of 2 */ +#define LOV_MIN_STRIPE_SIZE (1<lmm_stripe_offset = 0; \ + (user_md)->lmm_stripe_count = (mds_md)->lmm_stripe_count; } while(0) + /* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to * use this. It is unsafe to #define those values in this header as it * is possible the application has already #included . */ diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 96c5543..9edf487 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -189,7 +189,14 @@ typedef enum { /* Flags sent in AST lock_flags to be mapped into the receiving lock. */ #define LDLM_AST_FLAGS (LDLM_FL_DISCARD_DATA) -/* Used for marking lock as an target for -EINTR while cp_ast sleep situation +/* + * -------------------------------------------------------------------------- + * NOTE! Starting from this point, that is, LDLM_FL_* flags with values above + * 0x80000000 will not be sent over the wire. + * -------------------------------------------------------------------------- + */ + +/* Used for marking lock as an target for -EINTR while cp_ast sleep * emulation + race with upcoming bl_ast. */ #define LDLM_FL_FAIL_LOC 0x100000000ULL @@ -370,14 +377,6 @@ typedef enum { } ldlm_appetite_t; /* - * Default value for ->ns_shrink_thumb. If lock is not extent one its cost - * is one page. Here we have 256 pages which is 1M on i386. Thus by default - * all extent locks which have more than 1M long extent will be kept in lru, - * others (including ibits locks) will be canceled on memory pressure event. - */ -#define LDLM_LOCK_SHRINK_THUMB 256 - -/* * Default values for the "max_nolock_size", "contention_time" and * "contended_locks" namespace tunables. */ @@ -444,11 +443,6 @@ struct ldlm_namespace { unsigned int ns_ctime_age_limit; /** - * Lower limit to number of pages in lock to keep it in cache. - */ - unsigned long ns_shrink_thumb; - - /** * Next debug dump, jiffies. */ cfs_time_t ns_next_dump; @@ -645,7 +639,11 @@ struct ldlm_lock { */ cfs_waitq_t l_waitq; - struct timeval l_enqueued_time; + /** + * Seconds. it will be updated if there is any activity related to + * the lock, e.g. enqueue the lock or send block AST. + */ + cfs_time_t l_last_activity; /** * Jiffies. Should be converted to time if needed. @@ -796,13 +794,16 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 mask, ...) __attribute__ ((format (printf, 4, 5))); -#define LDLM_ERROR(lock, fmt, a...) do { \ +#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do { \ static cfs_debug_limit_state_t _ldlm_cdls; \ - ldlm_lock_debug(&_ldlm_cdls, D_ERROR, lock, \ + ldlm_lock_debug(&_ldlm_cdls, mask, lock, \ __FILE__, __FUNCTION__, __LINE__, \ "### " fmt , ##a); \ } while (0) +#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a) +#define LDLM_WARN(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a) + #define LDLM_DEBUG(lock, fmt, a...) do { \ ldlm_lock_debug(NULL, D_DLMTRACE, lock, \ __FILE__, __FUNCTION__, __LINE__, \ @@ -963,6 +964,7 @@ int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode); void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode); void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode); void ldlm_lock_allow_match(struct ldlm_lock *lock); +void ldlm_lock_allow_match_locked(struct ldlm_lock *lock); ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, int flags, const struct ldlm_res_id *, ldlm_type_t type, ldlm_policy_data_t *, ldlm_mode_t mode, diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h index 94033ef..6ee97d6 100644 --- a/lustre/include/lustre_export.h +++ b/lustre/include/lustre_export.h @@ -92,7 +92,6 @@ struct filter_export_data { int fed_mod_count;/* items in fed_writing list */ long fed_pending; /* bytes just being written */ __u32 fed_group; - struct brw_stats fed_brw_stats; }; typedef struct nid_stat_uuid { @@ -113,6 +112,12 @@ typedef struct nid_stat { int nid_exp_ref_count; }nid_stat_t; +enum obd_option { + OBD_OPT_FORCE = 0x0001, + OBD_OPT_FAILOVER = 0x0002, + OBD_OPT_ABORT_RECOV = 0x0004, +}; + struct obd_export { struct portals_handle exp_handle; atomic_t exp_refcount; @@ -137,7 +142,7 @@ struct obd_export { spinlock_t exp_lock; /* protects flags int below */ /* ^ protects exp_outstanding_replies too */ __u64 exp_connect_flags; - int exp_flags; + enum obd_option exp_flags; unsigned long exp_failed:1, exp_in_recovery:1, exp_disconnected:1, diff --git a/lustre/include/lustre_fid.h b/lustre/include/lustre_fid.h index 7c8085f..921b423 100644 --- a/lustre/include/lustre_fid.h +++ b/lustre/include/lustre_fid.h @@ -57,6 +57,8 @@ struct lu_context; extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE; extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE; extern const struct lu_fid LUSTRE_BFL_FID; +extern const struct lu_fid LU_OBF_FID; +extern const struct lu_fid LU_DOT_LUSTRE_FID; enum { /* @@ -82,6 +84,9 @@ enum { /** special fid seq: used for local object create. */ #define FID_SEQ_LOCAL_FILE (FID_SEQ_START + 1) +/** special fid seq: used for .lustre objects. */ +#define LU_DOT_LUSTRE_SEQ (FID_SEQ_START + 0x02ULL) + /** special OID for local objects */ enum { /** \see osd_oi_index_create */ diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h index 2ec0f44..a058fda 100644 --- a/lustre/include/lustre_lib.h +++ b/lustre/include/lustre_lib.h @@ -74,8 +74,6 @@ void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data, int target_handle_connect(struct ptlrpc_request *req); int target_handle_disconnect(struct ptlrpc_request *req); void target_destroy_export(struct obd_export *exp); -int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, - struct obd_uuid *cluuid, int); int target_pack_pool_reply(struct ptlrpc_request *req); int target_handle_ping(struct ptlrpc_request *req); void target_committed_to_req(struct ptlrpc_request *req); diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index efef5b4..67efebc 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -636,8 +636,12 @@ struct ptlrpc_bulk_desc { lnet_handle_md_t bd_md_h; /* associated MD */ lnet_nid_t bd_sender; /* stash event::sender */ - cfs_page_t **bd_enc_pages; #if defined(__KERNEL__) + /* + * encrypt iov, size is either 0 or bd_iov_count. + */ + lnet_kiov_t *bd_enc_iov; + lnet_kiov_t bd_iov[0]; #else lnet_md_iovec_t bd_iov[0]; @@ -1267,7 +1271,7 @@ static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req) int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg); int client_obd_cleanup(struct obd_device *obddev); int client_connect_import(const struct lu_env *env, - struct lustre_handle *conn, struct obd_device *obd, + struct obd_export **exp, struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *, void *localdata); int client_disconnect_export(struct obd_export *exp); diff --git a/lustre/include/lustre_sec.h b/lustre/include/lustre_sec.h index 57a58c7..50274fc 100644 --- a/lustre/include/lustre_sec.h +++ b/lustre/include/lustre_sec.h @@ -94,99 +94,163 @@ enum sptlrpc_service_type { SPTLRPC_SVC_MAX, }; +enum sptlrpc_bulk_type { + SPTLRPC_BULK_DEFAULT = 0, /* follow rpc flavor */ + SPTLRPC_BULK_HASH = 1, /* hash integrity */ + SPTLRPC_BULK_MAX, +}; + +enum sptlrpc_bulk_service { + SPTLRPC_BULK_SVC_NULL = 0, + SPTLRPC_BULK_SVC_AUTH = 1, + SPTLRPC_BULK_SVC_INTG = 2, + SPTLRPC_BULK_SVC_PRIV = 3, + SPTLRPC_BULK_SVC_MAX, +}; + /* - * rpc flavor compose/extract, represented as 16 bits + * rpc flavor compose/extract, represented as 32 bits. currently the + * high 12 bits are unused, must be set as 0. * - * 4b (reserved) | 4b (svc) | 4b (mech) | 4b (policy) + * 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech) | 4b (policy) */ -#define RPC_FLVR_POLICY_OFFSET (0) -#define RPC_FLVR_MECH_OFFSET (4) -#define RPC_FLVR_SVC_OFFSET (8) - -#define MAKE_RPC_FLVR(policy, mech, svc) \ - (((__u16)(policy) << RPC_FLVR_POLICY_OFFSET) | \ - ((__u16)(mech) << RPC_FLVR_MECH_OFFSET) | \ - ((__u16)(svc) << RPC_FLVR_SVC_OFFSET)) +#define FLVR_POLICY_OFFSET (0) +#define FLVR_MECH_OFFSET (4) +#define FLVR_SVC_OFFSET (8) +#define FLVR_BULK_TYPE_OFFSET (12) +#define FLVR_BULK_SVC_OFFSET (16) + +#define MAKE_FLVR(policy, mech, svc, btype, bsvc) \ + (((__u32)(policy) << FLVR_POLICY_OFFSET) | \ + ((__u32)(mech) << FLVR_MECH_OFFSET) | \ + ((__u32)(svc) << FLVR_SVC_OFFSET) | \ + ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) | \ + ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET)) -#define MAKE_RPC_SUBFLVR(mech, svc) \ - ((__u16)(mech) | \ - ((__u16)(svc) << (RPC_FLVR_SVC_OFFSET - RPC_FLVR_MECH_OFFSET))) - -#define RPC_FLVR_SUB(flavor) \ - ((((__u16)(flavor)) >> RPC_FLVR_MECH_OFFSET) & 0xFF) - -#define RPC_FLVR_POLICY(flavor) \ - ((((__u16)(flavor)) >> RPC_FLVR_POLICY_OFFSET) & 0xF) -#define RPC_FLVR_MECH(flavor) \ - ((((__u16)(flavor)) >> RPC_FLVR_MECH_OFFSET) & 0xF) -#define RPC_FLVR_SVC(flavor) \ - ((((__u16)(flavor)) >> RPC_FLVR_SVC_OFFSET) & 0xF) +/* + * extraction + */ +#define SPTLRPC_FLVR_POLICY(flavor) \ + ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF) +#define SPTLRPC_FLVR_MECH(flavor) \ + ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF) +#define SPTLRPC_FLVR_SVC(flavor) \ + ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF) +#define SPTLRPC_FLVR_BULK_TYPE(flavor) \ + ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF) +#define SPTLRPC_FLVR_BULK_SVC(flavor) \ + ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF) + +#define SPTLRPC_FLVR_BASE(flavor) \ + ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF) +#define SPTLRPC_FLVR_BASE_SUB(flavor) \ + ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF) /* * gss subflavors */ +#define MAKE_BASE_SUBFLVR(mech, svc) \ + ((__u32)(mech) | \ + ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET))) + #define SPTLRPC_SUBFLVR_KRB5N \ - MAKE_RPC_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL) + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL) #define SPTLRPC_SUBFLVR_KRB5A \ - MAKE_RPC_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH) + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH) #define SPTLRPC_SUBFLVR_KRB5I \ - MAKE_RPC_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG) + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG) #define SPTLRPC_SUBFLVR_KRB5P \ - MAKE_RPC_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV) + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV) /* * "end user" flavors */ #define SPTLRPC_FLVR_NULL \ - MAKE_RPC_FLVR(SPTLRPC_POLICY_NULL, \ - SPTLRPC_MECH_NULL, \ - SPTLRPC_SVC_NULL) + MAKE_FLVR(SPTLRPC_POLICY_NULL, \ + SPTLRPC_MECH_NULL, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) #define SPTLRPC_FLVR_PLAIN \ - MAKE_RPC_FLVR(SPTLRPC_POLICY_PLAIN, \ - SPTLRPC_MECH_PLAIN, \ - SPTLRPC_SVC_NULL) + MAKE_FLVR(SPTLRPC_POLICY_PLAIN, \ + SPTLRPC_MECH_PLAIN, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_HASH, \ + SPTLRPC_BULK_SVC_INTG) #define SPTLRPC_FLVR_KRB5N \ - MAKE_RPC_FLVR(SPTLRPC_POLICY_GSS, \ - SPTLRPC_MECH_GSS_KRB5, \ - SPTLRPC_SVC_NULL) + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) #define SPTLRPC_FLVR_KRB5A \ - MAKE_RPC_FLVR(SPTLRPC_POLICY_GSS, \ - SPTLRPC_MECH_GSS_KRB5, \ - SPTLRPC_SVC_AUTH) + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_AUTH, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) #define SPTLRPC_FLVR_KRB5I \ - MAKE_RPC_FLVR(SPTLRPC_POLICY_GSS, \ - SPTLRPC_MECH_GSS_KRB5, \ - SPTLRPC_SVC_INTG) + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_INTG, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_INTG) #define SPTLRPC_FLVR_KRB5P \ - MAKE_RPC_FLVR(SPTLRPC_POLICY_GSS, \ - SPTLRPC_MECH_GSS_KRB5, \ - SPTLRPC_SVC_PRIV) - -#define SPTLRPC_FLVR_ANY ((__u16) 0xf000) -#define SPTLRPC_FLVR_INVALID ((__u16) 0xffff) + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_PRIV, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_PRIV) #define SPTLRPC_FLVR_DEFAULT SPTLRPC_FLVR_NULL +#define SPTLRPC_FLVR_INVALID ((__u32) 0xFFFFFFFF) +#define SPTLRPC_FLVR_ANY ((__u32) 0xFFF00000) + /* - * 32 bits wire flavor (msg->lm_secflvr), lower 12 bits is the rpc flavor, - * higher 20 bits is not defined right now. + * extract the useful part from wire flavor */ -#define WIRE_FLVR_RPC(wflvr) (((__u16) (wflvr)) & 0x0FFF) +#define WIRE_FLVR(wflvr) (((__u32) (wflvr)) & 0x000FFFFF) -static inline void rpc_flvr_set_svc(__u16 *flvr, __u16 svc) +static inline void flvr_set_svc(__u32 *flvr, __u32 svc) { LASSERT(svc < SPTLRPC_SVC_MAX); - *flvr = MAKE_RPC_FLVR(RPC_FLVR_POLICY(*flvr), - RPC_FLVR_MECH(*flvr), - svc); + *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), + SPTLRPC_FLVR_MECH(*flvr), + svc, + SPTLRPC_FLVR_BULK_TYPE(*flvr), + SPTLRPC_FLVR_BULK_SVC(*flvr)); } +static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc) +{ + LASSERT(svc < SPTLRPC_BULK_SVC_MAX); + *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), + SPTLRPC_FLVR_MECH(*flvr), + SPTLRPC_FLVR_SVC(*flvr), + SPTLRPC_FLVR_BULK_TYPE(*flvr), + svc); +} + +struct bulk_spec_hash { + __u8 hash_alg; +}; struct sptlrpc_flavor { - __u16 sf_rpc; /* rpc flavor */ - __u8 sf_bulk_ciph; /* bulk cipher alg */ - __u8 sf_bulk_hash; /* bulk hash alg */ + __u32 sf_rpc; /* wire flavor - should be renamed to sf_wire */ __u32 sf_flags; /* general flags */ + /* + * rpc flavor specification + */ + union { + /* nothing for now */ + } u_rpc; + /* + * bulk flavor specification + */ + union { + struct bulk_spec_hash hash; + } u_bulk; }; enum lustre_sec_part { @@ -216,6 +280,7 @@ struct sptlrpc_rule_set { }; int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr); +int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr); static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set) { @@ -223,10 +288,9 @@ static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set) } void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set); -int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set, int expand); +int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set); int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set, - struct sptlrpc_rule *rule, - int expand); + struct sptlrpc_rule *rule); int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, enum lustre_sec_part from, enum lustre_sec_part to, @@ -396,10 +460,12 @@ struct ptlrpc_sec_sops { int msgsize); void (*free_rs) (struct ptlrpc_reply_state *rs); void (*free_ctx) (struct ptlrpc_svc_ctx *ctx); - /* reverse credential */ + /* reverse context */ int (*install_rctx)(struct obd_import *imp, struct ptlrpc_svc_ctx *ctx); /* bulk transform */ + int (*prep_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); int (*unwrap_bulk) (struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc); int (*wrap_bulk) (struct ptlrpc_request *req, @@ -481,55 +547,30 @@ enum sptlrpc_bulk_hash_alg { BULK_HASH_ALG_SHA256, BULK_HASH_ALG_SHA384, BULK_HASH_ALG_SHA512, - BULK_HASH_ALG_WP256, - BULK_HASH_ALG_WP384, - BULK_HASH_ALG_WP512, BULK_HASH_ALG_MAX }; -enum sptlrpc_bulk_cipher_alg { - BULK_CIPH_ALG_NULL = 0, - BULK_CIPH_ALG_ARC4, - BULK_CIPH_ALG_AES128, - BULK_CIPH_ALG_AES192, - BULK_CIPH_ALG_AES256, - BULK_CIPH_ALG_CAST128, - BULK_CIPH_ALG_CAST256, - BULK_CIPH_ALG_TWOFISH128, - BULK_CIPH_ALG_TWOFISH256, - BULK_CIPH_ALG_MAX -}; - struct sptlrpc_hash_type { char *sht_name; char *sht_tfm_name; unsigned int sht_size; }; -struct sptlrpc_ciph_type { - char *sct_name; - char *sct_tfm_name; - __u32 sct_tfm_flags; - unsigned int sct_ivsize; - unsigned int sct_keysize; -}; - const struct sptlrpc_hash_type *sptlrpc_get_hash_type(__u8 hash_alg); const char * sptlrpc_get_hash_name(__u8 hash_alg); -const struct sptlrpc_ciph_type *sptlrpc_get_ciph_type(__u8 ciph_alg); -const char *sptlrpc_get_ciph_name(__u8 ciph_alg); +__u8 sptlrpc_get_hash_alg(const char *algname); -#define CIPHER_MAX_BLKSIZE (16) -#define CIPHER_MAX_KEYSIZE (64) +enum { + BSD_FL_ERR = 1, +}; struct ptlrpc_bulk_sec_desc { - __u8 bsd_version; - __u8 bsd_flags; - __u8 bsd_pad[4]; - __u8 bsd_hash_alg; /* hash algorithm */ - __u8 bsd_ciph_alg; /* cipher algorithm */ - __u8 bsd_key[CIPHER_MAX_KEYSIZE]; /* encrypt key seed */ - __u8 bsd_csum[0]; + __u8 bsd_version; /* 0 */ + __u8 bsd_type; /* SPTLRPC_BULK_XXX */ + __u8 bsd_svc; /* SPTLRPC_BULK_SVC_XXXX */ + __u8 bsd_flags; /* flags */ + __u32 bsd_nob; /* nob of bulk data */ + __u8 bsd_data[0]; /* policy-specific token */ }; @@ -567,9 +608,12 @@ void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg, int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy); int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy); -__u16 sptlrpc_name2rpcflavor(const char *name); -const char *sptlrpc_rpcflavor2name(__u16 flavor); -int sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize); +__u32 sptlrpc_name2flavor_base(const char *name); +const char *sptlrpc_flavor2name_base(__u32 flvr); +char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, + char *buf, int bufsize); +char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize); +char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize); static inline struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy) @@ -672,7 +716,7 @@ void sptlrpc_request_out_callback(struct ptlrpc_request *req); */ int sptlrpc_import_sec_adapt(struct obd_import *imp, struct ptlrpc_svc_ctx *ctx, - __u16 rpc_flavor); + struct sptlrpc_flavor *flvr); struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp); void sptlrpc_import_sec_put(struct obd_import *imp); @@ -737,15 +781,23 @@ void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc); int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc); int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, - int nob, obd_count pg_count, - struct brw_page **pga); + struct ptlrpc_bulk_desc *desc, + int nob); int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc); +int sptlrpc_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); int sptlrpc_svc_wrap_bulk(struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc); int sptlrpc_svc_unwrap_bulk(struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc); +/* bulk helpers (internal use only by policies) */ +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen); + +int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset); + /* user descriptor helpers */ static inline int sptlrpc_user_desc_size(int ngroups) { @@ -756,18 +808,6 @@ int sptlrpc_current_user_desc_size(void); int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset); int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset); -/* bulk helpers (internal use only by policies) */ -int bulk_sec_desc_size(__u8 hash_alg, int request, int read); -int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset); - -int bulk_csum_cli_request(struct ptlrpc_bulk_desc *desc, int read, - __u32 alg, struct lustre_msg *rmsg, int roff); -int bulk_csum_cli_reply(struct ptlrpc_bulk_desc *desc, int read, - struct lustre_msg *rmsg, int roff, - struct lustre_msg *vmsg, int voff); -int bulk_csum_svc(struct ptlrpc_bulk_desc *desc, int read, - struct ptlrpc_bulk_sec_desc *bsdv, int vsize, - struct ptlrpc_bulk_sec_desc *bsdr, int rsize); #define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN) #define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE) diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 12daa90..f4ee771 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -648,6 +648,7 @@ struct lov_qos { }; struct lov_tgt_desc { + struct list_head ltd_kill; struct obd_uuid ltd_uuid; struct obd_export *ltd_exp; struct ltd_qos ltd_qos; /* qos info per target */ @@ -857,6 +858,8 @@ static inline void oti_free_cookies(struct obd_trans_info *oti) * Events signalled through obd_notify() upcall-chain. */ enum obd_notify_event { + /* Device connect start */ + OBD_NOTIFY_CONNECT, /* Device activated */ OBD_NOTIFY_ACTIVE, /* Device deactivated */ @@ -1075,9 +1078,6 @@ struct obd_device { struct lu_ref obd_reference; }; -#define OBD_OPT_FORCE 0x0001 -#define OBD_OPT_FAILOVER 0x0002 - #define OBD_LLOG_FL_SENDNOW 0x0001 enum obd_cleanup_stage { @@ -1111,7 +1111,7 @@ enum obd_cleanup_stage { #define KEY_CLEAR_FS "clear_fs" #define KEY_BLOCKSIZE "blocksize" #define KEY_BLOCKSIZE_BITS "blocksize_bits" -#define KEY_FIEMAP "FIEMAP" +#define KEY_FIEMAP "fiemap" #define KEY_SPTLRPC_CONF "sptlrpc_conf" #define KEY_MGSSEC "mgssec" /* XXX unused ?*/ @@ -1217,7 +1217,7 @@ struct obd_ops { * granted by the target, which are guaranteed to be a subset of flags * asked for. If @ocd == NULL, use default parameters. */ int (*o_connect)(const struct lu_env *env, - struct lustre_handle *conn, struct obd_device *src, + struct obd_export **exp, struct obd_device *src, struct obd_uuid *cluuid, struct obd_connect_data *ocd, void *localdata); int (*o_reconnect)(const struct lu_env *env, @@ -1361,15 +1361,6 @@ struct obd_ops { * Also, add a wrapper function in include/linux/obd_class.h. */ }; -/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */ -struct lmv_stripe_md { - __u32 mea_magic; - __u32 mea_count; - __u32 mea_master; - __u32 mea_padding; - struct lu_fid mea_ids[0]; -}; - enum { LUSTRE_OPC_MKDIR = (1 << 0), LUSTRE_OPC_SYMLINK = (1 << 1), diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index 1bc75e1..b47d60b 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -206,9 +206,18 @@ int class_connect(struct lustre_handle *conn, struct obd_device *obd, int class_disconnect(struct obd_export *exp); void class_fail_export(struct obd_export *exp); void class_disconnect_exports(struct obd_device *obddev); -int class_disconnect_stale_exports(struct obd_device *, - int (*test_export)(struct obd_export *)); int class_manual_cleanup(struct obd_device *obd); +int class_disconnect_stale_exports(struct obd_device *, + int (*test_export)(struct obd_export *), + enum obd_option flags); + +static inline enum obd_option exp_flags_from_obd(struct obd_device *obd) +{ + return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) | + (obd->obd_force ? OBD_OPT_FORCE : 0) | + (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) | + 0); +} void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid); void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj); @@ -813,7 +822,7 @@ static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp) } static inline int obd_connect(const struct lu_env *env, - struct lustre_handle *conn,struct obd_device *obd, + struct obd_export **exp,struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *d, void *localdata) @@ -827,7 +836,7 @@ static inline int obd_connect(const struct lu_env *env, OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP); OBD_COUNTER_INCREMENT(obd, connect); - rc = OBP(obd, connect)(env, conn, obd, cluuid, d, localdata); + rc = OBP(obd, connect)(env, exp, obd, cluuid, d, localdata); /* check that only subset is granted */ LASSERT(ergo(d != NULL, (d->ocd_connect_flags & ocf) == d->ocd_connect_flags)); diff --git a/lustre/kernel_patches/patches/md-mmp-unplug-dev-sles10.patch b/lustre/kernel_patches/patches/md-mmp-unplug-dev-sles10.patch new file mode 100644 index 0000000..8bfdef3 --- /dev/null +++ b/lustre/kernel_patches/patches/md-mmp-unplug-dev-sles10.patch @@ -0,0 +1,22 @@ +Index: linux-2.6.16.60-0.33/drivers/md/raid5.c +=================================================================== +--- linux-2.6.16.60-0.33.orig/drivers/md/raid5.c ++++ linux-2.6.16.60-0.33/drivers/md/raid5.c +@@ -900,6 +900,8 @@ static int add_stripe_bio(struct stripe_ + bi->bi_next = *bip; + *bip = bi; + bi->bi_phys_segments ++; ++ if (bio_sync(bi) && !forwrite) ++ clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */ + spin_unlock_irq(&conf->device_lock); + spin_unlock(&sh->lock); + +@@ -1617,6 +1619,8 @@ static int make_request (request_queue_t + bi->bi_end_io(bi, bytes, 0); + } + spin_unlock_irq(&conf->device_lock); ++ if (bio_sync(bi)) ++ raid5_unplug_device(q); + return 0; + } + diff --git a/lustre/kernel_patches/patches/md-mmp-unplug-dev.patch b/lustre/kernel_patches/patches/md-mmp-unplug-dev.patch new file mode 100644 index 0000000..0334abd --- /dev/null +++ b/lustre/kernel_patches/patches/md-mmp-unplug-dev.patch @@ -0,0 +1,22 @@ +Index: linux-2.6.22.14/drivers/md/raid5.c +=================================================================== +--- linux-2.6.22.14.orig/drivers/md/raid5.c ++++ linux-2.6.22.14/drivers/md/raid5.c +@@ -1268,6 +1268,8 @@ static int add_stripe_bio(struct stripe_ + bi->bi_next = *bip; + *bip = bi; + bi->bi_phys_segments ++; ++ if (bio_sync(bi) && !forwrite) ++ clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */ + spin_unlock_irq(&conf->device_lock); + spin_unlock(&sh->lock); + +@@ -2972,6 +2974,8 @@ static int make_request(request_queue_t + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); + } ++ if (bio_sync(bi)) ++ raid5_unplug_device(q); + return 0; + } + diff --git a/lustre/kernel_patches/series/2.6-rhel5.series b/lustre/kernel_patches/series/2.6-rhel5.series index 0fc2b97..97060fd 100644 --- a/lustre/kernel_patches/series/2.6-rhel5.series +++ b/lustre/kernel_patches/series/2.6-rhel5.series @@ -21,3 +21,4 @@ md-rebuild-policy.patch md-soft-lockups.patch jbd-journal-chksum-2.6.18-vanilla.patch quota-large-limits-rhel5.patch +md-mmp-unplug-dev.patch diff --git a/lustre/kernel_patches/series/2.6-sles10.series b/lustre/kernel_patches/series/2.6-sles10.series index 070f943..dc85e99 100644 --- a/lustre/kernel_patches/series/2.6-sles10.series +++ b/lustre/kernel_patches/series/2.6-sles10.series @@ -16,3 +16,4 @@ proc-sleep-2.6.16-sles10.patch export-nr_free_buffer_pages.patch fmode-exec-2.6-sles10.patch quota-large-limits-sles10.patch +md-mmp-unplug-dev-sles10.patch diff --git a/lustre/kernel_patches/series/2.6.22-vanilla.series b/lustre/kernel_patches/series/2.6.22-vanilla.series index fe32803..6fad0bd 100644 --- a/lustre/kernel_patches/series/2.6.22-vanilla.series +++ b/lustre/kernel_patches/series/2.6.22-vanilla.series @@ -12,3 +12,4 @@ export-2.6.18-vanilla.patch export-show_task-2.6.18-vanilla.patch sd_iostats-2.6.22-vanilla.patch quota-large-limits-rhel5.patch +md-mmp-unplug-dev.patch diff --git a/lustre/lclient/glimpse.c b/lustre/lclient/glimpse.c index 78acee6..ed81f15 100644 --- a/lustre/lclient/glimpse.c +++ b/lustre/lclient/glimpse.c @@ -118,11 +118,17 @@ int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, *descr = whole_file; descr->cld_obj = clob; descr->cld_mode = CLM_PHANTOM; - /* The lockreq for glimpse should be mandatory, - * otherwise, osc may decide to use lockless */ - io->ci_lockreq = CILR_MANDATORY; cio->cui_glimpse = 1; - lock = cl_lock_request(env, io, descr, CEF_ASYNC, + /* + * CEF_ASYNC is used because glimpse sub-locks cannot + * deadlock (because they never conflict with other + * locks) and, hence, can be enqueued out-of-order. + * + * CEF_MUST protects glimpse lock from conversion into + * a lockless mode. + */ + lock = cl_lock_request(env, io, descr, + CEF_ASYNC|CEF_MUST, "glimpse", cfs_current()); cio->cui_glimpse = 0; if (!IS_ERR(lock)) { diff --git a/lustre/lclient/lcommon_cl.c b/lustre/lclient/lcommon_cl.c index 6b56b4e..d68ba37 100644 --- a/lustre/lclient/lcommon_cl.c +++ b/lustre/lclient/lcommon_cl.c @@ -635,7 +635,7 @@ int ccc_lock_fits_into(const struct lu_env *env, } /** - * Implements cl_lock_operations::clo_state() method for vvp layer, invoked + * Implements cl_lock_operations::clo_state() method for ccc layer, invoked * whenever lock state changes. Transfers object attributes, that might be * updated as a result of lock acquiring into inode. */ diff --git a/lustre/ldlm/Makefile.am b/lustre/ldlm/Makefile.am index 600c6792..57897f2 100644 --- a/lustre/ldlm/Makefile.am +++ b/lustre/ldlm/Makefile.am @@ -39,7 +39,7 @@ # MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = ldlm_extent.c ldlm_flock.c ldlm_internal.h ldlm_lib.c \ +EXTRA_DIST = ldlm_extent.c ldlm_flock.c ldlm_internal.h ldlm_lib.c \ ldlm_lock.c ldlm_lockd.c ldlm_plain.c ldlm_request.c \ ldlm_resource.c l_lock.c ldlm_inodebits.c ldlm_pool.c \ interval_tree.c diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h index 5ff07e7..c01a702 100644 --- a/lustre/ldlm/ldlm_internal.h +++ b/lustre/ldlm/ldlm_internal.h @@ -79,8 +79,6 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync, int flags); int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, int count, int max, int cancel_flags, int flags); -int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns, int count, int max, - int flags); extern int ldlm_enqueue_min; int ldlm_get_enq_timeout(struct ldlm_lock *lock); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index a71aa3d..b400a27 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -385,27 +385,29 @@ int client_obd_cleanup(struct obd_device *obddev) /* ->o_connect() method for client side (OSC and MDC and MGC) */ int client_connect_import(const struct lu_env *env, - struct lustre_handle *dlm_handle, + struct obd_export **exp, struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *data, void *localdata) { struct client_obd *cli = &obd->u.cli; struct obd_import *imp = cli->cl_import; - struct obd_export *exp; struct obd_connect_data *ocd; struct ldlm_namespace *to_be_freed = NULL; + struct lustre_handle conn = { 0 }; int rc; ENTRY; + *exp = NULL; down_write(&cli->cl_sem); - rc = class_connect(dlm_handle, obd, cluuid); + rc = class_connect(&conn, obd, cluuid); if (rc) GOTO(out_sem, rc); + *exp = class_conn2export(&conn); + cli->cl_conn_count++; if (cli->cl_conn_count > 1) GOTO(out_sem, rc); - exp = class_conn2export(dlm_handle); if (obd->obd_namespace != NULL) CERROR("already have namespace!\n"); @@ -415,7 +417,7 @@ int client_connect_import(const struct lu_env *env, if (obd->obd_namespace == NULL) GOTO(out_disco, rc = -ENOMEM); - imp->imp_dlm_handle = *dlm_handle; + imp->imp_dlm_handle = conn; rc = ptlrpc_init_import(imp); if (rc != 0) GOTO(out_ldlm, rc); @@ -431,7 +433,7 @@ int client_connect_import(const struct lu_env *env, LASSERT (imp->imp_state == LUSTRE_IMP_DISCON); GOTO(out_ldlm, rc); } - LASSERT(exp->exp_connection); + LASSERT((*exp)->exp_connection); if (data) { LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) == @@ -451,9 +453,8 @@ out_ldlm: obd->obd_namespace = NULL; out_disco: cli->cl_conn_count--; - class_disconnect(exp); - } else { - class_export_put(exp); + class_disconnect(*exp); + *exp = NULL; } out_sem: up_write(&cli->cl_sem); @@ -513,7 +514,13 @@ int client_disconnect_export(struct obd_export *exp) to_be_freed = obd->obd_namespace; } + /* + * there's no necessary to hold sem during diconnecting an import, + * and actually it may cause deadlock in gss. + */ + up_write(&cli->cl_sem); rc = ptlrpc_disconnect_import(imp, 0); + down_write(&cli->cl_sem); ptlrpc_invalidate_import(imp); /* set obd_namespace to NULL only after invalidate, because we can have @@ -545,11 +552,12 @@ int client_disconnect_export(struct obd_export *exp) * from old lib/target.c * -------------------------------------------------------------------------- */ -int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, - struct obd_uuid *cluuid, int initial_conn) +static int target_handle_reconnect(struct lustre_handle *conn, + struct obd_export *exp, + struct obd_uuid *cluuid) { ENTRY; - if (exp->exp_connection && exp->exp_imp_reverse && !initial_conn) { + if (exp->exp_connection && exp->exp_imp_reverse) { struct lustre_handle *hdl; hdl = &exp->exp_imp_reverse->imp_remote_handle; /* Might be a re-connect after a partition. */ @@ -611,7 +619,7 @@ int target_handle_connect(struct ptlrpc_request *req) struct obd_uuid remote_uuid; char *str; int rc = 0; - int initial_conn = 0; + int mds_conn = 0; struct obd_connect_data *data, *tmpdata; lnet_nid_t *client_nid = NULL; ENTRY; @@ -717,17 +725,27 @@ int target_handle_connect(struct ptlrpc_request *req) } } - if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) - initial_conn = 1; + if ((lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) && + (data->ocd_connect_flags & OBD_CONNECT_MDS)) + mds_conn = 1; /* lctl gets a backstage, all-access pass. */ if (obd_uuid_equals(&cluuid, &target->obd_uuid)) goto dont_check_exports; - spin_lock(&target->obd_dev_lock); export = lustre_hash_lookup(target->obd_uuid_hash, &cluuid); - if (export != NULL && export->exp_connecting) { /* bug 9635, et. al. */ + if (export != NULL && mds_conn) { + /* mds reconnected after failover */ + class_fail_export(export); + CWARN("%s: received MDS connection from NID %s," + " removing former export from NID %s\n", + target->obd_name, libcfs_nid2str(req->rq_peer.nid), + libcfs_nid2str(export->exp_connection->c_peer.nid)); + class_export_put(export); + export = NULL; + rc = 0; + } else if (export != NULL && export->exp_connecting) { /* bug 9635, et. al. */ CWARN("%s: exp %p already connecting\n", export->exp_obd->obd_name, export); class_export_put(export); @@ -735,25 +753,14 @@ int target_handle_connect(struct ptlrpc_request *req) rc = -EALREADY; } else if (export != NULL && export->exp_connection != NULL && req->rq_peer.nid != export->exp_connection->c_peer.nid) { - /* make darn sure this is coming from the same peer - * if the UUIDs matched */ - if (data && data->ocd_connect_flags & OBD_CONNECT_MDS) { - /* the MDS UUID can be reused, don't need to wait - * for the export to be evicted */ - CWARN("%s: received MDS connection from a new NID %s," - " removing former export from NID %s\n", - target->obd_name, - libcfs_nid2str(req->rq_peer.nid), - libcfs_nid2str(export->exp_connection->c_peer.nid)); - class_fail_export(export); - } else { - CWARN("%s: cookie %s seen on new NID %s when " - "existing NID %s is already connected\n", - target->obd_name, cluuid.uuid, - libcfs_nid2str(req->rq_peer.nid), - libcfs_nid2str(export->exp_connection->c_peer.nid)); - rc = -EALREADY; - } + /* in mds failover we have static uuid but nid can be + * changed*/ + CWARN("%s: cookie %s seen on new NID %s when " + "existing NID %s is already connected\n", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid), + libcfs_nid2str(export->exp_connection->c_peer.nid)); + rc = -EALREADY; class_export_put(export); export = NULL; } else if (export != NULL) { @@ -761,15 +768,13 @@ int target_handle_connect(struct ptlrpc_request *req) export->exp_connecting = 1; spin_unlock(&export->exp_lock); class_export_put(export); - spin_unlock(&target->obd_dev_lock); LASSERT(export->exp_obd == target); - rc = target_handle_reconnect(&conn, export, &cluuid, initial_conn); + rc = target_handle_reconnect(&conn, export, &cluuid); } /* If we found an export, we already unlocked. */ if (!export) { - spin_unlock(&target->obd_dev_lock); OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_CONNECT, 2 * obd_timeout); } else if (req->rq_export == NULL && atomic_read(&export->exp_rpc_count) > 0) { @@ -785,18 +790,13 @@ int target_handle_connect(struct ptlrpc_request *req) libcfs_nid2str(req->rq_peer.nid), export, atomic_read(&export->exp_rpc_count)); GOTO(out, rc = -EBUSY); - } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1 && - !initial_conn) { + } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) { CERROR("%s: NID %s (%s) reconnected with 1 conn_cnt; " "cookies not random?\n", target->obd_name, libcfs_nid2str(req->rq_peer.nid), cluuid.uuid); GOTO(out, rc = -EALREADY); } else { OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout); - if (req->rq_export == NULL && initial_conn) - export->exp_last_request_time = - max(export->exp_last_request_time, - (time_t)cfs_time_current_sec()); } if (rc < 0) { @@ -849,12 +849,17 @@ int target_handle_connect(struct ptlrpc_request *req) } else { dont_check_exports: rc = obd_connect(req->rq_svc_thread->t_env, - &conn, target, &cluuid, data, + &export, target, &cluuid, data, client_nid); + if (rc == 0) + conn.cookie = export->exp_handle.h_cookie; } } else { rc = obd_reconnect(req->rq_svc_thread->t_env, export, target, &cluuid, data, client_nid); + if (rc == 0) + /* prevous done via class_conn2export */ + class_export_get(export); } if (rc) GOTO(out, rc); @@ -872,15 +877,6 @@ dont_check_exports: lustre_msg_set_handle(req->rq_repmsg, &conn); - /* ownership of this export ref transfers to the request AFTER we - * drop any previous reference the request had, but we don't want - * that to go to zero before we get our new export reference. */ - export = class_conn2export(&conn); - if (!export) { - DEBUG_REQ(D_ERROR, req, "Missing export!"); - GOTO(out, rc = -ENODEV); - } - /* If the client and the server are the same node, we will already * have an export that really points to the client's DLM export, * because we have a shared handles table. @@ -894,9 +890,7 @@ dont_check_exports: req->rq_export = export; spin_lock(&export->exp_lock); - if (initial_conn) { - lustre_msg_set_conn_cnt(req->rq_repmsg, export->exp_conn_cnt + 1); - } else if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) { + if (export->exp_conn_cnt >= lustre_msg_get_conn_cnt(req->rq_reqmsg)) { spin_unlock(&export->exp_lock); CERROR("%s: %s already connected at higher conn_cnt: %d > %d\n", cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), @@ -1003,8 +997,7 @@ dont_check_exports: else revimp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; - rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, - req->rq_flvr.sf_rpc); + rc = sptlrpc_import_sec_adapt(revimp, req->rq_svc_ctx, &req->rq_flvr); if (rc) { CERROR("Failed to get sec for reverse import: %d\n", rc); export->exp_imp_reverse = NULL; @@ -1676,7 +1669,9 @@ static int target_recovery_thread(void *arg) "evict them\n", obd->obd_connected_clients, obd->obd_max_recoverable_clients); obd->obd_abort_recovery = obd->obd_stopping; - class_disconnect_stale_exports(obd, connect_done); + class_disconnect_stale_exports(obd, connect_done, + exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); } /* next stage: replay requests */ delta = jiffies; @@ -1706,7 +1701,9 @@ static int target_recovery_thread(void *arg) if (obd->obd_abort_recovery) { CDEBUG(D_ERROR, "req replay timed out, aborting ...\n"); obd->obd_abort_recovery = obd->obd_stopping; - class_disconnect_stale_exports(obd, req_replay_done); + class_disconnect_stale_exports(obd, req_replay_done, + exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); abort_req_replay_queue(obd); } @@ -1731,7 +1728,9 @@ static int target_recovery_thread(void *arg) int stale; CERROR("lock replay timed out, aborting ...\n"); obd->obd_abort_recovery = obd->obd_stopping; - stale = class_disconnect_stale_exports(obd, lock_replay_done); + stale = class_disconnect_stale_exports(obd, lock_replay_done, + exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); abort_lock_replay_queue(obd); } diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index c0c566a..b3b36a8 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -187,8 +187,8 @@ int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock) struct ldlm_namespace *ns = lock->l_resource->lr_namespace; LASSERT(lock->l_resource->lr_type != LDLM_FLOCK); list_del_init(&lock->l_lru); + LASSERT(ns->ns_nr_unused > 0); ns->ns_nr_unused--; - LASSERT(ns->ns_nr_unused >= 0); rc = 1; } return rc; @@ -998,11 +998,16 @@ static struct ldlm_lock *search_queue(struct list_head *queue, return NULL; } -void ldlm_lock_allow_match(struct ldlm_lock *lock) +void ldlm_lock_allow_match_locked(struct ldlm_lock *lock) { - lock_res_and_lock(lock); lock->l_flags |= LDLM_FL_LVB_READY; cfs_waitq_signal(&lock->l_waitq); +} + +void ldlm_lock_allow_match(struct ldlm_lock *lock) +{ + lock_res_and_lock(lock); + ldlm_lock_allow_match_locked(lock); unlock_res_and_lock(lock); } @@ -1211,7 +1216,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, struct ldlm_interval *node = NULL; ENTRY; - do_gettimeofday(&lock->l_enqueued_time); + lock->l_last_activity = cfs_time_current_sec(); /* policies are not executed on the client or during replay */ if ((*flags & (LDLM_FL_HAS_INTENT|LDLM_FL_REPLAY)) == LDLM_FL_HAS_INTENT && !local && ns->ns_policy) { diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index b849987..58875a9 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -336,7 +336,7 @@ repeat: lock->l_resource->lr_namespace->ns_timeouts++; LDLM_ERROR(lock, "lock callback timer expired after %lds: " "evicting client at %s ", - cfs_time_current_sec()- lock->l_enqueued_time.tv_sec, + cfs_time_current_sec()- lock->l_last_activity, libcfs_nid2str( lock->l_export->exp_connection->c_peer.nid)); @@ -391,7 +391,7 @@ static int __ldlm_add_waiting_lock(struct ldlm_lock *lock, int seconds) if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT) || OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) - seconds = 2; + seconds = 1; timeout = cfs_time_shift(seconds); if (likely(cfs_time_after(timeout, lock->l_callback_timeout))) @@ -795,7 +795,6 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) struct ldlm_cb_set_arg *arg = data; struct ldlm_request *body; struct ptlrpc_request *req; - struct timeval granted_time; long total_enqueue_wait; int instant_cancel = 0; int rc = 0; @@ -804,14 +803,13 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) LASSERT(lock != NULL); LASSERT(data != NULL); - do_gettimeofday(&granted_time); - total_enqueue_wait = cfs_timeval_sub(&granted_time, - &lock->l_enqueued_time, NULL); + total_enqueue_wait = cfs_time_sub(cfs_time_current_sec(), + lock->l_last_activity); - if (total_enqueue_wait / ONE_MILLION > obd_timeout) + if (total_enqueue_wait > obd_timeout) /* non-fatal with AT - change to LDLM_DEBUG? */ - LDLM_ERROR(lock, "enqueue wait took %luus from "CFS_TIME_T, - total_enqueue_wait, lock->l_enqueued_time.tv_sec); + LDLM_WARN(lock, "enqueue wait took %lus from "CFS_TIME_T, + total_enqueue_wait, lock->l_last_activity); req = ptlrpc_request_alloc(lock->l_export->exp_imp_reverse, &RQF_LDLM_CP_CALLBACK); @@ -848,13 +846,13 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) unlock_res_and_lock(lock); } - LDLM_DEBUG(lock, "server preparing completion AST (after %ldus wait)", + LDLM_DEBUG(lock, "server preparing completion AST (after %lds wait)", total_enqueue_wait); /* Server-side enqueue wait time estimate, used in __ldlm_add_waiting_lock to set future enqueue timers */ at_add(&lock->l_resource->lr_namespace->ns_at_estimate, - total_enqueue_wait / ONE_MILLION); + total_enqueue_wait); ptlrpc_request_set_replen(req); @@ -867,6 +865,8 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) lock_res_and_lock(lock); if (lock->l_flags & LDLM_FL_AST_SENT) { body->lock_flags |= LDLM_FL_AST_SENT; + /* copy ast flags like LDLM_FL_DISCARD_DATA */ + body->lock_flags |= (lock->l_flags & LDLM_AST_FLAGS); /* We might get here prior to ldlm_handle_enqueue setting * LDLM_FL_CANCEL_ON_BLOCK flag. Then we will put this lock @@ -1090,7 +1090,7 @@ int ldlm_handle_enqueue0(struct ldlm_namespace *ns, if (!lock) GOTO(out, rc = -ENOMEM); - do_gettimeofday(&lock->l_enqueued_time); + lock->l_last_activity = cfs_time_current_sec(); lock->l_remote_handle = dlm_req->lock_handle[0]; LDLM_DEBUG(lock, "server-side enqueue handler, new lock created"); @@ -1303,7 +1303,7 @@ int ldlm_handle_convert0(struct ptlrpc_request *req, LDLM_DEBUG(lock, "server-side convert handler START"); - do_gettimeofday(&lock->l_enqueued_time); + lock->l_last_activity = cfs_time_current_sec(); res = ldlm_lock_convert(lock, dlm_req->lock_desc.l_req_mode, &dlm_rep->lock_flags); if (res) { @@ -1812,7 +1812,7 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) RETURN(0); } - if ((lock->l_flags & LDLM_FL_FAIL_LOC) && + if ((lock->l_flags & LDLM_FL_FAIL_LOC) && lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); @@ -2435,8 +2435,8 @@ int __init ldlm_init(void) return -ENOMEM; ldlm_lock_slab = cfs_mem_cache_create("ldlm_locks", - sizeof(struct ldlm_lock), 0, - SLAB_HWCACHE_ALIGN); + sizeof(struct ldlm_lock), 0, + SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU); if (ldlm_lock_slab == NULL) { cfs_mem_cache_destroy(ldlm_resource_slab); return -ENOMEM; @@ -2492,6 +2492,7 @@ EXPORT_SYMBOL(ldlm_lock_dump); EXPORT_SYMBOL(ldlm_lock_dump_handle); EXPORT_SYMBOL(ldlm_cancel_locks_for_export); EXPORT_SYMBOL(ldlm_reprocess_all_ns); +EXPORT_SYMBOL(ldlm_lock_allow_match_locked); EXPORT_SYMBOL(ldlm_lock_allow_match); EXPORT_SYMBOL(ldlm_lock_downgrade); EXPORT_SYMBOL(ldlm_lock_convert); diff --git a/lustre/ldlm/ldlm_pool.c b/lustre/ldlm/ldlm_pool.c index 09b9590..54c0cf5 100644 --- a/lustre/ldlm/ldlm_pool.c +++ b/lustre/ldlm/ldlm_pool.c @@ -381,13 +381,12 @@ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, int nr, unsigned int gfp_mask) { __u32 limit; - ENTRY; /* * VM is asking how many entries may be potentially freed. */ if (nr == 0) - RETURN(atomic_read(&pl->pl_granted)); + return atomic_read(&pl->pl_granted); /* * Client already canceled locks but server is already in shrinker @@ -427,7 +426,7 @@ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, * We did not really free any memory here so far, it only will be * freed later may be, so that we return 0 to not confuse VM. */ - RETURN(0); + return 0; } /** @@ -508,7 +507,7 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) * It may be called when SLV has changed much, this is why we do not * take into account pl->pl_recalc_time here. */ - RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC, + RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_SYNC, LDLM_CANCEL_LRUR)); } @@ -520,12 +519,15 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) static int ldlm_cli_pool_shrink(struct ldlm_pool *pl, int nr, unsigned int gfp_mask) { - ENTRY; + struct ldlm_namespace *ns; + int canceled = 0, unused; + + ns = ldlm_pl2ns(pl); /* * Do not cancel locks in case lru resize is disabled for this ns. */ - if (!ns_connect_lru_resize(ldlm_pl2ns(pl))) + if (!ns_connect_lru_resize(ns)) RETURN(0); /* @@ -533,19 +535,22 @@ static int ldlm_cli_pool_shrink(struct ldlm_pool *pl, */ ldlm_cli_pool_pop_slv(pl); + spin_lock(&ns->ns_unused_lock); + unused = ns->ns_nr_unused; + spin_unlock(&ns->ns_unused_lock); + + if (nr) { + canceled = ldlm_cancel_lru(ns, nr, LDLM_SYNC, + LDLM_CANCEL_SHRINK); + } +#ifdef __KERNEL__ /* - * Find out how many locks may be released according to shrink - * policy. - */ - if (nr == 0) - RETURN(ldlm_cancel_lru_estimate(ldlm_pl2ns(pl), 0, 0, - LDLM_CANCEL_SHRINK)); - - /* - * Cancel @nr locks accoding to shrink policy. + * Retrun the number of potentially reclaimable locks. */ - RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC, - LDLM_CANCEL_SHRINK)); + return ((unused - canceled) / 100) * sysctl_vfs_cache_pressure; +#else + return unused - canceled; +#endif } struct ldlm_pool_ops ldlm_srv_pool_ops = { diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 872ad36..b6fb1d1 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -80,9 +80,9 @@ int ldlm_expired_completion_wait(void *data) LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", " CFS_DURATION_T"s ago); not entering recovery in " "server code, just going back to sleep", - lock->l_enqueued_time.tv_sec, + lock->l_last_activity, cfs_time_sub(cfs_time_current_sec(), - lock->l_enqueued_time.tv_sec)); + lock->l_last_activity)); if (cfs_time_after(cfs_time_current(), next_dump)) { last_dump = next_dump; next_dump = cfs_time_shift(300); @@ -99,9 +99,8 @@ int ldlm_expired_completion_wait(void *data) ptlrpc_fail_import(imp, lwd->lwd_conn_cnt); LDLM_ERROR(lock, "lock timed out (enqueued at "CFS_TIME_T", " CFS_DURATION_T"s ago), entering recovery for %s@%s", - lock->l_enqueued_time.tv_sec, - cfs_time_sub(cfs_time_current_sec(), - lock->l_enqueued_time.tv_sec), + lock->l_last_activity, + cfs_time_sub(cfs_time_current_sec(), lock->l_last_activity), obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid); RETURN(0); @@ -136,7 +135,7 @@ static int ldlm_completion_tail(struct ldlm_lock *lock) result = -EIO; } else { delay = cfs_time_sub(cfs_time_current_sec(), - lock->l_enqueued_time.tv_sec); + lock->l_last_activity); LDLM_DEBUG(lock, "client-side enqueue: granted after " CFS_DURATION_T"s", delay); @@ -1314,65 +1313,6 @@ static int ldlm_cancel_list(struct list_head *cancels, int count, int flags) } /** - * Callback function for shrink policy. Makes decision whether to keep - * \a lock in LRU for current \a LRU size \a unused, added in current scan - * \a added and number of locks to be preferably canceled \a count. - * - * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning - * - * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU - */ -static ldlm_policy_res_t ldlm_cancel_shrink_policy(struct ldlm_namespace *ns, - struct ldlm_lock *lock, - int unused, int added, - int count) -{ - int lock_cost; - __u64 page_nr; - - /* - * Stop lru processing when we reached passed @count or checked all - * locks in lru. - */ - if (count && added >= count) - return LDLM_POLICY_KEEP_LOCK; - - if (lock->l_resource->lr_type == LDLM_EXTENT) { - if (lock->l_weigh_ast) { - /* - * For liblustre, l_weigh_ast should return 0 since it - * don't cache pages - */ - page_nr = lock->l_weigh_ast(lock); - } else { - struct ldlm_extent *l_extent; - - /* - * For all extent locks cost is 1 + number of pages in - * their extent. - */ - l_extent = &lock->l_policy_data.l_extent; - page_nr = l_extent->end - l_extent->start; - do_div(page_nr, CFS_PAGE_SIZE); - } - lock_cost = 1 + page_nr; - } else { - /* - * For all locks which are not extent ones cost is 1 - */ - lock_cost = 1; - } - - /* - * Keep all expensive locks in lru for the memory pressure time - * cancel policy. They anyways may be canceled by lru resize - * pplicy if they have not small enough CLV. - */ - return lock_cost > ns->ns_shrink_thumb ? - LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; -} - -/** * Callback function for lru-resize policy. Makes decision whether to keep * \a lock in LRU for current \a LRU size \a unused, added in current scan * \a added and number of locks to be preferably canceled \a count. @@ -1495,7 +1435,8 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags) { if (ns_connect_lru_resize(ns)) { if (flags & LDLM_CANCEL_SHRINK) - return ldlm_cancel_shrink_policy; + /* We kill passed number of old locks. */ + return ldlm_cancel_passed_policy; else if (flags & LDLM_CANCEL_LRUR) return ldlm_cancel_lrur_policy; else if (flags & LDLM_CANCEL_PASSED) @@ -1647,61 +1588,6 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, RETURN(ldlm_cancel_list(cancels, added, cancel_flags)); } -/* Returns number of locks which could be canceled next time when - * ldlm_cancel_lru() is called. Used from locks pool shrinker. */ -int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns, - int count, int max, int flags) -{ - struct list_head disp = CFS_LIST_HEAD_INIT(disp); - ldlm_cancel_lru_policy_t pf; - struct ldlm_lock *lock; - int added = 0, unused; - int loop_stop = 0; - ENTRY; - - pf = ldlm_cancel_lru_policy(ns, flags); - LASSERT(pf != NULL); - spin_lock(&ns->ns_unused_lock); - unused = ns->ns_nr_unused; - list_splice_init(&ns->ns_unused_list, &disp); - while (!list_empty(&disp)) { - lock = list_entry(disp.next, struct ldlm_lock, l_lru); - list_move_tail(&lock->l_lru, &ns->ns_unused_list); - - /* For any flags, stop scanning if @max is reached. */ - if (max && added >= max) - break; - - /* Somebody is already doing CANCEL or there is a - * blocking request will send cancel. Let's not count - * this lock. */ - if ((lock->l_flags & LDLM_FL_CANCELING) || - (lock->l_flags & LDLM_FL_BL_AST)) - continue; - - LDLM_LOCK_GET(lock); - spin_unlock(&ns->ns_unused_lock); - lu_ref_add(&lock->l_reference, __FUNCTION__, cfs_current()); - - /* Pass the lock through the policy filter and see if it - * should stay in lru. */ - if (pf(ns, lock, unused, added, count) == LDLM_POLICY_KEEP_LOCK) - loop_stop = 1; - - lu_ref_del(&lock->l_reference, __FUNCTION__, cfs_current()); - LDLM_LOCK_RELEASE(lock); - spin_lock(&ns->ns_unused_lock); - if (loop_stop) - break; - - added++; - unused--; - } - list_splice(&disp, ns->ns_unused_list.prev); - spin_unlock(&ns->ns_unused_lock); - RETURN(added); -} - /* when called with LDLM_ASYNC the blocking callback will be handled * in a thread and this function will return after the thread has been * asked to call the callback. when called with LDLM_SYNC the blocking @@ -1723,8 +1609,8 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync, RETURN(count); } - /* If an error occured in ASYNC mode, or - * this is SYNC mode, cancel the list. */ + /* If an error occured in ASYNC mode, or this is SYNC mode, + * cancel the list. */ ldlm_cli_cancel_list(&cancels, count, NULL, 0); RETURN(count); } diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 320a870..c04d948 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -266,13 +266,6 @@ void ldlm_proc_namespace(struct ldlm_namespace *ns) lock_vars[0].write_fptr = lprocfs_wr_lru_size; lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); - snprintf(lock_name, MAX_STRING_SIZE, "%s/shrink_thumb", - ns->ns_name); - lock_vars[0].data = ns; - lock_vars[0].read_fptr = lprocfs_rd_uint; - lock_vars[0].write_fptr = lprocfs_wr_uint; - lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); - snprintf(lock_name, MAX_STRING_SIZE, "%s/lru_max_age", ns->ns_name); lock_vars[0].data = &ns->ns_max_age; @@ -342,7 +335,6 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, if (!ns->ns_hash) GOTO(out_ns, NULL); - ns->ns_shrink_thumb = LDLM_LOCK_SHRINK_THUMB; ns->ns_appetite = apt; LASSERT(obd != NULL); diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index 232ce2b..5a34a82 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -94,7 +94,6 @@ int liblustre_process_log(struct config_llog_instance *cfg, struct lustre_cfg *lcfg; char *peer = "MGS_UUID"; struct obd_device *obd; - struct lustre_handle mgc_conn = {0, }; struct obd_export *exp; char *name = "mgc_dev"; class_uuid_t uuid; @@ -184,15 +183,13 @@ int liblustre_process_log(struct config_llog_instance *cfg, #endif ocd->ocd_version = LUSTRE_VERSION_CODE; - rc = obd_connect(NULL, &mgc_conn, obd, &mgc_uuid, ocd, NULL); + rc = obd_connect(NULL, &exp, obd, &mgc_uuid, ocd, NULL); if (rc) { CERROR("cannot connect to %s at %s: rc = %d\n", LUSTRE_MGS_OBDNAME, mgsnid, rc); GOTO(out_cleanup, rc); } - exp = class_conn2export(&mgc_conn); - ctxt = llog_get_context(exp->exp_obd, LLOG_CONFIG_REPL_CTXT); cfg->cfg_flags |= CFG_F_COMPAT146; rc = class_config_parse_llog(ctxt, profile, cfg); diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index a410d2f..5f4c017 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -1937,8 +1937,6 @@ llu_fsswop_mount(const char *source, struct obd_statfs osfs; static struct qstr noname = { NULL, 0, 0 }; struct ptlrpc_request *request = NULL; - struct lustre_handle md_conn = {0, }; - struct lustre_handle dt_conn = {0, }; struct lustre_md md; class_uuid_t uuid; struct config_llog_instance cfg = {0, }; @@ -2026,12 +2024,11 @@ llu_fsswop_mount(const char *source, ocd.ocd_version = LUSTRE_VERSION_CODE; /* setup mdc */ - err = obd_connect(NULL, &md_conn, obd, &sbi->ll_sb_uuid, &ocd, NULL); + err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, &ocd, NULL); if (err) { CERROR("cannot connect to %s: rc = %d\n", mdc, err); GOTO(out_free, err); } - sbi->ll_md_exp = class_conn2export(&md_conn); err = obd_statfs(obd, &osfs, 100000000, 0); if (err) @@ -2057,12 +2054,11 @@ llu_fsswop_mount(const char *source, OBD_CONNECT_VERSION | OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_FID | OBD_CONNECT_AT; ocd.ocd_version = LUSTRE_VERSION_CODE; - err = obd_connect(NULL, &dt_conn, obd, &sbi->ll_sb_uuid, &ocd, NULL); + err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, &ocd, NULL); if (err) { CERROR("cannot connect to %s: rc = %d\n", osc, err); GOTO(out_md, err); } - sbi->ll_dt_exp = class_conn2export(&dt_conn); sbi->ll_lco.lco_flags = ocd.ocd_connect_flags; sbi->ll_lco.lco_md_exp = sbi->ll_md_exp; sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp; diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in index 848c26b..09689d2 100644 --- a/lustre/llite/Makefile.in +++ b/lustre/llite/Makefile.in @@ -8,4 +8,8 @@ lustre-objs += vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o llite_lloop-objs := lloop.o +EXTRA_DIST := $(lustre-objs:.o=.c) llite_internal.h rw26.c super25.c +EXTRA_DIST += $(llite_lloop-objs:.o=.c) +EXTRA_DIST += vvp_internal.h + @INCLUDE_RULES@ diff --git a/lustre/llite/autoMakefile.am b/lustre/llite/autoMakefile.am index d5d1c10..391a8f6 100644 --- a/lustre/llite/autoMakefile.am +++ b/lustre/llite/autoMakefile.am @@ -38,7 +38,4 @@ if MODULES modulefs_DATA = lustre$(KMODEXT) llite_lloop$(KMODEXT) endif -DIST_SOURCES := $(lustre-objs:.o=.c) llite_internal.h rw26.c super25.c -DIST_SOURCES += $(llite_lloop-objs:.o=.c) -DIST_SOURCES += vvp_internal.h MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c index bafb293..31fdff5 100644 --- a/lustre/llite/dcache.c +++ b/lustre/llite/dcache.c @@ -496,6 +496,13 @@ do_lock: if (rc != -ESTALE) { CDEBUG(D_INFO, "ll_intent_lock: rc %d : it->it_status " "%d\n", rc, it->d.lustre.it_status); + } else { +#ifndef HAVE_VFS_INTENT_PATCHES + if (it_disposition(it, DISP_OPEN_OPEN) && + !it_open_error(DISP_OPEN_OPEN, it)) + /* server have valid open - close file first*/ + ll_release_openhandle(de, it); +#endif } GOTO(out, rc = 0); } @@ -763,7 +770,7 @@ int ll_revalidate_nd(struct dentry *dentry, struct nameidata *nd) * nd->intent.open.file for error, so we need to return it as lookup's result * instead */ if (IS_ERR(filp)) - rc = 0; + rc = PTR_ERR(filp); #endif } #else diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 9850774..cfa7f03 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -696,15 +696,14 @@ out_openerr: return rc; } -/* Fills the obdo with the attributes for the inode defined by lsm */ -int ll_inode_getattr(struct inode *inode, struct obdo *obdo) +/* Fills the obdo with the attributes for the lsm */ +static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp, + struct obd_capa *capa, struct obdo *obdo) { struct ptlrpc_request_set *set; - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; + struct obd_info oinfo = { { { 0 } } }; + int rc; - struct obd_info oinfo = { { { 0 } } }; - int rc; ENTRY; LASSERT(lsm != NULL); @@ -719,32 +718,44 @@ int ll_inode_getattr(struct inode *inode, struct obdo *obdo) OBD_MD_FLBLKSZ | OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLGROUP; - oinfo.oi_capa = ll_mdscapa_get(inode); + oinfo.oi_capa = capa; set = ptlrpc_prep_set(); if (set == NULL) { CERROR("can't allocate ptlrpc set\n"); rc = -ENOMEM; } else { - rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set); + rc = obd_getattr_async(exp, &oinfo, set); if (rc == 0) rc = ptlrpc_set_wait(set); ptlrpc_set_destroy(set); } - capa_put(oinfo.oi_capa); - if (rc) - RETURN(rc); + if (rc == 0) + oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | + OBD_MD_FLATIME | OBD_MD_FLMTIME | + OBD_MD_FLCTIME | OBD_MD_FLSIZE); + RETURN(rc); +} - oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | - OBD_MD_FLATIME | OBD_MD_FLMTIME | - OBD_MD_FLCTIME | OBD_MD_FLSIZE); +/* Fills the obdo with the attributes for the inode defined by lsm */ +int ll_inode_getattr(struct inode *inode, struct obdo *obdo) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *capa = ll_mdscapa_get(inode); + int rc; + ENTRY; - obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid); - CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n", - lli->lli_smd->lsm_object_id, i_size_read(inode), - (unsigned long long)inode->i_blocks, - (unsigned long)ll_inode_blksize(inode)); - RETURN(0); + rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo); + capa_put(capa); + if (rc == 0) { + obdo_refresh_inode(inode, obdo, obdo->o_valid); + CDEBUG(D_INODE, + "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n", + lli->lli_smd->lsm_object_id, i_size_read(inode), + (unsigned long long)inode->i_blocks, + (unsigned long)ll_inode_blksize(inode)); + } + RETURN(rc); } int ll_merge_lvb(struct inode *inode) @@ -773,8 +784,18 @@ int ll_merge_lvb(struct inode *inode) int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm, lstat_t *st) { - /* XXX */ - return -ENOSYS; + struct obdo obdo = { 0 }; + int rc; + + rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo); + if (rc == 0) { + st->st_size = obdo.o_size; + st->st_blocks = obdo.o_blocks; + st->st_mtime = obdo.o_mtime; + st->st_atime = obdo.o_atime; + st->st_ctime = obdo.o_ctime; + } + return rc; } void ll_io_init(struct cl_io *io, const struct file *file, int write) diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index a03e1bf..9576150 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -635,7 +635,6 @@ extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits, struct lustre_handle *lockh); int ll_file_open(struct inode *inode, struct file *file); int ll_file_release(struct inode *inode, struct file *file); -int ll_lsm_getattr(struct obd_export *, struct lov_stripe_md *, struct obdo *); int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm, lstat_t *st); int ll_local_open(struct file *file, @@ -1216,4 +1215,25 @@ static inline int cl_merge_lvb(struct inode *inode) struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt); +/** direct write pages */ +struct ll_dio_pages { + /** page array to be written. we don't support + * partial pages except the last one. */ + struct page **ldp_pages; + /* offset of each page */ + loff_t *ldp_offsets; + /** if ldp_offsets is NULL, it means a sequential + * pages to be written, then this is the file offset + * of the * first page. */ + loff_t ldp_start_offset; + /** how many bytes are to be written. */ + size_t ldp_size; + /** # of pages in the array. */ + int ldp_nr; +}; + +extern ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, + int rw, struct inode *inode, + struct ll_dio_pages *pv); + #endif /* LLITE_INTERNAL_H */ diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 7532fa8..2f20b80 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -166,8 +166,6 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) struct obd_capa *oc = NULL; struct obd_statfs osfs; struct ptlrpc_request *request = NULL; - struct lustre_handle dt_conn = {0, }; - struct lustre_handle md_conn = {0, }; struct obd_connect_data *data = NULL; struct obd_uuid *uuid; struct lustre_md lmd; @@ -232,7 +230,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) if (sbi->ll_flags & LL_SBI_RMT_CLIENT) data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE; - err = obd_connect(NULL, &md_conn, obd, &sbi->ll_sb_uuid, data, NULL); + err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL); if (err == -EBUSY) { LCONSOLE_ERROR_MSG(0x14f, "An MDT (md %s) is performing " "recovery, of which this client is not a " @@ -243,7 +241,6 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) CERROR("cannot connect to %s: rc = %d\n", md, err); GOTO(out, err); } - sbi->ll_md_exp = class_conn2export(&md_conn); err = obd_fid_init(sbi->ll_md_exp); if (err) { @@ -372,7 +369,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) obd->obd_upcall.onu_upcall = cl_ocd_update; data->ocd_brw_size = PTLRPC_MAX_BRW_PAGES << CFS_PAGE_SHIFT; - err = obd_connect(NULL, &dt_conn, obd, &sbi->ll_sb_uuid, data, NULL); + err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data, NULL); if (err == -EBUSY) { LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing " "recovery, of which this client is not a " @@ -384,8 +381,6 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) GOTO(out_md_fid, err); } - sbi->ll_dt_exp = class_conn2export(&dt_conn); - err = obd_fid_init(sbi->ll_dt_exp); if (err) { CERROR("Can't init data layer FID infrastructure, " diff --git a/lustre/llite/lloop.c b/lustre/llite/lloop.c index 05026f1..f3a4410 100644 --- a/lustre/llite/lloop.c +++ b/lustre/llite/lloop.c @@ -42,9 +42,6 @@ * Copyright 1993 by Theodore Ts'o. Redistribution of this file is * permitted under the GNU General Public License. * - * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 - * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 - * * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 * @@ -56,10 +53,6 @@ * * Loadable modules and other fixes by AK, 1998 * - * Make real block number available to downstream transfer functions, enables - * CBC (and relatives) mode encryption requiring unique IVs per data block. - * Reed H. Petty, rhp@draper.net - * * Maximum number of loop devices now dynamic via max_loop module parameter. * Russell Kroll 19990701 * @@ -129,37 +122,40 @@ enum { }; struct lloop_device { - int lo_number; - int lo_refcnt; - loff_t lo_offset; - loff_t lo_sizelimit; - int lo_flags; + int lo_number; + int lo_refcnt; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; int (*ioctl)(struct lloop_device *, int cmd, - unsigned long arg); + unsigned long arg); - struct file * lo_backing_file; + struct file *lo_backing_file; struct block_device *lo_device; - unsigned lo_blocksize; + unsigned lo_blocksize; - int old_gfp_mask; + int old_gfp_mask; - spinlock_t lo_lock; - struct bio *lo_bio; - struct bio *lo_biotail; - int lo_state; - struct semaphore lo_sem; - struct semaphore lo_ctl_mutex; - struct semaphore lo_bh_mutex; - atomic_t lo_pending; + spinlock_t lo_lock; + struct bio *lo_bio; + struct bio *lo_biotail; + int lo_state; + struct semaphore lo_sem; + struct semaphore lo_ctl_mutex; + atomic_t lo_pending; + wait_queue_head_t lo_bh_wait; - request_queue_t *lo_queue; + request_queue_t *lo_queue; + + const struct lu_env *lo_env; + struct cl_io lo_io; + struct ll_dio_pages lo_pvec; /* data to handle bio for lustre. */ struct lo_request_data { - struct brw_page lrd_pages[LLOOP_MAX_SEGMENTS]; - struct obdo lrd_oa; + struct page *lrd_pages[LLOOP_MAX_SEGMENTS]; + loff_t lrd_offsets[LLOOP_MAX_SEGMENTS]; } lo_requests[1]; - }; /* @@ -170,7 +166,8 @@ enum { }; static int lloop_major; -static int max_loop = 8; +#define MAX_LOOP_DEFAULT 16 +static int max_loop = MAX_LOOP_DEFAULT; static struct lloop_device *loop_dev; static struct gendisk **disks; static struct semaphore lloop_mutex; @@ -194,63 +191,88 @@ static loff_t get_loop_size(struct lloop_device *lo, struct file *file) return loopsize >> 9; } -static int do_bio_filebacked(struct lloop_device *lo, struct bio *bio) +static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head) { - struct inode *inode = lo->lo_backing_file->f_dentry->d_inode; - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - struct obd_info oinfo = {{{ 0 }}}; - struct brw_page *pg = lo->lo_requests[0].lrd_pages; - struct obdo *oa = &lo->lo_requests[0].lrd_oa; - pgoff_t offset; - int ret, cmd, i, opc; - struct bio_vec *bvec; - - BUG_ON(bio->bi_hw_segments > LLOOP_MAX_SEGMENTS); - - offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset; - bio_for_each_segment(bvec, bio, i) { - BUG_ON(bvec->bv_offset != 0); - BUG_ON(bvec->bv_len != CFS_PAGE_SIZE); - - pg->pg = bvec->bv_page; - pg->off = offset; - pg->count = bvec->bv_len; - pg->flag = OBD_BRW_SRVLOCK; - - pg++; - offset += bvec->bv_len; + const struct lu_env *env = lo->lo_env; + struct cl_io *io = &lo->lo_io; + struct inode *inode = lo->lo_backing_file->f_dentry->d_inode; + struct cl_object *obj = ll_i2info(inode)->lli_clob; + pgoff_t offset; + int ret; + int i; + int rw; + obd_count page_count = 0; + struct bio_vec *bvec; + struct bio *bio; + ssize_t bytes; + + struct ll_dio_pages *pvec = &lo->lo_pvec; + struct page **pages = pvec->ldp_pages; + loff_t *offsets = pvec->ldp_offsets; + + truncate_inode_pages(inode->i_mapping, 0); + + /* initialize the IO */ + memset(io, 0, sizeof(*io)); + io->ci_obj = obj; + ret = cl_io_init(env, io, CIT_MISC, obj); + if (ret) + return io->ci_result; + io->ci_lockreq = CILR_NEVER; + + LASSERT(head != NULL); + rw = head->bi_rw; + for (bio = head; bio != NULL; bio = bio->bi_next) { + LASSERT(rw == bio->bi_rw); + + offset = (pgoff_t)(bio->bi_sector << 9) + lo->lo_offset; + bio_for_each_segment(bvec, bio, i) { + BUG_ON(bvec->bv_offset != 0); + BUG_ON(bvec->bv_len != CFS_PAGE_SIZE); + + pages[page_count] = bvec->bv_page; + offsets[page_count] = offset; + page_count++; + offset += bvec->bv_len; + } + LASSERT(page_count <= LLOOP_MAX_SEGMENTS); } - oa->o_mode = inode->i_mode; - oa->o_id = lsm->lsm_object_id; - oa->o_gr = lsm->lsm_object_gr; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLMODE | - OBD_MD_FLTYPE |OBD_MD_FLGROUP; - obdo_from_inode(oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER); - - cmd = OBD_BRW_READ; - if (bio_rw(bio) == WRITE) - cmd = OBD_BRW_WRITE; - - if (cmd == OBD_BRW_WRITE) - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_WRITE, bio->bi_size); - else - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_READ, bio->bi_size); - oinfo.oi_oa = oa; - oinfo.oi_md = lsm; - opc = cmd & OBD_BRW_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW; - oinfo.oi_capa = ll_osscapa_get(inode, opc); - ret = obd_brw(cmd, ll_i2dtexp(inode), &oinfo, - (obd_count)(i - bio->bi_idx), - lo->lo_requests[0].lrd_pages, NULL); - capa_put(oinfo.oi_capa); - if (ret == 0) - obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS); - return ret; + ll_stats_ops_tally(ll_i2sbi(inode), + (rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ, + page_count << PAGE_CACHE_SHIFT); + + pvec->ldp_size = page_count << PAGE_CACHE_SHIFT; + pvec->ldp_nr = page_count; + + /* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to + * write those pages into OST. Even worse case is that more pages + * would be asked to write out to swap space, and then finally get here + * again. + * Unfortunately this is NOT easy to fix. + * Thoughts on solution: + * 0. Define a reserved pool for cl_pages, which could be a list of + * pre-allocated cl_pages from cl_page_kmem; + * 1. Define a new operation in cl_object_operations{}, says clo_depth, + * which measures how many layers for this lustre object. Generally + * speaking, the depth would be 2, one for llite, and one for lovsub. + * However, for SNS, there will be more since we need additional page + * to store parity; + * 2. Reserve the # of (page_count * depth) cl_pages from the reserved + * pool. Afterwards, the clio would allocate the pages from reserved + * pool, this guarantees we neeedn't allocate the cl_pages from + * generic cl_page slab cache. + * Of course, if there is NOT enough pages in the pool, we might + * be asked to write less pages once, this purely depends on + * implementation. Anyway, we should be careful to avoid deadlocking. + */ + LOCK_INODE_MUTEX(inode); + bytes = ll_direct_rw_pages(env, io, rw, inode, pvec); + UNLOCK_INODE_MUTEX(inode); + cl_io_fini(env, io); + return (bytes == pvec->ldp_size) ? 0 : (int)bytes; } - /* * Add bio to back of pending list */ @@ -266,41 +288,77 @@ static void loop_add_bio(struct lloop_device *lo, struct bio *bio) lo->lo_bio = lo->lo_biotail = bio; spin_unlock_irqrestore(&lo->lo_lock, flags); - up(&lo->lo_bh_mutex); + atomic_inc(&lo->lo_pending); + if (waitqueue_active(&lo->lo_bh_wait)) + wake_up(&lo->lo_bh_wait); } /* * Grab first pending buffer */ -static struct bio *loop_get_bio(struct lloop_device *lo) +static unsigned int loop_get_bio(struct lloop_device *lo, struct bio **req) { - struct bio *bio; + struct bio *first; + struct bio **bio; + unsigned int count = 0; + unsigned int page_count = 0; + int rw; spin_lock_irq(&lo->lo_lock); - if ((bio = lo->lo_bio)) { - if (bio == lo->lo_biotail) - lo->lo_biotail = NULL; - lo->lo_bio = bio->bi_next; - bio->bi_next = NULL; + first = lo->lo_bio; + if (unlikely(first == NULL)) { + spin_unlock_irq(&lo->lo_lock); + return 0; } - spin_unlock_irq(&lo->lo_lock); - return bio; + /* TODO: need to split the bio, too bad. */ + LASSERT(first->bi_vcnt <= LLOOP_MAX_SEGMENTS); + + rw = first->bi_rw; + bio = &lo->lo_bio; + while (*bio && (*bio)->bi_rw == rw) { + CDEBUG(D_INFO, "bio sector %llu size %u count %u vcnt%u \n", + (unsigned long long)(*bio)->bi_sector, (*bio)->bi_size, + page_count, (*bio)->bi_vcnt); + if (page_count + (*bio)->bi_vcnt > LLOOP_MAX_SEGMENTS) + break; + + + page_count += (*bio)->bi_vcnt; + count++; + bio = &(*bio)->bi_next; + } + if (*bio) { + /* Some of bios can't be mergable. */ + lo->lo_bio = *bio; + *bio = NULL; + } else { + /* Hit the end of queue */ + lo->lo_biotail = NULL; + lo->lo_bio = NULL; + } + *req = first; + spin_unlock_irq(&lo->lo_lock); + return count; } static int loop_make_request(request_queue_t *q, struct bio *old_bio) { struct lloop_device *lo = q->queuedata; int rw = bio_rw(old_bio); + int inactive; if (!lo) - goto out; + goto err; + + CDEBUG(D_INFO, "submit bio sector %llu size %u\n", + (unsigned long long)old_bio->bi_sector, old_bio->bi_size); spin_lock_irq(&lo->lo_lock); - if (lo->lo_state != LLOOP_BOUND) - goto inactive; - atomic_inc(&lo->lo_pending); + inactive = (lo->lo_state != LLOOP_BOUND); spin_unlock_irq(&lo->lo_lock); + if (inactive) + goto err; if (rw == WRITE) { if (lo->lo_flags & LO_FLAGS_READ_ONLY) @@ -314,14 +372,8 @@ static int loop_make_request(request_queue_t *q, struct bio *old_bio) loop_add_bio(lo, old_bio); return 0; err: - if (atomic_dec_and_test(&lo->lo_pending)) - up(&lo->lo_bh_mutex); -out: bio_io_error(old_bio, old_bio->bi_size); return 0; -inactive: - spin_unlock_irq(&lo->lo_lock); - goto out; } /* @@ -338,27 +390,50 @@ static void loop_unplug(request_queue_t *q) static inline void loop_handle_bio(struct lloop_device *lo, struct bio *bio) { int ret; - ret = do_bio_filebacked(lo, bio); - bio_endio(bio, bio->bi_size, ret); + ret = do_bio_lustrebacked(lo, bio); + while (bio) { + struct bio *tmp = bio->bi_next; + bio->bi_next = NULL; + bio_endio(bio, bio->bi_size, ret); + bio = tmp; + } +} + +static inline int loop_active(struct lloop_device *lo) +{ + return atomic_read(&lo->lo_pending) || (lo->lo_state == LLOOP_RUNDOWN); } /* * worker thread that handles reads/writes to file backed loop devices, - * to avoid blocking in our make_request_fn. it also does loop decrypting - * on reads for block backed loop, as that is too heavy to do from - * b_end_io context where irqs may be disabled. + * to avoid blocking in our make_request_fn. */ static int loop_thread(void *data) { struct lloop_device *lo = data; struct bio *bio; + unsigned int count; + unsigned long times = 0; + unsigned long total_count = 0; + + struct lu_env *env; + int refcheck; + int ret = 0; daemonize("lloop%d", lo->lo_number); set_user_nice(current, -20); lo->lo_state = LLOOP_BOUND; - atomic_inc(&lo->lo_pending); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + GOTO(out, ret = PTR_ERR(env)); + + lo->lo_env = env; + memset(&lo->lo_pvec, 0, sizeof(lo->lo_pvec)); + lo->lo_pvec.ldp_pages = lo->lo_requests[0].lrd_pages; + lo->lo_pvec.ldp_offsets = lo->lo_requests[0].lrd_offsets; /* * up sem, we are running @@ -366,40 +441,54 @@ static int loop_thread(void *data) up(&lo->lo_sem); for (;;) { - down_interruptible(&lo->lo_bh_mutex); - /* - * could be upped because of tear-down, not because of - * pending work - */ - if (!atomic_read(&lo->lo_pending)) - break; + wait_event(lo->lo_bh_wait, loop_active(lo)); + if (!atomic_read(&lo->lo_pending)) { + int exiting = 0; + spin_lock_irq(&lo->lo_lock); + exiting = (lo->lo_state == LLOOP_RUNDOWN); + spin_unlock_irq(&lo->lo_lock); + if (exiting) + break; + } - bio = loop_get_bio(lo); - if (!bio) { + bio = NULL; + count = loop_get_bio(lo, &bio); + if (!count) { CWARN("lloop(minor: %d): missing bio\n", lo->lo_number); continue; } - loop_handle_bio(lo, bio); - /* - * upped both for pending work and tear-down, lo_pending - * will hit zero then - */ - if (atomic_dec_and_test(&lo->lo_pending)) - break; + total_count += count; + if (total_count < count) { /* overflow */ + total_count = count; + times = 1; + } else { + times++; + } + if ((times & 127) == 0) { + CDEBUG(D_INFO, "total: %lu, count: %lu, avg: %lu\n", + total_count, times, total_count / times); + } + + LASSERT(bio != NULL); + LASSERT(count <= atomic_read(&lo->lo_pending)); + loop_handle_bio(lo, bio); + atomic_sub(count, &lo->lo_pending); } + cl_env_put(env, &refcheck); +out: up(&lo->lo_sem); - return 0; + return ret; } static int loop_set_fd(struct lloop_device *lo, struct file *unused, struct block_device *bdev, struct file *file) { - struct inode *inode; + struct inode *inode; struct address_space *mapping; - int lo_flags = 0; - int error; + int lo_flags = 0; + int error; loff_t size; if (!try_module_get(THIS_MODULE)) @@ -452,8 +541,10 @@ static int loop_set_fd(struct lloop_device *lo, struct file *unused, /* queue parameters */ blk_queue_hardsect_size(lo->lo_queue, CFS_PAGE_SIZE); - blk_queue_max_sectors(lo->lo_queue, LLOOP_MAX_SEGMENTS); + blk_queue_max_sectors(lo->lo_queue, + LLOOP_MAX_SEGMENTS << (CFS_PAGE_SHIFT - 9)); blk_queue_max_phys_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS); + blk_queue_max_hw_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS); set_capacity(disks[lo->lo_number], size); bd_set_size(bdev, size << 9); @@ -487,9 +578,8 @@ static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev, spin_lock_irq(&lo->lo_lock); lo->lo_state = LLOOP_RUNDOWN; - if (atomic_dec_and_test(&lo->lo_pending)) - up(&lo->lo_bh_mutex); spin_unlock_irq(&lo->lo_lock); + wake_up(&lo->lo_bh_wait); down(&lo->lo_sem); lo->lo_backing_file = NULL; @@ -533,7 +623,7 @@ static int lo_release(struct inode *inode, struct file *file) /* lloop device node's ioctl function. */ static int lo_ioctl(struct inode *inode, struct file *unused, - unsigned int cmd, unsigned long arg) + unsigned int cmd, unsigned long arg) { struct lloop_device *lo = inode->i_bdev->bd_disk->private_data; struct block_device *bdev = inode->i_bdev; @@ -578,12 +668,13 @@ static struct block_device_operations lo_fops = { /* dynamic iocontrol callback. * This callback is registered in lloop_init and will be called by * ll_iocontrol_call. + * * This is a llite regular file ioctl function. It takes the responsibility - * of attaching a file, and detaching a file by a lloop's device numner. + * of attaching or detaching a file by a lloop's device numner. */ static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file, - unsigned int cmd, unsigned long arg, - void *magic, int *rcp) + unsigned int cmd, unsigned long arg, + void *magic, int *rcp) { struct lloop_device *lo = NULL; struct block_device *bdev = NULL; @@ -684,25 +775,27 @@ static int __init lloop_init(void) }; if (max_loop < 1 || max_loop > 256) { + max_loop = MAX_LOOP_DEFAULT; CWARN("lloop: invalid max_loop (must be between" - " 1 and 256), using default (8)\n"); - max_loop = 8; + " 1 and 256), using default (%u)\n", max_loop); } lloop_major = register_blkdev(0, "lloop"); if (lloop_major < 0) return -EIO; + CDEBUG(D_CONFIG, "registered lloop major %d with %u minors\n", + lloop_major, max_loop); + ll_iocontrol_magic = ll_iocontrol_register(lloop_ioctl, 2, cmdlist); if (ll_iocontrol_magic == NULL) goto out_mem1; - loop_dev = kmalloc(max_loop * sizeof(struct lloop_device), GFP_KERNEL); + OBD_ALLOC_WAIT(loop_dev, max_loop * sizeof(*loop_dev)); if (!loop_dev) goto out_mem1; - memset(loop_dev, 0, max_loop * sizeof(struct lloop_device)); - disks = kmalloc(max_loop * sizeof(struct gendisk *), GFP_KERNEL); + OBD_ALLOC_WAIT(disks, max_loop * sizeof(*disks)); if (!disks) goto out_mem2; @@ -718,14 +811,13 @@ static int __init lloop_init(void) struct lloop_device *lo = &loop_dev[i]; struct gendisk *disk = disks[i]; - memset(lo, 0, sizeof(*lo)); lo->lo_queue = blk_alloc_queue(GFP_KERNEL); if (!lo->lo_queue) goto out_mem4; init_MUTEX(&lo->lo_ctl_mutex); init_MUTEX_LOCKED(&lo->lo_sem); - init_MUTEX_LOCKED(&lo->lo_bh_mutex); + init_waitqueue_head(&lo->lo_bh_wait); lo->lo_number = i; spin_lock_init(&lo->lo_lock); disk->major = lloop_major; @@ -748,9 +840,9 @@ out_mem4: out_mem3: while (i--) put_disk(disks[i]); - kfree(disks); + OBD_FREE(disks, max_loop * sizeof(*disks)); out_mem2: - kfree(loop_dev); + OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev)); out_mem1: unregister_blkdev(lloop_major, "lloop"); ll_iocontrol_unregister(ll_iocontrol_magic); @@ -770,9 +862,11 @@ static void lloop_exit(void) } if (ll_unregister_blkdev(lloop_major, "lloop")) CWARN("lloop: cannot unregister blkdev\n"); + else + CDEBUG(D_CONFIG, "unregistered lloop major %d\n", lloop_major); - kfree(disks); - kfree(loop_dev); + OBD_FREE(disks, max_loop * sizeof(*disks)); + OBD_FREE(loop_dev, max_loop * sizeof(*loop_dev)); } module_init(lloop_init); diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index 031b1ab..fac56d7 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -216,11 +216,9 @@ static void ll_free_user_pages(struct page **pages, int npages, int do_dirty) OBD_FREE(pages, npages * sizeof(*pages)); } -static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io, - int rw, struct inode *inode, - struct address_space *mapping, - size_t size, loff_t file_offset, - struct page **pages, int page_count) +ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, + int rw, struct inode *inode, + struct ll_dio_pages *pv) { struct cl_page *clp; struct ccc_page *clup; @@ -229,8 +227,11 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io, struct cl_sync_io *anchor = &ccc_env_info(env)->cti_sync_io; int i; ssize_t rc = 0; - ssize_t size_orig = size; - size_t page_size = cl_page_size(obj); + loff_t file_offset = pv->ldp_start_offset; + size_t size = pv->ldp_size; + int page_count = pv->ldp_nr; + struct page **pages = pv->ldp_pages; + size_t page_size = cl_page_size(obj); ENTRY; cl_sync_io_init(anchor, page_count); @@ -238,8 +239,11 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io, queue = &io->ci_queue; cl_2queue_init(queue); for (i = 0; i < page_count; i++) { + if (pv->ldp_offsets) + file_offset = pv->ldp_offsets[i]; + LASSERT(!(file_offset & (page_size - 1))); clp = cl_page_find(env, obj, cl_index(obj, file_offset), - pages[i], CPT_TRANSIENT); + pv->ldp_pages[i], CPT_TRANSIENT); if (IS_ERR(clp)) { rc = PTR_ERR(clp); break; @@ -319,7 +323,7 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io, cl_sync_io_note(anchor, +1); /* wait for the IO to be finished. */ rc = cl_sync_io_wait(env, io, &queue->c2_qout, - anchor) ?: size_orig; + anchor) ?: pv->ldp_size; } } @@ -328,6 +332,23 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io, cl_2queue_fini(env, queue); RETURN(rc); } +EXPORT_SYMBOL(ll_direct_rw_pages); + +static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io, + int rw, struct inode *inode, + struct address_space *mapping, + size_t size, loff_t file_offset, + struct page **pages, int page_count) +{ + struct ll_dio_pages pvec = { .ldp_pages = pages, + .ldp_nr = page_count, + .ldp_size = size, + .ldp_offsets = NULL, + .ldp_start_offset = file_offset + }; + + return ll_direct_rw_pages(env, io, rw, inode, &pvec); +} /* This is the maximum size of a single O_DIRECT request, based on a 128kB * kmalloc limit. We need to fit all of the brw_page structs, each one diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c index d199ad6..b698f52 100644 --- a/lustre/llite/vvp_page.c +++ b/lustre/llite/vvp_page.c @@ -243,7 +243,12 @@ static void vvp_page_completion_common(const struct lu_env *env, struct cl_sync_io *anchor = cp->cpg_sync_io; LINVRNT(cl_page_is_vmlocked(env, clp)); - KLASSERT(!PageWriteback(vmpage)); + + /* Don't assert the page writeback bit here because the lustre file + * may be as a backend of swap space. in this case, the page writeback + * is set by VM, and obvious we shouldn't clear it at all. Fortunately + * this type of pages are all TRANSIENT pages. */ + KLASSERT(ergo(clp->cp_type == CPT_CACHEABLE, !PageWriteback(vmpage))); vvp_vmpage_error(inode, vmpage, ioret); diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index 7543a8c..f98f511 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -239,7 +239,7 @@ static int lmv_notify(struct obd_device *obd, struct obd_device *watched, * caller that everything is okay. Real connection will be performed later. */ static int lmv_connect(const struct lu_env *env, - struct lustre_handle *conn, struct obd_device *obd, + struct obd_export **exp, struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *data, void *localdata) { @@ -247,29 +247,30 @@ static int lmv_connect(const struct lu_env *env, struct proc_dir_entry *lmv_proc_dir; #endif struct lmv_obd *lmv = &obd->u.lmv; - struct obd_export *exp; + struct lustre_handle conn = { 0 }; int rc = 0; ENTRY; - rc = class_connect(conn, obd, cluuid); - if (rc) { - CERROR("class_connection() returned %d\n", rc); - RETURN(rc); - } - - exp = class_conn2export(conn); - /* * We don't want to actually do the underlying connections more than * once, so keep track. */ lmv->refcount++; if (lmv->refcount > 1) { - class_export_put(exp); + *exp = NULL; RETURN(0); } - lmv->exp = exp; + rc = class_connect(&conn, obd, cluuid); + if (rc) { + CERROR("class_connection() returned %d\n", rc); + RETURN(rc); + } + + *exp = class_conn2export(&conn); + class_export_get(*exp); + + lmv->exp = *exp; lmv->connected = 0; lmv->cluuid = *cluuid; @@ -383,7 +384,6 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) struct obd_uuid *cluuid = &lmv->cluuid; struct obd_connect_data *mdc_data = NULL; struct obd_uuid lmv_mdc_uuid = { "LMV_MDC_UUID" }; - struct lustre_handle conn = {0, }; struct obd_device *mdc_obd; struct obd_export *mdc_exp; struct lu_fld_target target; @@ -407,15 +407,13 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) RETURN(-EINVAL); } - rc = obd_connect(NULL, &conn, mdc_obd, &lmv_mdc_uuid, + rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid, &lmv->conn_data, NULL); if (rc) { CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc); RETURN(rc); } - mdc_exp = class_conn2export(&conn); - /* * Init fid sequence client for this mdc and add new fld target. */ diff --git a/lustre/lov/Makefile.in b/lustre/lov/Makefile.in index 5a2aad7..59f7c79 100644 --- a/lustre/lov/Makefile.in +++ b/lustre/lov/Makefile.in @@ -1,4 +1,6 @@ MODULES := lov lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o lov_dev.o lov_object.o lov_page.o lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o lovsub_lock.o lovsub_io.o lov_pool.o +EXTRA_DIST = $(lov-objs:.o=.c) lov_internal.h lov_cl_internal.h + @INCLUDE_RULES@ diff --git a/lustre/lov/autoMakefile.am b/lustre/lov/autoMakefile.am index e18070c..77c91b0 100644 --- a/lustre/lov/autoMakefile.am +++ b/lustre/lov/autoMakefile.am @@ -84,5 +84,4 @@ endif # MODULES install-data-hook: $(install_data_hook) -DIST_SOURCES = $(lov-objs:.o=.c) lov_internal.h lov_cl_internal.h MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ diff --git a/lustre/lov/lov_cl_internal.h b/lustre/lov/lov_cl_internal.h index 6a98fbc..98c1270 100644 --- a/lustre/lov/lov_cl_internal.h +++ b/lustre/lov/lov_cl_internal.h @@ -88,14 +88,14 @@ * cl_lock::cll_guard, and will be automatically cleared by the sub-lock * when the latter is destroyed. When a sub-lock is canceled, a * reference to it is removed from the top-lock array, and top-lock is - * moved into CLS_NEW state. It is guaranteed that all sub-locks exits + * moved into CLS_NEW state. It is guaranteed that all sub-locks exist * while their top-lock is in CLS_HELD or CLS_CACHED states. * * - IO's are not reference counted. * * To implement a connection between top and sub entities, lov layer is split * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both - * implementing full set of cl-interfaces. For example, top-object has clu and + * implementing full set of cl-interfaces. For example, top-object has vvp and * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is * used to track child-parent relationship. * diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h index c6c3a69..2aaaff4 100644 --- a/lustre/lov/lov_internal.h +++ b/lustre/lov/lov_internal.h @@ -162,7 +162,7 @@ int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off); #define LOV_USES_ASSIGNED_STRIPE 0 #define LOV_USES_DEFAULT_STRIPE 1 int qos_add_tgt(struct obd_device *obd, __u32 index); -int qos_del_tgt(struct obd_device *obd, __u32 index); +int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt); void qos_shrink_lsm(struct lov_request_set *set); int qos_prep_create(struct obd_export *exp, struct lov_request_set *set); void qos_update(struct lov_obd *lov); @@ -320,5 +320,6 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname); void lov_dump_pool(int level, struct pool_desc *pool); struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname); int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool); +void lov_pool_putref(struct pool_desc *pool); #endif diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 86058ed..00ce37d 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -82,26 +82,43 @@ void lov_getref(struct obd_device *obd) return; } -static void __lov_del_obd(struct obd_device *obd, __u32 index); +static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt); void lov_putref(struct obd_device *obd) { struct lov_obd *lov = &obd->u.lov; + mutex_down(&lov->lov_lock); /* ok to dec to 0 more than once -- ltd_exp's will be null */ if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) { + CFS_LIST_HEAD(kill); int i; + struct lov_tgt_desc *tgt, *n; CDEBUG(D_CONFIG, "destroying %d lov targets\n", lov->lov_death_row); for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_reap) + tgt = lov->lov_tgts[i]; + + if (!tgt || !tgt->ltd_reap) continue; - /* Disconnect and delete from list */ - __lov_del_obd(obd, i); + list_add(&tgt->ltd_kill, &kill); + /* XXX - right now there is a dependency on ld_tgt_count + * being the maximum tgt index for computing the + * mds_max_easize. So we can't shrink it. */ + lov_ost_pool_remove(&lov->lov_packed, i); + lov->lov_tgts[i] = NULL; lov->lov_death_row--; } + mutex_up(&lov->lov_lock); + + list_for_each_entry_safe(tgt, n, &kill, ltd_kill) { + list_del(&tgt->ltd_kill); + /* Disconnect */ + __lov_del_obd(obd, tgt); + } + } else { + mutex_up(&lov->lov_lock); } - mutex_up(&lov->lov_lock); } static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid, @@ -118,7 +135,6 @@ int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, struct obd_uuid tgt_uuid; struct obd_device *tgt_obd; struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" }; - struct lustre_handle conn = {0, }; struct obd_import *imp; #ifdef __KERNEL__ @@ -162,39 +178,28 @@ int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, ptlrpc_activate_import(imp); } + rc = obd_register_observer(tgt_obd, obd); + if (rc) { + CERROR("Target %s register_observer error %d\n", + obd_uuid2str(&tgt_uuid), rc); + RETURN(rc); + } + + if (imp->imp_invalid) { CERROR("not connecting OSC %s; administratively " "disabled\n", obd_uuid2str(&tgt_uuid)); - rc = obd_register_observer(tgt_obd, obd); - if (rc) { - CERROR("Target %s register_observer error %d; " - "will not be able to reactivate\n", - obd_uuid2str(&tgt_uuid), rc); - } RETURN(0); } - rc = obd_connect(NULL, &conn, tgt_obd, &lov_osc_uuid, data, NULL); - if (rc) { + rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd, + &lov_osc_uuid, data, NULL); + if (rc || !lov->lov_tgts[index]->ltd_exp) { CERROR("Target %s connect error %d\n", obd_uuid2str(&tgt_uuid), rc); - RETURN(rc); - } - lov->lov_tgts[index]->ltd_exp = class_conn2export(&conn); - if (!lov->lov_tgts[index]->ltd_exp) { - CERROR("Target %s: null export!\n", obd_uuid2str(&tgt_uuid)); RETURN(-ENODEV); } - rc = obd_register_observer(tgt_obd, obd); - if (rc) { - CERROR("Target %s register_observer error %d\n", - obd_uuid2str(&tgt_uuid), rc); - obd_disconnect(lov->lov_tgts[index]->ltd_exp); - lov->lov_tgts[index]->ltd_exp = NULL; - RETURN(rc); - } - lov->lov_tgts[index]->ltd_reap = 0; if (activate) { lov->lov_tgts[index]->ltd_active = 1; @@ -207,7 +212,7 @@ int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, #ifdef __KERNEL__ lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds"); if (lov_proc_dir) { - struct obd_device *osc_obd = class_conn2obd(&conn); + struct obd_device *osc_obd = lov->lov_tgts[index]->ltd_exp->exp_obd; cfs_proc_dir_entry_t *osc_symlink; char name[MAX_STRING_SIZE]; @@ -237,21 +242,24 @@ int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, } static int lov_connect(const struct lu_env *env, - struct lustre_handle *conn, struct obd_device *obd, + struct obd_export **exp, struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *data, void *localdata) { struct lov_obd *lov = &obd->u.lov; struct lov_tgt_desc *tgt; + struct lustre_handle conn; int i, rc; ENTRY; CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects); - rc = class_connect(conn, obd, cluuid); + rc = class_connect(&conn, obd, cluuid); if (rc) RETURN(rc); + *exp = class_conn2export(&conn); + /* Why should there ever be more than 1 connect? */ lov->lov_connects++; LASSERT(lov->lov_connects == 1); @@ -277,7 +285,7 @@ static int lov_connect(const struct lu_env *env, continue; rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd, - OBD_NOTIFY_ACTIVE, (void *)&i); + OBD_NOTIFY_CONNECT, (void *)&i); if (rc) { CERROR("%s error sending notify %d\n", obd->obd_name, rc); @@ -288,26 +296,22 @@ static int lov_connect(const struct lu_env *env, RETURN(0); } -static int lov_disconnect_obd(struct obd_device *obd, __u32 index) +static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) { cfs_proc_dir_entry_t *lov_proc_dir; struct lov_obd *lov = &obd->u.lov; struct obd_device *osc_obd; int rc; - ENTRY; - if (lov->lov_tgts[index] == NULL) - RETURN(-EINVAL); - - osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp); + osc_obd = class_exp2obd(tgt->ltd_exp); CDEBUG(D_CONFIG, "%s: disconnecting target %s\n", obd->obd_name, osc_obd->obd_name); - if (lov->lov_tgts[index]->ltd_active) { - lov->lov_tgts[index]->ltd_active = 0; + if (tgt->ltd_active) { + tgt->ltd_active = 0; lov->desc.ld_active_tgt_count--; - lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1; + tgt->ltd_exp->exp_obd->obd_inactive = 1; } lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds"); @@ -336,16 +340,16 @@ static int lov_disconnect_obd(struct obd_device *obd, __u32 index) obd_register_observer(osc_obd, NULL); - rc = obd_disconnect(lov->lov_tgts[index]->ltd_exp); + rc = obd_disconnect(tgt->ltd_exp); if (rc) { CERROR("Target %s disconnect error %d\n", - lov_uuid2str(lov, index), rc); + tgt->ltd_uuid.uuid, rc); rc = 0; } - qos_del_tgt(obd, index); + qos_del_tgt(obd, tgt); - lov->lov_tgts[index]->ltd_exp = NULL; + tgt->ltd_exp = NULL; RETURN(0); } @@ -615,7 +619,7 @@ int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, GOTO(out, rc = 0); rc = lov_notify(obd, tgt->ltd_exp->exp_obd, - active ? OBD_NOTIFY_ACTIVE : OBD_NOTIFY_INACTIVE, + active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE, (void *)&index); out: @@ -671,12 +675,9 @@ out: RETURN(rc); } -/* We are holding lov_lock */ -static void __lov_del_obd(struct obd_device *obd, __u32 index) +static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) { - struct lov_obd *lov = &obd->u.lov; struct obd_device *osc_obd; - struct lov_tgt_desc *tgt = lov->lov_tgts[index]; LASSERT(tgt); LASSERT(tgt->ltd_reap); @@ -684,18 +685,12 @@ static void __lov_del_obd(struct obd_device *obd, __u32 index) osc_obd = class_exp2obd(tgt->ltd_exp); CDEBUG(D_CONFIG, "Removing tgt %s : %s\n", - lov_uuid2str(lov, index), + tgt->ltd_uuid.uuid, osc_obd ? osc_obd->obd_name : ""); if (tgt->ltd_exp) - lov_disconnect_obd(obd, index); - - /* XXX - right now there is a dependency on ld_tgt_count being the - * maximum tgt index for computing the mds_max_easize. So we can't - * shrink it. */ + lov_disconnect_obd(obd, tgt); - lov_ost_pool_remove(&lov->lov_packed, index); - lov->lov_tgts[index] = NULL; OBD_FREE_PTR(tgt); /* Manual cleanup - no cleanup logs to clean up the osc's. We must @@ -846,11 +841,12 @@ int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) { int rc = 0; + struct lov_obd *lov = &obd->u.lov; + ENTRY; switch (stage) { case OBD_CLEANUP_EARLY: { - struct lov_obd *lov = &obd->u.lov; int i; for (i = 0; i < lov->desc.ld_tgt_count; i++) { if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active) @@ -875,22 +871,19 @@ static int lov_cleanup(struct obd_device *obd) struct list_head *pos, *tmp; struct pool_desc *pool; - lprocfs_obd_cleanup(obd); - - /* Delete hash entries and kill hash table before freeing pools - * and get to use after free issue. */ - lustre_hash_exit(lov->lov_pools_hash_body); - list_for_each_safe(pos, tmp, &lov->lov_pool_list) { pool = list_entry(pos, struct pool_desc, pool_list); /* free pool structs */ + CDEBUG(D_INFO, "delete pool %p\n", pool); lov_pool_del(obd, pool->pool_name); } + lustre_hash_exit(lov->lov_pools_hash_body); lov_ost_pool_free(&(lov->lov_qos.lq_rr.lqr_pool)); lov_ost_pool_free(&lov->lov_packed); if (lov->lov_tgts) { int i; + lov_getref(obd); for (i = 0; i < lov->desc.ld_tgt_count; i++) { if (!lov->lov_tgts[i]) continue; @@ -907,11 +900,15 @@ static int lov_cleanup(struct obd_device *obd) atomic_read(&lov->lov_refcount)); lov_del_target(obd, i, 0, 0); } + lov_putref(obd); OBD_FREE(lov->lov_tgts, sizeof(*lov->lov_tgts) * lov->lov_tgt_size); lov->lov_tgt_size = 0; } + /* clear pools parent proc entry only after all pools is killed */ + lprocfs_obd_cleanup(obd); + RETURN(0); } diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c index 12c2d28..7fd5470 100644 --- a/lustre/lov/lov_pack.c +++ b/lustre/lov/lov_pack.c @@ -484,8 +484,7 @@ static int __lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp, rc = lov_check_index_in_pool(lumv3.lmm_stripe_offset, pool); if (rc < 0) { - lh_put(lov->lov_pools_hash_body, - &pool->pool_hash); + lov_pool_putref(pool); RETURN(-EINVAL); } } @@ -493,7 +492,7 @@ static int __lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp, if (stripe_count > pool_tgt_count(pool)) stripe_count = pool_tgt_count(pool); - lh_put(lov->lov_pools_hash_body, &pool->pool_hash); + lov_pool_putref(pool); } if ((__u64)lumv1->lmm_stripe_size * stripe_count > ~0UL) { @@ -640,14 +639,21 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm, CLASSERT(sizeof lum.lmm_objects[0] == sizeof lmmk->lmm_objects[0]); + if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) && + (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC))) + lustre_swab_lov_mds_md(lmmk); /* User wasn't expecting this many OST entries */ if (lum.lmm_stripe_count == 0) { - if (copy_to_user(lump, lmmk, lum_size)) + copy_lov_mds2user(&lum, lmmk); + if (copy_to_user(lump, &lum, lum_size)) rc = -EFAULT; } else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count) { rc = -EOVERFLOW; - } else if (copy_to_user(lump, lmmk, lmm_size)) - rc = -EFAULT; + } else { + copy_lov_mds2user(&lum, lmmk); + if (copy_to_user(lump, &lum, lmm_size)) + rc = -EFAULT; + } obd_free_diskmd(exp, &lmmk); } diff --git a/lustre/lov/lov_pool.c b/lustre/lov/lov_pool.c index 764a494..3df37a7 100644 --- a/lustre/lov/lov_pool.c +++ b/lustre/lov/lov_pool.c @@ -38,6 +38,8 @@ * OST pool methods * * Author: Jacques-Charles LAFOUCRIERE + * Author: Alex Lyashkov + * Author: Nathaniel Rutman */ #define DEBUG_SUBSYSTEM S_LOV @@ -51,15 +53,23 @@ #include #include "lov_internal.h" -static void lov_pool_getref(struct pool_desc *pool) { +static void lov_pool_getref(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); atomic_inc(&pool->pool_refcount); } -static void lov_pool_putref(struct pool_desc *pool) { +void lov_pool_putref(struct pool_desc *pool) +{ + CDEBUG(D_INFO, "pool %p\n", pool); if (atomic_dec_and_test(&pool->pool_refcount)) { + LASSERT(hlist_unhashed(&pool->pool_hash)); + LASSERT(list_empty(&pool->pool_list)); + LASSERT(pool->pool_proc_entry == NULL); lov_ost_pool_free(&(pool->pool_rr.lqr_pool)); lov_ost_pool_free(&(pool->pool_obds)); OBD_FREE_PTR(pool); + EXIT; } } @@ -302,6 +312,8 @@ void lov_dump_pool(int level, struct pool_desc *pool) #define LOV_POOL_INIT_COUNT 2 int lov_ost_pool_init(struct ost_pool *op, unsigned int count) { + ENTRY; + if (count == 0) count = LOV_POOL_INIT_COUNT; op->op_array = NULL; @@ -311,8 +323,9 @@ int lov_ost_pool_init(struct ost_pool *op, unsigned int count) OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0])); if (op->op_array == NULL) { op->op_size = 0; - return -ENOMEM; + RETURN(-ENOMEM); } + EXIT; return 0; } @@ -359,6 +372,7 @@ int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count) /* ost not found we add it */ op->op_array[op->op_count] = idx; op->op_count++; + EXIT; out: up_write(&op->op_rw_sem); return rc; @@ -367,6 +381,7 @@ out: int lov_ost_pool_remove(struct ost_pool *op, __u32 idx) { int i; + ENTRY; down_write(&op->op_rw_sem); @@ -376,18 +391,21 @@ int lov_ost_pool_remove(struct ost_pool *op, __u32 idx) (op->op_count - i - 1) * sizeof(op->op_array[0])); op->op_count--; up_write(&op->op_rw_sem); + EXIT; return 0; } } up_write(&op->op_rw_sem); - return -EINVAL; + RETURN(-EINVAL); } int lov_ost_pool_free(struct ost_pool *op) { + ENTRY; + if (op->op_size == 0) - return 0; + RETURN(0); down_write(&op->op_rw_sem); @@ -397,7 +415,7 @@ int lov_ost_pool_free(struct ost_pool *op) op->op_size = 0; up_write(&op->op_rw_sem); - return 0; + RETURN(0); } @@ -430,48 +448,54 @@ int lov_pool_new(struct obd_device *obd, char *poolname) memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr)); rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0); - if (rc) { - lov_ost_pool_free(&new_pool->pool_obds); - GOTO(out_err, rc); - } + if (rc) + GOTO(out_free_pool_obds, rc); INIT_HLIST_NODE(&new_pool->pool_hash); - rc = lustre_hash_add_unique(lov->lov_pools_hash_body, poolname, - &new_pool->pool_hash); - if (rc) { - lov_ost_pool_free(&new_pool->pool_rr.lqr_pool); - lov_ost_pool_free(&new_pool->pool_obds); - GOTO(out_err, rc = -EEXIST); - } - - spin_lock(&obd->obd_dev_lock); - list_add_tail(&new_pool->pool_list, &lov->lov_pool_list); - lov->lov_pool_count++; - - spin_unlock(&obd->obd_dev_lock); - - CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n", - poolname, lov->lov_pool_count); #ifdef LPROCFS - /* ifdef needed for liblustre */ + /* we need this assert seq_file is not implementated for liblustre */ /* get ref for /proc file */ lov_pool_getref(new_pool); new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry, poolname, NULL, NULL, new_pool, &pool_proc_operations); -#endif - if (IS_ERR(new_pool->pool_proc_entry)) { CWARN("Cannot add proc pool entry "LOV_POOLNAMEF"\n", poolname); new_pool->pool_proc_entry = NULL; lov_pool_putref(new_pool); } + CDEBUG(D_INFO, "pool %p - proc %p\n", new_pool, new_pool->pool_proc_entry); +#endif + + spin_lock(&obd->obd_dev_lock); + list_add_tail(&new_pool->pool_list, &lov->lov_pool_list); + lov->lov_pool_count++; + spin_unlock(&obd->obd_dev_lock); + + /* add to find only when it fully ready */ + rc = lustre_hash_add_unique(lov->lov_pools_hash_body, poolname, + &new_pool->pool_hash); + if (rc) + GOTO(out_err, rc = -EEXIST); + + CDEBUG(D_CONFIG, LOV_POOLNAMEF" is pool #%d\n", + poolname, lov->lov_pool_count); RETURN(0); out_err: + spin_lock(&obd->obd_dev_lock); + list_del_init(&new_pool->pool_list); + lov->lov_pool_count--; + spin_unlock(&obd->obd_dev_lock); + + lprocfs_remove(&new_pool->pool_proc_entry); + + lov_ost_pool_free(&new_pool->pool_rr.lqr_pool); +out_free_pool_obds: + lov_ost_pool_free(&new_pool->pool_obds); OBD_FREE_PTR(new_pool); return rc; } @@ -484,33 +508,23 @@ int lov_pool_del(struct obd_device *obd, char *poolname) lov = &(obd->u.lov); - spin_lock(&obd->obd_dev_lock); - - pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname); - if (pool == NULL) { - spin_unlock(&obd->obd_dev_lock); + /* lookup and kill hash reference */ + pool = lustre_hash_del_key(lov->lov_pools_hash_body, poolname); + if (pool == NULL) RETURN(-ENOENT); - } -#ifdef LPROCFS if (pool->pool_proc_entry != NULL) { - remove_proc_entry(pool->pool_proc_entry->name, - pool->pool_proc_entry->parent); - /* remove ref for /proc file */ + CDEBUG(D_INFO, "proc entry %p\n", pool->pool_proc_entry); + lprocfs_remove(&pool->pool_proc_entry); lov_pool_putref(pool); } -#endif - lustre_hash_del_key(lov->lov_pools_hash_body, poolname); + spin_lock(&obd->obd_dev_lock); list_del_init(&pool->pool_list); - lov->lov_pool_count--; - lh_put(lov->lov_pools_hash_body, &pool->pool_hash); spin_unlock(&obd->obd_dev_lock); - /* remove ref got when pool was created in memory - * pool will be freed when refount will reach 0 - */ + /* release last reference */ lov_pool_putref(pool); RETURN(0); @@ -522,7 +536,7 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname) struct obd_uuid ost_uuid; struct lov_obd *lov; struct pool_desc *pool; - unsigned int i, lov_idx; + unsigned int lov_idx; int rc; ENTRY; @@ -536,22 +550,17 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname) /* search ost in lov array */ - mutex_down(&lov->lov_lock); - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (!lov->lov_tgts[i]) + lov_getref(obd); + for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) { + if (!lov->lov_tgts[lov_idx]) continue; - if (obd_uuid_equals(&ost_uuid, &(lov->lov_tgts[i]->ltd_uuid))) + if (obd_uuid_equals(&ost_uuid, + &(lov->lov_tgts[lov_idx]->ltd_uuid))) break; } - /* test if ost found in lov */ - if (i == lov->desc.ld_tgt_count) { - mutex_up(&lov->lov_lock); + if (lov_idx == lov->desc.ld_tgt_count) GOTO(out, rc = -EINVAL); - } - mutex_up(&lov->lov_lock); - - lov_idx = i; rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size); if (rc) @@ -564,7 +573,8 @@ int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname) EXIT; out: - lh_put(lov->lov_pools_hash_body, &pool->pool_hash); + lov_putref(obd); + lov_pool_putref(pool); return rc; } @@ -573,39 +583,32 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname) struct obd_uuid ost_uuid; struct lov_obd *lov; struct pool_desc *pool; - unsigned int i, lov_idx; + unsigned int lov_idx; int rc = 0; ENTRY; lov = &(obd->u.lov); - spin_lock(&obd->obd_dev_lock); pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname); - if (pool == NULL) { - spin_unlock(&obd->obd_dev_lock); + if (pool == NULL) RETURN(-ENOENT); - } obd_str2uuid(&ost_uuid, ostname); + lov_getref(obd); /* search ost in lov array, to get index */ - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (!lov->lov_tgts[i]) + for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) { + if (!lov->lov_tgts[lov_idx]) continue; - if (obd_uuid_equals(&ost_uuid, &(lov->lov_tgts[i]->ltd_uuid))) + if (obd_uuid_equals(&ost_uuid, + &(lov->lov_tgts[lov_idx]->ltd_uuid))) break; } /* test if ost found in lov */ - if (i == lov->desc.ld_tgt_count) { - spin_unlock(&obd->obd_dev_lock); + if (lov_idx == lov->desc.ld_tgt_count) GOTO(out, rc = -EINVAL); - } - - spin_unlock(&obd->obd_dev_lock); - - lov_idx = i; lov_ost_pool_remove(&pool->pool_obds, lov_idx); @@ -616,7 +619,8 @@ int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname) EXIT; out: - lh_put(lov->lov_pools_hash_body, &pool->pool_hash); + lov_putref(obd); + lov_pool_putref(pool); return rc; } @@ -660,7 +664,7 @@ struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname) CWARN("Request for an empty pool ("LOV_POOLNAMEF")\n", poolname); /* pool is ignored, so we remove ref on it */ - lh_put(lov->lov_pools_hash_body, &pool->pool_hash); + lov_pool_putref(pool); pool = NULL; } } diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c index 45245ed..94539b3 100644 --- a/lustre/lov/lov_qos.c +++ b/lustre/lov/lov_qos.c @@ -121,19 +121,16 @@ out: RETURN(rc); } -int qos_del_tgt(struct obd_device *obd, __u32 index) +int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt) { struct lov_obd *lov = &obd->u.lov; struct lov_qos_oss *oss; int rc = 0; ENTRY; - if (!lov->lov_tgts[index]) - RETURN(0); - down_write(&lov->lov_qos.lq_rw_sem); - oss = lov->lov_tgts[index]->ltd_qos.ltq_oss; + oss = tgt->ltd_qos.ltq_oss; if (!oss) GOTO(out, rc = -ENOENT); @@ -640,7 +637,7 @@ out: if (pool != NULL) { up_read(&pool_tgt_rw_sem(pool)); /* put back ref got by lov_find_pool() */ - lh_put(lov->lov_pools_hash_body, &pool->pool_hash); + lov_pool_putref(pool); } RETURN(rc); @@ -732,7 +729,7 @@ out: if (pool != NULL) { up_read(&pool_tgt_rw_sem(pool)); /* put back ref got by lov_find_pool() */ - lh_put(lov->lov_pools_hash_body, &pool->pool_hash); + lov_pool_putref(pool); } RETURN(rc); @@ -927,7 +924,7 @@ out_nolock: if (pool != NULL) { up_read(&pool_tgt_rw_sem(pool)); /* put back ref got by lov_find_pool() */ - lh_put(lov->lov_pools_hash_body, &pool->pool_hash); + lov_pool_putref(pool); } if (rc == -EAGAIN) diff --git a/lustre/lvfs/Makefile.in b/lustre/lvfs/Makefile.in index 4b8773b..80687ea 100644 --- a/lustre/lvfs/Makefile.in +++ b/lustre/lvfs/Makefile.in @@ -12,6 +12,11 @@ fsfilt_@BACKINGFS@-objs := fsfilt-@BACKINGFS@.o $(obj)/fsfilt-%.c: $(obj)/fsfilt_%.c ln -s $< $@ +EXTRA_DIST = $(lvfs-objs:.o=.c) $(quotafmt-objs:.o=.c) \ + fsfilt_ext3.c fsfilt_reiserfs.c \ + lvfs_internal.h lvfs_userfs.c \ + lustre_quota_fmt.c lustre_quota_fmt.h quotafmt_test.c + # for on 2.6 EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LDISKFS_DIR@ -I@LDISKFS_DIR@/ldiskfs diff --git a/lustre/lvfs/autoMakefile.am b/lustre/lvfs/autoMakefile.am index b80a28d..a7122cc 100644 --- a/lustre/lvfs/autoMakefile.am +++ b/lustre/lvfs/autoMakefile.am @@ -101,10 +101,5 @@ endif # MODULES install-data-hook: $(install_data_hook) -DIST_SOURCES = fsfilt.c fsfilt_ext3.c fsfilt_reiserfs.c lvfs_common.c \ - lvfs_internal.h lvfs_linux.c lvfs_userfs.c \ - upcall_cache.c prng.c lvfs_lib.c \ - lustre_quota_fmt.c lustre_quota_fmt.h quotafmt_test.c - MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ CLEANFILES = fsfilt-*.c fsfilt_ldiskfs*.c fsfilt_extN.c sources diff --git a/lustre/lvfs/lvfs_linux.c b/lustre/lvfs/lvfs_linux.c index 5d07875..88b4334 100644 --- a/lustre/lvfs/lvfs_linux.c +++ b/lustre/lvfs/lvfs_linux.c @@ -421,6 +421,58 @@ long l_readdir(struct file *file, struct list_head *dentry_list) } EXPORT_SYMBOL(l_readdir); +int l_notify_change(struct vfsmount *mnt, struct dentry *dchild, + struct iattr *newattrs) +{ + int rc; + + LOCK_INODE_MUTEX(dchild->d_inode); +#ifdef HAVE_SECURITY_PLUG + rc = notify_change(dchild, mnt, newattrs); +#else + rc = notify_change(dchild, newattrs); +#endif + UNLOCK_INODE_MUTEX(dchild->d_inode); + return rc; +} +EXPORT_SYMBOL(l_notify_change); + +/* utility to truncate a file */ +int simple_truncate(struct dentry *dir, struct vfsmount *mnt, + char *name, loff_t length) +{ + struct dentry *dchild; + struct iattr newattrs; + int err = 0; + ENTRY; + + CDEBUG(D_INODE, "truncating file %.*s to %lld\n", (int)strlen(name), + name, (long long)length); + dchild = ll_lookup_one_len(name, dir, strlen(name)); + if (IS_ERR(dchild)) + GOTO(out, err = PTR_ERR(dchild)); + + if (dchild->d_inode) { + int old_mode = dchild->d_inode->i_mode; + if (S_ISDIR(old_mode)) { + CERROR("found %s (%lu/%u) is mode %o\n", name, + dchild->d_inode->i_ino, + dchild->d_inode->i_generation, old_mode); + GOTO(out_dput, err = -EISDIR); + } + + newattrs.ia_size = length; + newattrs.ia_valid = ATTR_SIZE; + err = l_notify_change(mnt, dchild, &newattrs); + } + EXIT; +out_dput: + dput(dchild); +out: + return err; +} +EXPORT_SYMBOL(simple_truncate); + #ifdef LUSTRE_KERNEL_VERSION #ifndef HAVE_CLEAR_RDONLY_ON_PUT #error rdonly patchset must be updated [cfs bz11248] diff --git a/lustre/mdc/Makefile.in b/lustre/mdc/Makefile.in index b9b9793..f007298 100644 --- a/lustre/mdc/Makefile.in +++ b/lustre/mdc/Makefile.in @@ -1,4 +1,6 @@ MODULES := mdc mdc-objs := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o +EXTRA_DIST = $(mdc-objs:.o=.c) mdc_internal.h + @INCLUDE_RULES@ diff --git a/lustre/mdc/autoMakefile.am b/lustre/mdc/autoMakefile.am index 65be657..ace974d 100644 --- a/lustre/mdc/autoMakefile.am +++ b/lustre/mdc/autoMakefile.am @@ -45,5 +45,4 @@ if MODULES modulefs_DATA = mdc$(KMODEXT) endif -DIST_SOURCES = $(mdc-objs:.o=.c) mdc_internal.h MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 48c79ee..a5698b5 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -961,7 +961,10 @@ int mdc_sendpage(struct obd_export *exp, const struct lu_fid *fid, ptlrpc_request_set_replen(req); rc = ptlrpc_queue_wait(req); - GOTO(out, rc); + if (rc) + GOTO(out, rc); + + rc = sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk); out: ptlrpc_req_finished(req); return rc; @@ -1011,6 +1014,13 @@ int mdc_readpage(struct obd_export *exp, const struct lu_fid *fid, RETURN(rc); } + rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, + req->rq_bulk->bd_nob_transferred); + if (rc < 0) { + ptlrpc_req_finished(req); + RETURN(rc); + } + if (req->rq_bulk->bd_nob_transferred != CFS_PAGE_SIZE) { CERROR("Unexpected # bytes transferred: %d (%ld expected)\n", req->rq_bulk->bd_nob_transferred, CFS_PAGE_SIZE); @@ -1789,7 +1799,7 @@ static int mdc_renew_capa(struct obd_export *exp, struct obd_capa *oc, } static int mdc_connect(const struct lu_env *env, - struct lustre_handle *dlm_handle, + struct obd_export **exp, struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *data, void *localdata) @@ -1806,7 +1816,7 @@ static int mdc_connect(const struct lu_env *env, obd->obd_name); } - return client_connect_import(env, dlm_handle, obd, cluuid, data, NULL); + return client_connect_import(env, exp, obd, cluuid, data, NULL); } struct obd_ops mdc_obd_ops = { diff --git a/lustre/mdd/mdd_device.c b/lustre/mdd/mdd_device.c index e9bfcfc..07924a0 100644 --- a/lustre/mdd/mdd_device.c +++ b/lustre/mdd/mdd_device.c @@ -60,12 +60,16 @@ #include #include /* for changelogs */ #include +#include #include "mdd_internal.h" const struct md_device_operations mdd_ops; +static struct lu_device_type mdd_device_type; static const char mdd_root_dir_name[] = "ROOT"; +static const char mdd_obf_dir_name[] = "fid"; +static const char mdd_dot_lustre_name[] = ".lustre"; static int mdd_device_init(const struct lu_env *env, struct lu_device *d, const char *name, struct lu_device *next) @@ -112,6 +116,8 @@ static void mdd_device_shutdown(const struct lu_env *env, ENTRY; mdd_changelog_fini(env, m); dt_txn_callback_del(m->mdd_child, &m->mdd_txn_cb); + mdd_object_put(env, m->mdd_dot_lustre_objs.mdd_obf); + mdd_object_put(env, m->mdd_dot_lustre); if (m->mdd_obd_dev) mdd_fini_obd(env, m, cfg); orph_index_fini(env, m); @@ -300,6 +306,369 @@ int mdd_changelog_write_header(struct mdd_device *mdd, int markerflags) RETURN(rc); } +/** + * Create ".lustre" directory. + */ +static int create_dot_lustre_dir(const struct lu_env *env, struct mdd_device *m) +{ + struct lu_fid *fid = &mdd_env_info(env)->mti_fid; + struct md_object *mdo; + int rc; + + memcpy(fid, &LU_DOT_LUSTRE_FID, sizeof(struct lu_fid)); + mdo = llo_store_create_index(env, &m->mdd_md_dev, m->mdd_child, + mdd_root_dir_name, mdd_dot_lustre_name, + fid, &dt_directory_features); + /* .lustre dir may be already present */ + if (IS_ERR(mdo) && PTR_ERR(mdo) != -EEXIST) { + rc = PTR_ERR(mdo); + CERROR("creating obj [%s] fid = "DFID" rc = %d\n", + mdd_dot_lustre_name, PFID(fid), rc); + RETURN(rc); + } + + return 0; +} + +static int dot_lustre_attr_get(const struct lu_env *env, struct md_object *obj, + struct md_attr *ma) +{ + struct mdd_object *mdd_obj = md2mdd_obj(obj); + + return mdd_attr_get_internal_locked(env, mdd_obj, ma); +} + +static int dot_lustre_attr_set(const struct lu_env *env, struct md_object *obj, + const struct md_attr *ma) +{ + return -EPERM; +} + +static int dot_lustre_xattr_get(const struct lu_env *env, + struct md_object *obj, struct lu_buf *buf, + const char *name) +{ + return 0; +} + +/** + * Direct access to the ".lustre" directory is not allowed. + */ +static int dot_lustre_mdd_open(const struct lu_env *env, struct md_object *obj, + int flags) +{ + return -EPERM; +} + +static int dot_lustre_path(const struct lu_env *env, struct md_object *obj, + char *path, int pathlen, __u64 recno, int *linkno) +{ + return -ENOSYS; +} + +static struct md_object_operations mdd_dot_lustre_obj_ops = { + .moo_attr_get = dot_lustre_attr_get, + .moo_attr_set = dot_lustre_attr_set, + .moo_xattr_get = dot_lustre_xattr_get, + .moo_open = dot_lustre_mdd_open, + .moo_path = dot_lustre_path +}; + +static int dot_lustre_lookup(const struct lu_env *env, struct md_object *p, + const struct lu_name *lname, struct lu_fid *f, + struct md_op_spec *spec) +{ + if (strcmp(lname->ln_name, mdd_obf_dir_name) == 0) + *f = LU_OBF_FID; + else + return -ENOENT; + + return 0; +} + +static int dot_lustre_create(const struct lu_env *env, struct md_object *pobj, + const struct lu_name *lname, + struct md_object *child, struct md_op_spec *spec, + struct md_attr* ma) +{ + return -EPERM; +} + +static int dot_lustre_rename(const struct lu_env *env, + struct md_object *src_pobj, + struct md_object *tgt_pobj, + const struct lu_fid *lf, + const struct lu_name *lsname, + struct md_object *tobj, + const struct lu_name *ltname, struct md_attr *ma) +{ + return -EPERM; +} + +static int dot_lustre_link(const struct lu_env *env, struct md_object *tgt_obj, + struct md_object *src_obj, + const struct lu_name *lname, struct md_attr *ma) +{ + return -EPERM; +} + +static int dot_lustre_unlink(const struct lu_env *env, struct md_object *pobj, + struct md_object *cobj, const struct lu_name *lname, + struct md_attr *ma) +{ + return -EPERM; +} + +static struct md_dir_operations mdd_dot_lustre_dir_ops = { + .mdo_lookup = dot_lustre_lookup, + .mdo_create = dot_lustre_create, + .mdo_rename = dot_lustre_rename, + .mdo_link = dot_lustre_link, + .mdo_unlink = dot_lustre_unlink, +}; + +static int obf_attr_get(const struct lu_env *env, struct md_object *obj, + struct md_attr *ma) +{ + int rc = 0; + + if (ma->ma_need & MA_INODE) { + struct mdd_device *mdd = mdo2mdd(obj); + + /* "fid" is a virtual object and hence does not have any "real" + * attributes. So we reuse attributes of .lustre for "fid" dir */ + ma->ma_need |= MA_INODE; + rc = dot_lustre_attr_get(env, &mdd->mdd_dot_lustre->mod_obj, ma); + if (rc) + return rc; + ma->ma_valid |= MA_INODE; + } + + /* "fid" directory does not have any striping information. */ + if (ma->ma_need & MA_LOV) { + struct mdd_object *mdd_obj = md2mdd_obj(obj); + + if (ma->ma_valid & MA_LOV) + return 0; + + if (!(S_ISREG(mdd_object_type(mdd_obj)) || + S_ISDIR(mdd_object_type(mdd_obj)))) + return 0; + + if (ma->ma_need & MA_LOV_DEF) { + rc = mdd_get_default_md(mdd_obj, ma->ma_lmm, + &ma->ma_lmm_size); + if (rc > 0) { + ma->ma_valid |= MA_LOV; + rc = 0; + } + } + } + + return rc; +} + +static int obf_attr_set(const struct lu_env *env, struct md_object *obj, + const struct md_attr *ma) +{ + return -EPERM; +} + +static int obf_xattr_get(const struct lu_env *env, + struct md_object *obj, struct lu_buf *buf, + const char *name) +{ + return 0; +} + +static int obf_mdd_open(const struct lu_env *env, struct md_object *obj, + int flags) +{ + struct mdd_object *mdd_obj = md2mdd_obj(obj); + + mdd_write_lock(env, mdd_obj, MOR_TGT_CHILD); + mdd_obj->mod_count++; + mdd_write_unlock(env, mdd_obj); + + return 0; +} + +static int obf_mdd_close(const struct lu_env *env, struct md_object *obj, + struct md_attr *ma) +{ + struct mdd_object *mdd_obj = md2mdd_obj(obj); + + mdd_write_lock(env, mdd_obj, MOR_TGT_CHILD); + mdd_obj->mod_count--; + mdd_write_unlock(env, mdd_obj); + + return 0; +} + +/** Nothing to list in "fid" directory */ +static int obf_mdd_readpage(const struct lu_env *env, struct md_object *obj, + const struct lu_rdpg *rdpg) +{ + return -EPERM; +} + +static int obf_path(const struct lu_env *env, struct md_object *obj, + char *path, int pathlen, __u64 recno, int *linkno) +{ + return -ENOSYS; +} + +static struct md_object_operations mdd_obf_obj_ops = { + .moo_attr_get = obf_attr_get, + .moo_attr_set = obf_attr_set, + .moo_xattr_get = obf_xattr_get, + .moo_open = obf_mdd_open, + .moo_close = obf_mdd_close, + .moo_readpage = obf_mdd_readpage, + .moo_path = obf_path +}; + +/** + * Lookup method for "fid" object. Only filenames with correct SEQ:OID format + * are valid. We also check if object with passed fid exists or not. + */ +static int obf_lookup(const struct lu_env *env, struct md_object *p, + const struct lu_name *lname, struct lu_fid *f, + struct md_op_spec *spec) +{ + char *name = (char *)lname->ln_name; + struct mdd_device *mdd = mdo2mdd(p); + struct mdd_object *child; + int rc = 0; + + while (*name == '[') + name++; + + sscanf(name, SFID, &(f->f_seq), &(f->f_oid), + &(f->f_ver)); + if (!fid_is_sane(f)) { + CWARN("bad FID format [%s], should be "DFID"\n", lname->ln_name, + (__u64)1, 2, 0); + GOTO(out, rc = -EINVAL); + } + + /* Check if object with this fid exists */ + child = mdd_object_find(env, mdd, f); + if (child == NULL) + GOTO(out, rc = 0); + if (IS_ERR(child)) + GOTO(out, rc = PTR_ERR(child)); + + if (mdd_object_exists(child) == 0) + rc = -ENOENT; + + mdd_object_put(env, child); + +out: + return rc; +} + +static int obf_create(const struct lu_env *env, struct md_object *pobj, + const struct lu_name *lname, struct md_object *child, + struct md_op_spec *spec, struct md_attr* ma) +{ + return -EPERM; +} + +static int obf_rename(const struct lu_env *env, + struct md_object *src_pobj, struct md_object *tgt_pobj, + const struct lu_fid *lf, const struct lu_name *lsname, + struct md_object *tobj, const struct lu_name *ltname, + struct md_attr *ma) +{ + return -EPERM; +} + +static int obf_link(const struct lu_env *env, struct md_object *tgt_obj, + struct md_object *src_obj, const struct lu_name *lname, + struct md_attr *ma) +{ + return -EPERM; +} + +static int obf_unlink(const struct lu_env *env, struct md_object *pobj, + struct md_object *cobj, const struct lu_name *lname, + struct md_attr *ma) +{ + return -EPERM; +} + +static struct md_dir_operations mdd_obf_dir_ops = { + .mdo_lookup = obf_lookup, + .mdo_create = obf_create, + .mdo_rename = obf_rename, + .mdo_link = obf_link, + .mdo_unlink = obf_unlink +}; + +/** + * Create special in-memory "fid" object for open-by-fid. + */ +static int mdd_obf_setup(const struct lu_env *env, struct mdd_device *m) +{ + struct mdd_object *mdd_obf; + struct lu_object *obf_lu_obj; + int rc = 0; + + m->mdd_dot_lustre_objs.mdd_obf = mdd_object_find(env, m, + &LU_OBF_FID); + if (m->mdd_dot_lustre_objs.mdd_obf == NULL || + IS_ERR(m->mdd_dot_lustre_objs.mdd_obf)) + GOTO(out, rc = -ENOENT); + + mdd_obf = m->mdd_dot_lustre_objs.mdd_obf; + mdd_obf->mod_obj.mo_dir_ops = &mdd_obf_dir_ops; + mdd_obf->mod_obj.mo_ops = &mdd_obf_obj_ops; + /* Don't allow objects to be created in "fid" dir */ + mdd_obf->mod_flags |= IMMUTE_OBJ; + + obf_lu_obj = mdd2lu_obj(mdd_obf); + obf_lu_obj->lo_header->loh_attr |= (LOHA_EXISTS | S_IFDIR); + +out: + return rc; +} + +/** Setup ".lustre" directory object */ +static int mdd_dot_lustre_setup(const struct lu_env *env, struct mdd_device *m) +{ + struct dt_object *dt_dot_lustre; + struct lu_fid *fid = &mdd_env_info(env)->mti_fid; + int rc; + + rc = create_dot_lustre_dir(env, m); + if (rc) + return rc; + + dt_dot_lustre = dt_store_open(env, m->mdd_child, mdd_root_dir_name, + mdd_dot_lustre_name, fid); + if (IS_ERR(dt_dot_lustre)) { + rc = PTR_ERR(dt_dot_lustre); + GOTO(out, rc); + } + + /* references are released in mdd_device_shutdown() */ + m->mdd_dot_lustre = lu2mdd_obj(lu_object_locate(dt_dot_lustre->do_lu.lo_header, + &mdd_device_type)); + + lu_object_put(env, &dt_dot_lustre->do_lu); + + m->mdd_dot_lustre->mod_obj.mo_dir_ops = &mdd_dot_lustre_dir_ops; + m->mdd_dot_lustre->mod_obj.mo_ops = &mdd_dot_lustre_obj_ops; + + rc = mdd_obf_setup(env, m); + if (rc) + CERROR("Error initializing \"fid\" object - %d.\n", rc); + +out: + RETURN(rc); +} + static int mdd_process_config(const struct lu_env *env, struct lu_device *d, struct lustre_cfg *cfg) { @@ -435,8 +804,17 @@ static int mdd_prepare(const struct lu_env *env, LASSERT(root != NULL); lu_object_put(env, &root->do_lu); rc = orph_index_init(env, mdd); - } else + } else { rc = PTR_ERR(root); + } + if (rc) + GOTO(out, rc); + + rc = mdd_dot_lustre_setup(env, mdd); + if (rc) { + CERROR("Error(%d) initializing .lustre objects\n", rc); + GOTO(out, rc); + } out: RETURN(rc); diff --git a/lustre/mdd/mdd_internal.h b/lustre/mdd/mdd_internal.h index c855f80..0d66a68d 100644 --- a/lustre/mdd/mdd_internal.h +++ b/lustre/mdd/mdd_internal.h @@ -110,6 +110,11 @@ struct mdd_changelog { __u64 mc_starttime; }; +/** Objects in .lustre dir */ +struct mdd_dot_lustre_objs { + struct mdd_object *mdd_obf; +}; + struct mdd_device { struct md_device mdd_md_dev; struct dt_device *mdd_child; @@ -123,6 +128,8 @@ struct mdd_device { struct mdd_txn_op_descr mdd_tod[MDD_TXN_LAST_OP]; struct mdd_changelog mdd_cl; unsigned long mdd_atime_diff; + struct mdd_object *mdd_dot_lustre; + struct mdd_dot_lustre_objs mdd_dot_lustre_objs; }; enum mod_flags { @@ -362,6 +369,8 @@ extern const struct lu_device_operations mdd_lu_ops; struct mdd_object *mdd_object_find(const struct lu_env *env, struct mdd_device *d, const struct lu_fid *f); +int mdd_get_default_md(struct mdd_object *mdd_obj, struct lov_mds_md *lmm, + int *size); /* mdd_quota.c*/ #ifdef HAVE_QUOTA_SUPPORT diff --git a/lustre/mdd/mdd_object.c b/lustre/mdd/mdd_object.c index 0cf918a..9d0dbc1 100644 --- a/lustre/mdd/mdd_object.c +++ b/lustre/mdd/mdd_object.c @@ -582,8 +582,8 @@ int mdd_iattr_get(const struct lu_env *env, struct mdd_object *mdd_obj, RETURN(rc); } -static int mdd_get_default_md(struct mdd_object *mdd_obj, - struct lov_mds_md *lmm, int *size) +int mdd_get_default_md(struct mdd_object *mdd_obj, struct lov_mds_md *lmm, + int *size) { struct lov_desc *ldesc; struct mdd_device *mdd = mdo2mdd(&mdd_obj->mod_obj); diff --git a/lustre/mds/Makefile.in b/lustre/mds/Makefile.in index a6400b8..0bb2876 100644 --- a/lustre/mds/Makefile.in +++ b/lustre/mds/Makefile.in @@ -1,4 +1,6 @@ MODULES := mds mds-objs := handler.o lproc_mds.o mds_fs.o mds_log.o mds_lov.o +EXTRA_DIST := $(mds-objs:%.o=%.c) mds_internal.h + @INCLUDE_RULES@ diff --git a/lustre/mds/autoMakefile.am b/lustre/mds/autoMakefile.am index d2aafc6..4cc5dea 100644 --- a/lustre/mds/autoMakefile.am +++ b/lustre/mds/autoMakefile.am @@ -39,4 +39,3 @@ modulefs_DATA = mds$(KMODEXT) endif MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES := $(mds-objs:%.o=%.c) mds_internal.h diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index d0a2076..9c6142f 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -601,7 +601,6 @@ out: int mds_lov_connect(struct obd_device *obd, char * lov_name) { struct mds_obd *mds = &obd->u.mds; - struct lustre_handle conn = {0,}; struct obd_connect_data *data; int rc; ENTRY; @@ -655,14 +654,13 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) /* send the list of supported checksum types */ data->ocd_cksum_types = OBD_CKSUM_ALL; /* NB: lov_connect() needs to fill in .ocd_index for each OST */ - rc = obd_connect(NULL, &conn, mds->mds_osc_obd, &obd->obd_uuid, data, NULL); + rc = obd_connect(NULL, &mds->mds_osc_exp, mds->mds_osc_obd, &obd->obd_uuid, data, NULL); OBD_FREE(data, sizeof(*data)); if (rc) { CERROR("MDS cannot connect to LOV %s (%d)\n", lov_name, rc); mds->mds_osc_obd = ERR_PTR(rc); RETURN(rc); } - mds->mds_osc_exp = class_conn2export(&conn); /* I want to see a callback happen when the OBD moves to a * "For General Use" state, and that's when we'll call diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 7acac5d..b46baaf 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -1168,6 +1168,10 @@ static int mdt_sendpage(struct mdt_thread_info *info, } LASSERT(desc->bd_nob == rdpg->rp_count); + rc = sptlrpc_svc_wrap_bulk(req, desc); + if (rc) + GOTO(free_desc, rc); + rc = ptlrpc_start_bulk_transfer(desc); if (rc) GOTO(free_desc, rc); @@ -1327,6 +1331,9 @@ static int mdt_writepage(struct mdt_thread_info *info) ptlrpc_prep_bulk_page(desc, page, (int)reqbody->size, (int)reqbody->nlink); + rc = sptlrpc_svc_prep_bulk(req, desc); + if (rc != 0) + GOTO(cleanup_page, rc); /* * Check if client was evicted while we were doing i/o before touching * network. @@ -2771,6 +2778,15 @@ static int mdt_handle0(struct ptlrpc_request *req, if (likely(rc == 0)) { rc = mdt_recovery(info); if (likely(rc == +1)) { + switch (lustre_msg_get_opc(msg)) { + case MDS_READPAGE: + req->rq_bulk_read = 1; + break; + case MDS_WRITEPAGE: + req->rq_bulk_write = 1; + break; + } + h = mdt_handler_find(lustre_msg_get_opc(msg), supported); if (likely(h != NULL)) { @@ -4826,39 +4842,40 @@ static int mdt_connect_check_sptlrpc(struct mdt_device *mdt, /* mds_connect copy */ static int mdt_obd_connect(const struct lu_env *env, - struct lustre_handle *conn, struct obd_device *obd, + struct obd_export **exp, struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *data, void *localdata) { struct mdt_thread_info *info; struct lsd_client_data *lcd; - struct obd_export *exp; + struct obd_export *lexp; + struct lustre_handle conn = { 0 }; struct mdt_device *mdt; struct ptlrpc_request *req; int rc; ENTRY; LASSERT(env != NULL); - if (!conn || !obd || !cluuid) + if (!exp || !obd || !cluuid) RETURN(-EINVAL); info = lu_context_key_get(&env->le_ctx, &mdt_thread_key); req = info->mti_pill->rc_req; mdt = mdt_dev(obd->obd_lu_dev); - rc = class_connect(conn, obd, cluuid); + rc = class_connect(&conn, obd, cluuid); if (rc) RETURN(rc); - exp = class_conn2export(conn); - LASSERT(exp != NULL); + lexp = class_conn2export(&conn); + LASSERT(lexp != NULL); - rc = mdt_connect_check_sptlrpc(mdt, exp, req); + rc = mdt_connect_check_sptlrpc(mdt, lexp, req); if (rc) GOTO(out, rc); - rc = mdt_connect_internal(exp, mdt, data); + rc = mdt_connect_internal(lexp, mdt, data); if (rc == 0) { OBD_ALLOC_PTR(lcd); if (lcd != NULL) { @@ -4866,15 +4883,15 @@ static int mdt_obd_connect(const struct lu_env *env, mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key); LASSERT(mti != NULL); - mti->mti_exp = exp; + mti->mti_exp = lexp; memcpy(lcd->lcd_uuid, cluuid, sizeof lcd->lcd_uuid); - exp->exp_mdt_data.med_lcd = lcd; + lexp->exp_mdt_data.med_lcd = lcd; rc = mdt_client_new(env, mdt); if (rc != 0) { OBD_FREE_PTR(lcd); - exp->exp_mdt_data.med_lcd = NULL; + lexp->exp_mdt_data.med_lcd = NULL; } else { - mdt_export_stats_init(obd, exp, localdata); + mdt_export_stats_init(obd, lexp, localdata); } } else rc = -ENOMEM; @@ -4882,9 +4899,9 @@ static int mdt_obd_connect(const struct lu_env *env, out: if (rc != 0) - class_disconnect(exp); + class_disconnect(lexp); else - class_export_put(exp); + *exp = lexp; RETURN(rc); } diff --git a/lustre/mdt/mdt_open.c b/lustre/mdt/mdt_open.c index e2e8802..ba2d4c2 100644 --- a/lustre/mdt/mdt_open.c +++ b/lustre/mdt/mdt_open.c @@ -896,9 +896,9 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc) LASSERT(info->mti_pill->rc_fmt == &RQF_LDLM_INTENT_OPEN); ldlm_rep = req_capsule_server_get(info->mti_pill, &RMF_DLM_REP); - /* TODO: JOIN file */ + /* JOIN file was deprecated since 1.6.5, but may be revived one day */ if (create_flags & MDS_OPEN_JOIN_FILE) { - CERROR("JOIN file will be supported soon\n"); + CERROR("file join is unsupported in this version of Lustre\n"); GOTO(out, result = err_serious(-EOPNOTSUPP)); } msg_flags = lustre_msg_get_flags(req->rq_reqmsg); @@ -1234,8 +1234,11 @@ int mdt_close(struct mdt_thread_info *info) req_capsule_set_size(info->mti_pill, &RMF_LOGCOOKIES, RCL_SERVER, info->mti_mdt->mdt_max_cookiesize); rc = req_capsule_server_pack(info->mti_pill); - if (mdt_check_resent(info, mdt_reconstruct_generic, NULL)) + if (mdt_check_resent(info, mdt_reconstruct_generic, NULL)) { + if (rc == 0) + mdt_shrink_reply(info); RETURN(lustre_msg_get_status(req->rq_repmsg)); + } /* Continue to close handle even if we can not pack reply */ if (rc == 0) { diff --git a/lustre/mdt/mdt_recovery.c b/lustre/mdt/mdt_recovery.c index b5e263e..c32ae5a 100644 --- a/lustre/mdt/mdt_recovery.c +++ b/lustre/mdt/mdt_recovery.c @@ -49,7 +49,8 @@ #include "mdt_internal.h" static int mdt_server_data_update(const struct lu_env *env, - struct mdt_device *mdt); + struct mdt_device *mdt, + int need_sync); struct lu_buf *mdt_buf(const struct lu_env *env, void *area, ssize_t len) { @@ -243,8 +244,16 @@ static inline int mdt_last_rcvd_header_read(const struct lu_env *env, return rc; } +static void mdt_client_cb(const struct mdt_device *mdt, __u64 transno, + void *data, int err) +{ + struct obd_device *obd = mdt2obd_dev(mdt); + target_client_add_cb(obd, transno, data, err); +} + static inline int mdt_last_rcvd_header_write(const struct lu_env *env, - struct mdt_device *mdt) + struct mdt_device *mdt, + int need_sync) { struct mdt_thread_info *mti; struct thandle *th; @@ -253,6 +262,11 @@ static inline int mdt_last_rcvd_header_write(const struct lu_env *env, mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key); + if (mti->mti_exp) { + spin_lock(&mti->mti_exp->exp_lock); + mti->mti_exp->exp_need_sync = need_sync; + spin_unlock(&mti->mti_exp->exp_lock); + } mdt_trans_credit_init(env, mdt, MDT_TXN_LAST_RCVD_WRITE_OP); th = mdt_trans_start(env, mdt); if (IS_ERR(th)) @@ -261,6 +275,9 @@ static inline int mdt_last_rcvd_header_write(const struct lu_env *env, mti->mti_off = 0; lsd_cpu_to_le(&mdt->mdt_lsd, &mti->mti_lsd); + if (need_sync && mti->mti_exp) + mdt_trans_add_cb(th, mdt_client_cb, mti->mti_exp); + rc = mdt_record_write(env, mdt->mdt_last_rcvd, mdt_buf_const(env, &mti->mti_lsd, sizeof(mti->mti_lsd)), @@ -561,7 +578,8 @@ static int mdt_server_data_init(const struct lu_env *env, lsd->lsd_mount_count = mdt->mdt_mount_count; /* save it, so mount count and last_transno is current */ - rc = mdt_server_data_update(env, mdt); + rc = mdt_server_data_update(env, mdt, (mti->mti_exp && + mti->mti_exp->exp_need_sync)); if (rc) GOTO(err_client, rc); @@ -574,7 +592,8 @@ out: } static int mdt_server_data_update(const struct lu_env *env, - struct mdt_device *mdt) + struct mdt_device *mdt, + int need_sync) { int rc = 0; ENTRY; @@ -591,18 +610,10 @@ static int mdt_server_data_update(const struct lu_env *env, * mdt->mdt_last_rcvd may be NULL that time. */ if (mdt->mdt_last_rcvd != NULL) - rc = mdt_last_rcvd_header_write(env, mdt); + rc = mdt_last_rcvd_header_write(env, mdt, need_sync); RETURN(rc); } -void mdt_cb_new_client(const struct mdt_device *mdt, __u64 transno, - void *data, int err) -{ - struct obd_device *obd = mdt2obd_dev(mdt); - - target_client_add_cb(obd, transno, data, err); -} - int mdt_client_new(const struct lu_env *env, struct mdt_device *mdt) { unsigned long *bitmap = mdt->mdt_client_bitmap; @@ -651,16 +662,22 @@ int mdt_client_new(const struct lu_env *env, struct mdt_device *mdt) init_mutex(&med->med_lcd_lock); LASSERTF(med->med_lr_off > 0, "med_lr_off = %llu\n", med->med_lr_off); - /* write new client data */ + + /* Write new client data. */ off = med->med_lr_off; mdt_trans_credit_init(env, mdt, MDT_TXN_LAST_RCVD_WRITE_OP); + th = mdt_trans_start(env, mdt); if (IS_ERR(th)) RETURN(PTR_ERR(th)); - /* until this operations will be committed the sync is needed for this - * export */ - mdt_trans_add_cb(th, mdt_cb_new_client, mti->mti_exp); + /* + * Until this operations will be committed the sync is needed + * for this export. This should be done _after_ starting the + * transaction so that many connecting clients will not bring + * server down with lots of sync writes. + */ + mdt_trans_add_cb(th, mdt_client_cb, mti->mti_exp); spin_lock(&mti->mti_exp->exp_lock); mti->mti_exp->exp_need_sync = 1; spin_unlock(&mti->mti_exp->exp_lock); @@ -730,21 +747,24 @@ int mdt_client_del(const struct lu_env *env, struct mdt_device *mdt) struct mdt_export_data *med; struct lsd_client_data *lcd; struct obd_device *obd = mdt2obd_dev(mdt); - struct thandle *th; - loff_t off; - int rc = 0; + struct obd_export *exp; + struct thandle *th; + int need_sync; + loff_t off; + int rc = 0; ENTRY; mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key); LASSERT(mti != NULL); - med = &mti->mti_exp->exp_mdt_data; + exp = mti->mti_exp; + med = &exp->exp_mdt_data; lcd = med->med_lcd; if (!lcd) RETURN(0); /* XXX: If lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ - if (!strcmp(med->med_lcd->lcd_uuid, obd->obd_uuid.uuid)) + if (!strcmp(lcd->lcd_uuid, obd->obd_uuid.uuid)) GOTO(free, 0); CDEBUG(D_INFO, "freeing client at idx %u, offset %lld\n", @@ -772,16 +792,34 @@ int mdt_client_del(const struct lu_env *env, struct mdt_device *mdt) LBUG(); } + /* Don't force sync on disconnect if aborting recovery, + * or it does num_clients * num_osts. b=17194 */ + need_sync = (!exp->exp_libclient || exp->exp_need_sync) && + !(exp->exp_flags & OBD_OPT_ABORT_RECOV); + /* * This may be called from difficult reply handler path and * mdt->mdt_last_rcvd may be NULL that time. */ if (mdt->mdt_last_rcvd != NULL) { mdt_trans_credit_init(env, mdt, MDT_TXN_LAST_RCVD_WRITE_OP); + + spin_lock(&exp->exp_lock); + exp->exp_need_sync = need_sync; + spin_unlock(&exp->exp_lock); + th = mdt_trans_start(env, mdt); if (IS_ERR(th)) GOTO(free, rc = PTR_ERR(th)); + if (need_sync) { + /* + * Until this operations will be committed the sync + * is needed for this export. + */ + mdt_trans_add_cb(th, mdt_client_cb, exp); + } + mutex_down(&med->med_lcd_lock); memset(lcd, 0, sizeof *lcd); @@ -791,18 +829,20 @@ int mdt_client_del(const struct lu_env *env, struct mdt_device *mdt) } CDEBUG(rc == 0 ? D_INFO : D_ERROR, "Zeroing out client idx %u in " - "%s rc %d\n", med->med_lr_idx, LAST_RCVD, rc); + "%s %ssync rc %d\n", med->med_lr_idx, LAST_RCVD, + need_sync ? "" : "a", rc); spin_lock(&mdt->mdt_client_bitmap_lock); clear_bit(med->med_lr_idx, mdt->mdt_client_bitmap); spin_unlock(&mdt->mdt_client_bitmap_lock); - /* - * Make sure the server's last_transno is up to date. Do this after the - * client is freed so we know all the client's transactions have been - * committed. + /* + * Make sure the server's last_transno is up to date. Do this + * after the client is freed so we know all the client's + * transactions have been committed. */ - mdt_server_data_update(env, mdt); + mdt_server_data_update(env, mdt, need_sync); + EXIT; free: OBD_FREE_PTR(lcd); @@ -866,7 +906,9 @@ static int mdt_last_rcvd_update(struct mdt_thread_info *mti, */ if (mti->mti_transno == 0 && *transno_p == mdt->mdt_last_transno) - mdt_server_data_update(mti->mti_env, mdt); + mdt_server_data_update(mti->mti_env, mdt, + (mti->mti_exp && + mti->mti_exp->exp_need_sync)); *transno_p = mti->mti_transno; diff --git a/lustre/mgc/Makefile.in b/lustre/mgc/Makefile.in index 8adca32..7ce8a37 100644 --- a/lustre/mgc/Makefile.in +++ b/lustre/mgc/Makefile.in @@ -1,4 +1,6 @@ MODULES := mgc mgc-objs := mgc_request.o lproc_mgc.o +EXTRA_DIST := $(mgc-objs:%.o=%.c) libmgc.c mgc_internal.h + @INCLUDE_RULES@ diff --git a/lustre/mgc/autoMakefile.am b/lustre/mgc/autoMakefile.am index db9a433..e337ea9 100644 --- a/lustre/mgc/autoMakefile.am +++ b/lustre/mgc/autoMakefile.am @@ -46,4 +46,3 @@ modulefs_DATA = mgc$(KMODEXT) endif MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES := $(mgc-objs:%.o=%.c) libmgc.c mgc_internal.h diff --git a/lustre/mgs/Makefile.in b/lustre/mgs/Makefile.in index 8bb6a5f..413f381 100644 --- a/lustre/mgs/Makefile.in +++ b/lustre/mgs/Makefile.in @@ -1,4 +1,6 @@ MODULES := mgs mgs-objs := mgs_handler.o mgs_fs.o mgs_llog.o lproc_mgs.o +EXTRA_DIST := $(mgs-objs:%.o=%.c) mgs_internal.h + @INCLUDE_RULES@ diff --git a/lustre/mgs/autoMakefile.am b/lustre/mgs/autoMakefile.am index c538cb4..a57c433 100644 --- a/lustre/mgs/autoMakefile.am +++ b/lustre/mgs/autoMakefile.am @@ -39,4 +39,3 @@ modulefs_DATA = mgs$(KMODEXT) endif MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES := $(mgs-objs:%.o=%.c) mgs_internal.h diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c index 581f1e1..8861a75 100644 --- a/lustre/mgs/mgs_handler.c +++ b/lustre/mgs/mgs_handler.c @@ -63,37 +63,39 @@ /* Establish a connection to the MGS.*/ static int mgs_connect(const struct lu_env *env, - struct lustre_handle *conn, struct obd_device *obd, + struct obd_export **exp, struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *data, void *localdata) { - struct obd_export *exp; + struct obd_export *lexp; + struct lustre_handle conn = { 0 }; int rc; ENTRY; - if (!conn || !obd || !cluuid) + if (!exp || !obd || !cluuid) RETURN(-EINVAL); - rc = class_connect(conn, obd, cluuid); + rc = class_connect(&conn, obd, cluuid); if (rc) RETURN(rc); - exp = class_conn2export(conn); - LASSERT(exp); - mgs_counter_incr(exp, LPROC_MGS_CONNECT); + lexp = class_conn2export(&conn); + LASSERT(lexp); + + mgs_counter_incr(lexp, LPROC_MGS_CONNECT); if (data != NULL) { data->ocd_connect_flags &= MGS_CONNECT_SUPPORTED; - exp->exp_connect_flags = data->ocd_connect_flags; + lexp->exp_connect_flags = data->ocd_connect_flags; data->ocd_version = LUSTRE_VERSION_CODE; } - rc = mgs_client_add(obd, exp, localdata); + rc = mgs_client_add(obd, lexp, localdata); if (rc) { - class_disconnect(exp); + class_disconnect(lexp); } else { - class_export_put(exp); + *exp = lexp; } RETURN(rc); @@ -220,7 +222,11 @@ static int mgs_setup(struct obd_device *obd, struct lustre_cfg *lcfg) ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, "mgs_ldlm_client", &obd->obd_ldlm_client); - LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb))); + if (lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb))) { + CERROR("%s: Underlying device is marked as read-only. " + "Setup failed\n", obd->obd_name); + GOTO(err_ops, rc = -EROFS); + } rc = mgs_fs_setup(obd, mnt); if (rc) { diff --git a/lustre/mgs/mgs_llog.c b/lustre/mgs/mgs_llog.c index e328f33..e5adb2f 100644 --- a/lustre/mgs/mgs_llog.c +++ b/lustre/mgs/mgs_llog.c @@ -2032,7 +2032,7 @@ static int mgs_srpc_set_param_mem(struct fs_db *fsdb, rset = &fsdb->fsdb_srpc_gen; } - rc = sptlrpc_rule_set_merge(rset, &rule, 1); + rc = sptlrpc_rule_set_merge(rset, &rule); RETURN(rc); } @@ -2046,6 +2046,9 @@ static int mgs_srpc_set_param(struct obd_device *obd, int rc, copy_size; ENTRY; +#ifndef HAVE_GSS + RETURN(-EINVAL); +#endif /* keep a copy of original param, which could be destroied * during parsing */ copy_size = strlen(param) + 1; diff --git a/lustre/obdclass/Makefile.in b/lustre/obdclass/Makefile.in index 90d898e..1ab1d54 100644 --- a/lustre/obdclass/Makefile.in +++ b/lustre/obdclass/Makefile.in @@ -26,4 +26,7 @@ llog_test-objs := llog-test.o $(obj)/llog-test.c: $(obj)/llog_test.c ln -sf $< $@ +EXTRA_DIST = $(filter-out llog-test.c,$(obdclass-all-objs:.o=.c)) $(llog-test-objs:.o=.c) llog_test.c llog_internal.h +EXTRA_DIST += cl_internal.h + @INCLUDE_RULES@ diff --git a/lustre/obdclass/autoMakefile.am b/lustre/obdclass/autoMakefile.am index b7fb43e..af30e10 100644 --- a/lustre/obdclass/autoMakefile.am +++ b/lustre/obdclass/autoMakefile.am @@ -54,4 +54,3 @@ install-data-hook: $(install_data_hook) MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ llog-test.c MOSTLYCLEANFILES += linux/*.o darwin/*.o -DIST_SOURCES = $(filter-out llog-test.c,$(obdclass-all-objs:.o=.c)) $(llog-test-objs:.o=.c) llog_test.c llog_internal.h cl_internal.h diff --git a/lustre/obdclass/cl_page.c b/lustre/obdclass/cl_page.c index feac1ff..e88427b 100644 --- a/lustre/obdclass/cl_page.c +++ b/lustre/obdclass/cl_page.c @@ -1259,7 +1259,12 @@ void cl_page_completion(const struct lu_env *env, (const struct lu_env *, const struct cl_page_slice *, int), ioret); - KLASSERT(!PageWriteback(cl_page_vmpage(env, pg))); + /* Don't assert the page writeback bit here because the lustre file + * may be as a backend of swap space. in this case, the page writeback + * is set by VM, and obvious we shouldn't clear it at all. Fortunately + * this type of pages are all TRANSIENT pages. */ + KLASSERT(ergo(pg->cp_type == CPT_CACHEABLE, + !PageWriteback(cl_page_vmpage(env, pg)))); EXIT; } EXPORT_SYMBOL(cl_page_completion); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 63f443a..7341aaa 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1068,7 +1068,8 @@ int class_disconnect(struct obd_export *export) RETURN(0); } -static void class_disconnect_export_list(struct list_head *list, int flags) +static void class_disconnect_export_list(struct list_head *list, + enum obd_option flags) { int rc; struct lustre_handle fake_conn; @@ -1118,12 +1119,6 @@ static void class_disconnect_export_list(struct list_head *list, int flags) EXIT; } -static inline int get_exp_flags_from_obd(struct obd_device *obd) -{ - return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) | - (obd->obd_force ? OBD_OPT_FORCE : 0)); -} - void class_disconnect_exports(struct obd_device *obd) { struct list_head work_list; @@ -1139,7 +1134,7 @@ void class_disconnect_exports(struct obd_device *obd) CDEBUG(D_HA, "OBD device %d (%p) has exports, " "disconnecting them\n", obd->obd_minor, obd); class_disconnect_export_list(&work_list, - get_exp_flags_from_obd(obd)); + exp_flags_from_obd(obd)); } else CDEBUG(D_HA, "OBD device %d (%p) has no exports\n", obd->obd_minor, obd); @@ -1150,7 +1145,8 @@ EXPORT_SYMBOL(class_disconnect_exports); /* Remove exports that have not completed recovery. */ int class_disconnect_stale_exports(struct obd_device *obd, - int (*test_export)(struct obd_export *)) + int (*test_export)(struct obd_export *), + enum obd_option flags) { struct list_head work_list; struct list_head *pos, *n; @@ -1182,7 +1178,7 @@ int class_disconnect_stale_exports(struct obd_device *obd, CDEBUG(D_ERROR, "%s: disconnecting %d stale clients\n", obd->obd_name, cnt); - class_disconnect_export_list(&work_list, get_exp_flags_from_obd(obd)); + class_disconnect_export_list(&work_list, flags); RETURN(cnt); } EXPORT_SYMBOL(class_disconnect_stale_exports); diff --git a/lustre/obdclass/llog_test.c b/lustre/obdclass/llog_test.c index 21be99b..ce887c8 100644 --- a/lustre/obdclass/llog_test.c +++ b/lustre/obdclass/llog_test.c @@ -501,7 +501,6 @@ static int llog_test_6(struct obd_device *obd, char *name) struct obd_device *mgc_obd; struct llog_ctxt *ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); struct obd_uuid *mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid; - struct lustre_handle exph = {0, }; struct obd_export *exp; struct obd_uuid uuid = {"LLOG_TEST6_UUID"}; struct llog_handle *llh = NULL; @@ -516,13 +515,13 @@ static int llog_test_6(struct obd_device *obd, char *name) GOTO(ctxt_release, rc = -ENOENT); } - rc = obd_connect(NULL, &exph, mgc_obd, &uuid, + rc = obd_connect(NULL, &exp, mgc_obd, &uuid, NULL /* obd_connect_data */, NULL); if (rc) { CERROR("6: failed to connect to MGC: %s\n", mgc_obd->obd_name); GOTO(ctxt_release, rc); } - exp = class_conn2export(&exph); + LASSERTF(exp->exp_obd == mgc_obd, "%p - %p - %p\n", exp, exp->exp_obd, mgc_obd); nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT); rc = llog_create(nctxt, &llh, NULL, name); @@ -552,6 +551,8 @@ parse_out: if (rc) { CERROR("6: llog_close failed: rc = %d\n", rc); } + CDEBUG(D_INFO, "obd %p - %p - %p - %p\n", + mgc_obd, exp, exp->exp_obd, exp->exp_obd->obd_type); rc = obd_disconnect(exp); ctxt_release: llog_ctxt_put(ctxt); diff --git a/lustre/obdclass/lu_object.c b/lustre/obdclass/lu_object.c index 5cae3c1..d5f6f5c 100644 --- a/lustre/obdclass/lu_object.c +++ b/lustre/obdclass/lu_object.c @@ -1193,6 +1193,7 @@ void *lu_context_key_get(const struct lu_context *ctx, { LINVRNT(ctx->lc_state == LCS_ENTERED); LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); + LASSERT(lu_keys[key->lct_index] == key); return ctx->lc_value[key->lct_index]; } EXPORT_SYMBOL(lu_context_key_get); diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index eb8d415..7783a3a 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -598,9 +598,7 @@ void class_decref(struct obd_device *obd, const char *scope, const void *source) be no more in-progress ops by this point.*/ spin_lock(&obd->obd_self_export->exp_lock); - obd->obd_self_export->exp_flags |= - (obd->obd_fail ? OBD_OPT_FAILOVER : 0) | - (obd->obd_force ? OBD_OPT_FORCE : 0); + obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd); spin_unlock(&obd->obd_self_export->exp_lock); /* note that we'll recurse into class_decref again */ diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index f568574..04d012c 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -567,7 +567,6 @@ DECLARE_MUTEX(mgc_start_lock); */ static int lustre_start_mgc(struct super_block *sb) { - struct lustre_handle mgc_conn = {0, }; struct obd_connect_data *data = NULL; struct lustre_sb_info *lsi = s2lsi(sb); struct obd_device *obd; @@ -768,14 +767,13 @@ static int lustre_start_mgc(struct super_block *sb) data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_FID | OBD_CONNECT_AT; data->ocd_version = LUSTRE_VERSION_CODE; - rc = obd_connect(NULL, &mgc_conn, obd, &(obd->obd_uuid), data, NULL); + rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL); OBD_FREE_PTR(data); if (rc) { CERROR("connect failed %d\n", rc); GOTO(out, rc); } - exp = class_conn2export(&mgc_conn); obd->u.cli.cl_mgc_mgsexp = exp; out: @@ -1358,6 +1356,10 @@ static struct vfsmount *server_kernel_mount(struct super_block *sb) GOTO(out_free, rc); } + if (lmd->lmd_flags & LMD_FLG_ABORT_RECOV) + simple_truncate(mnt->mnt_sb->s_root, mnt, LAST_RCVD, + LR_CLIENT_START); + OBD_PAGE_FREE(__page); lsi->lsi_ldd = ldd; /* freed at lsi cleanup */ CDEBUG(D_SUPER, "%s: mnt = %p\n", lmd->lmd_dev, mnt); diff --git a/lustre/obdecho/Makefile.in b/lustre/obdecho/Makefile.in index 66e61ed..c9069e5 100644 --- a/lustre/obdecho/Makefile.in +++ b/lustre/obdecho/Makefile.in @@ -1,4 +1,6 @@ MODULES := obdecho obdecho-objs := echo.o echo_client.o lproc_echo.o +EXTRA_DIST = $(obdecho-objs:%.o=%.c) echo_internal.h + @INCLUDE_RULES@ diff --git a/lustre/obdecho/autoMakefile.am b/lustre/obdecho/autoMakefile.am index c8b7df3..313b0f8 100644 --- a/lustre/obdecho/autoMakefile.am +++ b/lustre/obdecho/autoMakefile.am @@ -68,4 +68,3 @@ endif # MODULES install-data-hook: $(install_data_hook) MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(obdecho-objs:%.o=%.c) echo_internal.h diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c index d69cf59..6684c7a 100644 --- a/lustre/obdecho/echo.c +++ b/lustre/obdecho/echo.c @@ -66,12 +66,22 @@ enum { }; static int echo_connect(const struct lu_env *env, - struct lustre_handle *conn, struct obd_device *obd, + struct obd_export **exp, struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *data, void *localdata) { + struct lustre_handle conn = { 0 }; + int rc; + data->ocd_connect_flags &= ECHO_CONNECT_SUPPORTED; - return class_connect(conn, obd, cluuid); + rc = class_connect(&conn, obd, cluuid); + if (rc) { + CERROR("can't connect %d\n", rc); + return rc; + } + *exp = class_conn2export(&conn); + + return 0; } static int echo_disconnect(struct obd_export *exp) diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index 01b9572..16cab35 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -1870,7 +1870,6 @@ static int echo_client_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) { struct echo_client_obd *ec = &obddev->u.echo_client; struct obd_device *tgt; - struct lustre_handle conn = {0, }; struct obd_uuid echo_uuid = { "ECHO_UUID" }; struct obd_connect_data *ocd = NULL; int rc; @@ -1906,7 +1905,7 @@ static int echo_client_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) ocd->ocd_version = LUSTRE_VERSION_CODE; ocd->ocd_group = FILTER_GROUP_ECHO; - rc = obd_connect(NULL, &conn, tgt, &echo_uuid, ocd, NULL); + rc = obd_connect(NULL, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL); OBD_FREE(ocd, sizeof(*ocd)); @@ -1915,7 +1914,6 @@ static int echo_client_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) lustre_cfg_string(lcfg, 1)); return (rc); } - ec->ec_exp = class_conn2export(&conn); RETURN(rc); } @@ -1939,18 +1937,17 @@ static int echo_client_cleanup(struct obd_device *obddev) } static int echo_client_connect(const struct lu_env *env, - struct lustre_handle *conn, + struct obd_export **exp, struct obd_device *src, struct obd_uuid *cluuid, struct obd_connect_data *data, void *localdata) { - struct obd_export *exp; int rc; + struct lustre_handle conn = { 0 }; ENTRY; - rc = class_connect(conn, src, cluuid); + rc = class_connect(&conn, src, cluuid); if (rc == 0) { - exp = class_conn2export(conn); - class_export_put(exp); + *exp = class_conn2export(&conn); } RETURN (rc); diff --git a/lustre/obdfilter/Makefile.in b/lustre/obdfilter/Makefile.in index 2a15c71..c1ebc18 100644 --- a/lustre/obdfilter/Makefile.in +++ b/lustre/obdfilter/Makefile.in @@ -4,4 +4,6 @@ obdfilter-objs := filter.o filter_io.o filter_log.o obdfilter-objs += lproc_obdfilter.o filter_lvb.o filter_capa.o obdfilter-objs += filter_io_26.o +EXTRA_DIST = $(obdfilter-objs:%.o=%.c) filter_io_26.c filter_internal.h + @INCLUDE_RULES@ diff --git a/lustre/obdfilter/autoMakefile.am b/lustre/obdfilter/autoMakefile.am index 89490fb..cfef4e9 100644 --- a/lustre/obdfilter/autoMakefile.am +++ b/lustre/obdfilter/autoMakefile.am @@ -39,4 +39,3 @@ modulefs_DATA = obdfilter$(KMODEXT) endif MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(obdfilter-objs:%.o=%.c) filter_io_26.c filter_internal.h diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index d1678d7..191b3e5 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -202,12 +202,9 @@ static int filter_export_stats_init(struct obd_device *obd, struct obd_export *exp, void *client_nid) { - struct filter_export_data *fed = &exp->exp_filter_data; int rc, newnid = 0; ENTRY; - init_brw_stats(&fed->fed_brw_stats); - if (obd_uuid_equals(&exp->exp_client_uuid, &obd->obd_uuid)) /* Self-export gets no proc entry */ RETURN(0); @@ -357,12 +354,13 @@ static int filter_client_add(struct obd_device *obd, struct obd_export *exp, RETURN(0); } +struct lsd_client_data zero_lcd; /* globals are implicitly zeroed */ + static int filter_client_free(struct obd_export *exp) { struct filter_export_data *fed = &exp->exp_filter_data; struct filter_obd *filter = &exp->exp_obd->u.filter; struct obd_device *obd = exp->exp_obd; - struct lsd_client_data zero_lcd; struct lvfs_run_ctxt saved; int rc; loff_t off; @@ -399,23 +397,26 @@ static int filter_client_free(struct obd_export *exp) } if (!(exp->exp_flags & OBD_OPT_FAILOVER)) { - memset(&zero_lcd, 0, sizeof zero_lcd); + /* Don't force sync on disconnect if aborting recovery, + * or it does num_clients * num_osts. b=17194 */ + int need_sync = (!exp->exp_libclient || exp->exp_need_sync) && + !(exp->exp_flags&OBD_OPT_ABORT_RECOV); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); rc = fsfilt_write_record(obd, filter->fo_rcvd_filp, &zero_lcd, - sizeof(zero_lcd), &off, - (!exp->exp_libclient || - exp->exp_need_sync)); + sizeof(zero_lcd), &off, 0); + + /* Make sure the server's last_transno is up to date. Do this + * after the client is freed so we know all the client's + * transactions have been committed. */ if (rc == 0) - /* update server's transno */ filter_update_server_data(obd, filter->fo_rcvd_filp, - filter->fo_fsd, - !exp->exp_libclient); + filter->fo_fsd, need_sync); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); CDEBUG(rc == 0 ? D_INFO : D_ERROR, - "zeroing out client %s at idx %u (%llu) in %s rc %d\n", + "zero out client %s at idx %u/%llu in %s %ssync rc %d\n", fed->fed_lcd->lcd_uuid, fed->fed_lr_idx, fed->fed_lr_off, - LAST_RCVD, rc); + LAST_RCVD, need_sync ? "" : "a", rc); } if (!test_and_clear_bit(fed->fed_lr_idx, filter->fo_last_rcvd_slots)) { @@ -2012,7 +2013,11 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg, if (rc != 0) GOTO(err_ops, rc); - LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb))); + if (lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb))) { + CERROR("%s: Underlying device is marked as read-only. " + "Setup failed\n", obd->obd_name); + GOTO(err_ops, rc = -EROFS); + } /* failover is the default */ obd->obd_replayable = 1; @@ -2765,34 +2770,35 @@ static int filter_reconnect(const struct lu_env *env, /* nearly identical to mds_connect */ static int filter_connect(const struct lu_env *env, - struct lustre_handle *conn, struct obd_device *obd, + struct obd_export **exp, struct obd_device *obd, struct obd_uuid *cluuid, struct obd_connect_data *data, void *localdata) { struct lvfs_run_ctxt saved; - struct obd_export *exp; + struct lustre_handle conn = { 0 }; + struct obd_export *lexp; struct filter_export_data *fed; struct lsd_client_data *lcd = NULL; __u32 group; int rc; ENTRY; - if (conn == NULL || obd == NULL || cluuid == NULL) + if (exp == NULL || obd == NULL || cluuid == NULL) RETURN(-EINVAL); - rc = class_connect(conn, obd, cluuid); + rc = class_connect(&conn, obd, cluuid); if (rc) RETURN(rc); - exp = class_conn2export(conn); - LASSERT(exp != NULL); + lexp = class_conn2export(&conn); + LASSERT(lexp != NULL); - fed = &exp->exp_filter_data; + fed = &lexp->exp_filter_data; - rc = filter_connect_internal(exp, data); + rc = filter_connect_internal(lexp, data); if (rc) GOTO(cleanup, rc); - filter_export_stats_init(obd, exp, localdata); + filter_export_stats_init(obd, lexp, localdata); if (obd->obd_replayable) { OBD_ALLOC(lcd, sizeof(*lcd)); if (!lcd) { @@ -2802,7 +2808,7 @@ static int filter_connect(const struct lu_env *env, memcpy(lcd->lcd_uuid, cluuid, sizeof(lcd->lcd_uuid)); fed->fed_lcd = lcd; - rc = filter_client_add(obd, exp, -1); + rc = filter_client_add(obd, lexp, -1); if (rc) GOTO(cleanup, rc); } @@ -2810,7 +2816,7 @@ static int filter_connect(const struct lu_env *env, group = data->ocd_group; CWARN("%s: Received MDS connection ("LPX64"); group %d\n", - obd->obd_name, exp->exp_handle.h_cookie, group); + obd->obd_name, lexp->exp_handle.h_cookie, group); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); rc = filter_read_groups(obd, group, 1); @@ -2828,9 +2834,10 @@ cleanup: OBD_FREE_PTR(lcd); fed->fed_lcd = NULL; } - class_disconnect(exp); + class_disconnect(lexp); + *exp = NULL; } else { - class_export_put(exp); + *exp = lexp; } RETURN(rc); @@ -3051,6 +3058,8 @@ static int filter_disconnect(struct obd_export *exp) /* Flush any remaining cancel messages out to the target */ filter_sync_llogs(obd, exp); + lquota_clearinfo(filter_quota_interface_ref, exp, exp->exp_obd); + /* Disconnect early so that clients can't keep using export */ rc = class_disconnect(exp); if (exp->exp_obd->obd_namespace != NULL) @@ -4327,9 +4336,11 @@ static int filter_set_info_async(struct obd_export *exp, __u32 keylen, obd->u.filter.fo_mdc_conn.cookie = exp->exp_handle.h_cookie; /* setup llog imports */ - LASSERT(val != NULL); + if (val != NULL) + group = (int)(*(__u32 *)val); + else + group = 0; /* default value */ - group = (int)(*(__u32 *)val); LASSERT_MDS_GROUP(group); rc = filter_setup_llog_group(exp, obd, group); if (rc) diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index 960b097..68b2ef4 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -80,20 +80,32 @@ static void record_start_io(struct filter_iobuf *iobuf, int rw, int size, atomic_inc(&filter->fo_r_in_flight); lprocfs_oh_tally(&filter->fo_filter_stats.hist[BRW_R_RPC_HIST], atomic_read(&filter->fo_r_in_flight)); - lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_R_DISK_IOSIZE], + lprocfs_oh_tally_log2(&filter-> + fo_filter_stats.hist[BRW_R_DISK_IOSIZE], size); - lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_RPC_HIST], - atomic_read(&filter->fo_r_in_flight)); - lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_DISK_IOSIZE], size); + if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) { + lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats-> + hist[BRW_R_RPC_HIST], + atomic_read(&filter->fo_r_in_flight)); + lprocfs_oh_tally_log2(&exp->exp_nid_stats-> + nid_brw_stats->hist[BRW_R_DISK_IOSIZE], + size); + } } else { atomic_inc(&filter->fo_w_in_flight); lprocfs_oh_tally(&filter->fo_filter_stats.hist[BRW_W_RPC_HIST], atomic_read(&filter->fo_w_in_flight)); - lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_W_DISK_IOSIZE], + lprocfs_oh_tally_log2(&filter-> + fo_filter_stats.hist[BRW_W_DISK_IOSIZE], size); - lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_RPC_HIST], - atomic_read(&filter->fo_w_in_flight)); - lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_DISK_IOSIZE], size); + if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) { + lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats-> + hist[BRW_W_RPC_HIST], + atomic_read(&filter->fo_r_in_flight)); + lprocfs_oh_tally_log2(&exp->exp_nid_stats-> + nid_brw_stats->hist[BRW_W_DISK_IOSIZE], + size); + } } } @@ -405,31 +417,32 @@ int filter_do_bio(struct obd_export *exp, struct inode *inode, wait_event(iobuf->dr_wait, atomic_read(&iobuf->dr_numreqs) == 0); if (rw == OBD_BRW_READ) { - lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.hist[BRW_R_DIO_FRAGS], - frags); - lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_DIO_FRAGS], + lprocfs_oh_tally(&obd->u.filter.fo_filter_stats. + hist[BRW_R_DIO_FRAGS], frags); - lprocfs_oh_tally_log2(&obd->u.filter.fo_filter_stats.hist[BRW_R_IO_TIME], + lprocfs_oh_tally_log2(&obd->u.filter. + fo_filter_stats.hist[BRW_R_IO_TIME], jiffies - start_time); - lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_R_IO_TIME], jiffies - start_time); if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) { - lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_R_DIO_FRAGS], + lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats-> + hist[BRW_R_DIO_FRAGS], frags); - lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_R_IO_TIME], + lprocfs_oh_tally_log2(&exp->exp_nid_stats-> + nid_brw_stats->hist[BRW_R_IO_TIME], jiffies - start_time); } } else { - lprocfs_oh_tally(&obd->u.filter.fo_filter_stats.hist[BRW_W_DIO_FRAGS], - frags); - lprocfs_oh_tally(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_DIO_FRAGS], - frags); - lprocfs_oh_tally_log2(&obd->u.filter.fo_filter_stats.hist[BRW_W_IO_TIME], + lprocfs_oh_tally(&obd->u.filter.fo_filter_stats. + hist[BRW_W_DIO_FRAGS], frags); + lprocfs_oh_tally_log2(&obd->u.filter.fo_filter_stats. + hist[BRW_W_IO_TIME], jiffies - start_time); - lprocfs_oh_tally_log2(&exp->exp_filter_data.fed_brw_stats.hist[BRW_W_IO_TIME], jiffies - start_time); if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) { - lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_W_DIO_FRAGS], + lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats-> + hist[BRW_W_DIO_FRAGS], frags); - lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats->hist[BRW_W_IO_TIME], + lprocfs_oh_tally_log2(&exp->exp_nid_stats-> + nid_brw_stats->hist[BRW_W_IO_TIME], jiffies - start_time); } } diff --git a/lustre/obdfilter/lproc_obdfilter.c b/lustre/obdfilter/lproc_obdfilter.c index 5e5f1c7..7d4da23 100644 --- a/lustre/obdfilter/lproc_obdfilter.c +++ b/lustre/obdfilter/lproc_obdfilter.c @@ -381,7 +381,6 @@ void filter_tally(struct obd_export *exp, struct page **pages, int nr_pages, unsigned long *blocks, int blocks_per_page, int wr) { struct filter_obd *filter = &exp->exp_obd->u.filter; - struct filter_export_data *fed = &exp->exp_filter_data; struct page *last_page = NULL; unsigned long *last_block = NULL; unsigned long discont_pages = 0; @@ -393,8 +392,6 @@ void filter_tally(struct obd_export *exp, struct page **pages, int nr_pages, lprocfs_oh_tally_log2(&filter->fo_filter_stats.hist[BRW_R_PAGES + wr], nr_pages); - lprocfs_oh_tally_log2(&fed->fed_brw_stats.hist[BRW_R_PAGES + wr], - nr_pages); if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats-> hist[BRW_R_PAGES + wr], nr_pages); @@ -413,12 +410,8 @@ void filter_tally(struct obd_export *exp, struct page **pages, int nr_pages, lprocfs_oh_tally(&filter->fo_filter_stats.hist[BRW_R_DISCONT_PAGES +wr], discont_pages); - lprocfs_oh_tally(&fed->fed_brw_stats.hist[BRW_R_DISCONT_PAGES + wr], - discont_pages); lprocfs_oh_tally(&filter->fo_filter_stats.hist[BRW_R_DISCONT_BLOCKS+wr], discont_blocks); - lprocfs_oh_tally(&fed->fed_brw_stats.hist[BRW_R_DISCONT_BLOCKS + wr], - discont_blocks); if (exp->exp_nid_stats && exp->exp_nid_stats->nid_brw_stats) { lprocfs_oh_tally_log2(&exp->exp_nid_stats->nid_brw_stats-> @@ -546,30 +539,6 @@ int lproc_filter_attach_seqstat(struct obd_device *dev) &filter_brw_stats_fops, dev); } -static int filter_per_export_stats_seq_show(struct seq_file *seq, void *v) -{ - struct filter_export_data *fed = seq->private; - - brw_stats_show(seq, &fed->fed_brw_stats); - - return 0; -} - -static ssize_t filter_per_export_stats_seq_write(struct file *file, - const char *buf, size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct filter_export_data *fed = seq->private; - int i; - - for (i = 0; i < BRW_LAST; i++) - lprocfs_oh_clear(&fed->fed_brw_stats.hist[i]); - - return len; -} - -LPROC_SEQ_FOPS(filter_per_export_stats); - void lprocfs_filter_init_vars(struct lprocfs_static_vars *lvars) { lvars->module_vars = lprocfs_filter_module_vars; @@ -578,10 +547,10 @@ void lprocfs_filter_init_vars(struct lprocfs_static_vars *lvars) static int filter_per_nid_stats_seq_show(struct seq_file *seq, void *v) { - nid_stat_t *tmp = seq->private; + nid_stat_t * stat = seq->private; - if (tmp->nid_brw_stats) - brw_stats_show(seq, tmp->nid_brw_stats); + if (stat->nid_brw_stats) + brw_stats_show(seq, stat->nid_brw_stats); return 0; } @@ -590,13 +559,13 @@ static ssize_t filter_per_nid_stats_seq_write(struct file *file, const char *buf, size_t len, loff_t *off) { - struct seq_file *seq = file->private_data; - nid_stat_t *tmp = seq->private; + struct seq_file *seq = file->private_data; + nid_stat_t *stat = seq->private; int i; - if (tmp->nid_brw_stats) + if (stat->nid_brw_stats) for (i = 0; i < BRW_LAST; i++) - lprocfs_oh_clear(&tmp->nid_brw_stats->hist[i]); + lprocfs_oh_clear(&stat->nid_brw_stats->hist[i]); return len; } diff --git a/lustre/osc/Makefile.in b/lustre/osc/Makefile.in index 438ce4c..40ffa16 100644 --- a/lustre/osc/Makefile.in +++ b/lustre/osc/Makefile.in @@ -1,4 +1,6 @@ MODULES := osc osc-objs := osc_request.o lproc_osc.o osc_create.o osc_dev.o osc_object.o osc_page.o osc_lock.o osc_io.o +EXTRA_DIST = $(osc-objs:%.o=%.c) osc_internal.h osc_cl_internal.h + @INCLUDE_RULES@ diff --git a/lustre/osc/autoMakefile.am b/lustre/osc/autoMakefile.am index cf370ba..59c37b2 100644 --- a/lustre/osc/autoMakefile.am +++ b/lustre/osc/autoMakefile.am @@ -75,4 +75,3 @@ endif install-data-hook: $(install_data_hook) MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(osc-objs:%.o=%.c) osc_internal.h osc_cl_internal.h diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c index 8f2b2b5..247c044 100644 --- a/lustre/osc/osc_lock.c +++ b/lustre/osc/osc_lock.c @@ -312,6 +312,8 @@ static void osc_ast_data_put(const struct lu_env *env, struct osc_lock *olck) * * This can be optimized to not update attributes when lock is a result of a * local match. + * + * Called under lock and resource spin-locks. */ static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck, int rc) @@ -344,6 +346,8 @@ static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck, dlmlock = olck->ols_lock; LASSERT(dlmlock != NULL); + /* re-grab LVB from a dlm lock under DLM spin-locks. */ + *lvb = *(struct ost_lvb *)dlmlock->l_lvb_data; size = lvb->lvb_size; /* Extend KMS up to the end of this lock and no further * A lock on [x,y] means a KMS of up to y + 1 bytes! */ @@ -360,7 +364,7 @@ static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck, lvb->lvb_size, oinfo->loi_kms, dlmlock->l_policy_data.l_extent.end); } - ldlm_lock_allow_match(dlmlock); + ldlm_lock_allow_match_locked(dlmlock); } else if (rc == -ENAVAIL && olck->ols_glimpse) { CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving" " kms="LPU64"\n", lvb->lvb_size, oinfo->loi_kms); @@ -375,6 +379,13 @@ static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck, EXIT; } +/** + * Called when a lock is granted, from an upcall (when server returned a + * granted lock), or from completion AST, when server returned a blocked lock. + * + * Called under lock and resource spin-locks, that are released temporarily + * here. + */ static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck, struct ldlm_lock *dlmlock, int rc) { @@ -399,11 +410,19 @@ static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck, * tell upper layers the extent of the lock that was actually * granted */ - cl_lock_modify(env, lock, descr); LINVRNT(osc_lock_invariant(olck)); olck->ols_state = OLS_GRANTED; osc_lock_lvb_update(env, olck, rc); + + /* release DLM spin-locks to allow cl_lock_{modify,signal}() + * to take a semaphore on a parent lock. This is safe, because + * spin-locks are needed to protect consistency of + * dlmlock->l_*_mode and LVB, and we have finished processing + * them. */ + unlock_res_and_lock(dlmlock); + cl_lock_modify(env, lock, descr); cl_lock_signal(env, lock); + lock_res_and_lock(dlmlock); } EXIT; } @@ -424,7 +443,6 @@ static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck) LASSERT(olck->ols_lock == NULL); olck->ols_lock = dlmlock; spin_unlock(&osc_ast_guard); - unlock_res_and_lock(dlmlock); /* * Lock might be not yet granted. In this case, completion ast @@ -433,6 +451,8 @@ static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck) */ if (dlmlock->l_granted_mode == dlmlock->l_req_mode) osc_lock_granted(env, olck, dlmlock, 0); + unlock_res_and_lock(dlmlock); + /* * osc_enqueue_interpret() decrefs asynchronous locks, counter * this. @@ -751,6 +771,7 @@ static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock, * to lock->l_lvb_data, store it in osc_lock. */ LASSERT(dlmlock->l_lvb_data != NULL); + lock_res_and_lock(dlmlock); olck->ols_lvb = *(struct ost_lvb *)dlmlock->l_lvb_data; if (olck->ols_lock == NULL) /* @@ -767,6 +788,7 @@ static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock, osc_lock_granted(env, olck, dlmlock, dlmrc); if (dlmrc != 0) cl_lock_error(env, lock, dlmrc); + unlock_res_and_lock(dlmlock); cl_lock_mutex_put(env, lock); osc_ast_data_put(env, olck); result = 0; @@ -1038,6 +1060,7 @@ static void osc_lock_to_lockless(const struct lu_env *env, slice->cls_ops = &osc_lock_lockless_ops; } } + LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols))); } /** @@ -1273,7 +1296,7 @@ static int osc_lock_enqueue(const struct lu_env *env, ols->ols_state = OLS_GRANTED; } } - + LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols))); RETURN(result); } diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 1812828..1f7e465 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1175,7 +1175,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, /* size[REQ_REC_OFF] still sizeof (*body) */ if (opc == OST_WRITE) { if (unlikely(cli->cl_checksum) && - req->rq_flvr.sf_bulk_hash == BULK_HASH_ALG_NULL) { + !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { /* store cl_cksum_type in a local variable since * it can be changed via lprocfs */ cksum_type_t cksum_type = cli->cl_cksum_type; @@ -1204,7 +1204,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, sizeof(__u32) * niocount); } else { if (unlikely(cli->cl_checksum) && - req->rq_flvr.sf_bulk_hash == BULK_HASH_ALG_NULL) { + !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) body->oa.o_flags = 0; body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type); @@ -1331,6 +1331,9 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) } LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob); + if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk)) + RETURN(-EAGAIN); + if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum && check_write_checksum(&body->oa, peer, client_cksum, body->oa.o_cksum, aa->aa_requested_nob, @@ -1338,15 +1341,17 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) cksum_type_unpack(aa->aa_oa->o_flags))) RETURN(-EAGAIN); - if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk)) - RETURN(-EAGAIN); - rc = check_write_rcs(req, aa->aa_requested_nob,aa->aa_nio_count, aa->aa_page_count, aa->aa_ppga); GOTO(out, rc); } /* The rest of this function executes only for OST_READs */ + + rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc); + if (rc < 0) + GOTO(out, rc); + if (rc > aa->aa_requested_nob) { CERROR("Unexpected rc %d (%d requested)\n", rc, aa->aa_requested_nob); @@ -1362,10 +1367,6 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) if (rc < aa->aa_requested_nob) handle_short_read(rc, aa->aa_page_count, aa->aa_ppga); - if (sptlrpc_cli_unwrap_bulk_read(req, rc, aa->aa_page_count, - aa->aa_ppga)) - GOTO(out, rc = -EAGAIN); - if (body->oa.o_valid & OBD_MD_FLCKSUM) { static int cksum_counter; __u32 server_cksum = body->oa.o_cksum; diff --git a/lustre/osd/osd_handler.c b/lustre/osd/osd_handler.c index 97452a6..363b81a 100644 --- a/lustre/osd/osd_handler.c +++ b/lustre/osd/osd_handler.c @@ -3033,9 +3033,10 @@ static void osd_it_ea_fini(const struct lu_env *env, struct dt_it *di) { struct osd_it_ea *it = (struct osd_it_ea *)di; struct osd_object *obj = it->oie_obj; - + struct inode *inode = obj->oo_inode; ENTRY; + it->oie_file.f_op->release(inode, &it->oie_file); lu_object_put(env, &obj->oo_dt.do_lu); EXIT; } @@ -3088,8 +3089,6 @@ static int osd_ldiskfs_filldir(char *buf, const char *name, int namelen, { struct osd_it_ea *it = (struct osd_it_ea *)buf; struct dirent64 *dirent = &it->oie_dirent64; - int reclen = LDISKFS_DIR_REC_LEN(namelen); - ENTRY; if (it->oie_namelen) @@ -3101,8 +3100,6 @@ static int osd_ldiskfs_filldir(char *buf, const char *name, int namelen, strncpy(dirent->d_name, name, LDISKFS_NAME_LEN); dirent->d_name[namelen] = 0; dirent->d_ino = ino; - dirent->d_off = offset; - dirent->d_reclen = reclen; it->oie_namelen = namelen; it->oie_curr_pos = offset; @@ -3134,7 +3131,7 @@ int osd_ldiskfs_it_fill(const struct dt_it *di) it->oie_next_pos = it->oie_file.f_pos; - if(!result && it->oie_namelen == 0) + if (it->oie_namelen == 0) result = -EIO; RETURN(result); @@ -3232,6 +3229,8 @@ static struct dt_rec *osd_it_ea_rec(const struct lu_env *env, } rc = osd_ea_fid_get(env, dentry, (struct dt_rec*) rec); + if (rc != 0) + rec = ERR_PTR(rc); iput(inode); RETURN((struct dt_rec *)rec); @@ -3270,7 +3269,7 @@ static int osd_it_ea_load(const struct lu_env *env, int rc; ENTRY; - it->oie_curr_pos = it->oie_next_pos = hash; + it->oie_curr_pos = hash; rc = osd_ldiskfs_it_fill(di); if (rc == 0) diff --git a/lustre/ost/Makefile.in b/lustre/ost/Makefile.in index 99002e4..6bd8be3 100644 --- a/lustre/ost/Makefile.in +++ b/lustre/ost/Makefile.in @@ -1,4 +1,6 @@ MODULES := ost ost-objs := ost_handler.o lproc_ost.o +EXTRA_DIST = $(ost-objs:%.o=%.c) ost_internal.h + @INCLUDE_RULES@ diff --git a/lustre/ost/autoMakefile.am b/lustre/ost/autoMakefile.am index 8db3fe4..907a0e0 100644 --- a/lustre/ost/autoMakefile.am +++ b/lustre/ost/autoMakefile.am @@ -39,4 +39,3 @@ modulefs_DATA = ost$(KMODEXT) endif MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(ost-objs:%.o=%.c) ost_internal.h diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 28430a4..cb520bb 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -751,9 +751,9 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) if (exp->exp_failed) rc = -ENOTCONN; else { - sptlrpc_svc_wrap_bulk(req, desc); - - rc = ptlrpc_start_bulk_transfer(desc); + rc = sptlrpc_svc_wrap_bulk(req, desc); + if (rc == 0) + rc = ptlrpc_start_bulk_transfer(desc); } if (rc == 0) { @@ -978,6 +978,10 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) local_nb[i].offset & ~CFS_PAGE_MASK, local_nb[i].len); + rc = sptlrpc_svc_prep_bulk(req, desc); + if (rc != 0) + GOTO(out_lock, rc); + /* Check if client was evicted while we were doing i/o before touching network */ if (desc->bd_export->exp_failed) @@ -1012,23 +1016,18 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) DEBUG_REQ(D_ERROR, req, "Eviction on bulk GET"); rc = -ENOTCONN; ptlrpc_abort_bulk(desc); - } else if (!desc->bd_success || - desc->bd_nob_transferred != desc->bd_nob) { - DEBUG_REQ(D_ERROR, req, "%s bulk GET %d(%d)", - desc->bd_success ? - "truncated" : "network error on", - desc->bd_nob_transferred, desc->bd_nob); + } else if (!desc->bd_success) { + DEBUG_REQ(D_ERROR, req, "network error on bulk GET"); /* XXX should this be a different errno? */ rc = -ETIMEDOUT; + } else { + rc = sptlrpc_svc_unwrap_bulk(req, desc); } } else { DEBUG_REQ(D_ERROR, req, "ptlrpc_bulk_get failed: rc %d", rc); } no_reply = rc != 0; - if (rc == 0) - sptlrpc_svc_unwrap_bulk(req, desc); - repbody = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(repbody->oa)); @@ -1606,6 +1605,11 @@ static int ost_rw_hpreq_lock_match(struct ptlrpc_request *req, end = (nb[ioo->ioo_bufcnt - 1].offset + nb[ioo->ioo_bufcnt - 1].len - 1) | ~CFS_PAGE_MASK; + LASSERT(lock->l_resource != NULL); + if (!osc_res_name_eq(ioo->ioo_id, ioo->ioo_gr, + &lock->l_resource->lr_name)) + RETURN(0); + if (!(lock->l_granted_mode & mode)) RETURN(0); diff --git a/lustre/ptlrpc/Makefile.in b/lustre/ptlrpc/Makefile.in index aee4786..3660c7f 100644 --- a/lustre/ptlrpc/Makefile.in +++ b/lustre/ptlrpc/Makefile.in @@ -30,6 +30,7 @@ l_lock.c: @LUSTRE@/ldlm/l_lock.c interval_tree.c: @LUSTRE@/ldlm/interval_tree.c ln -sf $< $@ +EXTRA_DIST = $(ptlrpc_objs:.o=.c) ptlrpc_internal.h EXTRA_PRE_CFLAGS := -I@LUSTRE@/ldlm @INCLUDE_RULES@ diff --git a/lustre/ptlrpc/autoMakefile.am b/lustre/ptlrpc/autoMakefile.am index c0d8c41..35577b5 100644 --- a/lustre/ptlrpc/autoMakefile.am +++ b/lustre/ptlrpc/autoMakefile.am @@ -117,5 +117,4 @@ SUBDIRS = gss endif install-data-hook: $(install_data_hook) -DIST_SOURCES = $(ptlrpc_objs:.o=.c) ptlrpc_internal.h MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ ldlm_*.c l_lock.c interval_tree.c diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 75f365a..1188d41 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1427,7 +1427,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) spin_unlock(&imp->imp_lock); set->set_remaining--; - cfs_waitq_signal(&imp->imp_recovery_waitq); + cfs_waitq_broadcast(&imp->imp_recovery_waitq); } /* If we hit an error, we want to recover promptly. */ @@ -2299,7 +2299,7 @@ after_send: LASSERT(!req->rq_receiving_reply); ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); - cfs_waitq_signal(&imp->imp_recovery_waitq); + cfs_waitq_broadcast(&imp->imp_recovery_waitq); RETURN(rc); } diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 2d20ab7..92cdd7b 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -194,7 +194,9 @@ void client_bulk_callback (lnet_event_t *ev) desc->bd_sender = ev->sender; } - sptlrpc_enc_pool_put_pages(desc); + /* release the encrypted pages for write */ + if (desc->bd_req->rq_bulk_write) + sptlrpc_enc_pool_put_pages(desc); /* NB don't unlock till after wakeup; desc can disappear under us * otherwise */ diff --git a/lustre/ptlrpc/gss/gss_api.h b/lustre/ptlrpc/gss/gss_api.h index 11b1c37..3b20c99 100644 --- a/lustre/ptlrpc/gss/gss_api.h +++ b/lustre/ptlrpc/gss/gss_api.h @@ -51,11 +51,15 @@ __u32 lgss_get_mic( struct gss_ctx *ctx, int msgcnt, rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, rawobj_t *mic_token); __u32 lgss_verify_mic( struct gss_ctx *ctx, int msgcnt, rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, rawobj_t *mic_token); __u32 lgss_wrap( struct gss_ctx *ctx, @@ -68,12 +72,18 @@ __u32 lgss_unwrap( rawobj_t *gsshdr, rawobj_t *token, rawobj_t *out_msg); -__u32 lgss_plain_encrypt( - struct gss_ctx *ctx, - int decrypt, - int length, - void *in_buf, - void *out_buf); +__u32 lgss_prep_bulk( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc); +__u32 lgss_wrap_bulk( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); +__u32 lgss_unwrap_bulk( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token); __u32 lgss_delete_sec_context( struct gss_ctx **ctx); int lgss_display( @@ -115,11 +125,15 @@ struct gss_api_ops { struct gss_ctx *ctx, int msgcnt, rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, rawobj_t *mic_token); __u32 (*gss_verify_mic)( struct gss_ctx *ctx, int msgcnt, rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, rawobj_t *mic_token); __u32 (*gss_wrap)( struct gss_ctx *ctx, @@ -132,12 +146,18 @@ struct gss_api_ops { rawobj_t *gsshdr, rawobj_t *token, rawobj_t *out_msg); - __u32 (*gss_plain_encrypt)( - struct gss_ctx *ctx, - int decrypt, - int length, - void *in_buf, - void *out_buf); + __u32 (*gss_prep_bulk)( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc); + __u32 (*gss_wrap_bulk)( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob); + __u32 (*gss_unwrap_bulk)( + struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token); void (*gss_delete_sec_context)( void *ctx); int (*gss_display)( diff --git a/lustre/ptlrpc/gss/gss_bulk.c b/lustre/ptlrpc/gss/gss_bulk.c index 03fd0ce..f8723f5 100644 --- a/lustre/ptlrpc/gss/gss_bulk.c +++ b/lustre/ptlrpc/gss/gss_bulk.c @@ -67,391 +67,26 @@ #include "gss_internal.h" #include "gss_api.h" -static __u8 zero_iv[CIPHER_MAX_BLKSIZE] = { 0, }; - -static void buf_to_sl(struct scatterlist *sl, - void *buf, unsigned int len) -{ - sl->page = virt_to_page(buf); - sl->offset = offset_in_page(buf); - sl->length = len; -} - -/* - * CTS CBC encryption: - * 1. X(n-1) = P(n-1) - * 2. E(n-1) = Encrypt(K, X(n-1)) - * 3. C(n) = HEAD(E(n-1)) - * 4. P = P(n) | 0 - * 5. D(n) = E(n-1) XOR P - * 6. C(n-1) = Encrypt(K, D(n)) - * - * CTS encryption using standard CBC interface: - * 1. pad the last partial block with 0. - * 2. do CBC encryption. - * 3. swap the last two ciphertext blocks. - * 4. truncate to original plaintext size. - */ -static int cbc_cts_encrypt(struct ll_crypto_cipher *tfm, - struct scatterlist *sld, - struct scatterlist *sls) -{ - struct scatterlist slst, sldt; - struct blkcipher_desc desc; - void *data; - __u8 sbuf[CIPHER_MAX_BLKSIZE]; - __u8 dbuf[CIPHER_MAX_BLKSIZE]; - unsigned int blksize, blks, tail; - int rc; - - blksize = ll_crypto_blkcipher_blocksize(tfm); - blks = sls->length / blksize; - tail = sls->length % blksize; - LASSERT(blks > 0 && tail > 0); - - /* pad tail block with 0, copy to sbuf */ - data = cfs_kmap(sls->page); - memcpy(sbuf, data + sls->offset + blks * blksize, tail); - memset(sbuf + tail, 0, blksize - tail); - cfs_kunmap(sls->page); - - buf_to_sl(&slst, sbuf, blksize); - buf_to_sl(&sldt, dbuf, blksize); - desc.tfm = tfm; - desc.flags = 0; - - /* encrypt head */ - rc = ll_crypto_blkcipher_encrypt(&desc, sld, sls, sls->length - tail); - if (unlikely(rc)) { - CERROR("encrypt head (%u) data: %d\n", sls->length - tail, rc); - return rc; - } - /* encrypt tail */ - rc = ll_crypto_blkcipher_encrypt(&desc, &sldt, &slst, blksize); - if (unlikely(rc)) { - CERROR("encrypt tail (%u) data: %d\n", slst.length, rc); - return rc; - } - - /* swab C(n) and C(n-1), if n == 1, then C(n-1) is the IV */ - data = cfs_kmap(sld->page); - - memcpy(data + sld->offset + blks * blksize, - data + sld->offset + (blks - 1) * blksize, tail); - memcpy(data + sld->offset + (blks - 1) * blksize, dbuf, blksize); - cfs_kunmap(sld->page); - - return 0; -} - -/* - * CTS CBC decryption: - * 1. D(n) = Decrypt(K, C(n-1)) - * 2. C = C(n) | 0 - * 3. X(n) = D(n) XOR C - * 4. P(n) = HEAD(X(n)) - * 5. E(n-1) = C(n) | TAIL(X(n)) - * 6. X(n-1) = Decrypt(K, E(n-1)) - * 7. P(n-1) = X(n-1) XOR C(n-2) - * - * CTS decryption using standard CBC interface: - * 1. D(n) = Decrypt(K, C(n-1)) - * 2. C(n) = C(n) | TAIL(D(n)) - * 3. swap the last two ciphertext blocks. - * 4. do CBC decryption. - * 5. truncate to original ciphertext size. - */ -static int cbc_cts_decrypt(struct ll_crypto_cipher *tfm, - struct scatterlist *sld, - struct scatterlist *sls) -{ - struct blkcipher_desc desc; - struct scatterlist slst, sldt; - void *data; - __u8 sbuf[CIPHER_MAX_BLKSIZE]; - __u8 dbuf[CIPHER_MAX_BLKSIZE]; - unsigned int blksize, blks, tail; - int rc; - - blksize = ll_crypto_blkcipher_blocksize(tfm); - blks = sls->length / blksize; - tail = sls->length % blksize; - LASSERT(blks > 0 && tail > 0); - - /* save current IV, and set IV to zero */ - ll_crypto_blkcipher_get_iv(tfm, sbuf, blksize); - ll_crypto_blkcipher_set_iv(tfm, zero_iv, blksize); - - /* D(n) = Decrypt(K, C(n-1)) */ - slst = *sls; - slst.offset += (blks - 1) * blksize; - slst.length = blksize; - - buf_to_sl(&sldt, dbuf, blksize); - desc.tfm = tfm; - desc.flags = 0; - - rc = ll_crypto_blkcipher_decrypt(&desc, &sldt, &slst, blksize); - if (unlikely(rc)) { - CERROR("decrypt C(n-1) (%u): %d\n", slst.length, rc); - return rc; - } - - /* restore IV */ - ll_crypto_blkcipher_set_iv(tfm, sbuf, blksize); - - data = cfs_kmap(sls->page); - /* C(n) = C(n) | TAIL(D(n)) */ - memcpy(dbuf, data + sls->offset + blks * blksize, tail); - /* swab C(n) and C(n-1) */ - memcpy(sbuf, data + sls->offset + (blks - 1) * blksize, blksize); - memcpy(data + sls->offset + (blks - 1) * blksize, dbuf, blksize); - cfs_kunmap(sls->page); - - /* do cbc decrypt */ - buf_to_sl(&slst, sbuf, blksize); - buf_to_sl(&sldt, dbuf, blksize); - - /* decrypt head */ - rc = ll_crypto_blkcipher_decrypt(&desc, sld, sls, sls->length - tail); - if (unlikely(rc)) { - CERROR("decrypt head (%u) data: %d\n", sls->length - tail, rc); - return rc; - } - /* decrypt tail */ - rc = ll_crypto_blkcipher_decrypt(&desc, &sldt, &slst, blksize); - if (unlikely(rc)) { - CERROR("decrypt tail (%u) data: %d\n", slst.length, rc); - return rc; - } - - /* truncate to original ciphertext size */ - data = cfs_kmap(sld->page); - memcpy(data + sld->offset + blks * blksize, dbuf, tail); - cfs_kunmap(sld->page); - - return 0; -} - -static inline int do_cts_tfm(struct ll_crypto_cipher *tfm, - int encrypt, - struct scatterlist *sld, - struct scatterlist *sls) -{ -#ifndef HAVE_ASYNC_BLOCK_CIPHER - LASSERT(tfm->crt_cipher.cit_mode == CRYPTO_TFM_MODE_CBC); -#endif - - if (encrypt) - return cbc_cts_encrypt(tfm, sld, sls); - else - return cbc_cts_decrypt(tfm, sld, sls); -} - -/* - * normal encrypt/decrypt of data of even blocksize - */ -static inline int do_cipher_tfm(struct ll_crypto_cipher *tfm, - int encrypt, - struct scatterlist *sld, - struct scatterlist *sls) -{ - struct blkcipher_desc desc; - desc.tfm = tfm; - desc.flags = 0; - if (encrypt) - return ll_crypto_blkcipher_encrypt(&desc, sld, sls, sls->length); - else - return ll_crypto_blkcipher_decrypt(&desc, sld, sls, sls->length); -} - -static struct ll_crypto_cipher *get_stream_cipher(__u8 *key, unsigned int keylen) -{ - const struct sptlrpc_ciph_type *ct; - struct ll_crypto_cipher *tfm; - int rc; - - /* using ARC4, the only stream cipher in linux for now */ - ct = sptlrpc_get_ciph_type(BULK_CIPH_ALG_ARC4); - LASSERT(ct); - - tfm = ll_crypto_alloc_blkcipher(ct->sct_tfm_name, 0, 0); - if (tfm == NULL) { - CERROR("Failed to allocate stream TFM %s\n", ct->sct_name); - return NULL; - } - LASSERT(ll_crypto_blkcipher_blocksize(tfm)); - - if (keylen > ct->sct_keysize) - keylen = ct->sct_keysize; - - LASSERT(keylen >= crypto_tfm_alg_min_keysize(tfm)); - LASSERT(keylen <= crypto_tfm_alg_max_keysize(tfm)); - - rc = ll_crypto_blkcipher_setkey(tfm, key, keylen); - if (rc) { - CERROR("Failed to set key for TFM %s: %d\n", ct->sct_name, rc); - ll_crypto_free_blkcipher(tfm); - return NULL; - } - - return tfm; -} - -static int do_bulk_privacy(struct gss_ctx *gctx, - struct ptlrpc_bulk_desc *desc, - int encrypt, __u32 alg, - struct ptlrpc_bulk_sec_desc *bsd) -{ - const struct sptlrpc_ciph_type *ct = sptlrpc_get_ciph_type(alg); - struct ll_crypto_cipher *tfm; - struct ll_crypto_cipher *stfm = NULL; /* backup stream cipher */ - struct scatterlist sls, sld, *sldp; - unsigned int blksize, keygen_size; - int i, rc; - __u8 key[CIPHER_MAX_KEYSIZE]; - - LASSERT(ct); - - if (encrypt) - bsd->bsd_ciph_alg = BULK_CIPH_ALG_NULL; - - if (alg == BULK_CIPH_ALG_NULL) - return 0; - - if (desc->bd_iov_count <= 0) { - if (encrypt) - bsd->bsd_ciph_alg = alg; - return 0; - } - - tfm = ll_crypto_alloc_blkcipher(ct->sct_tfm_name, 0, 0 ); - if (tfm == NULL) { - CERROR("Failed to allocate TFM %s\n", ct->sct_name); - return -ENOMEM; - } - blksize = ll_crypto_blkcipher_blocksize(tfm); - - LASSERT(crypto_tfm_alg_max_keysize(tfm) >= ct->sct_keysize); - LASSERT(crypto_tfm_alg_min_keysize(tfm) <= ct->sct_keysize); - LASSERT(ct->sct_ivsize == 0 || - ll_crypto_blkcipher_ivsize(tfm) == ct->sct_ivsize); - LASSERT(ct->sct_keysize <= CIPHER_MAX_KEYSIZE); - LASSERT(blksize <= CIPHER_MAX_BLKSIZE); - - /* generate ramdom key seed and compute the secret key based on it. - * note determined by algorithm which lgss_plain_encrypt use, it - * might require the key size be its (blocksize * n). so here for - * simplicity, we force it's be n * MAX_BLKSIZE by padding 0 */ - keygen_size = (ct->sct_keysize + CIPHER_MAX_BLKSIZE - 1) & - ~(CIPHER_MAX_BLKSIZE - 1); - if (encrypt) { - get_random_bytes(bsd->bsd_key, ct->sct_keysize); - if (ct->sct_keysize < keygen_size) - memset(bsd->bsd_key + ct->sct_keysize, 0, - keygen_size - ct->sct_keysize); - } - - rc = lgss_plain_encrypt(gctx, 0, keygen_size, bsd->bsd_key, key); - if (rc) { - CERROR("failed to compute secret key: %d\n", rc); - goto out; - } - - rc = ll_crypto_blkcipher_setkey(tfm, key, ct->sct_keysize); - if (rc) { - CERROR("Failed to set key for TFM %s: %d\n", ct->sct_name, rc); - goto out; - } - - /* stream cipher doesn't need iv */ - if (blksize > 1) - ll_crypto_blkcipher_set_iv(tfm, zero_iv, blksize); - - for (i = 0; i < desc->bd_iov_count; i++) { - sls.page = desc->bd_iov[i].kiov_page; - sls.offset = desc->bd_iov[i].kiov_offset; - sls.length = desc->bd_iov[i].kiov_len; - - if (unlikely(sls.length == 0)) { - CWARN("page %d with 0 length data?\n", i); - continue; - } - - if (unlikely(sls.offset % blksize)) { - CERROR("page %d with odd offset %u, TFM %s\n", - i, sls.offset, ct->sct_name); - rc = -EINVAL; - goto out; - } - - if (desc->bd_enc_pages) { - sld.page = desc->bd_enc_pages[i]; - sld.offset = desc->bd_iov[i].kiov_offset; - sld.length = desc->bd_iov[i].kiov_len; - - sldp = &sld; - } else { - sldp = &sls; - } - - if (likely(sls.length % blksize == 0)) { - /* data length is n * blocksize, do the normal tfm */ - rc = do_cipher_tfm(tfm, encrypt, sldp, &sls); - } else if (sls.length < blksize) { - /* odd data length, and smaller than 1 block, CTS - * doesn't work in this case because it requires - * transfer a modified IV to peer. here we use a - * "backup" stream cipher to do the tfm */ - if (stfm == NULL) { - stfm = get_stream_cipher(key, ct->sct_keysize); - if (tfm == NULL) { - rc = -ENOMEM; - goto out; - } - } - rc = do_cipher_tfm(stfm, encrypt, sldp, &sls); - } else { - /* odd data length but > 1 block, do CTS tfm */ - rc = do_cts_tfm(tfm, encrypt, sldp, &sls); - } - - if (unlikely(rc)) { - CERROR("error %s page %d/%d: %d\n", - encrypt ? "encrypt" : "decrypt", - i + 1, desc->bd_iov_count, rc); - goto out; - } - - if (desc->bd_enc_pages) - desc->bd_iov[i].kiov_page = desc->bd_enc_pages[i]; - } - - if (encrypt) - bsd->bsd_ciph_alg = alg; - -out: - if (stfm) - ll_crypto_free_blkcipher(stfm); - - ll_crypto_free_blkcipher(tfm); - return rc; -} - int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc) { struct gss_cli_ctx *gctx; struct lustre_msg *msg; - struct ptlrpc_bulk_sec_desc *bsdr; - int offset, rc; + struct ptlrpc_bulk_sec_desc *bsd; + rawobj_t token; + __u32 maj; + int offset; + int rc; ENTRY; LASSERT(req->rq_pack_bulk); LASSERT(req->rq_bulk_read || req->rq_bulk_write); - switch (RPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + gctx = container_of(ctx, struct gss_cli_ctx, gc_base); + LASSERT(gctx->gc_mechctx); + + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { case SPTLRPC_SVC_NULL: LASSERT(req->rq_reqbuf->lm_bufcount >= 3); msg = req->rq_reqbuf; @@ -472,42 +107,68 @@ int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx, LBUG(); } - /* make checksum */ - rc = bulk_csum_cli_request(desc, req->rq_bulk_read, - req->rq_flvr.sf_bulk_hash, msg, offset); - if (rc) { - CERROR("client bulk %s: failed to generate checksum: %d\n", - req->rq_bulk_read ? "read" : "write", rc); - RETURN(rc); - } + bsd = lustre_msg_buf(msg, offset, sizeof(*bsd)); + bsd->bsd_version = 0; + bsd->bsd_flags = 0; + bsd->bsd_type = SPTLRPC_BULK_DEFAULT; + bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc); - if (req->rq_flvr.sf_bulk_ciph == BULK_CIPH_ALG_NULL) + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL) RETURN(0); - /* previous bulk_csum_cli_request() has verified bsdr is good */ - bsdr = lustre_msg_buf(msg, offset, 0); + LASSERT(bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG || + bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV); if (req->rq_bulk_read) { - bsdr->bsd_ciph_alg = req->rq_flvr.sf_bulk_ciph; - RETURN(0); - } - - /* it turn out to be bulk write */ - rc = sptlrpc_enc_pool_get_pages(desc); - if (rc) { - CERROR("bulk write: failed to allocate encryption pages\n"); - RETURN(rc); - } + /* + * bulk read: prepare receiving pages only for privacy mode. + */ + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_PRIV) + return gss_cli_prep_bulk(req, desc); + } else { + /* + * bulk write: sign or encrypt bulk pages. + */ + bsd->bsd_nob = desc->bd_nob; + + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_INTG) { + /* integrity mode */ + token.data = bsd->bsd_data; + token.len = lustre_msg_buflen(msg, offset) - + sizeof(*bsd); + + maj = lgss_get_mic(gctx->gc_mechctx, 0, NULL, + desc->bd_iov_count, desc->bd_iov, + &token); + if (maj != GSS_S_COMPLETE) { + CWARN("failed to sign bulk data: %x\n", maj); + RETURN(-EACCES); + } + } else { + /* privacy mode */ + if (desc->bd_iov_count == 0) + RETURN(0); + + rc = sptlrpc_enc_pool_get_pages(desc); + if (rc) { + CERROR("bulk write: failed to allocate " + "encryption pages: %d\n", rc); + RETURN(rc); + } - gctx = container_of(ctx, struct gss_cli_ctx, gc_base); - LASSERT(gctx->gc_mechctx); + token.data = bsd->bsd_data; + token.len = lustre_msg_buflen(msg, offset) - + sizeof(*bsd); - rc = do_bulk_privacy(gctx->gc_mechctx, desc, 1, - req->rq_flvr.sf_bulk_ciph, bsdr); - if (rc) - CERROR("bulk write: client failed to encrypt pages\n"); + maj = lgss_wrap_bulk(gctx->gc_mechctx, desc, &token, 0); + if (maj != GSS_S_COMPLETE) { + CWARN("fail to encrypt bulk data: %x\n", maj); + RETURN(-EACCES); + } + } + } - RETURN(rc); + RETURN(0); } int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, @@ -517,13 +178,15 @@ int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, struct gss_cli_ctx *gctx; struct lustre_msg *rmsg, *vmsg; struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; - int roff, voff, rc; + rawobj_t token; + __u32 maj; + int roff, voff; ENTRY; LASSERT(req->rq_pack_bulk); LASSERT(req->rq_bulk_read || req->rq_bulk_write); - switch (RPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { case SPTLRPC_SVC_NULL: vmsg = req->rq_repdata; voff = vmsg->lm_bufcount - 1; @@ -556,34 +219,158 @@ int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, LBUG(); } - if (req->rq_bulk_read) { - bsdr = lustre_msg_buf(rmsg, roff, 0); - if (bsdr->bsd_ciph_alg == BULK_CIPH_ALG_NULL) - goto verify_csum; - - bsdv = lustre_msg_buf(vmsg, voff, 0); - if (bsdr->bsd_ciph_alg != bsdv->bsd_ciph_alg) { - CERROR("bulk read: cipher algorithm mismatch: client " - "request %s but server reply with %s. try to " - "use the new one for decryption\n", - sptlrpc_get_ciph_name(bsdr->bsd_ciph_alg), - sptlrpc_get_ciph_name(bsdv->bsd_ciph_alg)); + bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr)); + bsdv = lustre_msg_buf(vmsg, voff, sizeof(*bsdv)); + LASSERT(bsdr && bsdv); + + if (bsdr->bsd_version != bsdv->bsd_version || + bsdr->bsd_type != bsdv->bsd_type || + bsdr->bsd_svc != bsdv->bsd_svc) { + CERROR("bulk security descriptor mismatch: " + "(%u,%u,%u) != (%u,%u,%u)\n", + bsdr->bsd_version, bsdr->bsd_type, bsdr->bsd_svc, + bsdv->bsd_version, bsdv->bsd_type, bsdv->bsd_svc); + RETURN(-EPROTO); + } + + LASSERT(bsdv->bsd_svc == SPTLRPC_BULK_SVC_NULL || + bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG || + bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV); + + /* + * in privacy mode if return success, make sure bd_nob_transferred + * is the actual size of the clear text, otherwise upper layer + * may be surprised. + */ + if (req->rq_bulk_write) { + if (bsdv->bsd_flags & BSD_FL_ERR) { + CERROR("server reported bulk i/o failure\n"); + RETURN(-EIO); } + if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) + desc->bd_nob_transferred = desc->bd_nob; + } else { + /* + * bulk read, upon return success, bd_nob_transferred is + * the size of plain text actually received. + */ gctx = container_of(ctx, struct gss_cli_ctx, gc_base); LASSERT(gctx->gc_mechctx); - rc = do_bulk_privacy(gctx->gc_mechctx, desc, 0, - bsdv->bsd_ciph_alg, bsdv); - if (rc) { - CERROR("bulk read: client failed to decrypt data\n"); - RETURN(rc); + if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_INTG) { + int i, nob; + + /* fix the actual data size */ + for (i = 0, nob = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_iov[i].kiov_len + nob > + desc->bd_nob_transferred) { + desc->bd_iov[i].kiov_len = + desc->bd_nob_transferred - nob; + } + nob += desc->bd_iov[i].kiov_len; + } + + token.data = bsdv->bsd_data; + token.len = lustre_msg_buflen(vmsg, voff) - + sizeof(*bsdv); + + maj = lgss_verify_mic(gctx->gc_mechctx, 0, NULL, + desc->bd_iov_count, desc->bd_iov, + &token); + if (maj != GSS_S_COMPLETE) { + CERROR("failed to verify bulk read: %x\n", maj); + RETURN(-EACCES); + } + } else if (bsdv->bsd_svc == SPTLRPC_BULK_SVC_PRIV) { + desc->bd_nob = bsdv->bsd_nob; + if (desc->bd_nob == 0) + RETURN(0); + + token.data = bsdv->bsd_data; + token.len = lustre_msg_buflen(vmsg, voff) - + sizeof(*bsdr); + + maj = lgss_unwrap_bulk(gctx->gc_mechctx, desc, &token); + if (maj != GSS_S_COMPLETE) { + CERROR("failed to decrypt bulk read: %x\n", + maj); + RETURN(-EACCES); + } + + desc->bd_nob_transferred = desc->bd_nob; } } -verify_csum: - rc = bulk_csum_cli_reply(desc, req->rq_bulk_read, - rmsg, roff, vmsg, voff); + RETURN(0); +} + +static int gss_prep_bulk(struct ptlrpc_bulk_desc *desc, + struct gss_ctx *mechctx) +{ + int rc; + + if (desc->bd_iov_count == 0) + return 0; + + rc = sptlrpc_enc_pool_get_pages(desc); + if (rc) + return rc; + + if (lgss_prep_bulk(mechctx, desc) != GSS_S_COMPLETE) + return -EACCES; + + return 0; +} + +int gss_cli_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + int rc; + ENTRY; + + LASSERT(req->rq_cli_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_read); + + if (SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_BULK_SVC_PRIV) + RETURN(0); + + rc = gss_prep_bulk(desc, ctx2gctx(req->rq_cli_ctx)->gc_mechctx); + if (rc) + CERROR("bulk read: failed to prepare encryption " + "pages: %d\n", rc); + + RETURN(rc); +} + +int gss_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct gss_svc_reqctx *grctx; + struct ptlrpc_bulk_sec_desc *bsd; + int rc; + ENTRY; + + LASSERT(req->rq_svc_ctx); + LASSERT(req->rq_pack_bulk); + LASSERT(req->rq_bulk_write); + + grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + LASSERT(grctx->src_reqbsd); + LASSERT(grctx->src_repbsd); + LASSERT(grctx->src_ctx); + LASSERT(grctx->src_ctx->gsc_mechctx); + + bsd = grctx->src_reqbsd; + if (bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV) + RETURN(0); + + rc = gss_prep_bulk(desc, grctx->src_ctx->gsc_mechctx); + if (rc) + CERROR("bulk write: failed to prepare encryption " + "pages: %d\n", rc); + RETURN(rc); } @@ -591,7 +378,9 @@ int gss_svc_unwrap_bulk(struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc) { struct gss_svc_reqctx *grctx; - int rc; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + rawobj_t token; + __u32 maj; ENTRY; LASSERT(req->rq_svc_ctx); @@ -605,29 +394,64 @@ int gss_svc_unwrap_bulk(struct ptlrpc_request *req, LASSERT(grctx->src_ctx); LASSERT(grctx->src_ctx->gsc_mechctx); - /* decrypt bulk data if it's encrypted */ - if (grctx->src_reqbsd->bsd_ciph_alg != BULK_CIPH_ALG_NULL) { - rc = do_bulk_privacy(grctx->src_ctx->gsc_mechctx, desc, 0, - grctx->src_reqbsd->bsd_ciph_alg, - grctx->src_reqbsd); - if (rc) { - CERROR("bulk write: server failed to decrypt data\n"); - RETURN(rc); + bsdr = grctx->src_reqbsd; + bsdv = grctx->src_repbsd; + + /* bsdr has been sanity checked during unpacking */ + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + switch (bsdv->bsd_svc) { + case SPTLRPC_BULK_SVC_INTG: + token.data = bsdr->bsd_data; + token.len = grctx->src_reqbsd_size - sizeof(*bsdr); + + maj = lgss_verify_mic(grctx->src_ctx->gsc_mechctx, 0, NULL, + desc->bd_iov_count, desc->bd_iov, &token); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed to verify bulk signature: %x\n", maj); + RETURN(-EACCES); + } + break; + case SPTLRPC_BULK_SVC_PRIV: + if (bsdr->bsd_nob != desc->bd_nob) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("prepared nob %d doesn't match the actual " + "nob %d\n", desc->bd_nob, bsdr->bsd_nob); + RETURN(-EPROTO); } - } - /* verify bulk data checksum */ - rc = bulk_csum_svc(desc, req->rq_bulk_read, - grctx->src_reqbsd, grctx->src_reqbsd_size, - grctx->src_repbsd, grctx->src_repbsd_size); + if (desc->bd_iov_count == 0) { + LASSERT(desc->bd_nob == 0); + break; + } - RETURN(rc); + token.data = bsdr->bsd_data; + token.len = grctx->src_reqbsd_size - sizeof(*bsdr); + + maj = lgss_unwrap_bulk(grctx->src_ctx->gsc_mechctx, + desc, &token); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed decrypt bulk data: %x\n", maj); + RETURN(-EACCES); + } + break; + } + + RETURN(0); } int gss_svc_wrap_bulk(struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc) { struct gss_svc_reqctx *grctx; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + rawobj_t token; + __u32 maj; int rc; ENTRY; @@ -642,22 +466,56 @@ int gss_svc_wrap_bulk(struct ptlrpc_request *req, LASSERT(grctx->src_ctx); LASSERT(grctx->src_ctx->gsc_mechctx); - /* generate bulk data checksum */ - rc = bulk_csum_svc(desc, req->rq_bulk_read, - grctx->src_reqbsd, grctx->src_reqbsd_size, - grctx->src_repbsd, grctx->src_repbsd_size); - if (rc) - RETURN(rc); - - /* encrypt bulk data if required */ - if (grctx->src_reqbsd->bsd_ciph_alg != BULK_CIPH_ALG_NULL) { - rc = do_bulk_privacy(grctx->src_ctx->gsc_mechctx, desc, 1, - grctx->src_reqbsd->bsd_ciph_alg, - grctx->src_repbsd); - if (rc) - CERROR("bulk read: server failed to encrypt data: " - "rc %d\n", rc); + bsdr = grctx->src_reqbsd; + bsdv = grctx->src_repbsd; + + /* bsdr has been sanity checked during unpacking */ + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + switch (bsdv->bsd_svc) { + case SPTLRPC_BULK_SVC_INTG: + token.data = bsdv->bsd_data; + token.len = grctx->src_repbsd_size - sizeof(*bsdv); + + maj = lgss_get_mic(grctx->src_ctx->gsc_mechctx, 0, NULL, + desc->bd_iov_count, desc->bd_iov, &token); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed to sign bulk data: %x\n", maj); + RETURN(-EACCES); + } + break; + case SPTLRPC_BULK_SVC_PRIV: + bsdv->bsd_nob = desc->bd_nob; + + if (desc->bd_iov_count == 0) { + LASSERT(desc->bd_nob == 0); + break; + } + + rc = sptlrpc_enc_pool_get_pages(desc); + if (rc) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("bulk read: failed to allocate encryption " + "pages: %d\n", rc); + RETURN(rc); + } + + token.data = bsdv->bsd_data; + token.len = grctx->src_repbsd_size - sizeof(*bsdv); + + maj = lgss_wrap_bulk(grctx->src_ctx->gsc_mechctx, + desc, &token, 1); + if (maj != GSS_S_COMPLETE) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("failed to encrypt bulk data: %x\n", maj); + RETURN(-EACCES); + } + break; } - RETURN(rc); + RETURN(0); } diff --git a/lustre/ptlrpc/gss/gss_internal.h b/lustre/ptlrpc/gss/gss_internal.h index afbb6144..66afd61 100644 --- a/lustre/ptlrpc/gss/gss_internal.h +++ b/lustre/ptlrpc/gss/gss_internal.h @@ -433,12 +433,16 @@ int __init gss_init_pipefs(void); void __exit gss_exit_pipefs(void); /* gss_bulk.c */ +int gss_cli_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); int gss_cli_ctx_wrap_bulk(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc); int gss_cli_ctx_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc); +int gss_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); int gss_svc_unwrap_bulk(struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc); int gss_svc_wrap_bulk(struct ptlrpc_request *req, diff --git a/lustre/ptlrpc/gss/gss_keyring.c b/lustre/ptlrpc/gss/gss_keyring.c index 74c786d..8906109 100644 --- a/lustre/ptlrpc/gss/gss_keyring.c +++ b/lustre/ptlrpc/gss/gss_keyring.c @@ -1450,6 +1450,7 @@ static struct ptlrpc_sec_sops gss_sec_keyring_sops = { .authorize = gss_svc_authorize, .free_rs = gss_svc_free_rs, .free_ctx = gss_svc_free_ctx, + .prep_bulk = gss_svc_prep_bulk, .unwrap_bulk = gss_svc_unwrap_bulk, .wrap_bulk = gss_svc_wrap_bulk, .install_rctx = gss_svc_install_rctx_kr, diff --git a/lustre/ptlrpc/gss/gss_krb5_mech.c b/lustre/ptlrpc/gss/gss_krb5_mech.c index a9a5388..7eb0c95 100644 --- a/lustre/ptlrpc/gss/gss_krb5_mech.c +++ b/lustre/ptlrpc/gss/gss_krb5_mech.c @@ -531,7 +531,7 @@ void gss_delete_sec_context_kerberos(void *internal_ctx) } static -void buf_to_sg(struct scatterlist *sg, char *ptr, int len) +void buf_to_sg(struct scatterlist *sg, void *ptr, int len) { sg->page = virt_to_page(ptr); sg->offset = offset_in_page(ptr); @@ -582,13 +582,15 @@ out: return(ret); } +#ifdef HAVE_ASYNC_BLOCK_CIPHER + static inline int krb5_digest_hmac(struct ll_crypto_hash *tfm, rawobj_t *key, struct krb5_header *khdr, int msgcnt, rawobj_t *msgs, + int iovcnt, lnet_kiov_t *iovs, rawobj_t *cksum) -#ifdef HAVE_ASYNC_BLOCK_CIPHER { struct hash_desc desc; struct scatterlist sg[1]; @@ -607,6 +609,15 @@ int krb5_digest_hmac(struct ll_crypto_hash *tfm, ll_crypto_hash_update(&desc, sg, msgs[i].len); } + for (i = 0; i < iovcnt; i++) { + if (iovs[i].kiov_len == 0) + continue; + sg[0].page = iovs[i].kiov_page; + sg[0].offset = iovs[i].kiov_offset; + sg[0].length = iovs[i].kiov_len; + ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len); + } + if (khdr) { buf_to_sg(sg, (char *) khdr, sizeof(*khdr)); ll_crypto_hash_update(&desc, sg, sizeof(*khdr)); @@ -614,7 +625,16 @@ int krb5_digest_hmac(struct ll_crypto_hash *tfm, return ll_crypto_hash_final(&desc, cksum->data); } -#else /* HAVE_ASYNC_BLOCK_CIPHER */ + +#else /* ! HAVE_ASYNC_BLOCK_CIPHER */ + +static inline +int krb5_digest_hmac(struct ll_crypto_hash *tfm, + rawobj_t *key, + struct krb5_header *khdr, + int msgcnt, rawobj_t *msgs, + int iovcnt, lnet_kiov_t *iovs, + rawobj_t *cksum) { struct scatterlist sg[1]; __u32 keylen = key->len, i; @@ -628,6 +648,15 @@ int krb5_digest_hmac(struct ll_crypto_hash *tfm, crypto_hmac_update(tfm, sg, 1); } + for (i = 0; i < iovcnt; i++) { + if (iovs[i].kiov_len == 0) + continue; + sg[0].page = iovs[i].kiov_page; + sg[0].offset = iovs[i].kiov_offset; + sg[0].length = iovs[i].kiov_len; + crypto_hmac_update(tfm, sg, 1); + } + if (khdr) { buf_to_sg(sg, (char *) khdr, sizeof(*khdr)); crypto_hmac_update(tfm, sg, 1); @@ -636,6 +665,7 @@ int krb5_digest_hmac(struct ll_crypto_hash *tfm, crypto_hmac_final(tfm, key->data, &keylen, cksum->data); return 0; } + #endif /* HAVE_ASYNC_BLOCK_CIPHER */ static inline @@ -643,6 +673,7 @@ int krb5_digest_norm(struct ll_crypto_hash *tfm, struct krb5_keyblock *kb, struct krb5_header *khdr, int msgcnt, rawobj_t *msgs, + int iovcnt, lnet_kiov_t *iovs, rawobj_t *cksum) { struct hash_desc desc; @@ -662,6 +693,15 @@ int krb5_digest_norm(struct ll_crypto_hash *tfm, ll_crypto_hash_update(&desc, sg, msgs[i].len); } + for (i = 0; i < iovcnt; i++) { + if (iovs[i].kiov_len == 0) + continue; + sg[0].page = iovs[i].kiov_page; + sg[0].offset = iovs[i].kiov_offset; + sg[0].length = iovs[i].kiov_len; + ll_crypto_hash_update(&desc, sg, iovs[i].kiov_len); + } + if (khdr) { buf_to_sg(sg, (char *) khdr, sizeof(*khdr)); ll_crypto_hash_update(&desc, sg, sizeof(*khdr)); @@ -682,6 +722,7 @@ __s32 krb5_make_checksum(__u32 enctype, struct krb5_keyblock *kb, struct krb5_header *khdr, int msgcnt, rawobj_t *msgs, + int iovcnt, lnet_kiov_t *iovs, rawobj_t *cksum) { struct krb5_enctype *ke = &enctypes[enctype]; @@ -703,10 +744,10 @@ __s32 krb5_make_checksum(__u32 enctype, if (ke->ke_hash_hmac) rc = krb5_digest_hmac(tfm, &kb->kb_key, - khdr, msgcnt, msgs, cksum); + khdr, msgcnt, msgs, iovcnt, iovs, cksum); else rc = krb5_digest_norm(tfm, kb, - khdr, msgcnt, msgs, cksum); + khdr, msgcnt, msgs, iovcnt, iovs, cksum); if (rc == 0) code = GSS_S_COMPLETE; @@ -715,38 +756,96 @@ out_tfm: return code; } +static void fill_krb5_header(struct krb5_ctx *kctx, + struct krb5_header *khdr, + int privacy) +{ + unsigned char acceptor_flag; + + acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR; + + if (privacy) { + khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG); + khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL; + khdr->kh_ec = cpu_to_be16(0); + khdr->kh_rrc = cpu_to_be16(0); + } else { + khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG); + khdr->kh_flags = acceptor_flag; + khdr->kh_ec = cpu_to_be16(0xffff); + khdr->kh_rrc = cpu_to_be16(0xffff); + } + + khdr->kh_filler = 0xff; + spin_lock(&krb5_seq_lock); + khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++); + spin_unlock(&krb5_seq_lock); +} + +static __u32 verify_krb5_header(struct krb5_ctx *kctx, + struct krb5_header *khdr, + int privacy) +{ + unsigned char acceptor_flag; + __u16 tok_id, ec_rrc; + + acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0; + + if (privacy) { + tok_id = KG_TOK_WRAP_MSG; + ec_rrc = 0x0; + } else { + tok_id = KG_TOK_MIC_MSG; + ec_rrc = 0xffff; + } + + /* sanity checks */ + if (be16_to_cpu(khdr->kh_tok_id) != tok_id) { + CERROR("bad token id\n"); + return GSS_S_DEFECTIVE_TOKEN; + } + if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) { + CERROR("bad direction flag\n"); + return GSS_S_BAD_SIG; + } + if (privacy && (khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) { + CERROR("missing confidential flag\n"); + return GSS_S_BAD_SIG; + } + if (khdr->kh_filler != 0xff) { + CERROR("bad filler\n"); + return GSS_S_DEFECTIVE_TOKEN; + } + if (be16_to_cpu(khdr->kh_ec) != ec_rrc || + be16_to_cpu(khdr->kh_rrc) != ec_rrc) { + CERROR("bad EC or RRC\n"); + return GSS_S_DEFECTIVE_TOKEN; + } + return GSS_S_COMPLETE; +} + static __u32 gss_get_mic_kerberos(struct gss_ctx *gctx, int msgcnt, rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, rawobj_t *token) { struct krb5_ctx *kctx = gctx->internal_ctx_id; struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; struct krb5_header *khdr; - unsigned char acceptor_flag; rawobj_t cksum = RAWOBJ_EMPTY; - __u32 rc = GSS_S_FAILURE; - - acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR; /* fill krb5 header */ LASSERT(token->len >= sizeof(*khdr)); khdr = (struct krb5_header *) token->data; - - khdr->kh_tok_id = cpu_to_be16(KG_TOK_MIC_MSG); - khdr->kh_flags = acceptor_flag; - khdr->kh_filler = 0xff; - khdr->kh_ec = cpu_to_be16(0xffff); - khdr->kh_rrc = cpu_to_be16(0xffff); - spin_lock(&krb5_seq_lock); - khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++); - spin_unlock(&krb5_seq_lock); + fill_krb5_header(kctx, khdr, 0); /* checksum */ if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc, - khdr, msgcnt, msgs, &cksum)) - goto out_err; + khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) + return GSS_S_FAILURE; LASSERT(cksum.len >= ke->ke_hash_size); LASSERT(token->len >= sizeof(*khdr) + ke->ke_hash_size); @@ -754,26 +853,23 @@ __u32 gss_get_mic_kerberos(struct gss_ctx *gctx, ke->ke_hash_size); token->len = sizeof(*khdr) + ke->ke_hash_size; - rc = GSS_S_COMPLETE; -out_err: rawobj_free(&cksum); - return rc; + return GSS_S_COMPLETE; } static __u32 gss_verify_mic_kerberos(struct gss_ctx *gctx, int msgcnt, rawobj_t *msgs, + int iovcnt, + lnet_kiov_t *iovs, rawobj_t *token) { struct krb5_ctx *kctx = gctx->internal_ctx_id; struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; struct krb5_header *khdr; - unsigned char acceptor_flag; rawobj_t cksum = RAWOBJ_EMPTY; - __u32 rc = GSS_S_FAILURE; - - acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0; + __u32 major; if (token->len < sizeof(*khdr)) { CERROR("short signature: %u\n", token->len); @@ -782,47 +878,34 @@ __u32 gss_verify_mic_kerberos(struct gss_ctx *gctx, khdr = (struct krb5_header *) token->data; - /* sanity checks */ - if (be16_to_cpu(khdr->kh_tok_id) != KG_TOK_MIC_MSG) { - CERROR("bad token id\n"); - return GSS_S_DEFECTIVE_TOKEN; - } - if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) { - CERROR("bad direction flag\n"); - return GSS_S_BAD_SIG; - } - if (khdr->kh_filler != 0xff) { - CERROR("bad filler\n"); - return GSS_S_DEFECTIVE_TOKEN; - } - if (be16_to_cpu(khdr->kh_ec) != 0xffff || - be16_to_cpu(khdr->kh_rrc) != 0xffff) { - CERROR("bad EC or RRC\n"); - return GSS_S_DEFECTIVE_TOKEN; + major = verify_krb5_header(kctx, khdr, 0); + if (major != GSS_S_COMPLETE) { + CERROR("bad krb5 header\n"); + return major; } if (token->len < sizeof(*khdr) + ke->ke_hash_size) { CERROR("short signature: %u, require %d\n", token->len, (int) sizeof(*khdr) + ke->ke_hash_size); - goto out; + return GSS_S_FAILURE; } if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyc, - khdr, msgcnt, msgs, &cksum)) + khdr, msgcnt, msgs, iovcnt, iovs, &cksum)) { + CERROR("failed to make checksum\n"); return GSS_S_FAILURE; + } LASSERT(cksum.len >= ke->ke_hash_size); if (memcmp(khdr + 1, cksum.data + cksum.len - ke->ke_hash_size, ke->ke_hash_size)) { CERROR("checksum mismatch\n"); - rc = GSS_S_BAD_SIG; - goto out; + rawobj_free(&cksum); + return GSS_S_BAD_SIG; } - rc = GSS_S_COMPLETE; -out: rawobj_free(&cksum); - return rc; + return GSS_S_COMPLETE; } static @@ -902,6 +985,195 @@ int krb5_encrypt_rawobjs(struct ll_crypto_cipher *tfm, } static +int krb5_encrypt_bulk(struct ll_crypto_cipher *tfm, + struct krb5_header *khdr, + char *confounder, + struct ptlrpc_bulk_desc *desc, + rawobj_t *cipher, + int adj_nob) +{ + struct blkcipher_desc ciph_desc; + __u8 local_iv[16] = {0}; + struct scatterlist src, dst; + int blocksize, i, rc, nob = 0; + + LASSERT(desc->bd_iov_count); + LASSERT(desc->bd_enc_iov); + + blocksize = ll_crypto_blkcipher_blocksize(tfm); + LASSERT(blocksize > 1); + LASSERT(cipher->len == blocksize + sizeof(*khdr)); + + ciph_desc.tfm = tfm; + ciph_desc.info = local_iv; + ciph_desc.flags = 0; + + /* encrypt confounder */ + buf_to_sg(&src, confounder, blocksize); + buf_to_sg(&dst, cipher->data, blocksize); + + rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src, blocksize); + if (rc) { + CERROR("error to encrypt confounder: %d\n", rc); + return rc; + } + + /* encrypt clear pages */ + for (i = 0; i < desc->bd_iov_count; i++) { + src.page = desc->bd_iov[i].kiov_page; + src.offset = desc->bd_iov[i].kiov_offset; + src.length = (desc->bd_iov[i].kiov_len + blocksize - 1) & + (~(blocksize - 1)); + + if (adj_nob) + nob += src.length; + + dst.page = desc->bd_enc_iov[i].kiov_page; + dst.offset = src.offset; + dst.length = src.length; + + desc->bd_enc_iov[i].kiov_offset = dst.offset; + desc->bd_enc_iov[i].kiov_len = dst.length; + + rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, &dst, &src, + src.length); + if (rc) { + CERROR("error to encrypt page: %d\n", rc); + return rc; + } + } + + /* encrypt krb5 header */ + buf_to_sg(&src, khdr, sizeof(*khdr)); + buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr)); + + rc = ll_crypto_blkcipher_encrypt_iv(&ciph_desc, + &dst, &src, sizeof(*khdr)); + if (rc) { + CERROR("error to encrypt krb5 header: %d\n", rc); + return rc; + } + + if (adj_nob) + desc->bd_nob = nob; + + return 0; +} + +/* + * desc->bd_nob_transferred is the size of cipher text received. + * desc->bd_nob is the target size of plain text supposed to be. + */ +static +int krb5_decrypt_bulk(struct ll_crypto_cipher *tfm, + struct krb5_header *khdr, + struct ptlrpc_bulk_desc *desc, + rawobj_t *cipher, + rawobj_t *plain) +{ + struct blkcipher_desc ciph_desc; + __u8 local_iv[16] = {0}; + struct scatterlist src, dst; + int ct_nob = 0, pt_nob = 0; + int blocksize, i, rc; + + LASSERT(desc->bd_iov_count); + LASSERT(desc->bd_enc_iov); + LASSERT(desc->bd_nob_transferred); + + blocksize = ll_crypto_blkcipher_blocksize(tfm); + LASSERT(blocksize > 1); + LASSERT(cipher->len == blocksize + sizeof(*khdr)); + + ciph_desc.tfm = tfm; + ciph_desc.info = local_iv; + ciph_desc.flags = 0; + + if (desc->bd_nob_transferred % blocksize) { + CERROR("odd transferred nob: %d\n", desc->bd_nob_transferred); + return -EPROTO; + } + + /* decrypt head (confounder) */ + buf_to_sg(&src, cipher->data, blocksize); + buf_to_sg(&dst, plain->data, blocksize); + + rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src, blocksize); + if (rc) { + CERROR("error to decrypt confounder: %d\n", rc); + return rc; + } + + /* + * decrypt clear pages. note the enc_iov is prepared by prep_bulk() + * which already done some sanity checkings. + * + * desc->bd_nob is the actual plain text size supposed to be + * transferred. desc->bd_nob_transferred is the actual cipher + * text received. + */ + for (i = 0; i < desc->bd_iov_count && ct_nob < desc->bd_nob_transferred; + i++) { + if (desc->bd_enc_iov[i].kiov_len == 0) + continue; + + if (ct_nob + desc->bd_enc_iov[i].kiov_len > + desc->bd_nob_transferred) + desc->bd_enc_iov[i].kiov_len = + desc->bd_nob_transferred - ct_nob; + + desc->bd_iov[i].kiov_len = desc->bd_enc_iov[i].kiov_len; + if (pt_nob + desc->bd_enc_iov[i].kiov_len > desc->bd_nob) + desc->bd_iov[i].kiov_len = desc->bd_nob - pt_nob; + + src.page = desc->bd_enc_iov[i].kiov_page; + src.offset = desc->bd_enc_iov[i].kiov_offset; + src.length = desc->bd_enc_iov[i].kiov_len; + + dst = src; + + if (desc->bd_iov[i].kiov_offset % blocksize == 0) + dst.page = desc->bd_iov[i].kiov_page; + + rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, &dst, &src, + src.length); + if (rc) { + CERROR("error to decrypt page: %d\n", rc); + return rc; + } + + if (desc->bd_iov[i].kiov_offset % blocksize) { + memcpy(cfs_page_address(desc->bd_iov[i].kiov_page) + + desc->bd_iov[i].kiov_offset, + cfs_page_address(desc->bd_enc_iov[i].kiov_page) + + desc->bd_iov[i].kiov_offset, + desc->bd_iov[i].kiov_len); + } + + ct_nob += desc->bd_enc_iov[i].kiov_len; + pt_nob += desc->bd_iov[i].kiov_len; + } + + /* decrypt tail (krb5 header) */ + buf_to_sg(&src, cipher->data + blocksize, sizeof(*khdr)); + buf_to_sg(&dst, cipher->data + blocksize, sizeof(*khdr)); + + rc = ll_crypto_blkcipher_decrypt_iv(&ciph_desc, + &dst, &src, sizeof(*khdr)); + if (rc) { + CERROR("error to decrypt tail: %d\n", rc); + return rc; + } + + if (memcmp(cipher->data + blocksize, khdr, sizeof(*khdr))) { + CERROR("krb5 header doesn't match\n"); + return -EACCES; + } + + return 0; +} + +static __u32 gss_wrap_kerberos(struct gss_ctx *gctx, rawobj_t *gsshdr, rawobj_t *msg, @@ -911,12 +1183,11 @@ __u32 gss_wrap_kerberos(struct gss_ctx *gctx, struct krb5_ctx *kctx = gctx->internal_ctx_id; struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; struct krb5_header *khdr; - unsigned char acceptor_flag; int blocksize; rawobj_t cksum = RAWOBJ_EMPTY; - rawobj_t data_desc[4], cipher; + rawobj_t data_desc[3], cipher; __u8 conf[GSS_MAX_CIPHER_BLOCK]; - int enc_rc = 0; + int rc = 0; LASSERT(ke); LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK); @@ -934,16 +1205,7 @@ __u32 gss_wrap_kerberos(struct gss_ctx *gctx, /* fill krb5 header */ LASSERT(token->len >= sizeof(*khdr)); khdr = (struct krb5_header *) token->data; - acceptor_flag = kctx->kc_initiate ? 0 : FLAG_SENDER_IS_ACCEPTOR; - - khdr->kh_tok_id = cpu_to_be16(KG_TOK_WRAP_MSG); - khdr->kh_flags = acceptor_flag | FLAG_WRAP_CONFIDENTIAL; - khdr->kh_filler = 0xff; - khdr->kh_ec = cpu_to_be16(0); - khdr->kh_rrc = cpu_to_be16(0); - spin_lock(&krb5_seq_lock); - khdr->kh_seq = cpu_to_be64(kctx->kc_seq_send++); - spin_unlock(&krb5_seq_lock); + fill_krb5_header(kctx, khdr, 1); /* generate confounder */ get_random_bytes(conf, ke->ke_conf_size); @@ -975,12 +1237,10 @@ __u32 gss_wrap_kerberos(struct gss_ctx *gctx, data_desc[1].len = gsshdr->len; data_desc[2].data = msg->data; data_desc[2].len = msg->len; - data_desc[3].data = (__u8 *) khdr; - data_desc[3].len = sizeof(*khdr); /* compute checksum */ if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, - khdr, 4, data_desc, &cksum)) + khdr, 3, data_desc, 0, NULL, &cksum)) return GSS_S_FAILURE; LASSERT(cksum.len >= ke->ke_hash_size); @@ -1007,26 +1267,26 @@ __u32 gss_wrap_kerberos(struct gss_ctx *gctx, struct ll_crypto_cipher *arc4_tfm; if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi, - NULL, 1, &cksum, &arc4_keye)) { + NULL, 1, &cksum, 0, NULL, &arc4_keye)) { CERROR("failed to obtain arc4 enc key\n"); - GOTO(arc4_out, enc_rc = -EACCES); + GOTO(arc4_out, rc = -EACCES); } arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0); if (arc4_tfm == NULL) { CERROR("failed to alloc tfm arc4 in ECB mode\n"); - GOTO(arc4_out_key, enc_rc = -EACCES); + GOTO(arc4_out_key, rc = -EACCES); } if (ll_crypto_blkcipher_setkey(arc4_tfm, arc4_keye.data, arc4_keye.len)) { CERROR("failed to set arc4 key, len %d\n", arc4_keye.len); - GOTO(arc4_out_tfm, enc_rc = -EACCES); + GOTO(arc4_out_tfm, rc = -EACCES); } - enc_rc = krb5_encrypt_rawobjs(arc4_tfm, 1, - 3, data_desc, &cipher, 1); + rc = krb5_encrypt_rawobjs(arc4_tfm, 1, + 3, data_desc, &cipher, 1); arc4_out_tfm: ll_crypto_free_blkcipher(arc4_tfm); arc4_out_key: @@ -1034,11 +1294,155 @@ arc4_out_key: arc4_out: do {} while(0); /* just to avoid compile warning */ } else { - enc_rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0, - 3, data_desc, &cipher, 1); + rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0, + 3, data_desc, &cipher, 1); + } + + if (rc != 0) { + rawobj_free(&cksum); + return GSS_S_FAILURE; + } + + /* fill in checksum */ + LASSERT(token->len >= sizeof(*khdr) + cipher.len + ke->ke_hash_size); + memcpy((char *)(khdr + 1) + cipher.len, + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size); + rawobj_free(&cksum); + + /* final token length */ + token->len = sizeof(*khdr) + cipher.len + ke->ke_hash_size; + return GSS_S_COMPLETE; +} + +static +__u32 gss_prep_bulk_kerberos(struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + int blocksize, i; + + LASSERT(desc->bd_iov_count); + LASSERT(desc->bd_enc_iov); + LASSERT(kctx->kc_keye.kb_tfm); + + blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm); + + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(desc->bd_enc_iov[i].kiov_page); + /* + * offset should always start at page boundary of either + * client or server side. + */ + if (desc->bd_iov[i].kiov_offset & blocksize) { + CERROR("odd offset %d in page %d\n", + desc->bd_iov[i].kiov_offset, i); + return GSS_S_FAILURE; + } + + desc->bd_enc_iov[i].kiov_offset = desc->bd_iov[i].kiov_offset; + desc->bd_enc_iov[i].kiov_len = (desc->bd_iov[i].kiov_len + + blocksize - 1) & (~(blocksize - 1)); + } + + return GSS_S_COMPLETE; +} + +static +__u32 gss_wrap_bulk_kerberos(struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, int adj_nob) +{ + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + int blocksize; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t data_desc[1], cipher; + __u8 conf[GSS_MAX_CIPHER_BLOCK]; + int rc = 0; + + LASSERT(ke); + LASSERT(ke->ke_conf_size <= GSS_MAX_CIPHER_BLOCK); + + /* + * final token format: + * -------------------------------------------------- + * | krb5 header | head/tail cipher text | checksum | + * -------------------------------------------------- + */ + + /* fill krb5 header */ + LASSERT(token->len >= sizeof(*khdr)); + khdr = (struct krb5_header *) token->data; + fill_krb5_header(kctx, khdr, 1); + + /* generate confounder */ + get_random_bytes(conf, ke->ke_conf_size); + + /* get encryption blocksize. note kc_keye might not associated with + * a tfm, currently only for arcfour-hmac */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksize = 1; + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm); + } + + /* + * we assume the size of krb5_header (16 bytes) must be n * blocksize. + * the bulk token size would be exactly (sizeof(krb5_header) + + * blocksize + sizeof(krb5_header) + hashsize) + */ + LASSERT(blocksize <= ke->ke_conf_size); + LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0); + LASSERT(token->len >= sizeof(*khdr) + blocksize + sizeof(*khdr) + 16); + + /* + * clear text layout for checksum: + * ------------------------------------------ + * | confounder | clear pages | krb5 header | + * ------------------------------------------ + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + + /* compute checksum */ + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 1, data_desc, + desc->bd_iov_count, desc->bd_iov, + &cksum)) + return GSS_S_FAILURE; + LASSERT(cksum.len >= ke->ke_hash_size); + + /* + * clear text layout for encryption: + * ------------------------------------------ + * | confounder | clear pages | krb5 header | + * ------------------------------------------ + * | | | + * ---------- (cipher pages) | + * result token: | | + * ------------------------------------------- + * | krb5 header | cipher text | cipher text | + * ------------------------------------------- + */ + data_desc[0].data = conf; + data_desc[0].len = ke->ke_conf_size; + + cipher.data = (__u8 *) (khdr + 1); + cipher.len = blocksize + sizeof(*khdr); + + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LBUG(); + rc = 0; + } else { + rc = krb5_encrypt_bulk(kctx->kc_keye.kb_tfm, khdr, + conf, desc, &cipher, adj_nob); } - if (enc_rc != 0) { + if (rc != 0) { rawobj_free(&cksum); return GSS_S_FAILURE; } @@ -1064,18 +1468,16 @@ __u32 gss_unwrap_kerberos(struct gss_ctx *gctx, struct krb5_ctx *kctx = gctx->internal_ctx_id; struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; struct krb5_header *khdr; - unsigned char acceptor_flag; unsigned char *tmpbuf; int blocksize, bodysize; rawobj_t cksum = RAWOBJ_EMPTY; rawobj_t cipher_in, plain_out; rawobj_t hash_objs[3]; - __u32 rc = GSS_S_FAILURE, enc_rc = 0; + int rc = 0; + __u32 major; LASSERT(ke); - acceptor_flag = kctx->kc_initiate ? FLAG_SENDER_IS_ACCEPTOR : 0; - if (token->len < sizeof(*khdr)) { CERROR("short signature: %u\n", token->len); return GSS_S_DEFECTIVE_TOKEN; @@ -1083,27 +1485,10 @@ __u32 gss_unwrap_kerberos(struct gss_ctx *gctx, khdr = (struct krb5_header *) token->data; - /* sanity check header */ - if (be16_to_cpu(khdr->kh_tok_id) != KG_TOK_WRAP_MSG) { - CERROR("bad token id\n"); - return GSS_S_DEFECTIVE_TOKEN; - } - if ((khdr->kh_flags & FLAG_SENDER_IS_ACCEPTOR) != acceptor_flag) { - CERROR("bad direction flag\n"); - return GSS_S_BAD_SIG; - } - if ((khdr->kh_flags & FLAG_WRAP_CONFIDENTIAL) == 0) { - CERROR("missing confidential flag\n"); - return GSS_S_BAD_SIG; - } - if (khdr->kh_filler != 0xff) { - CERROR("bad filler\n"); - return GSS_S_DEFECTIVE_TOKEN; - } - if (be16_to_cpu(khdr->kh_ec) != 0x0 || - be16_to_cpu(khdr->kh_rrc) != 0x0) { - CERROR("bad EC or RRC\n"); - return GSS_S_DEFECTIVE_TOKEN; + major = verify_krb5_header(kctx, khdr, 1); + if (major != GSS_S_COMPLETE) { + CERROR("bad krb5 header\n"); + return major; } /* block size */ @@ -1143,6 +1528,8 @@ __u32 gss_unwrap_kerberos(struct gss_ctx *gctx, if (!tmpbuf) return GSS_S_FAILURE; + major = GSS_S_FAILURE; + cipher_in.data = (__u8 *) (khdr + 1); cipher_in.len = bodysize; plain_out.data = tmpbuf; @@ -1156,26 +1543,26 @@ __u32 gss_unwrap_kerberos(struct gss_ctx *gctx, cksum.len = ke->ke_hash_size; if (krb5_make_checksum(ENCTYPE_ARCFOUR_HMAC, &kctx->kc_keyi, - NULL, 1, &cksum, &arc4_keye)) { + NULL, 1, &cksum, 0, NULL, &arc4_keye)) { CERROR("failed to obtain arc4 enc key\n"); - GOTO(arc4_out, enc_rc = -EACCES); + GOTO(arc4_out, rc = -EACCES); } arc4_tfm = ll_crypto_alloc_blkcipher("ecb(arc4)", 0, 0); if (arc4_tfm == NULL) { CERROR("failed to alloc tfm arc4 in ECB mode\n"); - GOTO(arc4_out_key, enc_rc = -EACCES); + GOTO(arc4_out_key, rc = -EACCES); } if (ll_crypto_blkcipher_setkey(arc4_tfm, arc4_keye.data, arc4_keye.len)) { CERROR("failed to set arc4 key, len %d\n", arc4_keye.len); - GOTO(arc4_out_tfm, enc_rc = -EACCES); + GOTO(arc4_out_tfm, rc = -EACCES); } - enc_rc = krb5_encrypt_rawobjs(arc4_tfm, 1, - 1, &cipher_in, &plain_out, 0); + rc = krb5_encrypt_rawobjs(arc4_tfm, 1, + 1, &cipher_in, &plain_out, 0); arc4_out_tfm: ll_crypto_free_blkcipher(arc4_tfm); arc4_out_key: @@ -1183,11 +1570,11 @@ arc4_out_key: arc4_out: cksum = RAWOBJ_EMPTY; } else { - enc_rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0, - 1, &cipher_in, &plain_out, 0); + rc = krb5_encrypt_rawobjs(kctx->kc_keye.kb_tfm, 0, + 1, &cipher_in, &plain_out, 0); } - if (enc_rc != 0) { + if (rc != 0) { CERROR("error decrypt\n"); goto out_free; } @@ -1215,46 +1602,119 @@ arc4_out: hash_objs[0].data = plain_out.data; hash_objs[1].len = gsshdr->len; hash_objs[1].data = gsshdr->data; - hash_objs[2].len = plain_out.len - ke->ke_conf_size; + hash_objs[2].len = plain_out.len - ke->ke_conf_size - sizeof(*khdr); hash_objs[2].data = plain_out.data + ke->ke_conf_size; if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, - khdr, 3, hash_objs, &cksum)) + khdr, 3, hash_objs, 0, NULL, &cksum)) goto out_free; LASSERT(cksum.len >= ke->ke_hash_size); if (memcmp((char *)(khdr + 1) + bodysize, cksum.data + cksum.len - ke->ke_hash_size, ke->ke_hash_size)) { - CERROR("cksum mismatch\n"); + CERROR("checksum mismatch\n"); goto out_free; } msg->len = bodysize - ke->ke_conf_size - sizeof(*khdr); memcpy(msg->data, tmpbuf + ke->ke_conf_size, msg->len); - rc = GSS_S_COMPLETE; + major = GSS_S_COMPLETE; out_free: OBD_FREE(tmpbuf, bodysize); rawobj_free(&cksum); - return rc; + return major; } static -__u32 gss_plain_encrypt_kerberos(struct gss_ctx *ctx, - int decrypt, - int length, - void *in_buf, - void *out_buf) +__u32 gss_unwrap_bulk_kerberos(struct gss_ctx *gctx, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token) { - struct krb5_ctx *kctx = ctx->internal_ctx_id; - __u32 rc; + struct krb5_ctx *kctx = gctx->internal_ctx_id; + struct krb5_enctype *ke = &enctypes[kctx->kc_enctype]; + struct krb5_header *khdr; + int blocksize; + rawobj_t cksum = RAWOBJ_EMPTY; + rawobj_t cipher, plain; + rawobj_t data_desc[1]; + int rc; + __u32 major; + + LASSERT(ke); + + if (token->len < sizeof(*khdr)) { + CERROR("short signature: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + khdr = (struct krb5_header *) token->data; + + major = verify_krb5_header(kctx, khdr, 1); + if (major != GSS_S_COMPLETE) { + CERROR("bad krb5 header\n"); + return major; + } + + /* block size */ + if (kctx->kc_enctype == ENCTYPE_ARCFOUR_HMAC) { + LASSERT(kctx->kc_keye.kb_tfm == NULL); + blocksize = 1; + LBUG(); + } else { + LASSERT(kctx->kc_keye.kb_tfm); + blocksize = ll_crypto_blkcipher_blocksize(kctx->kc_keye.kb_tfm); + } + LASSERT(sizeof(*khdr) >= blocksize && sizeof(*khdr) % blocksize == 0); + + /* + * token format is expected as: + * ----------------------------------------------- + * | krb5 header | head/tail cipher text | cksum | + * ----------------------------------------------- + */ + if (token->len < sizeof(*khdr) + blocksize + sizeof(*khdr) + + ke->ke_hash_size) { + CERROR("short token size: %u\n", token->len); + return GSS_S_DEFECTIVE_TOKEN; + } + + cipher.data = (__u8 *) (khdr + 1); + cipher.len = blocksize + sizeof(*khdr); + plain.data = cipher.data; + plain.len = cipher.len; - rc = krb5_encrypt(kctx->kc_keye.kb_tfm, decrypt, - NULL, in_buf, out_buf, length); + rc = krb5_decrypt_bulk(kctx->kc_keye.kb_tfm, khdr, + desc, &cipher, &plain); if (rc) - CERROR("plain encrypt error: %d\n", rc); + return GSS_S_DEFECTIVE_TOKEN; + + /* + * verify checksum, compose clear text as layout: + * ------------------------------------------ + * | confounder | clear pages | krb5 header | + * ------------------------------------------ + */ + data_desc[0].data = plain.data; + data_desc[0].len = blocksize; + + if (krb5_make_checksum(kctx->kc_enctype, &kctx->kc_keyi, + khdr, 1, data_desc, + desc->bd_iov_count, desc->bd_iov, + &cksum)) + return GSS_S_FAILURE; + LASSERT(cksum.len >= ke->ke_hash_size); + + if (memcmp(plain.data + blocksize + sizeof(*khdr), + cksum.data + cksum.len - ke->ke_hash_size, + ke->ke_hash_size)) { + CERROR("checksum mismatch\n"); + rawobj_free(&cksum); + return GSS_S_BAD_SIG; + } - return rc; + rawobj_free(&cksum); + return GSS_S_COMPLETE; } int gss_display_kerberos(struct gss_ctx *ctx, @@ -1277,7 +1737,9 @@ static struct gss_api_ops gss_kerberos_ops = { .gss_verify_mic = gss_verify_mic_kerberos, .gss_wrap = gss_wrap_kerberos, .gss_unwrap = gss_unwrap_kerberos, - .gss_plain_encrypt = gss_plain_encrypt_kerberos, + .gss_prep_bulk = gss_prep_bulk_kerberos, + .gss_wrap_bulk = gss_wrap_bulk_kerberos, + .gss_unwrap_bulk = gss_unwrap_bulk_kerberos, .gss_delete_sec_context = gss_delete_sec_context_kerberos, .gss_display = gss_display_kerberos, }; diff --git a/lustre/ptlrpc/gss/gss_mech_switch.c b/lustre/ptlrpc/gss/gss_mech_switch.c index 8a4e627..ca55fe8 100644 --- a/lustre/ptlrpc/gss/gss_mech_switch.c +++ b/lustre/ptlrpc/gss/gss_mech_switch.c @@ -214,6 +214,8 @@ __u32 lgss_inquire_context(struct gss_ctx *context_handle, __u32 lgss_get_mic(struct gss_ctx *context_handle, int msgcnt, rawobj_t *msg, + int iovcnt, + lnet_kiov_t *iovs, rawobj_t *mic_token) { LASSERT(context_handle); @@ -225,6 +227,8 @@ __u32 lgss_get_mic(struct gss_ctx *context_handle, ->gss_get_mic(context_handle, msgcnt, msg, + iovcnt, + iovs, mic_token); } @@ -232,6 +236,8 @@ __u32 lgss_get_mic(struct gss_ctx *context_handle, __u32 lgss_verify_mic(struct gss_ctx *context_handle, int msgcnt, rawobj_t *msg, + int iovcnt, + lnet_kiov_t *iovs, rawobj_t *mic_token) { LASSERT(context_handle); @@ -243,6 +249,8 @@ __u32 lgss_verify_mic(struct gss_ctx *context_handle, ->gss_verify_mic(context_handle, msgcnt, msg, + iovcnt, + iovs, mic_token); } @@ -276,19 +284,43 @@ __u32 lgss_unwrap(struct gss_ctx *context_handle, } -__u32 lgss_plain_encrypt(struct gss_ctx *ctx, - int decrypt, - int length, - void *in_buf, - void *out_buf) +__u32 lgss_prep_bulk(struct gss_ctx *context_handle, + struct ptlrpc_bulk_desc *desc) { - LASSERT(ctx); - LASSERT(ctx->mech_type); - LASSERT(ctx->mech_type->gm_ops); - LASSERT(ctx->mech_type->gm_ops->gss_plain_encrypt); + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_prep_bulk); - return ctx->mech_type->gm_ops - ->gss_plain_encrypt(ctx, decrypt, length, in_buf, out_buf); + return context_handle->mech_type->gm_ops + ->gss_prep_bulk(context_handle, desc); +} + +__u32 lgss_wrap_bulk(struct gss_ctx *context_handle, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token, + int adj_nob) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_wrap_bulk); + + return context_handle->mech_type->gm_ops + ->gss_wrap_bulk(context_handle, desc, token, adj_nob); +} + +__u32 lgss_unwrap_bulk(struct gss_ctx *context_handle, + struct ptlrpc_bulk_desc *desc, + rawobj_t *token) +{ + LASSERT(context_handle); + LASSERT(context_handle->mech_type); + LASSERT(context_handle->mech_type->gm_ops); + LASSERT(context_handle->mech_type->gm_ops->gss_unwrap_bulk); + + return context_handle->mech_type->gm_ops + ->gss_unwrap_bulk(context_handle, desc, token); } /* gss_delete_sec_context: free all resources associated with context_handle. diff --git a/lustre/ptlrpc/gss/sec_gss.c b/lustre/ptlrpc/gss/sec_gss.c index f3aae3f..9b531f2 100644 --- a/lustre/ptlrpc/gss/sec_gss.c +++ b/lustre/ptlrpc/gss/sec_gss.c @@ -182,7 +182,7 @@ static int gss_sign_msg(struct lustre_msg *msg, rawobj_t *handle) { struct gss_header *ghdr; - rawobj_t text[3], mic; + rawobj_t text[4], mic; int textcnt, max_textcnt, mic_idx; __u32 major; @@ -223,7 +223,7 @@ static int gss_sign_msg(struct lustre_msg *msg, mic.len = msg->lm_buflens[mic_idx]; mic.data = lustre_msg_buf(msg, mic_idx, 0); - major = lgss_get_mic(mechctx, textcnt, text, &mic); + major = lgss_get_mic(mechctx, textcnt, text, 0, NULL, &mic); if (major != GSS_S_COMPLETE) { CERROR("fail to generate MIC: %08x\n", major); return -EPERM; @@ -241,7 +241,7 @@ __u32 gss_verify_msg(struct lustre_msg *msg, struct gss_ctx *mechctx, __u32 svc) { - rawobj_t text[3], mic; + rawobj_t text[4], mic; int textcnt, max_textcnt; int mic_idx; __u32 major; @@ -262,7 +262,7 @@ __u32 gss_verify_msg(struct lustre_msg *msg, mic.len = msg->lm_buflens[mic_idx]; mic.data = lustre_msg_buf(msg, mic_idx, 0); - major = lgss_verify_mic(mechctx, textcnt, text, &mic); + major = lgss_verify_mic(mechctx, textcnt, text, 0, NULL, &mic); if (major != GSS_S_COMPLETE) CERROR("mic verify error: %08x\n", major); @@ -584,6 +584,33 @@ static inline int gss_cli_payload(struct ptlrpc_cli_ctx *ctx, return gss_mech_payload(NULL, msgsize, privacy); } +static int gss_cli_bulk_payload(struct ptlrpc_cli_ctx *ctx, + struct sptlrpc_flavor *flvr, + int reply, int read) +{ + int payload = sizeof(struct ptlrpc_bulk_sec_desc); + + LASSERT(SPTLRPC_FLVR_BULK_TYPE(flvr->sf_rpc) == SPTLRPC_BULK_DEFAULT); + + if ((!reply && !read) || (reply && read)) { + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_NULL: + break; + case SPTLRPC_BULK_SVC_INTG: + payload += gss_cli_payload(ctx, 0, 0); + break; + case SPTLRPC_BULK_SVC_PRIV: + payload += gss_cli_payload(ctx, 0, 1); + break; + case SPTLRPC_BULK_SVC_AUTH: + default: + LBUG(); + } + } + + return payload; +} + int gss_cli_ctx_match(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred) { return (ctx->cc_vcred.vc_uid == vcred->vc_uid); @@ -627,7 +654,7 @@ int gss_cli_ctx_sign(struct ptlrpc_cli_ctx *ctx, if (req->rq_ctx_init) RETURN(0); - svc = RPC_FLVR_SVC(req->rq_flvr.sf_rpc); + svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); if (req->rq_pack_bulk) flags |= LUSTRE_GSS_PACK_BULK; if (req->rq_pack_udesc) @@ -798,8 +825,10 @@ int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, gss_header_swabber(ghdr); major = gss_verify_msg(msg, gctx->gc_mechctx, reqhdr->gh_svc); - if (major != GSS_S_COMPLETE) + if (major != GSS_S_COMPLETE) { + CERROR("failed to verify reply: %x\n", major); RETURN(-EPERM); + } if (req->rq_early && reqhdr->gh_svc == SPTLRPC_SVC_NULL) { __u32 cksum; @@ -996,6 +1025,7 @@ int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, major = gss_unseal_msg(gctx->gc_mechctx, msg, &msglen, req->rq_repdata_len); if (major != GSS_S_COMPLETE) { + CERROR("failed to unwrap reply: %x\n", major); rc = -EPERM; break; } @@ -1018,7 +1048,7 @@ int gss_cli_ctx_unseal(struct ptlrpc_cli_ctx *ctx, } /* bulk checksum is the last segment */ - if (bulk_sec_desc_unpack(msg, msg->lm_bufcount-1)) + if (bulk_sec_desc_unpack(msg, msg->lm_bufcount - 1)) RETURN(-EPROTO); } @@ -1067,12 +1097,13 @@ int gss_sec_create_common(struct gss_sec *gsec, struct ptlrpc_sec *sec; LASSERT(imp); - LASSERT(RPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS); + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_GSS); - gsec->gs_mech = lgss_subflavor_to_mech(RPC_FLVR_SUB(sf->sf_rpc)); + gsec->gs_mech = lgss_subflavor_to_mech( + SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc)); if (!gsec->gs_mech) { CERROR("gss backend 0x%x not found\n", - RPC_FLVR_SUB(sf->sf_rpc)); + SPTLRPC_FLVR_BASE_SUB(sf->sf_rpc)); return -EOPNOTSUPP; } @@ -1099,8 +1130,7 @@ int gss_sec_create_common(struct gss_sec *gsec, sec->ps_gc_interval = 0; } - if (sec->ps_flvr.sf_bulk_ciph != BULK_CIPH_ALG_NULL && - sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_BULK) + if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV) sptlrpc_enc_pool_add_user(); CDEBUG(D_SEC, "create %s%s@%p\n", (svcctx ? "reverse " : ""), @@ -1124,8 +1154,7 @@ void gss_sec_destroy_common(struct gss_sec *gsec) class_import_put(sec->ps_import); - if (sec->ps_flvr.sf_bulk_ciph != BULK_CIPH_ALG_NULL && - sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_BULK) + if (SPTLRPC_FLVR_BULK_SVC(sec->ps_flvr.sf_rpc) == SPTLRPC_BULK_SVC_PRIV) sptlrpc_enc_pool_del_user(); EXIT; @@ -1247,9 +1276,9 @@ int gss_alloc_reqbuf_intg(struct ptlrpc_sec *sec, } if (req->rq_pack_bulk) { - buflens[bufcnt] = bulk_sec_desc_size( - req->rq_flvr.sf_bulk_hash, 1, - req->rq_bulk_read); + buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, + 0, req->rq_bulk_read); if (svc == SPTLRPC_SVC_INTG) txtsize += buflens[bufcnt]; bufcnt++; @@ -1313,9 +1342,9 @@ int gss_alloc_reqbuf_priv(struct ptlrpc_sec *sec, if (req->rq_pack_udesc) ibuflens[ibufcnt++] = sptlrpc_current_user_desc_size(); if (req->rq_pack_bulk) - ibuflens[ibufcnt++] = bulk_sec_desc_size( - req->rq_flvr.sf_bulk_hash, 1, - req->rq_bulk_read); + ibuflens[ibufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, 0, + req->rq_bulk_read); clearsize = lustre_msg_size_v2(ibufcnt, ibuflens); /* to allow append padding during encryption */ @@ -1375,7 +1404,7 @@ int gss_alloc_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, int msgsize) { - int svc = RPC_FLVR_SVC(req->rq_flvr.sf_rpc); + int svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); LASSERT(!req->rq_pack_bulk || (req->rq_bulk_read || req->rq_bulk_write)); @@ -1400,7 +1429,7 @@ void gss_free_reqbuf(struct ptlrpc_sec *sec, ENTRY; LASSERT(!req->rq_pool || req->rq_reqbuf); - privacy = RPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV; + privacy = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) == SPTLRPC_SVC_PRIV; if (!req->rq_clrbuf) goto release_reqbuf; @@ -1477,9 +1506,9 @@ int gss_alloc_repbuf_intg(struct ptlrpc_sec *sec, txtsize += buflens[1]; if (req->rq_pack_bulk) { - buflens[bufcnt] = bulk_sec_desc_size( - req->rq_flvr.sf_bulk_hash, 0, - req->rq_bulk_read); + buflens[bufcnt] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, + 1, req->rq_bulk_read); if (svc == SPTLRPC_SVC_INTG) txtsize += buflens[bufcnt]; bufcnt++; @@ -1513,9 +1542,9 @@ int gss_alloc_repbuf_priv(struct ptlrpc_sec *sec, buflens[0] = msgsize; if (req->rq_pack_bulk) - buflens[bufcnt++] = bulk_sec_desc_size( - req->rq_flvr.sf_bulk_hash, 0, - req->rq_bulk_read); + buflens[bufcnt++] = gss_cli_bulk_payload(req->rq_cli_ctx, + &req->rq_flvr, + 1, req->rq_bulk_read); txtsize = lustre_msg_size_v2(bufcnt, buflens); txtsize += GSS_MAX_CIPHER_BLOCK; @@ -1535,7 +1564,7 @@ int gss_alloc_repbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, int msgsize) { - int svc = RPC_FLVR_SVC(req->rq_flvr.sf_rpc); + int svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); ENTRY; LASSERT(!req->rq_pack_bulk || @@ -1771,7 +1800,7 @@ int gss_enlarge_reqbuf(struct ptlrpc_sec *sec, struct ptlrpc_request *req, int segment, int newsize) { - int svc = RPC_FLVR_SVC(req->rq_flvr.sf_rpc); + int svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); LASSERT(!req->rq_ctx_init && !req->rq_ctx_fini); @@ -2066,8 +2095,10 @@ int gss_svc_verify_request(struct ptlrpc_request *req, } *major = gss_verify_msg(msg, gctx->gsc_mechctx, gw->gw_svc); - if (*major != GSS_S_COMPLETE) + if (*major != GSS_S_COMPLETE) { + CERROR("failed to verify request: %x\n", *major); RETURN(-EACCES); + } if (gctx->gsc_reverse == 0 && gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) { @@ -2094,10 +2125,10 @@ verified: offset++; } - /* check bulk cksum data */ + /* check bulk_sec_desc data */ if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) { if (msg->lm_bufcount < (offset + 1)) { - CERROR("no bulk checksum included\n"); + CERROR("missing bulk sec descriptor\n"); RETURN(-EINVAL); } @@ -2133,8 +2164,10 @@ int gss_svc_unseal_request(struct ptlrpc_request *req, *major = gss_unseal_msg(gctx->gsc_mechctx, msg, &msglen, req->rq_reqdata_len); - if (*major != GSS_S_COMPLETE) + if (*major != GSS_S_COMPLETE) { + CERROR("failed to unwrap request: %x\n", *major); RETURN(-EACCES); + } if (gss_check_seq_num(&gctx->gsc_seqdata, gw->gw_seq, 1)) { CERROR("phase 1+: discard replayed req: seq %u\n", gw->gw_seq); @@ -2405,6 +2438,31 @@ int gss_svc_payload(struct gss_svc_reqctx *grctx, int early, return gss_mech_payload(NULL, msgsize, privacy); } +static int gss_svc_bulk_payload(struct gss_svc_ctx *gctx, + struct sptlrpc_flavor *flvr, + int read) +{ + int payload = sizeof(struct ptlrpc_bulk_sec_desc); + + if (read) { + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_NULL: + break; + case SPTLRPC_BULK_SVC_INTG: + payload += gss_mech_payload(NULL, 0, 0); + break; + case SPTLRPC_BULK_SVC_PRIV: + payload += gss_mech_payload(NULL, 0, 1); + break; + case SPTLRPC_BULK_SVC_AUTH: + default: + LBUG(); + } + } + + return payload; +} + int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen) { struct gss_svc_reqctx *grctx; @@ -2422,7 +2480,7 @@ int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen) RETURN(-EPROTO); } - svc = RPC_FLVR_SVC(req->rq_flvr.sf_rpc); + svc = SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc); early = (req->rq_packed_final == 0); grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); @@ -2440,9 +2498,10 @@ int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen) LASSERT(grctx->src_reqbsd); bsd_off = ibufcnt; - ibuflens[ibufcnt++] = bulk_sec_desc_size( - grctx->src_reqbsd->bsd_hash_alg, - 0, req->rq_bulk_read); + ibuflens[ibufcnt++] = gss_svc_bulk_payload( + grctx->src_ctx, + &req->rq_flvr, + req->rq_bulk_read); } txtsize = lustre_msg_size_v2(ibufcnt, ibuflens); @@ -2465,9 +2524,10 @@ int gss_svc_alloc_rs(struct ptlrpc_request *req, int msglen) LASSERT(grctx->src_reqbsd); bsd_off = bufcnt; - buflens[bufcnt] = bulk_sec_desc_size( - grctx->src_reqbsd->bsd_hash_alg, - 0, req->rq_bulk_read); + buflens[bufcnt] = gss_svc_bulk_payload( + grctx->src_ctx, + &req->rq_flvr, + req->rq_bulk_read); if (svc == SPTLRPC_SVC_INTG) txtsize += buflens[bufcnt]; bufcnt++; diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 3b22441..65eedd1 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -351,7 +351,7 @@ out: sptlrpc_import_flush_all_ctx(imp); atomic_dec(&imp->imp_inval_count); - cfs_waitq_signal(&imp->imp_recovery_waitq); + cfs_waitq_broadcast(&imp->imp_recovery_waitq); } /* unset imp_invalid */ @@ -810,14 +810,7 @@ static int ptlrpc_connect_interpret(const struct lu_env *env, IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS); } else { IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); - } - - spin_lock(&imp->imp_lock); - if (imp->imp_invalid) { - spin_unlock(&imp->imp_lock); ptlrpc_activate_import(imp); - } else { - spin_unlock(&imp->imp_lock); } GOTO(finish, rc = 0); @@ -1146,7 +1139,7 @@ out: imp->imp_last_recon = 0; spin_unlock(&imp->imp_lock); - cfs_waitq_signal(&imp->imp_recovery_waitq); + cfs_waitq_broadcast(&imp->imp_recovery_waitq); RETURN(rc); } @@ -1326,7 +1319,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) } if (imp->imp_state == LUSTRE_IMP_FULL) { - cfs_waitq_signal(&imp->imp_recovery_waitq); + cfs_waitq_broadcast(&imp->imp_recovery_waitq); ptlrpc_wake_delayed(imp); } diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 5c3aaca..7303382 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -117,13 +117,13 @@ struct ll_rpc_opcode { { LLOG_CATINFO, "llog_catinfo" }, { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" }, { LLOG_ORIGIN_HANDLE_DESTROY, "llog_origin_handle_destroy" }, - { FLD_QUERY, "fld_query" }, + { QUOTA_DQACQ, "quota_acquire" }, + { QUOTA_DQREL, "quota_release" }, { SEQ_QUERY, "seq_query" }, { SEC_CTX_INIT, "sec_ctx_init" }, { SEC_CTX_INIT_CONT,"sec_ctx_init_cont" }, { SEC_CTX_FINI, "sec_ctx_fini" }, - { QUOTA_DQACQ, "quota_acquire" }, - { QUOTA_DQREL, "quota_release" } + { FLD_QUERY, "fld_query" } }; struct ll_eopcode { diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 11f6641..357e559 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -529,6 +529,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) lustre_msghdr_set_flags(request->rq_reqmsg, request->rq_import->imp_msghdr_flags); + if (request->rq_resend) + lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); + rc = sptlrpc_cli_wrap_request(request); if (rc) RETURN(rc); @@ -540,9 +543,6 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) RETURN(rc); } - if (request->rq_resend) - lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); - if (!noreply) { LASSERT (request->rq_replen != 0); if (request->rq_repbuf == NULL) { diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 1f41109..e85951a 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -389,8 +389,9 @@ void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size) buflen = m->lm_buflens[n]; if (unlikely(buflen < min_size)) { - CERROR("msg %p buffer[%d] size %d too small (required %d)\n", - m, n, buflen, min_size); + CERROR("msg %p buffer[%d] size %d too small " + "(required %d, opc=%d)\n", + m, n, buflen, min_size, lustre_msg_get_opc(m)); return NULL; } @@ -1951,14 +1952,26 @@ void lustre_swab_lov_desc (struct lov_desc *ld) /* uuid endian insensitive */ } -/*begin adding MDT by huanghua@clusterfs.com*/ void lustre_swab_lmv_desc (struct lmv_desc *ld) { __swab32s (&ld->ld_tgt_count); __swab32s (&ld->ld_active_tgt_count); + __swab32s (&ld->ld_default_stripe_count); + __swab32s (&ld->ld_pattern); + __swab64s (&ld->ld_default_hash_size); + __swab32s (&ld->ld_qos_maxage); /* uuid endian insensitive */ } +void lustre_swab_lmv_stripe_md (struct lmv_stripe_md *mea) +{ + __swab32s(&mea->mea_magic); + __swab32s(&mea->mea_count); + __swab32s(&mea->mea_master); + CLASSERT(offsetof(typeof(*mea), mea_padding) != 0); +} + + static void print_lum (struct lov_user_md *lum) { CDEBUG(D_OTHER, "lov_user_md %p:\n", lum); @@ -2014,6 +2027,19 @@ void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum) EXIT; } +void lustre_swab_lov_mds_md(struct lov_mds_md *lmm) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_mds_md\n"); + __swab32s(&lmm->lmm_magic); + __swab32s(&lmm->lmm_pattern); + __swab64s(&lmm->lmm_object_id); + __swab64s(&lmm->lmm_object_gr); + __swab32s(&lmm->lmm_stripe_size); + __swab32s(&lmm->lmm_stripe_count); + EXIT; +} + void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj) { ENTRY; diff --git a/lustre/ptlrpc/pers.c b/lustre/ptlrpc/pers.c index d53d42c..1b5f1ed 100644 --- a/lustre/ptlrpc/pers.c +++ b/lustre/ptlrpc/pers.c @@ -57,8 +57,11 @@ void ptlrpc_fill_bulk_md (lnet_md_t *md, struct ptlrpc_bulk_desc *desc) LASSERT (!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV | LNET_MD_PHYS))); md->options |= LNET_MD_KIOV; - md->start = &desc->bd_iov[0]; md->length = desc->bd_iov_count; + if (desc->bd_enc_iov) + md->start = desc->bd_enc_iov; + else + md->start = desc->bd_iov; } void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, cfs_page_t *page, diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index e4f0a0e..c097f65 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -264,6 +264,7 @@ EXPORT_SYMBOL(lustre_swab_lov_user_md_v1); EXPORT_SYMBOL(lustre_swab_lov_user_md_v3); EXPORT_SYMBOL(lustre_swab_lov_user_md_objects); EXPORT_SYMBOL(lustre_swab_lov_user_md_join); +EXPORT_SYMBOL(lustre_swab_lov_mds_md); EXPORT_SYMBOL(lustre_swab_ldlm_res_id); EXPORT_SYMBOL(lustre_swab_ldlm_policy_data); EXPORT_SYMBOL(lustre_swab_ldlm_intent); diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c index dc4b13b..e90142b 100644 --- a/lustre/ptlrpc/recov_thread.c +++ b/lustre/ptlrpc/recov_thread.c @@ -591,6 +591,7 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt, mutex_down(&ctxt->loc_sem); lcm = ctxt->loc_lcm; + CDEBUG(D_INFO, "cancel on lsm %p\n", lcm); /* * Let's check if we have all structures alive. We also check for diff --git a/lustre/ptlrpc/sec.c b/lustre/ptlrpc/sec.c index d268380..69e618f 100644 --- a/lustre/ptlrpc/sec.c +++ b/lustre/ptlrpc/sec.c @@ -118,12 +118,13 @@ int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy) EXPORT_SYMBOL(sptlrpc_unregister_policy); static -struct ptlrpc_sec_policy * sptlrpc_rpcflavor2policy(__u16 flavor) +struct ptlrpc_sec_policy * sptlrpc_wireflavor2policy(__u32 flavor) { static DECLARE_MUTEX(load_mutex); static atomic_t loaded = ATOMIC_INIT(0); struct ptlrpc_sec_policy *policy; - __u16 number = RPC_FLVR_POLICY(flavor), flag = 0; + __u16 number = SPTLRPC_FLVR_POLICY(flavor); + __u16 flag = 0; if (number >= SPTLRPC_POLICY_MAX) return NULL; @@ -157,7 +158,7 @@ struct ptlrpc_sec_policy * sptlrpc_rpcflavor2policy(__u16 flavor) return policy; } -__u16 sptlrpc_name2rpcflavor(const char *name) +__u32 sptlrpc_name2flavor_base(const char *name) { if (!strcmp(name, "null")) return SPTLRPC_FLVR_NULL; @@ -174,51 +175,86 @@ __u16 sptlrpc_name2rpcflavor(const char *name) return SPTLRPC_FLVR_INVALID; } -EXPORT_SYMBOL(sptlrpc_name2rpcflavor); +EXPORT_SYMBOL(sptlrpc_name2flavor_base); -const char *sptlrpc_rpcflavor2name(__u16 flavor) +const char *sptlrpc_flavor2name_base(__u32 flvr) { - switch (flavor) { - case SPTLRPC_FLVR_NULL: + __u32 base = SPTLRPC_FLVR_BASE(flvr); + + if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) return "null"; - case SPTLRPC_FLVR_PLAIN: + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN)) return "plain"; - case SPTLRPC_FLVR_KRB5N: + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N)) return "krb5n"; - case SPTLRPC_FLVR_KRB5A: + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A)) return "krb5a"; - case SPTLRPC_FLVR_KRB5I: + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I)) return "krb5i"; - case SPTLRPC_FLVR_KRB5P: + else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P)) return "krb5p"; - default: - CERROR("invalid rpc flavor 0x%x(p%u,s%u,v%u)\n", flavor, - RPC_FLVR_POLICY(flavor), RPC_FLVR_MECH(flavor), - RPC_FLVR_SVC(flavor)); - } - return "unknown"; + + CERROR("invalid wire flavor 0x%x\n", flvr); + return "invalid"; } -EXPORT_SYMBOL(sptlrpc_rpcflavor2name); +EXPORT_SYMBOL(sptlrpc_flavor2name_base); -int sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize) +char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, + char *buf, int bufsize) { - char *bulk; - - if (sf->sf_bulk_ciph != BULK_CIPH_ALG_NULL) - bulk = "bulkp"; - else if (sf->sf_bulk_hash != BULK_HASH_ALG_NULL) - bulk = "bulki"; + if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) + snprintf(buf, bufsize, "hash:%s", + sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg)); else - bulk = "bulkn"; + snprintf(buf, bufsize, "%s", + sptlrpc_flavor2name_base(sf->sf_rpc)); - snprintf(buf, bufsize, "%s-%s:%s/%s", - sptlrpc_rpcflavor2name(sf->sf_rpc), bulk, - sptlrpc_get_hash_name(sf->sf_bulk_hash), - sptlrpc_get_ciph_name(sf->sf_bulk_ciph)); - return 0; + buf[bufsize - 1] = '\0'; + return buf; +} +EXPORT_SYMBOL(sptlrpc_flavor2name_bulk); + +char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize) +{ + snprintf(buf, bufsize, "%s", sptlrpc_flavor2name_base(sf->sf_rpc)); + + /* + * currently we don't support customized bulk specification for + * flavors other than plain + */ + if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) { + char bspec[16]; + + bspec[0] = '-'; + sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1); + strncat(buf, bspec, bufsize); + } + + buf[bufsize - 1] = '\0'; + return buf; } EXPORT_SYMBOL(sptlrpc_flavor2name); +char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize) +{ + buf[0] = '\0'; + + if (flags & PTLRPC_SEC_FL_REVERSE) + strncat(buf, "reverse,", bufsize); + if (flags & PTLRPC_SEC_FL_ROOTONLY) + strncat(buf, "rootonly,", bufsize); + if (flags & PTLRPC_SEC_FL_UDESC) + strncat(buf, "udesc,", bufsize); + if (flags & PTLRPC_SEC_FL_BULK) + strncat(buf, "bulk,", bufsize); + if (buf[0] == '\0') + strncat(buf, "-,", bufsize); + + buf[bufsize - 1] = '\0'; + return buf; +} +EXPORT_SYMBOL(sptlrpc_secflags2str); + /************************************************** * client context APIs * **************************************************/ @@ -752,9 +788,11 @@ void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode) /* special security flags accoding to opcode */ switch (opcode) { case OST_READ: + case MDS_READPAGE: req->rq_bulk_read = 1; break; case OST_WRITE: + case MDS_WRITEPAGE: req->rq_bulk_write = 1; break; case SEC_CTX_INIT: @@ -783,9 +821,9 @@ void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode) /* force SVC_NULL for context initiation rpc, SVC_INTG for context * destruction rpc */ if (unlikely(req->rq_ctx_init)) - rpc_flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL); + flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL); else if (unlikely(req->rq_ctx_fini)) - rpc_flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG); + flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG); /* user descriptor flag, null security can't do it anyway */ if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) && @@ -794,14 +832,13 @@ void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode) /* bulk security flag */ if ((req->rq_bulk_read || req->rq_bulk_write) && - (req->rq_flvr.sf_bulk_ciph != BULK_CIPH_ALG_NULL || - req->rq_flvr.sf_bulk_hash != BULK_HASH_ALG_NULL)) + sptlrpc_flavor_has_bulk(&req->rq_flvr)) req->rq_pack_bulk = 1; } void sptlrpc_request_out_callback(struct ptlrpc_request *req) { - if (RPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV) + if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV) return; LASSERT(req->rq_clrbuf); @@ -885,7 +922,7 @@ int sptlrpc_cli_wrap_request(struct ptlrpc_request *req) RETURN(rc); } - switch (RPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { case SPTLRPC_SVC_NULL: case SPTLRPC_SVC_AUTH: case SPTLRPC_SVC_INTG: @@ -913,7 +950,7 @@ static int do_cli_unwrap_reply(struct ptlrpc_request *req) { struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; int rc; - __u16 rpc_flvr; + __u32 flvr; ENTRY; LASSERT(ctx); @@ -929,26 +966,26 @@ static int do_cli_unwrap_reply(struct ptlrpc_request *req) } /* v2 message, check request/reply policy match */ - rpc_flvr = WIRE_FLVR_RPC(req->rq_repdata->lm_secflvr); + flvr = WIRE_FLVR(req->rq_repdata->lm_secflvr); if (req->rq_repdata->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED) - __swab16s(&rpc_flvr); + __swab32s(&flvr); - if (RPC_FLVR_POLICY(rpc_flvr) != - RPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) { + if (SPTLRPC_FLVR_POLICY(flvr) != + SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) { CERROR("request policy was %u while reply with %u\n", - RPC_FLVR_POLICY(req->rq_flvr.sf_rpc), - RPC_FLVR_POLICY(rpc_flvr)); + SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc), + SPTLRPC_FLVR_POLICY(flvr)); RETURN(-EPROTO); } /* do nothing if it's null policy; otherwise unpack the * wrapper message */ - if (RPC_FLVR_POLICY(rpc_flvr) != SPTLRPC_POLICY_NULL && + if (SPTLRPC_FLVR_POLICY(flvr) != SPTLRPC_POLICY_NULL && lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len)) RETURN(-EPROTO); - switch (RPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { + switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { case SPTLRPC_SVC_NULL: case SPTLRPC_SVC_AUTH: case SPTLRPC_SVC_INTG: @@ -1188,7 +1225,7 @@ void sptlrpc_sec_put(struct ptlrpc_sec *sec) EXPORT_SYMBOL(sptlrpc_sec_put); /* - * it's policy module responsible for taking refrence of import + * policy module is responsible for taking refrence of import */ static struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp, @@ -1198,6 +1235,7 @@ struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp, { struct ptlrpc_sec_policy *policy; struct ptlrpc_sec *sec; + char str[32]; ENTRY; if (svc_ctx) { @@ -1206,7 +1244,7 @@ struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp, CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n", imp->imp_obd->obd_type->typ_name, imp->imp_obd->obd_name, - sptlrpc_rpcflavor2name(sf->sf_rpc)); + sptlrpc_flavor2name(sf, str, sizeof(str))); policy = sptlrpc_policy_get(svc_ctx->sc_policy); sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY; @@ -1216,9 +1254,9 @@ struct ptlrpc_sec * sptlrpc_sec_create(struct obd_import *imp, CDEBUG(D_SEC, "%s %s: select security flavor %s\n", imp->imp_obd->obd_type->typ_name, imp->imp_obd->obd_name, - sptlrpc_rpcflavor2name(sf->sf_rpc)); + sptlrpc_flavor2name(sf, str, sizeof(str))); - policy = sptlrpc_rpcflavor2policy(sf->sf_rpc); + policy = sptlrpc_wireflavor2policy(sf->sf_rpc); if (!policy) { CERROR("invalid flavor 0x%x\n", sf->sf_rpc); RETURN(NULL); @@ -1272,52 +1310,49 @@ static void sptlrpc_import_sec_install(struct obd_import *imp, } } +static inline +int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2) +{ + return (memcmp(sf1, sf2, sizeof(*sf1)) == 0); +} + +static inline +void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src) +{ + *dst = *src; +} + static void sptlrpc_import_sec_adapt_inplace(struct obd_import *imp, struct ptlrpc_sec *sec, struct sptlrpc_flavor *sf) { - if (sf->sf_bulk_ciph != sec->ps_flvr.sf_bulk_ciph || - sf->sf_bulk_hash != sec->ps_flvr.sf_bulk_hash) { - CWARN("imp %p (%s->%s): changing bulk flavor %s/%s -> %s/%s\n", - imp, imp->imp_obd->obd_name, - obd_uuid2str(&imp->imp_connection->c_remote_uuid), - sptlrpc_get_ciph_name(sec->ps_flvr.sf_bulk_ciph), - sptlrpc_get_hash_name(sec->ps_flvr.sf_bulk_hash), - sptlrpc_get_ciph_name(sf->sf_bulk_ciph), - sptlrpc_get_hash_name(sf->sf_bulk_hash)); - - spin_lock(&sec->ps_lock); - sec->ps_flvr.sf_bulk_ciph = sf->sf_bulk_ciph; - sec->ps_flvr.sf_bulk_hash = sf->sf_bulk_hash; - spin_unlock(&sec->ps_lock); - } + char str1[32], str2[32]; - if (!equi(sf->sf_flags & PTLRPC_SEC_FL_UDESC, - sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC)) { - CWARN("imp %p (%s->%s): %s shipping user descriptor\n", - imp, imp->imp_obd->obd_name, - obd_uuid2str(&imp->imp_connection->c_remote_uuid), - (sf->sf_flags & PTLRPC_SEC_FL_UDESC) ? "start" : "stop"); + if (sec->ps_flvr.sf_flags != sf->sf_flags) + CWARN("changing sec flags: %s -> %s\n", + sptlrpc_secflags2str(sec->ps_flvr.sf_flags, + str1, sizeof(str1)), + sptlrpc_secflags2str(sf->sf_flags, + str2, sizeof(str2))); - spin_lock(&sec->ps_lock); - sec->ps_flvr.sf_flags &= ~PTLRPC_SEC_FL_UDESC; - sec->ps_flvr.sf_flags |= sf->sf_flags & PTLRPC_SEC_FL_UDESC; - spin_unlock(&sec->ps_lock); - } + spin_lock(&sec->ps_lock); + flavor_copy(&sec->ps_flvr, sf); + spin_unlock(&sec->ps_lock); } /* - * for normal import, @svc_ctx should be NULL and @rpc_flavor is ignored; - * for reverse import, @svc_ctx and @rpc_flavor is from incoming request. + * for normal import, @svc_ctx should be NULL and @flvr is ignored; + * for reverse import, @svc_ctx and @flvr is from incoming request. */ int sptlrpc_import_sec_adapt(struct obd_import *imp, struct ptlrpc_svc_ctx *svc_ctx, - __u16 rpc_flavor) + struct sptlrpc_flavor *flvr) { struct ptlrpc_connection *conn; struct sptlrpc_flavor sf; struct ptlrpc_sec *sec, *newsec; enum lustre_sec_part sp; + char str[24]; int rc; might_sleep(); @@ -1344,57 +1379,45 @@ int sptlrpc_import_sec_adapt(struct obd_import *imp, sp = imp->imp_obd->u.cli.cl_sp_me; } else { /* reverse import, determine flavor from incoming reqeust */ - sf.sf_rpc = rpc_flavor; - sf.sf_bulk_ciph = BULK_CIPH_ALG_NULL; - sf.sf_bulk_hash = BULK_HASH_ALG_NULL; - sf.sf_flags = PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY; + sf = *flvr; + + if (sf.sf_rpc != SPTLRPC_FLVR_NULL) + sf.sf_flags = PTLRPC_SEC_FL_REVERSE | + PTLRPC_SEC_FL_ROOTONLY; sp = sptlrpc_target_sec_part(imp->imp_obd); } sec = sptlrpc_import_sec_ref(imp); if (sec) { - if (svc_ctx == NULL) { - /* normal import, only check rpc flavor, if just bulk - * flavor or flags changed, we can handle it on the fly - * without switching sec. */ - if (sf.sf_rpc == sec->ps_flvr.sf_rpc) { - sptlrpc_import_sec_adapt_inplace(imp, sec, &sf); - - rc = 0; - goto out; - } - } else { - /* reverse import, do not compare bulk flavor */ - if (sf.sf_rpc == sec->ps_flvr.sf_rpc) { - rc = 0; - goto out; - } - } + char str2[24]; + + if (flavor_equal(&sf, &sec->ps_flvr)) + goto out; CWARN("%simport %p (%s%s%s): changing flavor " - "(%s, %s/%s) -> (%s, %s/%s)\n", - svc_ctx ? "reverse " : "", + "%s -> %s\n", svc_ctx ? "reverse " : "", imp, imp->imp_obd->obd_name, svc_ctx == NULL ? "->" : "<-", obd_uuid2str(&conn->c_remote_uuid), - sptlrpc_rpcflavor2name(sec->ps_flvr.sf_rpc), - sptlrpc_get_hash_name(sec->ps_flvr.sf_bulk_hash), - sptlrpc_get_ciph_name(sec->ps_flvr.sf_bulk_ciph), - sptlrpc_rpcflavor2name(sf.sf_rpc), - sptlrpc_get_hash_name(sf.sf_bulk_hash), - sptlrpc_get_ciph_name(sf.sf_bulk_ciph)); + sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)), + sptlrpc_flavor2name(&sf, str2, sizeof(str2))); + + if (SPTLRPC_FLVR_POLICY(sf.sf_rpc) == + SPTLRPC_FLVR_POLICY(sec->ps_flvr.sf_rpc) && + SPTLRPC_FLVR_MECH(sf.sf_rpc) == + SPTLRPC_FLVR_MECH(sec->ps_flvr.sf_rpc)) { + sptlrpc_import_sec_adapt_inplace(imp, sec, &sf); + goto out; + } } else { - CWARN("%simport %p (%s%s%s) netid %x: " - "select initial flavor (%s, %s/%s)\n", + CWARN("%simport %p (%s%s%s) netid %x: select flavor %s\n", svc_ctx == NULL ? "" : "reverse ", imp, imp->imp_obd->obd_name, svc_ctx == NULL ? "->" : "<-", obd_uuid2str(&conn->c_remote_uuid), LNET_NIDNET(conn->c_self), - sptlrpc_rpcflavor2name(sf.sf_rpc), - sptlrpc_get_hash_name(sf.sf_bulk_hash), - sptlrpc_get_ciph_name(sf.sf_bulk_ciph)); + sptlrpc_flavor2name(&sf, str, sizeof(str))); } mutex_down(&imp->imp_sec_mutex); @@ -1659,8 +1682,9 @@ static int flavor_allowed(struct sptlrpc_flavor *exp, return 1; if ((req->rq_ctx_init || req->rq_ctx_fini) && - RPC_FLVR_POLICY(exp->sf_rpc) == RPC_FLVR_POLICY(flvr->sf_rpc) && - RPC_FLVR_MECH(exp->sf_rpc) == RPC_FLVR_MECH(flvr->sf_rpc)) + SPTLRPC_FLVR_POLICY(exp->sf_rpc) == + SPTLRPC_FLVR_POLICY(flvr->sf_rpc) && + SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc)) return 1; return 0; @@ -1725,7 +1749,7 @@ int sptlrpc_target_export_check(struct obd_export *exp, spin_unlock(&exp->exp_lock); return sptlrpc_import_sec_adapt(exp->exp_imp_reverse, - req->rq_svc_ctx, flavor.sf_rpc); + req->rq_svc_ctx, &flavor); } /* if it equals to the current flavor, we accept it, but need to @@ -1759,7 +1783,7 @@ int sptlrpc_target_export_check(struct obd_export *exp, return sptlrpc_import_sec_adapt(exp->exp_imp_reverse, req->rq_svc_ctx, - flavor.sf_rpc); + &flavor); } else { CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, " "install rvs ctx\n", exp, exp->exp_flvr.sf_rpc, @@ -1866,7 +1890,7 @@ void sptlrpc_target_update_exp_flavor(struct obd_device *obd, exp->exp_connection->c_peer.nid, &new_flvr); if (exp->exp_flvr_changed || - memcmp(&new_flvr, &exp->exp_flvr, sizeof(new_flvr))) { + !flavor_equal(&new_flvr, &exp->exp_flvr)) { exp->exp_flvr_old[1] = new_flvr; exp->exp_flvr_expire[1] = 0; exp->exp_flvr_changed = 1; @@ -1931,13 +1955,14 @@ static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc) int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req) { struct ptlrpc_sec_policy *policy; - struct lustre_msg *msg = req->rq_reqbuf; - int rc; + struct lustre_msg *msg = req->rq_reqbuf; + int rc; ENTRY; LASSERT(msg); LASSERT(req->rq_reqmsg == NULL); LASSERT(req->rq_repmsg == NULL); + LASSERT(req->rq_svc_ctx == NULL); req->rq_sp_from = LUSTRE_SP_ANY; req->rq_auth_uid = INVALID_UID; @@ -1949,19 +1974,28 @@ int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req) } /* - * v2 message. + * only expect v2 message. */ - if (msg->lm_magic == LUSTRE_MSG_MAGIC_V2) - req->rq_flvr.sf_rpc = WIRE_FLVR_RPC(msg->lm_secflvr); - else - req->rq_flvr.sf_rpc = WIRE_FLVR_RPC(__swab32(msg->lm_secflvr)); + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr); + break; + case LUSTRE_MSG_MAGIC_V2_SWABBED: + req->rq_flvr.sf_rpc = WIRE_FLVR(__swab32(msg->lm_secflvr)); + break; + default: + CERROR("invalid magic %x\n", msg->lm_magic); + RETURN(SECSVC_DROP); + } /* unpack the wrapper message if the policy is not null */ - if ((RPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) && - lustre_unpack_msg(msg, req->rq_reqdata_len)) + if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL && + lustre_unpack_msg(msg, req->rq_reqdata_len)) { + CERROR("invalid wrapper msg format\n"); RETURN(SECSVC_DROP); + } - policy = sptlrpc_rpcflavor2policy(req->rq_flvr.sf_rpc); + policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc); if (!policy) { CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc); RETURN(SECSVC_DROP); @@ -1971,22 +2005,11 @@ int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req) rc = policy->sp_sops->accept(req); LASSERT(req->rq_reqmsg || rc != SECSVC_OK); + LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP); sptlrpc_policy_put(policy); /* sanity check for the request source */ rc = sptlrpc_svc_check_from(req, rc); - - /* FIXME move to proper place */ - if (rc == SECSVC_OK) { - __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); - - if (opc == OST_WRITE) - req->rq_bulk_write = 1; - else if (opc == OST_READ) - req->rq_bulk_read = 1; - } - - LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP); RETURN(rc); } @@ -2111,11 +2134,11 @@ int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, { struct ptlrpc_cli_ctx *ctx; + LASSERT(req->rq_bulk_read || req->rq_bulk_write); + if (!req->rq_pack_bulk) return 0; - LASSERT(req->rq_bulk_read || req->rq_bulk_write); - ctx = req->rq_cli_ctx; if (ctx->cc_ops->wrap_bulk) return ctx->cc_ops->wrap_bulk(ctx, req, desc); @@ -2123,79 +2146,61 @@ int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, } EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk); -static -void pga_to_bulk_desc(int nob, obd_count pg_count, struct brw_page **pga, - struct ptlrpc_bulk_desc *desc) -{ - int i; - - LASSERT(pga); - LASSERT(*pga); - - for (i = 0; i < pg_count && nob > 0; i++) { -#ifdef __KERNEL__ - desc->bd_iov[i].kiov_page = pga[i]->pg; - desc->bd_iov[i].kiov_len = pga[i]->count > nob ? - nob : pga[i]->count; - desc->bd_iov[i].kiov_offset = pga[i]->off & ~CFS_PAGE_MASK; -#else - /* FIXME currently liblustre doesn't support bulk encryption. - * if we do, check again following may not be right. */ - LASSERTF(0, "Bulk encryption not implemented for liblustre\n"); - desc->bd_iov[i].iov_base = pga[i]->pg->addr; - desc->bd_iov[i].iov_len = pga[i]->count > nob ? - nob : pga[i]->count; -#endif - - desc->bd_iov_count++; - nob -= pga[i]->count; - } -} - +/* + * return nob of actual plain text size received, or error code. + */ int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, - int nob, obd_count pg_count, - struct brw_page **pga) + struct ptlrpc_bulk_desc *desc, + int nob) { - struct ptlrpc_bulk_desc *desc; - struct ptlrpc_cli_ctx *ctx; - int rc = 0; - - if (!req->rq_pack_bulk) - return 0; + struct ptlrpc_cli_ctx *ctx; + int rc; LASSERT(req->rq_bulk_read && !req->rq_bulk_write); - OBD_ALLOC(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[pg_count])); - if (desc == NULL) { - CERROR("out of memory, can't verify bulk read data\n"); - return -ENOMEM; - } - - pga_to_bulk_desc(nob, pg_count, pga, desc); + if (!req->rq_pack_bulk) + return desc->bd_nob_transferred; ctx = req->rq_cli_ctx; - if (ctx->cc_ops->unwrap_bulk) + if (ctx->cc_ops->unwrap_bulk) { rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc); - - OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[pg_count])); - - return rc; + if (rc < 0) + return rc; + } + return desc->bd_nob_transferred; } EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read); +/* + * return 0 for success or error code. + */ int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc) { - struct ptlrpc_cli_ctx *ctx; + struct ptlrpc_cli_ctx *ctx; + int rc; + + LASSERT(!req->rq_bulk_read && req->rq_bulk_write); if (!req->rq_pack_bulk) return 0; - LASSERT(!req->rq_bulk_read && req->rq_bulk_write); - ctx = req->rq_cli_ctx; - if (ctx->cc_ops->unwrap_bulk) - return ctx->cc_ops->unwrap_bulk(ctx, req, desc); + if (ctx->cc_ops->unwrap_bulk) { + rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc); + if (rc < 0) + return rc; + } + + /* + * if everything is going right, nob should equals to nob_transferred. + * in case of privacy mode, nob_transferred needs to be adjusted. + */ + if (desc->bd_nob != desc->bd_nob_transferred) { + CERROR("nob %d doesn't match transferred nob %d", + desc->bd_nob, desc->bd_nob_transferred); + return -EPROTO; + } return 0; } @@ -2206,11 +2211,11 @@ int sptlrpc_svc_wrap_bulk(struct ptlrpc_request *req, { struct ptlrpc_svc_ctx *ctx; + LASSERT(req->rq_bulk_read); + if (!req->rq_pack_bulk) return 0; - LASSERT(req->rq_bulk_read || req->rq_bulk_write); - ctx = req->rq_svc_ctx; if (ctx->sc_policy->sp_sops->wrap_bulk) return ctx->sc_policy->sp_sops->wrap_bulk(req, desc); @@ -2223,20 +2228,50 @@ int sptlrpc_svc_unwrap_bulk(struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc) { struct ptlrpc_svc_ctx *ctx; + int rc; + + LASSERT(req->rq_bulk_write); + + if (desc->bd_nob_transferred != desc->bd_nob && + SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc) != + SPTLRPC_BULK_SVC_PRIV) { + DEBUG_REQ(D_ERROR, req, "truncated bulk GET %d(%d)", + desc->bd_nob_transferred, desc->bd_nob); + return -ETIMEDOUT; + } if (!req->rq_pack_bulk) return 0; - LASSERT(req->rq_bulk_read || req->rq_bulk_write); - ctx = req->rq_svc_ctx; - if (ctx->sc_policy->sp_sops->unwrap_bulk); - return ctx->sc_policy->sp_sops->unwrap_bulk(req, desc); + if (ctx->sc_policy->sp_sops->unwrap_bulk) { + rc = ctx->sc_policy->sp_sops->unwrap_bulk(req, desc); + if (rc) + CERROR("error unwrap bulk: %d\n", rc); + } + /* return 0 to allow reply be sent */ return 0; } EXPORT_SYMBOL(sptlrpc_svc_unwrap_bulk); +int sptlrpc_svc_prep_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc) +{ + struct ptlrpc_svc_ctx *ctx; + + LASSERT(req->rq_bulk_write); + + if (!req->rq_pack_bulk) + return 0; + + ctx = req->rq_svc_ctx; + if (ctx->sc_policy->sp_sops->prep_bulk) + return ctx->sc_policy->sp_sops->prep_bulk(req, desc); + + return 0; +} +EXPORT_SYMBOL(sptlrpc_svc_prep_bulk); /**************************************** * user descriptor helpers * @@ -2337,6 +2372,21 @@ const char * sec2target_str(struct ptlrpc_sec *sec) } EXPORT_SYMBOL(sec2target_str); +/* + * return true if the bulk data is protected + */ +int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr) +{ + switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { + case SPTLRPC_BULK_SVC_INTG: + case SPTLRPC_BULK_SVC_PRIV: + return 1; + default: + return 0; + } +} +EXPORT_SYMBOL(sptlrpc_flavor_has_bulk); + /**************************************** * crypto API helper/alloc blkciper * ****************************************/ diff --git a/lustre/ptlrpc/sec_bulk.c b/lustre/ptlrpc/sec_bulk.c index 12ff171..c09cf0ca 100644 --- a/lustre/ptlrpc/sec_bulk.c +++ b/lustre/ptlrpc/sec_bulk.c @@ -456,8 +456,10 @@ out: static inline void enc_pools_wakeup(void) { + LASSERT_SPIN_LOCKED(&page_pools.epp_lock); + LASSERT(page_pools.epp_waitqlen >= 0); + if (unlikely(page_pools.epp_waitqlen)) { - LASSERT(page_pools.epp_waitqlen > 0); LASSERT(cfs_waitq_active(&page_pools.epp_waitq)); cfs_waitq_broadcast(&page_pools.epp_waitq); } @@ -476,11 +478,15 @@ static int enc_pools_should_grow(int page_needed, long now) if (page_pools.epp_total_pages < page_needed) return 1; - /* if we just did a shrink due to memory tight, we'd better - * wait a while to grow again. + /* + * we wanted to return 0 here if there was a shrink just happened + * moment ago, but this may cause deadlock if both client and ost + * live on single node. */ +#if 0 if (now - page_pools.epp_last_shrink < 2) return 0; +#endif /* * here we perhaps need consider other factors like wait queue @@ -503,32 +509,32 @@ int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc) int p_idx, g_idx; int i; - LASSERT(desc->bd_max_iov > 0); - LASSERT(desc->bd_max_iov <= page_pools.epp_max_pages); + LASSERT(desc->bd_iov_count > 0); + LASSERT(desc->bd_iov_count <= page_pools.epp_max_pages); - /* resent bulk, enc pages might have been allocated previously */ - if (desc->bd_enc_pages != NULL) + /* resent bulk, enc iov might have been allocated previously */ + if (desc->bd_enc_iov != NULL) return 0; - OBD_ALLOC(desc->bd_enc_pages, - desc->bd_max_iov * sizeof(*desc->bd_enc_pages)); - if (desc->bd_enc_pages == NULL) + OBD_ALLOC(desc->bd_enc_iov, + desc->bd_iov_count * sizeof(*desc->bd_enc_iov)); + if (desc->bd_enc_iov == NULL) return -ENOMEM; spin_lock(&page_pools.epp_lock); page_pools.epp_st_access++; again: - if (unlikely(page_pools.epp_free_pages < desc->bd_max_iov)) { + if (unlikely(page_pools.epp_free_pages < desc->bd_iov_count)) { if (tick == 0) tick = cfs_time_current(); now = cfs_time_current_sec(); page_pools.epp_st_missings++; - page_pools.epp_pages_short += desc->bd_max_iov; + page_pools.epp_pages_short += desc->bd_iov_count; - if (enc_pools_should_grow(desc->bd_max_iov, now)) { + if (enc_pools_should_grow(desc->bd_iov_count, now)) { page_pools.epp_growing = 1; spin_unlock(&page_pools.epp_lock); @@ -536,6 +542,8 @@ again: spin_lock(&page_pools.epp_lock); page_pools.epp_growing = 0; + + enc_pools_wakeup(); } else { if (++page_pools.epp_waitqlen > page_pools.epp_st_max_wqlen) @@ -549,14 +557,13 @@ again: spin_unlock(&page_pools.epp_lock); cfs_waitq_wait(&waitlink, CFS_TASK_UNINT); cfs_waitq_del(&page_pools.epp_waitq, &waitlink); - spin_lock(&page_pools.epp_lock); - LASSERT(page_pools.epp_waitqlen > 0); + spin_lock(&page_pools.epp_lock); page_pools.epp_waitqlen--; } - LASSERT(page_pools.epp_pages_short >= desc->bd_max_iov); - page_pools.epp_pages_short -= desc->bd_max_iov; + LASSERT(page_pools.epp_pages_short >= desc->bd_iov_count); + page_pools.epp_pages_short -= desc->bd_iov_count; this_idle = 0; goto again; @@ -570,14 +577,15 @@ again: } /* proceed with rest of allocation */ - page_pools.epp_free_pages -= desc->bd_max_iov; + page_pools.epp_free_pages -= desc->bd_iov_count; p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; - for (i = 0; i < desc->bd_max_iov; i++) { + for (i = 0; i < desc->bd_iov_count; i++) { LASSERT(page_pools.epp_pools[p_idx][g_idx] != NULL); - desc->bd_enc_pages[i] = page_pools.epp_pools[p_idx][g_idx]; + desc->bd_enc_iov[i].kiov_page = + page_pools.epp_pools[p_idx][g_idx]; page_pools.epp_pools[p_idx][g_idx] = NULL; if (++g_idx == PAGES_PER_POOL) { @@ -612,26 +620,27 @@ void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) int p_idx, g_idx; int i; - if (desc->bd_enc_pages == NULL) - return; - if (desc->bd_max_iov == 0) + if (desc->bd_enc_iov == NULL) return; + LASSERT(desc->bd_iov_count > 0); + spin_lock(&page_pools.epp_lock); p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; - LASSERT(page_pools.epp_free_pages + desc->bd_max_iov <= + LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <= page_pools.epp_total_pages); LASSERT(page_pools.epp_pools[p_idx]); - for (i = 0; i < desc->bd_max_iov; i++) { - LASSERT(desc->bd_enc_pages[i] != NULL); + for (i = 0; i < desc->bd_iov_count; i++) { + LASSERT(desc->bd_enc_iov[i].kiov_page != NULL); LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]); LASSERT(page_pools.epp_pools[p_idx][g_idx] == NULL); - page_pools.epp_pools[p_idx][g_idx] = desc->bd_enc_pages[i]; + page_pools.epp_pools[p_idx][g_idx] = + desc->bd_enc_iov[i].kiov_page; if (++g_idx == PAGES_PER_POOL) { p_idx++; @@ -639,15 +648,15 @@ void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) } } - page_pools.epp_free_pages += desc->bd_max_iov; + page_pools.epp_free_pages += desc->bd_iov_count; enc_pools_wakeup(); spin_unlock(&page_pools.epp_lock); - OBD_FREE(desc->bd_enc_pages, - desc->bd_max_iov * sizeof(*desc->bd_enc_pages)); - desc->bd_enc_pages = NULL; + OBD_FREE(desc->bd_enc_iov, + desc->bd_iov_count * sizeof(*desc->bd_enc_iov)); + desc->bd_enc_iov = NULL; } EXPORT_SYMBOL(sptlrpc_enc_pool_put_pages); @@ -668,7 +677,8 @@ int sptlrpc_enc_pool_add_user(void) spin_unlock(&page_pools.epp_lock); if (need_grow) { - enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES); + enc_pools_add_pages(PTLRPC_MAX_BRW_PAGES + + PTLRPC_MAX_BRW_PAGES); spin_lock(&page_pools.epp_lock); page_pools.epp_growing = 0; @@ -815,9 +825,6 @@ static struct sptlrpc_hash_type hash_types[] = { [BULK_HASH_ALG_SHA256] = { "sha256", "sha256", 32 }, [BULK_HASH_ALG_SHA384] = { "sha384", "sha384", 48 }, [BULK_HASH_ALG_SHA512] = { "sha512", "sha512", 64 }, - [BULK_HASH_ALG_WP256] = { "wp256", "wp256", 32 }, - [BULK_HASH_ALG_WP384] = { "wp384", "wp384", 48 }, - [BULK_HASH_ALG_WP512] = { "wp512", "wp512", 64 }, }; const struct sptlrpc_hash_type *sptlrpc_get_hash_type(__u8 hash_alg) @@ -845,24 +852,21 @@ const char * sptlrpc_get_hash_name(__u8 hash_alg) } EXPORT_SYMBOL(sptlrpc_get_hash_name); -int bulk_sec_desc_size(__u8 hash_alg, int request, int read) +__u8 sptlrpc_get_hash_alg(const char *algname) { - int size = sizeof(struct ptlrpc_bulk_sec_desc); - - LASSERT(hash_alg < BULK_HASH_ALG_MAX); - - /* read request don't need extra data */ - if (!(read && request)) - size += hash_types[hash_alg].sht_size; + int i; - return size; + for (i = 0; i < BULK_HASH_ALG_MAX; i++) + if (!strcmp(hash_types[i].sht_name, algname)) + break; + return i; } -EXPORT_SYMBOL(bulk_sec_desc_size); +EXPORT_SYMBOL(sptlrpc_get_hash_alg); int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset) { struct ptlrpc_bulk_sec_desc *bsd; - int size = msg->lm_buflens[offset]; + int size = msg->lm_buflens[offset]; bsd = lustre_msg_buf(msg, offset, sizeof(*bsd)); if (bsd == NULL) { @@ -870,35 +874,27 @@ int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset) return -EINVAL; } - /* nothing to swab */ + if (lustre_msg_swabbed(msg)) { + __swab32s(&bsd->bsd_nob); + } if (unlikely(bsd->bsd_version != 0)) { CERROR("Unexpected version %u\n", bsd->bsd_version); return -EPROTO; } - if (unlikely(bsd->bsd_flags != 0)) { - CERROR("Unexpected flags %x\n", bsd->bsd_flags); + if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) { + CERROR("Invalid type %u\n", bsd->bsd_type); return -EPROTO; } - if (unlikely(!sptlrpc_get_hash_type(bsd->bsd_hash_alg))) { - CERROR("Unsupported checksum algorithm %u\n", - bsd->bsd_hash_alg); - return -EINVAL; - } + /* FIXME more sanity check here */ - if (unlikely(!sptlrpc_get_ciph_type(bsd->bsd_ciph_alg))) { - CERROR("Unsupported cipher algorithm %u\n", - bsd->bsd_ciph_alg); - return -EINVAL; - } - - if (unlikely(size > sizeof(*bsd)) && - size < sizeof(*bsd) + hash_types[bsd->bsd_hash_alg].sht_size) { - CERROR("Mal-formed checksum data: csum alg %u, size %d\n", - bsd->bsd_hash_alg, size); - return -EINVAL; + if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && + bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG && + bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) { + CERROR("Invalid svc %u\n", bsd->bsd_svc); + return -EPROTO; } return 0; @@ -957,14 +953,17 @@ static int do_bulk_checksum_crc32(struct ptlrpc_bulk_desc *desc, void *buf) return 0; } -static int do_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u32 alg, void *buf) +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen) { struct hash_desc hdesc; - struct scatterlist *sl; - int i, rc = 0, bytes = 0; + int hashsize; + char hashbuf[64]; + struct scatterlist sl; + int i; - LASSERT(alg > BULK_HASH_ALG_NULL && - alg < BULK_HASH_ALG_MAX); + LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX); + LASSERT(buflen >= 4); switch (alg) { case BULK_HASH_ALG_ADLER32: @@ -983,35 +982,35 @@ static int do_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u32 alg, void *buf) CERROR("Unable to allocate TFM %s\n", hash_types[alg].sht_name); return -ENOMEM; } + hdesc.flags = 0; + ll_crypto_hash_init(&hdesc); - OBD_ALLOC(sl, sizeof(*sl) * desc->bd_iov_count); - if (sl == NULL) { - rc = -ENOMEM; - goto out_tfm; - } + hashsize = ll_crypto_hash_digestsize(hdesc.tfm); for (i = 0; i < desc->bd_iov_count; i++) { - sl[i].page = desc->bd_iov[i].kiov_page; - sl[i].offset = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK; - sl[i].length = desc->bd_iov[i].kiov_len; - bytes += desc->bd_iov[i].kiov_len; + sl.page = desc->bd_iov[i].kiov_page; + sl.offset = desc->bd_iov[i].kiov_offset; + sl.length = desc->bd_iov[i].kiov_len; + ll_crypto_hash_update(&hdesc, &sl, sl.length); } - ll_crypto_hash_init(&hdesc); - ll_crypto_hash_update(&hdesc, sl, bytes); - ll_crypto_hash_final(&hdesc, buf); - - OBD_FREE(sl, sizeof(*sl) * desc->bd_iov_count); + if (hashsize > buflen) { + ll_crypto_hash_final(&hdesc, hashbuf); + memcpy(buf, hashbuf, buflen); + } else { + ll_crypto_hash_final(&hdesc, buf); + } -out_tfm: ll_crypto_free_hash(hdesc.tfm); - return rc; + return 0; } +EXPORT_SYMBOL(sptlrpc_get_bulk_checksum); #else /* !__KERNEL__ */ -static int do_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u32 alg, void *buf) +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen) { __u32 csum32; int i; @@ -1048,328 +1047,3 @@ static int do_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u32 alg, void *buf) } #endif /* __KERNEL__ */ - -/* - * perform algorithm @alg checksum on @desc, store result in @buf. - * if anything goes wrong, leave 'alg' be BULK_HASH_ALG_NULL. - */ -static -int generate_bulk_csum(struct ptlrpc_bulk_desc *desc, __u32 alg, - struct ptlrpc_bulk_sec_desc *bsd, int bsdsize) -{ - int rc; - - LASSERT(bsd); - LASSERT(alg < BULK_HASH_ALG_MAX); - - bsd->bsd_hash_alg = BULK_HASH_ALG_NULL; - - if (alg == BULK_HASH_ALG_NULL) - return 0; - - LASSERT(bsdsize >= sizeof(*bsd) + hash_types[alg].sht_size); - - rc = do_bulk_checksum(desc, alg, bsd->bsd_csum); - if (rc == 0) - bsd->bsd_hash_alg = alg; - - return rc; -} - -static -int verify_bulk_csum(struct ptlrpc_bulk_desc *desc, int read, - struct ptlrpc_bulk_sec_desc *bsdv, int bsdvsize, - struct ptlrpc_bulk_sec_desc *bsdr, int bsdrsize) -{ - char *csum_p; - char *buf = NULL; - int csum_size, rc = 0; - - LASSERT(bsdv); - LASSERT(bsdv->bsd_hash_alg < BULK_HASH_ALG_MAX); - - if (bsdr) - bsdr->bsd_hash_alg = BULK_HASH_ALG_NULL; - - if (bsdv->bsd_hash_alg == BULK_HASH_ALG_NULL) - return 0; - - /* for all supported algorithms */ - csum_size = hash_types[bsdv->bsd_hash_alg].sht_size; - - if (bsdvsize < sizeof(*bsdv) + csum_size) { - CERROR("verifier size %d too small, require %d\n", - bsdvsize, (int) sizeof(*bsdv) + csum_size); - return -EINVAL; - } - - if (bsdr) { - LASSERT(bsdrsize >= sizeof(*bsdr) + csum_size); - csum_p = (char *) bsdr->bsd_csum; - } else { - OBD_ALLOC(buf, csum_size); - if (buf == NULL) - return -EINVAL; - csum_p = buf; - } - - rc = do_bulk_checksum(desc, bsdv->bsd_hash_alg, csum_p); - - if (memcmp(bsdv->bsd_csum, csum_p, csum_size)) { - CERROR("BAD %s CHECKSUM (%s), data mutated during " - "transfer!\n", read ? "READ" : "WRITE", - hash_types[bsdv->bsd_hash_alg].sht_name); - rc = -EINVAL; - } else { - CDEBUG(D_SEC, "bulk %s checksum (%s) verified\n", - read ? "read" : "write", - hash_types[bsdv->bsd_hash_alg].sht_name); - } - - if (bsdr) { - bsdr->bsd_hash_alg = bsdv->bsd_hash_alg; - memcpy(bsdr->bsd_csum, csum_p, csum_size); - } else { - LASSERT(buf); - OBD_FREE(buf, csum_size); - } - - return rc; -} - -int bulk_csum_cli_request(struct ptlrpc_bulk_desc *desc, int read, - __u32 alg, struct lustre_msg *rmsg, int roff) -{ - struct ptlrpc_bulk_sec_desc *bsdr; - int rsize, rc = 0; - - rsize = rmsg->lm_buflens[roff]; - bsdr = lustre_msg_buf(rmsg, roff, sizeof(*bsdr)); - - LASSERT(bsdr); - LASSERT(rsize >= sizeof(*bsdr)); - LASSERT(alg < BULK_HASH_ALG_MAX); - - if (read) { - bsdr->bsd_hash_alg = alg; - } else { - rc = generate_bulk_csum(desc, alg, bsdr, rsize); - if (rc) - CERROR("bulk write: client failed to compute " - "checksum: %d\n", rc); - - /* For sending we only compute the wrong checksum instead - * of corrupting the data so it is still correct on a redo */ - if (rc == 0 && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) && - bsdr->bsd_hash_alg != BULK_HASH_ALG_NULL) - bsdr->bsd_csum[0] ^= 0x1; - } - - return rc; -} -EXPORT_SYMBOL(bulk_csum_cli_request); - -int bulk_csum_cli_reply(struct ptlrpc_bulk_desc *desc, int read, - struct lustre_msg *rmsg, int roff, - struct lustre_msg *vmsg, int voff) -{ - struct ptlrpc_bulk_sec_desc *bsdv, *bsdr; - int rsize, vsize; - - rsize = rmsg->lm_buflens[roff]; - vsize = vmsg->lm_buflens[voff]; - bsdr = lustre_msg_buf(rmsg, roff, 0); - bsdv = lustre_msg_buf(vmsg, voff, 0); - - if (bsdv == NULL || vsize < sizeof(*bsdv)) { - CERROR("Invalid checksum verifier from server: size %d\n", - vsize); - return -EINVAL; - } - - LASSERT(bsdr); - LASSERT(rsize >= sizeof(*bsdr)); - LASSERT(vsize >= sizeof(*bsdv)); - - if (bsdr->bsd_hash_alg != bsdv->bsd_hash_alg) { - CERROR("bulk %s: checksum algorithm mismatch: client request " - "%s but server reply with %s. try to use the new one " - "for checksum verification\n", - read ? "read" : "write", - hash_types[bsdr->bsd_hash_alg].sht_name, - hash_types[bsdv->bsd_hash_alg].sht_name); - } - - if (read) - return verify_bulk_csum(desc, 1, bsdv, vsize, NULL, 0); - else { - char *cli, *srv, *new = NULL; - int csum_size = hash_types[bsdr->bsd_hash_alg].sht_size; - - LASSERT(bsdr->bsd_hash_alg < BULK_HASH_ALG_MAX); - if (bsdr->bsd_hash_alg == BULK_HASH_ALG_NULL) - return 0; - - if (vsize < sizeof(*bsdv) + csum_size) { - CERROR("verifier size %d too small, require %d\n", - vsize, (int) sizeof(*bsdv) + csum_size); - return -EINVAL; - } - - cli = (char *) (bsdr + 1); - srv = (char *) (bsdv + 1); - - if (!memcmp(cli, srv, csum_size)) { - /* checksum confirmed */ - CDEBUG(D_SEC, "bulk write checksum (%s) confirmed\n", - hash_types[bsdr->bsd_hash_alg].sht_name); - return 0; - } - - /* checksum mismatch, re-compute a new one and compare with - * others, give out proper warnings. */ - OBD_ALLOC(new, csum_size); - if (new == NULL) - return -ENOMEM; - - do_bulk_checksum(desc, bsdr->bsd_hash_alg, new); - - if (!memcmp(new, srv, csum_size)) { - CERROR("BAD WRITE CHECKSUM (%s): pages were mutated " - "on the client after we checksummed them\n", - hash_types[bsdr->bsd_hash_alg].sht_name); - } else if (!memcmp(new, cli, csum_size)) { - CERROR("BAD WRITE CHECKSUM (%s): pages were mutated " - "in transit\n", - hash_types[bsdr->bsd_hash_alg].sht_name); - } else { - CERROR("BAD WRITE CHECKSUM (%s): pages were mutated " - "in transit, and the current page contents " - "don't match the originals and what the server " - "received\n", - hash_types[bsdr->bsd_hash_alg].sht_name); - } - OBD_FREE(new, csum_size); - - return -EINVAL; - } -} -EXPORT_SYMBOL(bulk_csum_cli_reply); - -#ifdef __KERNEL__ -static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc) -{ - char *ptr; - unsigned int off, i; - - for (i = 0; i < desc->bd_iov_count; i++) { - if (desc->bd_iov[i].kiov_len == 0) - continue; - - ptr = cfs_kmap(desc->bd_iov[i].kiov_page); - off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK; - ptr[off] ^= 0x1; - cfs_kunmap(desc->bd_iov[i].kiov_page); - return; - } -} -#else -static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc) -{ -} -#endif /* __KERNEL__ */ - -int bulk_csum_svc(struct ptlrpc_bulk_desc *desc, int read, - struct ptlrpc_bulk_sec_desc *bsdv, int vsize, - struct ptlrpc_bulk_sec_desc *bsdr, int rsize) -{ - int rc; - - LASSERT(vsize >= sizeof(*bsdv)); - LASSERT(rsize >= sizeof(*bsdr)); - LASSERT(bsdv && bsdr); - - if (read) { - rc = generate_bulk_csum(desc, bsdv->bsd_hash_alg, bsdr, rsize); - if (rc) - CERROR("bulk read: server failed to generate %s " - "checksum: %d\n", - hash_types[bsdv->bsd_hash_alg].sht_name, rc); - - /* corrupt the data after we compute the checksum, to - * simulate an OST->client data error */ - if (rc == 0 && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) - corrupt_bulk_data(desc); - } else { - rc = verify_bulk_csum(desc, 0, bsdv, vsize, bsdr, rsize); - } - - return rc; -} -EXPORT_SYMBOL(bulk_csum_svc); - -/**************************************** - * Helpers to assist policy modules to * - * implement encryption funcationality * - ****************************************/ - -/* FIXME */ -#ifndef __KERNEL__ -#define CRYPTO_TFM_MODE_ECB (0) -#define CRYPTO_TFM_MODE_CBC (1) -#endif - -static struct sptlrpc_ciph_type cipher_types[] = { - [BULK_CIPH_ALG_NULL] = { - "null", "null", 0, 0, 0 - }, - [BULK_CIPH_ALG_ARC4] = { - "arc4", "ecb(arc4)", 0, 0, 16 - }, - [BULK_CIPH_ALG_AES128] = { - "aes128", "cbc(aes)", 0, 16, 16 - }, - [BULK_CIPH_ALG_AES192] = { - "aes192", "cbc(aes)", 0, 16, 24 - }, - [BULK_CIPH_ALG_AES256] = { - "aes256", "cbc(aes)", 0, 16, 32 - }, - [BULK_CIPH_ALG_CAST128] = { - "cast128", "cbc(cast5)", 0, 8, 16 - }, - [BULK_CIPH_ALG_CAST256] = { - "cast256", "cbc(cast6)", 0, 16, 32 - }, - [BULK_CIPH_ALG_TWOFISH128] = { - "twofish128", "cbc(twofish)", 0, 16, 16 - }, - [BULK_CIPH_ALG_TWOFISH256] = { - "twofish256", "cbc(twofish)", 0, 16, 32 - }, -}; - -const struct sptlrpc_ciph_type *sptlrpc_get_ciph_type(__u8 ciph_alg) -{ - struct sptlrpc_ciph_type *ct; - - if (ciph_alg < BULK_CIPH_ALG_MAX) { - ct = &cipher_types[ciph_alg]; - if (ct->sct_tfm_name) - return ct; - } - return NULL; -} -EXPORT_SYMBOL(sptlrpc_get_ciph_type); - -const char *sptlrpc_get_ciph_name(__u8 ciph_alg) -{ - const struct sptlrpc_ciph_type *ct; - - ct = sptlrpc_get_ciph_type(ciph_alg); - if (ct) - return ct->sct_name; - else - return "unknown"; -} -EXPORT_SYMBOL(sptlrpc_get_ciph_name); diff --git a/lustre/ptlrpc/sec_config.c b/lustre/ptlrpc/sec_config.c index b54a3a4..e9fe66f 100644 --- a/lustre/ptlrpc/sec_config.c +++ b/lustre/ptlrpc/sec_config.c @@ -102,222 +102,67 @@ EXPORT_SYMBOL(sptlrpc_target_sec_part); * user supplied flavor string parsing * ****************************************/ -#ifdef HAVE_ADLER -#define BULK_HASH_ALG_DEFAULT BULK_HASH_ALG_ADLER32 -#else -#define BULK_HASH_ALG_DEFAULT BULK_HASH_ALG_CRC32 -#endif - -typedef enum { - BULK_TYPE_N = 0, - BULK_TYPE_I = 1, - BULK_TYPE_P = 2 -} bulk_type_t; - -static void get_default_flavor(struct sptlrpc_flavor *sf) -{ - sf->sf_rpc = SPTLRPC_FLVR_NULL; - sf->sf_bulk_ciph = BULK_CIPH_ALG_NULL; - sf->sf_bulk_hash = BULK_HASH_ALG_NULL; - sf->sf_flags = 0; -} - -static void get_flavor_by_rpc(struct sptlrpc_flavor *flvr, __u16 rpc_flavor) -{ - get_default_flavor(flvr); - - flvr->sf_rpc = rpc_flavor; - - switch (rpc_flavor) { - case SPTLRPC_FLVR_NULL: - break; - case SPTLRPC_FLVR_PLAIN: - case SPTLRPC_FLVR_KRB5N: - case SPTLRPC_FLVR_KRB5A: - flvr->sf_bulk_hash = BULK_HASH_ALG_DEFAULT; - break; - case SPTLRPC_FLVR_KRB5P: - flvr->sf_bulk_ciph = BULK_CIPH_ALG_AES128; - /* fall through */ - case SPTLRPC_FLVR_KRB5I: - flvr->sf_bulk_hash = BULK_HASH_ALG_SHA1; - break; - default: - LBUG(); - } -} - -static void get_flavor_by_bulk(struct sptlrpc_flavor *flvr, - __u16 rpc_flavor, bulk_type_t bulk_type) -{ - switch (bulk_type) { - case BULK_TYPE_N: - flvr->sf_bulk_hash = BULK_HASH_ALG_NULL; - flvr->sf_bulk_ciph = BULK_CIPH_ALG_NULL; - break; - case BULK_TYPE_I: - switch (rpc_flavor) { - case SPTLRPC_FLVR_PLAIN: - case SPTLRPC_FLVR_KRB5N: - case SPTLRPC_FLVR_KRB5A: - flvr->sf_bulk_hash = BULK_HASH_ALG_DEFAULT; - break; - case SPTLRPC_FLVR_KRB5I: - case SPTLRPC_FLVR_KRB5P: - flvr->sf_bulk_hash = BULK_HASH_ALG_SHA1; - break; - default: - LBUG(); - } - flvr->sf_bulk_ciph = BULK_CIPH_ALG_NULL; - break; - case BULK_TYPE_P: - flvr->sf_bulk_hash = BULK_HASH_ALG_SHA1; - flvr->sf_bulk_ciph = BULK_CIPH_ALG_AES128; - break; - default: - LBUG(); - } -} - -static __u16 __flavors[] = { - SPTLRPC_FLVR_NULL, - SPTLRPC_FLVR_PLAIN, - SPTLRPC_FLVR_KRB5N, - SPTLRPC_FLVR_KRB5A, - SPTLRPC_FLVR_KRB5I, - SPTLRPC_FLVR_KRB5P, -}; - -#define __nflavors ARRAY_SIZE(__flavors) - /* - * flavor string format: rpc[-bulk{n|i|p}[:cksum/enc]] - * for examples: - * null - * plain-bulki - * krb5p-bulkn - * krb5i-bulkp - * krb5i-bulkp:sha512/arc4 + * format: [-] */ int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr) { - const char *f; - char *bulk, *alg, *enc; - char buf[64]; - bulk_type_t bulk_type; - __u8 i; - ENTRY; + char buf[32]; + char *bulk, *alg; + + memset(flvr, 0, sizeof(*flvr)); if (str == NULL || str[0] == '\0') { flvr->sf_rpc = SPTLRPC_FLVR_INVALID; - goto out; + return 0; } - for (i = 0; i < __nflavors; i++) { - f = sptlrpc_rpcflavor2name(__flavors[i]); - if (strncmp(str, f, strlen(f)) == 0) - break; - } - - if (i >= __nflavors) - GOTO(invalid, -EINVAL); + strncpy(buf, str, sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; - /* prepare local buffer thus we can modify it as we want */ - strncpy(buf, str, 64); - buf[64 - 1] = '\0'; - - /* find bulk string */ bulk = strchr(buf, '-'); if (bulk) *bulk++ = '\0'; - /* now the first part must equal to rpc flavor name */ - if (strcmp(buf, f) != 0) - GOTO(invalid, -EINVAL); - - get_flavor_by_rpc(flvr, __flavors[i]); - - if (bulk == NULL) - goto out; - - /* find bulk algorithm string */ - alg = strchr(bulk, ':'); - if (alg) - *alg++ = '\0'; - - /* verify bulk section */ - if (strcmp(bulk, "bulkn") == 0) { - flvr->sf_bulk_hash = BULK_HASH_ALG_NULL; - flvr->sf_bulk_ciph = BULK_CIPH_ALG_NULL; - bulk_type = BULK_TYPE_N; - } else if (strcmp(bulk, "bulki") == 0) - bulk_type = BULK_TYPE_I; - else if (strcmp(bulk, "bulkp") == 0) - bulk_type = BULK_TYPE_P; - else - GOTO(invalid, -EINVAL); - - /* null flavor don't support bulk i/p */ - if (__flavors[i] == SPTLRPC_FLVR_NULL && bulk_type != BULK_TYPE_N) - GOTO(invalid, -EINVAL); - - /* plain policy dosen't support bulk p */ - if (__flavors[i] == SPTLRPC_FLVR_PLAIN && bulk_type == BULK_TYPE_P) - GOTO(invalid, -EINVAL); - - get_flavor_by_bulk(flvr, __flavors[i], bulk_type); - - if (alg == NULL) - goto out; - - /* find encryption algorithm string */ - enc = strchr(alg, '/'); - if (enc) - *enc++ = '\0'; - - /* checksum algorithm */ - for (i = 0; i < BULK_HASH_ALG_MAX; i++) { - if (strcmp(alg, sptlrpc_get_hash_name(i)) == 0) { - flvr->sf_bulk_hash = i; - break; - } - } - if (i >= BULK_HASH_ALG_MAX) - GOTO(invalid, -EINVAL); - - /* privacy algorithm */ - if (enc) { - for (i = 0; i < BULK_CIPH_ALG_MAX; i++) { - if (strcmp(enc, sptlrpc_get_ciph_name(i)) == 0) { - flvr->sf_bulk_ciph = i; - break; - } - } - if (i >= BULK_CIPH_ALG_MAX) - GOTO(invalid, -EINVAL); - } + flvr->sf_rpc = sptlrpc_name2flavor_base(buf); + if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID) + goto err_out; /* - * bulk combination sanity checks + * currently only base flavor "plain" can have bulk specification. */ - if (bulk_type == BULK_TYPE_P && - flvr->sf_bulk_ciph == BULK_CIPH_ALG_NULL) - GOTO(invalid, -EINVAL); - - if (bulk_type == BULK_TYPE_I && - (flvr->sf_bulk_hash == BULK_HASH_ALG_NULL || - flvr->sf_bulk_ciph != BULK_CIPH_ALG_NULL)) - GOTO(invalid, -EINVAL); + if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) { + flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32; + if (bulk) { + /* + * format: plain-hash: + */ + alg = strchr(bulk, ':'); + if (alg == NULL) + goto err_out; + *alg++ = '\0'; + + if (strcmp(bulk, "hash")) + goto err_out; + + flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg); + if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX) + goto err_out; + } - if (bulk_type == BULK_TYPE_N && - (flvr->sf_bulk_hash != BULK_HASH_ALG_NULL || - flvr->sf_bulk_ciph != BULK_CIPH_ALG_NULL)) - GOTO(invalid, -EINVAL); + if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL) + flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL); + else + flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG); + } else { + if (bulk) + goto err_out; + } -out: + flvr->sf_flags = 0; return 0; -invalid: + +err_out: CERROR("invalid flavor string: %s\n", str); return -EINVAL; } @@ -327,6 +172,14 @@ EXPORT_SYMBOL(sptlrpc_parse_flavor); * configure rules * ****************************************/ +static void get_default_flavor(struct sptlrpc_flavor *sf) +{ + memset(sf, 0, sizeof(*sf)); + + sf->sf_rpc = SPTLRPC_FLVR_NULL; + sf->sf_flags = 0; +} + static void sptlrpc_rule_init(struct sptlrpc_rule *rule) { rule->sr_netid = LNET_NIDNET(LNET_NID_ANY); @@ -411,19 +264,17 @@ EXPORT_SYMBOL(sptlrpc_rule_set_free); /* * return 0 if the rule set could accomodate one more rule. - * if @expand != 0, the rule set might be expanded. */ -int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset, int expand) +int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset) { struct sptlrpc_rule *rules; int nslot; + might_sleep(); + if (rset->srs_nrule < rset->srs_nslot) return 0; - if (expand == 0) - return -E2BIG; - nslot = rset->srs_nslot + 8; /* better use realloc() if available */ @@ -468,16 +319,17 @@ static inline int rule_match_net(struct sptlrpc_rule *r1, /* * merge @rule into @rset. - * if @expand != 0 then @rset slots might be expanded. + * the @rset slots might be expanded. */ int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset, - struct sptlrpc_rule *rule, - int expand) + struct sptlrpc_rule *rule) { struct sptlrpc_rule *p = rset->srs_rules; int spec_dir, spec_net; int rc, n, match = 0; + might_sleep(); + spec_net = rule_spec_net(rule); spec_dir = rule_spec_dir(rule); @@ -537,7 +389,7 @@ int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset, LASSERT(n >= 0 && n <= rset->srs_nrule); if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) { - rc = sptlrpc_rule_set_expand(rset, expand); + rc = sptlrpc_rule_set_expand(rset); if (rc) return rc; @@ -616,6 +468,8 @@ static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen, struct sptlrpc_rule *rule; int i, n, rc; + might_sleep(); + /* merge general rules firstly, then target-specific rules */ for (i = 0; i < 2; i++) { if (src[i] == NULL) @@ -633,7 +487,7 @@ static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen, rule->sr_to != to) continue; - rc = sptlrpc_rule_set_merge(rset, rule, 1); + rc = sptlrpc_rule_set_merge(rset, rule); if (rc) { CERROR("can't merge: %d\n", rc); return rc; @@ -800,7 +654,7 @@ static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf, } } - return sptlrpc_rule_set_merge(rule_set, rule, 1); + return sptlrpc_rule_set_merge(rule_set, rule); } /** @@ -829,7 +683,7 @@ static int __sptlrpc_process_config(struct lustre_cfg *lcfg, RETURN(-EINVAL); } - CDEBUG(D_SEC, "got one rule: %s.%s\n", target, param); + CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param); /* parse rule to make sure the format is correct */ if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) { @@ -974,6 +828,13 @@ static void inline flavor_set_flags(struct sptlrpc_flavor *sf, enum lustre_sec_part to, unsigned int fl_udesc) { + /* + * null flavor doesn't need to set any flavor, and in fact + * we'd better not do that because everybody share a single sec. + */ + if (sf->sf_rpc == SPTLRPC_FLVR_NULL) + return; + if (from == LUSTRE_SP_MDT) { /* MDT->MDT; MDT->OST */ sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY; diff --git a/lustre/ptlrpc/sec_lproc.c b/lustre/ptlrpc/sec_lproc.c index 51bace7..5a6fae95 100644 --- a/lustre/ptlrpc/sec_lproc.c +++ b/lustre/ptlrpc/sec_lproc.c @@ -66,7 +66,7 @@ struct proc_dir_entry *sptlrpc_proc_root = NULL; EXPORT_SYMBOL(sptlrpc_proc_root); -void sec_flags2str(unsigned long flags, char *buf, int bufsize) +char *sec_flags2str(unsigned long flags, char *buf, int bufsize) { buf[0] = '\0'; @@ -82,7 +82,7 @@ void sec_flags2str(unsigned long flags, char *buf, int bufsize) strncat(buf, "-,", bufsize); buf[strlen(buf) - 1] = '\0'; - + return buf; } static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v) @@ -90,7 +90,7 @@ static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v) struct obd_device *dev = seq->private; struct client_obd *cli = &dev->u.cli; struct ptlrpc_sec *sec = NULL; - char flags_str[32]; + char str[32]; LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || @@ -101,14 +101,14 @@ static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v) if (sec == NULL) goto out; - sec_flags2str(sec->ps_flvr.sf_flags, flags_str, sizeof(flags_str)); + sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)); seq_printf(seq, "rpc flavor: %s\n", - sptlrpc_rpcflavor2name(sec->ps_flvr.sf_rpc)); - seq_printf(seq, "bulk flavor: %s/%s\n", - sptlrpc_get_hash_name(sec->ps_flvr.sf_bulk_hash), - sptlrpc_get_ciph_name(sec->ps_flvr.sf_bulk_ciph)); - seq_printf(seq, "flags: %s\n", flags_str); + sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc)); + seq_printf(seq, "bulk flavor: %s\n", + sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str))); + seq_printf(seq, "flags: %s\n", + sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str))); seq_printf(seq, "id: %d\n", sec->ps_id); seq_printf(seq, "refcount: %d\n", atomic_read(&sec->ps_refcount)); seq_printf(seq, "nctx: %d\n", atomic_read(&sec->ps_nctx)); diff --git a/lustre/ptlrpc/sec_null.c b/lustre/ptlrpc/sec_null.c index 7b4368d..08baf12 100644 --- a/lustre/ptlrpc/sec_null.c +++ b/lustre/ptlrpc/sec_null.c @@ -59,13 +59,13 @@ static struct ptlrpc_cli_ctx null_cli_ctx; static struct ptlrpc_svc_ctx null_svc_ctx; /* - * null sec temporarily use the third byte of lm_secflvr to identify + * we can temporarily use the topmost 8-bits of lm_secflvr to identify * the source sec part. */ static inline void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp) { - msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 16; + msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 24; } static inline @@ -73,9 +73,9 @@ enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V2: - return (msg->lm_secflvr >> 16) & 0xFF; + return (msg->lm_secflvr >> 24) & 0xFF; case LUSTRE_MSG_MAGIC_V2_SWABBED: - return (msg->lm_secflvr >> 8) & 0xFF; + return (msg->lm_secflvr) & 0xFF; default: return LUSTRE_SP_ANY; } @@ -135,14 +135,7 @@ struct ptlrpc_sec *null_create_sec(struct obd_import *imp, struct ptlrpc_svc_ctx *svc_ctx, struct sptlrpc_flavor *sf) { - LASSERT(RPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL); - - if (sf->sf_bulk_ciph != BULK_CIPH_ALG_NULL || - sf->sf_bulk_hash != BULK_HASH_ALG_NULL) { - CERROR("null sec don't support bulk algorithm: %u/%u\n", - sf->sf_bulk_ciph, sf->sf_bulk_hash); - return NULL; - } + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL); /* general layer has take a module reference for us, because we never * really destroy the sec, simply release the reference here. @@ -300,7 +293,8 @@ static struct ptlrpc_svc_ctx null_svc_ctx = { static int null_accept(struct ptlrpc_request *req) { - LASSERT(RPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == SPTLRPC_POLICY_NULL); + LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == + SPTLRPC_POLICY_NULL); if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) { CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc); @@ -428,8 +422,6 @@ static void null_init_internal(void) null_sec.ps_id = -1; null_sec.ps_import = NULL; null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL; - null_sec.ps_flvr.sf_bulk_ciph = BULK_CIPH_ALG_NULL; - null_sec.ps_flvr.sf_bulk_hash = BULK_HASH_ALG_NULL; null_sec.ps_flvr.sf_flags = 0; null_sec.ps_part = LUSTRE_SP_ANY; null_sec.ps_dying = 0; diff --git a/lustre/ptlrpc/sec_plain.c b/lustre/ptlrpc/sec_plain.c index eb9ee82..9b03d77 100644 --- a/lustre/ptlrpc/sec_plain.c +++ b/lustre/ptlrpc/sec_plain.c @@ -71,44 +71,124 @@ static struct ptlrpc_svc_ctx plain_svc_ctx; static unsigned int plain_at_offset; /* - * flavor flags (maximum 8 flags) + * for simplicity, plain policy rpc use fixed layout. */ -#define PLAIN_WFLVR_FLAGS_OFFSET (12) -#define PLAIN_WFLVR_FLAG_BULK (1 << (0 + PLAIN_WFLVR_FLAGS_OFFSET)) -#define PLAIN_WFLVR_FLAG_USER (1 << (1 + PLAIN_WFLVR_FLAGS_OFFSET)) +#define PLAIN_PACK_SEGMENTS (4) + +#define PLAIN_PACK_HDR_OFF (0) +#define PLAIN_PACK_MSG_OFF (1) +#define PLAIN_PACK_USER_OFF (2) +#define PLAIN_PACK_BULK_OFF (3) + +#define PLAIN_FL_USER (0x01) +#define PLAIN_FL_BULK (0x02) + +struct plain_header { + __u8 ph_ver; /* 0 */ + __u8 ph_flags; + __u8 ph_sp; /* source */ + __u8 ph_bulk_hash_alg; /* complete flavor desc */ + __u8 ph_pad[4]; +}; -#define PLAIN_WFLVR_HAS_BULK(wflvr) \ - (((wflvr) & PLAIN_WFLVR_FLAG_BULK) != 0) -#define PLAIN_WFLVR_HAS_USER(wflvr) \ - (((wflvr) & PLAIN_WFLVR_FLAG_USER) != 0) +struct plain_bulk_token { + __u8 pbt_hash[8]; +}; -#define PLAIN_WFLVR_TO_RPC(wflvr) \ - ((wflvr) & ((1 << PLAIN_WFLVR_FLAGS_OFFSET) - 1)) +#define PLAIN_BSD_SIZE \ + (sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token)) -/* - * similar to null sec, temporarily use the third byte of lm_secflvr to identify - * the source sec part. - */ -static inline -void plain_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp) +/**************************************** + * bulk checksum helpers * + ****************************************/ + +static int plain_unpack_bsd(struct lustre_msg *msg) { - msg->lm_secflvr |= (((__u32) sp) & 0xFF) << 16; + struct ptlrpc_bulk_sec_desc *bsd; + + if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF)) + return -EPROTO; + + bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE); + if (bsd == NULL) { + CERROR("bulk sec desc has short size %d\n", + lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF)); + return -EPROTO; + } + + if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && + bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) { + CERROR("invalid bulk svc %u\n", bsd->bsd_svc); + return -EPROTO; + } + + return 0; } -static inline -enum lustre_sec_part plain_decode_sec_part(struct lustre_msg *msg) +static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc, + __u8 hash_alg, + struct plain_bulk_token *token) { - return (msg->lm_secflvr >> 16) & 0xFF; + if (hash_alg == BULK_HASH_ALG_NULL) + return 0; + + memset(token->pbt_hash, 0, sizeof(token->pbt_hash)); + return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash, + sizeof(token->pbt_hash)); } -/* - * for simplicity, plain policy rpc use fixed layout. - */ -#define PLAIN_PACK_SEGMENTS (3) +static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc, + __u8 hash_alg, + struct plain_bulk_token *tokenr) +{ + struct plain_bulk_token tokenv; + int rc; + + if (hash_alg == BULK_HASH_ALG_NULL) + return 0; -#define PLAIN_PACK_MSG_OFF (0) -#define PLAIN_PACK_USER_OFF (1) -#define PLAIN_PACK_BULK_OFF (2) + memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash)); + rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash, + sizeof(tokenv.pbt_hash)); + if (rc) + return rc; + + if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash))) + return -EACCES; + return 0; +} + +#ifdef __KERNEL__ +static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc) +{ + char *ptr; + unsigned int off, i; + + for (i = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_iov[i].kiov_len == 0) + continue; + + ptr = cfs_kmap(desc->bd_iov[i].kiov_page); + off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK; + ptr[off] ^= 0x1; + cfs_kunmap(desc->bd_iov[i].kiov_page); + return; + } +} +#else +static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc) +{ + unsigned int i; + + for (i = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_iov[i].iov_len == 0) + continue; + + ((char *)desc->bd_iov[i].iov_base)[i] ^= 0x1; + return; + } +} +#endif /* __KERNEL__ */ /**************************************** * cli_ctx apis * @@ -131,16 +211,22 @@ int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx) static int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) { - struct lustre_msg_v2 *msg = req->rq_reqbuf; + struct lustre_msg *msg = req->rq_reqbuf; + struct plain_header *phdr; ENTRY; msg->lm_secflvr = req->rq_flvr.sf_rpc; - if (req->rq_pack_bulk) - msg->lm_secflvr |= PLAIN_WFLVR_FLAG_BULK; - if (req->rq_pack_udesc) - msg->lm_secflvr |= PLAIN_WFLVR_FLAG_USER; - plain_encode_sec_part(msg, ctx->cc_sec->ps_part); + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0); + phdr->ph_ver = 0; + phdr->ph_flags = 0; + phdr->ph_sp = ctx->cc_sec->ps_part; + phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg; + + if (req->rq_pack_udesc) + phdr->ph_flags |= PLAIN_FL_USER; + if (req->rq_pack_bulk) + phdr->ph_flags |= PLAIN_FL_BULK; req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); @@ -150,8 +236,9 @@ int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) static int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) { - struct lustre_msg *msg = req->rq_repdata; - __u32 cksum; + struct lustre_msg *msg = req->rq_repdata; + struct plain_header *phdr; + __u32 cksum; ENTRY; if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) { @@ -159,12 +246,29 @@ int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) RETURN(-EPROTO); } + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr)); + if (phdr == NULL) { + CERROR("missing plain header\n"); + RETURN(-EPROTO); + } + + if (phdr->ph_ver != 0) { + CERROR("Invalid header version\n"); + RETURN(-EPROTO); + } + /* expect no user desc in reply */ - if (PLAIN_WFLVR_HAS_USER(msg->lm_secflvr)) { + if (phdr->ph_flags & PLAIN_FL_USER) { CERROR("Unexpected udesc flag in reply\n"); RETURN(-EPROTO); } + if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) { + CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg, + req->rq_flvr.u_bulk.hash.hash_alg); + RETURN(-EPROTO); + } + if (unlikely(req->rq_early)) { cksum = crc32_le(!(__u32) 0, lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0), @@ -179,16 +283,15 @@ int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) * in reply, except for early reply */ if (!req->rq_early && !equi(req->rq_pack_bulk == 1, - PLAIN_WFLVR_HAS_BULK(msg->lm_secflvr))) { + phdr->ph_flags & PLAIN_FL_BULK)) { CERROR("%s bulk checksum in reply\n", req->rq_pack_bulk ? "Missing" : "Unexpected"); RETURN(-EPROTO); } - if (PLAIN_WFLVR_HAS_BULK(msg->lm_secflvr) && - bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF)) { - CERROR("Mal-formed bulk checksum reply\n"); - RETURN(-EINVAL); + if (phdr->ph_flags & PLAIN_FL_BULK) { + if (plain_unpack_bsd(msg)) + RETURN(-EPROTO); } } @@ -202,13 +305,42 @@ int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc) { + struct ptlrpc_bulk_sec_desc *bsd; + struct plain_bulk_token *token; + int rc; + LASSERT(req->rq_pack_bulk); LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); - return bulk_csum_cli_request(desc, req->rq_bulk_read, - req->rq_flvr.sf_bulk_hash, - req->rq_reqbuf, - PLAIN_PACK_BULK_OFF); + bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + token = (struct plain_bulk_token *) bsd->bsd_data; + + bsd->bsd_version = 0; + bsd->bsd_flags = 0; + bsd->bsd_type = SPTLRPC_BULK_DEFAULT; + bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc); + + if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL) + RETURN(0); + + if (req->rq_bulk_read) + RETURN(0); + + rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + token); + if (rc) { + CERROR("bulk write: failed to compute checksum: %d\n", rc); + } else { + /* + * for sending we only compute the wrong checksum instead + * of corrupting the data so it is still correct on a redo + */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) && + req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL) + token->pbt_hash[0] ^= 0x1; + } + + return rc; } static @@ -216,13 +348,45 @@ int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc) { + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + struct plain_bulk_token *tokenr, *tokenv; + int rc; +#ifdef __KERNEL__ + int i, nob; +#endif + LASSERT(req->rq_pack_bulk); LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS); - return bulk_csum_cli_reply(desc, req->rq_bulk_read, - req->rq_reqbuf, PLAIN_PACK_BULK_OFF, - req->rq_repdata, PLAIN_PACK_BULK_OFF); + bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + tokenr = (struct plain_bulk_token *) bsdr->bsd_data; + bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0); + tokenv = (struct plain_bulk_token *) bsdv->bsd_data; + + if (req->rq_bulk_write) { + if (bsdv->bsd_flags & BSD_FL_ERR) + return -EIO; + return 0; + } + +#ifdef __KERNEL__ + /* fix the actual data size */ + for (i = 0, nob = 0; i < desc->bd_iov_count; i++) { + if (desc->bd_iov[i].kiov_len + nob > desc->bd_nob_transferred) { + desc->bd_iov[i].kiov_len = + desc->bd_nob_transferred - nob; + } + nob += desc->bd_iov[i].kiov_len; + } +#endif + + rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenv); + if (rc) + CERROR("bulk read: client verify failed: %d\n", rc); + + return rc; } /**************************************** @@ -303,13 +467,7 @@ struct ptlrpc_sec *plain_create_sec(struct obd_import *imp, struct ptlrpc_cli_ctx *ctx; ENTRY; - LASSERT(RPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN); - - if (sf->sf_bulk_ciph != BULK_CIPH_ALG_NULL) { - CERROR("plain policy don't support bulk cipher: %u\n", - sf->sf_bulk_ciph); - RETURN(NULL); - } + LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN); OBD_ALLOC_PTR(plsec); if (plsec == NULL) @@ -410,9 +568,10 @@ int plain_alloc_reqbuf(struct ptlrpc_sec *sec, int msgsize) { __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; - int alloc_len; + int alloc_len; ENTRY; + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); buflens[PLAIN_PACK_MSG_OFF] = msgsize; if (req->rq_pack_udesc) @@ -420,10 +579,7 @@ int plain_alloc_reqbuf(struct ptlrpc_sec *sec, if (req->rq_pack_bulk) { LASSERT(req->rq_bulk_read || req->rq_bulk_write); - - buflens[PLAIN_PACK_BULK_OFF] = bulk_sec_desc_size( - req->rq_flvr.sf_bulk_hash, 1, - req->rq_bulk_read); + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; } alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); @@ -444,7 +600,7 @@ int plain_alloc_reqbuf(struct ptlrpc_sec *sec, } lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL); - req->rq_reqmsg = lustre_msg_buf_v2(req->rq_reqbuf, 0, 0); + req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0); if (req->rq_pack_udesc) sptlrpc_pack_user_desc(req->rq_reqbuf, PLAIN_PACK_USER_OFF); @@ -476,13 +632,12 @@ int plain_alloc_repbuf(struct ptlrpc_sec *sec, int alloc_len; ENTRY; + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); buflens[PLAIN_PACK_MSG_OFF] = msgsize; if (req->rq_pack_bulk) { LASSERT(req->rq_bulk_read || req->rq_bulk_write); - buflens[PLAIN_PACK_BULK_OFF] = bulk_sec_desc_size( - req->rq_flvr.sf_bulk_hash, 0, - req->rq_bulk_read); + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; } alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); @@ -581,24 +736,46 @@ static struct ptlrpc_svc_ctx plain_svc_ctx = { static int plain_accept(struct ptlrpc_request *req) { - struct lustre_msg *msg = req->rq_reqbuf; + struct lustre_msg *msg = req->rq_reqbuf; + struct plain_header *phdr; ENTRY; - LASSERT(RPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == SPTLRPC_POLICY_PLAIN); + LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == + SPTLRPC_POLICY_PLAIN); + + if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) != + SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) || + SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) != + SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) { + CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc); + RETURN(SECSVC_DROP); + } if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) { CERROR("unexpected request buf count %u\n", msg->lm_bufcount); RETURN(SECSVC_DROP); } - if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_PLAIN) { - CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc); - RETURN(SECSVC_DROP); + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr)); + if (phdr == NULL) { + CERROR("missing plain header\n"); + RETURN(-EPROTO); } - req->rq_sp_from = plain_decode_sec_part(msg); + if (phdr->ph_ver != 0) { + CERROR("Invalid header version\n"); + RETURN(-EPROTO); + } - if (PLAIN_WFLVR_HAS_USER(msg->lm_secflvr)) { + if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) { + CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg); + RETURN(-EPROTO); + } + + req->rq_sp_from = phdr->ph_sp; + req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg; + + if (phdr->ph_flags & PLAIN_FL_USER) { if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF)) { CERROR("Mal-formed user descriptor\n"); RETURN(SECSVC_DROP); @@ -608,11 +785,9 @@ int plain_accept(struct ptlrpc_request *req) req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0); } - if (PLAIN_WFLVR_HAS_BULK(msg->lm_secflvr)) { - if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF)) { - CERROR("Mal-formed bulk checksum request\n"); + if (phdr->ph_flags & PLAIN_FL_BULK) { + if (plain_unpack_bsd(msg)) RETURN(SECSVC_DROP); - } req->rq_pack_bulk = 1; } @@ -630,24 +805,18 @@ static int plain_alloc_rs(struct ptlrpc_request *req, int msgsize) { struct ptlrpc_reply_state *rs; - struct ptlrpc_bulk_sec_desc *bsd; __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; int rs_size = sizeof(*rs); ENTRY; LASSERT(msgsize % 8 == 0); + buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); buflens[PLAIN_PACK_MSG_OFF] = msgsize; - if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write)) { - bsd = lustre_msg_buf(req->rq_reqbuf, - PLAIN_PACK_BULK_OFF, sizeof(*bsd)); - LASSERT(bsd); + if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write)) + buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; - buflens[PLAIN_PACK_BULK_OFF] = bulk_sec_desc_size( - bsd->bsd_hash_alg, 0, - req->rq_bulk_read); - } rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); rs = req->rq_reply_state; @@ -693,6 +862,7 @@ int plain_authorize(struct ptlrpc_request *req) { struct ptlrpc_reply_state *rs = req->rq_reply_state; struct lustre_msg_v2 *msg = rs->rs_repbuf; + struct plain_header *phdr; int len; ENTRY; @@ -706,8 +876,14 @@ int plain_authorize(struct ptlrpc_request *req) len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); msg->lm_secflvr = req->rq_flvr.sf_rpc; + + phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0); + phdr->ph_ver = 0; + phdr->ph_flags = 0; + phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg; + if (req->rq_pack_bulk) - msg->lm_secflvr |= PLAIN_WFLVR_FLAG_BULK; + phdr->ph_flags |= PLAIN_FL_BULK; rs->rs_repdata_len = len; @@ -730,44 +906,73 @@ static int plain_svc_unwrap_bulk(struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc) { - struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + struct plain_bulk_token *tokenr, *tokenv; + int rc; - LASSERT(rs); + LASSERT(req->rq_bulk_write); LASSERT(req->rq_pack_bulk); - LASSERT(req->rq_reqbuf->lm_bufcount >= PLAIN_PACK_SEGMENTS); - LASSERT(rs->rs_repbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); - return bulk_csum_svc(desc, req->rq_bulk_read, - lustre_msg_buf(req->rq_reqbuf, - PLAIN_PACK_BULK_OFF, 0), - lustre_msg_buflen(req->rq_reqbuf, - PLAIN_PACK_BULK_OFF), - lustre_msg_buf(rs->rs_repbuf, - PLAIN_PACK_BULK_OFF, 0), - lustre_msg_buflen(rs->rs_repbuf, - PLAIN_PACK_BULK_OFF)); + bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + tokenr = (struct plain_bulk_token *) bsdr->bsd_data; + bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0); + tokenv = (struct plain_bulk_token *) bsdv->bsd_data; + + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL) + return 0; + + rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenr); + if (rc) { + bsdv->bsd_flags |= BSD_FL_ERR; + CERROR("bulk write: server verify failed: %d\n", rc); + } + + return rc; } static int plain_svc_wrap_bulk(struct ptlrpc_request *req, struct ptlrpc_bulk_desc *desc) { - struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_reply_state *rs = req->rq_reply_state; + struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; + struct plain_bulk_token *tokenr, *tokenv; + int rc; - LASSERT(rs); + LASSERT(req->rq_bulk_read); LASSERT(req->rq_pack_bulk); - LASSERT(req->rq_reqbuf->lm_bufcount >= PLAIN_PACK_SEGMENTS); - LASSERT(rs->rs_repbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); - return bulk_csum_svc(desc, req->rq_bulk_read, - lustre_msg_buf(req->rq_reqbuf, - PLAIN_PACK_BULK_OFF, 0), - lustre_msg_buflen(req->rq_reqbuf, - PLAIN_PACK_BULK_OFF), - lustre_msg_buf(rs->rs_repbuf, - PLAIN_PACK_BULK_OFF, 0), - lustre_msg_buflen(rs->rs_repbuf, - PLAIN_PACK_BULK_OFF)); + bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); + tokenr = (struct plain_bulk_token *) bsdr->bsd_data; + bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0); + tokenv = (struct plain_bulk_token *) bsdv->bsd_data; + + bsdv->bsd_version = 0; + bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; + bsdv->bsd_svc = bsdr->bsd_svc; + bsdv->bsd_flags = 0; + + if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL) + return 0; + + rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, + tokenv); + if (rc) { + CERROR("bulk read: server failed to compute " + "checksum: %d\n", rc); + } else { + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) + corrupt_bulk_data(desc); + } + + return rc; } static struct ptlrpc_ctx_ops plain_ctx_ops = { @@ -787,8 +992,8 @@ static struct ptlrpc_sec_cops plain_sec_cops = { .release_ctx = plain_release_ctx, .flush_ctx_cache = plain_flush_ctx_cache, .alloc_reqbuf = plain_alloc_reqbuf, - .alloc_repbuf = plain_alloc_repbuf, .free_reqbuf = plain_free_reqbuf, + .alloc_repbuf = plain_alloc_repbuf, .free_repbuf = plain_free_repbuf, .enlarge_reqbuf = plain_enlarge_reqbuf, }; diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index cb101cf..a8d0785 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -1311,6 +1311,17 @@ ptlrpc_server_handle_req_in(struct ptlrpc_service *svc) goto err_req; } + switch(lustre_msg_get_opc(req->rq_reqmsg)) { + case MDS_WRITEPAGE: + case OST_WRITE: + req->rq_bulk_write = 1; + break; + case MDS_READPAGE: + case OST_READ: + req->rq_bulk_read = 1; + break; + } + CDEBUG(D_NET, "got req "LPD64"\n", req->rq_xid); req->rq_export = class_conn2export( diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index d1f4475..c93be5d 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -65,8 +65,8 @@ void lustre_assert_wire_constants(void) { /* Wire protocol assertions generated by 'wirecheck' * (make -C lustre/utils newwiretest) - * running on Linux lin2 2.6.18-92.1.17-prep #3 Sun Nov 23 14:29:36 IST 2008 i686 i686 i386 G - * with gcc version 3.4.6 20060404 (Red Hat 3.4.6-10) */ + * running on Linux localhost.localdomain 2.6.18-prep #3 SMP Sun Nov 23 08:04:44 EST 2008 i68 + * with gcc version 4.1.1 20061011 (Red Hat 4.1.1-30) */ /* Constants... */ @@ -254,9 +254,9 @@ void lustre_assert_wire_constants(void) (long long)OBD_QC_CALLBACK); LASSERTF(OBD_LAST_OPC == 403, " found %lld\n", (long long)OBD_LAST_OPC); - LASSERTF(QUOTA_DQACQ == 901, " found %lld\n", + LASSERTF(QUOTA_DQACQ == 601, " found %lld\n", (long long)QUOTA_DQACQ); - LASSERTF(QUOTA_DQREL == 902, " found %lld\n", + LASSERTF(QUOTA_DQREL == 602, " found %lld\n", (long long)QUOTA_DQREL); LASSERTF(MGS_CONNECT == 250, " found %lld\n", (long long)MGS_CONNECT); @@ -447,31 +447,31 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct obd_connect_data, padding2)); LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding2) == 8, " found %lld\n", (long long)(int)sizeof(((struct obd_connect_data *)0)->padding2)); - CLASSERT(OBD_CONNECT_RDONLY == 0x00000001ULL); - CLASSERT(OBD_CONNECT_INDEX == 0x00000002ULL); - CLASSERT(OBD_CONNECT_GRANT == 0x00000008ULL); - CLASSERT(OBD_CONNECT_SRVLOCK == 0x00000010ULL); - CLASSERT(OBD_CONNECT_VERSION == 0x00000020ULL); - CLASSERT(OBD_CONNECT_REQPORTAL == 0x00000040ULL); - CLASSERT(OBD_CONNECT_ACL == 0x00000080ULL); - CLASSERT(OBD_CONNECT_XATTR == 0x00000100ULL); + CLASSERT(OBD_CONNECT_RDONLY == 0x1ULL); + CLASSERT(OBD_CONNECT_INDEX == 0x2ULL); + CLASSERT(OBD_CONNECT_GRANT == 0x8ULL); + CLASSERT(OBD_CONNECT_SRVLOCK == 0x10ULL); + CLASSERT(OBD_CONNECT_VERSION == 0x20ULL); + CLASSERT(OBD_CONNECT_REQPORTAL == 0x40ULL); + CLASSERT(OBD_CONNECT_ACL == 0x80ULL); + CLASSERT(OBD_CONNECT_XATTR == 0x100ULL); CLASSERT(OBD_CONNECT_REAL == 0x08000000ULL); CLASSERT(OBD_CONNECT_CKSUM == 0x20000000ULL); - CLASSERT(OBD_CONNECT_TRUNCLOCK == 0x00000400ULL); - CLASSERT(OBD_CONNECT_IBITS == 0x00001000ULL); - CLASSERT(OBD_CONNECT_JOIN == 0x00002000ULL); - CLASSERT(OBD_CONNECT_ATTRFID == 0x00004000ULL); - CLASSERT(OBD_CONNECT_NODEVOH == 0x00008000ULL); + CLASSERT(OBD_CONNECT_TRUNCLOCK == 0x400ULL); + CLASSERT(OBD_CONNECT_IBITS == 0x1000ULL); + CLASSERT(OBD_CONNECT_JOIN == 0x2000ULL); + CLASSERT(OBD_CONNECT_ATTRFID == 0x4000ULL); + CLASSERT(OBD_CONNECT_NODEVOH == 0x8000ULL); CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x00010000ULL); CLASSERT(OBD_CONNECT_RMT_CLIENT_FORCE == 0x00020000ULL); - CLASSERT(OBD_CONNECT_BRW_SIZE == 0x00040000ULL); - CLASSERT(OBD_CONNECT_QUOTA64 == 0x00080000ULL); - CLASSERT(OBD_CONNECT_MDS_CAPA == 0x00100000ULL); - CLASSERT(OBD_CONNECT_OSS_CAPA == 0x00200000ULL); + CLASSERT(OBD_CONNECT_BRW_SIZE == 0x40000ULL); + CLASSERT(OBD_CONNECT_QUOTA64 == 0x80000ULL); + CLASSERT(OBD_CONNECT_MDS_CAPA == 0x100000ULL); + CLASSERT(OBD_CONNECT_OSS_CAPA == 0x200000ULL); CLASSERT(OBD_CONNECT_MDS_MDS == 0x04000000ULL); CLASSERT(OBD_CONNECT_SOM == 0x00800000ULL); CLASSERT(OBD_CONNECT_AT == 0x01000000ULL); - CLASSERT(OBD_CONNECT_CANCELSET == 0x00400000ULL); + CLASSERT(OBD_CONNECT_CANCELSET == 0x400000ULL); CLASSERT(OBD_CONNECT_LRU_RESIZE == 0x02000000ULL); /* Checks for struct obdo */ @@ -2389,7 +2389,7 @@ void lustre_assert_wire_constants(void) CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000); /* Checks for struct ll_fiemap_extent */ - LASSERTF((int)sizeof(struct ll_fiemap_extent) == 32, " found %lld\n", + LASSERTF((int)sizeof(struct ll_fiemap_extent) == 56, " found %lld\n", (long long)(int)sizeof(struct ll_fiemap_extent)); LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, " found %lld\n", (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical)); @@ -2403,27 +2403,26 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ll_fiemap_extent, fe_length)); LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, " found %lld\n", (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length)); - LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 24, " found %lld\n", + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 40, " found %lld\n", (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags)); LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, " found %lld\n", (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags)); - LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 28, " found %lld\n", + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 44, " found %lld\n", (long long)(int)offsetof(struct ll_fiemap_extent, fe_device)); LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, " found %lld\n", (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device)); CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001); CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002); CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004); - CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x00000008); - CLASSERT(FIEMAP_EXTENT_SECONDARY == 0x00000010); - CLASSERT(FIEMAP_EXTENT_NET == 0x00000020); - CLASSERT(FIEMAP_EXTENT_DATA_COMPRESSED == 0x00000040); + CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008); CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080); CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100); CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200); CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400); CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800); CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000); + CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000); + CLASSERT(FIEMAP_EXTENT_NET == 0x80000000); #ifdef LIBLUSTRE_POSIX_ACL /* Checks for type posix_acl_xattr_entry */ diff --git a/lustre/quota/Makefile.in b/lustre/quota/Makefile.in index f052b42..50efef3 100644 --- a/lustre/quota/Makefile.in +++ b/lustre/quota/Makefile.in @@ -3,5 +3,7 @@ MODULES := lquota lquota-objs := quota_check.o quota_context.o quota_ctl.o quota_interface.o lquota-objs += quota_master.o quota_adjust_qunit.o lproc_quota.o +EXTRA_DIST := $(lquota-objs:%.o=%.c) $(quotactl-objs:%.o=%.c) $(quotacheck-objs:%.o=%.c) quota_internal.h + @INCLUDE_RULES@ diff --git a/lustre/quota/autoMakefile.am b/lustre/quota/autoMakefile.am index 9a20d28..0c9bd1f 100644 --- a/lustre/quota/autoMakefile.am +++ b/lustre/quota/autoMakefile.am @@ -46,4 +46,3 @@ modulefs_DATA = lquota$(KMODEXT) endif MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES := $(lquota-objs:%.o=%.c) quota_internal.h diff --git a/lustre/quota/quota_adjust_qunit.c b/lustre/quota/quota_adjust_qunit.c index df2115d..abe57dd 100644 --- a/lustre/quota/quota_adjust_qunit.c +++ b/lustre/quota/quota_adjust_qunit.c @@ -331,7 +331,8 @@ int filter_quota_adjust_qunit(struct obd_export *exp, if (rc > 0) { rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, 1, 0, NULL); - if (rc == -EDQUOT || rc == -EBUSY || rc == -EAGAIN) { + if (rc == -EDQUOT || rc == -EBUSY || + rc == QUOTA_REQ_RETURNED || rc == -EAGAIN) { CDEBUG(D_QUOTA, "rc: %d.\n", rc); rc = 0; } diff --git a/lustre/quota/quota_check.c b/lustre/quota/quota_check.c index 62fc1f0..c2238e2 100644 --- a/lustre/quota/quota_check.c +++ b/lustre/quota/quota_check.c @@ -114,6 +114,7 @@ static int target_quotacheck_thread(void *data) pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); rc = target_quotacheck_callback(exp, oqctl); + class_export_put(exp); atomic_inc(qta->qta_sem); @@ -155,6 +156,9 @@ int target_quota_check(struct obd_device *obd, struct obd_export *exp, } } + /* we get ref for exp because target_quotacheck_callback() will use this + * export later b=18126 */ + class_export_get(exp); rc = kernel_thread(target_quotacheck_thread, qta, CLONE_VM|CLONE_FILES); if (rc >= 0) { CDEBUG(D_INFO, "%s: target_quotacheck_thread: %d\n", @@ -162,6 +166,7 @@ int target_quota_check(struct obd_device *obd, struct obd_export *exp, RETURN(0); } + class_export_put(exp); CERROR("%s: error starting quotacheck_thread: %d\n", obd->obd_name, rc); OBD_FREE_PTR(qta); @@ -274,12 +279,14 @@ int lov_quota_check(struct obd_device *unused, struct obd_export *exp, ENTRY; for (i = 0; i < lov->desc.ld_tgt_count; i++) { - int err; - if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active) { CERROR("lov idx %d inactive\n", i); RETURN(-EIO); } + } + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + int err; err = obd_quotacheck(lov->lov_tgts[i]->ltd_exp, oqctl); if (err && !rc) diff --git a/lustre/quota/quota_context.c b/lustre/quota/quota_context.c index bb0bbd2..162f0af 100644 --- a/lustre/quota/quota_context.c +++ b/lustre/quota/quota_context.c @@ -636,8 +636,10 @@ out: compute_lqs_after_removing_qunit(qunit); - /* wake up all waiters */ + if (rc == 0) + rc = QUOTA_REQ_RETURNED; QUNIT_SET_STATE_AND_RC(qunit, QUNIT_FINISHED, rc); + /* wake up all waiters */ wake_up_all(&qunit->lq_waitq); /* this is for dqacq_in_flight() */ @@ -664,7 +666,7 @@ out: CERROR("adjust slave's qunit size failed!(rc:%d)\n", rc1); RETURN(rc1); } - if (err || (rc && rc != -EBUSY && rc1 == 0) || is_master(qctxt)) + if (err || (rc < 0 && rc != -EBUSY && rc1 == 0) || is_master(qctxt)) RETURN(err); /* reschedule another dqacq/dqrel if needed */ @@ -774,25 +776,56 @@ int check_qm(struct lustre_quota_ctxt *qctxt) RETURN(rc); } +/* wake up all waiting threads when lqc_import is NULL */ +void dqacq_interrupt(struct lustre_quota_ctxt *qctxt) +{ + struct lustre_qunit *qunit, *tmp; + int i; + ENTRY; + + spin_lock(&qunit_hash_lock); + for (i = 0; i < NR_DQHASH; i++) { + list_for_each_entry_safe(qunit, tmp, &qunit_hash[i], lq_hash) { + if (qunit->lq_ctxt != qctxt) + continue; + + /* Wake up all waiters. Do not change lq_state. + * The waiters will check lq_rc which is kept as 0 + * if no others change it, then the waiters will return + * -EAGAIN to caller who can perform related quota + * acq/rel if necessary. */ + wake_up_all(&qunit->lq_waitq); + } + } + spin_unlock(&qunit_hash_lock); + EXIT; +} + static int got_qunit(struct lustre_qunit *qunit) { - int rc; + struct lustre_quota_ctxt *qctxt = qunit->lq_ctxt; + int rc = 0; ENTRY; spin_lock(&qunit->lq_lock); switch (qunit->lq_state) { case QUNIT_IN_HASH: case QUNIT_RM_FROM_HASH: - rc = 0; break; case QUNIT_FINISHED: rc = 1; break; default: - rc = 0; CERROR("invalid qunit state %d\n", qunit->lq_state); } spin_unlock(&qunit->lq_lock); + + if (!rc) { + spin_lock(&qctxt->lqc_lock); + rc = !qctxt->lqc_import || !qctxt->lqc_valid; + spin_unlock(&qctxt->lqc_lock); + } + RETURN(rc); } @@ -952,16 +985,14 @@ wait_completion: QDATA_DEBUG(p, "qunit(%p) is waiting for dqacq.\n", qunit); l_wait_event(qunit->lq_waitq, got_qunit(qunit), &lwi); - /* rc = -EAGAIN, it means a quota req is finished; + /* rc = -EAGAIN, it means the quota master isn't ready yet + * rc = QUOTA_REQ_RETURNED, it means a quota req is finished; * rc = -EDQUOT, it means out of quota * rc = -EBUSY, it means recovery is happening * other rc < 0, it means real errors, functions who call * schedule_dqacq should take care of this */ spin_lock(&qunit->lq_lock); - if (qunit->lq_rc == 0) - rc = -EAGAIN; - else - rc = qunit->lq_rc; + rc = qunit->lq_rc; spin_unlock(&qunit->lq_lock); CDEBUG(D_QUOTA, "qunit(%p) finishes waiting. (rc:%d)\n", qunit, rc); @@ -1057,10 +1088,7 @@ qctxt_wait_pending_dqacq(struct lustre_quota_ctxt *qctxt, unsigned int id, qunit, qunit->lq_rc); /* keep same as schedule_dqacq() b=17030 */ spin_lock(&qunit->lq_lock); - if (qunit->lq_rc == 0) - rc = -EAGAIN; - else - rc = qunit->lq_rc; + rc = qunit->lq_rc; spin_unlock(&qunit->lq_lock); /* this is for dqacq_in_flight() */ qunit_put(qunit); diff --git a/lustre/quota/quota_interface.c b/lustre/quota/quota_interface.c index 71bed54..2ef1b67 100644 --- a/lustre/quota/quota_interface.c +++ b/lustre/quota/quota_interface.c @@ -137,6 +137,7 @@ static int filter_quota_clearinfo(struct obd_export *exp, struct obd_device *obd spin_lock(&qctxt->lqc_lock); qctxt->lqc_import = NULL; spin_unlock(&qctxt->lqc_lock); + dqacq_interrupt(qctxt); CDEBUG(D_QUOTA, "%s: lqc_import of obd(%p) is invalid now.\n", obd->obd_name, obd); } @@ -380,7 +381,7 @@ static int quota_chk_acq_common(struct obd_device *obd, unsigned int uid, /* please reference to dqacq_completion for the below */ /* a new request is finished, try again */ - if (rc == -EAGAIN) { + if (rc == QUOTA_REQ_RETURNED) { CDEBUG(D_QUOTA, "finish a quota req, try again\n"); continue; } diff --git a/lustre/quota/quota_internal.h b/lustre/quota/quota_internal.h index e9073be..8856af3 100644 --- a/lustre/quota/quota_internal.h +++ b/lustre/quota/quota_internal.h @@ -113,6 +113,7 @@ int compute_remquota(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata, int isblk); int check_qm(struct lustre_quota_ctxt *qctxt); +void dqacq_interrupt(struct lustre_quota_ctxt *qctxt); /* quota_master.c */ int lustre_dquot_init(void); void lustre_dquot_exit(void); @@ -186,6 +187,8 @@ extern cfs_proc_dir_entry_t *lquota_type_proc_dir; #define LQS_INO_DECREASE 4 #define LQS_INO_INCREASE 8 +/* the return status of quota operation */ +#define QUOTA_REQ_RETURNED 1 #endif int client_quota_adjust_qunit(struct obd_export *exp, diff --git a/lustre/quota/quota_master.c b/lustre/quota/quota_master.c index 9629357..62b7127 100644 --- a/lustre/quota/quota_master.c +++ b/lustre/quota/quota_master.c @@ -552,8 +552,9 @@ int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[], } if (rc2) - CDEBUG(rc2 == -EAGAIN ? D_QUOTA: D_ERROR, - "mds adjust qunit failed! (opc:%d rc:%d)\n", opc, rc2); + CDEBUG(rc2 == QUOTA_REQ_RETURNED ? D_QUOTA: D_ERROR, + "mds adjust qunit %ssuccessfully! (opc:%d rc:%d)\n", + rc2 == QUOTA_REQ_RETURNED ? "" : "un", opc, rc2); RETURN(0); } @@ -590,9 +591,9 @@ int filter_quota_adjust(struct obd_device *obd, unsigned int qcids[], if (rc || rc2) { if (!rc) rc = rc2; - CDEBUG(rc == -EAGAIN ? D_QUOTA: D_ERROR, - "filter adjust qunit failed! (opc:%d rc%d)\n", - opc, rc); + CDEBUG(rc == QUOTA_REQ_RETURNED ? D_QUOTA: D_ERROR, + "filter adjust qunit %ssuccessfully! (opc:%d rc%d)\n", + QUOTA_REQ_RETURNED ? "" : "un", opc, rc); } RETURN(0); diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index fa4a14b..912e184 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -61,7 +61,7 @@ setup_if_needed() { local MOUNTED=$(mounted_lustre_filesystems) if $(echo $MOUNTED | grep -w -q $MOUNT); then check_config $MOUNT - init_versions_vars + init_param_vars return fi diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 443348e..60f880a 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -501,7 +501,7 @@ test_18() { check_mount || return 41 echo "check journal size..." - local FOUNDSIZE=`do_facet mds "$$DEBUGFS -c -R 'stat <8>' $MDSDEV" | awk '/Size: / { print $NF; exit;}'` + local FOUNDSIZE=`do_facet mds "$DEBUGFS -c -R 'stat <8>' $MDSDEV" | awk '/Size: / { print $NF; exit;}'` if [ $FOUNDSIZE -gt $((32 * 1024 * 1024)) ]; then log "Success: mkfs creates large journals. Size: $((FOUNDSIZE >> 20))M" else @@ -771,29 +771,6 @@ test_26() { } run_test 26 "MDT startup failure cleans LOV (should return errs)" -wait_update () { - local node=$1 - local TEST=$2 - local FINAL=$3 - - local RESULT - local MAX=90 - local WAIT=0 - local sleep=5 - while [ $WAIT -lt $MAX ]; do - RESULT=$(do_node $node "$TEST") - if [ $RESULT -eq $FINAL ]; then - echo "Updated config after $WAIT sec: wanted $FINAL got $RESULT" - return 0 - fi - WAIT=$((WAIT + sleep)) - echo "Waiting $((MAX - WAIT)) secs for config update" - sleep $sleep - done - echo "Config update not seen after $MAX sec: wanted $FINAL got $RESULT" - return 3 -} - set_and_check() { local myfacet=$1 local TEST=$2 diff --git a/lustre/tests/mdsrate-create-large.sh b/lustre/tests/mdsrate-create-large.sh index 00ad399..b45dae1 100644 --- a/lustre/tests/mdsrate-create-large.sh +++ b/lustre/tests/mdsrate-create-large.sh @@ -13,9 +13,9 @@ MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines} TESTDIR=$MOUNT # Requirements +# set NUM_FILES=0 to force TIME_PERIOD work +NUM_FILES=${NUM_FILES:-1000000} TIME_PERIOD=${TIME_PERIOD:-600} # seconds -SINGLE_TARGET_RATE=$((1300 / OSTCOUNT)) # ops/sec -AGGREGATE_TARGET_RATE=$((7000 / OSTCOUNT)) # ops/sec # Local test variables TESTDIR_SINGLE="${TESTDIR}/single" @@ -32,6 +32,11 @@ log "===== $0 ====== " check_and_setup_lustre +IFree=$(inodes_available) +if [ $IFree -lt $NUM_FILES ]; then + NUM_FILES=$IFree +fi + generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile" $LFS setstripe $TESTDIR -c -1 @@ -47,7 +52,7 @@ else echo "Running creates on 1 node(s)." COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD} - --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" + --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" echo "+ ${COMMAND}" mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} @@ -56,14 +61,11 @@ else error "mpirun ... mdsrate ... failed, aborting" fi - check_rate create ${SINGLE_TARGET_RATE} 1 ${LOG} || true - log "===== $0 ### 1 NODE UNLINK ###" echo "Running unlinks on 1 node(s)." - let NUM_FILES=${SINGLE_TARGET_RATE}\*${TIME_PERIOD} COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD} - --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" + --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" echo "+ ${COMMAND}" mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} @@ -71,8 +73,11 @@ else [ -f $LOG ] && cat $LOG error "mpirun ... mdsrate ... failed, aborting" fi +fi - check_rate unlink ${SINGLE_TARGET_RATE} 1 ${LOG} || true +IFree=$(inodes_available) +if [ $IFree -lt $NUM_FILES ]; then + NUM_FILES=$IFree fi if [ -n "$NOMULTI" ]; then @@ -83,7 +88,7 @@ else echo "Running creates on ${NUM_CLIENTS} node(s)." COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD} - --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" + --nfiles $NUM_FILES --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" echo "+ ${COMMAND}" mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} @@ -92,13 +97,10 @@ else error "mpirun ... mdsrate ... failed, aborting" fi - check_rate create ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true - echo "Running unlinks on ${NUM_CLIENTS} node(s)." - let NUM_FILES=${AGGREGATE_TARGET_RATE}\*${TIME_PERIOD} COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD} - --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" + --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" echo "+ ${COMMAND}" mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} @@ -107,12 +109,10 @@ else error "mpirun ... mdsrate ... failed, aborting" fi - check_rate unlink ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true fi equals_msg `basename $0`: test complete, cleaning up rm -f $MACHINEFILE -zconf_umount_clients $NODES_TO_USE $MOUNT check_and_cleanup_lustre #rm -f $LOG diff --git a/lustre/tests/mdsrate-create-small.sh b/lustre/tests/mdsrate-create-small.sh index 0f42e5d..5455796 100644 --- a/lustre/tests/mdsrate-create-small.sh +++ b/lustre/tests/mdsrate-create-small.sh @@ -13,10 +13,8 @@ MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines} TESTDIR=$MOUNT # Requirements -# The default number of stripes per file is set to 1 in test3/run_test.sh. +NUM_FILES=${NUM_FILES:-1000000} TIME_PERIOD=${TIME_PERIOD:-600} # seconds -SINGLE_TARGET_RATE=1400 # ops/sec -AGGREGATE_TARGET_RATE=10000 # ops/sec # Local test variables TESTDIR_SINGLE="${TESTDIR}/single" @@ -42,6 +40,11 @@ log "===== $0 ====== " check_and_setup_lustre +IFree=$(inodes_available) +if [ $IFree -lt $NUM_FILES ]; then + NUM_FILES=$IFree +fi + generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile" $LFS setstripe $TESTDIR -i 0 -c 1 @@ -59,7 +62,7 @@ else echo "Running creates on 1 node(s)." COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD} - --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" + --nfiles $NUM_FILES --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" echo "+ ${COMMAND}" mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} @@ -67,7 +70,6 @@ else [ -f $LOG ] && cat $LOG error "mpirun ... mdsrate ... failed, aborting" fi - check_rate create ${SINGLE_TARGET_RATE} 1 ${LOG} || true fi if [ -n "$NOUNLINK" ]; then @@ -76,7 +78,6 @@ else log "===== $0 ### 1 NODE UNLINK ###" echo "Running unlinks on 1 node(s)." - let NUM_FILES=${SINGLE_TARGET_RATE}\*${TIME_PERIOD} COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD} --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" echo "+ ${COMMAND}" @@ -86,10 +87,14 @@ else [ -f $LOG ] && cat $LOG error "mpirun ... mdsrate ... failed, aborting" fi - check_rate unlink ${SINGLE_TARGET_RATE} 1 ${LOG} || true fi fi +IFree=$(inodes_available) +if [ $IFree -lt $NUM_FILES ]; then + NUM_FILES=$IFree +fi + if [ -n "$NOMULTI" ]; then echo "NO tests on multiple nodes." else @@ -102,7 +107,7 @@ else echo "Running creates on ${NUM_CLIENTS} node(s) with $THREADS_PER_CLIENT threads per client." COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD} - --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" + --nfiles $NUM_FILES --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" echo "+ ${COMMAND}" mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) -machinefile ${MACHINEFILE} \ ${COMMAND} | tee ${LOG} @@ -110,7 +115,6 @@ else [ -f $LOG ] && cat $LOG error "mpirun ... mdsrate ... failed, aborting" fi - check_rate create ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true fi if [ -n "$NOUNLINK" ]; then @@ -119,7 +123,6 @@ else log "===== $0 ### $NUM_CLIENTS NODES UNLINK ###" echo "Running unlinks on ${NUM_CLIENTS} node(s) with $THREADS_PER_CLIENT threads per client." - let NUM_FILES=${AGGREGATE_TARGET_RATE}\*${TIME_PERIOD} COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --time ${TIME_PERIOD} --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" echo "+ ${COMMAND}" @@ -129,13 +132,11 @@ else [ -f $LOG ] && cat $LOG error "mpirun ... mdsrate ... failed, aborting" fi - check_rate unlink ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true fi fi equals_msg `basename $0`: test complete, cleaning up rm -f $MACHINEFILE -zconf_umount_clients $NODES_TO_USE $MOUNT check_and_cleanup_lustre #rm -f $LOG diff --git a/lustre/tests/mdsrate-lookup-1dir.sh b/lustre/tests/mdsrate-lookup-1dir.sh index eb5f497..3387a56 100644 --- a/lustre/tests/mdsrate-lookup-1dir.sh +++ b/lustre/tests/mdsrate-lookup-1dir.sh @@ -21,8 +21,6 @@ TESTDIR=$MOUNT # Requirements NUM_FILES=${NUM_FILES:-1000000} TIME_PERIOD=${TIME_PERIOD:-600} # seconds -SINGLE_TARGET_RATE=5900 # ops/sec -AGGREGATE_TARGET_RATE=62000 # ops/sec LOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log} CLIENT=$SINGLECLIENT @@ -37,6 +35,11 @@ log "===== $0 ====== " check_and_setup_lustre +IFree=$(inodes_available) +if [ $IFree -lt $NUM_FILES ]; then + NUM_FILES=$IFree +fi + generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile" $LFS setstripe $TESTDIR -c 1 @@ -78,7 +81,6 @@ else [ -f $LOG ] && cat $LOG error "mpirun ... mdsrate ... failed, aborting" fi - check_rate lookup ${SINGLE_TARGET_RATE} 1 ${LOG} || true fi # 2 @@ -94,12 +96,11 @@ else [ -f $LOG ] && cat $LOG error "mpirun ... mdsrate ... failed, aborting" fi - check_rate lookup ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true fi equals_msg `basename $0`: test complete, cleaning up +mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR 'f%%d' rm -f $MACHINEFILE -zconf_umount_clients $NODES_TO_USE $MOUNT check_and_cleanup_lustre #rm -f $LOG diff --git a/lustre/tests/mdsrate-stat-large.sh b/lustre/tests/mdsrate-stat-large.sh index daadc40..a26ffc8 100644 --- a/lustre/tests/mdsrate-stat-large.sh +++ b/lustre/tests/mdsrate-stat-large.sh @@ -23,8 +23,6 @@ TESTDIR=$MOUNT # Requirements NUM_FILES=${NUM_FILES:-1000000} TIME_PERIOD=${TIME_PERIOD:-600} # seconds -SINGLE_TARGET_RATE=$((3300 / OSTCOUNT)) # ops/sec -AGGREGATE_TARGET_RATE=$((28500 / OSTCOUNT)) # ops/sec # --random_order (default) -OR- --readdir_order DIR_ORDER=${DIR_ORDER:-"--readdir_order"} @@ -42,6 +40,11 @@ log "===== $0 ====== " check_and_setup_lustre +IFree=$(inodes_available) +if [ $IFree -lt $NUM_FILES ]; then + NUM_FILES=$IFree +fi + generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile" $LFS setstripe $TESTDIR -c -1 @@ -86,7 +89,6 @@ else [ -f $LOG ] && cat $LOG error "mpirun ... mdsrate ... failed, aborting" fi - check_rate stat ${SINGLE_TARGET_RATE} 1 ${LOG} || true fi # 2 @@ -104,12 +106,11 @@ else [ -f $LOG ] && cat $LOG error "mpirun ... mdsrate ... failed, aborting" fi - check_rate stat ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true fi equals_msg `basename $0`: test complete, cleaning up +mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR 'f%%d' rm -f $MACHINEFILE -zconf_umount_clients $NODES_TO_USE $MOUNT check_and_cleanup_lustre #rm -f $LOG diff --git a/lustre/tests/mdsrate-stat-small.sh b/lustre/tests/mdsrate-stat-small.sh index 1503416..f667ee6 100644 --- a/lustre/tests/mdsrate-stat-small.sh +++ b/lustre/tests/mdsrate-stat-small.sh @@ -23,8 +23,6 @@ TESTDIR=$MOUNT # Requirements NUM_FILES=${NUM_FILES:-1000000} TIME_PERIOD=${TIME_PERIOD:-600} # seconds -SINGLE_TARGET_RATE=3200 # ops/sec -AGGREGATE_TARGET_RATE=29000 # ops/sec # --random_order (default) -OR- --readdir_order DIR_ORDER=${DIR_ORDER:-"--readdir_order"} @@ -42,6 +40,11 @@ log "===== $0 ====== " check_and_setup_lustre +IFree=$(inodes_available) +if [ $IFree -lt $NUM_FILES ]; then + NUM_FILES=$IFree +fi + generate_machine_file $NODES_TO_USE $MACHINEFILE || error "can not generate machinefile" $LFS setstripe $TESTDIR -i 0 -c 1 @@ -86,7 +89,6 @@ else [ -f $LOG ] && cat $LOG error "mpirun ... mdsrate ... failed, aborting" fi - check_rate stat ${SINGLE_TARGET_RATE} 1 ${LOG} || true fi # 2 @@ -103,12 +105,11 @@ else [ -f $LOG ] && cat $LOG error "mpirun ... mdsrate ... failed, aborting" fi - check_rate stat ${AGGREGATE_TARGET_RATE} ${NUM_CLIENTS} ${LOG} || true fi equals_msg `basename $0`: test complete, cleaning up +mdsrate_cleanup $NUM_CLIENTS $MACHINEFILE $NUM_FILES $TESTDIR 'f%%d' rm -f $MACHINEFILE -zconf_umount_clients $NODES_TO_USE $MOUNT check_and_cleanup_lustre #rm -f $LOG diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 9efed77..41087cf 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -644,54 +644,65 @@ test_24() { # bug 11710 details correct fsync() behavior } run_test 24 "fsync error (should return error)" +wait_client_evicted () { + local facet=$1 + local exports=$2 + local varsvc=${facet}_svc + + wait_update $(facet_host $facet) "lctl get_param -n *.${!varsvc}.num_exports | cut -d' ' -f2" $((exports - 1)) $3 +} + test_26a() { # was test_26 bug 5921 - evict dead exports by pinger # this test can only run from a client on a separate node. remote_ost || { skip "local OST" && return 0; } remote_ost_nodsh && skip "remote OST with nodsh" && return 0 remote_mds || { skip "local MDS" && return 0; } - OST_FILE=obdfilter.${ost1_svc}.num_exports - OST_EXP="`do_facet ost1 lctl get_param -n $OST_FILE`" - OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2` - echo starting with $OST_NEXP1 OST exports + + check_timeout || return 1 + + local OST_NEXP=$(do_facet ost1 lctl get_param -n obdfilter.${ost1_svc}.num_exports | cut -d' ' -f2) + + echo starting with $OST_NEXP OST exports # OBD_FAIL_PTLRPC_DROP_RPC 0x505 do_facet client lctl set_param fail_loc=0x505 # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict. # But if there's a race to start the evictor from various obds, # the loser might have to wait for the next ping. - echo Waiting for $(($TIMEOUT * 8)) secs - sleep $(($TIMEOUT * 8)) - OST_EXP="`do_facet ost1 lctl get_param -n $OST_FILE`" - OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2` - echo ending with $OST_NEXP2 OST exports + + local rc=0 + wait_client_evicted ost1 $OST_NEXP $((TIMEOUT * 2 + TIMEOUT * 3 / 4)) + rc=$? do_facet client lctl set_param fail_loc=0x0 - [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted" - return 0 + [ $rc -eq 0 ] || error "client not evicted from OST" } run_test 26a "evict dead exports" test_26b() { # bug 10140 - evict dead exports by pinger remote_ost_nodsh && skip "remote OST with nodsh" && return 0 + check_timeout || return 1 client_df - zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2" - sleep 1 # wait connections being established - MDS_FILE=mdt.${mds1_svc}.num_exports - MDS_NEXP1="`do_facet $SINGLEMDS lctl get_param -n $MDS_FILE | cut -d' ' -f2`" - OST_FILE=obdfilter.${ost1_svc}.num_exports - OST_NEXP1="`do_facet ost1 lctl get_param -n $OST_FILE | cut -d' ' -f2`" - echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports - zconf_umount `hostname` $MOUNT2 -f - # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict. - # But if there's a race to start the evictor from various obds, - # the loser might have to wait for the next ping. - echo Waiting for $(($TIMEOUT * 3)) secs - sleep $(($TIMEOUT * 3)) - OST_NEXP2="`do_facet ost1 lctl get_param -n $OST_FILE | cut -d' ' -f2`" - MDS_NEXP2="`do_facet $SINGLEMDS lctl get_param -n $MDS_FILE | cut -d' ' -f2`" - echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports - [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted from OST" - [ $MDS_NEXP1 -le $MDS_NEXP2 ] && error "client not evicted from MDS" - return 0 + zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2" + sleep 1 # wait connections being established + + local MDS_NEXP=$(do_facet $SINGLEMDS lctl get_param -n mdt.${mds1_svc}.num_exports | cut -d' ' -f2) + local OST_NEXP=$(do_facet ost1 lctl get_param -n obdfilter.${ost1_svc}.num_exports | cut -d' ' -f2) + + echo starting with $OST_NEXP OST and $MDS_NEXP MDS exports + + zconf_umount `hostname` $MOUNT2 -f + + # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict. + # But if there's a race to start the evictor from various obds, + # the loser might have to wait for the next ping. + # PING_INTERVAL max(obd_timeout / 4, 1U) + # sleep (2*PING_INTERVAL) + + local rc=0 + wait_client_evicted ost1 $OST_NEXP $((TIMEOUT * 2 + TIMEOUT * 3 / 4)) || \ + error "Client was not evicted by ost" rc=1 + wait_client_evicted $SINGLEMDS $MDS_NEXP $((TIMEOUT * 2 + TIMEOUT * 3 / 4)) || \ + error "Client was not evicted by mds" } run_test 26b "evict dead exports" diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 925a089..71d83a3 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -19,8 +19,8 @@ GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""} remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0 # Skip these tests -# bug number: 17466 -ALWAYS_EXCEPT="61d $REPLAY_SINGLE_EXCEPT" +# bug number: 17466 15962 +ALWAYS_EXCEPT="61d 33b $REPLAY_SINGLE_EXCEPT" if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then CONFIG_EXCEPTIONS="0b 42 47 61a 61c" @@ -730,7 +730,7 @@ test_33a() { # was test_33 } run_test 33a "abort recovery before client does replay" -# Stale FID sequence +# Stale FID sequence bug 15962 test_33b() { # was test_33a replay_barrier $SINGLEMDS createmany -o $DIR/$tfile-%d 10 @@ -1112,6 +1112,8 @@ test_53a() { run_test 53a "|X| close request while two MDC requests in flight" test_53b() { + rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2 + mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 multiop $DIR/${tdir}-1/f O_c & @@ -1141,6 +1143,8 @@ test_53b() { run_test 53b "|X| open request while two MDC requests in flight" test_53c() { + rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2 + mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 multiop $DIR/${tdir}-1/f O_c & @@ -1172,6 +1176,8 @@ test_53c() { run_test 53c "|X| open request and close request while two MDC requests in flight" test_53d() { + rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2 + mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 multiop $DIR/${tdir}-1/f O_c & @@ -1198,6 +1204,8 @@ test_53d() { run_test 53d "|X| close reply while two MDC requests in flight" test_53e() { + rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2 + mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 multiop $DIR/${tdir}-1/f O_c & @@ -1227,6 +1235,8 @@ test_53e() { run_test 53e "|X| open reply while two MDC requests in flight" test_53f() { + rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2 + mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 multiop $DIR/${tdir}-1/f O_c & @@ -1258,6 +1268,8 @@ test_53f() { run_test 53f "|X| open reply and close reply while two MDC requests in flight" test_53g() { + rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2 + mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 multiop $DIR/${tdir}-1/f O_c & @@ -1289,6 +1301,8 @@ test_53g() { run_test 53g "|X| drop open reply and close request while close and open are both in flight" test_53h() { + rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2 + mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 multiop $DIR/${tdir}-1/f O_c & @@ -1513,10 +1527,31 @@ run_test 62 "don't mis-drop resent replay" #Adaptive Timeouts (bug 3055) AT_MAX_SET=0 -# Suppose that all osts have the same at_max -for facet in mds client ost; do - eval AT_MAX_SAVE_${facet}=$(at_max_get $facet) -done + +at_cleanup () { + local var + local facet + local at_new + + echo "Cleaning up AT ..." + if [ -n "$ATOLDBASE" ]; then + local at_history=$(do_facet mds "find /sys/ -name at_history") + do_facet mds "echo $ATOLDBASE >> $at_history" || true + do_facet ost1 "echo $ATOLDBASE >> $at_history" || true + fi + + if [ $AT_MAX_SET -ne 0 ]; then + for facet in mds client ost; do + var=AT_MAX_SAVE_${facet} + echo restore AT on $facet to saved value ${!var} + at_max_set ${!var} $facet + at_new=$(at_max_get $facet) + echo Restored AT value on $facet $at_new + [ $at_new -eq ${!var} ] || \ + error "$facet : AT value was not restored SAVED ${!var} NEW $at_new" + done + fi +} at_start() { @@ -1526,8 +1561,15 @@ at_start() return 1 fi + # Save at_max original values + local facet + if [ $AT_MAX_SET -eq 0 ]; then + # Suppose that all osts have the same at_max + for facet in mds client ost; do + eval AT_MAX_SAVE_${facet}=$(at_max_get $facet) + done + fi local at_max - for facet in mds client ost; do at_max=$(at_max_get $facet) if [ $at_max -ne $at_max_new ]; then @@ -1736,24 +1778,7 @@ test_68 () #bug 13813 } run_test 68 "AT: verify slowing locks" -if [ -n "$ATOLDBASE" ]; then - at_history=$(do_facet mds "find /sys/ -name at_history") - do_facet mds "echo $ATOLDBASE >> $at_history" || true - do_facet ost1 "echo $ATOLDBASE >> $at_history" || true -fi - -if [ $AT_MAX_SET -ne 0 ]; then - for facet in mds client ost; do - var=AT_MAX_SAVE_${facet} - echo restore AT on $facet to saved value ${!var} - at_max_set ${!var} $facet - AT_NEW=$(at_max_get $facet) - echo Restored AT value on $facet $AT_NEW - [ $AT_NEW -ne ${!var} ] && \ - error "$facet : AT value was not restored SAVED ${!var} NEW $AT_NEW" - done -fi - +at_cleanup # end of AT tests includes above lines diff --git a/lustre/tests/sanity-gss.sh b/lustre/tests/sanity-gss.sh index 018c242..478f872 100644 --- a/lustre/tests/sanity-gss.sh +++ b/lustre/tests/sanity-gss.sh @@ -83,7 +83,7 @@ check_and_setup_lustre rm -rf $DIR/[df][0-9]* -check_runas_id $RUNAS_ID $RUNAS +check_runas_id $RUNAS_ID $RUNAS_ID $RUNAS build_test_filter @@ -647,27 +647,39 @@ run_test 7 "exercise enlarge_reqbuf()" test_8() { - sleep $TIMEOUT + local ATHISTORY=$(do_facet mds "find /sys/ -name at_history") + local ATOLDBASE=$(do_facet mds "cat $ATHISTORY") + do_facet mds "echo 8 >> $ATHISTORY" + $LCTL dk > /dev/null debugsave sysctl -w lnet.debug="+other" + mkdir -p $DIR/d8 + chmod a+w $DIR/d8 + + REQ_DELAY=`lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | + awk '/portal 12/ {print $5}' | tail -1` + REQ_DELAY=$((${REQ_DELAY} + ${REQ_DELAY} / 4 + 5)) + # sleep sometime in ctx handle - do_facet mds lctl set_param fail_val=30 + do_facet mds lctl set_param fail_val=$REQ_DELAY #define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204 do_facet mds lctl set_param fail_loc=0x1204 $RUNAS $LFS flushctx || error "can't flush ctx" - $RUNAS df $DIR & - DFPID=$! - echo "waiting df (pid $TOUCHPID) to finish..." - sleep 2 # give df a chance to really trigger context init rpc + $RUNAS touch $DIR/d8/f & + TOUCHPID=$! + echo "waiting for touch (pid $TOUCHPID) to finish..." + sleep 2 # give it a chance to really trigger context init rpc do_facet mds sysctl -w lustre.fail_loc=0 - wait $DFPID || error "df should have succeeded" + wait $TOUCHPID || error "touch should have succeeded" $LCTL dk | grep "Early reply #" || error "No early reply" + debugrestore + do_facet mds "echo $ATOLDBASE >> $ATHISTORY" || true } run_test 8 "Early reply sent for slow gss context negotiation" @@ -676,98 +688,6 @@ run_test 8 "Early reply sent for slow gss context negotiation" # so each test should not assume any start flavor. # -test_50() { - local sample=$TMP/sanity-gss-8 - local tdir=$MOUNT/dir8 - local iosize="256K" - local hash_algs="adler32 crc32 md5 sha1 sha256 sha384 sha512 wp256 wp384 wp512" - - # create sample file with aligned size for direct i/o - dd if=/dev/zero of=$sample bs=$iosize count=1 || error - dd conv=notrunc if=/etc/termcap of=$sample bs=$iosize count=1 || error - - rm -rf $tdir - mkdir $tdir || error "create dir $tdir" - - restore_to_default_flavor - - for alg in $hash_algs; do - echo "Testing $alg..." - flavor=krb5i-bulki:$alg/null - set_rule $FSNAME any cli2ost $flavor - wait_flavor cli2ost $flavor $cnt_cli2ost - - dd if=$sample of=$tdir/$alg oflag=direct,dsync bs=$iosize || error "$alg write" - diff $sample $tdir/$alg || error "$alg read" - done - - rm -rf $tdir - rm -f $sample -} -run_test 50 "verify bulk hash algorithms works" - -test_51() { - local s1=$TMP/sanity-gss-9.1 - local s2=$TMP/sanity-gss-9.2 - local s3=$TMP/sanity-gss-9.3 - local s4=$TMP/sanity-gss-9.4 - local tdir=$MOUNT/dir9 - local s1_size=4194304 # n * pagesize (4M) - local s2_size=512 # n * blksize - local s3_size=111 # n * blksize + m - local s4_size=5 # m - local cipher_algs="arc4 aes128 aes192 aes256 cast128 cast256 twofish128 twofish256" - - # create sample files for each situation - rm -f $s1 $s2 $s2 $s4 - dd if=/dev/urandom of=$s1 bs=1M count=4 || error - dd if=/dev/urandom of=$s2 bs=$s2_size count=1 || error - dd if=/dev/urandom of=$s3 bs=$s3_size count=1 || error - dd if=/dev/urandom of=$s4 bs=$s4_size count=1 || error - - rm -rf $tdir - mkdir $tdir || error "create dir $tdir" - - restore_to_default_flavor - - # - # different bulk data alignment will lead to different behavior of - # the implementation: (n > 0; 0 < m < encryption_block_size) - # - full page i/o - # - partial page, size = n * encryption_block_size - # - partial page, size = n * encryption_block_size + m - # - partial page, size = m - # - for alg in $cipher_algs; do - echo "Testing $alg..." - flavor=krb5p-bulkp:sha1/$alg - set_rule $FSNAME any cli2ost $flavor - wait_flavor cli2ost $flavor $cnt_cli2ost - - # sync write - dd if=$s1 of=$tdir/$alg.1 oflag=dsync bs=1M || error "write $alg.1" - dd if=$s2 of=$tdir/$alg.2 oflag=dsync || error "write $alg.2" - dd if=$s3 of=$tdir/$alg.3 oflag=dsync || error "write $alg.3" - dd if=$s4 of=$tdir/$alg.4 oflag=dsync || error "write $alg.4" - - # remount client - umount_client $MOUNT - umount_client $MOUNT2 - mount_client $MOUNT - mount_client $MOUNT2 - - # read & compare - diff $tdir/$alg.1 $s1 || error "read $alg.1" - diff $tdir/$alg.2 $s2 || error "read $alg.2" - diff $tdir/$alg.3 $s3 || error "read $alg.3" - diff $tdir/$alg.4 $s4 || error "read $alg.4" - done - - rm -rf $tdir - rm -f $sample -} -run_test 51 "bulk data alignment test under encryption mode" - test_90() { if [ "$SLOW" = "no" ]; then total=10 diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh index e1cb070..5e56c85 100644 --- a/lustre/tests/sanity-quota.sh +++ b/lustre/tests/sanity-quota.sh @@ -121,22 +121,24 @@ set_file_unitsz() { lustre_fail() { local fail_node=$1 local fail_loc=$2 - - case $fail_node in - "mds" ) - do_facet $SINGLEMDS "lctl set_param fail_loc=$fail_loc" ;; - "ost" ) - for num in `seq $OSTCOUNT`; do - do_facet ost$num "lctl set_param fail_loc=$fail_loc" - done ;; - "mds_ost" ) - do_facet $SINGLEMDS "lctl set_param fail_loc=$fail_loc" ; - for num in `seq $OSTCOUNT`; do - do_facet ost$num "lctl set_param fail_loc=$fail_loc" - done ;; - * ) echo "usage: lustre_fail fail_node fail_loc" ; - return 1 ;; - esac + local fail_val=${3:-0} + + if [ $fail_node == "mds" ] || [ $fail_node == "mds_ost" ]; then + if [ $((fail_loc & 0x10000000)) -ne 0 -a $fail_val -gt 0 ] || \ + [ $((fail_loc)) -eq 0 ]; then + do_facet $SINGLEMDS "lctl set_param fail_val=$fail_val" + fi + do_facet $SINGLEMDS "lctl set_param fail_loc=$fail_loc" + fi + if [ $fail_node == "ost" ] || [ $fail_node == "mds_ost" ]; then + for num in `seq $OSTCOUNT`; do + if [ $((fail_loc & 0x10000000)) -ne 0 -a $fail_val -gt 0 ] || \ + [ $((fail_loc)) -eq 0 ]; then + do_facet ost$num "lctl set_param fail_val=$fail_val" + fi + do_facet ost$num "lctl set_param fail_loc=$fail_loc" + done + fi } RUNAS="runas -u $TSTID -g $TSTID" @@ -968,11 +970,6 @@ test_12() { [ "$(grep $DIR2 /proc/mounts)" ] || mount_client $DIR2 || \ { skip "Need lustre mounted on $MOUNT2 " && retutn 0; } - if [ $OSTCOUNT -lt 2 ]; then - skip "$OSTCOUNT < 2, too few osts" - return 0; - fi - LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) * 10)) # 10 bunits each sever TESTFILE="$DIR/$tdir/$tfile-0" TESTFILE2="$DIR2/$tdir/$tfile-1" @@ -984,11 +981,12 @@ test_12() { $LFS setstripe $TESTFILE -i 0 -c 1 chown $TSTUSR.$TSTUSR $TESTFILE - $LFS setstripe $TESTFILE2 -i 1 -c 1 + $LFS setstripe $TESTFILE2 -i 0 -c 1 chown $TSTUSR2.$TSTUSR2 $TESTFILE2 #define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f - lustre_fail ost 0x0000021f + #define OBD_FAIL_SOME 0x10000000 /* fail N times */ + lustre_fail ost $((0x0000021f | 0x10000000)) 1 echo " step1: write out of block quota ..." $RUNAS2 dd if=/dev/zero of=$TESTFILE2 bs=$BLK_SZ count=102400 & @@ -1785,24 +1783,25 @@ test_25_sub() { chmod 0777 $DIR/$tdir TESTFILE="$DIR/$tdir/$tfile-0" rm -f $TESTFILE + LIMIT=$(( $BUNIT_SZ * ($OSTCOUNT + 1) + 4096 )) wait_delete_completed # set quota for $TSTUSR log "setquota for $TSTUSR" - $LFS setquota $1 $TSTUSR -b 10240 -B 10240 -i 10 -I 10 $DIR + $LFS setquota $1 $TSTUSR -b $LIMIT -B $LIMIT -i 10 -I 10 $DIR sleep 3 show_quota $1 $TSTUSR # set quota for $TSTUSR2 log "setquota for $TSTUSR2" - $LFS setquota $1 $TSTUSR2 -b 10240 -B 10240 -i 10 -I 10 $DIR + $LFS setquota $1 $TSTUSR2 -b $LIMIT -B $LIMIT -i 10 -I 10 $DIR sleep 3 show_quota $1 $TSTUSR2 # set stripe index to 0 log "setstripe for $DIR/$tdir to 0" - $LFS setstripe $DIR/$tdir -i 0 + $LFS setstripe $DIR/$tdir -c 1 -i 0 MDS_UUID=`do_facet $SINGLEMDS $LCTL dl | grep -m1 " mdt " | awk '{print $((NF-1))}'` OST0_UUID=`do_facet ost1 $LCTL dl | grep -m1 obdfilter | awk '{print $((NF-1))}'` MDS_QUOTA_USED_OLD=`$LFS quota -o $MDS_UUID $1 $TSTUSR $DIR | awk '/^.*[[:digit:]+][[:space:]+]/ { print $4 }'` diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index fd20f97..568620d 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -8,7 +8,7 @@ set -e ONLY=${ONLY:-"$*"} # bug number for skipped test: 13297 2108 9789 3637 9789 3561 12622 12653 12653 5188 10764 16260 -ALWAYS_EXCEPT=" 27u 42a 42b 42c 42d 45 51d 65a 65e 68 75 119d $SANITY_EXCEPT" +ALWAYS_EXCEPT=" 27u 42a 42b 42c 42d 45 51d 65a 65e 68b 75 119d $SANITY_EXCEPT" # bug number for skipped test: 2108 9789 3637 9789 3561 5188/5749 1443 #ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"27m 42a 42b 42c 42d 45 68 76"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! @@ -3075,12 +3075,15 @@ LLOOP= cleanup_68() { trap 0 if [ ! -z "$LLOOP" ]; then - swapoff $LLOOP || error "swapoff failed" + if swapon -s | grep -q $LLOOP; then + swapoff $LLOOP || error "swapoff failed" + fi + $LCTL blockdev_detach $LLOOP || error "detach failed" rm -f $LLOOP unset LLOOP fi - rm -f $DIR/f68 + rm -f $DIR/f68* } meminfo() { @@ -3091,10 +3094,29 @@ swap_used() { swapon -s | awk '($1 == "'$1'") { print $4 }' } +# test case for lloop driver, basic function +test_68a() { + [ "$UID" != 0 ] && skip "must run as root" && return + + grep -q llite_lloop /proc/modules + [ $? -ne 0 ] && skip "can't find module llite_lloop" && return + + LLOOP=$TMP/lloop.`date +%s`.`date +%N` + dd if=/dev/zero of=$DIR/f68a bs=4k count=1024 + $LCTL blockdev_attach $DIR/f68a $LLOOP || error "attach failed" + + trap cleanup_68 EXIT + + directio rdwr $LLOOP 0 1024 4096 || error "direct write failed" + directio rdwr $LLOOP 0 1025 4096 && error "direct write should fail" + + cleanup_68 +} +run_test 68a "lloop driver - basic test ========================" # excercise swapping to lustre by adding a high priority swapfile entry # and then consuming memory until it is used. -test_68() { +test_68b() { # was test_68 [ "$UID" != 0 ] && skip "must run as root" && return lctl get_param -n devices | grep -q obdfilter && \ skip "local OST" && return @@ -3110,10 +3132,10 @@ test_68() { [[ $NR_BLOCKS -le 2048 ]] && NR_BLOCKS=2048 LLOOP=$TMP/lloop.`date +%s`.`date +%N` - dd if=/dev/zero of=$DIR/f68 bs=64k seek=$NR_BLOCKS count=1 - mkswap $DIR/f68 + dd if=/dev/zero of=$DIR/f68b bs=64k seek=$NR_BLOCKS count=1 + mkswap $DIR/f68b - $LCTL blockdev_attach $DIR/f68 $LLOOP || error "attach failed" + $LCTL blockdev_attach $DIR/f68b $LLOOP || error "attach failed" trap cleanup_68 EXIT @@ -3128,7 +3150,7 @@ test_68() { [ $SWAPUSED -eq 0 ] && echo "no swap used???" || true } -run_test 68 "support swapping to Lustre ========================" +run_test 68b "support swapping to Lustre ========================" # bug5265, obdfilter oa2dentry return -ENOENT # #define OBD_FAIL_OST_ENOENT 0x217 @@ -3410,6 +3432,7 @@ setup_f77() { } test_77a() { # bug 10889 + $GSS && skip "could not run with gss" && return [ ! -f $F77_TMP ] && setup_f77 set_checksums 1 dd if=$F77_TMP of=$DIR/$tfile bs=1M count=$F77SZ || error "dd error" @@ -3419,6 +3442,7 @@ test_77a() { # bug 10889 run_test 77a "normal checksum read/write operation =============" test_77b() { # bug 10889 + $GSS && skip "could not run with gss" && return [ ! -f $F77_TMP ] && setup_f77 #define OBD_FAIL_OSC_CHECKSUM_SEND 0x409 lctl set_param fail_loc=0x80000409 @@ -3432,6 +3456,7 @@ test_77b() { # bug 10889 run_test 77b "checksum error on client write ====================" test_77c() { # bug 10889 + $GSS && skip "could not run with gss" && return [ ! -f $DIR/f77b ] && skip "requires 77b - skipping" && return set_checksums 1 for algo in $CKSUM_TYPES; do @@ -3448,6 +3473,7 @@ test_77c() { # bug 10889 run_test 77c "checksum error on client read ===================" test_77d() { # bug 10889 + $GSS && skip "could not run with gss" && return #define OBD_FAIL_OSC_CHECKSUM_SEND 0x409 lctl set_param fail_loc=0x80000409 set_checksums 1 @@ -3459,6 +3485,7 @@ test_77d() { # bug 10889 run_test 77d "checksum error on OST direct write ===============" test_77e() { # bug 10889 + $GSS && skip "could not run with gss" && return [ ! -f $DIR/f77 ] && skip "requires 77d - skipping" && return #define OBD_FAIL_OSC_CHECKSUM_RECEIVE 0x408 lctl set_param fail_loc=0x80000408 @@ -3472,6 +3499,7 @@ test_77e() { # bug 10889 run_test 77e "checksum error on OST direct read ================" test_77f() { # bug 10889 + $GSS && skip "could not run with gss" && return set_checksums 1 for algo in $CKSUM_TYPES; do cancel_lru_locks osc @@ -3488,6 +3516,7 @@ test_77f() { # bug 10889 run_test 77f "repeat checksum error on write (expect error) ====" test_77g() { # bug 10889 + $GSS && skip "could not run with gss" && return remote_ost_nodsh && skip "remote OST with nodsh" && return [ ! -f $F77_TMP ] && setup_f77 @@ -3504,6 +3533,7 @@ test_77g() { # bug 10889 run_test 77g "checksum error on OST write ======================" test_77h() { # bug 10889 + $GSS && skip "could not run with gss" && return remote_ost_nodsh && skip "remote OST with nodsh" && return [ ! -f $DIR/f77g ] && skip "requires 77g - skipping" && return @@ -3518,6 +3548,7 @@ test_77h() { # bug 10889 run_test 77h "checksum error on OST read =======================" test_77i() { # bug 13805 + $GSS && skip "could not run with gss" && return #define OBD_FAIL_OSC_CONNECT_CKSUM 0x40b lctl set_param fail_loc=0x40b remount_client $MOUNT @@ -3532,6 +3563,7 @@ test_77i() { # bug 13805 run_test 77i "client not supporting OSD_CONNECT_CKSUM ==========" test_77j() { # bug 13805 + $GSS && skip "could not run with gss" && return #define OBD_FAIL_OSC_CKSUM_ADLER_ONLY 0x40c lctl set_param fail_loc=0x40c remount_client $MOUNT @@ -3866,7 +3898,6 @@ setup_test102() { trap cleanup_test102 EXIT cd $DIR - # $1 = runas $1 $SETSTRIPE $tdir -s $STRIPE_SIZE -i $STRIPE_OFFSET -c $STRIPE_COUNT cd $DIR/$tdir for num in 1 2 3 4 @@ -3883,10 +3914,7 @@ setup_test102() { done cd $DIR - if [ "$TAR" == "tar" ]; then - TAR_OPTS="--xattrs" - fi - $1 $TAR cf $TMP/f102.tar $tdir $TAR_OPTS + $1 $TAR cf $TMP/f102.tar $tdir --xattrs SETUP_TEST102=yes } @@ -4047,70 +4075,35 @@ compare_stripe_info2() { } find_lustre_tar() { - [ -n "$(which star 2>/dev/null)" ] && strings $(which star) | grep -q lustre && echo star && return [ -n "$(which tar 2>/dev/null)" ] && strings $(which tar) | grep -q lustre && echo tar } test_102d() { - # b10930: (s)tar test for trusted.lov xattr + # b10930: tar test for trusted.lov xattr TAR=$(find_lustre_tar) - [ -z "$TAR" ] && skip "lustre-aware (s)tar is not installed" && return + [ -z "$TAR" ] && skip "lustre-aware tar is not installed" && return [ "$OSTCOUNT" -lt "2" ] && skip "skipping N-stripe test" && return setup_test102 mkdir -p $DIR/d102d - if [ "$TAR" == "tar" ]; then - TAR_OPTS="--xattrs" - fi - $TAR xf $TMP/f102.tar -C $DIR/d102d $TAR_OPTS + $TAR xf $TMP/f102.tar -C $DIR/d102d --xattrs cd $DIR/d102d/$tdir compare_stripe_info1 } -run_test 102d "(s)tar restore stripe info from tarfile,not keep osts ===========" - -test_102e() { - # b10930: star test for trusted.lov xattr - TAR=$(find_lustre_tar) - [ "$TAR" != star ] && skip "lustre-aware star is not installed" && return - [ "$OSTCOUNT" -lt "2" ] && skip "skipping N-stripe test" && return - setup_test102 - mkdir -p $DIR/d102e - star -x -preserve-osts f=$TMP/f102.tar -C $DIR/d102e - cd $DIR/d102e/$tdir - compare_stripe_info2 -} -run_test 102e "star restore stripe info from tarfile, keep osts ===========" +run_test 102d "tar restore stripe info from tarfile,not keep osts ===========" test_102f() { - # b10930: (s)tar test for trusted.lov xattr + # b10930: tar test for trusted.lov xattr TAR=$(find_lustre_tar) - [ -z "$TAR" ] && skip "lustre-aware (s)tar is not installed" && return + [ -z "$TAR" ] && skip "lustre-aware tar is not installed" && return [ "$OSTCOUNT" -lt "2" ] && skip "skipping N-stripe test" && return setup_test102 mkdir -p $DIR/d102f cd $DIR - if [ "$TAR" == "tar" ]; then - TAR_OPTS="--xattrs" - fi - $TAR cf - $TAR_OPTS . | $TAR xf - $TAR_OPTS -C $DIR/d102f + $TAR cf - --xattrs $tdir | $TAR xf - --xattrs -C $DIR/d102f cd $DIR/d102f/$tdir compare_stripe_info1 } -run_test 102f "(s)tar copy files, not keep osts ===========" - -test_102g() { - # b10930: star test for trusted.lov xattr - TAR=$(find_lustre_tar) - [ "$TAR" != star ] && skip "lustre-aware star is not installed" && return - [ "$OSTCOUNT" -lt "2" ] && skip "skipping N-stripe test" && return - setup_test102 - mkdir -p $DIR/d102g - cd $DIR - star -copy -preserve-osts $tdir $DIR/d102g - cd $DIR/d102g/$tdir - compare_stripe_info2 - cleanup_test102 -} -run_test 102g "star copy files, keep osts ===========" +run_test 102f "tar copy files, not keep osts ===========" test_102h() { # bug 15777 [ -z $(lctl get_param -n mdc.*.connect_flags | grep xattr) ] && @@ -4168,19 +4161,16 @@ run_test 102i "lgetxattr test on symbolic link ============" test_102j() { TAR=$(find_lustre_tar) - [ -z "$TAR" ] && skip "lustre-aware (s)tar is not installed" && return + [ -z "$TAR" ] && skip "lustre-aware tar is not installed" && return [ "$OSTCOUNT" -lt "2" ] && skip "skipping N-stripe test" && return setup_test102 "$RUNAS" mkdir -p $DIR/d102j chown $RUNAS_ID $DIR/d102j - if [ "$TAR" == "tar" ]; then - TAR_OPTS="--xattrs" - fi - $RUNAS $TAR xf $TMP/f102.tar -C $DIR/d102j $TAR_OPTS + $RUNAS $TAR xf $TMP/f102.tar -C $DIR/d102j --xattrs cd $DIR/d102j/$tdir compare_stripe_info1 "$RUNAS" } -run_test 102j "non-root (s)tar restore stripe info from tarfile,not keep osts =" +run_test 102j "non-root tar restore stripe info from tarfile, not keep osts ===" run_acl_subtest() { @@ -5127,7 +5117,7 @@ test_123a() { # was test 123, statahead(bug 11401) cancel_lru_locks mdc cancel_lru_locks osc stime=`date +%s` - time ls -l $DIR/$tdir > /dev/null + time ls -l $DIR/$tdir | wc -l etime=`date +%s` delta=$((etime - stime)) log "ls $i files without statahead: $delta sec" @@ -5138,10 +5128,10 @@ test_123a() { # was test 123, statahead(bug 11401) cancel_lru_locks mdc cancel_lru_locks osc stime=`date +%s` - time ls -l $DIR/$tdir > /dev/null + time ls -l $DIR/$tdir | wc -l etime=`date +%s` delta_sa=$((etime - stime)) - log "ls $i files with statahead: $delta_sa sec" + log "ls $i files with statahead: $delta_sa sec" lctl get_param -n llite.*.statahead_stats ewrong=`lctl get_param -n llite.*.statahead_stats | grep "statahead wrong:" | awk '{print $3}'` @@ -5149,13 +5139,41 @@ test_123a() { # was test 123, statahead(bug 11401) log "statahead was stopped, maybe too many locks held!" fi + [ $delta -eq 0 ] && continue + if [ $((delta_sa * 100)) -gt $((delta * 105)) ]; then if [ $SLOWOK -eq 0 ]; then error "ls $i files is slower with statahead!" + + max=`lctl get_param -n llite.*.statahead_max | head -n 1` + lctl set_param -n llite.*.statahead_max 0 + lctl get_param llite.*.statahead_max + cancel_lru_locks mdc + cancel_lru_locks osc + $LCTL dk > /dev/null + stime=`date +%s` + time ls -l $DIR/$tdir | wc -l + etime=`date +%s` + $LCTL dk > $TMP/sanity_test_123a_${i}_disable_${etime}.log + delta=$((etime - stime)) + log "ls $i files without statahead: $delta sec, dump to $TMP/sanity_test_123a_${i}_disable_${etime}.log" + lctl set_param llite.*.statahead_max=$max + + lctl get_param -n llite.*.statahead_max | grep '[0-9]' + cancel_lru_locks mdc + cancel_lru_locks osc + $LCTL dk > /dev/null + stime=`date +%s` + time ls -l $DIR/$tdir | wc -l + etime=`date +%s` + $LCTL dk > $TMP/sanity_test_123a_${i}_enable_${etime}.log + delta_sa=$((etime - stime)) + log "ls $i files with statahead: $delta_sa sec, dump to $TMP/sanity_test_123a_${i}_enable_${etime}.log" + lctl get_param -n llite.*.statahead_stats else log "ls $i files is slower with statahead!" fi - break; + break fi [ $delta -gt 20 ] && break @@ -5930,6 +5948,19 @@ err17935 () { fi } +test_154() { + cp /etc/hosts $DIR/$tfile + + fid=`$LFS path2fid $DIR/$tfile` + rc=$? + [ $rc -ne 0 ] && error "error: could not get fid for $DIR/$tfile." + + diff $DIR/$tfile $DIR/.lustre/fid/$fid || error "open by fid failed: did not find expected data in file." + + echo "Opening a file by FID succeeded" +} +run_test 154 "Opening a file by FID" + #Changelogs test_160() { remote_mds && skip "remote MDS" && return diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index 983dc80..9008dce9 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -804,8 +804,6 @@ test_34() { #16129 echo writing on client1 dd if=/dev/zero of=$DIR1/$tfile count=100 conv=notrunc > /dev/null 2>&1 sync & - # wait for the flush - sleep 1 echo reading on client2 dd of=/dev/null if=$DIR2/$tfile > /dev/null 2>&1 # wait for a lock timeout @@ -884,6 +882,32 @@ test_35() { # bug 17645 } run_test 35 "-EINTR cp_ast vs. bl_ast race does not evict client" +test_36() { #bug 16417 + local SIZE + mkdir -p $MOUNT1/$tdir + lfs setstripe -c -1 $MOUNT1/$tdir + i=0 + SIZE=100 + + while [ $i -le 10 ]; do + lctl mark "start test" + before=$($LFS df | awk '{if ($1 ~/^filesystem/) {print $5; exit} }') + dd if=/dev/zero of=$MOUNT1/$tdir/file000 bs=1M count=$SIZE + dd if=$MOUNT2/$tdir/file000 of=/dev/null bs=1M count=$SIZE & + read_pid=$! + sleep 0.1 + rm -f $MOUNT1/$tdir/file000 + wait $read_pid + after=$($LFS df | awk '{if ($1 ~/^filesystem/) {print $5; exit} }') + if [ $before -gt $after ]; then + error "space leaked" + exit; + fi + let i=i+1 + done +} +run_test 36 "handle ESTALE/open-unlink corectly" + log "cleanup: ======================================================" check_and_cleanup_lustre diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 32516c1..07430ff 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -128,7 +128,7 @@ init_test_env() { export TUNEFS=${TUNEFS:-"$LUSTRE/utils/tunefs.lustre"} [ ! -f "$TUNEFS" ] && export TUNEFS=$(which tunefs.lustre) export CHECKSTAT="${CHECKSTAT:-"checkstat -v"} " - export FSYTPE=${FSTYPE:-"ldiskfs"} + export FSTYPE=${FSTYPE:-"ldiskfs"} export NAME=${NAME:-local} export LGSSD=${LGSSD:-"$LUSTRE/utils/gss/lgssd"} [ "$GSS_PIPEFS" = "true" ] && [ ! -f "$LGSSD" ] && \ @@ -252,6 +252,7 @@ load_modules() { load_module mgc/mgc if [ -z "$CLIENTONLY" ] && [ -z "$CLIENTMODSONLY" ]; then grep -q crc16 /proc/kallsyms || { modprobe crc16 2>/dev/null || true; } + grep -q jbd /proc/kallsyms || { modprobe jbd 2>/dev/null || true; } [ "$FSTYPE" = "ldiskfs" ] && load_module ../ldiskfs/ldiskfs/ldiskfs load_module mgs/mgs load_module mds/mds @@ -427,6 +428,10 @@ stop_gss_daemons() { init_gss() { if $GSS; then start_gss_daemons + + if [ -n "$LGSS_KEYRING_DEBUG" ]; then + echo $LGSS_KEYRING_DEBUG > /proc/fs/lustre/sptlrpc/gss/lgss_keyring/debug_level + fi fi } @@ -753,6 +758,34 @@ cleanup_check() { return 0 } +wait_update () { + local node=$1 + local TEST=$2 + local FINAL=$3 + local MAX=${4:-90} + + local RESULT + local WAIT=0 + local sleep=5 + while [ $WAIT -lt $MAX ]; do + sleep $sleep + RESULT=$(do_node $node "$TEST") + if [ $RESULT -eq $FINAL ]; then + echo "Updated after $WAIT sec: wanted $FINAL got $RESULT" + return 0 + fi + WAIT=$((WAIT + sleep)) + echo "Waiting $((MAX - WAIT)) secs for update" + done + echo "Update not seen after $MAX sec: wanted $FINAL got $RESULT" + return 3 +} + +wait_update_facet () { + local facet=$1 + wait_update $(facet_host $facet) $@ +} + wait_delete_completed () { local TOTALPREV=`lctl get_param -n osc.*.kbytesavail | \ awk 'BEGIN{total=0}; {total+=$1}; END{print total}'` @@ -772,14 +805,14 @@ wait_delete_completed () { } wait_for_host() { - HOST=$1 + local HOST=$1 check_network "$HOST" 900 while ! do_node $HOST "ls -d $LUSTRE " > /dev/null; do sleep 5; done } wait_for() { - facet=$1 - HOST=`facet_active_host $facet` + local facet=$1 + local HOST=`facet_active_host $facet` wait_for_host $HOST } @@ -788,8 +821,8 @@ wait_mds_recovery_done () { #define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) # as we are in process of changing obd_timeout in different ways # let's set MAX longer than that - MAX=$(( timeout * 4 )) - WAIT=0 + local MAX=$(( timeout * 4 )) + local WAIT=0 while [ $WAIT -lt $MAX ]; do STATUS=`do_facet $SINGLEMDS "lctl get_param -n mdt.*-MDT0000.recovery_status | grep status"` echo $STATUS | grep COMPLETE && return 0 @@ -876,8 +909,8 @@ client_reconnect() { } facet_failover() { - facet=$1 - sleep_time=$2 + local facet=$1 + local sleep_time=$2 echo "Failing $facet on node `facet_active_host $facet`" shutdown_facet $facet [ -n "$sleep_time" ] && sleep $sleep_time @@ -1292,16 +1325,6 @@ remount_client() zconf_mount `hostname` $1 || error "mount failed" } -set_obd_timeout() { - local facet=$1 - local timeout=$2 - - do_facet $facet lsmod | grep -q obdclass || \ - do_facet $facet "modprobe obdclass" - - do_facet $facet "lctl set_param timeout=$timeout" -} - writeconf_facet () { local facet=$1 local dev=$2 @@ -1330,7 +1353,6 @@ setupall() { writeconf_all for num in `seq $MDSCOUNT`; do DEVNAME=$(mdsdevname $num) - set_obd_timeout mds$num $TIMEOUT start mds$num $DEVNAME $MDS_MOUNT_OPTS # We started mds, now we should set failover variables properly. @@ -1346,7 +1368,6 @@ setupall() { done for num in `seq $OSTCOUNT`; do DEVNAME=$(ostdevname $num) - set_obd_timeout ost$num $TIMEOUT start ost$num $DEVNAME $OST_MOUNT_OPTS # We started ost$num, now we should set ost${num}failover variable properly. @@ -1371,7 +1392,7 @@ setupall() { [ -n "$CLIENTS" ] && zconf_mount_clients $CLIENTS $MOUNT2 fi - init_versions_vars + init_param_vars # by remounting mdt before ost, initial connect from mdt to ost might # timeout because ost is not ready yet. wait some time to its fully @@ -1425,10 +1446,13 @@ init_facets_vars () { done } -init_versions_vars () { +init_param_vars () { export MDSVER=$(do_facet $SINGLEMDS "lctl get_param version" | cut -d. -f1,2) export OSTVER=$(do_facet ost1 "lctl get_param version" | cut -d. -f1,2) export CLIVER=$(lctl get_param version | cut -d. -f 1,2) + + TIMEOUT=$(do_facet $SINGLEMDS "lctl get_param -n timeout") + log "Using TIMEOUT=$TIMEOUT" } check_config () { @@ -1449,6 +1473,15 @@ check_config () { fi } +check_timeout () { + local mdstimeout=$(do_facet $SINGLEMDS "lctl get_param -n timeout") + local cltimeout=$(lctl get_param -n timeout) + if [ $mdstimeout -ne $TIMEOUT ] || [ $mdstimeout -ne $cltimeout ]; then + error "timeouts are wrong! mds: $mdstimeout, client: $cltimeout, TIMEOUT=$TIMEOUT" + return 1 + fi +} + check_and_setup_lustre() { local MOUNTED=$(mounted_lustre_filesystems) if [ -z "$MOUNTED" ] || ! $(echo $MOUNTED | grep -w -q $MOUNT); then @@ -1460,7 +1493,7 @@ check_and_setup_lustre() { else check_config $MOUNT init_facets_vars - init_versions_vars + init_param_vars fi if [ "$ONLY" == "setup" ]; then exit 0 @@ -2231,25 +2264,9 @@ multiop_bg_pause() { return 0 } -check_rate() { - local OP=$1 - local TARGET_RATE=$2 - local NUM_CLIENTS=$3 - local LOG=$4 - - local RATE=$(awk '/^Rate: [0-9\.]+ '"${OP}"'s\/sec/ { print $2}' ${LOG}) - - # We need to use bc since the rate is a floating point number - local RES=$(echo "${RATE} < ${TARGET_RATE}" | bc -l ) - if [ "${RES}" = 0 ]; then - echo "Success: ${RATE} ${OP}s/sec met target rate" \ - "${TARGET_RATE} ${OP}s/sec for ${NUM_CLIENTS} client(s)." - return 0 - else - echo "Failure: ${RATE} ${OP}s/sec did not meet target rate" \ - "${TARGET_RATE} ${OP}s/sec for ${NUM_CLIENTS} client(s)." - return 1 - fi +inodes_available () { + local IFree=$($LFS df -i $MOUNT | grep ^$FSNAME | awk '{print $4}' | sort -un | head -1) || return 1 + echo $IFree } # reset llite stat counters @@ -2369,3 +2386,7 @@ mpi_run () { eval $command } +mdsrate_cleanup () { + mpi_run -np $1 -machinefile $2 ${MDSRATE} --unlink --nfiles $3 --dir $4 --filefmt $5 +} + diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c index 3927ca6..e0d8e1d 100644 --- a/lustre/utils/liblustreapi.c +++ b/lustre/utils/liblustreapi.c @@ -1525,8 +1525,8 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir, } obd_matches: - /* If file still fits the request, ask osd for updated info. - The regulat stat is almost of the same speed as some new + /* If file still fits the request, ask ost for updated info. + The regular stat is almost of the same speed as some new 'glimpse-size-ioctl'. */ if (!decision && S_ISREG(st->st_mode) && (param->lmd->lmd_lmm.lmm_stripe_count || param->size)) { diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index a408a9d..8945f11 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -103,9 +103,7 @@ const int thread = 0; const int nthreads = 1; #endif -static char rawbuf[8192]; -static char *buf = rawbuf; -static int max = sizeof(rawbuf); +#define MAX_IOC_BUFLEN 8192 static int cur_device = -1; @@ -122,42 +120,25 @@ static int l2_ioctl(int dev_id, int opc, void *buf) return l_ioctl(dev_id, opc, buf); } -#define IOC_INIT(data) \ -do { \ - memset(&data, 0, sizeof(data)); \ - data.ioc_dev = cur_device; \ -} while (0) - -#define IOC_PACK(func, data) \ -do { \ - memset(buf, 0, sizeof(rawbuf)); \ - if (obd_ioctl_pack(&data, &buf, max)) { \ - fprintf(stderr, "error: %s: invalid ioctl\n", \ - jt_cmdname(func)); \ - return -2; \ - } \ -} while (0) - -#define IOC_UNPACK(func, data) \ -do { \ - if (obd_ioctl_unpack(&data, buf, max)) { \ - fprintf(stderr, "error: %s: invalid reply\n", \ - jt_cmdname(func)); \ - return -2; \ - } \ -} while (0) - int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; data.ioc_type = LUSTRE_CFG_TYPE; data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens); data.ioc_pbuf1 = (void *)lcfg; - IOC_PACK(func, data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(func)); + return rc; + } rc = l_ioctl(dev_id, OBD_IOC_PROCESS_CFG, buf); @@ -190,9 +171,10 @@ static int get_mgs_device() int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); rc = data.ioc_dev = get_mgs_device(); if (rc < 0) goto out; @@ -200,7 +182,13 @@ int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg) data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens); data.ioc_pbuf1 = (void *)lcfg; - IOC_PACK(func, data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(func)); + return rc; + } rc = l_ioctl(dev_id, OBD_IOC_PARAM, buf); out: @@ -234,18 +222,30 @@ char *obdo_print(struct obdo *obd) static int do_name2dev(char *func, char *name) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; - IOC_INIT(data); - + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; data.ioc_inllen1 = strlen(name) + 1; data.ioc_inlbuf1 = name; - IOC_PACK(func, data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(func)); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_NAME2DEV, buf); if (rc < 0) return errno; - IOC_UNPACK(func, data); + rc = obd_ioctl_unpack(&data, buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid reply\n", + jt_cmdname(func)); + return rc; + } return data.ioc_dev + N2D_OFF; } @@ -272,7 +272,7 @@ int parse_devname(char *func, char *name) // printf("Name %s is device %d\n", name, ret); } else { fprintf(stderr, "No device found for name %s: %s\n", - name, strerror(rc)); + name, strerror(rc)); } } return ret; @@ -820,14 +820,22 @@ int jt_opt_net(int argc, char **argv) int jt_obd_no_transno(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; if (argc != 1) return CMD_HELP; - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_NO_TRANSNO, buf); if (rc < 0) fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), @@ -839,14 +847,22 @@ int jt_obd_no_transno(int argc, char **argv) int jt_obd_set_readonly(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; if (argc != 1) return CMD_HELP; - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_SET_READONLY, buf); if (rc < 0) fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), @@ -858,14 +874,22 @@ int jt_obd_set_readonly(int argc, char **argv) int jt_obd_abort_recovery(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; if (argc != 1) return CMD_HELP; - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_ABORT_RECOVERY, buf); if (rc < 0) fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), @@ -877,15 +901,15 @@ int jt_obd_abort_recovery(int argc, char **argv) int jt_get_version(int argc, char **argv) { int rc; - char buf[8192]; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; struct obd_ioctl_data *data = (struct obd_ioctl_data *)buf; if (argc != 1) return CMD_HELP; - memset(buf, 0, sizeof(buf)); + memset(buf, 0, sizeof(rawbuf)); data->ioc_version = OBD_IOCTL_VERSION; - data->ioc_inllen1 = sizeof(buf) - size_round(sizeof(*data)); + data->ioc_inllen1 = sizeof(rawbuf) - size_round(sizeof(*data)); data->ioc_inlbuf1 = buf + size_round(sizeof(*data)); data->ioc_len = obd_ioctl_packlen(data); @@ -950,7 +974,7 @@ fail: int jt_obd_list_ioctl(int argc, char **argv) { int rc, index; - char buf[8192]; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; struct obd_ioctl_data *data = (struct obd_ioctl_data *)buf; if (argc > 2) @@ -960,9 +984,9 @@ int jt_obd_list_ioctl(int argc, char **argv) return CMD_HELP; for (index = 0;; index++) { - memset(buf, 0, sizeof(buf)); + memset(buf, 0, sizeof(rawbuf)); data->ioc_version = OBD_IOCTL_VERSION; - data->ioc_inllen1 = sizeof(buf) - size_round(sizeof(*data)); + data->ioc_inllen1 = sizeof(rawbuf) - size_round(sizeof(*data)); data->ioc_inlbuf1 = buf + size_round(sizeof(*data)); data->ioc_len = obd_ioctl_packlen(data); data->ioc_count = index; @@ -978,8 +1002,7 @@ int jt_obd_list_ioctl(int argc, char **argv) rc = 0; else fprintf(stderr, "Error getting device list: %s: " - "check dmesg.\n", - strerror(errno)); + "check dmesg.\n", strerror(errno)); } return rc; } @@ -1017,9 +1040,6 @@ int jt_obd_list(int argc, char **argv) return 0; } - - - /* Create one or more objects, arg[4] may describe stripe meta-data. If * not, defaults assumed. This echo-client instance stashes the stripe * object ids. Use get_stripe on this node to print full lsm and @@ -1028,13 +1048,15 @@ int jt_obd_list(int argc, char **argv) /* create [] [q|v|# verbosity] [striping] */ int jt_obd_create(int argc, char **argv) { + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; struct obd_ioctl_data data; struct timeval next_time; __u64 count = 1, next_count, base_id = 0; int verbose = 1, mode = 0100644, rc = 0, i, valid_lsm = 0; char *end; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; if (argc < 2 || argc > 5) return CMD_HELP; @@ -1092,9 +1114,15 @@ int jt_obd_create(int argc, char **argv) data.ioc_pbuf1 = (char *)&lsm_buffer; } - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_CREATE, buf); - IOC_UNPACK(argv[0], data); + obd_ioctl_unpack(&data, buf, sizeof(rawbuf)); shmem_bump(); if (rc < 0) { fprintf(stderr, "error: %s: #%d - %s\n", @@ -1118,10 +1146,12 @@ int jt_obd_create(int argc, char **argv) int jt_obd_setattr(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; char *end; int rc; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; if (argc != 2) return CMD_HELP; @@ -1139,7 +1169,13 @@ int jt_obd_setattr(int argc, char **argv) } data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE; - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_SETATTR, buf); if (rc < 0) fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), @@ -1153,6 +1189,7 @@ int jt_obd_test_setattr(int argc, char **argv) struct obd_ioctl_data data; struct timeval start, next_time; __u64 i, count, next_count; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int verbose = 1; obd_id objid = 3; char *end; @@ -1161,7 +1198,8 @@ int jt_obd_test_setattr(int argc, char **argv) if (argc < 2 || argc > 4) return CMD_HELP; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; count = strtoull(argv[1], &end, 0); if (*end) { fprintf(stderr, "error: %s: invalid iteration count '%s'\n", @@ -1200,7 +1238,13 @@ int jt_obd_test_setattr(int argc, char **argv) data.ioc_obdo1.o_id = objid; data.ioc_obdo1.o_mode = S_IFREG; data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE; - IOC_PACK(argv[0], data); + memset(buf, 0x00, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_SETATTR, &data); shmem_bump(); if (rc < 0) { @@ -1236,13 +1280,15 @@ int jt_obd_destroy(int argc, char **argv) { struct obd_ioctl_data data; struct timeval next_time; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; __u64 count = 1, next_count; int verbose = 1; __u64 id; char *end; int rc = 0, i; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; if (argc < 2 || argc > 4) return CMD_HELP; @@ -1277,9 +1323,15 @@ int jt_obd_destroy(int argc, char **argv) data.ioc_obdo1.o_mode = S_IFREG | 0644; data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLMODE; - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_DESTROY, buf); - IOC_UNPACK(argv[0], data); + obd_ioctl_unpack(&data, buf, sizeof(rawbuf)); shmem_bump(); if (rc < 0) { fprintf(stderr, "error: %s: objid "LPX64": %s\n", @@ -1298,13 +1350,15 @@ int jt_obd_destroy(int argc, char **argv) int jt_obd_getattr(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; char *end; int rc; if (argc != 2) return CMD_HELP; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; data.ioc_obdo1.o_id = strtoull(argv[1], &end, 0); if (*end) { fprintf(stderr, "error: %s: invalid objid '%s'\n", @@ -1316,9 +1370,15 @@ int jt_obd_getattr(int argc, char **argv) data.ioc_obdo1.o_valid = 0xffffffff; printf("%s: object id "LPX64"\n", jt_cmdname(argv[0]),data.ioc_obdo1.o_id); - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_GETATTR, buf); - IOC_UNPACK(argv[0], data); + obd_ioctl_unpack(&data, buf, sizeof(rawbuf)); if (rc) { fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), strerror(rc = errno)); @@ -1333,6 +1393,7 @@ int jt_obd_test_getattr(int argc, char **argv) { struct obd_ioctl_data data; struct timeval start, next_time; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; __u64 i, count, next_count; int verbose = 1; obd_id objid = 3; @@ -1342,7 +1403,8 @@ int jt_obd_test_getattr(int argc, char **argv) if (argc < 2 || argc > 4) return CMD_HELP; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; count = strtoull(argv[1], &end, 0); if (*end) { fprintf(stderr, "error: %s: invalid iteration count '%s'\n", @@ -1381,7 +1443,13 @@ int jt_obd_test_getattr(int argc, char **argv) data.ioc_obdo1.o_id = objid; data.ioc_obdo1.o_mode = S_IFREG; data.ioc_obdo1.o_valid = 0xffffffff; - IOC_PACK(argv[0], data); + memset(buf, 0x00, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_GETATTR, &data); shmem_bump(); if (rc < 0) { @@ -1423,6 +1491,7 @@ int jt_obd_test_brw(int argc, char **argv) { struct obd_ioctl_data data; struct timeval start, next_time; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; __u64 count, next_count, len, stride, thr_offset = 0, objid = 3; int write = 0, verbose = 1, cmd, i, rc = 0, pages = 1; int offset_pages = 0; @@ -1513,7 +1582,8 @@ int jt_obd_test_brw(int argc, char **argv) } } - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; /* communicate the 'type' of brw test and batching to echo_client. * don't start. we'd love to refactor this lctl->echo_client @@ -1598,7 +1668,13 @@ int jt_obd_test_brw(int argc, char **argv) cmd = write ? OBD_IOC_BRW_WRITE : OBD_IOC_BRW_READ; for (i = 1, next_count = verbose; i <= count && shmem_running(); i++) { data.ioc_obdo1.o_valid &= ~(OBD_MD_FLBLOCKS|OBD_MD_FLGRANT); - IOC_PACK(argv[0], data); + memset(buf, 0x00, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, cmd, buf); shmem_bump(); if (rc) { @@ -1658,11 +1734,13 @@ int jt_obd_lov_getconfig(int argc, char **argv) struct obd_ioctl_data data; struct lov_desc desc; struct obd_uuid *uuidarray; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; __u32 *obdgens; char *path; int rc, fd; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; if (argc != 2) return CMD_HELP; @@ -1680,7 +1758,6 @@ int jt_obd_lov_getconfig(int argc, char **argv) desc.ld_tgt_count = ((OBD_MAX_IOCTL_BUFFER-sizeof(data)-sizeof(desc)) / (sizeof(*uuidarray) + sizeof(*obdgens))); - repeat: uuidarray = calloc(desc.ld_tgt_count, sizeof(*uuidarray)); if (!uuidarray) { @@ -1697,6 +1774,7 @@ repeat: goto out_uuidarray; } + memset(buf, 0x00, sizeof(rawbuf)); data.ioc_inllen1 = sizeof(desc); data.ioc_inlbuf1 = (char *)&desc; data.ioc_inllen2 = desc.ld_tgt_count * sizeof(*uuidarray); @@ -1704,7 +1782,7 @@ repeat: data.ioc_inllen3 = desc.ld_tgt_count * sizeof(*obdgens); data.ioc_inlbuf3 = (char *)obdgens; - if (obd_ioctl_pack(&data, &buf, max)) { + if (obd_ioctl_pack(&data, &buf, sizeof(rawbuf))) { fprintf(stderr, "error: %s: invalid ioctl\n", jt_cmdname(argv[0])); rc = -EINVAL; @@ -1723,7 +1801,7 @@ repeat: __u32 *genp; int i; - if (obd_ioctl_unpack(&data, buf, max)) { + if (obd_ioctl_unpack(&data, buf, sizeof(rawbuf))) { fprintf(stderr, "error: %s: invalid reply\n", jt_cmdname(argv[0])); rc = -EINVAL; @@ -1759,10 +1837,12 @@ int jt_obd_ldlm_regress_start(int argc, char **argv) { int rc; struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; char argstring[200]; int i, count = sizeof(argstring) - 1; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; if (argc > 5) return CMD_HELP; @@ -1779,7 +1859,13 @@ int jt_obd_ldlm_regress_start(int argc, char **argv) data.ioc_inllen1 = strlen(argstring) + 1; } - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, IOC_LDLM_REGRESS_START, buf); if (rc) fprintf(stderr, "error: %s: test failed: %s\n", @@ -1791,13 +1877,22 @@ int jt_obd_ldlm_regress_start(int argc, char **argv) int jt_obd_ldlm_regress_stop(int argc, char **argv) { int rc; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; struct obd_ioctl_data data; - IOC_INIT(data); + + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; if (argc != 1) return CMD_HELP; - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, IOC_LDLM_REGRESS_STOP, buf); if (rc) @@ -1809,16 +1904,24 @@ int jt_obd_ldlm_regress_stop(int argc, char **argv) static int do_activate(int argc, char **argv, int flag) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; if (argc != 1) return CMD_HELP; /* reuse offset for 'active' */ data.ioc_offset = flag; - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, IOC_OSC_SET_ACTIVE, buf); if (rc) fprintf(stderr, "error: %s: failed: %s\n", @@ -1840,9 +1943,11 @@ int jt_obd_activate(int argc, char **argv) int jt_obd_recover(int argc, char **argv) { int rc; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; struct obd_ioctl_data data; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; if (argc > 2) return CMD_HELP; @@ -1851,7 +1956,13 @@ int jt_obd_recover(int argc, char **argv) data.ioc_inlbuf1 = argv[1]; } - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l2_ioctl(OBD_DEV_ID, OBD_IOC_CLIENT_RECOVER, buf); if (rc < 0) { fprintf(stderr, "error: %s: %s\n", jt_cmdname(argv[0]), @@ -1864,6 +1975,7 @@ int jt_obd_recover(int argc, char **argv) int jt_obd_mdc_lookup(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; char *parent, *child; int rc, fd, verbose = 1; @@ -1875,12 +1987,19 @@ int jt_obd_mdc_lookup(int argc, char **argv) if (argc == 4) verbose = get_verbose(argv[0], argv[3]); - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; data.ioc_inllen1 = strlen(child) + 1; data.ioc_inlbuf1 = child; - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } fd = open(parent, O_RDONLY); if (fd < 0) { @@ -1897,7 +2016,12 @@ int jt_obd_mdc_lookup(int argc, char **argv) close(fd); if (verbose) { - IOC_UNPACK(argv[0], data); + rc = obd_ioctl_unpack(&data, buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid reply\n", + jt_cmdname(argv[0])); + return rc; + } printf("%s: mode %o uid %d gid %d\n", child, data.ioc_obdo1.o_mode, data.ioc_obdo1.o_uid, data.ioc_obdo1.o_gid); @@ -1909,16 +2033,24 @@ int jt_obd_mdc_lookup(int argc, char **argv) int jt_cfg_dump_log(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; if (argc != 2) return CMD_HELP; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; data.ioc_inllen1 = strlen(argv[1]) + 1; data.ioc_inlbuf1 = argv[1]; - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l_ioctl(OBD_DEV_ID, OBD_IOC_DUMP_LOG, buf); if (rc < 0) fprintf(stderr, "OBD_IOC_DUMP_LOG failed: %s\n", @@ -1930,15 +2062,22 @@ int jt_cfg_dump_log(int argc, char **argv) int jt_llog_catlist(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; if (argc != 1) return CMD_HELP; - IOC_INIT(data); - data.ioc_inllen1 = max - size_round(sizeof(data)); - IOC_PACK(argv[0], data); - + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; + data.ioc_inllen1 = sizeof(rawbuf) - size_round(sizeof(data)); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l_ioctl(OBD_DEV_ID, OBD_IOC_CATLOGLIST, buf); if (rc == 0) fprintf(stdout, "%s", ((struct obd_ioctl_data*)buf)->ioc_bulk); @@ -1952,17 +2091,25 @@ int jt_llog_catlist(int argc, char **argv) int jt_llog_info(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; if (argc != 2) return CMD_HELP; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; data.ioc_inllen1 = strlen(argv[1]) + 1; data.ioc_inlbuf1 = argv[1]; - data.ioc_inllen2 = max - size_round(sizeof(data)) - + data.ioc_inllen2 = sizeof(rawbuf) - size_round(sizeof(data)) - size_round(data.ioc_inllen1); - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l_ioctl(OBD_DEV_ID, OBD_IOC_LLOG_INFO, buf); if (rc == 0) @@ -1977,12 +2124,14 @@ int jt_llog_info(int argc, char **argv) int jt_llog_print(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; if (argc != 2 && argc != 4) return CMD_HELP; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; data.ioc_inllen1 = strlen(argv[1]) + 1; data.ioc_inlbuf1 = argv[1]; if (argc == 4) { @@ -1997,11 +2146,17 @@ int jt_llog_print(int argc, char **argv) data.ioc_inllen3 = strlen(to) + 1; data.ioc_inlbuf3 = to; } - data.ioc_inllen4 = max - size_round(sizeof(data)) - + data.ioc_inllen4 = sizeof(rawbuf) - size_round(sizeof(data)) - size_round(data.ioc_inllen1) - size_round(data.ioc_inllen2) - size_round(data.ioc_inllen3); - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l_ioctl(OBD_DEV_ID, OBD_IOC_LLOG_PRINT, buf); if (rc == 0) @@ -2016,19 +2171,27 @@ int jt_llog_print(int argc, char **argv) int jt_llog_cancel(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; if (argc != 4) return CMD_HELP; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; data.ioc_inllen1 = strlen(argv[1]) + 1; data.ioc_inlbuf1 = argv[1]; data.ioc_inllen2 = strlen(argv[2]) + 1; data.ioc_inlbuf2 = argv[2]; data.ioc_inllen3 = strlen(argv[3]) + 1; data.ioc_inlbuf3 = argv[3]; - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l_ioctl(OBD_DEV_ID, OBD_IOC_LLOG_CANCEL, buf); if (rc == 0) @@ -2043,12 +2206,14 @@ int jt_llog_cancel(int argc, char **argv) int jt_llog_check(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; if (argc != 2 && argc != 4) return CMD_HELP; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; data.ioc_inllen1 = strlen(argv[1]) + 1; data.ioc_inlbuf1 = argv[1]; if (argc == 4) { @@ -2063,11 +2228,17 @@ int jt_llog_check(int argc, char **argv) data.ioc_inllen3 = strlen(to) + 1; data.ioc_inlbuf3 = to; } - data.ioc_inllen4 = max - size_round(sizeof(data)) - + data.ioc_inllen4 = sizeof(rawbuf) - size_round(sizeof(data)) - size_round(data.ioc_inllen1) - size_round(data.ioc_inllen2) - size_round(data.ioc_inllen3); - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l_ioctl(OBD_DEV_ID, OBD_IOC_LLOG_CHECK, buf); if (rc == 0) @@ -2081,19 +2252,27 @@ int jt_llog_check(int argc, char **argv) int jt_llog_remove(int argc, char **argv) { struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; int rc; if (argc != 3 && argc != 2) return CMD_HELP; - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); + data.ioc_dev = cur_device; data.ioc_inllen1 = strlen(argv[1]) + 1; data.ioc_inlbuf1 = argv[1]; if (argc == 3){ data.ioc_inllen2 = strlen(argv[2]) + 1; data.ioc_inlbuf2 = argv[2]; } - IOC_PACK(argv[0], data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(argv[0])); + return rc; + } rc = l_ioctl(OBD_DEV_ID, OBD_IOC_LLOG_REMOVE, buf); if (rc == 0) { @@ -2142,14 +2321,15 @@ static int jt_blockdev_find_module(const char *module) { FILE *fp; int found = 0; - char modname[256]; + char buf[1024]; fp = fopen("/proc/modules", "r"); if (fp == NULL) return -1; - while (fscanf(fp, "%s %*s %*s %*s %*s %*s", modname) == 1) { - if (strcmp(module, modname) == 0) { + while (fgets(buf, 1024, fp) != NULL) { + *strchr(buf, ' ') = 0; + if (strcmp(module, buf) == 0) { found = 1; break; } @@ -2635,6 +2815,7 @@ static int pool_cmd(enum lcfg_command_type cmd, struct obd_ioctl_data data; struct lustre_cfg_bufs bufs; struct lustre_cfg *lcfg; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; rc = check_pool_cmd(cmd, fsname, poolname, ostname); if (rc) @@ -2652,7 +2833,7 @@ static int pool_cmd(enum lcfg_command_type cmd, return rc; } - IOC_INIT(data); + memset(&data, 0x00, sizeof(data)); rc = data.ioc_dev = get_mgs_device(); if (rc < 0) goto out; @@ -2661,8 +2842,14 @@ static int pool_cmd(enum lcfg_command_type cmd, data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens); data.ioc_pbuf1 = (void *)lcfg; - IOC_PACK(cmdname, data); + memset(buf, 0, sizeof(rawbuf)); + rc = obd_ioctl_pack(&data, &buf, sizeof(rawbuf)); + if (rc) { + fprintf(stderr, "error: %s: invalid ioctl\n", + jt_cmdname(cmdname)); + return rc; + } rc = l_ioctl(OBD_DEV_ID, OBD_IOC_POOL, buf); out: if (rc) @@ -2935,13 +3122,14 @@ void llapi_ping_target(char *obd_type, char *obd_name, { int rc; struct obd_ioctl_data data; + char rawbuf[MAX_IOC_BUFLEN], *buf = rawbuf; memset(&data, 0, sizeof(data)); data.ioc_inlbuf4 = obd_name; data.ioc_inllen4 = strlen(obd_name) + 1; data.ioc_dev = OBD_DEV_BY_DEVNAME; memset(buf, 0, sizeof(rawbuf)); - if (obd_ioctl_pack(&data, &buf, max)) { + if (obd_ioctl_pack(&data, &buf, sizeof(rawbuf))) { fprintf(stderr, "error: invalid ioctl\n"); return; } diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index b8e5d064..956af80 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -1182,16 +1182,15 @@ check_ll_fiemap_extent(void) CHECK_CDEFINE(FIEMAP_EXTENT_LAST); CHECK_CDEFINE(FIEMAP_EXTENT_UNKNOWN); CHECK_CDEFINE(FIEMAP_EXTENT_DELALLOC); - CHECK_CDEFINE(FIEMAP_EXTENT_NO_DIRECT); - CHECK_CDEFINE(FIEMAP_EXTENT_SECONDARY); - CHECK_CDEFINE(FIEMAP_EXTENT_NET); - CHECK_CDEFINE(FIEMAP_EXTENT_DATA_COMPRESSED); + CHECK_CDEFINE(FIEMAP_EXTENT_ENCODED); CHECK_CDEFINE(FIEMAP_EXTENT_DATA_ENCRYPTED); CHECK_CDEFINE(FIEMAP_EXTENT_NOT_ALIGNED); CHECK_CDEFINE(FIEMAP_EXTENT_DATA_INLINE); CHECK_CDEFINE(FIEMAP_EXTENT_DATA_TAIL); CHECK_CDEFINE(FIEMAP_EXTENT_UNWRITTEN); CHECK_CDEFINE(FIEMAP_EXTENT_MERGED); + CHECK_CDEFINE(FIEMAP_EXTENT_NO_DIRECT); + CHECK_CDEFINE(FIEMAP_EXTENT_NET); } static void diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index 344ce94..1a1fa03 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -62,8 +62,8 @@ void lustre_assert_wire_constants(void) { /* Wire protocol assertions generated by 'wirecheck' * (make -C lustre/utils newwiretest) - * running on Linux lin2 2.6.18-92.1.17-prep #3 Sun Nov 23 14:29:36 IST 2008 i686 i686 i386 G - * with gcc version 3.4.6 20060404 (Red Hat 3.4.6-10) */ + * running on Linux localhost.localdomain 2.6.18-prep #3 SMP Sun Nov 23 08:04:44 EST 2008 i68 + * with gcc version 4.1.1 20061011 (Red Hat 4.1.1-30) */ /* Constants... */ @@ -251,9 +251,9 @@ void lustre_assert_wire_constants(void) (long long)OBD_QC_CALLBACK); LASSERTF(OBD_LAST_OPC == 403, " found %lld\n", (long long)OBD_LAST_OPC); - LASSERTF(QUOTA_DQACQ == 901, " found %lld\n", + LASSERTF(QUOTA_DQACQ == 601, " found %lld\n", (long long)QUOTA_DQACQ); - LASSERTF(QUOTA_DQREL == 902, " found %lld\n", + LASSERTF(QUOTA_DQREL == 602, " found %lld\n", (long long)QUOTA_DQREL); LASSERTF(MGS_CONNECT == 250, " found %lld\n", (long long)MGS_CONNECT); @@ -444,31 +444,31 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct obd_connect_data, padding2)); LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding2) == 8, " found %lld\n", (long long)(int)sizeof(((struct obd_connect_data *)0)->padding2)); - CLASSERT(OBD_CONNECT_RDONLY == 0x00000001ULL); - CLASSERT(OBD_CONNECT_INDEX == 0x00000002ULL); - CLASSERT(OBD_CONNECT_GRANT == 0x00000008ULL); - CLASSERT(OBD_CONNECT_SRVLOCK == 0x00000010ULL); - CLASSERT(OBD_CONNECT_VERSION == 0x00000020ULL); - CLASSERT(OBD_CONNECT_REQPORTAL == 0x00000040ULL); - CLASSERT(OBD_CONNECT_ACL == 0x00000080ULL); - CLASSERT(OBD_CONNECT_XATTR == 0x00000100ULL); + CLASSERT(OBD_CONNECT_RDONLY == 0x1ULL); + CLASSERT(OBD_CONNECT_INDEX == 0x2ULL); + CLASSERT(OBD_CONNECT_GRANT == 0x8ULL); + CLASSERT(OBD_CONNECT_SRVLOCK == 0x10ULL); + CLASSERT(OBD_CONNECT_VERSION == 0x20ULL); + CLASSERT(OBD_CONNECT_REQPORTAL == 0x40ULL); + CLASSERT(OBD_CONNECT_ACL == 0x80ULL); + CLASSERT(OBD_CONNECT_XATTR == 0x100ULL); CLASSERT(OBD_CONNECT_REAL == 0x08000000ULL); CLASSERT(OBD_CONNECT_CKSUM == 0x20000000ULL); - CLASSERT(OBD_CONNECT_TRUNCLOCK == 0x00000400ULL); - CLASSERT(OBD_CONNECT_IBITS == 0x00001000ULL); - CLASSERT(OBD_CONNECT_JOIN == 0x00002000ULL); - CLASSERT(OBD_CONNECT_ATTRFID == 0x00004000ULL); - CLASSERT(OBD_CONNECT_NODEVOH == 0x00008000ULL); + CLASSERT(OBD_CONNECT_TRUNCLOCK == 0x400ULL); + CLASSERT(OBD_CONNECT_IBITS == 0x1000ULL); + CLASSERT(OBD_CONNECT_JOIN == 0x2000ULL); + CLASSERT(OBD_CONNECT_ATTRFID == 0x4000ULL); + CLASSERT(OBD_CONNECT_NODEVOH == 0x8000ULL); CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x00010000ULL); CLASSERT(OBD_CONNECT_RMT_CLIENT_FORCE == 0x00020000ULL); - CLASSERT(OBD_CONNECT_BRW_SIZE == 0x00040000ULL); - CLASSERT(OBD_CONNECT_QUOTA64 == 0x00080000ULL); - CLASSERT(OBD_CONNECT_MDS_CAPA == 0x00100000ULL); - CLASSERT(OBD_CONNECT_OSS_CAPA == 0x00200000ULL); + CLASSERT(OBD_CONNECT_BRW_SIZE == 0x40000ULL); + CLASSERT(OBD_CONNECT_QUOTA64 == 0x80000ULL); + CLASSERT(OBD_CONNECT_MDS_CAPA == 0x100000ULL); + CLASSERT(OBD_CONNECT_OSS_CAPA == 0x200000ULL); CLASSERT(OBD_CONNECT_MDS_MDS == 0x04000000ULL); CLASSERT(OBD_CONNECT_SOM == 0x00800000ULL); CLASSERT(OBD_CONNECT_AT == 0x01000000ULL); - CLASSERT(OBD_CONNECT_CANCELSET == 0x00400000ULL); + CLASSERT(OBD_CONNECT_CANCELSET == 0x400000ULL); CLASSERT(OBD_CONNECT_LRU_RESIZE == 0x02000000ULL); /* Checks for struct obdo */ @@ -2386,7 +2386,7 @@ void lustre_assert_wire_constants(void) CLASSERT(FIEMAP_FLAG_DEVICE_ORDER == 0x40000000); /* Checks for struct ll_fiemap_extent */ - LASSERTF((int)sizeof(struct ll_fiemap_extent) == 32, " found %lld\n", + LASSERTF((int)sizeof(struct ll_fiemap_extent) == 56, " found %lld\n", (long long)(int)sizeof(struct ll_fiemap_extent)); LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_logical) == 0, " found %lld\n", (long long)(int)offsetof(struct ll_fiemap_extent, fe_logical)); @@ -2400,28 +2400,27 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ll_fiemap_extent, fe_length)); LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_length) == 8, " found %lld\n", (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_length)); - LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 24, " found %lld\n", + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_flags) == 40, " found %lld\n", (long long)(int)offsetof(struct ll_fiemap_extent, fe_flags)); LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags) == 4, " found %lld\n", (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_flags)); - LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 28, " found %lld\n", + LASSERTF((int)offsetof(struct ll_fiemap_extent, fe_device) == 44, " found %lld\n", (long long)(int)offsetof(struct ll_fiemap_extent, fe_device)); LASSERTF((int)sizeof(((struct ll_fiemap_extent *)0)->fe_device) == 4, " found %lld\n", (long long)(int)sizeof(((struct ll_fiemap_extent *)0)->fe_device)); CLASSERT(FIEMAP_EXTENT_LAST == 0x00000001); CLASSERT(FIEMAP_EXTENT_UNKNOWN == 0x00000002); CLASSERT(FIEMAP_EXTENT_DELALLOC == 0x00000004); - CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x00000008); - CLASSERT(FIEMAP_EXTENT_SECONDARY == 0x00000010); - CLASSERT(FIEMAP_EXTENT_NET == 0x00000020); - CLASSERT(FIEMAP_EXTENT_DATA_COMPRESSED == 0x00000040); + CLASSERT(FIEMAP_EXTENT_ENCODED == 0x00000008); CLASSERT(FIEMAP_EXTENT_DATA_ENCRYPTED == 0x00000080); CLASSERT(FIEMAP_EXTENT_NOT_ALIGNED == 0x00000100); CLASSERT(FIEMAP_EXTENT_DATA_INLINE == 0x00000200); CLASSERT(FIEMAP_EXTENT_DATA_TAIL == 0x00000400); CLASSERT(FIEMAP_EXTENT_UNWRITTEN == 0x00000800); CLASSERT(FIEMAP_EXTENT_MERGED == 0x00001000); -#if defined(LIBLUSTRE_POSIX_ACL) && defined(CONFIG_FS_POSIX_ACL) + CLASSERT(FIEMAP_EXTENT_NO_DIRECT == 0x40000000); + CLASSERT(FIEMAP_EXTENT_NET == 0x80000000); +#ifdef LIBLUSTRE_POSIX_ACL /* Checks for type posix_acl_xattr_entry */ LASSERTF((int)sizeof(xattr_acl_entry) == 8, " found %lld\n", -- 1.8.3.1