From 93acd158c57c4c5d0fc751d46741231490c04707 Mon Sep 17 00:00:00 2001 From: pschwan Date: Fri, 7 Feb 2003 18:01:04 +0000 Subject: [PATCH] Merge b_md into HEAD * bug fixes - Fix ldlm_lock_match on the MDS to avoid matching remote locks (592) - Fix fsfilt_extN_readpage() to read a full page of directory entries, or fake the remainder if PAGE_SIZE != blocksize (500) - Avoid extra mdc_getattr() in ll_intent_lock when possible (534, 604) - Fix imbalanced LOV object allocation and out-of-bound access (469) - Most intent operations were removed, in favour of a new RPC mode that does a single RPC to the server and bypasses most of the VFS - All LDLM resource ID arrays were removed in favour of ldlm_res_id - Aggressively cancel local locks on DLM servers - mds_reint_unlink sends EA to the client if it's the last nlink. client uses that EA to unlink OST objects. - mds_reint_{rename,unlink,link} were rewritten to take ordered locks - recursive symlinks were fixed (440) - fixed NULL deref in DEBUG_REQ - filter_update_lastobjid no longer calls sync, which annoyed extN - fixed multi-client small-writes to a single file problem (445) - fixed mtime updates during file writes (607) - fixed vector writes on obdfilter causing problems when ENOSPC (670) - fixed bug in obd_brw_read/write() (under guise of testing 367) - fixed Linux OST size reporting problem (444, 656) - OST now updates object mtime with writes or setattr (607, 619) - client verifies file size before zeroing page past EOF (445) - OST now writes last allocated objid to disk with allocation (108) - LOV on echo now works (409) * protocol changes - mds_reint_unlink sends a new buffer, with the EA included. this buffer is only valid if body->valid & OBD_MD_FLEASIZE, which is only set if a regular file was being unlinked, and it was the last link - use PtlGet from the target for bulk writes (315) - OST now updates object mtime with writes or setattr (607, 619) - LDLM now has a grant-time callback to revalidate locked items, if necessary (604) - Many MDS operations were reorganized to combat race conditions * other changes - Merge b_intel branch (updated lprocfs code) - now at /proc/fs/lustre - configure check to avoid gcc version 2.96 20000731-2.96-98 (606) --- lustre/ChangeLog | 40 + lustre/Rules | 4 +- lustre/archdep.m4 | 2 +- lustre/cobd/cache_obd.c | 149 +-- lustre/cobd/lproc_cache.c | 60 +- lustre/conf/Makefile.am | 4 +- lustre/conf/lustre2ldif.xsl | 4 + lustre/configure.in | 24 +- lustre/extN/Makefile.am | 16 +- lustre/extN/ext3-unmount_sync.diff | 59 + lustre/extN/ext3-use-after-free.diff | 65 + lustre/extN/extN-iget-debug.diff | 48 + lustre/extN/extN-misc-fixup.diff | 8 + lustre/extN/extN-wantedi.diff | 2 +- lustre/extN/htree-ext3-2.4.18.diff | 86 +- lustre/extN/linux-2.4.18ea-0.8.26.diff | 32 +- lustre/include/linux/lprocfs_status.h | 242 ++-- lustre/include/linux/lustre_dlm.h | 93 +- lustre/include/linux/lustre_export.h | 10 +- lustre/include/linux/lustre_fsfilt.h | 6 + lustre/include/linux/lustre_idl.h | 91 +- lustre/include/linux/lustre_import.h | 5 +- lustre/include/linux/lustre_lib.h | 30 +- lustre/include/linux/lustre_lite.h | 38 +- lustre/include/linux/lustre_mds.h | 89 +- lustre/include/linux/lustre_net.h | 80 +- lustre/include/linux/obd.h | 68 +- lustre/include/linux/obd_class.h | 93 +- lustre/include/linux/obd_echo.h | 28 + lustre/include/linux/obd_filter.h | 31 + lustre/include/linux/obd_support.h | 8 +- .../kernel_patches/{scripts/docco.txt => README} | 0 lustre/kernel_patches/patches/dev_read_only.patch | 8 +- .../kernel_patches/patches/dev_read_only_hp.patch | 77 ++ lustre/kernel_patches/patches/exports.patch | 32 +- lustre/kernel_patches/patches/exports_hp.patch | 56 + .../kernel_patches/patches/invalidate_show.patch | 104 ++ .../kernel_patches/patches/iod-rmap-exports.patch | 64 + lustre/kernel_patches/patches/jbd-transno-cb.patch | 240 ++++ .../patches/kmem_cache_validate_hp.patch | 105 ++ lustre/kernel_patches/patches/lustre_version.patch | 9 +- lustre/kernel_patches/patches/vanilla-2.4.19.patch | 552 ++++----- .../patches/vfs_intent-2.4.18-18.patch | 522 ++++---- lustre/kernel_patches/patches/vfs_intent.patch | 577 ++++----- lustre/kernel_patches/patches/vfs_intent_hp.patch | 1267 ++++++++++++++++++++ lustre/kernel_patches/pc/dev_read_only_hp.pc | 3 + lustre/kernel_patches/pc/exports_hp.pc | 4 + lustre/kernel_patches/pc/invalidate_show.pc | 5 + lustre/kernel_patches/pc/iod-rmap-exports.pc | 6 + lustre/kernel_patches/pc/jbd-transno-cb.pc | 4 + lustre/kernel_patches/pc/kmem_cache_validate.pc | 1 - lustre/kernel_patches/pc/kmem_cache_validate_hp.pc | 5 + lustre/kernel_patches/pc/vanilla-2.4.19.pc | 4 - lustre/kernel_patches/pc/vfs_intent_hp.pc | 8 + lustre/kernel_patches/series/chaos | 4 +- lustre/kernel_patches/series/hp-pnnl | 8 +- lustre/kernel_patches/series/rh-2.4.18-18 | 2 + lustre/kernel_patches/series/rh-8.0 | 1 + lustre/kernel_patches/series/vanilla-2.4.18 | 1 + lustre/kernel_patches/series/vanilla-2.4.19 | 4 +- lustre/kernel_patches/txt/exports.txt | 2 +- lustre/kernel_patches/txt/exports_hp.txt | 3 + lustre/kernel_patches/txt/invalidate_show.txt | 3 + lustre/kernel_patches/which_patch | 12 +- lustre/ldlm/Makefile.am | 2 +- lustre/ldlm/ldlm_extent.c | 7 +- lustre/ldlm/ldlm_lock.c | 275 +++-- lustre/ldlm/ldlm_lockd.c | 120 +- lustre/ldlm/ldlm_request.c | 130 +- lustre/ldlm/ldlm_resource.c | 116 +- lustre/ldlm/ldlm_test.c | 19 +- lustre/lib/client.c | 37 +- lustre/lib/mds_updates.c | 150 ++- lustre/lib/simple.c | 32 +- lustre/lib/target.c | 91 +- lustre/llite/dcache.c | 159 ++- lustre/llite/dir.c | 38 +- lustre/llite/file.c | 479 ++++---- lustre/llite/lproc_llite.c | 281 ++--- lustre/llite/namei.c | 556 ++++++--- lustre/llite/rw.c | 34 +- lustre/llite/super.c | 129 +- lustre/llite/super25.c | 92 +- lustre/llite/symlink.c | 7 +- lustre/lov/Makefile.am | 5 +- lustre/lov/lov_obd.c | 315 +++-- lustre/lov/lov_pack.c | 17 +- lustre/lov/lproc_lov.c | 178 ++- lustre/mdc/lproc_mdc.c | 131 +- lustre/mdc/mdc_reint.c | 72 +- lustre/mdc/mdc_request.c | 469 ++++---- lustre/mds/Makefile.am | 2 +- lustre/mds/handler.c | 870 +++++++------- lustre/mds/lproc_mds.c | 169 +-- lustre/mds/mds_fs.c | 38 +- lustre/mds/mds_lov.c | 42 +- lustre/mds/mds_open.c | 238 ++++ lustre/mds/mds_reint.c | 661 +++++----- lustre/obdclass/class_obd.c | 192 ++- lustre/obdclass/fsfilt_ext3.c | 8 + lustre/obdclass/fsfilt_extN.c | 82 +- lustre/obdclass/fsfilt_reiserfs.c | 7 + lustre/obdclass/genops.c | 46 +- lustre/obdclass/lprocfs_status.c | 412 +++---- lustre/obdclass/statfs_pack.c | 32 +- lustre/obdclass/sysctl.c | 4 + lustre/obdclass/uuid.c | 6 +- lustre/obdecho/echo.c | 384 ++++-- lustre/obdecho/echo_client.c | 1134 ++++++++++++++++-- lustre/obdecho/lproc_echo.c | 54 +- lustre/obdfilter/filter.c | 1099 ++++++++++++++--- lustre/obdfilter/lproc_obdfilter.c | 143 +-- lustre/osc/lproc_osc.c | 123 +- lustre/osc/osc_request.c | 218 ++-- lustre/ost/lproc_ost.c | 150 +-- lustre/ost/ost_handler.c | 301 ++--- lustre/ptlbd/blk.c | 16 +- lustre/ptlbd/client.c | 20 +- lustre/ptlbd/rpc.c | 463 ++----- lustre/ptlbd/server.c | 85 +- lustre/ptlrpc/client.c | 136 ++- lustre/ptlrpc/connection.c | 18 +- lustre/ptlrpc/events.c | 125 +- lustre/ptlrpc/lproc_ptlrpc.c | 37 +- lustre/ptlrpc/niobuf.c | 122 +- lustre/ptlrpc/pack_generic.c | 11 +- lustre/ptlrpc/recovd.c | 15 +- lustre/ptlrpc/recover.c | 107 +- lustre/ptlrpc/rpc.c | 55 +- lustre/ptlrpc/service.c | 11 +- lustre/scripts/lustre.spec.in | 39 +- lustre/tests/.cvsignore | 2 + lustre/tests/Makefile.am | 7 +- lustre/tests/acceptance-metadata-single.sh | 130 ++ lustre/tests/acceptance-small.sh | 4 + lustre/tests/ba-echo.sh | 2 +- lustre/tests/busy.sh | 7 + lustre/tests/create.pl | 25 +- lustre/tests/createmany.c | 52 +- lustre/tests/createtest.c | 142 +++ lustre/tests/echo.sh | 40 +- lustre/tests/leak_finder.pl | 6 + lustre/tests/lkcdmap | 11 + lustre/tests/llmount.sh | 8 +- lustre/tests/llmount2-hack.sh | 21 - lustre/tests/llmount2-hackcleanup.sh | 21 - lustre/tests/llmountcleanup.sh | 13 +- lustre/tests/llmountcleanup2-hack.sh | 25 - lustre/tests/llrmount.sh | 6 +- lustre/tests/local.sh | 4 +- lustre/tests/local2-hack.xml | 43 - lustre/tests/mkdirmany.c | 2 +- lustre/tests/mount2.sh | 24 +- lustre/tests/open_delay.c | 25 + lustre/tests/openunlink.c | 75 +- lustre/tests/recovery-small.sh | 124 ++ lustre/tests/rename.pl | 78 ++ lustre/tests/runiozone | 2 +- lustre/tests/runregression-brw.sh | 19 +- lustre/tests/runregression-net.sh | 33 +- lustre/tests/runtests | 6 + lustre/tests/sanity.sh | 471 ++++---- lustre/tests/sanityN.sh | 122 +- lustre/tests/uml.sh | 49 +- lustre/utils/.cvsignore | 3 + lustre/utils/Makefile.am | 6 +- lustre/utils/lconf.in | 602 +++++----- lustre/utils/lctl.c | 17 +- lustre/utils/lfind.c | 11 +- lustre/utils/llparser.pm | 399 ++++++ lustre/utils/lmc | 164 ++- lustre/utils/lstripe.c | 48 +- lustre/utils/obd.c | 365 +++++- lustre/utils/obdbarrier.c | 223 ++++ lustre/utils/obdctl.h | 5 + lustre/utils/obdio.c | 304 +++++ lustre/utils/obdiolib.c | 465 +++++++ lustre/utils/obdiolib.h | 70 ++ lustre/utils/obdstat.c | 197 +++ 179 files changed, 14398 insertions(+), 6904 deletions(-) create mode 100644 lustre/extN/ext3-unmount_sync.diff create mode 100644 lustre/extN/ext3-use-after-free.diff create mode 100644 lustre/extN/extN-iget-debug.diff rename lustre/kernel_patches/{scripts/docco.txt => README} (100%) create mode 100644 lustre/kernel_patches/patches/dev_read_only_hp.patch create mode 100644 lustre/kernel_patches/patches/exports_hp.patch create mode 100644 lustre/kernel_patches/patches/invalidate_show.patch create mode 100644 lustre/kernel_patches/patches/iod-rmap-exports.patch create mode 100644 lustre/kernel_patches/patches/jbd-transno-cb.patch create mode 100644 lustre/kernel_patches/patches/kmem_cache_validate_hp.patch create mode 100644 lustre/kernel_patches/patches/vfs_intent_hp.patch create mode 100644 lustre/kernel_patches/pc/dev_read_only_hp.pc create mode 100644 lustre/kernel_patches/pc/exports_hp.pc create mode 100644 lustre/kernel_patches/pc/invalidate_show.pc create mode 100644 lustre/kernel_patches/pc/iod-rmap-exports.pc create mode 100644 lustre/kernel_patches/pc/jbd-transno-cb.pc create mode 100644 lustre/kernel_patches/pc/kmem_cache_validate_hp.pc create mode 100644 lustre/kernel_patches/pc/vfs_intent_hp.pc create mode 100644 lustre/kernel_patches/txt/exports_hp.txt create mode 100644 lustre/kernel_patches/txt/invalidate_show.txt create mode 100644 lustre/mds/mds_open.c create mode 100644 lustre/tests/acceptance-metadata-single.sh create mode 100644 lustre/tests/busy.sh create mode 100644 lustre/tests/createtest.c create mode 100755 lustre/tests/lkcdmap delete mode 100644 lustre/tests/llmount2-hack.sh delete mode 100644 lustre/tests/llmount2-hackcleanup.sh delete mode 100644 lustre/tests/llmountcleanup2-hack.sh delete mode 100644 lustre/tests/local2-hack.xml create mode 100644 lustre/tests/open_delay.c create mode 100755 lustre/tests/recovery-small.sh create mode 100644 lustre/tests/rename.pl create mode 100644 lustre/utils/llparser.pm create mode 100644 lustre/utils/obdbarrier.c create mode 100644 lustre/utils/obdio.c create mode 100644 lustre/utils/obdiolib.c create mode 100644 lustre/utils/obdiolib.h create mode 100644 lustre/utils/obdstat.c diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 41e712f..120deef 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1,3 +1,43 @@ +TBD + * version v0_5_20 + * bug fixes + - Fix ldlm_lock_match on the MDS to avoid matching remote locks (592) + - Fix fsfilt_extN_readpage() to read a full page of directory + entries, or fake the remainder if PAGE_SIZE != blocksize (500) + - Avoid extra mdc_getattr() in ll_intent_lock when possible (534, 604) + - Fix imbalanced LOV object allocation and out-of-bound access (469) + - Most intent operations were removed, in favour of a new RPC mode + that does a single RPC to the server and bypasses most of the VFS + - All LDLM resource ID arrays were removed in favour of ldlm_res_id + - Aggressively cancel local locks on DLM servers + - mds_reint_unlink sends EA to the client if it's the last nlink. + client uses that EA to unlink OST objects. + - mds_reint_{rename,unlink,link} were rewritten to take ordered locks + - recursive symlinks were fixed (439) + - fixed NULL deref in DEBUG_REQ + - filter_update_lastobjid no longer calls sync, which annoyed extN + - fixed multi-client small-writes to a single file problem (445) + - fixed mtime updates during file writes (607) + - fixed vector writes on obdfilter causing problems when ENOSPC (670) + - fixed bug in obd_brw_read/write() (under guise of testing 367) + - fixed Linux OST size reporting problem (444, 656) + - OST now updates object mtime with writes or setattr (607, 619) + - client verifies file size before zeroing page past EOF (445) + - OST now writes last allocated objid to disk with allocation (108) + - LOV on echo now works (409) + * protocol changes + - mds_reint_unlink sends a new buffer, with the EA included. this + buffer is only valid if body->valid & OBD_MD_FLEASIZE, which is only + set if a regular file was being unlinked, and it was the last link + - use PtlGet from the target for bulk writes (315) + - OST now updates object mtime with writes or setattr (607, 619) + - LDLM now has a grant-time callback to revalidate locked items, if + necessary (604) + - Many MDS operations were reorganized to combat race conditions + * other changes + - Merge b_intel branch (updated lprocfs code) - now at /proc/fs/lustre + - configure check to avoid gcc version 2.96 20000731-2.96-98) (606) + 2003-01-06 Andreas Dilger * version v0_5_19 * bug fixes diff --git a/lustre/Rules b/lustre/Rules index 0f2fa56..069e89a 100644 --- a/lustre/Rules +++ b/lustre/Rules @@ -17,8 +17,8 @@ tags: rm -f $(top_srcdir)/TAGS rm -f $(top_srcdir)/tags find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs etags -a - find $(top_srcdir) -name '*.[hc]' | xargs etags -a + find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs etags -a find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs ctags -a - find $(top_srcdir) -name '*.[hc]' | xargs ctags -a + find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs ctags -a AM_CPPFLAGS=-I$(top_builddir)/include diff --git a/lustre/archdep.m4 b/lustre/archdep.m4 index b11266c..58a6576 100644 --- a/lustre/archdep.m4 +++ b/lustre/archdep.m4 @@ -49,7 +49,7 @@ case ${host_cpu} in ia64 ) AC_MSG_RESULT($host_cpu) - KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -ffixed-r13 -mfixed-range=f10-f15,f32-f127 -falign-functions=32 -mb-step' + KCFLAGS='-gstabs -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -ffixed-r13 -mfixed-range=f10-f15,f32-f127 -falign-functions=32 -mb-step' KCPPFLAGS='-D__KERNEL__ -DMODULE' MOD_LINK=elf64_ia64 ;; diff --git a/lustre/cobd/cache_obd.c b/lustre/cobd/cache_obd.c index ac921d8..72a05cc 100644 --- a/lustre/cobd/cache_obd.c +++ b/lustre/cobd/cache_obd.c @@ -1,10 +1,22 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Copyright (c) 2002 Cluster File Systems, Inc. * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define DEBUG_SUBSYSTEM S_COBD @@ -16,19 +28,17 @@ #include #include -extern struct lprocfs_vars status_var_nm_1[]; -extern struct lprocfs_vars status_class_var[]; - -static int -cobd_attach (struct obd_device *dev, obd_count len, void *data) +static int cobd_attach(struct obd_device *dev, obd_count len, void *data) { - return (lprocfs_reg_obd (dev, status_var_nm_1, dev)); + struct lprocfs_static_vars lvars; + + lprocfs_init_vars(&lvars); + return lprocfs_obd_attach(dev, lvars.obd_vars); } -static int -cobd_detach (struct obd_device *dev) +static int cobd_detach(struct obd_device *dev) { - return (lprocfs_dereg_obd (dev)); + return lprocfs_obd_detach(dev); } static int @@ -38,26 +48,30 @@ cobd_setup (struct obd_device *dev, obd_count len, void *buf) struct cache_obd *cobd = &dev->u.cobd; struct obd_device *target; struct obd_device *cache; + struct obd_uuid target_uuid; + struct obd_uuid cache_uuid; int rc; - + if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf2 == NULL) return (-EINVAL); - - target = class_uuid2obd (data->ioc_inlbuf1); - cache = class_uuid2obd (data->ioc_inlbuf2); + + obd_str2uuid(&target_uuid, data->ioc_inlbuf1); + target = class_uuid2obd (&target_uuid); + + obd_str2uuid(&cache_uuid, data->ioc_inlbuf2); + cache = class_uuid2obd (&cache_uuid); if (target == NULL || cache == NULL) return (-EINVAL); - - /* don't bother checking attached/setup; - * obd_connect() should, and it can change underneath us */ - rc = obd_connect (&cobd->cobd_target, target, NULL, NULL, NULL); + /* don't bother checking attached/setup; + * obd_connect() should, and it can change underneath us */ + rc = obd_connect (&cobd->cobd_target, target, &target_uuid, NULL, NULL); if (rc != 0) return (rc); - rc = obd_connect (&cobd->cobd_cache, cache, NULL, NULL, NULL); + rc = obd_connect (&cobd->cobd_cache, cache, &cache_uuid, NULL, NULL); if (rc != 0) goto fail_0; @@ -73,14 +87,14 @@ cobd_cleanup (struct obd_device *dev) { struct cache_obd *cobd = &dev->u.cobd; int rc; - + if (!list_empty (&dev->obd_exports)) return (-EBUSY); - + rc = obd_disconnect (&cobd->cobd_cache); if (rc != 0) CERROR ("error %d disconnecting cache\n", rc); - + rc = obd_disconnect (&cobd->cobd_target); if (rc != 0) CERROR ("error %d disconnecting target\n", rc); @@ -90,7 +104,7 @@ cobd_cleanup (struct obd_device *dev) static int cobd_connect (struct lustre_handle *conn, struct obd_device *obd, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { int rc = class_connect (conn, obd, cluuid); @@ -103,12 +117,12 @@ static int cobd_disconnect (struct lustre_handle *conn) { int rc = class_disconnect (conn); - + CERROR ("rc %d\n", rc); return (rc); } -static int +static int cobd_get_info(struct lustre_handle *conn, obd_count keylen, void *key, obd_count *vallen, void **val) { @@ -124,11 +138,11 @@ cobd_get_info(struct lustre_handle *conn, obd_count keylen, /* intercept cache utilisation info? */ - return (obd_get_info (&cobd->cobd_target, + return (obd_get_info (&cobd->cobd_target, keylen, key, vallen, val)); } -static int +static int cobd_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) { struct obd_device *obd = class_conn2obd(conn); @@ -143,7 +157,7 @@ cobd_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) return (obd_statfs (&cobd->cobd_target, osfs)); } -static int +static int cobd_getattr(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md *lsm) { @@ -159,9 +173,9 @@ cobd_getattr(struct lustre_handle *conn, struct obdo *oa, return (obd_getattr (&cobd->cobd_target, oa, lsm)); } -static int +static int cobd_open(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *lsm) + struct lov_stripe_md *lsm, struct obd_trans_info *oti) { struct obd_device *obd = class_conn2obd(conn); struct cache_obd *cobd; @@ -172,12 +186,12 @@ cobd_open(struct lustre_handle *conn, struct obdo *oa, } cobd = &obd->u.cobd; - return (obd_open (&cobd->cobd_target, oa, lsm)); + return (obd_open (&cobd->cobd_target, oa, lsm, oti)); } -static int +static int cobd_close(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *lsm) + struct lov_stripe_md *lsm, struct obd_trans_info *oti) { struct obd_device *obd = class_conn2obd(conn); struct cache_obd *cobd; @@ -188,14 +202,15 @@ cobd_close(struct lustre_handle *conn, struct obdo *oa, } cobd = &obd->u.cobd; - return (obd_close (&cobd->cobd_target, oa, lsm)); + return (obd_close (&cobd->cobd_target, oa, lsm, oti)); } -static int +static int cobd_preprw(int cmd, struct lustre_handle *conn, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_remote *nb, - struct niobuf_local *res, void **desc_private) + struct niobuf_local *res, void **desc_private, + struct obd_trans_info *oti) { struct obd_device *obd = class_conn2obd(conn); struct cache_obd *cobd; @@ -207,19 +222,19 @@ cobd_preprw(int cmd, struct lustre_handle *conn, if ((cmd & OBD_BRW_WRITE) != 0) return -EOPNOTSUPP; - + cobd = &obd->u.cobd; - return (obd_preprw (cmd, &cobd->cobd_target, - objcount, obj, - niocount, nb, - res, desc_private)); + return (obd_preprw (cmd, &cobd->cobd_target, + objcount, obj, + niocount, nb, + res, desc_private, oti)); } -static int +static int cobd_commitrw(int cmd, struct lustre_handle *conn, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_local *local, - void *desc_private) + void *desc_private, struct obd_trans_info *oti) { struct obd_device *obd = class_conn2obd(conn); struct cache_obd *cobd; @@ -231,18 +246,19 @@ cobd_commitrw(int cmd, struct lustre_handle *conn, if ((cmd & OBD_BRW_WRITE) != 0) return -EOPNOTSUPP; - + cobd = &obd->u.cobd; return (obd_commitrw (cmd, &cobd->cobd_target, objcount, obj, niocount, local, - desc_private)); + desc_private, oti)); } -static inline int +static inline int cobd_brw(int cmd, struct lustre_handle *conn, struct lov_stripe_md *lsm, obd_count oa_bufs, - struct brw_page *pga, struct obd_brw_set *set) + struct brw_page *pga, struct obd_brw_set *set, + struct obd_trans_info *oti) { struct obd_device *obd = class_conn2obd(conn); struct cache_obd *cobd; @@ -254,13 +270,13 @@ cobd_brw(int cmd, struct lustre_handle *conn, if ((cmd & OBD_BRW_WRITE) != 0) return -EOPNOTSUPP; - + cobd = &obd->u.cobd; - return (obd_brw (cmd, &cobd->cobd_target, - lsm, oa_bufs, pga, set)); + return (obd_brw (cmd, &cobd->cobd_target, + lsm, oa_bufs, pga, set, oti)); } -static int +static int cobd_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, void *karg, void *uarg) { @@ -301,29 +317,26 @@ static struct obd_ops cobd_ops = { o_iocontrol: cobd_iocontrol, }; -static int __init -cobd_init (void) +static int __init cobd_init(void) { - int rc; - - printk (KERN_INFO "Lustre Caching OBD driver\n"); - - rc = class_register_type (&cobd_ops, status_class_var, - OBD_CACHE_DEVICENAME); - return (rc); + struct lprocfs_static_vars lvars; + ENTRY; + + printk(KERN_INFO "Lustre Caching OBD driver; info@clusterfs.com\n"); + + lprocfs_init_vars(&lvars); + RETURN(class_register_type(&cobd_ops, lvars.module_vars, + OBD_CACHE_DEVICENAME)); } -static void __exit -cobd_exit (void) +static void __exit cobd_exit(void) { - class_unregister_type (OBD_CACHE_DEVICENAME); + class_unregister_type(OBD_CACHE_DEVICENAME); } -MODULE_AUTHOR("Cluster Filesystems Inc. "); +MODULE_AUTHOR("Cluster File Systems, Inc. "); MODULE_DESCRIPTION("Lustre Caching OBD driver"); MODULE_LICENSE("GPL"); module_init(cobd_init); module_exit(cobd_exit); - - diff --git a/lustre/cobd/lproc_cache.c b/lustre/cobd/lproc_cache.c index 5adcaf8..5170829 100644 --- a/lustre/cobd/lproc_cache.c +++ b/lustre/cobd/lproc_cache.c @@ -24,24 +24,16 @@ #include #include -/* - * Common STATUS namespace - */ - -static int rd_uuid (char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - struct obd_device* dev = (struct obd_device*)data; - - return (snprintf(page, count, "%s\n", dev->obd_uuid)); -} - -static int rd_target (char *page, char **start, off_t off, int count, - int *eof, void *data) +#ifndef LPROCFS +struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; +struct lprocfs_vars lprocfs_module_vars[] = { {0} }; +#else +/* Common STATUS namespace */ +static int rd_target(char *page, char **start, off_t off, int count, + int *eof, void *data) { struct obd_device *dev = (struct obd_device*)data; - struct cache_obd *cobd = &dev->u.cobd; - struct lustre_handle *conn = &cobd->cobd_target; + struct lustre_handle *conn = &dev->u.cobd.cobd_target; struct obd_export *exp; int rc; @@ -49,8 +41,8 @@ static int rd_target (char *page, char **start, off_t off, int count, rc = snprintf (page, count, "not set up\n"); else { exp = class_conn2export (conn); - LASSERT (exp != NULL); - rc = snprintf(page, count, "%s\n", exp->exp_obd->obd_uuid); + LASSERT(exp != NULL); + rc = snprintf(page, count, "%s\n", exp->exp_obd->obd_uuid.uuid); } return (rc); } @@ -59,8 +51,7 @@ static int rd_cache(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device *dev = (struct obd_device*)data; - struct cache_obd *cobd = &dev->u.cobd; - struct lustre_handle *conn = &cobd->cobd_cache; + struct lustre_handle *conn = &dev->u.cobd.cobd_cache; struct obd_export *exp; int rc; @@ -69,27 +60,22 @@ static int rd_cache(char *page, char **start, off_t off, int count, else { exp = class_conn2export (conn); LASSERT (exp != NULL); - rc = snprintf(page, count, "%s\n", exp->exp_obd->obd_uuid); + rc = snprintf(page, count, "%s\n", exp->exp_obd->obd_uuid.uuid); } return (rc); } -struct lprocfs_vars status_var_nm_1[] = { - {"status/uuid", rd_uuid, 0, 0}, - {"status/target_uuid", rd_target, 0, 0}, - {"status/cache_uuid", rd_cache, 0, 0}, - {0} +struct lprocfs_vars lprocfs_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "target_uuid", rd_target, 0, 0 }, + { "cache_uuid", rd_cache, 0, 0 }, + { 0 } }; -int rd_numrefs(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - struct obd_type* class = (struct obd_type*)data; - - return (snprintf(page, count, "%d\n", class->typ_refcnt)); -} - -struct lprocfs_vars status_class_var[] = { - {"status/num_refs", rd_numrefs, 0, 0}, - {0} +struct lprocfs_vars lprocfs_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } }; +#endif /* LPROCFS */ + +LPROCFS_INIT_VARS(lprocfs_module_vars, lprocfs_obd_vars) diff --git a/lustre/conf/Makefile.am b/lustre/conf/Makefile.am index 7f98129..a205d10 100644 --- a/lustre/conf/Makefile.am +++ b/lustre/conf/Makefile.am @@ -3,11 +3,13 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution -EXTRA_DIST = lustre2ldif.xsl lustre.dtd lustre.schema slapd-lustre.conf +EXTRA_DIST = lustre.dtd lustre.schema slapd-lustre.conf lustre2ldif.xsl top.ldif ldapconfdir = $(sysconfdir)/openldap ldapschemadir = $(sysconfdir)/openldap/schema ldapconf_SCRIPTS = slapd-lustre.conf ldapschema_SCRIPTS = lustre.schema +pkglibdir = '${exec_prefix}/usr/lib/$(PACKAGE)' +pkglib_DATA = top.ldif lustre2ldif.xsl include $(top_srcdir)/Rules diff --git a/lustre/conf/lustre2ldif.xsl b/lustre/conf/lustre2ldif.xsl index f5d8098..c7ea957 100644 --- a/lustre/conf/lustre2ldif.xsl +++ b/lustre/conf/lustre2ldif.xsl @@ -184,6 +184,10 @@ networkRef: mdsRef: + + diff --git a/lustre/configure.in b/lustre/configure.in index 6ef9286..d51fb40 100644 --- a/lustre/configure.in +++ b/lustre/configure.in @@ -1,7 +1,7 @@ AC_INIT AC_CANONICAL_SYSTEM -# Copyright (C) 2001 Cluster File Systems, Inc. +# Copyright (C) 2001-2003 Cluster File Systems, Inc. # # This code is issued under the GNU General Public License. # See the file COPYING in this distribution @@ -9,7 +9,29 @@ AC_CANONICAL_SYSTEM # Automake variables. Steal the version number from lustre.spec.in. AM_INIT_AUTOMAKE(lustre, builtin([esyscmd], [sed -ne '/^%define version /{ s/.*version //; p; q; }' scripts/lustre.spec.in])) #AM_MAINTAINER_MODE + AC_PROG_CC +AC_MSG_CHECKING(for buggy compiler) +CC_VERSION=`$CC -v 2>&1 | grep "^gcc version"` +bad_cc() { + echo + echo " '$CC_VERSION'" + echo " has been known to generate bad code, " + echo " please get an updated compiler." + AC_MSG_ERROR(sorry) +} +case "$CC_VERSION" in + # ost_pack_niobuf putting 64bit NTOH temporaries on the stack + # without "sub $0xc,%esp" to protect the stack from being + # stomped on by interrupts (bug 606) + "gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)") + bad_cc + ;; + *) + AC_MSG_RESULT(no known problems) + ;; +esac + AC_PROG_RANLIB # diff --git a/lustre/extN/Makefile.am b/lustre/extN/Makefile.am index 5ad1642..3fc2b66 100644 --- a/lustre/extN/Makefile.am +++ b/lustre/extN/Makefile.am @@ -18,6 +18,7 @@ EXTN_FIXES = patch-2.4.18-chaos22 EXTNP = htree-ext3-2.4.18.diff linux-2.4.18ea-0.8.26.diff EXTNP+= ext3-2.4.18-ino_sb_macro.diff extN-misc-fixup.diff extN-noread.diff EXTNP+= extN-wantedi.diff +#EXTNP+= extN-iget-debug.diff EXTNC = balloc.c bitmap.c dir.c file.c fsync.c ialloc.c inode.c ioctl.c EXTNC+= namei.c super.c symlink.c EXTNI = extN_fs.h extN_fs_i.h extN_fs_sb.h extN_jbd.h quotaops.h @@ -107,18 +108,21 @@ patch-stamp: sed-stamp $(EXTNP) list='$(EXTN_EXTRA)'; for f in $$list; do $(RM) $(top_builddir)/$$f; done if [ -f $(srcdir)/extN.patch-$(RELEASE) ]; then \ echo "applying patch $(srcdir)/extN.patch-$(RELEASE)"; \ - (cd $(top_builddir) && patch -p0) < $(srcdir)/extN.patch-$(RELEASE); \ + (cd $(top_builddir) && patch -p0) < $(srcdir)/extN.patch-$(RELEASE);\ else \ - echo "If first patch fails, read NOTE in extN/Makefile.am"; \ list='$(EXTNP)'; \ - sed '/i_version/q' $(extN_orig)/namei.c | tail -2 | \ - grep extN_mark_inode_dirty >/dev/null && list="$(EXTN_FIXES) $$list"; \ + grep -q "err = extN_mark_inode_dirty" $(extN_orig)/namei.c || \ + list="ext3-use-after-free.diff $$list"; \ + sed '/i_version/q' $(extN_orig)/namei.c | tail -2 | \ + grep -q extN_mark_inode_dirty && list="$(EXTN_FIXES) $$list"; \ + grep -q "if (do_sync_supers)" $(extN_orig)/super.c && \ + list="ext3-unmount_sync.diff $$list"; \ for p in $$list; do \ echo "applying patch $$p"; \ sed $(SUB) $(srcdir)/$$p | \ - (cd $(top_builddir) && patch -p1) || exit $$?; \ + (cd $(top_builddir) && patch -p1) || exit $$?; \ done; \ - echo "It is OK if the next patch says it is already applied"; \ + echo "It is OK if the next patch says it is skipping this patch"; \ echo "applying patch $(srcdir)/extN-2.4.18-exports.diff"; \ (cd $(top_builddir) && \ patch -N -p1) < $(srcdir)/extN-2.4.18-exports.diff; \ diff --git a/lustre/extN/ext3-unmount_sync.diff b/lustre/extN/ext3-unmount_sync.diff new file mode 100644 index 0000000..1f9b796 --- /dev/null +++ b/lustre/extN/ext3-unmount_sync.diff @@ -0,0 +1,59 @@ +From adilger@clusterfs.com Mon Dec 2 10:26:44 2002 +Date: Mon, 2 Dec 2002 10:26:44 -0700 +From: Andreas Dilger +To: Lustre LLNL Mailing list , + Lustre Development Mailing List +Subject: Re: data corrupting bug in 2.4.20 ext3, data=journal +Message-ID: <20021202102644.H1422@schatzie.adilger.int> +Mail-Followup-To: Lustre LLNL Mailing list , + Lustre Development Mailing List +Mime-Version: 1.0 +Content-Type: text/plain; charset=us-ascii +Content-Disposition: inline +User-Agent: Mutt/1.2.5.1i +X-GPG-Key: 1024D/0D35BED6 +X-GPG-Fingerprint: 7A37 5D79 BF1B CECA D44F 8A29 A488 39F5 0D35 BED6 +Status: RO +Content-Length: 1160 +Lines: 39 + +Here is the new-improved fix for the ext3 discarding data at umount bug +discovered late last week. To be used instead of the previous ext3 fix. + +Sadly, this is completely unrelated to the problems Mike is having with +ext3 under UML, since it is an unmount-time problem. + +----- Forwarded message from "Stephen C. Tweedie" ----- +The attached patch seems to fix things for me. + +Cheers, + Stephen + + +--- linux-2.4-ext3merge/fs/ext3/super.c.=K0027=.orig 2002-12-02 15:35:13.000000000 +0000 ++++ linux-2.4-ext3merge/fs/ext3/super.c 2002-12-02 15:35:14.000000000 +0000 +@@ -1640,7 +1640,12 @@ + sb->s_dirt = 0; + target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); + +- if (do_sync_supers) { ++ /* ++ * Tricky --- if we are unmounting, the write really does need ++ * to be synchronous. We can detect that by looking for NULL in ++ * sb->s_root. ++ */ ++ if (do_sync_supers || !sb->s_root) { + unlock_super(sb); + log_wait_commit(EXT3_SB(sb)->s_journal, target); + lock_super(sb); + + +----- End forwarded message ----- + +Cheers, Andreas +-- +Andreas Dilger +http://sourceforge.net/projects/ext2resize/ +http://www-mddsp.enel.ucalgary.ca/People/adilger/ + + diff --git a/lustre/extN/ext3-use-after-free.diff b/lustre/extN/ext3-use-after-free.diff new file mode 100644 index 0000000..8cd673f --- /dev/null +++ b/lustre/extN/ext3-use-after-free.diff @@ -0,0 +1,65 @@ + + +If ext3_add_nondir() fails it will do an iput() of the inode. But we +continue to run ext3_mark_inode_dirty() against the potentially-freed +inode. This oopses when slab poisoning is enabled. + +Fix it so that we only run ext3_mark_inode_dirty() if the inode was +successfully instantiated. + +This bug was added in 2.4.20-pre9. + + + fs/ext3/namei.c | 11 +++++------ + 1 files changed, 5 insertions(+), 6 deletions(-) + +--- 24/fs/ext3/namei.c~ext3-use-after-free Sun Dec 15 11:27:50 2002 ++++ 24-akpm/fs/ext3/namei.c Sun Dec 15 11:27:50 2002 +@@ -429,8 +429,11 @@ static int ext3_add_nondir(handle_t *han + { + int err = ext3_add_entry(handle, dentry, inode); + if (!err) { +- d_instantiate(dentry, inode); +- return 0; ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ d_instantiate(dentry, inode); ++ return 0; ++ } + } + ext3_dec_count(handle, inode); + iput(inode); +@@ -465,7 +468,6 @@ static int ext3_create (struct inode * d + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; + err = ext3_add_nondir(handle, dentry, inode); +- ext3_mark_inode_dirty(handle, inode); + } + ext3_journal_stop(handle, dir); + return err; +@@ -490,7 +492,6 @@ static int ext3_mknod (struct inode * di + if (!IS_ERR(inode)) { + init_special_inode(inode, mode, rdev); + err = ext3_add_nondir(handle, dentry, inode); +- ext3_mark_inode_dirty(handle, inode); + } + ext3_journal_stop(handle, dir); + return err; +@@ -934,7 +935,6 @@ static int ext3_symlink (struct inode * + } + inode->u.ext3_i.i_disksize = inode->i_size; + err = ext3_add_nondir(handle, dentry, inode); +- ext3_mark_inode_dirty(handle, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; +@@ -971,7 +971,6 @@ static int ext3_link (struct dentry * ol + atomic_inc(&inode->i_count); + + err = ext3_add_nondir(handle, dentry, inode); +- ext3_mark_inode_dirty(handle, inode); + ext3_journal_stop(handle, dir); + return err; + } + +_ diff --git a/lustre/extN/extN-iget-debug.diff b/lustre/extN/extN-iget-debug.diff new file mode 100644 index 0000000..9714e35 --- /dev/null +++ b/lustre/extN/extN-iget-debug.diff @@ -0,0 +1,48 @@ +--- linux/fs/ext3/namei.c.orig Thu Jan 30 01:15:13 2003 ++++ linux/fs/ext3/namei.c Sat Feb 1 00:33:46 2003 +@@ -710,6 +710,24 @@ + return ret; + } + ++static int extN_find_inode(struct inode *inode, unsigned long ino, ++ void *opaque) ++{ ++ const char *name = NULL; ++ int len = 0; ++ ++ if (opaque) { ++ struct dentry *dentry = opaque; ++ name = dentry->d_name.name; ++ len = dentry->d_name.len; ++ } ++ printk(KERN_INFO "finding inode %s:%lu (%p) count %d (%p = %*s)\n", ++ kdevname(inode->i_dev), ino, inode, atomic_read(&inode->i_count), ++ opaque, len, name ? name : ""); ++ ++ return 1; ++} ++ + static struct dentry *extN_lookup(struct inode * dir, struct dentry *dentry) + { + struct inode * inode; +@@ -724,7 +742,7 @@ + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); + brelse (bh); +- inode = iget(dir->i_sb, ino); ++ inode = iget4(dir->i_sb, ino, extN_find_inode, dentry); + + if (!inode) + return ERR_PTR(-EACCES); +--- linux/fs/ext3/inode.c.orig Thu Jan 30 01:15:13 2003 ++++ linux/fs/ext3/inode.c Sat Feb 1 00:34:45 2003 +@@ -166,6 +166,9 @@ + */ + void extN_put_inode (struct inode * inode) + { ++ printk(KERN_INFO "putting inode %s:%lu (%p) count %d\n", ++ kdevname(inode->i_dev), inode->i_ino, inode, ++ atomic_read(&inode->i_count)); + extN_discard_prealloc (inode); + } + diff --git a/lustre/extN/extN-misc-fixup.diff b/lustre/extN/extN-misc-fixup.diff index 29b36fb..db0bc0f 100644 --- a/lustre/extN/extN-misc-fixup.diff +++ b/lustre/extN/extN-misc-fixup.diff @@ -13,3 +13,11 @@ goto out_journal; } EXTN_SB(sb)->journal_bdev = bdev; +@@ -1560,6 +1560,7 @@ + unlock_kernel(); + return ret; + } ++EXPORT_SYMBOL(extN_force_commit); /* here to avoid potential patch collisions */ + + /* + * Ext3 always journals updates to the superblock itself, so we don't diff --git a/lustre/extN/extN-wantedi.diff b/lustre/extN/extN-wantedi.diff index 3be559f..a55aec0 100644 --- a/lustre/extN/extN-wantedi.diff +++ b/lustre/extN/extN-wantedi.diff @@ -74,7 +74,7 @@ + if (err) goto fail; + + if (extN_set_bit(j, bh->b_data)) { -+ printk(KERN_ERR "goal inode %lu unavailable", goal); ++ printk(KERN_ERR "goal inode %lu unavailable\n", goal); + /* Oh well, we tried. */ + goto repeat; + } diff --git a/lustre/extN/htree-ext3-2.4.18.diff b/lustre/extN/htree-ext3-2.4.18.diff index 9eba30c..4251251 100644 --- a/lustre/extN/htree-ext3-2.4.18.diff +++ b/lustre/extN/htree-ext3-2.4.18.diff @@ -511,24 +511,18 @@ static struct buffer_head * ext3_find_entry (struct dentry *dentry, struct ext3_dir_entry_2 ** res_dir) { -@@ -119,10 +564,76 @@ +@@ -119,10 +564,70 @@ int num = 0; int nblocks, i, err; struct inode *dir = dentry->d_parent->d_inode; -+ int namelen; -+ const u8 *name; -+ unsigned blocksize; + ext3_dirent *de, *top; *res_dir = NULL; sb = dir->i_sb; -+ blocksize = sb->s_blocksize; -+ namelen = dentry->d_name.len; -+ name = dentry->d_name.name; -+ if (namelen > EXT3_NAME_LEN) ++ if (dentry->d_name.len > EXT3_NAME_LEN) + return NULL; + if (ext3_dx && is_dx(dir)) { -+ u32 hash = dx_hash (name, namelen); ++ u32 hash = dx_hash(dentry->d_name.name, dentry->d_name.len); + struct dx_frame frames[2], *frame; + if (!(frame = dx_probe (dir, hash, frames))) + return NULL; @@ -537,10 +531,10 @@ + if (!(bh = ext3_bread (NULL,dir, block, 0, &err))) + goto dxfail; + de = (ext3_dirent *) bh->b_data; -+ top = (ext3_dirent *) ((char *) de + blocksize - ++ top = (ext3_dirent *) ((char *) de + sb->s_blocksize - + EXT3_DIR_REC_LEN(0)); + for (; de < top; de = ext3_next_entry(de)) -+ if (ext3_match (namelen, name, de)) { ++ if (ext3_match(dentry->d_name.len, dentry->d_name.name, de)) { + if (!ext3_check_dir_entry("ext3_find_entry", + dir, de, bh, + (block<i_size >> EXT3_BLOCK_SIZE_BITS(sb); start = dir->u.ext3_i.i_dir_start_lookup; if (start >= nblocks) -@@ -237,6 +748,92 @@ +@@ -237,6 +748,90 @@ de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } @@ -613,11 +607,9 @@ + struct buffer_head **bh,struct dx_frame *frame, + u32 hash, int *error) +{ -+ unsigned blocksize = dir->i_sb->s_blocksize; -+ unsigned count, continued; ++ unsigned count; + struct buffer_head *bh2; + u32 newblock; -+ unsigned MAX_DX_MAP = PAGE_CACHE_SIZE/EXT3_DIR_REC_LEN(1) + 1; + u32 hash2; + struct dx_map_entry *map; + char *data1 = (*bh)->b_data, *data2, *data3; @@ -639,14 +631,14 @@ + + data2 = bh2->b_data; + -+ map = kmalloc(sizeof(*map) * MAX_DX_MAP, GFP_KERNEL); ++ map = kmalloc(sizeof(*map) * PAGE_CACHE_SIZE/EXT3_DIR_REC_LEN(1) + 1, ++ GFP_KERNEL); + if (!map) + panic("no memory for do_split\n"); -+ count = dx_make_map ((ext3_dirent *) data1, blocksize, map); ++ count = dx_make_map((ext3_dirent *)data1, dir->i_sb->s_blocksize, map); + split = count/2; // need to adjust to actual middle + dx_sort_map (map, count); + hash2 = map[split].hash; -+ continued = hash2 == map[split - 1].hash; + dxtrace(printk("Split block %i at %x, %i/%i\n", + dx_get_block(frame->at), hash2, split, count-split)); + @@ -656,10 +648,10 @@ + de = dx_copy_dirents (data1, data3, map, split); + memcpy(data1, data3, (char *) de + de->rec_len - data3); + de = (ext3_dirent *) ((char *) de - data3 + data1); // relocate de -+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); -+ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); -+ dxtrace(dx_show_leaf ((ext3_dirent *) data1, blocksize, 1)); -+ dxtrace(dx_show_leaf ((ext3_dirent *) data2, blocksize, 1)); ++ de->rec_len = cpu_to_le16(data1 + dir->i_sb->s_blocksize - (char *)de); ++ de2->rec_len = cpu_to_le16(data2 + dir->i_sb->s_blocksize-(char *)de2); ++ dxtrace(dx_show_leaf((ext3_dirent *)data1, dir->i_sb->s_blocksize, 1)); ++ dxtrace(dx_show_leaf((ext3_dirent *)data2, dir->i_sb->s_blocksize, 1)); + + /* Which block gets the new entry? */ + if (hash >= hash2) @@ -667,7 +659,7 @@ + swap(*bh, bh2); + de = de2; + } -+ dx_insert_block (frame, hash2 + continued, newblock); ++ dx_insert_block(frame, hash2 + (hash2 == map[split-1].hash), newblock); + ext3_journal_dirty_metadata (handle, bh2); + brelse (bh2); + ext3_journal_dirty_metadata (handle, frame->bh); @@ -681,17 +673,12 @@ /* * ext3_add_entry() * -@@ -251,6 +844,7 @@ - /* - * AKPM: the journalling code here looks wrong on the error paths - */ -+ - static int ext3_add_entry (handle_t *handle, struct dentry *dentry, +@@ -255,118 +849,278 @@ struct inode *inode) { -@@ -258,117 +852,281 @@ - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + struct inode *dir = dentry->d_parent->d_inode; +- const char *name = dentry->d_name.name; +- int namelen = dentry->d_name.len; unsigned long offset; - unsigned short rec_len; struct buffer_head * bh; @@ -700,15 +687,15 @@ + ext3_dirent *de; + struct super_block * sb = dir->i_sb; int retval; -+ unsigned short reclen = EXT3_DIR_REC_LEN(namelen); ++ unsigned short reclen = EXT3_DIR_REC_LEN(dentry->d_name.len); - sb = dir->i_sb; -+ unsigned blocksize = sb->s_blocksize; + unsigned nlen, rlen; + u32 block, blocks; + char *top; - if (!namelen) +- if (!namelen) ++ if (!dentry->d_name.len) return -EINVAL; - bh = ext3_bread (handle, dir, 0, 0, &retval); - if (!bh) @@ -734,7 +721,7 @@ + u32 hash; + char *data1; + -+ hash = dx_hash(name, namelen); ++ hash = dx_hash(dentry->d_name.name, dentry->d_name.len); + /* FIXME: do something if dx_probe() fails here */ + frame = dx_probe(dir, hash, frames); + entries = frame->entries; @@ -748,7 +735,7 @@ + + data1 = bh->b_data; + de = (ext3_dirent *) data1; -+ top = data1 + (0? 200: blocksize); ++ top = data1 + (0? 200: sb->s_blocksize); + while ((char *) de < top) + { + /* FIXME: check EEXIST and dir */ @@ -777,7 +764,7 @@ + goto dxfail2; + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; -+ node2->fake.rec_len = cpu_to_le16(blocksize); ++ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); + node2->fake.inode = 0; + BUFFER_TRACE(frame->bh, "get_write_access"); + ext3_journal_get_write_access(handle, frame->bh); @@ -878,14 +865,14 @@ + if(!bh) + return retval; + de = (ext3_dirent *)bh->b_data; -+ top = bh->b_data + blocksize - reclen; ++ top = bh->b_data + sb->s_blocksize - reclen; + while ((char *) de <= top) { + if (!ext3_check_dir_entry("ext3_add_entry", dir, de, + bh, offset)) { + brelse (bh); + return -EIO; + } -+ if (ext3_match (namelen, name, de)) { ++ if (ext3_match(dentry->d_name.len,dentry->d_name.name,de)) { brelse (bh); return -EEXIST; - } @@ -934,7 +921,7 @@ - ext3_journal_dirty_metadata(handle, bh); + nlen = EXT3_DIR_REC_LEN(de->name_len); + rlen = le16_to_cpu(de->rec_len); -+ if ((de->inode? rlen - nlen: rlen) >= reclen) ++ if ((de->inode ? rlen - nlen: rlen) >= reclen) + goto add; + de = (ext3_dirent *)((char *)de + rlen); + offset += rlen; @@ -948,7 +935,7 @@ + return retval; + de = (ext3_dirent *) bh->b_data; + de->inode = 0; -+ de->rec_len = cpu_to_le16(rlen = blocksize); ++ de->rec_len = cpu_to_le16(rlen = sb->s_blocksize); + nlen = 0; + goto add; + @@ -968,8 +955,8 @@ + ext3_set_de_type(dir->i_sb, de, inode->i_mode); + } else + de->inode = 0; -+ de->name_len = namelen; -+ memcpy (de->name, name, namelen); ++ de->name_len = dentry->d_name.len; ++ memcpy (de->name, dentry->d_name.name, dentry->d_name.len); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend @@ -1020,16 +1007,16 @@ + + /* The 0th block becomes the root, move the dirents out */ + de = (ext3_dirent *) &root->info; -+ len = ((char *) root) + blocksize - (char *) de; ++ len = ((char *) root) + sb->s_blocksize - (char *) de; + memcpy (data1, de, len); + de = (ext3_dirent *) data1; + top = data1 + len; + while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top) + de = de2; -+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ de->rec_len = cpu_to_le16(data1 + sb->s_blocksize - (char *)de); + /* Initialize the root; the dot dirents already exist */ + de = (ext3_dirent *) (&root->dotdot); -+ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); ++ de->rec_len = cpu_to_le16(sb->s_blocksize-EXT3_DIR_REC_LEN(2)); + memset (&root->info, 0, sizeof(root->info)); + root->info.info_length = sizeof(root->info); + entries = root->entries; @@ -1038,7 +1025,7 @@ + dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); + + /* Initialize as for dx_probe */ -+ hash = dx_hash (name, namelen); ++ hash = dx_hash (dentry->d_name.name, dentry->d_name.len); + frame = frames; + frame->entries = entries; + frame->at = entries; @@ -1060,10 +1047,7 @@ + return -ENOENT; } -+ /* - * ext3_delete_entry deletes a directory entry by merging it with the - * previous entry @@ -451,7 +1212,8 @@ struct inode * inode; int err; diff --git a/lustre/extN/linux-2.4.18ea-0.8.26.diff b/lustre/extN/linux-2.4.18ea-0.8.26.diff index 15df90c..4c8fb86 100644 --- a/lustre/extN/linux-2.4.18ea-0.8.26.diff +++ b/lustre/extN/linux-2.4.18ea-0.8.26.diff @@ -133,24 +133,14 @@ diff -Nur linux-2.4.18/fs/ext3/namei.c linux-2.4.18ea/fs/ext3/namei.c #include #include #include -@@ -465,6 +466,8 @@ - inode->i_fop = &extN_file_operations; - inode->i_mapping->a_ops = &ext3_aops; - err = ext3_add_nondir(handle, dentry, inode); -+ if (err) -+ ext3_xattr_drop_inode(handle, inode); - ext3_mark_inode_dirty(handle, inode); - } - ext3_journal_stop(handle, dir); -@@ -490,6 +493,8 @@ - if (!IS_ERR(inode)) { - init_special_inode(inode, mode, rdev); - err = ext3_add_nondir(handle, dentry, inode); -+ if (err) -+ ext3_xattr_drop_inode(handle, inode); - ext3_mark_inode_dirty(handle, inode); +@@ -435,6 +435,7 @@ static int ext3_add_nondir(handle_t *han + return 0; + } } - ext3_journal_stop(handle, dir); ++ ext3_xattr_drop_inode(handle, inode); + ext3_dec_count(handle, inode); + iput(inode); + return err; @@ -514,7 +519,7 @@ if (IS_SYNC(dir)) handle->h_sync = 1; @@ -179,14 +169,6 @@ diff -Nur linux-2.4.18/fs/ext3/namei.c linux-2.4.18ea/fs/ext3/namei.c ext3_mark_inode_dirty(handle, inode); err = ext3_add_entry (handle, dentry, inode); if (err) -@@ -565,6 +566,7 @@ - return err; - - out_no_entry: -+ ext3_xattr_drop_inode(handle, inode); - inode->i_nlink = 0; - ext3_mark_inode_dirty(handle, inode); - iput (inode); @@ -917,5 +919,5 @@ goto out_stop; diff --git a/lustre/include/linux/lprocfs_status.h b/lustre/include/linux/lprocfs_status.h index e769f43..14a713c 100644 --- a/lustre/include/linux/lprocfs_status.h +++ b/lustre/include/linux/lprocfs_status.h @@ -24,109 +24,161 @@ #ifndef _LPROCFS_SNMP_H #define _LPROCFS_SNMP_H - -#ifndef LPROC_SNMP -#define LPROC_SNMP -#endif - +#include #include -typedef enum { - E_LPROC_OK = 0 -} lproc_error_t; - -struct lprocfs_vars{ +#ifndef LPROCFS +#ifdef CONFIG_PROC_FS /* Ensure that /proc is configured */ +#define LPROCFS +#endif +#endif - char* name; - read_proc_t* read_fptr; - write_proc_t* write_fptr; - void* data; +struct lprocfs_vars { + char *name; + read_proc_t *read_fptr; + write_proc_t *write_fptr; + void *data; }; -#ifdef LPROC_SNMP - -struct proc_dir_entry* lprocfs_mkdir(const char *dname, - struct proc_dir_entry *parent); -struct proc_dir_entry* lprocfs_srch(struct proc_dir_entry *head, - const char *name); -void lprocfs_remove_all(struct proc_dir_entry *root); -struct proc_dir_entry* lprocfs_new_dir(struct proc_dir_entry *root, - const char *string, - const char *tok); -int lprocfs_new_vars(struct proc_dir_entry *root, struct lprocfs_vars *list, - const char *tok, void *data); - -int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *var, - void *data); -int lprocfs_reg_obd(struct obd_device *device, struct lprocfs_vars *list, - void *data); -int lprocfs_dereg_obd(struct obd_device *device); -struct proc_dir_entry* lprocfs_reg_mnt(char *mnt_name); -int lprocfs_dereg_mnt(struct proc_dir_entry *root); - -int lprocfs_reg_class(struct obd_type *type, struct lprocfs_vars *list, - void *data); -int lprocfs_dereg_class(struct obd_type *class); -int lprocfs_reg_main(void); -int lprocfs_dereg_main(void); -int lprocfs_ll_rd(char *page, char **start, off_t off, int count, int *eof, - void *data); -#else - - -static inline int lprocfs_add_vars(struct proc_dir_entry *root, - struct lprocfs_vars *var, void *data) -{ - return 0; -} - -static inline int lprocfs_reg_obd(struct obd_device* device, - struct lprocfs_vars* list, void* data) -{ - return 0; -} - -static inline int lprocfs_dereg_obd(struct obd_device* device) -{ - return 0; -} - -static inline struct proc_dir_entry* lprocfs_reg_mnt(char *name) -{ - return NULL; -} - -static inline int lprocfs_dereg_mnt(struct proc_dir_entry* root) -{ - return 0; -} - -static inline int lprocfs_reg_class(struct obd_type* type, - struct lprocfs_vars* list, void* data) -{ - return 0; -} - -static inline int lprocfs_dereg_class(struct obd_type* class) -{ - return 0; -} +struct lprocfs_static_vars { + struct lprocfs_vars *module_vars; + struct lprocfs_vars *obd_vars; +}; -static inline int lprocfs_reg_main(void) -{ - return 0; +/* class_obd.c */ +extern struct proc_dir_entry *proc_lustre_root; + +extern void lprocfs_init_vars(struct lprocfs_static_vars *var); +extern void lprocfs_init_multi_vars(unsigned int idx, + struct lprocfs_static_vars *var); + +#define LPROCFS_INIT_MULTI_VARS(array, size) \ +void lprocfs_init_multi_vars(unsigned int idx, \ + struct lprocfs_static_vars *x) \ +{ \ + struct lprocfs_static_vars *glob = (struct lprocfs_static_vars*)array; \ + LASSERT(glob != 0); \ + LASSERT(idx < (unsigned int)(size)); \ + x->module_vars = glob[idx].module_vars; \ + x->obd_vars = glob[idx].obd_vars; \ +} \ + +#define LPROCFS_INIT_VARS(vclass, vinstance) \ +void lprocfs_init_vars(struct lprocfs_static_vars *x) \ +{ \ + x->module_vars = vclass; \ + x->obd_vars = vinstance; \ +} \ + +#ifdef LPROCFS +/* lprocfs_status.c */ +extern int lprocfs_add_vars(struct proc_dir_entry *root, + struct lprocfs_vars *var, + void *data); + +extern struct proc_dir_entry *lprocfs_register(const char *name, + struct proc_dir_entry *parent, + struct lprocfs_vars *list, + void *data); + +extern void lprocfs_remove(struct proc_dir_entry *root); + +struct obd_device; +extern int lprocfs_obd_attach(struct obd_device *dev, struct lprocfs_vars *list); +extern int lprocfs_obd_detach(struct obd_device *dev); + +/* Generic callbacks */ + +extern int lprocfs_rd_u64(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_uuid(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_name(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_server_uuid(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_rd_numrefs(char *page, char **start, off_t off, + int count, int *eof, void *data); + +/* Statfs helpers */ +struct statfs; +extern int lprocfs_rd_blksize(char *page, char **start, off_t off, + int count, int *eof, struct statfs *sfs); +extern int lprocfs_rd_kbytestotal(char *page, char **start, off_t off, + int count, int *eof, struct statfs *sfs); +extern int lprocfs_rd_kbytesfree(char *page, char **start, off_t off, + int count, int *eof, struct statfs *sfs); +extern int lprocfs_rd_filestotal(char *page, char **start, off_t off, + int count, int *eof, struct statfs *sfs); +extern int lprocfs_rd_filesfree(char *page, char **start, off_t off, + int count, int *eof, struct statfs *sfs); +extern int lprocfs_rd_filegroups(char *page, char **start, off_t off, + int count, int *eof, struct statfs *sfs); + +#define DEFINE_LPROCFS_STATFS_FCT(fct_name, get_statfs_fct) \ +int fct_name(char *page, char **start, off_t off, \ + int count, int *eof, void *data) \ +{ \ + struct statfs sfs; \ + int rc = get_statfs_fct((struct obd_device*)data, &sfs); \ + return (rc==0 \ + ? lprocfs_##fct_name (page, start, off, count, eof, &sfs) \ + : rc); \ } -static inline int lprocfs_dereg_main(void) -{ - return 0; -} +#else -static inline int lprocfs_ll_rd(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - return 0; -} -#endif /* LPROC_SNMP */ +static inline struct proc_dir_entry * +lprocfs_register(const char *name, struct proc_dir_entry *parent, + struct lprocfs_vars *list, void *data) { return NULL; } +static inline int lprocfs_add_vars(struct proc_dir_entry *root, + struct lprocfs_vars *var, + void *data) { return 0; } +static inline void lprocfs_remove(struct proc_dir_entry *root) {}; +struct obd_device; +static inline int lprocfs_obd_attach(struct obd_device *dev, + struct lprocfs_vars *list) { return 0; } +static inline int lprocfs_obd_detach(struct obd_device *dev) { return 0; } +static inline int lprocfs_rd_u64(char *page, char **start, off_t off, + int count, int *eof, void *data) { return 0; } +static inline int lprocfs_rd_uuid(char *page, char **start, off_t off, + int count, int *eof, void *data) { return 0; } +static inline int lprocfs_rd_name(char *page, char **start, off_t off, + int count, int *eof, void *data) { return 0; } +static inline int lprocfs_rd_server_uuid(char *page, char **start, off_t off, + int count, int *eof, void *data) { return 0; } +static inline int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, + int count, int *eof, void *data) { return 0; } +static inline int lprocfs_rd_numrefs(char *page, char **start, off_t off, + int count, int *eof, void *data) { return 0; } + +/* Statfs helpers */ +struct statfs; +static inline +int lprocfs_rd_blksize(char *page, char **start, off_t off, + int count, int *eof, struct statfs *sfs) { return 0; } +static inline +int lprocfs_rd_kbytestotal(char *page, char **start, off_t off, + int count, int *eof, struct statfs *sfs) { return 0; } +static inline +int lprocfs_rd_kbytesfree(char *page, char **start, off_t off, + int count, int *eof, struct statfs *sfs) { return 0; } +static inline +int lprocfs_rd_filestotal(char *page, char **start, off_t off, + int count, int *eof, struct statfs *sfs) { return 0; } +static inline +int lprocfs_rd_filesfree(char *page, char **start, off_t off, + int count, int *eof, struct statfs *sfs) { return 0; } +static inline +int lprocfs_rd_filegroups(char *page, char **start, off_t off, + int count, int *eof, struct statfs *sfs) { return 0; } + +#define DEFINE_LPROCFS_STATFS_FCT(fct_name, get_statfs_fct) \ +int fct_name(char *page, char **start, off_t off, \ + int count, int *eof, void *data) { *eof = 1; return 0; } + +#endif /* LPROCFS */ #endif /* LPROCFS_SNMP_H */ diff --git a/lustre/include/linux/lustre_dlm.h b/lustre/include/linux/lustre_dlm.h index e552dfd..8c05041 100644 --- a/lustre/include/linux/lustre_dlm.h +++ b/lustre/include/linux/lustre_dlm.h @@ -24,6 +24,7 @@ typedef enum { ELDLM_LOCK_CHANGED = 300, ELDLM_LOCK_ABORTED = 301, + ELDLM_LOCK_REPLACED = 302, ELDLM_NAMESPACE_EXISTS = 400, ELDLM_BAD_NAMESPACE = 401 @@ -55,6 +56,7 @@ typedef enum { #define LDLM_FL_NO_CALLBACK (1 << 11) /* see ldlm_cli_cancel_unused */ #define LDLM_FL_HAS_INTENT (1 << 12) /* lock request has intent */ #define LDLM_FL_CANCELING (1 << 13) /* lock cancel has already been sent */ +#define LDLM_FL_LOCAL (1 << 14) // a local lock (ie, no srv/cli split) /* The blocking callback is overloaded to perform two functions. These flags * indicate which operation should be performed. */ @@ -140,9 +142,10 @@ struct ldlm_lock; typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock, struct ldlm_lock_desc *new, void *data, - __u32 data_len, int flag); - -typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, int flags); + int flag); +typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, int flags, void *data); +typedef int (*ldlm_granted_callback)(struct ldlm_lock *, + struct lustre_msg *, int offset); struct ldlm_lock { struct portals_handle l_handle; // must be first in the structure @@ -162,13 +165,14 @@ struct ldlm_lock { ldlm_completion_callback l_completion_ast; ldlm_blocking_callback l_blocking_ast; + ldlm_granted_callback l_granted_cb; struct obd_export *l_export; struct lustre_handle *l_connh; __u32 l_flags; struct lustre_handle l_remote_handle; void *l_data; - __u32 l_data_len; + void *l_cp_data; struct ldlm_extent l_extent; __u32 l_version[RES_VERSION_SIZE]; @@ -183,7 +187,7 @@ struct ldlm_lock { }; typedef int (*ldlm_res_compat)(struct ldlm_lock *child, struct ldlm_lock *new); -typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock *, +typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **, void *req_cookie, ldlm_mode_t mode, int flags, void *data); @@ -210,9 +214,12 @@ struct ldlm_resource { ldlm_mode_t lr_most_restr; __u32 lr_type; /* LDLM_PLAIN or LDLM_EXTENT */ struct ldlm_resource *lr_root; - __u64 lr_name[RES_NAME_SIZE]; + struct ldlm_res_id lr_name; __u32 lr_version[RES_VERSION_SIZE]; atomic_t lr_refcount; + + /* lr_tmp holds a list head temporarily, during the building of a work + * queue. see ldlm_add_ast_work_item and ldlm_run_ast_work */ void *lr_tmp; }; @@ -232,21 +239,16 @@ struct ldlm_export_data { struct obd_import led_import; }; -static inline struct ldlm_extent *ldlm_res2extent(struct ldlm_resource *res) -{ - return (struct ldlm_extent *)(res->lr_name); -} - extern struct obd_ops ldlm_obd_ops; extern char *ldlm_lockname[]; extern char *ldlm_typename[]; extern char *ldlm_it2str(int it); -#define LDLM_DEBUG(lock, format, a...) \ +#define __LDLM_DEBUG(level, lock, format, a...) \ do { \ if (lock->l_resource == NULL) { \ - CDEBUG(D_DLMTRACE, "### " format \ + CDEBUG(level, "### " format \ " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "\ "res: \?\? rrc=\?\? type: \?\?\? remote: "LPX64")\n" \ , ## a, lock, lock->l_handle.h_cookie, \ @@ -258,7 +260,7 @@ do { \ break; \ } \ if (lock->l_resource->lr_type == LDLM_EXTENT) { \ - CDEBUG(D_DLMTRACE, "### " format \ + CDEBUG(level, "### " format \ " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " \ "res: "LPU64"/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64\ "] remote: "LPX64"\n" , ## a, \ @@ -267,8 +269,8 @@ do { \ lock->l_readers, lock->l_writers, \ ldlm_lockname[lock->l_granted_mode], \ ldlm_lockname[lock->l_req_mode], \ - lock->l_resource->lr_name[0], \ - lock->l_resource->lr_name[1], \ + lock->l_resource->lr_name.name[0], \ + lock->l_resource->lr_name.name[1], \ atomic_read(&lock->l_resource->lr_refcount), \ ldlm_typename[lock->l_resource->lr_type], \ lock->l_extent.start, lock->l_extent.end, \ @@ -276,7 +278,7 @@ do { \ break; \ } \ { \ - CDEBUG(D_DLMTRACE, "### " format \ + CDEBUG(level, "### " format \ " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " \ "res: "LPU64"/"LPU64" rrc: %d type: %s remote: "LPX64 \ "\n" , ## a, lock->l_resource->lr_namespace->ns_name, \ @@ -285,14 +287,17 @@ do { \ lock->l_readers, lock->l_writers, \ ldlm_lockname[lock->l_granted_mode], \ ldlm_lockname[lock->l_req_mode], \ - lock->l_resource->lr_name[0], \ - lock->l_resource->lr_name[1], \ + lock->l_resource->lr_name.name[0], \ + lock->l_resource->lr_name.name[1], \ atomic_read(&lock->l_resource->lr_refcount), \ ldlm_typename[lock->l_resource->lr_type], \ lock->l_remote_handle.cookie); \ } \ } while (0) +#define LDLM_DEBUG(lock, format, a...) __LDLM_DEBUG(D_DLMTRACE, lock, format, a) +#define LDLM_ERROR(lock, format, a...) __LDLM_DEBUG(D_ERROR, lock, format, a) + #define LDLM_DEBUG_NOLOCK(format, a...) \ CDEBUG(D_DLMTRACE, "### " format "\n" , ## a) @@ -317,11 +322,15 @@ int ldlm_replay_locks(struct obd_import *imp); /* ldlm_extent.c */ int ldlm_extent_compat(struct ldlm_lock *, struct ldlm_lock *); -int ldlm_extent_policy(struct ldlm_namespace *, struct ldlm_lock *, void *, +int ldlm_extent_policy(struct ldlm_namespace *, struct ldlm_lock **, void *, ldlm_mode_t, int flags, void *); /* ldlm_lockd.c */ -int ldlm_handle_enqueue(struct ptlrpc_request *req); +int ldlm_server_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, + void *data, int flag); +int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data); +int ldlm_handle_enqueue(struct ptlrpc_request *req, ldlm_completion_callback, + ldlm_blocking_callback); int ldlm_handle_convert(struct ptlrpc_request *req); int ldlm_handle_cancel(struct ptlrpc_request *req); int ldlm_del_waiting_lock(struct ldlm_lock *lock); @@ -332,7 +341,7 @@ void ldlm_unregister_intent(void); void ldlm_lock2handle(struct ldlm_lock *lock, struct lustre_handle *lockh); struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *, int flags); void ldlm_cancel_callback(struct ldlm_lock *); -int ldlm_lock_set_data(struct lustre_handle *, void *data, int datalen); +int ldlm_lock_set_data(struct lustre_handle *, void *data, void *cp_data); void ldlm_lock_remove_from_lru(struct ldlm_lock *); static inline struct ldlm_lock *ldlm_handle2lock(struct lustre_handle *h) @@ -342,14 +351,14 @@ static inline struct ldlm_lock *ldlm_handle2lock(struct lustre_handle *h) #define LDLM_LOCK_PUT(lock) \ do { \ - /*LDLM_DEBUG(lock, "put");*/ \ + /*LDLM_DEBUG((lock), "put");*/ \ ldlm_lock_put(lock); \ } while (0) #define LDLM_LOCK_GET(lock) \ ({ \ ldlm_lock_get(lock); \ - /*LDLM_DEBUG(lock, "get");*/ \ + /*LDLM_DEBUG((lock), "get");*/ \ lock; \ }) @@ -360,16 +369,16 @@ void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc); void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode); void ldlm_lock_addref_internal(struct ldlm_lock *, __u32 mode); void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode); -void ldlm_grant_lock(struct ldlm_lock *lock); -int ldlm_lock_match(struct ldlm_namespace *ns, __u64 *res_id, __u32 type, - void *cookie, int cookielen, ldlm_mode_t mode, - struct lustre_handle *lockh); +void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode); +void ldlm_grant_lock(struct ldlm_lock *lock, void *data, int datalen); +int ldlm_lock_match(struct ldlm_namespace *ns, int flags, struct ldlm_res_id *, + __u32 type, void *cookie, int cookielen, ldlm_mode_t mode, + struct lustre_handle *); struct ldlm_lock * ldlm_lock_create(struct ldlm_namespace *ns, - struct lustre_handle *parent_lock_handle, - __u64 *res_id, __u32 type, ldlm_mode_t mode, void *data, - __u32 data_len); -ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock *, + struct lustre_handle *parent_lock_handle, struct ldlm_res_id, + __u32 type, ldlm_mode_t mode, void *data, void *cp_data); +ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **, void *cookie, int cookie_len, int *flags, ldlm_completion_callback completion, ldlm_blocking_callback blocking); @@ -403,7 +412,8 @@ void ldlm_proc_cleanup(struct obd_device *obd); /* resource.c - internal */ struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, - __u64 *name, __u32 type, int create); + struct ldlm_res_id, __u32 type, + int create); struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res); int ldlm_resource_putref(struct ldlm_resource *res); void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head, @@ -414,16 +424,16 @@ void ldlm_dump_all_namespaces(void); void ldlm_namespace_dump(struct ldlm_namespace *); void ldlm_resource_dump(struct ldlm_resource *); int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *, - __u64 new_resid[3]); + struct ldlm_res_id); /* ldlm_request.c */ int ldlm_expired_completion_wait(void *data); -int ldlm_completion_ast(struct ldlm_lock *lock, int flags); +int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data); int ldlm_cli_enqueue(struct lustre_handle *conn, struct ptlrpc_request *req, struct ldlm_namespace *ns, struct lustre_handle *parent_lock_handle, - __u64 *res_id, + struct ldlm_res_id, __u32 type, void *cookie, int cookielen, ldlm_mode_t mode, @@ -431,13 +441,13 @@ int ldlm_cli_enqueue(struct lustre_handle *conn, ldlm_completion_callback completion, ldlm_blocking_callback callback, void *data, - __u32 data_len, + void *cp_data, struct lustre_handle *lockh); int ldlm_match_or_enqueue(struct lustre_handle *connh, struct ptlrpc_request *req, struct ldlm_namespace *ns, struct lustre_handle *parent_lock_handle, - __u64 *res_id, + struct ldlm_res_id, __u32 type, void *cookie, int cookielen, ldlm_mode_t mode, @@ -445,19 +455,20 @@ int ldlm_match_or_enqueue(struct lustre_handle *connh, ldlm_completion_callback completion, ldlm_blocking_callback callback, void *data, - __u32 data_len, + void *cp_data, struct lustre_handle *lockh); int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new, void *data, __u32 data_len); int ldlm_cli_convert(struct lustre_handle *, int new_mode, int *flags); int ldlm_cli_cancel(struct lustre_handle *lockh); -int ldlm_cli_cancel_unused(struct ldlm_namespace *, __u64 *, int flags); +int ldlm_cli_cancel_unused(struct ldlm_namespace *, struct ldlm_res_id *, + int flags); int ldlm_cancel_lru(struct ldlm_namespace *ns); /* mds/handler.c */ /* This has to be here because recurisve inclusion sucks. */ int mds_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, - void *data, __u32 data_len, int flag); + void *data, int flag); #endif /* __KERNEL__ */ diff --git a/lustre/include/linux/lustre_export.h b/lustre/include/linux/lustre_export.h index 342721c..74b8dca 100644 --- a/lustre/include/linux/lustre_export.h +++ b/lustre/include/linux/lustre_export.h @@ -23,11 +23,17 @@ struct lov_export_data { }; struct ost_export_data { - __u8 oed_uuid[37]; /* client UUID */ + struct obd_uuid oed_uuid; /* client UUID */ +}; + +struct ec_export_data { /* echo client */ + struct list_head eced_open_head; + struct list_head eced_locks; }; struct obd_export { __u64 exp_cookie; + struct obd_uuid exp_client_uuid; struct list_head exp_obd_chain; struct list_head exp_conn_chain; struct obd_device *exp_obd; @@ -38,6 +44,7 @@ struct obd_export { struct filter_export_data eu_filter_data; struct lov_export_data eu_lov_data; struct ost_export_data eu_ost_data; + struct ec_export_data eu_ec_data; } u; }; @@ -45,6 +52,7 @@ struct obd_export { #define exp_lov_data u.eu_lov_data #define exp_filter_data u.eu_filter_data #define exp_ost_data u.eu_ost_data +#define exp_ec_data u.eu_ec_data extern struct obd_export *class_conn2export(struct lustre_handle *conn); extern struct obd_device *class_conn2obd(struct lustre_handle *conn); diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index eeae647..341d082 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -56,6 +56,7 @@ struct fsfilt_operations { int (* fs_set_last_rcvd)(struct obd_device *obd, __u64 last_rcvd, void *handle, fsfilt_cb_t cb_func); int (* fs_statfs)(struct super_block *sb, struct obd_statfs *osfs); + int (* fs_sync)(struct super_block *sb); }; extern int fsfilt_register_ops(struct fsfilt_operations *fs_ops); @@ -146,6 +147,11 @@ static inline int fsfilt_statfs(struct obd_device *obd, struct super_block *fs, return obd->obd_fsops->fs_statfs(fs, osfs); } +static inline int fsfilt_sync(struct obd_device *obd, struct super_block *fs) +{ + return obd->obd_fsops->fs_sync(fs); +} + #endif /* __KERNEL__ */ #endif diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index cc194ac14..6e11240 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -47,7 +47,15 @@ /* * GENERAL STUFF */ -typedef __u8 obd_uuid_t[37]; +struct obd_uuid { + __u8 uuid[37]; +}; + +static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp) +{ + strncpy(uuid->uuid, tmp, sizeof(uuid->uuid)); + uuid->uuid[sizeof(uuid->uuid) - 1] = '\0'; +} /* FOO_REQUEST_PORTAL is for incoming requests on the FOO * FOO_REPLY_PORTAL is for incoming replies on the FOO @@ -75,6 +83,7 @@ typedef __u8 obd_uuid_t[37]; #define PTLBD_REQUEST_PORTAL 19 #define PTLBD_REPLY_PORTAL 20 #define PTLBD_BULK_PORTAL 21 +#define MDS_GETATTR_PORTAL 22 #define SVC_KILLED 1 #define SVC_EVENT 2 @@ -133,9 +142,6 @@ struct lustre_msg { #define MSG_LAST_REPLAY 1 #define MSG_RESENT 2 -/* XXX horrible interim hack -- see bug 578 */ -#define MSG_REPLAY_IN_PROGRESS 4 - static inline int lustre_msg_get_flags(struct lustre_msg *msg) { return (msg->flags & MSG_GEN_FLAG_MASK); @@ -157,14 +163,24 @@ static inline int lustre_msg_get_op_flags(struct lustre_msg *msg) return (msg->flags >> MSG_OP_FLAG_SHIFT); } +static inline void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags) +{ + msg->flags |= ((flags & MSG_GEN_FLAG_MASK) << MSG_OP_FLAG_SHIFT); +} + static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) { msg->flags &= ~MSG_OP_FLAG_MASK; - msg->flags |= ((flags & MSG_GEN_FLAG_MASK) << MSG_OP_FLAG_SHIFT); + lustre_msg_add_op_flags(msg, flags); } -#define CONNMGR_REPLY 0 -#define CONNMGR_CONNECT 1 +/* + * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT) + */ + +#define MSG_CONNECT_RECOVERING 0x1 +#define MSG_CONNECT_RECONNECT 0x2 +#define MSG_CONNECT_REPLAYABLE 0x4 /* * OST requests: OBDO & OBD request records @@ -305,16 +321,6 @@ struct niobuf_remote { __u32 flags; }; -#define CONNMGR_REPLY 0 -#define CONNMGR_CONNECT 1 - -struct connmgr_body { - __u64 conn; - __u64 conn_token; - __u32 generation; - obd_uuid_t conn_uuid; -}; - /* request structure for OST's */ #define OST_REQ_HAS_OA1 0x1 @@ -328,24 +334,33 @@ struct ost_body { */ /* opcodes */ -#define MDS_GETATTR 1 -#define MDS_OPEN 2 -#define MDS_CLOSE 3 -#define MDS_REINT 4 -#define MDS_READPAGE 6 -#define MDS_CONNECT 7 -#define MDS_DISCONNECT 8 -#define MDS_GETSTATUS 9 -#define MDS_STATFS 10 -#define MDS_GETLOVINFO 11 -#define MDS_GETATTR_NAME 12 +#define MDS_GETATTR 1 +#define MDS_GETATTR_NAME 2 +#define MDS_CLOSE 3 +#define MDS_REINT 4 +#define MDS_READPAGE 6 +#define MDS_CONNECT 7 +#define MDS_DISCONNECT 8 +#define MDS_GETSTATUS 9 +#define MDS_STATFS 10 +#define MDS_GETLOVINFO 11 #define REINT_SETATTR 1 #define REINT_CREATE 2 #define REINT_LINK 3 #define REINT_UNLINK 4 #define REINT_RENAME 5 -#define REINT_MAX 5 +#define REINT_OPEN 6 +#define REINT_MAX 6 + +#define IT_INTENT_EXEC 1 +#define IT_OPEN_LOOKUP (1 << 1) +#define IT_OPEN_NEG (1 << 2) +#define IT_OPEN_POS (1 << 3) +#define IT_OPEN_CREATE (1 << 4) +#define IT_OPEN_OPEN (1 << 5) + +#define IT_UNLINK (1<<8) #define REINT_OPCODE_MASK 0xff /* opcodes must fit into this mask */ #define REINT_REPLAYING 0x1000 /* masked into the opcode to indicate replay */ @@ -383,6 +398,7 @@ struct mds_body { struct ll_fid fid2; struct lustre_handle handle; __u64 size; + __u64 blocks; /* XID, in the case of MDS_READPAGE */ __u32 ino; /* make this a __u64 */ __u32 valid; __u32 fsuid; @@ -398,6 +414,7 @@ struct mds_body { __u32 rdev; __u32 nlink; __u32 generation; + __u32 suppgid; }; /* This is probably redundant with OBD_MD_FLEASIZE, but we need an audit */ @@ -426,6 +443,7 @@ struct mds_rec_setattr { __u64 sa_atime; __u64 sa_mtime; __u64 sa_ctime; + __u32 sa_suppgid; }; struct mds_rec_create { @@ -433,7 +451,7 @@ struct mds_rec_create { __u32 cr_fsuid; __u32 cr_fsgid; __u32 cr_cap; - __u32 cr_reserved; + __u32 cr_flags; /* for use with open */ __u32 cr_mode; struct ll_fid cr_fid; struct ll_fid cr_replayfid; @@ -441,6 +459,7 @@ struct mds_rec_create { __u32 cr_gid; __u64 cr_time; __u64 cr_rdev; + __u32 cr_suppgid; }; struct mds_rec_link { @@ -448,6 +467,7 @@ struct mds_rec_link { __u32 lk_fsuid; __u32 lk_fsgid; __u32 lk_cap; + __u32 lk_suppgid; struct ll_fid lk_fid1; struct ll_fid lk_fid2; }; @@ -459,6 +479,7 @@ struct mds_rec_unlink { __u32 ul_cap; __u32 ul_reserved; __u32 ul_mode; + __u32 ul_suppgid; struct ll_fid ul_fid1; struct ll_fid ul_fid2; }; @@ -487,7 +508,7 @@ struct lov_desc { __u64 ld_default_stripe_size; /* in bytes */ __u64 ld_default_stripe_offset; /* in bytes */ __u32 ld_pattern; /* RAID 0,1 etc */ - obd_uuid_t ld_uuid; + struct obd_uuid ld_uuid; }; /* @@ -503,6 +524,10 @@ struct lov_desc { #define RES_NAME_SIZE 3 #define RES_VERSION_SIZE 4 +struct ldlm_res_id { + __u64 name[RES_NAME_SIZE]; +}; + /* lock types */ typedef enum { LCK_EX = 1, @@ -526,7 +551,7 @@ struct ldlm_intent { * below, we're probably fine. */ struct ldlm_resource_desc { __u32 lr_type; - __u64 lr_name[RES_NAME_SIZE]; + struct ldlm_res_id lr_name; __u32 lr_version[RES_VERSION_SIZE]; }; @@ -548,7 +573,7 @@ struct ldlm_request { struct ldlm_reply { __u32 lock_flags; __u32 lock_mode; - __u64 lock_resource_name[RES_NAME_SIZE]; + struct ldlm_res_id lock_resource_name; struct lustre_handle lock_handle; struct ldlm_extent lock_extent; /* XXX make this policy 1 &2 */ __u64 lock_policy_res1; diff --git a/lustre/include/linux/lustre_import.h b/lustre/include/linux/lustre_import.h index 0f0d67d..36cd54f 100644 --- a/lustre/include/linux/lustre_import.h +++ b/lustre/include/linux/lustre_import.h @@ -18,6 +18,7 @@ typedef int (*import_recover_t)(struct obd_import *imp, int phase); #include + struct obd_import { import_recover_t imp_recover; struct ptlrpc_connection *imp_connection; @@ -36,11 +37,11 @@ struct obd_import { int imp_flags; int imp_level; __u64 imp_last_xid; + __u64 imp_last_bulk_xid; __u64 imp_max_transno; - __u64 imp_peer_last_xid; __u64 imp_peer_committed_transno; - /* Protects flags, level, *_xid, *_list */ + /* Protects flags, level, last_xid, *_list */ spinlock_t imp_lock; }; diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h index b1f9288..54750c0 100644 --- a/lustre/include/linux/lustre_lib.h +++ b/lustre/include/linux/lustre_lib.h @@ -18,7 +18,7 @@ * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * - * Basic Lustre library routines. + * Basic Lustre library routines. * */ @@ -59,20 +59,22 @@ struct obd_export; int target_handle_connect(struct ptlrpc_request *req); int target_handle_disconnect(struct ptlrpc_request *req); int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, - char *cluuid); + struct obd_uuid *cluuid); int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover); int client_obd_disconnect(struct lustre_handle *conn); int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf); int client_obd_cleanup(struct obd_device * obddev); -struct client_obd *client_conn2cli(struct lustre_handle *conn); -struct obd_device *client_tgtuuid2obd(char *tgtuuid); +struct client_obd *client_conn2cli(struct lustre_handle *conn); +struct obd_device *client_tgtuuid2obd(struct obd_uuid *tgtuuid); int target_revoke_connection(struct recovd_data *rd, int phase); +int obd_self_statfs(struct obd_device *dev, struct statfs *sfs); + /* l_lock.c */ -struct lustre_lock { +struct lustre_lock { int l_depth; struct task_struct *l_owner; struct semaphore l_sem; @@ -131,9 +133,9 @@ static inline void ll_sleep(int t) /* FIXME: This needs to validate pointers and cookies */ static inline void *lustre_handle2object(struct lustre_handle *handle) { - if (handle) + if (handle) return (void *)(unsigned long)(handle->addr); - return NULL; + return NULL; } static inline void ldlm_object2handle(void *object, struct lustre_handle *handle) @@ -279,7 +281,7 @@ static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data) printk("OBD ioctl: inlbuf3 not 0 terminated\n"); return 1; } -#endif +#endif return 0; } @@ -457,16 +459,24 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg) #define OBD_IOC_RECOVD_FAILCONN _IOWR('f', 136, long) #define OBD_IOC_DEC_FS_USE_COUNT _IO ('f', 139 ) +#define OBD_IOC_NO_TRANSNO _IOW ('f', 140, long) +#define OBD_IOC_SET_READONLY _IOW ('f', 141, long) #define OBD_GET_VERSION _IOWR ('f', 144, long) +#define ECHO_IOC_GET_STRIPE _IOWR('f', 200, long) +#define ECHO_IOC_SET_STRIPE _IOWR('f', 201, long) +#define ECHO_IOC_ENQUEUE _IOWR('f', 202, long) +#define ECHO_IOC_CANCEL _IOWR('f', 203, long) + + /* * l_wait_event is a flexible sleeping function, permitting simple caller * configuration of interrupt and timeout sensitivity along with actions to * be performed in the event of either exception. * * Common usage looks like this: - * + * * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler, * intr_handler, callback_data); * rc = l_wait_event(waitq, condition, &lwi); diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h index deb9656..0c56fcd 100644 --- a/lustre/include/linux/lustre_lite.h +++ b/lustre/include/linux/lustre_lite.h @@ -62,7 +62,9 @@ struct ll_inode_info { #endif }; - +/* interpet return codes from intent lookup */ +#define LL_LOOKUP_POSITIVE 1 +#define LL_LOOKUP_NEGATIVE 2 #define LL_SUPER_MAGIC 0x0BD00BD0 @@ -73,7 +75,7 @@ struct ll_inode_info { #define LL_SBI_NOLCK 0x1 struct ll_sb_info { - obd_uuid_t ll_sb_uuid; + struct obd_uuid ll_sb_uuid; struct lustre_handle ll_mdc_conn; struct lustre_handle ll_osc_conn; struct proc_dir_entry* ll_proc_root; @@ -120,6 +122,28 @@ static inline struct ll_sb_info *ll_i2sbi(struct inode *inode) return ll_s2sbi(inode->i_sb); } +static inline void d_unhash_aliases(struct inode *inode) +{ + struct dentry *dentry = NULL; + struct list_head *tmp; + struct ll_sb_info *sbi = ll_i2sbi(inode); + ENTRY; + + CDEBUG(D_INODE, "marking dentries for ino %lx/%x invalid\n", + inode->i_ino, inode->i_generation); + + spin_lock(&dcache_lock); + list_for_each(tmp, &inode->i_dentry) { + dentry = list_entry(tmp, struct dentry, d_alias); + + list_del_init(&dentry->d_hash); + dentry->d_flags |= DCACHE_LUSTRE_INVALID; + list_add(&dentry->d_hash, &sbi->ll_orphan_dentry_list); + } + + spin_unlock(&dcache_lock); + EXIT; +} // FIXME: replace the name of this with LL_I to conform to kernel stuff // static inline struct ll_inode_info *LL_I(struct inode *inode) @@ -169,7 +193,6 @@ int ll_intent_lock(struct inode *parent, struct dentry **, /* dcache.c */ void ll_intent_release(struct dentry *, struct lookup_intent *); -int ll_set_dd(struct dentry *de); /**** @@ -220,14 +243,15 @@ extern struct inode_operations ll_dir_inode_operations; /* file.c */ extern struct file_operations ll_file_operations; extern struct inode_operations ll_file_inode_operations; +extern struct inode_operations ll_special_inode_operations; struct ldlm_lock; -int ll_lock_callback(struct ldlm_lock *, struct ldlm_lock_desc *, void *data, - __u32 data_len, int flag); +int ll_lock_callback(struct ldlm_lock *, struct ldlm_lock_desc *, void *data, int flag); int ll_size_lock(struct inode *, struct lov_stripe_md *, obd_off start, int mode, struct lustre_handle *); int ll_size_unlock(struct inode *, struct lov_stripe_md *, int mode, struct lustre_handle *); -int ll_file_size(struct inode *inode, struct lov_stripe_md *md); +int ll_file_size(struct inode *inode, struct lov_stripe_md *md, + struct lustre_handle *); int ll_create_objects(struct super_block *sb, obd_id id, uid_t uid, gid_t gid, struct lov_stripe_md **lsmp); @@ -237,7 +261,7 @@ struct page *ll_getpage(struct inode *inode, unsigned long offset, void ll_truncate(struct inode *inode); /* super.c */ -void ll_update_inode(struct inode *, struct mds_body *); +void ll_update_inode(struct inode *, struct mds_body *, struct lov_mds_md *); /* symlink.c */ extern struct inode_operations ll_fast_symlink_inode_operations; diff --git a/lustre/include/linux/lustre_mds.h b/lustre/include/linux/lustre_mds.h index 7a02dae..133f7af 100644 --- a/lustre/include/linux/lustre_mds.h +++ b/lustre/include/linux/lustre_mds.h @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2001 Cluster File Systems, Inc. + * Copyright (C) 2001-2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -29,6 +29,7 @@ #ifdef __KERNEL__ #include +#include #include struct ldlm_lock_desc; @@ -38,11 +39,57 @@ struct ptlrpc_client; struct obd_export; struct ptlrpc_request; struct obd_device; +struct ll_file_data; #define LUSTRE_MDS_NAME "mds" #define LUSTRE_MDT_NAME "mdt" #define LUSTRE_MDC_NAME "mdc" +struct mdc_rpc_lock { + struct semaphore rpcl_sem; + struct lookup_intent *rpcl_it; +}; +extern struct mdc_rpc_lock mdc_rpc_lock; + +static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck) +{ + sema_init(&lck->rpcl_sem, 1); + lck->rpcl_it = NULL; +} + +static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck, + struct lookup_intent *it) +{ + down(&lck->rpcl_sem); + if (it) { + lck->rpcl_it = it; + it->it_iattr = (void *)1; + } +} + +static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, + struct lookup_intent *it) +{ + if (it == NULL) { + LASSERT(it == lck->rpcl_it); + up(&lck->rpcl_sem); + return; + } + if (it && it->it_iattr) { + it->it_iattr = NULL; + LASSERT(it == lck->rpcl_it); + lck->rpcl_it = NULL; + up(&lck->rpcl_sem); + } +} +struct mdc_unlink_data { + struct inode *unl_dir; + struct inode *unl_de; + int unl_mode; + const char *unl_name; + int unl_len; +}; + struct mds_update_record { __u32 ur_fsuid; __u32 ur_fsgid; @@ -60,6 +107,8 @@ struct mds_update_record { __u32 ur_uid; __u32 ur_gid; __u64 ur_time; + __u32 ur_flags; + __u32 ur_suppgid; }; #define MDS_LR_CLIENT 8192 @@ -68,6 +117,7 @@ struct mds_update_record { #define MDS_CLIENT_SLOTS 17 #define MDS_MOUNT_RECOV 2 +#define MDS_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */ /* Data stored per server at the head of the last_rcvd file. In le32 order. */ struct mds_server_data { @@ -94,19 +144,25 @@ struct mds_export_data { spinlock_t med_open_lock; struct mds_client_data *med_mcd; int med_off; + __u64 med_last_xid; + struct lustre_msg *med_last_reply; + int med_last_replen; }; /* file data for open files on MDS */ struct mds_file_data { struct list_head mfd_list; - struct lustre_handle mfd_clienthandle; __u64 mfd_servercookie; struct file *mfd_file; }; /* mds/mds_reint.c */ int mds_reint_rec(struct mds_update_record *r, int offset, - struct ptlrpc_request *req); + struct ptlrpc_request *req, struct lustre_handle *); + +/* mds/mds_open.c */ +int mds_open(struct mds_update_record *rec, int offset, + struct ptlrpc_request *req, struct lustre_handle *); /* lib/mds_updates.c */ void mds_unpack_body(struct mds_body *b); @@ -117,16 +173,20 @@ void mds_pack_rep_body(struct ptlrpc_request *); int mds_update_unpack(struct ptlrpc_request *, int offset, struct mds_update_record *); -void mds_readdir_pack(struct ptlrpc_request *req, __u64 offset, - obd_id ino, int type); -void mds_getattr_pack(struct ptlrpc_request *req, int offset, +void mds_readdir_pack(struct ptlrpc_request *req, __u64 offset, obd_id ino, + int type, __u64 xid); +void mds_getattr_pack(struct ptlrpc_request *req, int valid, int offset, int fl, struct inode *inode, const char *name, int namelen); -void mds_setattr_pack(struct ptlrpc_request *, int offset, struct inode *, - struct iattr *, const char *name, int namelen); +void mds_setattr_pack(struct ptlrpc_request *, struct inode *, + struct iattr *, void *ea, int ealen); void mds_create_pack(struct ptlrpc_request *, int offset, struct inode *dir, __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time, const char *name, int namelen, const void *data, int datalen); +void mds_open_pack(struct ptlrpc_request *, int offset, struct inode *dir, + __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time, + __u32 flags, const char *name, int namelen, + const void *data, int datalen); void mds_unlink_pack(struct ptlrpc_request *, int offset, struct inode *inode, struct inode *child, __u32 mode, const char *name, int namelen); @@ -149,8 +209,8 @@ struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid, struct lustre_handle *lockh); struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid, struct vfsmount **mnt); -int mds_reint(struct ptlrpc_request *req, int offset); -int mds_pack_md(struct mds_obd *mds, struct ptlrpc_request *req, +int mds_reint(struct ptlrpc_request *req, int offset, struct lustre_handle *); +int mds_pack_md(struct obd_device *mds, struct lustre_msg *msg, int offset, struct mds_body *body, struct inode *inode); /* mds/mds_fs.c */ @@ -173,10 +233,12 @@ int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent, char *filename, int namelen, unsigned long valid, unsigned int ea_size, struct ptlrpc_request **request); int mdc_setattr(struct lustre_handle *conn, - struct inode *, struct iattr *iattr, struct ptlrpc_request **); + struct inode *, struct iattr *iattr, + void *ea, int ealen, struct ptlrpc_request **); int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags, struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh, struct ptlrpc_request **); +void mdc_set_open_replay_data(struct ll_file_data *fd); int mdc_close(struct lustre_handle *conn, obd_id ino, int type, struct lustre_handle *fh, struct ptlrpc_request **req); int mdc_readpage(struct lustre_handle *conn, obd_id ino, @@ -189,13 +251,14 @@ int mdc_unlink(struct lustre_handle *, struct inode *dir, struct inode *child, __u32 mode, const char *name, int namelen, struct ptlrpc_request **); int mdc_link(struct lustre_handle *conn, - struct dentry *src, struct inode *dir, const char *name, + struct inode *src, struct inode *dir, const char *name, int namelen, struct ptlrpc_request **); int mdc_rename(struct lustre_handle *conn, struct inode *src, struct inode *tgt, const char *old, int oldlen, const char *new, int newlen, struct ptlrpc_request **); -int mdc_create_client(obd_uuid_t uuid, struct ptlrpc_client *cl); +int mdc_create_client(struct obd_uuid uuid, struct ptlrpc_client *cl); +void mdc_lock_set_inode(struct lustre_handle *lock, struct inode *inode); /* Store the generation of a newly-created inode in |req| for replay. */ void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff, diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index 081492c..e2c9db3 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -47,22 +47,47 @@ */ #define LDLM_NUM_THREADS 4 -#define LDLM_NEVENTS 1024 -#define LDLM_NBUFS 100 +#define LDLM_NEVENT_MAX 8192UL +#define LDLM_NEVENTS min(num_physpages / 64, LDLM_NEVENT_MAX) +#define LDLM_NBUF_MAX 256UL +#define LDLM_NBUFS min(LDLM_NEVENTS / 16, LDLM_NBUF_MAX) #define LDLM_BUFSIZE (8 * 1024) #define LDLM_MAXREQSIZE 1024 #define MDT_NUM_THREADS 8 -#define MDS_NEVENTS 1024 -#define MDS_NBUFS 100 +#define MDS_NEVENT_MAX 8192UL +#define MDS_NEVENTS min(num_physpages / 64, MDS_NEVENT_MAX) +#define MDS_NBUF_MAX 512UL +#define MDS_NBUFS min(MDS_NEVENTS / 16, MDS_NBUF_MAX) #define MDS_BUFSIZE (8 * 1024) -#define MDS_MAXREQSIZE 1024 +/* Assume file name length = FNAME_MAX = 256 (true for extN). + * path name length = PATH_MAX = 4096 + * LOV MD size max = EA_MAX = 4000 + * symlink: FNAME_MAX + PATH_MAX <- largest + * link: FNAME_MAX + PATH_MAX (mds_rec_link < mds_rec_create) + * rename: FNAME_MAX + FNAME_MAX + * open: FNAME_MAX + EA_MAX + * + * MDS_MAXREQSIZE ~= 4736 bytes = + * lustre_msg + ldlm_request + mds_body + mds_rec_create + FNAME_MAX + PATH_MAX + * + * Realistic size is about 512 bytes (20 character name + 128 char symlink), + * except in the open case where there are a large number of OSTs in a LOV. + */ +#define MDS_MAXREQSIZE (5 * 1024) #define OST_NUM_THREADS 6 -#define OST_NEVENTS min(num_physpages / 16, 32768UL) -#define OST_NBUFS min(OST_NEVENTS / 128, 1280UL) -#define OST_BUFSIZE ((OST_NEVENTS > 4096UL ? 32 : 8) * 1024) -#define OST_MAXREQSIZE (8 * 1024) +#define OST_NEVENT_MAX 32768UL +#define OST_NEVENTS min(num_physpages / 16, OST_NEVENT_MAX) +#define OST_NBUF_MAX 1280UL +#define OST_NBUFS min(OST_NEVENTS / 64, OST_NBUF_MAX) +#define OST_BUFSIZE (8 * 1024) +/* OST_MAXREQSIZE ~= 1896 bytes = + * lustre_msg + obdo + 16 * obd_ioobj + 64 * niobuf_remote + * + * single object with 16 pages is 576 bytes + */ +#define OST_MAXREQSIZE (2 * 1024) #define PTLBD_NUM_THREADS 4 #define PTLBD_NEVENTS 1024 @@ -75,8 +100,8 @@ struct ptlrpc_connection { struct list_head c_link; struct lustre_peer c_peer; - __u8 c_local_uuid[37]; /* XXX do we need this? */ - __u8 c_remote_uuid[37]; + struct obd_uuid c_local_uuid; /* XXX do we need this? */ + struct obd_uuid c_remote_uuid; __u32 c_generation; /* changes upon new connection */ __u32 c_epoch; /* changes when peer changes */ @@ -160,19 +185,25 @@ struct ptlrpc_request { struct ptlrpc_service *rq_svc; void (*rq_replay_cb)(struct ptlrpc_request *); + void *rq_replay_data; }; #define DEBUG_REQ(level, req, fmt, args...) \ do { \ CDEBUG(level, \ "@@@ " fmt " req@%p x"LPD64"/t"LPD64" o%d->%s:%d lens %d/%d ref %d fl " \ - "%x\n" , ## args, req, req->rq_xid, req->rq_reqmsg->transno, \ + "%x/%x/%x rc %x\n" , ## args, req, req->rq_xid, \ + req->rq_reqmsg ? req->rq_reqmsg->transno : -1, \ req->rq_reqmsg ? req->rq_reqmsg->opc : -1, \ - req->rq_connection ? (char *)req->rq_connection->c_remote_uuid : "", \ + req->rq_connection ? \ + (char *)req->rq_connection->c_remote_uuid.uuid : "", \ (req->rq_import && req->rq_import->imp_client) ? \ req->rq_import->imp_client->cli_request_portal : -1, \ req->rq_reqlen, req->rq_replen, \ - atomic_read (&req->rq_refcount), req->rq_flags); \ + atomic_read (&req->rq_refcount), req->rq_flags, \ + req->rq_reqmsg ? req->rq_reqmsg->flags : 0, \ + req->rq_repmsg ? req->rq_repmsg->flags : 0, \ + req->rq_status); \ } while (0) struct ptlrpc_bulk_page { @@ -277,9 +308,9 @@ typedef void (*bulk_callback_t)(struct ptlrpc_bulk_desc *, void *); typedef int (*svc_handler_t)(struct ptlrpc_request *req); /* rpc/connection.c */ -void ptlrpc_readdress_connection(struct ptlrpc_connection *, obd_uuid_t uuid); +void ptlrpc_readdress_connection(struct ptlrpc_connection *, struct obd_uuid *uuid); struct ptlrpc_connection *ptlrpc_get_connection(struct lustre_peer *peer, - obd_uuid_t uuid); + struct obd_uuid *uuid); int ptlrpc_put_connection(struct ptlrpc_connection *c); struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *); void ptlrpc_init_connection(void); @@ -288,8 +319,10 @@ void ptlrpc_cleanup_connection(void); /* rpc/niobuf.c */ int ptlrpc_check_bulk_sent(struct ptlrpc_bulk_desc *bulk); int ptlrpc_check_bulk_received(struct ptlrpc_bulk_desc *bulk); -int ptlrpc_send_bulk(struct ptlrpc_bulk_desc *); -int ptlrpc_register_bulk(struct ptlrpc_bulk_desc *); +int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *); +int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *); +int ptlrpc_register_bulk_put(struct ptlrpc_bulk_desc *); +int ptlrpc_register_bulk_get(struct ptlrpc_bulk_desc *); int ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk); struct obd_brw_set *obd_brw_set_new(void); void obd_brw_set_add(struct obd_brw_set *, struct ptlrpc_bulk_desc *); @@ -305,8 +338,8 @@ void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd); void ptlrpc_init_client(int req_portal, int rep_portal, char *name, struct ptlrpc_client *); void ptlrpc_cleanup_client(struct obd_import *imp); -__u8 *ptlrpc_req_to_uuid(struct ptlrpc_request *req); -struct ptlrpc_connection *ptlrpc_uuid_to_connection(obd_uuid_t uuid); +struct obd_uuid *ptlrpc_req_to_uuid(struct ptlrpc_request *req); +struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid); int ll_brw_sync_wait(struct obd_brw_set *, int phase); @@ -314,22 +347,25 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req); void ptlrpc_continue_req(struct ptlrpc_request *req); int ptlrpc_replay_req(struct ptlrpc_request *req); void ptlrpc_restart_req(struct ptlrpc_request *req); -void ptlrpc_abort_inflight(struct obd_import *imp); +void ptlrpc_abort_inflight(struct obd_import *imp, int dying_import); struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, int count, int *lengths, char **bufs); void ptlrpc_free_req(struct ptlrpc_request *request); void ptlrpc_req_finished(struct ptlrpc_request *request); +struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req); struct ptlrpc_bulk_desc *ptlrpc_prep_bulk(struct ptlrpc_connection *); void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk); struct ptlrpc_bulk_page *ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc); void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *page); +void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, + struct obd_import *imp); /* rpc/service.c */ struct ptlrpc_service * ptlrpc_init_svc(__u32 nevents, __u32 nbufs, __u32 bufsize, __u32 max_req_size, int req_portal, int rep_portal, - obd_uuid_t uuid, svc_handler_t, char *name); + struct obd_uuid *uuid, svc_handler_t, char *name); void ptlrpc_stop_all_threads(struct ptlrpc_service *svc); int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, char *name); diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 94ffd4f..acc59c2 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -17,11 +17,11 @@ struct lov_oinfo { /* per-child structure */ }; struct lov_stripe_md { - __u64 lsm_object_id; /* lov object id */ + __u64 lsm_object_id; /* lov object id */ __u32 lsm_magic; - __u32 lsm_stripe_size; /* size of the stripe */ - int lsm_stripe_offset; /* offset of first stripe in lmd_objects */ - int lsm_stripe_count; /* how many objects are being striped on */ + __u32 lsm_stripe_size; /* size of the stripe */ + unsigned lsm_stripe_offset; /* offset of first stripe in lmd_objects */ + unsigned lsm_stripe_count; /* how many objects are being striped on */ struct lov_oinfo lsm_oinfo[0]; }; @@ -72,6 +72,7 @@ struct obd_ucred { __u32 ouc_fsuid; __u32 ouc_fsgid; __u32 ouc_cap; + __u32 ouc_suppgid; }; #define OBD_RUN_CTXT_MAGIC 0xC0FFEEAA @@ -95,6 +96,8 @@ struct obd_run_ctxt { #define OBD_SET_CTXT_MAGIC(ctxt) do {} while(0) #endif +struct ost_server_data; + struct filter_obd { char *fo_fstype; struct super_block *fo_sb; @@ -103,8 +106,12 @@ struct filter_obd { struct dentry *fo_dentry_O; struct dentry *fo_dentry_O_mode[16]; spinlock_t fo_objidlock; /* protects fo_lastobjid increment */ - __u64 fo_lastobjid; - __u64 fo_last_committed; + struct semaphore fo_transno_sem; + struct file *fo_rcvd_filp; + struct filter_server_data *fo_fsd; + + __u64 fo_next_recovery_transno; + int fo_recoverable_clients; struct file_operations *fo_fop; struct inode_operations *fo_iop; struct address_space_operations *fo_aops; @@ -118,7 +125,7 @@ struct client_obd { struct obd_import cl_import; struct semaphore cl_sem; int cl_conn_count; - obd_uuid_t cl_target_uuid; /* XXX -> lustre_name */ + struct obd_uuid cl_target_uuid; /* XXX -> lustre_name */ /* max_mds_easize is purely a performance thing so we don't have to * call obd_size_wiremd() all the time. */ int cl_max_mds_easize; @@ -127,6 +134,7 @@ struct client_obd { struct mds_obd { struct ptlrpc_service *mds_service; + struct ptlrpc_service *mds_getattr_service; struct super_block *mds_sb; struct vfsmount *mds_vfsmnt; @@ -138,7 +146,6 @@ struct mds_obd { int mds_max_mdsize; struct file *mds_rcvd_filp; struct semaphore mds_transno_sem; - __u64 mds_last_committed; __u64 mds_last_rcvd; __u64 mds_mount_count; struct ll_fid mds_rootfid; @@ -151,7 +158,8 @@ struct mds_obd { struct list_head mds_delayed_reply_queue; spinlock_t mds_processing_task_lock; pid_t mds_processing_task; - + struct timer_list mds_recovery_timer; + int mds_has_lov_desc; struct lov_desc mds_lov_desc; }; @@ -184,6 +192,7 @@ struct echo_obd { struct ptlbd_obd { /* server's */ struct ptlrpc_service *ptlbd_service; + struct file *filp; /* client's */ struct ptlrpc_client bd_client; struct obd_import bd_import; @@ -216,11 +225,15 @@ struct snap_obd { struct ost_obd { struct ptlrpc_service *ost_service; - struct lustre_handle ost_conn; /* the local connection to the OBD */ }; struct echo_client_obd { - struct lustre_handle conn; /* the local connection to osc/lov */ + struct lustre_handle ec_conn; /* the local connection to osc/lov */ + spinlock_t ec_lock; + struct list_head ec_objects; + int ec_lsmsize; + int ec_nstripes; + __u64 ec_unique; }; struct cache_obd { @@ -229,7 +242,7 @@ struct cache_obd { }; struct lov_tgt_desc { - obd_uuid_t uuid; + struct obd_uuid uuid; struct lustre_handle conn; int active; /* is this target available for requests, etc */ }; @@ -254,6 +267,10 @@ struct niobuf_local { struct dentry *dentry; }; +struct obd_trans_info { + __u64 oti_transno; +}; + #define N_LOCAL_TEMP_PAGE 0x00000001 /* corresponds to one of the obd's */ @@ -262,7 +279,7 @@ struct obd_device { /* common and UUID name of this device */ char *obd_name; - obd_uuid_t obd_uuid; + struct obd_uuid obd_uuid; int obd_minor; int obd_flags; @@ -273,6 +290,7 @@ struct obd_device { struct ptlrpc_client obd_ldlm_client; /* XXX OST/MDS only */ /* a spinlock is OK for what we do now, may need a semaphore later */ spinlock_t obd_dev_lock; + __u64 obd_last_committed; struct fsfilt_operations *obd_fsops; union { struct ext2_obd ext2; @@ -310,7 +328,7 @@ struct obd_ops { int (*o_setup) (struct obd_device *dev, obd_count len, void *data); int (*o_cleanup)(struct obd_device *dev); int (*o_connect)(struct lustre_handle *conn, struct obd_device *src, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover); int (*o_disconnect)(struct lustre_handle *conn); @@ -324,41 +342,43 @@ struct obd_ops { int (*o_preallocate)(struct lustre_handle *, obd_count *req, obd_id *ids); int (*o_create)(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md **ea); + struct lov_stripe_md **ea, struct obd_trans_info *oti); int (*o_destroy)(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *ea); + struct lov_stripe_md *ea, struct obd_trans_info *oti); int (*o_setattr)(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *ea); + struct lov_stripe_md *ea, struct obd_trans_info *oti); int (*o_getattr)(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md *ea); int (*o_open)(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *ea); + struct lov_stripe_md *ea, struct obd_trans_info *oti); int (*o_close)(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *ea); + struct lov_stripe_md *ea, struct obd_trans_info *oti); int (*o_brw)(int rw, struct lustre_handle *conn, struct lov_stripe_md *ea, obd_count oa_bufs, - struct brw_page *pgarr, struct obd_brw_set *); + struct brw_page *pgarr, struct obd_brw_set *, + struct obd_trans_info *oti); int (*o_punch)(struct lustre_handle *conn, struct obdo *tgt, struct lov_stripe_md *ea, obd_size count, - obd_off offset); + obd_off offset, struct obd_trans_info *oti); int (*o_sync)(struct lustre_handle *conn, struct obdo *tgt, obd_size count, obd_off offset); int (*o_migrate)(struct lustre_handle *conn, struct obdo *dst, struct obdo *src, obd_size count, obd_off offset); int (*o_copy)(struct lustre_handle *dstconn, struct obdo *dst, struct lustre_handle *srconn, struct obdo *src, - obd_size count, obd_off offset); + obd_size count, obd_off offset, struct obd_trans_info *); int (*o_iterate)(struct lustre_handle *conn, int (*)(obd_id, obd_gr, void *), obd_id *startid, obd_gr group, void *data); int (*o_preprw)(int cmd, struct lustre_handle *conn, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_remote *remote, - struct niobuf_local *local, void **desc_private); + struct niobuf_local *local, void **desc_private, + struct obd_trans_info *oti); int (*o_commitrw)(int cmd, struct lustre_handle *conn, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_local *local, - void *desc_private); + void *desc_private, struct obd_trans_info *oti); int (*o_enqueue)(struct lustre_handle *conn, struct lov_stripe_md *md, struct lustre_handle *parent_lock, __u32 type, void *cookie, int cookielen, __u32 mode, diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h index ed3eb99..8e160ad 100644 --- a/lustre/include/linux/obd_class.h +++ b/lustre/include/linux/obd_class.h @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Copyright (C) 2001-2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -44,13 +44,16 @@ #include #endif - /* OBD Device Declarations */ #define MAX_OBD_DEVICES 128 extern struct obd_device obd_dev[MAX_OBD_DEVICES]; -#define OBD_ATTACHED 0x1 -#define OBD_SET_UP 0x2 +#define OBD_ATTACHED 0x01 +#define OBD_SET_UP 0x02 +#define OBD_RECOVERING 0x04 +#define OBD_ABORT_RECOVERY 0x08 +#define OBD_REPLAYABLE 0x10 +#define OBD_NO_TRANSNO 0x20 /* XXX needs better name */ /* OBD Operations Declarations */ @@ -104,7 +107,8 @@ do { \ \ exp = class_conn2export(conn); \ if (!(exp)) { \ - CERROR("No export\n"); \ + CERROR("No export for conn "LPX64":"LPX64"\n", \ + conn->addr, conn->cookie); \ RETURN(-EINVAL); \ } \ \ @@ -276,7 +280,8 @@ static inline int obd_free_memmd(struct lustre_handle *conn, } static inline int obd_create(struct lustre_handle *conn, struct obdo *obdo, - struct lov_stripe_md **ea) + struct lov_stripe_md **ea, + struct obd_trans_info *oti) { struct obd_export *exp; int rc; @@ -285,12 +290,13 @@ static inline int obd_create(struct lustre_handle *conn, struct obdo *obdo, OBD_CHECK_SETUP(conn, exp); OBD_CHECK_OP(exp->exp_obd, create); - rc = OBP(exp->exp_obd, create)(conn, obdo, ea); + rc = OBP(exp->exp_obd, create)(conn, obdo, ea, oti); RETURN(rc); } static inline int obd_destroy(struct lustre_handle *conn, struct obdo *obdo, - struct lov_stripe_md *ea) + struct lov_stripe_md *ea, + struct obd_trans_info *oti) { struct obd_export *exp; int rc; @@ -299,7 +305,7 @@ static inline int obd_destroy(struct lustre_handle *conn, struct obdo *obdo, OBD_CHECK_SETUP(conn, exp); OBD_CHECK_OP(exp->exp_obd, destroy); - rc = OBP(exp->exp_obd, destroy)(conn, obdo, ea); + rc = OBP(exp->exp_obd, destroy)(conn, obdo, ea, oti); RETURN(rc); } @@ -318,7 +324,8 @@ static inline int obd_getattr(struct lustre_handle *conn, struct obdo *obdo, } static inline int obd_close(struct lustre_handle *conn, struct obdo *obdo, - struct lov_stripe_md *ea) + struct lov_stripe_md *ea, + struct obd_trans_info *oti) { struct obd_export *exp; int rc; @@ -327,12 +334,12 @@ static inline int obd_close(struct lustre_handle *conn, struct obdo *obdo, OBD_CHECK_SETUP(conn, exp); OBD_CHECK_OP(exp->exp_obd, close); - rc = OBP(exp->exp_obd, close)(conn, obdo, ea); + rc = OBP(exp->exp_obd, close)(conn, obdo, ea, oti); RETURN(rc); } static inline int obd_open(struct lustre_handle *conn, struct obdo *obdo, - struct lov_stripe_md *ea) + struct lov_stripe_md *ea, struct obd_trans_info *oti) { struct obd_export *exp; int rc; @@ -341,12 +348,13 @@ static inline int obd_open(struct lustre_handle *conn, struct obdo *obdo, OBD_CHECK_SETUP(conn, exp); OBD_CHECK_OP(exp->exp_obd, open); - rc = OBP(exp->exp_obd, open)(conn, obdo, ea); + rc = OBP(exp->exp_obd, open)(conn, obdo, ea, oti); RETURN(rc); } static inline int obd_setattr(struct lustre_handle *conn, struct obdo *obdo, - struct lov_stripe_md *ea) + struct lov_stripe_md *ea, + struct obd_trans_info *oti) { struct obd_export *exp; int rc; @@ -355,12 +363,12 @@ static inline int obd_setattr(struct lustre_handle *conn, struct obdo *obdo, OBD_CHECK_SETUP(conn, exp); OBD_CHECK_OP(exp->exp_obd, setattr); - rc = OBP(exp->exp_obd, setattr)(conn, obdo, ea); + rc = OBP(exp->exp_obd, setattr)(conn, obdo, ea, oti); RETURN(rc); } static inline int obd_connect(struct lustre_handle *conn, - struct obd_device *obd, obd_uuid_t cluuid, + struct obd_device *obd, struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { @@ -401,8 +409,8 @@ static inline int obd_statfs(struct lustre_handle *conn,struct obd_statfs *osfs) } static inline int obd_punch(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *ea, - obd_size start, obd_size end) + struct lov_stripe_md *ea, obd_size start, + obd_size end, struct obd_trans_info *oti) { struct obd_export *exp; int rc; @@ -411,13 +419,14 @@ static inline int obd_punch(struct lustre_handle *conn, struct obdo *oa, OBD_CHECK_SETUP(conn, exp); OBD_CHECK_OP(exp->exp_obd, punch); - rc = OBP(exp->exp_obd, punch)(conn, oa, ea, start, end); + rc = OBP(exp->exp_obd, punch)(conn, oa, ea, start, end, oti); RETURN(rc); } static inline int obd_brw(int cmd, struct lustre_handle *conn, struct lov_stripe_md *ea, obd_count oa_bufs, - struct brw_page *pg, struct obd_brw_set *set) + struct brw_page *pg, struct obd_brw_set *set, + struct obd_trans_info *oti) { struct obd_export *exp; int rc; @@ -431,14 +440,15 @@ static inline int obd_brw(int cmd, struct lustre_handle *conn, LBUG(); } - rc = OBP(exp->exp_obd, brw)(cmd, conn, ea, oa_bufs, pg, set); + rc = OBP(exp->exp_obd, brw)(cmd, conn, ea, oa_bufs, pg, set, oti); RETURN(rc); } static inline int obd_preprw(int cmd, struct lustre_handle *conn, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_remote *remote, - struct niobuf_local *local, void **desc_private) + struct niobuf_local *local, void **desc_private, + struct obd_trans_info *oti) { struct obd_export *exp; int rc; @@ -448,14 +458,14 @@ static inline int obd_preprw(int cmd, struct lustre_handle *conn, OBD_CHECK_OP(exp->exp_obd, preprw); rc = OBP(exp->exp_obd, preprw)(cmd, conn, objcount, obj, niocount, - remote, local, desc_private); + remote, local, desc_private, oti); RETURN(rc); } static inline int obd_commitrw(int cmd, struct lustre_handle *conn, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_local *local, - void *desc_private) + void *desc_private, struct obd_trans_info *oti) { struct obd_export *exp; int rc; @@ -465,7 +475,7 @@ static inline int obd_commitrw(int cmd, struct lustre_handle *conn, OBD_CHECK_OP(exp->exp_obd, commitrw); rc = OBP(exp->exp_obd, commitrw)(cmd, conn, objcount, obj, niocount, - local, desc_private); + local, desc_private, oti); RETURN(rc); } @@ -554,7 +564,7 @@ static inline void obd_oa2handle(struct lustre_handle *handle, struct obdo *oa) static inline void obd_handle2oa(struct obdo *oa, struct lustre_handle *handle) { - if (handle->addr) { + if (handle && handle->addr) { struct lustre_handle *oa_handle = obdo_handle(oa); memcpy(oa_handle, handle, sizeof(*handle)); oa->o_valid |= OBD_MD_FLHANDLE; @@ -714,7 +724,7 @@ static inline void obdo_to_inode(struct inode *dst, struct obdo *src, dst->i_atime = src->o_atime; if (valid & OBD_MD_FLMTIME) dst->i_mtime = src->o_mtime; - if (valid & OBD_MD_FLCTIME) + if (valid & OBD_MD_FLCTIME && src->o_ctime > dst->i_ctime) dst->i_ctime = src->o_ctime; if (valid & OBD_MD_FLSIZE) dst->i_size = src->o_size; @@ -835,21 +845,23 @@ static inline int obdo_cmp_md(struct obdo *dst, struct obdo *src, /* I'm as embarrassed about this as you are. * * // XXX do not look into _superhack with remaining eye - * // XXX if this were any uglier, I'd get my own show on MTV */ + * // XXX if this were any uglier, I'd get my own show on MTV */ extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); +extern void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp, + int dying_import); -int class_register_type(struct obd_ops *ops, struct lprocfs_vars* vars, +int class_register_type(struct obd_ops *ops, struct lprocfs_vars* vars, char *nm); int class_unregister_type(char *nm); int class_name2dev(char *name); -int class_uuid2dev(char *uuid); -struct obd_device *class_uuid2obd(char *uuid); +int class_uuid2dev(struct obd_uuid *uuid); +struct obd_device *class_uuid2obd(struct obd_uuid *uuid); struct obd_export *class_new_export(struct obd_device *obddev); struct obd_type *class_get_type(char *name); void class_put_type(struct obd_type *type); void class_destroy_export(struct obd_export *exp); int class_connect(struct lustre_handle *conn, struct obd_device *obd, - obd_uuid_t cluuid); + struct obd_uuid *cluuid); int class_disconnect(struct lustre_handle *conn); void class_disconnect_all(struct obd_device *obddev); @@ -872,6 +884,17 @@ void statfs_unpack(struct statfs *sfs, struct obd_statfs *osfs); void obd_statfs_pack(struct obd_statfs *tgt, struct obd_statfs *src); void obd_statfs_unpack(struct obd_statfs *tgt, struct obd_statfs *src); + +struct obd_class_user_state { + struct obd_device *ocus_current_obd; + struct list_head ocus_conns; +}; + +struct obd_class_user_conn { + struct list_head ocuc_chain; + struct lustre_handle ocuc_conn; +}; + #endif /* sysctl.c */ @@ -880,6 +903,6 @@ extern void obd_sysctl_clean (void); /* uuid.c */ typedef __u8 class_uuid_t[16]; -//int class_uuid_parse(obd_uuid_t in, class_uuid_t out); -void class_uuid_unparse(class_uuid_t in, obd_uuid_t out); -#endif /* __LINUX_CLASS_OBD_H */ +//int class_uuid_parse(struct obd_uuid in, class_uuid_t out); +void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out); +#endif /* __LINUX_OBD_CLASS_H */ diff --git a/lustre/include/linux/obd_echo.h b/lustre/include/linux/obd_echo.h index 6bc32f2..273779a 100644 --- a/lustre/include/linux/obd_echo.h +++ b/lustre/include/linux/obd_echo.h @@ -10,4 +10,32 @@ #define OBD_ECHO_DEVICENAME "obdecho" #define OBD_ECHO_CLIENT_DEVICENAME "echo_client" +struct ec_object +{ + struct list_head eco_obj_chain; + struct obd_device *eco_device; + int eco_refcount; + int eco_deleted; + obd_id eco_id; + struct lov_stripe_md *eco_lsm; +}; + +struct ec_open_object +{ + struct list_head ecoo_exp_chain; + struct ec_object *ecoo_object; + struct obdo ecoo_oa; + __u64 ecoo_cookie; +}; + +struct ec_lock +{ + struct list_head ecl_exp_chain; + struct lustre_handle ecl_handle; + struct ldlm_extent ecl_extent; + __u32 ecl_mode; + struct ec_object *ecl_object; + __u64 ecl_cookie; +}; + #endif diff --git a/lustre/include/linux/obd_filter.h b/lustre/include/linux/obd_filter.h index fb3d1ff..16a4d03 100644 --- a/lustre/include/linux/obd_filter.h +++ b/lustre/include/linux/obd_filter.h @@ -27,10 +27,40 @@ #define OBD_FILTER_DEVICENAME "obdfilter" #endif +#define FILTER_LR_SERVER_SIZE 512 + +#define FILTER_LR_CLIENT_START 8192 +#define FILTER_LR_CLIENT_SIZE 128 + +#define FILTER_MOUNT_RECOV 2 +#define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */ + +/* Data stored per server at the head of the last_rcvd file. In le32 order. */ +struct filter_server_data { + __u8 fsd_uuid[37]; /* server UUID */ + __u8 fsd_uuid_padding[3]; /* unused */ + __u64 fsd_last_objid; /* last completed transaction ID */ + __u64 fsd_last_rcvd; /* last completed transaction ID */ + __u64 fsd_mount_count; /* FILTER incarnation number */ + __u8 fsd_padding[FILTER_LR_SERVER_SIZE - 64]; /* */ +}; + +/* Data stored per client in the last_rcvd file. In le32 order. */ +struct filter_client_data { + __u8 fcd_uuid[37]; /* client UUID */ + __u8 fcd_uuid_padding[3]; /* unused */ + __u64 fcd_last_rcvd; /* last completed transaction ID */ + __u64 fcd_mount_count; /* FILTER incarnation number */ + __u64 fcd_last_xid; /* client RPC xid for the last transaction */ + __u8 fcd_padding[FILTER_LR_CLIENT_SIZE - 64]; +}; + /* In-memory access to client data from OST struct */ struct filter_export_data { struct list_head fed_open_head; /* files to close on disconnect */ spinlock_t fed_lock; /* protects fed_open_head */ + struct filter_client_data *fed_fcd; + int fed_lr_off; }; /* file data for open files on OST */ @@ -47,4 +77,5 @@ struct filter_dentry_data { #define FILTER_FLAG_DESTROY 0x0001 /* destroy dentry on last file close */ + #endif diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index e3e23f4..69e4126 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -35,6 +35,7 @@ extern int obd_memmax; extern unsigned long obd_fail_loc; extern unsigned long obd_timeout; extern char obd_recovery_upcall[128]; +extern unsigned long obd_sync_filter; #define OBD_FAIL_MDS 0x100 #define OBD_FAIL_MDS_HANDLE_UNPACK 0x101 @@ -68,6 +69,8 @@ extern char obd_recovery_upcall[128]; #define OBD_FAIL_MDS_STATFS_PACK 0x11d #define OBD_FAIL_MDS_STATFS_NET 0x11e #define OBD_FAIL_MDS_GETATTR_NAME_NET 0x11f +#define OBD_FAIL_MDS_ALL_REPLY_NET 0x120 +#define OBD_FAIL_MDS_ALL_REQUEST_NET 0x121 #define OBD_FAIL_OST 0x200 #define OBD_FAIL_OST_CONNECT_NET 0x201 @@ -108,8 +111,9 @@ extern char obd_recovery_upcall[128]; #define OBD_FAIL_MDS_ALL_NET 0x01000000 #define OBD_FAIL_OST_ALL_NET 0x02000000 -#define OBD_FAIL_CHECK(id) ((obd_fail_loc & OBD_FAIL_MASK_LOC) == (id) && \ - ((obd_fail_loc & (OBD_FAILED | OBD_FAIL_ONCE))!=\ +#define OBD_FAIL_CHECK(id) (((obd_fail_loc & OBD_FAIL_MASK_LOC) == \ + ((id) & OBD_FAIL_MASK_LOC)) && \ + ((obd_fail_loc & (OBD_FAILED | OBD_FAIL_ONCE))!= \ (OBD_FAILED | OBD_FAIL_ONCE))) #define OBD_FAIL_RETURN(id, ret) \ diff --git a/lustre/kernel_patches/scripts/docco.txt b/lustre/kernel_patches/README similarity index 100% rename from lustre/kernel_patches/scripts/docco.txt rename to lustre/kernel_patches/README diff --git a/lustre/kernel_patches/patches/dev_read_only.patch b/lustre/kernel_patches/patches/dev_read_only.patch index 9ff075e..bac5ebf 100644 --- a/lustre/kernel_patches/patches/dev_read_only.patch +++ b/lustre/kernel_patches/patches/dev_read_only.patch @@ -46,30 +46,26 @@ +EXPORT_SYMBOL(dev_clear_rdonly); --- linux-2.4.18-17.8.0/drivers/block/loop.c~dev_read_only 2002-12-06 14:52:29.000000000 -0800 +++ linux-2.4.18-17.8.0-zab/drivers/block/loop.c 2002-12-06 14:52:29.000000000 -0800 -@@ -491,6 +491,11 @@ static int loop_make_request(request_que +@@ -491,6 +491,9 @@ static int loop_make_request(request_que spin_unlock_irq(&lo->lo_lock); if (rw == WRITE) { -+#ifdef CONFIG_DEV_RDONLY + if (dev_check_rdonly(rbh->b_rdev)) + goto err; -+#endif + if (lo->lo_flags & LO_FLAGS_READ_ONLY) goto err; } else if (rw == READA) { --- linux-2.4.18-17.8.0/drivers/ide/ide-disk.c~dev_read_only 2002-12-06 14:52:29.000000000 -0800 +++ linux-2.4.18-17.8.0-zab/drivers/ide/ide-disk.c 2002-12-06 14:52:29.000000000 -0800 -@@ -557,6 +557,12 @@ static ide_startstop_t lba_48_rw_disk (i +@@ -557,6 +557,10 @@ static ide_startstop_t lba_48_rw_disk (i */ static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) { -+#ifdef CONFIG_DEV_RDONLY + if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) { + ide_end_request(1, HWGROUP(drive)); + return ide_stopped; + } -+#endif if (IDE_CONTROL_REG) OUT_BYTE(drive->ctl,IDE_CONTROL_REG); diff --git a/lustre/kernel_patches/patches/dev_read_only_hp.patch b/lustre/kernel_patches/patches/dev_read_only_hp.patch new file mode 100644 index 0000000..b2cf6f0 --- /dev/null +++ b/lustre/kernel_patches/patches/dev_read_only_hp.patch @@ -0,0 +1,77 @@ + drivers/block/blkpg.c | 38 ++++++++++++++++++++++++++++++++++++++ + drivers/block/loop.c | 5 +++++ + drivers/ide/ide-disk.c | 6 ++++++ + 3 files changed, 49 insertions(+) + +--- linux-2.4.19-hp2_pnnl2/drivers/block/blkpg.c~dev_read_only_hp Sun Jan 19 18:51:12 2003 ++++ linux-2.4.19-hp2_pnnl2-root/drivers/block/blkpg.c Sun Jan 19 18:52:28 2003 +@@ -310,6 +310,42 @@ int blk_ioctl(kdev_t dev, unsigned int c + + EXPORT_SYMBOL(blk_ioctl); + ++ ++#define NUM_DEV_NO_WRITE 16 ++static int dev_no_write[NUM_DEV_NO_WRITE]; ++ ++/* ++ * Debug code for turning block devices "read-only" (will discard writes ++ * silently). This is for filesystem crash/recovery testing. ++ */ ++void dev_set_rdonly(kdev_t dev, int no_write) ++{ ++ if (dev) { ++ printk(KERN_WARNING "Turning device %s read-only\n", ++ bdevname(dev)); ++ dev_no_write[no_write] = 0xdead0000 + dev; ++ } ++} ++ ++int dev_check_rdonly(kdev_t dev) { ++ int i; ++ ++ for (i = 0; i < NUM_DEV_NO_WRITE; i++) { ++ if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 && ++ dev == (dev_no_write[i] & 0xffff)) ++ return 1; ++ } ++ return 0; ++} ++ ++void dev_clear_rdonly(int no_write) { ++ dev_no_write[no_write] = 0; ++} ++ ++EXPORT_SYMBOL(dev_set_rdonly); ++EXPORT_SYMBOL(dev_check_rdonly); ++EXPORT_SYMBOL(dev_clear_rdonly); ++ + /** + * get_last_sector() + * +--- linux-2.4.19-hp2_pnnl2/drivers/block/loop.c~dev_read_only_hp Sun Jan 19 18:51:12 2003 ++++ linux-2.4.19-hp2_pnnl2-root/drivers/block/loop.c Sun Jan 19 18:51:12 2003 +@@ -474,6 +474,9 @@ static int loop_make_request(request_que + spin_unlock_irq(&lo->lo_lock); + + if (rw == WRITE) { ++ if (dev_check_rdonly(rbh->b_rdev)) ++ goto err; ++ + if (lo->lo_flags & LO_FLAGS_READ_ONLY) + goto err; + } else if (rw == READA) { +--- linux-2.4.19-hp2_pnnl2/drivers/ide/ide-disk.c~dev_read_only_hp Sun Jan 19 18:51:12 2003 ++++ linux-2.4.19-hp2_pnnl2-root/drivers/ide/ide-disk.c Sun Jan 19 18:51:12 2003 +@@ -551,6 +551,10 @@ static ide_startstop_t lba_48_rw_disk (i + */ + static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) + { ++ if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) { ++ ide_end_request(1, HWGROUP(drive)); ++ return ide_stopped; ++ } + if (IDE_CONTROL_REG) + OUT_BYTE(drive->ctl,IDE_CONTROL_REG); + + +_ diff --git a/lustre/kernel_patches/patches/exports.patch b/lustre/kernel_patches/patches/exports.patch index cdf72f0..716c156 100644 --- a/lustre/kernel_patches/patches/exports.patch +++ b/lustre/kernel_patches/patches/exports.patch @@ -1,10 +1,14 @@ - 0 files changed + fs/ext3/Makefile | 2 ++ + fs/ext3/super.c | 2 +- + include/linux/fs.h | 1 + + kernel/ksyms.c | 5 +++++ + 4 files changed, 9 insertions(+), 1 deletion(-) ---- linux-2.4.18-17.8.0/fs/ext3/Makefile~exports 2002-12-06 14:52:29.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/fs/ext3/Makefile 2002-12-06 14:52:29.000000000 -0800 +--- linux-2.4.19-hp2_pnnl2/fs/ext3/Makefile~exports Sun Jan 19 18:52:38 2003 ++++ linux-2.4.19-hp2_pnnl2-root/fs/ext3/Makefile Sun Jan 19 18:52:38 2003 @@ -9,6 +9,8 @@ O_TARGET := ext3.o @@ -14,9 +18,9 @@ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o obj-m := $(O_TARGET) ---- linux-2.4.18-17.8.0/fs/ext3/super.c~exports 2002-12-06 14:52:29.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/fs/ext3/super.c 2002-12-06 14:52:29.000000000 -0800 -@@ -1746,7 +1746,7 @@ static void __exit exit_ext3_fs(void) +--- linux-2.4.19-hp2_pnnl2/fs/ext3/super.c~exports Sun Jan 19 18:52:38 2003 ++++ linux-2.4.19-hp2_pnnl2-root/fs/ext3/super.c Sun Jan 19 18:52:38 2003 +@@ -1744,7 +1744,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); } @@ -25,9 +29,9 @@ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); ---- linux-2.4.18-17.8.0/include/linux/fs.h~exports 2002-12-06 14:52:29.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/include/linux/fs.h 2002-12-06 14:52:29.000000000 -0800 -@@ -1046,6 +1046,7 @@ extern int unregister_filesystem(struct +--- linux-2.4.19-hp2_pnnl2/include/linux/fs.h~exports Sun Jan 19 18:52:38 2003 ++++ linux-2.4.19-hp2_pnnl2-root/include/linux/fs.h Sun Jan 19 18:52:38 2003 +@@ -1020,6 +1020,7 @@ extern int unregister_filesystem(struct extern struct vfsmount *kern_mount(struct file_system_type *); extern int may_umount(struct vfsmount *); extern long do_mount(char *, char *, char *, unsigned long, void *); @@ -35,11 +39,11 @@ extern void umount_tree(struct vfsmount *); #define kern_umount mntput ---- linux-2.4.18-17.8.0/kernel/ksyms.c~exports 2002-12-06 14:52:29.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/kernel/ksyms.c 2002-12-06 14:52:29.000000000 -0800 -@@ -306,6 +306,11 @@ EXPORT_SYMBOL_GPL(buffermem_pages); - EXPORT_SYMBOL_GPL(nr_free_pages); - EXPORT_SYMBOL_GPL(page_cache_size); +--- linux-2.4.19-hp2_pnnl2/kernel/ksyms.c~exports Sun Jan 19 18:52:38 2003 ++++ linux-2.4.19-hp2_pnnl2-root/kernel/ksyms.c Sun Jan 19 18:52:38 2003 +@@ -308,6 +308,11 @@ EXPORT_SYMBOL(dcache_dir_fsync); + EXPORT_SYMBOL(dcache_readdir); + EXPORT_SYMBOL(dcache_dir_ops); +/* lustre */ +EXPORT_SYMBOL(panic_notifier_list); diff --git a/lustre/kernel_patches/patches/exports_hp.patch b/lustre/kernel_patches/patches/exports_hp.patch new file mode 100644 index 0000000..0222b46 --- /dev/null +++ b/lustre/kernel_patches/patches/exports_hp.patch @@ -0,0 +1,56 @@ + + + + fs/ext3/Makefile | 2 ++ + fs/ext3/super.c | 2 +- + include/linux/fs.h | 1 + + kernel/ksyms.c | 4 ++++ + 4 files changed, 9 insertions(+), 1 deletion(-) + +--- linux-2.4.19-hp2_pnnl2/fs/ext3/Makefile~exports Sun Jan 19 18:52:38 2003 ++++ linux-2.4.19-hp2_pnnl2-root/fs/ext3/Makefile Sun Jan 19 18:52:38 2003 +@@ -9,6 +9,8 @@ + + O_TARGET := ext3.o + ++export-objs := super.o ++ + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o + obj-m := $(O_TARGET) +--- linux-2.4.19-hp2_pnnl2/fs/ext3/super.c~exports Sun Jan 19 18:52:38 2003 ++++ linux-2.4.19-hp2_pnnl2-root/fs/ext3/super.c Sun Jan 19 18:52:38 2003 +@@ -1744,7 +1744,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + } + +-EXPORT_NO_SYMBOLS; ++EXPORT_SYMBOL(ext3_bread); + + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); +--- linux-2.4.19-hp2_pnnl2/include/linux/fs.h~exports Sun Jan 19 18:52:38 2003 ++++ linux-2.4.19-hp2_pnnl2-root/include/linux/fs.h Sun Jan 19 18:52:38 2003 +@@ -1020,6 +1020,7 @@ extern int unregister_filesystem(struct + extern struct vfsmount *kern_mount(struct file_system_type *); + extern int may_umount(struct vfsmount *); + extern long do_mount(char *, char *, char *, unsigned long, void *); ++struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data); + extern void umount_tree(struct vfsmount *); + + #define kern_umount mntput +--- linux-2.4.19-hp2_pnnl2/kernel/ksyms.c~exports Sun Jan 19 18:52:38 2003 ++++ linux-2.4.19-hp2_pnnl2-root/kernel/ksyms.c Sun Jan 19 18:52:38 2003 +@@ -308,6 +308,10 @@ EXPORT_SYMBOL(dcache_dir_fsync); + EXPORT_SYMBOL(dcache_readdir); + EXPORT_SYMBOL(dcache_dir_ops); + ++/* lustre */ ++EXPORT_SYMBOL(pagecache_lock_cacheline); ++EXPORT_SYMBOL(do_kern_mount); ++ + /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */ + EXPORT_SYMBOL(default_llseek); + EXPORT_SYMBOL(dentry_open); + +_ diff --git a/lustre/kernel_patches/patches/invalidate_show.patch b/lustre/kernel_patches/patches/invalidate_show.patch new file mode 100644 index 0000000..c3ae2f5 --- /dev/null +++ b/lustre/kernel_patches/patches/invalidate_show.patch @@ -0,0 +1,104 @@ +--- lum/fs/inode.c Sat Oct 19 11:42:42 2002 ++++ linux-2.4.18-uml35-ext3online/fs/inode.c Mon Oct 14 00:41:20 2002 +@@ -606,7 +553,8 @@ static void dispose_list(struct list_hea + /* + * Invalidate all inodes for a device. + */ +-static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) ++static int invalidate_list(struct list_head *head, struct super_block * sb, ++ struct list_head * dispose, int show) + { + struct list_head *next; + int busy = 0, count = 0; +@@ -631,6 +579,11 @@ static int invalidate_list(struct list_h + count++; + continue; + } ++ if (show) ++ printk(KERN_ERR ++ "inode busy: dev %s:%lu (%p) mode %o count %u\n", ++ kdevname(sb->s_dev), inode->i_ino, inode, ++ inode->i_mode, atomic_read(&inode->i_count)); + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ +@@ -649,22 +601,23 @@ static int invalidate_list(struct list_h + /** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock ++ * @show: whether we should display any busy inodes found + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +-int invalidate_inodes(struct super_block * sb) ++int invalidate_inodes(struct super_block * sb, int show) + { + int busy; + LIST_HEAD(throw_away); + + spin_lock(&inode_lock); +- busy = invalidate_list(&inode_in_use, sb, &throw_away); +- busy |= invalidate_list(&inode_unused, sb, &throw_away); +- busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); +- busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away); ++ busy = invalidate_list(&inode_in_use, sb, &throw_away, show); ++ busy |= invalidate_list(&inode_unused, sb, &throw_away, show); ++ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away, show); ++ busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away, show); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); +@@ -690,7 +643,7 @@ int invalidate_device(kdev_t dev, int do + * hold). + */ + shrink_dcache_sb(sb); +- res = invalidate_inodes(sb); ++ res = invalidate_inodes(sb, 0); + drop_super(sb); + } + invalidate_buffers(dev); +--- lum/fs/super.c.orig Sat Oct 19 11:42:42 2002 ++++ lum/fs/super.c Wed Oct 30 17:16:55 2002 +@@ -936,7 +936,7 @@ + lock_super(sb); + lock_kernel(); + sb->s_flags &= ~MS_ACTIVE; +- invalidate_inodes(sb); /* bad name - it should be evict_inodes() */ ++ invalidate_inodes(sb, 0); /* bad name - it should be evict_inodes() */ + if (sop) { + if (sop->write_super && sb->s_dirt) + sop->write_super(sb); +@@ -945,7 +945,7 @@ + } + + /* Forget any remaining inodes */ +- if (invalidate_inodes(sb)) { ++ if (invalidate_inodes(sb, 1)) { + printk(KERN_ERR "VFS: Busy inodes after unmount. " + "Self-destruct in 5 seconds. Have a nice day...\n"); + } +--- lum/include/linux/fs.h Wed Oct 30 17:10:42 2002 ++++ lum/include/linux/fs.h.orig Tue Oct 22 23:15:00 2002 +@@ -1261,7 +1261,7 @@ + extern void set_buffer_flushtime(struct buffer_head *); + extern void balance_dirty(void); + extern int check_disk_change(kdev_t); +-extern int invalidate_inodes(struct super_block *); ++extern int invalidate_inodes(struct super_block *, int); + extern int invalidate_device(kdev_t, int); + extern void invalidate_inode_pages(struct inode *); + extern void invalidate_inode_pages2(struct address_space *); +--- lum/fs/smbfs/inode.c.orig Mon Feb 25 12:38:09 2002 ++++ lum/fs/smbfs/inode.c Thu Feb 6 21:34:26 2003 +@@ -166,7 +166,7 @@ + { + VERBOSE("\n"); + shrink_dcache_sb(SB_of(server)); +- invalidate_inodes(SB_of(server)); ++ invalidate_inodes(SB_of(server), 0); + } + + /* diff --git a/lustre/kernel_patches/patches/iod-rmap-exports.patch b/lustre/kernel_patches/patches/iod-rmap-exports.patch new file mode 100644 index 0000000..00eba97 --- /dev/null +++ b/lustre/kernel_patches/patches/iod-rmap-exports.patch @@ -0,0 +1,64 @@ +--- linux-chaos/fs/inode.c.b_io_export Wed Jan 29 16:56:15 2003 ++++ linux-chaos/fs/inode.c Wed Jan 29 16:56:27 2003 +@@ -66,7 +66,8 @@ + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +-static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; ++spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; ++EXPORT_SYMBOL(inode_lock); + + /* + * Statistics gathering.. +--- linux-chaos/fs/Makefile.b_io_export Wed Jan 29 16:56:45 2003 ++++ linux-chaos/fs/Makefile Wed Jan 29 16:56:53 2003 +@@ -7,7 +7,7 @@ + + O_TARGET := fs.o + +-export-objs := filesystems.o open.o dcache.o buffer.o ++export-objs := filesystems.o open.o dcache.o buffer.o inode.o + mod-subdirs := nls + + obj-y := open.o read_write.o devices.o file_table.o buffer.o \ +--- linux-chaos/mm/filemap.c.b_io_export Wed Jan 29 16:50:39 2003 ++++ linux-chaos/mm/filemap.c Wed Jan 29 16:51:11 2003 +@@ -65,6 +65,7 @@ + * pagecache_lock + */ + spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED}; ++EXPORT_SYMBOL(pagemap_lru_lock_cacheline); + + #define CLUSTER_PAGES (1 << page_cluster) + #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) +--- linux-chaos/mm/vmscan.c.b_io_export Wed Jan 29 16:51:58 2003 ++++ linux-chaos/mm/vmscan.c Wed Jan 29 16:55:16 2003 +@@ -839,6 +839,7 @@ + set_current_state(TASK_RUNNING); + remove_wait_queue(&kswapd_done, &wait); + } ++EXPORT_SYMBOL(wakeup_kswapd); + + static void wakeup_memwaiters(void) + { +--- linux-chaos/mm/Makefile.b_io_export Wed Jan 29 16:52:46 2003 ++++ linux-chaos/mm/Makefile Wed Jan 29 16:54:23 2003 +@@ -9,7 +9,7 @@ + + O_TARGET := mm.o + +-export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o ++export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o vmscan.c + + obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ + vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ +--- linux-chaos/mm/page_alloc.c.b_io_export Wed Jan 29 17:00:32 2003 ++++ linux-chaos/mm/page_alloc.c Wed Jan 29 17:01:31 2003 +@@ -31,6 +31,7 @@ + int nr_inactive_dirty_pages; + int nr_inactive_clean_pages; + pg_data_t *pgdat_list; ++EXPORT_SYMBOL(pgdat_list); + + /* + * The zone_table array is used to look up the address of the diff --git a/lustre/kernel_patches/patches/jbd-transno-cb.patch b/lustre/kernel_patches/patches/jbd-transno-cb.patch new file mode 100644 index 0000000..ceb086d --- /dev/null +++ b/lustre/kernel_patches/patches/jbd-transno-cb.patch @@ -0,0 +1,240 @@ + + + + fs/jbd/commit.c | 27 +++++++++++++++++++++--- + fs/jbd/journal.c | 1 + fs/jbd/transaction.c | 56 ++++++++++++++++++++++++++++++++++++++++----------- + include/linux/jbd.h | 20 ++++++++++++++++++ + 4 files changed, 90 insertions(+), 14 deletions(-) + +--- linux-2.4.19/fs/jbd/commit.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/fs/jbd/commit.c Sun Jan 19 19:46:42 2003 +@@ -475,7 +475,7 @@ start_journal_io: + transaction's t_log_list queue, and metadata buffers are on + the t_iobuf_list queue. + +- Wait for the transactions in reverse order. That way we are ++ Wait for the buffers in reverse order. That way we are + less likely to be woken up until all IOs have completed, and + so we incur less scheduling load. + */ +@@ -566,8 +566,10 @@ start_journal_io: + + jbd_debug(3, "JBD: commit phase 6\n"); + +- if (is_journal_aborted(journal)) ++ if (is_journal_aborted(journal)) { ++ unlock_journal(journal); + goto skip_commit; ++ } + + /* Done it all: now write the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort +@@ -577,6 +579,7 @@ start_journal_io: + descriptor = journal_get_descriptor_buffer(journal); + if (!descriptor) { + __journal_abort_hard(journal); ++ unlock_journal(journal); + goto skip_commit; + } + +@@ -600,7 +603,6 @@ start_journal_io: + put_bh(bh); /* One for getblk() */ + journal_unlock_journal_head(descriptor); + } +- lock_journal(journal); + + /* End of a transaction! Finally, we can do checkpoint + processing: any buffers committed as a result of this +@@ -609,6 +611,25 @@ start_journal_io: + + skip_commit: + ++ /* Call any callbacks that had been registered for handles in this ++ * transaction. It is up to the callback to free any allocated ++ * memory. ++ */ ++ if (!list_empty(&commit_transaction->t_jcb)) { ++ struct list_head *p, *n; ++ int error = is_journal_aborted(journal); ++ ++ list_for_each_safe(p, n, &commit_transaction->t_jcb) { ++ struct journal_callback *jcb; ++ ++ jcb = list_entry(p, struct journal_callback, jcb_list); ++ list_del(p); ++ jcb->jcb_func(jcb, error); ++ } ++ } ++ ++ lock_journal(journal); ++ + jbd_debug(3, "JBD: commit phase 7\n"); + + J_ASSERT(commit_transaction->t_sync_datalist == NULL); +--- linux-2.4.19/fs/jbd/journal.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/fs/jbd/journal.c Sun Jan 19 19:46:42 2003 +@@ -58,6 +58,7 @@ EXPORT_SYMBOL(journal_sync_buffer); + #endif + EXPORT_SYMBOL(journal_flush); + EXPORT_SYMBOL(journal_revoke); ++EXPORT_SYMBOL(journal_callback_set); + + EXPORT_SYMBOL(journal_init_dev); + EXPORT_SYMBOL(journal_init_inode); +--- linux-2.4.19/fs/jbd/transaction.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/fs/jbd/transaction.c Sun Jan 19 19:46:42 2003 +@@ -57,6 +57,7 @@ static transaction_t * get_transaction ( + transaction->t_state = T_RUNNING; + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; ++ INIT_LIST_HEAD(&transaction->t_jcb); + + /* Set up the commit timer for the new transaction. */ + J_ASSERT (!journal->j_commit_timer_active); +@@ -201,6 +202,20 @@ repeat_locked: + return 0; + } + ++/* Allocate a new handle. This should probably be in a slab... */ ++static handle_t *new_handle(int nblocks) ++{ ++ handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); ++ if (!handle) ++ return NULL; ++ memset(handle, 0, sizeof (handle_t)); ++ handle->h_buffer_credits = nblocks; ++ handle->h_ref = 1; ++ INIT_LIST_HEAD(&handle->h_jcb); ++ ++ return handle; ++} ++ + /* + * Obtain a new handle. + * +@@ -227,14 +242,11 @@ handle_t *journal_start(journal_t *journ + handle->h_ref++; + return handle; + } +- +- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); ++ ++ handle = new_handle(nblocks); + if (!handle) + return ERR_PTR(-ENOMEM); +- memset (handle, 0, sizeof (handle_t)); + +- handle->h_buffer_credits = nblocks; +- handle->h_ref = 1; + current->journal_info = handle; + + err = start_this_handle(journal, handle); +@@ -333,14 +345,11 @@ handle_t *journal_try_start(journal_t *j + + if (is_journal_aborted(journal)) + return ERR_PTR(-EIO); +- +- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); ++ ++ handle = new_handle(nblocks); + if (!handle) + return ERR_PTR(-ENOMEM); +- memset (handle, 0, sizeof (handle_t)); + +- handle->h_buffer_credits = nblocks; +- handle->h_ref = 1; + current->journal_info = handle; + + err = try_start_this_handle(journal, handle); +@@ -1328,6 +1337,28 @@ out: + #endif + + /* ++ * Register a callback function for this handle. The function will be ++ * called when the transaction that this handle is part of has been ++ * committed to disk with the original callback data struct and the ++ * error status of the journal as parameters. There is no guarantee of ++ * ordering between handles within a single transaction, nor between ++ * callbacks registered on the same handle. ++ * ++ * The caller is responsible for allocating the journal_callback struct. ++ * This is to allow the caller to add as much extra data to the callback ++ * as needed, but reduce the overhead of multiple allocations. The caller ++ * allocated struct must start with a struct journal_callback at offset 0, ++ * and has the caller-specific data afterwards. ++ */ ++void journal_callback_set(handle_t *handle, ++ void (*func)(struct journal_callback *jcb, int error), ++ struct journal_callback *jcb) ++{ ++ list_add(&jcb->jcb_list, &handle->h_jcb); ++ jcb->jcb_func = func; ++} ++ ++/* + * All done for a particular handle. + * + * There is not much action needed here. We just return any remaining +@@ -1393,7 +1424,10 @@ int journal_stop(handle_t *handle) + wake_up(&journal->j_wait_transaction_locked); + } + +- /* ++ /* Move callbacks from the handle to the transaction. */ ++ list_splice(&handle->h_jcb, &transaction->t_jcb); ++ ++ /* + * If the handle is marked SYNC, we need to set another commit + * going! We also want to force a commit if the current + * transaction is occupying too much of the log, or if the +--- linux-2.4.19/include/linux/jbd.h~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/include/linux/jbd.h Sun Jan 19 19:46:42 2003 +@@ -249,6 +249,13 @@ static inline struct journal_head *bh2jh + return bh->b_private; + } + ++#define HAVE_JOURNAL_CALLBACK_STATUS ++struct journal_callback { ++ struct list_head jcb_list; ++ void (*jcb_func)(struct journal_callback *jcb, int error); ++ /* user data goes here */ ++}; ++ + struct jbd_revoke_table_s; + + /* The handle_t type represents a single atomic update being performed +@@ -279,6 +286,12 @@ struct handle_s + operations */ + int h_err; + ++ /* List of application registered callbacks for this handle. ++ * The function(s) will be called after the transaction that ++ * this handle is part of has been committed to disk. ++ */ ++ struct list_head h_jcb; ++ + /* Flags */ + unsigned int h_sync: 1; /* sync-on-close */ + unsigned int h_jdata: 1; /* force data journaling */ +@@ -398,6 +411,10 @@ struct transaction_s + + /* How many handles used this transaction? */ + int t_handle_count; ++ ++ /* List of registered callback functions for this transaction. ++ * Called when the transaction is committed. */ ++ struct list_head t_jcb; + }; + + +@@ -646,6 +663,9 @@ extern int journal_flushpage(journal_t + extern int journal_try_to_free_buffers(journal_t *, struct page *, int); + extern int journal_stop(handle_t *); + extern int journal_flush (journal_t *); ++extern void journal_callback_set(handle_t *handle, ++ void (*fn)(struct journal_callback *,int), ++ struct journal_callback *jcb); + + extern void journal_lock_updates (journal_t *); + extern void journal_unlock_updates (journal_t *); diff --git a/lustre/kernel_patches/patches/kmem_cache_validate_hp.patch b/lustre/kernel_patches/patches/kmem_cache_validate_hp.patch new file mode 100644 index 0000000..03385a7 --- /dev/null +++ b/lustre/kernel_patches/patches/kmem_cache_validate_hp.patch @@ -0,0 +1,105 @@ + arch/ia64/mm/init.c | 6 +++++ + include/linux/slab.h | 1 + kernel/ksyms.c | 1 + mm/slab.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ + 4 files changed, 61 insertions(+) + +--- linux-2.4.19-hp2_pnnl2/arch/ia64/mm/init.c~kmem_cache_validate_hp Sun Jan 19 18:59:23 2003 ++++ linux-2.4.19-hp2_pnnl2-root/arch/ia64/mm/init.c Sun Jan 19 18:59:24 2003 +@@ -44,6 +44,12 @@ unsigned long vmalloc_end = VMALLOC_END_ + static struct page *vmem_map; + static unsigned long num_dma_physpages; + ++struct page *check_get_page(unsigned long kaddr) ++{ ++#warning FIXME: Lustre team, is this solid? ++ return virt_to_page(kaddr); ++} ++ + int + do_check_pgt_cache (int low, int high) + { +--- linux-2.4.19-hp2_pnnl2/include/linux/slab.h~kmem_cache_validate_hp Sun Jan 19 18:59:23 2003 ++++ linux-2.4.19-hp2_pnnl2-root/include/linux/slab.h Sun Jan 19 19:01:07 2003 +@@ -56,6 +56,7 @@ extern kmem_cache_t *kmem_cache_create(c + extern int kmem_cache_destroy(kmem_cache_t *); + extern int kmem_cache_shrink(kmem_cache_t *); + extern void *kmem_cache_alloc(kmem_cache_t *, int); ++extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp); + extern void kmem_cache_free(kmem_cache_t *, void *); + extern unsigned int kmem_cache_size(kmem_cache_t *); + +--- linux-2.4.19-hp2_pnnl2/kernel/ksyms.c~kmem_cache_validate_hp Sun Jan 19 18:59:23 2003 ++++ linux-2.4.19-hp2_pnnl2-root/kernel/ksyms.c Sun Jan 19 19:00:32 2003 +@@ -118,6 +118,7 @@ EXPORT_SYMBOL(kmem_find_general_cachep); + EXPORT_SYMBOL(kmem_cache_create); + EXPORT_SYMBOL(kmem_cache_destroy); + EXPORT_SYMBOL(kmem_cache_shrink); ++EXPORT_SYMBOL(kmem_cache_validate); + EXPORT_SYMBOL(kmem_cache_alloc); + EXPORT_SYMBOL(kmem_cache_free); + EXPORT_SYMBOL(kmem_cache_size); +--- linux-2.4.19-hp2_pnnl2/mm/slab.c~kmem_cache_validate_hp Sun Jan 19 18:59:23 2003 ++++ linux-2.4.19-hp2_pnnl2-root/mm/slab.c Sun Jan 19 18:59:24 2003 +@@ -1207,6 +1207,59 @@ failed: + * Called with the cache-lock held. + */ + ++extern struct page *check_get_page(unsigned long kaddr); ++struct page *page_mem_map(struct page *page); ++static int kmem_check_cache_obj (kmem_cache_t * cachep, ++ slab_t *slabp, void * objp) ++{ ++ int i; ++ unsigned int objnr; ++ ++#if DEBUG ++ if (cachep->flags & SLAB_RED_ZONE) { ++ objp -= BYTES_PER_WORD; ++ if ( *(unsigned long *)objp != RED_MAGIC2) ++ /* Either write before start, or a double free. */ ++ return 0; ++ if (*(unsigned long *)(objp+cachep->objsize - ++ BYTES_PER_WORD) != RED_MAGIC2) ++ /* Either write past end, or a double free. */ ++ return 0; ++ } ++#endif ++ ++ objnr = (objp-slabp->s_mem)/cachep->objsize; ++ if (objnr >= cachep->num) ++ return 0; ++ if (objp != slabp->s_mem + objnr*cachep->objsize) ++ return 0; ++ ++ /* Check slab's freelist to see if this obj is there. */ ++ for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { ++ if (i == objnr) ++ return 0; ++ } ++ return 1; ++} ++ ++ ++int kmem_cache_validate(kmem_cache_t *cachep, void *objp) ++{ ++ struct page *page = check_get_page((unsigned long)objp); ++ ++ if (!VALID_PAGE(page)) ++ return 0; ++ ++ if (!PageSlab(page)) ++ return 0; ++ ++ /* XXX check for freed slab objects ? */ ++ if (!kmem_check_cache_obj(cachep, GET_PAGE_SLAB(page), objp)) ++ return 0; ++ ++ return (cachep == GET_PAGE_CACHE(page)); ++} ++ + #if DEBUG + static int kmem_extra_free_checks (kmem_cache_t * cachep, + slab_t *slabp, void * objp) + +_ diff --git a/lustre/kernel_patches/patches/lustre_version.patch b/lustre/kernel_patches/patches/lustre_version.patch index 2e69e01..9ed43cf 100644 --- a/lustre/kernel_patches/patches/lustre_version.patch +++ b/lustre/kernel_patches/patches/lustre_version.patch @@ -1,11 +1,12 @@ - 0 files changed + include/linux/lustre_version.h | 1 + + 1 files changed, 1 insertion(+) ---- /dev/null 2002-08-30 16:31:37.000000000 -0700 -+++ linux-2.4.18-17.8.0-zab/include/linux/lustre_version.h 2002-12-06 14:52:30.000000000 -0800 +--- /dev/null Fri Aug 30 17:31:37 2002 ++++ linux-2.4.18-18.8.0-l7-root/include/linux/lustre_version.h Mon Jan 20 12:24:45 2003 @@ -0,0 +1 @@ -+#define LUSTRE_KERNEL_VERSION 5 ++#define LUSTRE_KERNEL_VERSION 10 _ diff --git a/lustre/kernel_patches/patches/vanilla-2.4.19.patch b/lustre/kernel_patches/patches/vanilla-2.4.19.patch index 72949cd..4ed5bb9 100644 --- a/lustre/kernel_patches/patches/vanilla-2.4.19.patch +++ b/lustre/kernel_patches/patches/vanilla-2.4.19.patch @@ -1,34 +1,33 @@ - arch/i386/mm/init.c | 6 + - arch/ia64/mm/init.c | 6 + - drivers/block/blkpg.c | 35 ++++++ + + + + arch/i386/mm/init.c | 6 + arch/ia64/mm/init.c | 6 + drivers/block/blkpg.c | 35 ++++ drivers/block/loop.c | 5 - drivers/ide/ide-disk.c | 6 + + drivers/ide/ide-disk.c | 6 fs/dcache.c | 1 fs/ext3/Makefile | 2 fs/ext3/super.c | 2 - fs/jbd/commit.c | 27 ++++- - fs/jbd/journal.c | 1 - fs/jbd/transaction.c | 56 ++++++++-- - fs/namei.c | 215 ++++++++++++++++++++++++++++++++--------- + fs/namei.c | 296 ++++++++++++++++++++++++++++++++++------- fs/nfsd/vfs.c | 2 - fs/open.c | 63 +++++++++--- - fs/stat.c | 30 ++++- + fs/open.c | 63 ++++++-- + fs/stat.c | 30 +++- include/linux/blkdev.h | 4 - include/linux/dcache.h | 31 +++++ - include/linux/fs.h | 14 ++ - include/linux/jbd.h | 20 +++ + include/linux/dcache.h | 31 ++++ + include/linux/fs.h | 23 +++ include/linux/lustre_version.h | 1 include/linux/slab.h | 1 - kernel/ksyms.c | 7 + - mm/slab.c | 53 ++++++++++ - 23 files changed, 502 insertions(+), 86 deletions(-) + kernel/ksyms.c | 7 + mm/slab.c | 53 +++++++ + 19 files changed, 501 insertions(+), 73 deletions(-) --- /dev/null Fri Aug 30 17:31:37 2002 -+++ linux-2.4.19-root/include/linux/lustre_version.h Sun Dec 15 16:58:43 2002 ++++ linux-2.4.19-root/include/linux/lustre_version.h Sun Jan 19 19:54:00 2003 @@ -0,0 +1 @@ -+#define LUSTRE_KERNEL_VERSION 5 ---- linux-2.4.19/arch/ia64/mm/init.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/arch/ia64/mm/init.c Sun Dec 15 16:58:43 2002 ++#define LUSTRE_KERNEL_VERSION 7 +--- linux-2.4.19/arch/ia64/mm/init.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/arch/ia64/mm/init.c Sun Jan 19 19:46:42 2003 @@ -37,6 +37,12 @@ unsigned long MAX_DMA_ADDRESS = PAGE_OFF static unsigned long totalram_pages; @@ -42,8 +41,8 @@ int do_check_pgt_cache (int low, int high) { ---- linux-2.4.19/arch/i386/mm/init.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/arch/i386/mm/init.c Sun Dec 15 16:58:43 2002 +--- linux-2.4.19/arch/i386/mm/init.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/arch/i386/mm/init.c Sun Jan 19 19:46:42 2003 @@ -43,6 +43,12 @@ unsigned long highstart_pfn, highend_pfn static unsigned long totalram_pages; static unsigned long totalhigh_pages; @@ -57,8 +56,8 @@ int do_check_pgt_cache(int low, int high) { int freed = 0; ---- linux-2.4.19/drivers/block/blkpg.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/drivers/block/blkpg.c Sun Dec 15 16:58:43 2002 +--- linux-2.4.19/drivers/block/blkpg.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/drivers/block/blkpg.c Sun Jan 19 19:46:42 2003 @@ -296,3 +296,38 @@ int blk_ioctl(kdev_t dev, unsigned int c } @@ -98,8 +97,8 @@ +EXPORT_SYMBOL(dev_set_rdonly); +EXPORT_SYMBOL(dev_check_rdonly); +EXPORT_SYMBOL(dev_clear_rdonly); ---- linux-2.4.19/drivers/block/loop.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/drivers/block/loop.c Sun Dec 15 16:58:43 2002 +--- linux-2.4.19/drivers/block/loop.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/drivers/block/loop.c Sun Jan 19 19:46:42 2003 @@ -474,6 +474,11 @@ static int loop_make_request(request_que spin_unlock_irq(&lo->lo_lock); @@ -112,8 +111,8 @@ if (lo->lo_flags & LO_FLAGS_READ_ONLY) goto err; } else if (rw == READA) { ---- linux-2.4.19/drivers/ide/ide-disk.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/drivers/ide/ide-disk.c Sun Dec 15 16:58:43 2002 +--- linux-2.4.19/drivers/ide/ide-disk.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/drivers/ide/ide-disk.c Sun Jan 19 19:46:42 2003 @@ -551,6 +551,12 @@ static ide_startstop_t lba_48_rw_disk (i */ static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) @@ -127,8 +126,8 @@ if (IDE_CONTROL_REG) OUT_BYTE(drive->ctl,IDE_CONTROL_REG); ---- linux-2.4.19/fs/ext3/Makefile~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/fs/ext3/Makefile Sun Dec 15 16:58:43 2002 +--- linux-2.4.19/fs/ext3/Makefile~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/fs/ext3/Makefile Sun Jan 19 19:46:42 2003 @@ -9,6 +9,8 @@ O_TARGET := ext3.o @@ -138,8 +137,8 @@ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o obj-m := $(O_TARGET) ---- linux-2.4.19/fs/ext3/super.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/fs/ext3/super.c Sun Dec 15 16:58:43 2002 +--- linux-2.4.19/fs/ext3/super.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/fs/ext3/super.c Sun Jan 19 19:46:42 2003 @@ -1744,7 +1744,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); } @@ -149,189 +148,8 @@ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); ---- linux-2.4.19/fs/jbd/commit.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/fs/jbd/commit.c Sun Dec 15 16:58:43 2002 -@@ -475,7 +475,7 @@ start_journal_io: - transaction's t_log_list queue, and metadata buffers are on - the t_iobuf_list queue. - -- Wait for the transactions in reverse order. That way we are -+ Wait for the buffers in reverse order. That way we are - less likely to be woken up until all IOs have completed, and - so we incur less scheduling load. - */ -@@ -566,8 +566,10 @@ start_journal_io: - - jbd_debug(3, "JBD: commit phase 6\n"); - -- if (is_journal_aborted(journal)) -+ if (is_journal_aborted(journal)) { -+ unlock_journal(journal); - goto skip_commit; -+ } - - /* Done it all: now write the commit record. We should have - * cleaned up our previous buffers by now, so if we are in abort -@@ -577,6 +579,7 @@ start_journal_io: - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) { - __journal_abort_hard(journal); -+ unlock_journal(journal); - goto skip_commit; - } - -@@ -600,7 +603,6 @@ start_journal_io: - put_bh(bh); /* One for getblk() */ - journal_unlock_journal_head(descriptor); - } -- lock_journal(journal); - - /* End of a transaction! Finally, we can do checkpoint - processing: any buffers committed as a result of this -@@ -609,6 +611,25 @@ start_journal_io: - - skip_commit: - -+ /* Call any callbacks that had been registered for handles in this -+ * transaction. It is up to the callback to free any allocated -+ * memory. -+ */ -+ if (!list_empty(&commit_transaction->t_jcb)) { -+ struct list_head *p, *n; -+ int error = is_journal_aborted(journal); -+ -+ list_for_each_safe(p, n, &commit_transaction->t_jcb) { -+ struct journal_callback *jcb; -+ -+ jcb = list_entry(p, struct journal_callback, jcb_list); -+ list_del(p); -+ jcb->jcb_func(jcb, error); -+ } -+ } -+ -+ lock_journal(journal); -+ - jbd_debug(3, "JBD: commit phase 7\n"); - - J_ASSERT(commit_transaction->t_sync_datalist == NULL); ---- linux-2.4.19/fs/jbd/journal.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/fs/jbd/journal.c Sun Dec 15 16:58:43 2002 -@@ -58,6 +58,7 @@ EXPORT_SYMBOL(journal_sync_buffer); - #endif - EXPORT_SYMBOL(journal_flush); - EXPORT_SYMBOL(journal_revoke); -+EXPORT_SYMBOL(journal_callback_set); - - EXPORT_SYMBOL(journal_init_dev); - EXPORT_SYMBOL(journal_init_inode); ---- linux-2.4.19/fs/jbd/transaction.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/fs/jbd/transaction.c Sun Dec 15 16:58:43 2002 -@@ -57,6 +57,7 @@ static transaction_t * get_transaction ( - transaction->t_state = T_RUNNING; - transaction->t_tid = journal->j_transaction_sequence++; - transaction->t_expires = jiffies + journal->j_commit_interval; -+ INIT_LIST_HEAD(&transaction->t_jcb); - - /* Set up the commit timer for the new transaction. */ - J_ASSERT (!journal->j_commit_timer_active); -@@ -201,6 +202,20 @@ repeat_locked: - return 0; - } - -+/* Allocate a new handle. This should probably be in a slab... */ -+static handle_t *new_handle(int nblocks) -+{ -+ handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ if (!handle) -+ return NULL; -+ memset(handle, 0, sizeof (handle_t)); -+ handle->h_buffer_credits = nblocks; -+ handle->h_ref = 1; -+ INIT_LIST_HEAD(&handle->h_jcb); -+ -+ return handle; -+} -+ - /* - * Obtain a new handle. - * -@@ -227,14 +242,11 @@ handle_t *journal_start(journal_t *journ - handle->h_ref++; - return handle; - } -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = start_this_handle(journal, handle); -@@ -333,14 +345,11 @@ handle_t *journal_try_start(journal_t *j - - if (is_journal_aborted(journal)) - return ERR_PTR(-EIO); -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = try_start_this_handle(journal, handle); -@@ -1328,6 +1337,28 @@ out: - #endif - - /* -+ * Register a callback function for this handle. The function will be -+ * called when the transaction that this handle is part of has been -+ * committed to disk with the original callback data struct and the -+ * error status of the journal as parameters. There is no guarantee of -+ * ordering between handles within a single transaction, nor between -+ * callbacks registered on the same handle. -+ * -+ * The caller is responsible for allocating the journal_callback struct. -+ * This is to allow the caller to add as much extra data to the callback -+ * as needed, but reduce the overhead of multiple allocations. The caller -+ * allocated struct must start with a struct journal_callback at offset 0, -+ * and has the caller-specific data afterwards. -+ */ -+void journal_callback_set(handle_t *handle, -+ void (*func)(struct journal_callback *jcb, int error), -+ struct journal_callback *jcb) -+{ -+ list_add(&jcb->jcb_list, &handle->h_jcb); -+ jcb->jcb_func = func; -+} -+ -+/* - * All done for a particular handle. - * - * There is not much action needed here. We just return any remaining -@@ -1393,7 +1424,10 @@ int journal_stop(handle_t *handle) - wake_up(&journal->j_wait_transaction_locked); - } - -- /* -+ /* Move callbacks from the handle to the transaction. */ -+ list_splice(&handle->h_jcb, &transaction->t_jcb); -+ -+ /* - * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current - * transaction is occupying too much of the log, or if the ---- linux-2.4.19/include/linux/blkdev.h~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/include/linux/blkdev.h Sun Dec 15 17:02:24 2002 +--- linux-2.4.19/include/linux/blkdev.h~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/include/linux/blkdev.h Sun Jan 19 21:05:55 2003 @@ -240,4 +240,8 @@ static inline unsigned int block_size(kd return retval; } @@ -341,8 +159,8 @@ +int dev_check_rdonly(kdev_t); +void dev_clear_rdonly(int); #endif ---- linux-2.4.19/include/linux/slab.h~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/include/linux/slab.h Sun Dec 15 17:02:12 2002 +--- linux-2.4.19/include/linux/slab.h~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/include/linux/slab.h Sun Jan 19 21:05:52 2003 @@ -57,6 +57,7 @@ extern int kmem_cache_destroy(kmem_cache extern int kmem_cache_shrink(kmem_cache_t *); extern void *kmem_cache_alloc(kmem_cache_t *, int); @@ -351,58 +169,8 @@ extern void *kmalloc(size_t, int); extern void kfree(const void *); ---- linux-2.4.19/include/linux/jbd.h~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/include/linux/jbd.h Sun Dec 15 16:58:43 2002 -@@ -249,6 +249,13 @@ static inline struct journal_head *bh2jh - return bh->b_private; - } - -+#define HAVE_JOURNAL_CALLBACK_STATUS -+struct journal_callback { -+ struct list_head jcb_list; -+ void (*jcb_func)(struct journal_callback *jcb, int error); -+ /* user data goes here */ -+}; -+ - struct jbd_revoke_table_s; - - /* The handle_t type represents a single atomic update being performed -@@ -279,6 +286,12 @@ struct handle_s - operations */ - int h_err; - -+ /* List of application registered callbacks for this handle. -+ * The function(s) will be called after the transaction that -+ * this handle is part of has been committed to disk. -+ */ -+ struct list_head h_jcb; -+ - /* Flags */ - unsigned int h_sync: 1; /* sync-on-close */ - unsigned int h_jdata: 1; /* force data journaling */ -@@ -398,6 +411,10 @@ struct transaction_s - - /* How many handles used this transaction? */ - int t_handle_count; -+ -+ /* List of registered callback functions for this transaction. -+ * Called when the transaction is committed. */ -+ struct list_head t_jcb; - }; - - -@@ -646,6 +663,9 @@ extern int journal_flushpage(journal_t - extern int journal_try_to_free_buffers(journal_t *, struct page *, int); - extern int journal_stop(handle_t *); - extern int journal_flush (journal_t *); -+extern void journal_callback_set(handle_t *handle, -+ void (*fn)(struct journal_callback *,int), -+ struct journal_callback *jcb); - - extern void journal_lock_updates (journal_t *); - extern void journal_unlock_updates (journal_t *); ---- linux-2.4.19/kernel/ksyms.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/kernel/ksyms.c Sun Dec 15 17:03:55 2002 +--- linux-2.4.19/kernel/ksyms.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/kernel/ksyms.c Sun Jan 19 19:46:42 2003 @@ -264,6 +264,7 @@ EXPORT_SYMBOL(read_cache_page); EXPORT_SYMBOL(set_page_dirty); EXPORT_SYMBOL(vfs_readlink); @@ -424,8 +192,8 @@ /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */ EXPORT_SYMBOL(default_llseek); EXPORT_SYMBOL(dentry_open); ---- linux-2.4.19/include/linux/dcache.h~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/include/linux/dcache.h Sun Dec 15 17:02:11 2002 +--- linux-2.4.19/include/linux/dcache.h~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/include/linux/dcache.h Sun Jan 19 19:46:42 2003 @@ -6,6 +6,34 @@ #include #include @@ -478,8 +246,8 @@ }; /* the dentry parameter passed to d_hash and d_compare is the parent ---- linux-2.4.19/include/linux/fs.h~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/include/linux/fs.h Sun Dec 15 17:02:11 2002 +--- linux-2.4.19/include/linux/fs.h~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/include/linux/fs.h Sun Jan 19 21:05:40 2003 @@ -541,6 +541,7 @@ struct file { /* needed for tty driver, and maybe others */ @@ -499,16 +267,28 @@ /* * File types -@@ -853,6 +856,7 @@ struct file_operations { +@@ -853,16 +856,28 @@ struct file_operations { struct inode_operations { int (*create) (struct inode *,struct dentry *,int); struct dentry * (*lookup) (struct inode *,struct dentry *); + struct dentry * (*lookup2) (struct inode *,struct dentry *, struct lookup_intent *); int (*link) (struct dentry *,struct inode *,struct dentry *); ++ int (*link2) (struct inode *,struct inode *, const char *, int); int (*unlink) (struct inode *,struct dentry *); ++ int (*unlink2) (struct inode *, char *, int); int (*symlink) (struct inode *,struct dentry *,const char *); -@@ -863,6 +867,8 @@ struct inode_operations { ++ int (*symlink2) (struct inode *,const char *, int, const char *); + int (*mkdir) (struct inode *,struct dentry *,int); ++ int (*mkdir2) (struct inode *,char *, int,int); + int (*rmdir) (struct inode *,struct dentry *); ++ int (*rmdir2) (struct inode *, char *, int); + int (*mknod) (struct inode *,struct dentry *,int,int); ++ int (*mknod2) (struct inode *,char *, int,int,int); + int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); ++ int (*rename2) (struct inode *, struct inode *, ++ char *oldname, int oldlen, ++ char *newname, int newlen); int (*readlink) (struct dentry *, char *,int); int (*follow_link) (struct dentry *, struct nameidata *); + int (*follow_link2) (struct dentry *, struct nameidata *, @@ -516,15 +296,15 @@ void (*truncate) (struct inode *); int (*permission) (struct inode *, int); int (*revalidate) (struct dentry *); -@@ -999,6 +1005,7 @@ extern int unregister_filesystem(struct +@@ -999,6 +1014,7 @@ extern int unregister_filesystem(struct extern struct vfsmount *kern_mount(struct file_system_type *); extern int may_umount(struct vfsmount *); extern long do_mount(char *, char *, char *, unsigned long, void *); -+struct vfsmount *do_kern_mount(char *type, int flags, char *name, void *data); ++struct vfsmount *do_kern_mount(const char *fstype, int flags, char *name, void *data); extern void umount_tree(struct vfsmount *); #define kern_umount mntput -@@ -1329,6 +1336,7 @@ typedef int (*read_actor_t)(read_descrip +@@ -1329,6 +1345,7 @@ typedef int (*read_actor_t)(read_descrip extern loff_t default_llseek(struct file *file, loff_t offset, int origin); extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); @@ -532,7 +312,7 @@ extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); extern int FASTCALL(path_walk(const char *, struct nameidata *)); extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); -@@ -1339,6 +1347,8 @@ extern struct dentry * lookup_one_len(co +@@ -1339,6 +1356,8 @@ extern struct dentry * lookup_one_len(co extern struct dentry * lookup_hash(struct qstr *, struct dentry *); #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) @@ -541,7 +321,7 @@ extern void iput(struct inode *); extern void force_delete(struct inode *); -@@ -1448,6 +1458,8 @@ extern struct file_operations generic_ro +@@ -1448,6 +1467,8 @@ extern struct file_operations generic_ro extern int vfs_readlink(struct dentry *, char *, int, const char *); extern int vfs_follow_link(struct nameidata *, const char *); @@ -550,8 +330,8 @@ extern int page_readlink(struct dentry *, char *, int); extern int page_follow_link(struct dentry *, struct nameidata *); extern struct inode_operations page_symlink_inode_operations; ---- linux-2.4.19/fs/dcache.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/fs/dcache.c Sun Dec 15 16:58:43 2002 +--- linux-2.4.19/fs/dcache.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/fs/dcache.c Sun Jan 19 19:46:42 2003 @@ -616,6 +616,7 @@ struct dentry * d_alloc(struct dentry * dentry->d_op = NULL; dentry->d_fsdata = NULL; @@ -560,8 +340,8 @@ INIT_LIST_HEAD(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); ---- linux-2.4.19/fs/nfsd/vfs.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/fs/nfsd/vfs.c Sun Dec 15 16:58:43 2002 +--- linux-2.4.19/fs/nfsd/vfs.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/fs/nfsd/vfs.c Sun Jan 19 19:46:42 2003 @@ -1295,7 +1295,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru err = nfserr_perm; } else @@ -571,8 +351,8 @@ if (!err && EX_ISSYNC(tfhp->fh_export)) { nfsd_sync_dir(tdentry); nfsd_sync_dir(fdentry); ---- linux-2.4.19/fs/namei.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/fs/namei.c Sun Dec 15 16:58:43 2002 +--- linux-2.4.19/fs/namei.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/fs/namei.c Sun Jan 19 19:46:42 2003 @@ -94,6 +94,12 @@ * XEmacs seems to be relying on it... */ @@ -743,7 +523,14 @@ break; } goto return_base; -@@ -651,10 +683,21 @@ return_err: +@@ -646,15 +678,28 @@ out_dput: + dput(dentry); + break; + } ++ if (err) ++ intent_release(nd->dentry, it); + path_release(nd); + return_err: return err; } @@ -766,7 +553,7 @@ } /* SMP-safe */ -@@ -757,7 +800,8 @@ int path_init(const char *name, unsigned +@@ -757,7 +802,8 @@ int path_init(const char *name, unsigned * needs parent already locked. Doesn't follow mounts. * SMP-safe. */ @@ -776,7 +563,7 @@ { struct dentry * dentry; struct inode *inode; -@@ -780,13 +824,16 @@ struct dentry * lookup_hash(struct qstr +@@ -780,13 +826,16 @@ struct dentry * lookup_hash(struct qstr goto out; } @@ -794,7 +581,7 @@ dentry = inode->i_op->lookup(inode, new); unlock_kernel(); if (!dentry) -@@ -798,6 +845,12 @@ out: +@@ -798,6 +847,12 @@ out: return dentry; } @@ -807,7 +594,7 @@ /* SMP-safe */ struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) { -@@ -819,7 +872,7 @@ struct dentry * lookup_one_len(const cha +@@ -819,7 +874,7 @@ struct dentry * lookup_one_len(const cha } this.hash = end_name_hash(hash); @@ -816,7 +603,7 @@ access: return ERR_PTR(-EACCES); } -@@ -851,6 +904,23 @@ int __user_walk(const char *name, unsign +@@ -851,6 +906,23 @@ int __user_walk(const char *name, unsign return err; } @@ -840,7 +627,7 @@ /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. -@@ -987,7 +1057,8 @@ exit_lock: +@@ -987,7 +1059,8 @@ exit_lock: * for symlinks (where the permissions are checked later). * SMP-safe */ @@ -850,7 +637,7 @@ { int acc_mode, error = 0; struct inode *inode; -@@ -1002,7 +1073,7 @@ int open_namei(const char * pathname, in +@@ -1002,7 +1075,7 @@ int open_namei(const char * pathname, in */ if (!(flag & O_CREAT)) { if (path_init(pathname, lookup_flags(flag), nd)) @@ -859,7 +646,7 @@ if (error) return error; dentry = nd->dentry; -@@ -1012,6 +1083,10 @@ int open_namei(const char * pathname, in +@@ -1012,6 +1085,10 @@ int open_namei(const char * pathname, in /* * Create - we need to know the parent. */ @@ -870,7 +657,7 @@ if (path_init(pathname, LOOKUP_PARENT, nd)) error = path_walk(pathname, nd); if (error) -@@ -1028,7 +1103,7 @@ int open_namei(const char * pathname, in +@@ -1028,7 +1105,7 @@ int open_namei(const char * pathname, in dir = nd->dentry; down(&dir->d_inode->i_sem); @@ -879,7 +666,7 @@ do_last: error = PTR_ERR(dentry); -@@ -1037,6 +1112,7 @@ do_last: +@@ -1037,6 +1114,7 @@ do_last: goto exit; } @@ -887,7 +674,7 @@ /* Negative dentry, just create the file */ if (!dentry->d_inode) { error = vfs_create(dir->d_inode, dentry, -@@ -1070,7 +1146,8 @@ do_last: +@@ -1070,7 +1148,8 @@ do_last: error = -ENOENT; if (!dentry->d_inode) goto exit_dput; @@ -897,7 +684,7 @@ goto do_link; dput(nd->dentry); -@@ -1156,8 +1233,10 @@ ok: +@@ -1156,8 +1235,10 @@ ok: return 0; exit_dput: @@ -908,7 +695,7 @@ path_release(nd); return error; -@@ -1176,7 +1255,12 @@ do_link: +@@ -1176,7 +1257,12 @@ do_link: * are done. Procfs-like symlinks just set LAST_BIND. */ UPDATE_ATIME(dentry->d_inode); @@ -922,7 +709,7 @@ dput(dentry); if (error) return error; -@@ -1198,13 +1282,20 @@ do_link: +@@ -1198,13 +1284,20 @@ do_link: } dir = nd->dentry; down(&dir->d_inode->i_sem); @@ -945,7 +732,7 @@ { struct dentry *dentry; -@@ -1212,7 +1303,7 @@ static struct dentry *lookup_create(stru +@@ -1212,7 +1305,7 @@ static struct dentry *lookup_create(stru dentry = ERR_PTR(-EEXIST); if (nd->last_type != LAST_NORM) goto fail; @@ -954,7 +741,7 @@ if (IS_ERR(dentry)) goto fail; if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1258,6 +1349,7 @@ asmlinkage long sys_mknod(const char * f +@@ -1258,6 +1351,7 @@ asmlinkage long sys_mknod(const char * f char * tmp; struct dentry * dentry; struct nameidata nd; @@ -962,16 +749,28 @@ if (S_ISDIR(mode)) return -EPERM; -@@ -1269,7 +1361,7 @@ asmlinkage long sys_mknod(const char * f +@@ -1269,7 +1363,19 @@ asmlinkage long sys_mknod(const char * f error = path_walk(tmp, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); ++ ++ if (nd.dentry->d_inode->i_op->mknod2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mknod2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ mode, dev); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ + dentry = lookup_create(&nd, 0, &it); error = PTR_ERR(dentry); mode &= ~current->fs->umask; -@@ -1287,6 +1379,7 @@ asmlinkage long sys_mknod(const char * f +@@ -1287,9 +1393,11 @@ asmlinkage long sys_mknod(const char * f default: error = -EINVAL; } @@ -979,7 +778,11 @@ dput(dentry); } up(&nd.dentry->d_inode->i_sem); -@@ -1327,6 +1420,7 @@ asmlinkage long sys_mkdir(const char * p ++ out2: + path_release(&nd); + out: + putname(tmp); +@@ -1327,6 +1435,7 @@ asmlinkage long sys_mkdir(const char * p { int error = 0; char * tmp; @@ -987,11 +790,21 @@ tmp = getname(pathname); error = PTR_ERR(tmp); -@@ -1338,11 +1432,12 @@ asmlinkage long sys_mkdir(const char * p +@@ -1338,14 +1447,26 @@ asmlinkage long sys_mkdir(const char * p error = path_walk(tmp, &nd); if (error) goto out; - dentry = lookup_create(&nd, 1); ++ if (nd.dentry->d_inode->i_op->mkdir2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ mode); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } + dentry = lookup_create(&nd, 1, &it); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { @@ -1001,7 +814,11 @@ dput(dentry); } up(&nd.dentry->d_inode->i_sem); -@@ -1426,6 +1521,7 @@ asmlinkage long sys_rmdir(const char * p ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1426,6 +1547,7 @@ asmlinkage long sys_rmdir(const char * p char * name; struct dentry *dentry; struct nameidata nd; @@ -1009,9 +826,19 @@ name = getname(pathname); if(IS_ERR(name)) -@@ -1448,10 +1544,11 @@ asmlinkage long sys_rmdir(const char * p +@@ -1447,11 +1569,21 @@ asmlinkage long sys_rmdir(const char * p + error = -EBUSY; goto exit1; } ++ if (nd.dentry->d_inode->i_op->rmdir2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->rmdir2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } down(&nd.dentry->d_inode->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); + dentry = lookup_hash_it(&nd.last, nd.dentry, &it); @@ -1022,7 +849,7 @@ dput(dentry); } up(&nd.dentry->d_inode->i_sem); -@@ -1495,6 +1592,7 @@ asmlinkage long sys_unlink(const char * +@@ -1495,6 +1627,7 @@ asmlinkage long sys_unlink(const char * char * name; struct dentry *dentry; struct nameidata nd; @@ -1030,16 +857,26 @@ name = getname(pathname); if(IS_ERR(name)) -@@ -1508,7 +1606,7 @@ asmlinkage long sys_unlink(const char * +@@ -1507,8 +1640,17 @@ asmlinkage long sys_unlink(const char * + error = -EISDIR; if (nd.last_type != LAST_NORM) goto exit1; ++ if (nd.dentry->d_inode->i_op->unlink2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } down(&nd.dentry->d_inode->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); + dentry = lookup_hash_it(&nd.last, nd.dentry, &it); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ -@@ -1516,6 +1614,7 @@ asmlinkage long sys_unlink(const char * +@@ -1516,6 +1658,7 @@ asmlinkage long sys_unlink(const char * goto slashes; error = vfs_unlink(nd.dentry->d_inode, dentry); exit2: @@ -1047,7 +884,7 @@ dput(dentry); } up(&nd.dentry->d_inode->i_sem); -@@ -1562,6 +1661,7 @@ asmlinkage long sys_symlink(const char * +@@ -1562,6 +1705,7 @@ asmlinkage long sys_symlink(const char * int error = 0; char * from; char * to; @@ -1055,11 +892,21 @@ from = getname(oldname); if(IS_ERR(from)) -@@ -1576,10 +1676,12 @@ asmlinkage long sys_symlink(const char * +@@ -1576,15 +1720,28 @@ asmlinkage long sys_symlink(const char * error = path_walk(to, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); ++ if (nd.dentry->d_inode->i_op->symlink2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->symlink2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ from); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } + it.it_data = from; + dentry = lookup_create(&nd, 0, &it); error = PTR_ERR(dentry); @@ -1069,7 +916,14 @@ dput(dentry); } up(&nd.dentry->d_inode->i_sem); -@@ -1645,6 +1747,7 @@ asmlinkage long sys_link(const char * ol ++ out2: + path_release(&nd); +-out: ++ out: + putname(to); + } + putname(from); +@@ -1645,6 +1802,7 @@ asmlinkage long sys_link(const char * ol int error; char * from; char * to; @@ -1077,7 +931,7 @@ from = getname(oldname); if(IS_ERR(from)) -@@ -1657,7 +1760,7 @@ asmlinkage long sys_link(const char * ol +@@ -1657,7 +1815,7 @@ asmlinkage long sys_link(const char * ol error = 0; if (path_init(from, LOOKUP_POSITIVE, &old_nd)) @@ -1086,11 +940,21 @@ if (error) goto exit; if (path_init(to, LOOKUP_PARENT, &nd)) -@@ -1667,10 +1770,12 @@ asmlinkage long sys_link(const char * ol +@@ -1667,10 +1825,22 @@ asmlinkage long sys_link(const char * ol error = -EXDEV; if (old_nd.mnt != nd.mnt) goto out_release; - new_dentry = lookup_create(&nd, 0); ++ if (nd.dentry->d_inode->i_op->link2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link2(old_nd.dentry->d_inode, ++ nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } + it.it_op = IT_LINK2; + new_dentry = lookup_create(&nd, 0, &it); error = PTR_ERR(new_dentry); @@ -1100,7 +964,7 @@ dput(new_dentry); } up(&nd.dentry->d_inode->i_sem); -@@ -1713,7 +1818,8 @@ exit: +@@ -1713,7 +1883,8 @@ exit: * locking]. */ int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, @@ -1110,7 +974,7 @@ { int error; struct inode *target; -@@ -1771,6 +1877,7 @@ int vfs_rename_dir(struct inode *old_dir +@@ -1771,6 +1942,7 @@ int vfs_rename_dir(struct inode *old_dir error = -EBUSY; else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -1118,7 +982,7 @@ if (target) { if (!error) target->i_flags |= S_DEAD; -@@ -1792,7 +1899,8 @@ out_unlock: +@@ -1792,7 +1964,8 @@ out_unlock: } int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, @@ -1128,7 +992,7 @@ { int error; -@@ -1823,6 +1931,7 @@ int vfs_rename_other(struct inode *old_d +@@ -1823,6 +1996,7 @@ int vfs_rename_other(struct inode *old_d error = -EBUSY; else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -1136,7 +1000,7 @@ double_up(&old_dir->i_zombie, &new_dir->i_zombie); if (error) return error; -@@ -1834,13 +1943,14 @@ int vfs_rename_other(struct inode *old_d +@@ -1834,13 +2008,14 @@ int vfs_rename_other(struct inode *old_d } int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -1154,7 +1018,7 @@ if (!error) { if (old_dir == new_dir) inode_dir_notify(old_dir, DN_RENAME); -@@ -1857,6 +1967,7 @@ static inline int do_rename(const char * +@@ -1857,6 +2032,7 @@ static inline int do_rename(const char * int error = 0; struct dentry * old_dir, * new_dir; struct dentry * old_dentry, *new_dentry; @@ -1162,8 +1026,24 @@ struct nameidata oldnd, newnd; if (path_init(oldname, LOOKUP_PARENT, &oldnd)) -@@ -1885,7 +1996,7 @@ static inline int do_rename(const char * - +@@ -1883,9 +2059,23 @@ static inline int do_rename(const char * + if (newnd.last_type != LAST_NORM) + goto exit2; + ++ if (old_dir->d_inode->i_op->rename2) { ++ lock_kernel(); ++ error = old_dir->d_inode->i_op->rename2(old_dir->d_inode, ++ new_dir->d_inode, ++ oldnd.last.name, ++ oldnd.last.len, ++ newnd.last.name, ++ newnd.last.len); ++ unlock_kernel(); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit2; ++ } ++ double_lock(new_dir, old_dir); - old_dentry = lookup_hash(&oldnd.last, old_dir); @@ -1171,7 +1051,7 @@ error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; -@@ -1901,18 +2012,21 @@ static inline int do_rename(const char * +@@ -1901,18 +2091,21 @@ static inline int do_rename(const char * if (newnd.last.name[newnd.last.len]) goto exit4; } @@ -1195,7 +1075,7 @@ dput(old_dentry); exit3: double_up(&new_dir->d_inode->i_sem, &old_dir->d_inode->i_sem); -@@ -1961,7 +2075,8 @@ out: +@@ -1961,7 +2154,8 @@ out: } static inline int @@ -1205,7 +1085,7 @@ { int res = 0; char *name; -@@ -1974,7 +2089,7 @@ __vfs_follow_link(struct nameidata *nd, +@@ -1974,7 +2168,7 @@ __vfs_follow_link(struct nameidata *nd, /* weird __emul_prefix() stuff did it */ goto out; } @@ -1214,7 +1094,7 @@ out: if (current->link_count || res || nd->last_type!=LAST_NORM) return res; -@@ -1996,7 +2111,13 @@ fail: +@@ -1996,7 +2190,13 @@ fail: int vfs_follow_link(struct nameidata *nd, const char *link) { @@ -1229,7 +1109,7 @@ } /* get the link contents into pagecache */ -@@ -2038,7 +2159,7 @@ int page_follow_link(struct dentry *dent +@@ -2038,7 +2238,7 @@ int page_follow_link(struct dentry *dent { struct page *page = NULL; char *s = page_getlink(dentry, &page); @@ -1238,8 +1118,8 @@ if (page) { kunmap(page); page_cache_release(page); ---- linux-2.4.19/fs/open.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/fs/open.c Sun Dec 15 16:58:43 2002 +--- linux-2.4.19/fs/open.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/fs/open.c Sun Jan 19 19:46:42 2003 @@ -19,6 +19,9 @@ #include @@ -1496,8 +1376,8 @@ /* * Find an empty file descriptor entry, and mark it busy. */ ---- linux-2.4.19/fs/stat.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/fs/stat.c Sun Dec 15 16:58:43 2002 +--- linux-2.4.19/fs/stat.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/fs/stat.c Sun Jan 19 19:46:42 2003 @@ -13,6 +13,7 @@ #include @@ -1630,8 +1510,8 @@ path_release(&nd); } return error; ---- linux-2.4.19/mm/slab.c~vanilla-2.4.19 Sun Dec 15 16:58:43 2002 -+++ linux-2.4.19-root/mm/slab.c Sun Dec 15 16:58:43 2002 +--- linux-2.4.19/mm/slab.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 ++++ linux-2.4.19-root/mm/slab.c Sun Jan 19 19:46:42 2003 @@ -1207,6 +1207,59 @@ failed: * Called with the cache-lock held. */ diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.18-18.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.18-18.patch index ba7b7ac..7384675 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.18-18.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.18-18.patch @@ -1,25 +1,30 @@ - fs/dcache.c | 3 - fs/namei.c | 228 ++++++++++++++++++++++++++++++++++++++----------- + fs/dcache.c | 8 + + fs/namei.c | 288 ++++++++++++++++++++++++++++++++++++++++--------- fs/nfsd/vfs.c | 2 - fs/open.c | 53 +++++++++-- + fs/open.c | 53 +++++++-- fs/stat.c | 9 + - include/linux/dcache.h | 31 ++++++ - include/linux/fs.h | 13 ++ + include/linux/dcache.h | 25 ++++ + include/linux/fs.h | 22 +++ kernel/ksyms.c | 1 - 8 files changed, 278 insertions(+), 62 deletions(-) + 8 files changed, 345 insertions(+), 63 deletions(-) ---- linux-2.4.18-18.8.0-l4/fs/dcache.c~vfs_intent-2.4.18-18 Sat Dec 14 06:31:22 2002 -+++ linux-2.4.18-18.8.0-l4-root/fs/dcache.c Sat Dec 14 06:31:22 2002 -@@ -150,6 +150,8 @@ repeat: - unhash_it: - list_del_init(&dentry->d_hash); - +--- linux-2.4.18-49chaos-lustre9/fs/dcache.c~vfs_intent-2.4.18-18 Wed Jan 29 12:43:32 2003 ++++ linux-2.4.18-49chaos-lustre9-root/fs/dcache.c Wed Jan 29 12:43:32 2003 +@@ -186,6 +186,13 @@ int d_invalidate(struct dentry * dentry) + spin_unlock(&dcache_lock); + return 0; + } + ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } + - kill_it: { - struct dentry *parent; - list_del(&dentry->d_child); -@@ -645,6 +647,7 @@ struct dentry * d_alloc(struct dentry * + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -645,6 +652,7 @@ struct dentry * d_alloc(struct dentry * dentry->d_fsdata = NULL; dentry->d_extra_attributes = NULL; dentry->d_mounted = 0; @@ -27,16 +32,9 @@ INIT_LIST_HEAD(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); ---- linux-2.4.18-18.8.0-l4/fs/namei.c~vfs_intent-2.4.18-18 Sat Dec 14 06:31:22 2002 -+++ linux-2.4.18-18.8.0-l4-root/fs/namei.c Sat Dec 14 06:37:21 2002 -@@ -1,3 +1,6 @@ -+ -+ -+ - /* - * linux/fs/namei.c - * -@@ -94,6 +97,14 @@ +--- linux-2.4.18-49chaos-lustre9/fs/namei.c~vfs_intent-2.4.18-18 Wed Jan 29 12:43:32 2003 ++++ linux-2.4.18-49chaos-lustre9-root/fs/namei.c Wed Feb 5 16:23:06 2003 +@@ -94,6 +94,13 @@ * XEmacs seems to be relying on it... */ @@ -47,11 +45,10 @@ + +} + -+ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. -@@ -260,10 +271,19 @@ void path_release(struct nameidata *nd) +@@ -260,10 +267,19 @@ void path_release(struct nameidata *nd) * Internal lookup() using the new generic dcache. * SMP-safe */ @@ -72,7 +69,7 @@ if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { dput(dentry); -@@ -281,7 +301,8 @@ static struct dentry * cached_lookup(str +@@ -281,11 +297,14 @@ static struct dentry * cached_lookup(str * make sure that nobody added the entry to the dcache in the meantime.. * SMP-safe */ @@ -82,7 +79,13 @@ { struct dentry * result; struct inode *dir = parent->d_inode; -@@ -300,6 +321,9 @@ static struct dentry * real_lookup(struc + ++again: ++ + down(&dir->i_sem); + /* + * First re-do the cached lookup just in case it was created +@@ -300,6 +319,9 @@ static struct dentry * real_lookup(struc result = ERR_PTR(-ENOMEM); if (dentry) { lock_kernel(); @@ -92,7 +95,7 @@ result = dir->i_op->lookup(dir, dentry); unlock_kernel(); if (result) -@@ -321,6 +345,12 @@ static struct dentry * real_lookup(struc +@@ -321,6 +343,12 @@ static struct dentry * real_lookup(struc dput(result); result = ERR_PTR(-ENOENT); } @@ -100,38 +103,38 @@ + if (!result->d_op->d_revalidate2(result, flags, it) && + !d_invalidate(result)) { + dput(result); -+ result = ERR_PTR(-ENOENT); ++ goto again; + } } return result; } -@@ -334,7 +364,8 @@ int max_recursive_link = 5; +@@ -334,7 +362,8 @@ int max_recursive_link = 5; * Without that kind of total limit, nasty chains of consecutive * symlinks can cause almost arbitrarily long lookups. */ -static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) -+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, -+ struct lookup_intent *it) ++static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, ++ struct lookup_intent *it) { int err; if (current->link_count >= max_recursive_link) -@@ -348,10 +379,14 @@ static inline int do_follow_link(struct +@@ -348,10 +377,14 @@ static inline int do_follow_link(struct current->link_count++; current->total_link_count++; UPDATE_ATIME(dentry->d_inode); - err = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (dentry->d_inode->i_op->follow_link2) -+ err = dentry->d_inode->i_op->follow_link2(dentry, nd, it); -+ else -+ err = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (dentry->d_inode->i_op->follow_link2) ++ err = dentry->d_inode->i_op->follow_link2(dentry, nd, it); ++ else ++ err = dentry->d_inode->i_op->follow_link(dentry, nd); current->link_count--; return err; loop: -+ intent_release(dentry, it); ++ intent_release(dentry, it); path_release(nd); return -ELOOP; } -@@ -449,7 +484,8 @@ static inline void follow_dotdot(struct +@@ -449,7 +482,8 @@ static inline void follow_dotdot(struct * * We expect 'base' to be positive and a directory. */ @@ -141,7 +144,7 @@ { struct dentry *dentry; struct inode *inode; -@@ -526,12 +562,12 @@ int link_path_walk(const char * name, st +@@ -526,12 +560,12 @@ int link_path_walk(const char * name, st break; } /* This does the actual lookups.. */ @@ -156,7 +159,7 @@ err = PTR_ERR(dentry); if (IS_ERR(dentry)) break; -@@ -548,8 +584,8 @@ int link_path_walk(const char * name, st +@@ -548,8 +582,8 @@ int link_path_walk(const char * name, st if (!inode->i_op) goto out_dput; @@ -167,7 +170,7 @@ dput(dentry); if (err) goto return_err; -@@ -565,7 +601,7 @@ int link_path_walk(const char * name, st +@@ -565,7 +599,7 @@ int link_path_walk(const char * name, st nd->dentry = dentry; } err = -ENOTDIR; @@ -176,7 +179,7 @@ break; continue; /* here ends the main loop */ -@@ -592,12 +628,12 @@ last_component: +@@ -592,12 +626,12 @@ last_component: if (err < 0) break; } @@ -191,30 +194,36 @@ err = PTR_ERR(dentry); if (IS_ERR(dentry)) break; -@@ -606,8 +642,10 @@ last_component: +@@ -606,8 +640,9 @@ last_component: ; inode = dentry->d_inode; if ((lookup_flags & LOOKUP_FOLLOW) - && inode && inode->i_op && inode->i_op->follow_link) { - err = do_follow_link(dentry, nd); -+ && inode && inode->i_op && -+ (inode->i_op->follow_link || -+ inode->i_op->follow_link2)) { ++ && inode && inode->i_op && ++ (inode->i_op->follow_link || inode->i_op->follow_link2)) { + err = do_follow_link(dentry, nd, it); dput(dentry); if (err) goto return_err; -@@ -621,7 +659,8 @@ last_component: +@@ -621,7 +656,8 @@ last_component: goto no_inode; if (lookup_flags & LOOKUP_DIRECTORY) { err = -ENOTDIR; - if (!inode->i_op || !inode->i_op->lookup) -+ if (!inode->i_op || (!inode->i_op->lookup && -+ !inode->i_op->lookup2)) ++ if (!inode->i_op || ++ (!inode->i_op->lookup && !inode->i_op->lookup2)) break; } goto return_base; -@@ -663,10 +702,21 @@ return_err: +@@ -658,15 +694,28 @@ out_dput: + dput(dentry); + break; + } ++ if (err) ++ intent_release(nd->dentry, it); + path_release(nd); + return_err: return err; } @@ -237,7 +246,7 @@ } /* SMP-safe */ -@@ -751,6 +801,17 @@ walk_init_root(const char *name, struct +@@ -751,6 +800,17 @@ walk_init_root(const char *name, struct } /* SMP-safe */ @@ -255,7 +264,7 @@ int path_lookup(const char *path, unsigned flags, struct nameidata *nd) { int error = 0; -@@ -779,7 +840,8 @@ int path_init(const char *name, unsigned +@@ -779,7 +839,8 @@ int path_init(const char *name, unsigned * needs parent already locked. Doesn't follow mounts. * SMP-safe. */ @@ -265,7 +274,7 @@ { struct dentry * dentry; struct inode *inode; -@@ -802,13 +864,16 @@ struct dentry * lookup_hash(struct qstr +@@ -802,13 +863,16 @@ struct dentry * lookup_hash(struct qstr goto out; } @@ -283,7 +292,7 @@ dentry = inode->i_op->lookup(inode, new); unlock_kernel(); if (!dentry) -@@ -820,6 +885,12 @@ out: +@@ -820,6 +884,12 @@ out: return dentry; } @@ -296,7 +305,7 @@ /* SMP-safe */ struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) { -@@ -841,7 +912,7 @@ struct dentry * lookup_one_len(const cha +@@ -841,7 +911,7 @@ struct dentry * lookup_one_len(const cha } this.hash = end_name_hash(hash); @@ -305,7 +314,7 @@ access: return ERR_PTR(-EACCES); } -@@ -872,6 +943,23 @@ int __user_walk(const char *name, unsign +@@ -872,6 +942,23 @@ int __user_walk(const char *name, unsign return err; } @@ -329,12 +338,12 @@ /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. -@@ -1045,14 +1133,17 @@ int may_open(struct nameidata *nd, int a +@@ -1045,14 +1132,17 @@ int may_open(struct nameidata *nd, int a return get_lease(inode, flag); } +extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it); ++ int flags, struct lookup_intent *it); + struct file *filp_open(const char * pathname, int open_flags, int mode) { @@ -344,11 +353,11 @@ struct dentry *dir; int flag = open_flags; struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_OPEN }; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = open_flags }; int count = 0; if ((flag+1) & O_ACCMODE) -@@ -1066,7 +1157,7 @@ struct file *filp_open(const char * path +@@ -1066,7 +1156,7 @@ struct file *filp_open(const char * path * The simplest case - just a plain lookup. */ if (!(flag & O_CREAT)) { @@ -357,16 +366,16 @@ if (error) return ERR_PTR(error); dentry = nd.dentry; -@@ -1076,6 +1167,8 @@ struct file *filp_open(const char * path +@@ -1076,6 +1166,8 @@ struct file *filp_open(const char * path /* * Create - we need to know the parent. */ -+ it.it_mode = mode; -+ it.it_op |= IT_CREAT; ++ it.it_mode = mode; ++ it.it_op |= IT_CREAT; error = path_lookup(pathname, LOOKUP_PARENT, &nd); if (error) return ERR_PTR(error); -@@ -1091,7 +1184,7 @@ struct file *filp_open(const char * path +@@ -1091,7 +1183,7 @@ struct file *filp_open(const char * path dir = nd.dentry; down(&dir->d_inode->i_sem); @@ -375,7 +384,7 @@ do_last: error = PTR_ERR(dentry); -@@ -1100,6 +1193,7 @@ do_last: +@@ -1100,6 +1192,7 @@ do_last: goto exit; } @@ -383,46 +392,46 @@ /* Negative dentry, just create the file */ if (!dentry->d_inode) { error = vfs_create(dir->d_inode, dentry, -@@ -1134,7 +1228,8 @@ do_last: +@@ -1134,7 +1227,8 @@ do_last: error = -ENOENT; if (!dentry->d_inode) goto exit_dput; - if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link) -+ if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link || -+ dentry->d_inode->i_op->follow_link2)) ++ if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link || ++ dentry->d_inode->i_op->follow_link2)) goto do_link; dput(nd.dentry); -@@ -1149,11 +1244,13 @@ ok: +@@ -1149,11 +1243,13 @@ ok: if (!S_ISREG(nd.dentry->d_inode->i_mode)) open_flags &= ~O_TRUNC; - return dentry_open(nd.dentry, nd.mnt, open_flags); -+ return dentry_open_it(nd.dentry, nd.mnt, open_flags, &it); ++ return dentry_open_it(nd.dentry, nd.mnt, open_flags, &it); exit_dput: -+ intent_release(dentry, &it); ++ intent_release(dentry, &it); dput(dentry); exit: -+ intent_release(nd.dentry, &it); ++ intent_release(nd.dentry, &it); path_release(&nd); return ERR_PTR(error); -@@ -1172,7 +1269,12 @@ do_link: +@@ -1172,7 +1268,12 @@ do_link: * are done. Procfs-like symlinks just set LAST_BIND. */ UPDATE_ATIME(dentry->d_inode); - error = dentry->d_inode->i_op->follow_link(dentry, &nd); -+ if (dentry->d_inode->i_op->follow_link2) -+ error = dentry->d_inode->i_op->follow_link2(dentry, &nd, &it); -+ else -+ error = dentry->d_inode->i_op->follow_link(dentry, &nd); ++ if (dentry->d_inode->i_op->follow_link2) ++ error = dentry->d_inode->i_op->follow_link2(dentry, &nd, &it); ++ else ++ error = dentry->d_inode->i_op->follow_link(dentry, &nd); + if (error) + intent_release(dentry, &it); dput(dentry); if (error) return error; -@@ -1194,13 +1296,15 @@ do_link: +@@ -1194,13 +1295,15 @@ do_link: } dir = nd.dentry; down(&dir->d_inode->i_sem); @@ -440,7 +449,7 @@ { struct dentry *dentry; -@@ -1208,7 +1312,7 @@ static struct dentry *lookup_create(stru +@@ -1208,7 +1311,7 @@ static struct dentry *lookup_create(stru dentry = ERR_PTR(-EEXIST); if (nd->last_type != LAST_NORM) goto fail; @@ -449,153 +458,158 @@ if (IS_ERR(dentry)) goto fail; if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1254,6 +1358,7 @@ asmlinkage long sys_mknod(const char * f - char * tmp; - struct dentry * dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_MKNOD, .it_mode = mode }; - - if (S_ISDIR(mode)) - return -EPERM; -@@ -1264,7 +1369,7 @@ asmlinkage long sys_mknod(const char * f +@@ -1264,7 +1367,19 @@ asmlinkage long sys_mknod(const char * f error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); -+ dentry = lookup_create(&nd, 0, &it); ++ ++ if (nd.dentry->d_inode->i_op->mknod2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mknod2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ mode, dev); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ ++ dentry = lookup_create(&nd, 0, NULL); error = PTR_ERR(dentry); mode &= ~current->fs->umask; -@@ -1282,6 +1387,7 @@ asmlinkage long sys_mknod(const char * f - default: - error = -EINVAL; - } -+ intent_release(dentry, &it); +@@ -1285,6 +1400,7 @@ asmlinkage long sys_mknod(const char * f dput(dentry); } up(&nd.dentry->d_inode->i_sem); -@@ -1322,6 +1428,7 @@ asmlinkage long sys_mkdir(const char * p - { - int error = 0; - char * tmp; -+ struct lookup_intent it = { .it_op = IT_MKDIR, .it_mode = mode }; - - tmp = getname(pathname); - error = PTR_ERR(tmp); -@@ -1332,11 +1439,12 @@ asmlinkage long sys_mkdir(const char * p ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1332,7 +1448,17 @@ asmlinkage long sys_mkdir(const char * p error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 1); -+ dentry = lookup_create(&nd, 1, &it); ++ if (nd.dentry->d_inode->i_op->mkdir2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ mode); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 1, NULL); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_mkdir(nd.dentry->d_inode, dentry, - mode & ~current->fs->umask); -+ intent_release(dentry, &it); +@@ -1340,6 +1466,7 @@ asmlinkage long sys_mkdir(const char * p dput(dentry); } up(&nd.dentry->d_inode->i_sem); -@@ -1420,6 +1528,7 @@ asmlinkage long sys_rmdir(const char * p - char * name; - struct dentry *dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_RMDIR }; - - name = getname(pathname); - if(IS_ERR(name)) -@@ -1441,10 +1550,11 @@ asmlinkage long sys_rmdir(const char * p ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1440,8 +1567,17 @@ asmlinkage long sys_rmdir(const char * p + error = -EBUSY; goto exit1; } ++ if (nd.dentry->d_inode->i_op->rmdir2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->rmdir2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } down(&nd.dentry->d_inode->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_rmdir(nd.dentry->d_inode, dentry); -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1488,6 +1598,7 @@ asmlinkage long sys_unlink(const char * - char * name; - struct dentry *dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_UNLINK }; - - name = getname(pathname); - if(IS_ERR(name)) -@@ -1500,7 +1611,7 @@ asmlinkage long sys_unlink(const char * +@@ -1499,8 +1635,17 @@ asmlinkage long sys_unlink(const char * + error = -EISDIR; if (nd.last_type != LAST_NORM) goto exit1; ++ if (nd.dentry->d_inode->i_op->unlink2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } down(&nd.dentry->d_inode->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ -@@ -1508,6 +1619,7 @@ asmlinkage long sys_unlink(const char * - goto slashes; - error = vfs_unlink(nd.dentry->d_inode, dentry); - exit2: -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1554,6 +1666,7 @@ asmlinkage long sys_symlink(const char * - int error = 0; - char * from; - char * to; -+ struct lookup_intent it = { .it_op = IT_SYMLINK }; - - from = getname(oldname); - if(IS_ERR(from)) -@@ -1567,10 +1680,12 @@ asmlinkage long sys_symlink(const char * +@@ -1567,15 +1712,26 @@ asmlinkage long sys_symlink(const char * error = path_lookup(to, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); -+ it.it_data = from; -+ dentry = lookup_create(&nd, 0, &it); ++ if (nd.dentry->d_inode->i_op->symlink2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->symlink2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ from); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 0, NULL); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_symlink(nd.dentry->d_inode, dentry, from); -+ intent_release(dentry, &it); dput(dentry); } up(&nd.dentry->d_inode->i_sem); -@@ -1635,6 +1750,7 @@ asmlinkage long sys_link(const char * ol - { - int error; - char * to; -+ struct lookup_intent it = { .it_op = IT_LINK }; - - to = getname(newname); - error = PTR_ERR(to); -@@ -1642,7 +1758,7 @@ asmlinkage long sys_link(const char * ol ++ out2: + path_release(&nd); +-out: ++ out: + putname(to); + } + putname(from); +@@ -1642,7 +1798,7 @@ asmlinkage long sys_link(const char * ol struct dentry *new_dentry; struct nameidata nd, old_nd; - error = __user_walk(oldname, LOOKUP_POSITIVE, &old_nd); -+ error = __user_walk_it(oldname, LOOKUP_POSITIVE, &old_nd, &it); ++ error = __user_walk_it(oldname, LOOKUP_POSITIVE, &old_nd, NULL); if (error) goto exit; error = path_lookup(to, LOOKUP_PARENT, &nd); -@@ -1651,10 +1767,12 @@ asmlinkage long sys_link(const char * ol +@@ -1651,7 +1807,17 @@ asmlinkage long sys_link(const char * ol error = -EXDEV; if (old_nd.mnt != nd.mnt) goto out_release; - new_dentry = lookup_create(&nd, 0); -+ it.it_op = IT_LINK2; -+ new_dentry = lookup_create(&nd, 0, &it); ++ if (nd.dentry->d_inode->i_op->link2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link2(old_nd.dentry->d_inode, ++ nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } ++ new_dentry = lookup_create(&nd, 0, NULL); error = PTR_ERR(new_dentry); if (!IS_ERR(new_dentry)) { error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); -+ intent_release(new_dentry, &it); - dput(new_dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1695,7 +1813,8 @@ exit: +@@ -1695,7 +1861,8 @@ exit: * locking]. */ int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, @@ -605,7 +619,7 @@ { int error; struct inode *target; -@@ -1753,6 +1872,7 @@ int vfs_rename_dir(struct inode *old_dir +@@ -1753,6 +1920,7 @@ int vfs_rename_dir(struct inode *old_dir error = -EBUSY; else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -613,7 +627,7 @@ if (target) { if (!error) target->i_flags |= S_DEAD; -@@ -1774,7 +1894,8 @@ out_unlock: +@@ -1774,7 +1942,8 @@ out_unlock: } int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, @@ -623,7 +637,7 @@ { int error; -@@ -1805,6 +1926,7 @@ int vfs_rename_other(struct inode *old_d +@@ -1805,6 +1974,7 @@ int vfs_rename_other(struct inode *old_d error = -EBUSY; else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -631,7 +645,7 @@ double_up(&old_dir->i_zombie, &new_dir->i_zombie); if (error) return error; -@@ -1816,13 +1938,14 @@ int vfs_rename_other(struct inode *old_d +@@ -1816,13 +1986,14 @@ int vfs_rename_other(struct inode *old_d } int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -649,30 +663,37 @@ if (!error) { if (old_dir == new_dir) inode_dir_notify(old_dir, DN_RENAME); -@@ -1839,6 +1962,7 @@ static inline int do_rename(const char * - int error = 0; - struct dentry * old_dir, * new_dir; - struct dentry * old_dentry, *new_dentry; -+ struct lookup_intent it = { .it_op = IT_RENAME }; - struct nameidata oldnd, newnd; - - error = path_lookup(oldname, LOOKUP_PARENT, &oldnd); -@@ -1864,7 +1988,7 @@ static inline int do_rename(const char * - +@@ -1862,9 +2033,23 @@ static inline int do_rename(const char * + if (newnd.last_type != LAST_NORM) + goto exit2; + ++ if (old_dir->d_inode->i_op->rename2) { ++ lock_kernel(); ++ error = old_dir->d_inode->i_op->rename2(old_dir->d_inode, ++ new_dir->d_inode, ++ oldnd.last.name, ++ oldnd.last.len, ++ newnd.last.name, ++ newnd.last.len); ++ unlock_kernel(); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit2; ++ } ++ double_lock(new_dir, old_dir); - old_dentry = lookup_hash(&oldnd.last, old_dir); -+ old_dentry = lookup_hash_it(&oldnd.last, old_dir, &it); ++ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; -@@ -1880,18 +2004,21 @@ static inline int do_rename(const char * +@@ -1880,14 +2065,14 @@ static inline int do_rename(const char * if (newnd.last.name[newnd.last.len]) goto exit4; } - new_dentry = lookup_hash(&newnd.last, new_dir); -+ it.it_op = IT_RENAME2; -+ new_dentry = lookup_hash_it(&newnd.last, new_dir, &it); ++ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; @@ -680,27 +701,21 @@ lock_kernel(); error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); -+ new_dir->d_inode, new_dentry, &it); ++ new_dir->d_inode, new_dentry, NULL); unlock_kernel(); -+ intent_release(new_dentry, &it); dput(new_dentry); - exit4: -+ intent_release(old_dentry, &it); - dput(old_dentry); - exit3: - double_up(&new_dir->d_inode->i_sem, &old_dir->d_inode->i_sem); -@@ -1940,7 +2067,8 @@ out: +@@ -1940,7 +2125,8 @@ out: } static inline int -__vfs_follow_link(struct nameidata *nd, const char *link) -+__vfs_follow_link(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) ++__vfs_follow_link(struct nameidata *nd, const char *link, ++ struct lookup_intent *it) { int res = 0; char *name; -@@ -1953,7 +2081,7 @@ __vfs_follow_link(struct nameidata *nd, +@@ -1953,7 +2139,7 @@ __vfs_follow_link(struct nameidata *nd, /* weird __emul_prefix() stuff did it */ goto out; } @@ -709,7 +724,7 @@ out: if (current->link_count || res || nd->last_type!=LAST_NORM) return res; -@@ -1975,7 +2103,13 @@ fail: +@@ -1975,7 +2161,13 @@ fail: int vfs_follow_link(struct nameidata *nd, const char *link) { @@ -717,14 +732,14 @@ + return __vfs_follow_link(nd, link, NULL); +} + -+int vfs_follow_link_it(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) ++int vfs_follow_link_it(struct nameidata *nd, const char *link, ++ struct lookup_intent *it) +{ + return __vfs_follow_link(nd, link, it); } /* get the link contents into pagecache */ -@@ -2017,7 +2151,7 @@ int page_follow_link(struct dentry *dent +@@ -2017,7 +2209,7 @@ int page_follow_link(struct dentry *dent { struct page *page = NULL; char *s = page_getlink(dentry, &page); @@ -733,8 +748,8 @@ if (page) { kunmap(page); page_cache_release(page); ---- linux-2.4.18-18.8.0-l4/fs/nfsd/vfs.c~vfs_intent-2.4.18-18 Sat Dec 14 06:31:22 2002 -+++ linux-2.4.18-18.8.0-l4-root/fs/nfsd/vfs.c Sat Dec 14 06:31:22 2002 +--- linux-2.4.18-49chaos-lustre9/fs/nfsd/vfs.c~vfs_intent-2.4.18-18 Wed Jan 29 12:43:32 2003 ++++ linux-2.4.18-49chaos-lustre9-root/fs/nfsd/vfs.c Wed Jan 29 12:43:32 2003 @@ -1298,7 +1298,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru err = nfserr_perm; } else @@ -744,8 +759,8 @@ unlock_kernel(); if (!err && EX_ISSYNC(tfhp->fh_export)) { nfsd_sync_dir(tdentry); ---- linux-2.4.18-18.8.0-l4/fs/open.c~vfs_intent-2.4.18-18 Sat Dec 14 06:31:22 2002 -+++ linux-2.4.18-18.8.0-l4-root/fs/open.c Sat Dec 14 06:31:22 2002 +--- linux-2.4.18-49chaos-lustre9/fs/open.c~vfs_intent-2.4.18-18 Wed Jan 29 12:43:32 2003 ++++ linux-2.4.18-49chaos-lustre9-root/fs/open.c Wed Jan 29 12:43:32 2003 @@ -19,6 +19,9 @@ #include @@ -760,7 +775,7 @@ struct nameidata nd; struct inode * inode; int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; ++ struct lookup_intent it = { .it_op = IT_TRUNC }; error = -EINVAL; if (length < 0) /* sorry, but loff_t says... */ @@ -966,8 +981,8 @@ /* * Find an empty file descriptor entry, and mark it busy. */ ---- linux-2.4.18-18.8.0-l4/fs/stat.c~vfs_intent-2.4.18-18 Sat Dec 14 06:31:22 2002 -+++ linux-2.4.18-18.8.0-l4-root/fs/stat.c Sat Dec 14 06:31:22 2002 +--- linux-2.4.18-49chaos-lustre9/fs/stat.c~vfs_intent-2.4.18-18 Wed Jan 29 12:43:32 2003 ++++ linux-2.4.18-49chaos-lustre9-root/fs/stat.c Wed Jan 29 12:43:32 2003 @@ -13,6 +13,7 @@ #include @@ -1004,32 +1019,25 @@ path_release(&nd); } return error; ---- linux-2.4.18-18.8.0-l4/include/linux/dcache.h~vfs_intent-2.4.18-18 Sat Dec 14 06:31:22 2002 -+++ linux-2.4.18-18.8.0-l4-root/include/linux/dcache.h Sat Dec 14 06:31:22 2002 -@@ -6,6 +6,34 @@ +--- linux-2.4.18-49chaos-lustre9/include/linux/dcache.h~vfs_intent-2.4.18-18 Wed Jan 29 12:43:32 2003 ++++ linux-2.4.18-49chaos-lustre9-root/include/linux/dcache.h Wed Jan 29 12:43:32 2003 +@@ -6,6 +6,27 @@ #include #include -+#define IT_OPEN (1) -+#define IT_CREAT (1<<1) -+#define IT_MKDIR (1<<2) -+#define IT_LINK (1<<3) -+#define IT_LINK2 (1<<4) -+#define IT_SYMLINK (1<<5) -+#define IT_UNLINK (1<<6) -+#define IT_RMDIR (1<<7) -+#define IT_RENAME (1<<8) -+#define IT_RENAME2 (1<<9) -+#define IT_READDIR (1<<10) -+#define IT_GETATTR (1<<11) -+#define IT_SETATTR (1<<12) -+#define IT_READLINK (1<<13) -+#define IT_MKNOD (1<<14) -+#define IT_LOOKUP (1<<15) ++#define IT_OPEN (1) ++#define IT_CREAT (1<<1) ++#define IT_READDIR (1<<2) ++#define IT_GETATTR (1<<3) ++#define IT_SETATTR (1<<4) ++#define IT_TRUNC (1<<5) ++#define IT_READLINK (1<<6) ++#define IT_LOOKUP (1<<7) + +struct lookup_intent { + int it_op; + int it_mode; ++ int it_flags; + int it_disposition; + int it_status; + struct iattr *it_iattr; @@ -1041,7 +1049,7 @@ /* * linux/include/linux/dcache.h * -@@ -78,6 +106,7 @@ struct dentry { +@@ -78,6 +99,7 @@ struct dentry { unsigned long d_time; /* used by d_revalidate */ struct dentry_operations *d_op; struct super_block * d_sb; /* The root of the dentry tree */ @@ -1049,7 +1057,7 @@ unsigned long d_vfs_flags; void * d_fsdata; /* fs-specific data */ void * d_extra_attributes; /* TUX-specific data */ -@@ -91,6 +120,8 @@ struct dentry_operations { +@@ -91,6 +113,8 @@ struct dentry_operations { int (*d_delete)(struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); @@ -1058,8 +1066,16 @@ }; /* the dentry parameter passed to d_hash and d_compare is the parent ---- linux-2.4.18-18.8.0-l4/include/linux/fs.h~vfs_intent-2.4.18-18 Sat Dec 14 06:31:22 2002 -+++ linux-2.4.18-18.8.0-l4-root/include/linux/fs.h Sat Dec 14 06:33:11 2002 +@@ -124,6 +148,7 @@ d_iput: no no yes + * s_nfsd_free_path semaphore will be down + */ + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ ++#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */ + + extern spinlock_t dcache_lock; + +--- linux-2.4.18-49chaos-lustre9/include/linux/fs.h~vfs_intent-2.4.18-18 Wed Jan 29 12:43:32 2003 ++++ linux-2.4.18-49chaos-lustre9-root/include/linux/fs.h Wed Jan 29 12:43:32 2003 @@ -576,6 +576,7 @@ struct file { /* needed for tty driver, and maybe others */ @@ -1079,24 +1095,36 @@ /* * File types -@@ -897,6 +900,7 @@ struct file_operations { +@@ -897,16 +900,28 @@ struct file_operations { struct inode_operations { int (*create) (struct inode *,struct dentry *,int); struct dentry * (*lookup) (struct inode *,struct dentry *); + struct dentry * (*lookup2) (struct inode *,struct dentry *, struct lookup_intent *); int (*link) (struct dentry *,struct inode *,struct dentry *); ++ int (*link2) (struct inode *,struct inode *, const char *, int); int (*unlink) (struct inode *,struct dentry *); ++ int (*unlink2) (struct inode *, const char *, int); int (*symlink) (struct inode *,struct dentry *,const char *); -@@ -907,6 +911,8 @@ struct inode_operations { ++ int (*symlink2) (struct inode *, const char *, int, const char *); + int (*mkdir) (struct inode *,struct dentry *,int); ++ int (*mkdir2) (struct inode *, const char *, int,int); + int (*rmdir) (struct inode *,struct dentry *); ++ int (*rmdir2) (struct inode *, const char *, int); + int (*mknod) (struct inode *,struct dentry *,int,int); ++ int (*mknod2) (struct inode *, const char *, int,int,int); + int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); ++ int (*rename2) (struct inode *, struct inode *, ++ const char *oldname, int oldlen, ++ const char *newname, int newlen); int (*readlink) (struct dentry *, char *,int); int (*follow_link) (struct dentry *, struct nameidata *); -+ int (*follow_link2) (struct dentry *, struct nameidata *, -+ struct lookup_intent *it); ++ int (*follow_link2) (struct dentry *, struct nameidata *, ++ struct lookup_intent *it); void (*truncate) (struct inode *); int (*permission) (struct inode *, int); int (*revalidate) (struct dentry *); -@@ -1381,6 +1387,7 @@ typedef int (*read_actor_t)(read_descrip +@@ -1383,6 +1398,7 @@ typedef int (*read_actor_t)(read_descrip extern loff_t default_llseek(struct file *file, loff_t offset, int origin); extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); @@ -1104,7 +1132,7 @@ extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); extern int FASTCALL(path_walk(const char *, struct nameidata *)); extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *)); -@@ -1392,6 +1399,8 @@ extern struct dentry * lookup_one_len(co +@@ -1394,6 +1410,8 @@ extern struct dentry * lookup_one_len(co extern struct dentry * lookup_hash(struct qstr *, struct dentry *); #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) @@ -1113,18 +1141,18 @@ extern void inode_init_once(struct inode *); extern void iput(struct inode *); -@@ -1492,6 +1501,8 @@ extern struct file_operations generic_ro +@@ -1494,6 +1512,8 @@ extern struct file_operations generic_ro extern int vfs_readlink(struct dentry *, char *, int, const char *); extern int vfs_follow_link(struct nameidata *, const char *); -+extern int vfs_follow_link_it(struct nameidata *, const char *, -+ struct lookup_intent *it); ++extern int vfs_follow_link_it(struct nameidata *, const char *, ++ struct lookup_intent *it); extern int page_readlink(struct dentry *, char *, int); extern int page_follow_link(struct dentry *, struct nameidata *); extern struct inode_operations page_symlink_inode_operations; ---- linux-2.4.18-18.8.0-l4/kernel/ksyms.c~vfs_intent-2.4.18-18 Sat Dec 14 06:31:22 2002 -+++ linux-2.4.18-18.8.0-l4-root/kernel/ksyms.c Sat Dec 14 06:31:22 2002 -@@ -293,6 +293,7 @@ EXPORT_SYMBOL(read_cache_page); +--- linux-2.4.18-49chaos-lustre9/kernel/ksyms.c~vfs_intent-2.4.18-18 Wed Jan 29 12:43:32 2003 ++++ linux-2.4.18-49chaos-lustre9-root/kernel/ksyms.c Wed Jan 29 12:43:32 2003 +@@ -294,6 +294,7 @@ EXPORT_SYMBOL(read_cache_page); EXPORT_SYMBOL(set_page_dirty); EXPORT_SYMBOL(vfs_readlink); EXPORT_SYMBOL(vfs_follow_link); diff --git a/lustre/kernel_patches/patches/vfs_intent.patch b/lustre/kernel_patches/patches/vfs_intent.patch index 54c498a..75e404b 100644 --- a/lustre/kernel_patches/patches/vfs_intent.patch +++ b/lustre/kernel_patches/patches/vfs_intent.patch @@ -1,20 +1,30 @@ + fs/dcache.c | 8 + + fs/namei.c | 287 ++++++++++++++++++++++++++++++++++++++++--------- + fs/nfsd/vfs.c | 2 + fs/open.c | 53 +++++++-- + fs/stat.c | 9 + + include/linux/dcache.h | 25 ++++ + include/linux/fs.h | 22 +++ + kernel/ksyms.c | 1 + 8 files changed, 344 insertions(+), 63 deletions(-) - - - 0 files changed - ---- linux-2.4.18-17.8.0/fs/dcache.c~vfs_intent 2002-12-06 14:52:31.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/fs/dcache.c 2002-12-06 14:52:31.000000000 -0800 -@@ -150,6 +150,8 @@ repeat: - unhash_it: - list_del_init(&dentry->d_hash); - +--- linux-2.4.18-18.8.0-l7/fs/dcache.c~vfs_intent-2.4.18-18 Mon Jan 20 08:28:00 2003 ++++ linux-2.4.18-18.8.0-l7-root/fs/dcache.c Mon Jan 20 08:54:54 2003 +@@ -186,6 +188,13 @@ int d_invalidate(struct dentry * dentry) + spin_unlock(&dcache_lock); + return 0; + } + ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } + - kill_it: { - struct dentry *parent; - list_del(&dentry->d_child); -@@ -645,6 +647,7 @@ struct dentry * d_alloc(struct dentry * + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -645,6 +654,7 @@ struct dentry * d_alloc(struct dentry * dentry->d_fsdata = NULL; dentry->d_extra_attributes = NULL; dentry->d_mounted = 0; @@ -22,16 +32,9 @@ INIT_LIST_HEAD(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); ---- linux-2.4.18-17.8.0/fs/namei.c~vfs_intent 2002-12-06 14:52:31.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/fs/namei.c 2002-12-06 14:52:31.000000000 -0800 -@@ -1,3 +1,6 @@ -+ -+ -+ - /* - * linux/fs/namei.c - * -@@ -94,6 +97,14 @@ +--- linux-2.4.18-18.8.0-l7/fs/namei.c~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 ++++ linux-2.4.18-18.8.0-l7-root/fs/namei.c Wed Jan 22 22:53:28 2003 +@@ -94,6 +97,13 @@ * XEmacs seems to be relying on it... */ @@ -42,7 +45,6 @@ + +} + -+ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. @@ -106,7 +108,7 @@ */ -static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) +static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, -+ struct lookup_intent *it) ++ struct lookup_intent *it) { int err; if (current->link_count >= max_recursive_link) @@ -115,14 +117,14 @@ current->total_link_count++; UPDATE_ATIME(dentry->d_inode); - err = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (dentry->d_inode->i_op->follow_link2) -+ err = dentry->d_inode->i_op->follow_link2(dentry, nd, it); -+ else -+ err = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (dentry->d_inode->i_op->follow_link2) ++ err = dentry->d_inode->i_op->follow_link2(dentry, nd, it); ++ else ++ err = dentry->d_inode->i_op->follow_link(dentry, nd); current->link_count--; return err; loop: -+ intent_release(dentry, it); ++ intent_release(dentry, it); path_release(nd); return -ELOOP; } @@ -186,15 +188,14 @@ err = PTR_ERR(dentry); if (IS_ERR(dentry)) break; -@@ -606,8 +642,10 @@ last_component: +@@ -606,8 +642,9 @@ last_component: ; inode = dentry->d_inode; if ((lookup_flags & LOOKUP_FOLLOW) - && inode && inode->i_op && inode->i_op->follow_link) { - err = do_follow_link(dentry, nd); + && inode && inode->i_op && -+ (inode->i_op->follow_link || -+ inode->i_op->follow_link2)) { ++ (inode->i_op->follow_link || inode->i_op->follow_link2)) { + err = do_follow_link(dentry, nd, it); dput(dentry); if (err) @@ -209,7 +210,14 @@ break; } goto return_base; -@@ -663,10 +702,21 @@ return_err: +@@ -658,15 +697,28 @@ out_dput: + dput(dentry); + break; + } ++ if (err) ++ intent_release(nd->dentry, it); + path_release(nd); + return_err: return err; } @@ -232,7 +240,7 @@ } /* SMP-safe */ -@@ -751,6 +801,17 @@ walk_init_root(const char *name, struct +@@ -751,6 +803,17 @@ walk_init_root(const char *name, struct } /* SMP-safe */ @@ -250,7 +258,7 @@ int path_lookup(const char *path, unsigned flags, struct nameidata *nd) { int error = 0; -@@ -779,7 +840,8 @@ int path_init(const char *name, unsigned +@@ -779,7 +842,8 @@ int path_init(const char *name, unsigned * needs parent already locked. Doesn't follow mounts. * SMP-safe. */ @@ -260,7 +268,7 @@ { struct dentry * dentry; struct inode *inode; -@@ -802,13 +864,16 @@ struct dentry * lookup_hash(struct qstr +@@ -802,13 +866,16 @@ struct dentry * lookup_hash(struct qstr goto out; } @@ -278,7 +286,7 @@ dentry = inode->i_op->lookup(inode, new); unlock_kernel(); if (!dentry) -@@ -820,6 +885,12 @@ out: +@@ -820,6 +887,12 @@ out: return dentry; } @@ -291,7 +299,7 @@ /* SMP-safe */ struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) { -@@ -841,7 +912,7 @@ struct dentry * lookup_one_len(const cha +@@ -841,7 +914,7 @@ struct dentry * lookup_one_len(const cha } this.hash = end_name_hash(hash); @@ -300,7 +308,7 @@ access: return ERR_PTR(-EACCES); } -@@ -872,6 +943,23 @@ int __user_walk(const char *name, unsign +@@ -872,6 +945,23 @@ int __user_walk(const char *name, unsign return err; } @@ -324,103 +332,109 @@ /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. -@@ -1010,7 +1098,8 @@ exit_lock: - * for symlinks (where the permissions are checked later). - * SMP-safe - */ --int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) -+int open_namei_it(const char *pathname, int flag, int mode, -+ struct nameidata *nd, struct lookup_intent *it) +@@ -1045,14 +1135,17 @@ int may_open(struct nameidata *nd, int a + return get_lease(inode, flag); + } + ++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it); ++ + struct file *filp_open(const char * pathname, int open_flags, int mode) { int acc_mode, error = 0; - struct inode *inode; -@@ -1024,7 +1113,7 @@ int open_namei(const char * pathname, in +- struct inode *inode; + struct dentry *dentry; + struct dentry *dir; + int flag = open_flags; + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = open_flags }; + int count = 0; + + if ((flag+1) & O_ACCMODE) +@@ -1066,7 +1159,7 @@ struct file *filp_open(const char * path * The simplest case - just a plain lookup. */ if (!(flag & O_CREAT)) { -- error = path_lookup(pathname, lookup_flags(flag), nd); -+ error = path_lookup_it(pathname, lookup_flags(flag), nd, it); +- error = path_lookup(pathname, lookup_flags(flag), &nd); ++ error = path_lookup_it(pathname, lookup_flags(flag), &nd, &it); if (error) - return error; - dentry = nd->dentry; -@@ -1034,6 +1123,10 @@ int open_namei(const char * pathname, in + return ERR_PTR(error); + dentry = nd.dentry; +@@ -1076,6 +1169,8 @@ struct file *filp_open(const char * path /* * Create - we need to know the parent. */ -+ if (it) { -+ it->it_mode = mode; -+ it->it_op |= IT_CREAT; -+ } - error = path_lookup(pathname, LOOKUP_PARENT, nd); ++ it.it_mode = mode; ++ it.it_op |= IT_CREAT; + error = path_lookup(pathname, LOOKUP_PARENT, &nd); if (error) - return error; -@@ -1049,7 +1142,7 @@ int open_namei(const char * pathname, in + return ERR_PTR(error); +@@ -1091,7 +1186,7 @@ struct file *filp_open(const char * path - dir = nd->dentry; + dir = nd.dentry; down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); do_last: error = PTR_ERR(dentry); -@@ -1058,6 +1151,7 @@ do_last: +@@ -1100,6 +1195,7 @@ do_last: goto exit; } -+ it->it_mode = mode; ++ it.it_mode = mode; /* Negative dentry, just create the file */ if (!dentry->d_inode) { error = vfs_create(dir->d_inode, dentry, -@@ -1091,7 +1185,8 @@ do_last: +@@ -1134,7 +1230,8 @@ do_last: error = -ENOENT; if (!dentry->d_inode) goto exit_dput; - if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link) + if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link || -+ dentry->d_inode->i_op->follow_link2)) ++ dentry->d_inode->i_op->follow_link2)) goto do_link; - dput(nd->dentry); -@@ -1177,8 +1272,10 @@ ok: - return 0; + dput(nd.dentry); +@@ -1149,11 +1246,13 @@ ok: + if (!S_ISREG(nd.dentry->d_inode->i_mode)) + open_flags &= ~O_TRUNC; + +- return dentry_open(nd.dentry, nd.mnt, open_flags); ++ return dentry_open_it(nd.dentry, nd.mnt, open_flags, &it); exit_dput: -+ intent_release(dentry, it); ++ intent_release(dentry, &it); dput(dentry); exit: -+ intent_release(nd->dentry, it); - path_release(nd); - return error; ++ intent_release(nd.dentry, &it); + path_release(&nd); + return ERR_PTR(error); -@@ -1197,7 +1294,12 @@ do_link: +@@ -1172,7 +1271,12 @@ do_link: * are done. Procfs-like symlinks just set LAST_BIND. */ UPDATE_ATIME(dentry->d_inode); -- error = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (dentry->d_inode->i_op->follow_link2) -+ error = dentry->d_inode->i_op->follow_link2(dentry, nd, it); -+ else -+ error = dentry->d_inode->i_op->follow_link(dentry, nd); +- error = dentry->d_inode->i_op->follow_link(dentry, &nd); ++ if (dentry->d_inode->i_op->follow_link2) ++ error = dentry->d_inode->i_op->follow_link2(dentry, &nd, &it); ++ else ++ error = dentry->d_inode->i_op->follow_link(dentry, &nd); + if (error) -+ intent_release(dentry, it); ++ intent_release(dentry, &it); dput(dentry); if (error) return error; -@@ -1219,13 +1321,20 @@ do_link: +@@ -1194,13 +1298,15 @@ do_link: } - dir = nd->dentry; + dir = nd.dentry; down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - putname(nd->last.name); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); + putname(nd.last.name); goto do_last; } -+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) -+{ -+ return open_namei_it(pathname, flag, mode, nd, NULL); -+} -+ + /* SMP-safe */ -static struct dentry *lookup_create(struct nameidata *nd, int is_dir) @@ -429,7 +443,7 @@ { struct dentry *dentry; -@@ -1233,7 +1342,7 @@ static struct dentry *lookup_create(stru +@@ -1208,7 +1314,7 @@ static struct dentry *lookup_create(stru dentry = ERR_PTR(-EEXIST); if (nd->last_type != LAST_NORM) goto fail; @@ -438,153 +452,158 @@ if (IS_ERR(dentry)) goto fail; if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1279,6 +1388,7 @@ asmlinkage long sys_mknod(const char * f - char * tmp; - struct dentry * dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_MKNOD, .it_mode = mode }; - - if (S_ISDIR(mode)) - return -EPERM; -@@ -1289,7 +1399,7 @@ asmlinkage long sys_mknod(const char * f +@@ -1264,7 +1370,19 @@ asmlinkage long sys_mknod(const char * f error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); -+ dentry = lookup_create(&nd, 0, &it); ++ ++ if (nd.dentry->d_inode->i_op->mknod2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mknod2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ mode, dev); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ ++ dentry = lookup_create(&nd, 0, NULL); error = PTR_ERR(dentry); mode &= ~current->fs->umask; -@@ -1307,6 +1417,7 @@ asmlinkage long sys_mknod(const char * f - default: - error = -EINVAL; - } -+ intent_release(dentry, &it); +@@ -1285,6 +1403,7 @@ asmlinkage long sys_mknod(const char * f dput(dentry); } up(&nd.dentry->d_inode->i_sem); -@@ -1347,6 +1458,7 @@ asmlinkage long sys_mkdir(const char * p - { - int error = 0; - char * tmp; -+ struct lookup_intent it = { .it_op = IT_MKDIR, .it_mode = mode }; - - tmp = getname(pathname); - error = PTR_ERR(tmp); -@@ -1357,11 +1469,12 @@ asmlinkage long sys_mkdir(const char * p ++ out2: + path_release(&nd); + out: + putname(tmp); +@@ -1332,7 +1451,17 @@ asmlinkage long sys_mkdir(const char * p error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 1); -+ dentry = lookup_create(&nd, 1, &it); ++ if (nd.dentry->d_inode->i_op->mkdir2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ mode); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 1, NULL); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_mkdir(nd.dentry->d_inode, dentry, - mode & ~current->fs->umask); -+ intent_release(dentry, &it); +@@ -1340,6 +1469,7 @@ asmlinkage long sys_mkdir(const char * p dput(dentry); } up(&nd.dentry->d_inode->i_sem); -@@ -1445,6 +1558,7 @@ asmlinkage long sys_rmdir(const char * p - char * name; - struct dentry *dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_RMDIR }; - - name = getname(pathname); - if(IS_ERR(name)) -@@ -1466,10 +1580,11 @@ asmlinkage long sys_rmdir(const char * p ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1440,8 +1570,17 @@ asmlinkage long sys_rmdir(const char * p + error = -EBUSY; goto exit1; } ++ if (nd.dentry->d_inode->i_op->rmdir2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->rmdir2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } down(&nd.dentry->d_inode->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_rmdir(nd.dentry->d_inode, dentry); -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1513,6 +1628,7 @@ asmlinkage long sys_unlink(const char * - char * name; - struct dentry *dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_UNLINK }; - - name = getname(pathname); - if(IS_ERR(name)) -@@ -1525,7 +1641,7 @@ asmlinkage long sys_unlink(const char * +@@ -1499,8 +1638,17 @@ asmlinkage long sys_unlink(const char * + error = -EISDIR; if (nd.last_type != LAST_NORM) goto exit1; ++ if (nd.dentry->d_inode->i_op->unlink2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } down(&nd.dentry->d_inode->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ -@@ -1533,6 +1649,7 @@ asmlinkage long sys_unlink(const char * - goto slashes; - error = vfs_unlink(nd.dentry->d_inode, dentry); - exit2: -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1579,6 +1696,7 @@ asmlinkage long sys_symlink(const char * - int error = 0; - char * from; - char * to; -+ struct lookup_intent it = { .it_op = IT_SYMLINK }; - - from = getname(oldname); - if(IS_ERR(from)) -@@ -1592,10 +1710,12 @@ asmlinkage long sys_symlink(const char * +@@ -1567,15 +1715,26 @@ asmlinkage long sys_symlink(const char * error = path_lookup(to, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); -+ it.it_data = from; -+ dentry = lookup_create(&nd, 0, &it); ++ if (nd.dentry->d_inode->i_op->symlink2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->symlink2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ from); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 0, NULL); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_symlink(nd.dentry->d_inode, dentry, from); -+ intent_release(dentry, &it); dput(dentry); } up(&nd.dentry->d_inode->i_sem); -@@ -1660,6 +1780,7 @@ asmlinkage long sys_link(const char * ol - { - int error; - char * to; -+ struct lookup_intent it = { .it_op = IT_LINK }; - - to = getname(newname); - error = PTR_ERR(to); -@@ -1667,7 +1788,7 @@ asmlinkage long sys_link(const char * ol ++ out2: + path_release(&nd); +-out: ++ out: + putname(to); + } + putname(from); +@@ -1642,7 +1801,7 @@ asmlinkage long sys_link(const char * ol struct dentry *new_dentry; struct nameidata nd, old_nd; - error = __user_walk(oldname, LOOKUP_POSITIVE, &old_nd); -+ error = __user_walk_it(oldname, LOOKUP_POSITIVE, &old_nd, &it); ++ error = __user_walk_it(oldname, LOOKUP_POSITIVE, &old_nd, NULL); if (error) goto exit; error = path_lookup(to, LOOKUP_PARENT, &nd); -@@ -1676,10 +1797,12 @@ asmlinkage long sys_link(const char * ol +@@ -1651,7 +1810,17 @@ asmlinkage long sys_link(const char * ol error = -EXDEV; if (old_nd.mnt != nd.mnt) goto out_release; - new_dentry = lookup_create(&nd, 0); -+ it.it_op = IT_LINK2; -+ new_dentry = lookup_create(&nd, 0, &it); ++ if (nd.dentry->d_inode->i_op->link2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link2(old_nd.dentry->d_inode, ++ nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } ++ new_dentry = lookup_create(&nd, 0, NULL); error = PTR_ERR(new_dentry); if (!IS_ERR(new_dentry)) { error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); -+ intent_release(new_dentry, &it); - dput(new_dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1720,7 +1843,8 @@ exit: +@@ -1695,7 +1864,8 @@ exit: * locking]. */ int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, @@ -594,7 +613,7 @@ { int error; struct inode *target; -@@ -1778,6 +1902,7 @@ int vfs_rename_dir(struct inode *old_dir +@@ -1753,6 +1923,7 @@ int vfs_rename_dir(struct inode *old_dir error = -EBUSY; else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -602,7 +621,7 @@ if (target) { if (!error) target->i_flags |= S_DEAD; -@@ -1799,7 +1924,8 @@ out_unlock: +@@ -1774,7 +1945,8 @@ out_unlock: } int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, @@ -612,7 +631,7 @@ { int error; -@@ -1830,6 +1956,7 @@ int vfs_rename_other(struct inode *old_d +@@ -1805,6 +1977,7 @@ int vfs_rename_other(struct inode *old_d error = -EBUSY; else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -620,7 +639,7 @@ double_up(&old_dir->i_zombie, &new_dir->i_zombie); if (error) return error; -@@ -1841,13 +1968,14 @@ int vfs_rename_other(struct inode *old_d +@@ -1816,13 +1989,14 @@ int vfs_rename_other(struct inode *old_d } int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -638,30 +657,37 @@ if (!error) { if (old_dir == new_dir) inode_dir_notify(old_dir, DN_RENAME); -@@ -1864,6 +1992,7 @@ static inline int do_rename(const char * - int error = 0; - struct dentry * old_dir, * new_dir; - struct dentry * old_dentry, *new_dentry; -+ struct lookup_intent it = { .it_op = IT_RENAME }; - struct nameidata oldnd, newnd; - - error = path_lookup(oldname, LOOKUP_PARENT, &oldnd); -@@ -1889,7 +2018,7 @@ static inline int do_rename(const char * - +@@ -1862,9 +2036,23 @@ static inline int do_rename(const char * + if (newnd.last_type != LAST_NORM) + goto exit2; + ++ if (old_dir->d_inode->i_op->rename2) { ++ lock_kernel(); ++ error = old_dir->d_inode->i_op->rename2(old_dir->d_inode, ++ new_dir->d_inode, ++ oldnd.last.name, ++ oldnd.last.len, ++ newnd.last.name, ++ newnd.last.len); ++ unlock_kernel(); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit2; ++ } ++ double_lock(new_dir, old_dir); - old_dentry = lookup_hash(&oldnd.last, old_dir); -+ old_dentry = lookup_hash_it(&oldnd.last, old_dir, &it); ++ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; -@@ -1905,18 +2034,21 @@ static inline int do_rename(const char * +@@ -1880,14 +2068,14 @@ static inline int do_rename(const char * if (newnd.last.name[newnd.last.len]) goto exit4; } - new_dentry = lookup_hash(&newnd.last, new_dir); -+ it.it_op = IT_RENAME2; -+ new_dentry = lookup_hash_it(&newnd.last, new_dir, &it); ++ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; @@ -669,27 +695,21 @@ lock_kernel(); error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); -+ new_dir->d_inode, new_dentry, &it); ++ new_dir->d_inode, new_dentry, NULL); unlock_kernel(); -+ intent_release(new_dentry, &it); dput(new_dentry); - exit4: -+ intent_release(old_dentry, &it); - dput(old_dentry); - exit3: - double_up(&new_dir->d_inode->i_sem, &old_dir->d_inode->i_sem); -@@ -1965,7 +2097,8 @@ out: +@@ -1940,7 +2127,8 @@ out: } static inline int -__vfs_follow_link(struct nameidata *nd, const char *link) +__vfs_follow_link(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) ++ struct lookup_intent *it) { int res = 0; char *name; -@@ -1978,7 +2111,7 @@ __vfs_follow_link(struct nameidata *nd, +@@ -1953,7 +2141,7 @@ __vfs_follow_link(struct nameidata *nd, /* weird __emul_prefix() stuff did it */ goto out; } @@ -698,7 +718,7 @@ out: if (current->link_count || res || nd->last_type!=LAST_NORM) return res; -@@ -2000,7 +2133,13 @@ fail: +@@ -1975,7 +2163,13 @@ fail: int vfs_follow_link(struct nameidata *nd, const char *link) { @@ -707,13 +727,13 @@ +} + +int vfs_follow_link_it(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) ++ struct lookup_intent *it) +{ + return __vfs_follow_link(nd, link, it); } /* get the link contents into pagecache */ -@@ -2042,7 +2181,7 @@ int page_follow_link(struct dentry *dent +@@ -2017,7 +2211,7 @@ int page_follow_link(struct dentry *dent { struct page *page = NULL; char *s = page_getlink(dentry, &page); @@ -722,8 +742,8 @@ if (page) { kunmap(page); page_cache_release(page); ---- linux-2.4.18-17.8.0/fs/nfsd/vfs.c~vfs_intent 2002-12-06 14:52:31.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/fs/nfsd/vfs.c 2002-12-06 14:52:31.000000000 -0800 +--- linux-2.4.18-18.8.0-l7/fs/nfsd/vfs.c~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 ++++ linux-2.4.18-18.8.0-l7-root/fs/nfsd/vfs.c Mon Jan 20 12:25:10 2003 @@ -1298,7 +1298,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru err = nfserr_perm; } else @@ -733,8 +753,8 @@ unlock_kernel(); if (!err && EX_ISSYNC(tfhp->fh_export)) { nfsd_sync_dir(tdentry); ---- linux-2.4.18-17.8.0/fs/open.c~vfs_intent 2002-12-06 14:52:31.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/fs/open.c 2002-12-06 14:52:31.000000000 -0800 +--- linux-2.4.18-18.8.0-l7/fs/open.c~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 ++++ linux-2.4.18-18.8.0-l7-root/fs/open.c Wed Jan 22 10:39:31 2003 @@ -19,6 +19,9 @@ #include @@ -749,7 +769,7 @@ struct nameidata nd; struct inode * inode; int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; ++ struct lookup_intent it = { .it_op = IT_TRUNC }; error = -EINVAL; if (length < 0) /* sorry, but loff_t says... */ @@ -919,39 +939,7 @@ path_release(&nd); } return error; -@@ -638,10 +661,16 @@ asmlinkage long sys_fchown(unsigned int - * for the internal routines (ie open_namei()/follow_link() etc). 00 is - * used by symlinks. - */ -+extern int open_namei_it(const char *filename, int namei_flags, int mode, -+ struct nameidata *nd, struct lookup_intent *it); -+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it); -+ - struct file *filp_open(const char * filename, int flags, int mode) - { - int namei_flags, error; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_OPEN }; - - namei_flags = flags; - if ((namei_flags+1) & O_ACCMODE) -@@ -649,18 +678,19 @@ struct file *filp_open(const char * file - if (namei_flags & O_TRUNC) - namei_flags |= 2; - -- error = open_namei(filename, namei_flags, mode, &nd); -- if (!error) -- return dentry_open(nd.dentry, nd.mnt, flags); -+ error = open_namei_it(filename, namei_flags, mode, &nd, &it); -+ if (error) -+ return ERR_PTR(error); - -- return ERR_PTR(error); -+ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); - } - - extern ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr); +@@ -628,7 +651,8 @@ extern ssize_t do_readahead(struct file /* for files over a certains size it doesn't pay to do readahead on open */ #define READAHEAD_CUTOFF 48000 @@ -961,7 +949,7 @@ { struct file * f; struct inode *inode; -@@ -711,6 +741,7 @@ struct file *dentry_open(struct dentry * +@@ -693,6 +717,7 @@ struct file *dentry_open(struct dentry * do_readahead(f, 0, (48 * 1024) >> PAGE_SHIFT); @@ -969,7 +957,7 @@ return f; cleanup_all: -@@ -725,11 +756,17 @@ cleanup_all: +@@ -707,11 +732,17 @@ cleanup_all: cleanup_file: put_filp(f); cleanup_dentry: @@ -987,8 +975,8 @@ /* * Find an empty file descriptor entry, and mark it busy. */ ---- linux-2.4.18-17.8.0/fs/stat.c~vfs_intent 2002-12-06 14:52:31.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/fs/stat.c 2002-12-06 14:52:31.000000000 -0800 +--- linux-2.4.18-18.8.0-l7/fs/stat.c~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 ++++ linux-2.4.18-18.8.0-l7-root/fs/stat.c Mon Jan 20 12:25:10 2003 @@ -13,6 +13,7 @@ #include @@ -1025,32 +1013,25 @@ path_release(&nd); } return error; ---- linux-2.4.18-17.8.0/include/linux/dcache.h~vfs_intent 2002-12-06 14:52:31.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/include/linux/dcache.h 2002-12-06 14:52:31.000000000 -0800 -@@ -6,6 +6,34 @@ +--- linux-2.4.18-18.8.0-l7/include/linux/dcache.h~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 ++++ linux-2.4.18-18.8.0-l7-root/include/linux/dcache.h Wed Jan 22 19:38:12 2003 +@@ -6,6 +6,27 @@ #include #include -+#define IT_OPEN (1) -+#define IT_CREAT (1<<1) -+#define IT_MKDIR (1<<2) -+#define IT_LINK (1<<3) -+#define IT_LINK2 (1<<4) -+#define IT_SYMLINK (1<<5) -+#define IT_UNLINK (1<<6) -+#define IT_RMDIR (1<<7) -+#define IT_RENAME (1<<8) -+#define IT_RENAME2 (1<<9) -+#define IT_READDIR (1<<10) -+#define IT_GETATTR (1<<11) -+#define IT_SETATTR (1<<12) -+#define IT_READLINK (1<<13) -+#define IT_MKNOD (1<<14) -+#define IT_LOOKUP (1<<15) ++#define IT_OPEN (1) ++#define IT_CREAT (1<<1) ++#define IT_READDIR (1<<2) ++#define IT_GETATTR (1<<3) ++#define IT_SETATTR (1<<4) ++#define IT_TRUNC (1<<5) ++#define IT_READLINK (1<<6) ++#define IT_LOOKUP (1<<7) + +struct lookup_intent { + int it_op; + int it_mode; ++ int it_flags; + int it_disposition; + int it_status; + struct iattr *it_iattr; @@ -1062,7 +1043,7 @@ /* * linux/include/linux/dcache.h * -@@ -78,6 +106,7 @@ struct dentry { +@@ -78,6 +99,7 @@ struct dentry { unsigned long d_time; /* used by d_revalidate */ struct dentry_operations *d_op; struct super_block * d_sb; /* The root of the dentry tree */ @@ -1070,7 +1051,7 @@ unsigned long d_vfs_flags; void * d_fsdata; /* fs-specific data */ void * d_extra_attributes; /* TUX-specific data */ -@@ -91,6 +120,8 @@ struct dentry_operations { +@@ -91,6 +113,8 @@ struct dentry_operations { int (*d_delete)(struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); @@ -1079,8 +1060,16 @@ }; /* the dentry parameter passed to d_hash and d_compare is the parent ---- linux-2.4.18-17.8.0/include/linux/fs.h~vfs_intent 2002-12-06 14:52:31.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/include/linux/fs.h 2002-12-06 14:52:31.000000000 -0800 +@@ -124,6 +148,7 @@ d_iput: no no yes + * s_nfsd_free_path semaphore will be down + */ + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ ++#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */ + + extern spinlock_t dcache_lock; + +--- linux-2.4.18-18.8.0-l7/include/linux/fs.h~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 ++++ linux-2.4.18-18.8.0-l7-root/include/linux/fs.h Wed Jan 22 22:46:13 2003 @@ -576,6 +576,7 @@ struct file { /* needed for tty driver, and maybe others */ @@ -1100,24 +1089,36 @@ /* * File types -@@ -897,6 +900,7 @@ struct file_operations { +@@ -897,16 +900,28 @@ struct file_operations { struct inode_operations { int (*create) (struct inode *,struct dentry *,int); struct dentry * (*lookup) (struct inode *,struct dentry *); + struct dentry * (*lookup2) (struct inode *,struct dentry *, struct lookup_intent *); int (*link) (struct dentry *,struct inode *,struct dentry *); ++ int (*link2) (struct inode *,struct inode *, const char *, int); int (*unlink) (struct inode *,struct dentry *); ++ int (*unlink2) (struct inode *, const char *, int); int (*symlink) (struct inode *,struct dentry *,const char *); -@@ -907,6 +911,8 @@ struct inode_operations { ++ int (*symlink2) (struct inode *, const char *, int, const char *); + int (*mkdir) (struct inode *,struct dentry *,int); ++ int (*mkdir2) (struct inode *, const char *, int,int); + int (*rmdir) (struct inode *,struct dentry *); ++ int (*rmdir2) (struct inode *, const char *, int); + int (*mknod) (struct inode *,struct dentry *,int,int); ++ int (*mknod2) (struct inode *, const char *, int,int,int); + int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); ++ int (*rename2) (struct inode *, struct inode *, ++ const char *oldname, int oldlen, ++ const char *newname, int newlen); int (*readlink) (struct dentry *, char *,int); int (*follow_link) (struct dentry *, struct nameidata *); + int (*follow_link2) (struct dentry *, struct nameidata *, -+ struct lookup_intent *it); ++ struct lookup_intent *it); void (*truncate) (struct inode *); int (*permission) (struct inode *, int); int (*revalidate) (struct dentry *); -@@ -1381,6 +1387,7 @@ typedef int (*read_actor_t)(read_descrip +@@ -1381,6 +1396,7 @@ typedef int (*read_actor_t)(read_descrip extern loff_t default_llseek(struct file *file, loff_t offset, int origin); extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); @@ -1125,7 +1126,7 @@ extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); extern int FASTCALL(path_walk(const char *, struct nameidata *)); extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *)); -@@ -1392,6 +1399,8 @@ extern struct dentry * lookup_one_len(co +@@ -1392,6 +1408,8 @@ extern struct dentry * lookup_one_len(co extern struct dentry * lookup_hash(struct qstr *, struct dentry *); #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) @@ -1134,17 +1135,17 @@ extern void inode_init_once(struct inode *); extern void iput(struct inode *); -@@ -1492,6 +1501,8 @@ extern struct file_operations generic_ro +@@ -1492,6 +1510,8 @@ extern struct file_operations generic_ro extern int vfs_readlink(struct dentry *, char *, int, const char *); extern int vfs_follow_link(struct nameidata *, const char *); +extern int vfs_follow_link_it(struct nameidata *, const char *, -+ struct lookup_intent *it); ++ struct lookup_intent *it); extern int page_readlink(struct dentry *, char *, int); extern int page_follow_link(struct dentry *, struct nameidata *); extern struct inode_operations page_symlink_inode_operations; ---- linux-2.4.18-17.8.0/kernel/ksyms.c~vfs_intent 2002-12-06 14:52:31.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/kernel/ksyms.c 2002-12-06 14:52:31.000000000 -0800 +--- linux-2.4.18-18.8.0-l7/kernel/ksyms.c~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 ++++ linux-2.4.18-18.8.0-l7-root/kernel/ksyms.c Mon Jan 20 12:25:10 2003 @@ -293,6 +293,7 @@ EXPORT_SYMBOL(read_cache_page); EXPORT_SYMBOL(set_page_dirty); EXPORT_SYMBOL(vfs_readlink); diff --git a/lustre/kernel_patches/patches/vfs_intent_hp.patch b/lustre/kernel_patches/patches/vfs_intent_hp.patch new file mode 100644 index 0000000..63f09b3 --- /dev/null +++ b/lustre/kernel_patches/patches/vfs_intent_hp.patch @@ -0,0 +1,1267 @@ + fs/dcache.c | 3 + fs/namei.c | 306 ++++++++++++++++++++++++++++++++++++++++--------- + fs/nfsd/vfs.c | 2 + fs/open.c | 63 +++++++--- + fs/stat.c | 29 +++- + include/linux/dcache.h | 31 ++++ + include/linux/fs.h | 22 +++ + kernel/ksyms.c | 1 + 8 files changed, 384 insertions(+), 73 deletions(-) + +--- linux-2.4.19-hp2_pnnl2/fs/dcache.c~vfs_intent_hp Sun Jan 19 19:04:47 2003 ++++ linux-2.4.19-hp2_pnnl2-root/fs/dcache.c Sun Jan 19 19:04:47 2003 +@@ -186,6 +188,13 @@ int d_invalidate(struct dentry * dentry) + spin_unlock(&dcache_lock); + return 0; + } ++ ++ /* network invalidation by Lustre */ ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ spin_unlock(&dcache_lock); ++ return 0; ++ } ++ + /* + * Check whether to do a partial shrink_dcache + * to get rid of unused child entries. +@@ -616,6 +618,7 @@ struct dentry * d_alloc(struct dentry * + dentry->d_op = NULL; + dentry->d_fsdata = NULL; + dentry->d_mounted = 0; ++ dentry->d_it = NULL; + INIT_LIST_HEAD(&dentry->d_hash); + INIT_LIST_HEAD(&dentry->d_lru); + INIT_LIST_HEAD(&dentry->d_subdirs); +--- linux-2.4.19-hp2_pnnl2/fs/namei.c~vfs_intent_hp Sun Jan 19 19:04:47 2003 ++++ linux-2.4.19-hp2_pnnl2-root/fs/namei.c Sun Jan 19 19:35:55 2003 +@@ -94,6 +97,13 @@ + * XEmacs seems to be relying on it... + */ + ++void intent_release(struct dentry *de, struct lookup_intent *it) ++{ ++ if (it && de->d_op && de->d_op->d_intent_release) ++ de->d_op->d_intent_release(de, it); ++ ++} ++ + /* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. +@@ -260,10 +271,19 @@ void path_release(struct nameidata *nd) + * Internal lookup() using the new generic dcache. + * SMP-safe + */ +-static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) ++static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name, ++ int flags, struct lookup_intent *it) + { + struct dentry * dentry = d_lookup(parent, name); + ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) { ++ if (!dentry->d_op->d_revalidate2(dentry, flags, it) && ++ !d_invalidate(dentry)) { ++ dput(dentry); ++ dentry = NULL; ++ } ++ return dentry; ++ } else + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { + dput(dentry); +@@ -281,11 +301,14 @@ static struct dentry * cached_lookup(str + * make sure that nobody added the entry to the dcache in the meantime.. + * SMP-safe + */ +-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) ++static struct dentry *real_lookup(struct dentry *parent, struct qstr *name, ++ int flags, struct lookup_intent *it) + { + struct dentry * result; + struct inode *dir = parent->d_inode; + ++again: ++ + down(&dir->i_sem); + /* + * First re-do the cached lookup just in case it was created +@@ -300,6 +321,9 @@ static struct dentry * real_lookup(struc + result = ERR_PTR(-ENOMEM); + if (dentry) { + lock_kernel(); ++ if (dir->i_op->lookup2) ++ result = dir->i_op->lookup2(dir, dentry, it); ++ else + result = dir->i_op->lookup(dir, dentry); + unlock_kernel(); + if (result) +@@ -321,6 +345,12 @@ static struct dentry * real_lookup(struc + dput(result); + result = ERR_PTR(-ENOENT); + } ++ } else if (result->d_op && result->d_op->d_revalidate2) { ++ if (!result->d_op->d_revalidate2(result, flags, it) && ++ !d_invalidate(result)) { ++ dput(result); ++ goto again; ++ } + } + return result; + } +@@ -332,7 +362,8 @@ static struct dentry * real_lookup(struc + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +-static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) ++static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, ++ struct lookup_intent *it) + { + int err; + if (current->link_count >= 5) +@@ -346,10 +377,14 @@ static inline int do_follow_link(struct + current->link_count++; + current->total_link_count++; + UPDATE_ATIME(dentry->d_inode); +- err = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (dentry->d_inode->i_op->follow_link2) ++ err = dentry->d_inode->i_op->follow_link2(dentry, nd, it); ++ else ++ err = dentry->d_inode->i_op->follow_link(dentry, nd); + current->link_count--; + return err; + loop: ++ intent_release(dentry, it); + path_release(nd); + return -ELOOP; + } +@@ -447,7 +482,8 @@ static inline void follow_dotdot(struct + * + * We expect 'base' to be positive and a directory. + */ +-int link_path_walk(const char * name, struct nameidata *nd) ++int link_path_walk_it(const char *name, struct nameidata *nd, ++ struct lookup_intent *it) + { + struct dentry *dentry; + struct inode *inode; +@@ -520,9 +556,9 @@ int link_path_walk(const char * name, st + break; + } + /* This does the actual lookups.. */ +- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); ++ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); + if (!dentry) { +- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); ++ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + break; +@@ -539,8 +575,8 @@ int link_path_walk(const char * name, st + if (!inode->i_op) + goto out_dput; + +- if (inode->i_op->follow_link) { +- err = do_follow_link(dentry, nd); ++ if (inode->i_op->follow_link || inode->i_op->follow_link2) { ++ err = do_follow_link(dentry, nd, NULL); + dput(dentry); + if (err) + goto return_err; +@@ -556,7 +592,7 @@ int link_path_walk(const char * name, st + nd->dentry = dentry; + } + err = -ENOTDIR; +- if (!inode->i_op->lookup) ++ if (!inode->i_op->lookup && !inode->i_op->lookup2) + break; + continue; + /* here ends the main loop */ +@@ -583,9 +619,9 @@ last_component: + if (err < 0) + break; + } +- dentry = cached_lookup(nd->dentry, &this, 0); ++ dentry = cached_lookup(nd->dentry, &this, 0, it); + if (!dentry) { +- dentry = real_lookup(nd->dentry, &this, 0); ++ dentry = real_lookup(nd->dentry, &this, 0, it); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + break; +@@ -594,8 +630,9 @@ last_component: + ; + inode = dentry->d_inode; + if ((lookup_flags & LOOKUP_FOLLOW) +- && inode && inode->i_op && inode->i_op->follow_link) { +- err = do_follow_link(dentry, nd); ++ && inode && inode->i_op && ++ (inode->i_op->follow_link || inode->i_op->follow_link2)) { ++ err = do_follow_link(dentry, nd, it); + dput(dentry); + if (err) + goto return_err; +@@ -609,7 +647,8 @@ last_component: + goto no_inode; + if (lookup_flags & LOOKUP_DIRECTORY) { + err = -ENOTDIR; +- if (!inode->i_op || !inode->i_op->lookup) ++ if (!inode->i_op || ++ (!inode->i_op->lookup && !inode->i_op->lookup2)) + break; + } + goto return_base; +@@ -646,15 +685,28 @@ out_dput: + dput(dentry); + break; + } ++ if (err) ++ intent_release(nd->dentry, it); + path_release(nd); + return_err: + return err; + } + ++int link_path_walk(const char * name, struct nameidata *nd) ++{ ++ return link_path_walk_it(name, nd, NULL); ++} ++ ++int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it) ++{ ++ current->total_link_count = 0; ++ return link_path_walk_it(name, nd, it); ++} ++ + int path_walk(const char * name, struct nameidata *nd) + { + current->total_link_count = 0; +- return link_path_walk(name, nd); ++ return link_path_walk_it(name, nd, NULL); + } + + /* SMP-safe */ +@@ -757,7 +809,8 @@ int path_init(const char *name, unsigned + * needs parent already locked. Doesn't follow mounts. + * SMP-safe. + */ +-struct dentry * lookup_hash(struct qstr *name, struct dentry * base) ++struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base, ++ struct lookup_intent *it) + { + struct dentry * dentry; + struct inode *inode; +@@ -780,13 +833,16 @@ struct dentry * lookup_hash(struct qstr + goto out; + } + +- dentry = cached_lookup(base, name, 0); ++ dentry = cached_lookup(base, name, 0, it); + if (!dentry) { + struct dentry *new = d_alloc(base, name); + dentry = ERR_PTR(-ENOMEM); + if (!new) + goto out; + lock_kernel(); ++ if (inode->i_op->lookup2) ++ dentry = inode->i_op->lookup2(inode, new, it); ++ else + dentry = inode->i_op->lookup(inode, new); + unlock_kernel(); + if (!dentry) +@@ -798,6 +854,12 @@ out: + return dentry; + } + ++struct dentry * lookup_hash(struct qstr *name, struct dentry * base) ++{ ++ return lookup_hash_it(name, base, NULL); ++} ++ ++ + /* SMP-safe */ + struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) + { +@@ -819,7 +881,7 @@ struct dentry * lookup_one_len(const cha + } + this.hash = end_name_hash(hash); + +- return lookup_hash(&this, base); ++ return lookup_hash_it(&this, base, NULL); + access: + return ERR_PTR(-EACCES); + } +@@ -851,6 +913,23 @@ int __user_walk(const char *name, unsign + return err; + } + ++int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd, ++ struct lookup_intent *it) ++{ ++ char *tmp; ++ int err; ++ ++ tmp = getname(name); ++ err = PTR_ERR(tmp); ++ if (!IS_ERR(tmp)) { ++ err = 0; ++ if (path_init(tmp, flags, nd)) ++ err = path_walk_it(tmp, nd, it); ++ putname(tmp); ++ } ++ return err; ++} ++ + /* + * It's inline, so penalty for filesystems that don't use sticky bit is + * minimal. +@@ -987,7 +1066,8 @@ exit_lock: + * for symlinks (where the permissions are checked later). + * SMP-safe + */ +-int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) ++int open_namei_it(const char *pathname, int flag, int mode, ++ struct nameidata *nd, struct lookup_intent *it) + { + int acc_mode, error = 0; + struct inode *inode; +@@ -1002,7 +1082,7 @@ int open_namei(const char * pathname, in + */ + if (!(flag & O_CREAT)) { + if (path_init(pathname, lookup_flags(flag), nd)) +- error = path_walk(pathname, nd); ++ error = path_walk_it(pathname, nd, it); + if (error) + return error; + dentry = nd->dentry; +@@ -1012,6 +1092,10 @@ int open_namei(const char * pathname, in + /* + * Create - we need to know the parent. + */ ++ if (it) { ++ it->it_mode = mode; ++ it->it_op |= IT_CREAT; ++ } + if (path_init(pathname, LOOKUP_PARENT, nd)) + error = path_walk(pathname, nd); + if (error) +@@ -1028,7 +1112,7 @@ int open_namei(const char * pathname, in + + dir = nd->dentry; + down(&dir->d_inode->i_sem); +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + + do_last: + error = PTR_ERR(dentry); +@@ -1037,6 +1121,7 @@ do_last: + goto exit; + } + ++ it->it_mode = mode; + /* Negative dentry, just create the file */ + if (!dentry->d_inode) { + if (!IS_POSIXACL(dir->d_inode)) +@@ -1071,7 +1156,8 @@ do_last: + error = -ENOENT; + if (!dentry->d_inode) + goto exit_dput; +- if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link) ++ if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link || ++ dentry->d_inode->i_op->follow_link2)) + goto do_link; + + dput(nd->dentry); +@@ -1157,8 +1243,10 @@ ok: + return 0; + + exit_dput: ++ intent_release(dentry, it); + dput(dentry); + exit: ++ intent_release(nd->dentry, it); + path_release(nd); + return error; + +@@ -1177,7 +1265,12 @@ do_link: + * are done. Procfs-like symlinks just set LAST_BIND. + */ + UPDATE_ATIME(dentry->d_inode); +- error = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (dentry->d_inode->i_op->follow_link2) ++ error = dentry->d_inode->i_op->follow_link2(dentry, nd, it); ++ else ++ error = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (error) ++ intent_release(dentry, it); + dput(dentry); + if (error) + return error; +@@ -1199,13 +1292,20 @@ do_link: + } + dir = nd->dentry; + down(&dir->d_inode->i_sem); +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + putname(nd->last.name); + goto do_last; + } + ++int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) ++{ ++ return open_namei_it(pathname, flag, mode, nd, NULL); ++} ++ ++ + /* SMP-safe */ +-static struct dentry *lookup_create(struct nameidata *nd, int is_dir) ++static struct dentry *lookup_create(struct nameidata *nd, int is_dir, ++ struct lookup_intent *it) + { + struct dentry *dentry; + +@@ -1213,7 +1313,7 @@ static struct dentry *lookup_create(stru + dentry = ERR_PTR(-EEXIST); + if (nd->last_type != LAST_NORM) + goto fail; +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + if (IS_ERR(dentry)) + goto fail; + if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) +@@ -1270,7 +1371,19 @@ asmlinkage long sys_mknod(const char * f + error = path_walk(tmp, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 0); ++ ++ if (nd.dentry->d_inode->i_op->mknod2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mknod2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ mode, dev); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ ++ dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(dentry); + + if (!IS_POSIXACL(nd.dentry->d_inode)) +@@ -1289,6 +1402,7 @@ asmlinkage long sys_mknod(const char * f + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1340,15 +1456,25 @@ asmlinkage long sys_mkdir(const char * p + error = path_walk(tmp, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 1); ++ if (nd.dentry->d_inode->i_op->mkdir2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ mode); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 1, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { +- if (!IS_POSIXACL(nd.dentry->d_inode)) +- mode &= ~current->fs->umask; +- error = vfs_mkdir(nd.dentry->d_inode, dentry, mode); ++ error = vfs_mkdir(nd.dentry->d_inode, dentry, ++ mode & ~current->fs->umask); + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++out2: + path_release(&nd); + out: + putname(tmp); +@@ -1450,8 +1578,17 @@ asmlinkage long sys_rmdir(const char * p + error = -EBUSY; + goto exit1; + } ++ if (nd.dentry->d_inode->i_op->rmdir2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->rmdir2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_rmdir(nd.dentry->d_inode, dentry); +@@ -1510,8 +1649,17 @@ asmlinkage long sys_unlink(const char * + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; ++ if (nd.dentry->d_inode->i_op->unlink2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + /* Why not before? Because we want correct error value */ +@@ -1579,15 +1729,26 @@ asmlinkage long sys_symlink(const char * + error = path_walk(to, &nd); + if (error) + goto out; +- dentry = lookup_create(&nd, 0); ++ if (nd.dentry->d_inode->i_op->symlink2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->symlink2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ from); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } ++ dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_symlink(nd.dentry->d_inode, dentry, from); + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); ++ out2: + path_release(&nd); +-out: ++ out: + putname(to); + } + putname(from); +@@ -1660,7 +1824,7 @@ asmlinkage long sys_link(const char * ol + + error = 0; + if (path_init(from, LOOKUP_POSITIVE, &old_nd)) +- error = path_walk(from, &old_nd); ++ error = path_walk_it(from, &old_nd, NULL); + if (error) + goto exit; + if (path_init(to, LOOKUP_PARENT, &nd)) +@@ -1670,7 +1834,17 @@ asmlinkage long sys_link(const char * ol + error = -EXDEV; + if (old_nd.mnt != nd.mnt) + goto out_release; +- new_dentry = lookup_create(&nd, 0); ++ if (nd.dentry->d_inode->i_op->link2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->link2(old_nd.dentry->d_inode, ++ nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out_release; ++ } ++ new_dentry = lookup_create(&nd, 0, NULL); + error = PTR_ERR(new_dentry); + if (!IS_ERR(new_dentry)) { + error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); +@@ -1716,7 +1892,8 @@ exit: + * locking]. + */ + int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry, ++ struct lookup_intent *it) + { + int error; + struct inode *target; +@@ -1753,6 +1923,7 @@ int vfs_rename_dir(struct inode *old_dir + error = -EBUSY; + else + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); ++ intent_release(new_dentry, it); + if (target) { + if (!error) + target->i_flags |= S_DEAD; +@@ -1795,7 +1973,8 @@ out_unlock: + } + + int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry, ++ struct lookup_intent *it) + { + int error; + +@@ -1826,6 +2005,7 @@ int vfs_rename_other(struct inode *old_d + error = -EBUSY; + else + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); ++ intent_release(new_dentry, it); + double_up(&old_dir->i_zombie, &new_dir->i_zombie); + if (error) + return error; +@@ -1837,13 +2017,14 @@ int vfs_rename_other(struct inode *old_d + } + + int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry, ++ struct lookup_intent *it) + { + int error; + if (S_ISDIR(old_dentry->d_inode->i_mode)) +- error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); ++ error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry,it); + else +- error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); ++ error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,it); + if (!error) { + if (old_dir == new_dir) + inode_dir_notify(old_dir, DN_RENAME); +@@ -1886,9 +2068,23 @@ static inline int do_rename(const char * + if (newnd.last_type != LAST_NORM) + goto exit2; + ++ if (old_dir->d_inode->i_op->rename2) { ++ lock_kernel(); ++ error = old_dir->d_inode->i_op->rename2(old_dir->d_inode, ++ new_dir->d_inode, ++ oldnd.last.name, ++ oldnd.last.len, ++ newnd.last.name, ++ newnd.last.len); ++ unlock_kernel(); ++ /* the file system want to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit2; ++ } ++ + double_lock(new_dir, old_dir); + +- old_dentry = lookup_hash(&oldnd.last, old_dir); ++ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL); + error = PTR_ERR(old_dentry); + if (IS_ERR(old_dentry)) + goto exit3; +@@ -1904,14 +2100,14 @@ static inline int do_rename(const char * + if (newnd.last.name[newnd.last.len]) + goto exit4; + } +- new_dentry = lookup_hash(&newnd.last, new_dir); ++ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto exit4; + + lock_kernel(); + error = vfs_rename(old_dir->d_inode, old_dentry, +- new_dir->d_inode, new_dentry); ++ new_dir->d_inode, new_dentry, NULL); + unlock_kernel(); + + dput(new_dentry); +@@ -1964,7 +2163,8 @@ out: + } + + static inline int +-__vfs_follow_link(struct nameidata *nd, const char *link) ++__vfs_follow_link(struct nameidata *nd, const char *link, ++ struct lookup_intent *it) + { + int res = 0; + char *name; +@@ -1977,7 +2177,7 @@ __vfs_follow_link(struct nameidata *nd, + /* weird __emul_prefix() stuff did it */ + goto out; + } +- res = link_path_walk(link, nd); ++ res = link_path_walk_it(link, nd, it); + out: + if (current->link_count || res || nd->last_type!=LAST_NORM) + return res; +@@ -1999,7 +2199,13 @@ fail: + + int vfs_follow_link(struct nameidata *nd, const char *link) + { +- return __vfs_follow_link(nd, link); ++ return __vfs_follow_link(nd, link, NULL); ++} ++ ++int vfs_follow_link_it(struct nameidata *nd, const char *link, ++ struct lookup_intent *it) ++{ ++ return __vfs_follow_link(nd, link, it); + } + + /* get the link contents into pagecache */ +@@ -2041,7 +2247,7 @@ int page_follow_link(struct dentry *dent + { + struct page *page = NULL; + char *s = page_getlink(dentry, &page); +- int res = __vfs_follow_link(nd, s); ++ int res = __vfs_follow_link(nd, s, NULL); + if (page) { + kunmap(page); + page_cache_release(page); +--- linux-2.4.19-hp2_pnnl2/fs/nfsd/vfs.c~vfs_intent_hp Sun Jan 19 19:04:47 2003 ++++ linux-2.4.19-hp2_pnnl2-root/fs/nfsd/vfs.c Sun Jan 19 19:37:57 2003 +@@ -1295,7 +1295,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru + err = nfserr_perm; + } else + #endif +- err = vfs_rename(fdir, odentry, tdir, ndentry); ++ err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); + if (!err && EX_ISSYNC(tfhp->fh_export)) { + nfsd_sync_dir(tdentry); + nfsd_sync_dir(fdentry); +--- linux-2.4.19-hp2_pnnl2/fs/open.c~vfs_intent_hp Sun Jan 19 19:04:47 2003 ++++ linux-2.4.19-hp2_pnnl2-root/fs/open.c Sun Jan 19 19:41:00 2003 +@@ -19,6 +19,9 @@ + #include + + #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) ++extern int path_walk_it(const char *name, struct nameidata *nd, ++ struct lookup_intent *it); ++extern void intent_release(struct dentry *de, struct lookup_intent *it); + + int vfs_statfs(struct super_block *sb, struct statfs *buf) + { +@@ -118,12 +121,13 @@ static inline long do_sys_truncate(const + struct nameidata nd; + struct inode * inode; + int error; ++ struct lookup_intent it = { .it_op = IT_TRUNC }; + + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + +- error = user_path_walk(path, &nd); ++ error = user_path_walk_it(path, &nd, &it); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -168,6 +172,7 @@ static inline long do_sys_truncate(const + put_write_access(inode); + + dput_and_out: ++ intent_release(nd.dentry, &it); + path_release(&nd); + out: + return error; +@@ -259,8 +264,9 @@ asmlinkage long sys_utime(char * filenam + struct nameidata nd; + struct inode * inode; + struct iattr newattrs; ++ struct lookup_intent it = { .it_op = IT_SETATTR }; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -286,6 +292,7 @@ asmlinkage long sys_utime(char * filenam + } + error = notify_change(nd.dentry, &newattrs); + dput_and_out: ++ intent_release(nd.dentry, &it); + path_release(&nd); + out: + return error; +@@ -303,8 +310,9 @@ asmlinkage long sys_utimes(char * filena + struct nameidata nd; + struct inode * inode; + struct iattr newattrs; ++ struct lookup_intent it = { .it_op = IT_SETATTR }; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + + if (error) + goto out; +@@ -331,6 +339,7 @@ asmlinkage long sys_utimes(char * filena + } + error = notify_change(nd.dentry, &newattrs); + dput_and_out: ++ intent_release(nd.dentry, &it); + path_release(&nd); + out: + return error; +@@ -347,6 +356,7 @@ asmlinkage long sys_access(const char * + int old_fsuid, old_fsgid; + kernel_cap_t old_cap; + int res; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; +@@ -364,13 +374,14 @@ asmlinkage long sys_access(const char * + else + current->cap_effective = current->cap_permitted; + +- res = user_path_walk(filename, &nd); ++ res = user_path_walk_it(filename, &nd, &it); + if (!res) { + res = permission(nd.dentry->d_inode, mode); + /* SuS v2 requires we report a read only fs too */ + if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) + && !special_file(nd.dentry->d_inode->i_mode)) + res = -EROFS; ++ intent_release(nd.dentry, &it); + path_release(&nd); + } + +@@ -386,6 +397,7 @@ asmlinkage long sys_chdir(const char * f + int error; + struct nameidata nd; + char *name; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + name = getname(filename); + error = PTR_ERR(name); +@@ -394,7 +406,7 @@ asmlinkage long sys_chdir(const char * f + + error = 0; + if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd)) +- error = path_walk(name, &nd); ++ error = path_walk_it(name, &nd, &it); + putname(name); + if (error) + goto out; +@@ -406,6 +418,7 @@ asmlinkage long sys_chdir(const char * f + set_fs_pwd(current->fs, nd.mnt, nd.dentry); + + dput_and_out: ++ intent_release(nd.dentry, &it); + path_release(&nd); + out: + return error; +@@ -446,6 +459,7 @@ asmlinkage long sys_chroot(const char * + int error; + struct nameidata nd; + char *name; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + + name = getname(filename); + error = PTR_ERR(name); +@@ -454,7 +468,7 @@ asmlinkage long sys_chroot(const char * + + path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW | + LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); +- error = path_walk(name, &nd); ++ error = path_walk_it(name, &nd, &it); + putname(name); + if (error) + goto out; +@@ -471,6 +485,7 @@ asmlinkage long sys_chroot(const char * + set_fs_altroot(); + error = 0; + dput_and_out: ++ intent_release(nd.dentry, &it); + path_release(&nd); + out: + return error; +@@ -515,8 +530,9 @@ asmlinkage long sys_chmod(const char * f + struct inode * inode; + int error; + struct iattr newattrs; ++ struct lookup_intent it = { .it_op = IT_SETATTR }; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + if (error) + goto out; + inode = nd.dentry->d_inode; +@@ -536,6 +552,7 @@ asmlinkage long sys_chmod(const char * f + error = notify_change(nd.dentry, &newattrs); + + dput_and_out: ++ intent_release(nd.dentry, &it); + path_release(&nd); + out: + return error; +@@ -605,10 +622,12 @@ asmlinkage long sys_chown(const char * f + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_SETATTR }; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + if (!error) { + error = chown_common(nd.dentry, user, group); ++ intent_release(nd.dentry, &it); + path_release(&nd); + } + return error; +@@ -618,10 +637,12 @@ asmlinkage long sys_lchown(const char * + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_SETATTR }; + +- error = user_path_walk_link(filename, &nd); ++ error = user_path_walk_link_it(filename, &nd, &it); + if (!error) { + error = chown_common(nd.dentry, user, group); ++ intent_release(nd.dentry, &it); + path_release(&nd); + } + return error; +@@ -655,10 +676,16 @@ asmlinkage long sys_fchown(unsigned int + * for the internal routines (ie open_namei()/follow_link() etc). 00 is + * used by symlinks. + */ ++extern int open_namei_it(const char *filename, int namei_flags, int mode, ++ struct nameidata *nd, struct lookup_intent *it); ++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it); ++ + struct file *filp_open(const char * filename, int flags, int mode) + { + int namei_flags, error; + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = flags }; + + namei_flags = flags; + if ((namei_flags+1) & O_ACCMODE) +@@ -666,14 +693,15 @@ struct file *filp_open(const char * file + if (namei_flags & O_TRUNC) + namei_flags |= 2; + +- error = open_namei(filename, namei_flags, mode, &nd); +- if (!error) +- return dentry_open(nd.dentry, nd.mnt, flags); ++ error = open_namei_it(filename, namei_flags, mode, &nd, &it); ++ if (error) ++ return ERR_PTR(error); + +- return ERR_PTR(error); ++ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); + } + +-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it) + { + struct file * f; + struct inode *inode; +@@ -716,6 +744,7 @@ struct file *dentry_open(struct dentry * + } + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + ++ intent_release(dentry, it); + return f; + + cleanup_all: +@@ -730,11 +759,17 @@ cleanup_all: + cleanup_file: + put_filp(f); + cleanup_dentry: ++ intent_release(dentry, it); + dput(dentry); + mntput(mnt); + return ERR_PTR(error); + } + ++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) ++{ ++ return dentry_open_it(dentry, mnt, flags, NULL); ++} ++ + /* + * Find an empty file descriptor entry, and mark it busy. + */ +--- linux-2.4.19-hp2_pnnl2/fs/stat.c~vfs_intent_hp Sun Jan 19 19:04:47 2003 ++++ linux-2.4.19-hp2_pnnl2-root/fs/stat.c Sun Jan 19 19:44:51 2003 +@@ -13,6 +13,7 @@ + + #include + ++extern void intent_release(struct dentry *de, struct lookup_intent *it); + /* + * Revalidate the inode. This is required for proper NFS attribute caching. + */ +@@ -135,13 +136,15 @@ static int cp_new_stat(struct inode * in + asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int error; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + if (!error) { + error = do_revalidate(nd.dentry); + if (!error) + error = cp_old_stat(nd.dentry->d_inode, statbuf); ++ intent_release(nd.dentry, &it); + path_release(&nd); + } + return error; +@@ -151,13 +154,15 @@ asmlinkage long sys_stat(char * filename + asmlinkage long sys_newstat(char * filename, struct stat * statbuf) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int error; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + if (!error) { + error = do_revalidate(nd.dentry); + if (!error) + error = cp_new_stat(nd.dentry->d_inode, statbuf); ++ intent_release(nd.dentry, &it); + path_release(&nd); + } + return error; +@@ -172,13 +177,15 @@ asmlinkage long sys_newstat(char * filen + asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int error; + +- error = user_path_walk_link(filename, &nd); ++ error = user_path_walk_link_it(filename, &nd, &it); + if (!error) { + error = do_revalidate(nd.dentry); + if (!error) + error = cp_old_stat(nd.dentry->d_inode, statbuf); ++ intent_release(nd.dentry, &it); + path_release(&nd); + } + return error; +@@ -189,13 +196,15 @@ asmlinkage long sys_lstat(char * filenam + asmlinkage long sys_newlstat(char * filename, struct stat * statbuf) + { + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + int error; + +- error = user_path_walk_link(filename, &nd); ++ error = user_path_walk_link_it(filename, &nd, &it); + if (!error) { + error = do_revalidate(nd.dentry); + if (!error) + error = cp_new_stat(nd.dentry->d_inode, statbuf); ++ intent_release(nd.dentry, &it); + path_release(&nd); + } + return error; +@@ -247,11 +256,12 @@ asmlinkage long sys_readlink(const char + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_READLINK }; + + if (bufsiz <= 0) + return -EINVAL; + +- error = user_path_walk_link(path, &nd); ++ error = user_path_walk_link_it(path, &nd, &it); + if (!error) { + struct inode * inode = nd.dentry->d_inode; + +@@ -261,6 +271,7 @@ asmlinkage long sys_readlink(const char + UPDATE_ATIME(inode); + error = inode->i_op->readlink(nd.dentry, buf, bufsiz); + } ++ intent_release(nd.dentry, &it); + path_release(&nd); + } + return error; +@@ -333,12 +344,14 @@ asmlinkage long sys_stat64(char * filena + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = user_path_walk(filename, &nd); ++ error = user_path_walk_it(filename, &nd, &it); + if (!error) { + error = do_revalidate(nd.dentry); + if (!error) + error = cp_new_stat64(nd.dentry->d_inode, statbuf); ++ intent_release(nd.dentry, &it); + path_release(&nd); + } + return error; +@@ -348,12 +361,14 @@ asmlinkage long sys_lstat64(char * filen + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; + +- error = user_path_walk_link(filename, &nd); ++ error = user_path_walk_link_it(filename, &nd, &it); + if (!error) { + error = do_revalidate(nd.dentry); + if (!error) + error = cp_new_stat64(nd.dentry->d_inode, statbuf); ++ intent_release(nd.dentry, &it); + path_release(&nd); + } + return error; +--- linux-2.4.19-hp2_pnnl2/include/linux/dcache.h~vfs_intent_hp Sun Jan 19 19:04:47 2003 ++++ linux-2.4.19-hp2_pnnl2-root/include/linux/dcache.h Sun Jan 19 19:04:48 2003 +@@ -6,6 +6,27 @@ + #include + #include + ++#define IT_OPEN (1) ++#define IT_CREAT (1<<1) ++#define IT_READDIR (1<<2) ++#define IT_GETATTR (1<<3) ++#define IT_SETATTR (1<<4) ++#define IT_TRUNC (1<<5) ++#define IT_READLINK (1<<6) ++#define IT_LOOKUP (1<<7) ++ ++struct lookup_intent { ++ int it_op; ++ int it_mode; ++ int it_flags; ++ int it_disposition; ++ int it_status; ++ struct iattr *it_iattr; ++ __u64 it_lock_handle[2]; ++ int it_lock_mode; ++ void *it_data; ++}; ++ + /* + * linux/include/linux/dcache.h + * +@@ -78,6 +106,7 @@ struct dentry { + unsigned long d_time; /* used by d_revalidate */ + struct dentry_operations *d_op; + struct super_block * d_sb; /* The root of the dentry tree */ ++ struct lookup_intent *d_it; + unsigned long d_vfs_flags; + void * d_fsdata; /* fs-specific data */ + unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ +@@ -90,6 +119,8 @@ struct dentry_operations { + int (*d_delete)(struct dentry *); + void (*d_release)(struct dentry *); + void (*d_iput)(struct dentry *, struct inode *); ++ int (*d_revalidate2)(struct dentry *, int, struct lookup_intent *); ++ void (*d_intent_release)(struct dentry *, struct lookup_intent *); + }; + + /* the dentry parameter passed to d_hash and d_compare is the parent +@@ -124,6 +148,7 @@ d_iput: no no yes + * s_nfsd_free_path semaphore will be down + */ + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ ++#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */ + + extern spinlock_t dcache_lock; + +--- linux-2.4.19-hp2_pnnl2/include/linux/fs.h~vfs_intent_hp Sun Jan 19 19:04:47 2003 ++++ linux-2.4.19-hp2_pnnl2-root/include/linux/fs.h Sun Jan 19 19:04:48 2003 +@@ -575,6 +575,7 @@ struct file { + + /* needed for tty driver, and maybe others */ + void *private_data; ++ struct lookup_intent *f_intent; + + /* preallocated helper kiobuf to speedup O_DIRECT */ + struct kiobuf *f_iobuf; +@@ -815,7 +816,9 @@ extern int vfs_symlink(struct inode *, s + extern int vfs_link(struct dentry *, struct inode *, struct dentry *); + extern int vfs_rmdir(struct inode *, struct dentry *); + extern int vfs_unlink(struct inode *, struct dentry *); +-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); ++int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, ++ struct inode *new_dir, struct dentry *new_dentry, ++ struct lookup_intent *it); + + /* + * File types +@@ -876,16 +879,28 @@ struct file_operations { + struct inode_operations { + int (*create) (struct inode *,struct dentry *,int); + struct dentry * (*lookup) (struct inode *,struct dentry *); ++ struct dentry * (*lookup2) (struct inode *,struct dentry *, struct lookup_intent *); + int (*link) (struct dentry *,struct inode *,struct dentry *); ++ int (*link2) (struct inode *,struct inode *, const char *, int); + int (*unlink) (struct inode *,struct dentry *); ++ int (*unlink2) (struct inode *, const char *, int); + int (*symlink) (struct inode *,struct dentry *,const char *); ++ int (*symlink2) (struct inode *, const char *, int, const char *); + int (*mkdir) (struct inode *,struct dentry *,int); ++ int (*mkdir2) (struct inode *, const char *, int,int); + int (*rmdir) (struct inode *,struct dentry *); ++ int (*rmdir2) (struct inode *, const char *, int); + int (*mknod) (struct inode *,struct dentry *,int,int); ++ int (*mknod2) (struct inode *, const char *, int,int,int); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); ++ int (*rename2) (struct inode *, struct inode *, ++ const char *oldname, int oldlen, ++ const char *newname, int newlen); + int (*readlink) (struct dentry *, char *,int); + int (*follow_link) (struct dentry *, struct nameidata *); ++ int (*follow_link2) (struct dentry *, struct nameidata *, ++ struct lookup_intent *it); + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int); + int (*revalidate) (struct dentry *); +@@ -1354,6 +1369,7 @@ typedef int (*read_actor_t)(read_descrip + extern loff_t default_llseek(struct file *file, loff_t offset, int origin); + + extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); ++extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); + extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); + extern int FASTCALL(path_walk(const char *, struct nameidata *)); + extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); +@@ -1364,6 +1380,8 @@ extern struct dentry * lookup_one_len(co + extern struct dentry * lookup_hash(struct qstr *, struct dentry *); + #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) + #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) ++#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) ++#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) + + extern void inode_init_once(struct inode *); + extern void iput(struct inode *); +@@ -1499,6 +1517,8 @@ extern struct file_operations generic_ro + + extern int vfs_readlink(struct dentry *, char *, int, const char *); + extern int vfs_follow_link(struct nameidata *, const char *); ++extern int vfs_follow_link_it(struct nameidata *, const char *, ++ struct lookup_intent *it); + extern int page_readlink(struct dentry *, char *, int); + extern int page_follow_link(struct dentry *, struct nameidata *); + extern struct inode_operations page_symlink_inode_operations; +--- linux-2.4.19-hp2_pnnl2/kernel/ksyms.c~vfs_intent_hp Sun Jan 19 19:04:47 2003 ++++ linux-2.4.19-hp2_pnnl2-root/kernel/ksyms.c Sun Jan 19 19:04:48 2003 +@@ -293,6 +293,7 @@ EXPORT_SYMBOL(read_cache_page); + EXPORT_SYMBOL(set_page_dirty); + EXPORT_SYMBOL(vfs_readlink); + EXPORT_SYMBOL(vfs_follow_link); ++EXPORT_SYMBOL(vfs_follow_link_it); + EXPORT_SYMBOL(page_readlink); + EXPORT_SYMBOL(page_follow_link); + EXPORT_SYMBOL(page_symlink_inode_operations); + +_ diff --git a/lustre/kernel_patches/pc/dev_read_only_hp.pc b/lustre/kernel_patches/pc/dev_read_only_hp.pc new file mode 100644 index 0000000..4760ad1 --- /dev/null +++ b/lustre/kernel_patches/pc/dev_read_only_hp.pc @@ -0,0 +1,3 @@ +drivers/block/blkpg.c +drivers/block/loop.c +drivers/ide/ide-disk.c diff --git a/lustre/kernel_patches/pc/exports_hp.pc b/lustre/kernel_patches/pc/exports_hp.pc new file mode 100644 index 0000000..6472a11 --- /dev/null +++ b/lustre/kernel_patches/pc/exports_hp.pc @@ -0,0 +1,4 @@ +fs/ext3/Makefile +fs/ext3/super.c +include/linux/fs.h +kernel/ksyms.c diff --git a/lustre/kernel_patches/pc/invalidate_show.pc b/lustre/kernel_patches/pc/invalidate_show.pc new file mode 100644 index 0000000..1f565ab --- /dev/null +++ b/lustre/kernel_patches/pc/invalidate_show.pc @@ -0,0 +1,5 @@ +fs/inode.c +fs/block_dev.c +fs/devfs/base.c +fs/super.c +include/linux/fs.h diff --git a/lustre/kernel_patches/pc/iod-rmap-exports.pc b/lustre/kernel_patches/pc/iod-rmap-exports.pc new file mode 100644 index 0000000..1218f55 --- /dev/null +++ b/lustre/kernel_patches/pc/iod-rmap-exports.pc @@ -0,0 +1,6 @@ +fs/inode.c +fs/Makefile +mm/filemap.c +mm/vmscan.c +mm/Makefile +mm/page_alloc.c diff --git a/lustre/kernel_patches/pc/jbd-transno-cb.pc b/lustre/kernel_patches/pc/jbd-transno-cb.pc new file mode 100644 index 0000000..cde73d8 --- /dev/null +++ b/lustre/kernel_patches/pc/jbd-transno-cb.pc @@ -0,0 +1,4 @@ +fs/jbd/commit.c +fs/jbd/journal.c +fs/jbd/transaction.c +include/linux/jbd.h diff --git a/lustre/kernel_patches/pc/kmem_cache_validate.pc b/lustre/kernel_patches/pc/kmem_cache_validate.pc index 12f8816..a0a6297 100644 --- a/lustre/kernel_patches/pc/kmem_cache_validate.pc +++ b/lustre/kernel_patches/pc/kmem_cache_validate.pc @@ -2,5 +2,4 @@ arch/i386/mm/init.c arch/ia64/mm/init.c include/linux/slab.h kernel/ksyms.c -kernel/ksyms.c.validate mm/slab.c diff --git a/lustre/kernel_patches/pc/kmem_cache_validate_hp.pc b/lustre/kernel_patches/pc/kmem_cache_validate_hp.pc new file mode 100644 index 0000000..a0a6297 --- /dev/null +++ b/lustre/kernel_patches/pc/kmem_cache_validate_hp.pc @@ -0,0 +1,5 @@ +arch/i386/mm/init.c +arch/ia64/mm/init.c +include/linux/slab.h +kernel/ksyms.c +mm/slab.c diff --git a/lustre/kernel_patches/pc/vanilla-2.4.19.pc b/lustre/kernel_patches/pc/vanilla-2.4.19.pc index c1ed719..bb5c390 100644 --- a/lustre/kernel_patches/pc/vanilla-2.4.19.pc +++ b/lustre/kernel_patches/pc/vanilla-2.4.19.pc @@ -6,12 +6,8 @@ drivers/block/loop.c drivers/ide/ide-disk.c fs/ext3/Makefile fs/ext3/super.c -fs/jbd/commit.c -fs/jbd/journal.c -fs/jbd/transaction.c include/linux/blkdev.h include/linux/slab.h -include/linux/jbd.h kernel/ksyms.c include/linux/dcache.h include/linux/fs.h diff --git a/lustre/kernel_patches/pc/vfs_intent_hp.pc b/lustre/kernel_patches/pc/vfs_intent_hp.pc new file mode 100644 index 0000000..881576c --- /dev/null +++ b/lustre/kernel_patches/pc/vfs_intent_hp.pc @@ -0,0 +1,8 @@ +fs/dcache.c +fs/namei.c +fs/nfsd/vfs.c +fs/open.c +fs/stat.c +include/linux/dcache.h +include/linux/fs.h +kernel/ksyms.c diff --git a/lustre/kernel_patches/series/chaos b/lustre/kernel_patches/series/chaos index b35612f..913ae18 100644 --- a/lustre/kernel_patches/series/chaos +++ b/lustre/kernel_patches/series/chaos @@ -2,4 +2,6 @@ dev_read_only.patch exports.patch kmem_cache_validate.patch lustre_version.patch -vfs_intent.patch +vfs_intent-2.4.18-18.patch +invalidate_show.patch +iod-rmap-exports.patch diff --git a/lustre/kernel_patches/series/hp-pnnl b/lustre/kernel_patches/series/hp-pnnl index d0171e0..6723ab6 100644 --- a/lustre/kernel_patches/series/hp-pnnl +++ b/lustre/kernel_patches/series/hp-pnnl @@ -1 +1,7 @@ -patch-2.4.18-hp1_pnnl18.2.8qsnet +dev_read_only_hp.patch +exports_hp.patch +kmem_cache_validate_hp.patch +jbd-transno-cb.patch +lustre_version.patch +vfs_intent_hp.patch +invalidate_show.patch diff --git a/lustre/kernel_patches/series/rh-2.4.18-18 b/lustre/kernel_patches/series/rh-2.4.18-18 index ec72618..51a833f 100644 --- a/lustre/kernel_patches/series/rh-2.4.18-18 +++ b/lustre/kernel_patches/series/rh-2.4.18-18 @@ -6,3 +6,5 @@ uml_check_get_page.patch uml_no_panic.patch vfs_intent-2.4.18-18.patch uml_compile_fixes.patch +invalidate_show.patch +iod-rmap-exports.patch diff --git a/lustre/kernel_patches/series/rh-8.0 b/lustre/kernel_patches/series/rh-8.0 index 4c64ad2..2ba39f5 100644 --- a/lustre/kernel_patches/series/rh-8.0 +++ b/lustre/kernel_patches/series/rh-8.0 @@ -6,3 +6,4 @@ uml_check_get_page.patch uml_no_panic.patch vfs_intent.patch uml_compile_fixes.patch +invalidate_show.patch diff --git a/lustre/kernel_patches/series/vanilla-2.4.18 b/lustre/kernel_patches/series/vanilla-2.4.18 index 314a8c3..5d2ab68 100644 --- a/lustre/kernel_patches/series/vanilla-2.4.18 +++ b/lustre/kernel_patches/series/vanilla-2.4.18 @@ -1 +1,2 @@ vanilla-2.4.18 +invalidate_show.patch diff --git a/lustre/kernel_patches/series/vanilla-2.4.19 b/lustre/kernel_patches/series/vanilla-2.4.19 index f868802..37cb65e 100644 --- a/lustre/kernel_patches/series/vanilla-2.4.19 +++ b/lustre/kernel_patches/series/vanilla-2.4.19 @@ -1 +1,3 @@ -vanilla-2.4.19 +vanilla-2.4.19.patch +jbd-transno-cb.patch +invalidate_show.patch diff --git a/lustre/kernel_patches/txt/exports.txt b/lustre/kernel_patches/txt/exports.txt index 010cdb7..00b991e 100644 --- a/lustre/kernel_patches/txt/exports.txt +++ b/lustre/kernel_patches/txt/exports.txt @@ -1,3 +1,3 @@ DESC -(undescribed patch) +Required kernel function exports for Lustre. EDESC diff --git a/lustre/kernel_patches/txt/exports_hp.txt b/lustre/kernel_patches/txt/exports_hp.txt new file mode 100644 index 0000000..00b991e --- /dev/null +++ b/lustre/kernel_patches/txt/exports_hp.txt @@ -0,0 +1,3 @@ +DESC +Required kernel function exports for Lustre. +EDESC diff --git a/lustre/kernel_patches/txt/invalidate_show.txt b/lustre/kernel_patches/txt/invalidate_show.txt new file mode 100644 index 0000000..88f093a --- /dev/null +++ b/lustre/kernel_patches/txt/invalidate_show.txt @@ -0,0 +1,3 @@ +DESC +Prints which inodes are busy at filesystem unmount time. +EDESC diff --git a/lustre/kernel_patches/which_patch b/lustre/kernel_patches/which_patch index 45e259a..4a5e662 100644 --- a/lustre/kernel_patches/which_patch +++ b/lustre/kernel_patches/which_patch @@ -1,12 +1,8 @@ -series/rh-8.0 - redhat 2.4.18-14 - redhat 2.4.18-17 +series/chaos + chaos-39 series/rh-2.4.18-18 redhat 2.4.18-18 -series/hp-pnnl ** NOTE: equivalent to vanilla-2.4.18 - linux-2.4.18-hp1_pnnl18 - linux-2.4.18-hp1_pnnl19 -series/vanilla-2.4.18 ** Not officially supported - linux-2.4.18 +series/hp-pnnl ** Note: functionally equivalent to 2.4.19 + linux-2.4.18-hp2_pnnl2 series/vanilla-2.4.19 ** Not officially supported linux-2.4.19 diff --git a/lustre/ldlm/Makefile.am b/lustre/ldlm/Makefile.am index ed5051a..a76ff4a 100644 --- a/lustre/ldlm/Makefile.am +++ b/lustre/ldlm/Makefile.am @@ -8,7 +8,7 @@ MODULE = ldlm modulefs_DATA = ldlm.o EXTRA_PROGRAMS = ldlm -ldlm_SOURCES = l_lock.c ldlm_lock.c ldlm_resource.c ldlm_test.c ldlm_lockd.c \ +ldlm_SOURCES = l_lock.c ldlm_lock.c ldlm_resource.c ldlm_lockd.c \ ldlm_extent.c ldlm_request.c include $(top_srcdir)/Rules diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c index ae1153f..5a84909 100644 --- a/lustre/ldlm/ldlm_extent.c +++ b/lustre/ldlm/ldlm_extent.c @@ -67,10 +67,11 @@ static void policy_internal(struct list_head *queue, struct ldlm_extent *req_ex, } /* apply the internal policy by walking all the lists */ -int ldlm_extent_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, - void *req_cookie, - ldlm_mode_t mode, int flags, void *data) +int ldlm_extent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp, + void *req_cookie, ldlm_mode_t mode, int flags, + void *data) { + struct ldlm_lock *lock = *lockp; struct ldlm_resource *res = lock->l_resource; struct ldlm_extent *req_ex = req_cookie; struct ldlm_extent new_ex; diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index a1220ab..b1ba4ef 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (c) 2002 Cluster File Systems, Inc. + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. * Author: Peter Braam * Author: Phil Schwan * @@ -55,34 +55,18 @@ char *ldlm_it2str(int it) return "creat"; case (IT_OPEN | IT_CREAT): return "open|creat"; - case IT_MKDIR: - return "mkdir"; - case IT_LINK: - return "link"; - case IT_LINK2: - return "link2"; - case IT_SYMLINK: - return "symlink"; - case IT_UNLINK: - return "unlink"; - case IT_RMDIR: - return "rmdir"; - case IT_RENAME: - return "rename"; - case IT_RENAME2: - return "rename2"; case IT_READDIR: return "readdir"; case IT_GETATTR: return "getattr"; + case IT_TRUNC: + return "truncate"; case IT_SETATTR: return "setattr"; - case IT_READLINK: - return "readlink"; - case IT_MKNOD: - return "mknod"; case IT_LOOKUP: return "lookup"; + case IT_UNLINK: + return "unlink"; default: CERROR("Unknown intent %d\n", it); return "UNKNOWN"; @@ -101,7 +85,7 @@ ldlm_res_compat ldlm_res_compat_table[] = { static ldlm_res_policy ldlm_intent_policy_func; -static int ldlm_plain_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, +static int ldlm_plain_policy(struct ldlm_namespace *ns, struct ldlm_lock **lock, void *req_cookie, ldlm_mode_t mode, int flags, void *data) { @@ -186,22 +170,26 @@ void ldlm_lock_remove_from_lru(struct ldlm_lock *lock) EXIT; } -/* Only called with strict == 0 by recovery, to mark in-use locks as - * should-be-destroyed */ +/* This used to have a 'strict' flact, which recovery would use to mark an + * in-use lock as needing-to-die. Lest I am ever tempted to put it back, I + * shall explain why it's gone: with the new hash table scheme, once you call + * ldlm_lock_destroy, you can never drop your final references on this lock. + * Because it's not in the hash table anymore. -phil */ void ldlm_lock_destroy(struct ldlm_lock *lock) { ENTRY; l_lock(&lock->l_resource->lr_namespace->ns_lock); if (!list_empty(&lock->l_children)) { - LDLM_DEBUG(lock, "still has children (%p)!", + LDLM_ERROR(lock, "still has children (%p)!", lock->l_children.next); ldlm_lock_dump(D_ERROR, lock); LBUG(); } if (lock->l_readers || lock->l_writers) { - LDLM_DEBUG(lock, "lock still has references"); - ldlm_lock_dump(D_OTHER, lock); + LDLM_ERROR(lock, "lock still has references"); + ldlm_lock_dump(D_ERROR, lock); + LBUG(); } if (!list_empty(&lock->l_res_link)) { @@ -238,7 +226,7 @@ void ldlm_lock_destroy(struct ldlm_lock *lock) /* this is called by portals_handle2object with the handle lock taken */ static void lock_handle_addref(void *lock) { - ldlm_lock_get(lock); + LDLM_LOCK_GET((struct ldlm_lock *)lock); } /* @@ -288,20 +276,20 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_lock *parent, } int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock, - __u64 new_resid[3]) + struct ldlm_res_id new_resid) { struct ldlm_resource *oldres = lock->l_resource; ENTRY; l_lock(&ns->ns_lock); - if (memcmp(new_resid, lock->l_resource->lr_name, + if (memcmp(&new_resid, &lock->l_resource->lr_name, sizeof(lock->l_resource->lr_name)) == 0) { /* Nothing to do */ l_unlock(&ns->ns_lock); RETURN(0); } - LASSERT(new_resid[0] != 0); + LASSERT(new_resid.name[0] != 0); /* This function assumes that the lock isn't on any lists */ LASSERT(list_empty(&lock->l_res_link)); @@ -326,12 +314,11 @@ int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock, void ldlm_lock2handle(struct ldlm_lock *lock, struct lustre_handle *lockh) { - //lockh->addr = (__u64)(unsigned long)lock; memset(&lockh->addr, 0x69, sizeof(lockh->addr)); lockh->cookie = lock->l_handle.h_cookie; } -/* if flags: atomically get the lock and set the flags. +/* if flags: atomically get the lock and set the flags. * Return NULL if flag already set */ @@ -354,7 +341,7 @@ struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *handle, int flags) /* It's unlikely but possible that someone marked the lock as * destroyed after we did handle2object on it */ if (lock->l_destroyed) { - CERROR("lock already destroyed: lock %p\n", lock); + CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock); LDLM_LOCK_PUT(lock); GOTO(out, retval); } @@ -401,7 +388,8 @@ void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc) } static void ldlm_add_ast_work_item(struct ldlm_lock *lock, - struct ldlm_lock *new) + struct ldlm_lock *new, + void *data, int datalen) { struct ldlm_ast_work *w; ENTRY; @@ -416,6 +404,8 @@ static void ldlm_add_ast_work_item(struct ldlm_lock *lock, GOTO(out, 0); } + w->w_data = data; + w->w_datalen = datalen; if (new) { lock->l_flags |= LDLM_FL_AST_SENT; w->w_blocking = 1; @@ -424,7 +414,8 @@ static void ldlm_add_ast_work_item(struct ldlm_lock *lock, w->w_lock = LDLM_LOCK_GET(lock); list_add(&w->w_list, lock->l_resource->lr_tmp); - out: + EXIT; + out: l_unlock(&lock->l_resource->lr_namespace->ns_lock); return; } @@ -454,20 +445,16 @@ void ldlm_lock_addref_internal(struct ldlm_lock *lock, __u32 mode) /* Args: unlocked lock */ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, - __u64 *res_id, int flags); + struct ldlm_res_id, int flags); -void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode) +void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode) { - struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); struct ldlm_namespace *ns; ENTRY; - if (lock == NULL) - LBUG(); - LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); ns = lock->l_resource->lr_namespace; - l_lock(&lock->l_resource->lr_namespace->ns_lock); + l_lock(&ns->ns_lock); if (mode == LCK_NL || mode == LCK_CR || mode == LCK_PR) { LASSERT(lock->l_readers > 0); lock->l_readers--; @@ -476,38 +463,71 @@ void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode) lock->l_writers--; } - /* If we received a blocked AST and this was the last reference, - * run the callback. */ + if (lock->l_flags & LDLM_FL_LOCAL && + !lock->l_readers && !lock->l_writers) { + /* If this is a local lock on a server namespace and this was + * the last reference, cancel the lock. */ + CDEBUG(D_INFO, "forcing cancel of local lock\n"); + lock->l_flags |= LDLM_FL_CBPENDING; + } + if (!lock->l_readers && !lock->l_writers && (lock->l_flags & LDLM_FL_CBPENDING)) { - if (!lock->l_resource->lr_namespace->ns_client && - lock->l_export) + /* If we received a blocked AST and this was the last reference, + * run the callback. */ + if (!ns->ns_client && lock->l_export) CERROR("FL_CBPENDING set on non-local lock--just a " "warning\n"); LDLM_DEBUG(lock, "final decref done on cbpending lock"); - l_unlock(&lock->l_resource->lr_namespace->ns_lock); + l_unlock(&ns->ns_lock); /* FIXME: need a real 'desc' here */ lock->l_blocking_ast(lock, NULL, lock->l_data, - lock->l_data_len, LDLM_CB_BLOCKING); + LDLM_CB_BLOCKING); } else if (ns->ns_client && !lock->l_readers && !lock->l_writers) { + /* If this is a client-side namespace and this was the last + * reference, put it on the LRU. */ LASSERT(list_empty(&lock->l_lru)); LASSERT(ns->ns_nr_unused >= 0); list_add_tail(&lock->l_lru, &ns->ns_unused_list); ns->ns_nr_unused++; - l_unlock(&lock->l_resource->lr_namespace->ns_lock); + l_unlock(&ns->ns_lock); ldlm_cancel_lru(ns); } else { - l_unlock(&lock->l_resource->lr_namespace->ns_lock); + l_unlock(&ns->ns_lock); } LDLM_LOCK_PUT(lock); /* matches the ldlm_lock_get in addref */ - LDLM_LOCK_PUT(lock); /* matches the handle2lock above */ EXIT; } +void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode) +{ + struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); + LASSERT(lock != NULL); + ldlm_lock_decref_internal(lock, mode); + LDLM_LOCK_PUT(lock); +} + +/* This will drop a lock reference and mark it for destruction, but will not + * necessarily cancel the lock before returning. */ +void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode) +{ + struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); + ENTRY; + + LASSERT(lock != NULL); + + LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); + l_lock(&lock->l_resource->lr_namespace->ns_lock); + lock->l_flags |= LDLM_FL_CBPENDING; + ldlm_lock_decref_internal(lock, mode); + l_unlock(&lock->l_resource->lr_namespace->ns_lock); + LDLM_LOCK_PUT(lock); +} + static int ldlm_lock_compat_list(struct ldlm_lock *lock, int send_cbs, struct list_head *queue) { @@ -537,7 +557,7 @@ static int ldlm_lock_compat_list(struct ldlm_lock *lock, int send_cbs, if (send_cbs && child->l_blocking_ast != NULL) { CDEBUG(D_OTHER, "lock %p incompatible; sending " "blocking AST.\n", child); - ldlm_add_ast_work_item(child, lock); + ldlm_add_ast_work_item(child, lock, NULL, 0); } } @@ -562,9 +582,11 @@ static int ldlm_lock_compat(struct ldlm_lock *lock, int send_cbs) } /* NOTE: called by - - ldlm_handle_enqueuque - resource -*/ -void ldlm_grant_lock(struct ldlm_lock *lock) + * - ldlm_lock_enqueue + * - ldlm_reprocess_queue + * - ldlm_lock_convert + */ +void ldlm_grant_lock(struct ldlm_lock *lock, void *data, int datalen) { struct ldlm_resource *res = lock->l_resource; ENTRY; @@ -576,17 +598,18 @@ void ldlm_grant_lock(struct ldlm_lock *lock) if (lock->l_granted_mode < res->lr_most_restr) res->lr_most_restr = lock->l_granted_mode; - if (lock->l_completion_ast) { - ldlm_add_ast_work_item(lock, NULL); - } + if (lock->l_completion_ast != NULL) + ldlm_add_ast_work_item(lock, NULL, data, datalen); + l_unlock(&lock->l_resource->lr_namespace->ns_lock); EXIT; } -/* returns a referenced lock or NULL */ +/* returns a referenced lock or NULL. See the flag descriptions below, in the + * comment above ldlm_lock_match */ static struct ldlm_lock *search_queue(struct list_head *queue, ldlm_mode_t mode, struct ldlm_extent *extent, - struct ldlm_lock *old_lock) + struct ldlm_lock *old_lock, int flags) { struct ldlm_lock *lock; struct list_head *tmp; @@ -595,7 +618,7 @@ static struct ldlm_lock *search_queue(struct list_head *queue, ldlm_mode_t mode, lock = list_entry(tmp, struct ldlm_lock, l_res_link); if (lock == old_lock) - continue; + break; if (lock->l_flags & LDLM_FL_CBPENDING) continue; @@ -611,6 +634,10 @@ static struct ldlm_lock *search_queue(struct list_head *queue, ldlm_mode_t mode, if (lock->l_destroyed) continue; + if ((flags & LDLM_FL_LOCAL_ONLY) && + !(lock->l_flags & LDLM_FL_LOCAL)) + continue; + ldlm_lock_addref_internal(lock, mode); return lock; } @@ -625,12 +652,17 @@ static struct ldlm_lock *search_queue(struct list_head *queue, ldlm_mode_t mode, * * Otherwise, all of the fields must be filled in, to match against. * + * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the + * server (ie, connh is NULL) + * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted + * list will be considered + * * Returns 1 if it finds an already-existing lock that is compatible; in this * case, lockh is filled in with a addref()ed lock */ -int ldlm_lock_match(struct ldlm_namespace *ns, __u64 *res_id, __u32 type, - void *cookie, int cookielen, ldlm_mode_t mode, - struct lustre_handle *lockh) +int ldlm_lock_match(struct ldlm_namespace *ns, int flags, + struct ldlm_res_id *res_id, __u32 type, void *cookie, + int cookielen, ldlm_mode_t mode,struct lustre_handle *lockh) { struct ldlm_resource *res; struct ldlm_lock *lock, *old_lock = NULL; @@ -642,12 +674,12 @@ int ldlm_lock_match(struct ldlm_namespace *ns, __u64 *res_id, __u32 type, LASSERT(old_lock); ns = old_lock->l_resource->lr_namespace; - res_id = old_lock->l_resource->lr_name; + res_id = &old_lock->l_resource->lr_name; type = old_lock->l_resource->lr_type; mode = old_lock->l_req_mode; } - res = ldlm_resource_get(ns, NULL, res_id, type, 0); + res = ldlm_resource_get(ns, NULL, *res_id, type, 0); if (res == NULL) { LASSERT(old_lock == NULL); RETURN(0); @@ -655,11 +687,16 @@ int ldlm_lock_match(struct ldlm_namespace *ns, __u64 *res_id, __u32 type, l_lock(&ns->ns_lock); - if ((lock = search_queue(&res->lr_granted, mode, cookie, old_lock))) + lock = search_queue(&res->lr_granted, mode, cookie, old_lock, flags); + if (lock != NULL) GOTO(out, rc = 1); - if ((lock = search_queue(&res->lr_converting, mode, cookie, old_lock))) + if (flags & LDLM_FL_BLOCK_GRANTED) + GOTO(out, rc = 0); + lock = search_queue(&res->lr_converting, mode, cookie, old_lock, flags); + if (lock != NULL) GOTO(out, rc = 1); - if ((lock = search_queue(&res->lr_waiting, mode, cookie, old_lock))) + lock = search_queue(&res->lr_waiting, mode, cookie, old_lock, flags); + if (lock != NULL) GOTO(out, rc = 1); EXIT; @@ -670,7 +707,7 @@ int ldlm_lock_match(struct ldlm_namespace *ns, __u64 *res_id, __u32 type, if (lock) { ldlm_lock2handle(lock, lockh); if (lock->l_completion_ast) - lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC); + lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC, NULL); } if (rc) LDLM_DEBUG(lock, "matched"); @@ -686,11 +723,12 @@ int ldlm_lock_match(struct ldlm_namespace *ns, __u64 *res_id, __u32 type, /* Returns a referenced lock */ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, struct lustre_handle *parent_lock_handle, - __u64 * res_id, __u32 type, - ldlm_mode_t mode, void *data, __u32 data_len) + struct ldlm_res_id res_id, __u32 type, + ldlm_mode_t mode, void *data, void *cp_data) { struct ldlm_resource *res, *parent_res = NULL; struct ldlm_lock *lock, *parent_lock = NULL; + ENTRY; if (parent_lock_handle) { parent_lock = ldlm_handle2lock(parent_lock_handle); @@ -712,19 +750,20 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, lock->l_req_mode = mode; lock->l_data = data; - lock->l_data_len = data_len; + lock->l_cp_data = cp_data; - return lock; + RETURN(lock); } ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, - struct ldlm_lock *lock, + struct ldlm_lock **lockp, void *cookie, int cookie_len, int *flags, ldlm_completion_callback completion, ldlm_blocking_callback blocking) { struct ldlm_resource *res; + struct ldlm_lock *lock = *lockp; int local; ldlm_res_policy policy; ENTRY; @@ -740,11 +779,18 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, if (!local && !(*flags & LDLM_FL_REPLAY) && (policy = ldlm_res_policy_table[res->lr_type])) { int rc; - rc = policy(ns, lock, cookie, lock->l_req_mode, *flags, NULL); - + rc = policy(ns, lockp, cookie, lock->l_req_mode, *flags, NULL); if (rc == ELDLM_LOCK_CHANGED) { res = lock->l_resource; *flags |= LDLM_FL_LOCK_CHANGED; + } else if (rc == ELDLM_LOCK_REPLACED) { + /* The lock that was returned has already been granted, + * and placed into lockp. Destroy the old one and our + * work here is done. */ + ldlm_lock_destroy(lock); + LDLM_LOCK_PUT(lock); + *flags |= LDLM_FL_LOCK_CHANGED; + RETURN(0); } else if (rc == ELDLM_LOCK_ABORTED) { ldlm_lock_destroy(lock); RETURN(rc); @@ -756,8 +802,8 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, /* The server returned a blocked lock, but it was granted before * we got a chance to actually enqueue it. We don't need to do * anything else. */ - *flags &= ~(LDLM_FL_BLOCK_GRANTED | - LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT); + *flags &= ~(LDLM_FL_BLOCK_GRANTED | + LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT); GOTO(out, ELDLM_OK); } @@ -775,23 +821,21 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, ldlm_resource_unlink_lock(lock); if (local) { if (*flags & LDLM_FL_BLOCK_CONV) - ldlm_resource_add_lock(res, res->lr_converting.prev, - lock); + ldlm_resource_add_lock(res, &res->lr_converting, lock); else if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED)) - ldlm_resource_add_lock(res, res->lr_waiting.prev, lock); + ldlm_resource_add_lock(res, &res->lr_waiting, lock); else - ldlm_grant_lock(lock); + ldlm_grant_lock(lock, NULL, 0); GOTO(out, ELDLM_OK); } else if (*flags & LDLM_FL_REPLAY) { if (*flags & LDLM_FL_BLOCK_CONV) { - ldlm_resource_add_lock(res, res->lr_converting.prev, - lock); + ldlm_resource_add_lock(res, &res->lr_converting, lock); GOTO(out, ELDLM_OK); } else if (*flags & LDLM_FL_BLOCK_WAIT) { - ldlm_resource_add_lock(res, res->lr_waiting.prev, lock); + ldlm_resource_add_lock(res, &res->lr_waiting, lock); GOTO(out, ELDLM_OK); } else if (*flags & LDLM_FL_BLOCK_GRANTED) { - ldlm_grant_lock(lock); + ldlm_grant_lock(lock, NULL, 0); GOTO(out, ELDLM_OK); } /* If no flags, fall through to normal enqueue path. */ @@ -799,22 +843,27 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, /* FIXME: We may want to optimize by checking lr_most_restr */ if (!list_empty(&res->lr_converting)) { - ldlm_resource_add_lock(res, res->lr_waiting.prev, lock); + ldlm_resource_add_lock(res, &res->lr_waiting, lock); *flags |= LDLM_FL_BLOCK_CONV; GOTO(out, ELDLM_OK); } if (!list_empty(&res->lr_waiting)) { - ldlm_resource_add_lock(res, res->lr_waiting.prev, lock); + ldlm_resource_add_lock(res, &res->lr_waiting, lock); *flags |= LDLM_FL_BLOCK_WAIT; GOTO(out, ELDLM_OK); } if (!ldlm_lock_compat(lock, 0)) { - ldlm_resource_add_lock(res, res->lr_waiting.prev, lock); + ldlm_resource_add_lock(res, &res->lr_waiting, lock); *flags |= LDLM_FL_BLOCK_GRANTED; GOTO(out, ELDLM_OK); } - ldlm_grant_lock(lock); + if (lock->l_granted_cb != NULL && lock->l_data != NULL) { + /* We just -know- */ + struct ptlrpc_request *req = lock->l_data; + lock->l_granted_cb(lock, req->rq_repmsg, 0); + } + ldlm_grant_lock(lock, NULL, 0); EXIT; out: l_unlock(&ns->ns_lock); @@ -841,7 +890,7 @@ static int ldlm_reprocess_queue(struct ldlm_resource *res, RETURN(1); list_del_init(&pending->l_res_link); - ldlm_grant_lock(pending); + ldlm_grant_lock(pending, NULL, 0); } RETURN(0); @@ -860,9 +909,10 @@ int ldlm_run_ast_work(struct list_head *rpc_list) if (w->w_blocking) rc = w->w_lock->l_blocking_ast (w->w_lock, &w->w_desc, w->w_data, - w->w_datalen, LDLM_CB_BLOCKING); + LDLM_CB_BLOCKING); else - rc = w->w_lock->l_completion_ast(w->w_lock, w->w_flags); + rc = w->w_lock->l_completion_ast(w->w_lock, w->w_flags, + w->w_data); if (rc == -ERESTART) retval = rc; else if (rc) @@ -886,7 +936,6 @@ void ldlm_reprocess_all_ns(struct ldlm_namespace *ns) (void)ldlm_namespace_foreach_res(ns, reprocess_one_queue, NULL); } -/* Must be called with resource->lr_lock not taken. */ void ldlm_reprocess_all(struct ldlm_resource *res) { struct list_head rpc_list = LIST_HEAD_INIT(rpc_list); @@ -923,7 +972,6 @@ void ldlm_cancel_callback(struct ldlm_lock *lock) lock->l_flags |= LDLM_FL_CANCEL; if (lock->l_blocking_ast) lock->l_blocking_ast(lock, NULL, lock->l_data, - lock->l_data_len, LDLM_CB_CANCELING); else LDLM_DEBUG(lock, "no blocking ast"); @@ -937,6 +985,8 @@ void ldlm_lock_cancel(struct ldlm_lock *lock) struct ldlm_namespace *ns; ENTRY; + ldlm_del_waiting_lock(lock); + res = lock->l_resource; ns = res->lr_namespace; @@ -951,14 +1001,13 @@ void ldlm_lock_cancel(struct ldlm_lock *lock) ldlm_cancel_callback(lock); - ldlm_del_waiting_lock(lock); ldlm_resource_unlink_lock(lock); ldlm_lock_destroy(lock); l_unlock(&ns->ns_lock); EXIT; } -int ldlm_lock_set_data(struct lustre_handle *lockh, void *data, int datalen) +int ldlm_lock_set_data(struct lustre_handle *lockh, void *data, void *cp_data) { struct ldlm_lock *lock = ldlm_handle2lock(lockh); ENTRY; @@ -967,16 +1016,18 @@ int ldlm_lock_set_data(struct lustre_handle *lockh, void *data, int datalen) RETURN(-EINVAL); lock->l_data = data; - lock->l_data_len = datalen; + lock->l_cp_data = cp_data; LDLM_LOCK_PUT(lock); RETURN(0); } +/* This function is only called from one thread (per export); no locking around + * the list ops needed */ void ldlm_cancel_locks_for_export(struct obd_export *exp) { - struct list_head *iter, *n; /* MUST BE CALLED "n"! */ + struct list_head *iter, *n; list_for_each_safe(iter, n, &exp->exp_ldlm_data.led_held_locks) { struct ldlm_lock *lock; @@ -999,6 +1050,8 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, int granted = 0; ENTRY; + LBUG(); + res = lock->l_resource; ns = res->lr_namespace; @@ -1009,26 +1062,25 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, /* If this is a local resource, put it on the appropriate list. */ if (res->lr_namespace->ns_client) { - if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) - ldlm_resource_add_lock(res, res->lr_converting.prev, - lock); - else { + if (*flags & (LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_GRANTED)) { + ldlm_resource_add_lock(res, &res->lr_converting, lock); + } else { /* This should never happen, because of the way the * server handles conversions. */ LBUG(); res->lr_tmp = &rpc_list; - ldlm_grant_lock(lock); + ldlm_grant_lock(lock, NULL, 0); res->lr_tmp = NULL; granted = 1; /* FIXME: completion handling not with ns_lock held ! */ if (lock->l_completion_ast) - lock->l_completion_ast(lock, 0); + lock->l_completion_ast(lock, 0, NULL); } } else { /* FIXME: We should try the conversion right away and possibly * return success without the need for an extra AST */ - ldlm_resource_add_lock(res, res->lr_converting.prev, lock); + ldlm_resource_add_lock(res, &res->lr_converting, lock); *flags |= LDLM_FL_BLOCK_CONV; } @@ -1043,7 +1095,7 @@ void ldlm_lock_dump(int level, struct ldlm_lock *lock) { char ver[128]; - if (!(portal_debug & level)) + if (!((portal_debug | D_ERROR) & level)) return; if (RES_VERSION_SIZE != 4) @@ -1058,7 +1110,8 @@ void ldlm_lock_dump(int level, struct ldlm_lock *lock) lock->l_version[0], lock->l_version[1], lock->l_version[2], lock->l_version[3]); - CDEBUG(level, " -- Lock dump: %p (%s)\n", lock, ver); + CDEBUG(level, " -- Lock dump: %p (%s) (rc: %d)\n", lock, ver, + atomic_read(&lock->l_refc)); if (lock->l_export && lock->l_export->exp_connection) CDEBUG(level, " Node: NID %x (rhandle: "LPX64")\n", lock->l_export->exp_connection->c_peer.peer_nid, @@ -1067,7 +1120,7 @@ void ldlm_lock_dump(int level, struct ldlm_lock *lock) CDEBUG(level, " Node: local\n"); CDEBUG(level, " Parent: %p\n", lock->l_parent); CDEBUG(level, " Resource: %p ("LPD64")\n", lock->l_resource, - lock->l_resource->lr_name[0]); + lock->l_resource->lr_name.name[0]); CDEBUG(level, " Requested mode: %d, granted mode: %d\n", (int)lock->l_req_mode, (int)lock->l_granted_mode); CDEBUG(level, " Readers: %u ; Writers; %u\n", diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index d826db1..803e59d 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. * Author: Peter Braam * Author: Phil Schwan * @@ -42,6 +42,7 @@ inline unsigned long round_timeout(unsigned long timeout) return ((timeout / HZ) + 1) * HZ; } +/* XXX should this be per-ldlm? */ static struct list_head waiting_locks_list; static spinlock_t waiting_locks_spinlock; static struct timer_list waiting_locks_timer; @@ -129,9 +130,9 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock) RETURN(1); } -static int ldlm_server_blocking_ast(struct ldlm_lock *lock, - struct ldlm_lock_desc *desc, - void *data, __u32 data_len, int flag) +int ldlm_server_blocking_ast(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, + void *data, int flag) { struct ldlm_request *body; struct ptlrpc_request *req; @@ -146,6 +147,13 @@ static int ldlm_server_blocking_ast(struct ldlm_lock *lock, LASSERT(lock); l_lock(&lock->l_resource->lr_namespace->ns_lock); + /* XXX This is necessary because, with the lock re-tasking, we actually + * _can_ get called in here twice. (bug 830) */ + if (!list_empty(&lock->l_pending_chain)) { + l_unlock(&lock->l_resource->lr_namespace->ns_lock); + RETURN(0); + } + if (lock->l_destroyed) { /* What's the point? */ l_unlock(&lock->l_resource->lr_namespace->ns_lock); @@ -171,6 +179,7 @@ static int ldlm_server_blocking_ast(struct ldlm_lock *lock, req->rq_level = LUSTRE_CONN_RECOVD; rc = ptlrpc_queue_wait(req); if (rc == -ETIMEDOUT || rc == -EINTR) { + ldlm_del_waiting_lock(lock); ldlm_expired_completion_wait(lock); } else if (rc) { CERROR("client returned %d from blocking AST for lock %p\n", @@ -188,7 +197,7 @@ static int ldlm_server_blocking_ast(struct ldlm_lock *lock, RETURN(rc); } -static int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags) +int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) { struct ldlm_request *body; struct ptlrpc_request *req; @@ -217,6 +226,7 @@ static int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags) req->rq_level = LUSTRE_CONN_RECOVD; rc = ptlrpc_queue_wait(req); if (rc == -ETIMEDOUT || rc == -EINTR) { + ldlm_del_waiting_lock(lock); ldlm_expired_completion_wait(lock); } else if (rc) { CERROR("client returned %d from completion AST for lock %p\n", @@ -233,7 +243,9 @@ static int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags) RETURN(rc); } -int ldlm_handle_enqueue(struct ptlrpc_request *req) +int ldlm_handle_enqueue(struct ptlrpc_request *req, + ldlm_completion_callback completion_callback, + ldlm_blocking_callback blocking_callback) { struct obd_device *obddev = req->rq_export->exp_obd; struct ldlm_reply *dlm_rep; @@ -268,8 +280,7 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req) } } - /* XXX notice that this lock has no callback data: of course the - export would be exactly what we may want to use here... */ + /* The lock's callback data might be set in the policy function */ lock = ldlm_lock_create(obddev->obd_namespace, &dlm_req->lock_handle2, dlm_req->lock_desc.l_resource.lr_name, @@ -289,10 +300,9 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req) &lock->l_export->exp_ldlm_data.led_held_locks); l_unlock(&lock->l_resource->lr_namespace->ns_lock); - err = ldlm_lock_enqueue(obddev->obd_namespace, lock, cookie, cookielen, - &flags, ldlm_server_completion_ast, - ldlm_server_blocking_ast); - if (err != ELDLM_OK) + err = ldlm_lock_enqueue(obddev->obd_namespace, &lock, cookie, cookielen, + &flags, completion_callback, blocking_callback); + if (err) GOTO(out, err); dlm_rep = lustre_msg_buf(req->rq_repmsg, 0); @@ -303,7 +313,7 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req) memcpy(&dlm_rep->lock_extent, &lock->l_extent, sizeof(lock->l_extent)); if (dlm_rep->lock_flags & LDLM_FL_LOCK_CHANGED) { - memcpy(dlm_rep->lock_resource_name, lock->l_resource->lr_name, + memcpy(&dlm_rep->lock_resource_name, &lock->l_resource->lr_name, sizeof(dlm_rep->lock_resource_name)); dlm_rep->lock_mode = lock->l_req_mode; } @@ -315,6 +325,8 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req) "(err=%d)", err); req->rq_status = err; + /* The LOCK_CHANGED code in ldlm_lock_enqueue depends on this + * ldlm_reprocess_all. If this moves, revisit that code. -phil */ if (lock) { if (!err) ldlm_reprocess_all(lock->l_resource); @@ -384,9 +396,11 @@ int ldlm_handle_cancel(struct ptlrpc_request *req) lock = ldlm_handle2lock(&dlm_req->lock_handle1); if (!lock) { - LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock (lock " - "%p)", (void *)(unsigned long) - dlm_req->lock_handle1.addr); + CERROR("received cancel for unknown lock cookie "LPX64"\n", + dlm_req->lock_handle1.cookie); + LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock " + "(cookie "LPU64")", + dlm_req->lock_handle1.cookie); req->rq_status = ESTALE; } else { LDLM_DEBUG(lock, "server-side cancel handler START"); @@ -442,8 +456,7 @@ static int ldlm_handle_bl_callback(struct ptlrpc_request *req, "callback (%p)", lock->l_blocking_ast); if (lock->l_blocking_ast != NULL) { lock->l_blocking_ast(lock, &dlm_req->lock_desc, - lock->l_data, lock->l_data_len, - LDLM_CB_BLOCKING); + lock->l_data, LDLM_CB_BLOCKING); } } else LDLM_DEBUG(lock, "Lock still has references, will be" @@ -487,15 +500,15 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req, memcpy(&lock->l_extent, &dlm_req->lock_desc.l_extent, sizeof(lock->l_extent)); ldlm_resource_unlink_lock(lock); - if (memcmp(dlm_req->lock_desc.l_resource.lr_name, - lock->l_resource->lr_name, - sizeof(__u64) * RES_NAME_SIZE) != 0) { + if (memcmp(&dlm_req->lock_desc.l_resource.lr_name, + &lock->l_resource->lr_name, + sizeof(lock->l_resource->lr_name)) != 0) { ldlm_lock_change_resource(ns, lock, dlm_req->lock_desc.l_resource.lr_name); LDLM_DEBUG(lock, "completion AST, new resource"); } lock->l_resource->lr_tmp = &ast_list; - ldlm_grant_lock(lock); + ldlm_grant_lock(lock, req, sizeof(*req)); lock->l_resource->lr_tmp = NULL; l_unlock(&ns->ns_lock); LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work"); @@ -618,6 +631,7 @@ static int ldlm_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, { struct obd_device *obddev = class_conn2obd(conn); struct ptlrpc_connection *connection; + struct obd_uuid uuid = { "ldlm" }; int err = 0; ENTRY; @@ -630,14 +644,15 @@ static int ldlm_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, OBD_ALLOC(obddev->u.ldlm.ldlm_client, sizeof(*obddev->u.ldlm.ldlm_client)); - connection = ptlrpc_uuid_to_connection("ldlm"); + connection = ptlrpc_uuid_to_connection(&uuid); if (!connection) CERROR("No LDLM UUID found: assuming ldlm is local.\n"); switch (cmd) { case IOC_LDLM_TEST: - err = ldlm_test(obddev, conn); - CERROR("-- done err %d\n", err); + //err = ldlm_test(obddev, conn); + err = 0; + CERROR("-- NO TESTS WERE RUN done err %d\n", err); GOTO(out, err); case IOC_LDLM_DUMP: ldlm_dump_all_namespaces(); @@ -657,6 +672,7 @@ static int ldlm_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf) { struct ldlm_obd *ldlm = &obddev->u.ldlm; + struct obd_uuid uuid = {"self"}; int rc, i; ENTRY; @@ -670,7 +686,7 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf) ldlm->ldlm_cb_service = ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE, LDLM_CB_REQUEST_PORTAL, - LDLM_CB_REPLY_PORTAL, "self", + LDLM_CB_REPLY_PORTAL, &uuid, ldlm_callback_handler, "ldlm_cbd"); if (!ldlm->ldlm_cb_service) { @@ -681,7 +697,7 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf) ldlm->ldlm_cancel_service = ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE, LDLM_CANCEL_REQUEST_PORTAL, - LDLM_CANCEL_REPLY_PORTAL, "self", + LDLM_CANCEL_REPLY_PORTAL, &uuid, ldlm_cancel_handler, "ldlm_canceld"); if (!ldlm->ldlm_cancel_service) { @@ -755,7 +771,7 @@ static int ldlm_cleanup(struct obd_device *obddev) } static int ldlm_connect(struct lustre_handle *conn, struct obd_device *src, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { return class_connect(conn, src, cluuid); @@ -804,43 +820,63 @@ static void __exit ldlm_exit(void) CERROR("couldn't free ldlm lock slab\n"); } -EXPORT_SYMBOL(ldlm_completion_ast); -EXPORT_SYMBOL(ldlm_handle_enqueue); -EXPORT_SYMBOL(ldlm_handle_cancel); -EXPORT_SYMBOL(ldlm_handle_convert); +/* ldlm_lock.c */ +EXPORT_SYMBOL(ldlm_lock2desc); EXPORT_SYMBOL(ldlm_register_intent); EXPORT_SYMBOL(ldlm_unregister_intent); EXPORT_SYMBOL(ldlm_lockname); EXPORT_SYMBOL(ldlm_typename); -EXPORT_SYMBOL(__ldlm_handle2lock); EXPORT_SYMBOL(ldlm_lock2handle); +EXPORT_SYMBOL(__ldlm_handle2lock); EXPORT_SYMBOL(ldlm_lock_put); EXPORT_SYMBOL(ldlm_lock_match); +EXPORT_SYMBOL(ldlm_lock_cancel); EXPORT_SYMBOL(ldlm_lock_addref); EXPORT_SYMBOL(ldlm_lock_decref); +EXPORT_SYMBOL(ldlm_lock_decref_and_cancel); EXPORT_SYMBOL(ldlm_lock_change_resource); EXPORT_SYMBOL(ldlm_lock_set_data); +EXPORT_SYMBOL(ldlm_it2str); +EXPORT_SYMBOL(ldlm_lock_dump); +EXPORT_SYMBOL(ldlm_lock_dump_handle); +EXPORT_SYMBOL(ldlm_cancel_locks_for_export); +EXPORT_SYMBOL(ldlm_reprocess_all_ns); + +/* ldlm_request.c */ +EXPORT_SYMBOL(ldlm_completion_ast); +EXPORT_SYMBOL(ldlm_expired_completion_wait); EXPORT_SYMBOL(ldlm_cli_convert); EXPORT_SYMBOL(ldlm_cli_enqueue); EXPORT_SYMBOL(ldlm_cli_cancel); EXPORT_SYMBOL(ldlm_cli_cancel_unused); EXPORT_SYMBOL(ldlm_match_or_enqueue); -EXPORT_SYMBOL(ldlm_it2str); +EXPORT_SYMBOL(ldlm_replay_locks); +EXPORT_SYMBOL(ldlm_resource_foreach); +EXPORT_SYMBOL(ldlm_namespace_foreach); +EXPORT_SYMBOL(ldlm_namespace_foreach_res); + +/* ldlm_lockd.c */ +EXPORT_SYMBOL(ldlm_server_blocking_ast); +EXPORT_SYMBOL(ldlm_server_completion_ast); +EXPORT_SYMBOL(ldlm_handle_enqueue); +EXPORT_SYMBOL(ldlm_handle_cancel); +EXPORT_SYMBOL(ldlm_handle_convert); +EXPORT_SYMBOL(ldlm_del_waiting_lock); + +#if 0 +/* ldlm_test.c */ EXPORT_SYMBOL(ldlm_test); EXPORT_SYMBOL(ldlm_regression_start); EXPORT_SYMBOL(ldlm_regression_stop); -EXPORT_SYMBOL(ldlm_lock_dump); -EXPORT_SYMBOL(ldlm_lock_dump_handle); +#endif + +/* ldlm_resource.c */ EXPORT_SYMBOL(ldlm_namespace_new); EXPORT_SYMBOL(ldlm_namespace_cleanup); EXPORT_SYMBOL(ldlm_namespace_free); EXPORT_SYMBOL(ldlm_namespace_dump); -EXPORT_SYMBOL(ldlm_cancel_locks_for_export); -EXPORT_SYMBOL(ldlm_replay_locks); -EXPORT_SYMBOL(ldlm_resource_foreach); -EXPORT_SYMBOL(ldlm_reprocess_all_ns); -EXPORT_SYMBOL(ldlm_namespace_foreach); -EXPORT_SYMBOL(ldlm_namespace_foreach_res); + +/* l_lock.c */ EXPORT_SYMBOL(l_lock); EXPORT_SYMBOL(l_unlock); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index b71dd20..44122f5 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -47,14 +47,14 @@ int ldlm_expired_completion_wait(void *data) else { LDLM_DEBUG(lock, "timed out waiting for completion"); CERROR("lock %p timed out from %s\n", lock, - conn->c_remote_uuid); + conn->c_remote_uuid.uuid); ldlm_lock_dump(D_ERROR, lock); class_signal_connection_failure(conn); } RETURN(0); } -int ldlm_completion_ast(struct ldlm_lock *lock, int flags) +int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data) { struct l_wait_info lwi = LWI_TIMEOUT_INTR(obd_timeout * HZ, ldlm_expired_completion_wait, @@ -102,7 +102,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags) static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, struct lustre_handle *parent_lockh, - __u64 *res_id, + struct ldlm_res_id res_id, __u32 type, void *cookie, int cookielen, ldlm_mode_t mode, @@ -110,7 +110,7 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, ldlm_completion_callback completion, ldlm_blocking_callback blocking, void *data, - __u32 data_len, + void *cp_data, struct lustre_handle *lockh) { struct ldlm_lock *lock; @@ -122,17 +122,17 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, LBUG(); } - lock = ldlm_lock_create(ns, parent_lockh, res_id, type, mode, data, - data_len); + lock = ldlm_lock_create(ns, parent_lockh, res_id, type, mode, + data, cp_data); if (!lock) GOTO(out_nolock, err = -ENOMEM); LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created"); ldlm_lock_addref_internal(lock, mode); ldlm_lock2handle(lock, lockh); - lock->l_connh = NULL; + lock->l_flags |= LDLM_FL_LOCAL; - err = ldlm_lock_enqueue(ns, lock, cookie, cookielen, flags, completion, + err = ldlm_lock_enqueue(ns, &lock, cookie, cookielen, flags, completion, blocking); if (err != ELDLM_OK) GOTO(out, err); @@ -140,13 +140,13 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, if (type == LDLM_EXTENT) memcpy(cookie, &lock->l_extent, sizeof(lock->l_extent)); if ((*flags) & LDLM_FL_LOCK_CHANGED) - memcpy(res_id, lock->l_resource->lr_name, sizeof(*res_id)); + memcpy(&res_id, &lock->l_resource->lr_name, sizeof(res_id)); LDLM_DEBUG_NOLOCK("client-side local enqueue handler END (lock %p)", lock); if (lock->l_completion_ast) - lock->l_completion_ast(lock, *flags); + lock->l_completion_ast(lock, *flags, NULL); LDLM_DEBUG(lock, "client-side local enqueue END"); EXIT; @@ -160,7 +160,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, struct ptlrpc_request *req, struct ldlm_namespace *ns, struct lustre_handle *parent_lock_handle, - __u64 *res_id, + struct ldlm_res_id res_id, __u32 type, void *cookie, int cookielen, ldlm_mode_t mode, @@ -168,7 +168,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, ldlm_completion_callback completion, ldlm_blocking_callback blocking, void *data, - __u32 data_len, + void *cp_data, struct lustre_handle *lockh) { struct ldlm_lock *lock; @@ -180,11 +180,13 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, is_replay = *flags & LDLM_FL_REPLAY; LASSERT(connh != NULL || !is_replay); - if (connh == NULL) - return ldlm_cli_enqueue_local(ns, parent_lock_handle, res_id, - type, cookie, cookielen, mode, - flags, completion, blocking, data, - data_len, lockh); + if (connh == NULL) { + rc = ldlm_cli_enqueue_local(ns, parent_lock_handle, res_id, + type, cookie, cookielen, mode, + flags, completion, blocking, data, + cp_data, lockh); + RETURN(rc); + } /* If we're replaying this lock, just check some invariants. * If we're creating a new lock, get everything all setup nice. */ @@ -194,9 +196,14 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, LASSERT(connh == lock->l_connh); } else { lock = ldlm_lock_create(ns, parent_lock_handle, res_id, type, - mode, data, data_len); + mode, data, cp_data); if (lock == NULL) GOTO(out_nolock, rc = -ENOMEM); + /* ugh. I set this early (instead of waiting for _enqueue) + * because the completion AST might arrive early, and we need + * (in just this one case) to run the completion_cb even if it + * arrives before the reply. */ + lock->l_completion_ast = completion; LDLM_DEBUG(lock, "client-side enqueue START"); /* for the local lock, add the reference */ ldlm_lock_addref_internal(lock, mode); @@ -240,9 +247,12 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, LASSERT(!is_replay); LDLM_DEBUG(lock, "client-side enqueue END (%s)", rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED"); + /* Set a flag to prevent us from sending a CANCEL (bug 407) */ + l_lock(&ns->ns_lock); + lock->l_flags |= LDLM_FL_CANCELING; + l_unlock(&ns->ns_lock); + ldlm_lock_decref(lockh, mode); - /* FIXME: if we've already received a completion AST, this will - * LBUG! */ ldlm_lock_destroy(lock); GOTO(out_req, rc); } @@ -276,12 +286,12 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, lock->l_req_mode = newmode; } - if (reply->lock_resource_name[0] != - lock->l_resource->lr_name[0]) { + if (reply->lock_resource_name.name[0] != + lock->l_resource->lr_name.name[0]) { CDEBUG(D_INFO, "remote intent success, locking %ld " "instead of %ld\n", - (long)reply->lock_resource_name[0], - (long)lock->l_resource->lr_name[0]); + (long)reply->lock_resource_name.name[0], + (long)lock->l_resource->lr_name.name[0]); ldlm_lock_change_resource(ns, lock, reply->lock_resource_name); @@ -294,10 +304,13 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, } if (!is_replay) { - rc = ldlm_lock_enqueue(ns, lock, cookie, cookielen, flags, + l_lock(&ns->ns_lock); + lock->l_completion_ast = NULL; + rc = ldlm_lock_enqueue(ns, &lock, cookie, cookielen, flags, completion, blocking); + l_unlock(&ns->ns_lock); if (lock->l_completion_ast) - lock->l_completion_ast(lock, *flags); + lock->l_completion_ast(lock, *flags, NULL); } LDLM_DEBUG(lock, "client-side enqueue END"); @@ -315,7 +328,7 @@ int ldlm_match_or_enqueue(struct lustre_handle *connh, struct ptlrpc_request *req, struct ldlm_namespace *ns, struct lustre_handle *parent_lock_handle, - __u64 *res_id, + struct ldlm_res_id res_id, __u32 type, void *cookie, int cookielen, ldlm_mode_t mode, @@ -323,30 +336,39 @@ int ldlm_match_or_enqueue(struct lustre_handle *connh, ldlm_completion_callback completion, ldlm_blocking_callback blocking, void *data, - __u32 data_len, + void *cp_data, struct lustre_handle *lockh) { int rc; ENTRY; - rc = ldlm_lock_match(ns, res_id, type, cookie, cookielen, mode, lockh); + if (connh == NULL) { + /* Just to make sure that I understand things --phil */ + LASSERT(*flags & LDLM_FL_LOCAL_ONLY); + } + + LDLM_DEBUG_NOLOCK("resource "LPU64"/"LPU64, res_id.name[0], + res_id.name[1]); + rc = ldlm_lock_match(ns, *flags, &res_id, type, cookie, cookielen, mode, + lockh); if (rc == 0) { - rc = ldlm_cli_enqueue(connh, req, ns, - parent_lock_handle, res_id, type, cookie, - cookielen, mode, flags, completion, - blocking, data, data_len, lockh); + rc = ldlm_cli_enqueue(connh, req, ns, parent_lock_handle, + res_id, type, cookie, cookielen, mode, + flags, completion, blocking, data, + cp_data, lockh); if (rc != ELDLM_OK) CERROR("ldlm_cli_enqueue: err: %d\n", rc); RETURN(rc); - } else - RETURN(0); + } + RETURN(0); } int ldlm_cli_replay_enqueue(struct ldlm_lock *lock) { struct lustre_handle lockh; + struct ldlm_res_id junk; int flags = LDLM_FL_REPLAY; ldlm_lock2handle(lock, &lockh); - return ldlm_cli_enqueue(lock->l_connh, NULL, NULL, NULL, NULL, + return ldlm_cli_enqueue(lock->l_connh, NULL, NULL, NULL, junk, lock->l_resource->lr_type, NULL, 0, -1, &flags, NULL, NULL, NULL, 0, &lockh); } @@ -421,7 +443,7 @@ int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, int *flags) /* Go to sleep until the lock is granted. */ /* FIXME: or cancelled. */ if (lock->l_completion_ast) - lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC); + lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC, NULL); EXIT; out: LDLM_LOCK_PUT(lock); @@ -443,13 +465,22 @@ int ldlm_cli_cancel(struct lustre_handle *lockh) RETURN(0); if (lock->l_connh) { + int local_only; + LDLM_DEBUG(lock, "client-side cancel"); /* Set this flag to prevent others from getting new references*/ l_lock(&lock->l_resource->lr_namespace->ns_lock); lock->l_flags |= LDLM_FL_CBPENDING; ldlm_cancel_callback(lock); + local_only = (lock->l_flags & LDLM_FL_LOCAL_ONLY); l_unlock(&lock->l_resource->lr_namespace->ns_lock); + if (local_only) { + CDEBUG(D_INFO, "not sending request (at caller's " + "instruction\n"); + goto local_cancel; + } + req = ptlrpc_prep_req(class_conn2cliimp(lock->l_connh), LDLM_CANCEL, 1, &size, NULL); if (!req) @@ -467,9 +498,14 @@ int ldlm_cli_cancel(struct lustre_handle *lockh) rc = ptlrpc_queue_wait(req); ptlrpc_req_finished(req); + if (rc == ESTALE) { + CERROR("client/server out of sync\n"); + LBUG(); + } if (rc != ELDLM_OK) - GOTO(out, rc); - + CERROR("Got rc %d from cancel RPC: canceling " + "anyway\n", rc); + local_cancel: ldlm_lock_cancel(lock); } else { LDLM_DEBUG(lock, "client-side local cancel"); @@ -482,8 +518,6 @@ int ldlm_cli_cancel(struct lustre_handle *lockh) LDLM_DEBUG(lock, "client-side local cancel handler END"); } - lock->l_flags |= LDLM_FL_CANCELING; - EXIT; out: LDLM_LOCK_PUT(lock); @@ -549,7 +583,7 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns) } int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, - __u64 *res_id, int flags) + struct ldlm_res_id res_id, int flags) { struct ldlm_resource *res; struct list_head *tmp, *next, list = LIST_HEAD_INIT(list); @@ -559,7 +593,7 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, res = ldlm_resource_get(ns, NULL, res_id, 0, 0); if (res == NULL) { /* This is not a problem. */ - CDEBUG(D_INFO, "No resource "LPU64"\n", res_id[0]); + CDEBUG(D_INFO, "No resource "LPU64"\n", res_id.name[0]); RETURN(0); } @@ -615,8 +649,8 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, * * If 'local_only' is true, throw the locks away without trying to notify the * server. */ -int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, __u64 *res_id, - int flags) +int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, + struct ldlm_res_id *res_id, int flags) { int i; ENTRY; @@ -625,7 +659,7 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, __u64 *res_id, RETURN(ELDLM_OK); if (res_id) - RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, flags)); + RETURN(ldlm_cli_cancel_unused_resource(ns, *res_id, flags)); l_lock(&ns->ns_lock); for (i = 0; i < RES_HASH_SIZE; i++) { @@ -641,7 +675,7 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, __u64 *res_id, if (rc) CERROR("cancel_unused_res ("LPU64"): %d\n", - res->lr_name[0], rc); + res->lr_name.name[0], rc); ldlm_resource_putref(res); } } diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index e5960bd..9e757a6 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -1,12 +1,24 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. + * Author: Phil Schwan + * Author: Peter Braam * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * This file is part of Lustre, http://www.lustre.org. * - * by Cluster File Systems, Inc. + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define DEBUG_SUBSYSTEM S_LDLM @@ -22,58 +34,59 @@ static struct proc_dir_entry *ldlm_ns_proc_dir = NULL; int ldlm_proc_setup(struct obd_device *obd) { + int rc; ENTRY; LASSERT(ldlm_ns_proc_dir == NULL); - ldlm_ns_proc_dir = obd->obd_type->typ_procroot; + rc = lprocfs_obd_attach(obd, 0); + if (rc) { + CERROR("LProcFS failed in ldlm-init\n"); + RETURN(rc); + } + ldlm_ns_proc_dir = obd->obd_proc_entry; RETURN(0); } void ldlm_proc_cleanup(struct obd_device *obd) { - ldlm_ns_proc_dir = NULL; + if (ldlm_ns_proc_dir) { + lprocfs_obd_detach(obd); + ldlm_ns_proc_dir = NULL; + } } static int lprocfs_uint_rd(char *page, char **start, off_t off, int count, int *eof, void *data) { unsigned int *temp = (unsigned int *)data; - int len; - len = snprintf(page, count, "%u\n", *temp); - return len; + return snprintf(page, count, "%u\n", *temp); } -#define MAX_STRING_SIZE 100 +#define MAX_STRING_SIZE 128 void ldlm_proc_namespace(struct ldlm_namespace *ns) { struct lprocfs_vars lock_vars[2]; - char lock_names[MAX_STRING_SIZE + 1]; + char lock_name[MAX_STRING_SIZE + 1]; + + lock_name[MAX_STRING_SIZE] = '\0'; memset(lock_vars, 0, sizeof(lock_vars)); - snprintf(lock_names, MAX_STRING_SIZE, "%s/resource_count", ns->ns_name); - lock_names[MAX_STRING_SIZE] = '\0'; - lock_vars[0].name = lock_names; - lock_vars[0].read_fptr = lprocfs_ll_rd; - lock_vars[0].write_fptr = NULL; + lock_vars[0].read_fptr = lprocfs_rd_u64; + + lock_vars[0].name = lock_name; + + snprintf(lock_name, MAX_STRING_SIZE, "%s/resource_count", ns->ns_name); + lock_vars[0].data = &ns->ns_resources; lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); - memset(lock_vars, 0, sizeof(lock_vars)); - snprintf(lock_names, MAX_STRING_SIZE, "%s/lock_count", ns->ns_name); - lock_names[MAX_STRING_SIZE] = '\0'; - lock_vars[0].name = lock_names; - lock_vars[0].read_fptr = lprocfs_ll_rd; - lock_vars[0].write_fptr = NULL; + snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_count", ns->ns_name); lock_vars[0].data = &ns->ns_locks; lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); - memset(lock_vars, 0, sizeof(lock_vars)); - snprintf(lock_names, MAX_STRING_SIZE, "%s/lock_unused_count", + snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_unused_count", ns->ns_name); - lock_names[MAX_STRING_SIZE] = '\0'; - lock_vars[0].name = lock_names; - lock_vars[0].read_fptr = lprocfs_uint_rd; - lock_vars[0].write_fptr = NULL; lock_vars[0].data = &ns->ns_nr_unused; + lock_vars[0].read_fptr = lprocfs_uint_rd; lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); } #undef MAX_STRING_SIZE @@ -136,7 +149,9 @@ extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock); /* If 'local_only' is true, don't try to tell the server, just cleanup. * This is currently only used for recovery, and we make certain assumptions - * as a result--notably, that we shouldn't cancel locks with refs. -phil */ + * as a result--notably, that we shouldn't cancel locks with refs. -phil + * + * Called with the ns_lock held. */ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, int local_only) { @@ -156,7 +171,9 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, * will go away ... */ lock->l_flags |= LDLM_FL_CBPENDING; /* ... without sending a CANCEL message. */ - lock->l_flags |= LDLM_FL_CANCELING; + lock->l_flags |= LDLM_FL_LOCAL_ONLY; + /* ... and without calling the cancellation callback */ + lock->l_flags |= LDLM_FL_CANCEL; LDLM_LOCK_PUT(lock); continue; } @@ -177,7 +194,7 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, ldlm_lock_cancel(lock); } else { LDLM_DEBUG(lock, "Freeing a lock still held by a " - "client node.\n"); + "client node"); ldlm_resource_unlink_lock(lock); ldlm_lock_destroy(lock); @@ -256,13 +273,13 @@ int ldlm_client_free(struct obd_export *exp) RETURN(0); } -static __u32 ldlm_hash_fn(struct ldlm_resource *parent, __u64 *name) +static __u32 ldlm_hash_fn(struct ldlm_resource *parent, struct ldlm_res_id name) { __u32 hash = 0; int i; for (i = 0; i < RES_NAME_SIZE; i++) - hash += name[i]; + hash += name.name[i]; hash += (__u32)((unsigned long)parent >> 4); @@ -293,9 +310,9 @@ static struct ldlm_resource *ldlm_resource_new(void) /* Args: locked namespace * Returns: newly-allocated, referenced, unlocked resource */ -static struct ldlm_resource *ldlm_resource_add(struct ldlm_namespace *ns, - struct ldlm_resource *parent, - __u64 *name, __u32 type) +static struct ldlm_resource * +ldlm_resource_add(struct ldlm_namespace *ns, struct ldlm_resource *parent, + struct ldlm_res_id name, __u32 type) { struct list_head *bucket; struct ldlm_resource *res; @@ -317,7 +334,7 @@ static struct ldlm_resource *ldlm_resource_add(struct ldlm_namespace *ns, spin_unlock(&ns->ns_counter_lock); l_lock(&ns->ns_lock); - memcpy(res->lr_name, name, sizeof(res->lr_name)); + memcpy(&res->lr_name, &name, sizeof(res->lr_name)); res->lr_namespace = ns; ns->ns_refcount++; @@ -341,9 +358,9 @@ static struct ldlm_resource *ldlm_resource_add(struct ldlm_namespace *ns, /* Args: unlocked namespace * Locks: takes and releases ns->ns_lock and res->lr_lock * Returns: referenced, unlocked ldlm_resource or NULL */ -struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns, - struct ldlm_resource *parent, - __u64 *name, __u32 type, int create) +struct ldlm_resource * +ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, + struct ldlm_res_id name, __u32 type, int create) { struct list_head *bucket, *tmp; struct ldlm_resource *res = NULL; @@ -358,7 +375,7 @@ struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns, list_for_each(tmp, bucket) { res = list_entry(tmp, struct ldlm_resource, lr_hash); - if (memcmp(res->lr_name, name, sizeof(res->lr_name)) == 0) { + if (memcmp(&res->lr_name, &name, sizeof(res->lr_name)) == 0) { ldlm_resource_getref(res); l_unlock(&ns->ns_lock); RETURN(res); @@ -451,12 +468,17 @@ void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head, l_lock(&res->lr_namespace->ns_lock); ldlm_resource_dump(res); - CDEBUG(D_OTHER, "About to grant this lock:\n"); + CDEBUG(D_OTHER, "About to add this lock:\n"); ldlm_lock_dump(D_OTHER, lock); + if (lock->l_destroyed) { + CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); + return; + } + LASSERT(list_empty(&lock->l_res_link)); - list_add(&lock->l_res_link, head); + list_add_tail(&lock->l_res_link, head); l_unlock(&res->lr_namespace->ns_lock); } @@ -470,7 +492,7 @@ void ldlm_resource_unlink_lock(struct ldlm_lock *lock) void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc) { desc->lr_type = res->lr_type; - memcpy(desc->lr_name, res->lr_name, sizeof(desc->lr_name)); + memcpy(&desc->lr_name, &res->lr_name, sizeof(desc->lr_name)); memcpy(desc->lr_version, res->lr_version, sizeof(desc->lr_version)); } @@ -517,9 +539,9 @@ void ldlm_resource_dump(struct ldlm_resource *res) LBUG(); snprintf(name, sizeof(name), "%Lx %Lx %Lx", - (unsigned long long)res->lr_name[0], - (unsigned long long)res->lr_name[1], - (unsigned long long)res->lr_name[2]); + (unsigned long long)res->lr_name.name[0], + (unsigned long long)res->lr_name.name[1], + (unsigned long long)res->lr_name.name[2]); CDEBUG(D_OTHER, "--- Resource: %p (%s) (rc: %d)\n", res, name, atomic_read(&res->lr_refcount)); diff --git a/lustre/ldlm/ldlm_test.c b/lustre/ldlm/ldlm_test.c index b34c9ab..6cf1056 100644 --- a/lustre/ldlm/ldlm_test.c +++ b/lustre/ldlm/ldlm_test.c @@ -75,7 +75,7 @@ static int ldlm_do_convert(void); */ static int ldlm_test_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *new, - void *data, __u32 data_len, int flag) + void *data, int flag) { int rc; struct lustre_handle lockh; @@ -104,7 +104,7 @@ static int ldlm_test_blocking_ast(struct ldlm_lock *lock, /* blocking ast for basic tests. noop */ static int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *new, - void *data, __u32 data_len, int flag) + void *data, int flag) { ENTRY; CERROR("ldlm_blocking_ast: lock=%p, new=%p, flag=%d\n", lock, new, @@ -115,7 +115,7 @@ static int ldlm_blocking_ast(struct ldlm_lock *lock, /* Completion ast for regression test. * Does not sleep when blocked. */ -static int ldlm_test_completion_ast(struct ldlm_lock *lock, int flags) +static int ldlm_test_completion_ast(struct ldlm_lock *lock, int flags, void *data) { struct ldlm_test_lock *lock_info; ENTRY; @@ -159,7 +159,7 @@ int ldlm_test_basics(struct obd_device *obddev) { struct ldlm_namespace *ns; struct ldlm_resource *res; - __u64 res_id[RES_NAME_SIZE] = {1, 2, 3}; + struct ldlm_res_id res_id = { .name = {1, 2, 3} }; ldlm_error_t err; struct ldlm_lock *lock1, *lock; int flags; @@ -207,7 +207,7 @@ int ldlm_test_extents(struct obd_device *obddev) struct ldlm_namespace *ns; struct ldlm_resource *res; struct ldlm_lock *lock, *lock1, *lock2; - __u64 res_id[RES_NAME_SIZE] = {0, 0, 0}; + struct ldlm_res_id res_id = { .name = {0} }; struct ldlm_extent ext1 = {4, 6}, ext2 = {6, 9}, ext3 = {10, 11}; ldlm_error_t err; int flags; @@ -275,8 +275,7 @@ int ldlm_test_extents(struct obd_device *obddev) static int ldlm_test_network(struct obd_device *obddev, struct lustre_handle *connh) { - - __u64 res_id[RES_NAME_SIZE] = {1, 2, 3}; + struct ldlm_res_id res_id = { .name = {1, 2, 3} }; struct ldlm_extent ext = {4, 6}; struct lustre_handle lockh1; struct ldlm_lock *lock; @@ -341,7 +340,7 @@ static int ldlm_do_decrement(void) static int ldlm_do_enqueue(struct ldlm_test_thread *thread) { struct lustre_handle lockh; - __u64 res_id[3] = {0}; + struct ldlm_res_id res_id = { .name = {0} }; __u32 lock_mode; struct ldlm_extent ext; unsigned char random; @@ -350,7 +349,7 @@ static int ldlm_do_enqueue(struct ldlm_test_thread *thread) /* Pick a random resource from 1 to num_resources */ get_random_bytes(&random, sizeof(random)); - res_id[0] = random % num_resources; + res_id.name[0] = random % num_resources; /* Pick a random lock mode */ get_random_bytes(&random, sizeof(random)); @@ -364,7 +363,7 @@ static int ldlm_do_enqueue(struct ldlm_test_thread *thread) (num_extents - (int)ext.start) + ext.start; LDLM_DEBUG_NOLOCK("about to enqueue with resource "LPX64", mode %d," - " extent "LPX64" -> "LPX64, res_id[0], lock_mode, + " extent "LPX64" -> "LPX64, res_id.name[0], lock_mode, ext.start, ext.end); rc = ldlm_match_or_enqueue(®ress_connh, NULL, diff --git a/lustre/lib/client.c b/lustre/lib/client.c index 5bf0d4a..122142b 100644 --- a/lustre/lib/client.c +++ b/lustre/lib/client.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Copyright (C) 2001-2003 Cluster File Systems, Inc. * Author: Peter J. Braam * Author: Phil Schwan * Author: Mike Shaver @@ -40,7 +40,7 @@ struct client_obd *client_conn2cli(struct lustre_handle *conn) return &export->exp_obd->u.cli; } -struct obd_device *client_tgtuuid2obd(char *tgtuuid) +struct obd_device *client_tgtuuid2obd(struct obd_uuid *tgtuuid) { int i; @@ -49,8 +49,8 @@ struct obd_device *client_tgtuuid2obd(char *tgtuuid) if ((strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0) || (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0)) { struct client_obd *cli = &obd->u.cli; - if (strncmp(tgtuuid, cli->cl_target_uuid, - sizeof(cli->cl_target_uuid)) == 0) + if (strncmp(tgtuuid->uuid, cli->cl_target_uuid.uuid, + sizeof(cli->cl_target_uuid.uuid)) == 0) return obd; } } @@ -65,7 +65,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) char *name; struct client_obd *cli = &obddev->u.cli; struct obd_import *imp = &cli->cl_import; - obd_uuid_t server_uuid; + struct obd_uuid server_uuid; ENTRY; if (obddev->obd_type->typ_ops->o_brw) { @@ -100,11 +100,11 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) sema_init(&cli->cl_sem, 1); cli->cl_conn_count = 0; - memcpy(cli->cl_target_uuid, data->ioc_inlbuf1, data->ioc_inllen1); - memcpy(server_uuid, data->ioc_inlbuf2, MIN(data->ioc_inllen2, + memcpy(cli->cl_target_uuid.uuid, data->ioc_inlbuf1, data->ioc_inllen1); + memcpy(server_uuid.uuid, data->ioc_inlbuf2, MIN(data->ioc_inllen2, sizeof(server_uuid))); - imp->imp_connection = ptlrpc_uuid_to_connection(server_uuid); + imp->imp_connection = ptlrpc_uuid_to_connection(&server_uuid); if (!imp->imp_connection) RETURN(-ENOENT); @@ -134,17 +134,18 @@ int client_obd_cleanup(struct obd_device * obddev) } int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { struct client_obd *cli = &obd->u.cli; struct ptlrpc_request *request; int rc, size[] = {sizeof(cli->cl_target_uuid), sizeof(obd->obd_uuid) }; - char *tmp[] = {cli->cl_target_uuid, obd->obd_uuid}; + char *tmp[] = {cli->cl_target_uuid.uuid, obd->obd_uuid.uuid}; int rq_opc = (obd->obd_type->typ_ops->o_brw) ? OST_CONNECT :MDS_CONNECT; struct ptlrpc_connection *c; struct obd_import *imp = &cli->cl_import; + int msg_flags; ENTRY; down(&cli->cl_sem); @@ -166,7 +167,6 @@ int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd, INIT_LIST_HEAD(&imp->imp_chain); imp->imp_last_xid = 0; imp->imp_max_transno = 0; - imp->imp_peer_last_xid = 0; imp->imp_peer_committed_transno = 0; request = ptlrpc_prep_req(&cli->cl_import, rq_opc, 2, size, tmp); @@ -187,8 +187,11 @@ int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd, if (rc) GOTO(out_req, rc); - if (rq_opc == MDS_CONNECT) + msg_flags = lustre_msg_get_op_flags(request->rq_repmsg); + if (rq_opc == MDS_CONNECT || msg_flags & MSG_CONNECT_REPLAYABLE) { imp->imp_flags |= IMP_REPLAYABLE; + CDEBUG(D_HA, "connected to replayable target: %s\n", cli->cl_target_uuid.uuid); + } imp->imp_level = LUSTRE_CONN_FULL; imp->imp_handle.addr = request->rq_repmsg->addr; imp->imp_handle.cookie = request->rq_repmsg->cookie; @@ -248,10 +251,12 @@ int client_obd_disconnect(struct lustre_handle *conn) if (cli->cl_conn_count) GOTO(out_no_disconnect, rc = 0); - ldlm_namespace_free(obd->obd_namespace); - obd->obd_namespace = NULL; - request = ptlrpc_prep_req(&cli->cl_import, rq_opc, 0, NULL, - NULL); + if (obd->obd_namespace != NULL) { + ldlm_cli_cancel_unused(obd->obd_namespace, NULL, 0); + ldlm_namespace_free(obd->obd_namespace); + obd->obd_namespace = NULL; + } + request = ptlrpc_prep_req(&cli->cl_import, rq_opc, 0, NULL, NULL); if (!request) GOTO(out_req, rc = -ENOMEM); diff --git a/lustre/lib/mds_updates.c b/lustre/lib/mds_updates.c index 6a53cb6..4d7f37a 100644 --- a/lustre/lib/mds_updates.c +++ b/lustre/lib/mds_updates.c @@ -1,14 +1,24 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copryright (C) 2002 Cluster File Systems, Inc. + * Lustre Lite Update Records * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * This file is part of Lustre, http://www.lustre.org. * - * Lustre Lite Update Records + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include @@ -20,7 +30,7 @@ #include #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) #include // for wait_on_buffer -#else +#else #include // for wait_on_buffer #endif #include @@ -52,20 +62,23 @@ void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode) void mds_pack_inode2body(struct mds_body *b, struct inode *inode) { b->valid = OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | - OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLUID | OBD_MD_FLGID | - OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLNLINK | OBD_MD_FLGENER; + OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | + OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLTYPE | OBD_MD_FLMODE | + OBD_MD_FLNLINK | OBD_MD_FLGENER; b->ino = HTON__u32(inode->i_ino); b->atime = HTON__u32(inode->i_atime); b->mtime = HTON__u32(inode->i_mtime); b->ctime = HTON__u32(inode->i_ctime); b->mode = HTON__u32(inode->i_mode); b->size = HTON__u64(inode->i_size); + b->blocks = HTON__u64(inode->i_blocks); b->uid = HTON__u32(inode->i_uid); b->gid = HTON__u32(inode->i_gid); b->flags = HTON__u32(inode->i_flags); b->rdev = HTON__u32(b->rdev); b->nlink = HTON__u32(inode->i_nlink); b->generation = HTON__u32(inode->i_generation); + b->suppgid = HTON__u32(-1); } @@ -100,11 +113,12 @@ static void mds_pack_body(struct mds_body *b) b->rdev = HTON__u32(b->rdev); b->nlink = HTON__u32(b->nlink); b->generation = HTON__u32(b->generation); + b->suppgid = HTON__u32(b->suppgid); } -void mds_getattr_pack(struct ptlrpc_request *req, int offset, - struct inode *inode, - const char *name, int namelen) +void mds_getattr_pack(struct ptlrpc_request *req, int valid, int offset, + int flags, + struct inode *inode, const char *name, int namelen) { struct mds_body *b; b = lustre_msg_buf(req->rq_reqmsg, offset); @@ -112,6 +126,12 @@ void mds_getattr_pack(struct ptlrpc_request *req, int offset, b->fsuid = HTON__u32(current->fsuid); b->fsgid = HTON__u32(current->fsgid); b->capability = HTON__u32(current->cap_effective); + b->valid = HTON__u32(valid); + b->flags = HTON__u32(flags); + if (in_group_p(inode->i_gid)) + b->suppgid = HTON__u32(inode->i_gid); + else + b->suppgid = HTON__u32(-1); ll_inode2fid(&b->fid1, inode); if (name) { @@ -122,7 +142,7 @@ void mds_getattr_pack(struct ptlrpc_request *req, int offset, } void mds_readdir_pack(struct ptlrpc_request *req, __u64 offset, - obd_id ino, int type) + obd_id ino, int type, __u64 xid) { struct mds_body *b; @@ -133,6 +153,8 @@ void mds_readdir_pack(struct ptlrpc_request *req, __u64 offset, b->fid1.id = HTON__u64(ino); b->fid1.f_type = HTON__u32(type); b->size = HTON__u64(offset); + b->suppgid = HTON__u32(-1); + b->blocks = HTON__u64(xid); } @@ -159,7 +181,6 @@ void mds_create_pack(struct ptlrpc_request *req, int offset, struct inode *dir, char *tmp; rec = lustre_msg_buf(req->rq_reqmsg, offset); - /* XXX do something about time, uid, gid */ rec->cr_opcode = HTON__u32(REINT_CREATE); rec->cr_fsuid = HTON__u32(current->fsuid); rec->cr_fsgid = HTON__u32(current->fsgid); @@ -180,34 +201,78 @@ void mds_create_pack(struct ptlrpc_request *req, int offset, struct inode *dir, LOGL0(data, datalen, tmp); } } +/* packing of MDS records */ +void mds_open_pack(struct ptlrpc_request *req, int offset, struct inode *dir, + __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time, + __u32 flags, + const char *name, int namelen, + const void *data, int datalen) +{ + struct mds_rec_create *rec; + char *tmp; + rec = lustre_msg_buf(req->rq_reqmsg, offset); + + /* XXX do something about time, uid, gid */ + rec->cr_opcode = HTON__u32(REINT_OPEN); + rec->cr_fsuid = HTON__u32(current->fsuid); + rec->cr_fsgid = HTON__u32(current->fsgid); + rec->cr_cap = HTON__u32(current->cap_effective); + ll_inode2fid(&rec->cr_fid, dir); + memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid)); + rec->cr_mode = HTON__u32(mode); + rec->cr_flags = HTON__u32(flags); + rec->cr_rdev = HTON__u64(rdev); + rec->cr_uid = HTON__u32(uid); + rec->cr_gid = HTON__u32(gid); + rec->cr_time = HTON__u64(time); + if (in_group_p(dir->i_gid)) + rec->cr_suppgid = HTON__u32(dir->i_gid); + else + rec->cr_suppgid = HTON__u32(-1); + + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1); + LOGL0(name, namelen, tmp); + + if (data) { + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2); + LOGL0(data, datalen, tmp); + } +} -void mds_setattr_pack(struct ptlrpc_request *req, int offset, +void mds_setattr_pack(struct ptlrpc_request *req, struct inode *inode, struct iattr *iattr, - const char *name, int namelen) + void *ea, int ealen) { - struct mds_rec_setattr *rec; - rec = lustre_msg_buf(req->rq_reqmsg, offset); + struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, 0); rec->sa_opcode = HTON__u32(REINT_SETATTR); rec->sa_fsuid = HTON__u32(current->fsuid); rec->sa_fsgid = HTON__u32(current->fsgid); rec->sa_cap = HTON__u32(current->cap_effective); ll_inode2fid(&rec->sa_fid, inode); - rec->sa_valid = HTON__u32(iattr->ia_valid); - rec->sa_mode = HTON__u32(iattr->ia_mode); - rec->sa_uid = HTON__u32(iattr->ia_uid); - rec->sa_gid = HTON__u32(iattr->ia_gid); - rec->sa_size = HTON__u64(iattr->ia_size); - rec->sa_atime = HTON__u64(iattr->ia_atime); - rec->sa_mtime = HTON__u64(iattr->ia_mtime); - rec->sa_ctime = HTON__u64(iattr->ia_ctime); - rec->sa_attr_flags = HTON__u32(iattr->ia_attr_flags); - - if (namelen) { - char *tmp; - tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1); - LOGL0(name, namelen, tmp); + + if (iattr) { + rec->sa_valid = HTON__u32(iattr->ia_valid); + rec->sa_mode = HTON__u32(iattr->ia_mode); + rec->sa_uid = HTON__u32(iattr->ia_uid); + rec->sa_gid = HTON__u32(iattr->ia_gid); + rec->sa_size = HTON__u64(iattr->ia_size); + rec->sa_atime = HTON__u64(iattr->ia_atime); + rec->sa_mtime = HTON__u64(iattr->ia_mtime); + rec->sa_ctime = HTON__u64(iattr->ia_ctime); + rec->sa_attr_flags = HTON__u32(iattr->ia_attr_flags); + + if ((iattr->ia_valid & ATTR_GID) && in_group_p(iattr->ia_gid)) + rec->sa_suppgid = HTON__u32(iattr->ia_gid); + else if ((iattr->ia_valid & ATTR_MODE) && + in_group_p(inode->i_gid)) + rec->sa_suppgid = HTON__u32(inode->i_gid); + else + rec->sa_suppgid = HTON__u32(-1); } + + if (ealen) + memcpy(lustre_msg_buf(req->rq_reqmsg, 1), ea, ealen); } void mds_unlink_pack(struct ptlrpc_request *req, int offset, @@ -224,6 +289,10 @@ void mds_unlink_pack(struct ptlrpc_request *req, int offset, rec->ul_fsgid = HTON__u32(current->fsgid); rec->ul_cap = HTON__u32(current->cap_effective); rec->ul_mode = HTON__u32(mode); + if (in_group_p(inode->i_gid)) + rec->ul_suppgid = HTON__u32(inode->i_gid); + else + rec->ul_suppgid = HTON__u32(-1); ll_inode2fid(&rec->ul_fid1, inode); if (child) ll_inode2fid(&rec->ul_fid2, child); @@ -245,6 +314,10 @@ void mds_link_pack(struct ptlrpc_request *req, int offset, rec->lk_fsuid = HTON__u32(current->fsuid); rec->lk_fsgid = HTON__u32(current->fsgid); rec->lk_cap = HTON__u32(current->cap_effective); + if (in_group_p(dir->i_gid)) + rec->lk_suppgid = HTON__u32(dir->i_gid); + else + rec->lk_suppgid = HTON__u32(-1); ll_inode2fid(&rec->lk_fid1, inode); ll_inode2fid(&rec->lk_fid2, dir); @@ -294,6 +367,7 @@ void mds_unpack_body(struct mds_body *b) mds_unpack_fid(&b->fid1); mds_unpack_fid(&b->fid2); b->size = NTOH__u64(b->size); + b->blocks = NTOH__u64(b->blocks); b->valid = NTOH__u32(b->valid); b->fsuid = NTOH__u32(b->fsuid); b->fsgid = NTOH__u32(b->fsgid); @@ -309,6 +383,7 @@ void mds_unpack_body(struct mds_body *b) b->rdev = NTOH__u32(b->rdev); b->nlink = NTOH__u32(b->nlink); b->generation = NTOH__u32(b->generation); + b->suppgid = NTOH__u32(b->suppgid); } static int mds_setattr_unpack(struct ptlrpc_request *req, int offset, @@ -325,6 +400,7 @@ static int mds_setattr_unpack(struct ptlrpc_request *req, int offset, r->ur_fsuid = NTOH__u32(rec->sa_fsuid); r->ur_fsgid = NTOH__u32(rec->sa_fsgid); r->ur_cap = NTOH__u32(rec->sa_cap); + r->ur_suppgid = NTOH__u32(rec->sa_suppgid); r->ur_fid1 = &rec->sa_fid; attr->ia_valid = NTOH__u32(rec->sa_valid); attr->ia_mode = NTOH__u32(rec->sa_mode); @@ -339,8 +415,9 @@ static int mds_setattr_unpack(struct ptlrpc_request *req, int offset, if (req->rq_reqmsg->bufcount == offset + 2) { r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1); - } else + } else { r->ur_namelen = 0; + } RETURN(0); } @@ -365,6 +442,8 @@ static int mds_create_unpack(struct ptlrpc_request *req, int offset, r->ur_uid = NTOH__u32(rec->cr_uid); r->ur_gid = NTOH__u32(rec->cr_gid); r->ur_time = NTOH__u64(rec->cr_time); + r->ur_flags = NTOH__u32(rec->cr_flags); + r->ur_suppgid = NTOH__u32(rec->cr_suppgid); r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1); r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; @@ -392,6 +471,7 @@ static int mds_link_unpack(struct ptlrpc_request *req, int offset, r->ur_fsuid = NTOH__u32(rec->lk_fsuid); r->ur_fsgid = NTOH__u32(rec->lk_fsgid); r->ur_cap = NTOH__u32(rec->lk_cap); + r->ur_suppgid = NTOH__u32(rec->lk_suppgid); r->ur_fid1 = &rec->lk_fid1; r->ur_fid2 = &rec->lk_fid2; @@ -414,6 +494,7 @@ static int mds_unlink_unpack(struct ptlrpc_request *req, int offset, r->ur_fsgid = NTOH__u32(rec->ul_fsgid); r->ur_cap = NTOH__u32(rec->ul_cap); r->ur_mode = NTOH__u32(rec->ul_mode); + r->ur_suppgid = NTOH__u32(rec->ul_suppgid); r->ur_fid1 = &rec->ul_fid1; r->ur_fid2 = &rec->ul_fid2; @@ -455,6 +536,7 @@ static update_unpacker mds_unpackers[REINT_MAX + 1] = { [REINT_LINK] mds_link_unpack, [REINT_UNLINK] mds_unlink_unpack, [REINT_RENAME] mds_rename_unpack, + [REINT_OPEN] mds_create_unpack, }; int mds_update_unpack(struct ptlrpc_request *req, int offset, @@ -470,8 +552,10 @@ int mds_update_unpack(struct ptlrpc_request *req, int offset, realop = rec->ur_opcode = NTOH__u32(*opcode); realop &= REINT_OPCODE_MASK; - if (realop < 0 || realop > REINT_MAX) + if (realop < 0 || realop > REINT_MAX) { + LBUG(); RETURN(-EFAULT); + } rc = mds_unpackers[realop](req, offset, rec); RETURN(rc); diff --git a/lustre/lib/simple.c b/lustre/lib/simple.c index 73a4383..f5627ba 100644 --- a/lustre/lib/simple.c +++ b/lustre/lib/simple.c @@ -1,15 +1,24 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * lib/simple.c + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. + * Author: Peter Braam + * Aurhot: Andreas Dilger * - * Copyright (C) 2002 Cluster File Systems, Inc. + * This file is part of Lustre, http://www.lustre.org. * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. * - * by Peter Braam - * and Andreas Dilger + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define EXPORT_SYMTAB @@ -71,6 +80,8 @@ void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx, current->fsuid = uc->ouc_fsuid; current->fsgid = uc->ouc_fsgid; current->cap_effective = uc->ouc_cap; + if (uc->ouc_suppgid != -1) + current->groups[current->ngroups++] = uc->ouc_suppgid; } set_fs(new_ctx->fs); set_fs_pwd(current->fs, new_ctx->pwdmnt, new_ctx->pwd); @@ -115,6 +126,9 @@ void pop_ctxt(struct obd_run_ctxt *saved, struct obd_run_ctxt *new_ctx, current->fsuid = saved->fsuid; current->fsgid = saved->fsgid; current->cap_effective = saved->cap; + + if (uc->ouc_suppgid != -1) + current->ngroups--; } /* @@ -135,7 +149,6 @@ struct dentry *simple_mknod(struct dentry *dir, char *name, int mode) ASSERT_KERNEL_CTXT("kernel doing mknod outside kernel context\n"); CDEBUG(D_INODE, "creating file %*s\n", (int)strlen(name), name); - down(&dir->d_inode->i_sem); dchild = lookup_one_len(name, dir, strlen(name)); if (IS_ERR(dchild)) GOTO(out_up, dchild); @@ -151,14 +164,12 @@ struct dentry *simple_mknod(struct dentry *dir, char *name, int mode) if (err) GOTO(out_err, err); - up(&dir->d_inode->i_sem); RETURN(dchild); out_err: dput(dchild); dchild = ERR_PTR(err); out_up: - up(&dir->d_inode->i_sem); return dchild; } @@ -171,7 +182,6 @@ struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode) ASSERT_KERNEL_CTXT("kernel doing mkdir outside kernel context\n"); CDEBUG(D_INODE, "creating directory %*s\n", (int)strlen(name), name); - down(&dir->d_inode->i_sem); dchild = lookup_one_len(name, dir, strlen(name)); if (IS_ERR(dchild)) GOTO(out_up, dchild); @@ -187,14 +197,12 @@ struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode) if (err) GOTO(out_err, err); - up(&dir->d_inode->i_sem); RETURN(dchild); out_err: dput(dchild); dchild = ERR_PTR(err); out_up: - up(&dir->d_inode->i_sem); return dchild; } diff --git a/lustre/lib/target.c b/lustre/lib/target.c index 3889f1c..81638f1 100644 --- a/lustre/lib/target.c +++ b/lustre/lib/target.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Copyright (C) 2001-2003 Cluster File Systems, Inc. * Author: Peter J. Braam * Author: Phil Schwan * Author: Mike Shaver @@ -33,22 +33,23 @@ #include int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, - char *cluuid) + struct obd_uuid *cluuid) { if (exp->exp_connection) { struct lustre_handle *hdl; hdl = &exp->exp_ldlm_data.led_import.imp_handle; /* Might be a re-connect after a partition. */ if (!memcmp(conn, hdl, sizeof *conn)) { - CERROR("%s reconnecting\n", cluuid); + CERROR("%s reconnecting\n", cluuid->uuid); conn->addr = (__u64) (unsigned long)exp; conn->cookie = exp->exp_cookie; RETURN(EALREADY); } else { CERROR("%s reconnecting from %s, " "handle mismatch (ours "LPX64"/"LPX64", " - "theirs "LPX64"/"LPX64")\n", cluuid, - exp->exp_connection->c_remote_uuid, hdl->addr, + "theirs "LPX64"/"LPX64")\n", cluuid->uuid, + exp->exp_connection->c_remote_uuid.uuid, + hdl->addr, hdl->cookie, conn->addr, conn->cookie); /* XXX disconnect them here? */ memset(conn, 0, sizeof *conn); @@ -62,7 +63,7 @@ int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, conn->addr = (__u64) (unsigned long)exp; conn->cookie = exp->exp_cookie; - CDEBUG(D_INFO, "existing export for UUID '%s' at %p\n", cluuid, exp); + CDEBUG(D_INFO, "existing export for UUID '%s' at %p\n", cluuid->uuid, exp); CDEBUG(D_IOCTL,"connect: addr %Lx cookie %Lx\n", (long long)conn->addr, (long long)conn->cookie); RETURN(0); @@ -71,28 +72,30 @@ int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, int target_handle_connect(struct ptlrpc_request *req) { struct obd_device *target; - struct obd_export *export; + struct obd_export *export = NULL; struct obd_import *dlmimp; struct lustre_handle conn; - char *tgtuuid, *cluuid; + struct obd_uuid tgtuuid; + struct obd_uuid cluuid; + struct list_head *p; int rc, i; ENTRY; - tgtuuid = lustre_msg_buf(req->rq_reqmsg, 0); if (req->rq_reqmsg->buflens[0] > 37) { CERROR("bad target UUID for connect\n"); GOTO(out, rc = -EINVAL); } + obd_str2uuid(&tgtuuid, lustre_msg_buf(req->rq_reqmsg, 0)); - cluuid = lustre_msg_buf(req->rq_reqmsg, 1); if (req->rq_reqmsg->buflens[1] > 37) { CERROR("bad client UUID for connect\n"); GOTO(out, rc = -EINVAL); } + obd_str2uuid(&cluuid, lustre_msg_buf(req->rq_reqmsg, 1)); - i = class_uuid2dev(tgtuuid); + i = class_uuid2dev(&tgtuuid); if (i == -1) { - CERROR("UUID '%s' not found for connect\n", tgtuuid); + CERROR("UUID '%s' not found for connect\n", tgtuuid.uuid); GOTO(out, rc = -ENODEV); } @@ -103,18 +106,62 @@ int target_handle_connect(struct ptlrpc_request *req) conn.addr = req->rq_reqmsg->addr; conn.cookie = req->rq_reqmsg->cookie; - rc = obd_connect(&conn, target, cluuid, ptlrpc_recovd, - target_revoke_connection); - /* EALREADY indicates a reconnection, send the reply normally. */ - if (rc && rc != EALREADY) + rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg); + if (rc) GOTO(out, rc); + /* lctl gets a backstage, all-access pass. */ + if (!strcmp(cluuid.uuid, "OBD_CLASS_UUID")) + goto dont_check_exports; + + spin_lock(&target->obd_dev_lock); + list_for_each(p, &target->obd_exports) { + export = list_entry(p, struct obd_export, exp_obd_chain); + if (!memcmp(&cluuid, &export->exp_client_uuid, + sizeof(export->exp_client_uuid))) { + spin_unlock(&target->obd_dev_lock); + LASSERT(export->exp_obd == target); + + rc = target_handle_reconnect(&conn, export, &cluuid); + break; + } + export = NULL; + } + /* If we found an export, we already unlocked. */ + if (!export) + spin_unlock(&target->obd_dev_lock); + + /* Tell the client if we're in recovery. */ + if (target->obd_flags & OBD_RECOVERING) + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); + + /* Tell the client if we support replayable requests */ + if (target->obd_flags & OBD_REPLAYABLE) + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE); + + if (!export) { + if (target->obd_flags & OBD_RECOVERING) { + CERROR("denying connection for new client %s: " + "in recovery\n", cluuid.uuid); + rc = -EBUSY; + } else { + dont_check_exports: + rc = obd_connect(&conn, target, &cluuid, ptlrpc_recovd, + target_revoke_connection); + } + } + + if (rc == EALREADY) { + /* We indicate the reconnection in a flag, not an error code. */ + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT); + rc = 0; + } else if (rc) { + GOTO(out, rc); + } + /* If all else goes well, this is our RPC return code. */ req->rq_status = rc; - rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg); - if (rc) - GOTO(out, rc); req->rq_repmsg->addr = conn.addr; req->rq_repmsg->cookie = conn.cookie; @@ -122,7 +169,7 @@ int target_handle_connect(struct ptlrpc_request *req) LASSERT(export); req->rq_export = export; - export->exp_connection = ptlrpc_get_connection(&req->rq_peer, cluuid); + export->exp_connection = ptlrpc_get_connection(&req->rq_peer, &cluuid); if (req->rq_connection != NULL) ptlrpc_put_connection(req->rq_connection); req->rq_connection = ptlrpc_connection_addref(export->exp_connection); @@ -162,7 +209,7 @@ int target_handle_disconnect(struct ptlrpc_request *req) RETURN(rc); req->rq_status = obd_disconnect(conn); - + req->rq_export = NULL; RETURN(0); } @@ -200,7 +247,7 @@ static int target_fence_failed_connection(struct ptlrpc_connection *conn) int target_revoke_connection(struct recovd_data *rd, int phase) { struct ptlrpc_connection *conn = class_rd2conn(rd); - + LASSERT(conn); ENTRY; diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c index a9d4aac..0286cc6 100644 --- a/lustre/llite/dcache.c +++ b/lustre/llite/dcache.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (c) 2001, 2002 Cluster File Systems, Inc. + * Copyright (c) 2001-2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -31,37 +31,49 @@ #include #include -extern struct address_space_operations ll_aops; - +/* should NOT be called with the dcache lock, see fs/dcache.c */ void ll_release(struct dentry *de) { ENTRY; - OBD_FREE(de->d_fsdata, sizeof(struct ll_dentry_data)); EXIT; } -extern void d_delete_aliases(struct inode *); +void ll_set_dd(struct dentry *de) +{ + ENTRY; + LASSERT(de != NULL); + + lock_kernel(); + + if (de->d_fsdata == NULL) { + OBD_ALLOC(de->d_fsdata, sizeof(struct ll_dentry_data)); + sema_init(&ll_d2d(de)->lld_it_sem, 1); + } + + unlock_kernel(); + + EXIT; +} + void ll_intent_release(struct dentry *de, struct lookup_intent *it) { struct lustre_handle *handle; ENTRY; LASSERT(ll_d2d(de) != NULL); + mdc_put_rpc_lock(&mdc_rpc_lock, it); if (it->it_lock_mode) { handle = (struct lustre_handle *)it->it_lock_handle; - if (it->it_op == IT_SETATTR) { - int rc; - ldlm_lock_decref(handle, it->it_lock_mode); - rc = ldlm_cli_cancel(handle); - if (rc < 0) - CERROR("ldlm_cli_cancel: %d\n", rc); - } else + if (it->it_op == IT_SETATTR) + ldlm_lock_decref_and_cancel(handle, it->it_lock_mode); + else ldlm_lock_decref(handle, it->it_lock_mode); - /* intent_release may be called multiple times, and we don't - * want to double-decref this lock (see bug 494) */ + /* intent_release may be called multiple times, from + this thread and we don't want to double-decref this + lock (see bug 494) */ it->it_lock_mode = 0; } @@ -72,6 +84,8 @@ void ll_intent_release(struct dentry *de, struct lookup_intent *it) if (de->d_it == it) LL_GET_INTENT(de, it); + else + CERROR("STRANGE intent release: %p %p\n", de->d_it, it); EXIT; } @@ -79,21 +93,33 @@ void ll_intent_release(struct dentry *de, struct lookup_intent *it) extern struct dentry *ll_find_alias(struct inode *, struct dentry *); static int revalidate2_finish(int flag, struct ptlrpc_request *request, - struct dentry **de, - struct lookup_intent *it, - int offset, obd_id ino) + struct dentry **de, struct lookup_intent *it, + int offset, obd_id ino) { - ldlm_lock_set_data((struct lustre_handle *)it->it_lock_handle, - (*de)->d_inode, sizeof(*((*de)->d_inode))); + struct mds_body *body; + struct lov_mds_md *lmm = NULL; + int rc = 0; + ENTRY; + + if (!(flag & LL_LOOKUP_NEGATIVE)) { + body = lustre_msg_buf(request->rq_repmsg, offset); + if (body->valid & OBD_MD_FLEASIZE) + lmm = lustre_msg_buf(request->rq_repmsg, offset + 1); + ll_update_inode((*de)->d_inode, body, lmm); + mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle, + (*de)->d_inode); + } else + rc = -ENOENT; + ptlrpc_req_finished(request); - return 0; + RETURN(rc); } int ll_have_md_lock(struct dentry *de) { struct ll_sb_info *sbi = ll_s2sbi(de->d_sb); struct lustre_handle lockh; - __u64 res_id[RES_NAME_SIZE] = {0}; + struct ldlm_res_id res_id = { .name = {0} }; struct obd_device *obddev; ENTRY; @@ -101,19 +127,19 @@ int ll_have_md_lock(struct dentry *de) RETURN(0); obddev = class_conn2obd(&sbi->ll_mdc_conn); - res_id[0] = de->d_inode->i_ino; - res_id[1] = de->d_inode->i_generation; + res_id.name[0] = de->d_inode->i_ino; + res_id.name[1] = de->d_inode->i_generation; - CDEBUG(D_INFO, "trying to match res "LPU64"\n", res_id[0]); + CDEBUG(D_INFO, "trying to match res "LPU64"\n", res_id.name[0]); - if (ldlm_lock_match(obddev->obd_namespace, res_id, LDLM_PLAIN, - NULL, 0, LCK_PR, &lockh)) { + if (ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED, + &res_id, LDLM_PLAIN, NULL, 0, LCK_PR, &lockh)) { ldlm_lock_decref(&lockh, LCK_PR); RETURN(1); } - if (ldlm_lock_match(obddev->obd_namespace, res_id, LDLM_PLAIN, - NULL, 0, LCK_PW, &lockh)) { + if (ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED, + &res_id, LDLM_PLAIN, NULL, 0, LCK_PW, &lockh)) { ldlm_lock_decref(&lockh, LCK_PW); RETURN(1); } @@ -133,10 +159,62 @@ int ll_revalidate2(struct dentry *de, int flags, struct lookup_intent *it) RETURN(0); } + if (it && it->it_op == IT_TRUNC) + it->it_op = IT_SETATTR; + + if (it == NULL || it->it_op == IT_GETATTR) { + /* We could just return 1 immediately, but since we should only + * be called in revalidate2 if we already have a lock, let's + * verify that. */ + struct inode *inode = de->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_device *obddev = class_conn2obd(&sbi->ll_mdc_conn); + struct ldlm_res_id res_id = + { .name = {inode->i_ino, (__u64)inode->i_generation} }; + struct lustre_handle lockh; + rc = ldlm_lock_match(obddev->obd_namespace, + LDLM_FL_BLOCK_GRANTED, &res_id, + LDLM_PLAIN, NULL, 0, LCK_PR, &lockh); + if (rc) { + de->d_flags &= ~DCACHE_LUSTRE_INVALID; + if (it && it->it_op == IT_GETATTR) { + memcpy(it->it_lock_handle, &lockh, + sizeof(lockh)); + it->it_lock_mode = LCK_PR; + LL_SAVE_INTENT(de, it); + } else { + ldlm_lock_decref(&lockh, LCK_PR); + } + RETURN(1); + } + rc = ldlm_lock_match(obddev->obd_namespace, + LDLM_FL_BLOCK_GRANTED, &res_id, + LDLM_PLAIN, NULL, 0, LCK_PW, &lockh); + if (rc) { + de->d_flags &= ~DCACHE_LUSTRE_INVALID; + if (it && it->it_op == IT_GETATTR) { + memcpy(it->it_lock_handle, &lockh, + sizeof(lockh)); + it->it_lock_mode = LCK_PW; + LL_SAVE_INTENT(de, it); + } else { + ldlm_lock_decref(&lockh, LCK_PW); + } + RETURN(1); + } + if (S_ISDIR(de->d_inode->i_mode)) + ll_invalidate_inode_pages(de->d_inode); + d_unhash_aliases(de->d_inode); + RETURN(0); + } + rc = ll_intent_lock(de->d_parent->d_inode, &de, it, revalidate2_finish); - if (rc < 0) { - /* Something bad happened; overwrite it_status? */ - CERROR("ll_intent_lock: %d\n", rc); + if (rc == -ESTALE) + RETURN(0); + if (rc < 0 && it->it_status) { + CERROR("ll_intent_lock: rc %d : it->it_status %d\n", rc, + it->it_status); + RETURN(0); } /* unfortunately ll_intent_lock may cause a callback and revoke our dentry */ @@ -148,25 +226,6 @@ int ll_revalidate2(struct dentry *de, int flags, struct lookup_intent *it) RETURN(1); } -int ll_set_dd(struct dentry *de) -{ - ENTRY; - LASSERT(de != NULL); - - lock_kernel(); - - if (de->d_fsdata != NULL) { - CERROR("dentry %p already has d_fsdata set\n", de); - } else { - OBD_ALLOC(de->d_fsdata, sizeof(struct ll_dentry_data)); - sema_init(&ll_d2d(de)->lld_it_sem, 1); - } - - unlock_kernel(); - - RETURN(0); -} - struct dentry_operations ll_d_ops = { .d_revalidate2 = ll_revalidate2, .d_intent_release = ll_intent_release, diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index 921eea2..072eeea 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -22,7 +22,7 @@ * and moved here. AV * * Adapted for Lustre Light - * Copyright (C) 2002, Cluster File Systems, Inc. + * Copyright (C) 2002-2003, Cluster File Systems, Inc. * */ @@ -76,6 +76,11 @@ static int ll_dir_readpage(struct file *file, struct page *page) ENTRY; if ((inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT <= page->index){ + /* XXX why do we need this exactly, and why do we think that + * an all-zero directory page is useful? + */ + CERROR("memsetting dir page %lu to zero (size %lld)\n", + page->index, inode->i_size); memset(kmap(page), 0, PAGE_CACHE_SIZE); kunmap(page); GOTO(readpage_out, rc); @@ -86,7 +91,7 @@ static int ll_dir_readpage(struct file *file, struct page *page) request = (struct ptlrpc_request *)it.it_data; if (request) ptlrpc_req_finished(request); - if (rc != ELDLM_OK) { + if (rc < 0) { CERROR("lock enqueue: err: %d\n", rc); unlock_page(page); RETURN(rc); @@ -118,7 +123,8 @@ static int ll_dir_readpage(struct file *file, struct page *page) SetPageUptodate(page); unlock_page(page); - rc = ll_unlock(LCK_PR, &lockh); + ll_unlock(LCK_PR, &lockh); + mdc_put_rpc_lock(&mdc_rpc_lock, &it); if (rc != ELDLM_OK) CERROR("ll_unlock: err: %d\n", rc); return rc; @@ -206,7 +212,7 @@ static void ext2_check_page(struct page *page) limit = dir->i_size & ~PAGE_CACHE_MASK; if (limit & (chunk_size - 1)) { CERROR("limit %d dir size %lld index %ld\n", - limit, dir->i_size, page->index); + limit, dir->i_size, page->index); goto Ebadsize; } for (offs = limit; offsi_ino, error, (page->index<i_ino, error, (page->index<inode), rec_len, p->name_len); goto fail; @@ -281,7 +287,7 @@ fail: LBUG(); } -static struct page * ll_get_page(struct inode *dir, unsigned long n) +static struct page *ll_get_dir_page(struct inode *dir, unsigned long n) { struct address_space *mapping = dir->i_mapping; struct page *page = read_cache_page(mapping, n, @@ -397,8 +403,10 @@ int ll_readdir(struct file * filp, void * dirent, filldir_t filldir) char *kaddr, *limit; ext2_dirent *de; struct page *page; - - page = ll_get_page(inode, n); + + CDEBUG(D_EXT2, "reading %lu of dir %lu page %lu, size %llu\n", + PAGE_CACHE_SIZE, inode->i_ino, n, inode->i_size); + page = ll_get_dir_page(inode, n); /* size might have been updated by mdc_readpage */ npages = dir_pages(inode); @@ -422,8 +430,8 @@ int ll_readdir(struct file * filp, void * dirent, filldir_t filldir) offset = (char *)de - kaddr; over = filldir(dirent, de->name, de->name_len, - (n<inode), d_type); + (n<inode), d_type); if (over) { ext2_put_page(page); GOTO(done,0); @@ -468,7 +476,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, n = start; do { char *kaddr; - page = ll_get_page(dir, n); + page = ll_get_dir_page(dir, n); if (!IS_ERR(page)) { kaddr = page_address(page); de = (ext2_dirent *) kaddr; @@ -493,7 +501,7 @@ found: struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p) { - struct page *page = ll_get_page(dir, 0); + struct page *page = ll_get_dir_page(dir, 0); ext2_dirent *de = NULL; if (!IS_ERR(page)) { @@ -559,7 +567,7 @@ int ll_add_link (struct dentry *dentry, struct inode *inode) /* We take care of directory expansion in the same loop */ for (n = 0; n <= npages; n++) { - page = ll_get_page(dir, n); + page = ll_get_dir_page(dir, n); err = PTR_ERR(page); if (IS_ERR(page)) goto out; @@ -711,7 +719,7 @@ int ext2_empty_dir (struct inode * inode) for (i = 0; i < npages; i++) { char *kaddr; ext2_dirent * de; - page = ll_get_page(inode, i); + page = ll_get_dir_page(inode, i); if (IS_ERR(page)) continue; diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 6b37d99..1e26110 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -1,26 +1,25 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * linux/fs/ext2/file.c + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * Author: Peter Braam + * Author: Phil Schwan + * Author: Andreas Dilger * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * This file is part of Lustre, http://www.lustre.org. * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. * - * from + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. * - * linux/fs/minix/file.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext2 fs regular file handling primitives - * - * 64-bit file support on 64-bit platforms by Jakub Jelinek - * (jj@sunsite.ms.mff.cuni.cz) + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define DEBUG_SUBSYSTEM S_LLITE @@ -33,96 +32,156 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc); extern int ll_setattr(struct dentry *de, struct iattr *attr); -static int ll_mdc_open(struct lustre_handle *mdc_conn, struct inode *inode, - struct file *file, struct lov_mds_md *lmm, int lmm_size) -{ - struct ptlrpc_request *req = NULL; - struct ll_file_data *fd; - int rc; - ENTRY; - - LASSERT(!file->private_data); - - fd = kmem_cache_alloc(ll_file_data_slab, SLAB_KERNEL); - if (!fd) - RETURN(-ENOMEM); - - memset(fd, 0, sizeof(*fd)); - fd->fd_mdshandle.addr = (__u64)(unsigned long)file; - get_random_bytes(&fd->fd_mdshandle.cookie, - sizeof(fd->fd_mdshandle.cookie)); - - rc = mdc_open(mdc_conn, inode->i_ino, S_IFREG | inode->i_mode, - file->f_flags, lmm, lmm_size, &fd->fd_mdshandle, &req); - - /* This is the "reply" refcount. */ - ptlrpc_req_finished(req); - - if (rc) - GOTO(out_fd, rc); - - fd->fd_req = req; - file->private_data = fd; - - if (!fd->fd_mdshandle.addr || - fd->fd_mdshandle.addr == (__u64)(unsigned long)file) { - CERROR("hmm, mdc_open didn't assign fd_mdshandle?\n"); - /* XXX handle this how, abort or is it non-fatal? */ - } - - file->f_flags &= ~O_LOV_DELAY_CREATE; - RETURN(0); - -out_fd: - fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC; - kmem_cache_free(ll_file_data_slab, fd); - - return -abs(rc); -} - static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode, struct file *file) { struct ll_file_data *fd = file->private_data; struct ptlrpc_request *req = NULL; unsigned long flags; - struct obd_import *imp = fd->fd_req->rq_import; + struct obd_import *imp; int rc; + ENTRY; /* Complete the open request and remove it from replay list */ - DEBUG_REQ(D_HA, fd->fd_req, "matched open req %p", fd->fd_req); rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, inode->i_ino, inode->i_mode, &fd->fd_mdshandle, &req); - if (rc) CERROR("inode %lu close failed: rc = %d\n", inode->i_ino, rc); - ptlrpc_req_finished(req); + imp = fd->fd_req->rq_import; + LASSERT(imp != NULL); spin_lock_irqsave(&imp->imp_lock, flags); + + DEBUG_REQ(D_HA, fd->fd_req, "matched open req %p", fd->fd_req); + + /* We held on to the request for replay until we saw a close for that + * file. Now that we've closed it, it gets replayed on the basis of + * its transno only. */ + fd->fd_req->rq_flags &= ~PTL_RPC_FL_REPLAY; + if (fd->fd_req->rq_transno) { - /* This caused an EA to be written, need to replay as a normal - * transaction now. Our reference is now effectively owned - * by the imp_replay_list, and we'll be committed just like - * other transno-having requests now. - */ - fd->fd_req->rq_flags &= ~PTL_RPC_FL_REPLAY; + /* This open created a file, so it needs replay as a + * normal transaction now. Our reference to it now + * effectively owned by the imp_replay_list, and it'll + * be committed just like other transno-having + * requests from here on out. */ + + /* We now retain this close request, so that it is + * replayed if the open is replayed. We duplicate the + * transno, so that we get freed at the right time, + * and rely on the difference in xid to keep + * everything ordered correctly. + * + * But! If this close was already given a transno + * (because it caused real unlinking of an + * open-unlinked file, f.e.), then we'll be ordered on + * the basis of that and we don't need to do anything + * magical here. */ + if (!req->rq_transno) { + req->rq_transno = fd->fd_req->rq_transno; + ptlrpc_retain_replayable_request(req, imp); + } spin_unlock_irqrestore(&imp->imp_lock, flags); + + /* Should we free_committed now? we always free before + * replay, so it's probably a wash. We could check to + * see if the fd_req should already be committed, in + * which case we can avoid the whole retain_replayable + * dance. */ } else { /* No transno means that we can just drop our ref. */ spin_unlock_irqrestore(&imp->imp_lock, flags); ptlrpc_req_finished(fd->fd_req); } + + /* Do this after the fd_req->rq_transno check, because we don't want + * to bounce off zero references. */ + ptlrpc_req_finished(req); fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC; file->private_data = NULL; kmem_cache_free(ll_file_data_slab, fd); - return -abs(rc); + RETURN(-abs(rc)); +} + +/* While this returns an error code, fput() the caller does not, so we need + * to make every effort to clean up all of our state here. Also, applications + * rarely check close errors and even if an error is returned they will not + * re-try the close call. + */ +static int ll_file_release(struct inode *inode, struct file *file) +{ + struct ll_file_data *fd; + struct obdo oa; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + struct lov_stripe_md *lsm = lli->lli_smd; + int rc = 0, rc2; + + ENTRY; + + fd = (struct ll_file_data *)file->private_data; + if (!fd) /* no process opened the file after an mcreate */ + RETURN(rc = 0); + + if (lsm != NULL) { + memset(&oa, 0, sizeof(oa)); + oa.o_id = lsm->lsm_object_id; + oa.o_mode = S_IFREG; + oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID; + obd_handle2oa(&oa, &fd->fd_osthandle); + rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL); + if (rc) + CERROR("inode %lu object close failed: rc = %d\n", + inode->i_ino, rc); + } + + mdc_get_rpc_lock(&mdc_rpc_lock, NULL); + rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file); + mdc_put_rpc_lock(&mdc_rpc_lock, NULL); + if (rc2 && !rc) + rc = rc2; + + if (atomic_dec_and_test(&lli->lli_open_count)) { + CDEBUG(D_INFO, "last close, cancelling unused locks\n"); + rc2 = obd_cancel_unused(&sbi->ll_osc_conn, lsm, 0); + if (rc2 && !rc) { + rc = rc2; + CERROR("obd_cancel_unused: %d\n", rc); + } + } else + CDEBUG(D_INFO, "not last close, not cancelling unused locks\n"); + + RETURN(rc); +} + +static int ll_local_open(struct file *file, struct lookup_intent *it) +{ + struct ptlrpc_request *req = it->it_data; + struct ll_file_data *fd; + struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1); + ENTRY; + + LASSERT(!file->private_data); + + fd = kmem_cache_alloc(ll_file_data_slab, SLAB_KERNEL); + /* We can't handle this well without reorganizing ll_file_open and + * ll_mdc_close, so don't even try right now. */ + LASSERT(fd != NULL); + + memset(fd, 0, sizeof(*fd)); + + memcpy(&fd->fd_mdshandle, &body->handle, sizeof(body->handle)); + fd->fd_req = it->it_data; + file->private_data = fd; + + RETURN(0); } static int ll_osc_open(struct lustre_handle *conn, struct inode *inode, struct file *file, struct lov_stripe_md *lsm) { - struct ll_file_data *fd; + struct ll_file_data *fd = file->private_data; struct obdo *oa; int rc; ENTRY; @@ -133,14 +192,15 @@ static int ll_osc_open(struct lustre_handle *conn, struct inode *inode, oa->o_id = lsm->lsm_object_id; oa->o_mode = S_IFREG; oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | - OBD_MD_FLBLOCKS; - rc = obd_open(conn, oa, lsm); + OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME; + rc = obd_open(conn, oa, lsm, NULL); if (rc) GOTO(out, rc); - obdo_to_inode(inode, oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); + file->f_flags &= ~O_LOV_DELAY_CREATE; + obdo_to_inode(inode, oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | + OBD_MD_FLMTIME | OBD_MD_FLCTIME); - fd = file->private_data; obd_oa2handle(&fd->fd_osthandle, oa); atomic_inc(&ll_i2info(inode)->lli_open_count); @@ -154,9 +214,10 @@ out: * the mdc open was successful (hence stored stripe MD on MDS), otherwise * other nodes could try to create different objects for the same file. */ -static int ll_create_open_obj(struct lustre_handle *conn, struct inode *inode, - struct file *file, struct lov_stripe_md *lsm) +static int ll_create_obj(struct lustre_handle *conn, struct inode *inode, + struct file *file, struct lov_stripe_md *lsm) { + struct ptlrpc_request *req = NULL; struct ll_inode_info *lli = ll_i2info(inode); struct lov_mds_md *lmm = NULL; int lmm_size = 0; @@ -179,10 +240,14 @@ static int ll_create_open_obj(struct lustre_handle *conn, struct inode *inode, oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID; - rc = obd_create(conn, oa, &lsm); + rc = obd_create(conn, oa, &lsm, NULL); if (rc) { CERROR("error creating objects for inode %lu: rc = %d\n", inode->i_ino, rc); + if (rc > 0) { + CERROR("obd_create returned invalid rc %d\n", rc); + rc = -EIO; + } GOTO(out_oa, rc); } @@ -193,7 +258,10 @@ static int ll_create_open_obj(struct lustre_handle *conn, struct inode *inode, lmm_size = rc; - rc = ll_mdc_open(&ll_i2sbi(inode)->ll_mdc_conn,inode,file,lmm,lmm_size); + /* Save the stripe MD with this file on the MDS */ + rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, inode, NULL, + lmm, lmm_size, &req); + ptlrpc_req_finished(req); obd_free_wiremd(conn, &lmm); @@ -201,7 +269,7 @@ static int ll_create_open_obj(struct lustre_handle *conn, struct inode *inode, * MDS, we need to destroy the objects now or they will be leaked. */ if (rc) { - CERROR("error MDS opening %lu with delayed create: rc %d\n", + CERROR("error: storing stripe MD for %lu: rc %d\n", inode->i_ino, rc); GOTO(out_destroy, rc); } @@ -216,7 +284,7 @@ out_destroy: obdo_from_inode(oa, inode, OBD_MD_FLTYPE); oa->o_id = lsm->lsm_object_id; oa->o_valid |= OBD_MD_FLID; - err = obd_destroy(conn, oa, lsm); + err = obd_destroy(conn, oa, lsm, NULL); obd_free_memmd(conn, &lsm); if (err) CERROR("error uncreating inode %lu objects: rc %d\n", @@ -239,43 +307,55 @@ out_destroy: * before returning in the O_LOV_DELAY_CREATE case and dropping it here * or in ll_file_release(), but I'm not sure that is desirable/necessary. */ +extern int ll_it_open_error(int phase, struct lookup_intent *it); + static int ll_file_open(struct inode *inode, struct file *file) { struct ll_sb_info *sbi = ll_i2sbi(inode); struct ll_inode_info *lli = ll_i2info(inode); struct lustre_handle *conn = ll_i2obdconn(inode); + struct lookup_intent *it; struct lov_stripe_md *lsm; int rc = 0; ENTRY; + LL_GET_INTENT(file->f_dentry, it); + rc = ll_it_open_error(IT_OPEN_OPEN, it); + if (rc) + RETURN(rc); + + rc = ll_local_open(file, it); + if (rc) + LBUG(); + + mdc_set_open_replay_data((struct ll_file_data *)file->private_data); + lsm = lli->lli_smd; if (lsm == NULL) { if (file->f_flags & O_LOV_DELAY_CREATE) { CDEBUG(D_INODE, "delaying object creation\n"); RETURN(0); } - down(&lli->lli_open_sem); if (!lli->lli_smd) { - rc = ll_create_open_obj(conn, inode, file, NULL); + rc = ll_create_obj(conn, inode, file, NULL); up(&lli->lli_open_sem); + if (rc) + GOTO(out_close, rc); } else { - CERROR("stripe already set on ino %lu\n", inode->i_ino); + CERROR("warning: stripe already set on ino %lu\n", + inode->i_ino); up(&lli->lli_open_sem); - rc = ll_mdc_open(&sbi->ll_mdc_conn, inode, file,NULL,0); } lsm = lli->lli_smd; - } else - rc = ll_mdc_open(&sbi->ll_mdc_conn, inode, file, NULL, 0); - - if (rc) - RETURN(rc); + } rc = ll_osc_open(conn, inode, file, lsm); if (rc) GOTO(out_close, rc); RETURN(0); -out_close: + + out_close: ll_mdc_close(&sbi->ll_mdc_conn, inode, file); return rc; } @@ -321,90 +401,40 @@ int ll_size_unlock(struct inode *inode, struct lov_stripe_md *lsm, int mode, RETURN(rc); } -int ll_file_size(struct inode *inode, struct lov_stripe_md *lsm) +/* This function is solely "sampling" the file size, and does not explicit + * locking on the size itself (see ll_size_lock() and ll_size_unlock()). + * + * XXX We need to optimize away the obd_getattr for decent performance here, + * by checking if we already have the size lock and considering our size + * authoritative in that case. In order to do that either the act of + * getting the size lock includes retrieving the file size, or the client + * keeps an atomic flag in the inode which indicates whether the size + * has been updated (see bug 280). + */ +int ll_file_size(struct inode *inode, struct lov_stripe_md *lsm, + struct lustre_handle *handle) { struct ll_sb_info *sbi = ll_i2sbi(inode); - //struct lustre_handle lockh = { 0, 0 }; struct obdo oa; - //int err; int rc; ENTRY; LASSERT(lsm); LASSERT(sbi); - /* XXX do not yet need size lock - OST size always correct (sync write) - rc = ll_size_lock(inode, lsm, 0, LCK_PR, &lockh); - if (rc != ELDLM_OK) { - CERROR("lock enqueue: %d\n", rc); - RETURN(rc); - } - */ - memset(&oa, 0, sizeof oa); oa.o_id = lsm->lsm_object_id; oa.o_mode = S_IFREG; - oa.o_valid = OBD_MD_FLID|OBD_MD_FLTYPE|OBD_MD_FLSIZE|OBD_MD_FLBLOCKS; + oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | + OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME; + obd_handle2oa(&oa, handle); rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm); if (!rc) { - obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); - CDEBUG(D_INODE, LPX64" size %Lu/%Lu\n", + obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | + OBD_MD_FLMTIME | OBD_MD_FLCTIME); + CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lu\n", lsm->lsm_object_id, inode->i_size, inode->i_size); } - /* XXX do not need size lock, because OST size always correct (sync write) - err = ll_size_unlock(inode, lsm, LCK_PR, &lockh); - if (err != ELDLM_OK) { - CERROR("lock cancel: %d\n", err); - if (!rc) - rc = err; - } - */ - RETURN(rc); -} - -/* While this returns an error code, fput() the caller does not, so we need - * to make every effort to clean up all of our state here. Also, applications - * rarely check close errors and even if an error is returned they will not - * re-try the close call. - */ -static int ll_file_release(struct inode *inode, struct file *file) -{ - struct ll_file_data *fd; - struct obdo oa; - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - int rc, rc2; - - ENTRY; - - fd = (struct ll_file_data *)file->private_data; - if (!fd) /* no process opened the file after an mcreate */ - RETURN(rc = 0); - - memset(&oa, 0, sizeof(oa)); - oa.o_id = lsm->lsm_object_id; - oa.o_mode = S_IFREG; - oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID; - obd_handle2oa(&oa, &fd->fd_osthandle); - rc = obd_close(&sbi->ll_osc_conn, &oa, lsm); - if (rc) - CERROR("inode %lu object close failed: rc = %d\n", - inode->i_ino, rc); - - rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file); - if (rc2 && !rc) - rc = rc2; - - if (atomic_dec_and_test(&lli->lli_open_count)) { - CDEBUG(D_INFO, "last close, cancelling unused locks\n"); - rc2 = obd_cancel_unused(&sbi->ll_osc_conn, lsm, 0); - if (rc2 && !rc) { - rc = rc2; - CERROR("obd_cancel_unused: %d\n", rc); - } - } else - CDEBUG(D_INFO, "not last close, not cancelling unused locks\n"); RETURN(rc); } @@ -426,6 +456,7 @@ static inline void ll_remove_suid(struct inode *inode) static void ll_update_atime(struct inode *inode) { +#ifdef USE_ATIME struct iattr attr; attr.ia_atime = CURRENT_TIME; @@ -437,19 +468,20 @@ static void ll_update_atime(struct inode *inode) /* ll_inode_setattr() sets inode->i_atime from attr.ia_atime */ ll_inode_setattr(inode, &attr, 0); +#else + /* update atime, but don't explicitly write it out just this change */ + inode->i_atime = CURRENT_TIME; +#endif } int ll_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new, - void *data, __u32 data_len, int flag) + void *data, int flag) { struct inode *inode = data; struct lustre_handle lockh = { 0, 0 }; int rc; ENTRY; - if (data_len != sizeof(struct inode)) - LBUG(); - if (inode == NULL) LBUG(); @@ -477,7 +509,7 @@ int ll_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new, static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos) { - struct ll_file_data *fd = (struct ll_file_data *)filp->private_data; + struct ll_file_data *fd = filp->private_data; struct inode *inode = filp->f_dentry->d_inode; struct ll_sb_info *sbi = ll_i2sbi(inode); struct lustre_handle lockh = { 0, 0 }; @@ -487,14 +519,6 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, ssize_t retval; ENTRY; - /* If we don't refresh the file size, generic_file_read may not even - * call us */ - retval = ll_file_size(inode, lsm); - if (retval < 0) { - CERROR("ll_file_size: "LPSZ"\n", retval); - RETURN(retval); - } - if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) && !(sbi->ll_flags & LL_SBI_NOLCK)) { struct ldlm_extent extent; @@ -513,6 +537,14 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, } } + /* If we don't refresh the file size, generic_file_read may not even + * call us */ + retval = ll_file_size(inode, lsm, &fd->fd_osthandle); + if (retval < 0) { + CERROR("ll_file_size: "LPSZ"\n", retval); + RETURN(retval); + } + CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n", inode->i_ino, count, *ppos); retval = generic_file_read(filp, buf, count, ppos); @@ -538,7 +570,7 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) { - struct ll_file_data *fd = (struct ll_file_data *)file->private_data; + struct ll_file_data *fd = file->private_data; struct inode *inode = file->f_dentry->d_inode; struct ll_sb_info *sbi = ll_i2sbi(inode); struct lustre_handle lockh = { 0, 0 }, eof_lockh = { 0, 0 }; @@ -549,32 +581,16 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) ENTRY; if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) { - struct obdo *oa; - - oa = obdo_alloc(); - if (!oa) - RETURN(-ENOMEM); - err = ll_size_lock(inode, lsm, 0, LCK_PW, &eof_lockh); - if (err) { - obdo_free(oa); + if (err) RETURN(err); - } - oa->o_id = lsm->lsm_object_id; - oa->o_mode = inode->i_mode; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | - OBD_MD_FLBLOCKS; - obd_handle2oa(oa, &fd->fd_osthandle); - retval = obd_getattr(&sbi->ll_osc_conn, oa, lsm); - if (retval) { - obdo_free(oa); + /* Get size here so we know extent to enqueue write lock on. */ + retval = ll_file_size(inode, lsm, &fd->fd_osthandle); + if (retval) GOTO(out_eof, retval); - } - *ppos = oa->o_size; - obdo_to_inode(inode, oa, oa->o_valid); - obdo_free(oa); + *ppos = inode->i_size; } if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) && @@ -600,21 +616,19 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) retval = generic_file_write(file, buf, count, ppos); - if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) || - sbi->ll_flags & LL_SBI_NOLCK) { + if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) && + !(sbi->ll_flags & LL_SBI_NOLCK)) { err = obd_cancel(&sbi->ll_osc_conn, lsm, LCK_PW, &lockh); - if (err != ELDLM_OK) { + if (err != ELDLM_OK) CERROR("lock cancel: err: %d\n", err); - GOTO(out_eof, retval = err); - } } EXIT; out_eof: if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) { err = ll_size_unlock(inode, lsm, LCK_PW, &eof_lockh); - if (err && !retval) - retval = err; + if (err) + CERROR("ll_size_unlock: %d\n", err); } return retval; @@ -624,7 +638,7 @@ static int ll_lov_setstripe(struct inode *inode, struct file *file, unsigned long arg) { struct ll_inode_info *lli = ll_i2info(inode); - struct lustre_handle *conn; + struct lustre_handle *conn = ll_i2obdconn(inode); struct lov_stripe_md *lsm; int rc; ENTRY; @@ -636,7 +650,7 @@ static int ll_lov_setstripe(struct inode *inode, struct file *file, CERROR("stripe already set for ino %lu\n", inode->i_ino); /* If we haven't already done the open, do so now */ if (file->f_flags & O_LOV_DELAY_CREATE) { - int rc2 = ll_file_open(inode, file); + int rc2 = ll_osc_open(conn, inode, file, lsm); if (rc2) RETURN(rc2); } @@ -644,11 +658,12 @@ static int ll_lov_setstripe(struct inode *inode, struct file *file, RETURN(-EALREADY); } - conn = ll_i2obdconn(inode); - rc = obd_iocontrol(LL_IOC_LOV_SETSTRIPE, conn, 0, &lsm, (void *)arg); - if (!rc) - rc = ll_create_open_obj(conn, inode, file, lsm); + if (rc) { + up(&lli->lli_open_sem); + RETURN(rc); + } + rc = ll_create_obj(conn, inode, file, lsm); up(&lli->lli_open_sem); if (rc) { @@ -673,11 +688,13 @@ static int ll_lov_getstripe(struct inode *inode, unsigned long arg) int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { - struct ll_file_data *fd = (struct ll_file_data *)file->private_data; + struct ll_file_data *fd = file->private_data; struct lustre_handle *conn; int flags; switch(cmd) { + case TCGETS: + return -ENOTTY; case LL_IOC_GETFLAGS: /* Get the current value of the file flags */ return put_user(fd->fd_flags, (int *)arg); @@ -725,8 +742,9 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin) switch (origin) { case 2: { struct ll_inode_info *lli = ll_i2info(inode); + struct ll_file_data *fd = file->private_data; - retval = ll_file_size(inode, lli->lli_smd); + retval = ll_file_size(inode, lli->lli_smd, &fd->fd_osthandle); if (retval) RETURN(retval); @@ -757,7 +775,7 @@ int ll_fsync(struct file *file, struct dentry *dentry, int data) return 0; } -static int ll_inode_revalidate(struct dentry *dentry) +int ll_inode_revalidate(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct lov_stripe_md *lsm; @@ -768,14 +786,21 @@ static int ll_inode_revalidate(struct dentry *dentry) RETURN(0); } - if (!ll_have_md_lock(dentry)) { + /* this is very tricky. it is unsafe to call ll_have_md_lock + when we have a referenced lock: because it may cause an RPC + below when the lock is marked CB_PENDING. That RPC may not + go out because someone else may be in another RPC waiting for + that lock*/ + if (!(dentry->d_it && dentry->d_it->it_lock_mode) && + !ll_have_md_lock(dentry)) { struct ptlrpc_request *req = NULL; struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode); struct mds_body *body; unsigned long valid = 0; - int datalen = 0; - int rc; + int datalen = 0, rc; + /* Why don't we update all valid MDS fields here, if we're + * doing an RPC anyways? -phil */ if (S_ISREG(inode->i_mode)) { datalen = obd_size_wiremd(&sbi->ll_osc_conn, NULL); valid |= OBD_MD_FLEASIZE; @@ -789,7 +814,11 @@ static int ll_inode_revalidate(struct dentry *dentry) } body = lustre_msg_buf(req->rq_repmsg, 0); - ll_update_inode(inode, body); + if (body->valid & OBD_MD_FLEASIZE) + ll_update_inode(inode, body, + lustre_msg_buf(req->rq_repmsg, 1)); + else + ll_update_inode(inode, body, NULL); ptlrpc_req_finished(req); } @@ -797,7 +826,10 @@ static int ll_inode_revalidate(struct dentry *dentry) if (!lsm) /* object not yet allocated, don't validate size */ RETURN(0); - RETURN(ll_file_size(inode, lsm)); + /* XXX this should probably become an unconditional obd_getattr() + * so that we update the blocks count and mtime from the OST too. + */ + RETURN(ll_file_size(inode, lsm, NULL)); } #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) @@ -828,3 +860,12 @@ struct inode_operations ll_file_inode_operations = { revalidate: ll_inode_revalidate, #endif }; + +struct inode_operations ll_special_inode_operations = { + setattr: ll_setattr, +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + getattr: ll_getattr, +#else + revalidate: ll_inode_revalidate, +#endif +}; diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 65df985..8989a82 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -24,226 +24,131 @@ #include #include +/* /proc/lustre/llite mount point registration */ -int rd_path(char* page, char **start, off_t off, int count, int *eof, - void *data) +#ifndef LPROCFS +int lprocfs_register_mountpoint(struct proc_dir_entry *parent, + struct super_block *sb, char *osc, char *mdc) { return 0; } +#else -int rd_fstype(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - int len = 0; - struct super_block *sb = (struct super_block*)data; - - len += snprintf(page, count, "%s\n", sb->s_type->name); - return len; -} - -int rd_blksize(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - int len = 0; - struct super_block *sb = (struct super_block*)data; - struct statfs mystats; - - (sb->s_op->statfs)(sb, &mystats); - len += snprintf(page, count, "%lu\n", mystats.f_bsize); - return len; - -} +long long mnt_instance; -int rd_kbytestotal(char* page, char **start, off_t off, int count, int *eof, - void *data) +static inline int lprocfs_llite_statfs(void *data, struct statfs *sfs) { - int len = 0; struct super_block *sb = (struct super_block*)data; - struct statfs mystats; - __u32 blk_size; - __u64 result; - - (sb->s_op->statfs)(sb, &mystats); - blk_size = mystats.f_bsize; - blk_size >>= 10; - result = mystats.f_blocks; - - while(blk_size >>= 1) - result <<= 1; - - len += snprintf(page, count, LPU64"\n", result); - return len; + return (sb->s_op->statfs)(sb, sfs); } +DEFINE_LPROCFS_STATFS_FCT(rd_blksize, lprocfs_llite_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_kbytestotal, lprocfs_llite_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_kbytesfree, lprocfs_llite_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filestotal, lprocfs_llite_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filesfree, lprocfs_llite_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filegroups, lprocfs_llite_statfs); -int rd_kbytesfree(char* page, char **start, off_t off, int count, int *eof, - void *data) +int rd_path(char *page, char **start, off_t off, int count, int *eof, + void *data) { - int len = 0; - struct super_block *sb = (struct super_block*)data; - struct statfs mystats; - __u32 blk_size; - __u64 result; - - (sb->s_op->statfs)(sb, &mystats); - blk_size = mystats.f_bsize; - blk_size >>= 10; - result = mystats.f_bfree; - - while(blk_size >>= 1) - result <<= 1; - - len += snprintf(page, count, LPU64"\n", result); - return len; + return 0; } -int rd_filestotal(char* page, char **start, off_t off, int count, int *eof, - void *data) +int rd_fstype(char *page, char **start, off_t off, int count, int *eof, + void *data) { - int len = 0; struct super_block *sb = (struct super_block*)data; - struct statfs mystats; - (sb->s_op->statfs)(sb, &mystats); - len += snprintf(page, count, LPU64"\n", (__u64)(mystats.f_files)); - return len; + *eof = 1; + return snprintf(page, count, "%s\n", sb->s_type->name); } -int rd_filesfree(char* page, char **start, off_t off, int count, int *eof, - void *data) +int rd_sb_uuid(char *page, char **start, off_t off, int count, int *eof, + void *data) { - int len = 0; - struct super_block *sb = (struct super_block*)data; - struct statfs mystats; + struct super_block *sb = (struct super_block *)data; - (sb->s_op->statfs)(sb, &mystats); - len += snprintf(page, count, LPU64"\n", (__u64)(mystats.f_ffree)); - return len; + *eof = 1; + return snprintf(page, count, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid); } -int rd_filegroups(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} -int rd_uuid(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - int len = 0; - struct super_block *sb = (struct super_block*)data; - struct ll_sb_info *sbi = ll_s2sbi(sb); - - len += snprintf(page, count, "%s\n", sbi->ll_sb_uuid); - - return len; - -} -int rd_dev_name(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - int len = 0; - struct obd_device* dev = (struct obd_device*)data; - len += snprintf(page, count, "%s\n", dev->obd_name); - return len; -} +struct lprocfs_vars lprocfs_obd_vars[] = { + { "uuid", rd_sb_uuid, 0, 0 }, + { "mntpt_path", rd_path, 0, 0 }, + { "fstype", rd_fstype, 0, 0 }, + { "blocksize", rd_blksize, 0, 0 }, + { "kbytestotal", rd_kbytestotal, 0, 0 }, + { "kbytesfree", rd_kbytesfree, 0, 0 }, + { "filestotal", rd_filestotal, 0, 0 }, + { "filesfree", rd_filesfree, 0, 0 }, + { "filegroups", rd_filegroups, 0, 0 }, + { 0 } +}; -int rd_dev_uuid(char* page, char **start, off_t off, int count, int *eof, - void *data) +#define MAX_STRING_SIZE 128 +int lprocfs_register_mountpoint(struct proc_dir_entry *parent, + struct super_block *sb, char *osc, char *mdc) { - int len = 0; - struct obd_device* dev = (struct obd_device*)data; - len += snprintf(page, count, "%s\n", dev->obd_uuid); - return len; -} + struct lprocfs_vars lvars[2]; + struct ll_sb_info *sbi = ll_s2sbi(sb); + struct obd_device *obd; + char name[MAX_STRING_SIZE + 1]; + struct obd_uuid uuid; + int err; + ENTRY; + memset(lvars, 0, sizeof(lvars)); -struct lprocfs_vars status_var_nm_1[] = { - {"status/uuid", rd_uuid, 0, 0}, - {"status/mntpt_path", rd_path, 0, 0}, - {"status/fstype", rd_fstype, 0, 0}, - {"status/blocksize",rd_blksize, 0, 0}, - {"status/kbytestotal",rd_kbytestotal, 0, 0}, - {"status/kbytesfree", rd_kbytesfree, 0, 0}, - {"status/filestotal", rd_filestotal, 0, 0}, - {"status/filesfree", rd_filesfree, 0, 0}, - {"status/filegroups", rd_filegroups, 0, 0}, - {0} -}; + name[MAX_STRING_SIZE] = '\0'; + lvars[0].name = name; -/* - * Proc registration function for Lustre - * file system - */ + /* Mount info */ + snprintf(name, MAX_STRING_SIZE, "fs%llu", mnt_instance); + mnt_instance++; + sbi->ll_proc_root = lprocfs_register(name, parent, NULL, NULL); + if (IS_ERR(sbi->ll_proc_root)) + RETURN(err = PTR_ERR(sbi->ll_proc_root)); -#define MAX_STRING_SIZE 100 -void ll_proc_namespace(struct super_block* sb, char* osc, char* mdc) -{ - char mnt_name[MAX_STRING_SIZE+1]; - char uuid_name[MAX_STRING_SIZE+1]; - struct lprocfs_vars d_vars[3]; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct obd_device* obd; - int err; + /* Static configuration info */ + err = lprocfs_add_vars(sbi->ll_proc_root, lprocfs_obd_vars, sb); + if (err) + RETURN(err); - /* Register this mount instance with LProcFS */ - snprintf(mnt_name, MAX_STRING_SIZE, "mount_%s", sbi->ll_sb_uuid); - mnt_name[MAX_STRING_SIZE] = '\0'; - sbi->ll_proc_root = lprocfs_reg_mnt(mnt_name); - if (sbi->ll_proc_root == NULL) { - CDEBUG(D_OTHER, "Could not register FS"); - return; - } - /* Add the static configuration info */ - err = lprocfs_add_vars(sbi->ll_proc_root,status_var_nm_1, sb); - if (err) { - CDEBUG(D_OTHER, "Unable to add procfs variables\n"); - return; - } - /* MDC */ - obd = class_uuid2obd(mdc); - snprintf(mnt_name, MAX_STRING_SIZE, "status/%s/common_name", - obd->obd_type->typ_name); - mnt_name[MAX_STRING_SIZE] = '\0'; - memset(d_vars, 0, sizeof(d_vars)); - d_vars[0].read_fptr = rd_dev_name; - d_vars[0].write_fptr = NULL; - d_vars[0].name = mnt_name; - snprintf(uuid_name, MAX_STRING_SIZE, "status/%s/uuid", + /* MDC info */ + strncpy(uuid.uuid, mdc, sizeof(uuid.uuid)); + obd = class_uuid2obd(&uuid); + snprintf(name, MAX_STRING_SIZE, "%s/common_name", obd->obd_type->typ_name); - uuid_name[MAX_STRING_SIZE] = '\0'; - d_vars[1].read_fptr = rd_dev_uuid; - d_vars[1].write_fptr = NULL; - d_vars[1].name = uuid_name; - - err = lprocfs_add_vars(sbi->ll_proc_root, d_vars, obd); - if (err) { - CDEBUG(D_OTHER, "Unable to add fs proc dynamic variables\n"); - return; - } - /* OSC or LOV*/ - obd = class_uuid2obd(osc); - - /* Reuse mnt_name */ - snprintf(mnt_name, MAX_STRING_SIZE, - "status/%s/common_name", obd->obd_type->typ_name); - mnt_name[MAX_STRING_SIZE] = '\0'; - memset(d_vars, 0, sizeof(d_vars)); - d_vars[0].read_fptr = rd_dev_name; - d_vars[0].write_fptr = NULL; - d_vars[0].name = mnt_name; - - snprintf(uuid_name, MAX_STRING_SIZE, "status/%s/uuid", + lvars[0].read_fptr = lprocfs_rd_name; + err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd); + if (err) + RETURN(err); + + snprintf(name, MAX_STRING_SIZE, "%s/uuid", obd->obd_type->typ_name); + lvars[0].read_fptr = lprocfs_rd_uuid; + err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd); + if (err < 0) + RETURN(err); + + /* OSC */ + strncpy(uuid.uuid, osc, sizeof(uuid.uuid)); + obd = class_uuid2obd(&uuid); + + snprintf(name, MAX_STRING_SIZE, "%s/common_name", obd->obd_type->typ_name); - uuid_name[MAX_STRING_SIZE] = '\0'; - d_vars[1].read_fptr = rd_dev_uuid; - d_vars[1].write_fptr = NULL; - d_vars[1].name = uuid_name; - - err = lprocfs_add_vars(sbi->ll_proc_root, d_vars, obd); - if (err) { - CDEBUG(D_OTHER, "Unable to add fs proc dynamic variables\n"); - return; - } + lvars[0].read_fptr = lprocfs_rd_name; + err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd); + if (err) + RETURN(err); + + snprintf(name, MAX_STRING_SIZE, "%s/uuid", obd->obd_type->typ_name); + lvars[0].read_fptr = lprocfs_rd_uuid; + err = lprocfs_add_vars(sbi->ll_proc_root, lvars, obd); + + RETURN(err); } + #undef MAX_STRING_SIZE +#endif /* LPROCFS */ diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 81a5aad..f72e6ba 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -1,17 +1,24 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) + * This file is part of Lustre, http://www.lustre.org. * - * from + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. * - * linux/fs/ext2/namei.c + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * derived in small part from linux/fs/ext2/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds * @@ -19,12 +26,6 @@ * David S. Miller (davem@caip.rutgers.edu), 1995 * Directory entry file type support and forward compatibility hooks * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 - * - * Changes for use in OBDFS - * Copyright (c) 1999, Seagate Technology Inc. - * Copyright (C) 2001, Cluster File Systems, Inc. - * Rewritten based on recent ext2 page cache use. - * */ #include @@ -41,7 +42,8 @@ #include #include -extern struct address_space_operations ll_aops; +/* from dcache.c */ +extern void ll_set_dd(struct dentry *de); /* from super.c */ extern void ll_change_inode(struct inode *inode); @@ -100,7 +102,7 @@ static int ll_test_inode(struct inode *inode, void *opaque) return 0; /* Apply the attributes in 'opaque' to this inode */ - ll_update_inode(inode, body); + ll_update_inode(inode, body, lic->lic_lmm); return 1; } @@ -149,24 +151,45 @@ struct inode *ll_iget(struct super_block *sb, ino_t hash, static int ll_intent_to_lock_mode(struct lookup_intent *it) { /* CREAT needs to be tested before open (both could be set) */ - if ((it->it_op & (IT_CREAT | IT_MKDIR | IT_SETATTR | IT_MKNOD))) { + if (it->it_op & (IT_CREAT | IT_SETATTR)) return LCK_PW; - } else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_UNLINK | - IT_RMDIR | IT_RENAME | IT_RENAME2 | IT_READLINK| - IT_LINK | IT_LINK2 | IT_LOOKUP | IT_SYMLINK)) { + else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP)) return LCK_PR; - } LBUG(); RETURN(-EINVAL); } -#define LL_LOOKUP_POSITIVE 1 -#define LL_LOOKUP_NEGATIVE 2 +int ll_it_open_error(int phase, struct lookup_intent *it) +{ + if (it->it_disposition & IT_OPEN_OPEN) { + if (phase == IT_OPEN_OPEN) + return it->it_status; + else + return 0; + } + + if (it->it_disposition & IT_OPEN_CREATE) { + if (phase == IT_OPEN_CREATE) + return it->it_status; + else + return 0; + } + + if (it->it_disposition & IT_OPEN_LOOKUP) { + if (phase == IT_OPEN_LOOKUP) + return it->it_status; + else + return 0; + } + LBUG(); + return 0; +} + +#define IT_ENQ_COMPLETE (1<<16) int ll_intent_lock(struct inode *parent, struct dentry **de, - struct lookup_intent *it, - intent_finish_cb intent_finish) + struct lookup_intent *it, intent_finish_cb intent_finish) { struct dentry *dentry = *de; struct ll_sb_info *sbi = ll_i2sbi(parent); @@ -174,9 +197,8 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; struct ptlrpc_request *request = NULL; char *data = NULL; - int rc, lock_mode, datalen = 0, offset, flag = LL_LOOKUP_POSITIVE; + int rc = 0, datalen = 0, offset, flag = 0; obd_id ino = 0; - ENTRY; if (it == NULL) @@ -188,26 +210,23 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, if (dentry->d_name.len > EXT2_NAME_LEN) RETURN(-ENAMETOOLONG); - lock_mode = ll_intent_to_lock_mode(it); - if (it->it_op & IT_SYMLINK) { - data = it->it_data; - datalen = strlen(data) + 1; - it->it_data = NULL; + if (!(it->it_disposition & IT_ENQ_COMPLETE)) { + rc = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, it, + ll_intent_to_lock_mode(it), parent, dentry, + &lockh, data, datalen, parent, + sizeof(*parent)); + if (rc < 0) + RETURN(rc); + memcpy(it->it_lock_handle, &lockh, sizeof(lockh)); } - rc = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, it, lock_mode, parent, - dentry, &lockh, data, datalen, parent,sizeof(*parent)); - if (rc < 0) - RETURN(rc); - memcpy(it->it_lock_handle, &lockh, sizeof(lockh)); - request = (struct ptlrpc_request *)it->it_data; - /* it_disposition == 1 indicates that the server performed the + + /* non-zero it_disposition indicates that the server performed the * intent on our behalf. */ if (it->it_disposition) { struct mds_body *mds_body; int mode; - obd_flag valid; /* This long block is all about fixing up the local * state so that it is correct as of the moment @@ -237,76 +256,73 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, ino = mds_body->fid1.id; mode = mds_body->mode; - if (it->it_op & (IT_CREAT | IT_MKDIR | IT_SYMLINK | IT_MKNOD)) { + /*We were called from revalidate2: did we find the same inode?*/ + if ((*de)->d_inode && + (ino != (*de)->d_inode->i_ino || + mds_body->fid1.generation != (*de)->d_inode->i_generation)) { + it->it_disposition |= IT_ENQ_COMPLETE; + RETURN(-ESTALE); + } + + /* If we're doing an IT_OPEN which did not result in an actual + * successful open, then we need to remove the bit which saves + * this request for unconditional replay. */ + if (it->it_op & IT_OPEN && + (!(it->it_disposition & IT_OPEN_OPEN) || + it->it_status != 0)) + request->rq_flags &= ~PTL_RPC_FL_REPLAY; + + if (it->it_op & IT_CREAT) { mdc_store_inode_generation(request, 2, 1); - /* For create ops, we want the lookup to be negative, - * unless the create failed in a way that indicates - * that the file is already there */ - if (it->it_status == 0) - atomic_inc(&request->rq_refcount); - if (it->it_status != -EEXIST) - GOTO(out, flag = LL_LOOKUP_NEGATIVE); - /* - * Fall through to update attibutes: it may already - * have appeared in the namespace of another client - */ + /* The server will return to us, in it_disposition, an + * indication of exactly what it_status refers to. + * + * If IT_OPEN_OPEN is set, then it_status refers to the + * open() call, otherwise if IT_OPEN_CREATE is set, then + * it status is the creation failure mode. In either + * case, one of IT_OPEN_NEG or IT_OPEN_POS will be set, + * indicating whether the child lookup was successful. + * + * Else, if IT_OPEN_LOOKUP then it_status is the rc + * of the child lookup. + * + * Finally, if none of the bits are set, then the + * failure occurred while looking up the parent. */ + rc = ll_it_open_error(IT_OPEN_LOOKUP, it); + if (rc) + GOTO(drop_req, rc); + + if (it->it_disposition & IT_OPEN_CREATE) + ptlrpc_request_addref(request); + + if (it->it_disposition & IT_OPEN_NEG) + flag = LL_LOOKUP_NEGATIVE; + else + flag = LL_LOOKUP_POSITIVE; + } else if (it->it_op == IT_OPEN) { + LASSERT(!(it->it_disposition & IT_OPEN_CREATE)); + + rc = ll_it_open_error(IT_OPEN_LOOKUP, it); + if (rc) + GOTO(drop_req, rc); + + if (it->it_disposition & IT_OPEN_OPEN) + ptlrpc_request_addref(request); + + if (it->it_disposition & IT_OPEN_NEG) + flag = LL_LOOKUP_NEGATIVE; + else + flag = LL_LOOKUP_POSITIVE; } else if (it->it_op & (IT_GETATTR | IT_SETATTR | IT_LOOKUP | IT_READLINK)) { /* For check ops, we want the lookup to succeed */ it->it_data = NULL; if (it->it_status) - GOTO(out, flag = LL_LOOKUP_NEGATIVE); - /* Fall through to update attibutes. */ - } else if (it->it_op & (IT_RENAME | IT_LINK)) { - /* For rename, we want the source lookup to succeed */ - if (it->it_status) { - it->it_data = NULL; - GOTO(drop_req, rc = it->it_status); - } - /* Fall through to update attibutes. */ - } else if (it->it_op & (IT_UNLINK | IT_RMDIR)) { - /* For remove ops, we want the lookup to succeed unless - * the file truly doesn't exist */ - it->it_data = NULL; - if (it->it_status == -ENOENT) - GOTO(out, flag = LL_LOOKUP_NEGATIVE); - /* No point in updating attributes that we're about to - * unlink. -phil */ - GOTO(out, flag = LL_LOOKUP_POSITIVE); - } else if (it->it_op == IT_OPEN) { - it->it_data = NULL; - if (it->it_status && it->it_status != -EEXIST) - GOTO(out, flag = LL_LOOKUP_NEGATIVE); - /* Fall through to update attibutes. */ - } else if (it->it_op & (IT_RENAME2 | IT_LINK2)) { - it->it_data = NULL; - /* This means the target lookup is negative */ - if (mds_body->valid == 0) - GOTO(out, flag = LL_LOOKUP_NEGATIVE); - /* XXX bug 289: should we maybe fall through here? -p */ - GOTO(out, flag = LL_LOOKUP_POSITIVE); - } - - /* Do a getattr now that we have the lock, and fetch the - * up-to-date stripe MD at the same time. - */ - valid = OBD_MD_FLNOTOBD; - if (it->it_op == IT_READLINK) { - datalen = mds_body->size; - valid |= OBD_MD_LINKNAME; - } else if (S_ISREG(mode)) { - datalen = obd_size_wiremd(&sbi->ll_osc_conn, NULL); - valid |= OBD_MD_FLEASIZE; - } - ptlrpc_req_finished(request); - request = NULL; - rc = mdc_getattr(&sbi->ll_mdc_conn, ino, mode, - valid, datalen, &request); - if (rc) { - CERROR("failure %d inode "LPX64"\n", rc, ino); - GOTO(drop_req, rc = -abs(rc)); - } - offset = 0; + flag = LL_LOOKUP_NEGATIVE; + else + flag = LL_LOOKUP_POSITIVE; + } else + LBUG(); } else { obd_flag valid; int mode; @@ -332,6 +348,8 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, if (S_ISREG(mode)) { datalen = obd_size_wiremd(&sbi->ll_osc_conn, NULL), valid |= OBD_MD_FLEASIZE; + } else { + valid |= OBD_MD_FLBLOCKS; } rc = mdc_getattr(&sbi->ll_mdc_conn, ino, mode, valid, @@ -342,7 +360,6 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, } } - out: if (intent_finish != NULL) { rc = intent_finish(flag, request, de, it, offset, ino); dentry = *de; /* intent_finish may change *de */ @@ -350,29 +367,19 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, ptlrpc_req_finished(request); } - if (it->it_disposition && it->it_op & (IT_RENAME | IT_LINK)) - it->it_data = dentry; - - /* this places the intent in the dentry so that the vfs_xxx - * operation can lay its hands on it; but that is not - * always needed... - */ - if ( // it->it_status == 0 && - it->it_op != IT_RENAME && - it->it_op != IT_LINK && - it->it_op != IT_SETATTR && - it->it_op != IT_GETATTR && - it->it_op != IT_READDIR && - it->it_op != IT_LOOKUP) { + /* This places the intent in the dentry so that the vfs_xxx + * operation can lay its hands on it; but that is not always + * needed... (we need to save it in the GETATTR case for the + * benefit of ll_inode_revalidate -phil) */ + if (it->it_op & (IT_OPEN | IT_GETATTR)) LL_SAVE_INTENT(dentry, it); - } else { + else CDEBUG(D_DENTRY, "D_IT dentry %p fsdata %p intent: %s status %d\n", dentry, ll_d2d(dentry), ldlm_it2str(it->it_op), it->it_status); - } - if (rc < 0 || it->it_op == IT_LOOKUP) + if (it->it_op == IT_LOOKUP) ll_intent_release(dentry, it); RETURN(rc); @@ -395,7 +402,7 @@ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de) struct dentry *dentry = list_entry(tmp, struct dentry, d_alias); /* We are called here with 'de' already on the aliases list. */ - if (dentry == de) { + if (dentry == de) { CERROR("whoops\n"); continue; } @@ -418,6 +425,7 @@ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de) d_rehash(dentry); atomic_inc(&dentry->d_count); iput(inode); + dentry->d_flags &= ~DCACHE_LUSTRE_INVALID; return dentry; } @@ -434,7 +442,7 @@ lookup2_finish(int flag, struct ptlrpc_request *request, struct dentry **de, struct inode *inode = NULL; struct ll_read_inode2_cookie lic = {.lic_body = NULL, .lic_lmm = NULL}; - if (flag == LL_LOOKUP_POSITIVE) { + if (!(flag & LL_LOOKUP_NEGATIVE)) { ENTRY; lic.lic_body = lustre_msg_buf(request->rq_repmsg, offset); @@ -460,10 +468,8 @@ lookup2_finish(int flag, struct ptlrpc_request *request, struct dentry **de, /* We asked for a lock on the directory, and may have been * granted a lock on the inode. Just in case, fixup the data * pointer. */ - ldlm_lock_set_data((struct lustre_handle *)it->it_lock_handle, - inode, sizeof(*inode)); - - EXIT; + mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle, + inode); } else { ENTRY; } @@ -471,9 +477,7 @@ lookup2_finish(int flag, struct ptlrpc_request *request, struct dentry **de, ptlrpc_req_finished(request); dentry->d_op = &ll_d_ops; - if (ll_d2d(dentry) == NULL) { - ll_set_dd(dentry); - } + ll_set_dd(dentry); if (dentry == saved) d_add(dentry, inode); @@ -488,9 +492,12 @@ static struct dentry *ll_lookup2(struct inode *parent, struct dentry *dentry, int rc; ENTRY; + if (it && it->it_op == IT_TRUNC) + it->it_op = IT_SETATTR; + rc = ll_intent_lock(parent, &dentry, it, lookup2_finish); if (rc < 0) { - CERROR("ll_intent_lock: %d\n", rc); + CDEBUG(D_INFO, "ll_intent_lock: %d\n", rc); RETURN(ERR_PTR(rc)); } @@ -500,6 +507,7 @@ static struct dentry *ll_lookup2(struct inode *parent, struct dentry *dentry, RETURN(dentry); } +/* We depend on "mode" being set with the proper file type/umask by now */ static struct inode *ll_create_node(struct inode *dir, const char *name, int namelen, const void *data, int datalen, int mode, __u64 extra, @@ -514,12 +522,6 @@ static struct inode *ll_create_node(struct inode *dir, const char *name, ENTRY; if (it && it->it_disposition) { - int rc = it->it_status; - if (rc) { - CERROR("error creating MDS inode for %*s: rc = %d\n", - namelen, name, rc); - RETURN(ERR_PTR(rc)); - } ll_invalidate_inode_pages(dir); request = it->it_data; body = lustre_msg_buf(request->rq_repmsg, 1); @@ -567,8 +569,8 @@ static struct inode *ll_create_node(struct inode *dir, const char *name, /* We asked for a lock on the directory, but were * granted a lock on the inode. Since we finally have * an inode pointer, stuff it in the lock. */ - ldlm_lock_set_data((struct lustre_handle *)it->it_lock_handle, - inode, sizeof(*inode)); + mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle, + inode); } EXIT; @@ -582,47 +584,63 @@ static int ll_mdc_unlink(struct inode *dir, struct inode *child, __u32 mode, { struct ptlrpc_request *request = NULL; struct ll_sb_info *sbi = ll_i2sbi(dir); + struct mds_body *body; + struct lov_stripe_md *lsm = NULL; + struct lustre_handle lockh; + struct lookup_intent it = { .it_op = IT_UNLINK }; + struct obdo *oa; int err; - - ENTRY; - - err = mdc_unlink(&sbi->ll_mdc_conn, dir, child, mode, name, len, - &request); - ptlrpc_req_finished(request); - - RETURN(err); -} - -int ll_mdc_link(struct dentry *src, struct inode *dir, - const char *name, int len) -{ - struct ptlrpc_request *request = NULL; - int err; - struct ll_sb_info *sbi = ll_i2sbi(dir); - + struct mdc_unlink_data data; ENTRY; - err = mdc_link(&sbi->ll_mdc_conn, src, dir, name, len, &request); - ptlrpc_req_finished(request); - - RETURN(err); -} - -int ll_mdc_rename(struct inode *src, struct inode *tgt, - struct dentry *old, struct dentry *new) -{ - struct ptlrpc_request *request = NULL; - struct ll_sb_info *sbi = ll_i2sbi(src); - int err; - - ENTRY; + data.unl_dir = dir; + data.unl_de = child; + data.unl_mode = mode; + data.unl_name = name; + data.unl_len = len; + + err = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, &it, LCK_EX, dir, + NULL, &lockh, NULL, 0, &data, sizeof(data)); + mdc_put_rpc_lock(&mdc_rpc_lock, &it); + request = (struct ptlrpc_request *)it.it_data; + if (err < 0) + GOTO(out, err); + if (it.it_status) + GOTO(out, err = it.it_status); + err = 0; + + body = lustre_msg_buf(request->rq_repmsg, 1); + LASSERT(body != NULL); + if (!(body->valid & OBD_MD_FLEASIZE)) + GOTO(out, 0); + + /* The MDS sent back the EA because we unlinked the last reference + * to this file. Use this EA to unlink the objects on the OST */ + err = obd_unpackmd(ll_i2obdconn(dir), &lsm, + lustre_msg_buf(request->rq_repmsg, 2)); + if (err < 0) + CERROR("obd_unpackmd: %d\n", err); + + oa = obdo_alloc(); + if (oa == NULL) + GOTO(out_unlock, err = -ENOMEM); + + oa->o_id = lsm->lsm_object_id; + oa->o_mode = body->mode & S_IFMT; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE; + + err = obd_destroy(ll_i2obdconn(dir), oa, lsm, NULL); + obdo_free(oa); + if (err) + CERROR("obd destroy objid 0x"LPX64" error %d\n", + lsm->lsm_object_id, err); - err = mdc_rename(&sbi->ll_mdc_conn, src, tgt, - old->d_name.name, old->d_name.len, - new->d_name.name, new->d_name.len, &request); + obd_free_memmd(ll_i2obdconn(dir), &lsm); + out_unlock: + ldlm_lock_decref_and_cancel(&lockh, LCK_EX); + out: ptlrpc_req_finished(request); - - RETURN(err); + return err; } /* @@ -646,24 +664,66 @@ static int ll_create(struct inode *dir, struct dentry *dentry, int mode) int rc = 0; ENTRY; - LL_GET_INTENT(dentry, it); + it = dentry->d_it; + + rc = ll_it_open_error(IT_OPEN_CREATE, it); + if (rc) { + LL_GET_INTENT(dentry, it); + ptlrpc_req_finished(it->it_data); + RETURN(rc); + } inode = ll_create_node(dir, dentry->d_name.name, dentry->d_name.len, NULL, 0, mode, 0, it); - if (IS_ERR(inode)) + if (IS_ERR(inode)) { + LL_GET_INTENT(dentry, it); RETURN(PTR_ERR(inode)); + } + /* no directory data updates when intents rule */ if (it && it->it_disposition) { d_instantiate(dentry, inode); - } else { - /* no directory data updates when intents rule */ - rc = ext2_add_nondir(dentry, inode); + RETURN(0); } + rc = ext2_add_nondir(dentry, inode); RETURN(rc); } +static int ll_mknod2(struct inode *dir, const char *name, int len, int mode, + int rdev) +{ + struct ptlrpc_request *request = NULL; + time_t time = CURRENT_TIME; + struct ll_sb_info *sbi = ll_i2sbi(dir); + int err = -EMLINK; + ENTRY; + + if (dir->i_nlink >= EXT2_LINK_MAX) + RETURN(err); + + mode &= ~current->fs->umask; + + switch (mode & S_IFMT) { + case 0: case S_IFREG: + mode |= S_IFREG; /* for mode = 0 case, fallthrough */ + case S_IFCHR: case S_IFBLK: + case S_IFIFO: case S_IFSOCK: + err = mdc_create(&sbi->ll_mdc_conn, dir, name, len, NULL, 0, + mode, current->fsuid, current->fsgid, time, + rdev, &request); + ptlrpc_req_finished(request); + break; + case S_IFDIR: + err = -EPERM; + break; + default: + err = -EINVAL; + } + RETURN(err); +} + static int ll_mknod(struct inode *dir, struct dentry *dentry, int mode, int rdev) { @@ -673,6 +733,8 @@ static int ll_mknod(struct inode *dir, struct dentry *dentry, int mode, LL_GET_INTENT(dentry, it); + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; inode = ll_create_node(dir, dentry->d_name.name, dentry->d_name.len, NULL, 0, mode, rdev, it); @@ -688,6 +750,25 @@ static int ll_mknod(struct inode *dir, struct dentry *dentry, int mode, return rc; } +static int ll_symlink2(struct inode *dir, const char *name, int len, + const char *tgt) +{ + struct ptlrpc_request *request = NULL; + time_t time = CURRENT_TIME; + struct ll_sb_info *sbi = ll_i2sbi(dir); + int err = -EMLINK; + ENTRY; + + if (dir->i_nlink >= EXT2_LINK_MAX) + RETURN(err); + + err = mdc_create(&sbi->ll_mdc_conn, dir, name, len, + tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO, + current->fsuid, current->fsgid, time, 0, &request); + ptlrpc_req_finished(request); + RETURN(err); +} + static int ll_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { @@ -726,6 +807,21 @@ static int ll_symlink(struct inode *dir, struct dentry *dentry, RETURN(err); } +static int ll_link2(struct inode *src, struct inode *dir, + const char *name, int len) +{ + struct ptlrpc_request *request = NULL; + int err; + struct ll_sb_info *sbi = ll_i2sbi(dir); + + ENTRY; + + err = mdc_link(&sbi->ll_mdc_conn, src, dir, name, len, &request); + ptlrpc_req_finished(request); + + RETURN(err); +} + static int ll_link(struct dentry *old_dentry, struct inode * dir, struct dentry *dentry) { @@ -752,8 +848,8 @@ static int ll_link(struct dentry *old_dentry, struct inode * dir, if (inode->i_nlink >= EXT2_LINK_MAX) return -EMLINK; - rc = ll_mdc_link(old_dentry, dir, - dentry->d_name.name, dentry->d_name.len); + rc = ll_link2(old_dentry->d_inode, dir, + dentry->d_name.name, dentry->d_name.len); if (rc) RETURN(rc); @@ -764,6 +860,26 @@ static int ll_link(struct dentry *old_dentry, struct inode * dir, return ext2_add_nondir(dentry, inode); } +static int ll_mkdir2(struct inode *dir, const char *name, int len, int mode) +{ + struct ptlrpc_request *request = NULL; + time_t time = CURRENT_TIME; + struct ll_sb_info *sbi = ll_i2sbi(dir); + int err = -EMLINK; + ENTRY; + + if (dir->i_nlink >= EXT2_LINK_MAX) + RETURN(err); + + mode = (mode & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR; + err = mdc_create(&sbi->ll_mdc_conn, dir, name, len, NULL, 0, + mode, current->fsuid, current->fsgid, + time, 0, &request); + ptlrpc_req_finished(request); + RETURN(err); +} + + static int ll_mkdir(struct inode *dir, struct dentry *dentry, int mode) { struct lookup_intent *it; @@ -812,6 +928,24 @@ out_dir: goto out; } +static int ll_rmdir2(struct inode *dir, const char *name, int len) +{ + int rc; + ENTRY; + + rc = ll_mdc_unlink(dir, NULL, S_IFDIR, name, len); + RETURN(rc); +} + +static int ll_unlink2(struct inode *dir, const char *name, int len) +{ + int rc; + ENTRY; + + rc = ll_mdc_unlink(dir, NULL, S_IFREG, name, len); + RETURN(rc); +} + static int ll_common_unlink(struct inode *dir, struct dentry *dentry, struct lookup_intent *it, __u32 mode) { @@ -819,6 +953,7 @@ static int ll_common_unlink(struct inode *dir, struct dentry *dentry, struct ext2_dir_entry_2 * de; struct page * page; int rc = 0; + ENTRY; if (it && it->it_disposition) { rc = it->it_status; @@ -846,6 +981,7 @@ static int ll_common_unlink(struct inode *dir, struct dentry *dentry, ll_invalidate_inode_pages(dir); inode->i_ctime = dir->i_ctime; + EXIT; out_dec: ext2_dec_count(inode); out: @@ -855,10 +991,11 @@ out: static int ll_unlink(struct inode *dir, struct dentry *dentry) { struct lookup_intent * it; + ENTRY; LL_GET_INTENT(dentry, it); - return ll_common_unlink(dir, dentry, it, S_IFREG); + RETURN(ll_common_unlink(dir, dentry, it, S_IFREG)); } static int ll_rmdir(struct inode *dir, struct dentry *dentry) @@ -883,6 +1020,24 @@ static int ll_rmdir(struct inode *dir, struct dentry *dentry) RETURN(rc); } +static int ll_rename2(struct inode *src, struct inode *tgt, + const char *oldname, int oldlen, + const char *newname, int newlen) +{ + struct ptlrpc_request *request = NULL; + struct ll_sb_info *sbi = ll_i2sbi(src); + int err; + ENTRY; + + err = mdc_rename(&sbi->ll_mdc_conn, src, tgt, + oldname, oldlen, newname, newlen, &request); + ptlrpc_req_finished(request); + + RETURN(err); +} + + + static int ll_rename(struct inode * old_dir, struct dentry * old_dentry, struct inode * new_dir, struct dentry * new_dentry) { @@ -907,7 +1062,9 @@ static int ll_rename(struct inode * old_dir, struct dentry * old_dentry, GOTO(out, err = it->it_status); } - err = ll_mdc_rename(old_dir, new_dir, old_dentry, new_dentry); + err = ll_rename2(old_dir, new_dir, + old_dentry->d_name.name, old_dentry->d_name.len, + new_dentry->d_name.name, new_dentry->d_name.len); if (err) goto out; @@ -977,15 +1134,24 @@ out: return err; } +extern int ll_inode_revalidate(struct dentry *dentry); struct inode_operations ll_dir_inode_operations = { - create: ll_create, - lookup2: ll_lookup2, - link: ll_link, - unlink: ll_unlink, - symlink: ll_symlink, - mkdir: ll_mkdir, - rmdir: ll_rmdir, - mknod: ll_mknod, - rename: ll_rename, - setattr: ll_setattr + create: ll_create, + lookup2: ll_lookup2, + link: ll_link, + link2: ll_link2, + unlink: ll_unlink, + unlink2: ll_unlink2, + symlink: ll_symlink, + symlink2: ll_symlink2, + mkdir: ll_mkdir, + mkdir2: ll_mkdir2, + rmdir: ll_rmdir, + rmdir2: ll_rmdir2, + mknod: ll_mknod, + mknod2: ll_mknod2, + rename: ll_rename, + rename2: ll_rename2, + setattr: ll_setattr, + revalidate: ll_inode_revalidate, }; diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index e1402d1..ab3ff86 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -3,7 +3,7 @@ * * Lustre Lite I/O Page Cache * - * Copyright (c) 2001, 2002 Cluster File Systems, Inc. + * Copyright (c) 2001-2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -120,7 +120,7 @@ static int ll_brw(int cmd, struct inode *inode, struct page *page, int create) pg.flag = create ? OBD_BRW_CREATE : 0; set->brw_callback = ll_brw_sync_wait; - rc = obd_brw(cmd, ll_i2obdconn(inode), lsm, 1, &pg, set); + rc = obd_brw(cmd, ll_i2obdconn(inode), lsm, 1, &pg, set, NULL); if (rc) { if (rc != -EIO) CERROR("error from obd_brw: rc = %d\n", rc); @@ -195,7 +195,7 @@ void ll_truncate(struct inode *inode) /* truncate == punch from new size to absolute end of file */ err = obd_punch(ll_i2obdconn(inode), &oa, lsm, inode->i_size, - OBD_OBJECT_EOF); + OBD_OBJECT_EOF, NULL); if (err) CERROR("obd_truncate fails (%d) ino %lu\n", err, inode->i_ino); else @@ -232,10 +232,24 @@ static int ll_prepare_write(struct file *file, struct page *page, unsigned from, if (from == 0 && to == PAGE_SIZE) RETURN(0); - /* We are writing to a new page, no need to read old data */ + /* If are writing to a new page, no need to read old data. If we + * haven't already gotten the file size in ll_file_write() since + * we got our extent lock, we need to verify it here before we + * overwrite some other node's write (bug 445). + */ if (inode->i_size <= offset) { - memset(addr, 0, PAGE_SIZE); - GOTO(prepare_done, rc=0); + if (!S_ISBLK(inode->i_mode) && !(file->f_flags & O_APPEND)) { + struct ll_file_data *fd = file->private_data; + struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + + rc = ll_file_size(inode, lsm, &fd->fd_osthandle); + if (rc) + GOTO(prepare_done, rc); + } + if (inode->i_size <= offset) { + memset(addr, 0, PAGE_SIZE); + GOTO(prepare_done, rc=0); + } } rc = ll_brw(OBD_BRW_READ, inode, page, 0); @@ -244,7 +258,9 @@ static int ll_prepare_write(struct file *file, struct page *page, unsigned from, prepare_done: if (!rc) SetPageUptodate(page); - + else + kunmap (page); + return rc; } @@ -307,7 +323,7 @@ static int ll_commit_write(struct file *file, struct page *page, pg.off, pg.count); set->brw_callback = ll_brw_sync_wait; - rc = obd_brw(OBD_BRW_WRITE, ll_i2obdconn(inode), md, 1, &pg, set); + rc = obd_brw(OBD_BRW_WRITE, ll_i2obdconn(inode), md, 1, &pg, set, NULL); if (rc) CERROR("error from obd_brw: rc = %d\n", rc); else { @@ -368,7 +384,7 @@ static int ll_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf, set->brw_callback = ll_brw_sync_wait; rc = obd_brw(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ, - ll_i2obdconn(inode), lsm, bufs_per_obdo, pga, set); + ll_i2obdconn(inode), lsm, bufs_per_obdo, pga, set, NULL); if (rc) CERROR("error from obd_brw: rc = %d\n", rc); else { diff --git a/lustre/llite/super.c b/lustre/llite/super.c index 73b6ea5..8df74f1 100644 --- a/lustre/llite/super.c +++ b/lustre/llite/super.c @@ -3,10 +3,22 @@ * * Lustre Light Super operations * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. * - * Copryright (C) 2002 Cluster File Systems, Inc. + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define DEBUG_SUBSYSTEM S_LLITE @@ -27,12 +39,17 @@ extern struct address_space_operations ll_aops; extern struct address_space_operations ll_dir_aops; struct super_operations ll_super_operations; +/* /proc/lustre/llite root that tracks llite mount points */ +struct proc_dir_entry *proc_lustre_fs_root; +/* lproc_llite.c */ +extern int lprocfs_register_mountpoint(struct proc_dir_entry *parent, + struct super_block *sb, + char *osc, char *mdc); + extern int ll_recover(struct recovd_data *, int); extern int ll_commitcbd_setup(struct ll_sb_info *); extern int ll_commitcbd_cleanup(struct ll_sb_info *); -extern void ll_proc_namespace(struct super_block* sb, char* osc, char* mdc); - static char *ll_read_opt(const char *opt, char *data) { char *value; @@ -110,6 +127,7 @@ static struct super_block *ll_read_super(struct super_block *sb, struct ptlrpc_connection *mdc_conn; struct ll_read_inode2_cookie lic; class_uuid_t uuid; + struct obd_uuid param_uuid; ENTRY; @@ -120,7 +138,7 @@ static struct super_block *ll_read_super(struct super_block *sb, INIT_LIST_HEAD(&sbi->ll_conn_chain); INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list); generate_random_uuid(uuid); - class_uuid_unparse(uuid, sbi->ll_sb_uuid); + class_uuid_unparse(uuid, &sbi->ll_sb_uuid); sb->u.generic_sbp = sbi; @@ -136,13 +154,14 @@ static struct super_block *ll_read_super(struct super_block *sb, GOTO(out_free, sb = NULL); } - obd = class_uuid2obd(mdc); + strncpy(param_uuid.uuid, mdc, sizeof(param_uuid.uuid)); + obd = class_uuid2obd(¶m_uuid); if (!obd) { CERROR("MDC %s: not setup or attached\n", mdc); GOTO(out_free, sb = NULL); } - err = obd_connect(&sbi->ll_mdc_conn, obd, sbi->ll_sb_uuid, + err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid, ptlrpc_recovd, ll_recover); if (err) { CERROR("cannot connect to %s: rc = %d\n", mdc, err); @@ -152,13 +171,14 @@ static struct super_block *ll_read_super(struct super_block *sb, mdc_conn = sbi2mdc(sbi)->cl_import.imp_connection; list_add(&mdc_conn->c_sb_chain, &sbi->ll_conn_chain); - obd = class_uuid2obd(osc); + strncpy(param_uuid.uuid, osc, sizeof(param_uuid.uuid)); + obd = class_uuid2obd(¶m_uuid); if (!obd) { CERROR("OSC %s: not setup or attached\n", osc); GOTO(out_mdc, sb = NULL); } - err = obd_connect(&sbi->ll_osc_conn, obd, sbi->ll_sb_uuid, + err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid, ptlrpc_recovd, ll_recover); if (err) { CERROR("cannot connect to %s: rc = %d\n", osc, err); @@ -215,7 +235,13 @@ static struct super_block *ll_read_super(struct super_block *sb, ptlrpc_req_finished(request); request = NULL; - ll_proc_namespace(sb, osc, mdc); + + if (proc_lustre_fs_root) { + err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb, + osc, mdc); + if (err < 0) + CERROR("could not register mount in /proc/lustre"); + } out_dev: if (mdc) @@ -257,8 +283,10 @@ static void ll_put_super(struct super_block *sb) */ mdc_getstatus(&sbi->ll_mdc_conn, &rootfid); - lprocfs_dereg_mnt(sbi->ll_proc_root); - sbi->ll_proc_root = NULL; + if (sbi->ll_proc_root) { + lprocfs_remove(sbi->ll_proc_root); + sbi->ll_proc_root = NULL; + } obd_disconnect(&sbi->ll_mdc_conn); @@ -303,13 +331,15 @@ static void ll_clear_inode(struct inode *inode) obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd); if (lli->lli_symlink_name) { - OBD_FREE(lli->lli_symlink_name,strlen(lli->lli_symlink_name)+1); + OBD_FREE(lli->lli_symlink_name, + strlen(lli->lli_symlink_name) + 1); lli->lli_symlink_name = NULL; } EXIT; } +#if 0 static void ll_delete_inode(struct inode *inode) { ENTRY; @@ -335,19 +365,21 @@ static void ll_delete_inode(struct inode *inode) oa->o_id = lsm->lsm_object_id; obdo_from_inode(oa, inode, OBD_MD_FLID | OBD_MD_FLTYPE); - err = obd_destroy(ll_i2obdconn(inode), oa, lsm); + err = obd_destroy(ll_i2obdconn(inode), oa, lsm, NULL); obdo_free(oa); if (err) - CDEBUG(D_SUPER, "obd destroy objid "LPX64" error %d\n", - lsm->lsm_object_id, err); + CDEBUG(D_INODE, + "inode %lu obd_destroy objid "LPX64" error %d\n", + inode->i_ino, lsm->lsm_object_id, err); } out: clear_inode(inode); EXIT; } +#endif /* like inode_setattr, but doesn't mark the inode dirty */ -static int ll_attr2inode(struct inode * inode, struct iattr * attr, int trunc) +static int ll_attr2inode(struct inode *inode, struct iattr *attr, int trunc) { unsigned int ia_valid = attr->ia_valid; int error = 0; @@ -393,11 +425,30 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc) */ attr->ia_valid &= ~ATTR_SIZE; if (attr->ia_valid) { - err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, &request); + err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, NULL, 0, + &request); if (err) - CERROR("mdc_setattr fails (%d)\n", err); + CERROR("mdc_setattr fails: err = %d\n", err); ptlrpc_req_finished(request); + if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_MTIME_SET) { + struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + struct obdo oa; + int err2; + + CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n", + inode->i_ino, attr->ia_mtime); + oa.o_id = lsm->lsm_object_id; + oa.o_mode = S_IFREG; + oa.o_valid = OBD_MD_FLID |OBD_MD_FLTYPE |OBD_MD_FLMTIME; + oa.o_mtime = attr->ia_mtime; + err2 = obd_setattr(&sbi->ll_osc_conn, &oa, lsm, NULL); + if (err2) { + CERROR("obd_setattr fails: rc=%d\n", err); + if (!err) + err = err2; + } + } } RETURN(err); @@ -461,8 +512,14 @@ out: RETURN(rc); } -void ll_update_inode(struct inode *inode, struct mds_body *body) +void ll_update_inode(struct inode *inode, struct mds_body *body, + struct lov_mds_md *lmm) { + struct ll_inode_info *lli = ll_i2info(inode); + + if (lmm != NULL) + obd_unpackmd(ll_i2obdconn(inode), &lli->lli_smd, lmm); + if (body->valid & OBD_MD_FLID) inode->i_ino = body->ino; if (body->valid & OBD_MD_FLATIME) @@ -489,6 +546,8 @@ void ll_update_inode(struct inode *inode, struct mds_body *body) inode->i_rdev = body->rdev; if (body->valid & OBD_MD_FLSIZE) inode->i_size = body->size; + if (body->valid & OBD_MD_FLBLOCKS) + inode->i_blocks = body->blocks; } static void ll_read_inode2(struct inode *inode, void *opaque) @@ -501,18 +560,16 @@ static void ll_read_inode2(struct inode *inode, void *opaque) sema_init(&lli->lli_open_sem, 1); atomic_set(&lli->lli_open_count, 0); - /* core attributes first */ - ll_update_inode(inode, body); - LASSERT(!lli->lli_smd); - if (lic && lic->lic_lmm) - obd_unpackmd(ll_i2obdconn(inode), &lli->lli_smd, lic->lic_lmm); + + /* core attributes first */ + ll_update_inode(inode, body, lic ? lic->lic_lmm : NULL); /* Get the authoritative file size */ if (lli->lli_smd && (inode->i_mode & S_IFREG)) { int rc; LASSERT(lli->lli_smd->lsm_object_id != 0); - rc = ll_file_size(inode, lli->lli_smd); + rc = ll_file_size(inode, lli->lli_smd, NULL); if (rc) { CERROR("ll_file_size: %d\n", rc); ll_clear_inode(inode); @@ -536,6 +593,7 @@ static void ll_read_inode2(struct inode *inode, void *opaque) inode->i_op = &ll_fast_symlink_inode_operations; EXIT; } else { + inode->i_op = &ll_special_inode_operations; init_special_inode(inode, inode->i_mode, inode->i_rdev); EXIT; } @@ -549,7 +607,7 @@ static inline void invalidate_request_list(struct list_head *req_list) list_entry(tmp, struct ptlrpc_request, rq_list); CERROR("invalidating req xid "LPU64" op %d to %s:%d\n", req->rq_xid, req->rq_reqmsg->opc, - req->rq_connection->c_remote_uuid, + req->rq_connection->c_remote_uuid.uuid, req->rq_import->imp_client->cli_request_portal); req->rq_flags |= PTL_RPC_FL_ERR; wake_up(&req->rq_wait_for_rep); @@ -584,7 +642,7 @@ struct super_operations ll_super_operations = { read_inode2: ll_read_inode2, clear_inode: ll_clear_inode, - delete_inode: ll_delete_inode, + // delete_inode: ll_delete_inode, put_super: ll_put_super, statfs: ll_statfs, umount_begin: ll_umount_begin @@ -599,12 +657,16 @@ static struct file_system_type lustre_lite_fs_type = { static int __init init_lustre_lite(void) { - printk(KERN_INFO "Lustre Lite 0.5.14, info@clusterfs.com\n"); + printk(KERN_INFO "Lustre Lite Client File System; " + "info@clusterfs.com\n"); ll_file_data_slab = kmem_cache_create("ll_file_data", sizeof(struct ll_file_data), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if (ll_file_data_slab == NULL) return -ENOMEM; + + proc_lustre_fs_root = proc_lustre_root ? proc_mkdir("llite", proc_lustre_root) : NULL; + return register_filesystem(&lustre_lite_fs_type); } @@ -612,10 +674,15 @@ static void __exit exit_lustre_lite(void) { unregister_filesystem(&lustre_lite_fs_type); kmem_cache_destroy(ll_file_data_slab); + + if (proc_lustre_fs_root) { + lprocfs_remove(proc_lustre_fs_root); + proc_lustre_fs_root = NULL; + } } MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Lustre Lite Client File System v1.0"); +MODULE_DESCRIPTION("Lustre Lite Client File System"); MODULE_LICENSE("GPL"); module_init(init_lustre_lite); diff --git a/lustre/llite/super25.c b/lustre/llite/super25.c index 557d715..fad4a4d 100644 --- a/lustre/llite/super25.c +++ b/lustre/llite/super25.c @@ -3,10 +3,22 @@ * * Lustre Light Super operations * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. * - * Copryright (C) 2002 Cluster File Systems, Inc. + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define DEBUG_SUBSYSTEM S_LLITE @@ -27,6 +39,13 @@ extern struct address_space_operations ll_aops; extern struct address_space_operations ll_dir_aops; struct super_operations ll_super_operations; +/* /proc/lustre/llite root that tracks llite mount points */ +struct proc_dir_entry *proc_lustre_fs_root; +/* lproc_llite.c */ +extern int lprocfs_register_mountpoint(struct proc_dir_entry *parent, + struct super_block *sb, + char *osc, char *mdc); + extern int ll_init_inodecache(void); extern void ll_destroy_inodecache(void); extern int ll_recover(struct recovd_data *, int); @@ -34,7 +53,7 @@ extern int ll_commitcbd_setup(struct ll_sb_info *); extern int ll_commitcbd_cleanup(struct ll_sb_info *); int ll_read_inode2(struct inode *inode, void *opaque); -extern void ll_proc_namespace(struct super_block* sb, char* osc, char* mdc) +extern int ll_proc_namespace(struct super_block* sb, char* osc, char* mdc) static char *ll_read_opt(const char *opt, char *data) { @@ -216,7 +235,14 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent) ptlrpc_req_finished(request); request = NULL; - ll_proc_namespace(sb, osc, mdc) + + if (proc_lustre_fs_root) { + err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb, + osc, mdc); + if (err < 0) + CERROR("could not register mount in /proc/lustre"); + } + out_dev: if (mdc) OBD_FREE(mdc, strlen(mdc) + 1); @@ -262,8 +288,10 @@ static void ll_put_super(struct super_block *sb) */ mdc_getstatus(&sbi->ll_mdc_conn, &rootfid); - lprocfs_dereg_mnt(sbi->ll_proc_root); + if (sbi->ll_proc_root) { + lprocfs_remove(sbi->ll_proc_root); sbi->ll_proc_root = NULL; + } obd_disconnect(&sbi->ll_mdc_conn); OBD_FREE(sbi, sizeof(*sbi)); @@ -397,9 +425,26 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc) if (attr->ia_valid) { err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, &request); if (err) - CERROR("mdc_setattr fails (%d)\n", err); + CERROR("mdc_setattr fails: err = %d\n", err); ptlrpc_req_finished(request); + if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_MTIME_SET) { + struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + struct obdo oa; + int err; + + CDEBUG(D_ERROR, "setting mtime on OST\n"); + oa.o_id = lsm->lsm_object_id; + oa.o_mode = S_IFREG; + oa.o_valid = OBD_MD_FLID |OBD_MD_FLTYPE |OBD_MD_FLMTIME; + oa.o_mtime = attr->ia_mtime; + err = obd_setattr(&sbi->ll_osc_conn, &oa, lsm); + if (err) { + CERROR("obd_setattr fails: rc=%d\n", err); + if (!rc) + rc = err; + } + } } RETURN(err); @@ -463,8 +508,14 @@ out: RETURN(rc); } -void ll_update_inode(struct inode *inode, struct mds_body *body) +void ll_update_inode(struct inode *inode, struct mds_body *body, + struct lov_mds_md *lmm) { + struct ll_inode_info *lli = ll_i2info(inode); + + if (lmm != NULL) + obd_unpackmd(ll_i2obdconn(inode), &lli->lli_smd, lmm); + if (body->valid & OBD_MD_FLID) inode->i_ino = body->ino; if (body->valid & OBD_MD_FLATIME) @@ -491,6 +542,8 @@ void ll_update_inode(struct inode *inode, struct mds_body *body) inode->i_rdev = to_kdev_t(body->rdev); if (body->valid & OBD_MD_FLSIZE) inode->i_size = body->size; + if (body->valid & OBD_MD_FLBLOCKS) + inode->i_blocks = body->blocks; } int ll_read_inode2(struct inode *inode, void *opaque) @@ -503,16 +556,14 @@ int ll_read_inode2(struct inode *inode, void *opaque) sema_init(&lli->lli_open_sem, 1); - /* core attributes first */ - ll_update_inode(inode, body); - LASSERT(!lli->lli_smd); - if (lic && lic->lic_lmm) - obd_unpackmd(ll_i2obdconn(inode), &lli->lli_smd, lic->lic_lmm); + + /* core attributes first */ + ll_update_inode(inode, body, lic ? lic->lic_lmm : NULL); /* Get the authoritative file size */ if (lli->lli_smd && S_ISREG(inode->i_mode)) { - rc = ll_file_size(inode, lli->lli_smd); + rc = ll_file_size(inode, lli->lli_smd, NULL); if (rc) { CERROR("ll_file_size: %d\n", rc); ll_clear_inode(inode); @@ -652,7 +703,8 @@ struct file_system_type lustre_lite_fs_type = { static int __init init_lustre_lite(void) { int rc; - printk(KERN_INFO "Lustre Lite 0.5.14, info@clusterfs.com\n"); + printk(KERN_INFO "Lustre Lite Client File System; " + "info@clusterfs.com\n"); rc = ll_init_inodecache(); if (rc) return -ENOMEM; @@ -663,6 +715,10 @@ static int __init init_lustre_lite(void) ll_destroy_inodecache(); return -ENOMEM; } + + proc_lustre_fs_root = proc_lustre_root ? + proc_mkdir("llite", proc_lustre_root) : NULL; + return register_filesystem(&lustre_lite_fs_type); } @@ -671,10 +727,14 @@ static void __exit exit_lustre_lite(void) unregister_filesystem(&lustre_lite_fs_type); ll_destroy_inodecache(); kmem_cache_destroy(ll_file_data_slab); + if (proc_lustre_fs_root) { + lprocfs_remove(proc_lustre_fs_root); + proc_lustre_fs_root = NULL; + } } MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Lustre Lite Client File System v1.0"); +MODULE_DESCRIPTION("Lustre Lite Client File System"); MODULE_LICENSE("GPL"); module_init(init_lustre_lite); diff --git a/lustre/llite/symlink.c b/lustre/llite/symlink.c index 5be4717..3c9d646 100644 --- a/lustre/llite/symlink.c +++ b/lustre/llite/symlink.c @@ -101,8 +101,8 @@ static int ll_follow_link(struct dentry *dentry, struct nameidata *nd, } down(&lli->lli_open_sem); - rc = ll_readlink_internal(inode, &request, &symname); + up(&lli->lli_open_sem); if (rc) GOTO(out, rc); @@ -113,15 +113,16 @@ static int ll_follow_link(struct dentry *dentry, struct nameidata *nd, rc = vfs_follow_link_it(nd, symname, it); out: - up(&lli->lli_open_sem); ptlrpc_req_finished(request); RETURN(rc); } +extern int ll_inode_revalidate(struct dentry *dentry); extern int ll_setattr(struct dentry *de, struct iattr *attr); struct inode_operations ll_fast_symlink_inode_operations = { readlink: ll_readlink, setattr: ll_setattr, - follow_link2: ll_follow_link + follow_link2: ll_follow_link, + revalidate: ll_inode_revalidate }; diff --git a/lustre/lov/Makefile.am b/lustre/lov/Makefile.am index 2320dcc..2070b01 100644 --- a/lustre/lov/Makefile.am +++ b/lustre/lov/Makefile.am @@ -8,8 +8,11 @@ DEFS= MODULE = lov modulefs_DATA = lov.o EXTRA_PROGRAMS = lov -LINX= +LINX=client.c lov_SOURCES = lov_obd.c lov_pack.c lproc_lov.c $(LINX) +client.c: + test -e client.c || ln -sf $(top_srcdir)/lib/client.c + include $(top_srcdir)/Rules diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 7135743..3e6b2d2 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -1,15 +1,25 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * lov/lov.c - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * Author: Phil Schwan + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. + * Author: Phil Schwan * Peter Braam - * Mike Shaver + * Mike Shaver + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define EXPORT_SYMTAB @@ -31,8 +41,6 @@ #include #include -extern struct lprocfs_vars status_var_nm_1[]; -extern struct lprocfs_vars status_class_var[]; static kmem_cache_t *lov_file_cache; @@ -60,16 +68,19 @@ extern int lov_getstripe(struct lustre_handle *conn, struct lov_mds_md *lmmu, /* obd methods */ int lov_attach(struct obd_device *dev, obd_count len, void *data) { - return lprocfs_reg_obd(dev, status_var_nm_1, dev); + struct lprocfs_static_vars lvars; + + lprocfs_init_vars(&lvars); + return lprocfs_obd_attach(dev, lvars.obd_vars); } int lov_detach(struct obd_device *dev) { - return lprocfs_dereg_obd(dev); + return lprocfs_obd_detach(dev); } static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { struct ptlrpc_request *req = NULL; @@ -78,7 +89,9 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, struct lov_desc *desc = &lov->desc; struct obd_export *exp; struct lustre_handle mdc_conn; - obd_uuid_t *uuidarray; + struct obd_uuid lov_mds_uuid = {"LOV_MDS_UUID"}; + struct obd_uuid uuid; + char *tmp; int rc, rc2, i; ENTRY; @@ -97,7 +110,7 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, INIT_LIST_HEAD(&exp->exp_lov_data.led_open_head); /* retrieve LOV metadata from MDS */ - rc = obd_connect(&mdc_conn, lov->mdcobd, NULL, recovd, recover); + rc = obd_connect(&mdc_conn, lov->mdcobd, &lov_mds_uuid, recovd,recover); if (rc) { CERROR("cannot connect to mdc: rc = %d\n", rc); GOTO(out_conn, rc); @@ -125,14 +138,15 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, memcpy(desc, lustre_msg_buf(req->rq_repmsg, 0), sizeof(*desc)); lov_unpackdesc(desc); - if (req->rq_repmsg->buflens[1] < sizeof(*uuidarray)*desc->ld_tgt_count){ + if (req->rq_repmsg->buflens[1] < sizeof(uuid.uuid)*desc->ld_tgt_count){ CERROR("LOV desc: invalid uuid array returned\n"); GOTO(out_conn, rc = -EINVAL); } - if (memcmp(obd->obd_uuid, desc->ld_uuid, sizeof(desc->ld_uuid))) { + if (memcmp(obd->obd_uuid.uuid, desc->ld_uuid.uuid, + sizeof(desc->ld_uuid.uuid))) { CERROR("LOV desc: uuid %s not on mds device (%s)\n", - obd->obd_uuid, desc->ld_uuid); + obd->obd_uuid.uuid, desc->ld_uuid.uuid); GOTO(out_conn, rc = -EINVAL); } @@ -163,37 +177,40 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, GOTO(out_conn, rc = -ENOMEM); } - uuidarray = lustre_msg_buf(req->rq_repmsg, 1); - for (i = 0; i < desc->ld_tgt_count; i++) - memcpy(lov->tgts[i].uuid, uuidarray[i], sizeof(*uuidarray)); - + tmp = lustre_msg_buf(req->rq_repmsg, 1); for (i = 0; i < desc->ld_tgt_count; i++) { - struct obd_device *tgt = client_tgtuuid2obd(uuidarray[i]); + struct obd_device *tgt; + struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" }; + + strncpy(uuid.uuid, tmp, sizeof(uuid.uuid)); + memcpy(&lov->tgts[i].uuid, &uuid, sizeof(uuid)); + tgt = client_tgtuuid2obd(&uuid); + tmp += sizeof(uuid.uuid); if (!tgt) { - CERROR("Target %s not attached\n", uuidarray[i]); + CERROR("Target %s not attached\n", uuid.uuid); GOTO(out_disc, rc = -EINVAL); } if (!(tgt->obd_flags & OBD_SET_UP)) { - CERROR("Target %s not set up\n", uuidarray[i]); + CERROR("Target %s not set up\n", uuid.uuid); GOTO(out_disc, rc = -EINVAL); } - rc = obd_connect(&lov->tgts[i].conn, tgt, NULL, recovd, + rc = obd_connect(&lov->tgts[i].conn, tgt, &lov_osc_uuid, recovd, recover); if (rc) { - CERROR("Target %s connect error %d\n", uuidarray[i], + CERROR("Target %s connect error %d\n", uuid.uuid, rc); GOTO(out_disc, rc); } - + rc = obd_iocontrol(IOC_OSC_REGISTER_LOV, &lov->tgts[i].conn, sizeof(struct obd_device *), obd, NULL); if (rc) { CERROR("Target %s REGISTER_LOV error %d\n", - uuidarray[i], rc); + uuid.uuid, rc); GOTO(out_disc, rc); } @@ -212,10 +229,11 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, while (i-- > 0) { desc->ld_active_tgt_count--; lov->tgts[i].active = 0; + memcpy(&uuid, &lov->tgts[i].uuid, sizeof(uuid)); rc2 = obd_disconnect(&lov->tgts[i].conn); if (rc2) - CERROR("LOV Target %s disconnect error: rc = %d\n", - uuidarray[i], rc2); + CERROR("error: LOV target %s disconnect on OST idx %d: " + "rc = %d\n", uuid.uuid, i, rc2); } OBD_FREE(lov->tgts, lov->bufsize); out_conn: @@ -244,7 +262,7 @@ static int lov_disconnect(struct lustre_handle *conn) if (rc) { if (lov->tgts[i].active) { CERROR("Target %s disconnect error %d\n", - lov->tgts[i].uuid, rc); + lov->tgts[i].uuid.uuid, rc); } rc = 0; } @@ -284,7 +302,7 @@ static int lov_disconnect(struct lustre_handle *conn) * -EBADF : The UUID is found, but the OBD is the wrong type (!) * -EALREADY: The OSC is already marked (in)active */ -static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid, +static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid, int activate) { struct obd_device *obd; @@ -293,13 +311,13 @@ static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid, ENTRY; CDEBUG(D_INFO, "Searching in lov %p for uuid %s (activate=%d)\n", - lov, uuid, activate); + lov, uuid->uuid, activate); spin_lock(&lov->lov_lock); for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { CDEBUG(D_INFO, "lov idx %d is %s conn "LPX64"\n", - i, tgt->uuid, tgt->conn.addr); - if (strncmp(uuid, tgt->uuid, sizeof(tgt->uuid)) == 0) + i, tgt->uuid.uuid, tgt->conn.addr); + if (strncmp(uuid->uuid, tgt->uuid.uuid, sizeof(uuid->uuid)) == 0) break; } @@ -313,7 +331,7 @@ static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid, } CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LOV idx %d\n", - obd->obd_name, obd->obd_uuid, obd->obd_minor, obd, + obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd, obd->obd_type->typ_name, i); if (strcmp(obd->obd_type->typ_name, "osc") != 0) { LBUG(); @@ -359,6 +377,7 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf) { struct obd_ioctl_data *data = buf; struct lov_obd *lov = &obd->u.lov; + struct obd_uuid uuid; int rc = 0; ENTRY; @@ -373,9 +392,10 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf) } spin_lock_init(&lov->lov_lock); - lov->mdcobd = class_uuid2obd(data->ioc_inlbuf1); + obd_str2uuid(&uuid, data->ioc_inlbuf1); + lov->mdcobd = class_uuid2obd(&uuid); if (!lov->mdcobd) { - CERROR("LOV %s cannot locate MDC %s\n", obd->obd_uuid, + CERROR("LOV %s cannot locate MDC %s\n", obd->obd_uuid.uuid, data->ioc_inlbuf1); rc = -EINVAL; } @@ -401,7 +421,7 @@ static struct lov_file_handles *lov_handle2lfh(struct lustre_handle *handle) /* the LOV expects oa->o_id to be set to the LOV object id */ static int lov_create(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md **ea) + struct lov_stripe_md **ea, struct obd_trans_info *oti) { struct obd_export *export = class_conn2export(conn); struct lov_obd *lov; @@ -448,9 +468,9 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, if (!*ea || lsm->lsm_stripe_offset >= ost_count) { int mult = lsm->lsm_object_id * lsm->lsm_stripe_count; int stripe_offset = mult % ost_count; - int sub_offset = (mult / ost_count) % lsm->lsm_stripe_count; + int sub_offset = (mult / ost_count); - ost_idx = stripe_offset + sub_offset; + ost_idx = (stripe_offset + sub_offset) % ost_count; } else ost_idx = lsm->lsm_stripe_offset; @@ -471,12 +491,17 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, /* create data objects with "parent" OA */ memcpy(tmp, oa, sizeof(*tmp)); /* XXX: LOV STACKING: use real "obj_mdp" sub-data */ - err = obd_create(&lov->tgts[ost_idx].conn, tmp, &obj_mdp); + err = obd_create(&lov->tgts[ost_idx].conn, tmp, &obj_mdp, oti); if (err) { if (lov->tgts[ost_idx].active) { CERROR("error creating objid "LPX64" sub-object" - "on OST idx %d: rc = %d\n", - oa->o_id, ost_idx, err); + " on OST idx %d/%d: rc = %d\n", oa->o_id, + ost_idx, lsm->lsm_stripe_count, err); + if (err > 0) { + CERROR("obd_create returned invalid " + "err %d\n", err); + err = -EIO; + } if (!rc) rc = err; } @@ -525,14 +550,14 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, return rc; out_cleanup: - while (i-- > 0) { + while (obj_alloc-- > 0) { int err; --loi; /* destroy already created objects here */ memcpy(tmp, oa, sizeof(*tmp)); tmp->o_id = loi->loi_id; - err = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL); + err = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL, NULL); if (err) CERROR("Failed to uncreate objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", @@ -545,7 +570,7 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, } static int lov_destroy(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *lsm) + struct lov_stripe_md *lsm, struct obd_trans_info *oti) { struct obdo tmp; struct obd_export *export = class_conn2export(conn); @@ -589,9 +614,9 @@ static int lov_destroy(struct lustre_handle *conn, struct obdo *oa, else tmp.o_valid &= ~OBD_MD_FLHANDLE; err = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, &tmp, - NULL); + NULL, NULL); if (err && lov->tgts[loi->loi_ost_idx].active) { - CERROR("Error destroying objid "LPX64" subobj " + CERROR("error: destroying objid "LPX64" subobj " LPX64" on OST idx %d\n: rc = %d", oa->o_id, loi->loi_id, loi->loi_ost_idx, err); if (!rc) @@ -625,14 +650,9 @@ static obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size, } static void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_flag valid, - struct lov_stripe_md *lsm, int stripeno, int *new) + struct lov_stripe_md *lsm, int stripeno, int *set) { - if (*new) { - obdo_cpy_md(tgt, src, valid); - if (valid & OBD_MD_FLSIZE) - tgt->o_size = lov_stripe_size(lsm,src->o_size,stripeno); - *new = 0; - } else { + if (*set) { if (valid & OBD_MD_FLSIZE) { /* this handles sparse files properly */ obd_size lov_size; @@ -647,6 +667,11 @@ static void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_flag valid, tgt->o_ctime = src->o_ctime; if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime) tgt->o_mtime = src->o_mtime; + } else { + obdo_cpy_md(tgt, src, valid); + if (valid & OBD_MD_FLSIZE) + tgt->o_size = lov_stripe_size(lsm,src->o_size,stripeno); + *set = 1; } } @@ -659,7 +684,7 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa, struct lov_oinfo *loi; struct lov_file_handles *lfh = NULL; int i; - int new = 1; + int set = 0; ENTRY; if (!lsm) { @@ -705,36 +730,31 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa, err = obd_getattr(&lov->tgts[loi->loi_ost_idx].conn, &tmp,NULL); if (err) { if (lov->tgts[loi->loi_ost_idx].active) { - CERROR("Error getattr objid "LPX64" subobj " + CERROR("error: getattr objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", oa->o_id, loi->loi_id, loi->loi_ost_idx, err); RETURN(err); } } else { - lov_merge_attrs(oa, &tmp, tmp.o_valid, lsm, i, &new); + lov_merge_attrs(oa, &tmp, tmp.o_valid, lsm, i, &set); } } - RETURN(0); + RETURN(set ? 0 : -EIO); } static int lov_setattr(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *lsm) + struct lov_stripe_md *lsm, struct obd_trans_info *oti) { struct obdo *tmp; struct obd_export *export = class_conn2export(conn); struct lov_obd *lov; struct lov_oinfo *loi; struct lov_file_handles *lfh = NULL; - int rc = 0, i; + int rc = 0, i, set = 0; ENTRY; - /* Note that this code is currently unused, hence LBUG(), just - * to know when/if it is ever revived that it needs cleanups. - */ - LBUG(); - if (!lsm) { CERROR("LOV requires striping ea\n"); RETURN(-EINVAL); @@ -752,6 +772,9 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa, /* size changes should go through punch and not setattr */ LASSERT(!(oa->o_valid & OBD_MD_FLSIZE)); + /* for now, we only expect mtime updates here */ + LASSERT(!(oa->o_valid & ~(OBD_MD_FLID |OBD_MD_FLTYPE |OBD_MD_FLMTIME))); + tmp = obdo_alloc(); if (!tmp) RETURN(-ENOMEM); @@ -763,31 +786,43 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa, for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { int err; + if (lov->tgts[loi->loi_ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + continue; + } + obdo_cpy_md(tmp, oa, oa->o_valid); if (lfh) memcpy(obdo_handle(tmp), &lfh->lfh_handles[i], - sizeof(lfh->lfh_handles[i])); + sizeof(lfh->lfh_handles[i])); else tmp->o_valid &= ~OBD_MD_FLHANDLE; tmp->o_id = loi->loi_id; - err = obd_setattr(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL); + err = obd_setattr(&lov->tgts[loi->loi_ost_idx].conn, tmp, + NULL, NULL); if (err) { - CERROR("Error setattr objid "LPX64" subobj "LPX64 - " on OST idx %d: rc = %d\n", - oa->o_id, loi->loi_id, loi->loi_ost_idx, err); - if (!rc) - rc = err; - } + if (lov->tgts[loi->loi_ost_idx].active) { + CERROR("error: setattr objid "LPX64" subobj " + LPX64" on OST idx %d: rc = %d\n", + oa->o_id, loi->loi_id, loi->loi_ost_idx, + err); + if (!rc) + rc = err; + } + } else + set = 1; } obdo_free(tmp); + if (!set && !rc) + rc = -EIO; RETURN(rc); } static int lov_open(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *lsm) + struct lov_stripe_md *lsm, struct obd_trans_info *oti) { struct obdo *tmp; /* on the heap here, on the stack in lov_close? */ struct obd_export *export = class_conn2export(conn); @@ -795,7 +830,7 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa, struct lov_oinfo *loi; struct lov_file_handles *lfh = NULL; struct lustre_handle *handle; - int new = 1; + int set = 0; int rc = 0, i; ENTRY; @@ -829,7 +864,6 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa, oa->o_size = 0; oa->o_blocks = 0; for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { - if (lov->tgts[loi->loi_ost_idx].active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; @@ -839,10 +873,11 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa, memcpy(tmp, oa, sizeof(*tmp)); tmp->o_id = loi->loi_id; - rc = obd_open(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL); + rc = obd_open(&lov->tgts[loi->loi_ost_idx].conn, tmp, + NULL, NULL); if (rc) { if (lov->tgts[loi->loi_ost_idx].active) { - CERROR("Error open objid "LPX64" subobj "LPX64 + CERROR("error: open objid "LPX64" subobj "LPX64 " on OST idx %d: rc = %d\n", oa->o_id, lsm->lsm_oinfo[i].loi_id, loi->loi_ost_idx, rc); @@ -851,7 +886,7 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa, continue; } - lov_merge_attrs(oa, tmp, tmp->o_valid, lsm, i, &new); + lov_merge_attrs(oa, tmp, tmp->o_valid, lsm, i, &set); if (tmp->o_valid & OBD_MD_FLHANDLE) memcpy(&lfh->lfh_handles[i], obdo_handle(tmp), @@ -859,10 +894,10 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa, } handle = obdo_handle(oa); - + lfh->lfh_count = lsm->lsm_stripe_count; get_random_bytes(&lfh->lfh_cookie, sizeof(lfh->lfh_cookie)); - + handle->addr = (__u64)(unsigned long)lfh; handle->cookie = lfh->lfh_cookie; oa->o_valid |= OBD_MD_FLHANDLE; @@ -870,6 +905,8 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa, list_add(&lfh->lfh_list, &export->exp_lov_data.led_open_head); spin_unlock(&export->exp_lov_data.led_lock); + if (!set && !rc) + rc = -EIO; out_tmp: obdo_free(tmp); RETURN(rc); @@ -886,14 +923,15 @@ out_handles: memcpy(obdo_handle(tmp), &lfh->lfh_handles[i], sizeof(lfh->lfh_handles[i])); - err = obd_close(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL); - if (err) { - CERROR("Error closing objid "LPX64" subobj "LPX64 - " on OST idx %d after open error: rc = %d\n", + err = obd_close(&lov->tgts[loi->loi_ost_idx].conn, tmp, + NULL, NULL); + if (err && lov->tgts[loi->loi_ost_idx].active) { + CERROR("error: closing objid "LPX64" subobj "LPX64 + " on OST idx %d after open error: rc=%d\n", oa->o_id, loi->loi_id, loi->loi_ost_idx, err); } } - + OBD_FREE(lfh->lfh_handles, lsm->lsm_stripe_count * sizeof(*lfh->lfh_handles)); out_lfh: @@ -903,7 +941,7 @@ out_lfh: } static int lov_close(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *lsm) + struct lov_stripe_md *lsm, struct obd_trans_info *oti) { struct obdo tmp; struct obd_export *export = class_conn2export(conn); @@ -948,11 +986,14 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa, else tmp.o_valid &= ~OBD_MD_FLHANDLE; - err = obd_close(&lov->tgts[loi->loi_ost_idx].conn, &tmp, NULL); + err = obd_close(&lov->tgts[loi->loi_ost_idx].conn, &tmp, + NULL, NULL); if (err) { - CERROR("Error close objid "LPX64" subobj "LPX64 - " on OST idx %d: rc = %d\n", - oa->o_id, loi->loi_id, loi->loi_ost_idx, err); + if (lov->tgts[loi->loi_ost_idx].active) { + CERROR("error: close objid "LPX64" subobj "LPX64 + " on OST idx %d: rc = %d\n", oa->o_id, + loi->loi_id, loi->loi_ost_idx, err); + } if (!rc) rc = err; } @@ -1020,7 +1061,7 @@ static int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off) * that the punch will affect. */ static int lov_punch(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md *lsm, - obd_off start, obd_off end) + obd_off start, obd_off end, struct obd_trans_info *oti) { struct obdo tmp; struct obd_export *export = class_conn2export(conn); @@ -1066,11 +1107,13 @@ static int lov_punch(struct lustre_handle *conn, struct obdo *oa, tmp.o_valid &= ~OBD_MD_FLHANDLE; err = obd_punch(&lov->tgts[loi->loi_ost_idx].conn, &tmp, NULL, - starti, endi); + starti, endi, NULL); if (err) { - CERROR("Error punch objid "LPX64" subobj "LPX64 - " on OST idx %d: rc = %d\n", - oa->o_id, loi->loi_id, loi->loi_ost_idx, err); + if (lov->tgts[loi->loi_ost_idx].active) { + CERROR("error: punch objid "LPX64" subobj "LPX64 + " on OST idx %d: rc = %d\n", oa->o_id, + loi->loi_id, loi->loi_ost_idx, err); + } if (!rc) rc = err; } @@ -1080,7 +1123,8 @@ static int lov_punch(struct lustre_handle *conn, struct obdo *oa, static inline int lov_brw(int cmd, struct lustre_handle *conn, struct lov_stripe_md *lsm, obd_count oa_bufs, - struct brw_page *pga, struct obd_brw_set *set) + struct brw_page *pga, struct obd_brw_set *set, + struct obd_trans_info *oti) { struct { int bufct; @@ -1151,7 +1195,8 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn, if (si->bufct) { LASSERT(shift < oa_bufs); rc = obd_brw(cmd, &lov->tgts[si->ost_idx].conn, - &si->lsm, si->bufct, &ioarr[shift], set); + &si->lsm, si->bufct, &ioarr[shift], + set, oti); if (rc) GOTO(out_ioarr, rc); } @@ -1274,7 +1319,7 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm, if (rc) memset(lov_lockhp, 0, sizeof(*lov_lockhp)); if (rc && lov->tgts[loi->loi_ost_idx].active) { - CERROR("Error enqueue objid "LPX64" subobj "LPX64 + CERROR("error: enqueue objid "LPX64" subobj "LPX64 " on OST idx %d: rc = %d\n", lsm->lsm_object_id, loi->loi_id, loi->loi_ost_idx, rc); goto out_locks; @@ -1296,9 +1341,9 @@ out_locks: submd.lsm_stripe_count = 0; err = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd, mode, lov_lockhp); - if (err) { - CERROR("Error cancelling objid "LPX64 - " on OST idx %d after enqueue error: rc = %d\n", + if (err && lov->tgts[loi->loi_ost_idx].active) { + CERROR("error: cancelling objid "LPX64" on OST " + "idx %d after enqueue error: rc = %d\n", loi->loi_id, loi->loi_ost_idx, err); } } @@ -1370,7 +1415,7 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm, mode, lov_lockhp); if (err) { if (lov->tgts[loi->loi_ost_idx].active) { - CERROR("Error cancel objid "LPX64" subobj " + CERROR("error: cancel objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", lsm->lsm_object_id, loi->loi_id, loi->loi_ost_idx, err); @@ -1419,7 +1464,7 @@ static int lov_cancel_unused(struct lustre_handle *conn, err = obd_cancel_unused(&lov->tgts[loi->loi_ost_idx].conn, &submd, flags); if (err && lov->tgts[loi->loi_ost_idx].active) { - CERROR("Error cancel unused objid "LPX64" subobj "LPX64 + CERROR("error: cancel unused objid "LPX64" subobj "LPX64 " on OST idx %d: rc = %d\n", lsm->lsm_object_id, loi->loi_id, loi->loi_ost_idx, err); if (!rc) @@ -1456,11 +1501,14 @@ static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) err = obd_statfs(&lov->tgts[i].conn, &lov_sfs); if (err) { - CERROR("Error statfs OSC %s i %d: err = %d\n", - lov->tgts[i].uuid, i, err); - if (!rc) - rc = err; - continue; /* XXX or break? - probably OK to continue */ + if (lov->tgts[i].active) { + CERROR("error: statfs OSC %s on OST idx %d: " + "err = %d\n", + lov->tgts[i].uuid.uuid, i, err); + if (!rc) + rc = err; + } + continue; } if (!set) { memcpy(osfs, &lov_sfs, sizeof(lov_sfs)); @@ -1480,6 +1528,8 @@ static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) */ } } + if (!set && !rc) + rc = -EIO; RETURN(rc); } @@ -1489,6 +1539,7 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, struct obd_device *obddev = class_conn2obd(conn); struct lov_obd *lov = &obddev->u.lov; int i, count = lov->desc.ld_tgt_count; + struct obd_uuid *uuidp; int rc; ENTRY; @@ -1496,14 +1547,14 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, switch (cmd) { case IOC_LOV_SET_OSC_ACTIVE: { struct obd_ioctl_data *data = karg; - rc = lov_set_osc_active(lov,data->ioc_inlbuf1,data->ioc_offset); + uuidp = (struct obd_uuid *)data->ioc_inlbuf1; + rc = lov_set_osc_active(lov, uuidp, data->ioc_offset); break; } case OBD_IOC_LOV_GET_CONFIG: { struct obd_ioctl_data *data = karg; struct lov_tgt_desc *tgtdesc; struct lov_desc *desc; - obd_uuid_t *uuidp; char *buf = NULL; buf = NULL; @@ -1518,18 +1569,18 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, RETURN(-EINVAL); } - if (sizeof(*uuidp) * count > data->ioc_inllen2) { + if (sizeof(uuidp->uuid) * count > data->ioc_inllen2) { OBD_FREE(buf, len); RETURN(-EINVAL); } desc = (struct lov_desc *)data->ioc_inlbuf1; - uuidp = (obd_uuid_t *)data->ioc_inlbuf2; memcpy(desc, &(lov->desc), sizeof(*desc)); + uuidp = (struct obd_uuid *)data->ioc_inlbuf2; tgtdesc = lov->tgts; for (i = 0; i < count; i++, uuidp++, tgtdesc++) - memcpy(uuidp, tgtdesc->uuid, sizeof(*uuidp)); + obd_str2uuid(uuidp, tgtdesc->uuid.uuid); rc = copy_to_user((void *)uarg, buf, len); if (rc) @@ -1543,7 +1594,8 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, case LL_IOC_LOV_GETSTRIPE: rc = lov_getstripe(conn, karg, uarg); break; - default: + default: { + int set = 0; if (count == 0) RETURN(-ENOTTY); rc = 0; @@ -1552,9 +1604,20 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, err = obd_iocontrol(cmd, &lov->tgts[i].conn, len, karg, uarg); - if (err && !rc) - rc = err; + if (err) { + if (lov->tgts[i].active) { + CERROR("error: iocontrol OSC %s on OST" + "idx %d: err = %d\n", + lov->tgts[i].uuid.uuid, i, err); + if (!rc) + rc = err; + } + } else + set = 1; } + if (!set && !rc) + rc = -EIO; + } } RETURN(rc); @@ -1584,21 +1647,21 @@ struct obd_ops lov_obd_ops = { o_iocontrol: lov_iocontrol }; - -#define LOV_VERSION "v0.1" - static int __init lov_init(void) { + struct lprocfs_static_vars lvars; int rc; - printk(KERN_INFO "Lustre Logical Object Volume driver " LOV_VERSION - ", info@clusterfs.com\n"); + + printk(KERN_INFO "Lustre Logical Object Volume driver; " + "info@clusterfs.com\n"); lov_file_cache = kmem_cache_create("ll_lov_file_data", sizeof(struct lov_file_handles), 0, 0, NULL, NULL); if (!lov_file_cache) RETURN(-ENOMEM); - rc = class_register_type(&lov_obd_ops, status_class_var, + lprocfs_init_vars(&lvars); + rc = class_register_type(&lov_obd_ops, lvars.module_vars, OBD_LOV_DEVICENAME); RETURN(rc); } @@ -1611,7 +1674,7 @@ static void __exit lov_exit(void) } MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver " LOV_VERSION); +MODULE_DESCRIPTION("Lustre Logical Object Volume OBD driver"); MODULE_LICENSE("GPL"); module_init(lov_init); diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c index 3d4b4b8..9dc4e03 100644 --- a/lustre/lov/lov_pack.c +++ b/lustre/lov/lov_pack.c @@ -1,7 +1,8 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. + * Author: Andreas Dilger * * This file is part of Lustre, http://www.lustre.org. * @@ -260,13 +261,14 @@ int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp, RETURN(-EINVAL); } if (lmm.lmm_stripe_count > lov->desc.ld_tgt_count) { - CERROR("stripe count %d more than OST count %d\n", - (int)lmm.lmm_stripe_count, lov->desc.ld_tgt_count); + CERROR("stripe count %u more than OST count %d\n", + lmm.lmm_stripe_count, lov->desc.ld_tgt_count); RETURN(-EINVAL); } - if (lmm.lmm_stripe_offset >= lov->desc.ld_tgt_count) { - CERROR("stripe offset %d more than max OST index %d\n", - (int)lmm.lmm_stripe_count, lov->desc.ld_tgt_count); + if (lmm.lmm_stripe_offset >= lov->desc.ld_tgt_count && + lmm.lmm_stripe_offset != 0xffffffff) { + CERROR("stripe offset %u more than max OST index %d\n", + lmm.lmm_stripe_offset, lov->desc.ld_tgt_count); RETURN(-EINVAL); } if (lmm.lmm_stripe_size & (PAGE_SIZE - 1)) { @@ -274,7 +276,7 @@ int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp, lmm.lmm_stripe_size, PAGE_SIZE); RETURN(-EINVAL); } - if (lmm.lmm_stripe_size * lmm.lmm_stripe_count > ~0UL) { + if ((__u64)lmm.lmm_stripe_size * lmm.lmm_stripe_count > ~0UL) { CERROR("stripe width %ux%u > %lu on 32-bit system\n", lmm.lmm_stripe_size, (int)lmm.lmm_stripe_count, ~0UL); RETURN(-EINVAL); @@ -288,7 +290,6 @@ int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp, RETURN(-ENOMEM); lsm->lsm_magic = LOV_MAGIC; - /* This is all validated in lov_create() */ lsm->lsm_stripe_count = stripe_count; lsm->lsm_stripe_offset = lmm.lmm_stripe_offset; lsm->lsm_stripe_size = lmm.lmm_stripe_size; diff --git a/lustre/lov/lproc_lov.c b/lustre/lov/lproc_lov.c index 0812e00..648f80b 100644 --- a/lustre/lov/lproc_lov.c +++ b/lustre/lov/lproc_lov.c @@ -21,64 +21,69 @@ */ #define DEBUG_SUBSYSTEM S_CLASS -#include #include +#include -/* - * Common STATUS namespace - */ +#ifndef LPROCFS +struct lprocfs_vars lprocfs_module_vars[] = { {0} }; +struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; +#else -int rd_uuid(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* dev = (struct obd_device*)data; - return snprintf(page, count, "%s\n", dev->obd_uuid); -} +DEFINE_LPROCFS_STATFS_FCT(rd_blksize, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_kbytestotal, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_kbytesfree, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filestotal, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filesfree, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filegroups, obd_self_statfs); int rd_stripesize(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct obd_device *dev = (struct obd_device*)data; + struct obd_device *dev = (struct obd_device *)data; struct lov_desc *desc = &dev->u.lov.desc; + *eof = 1; return snprintf(page, count, LPU64"\n", desc->ld_default_stripe_size); } int rd_stripeoffset(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct obd_device* dev = (struct obd_device*)data; - struct lov_obd* lov = &dev->u.lov; + struct obd_device *dev = (struct obd_device *)data; + struct lov_desc *desc = &dev->u.lov.desc; - return snprintf(page, count, LPU64"\n", - lov->desc.ld_default_stripe_offset); + *eof = 1; + return snprintf(page, count, LPU64"\n", desc->ld_default_stripe_offset); } int rd_stripetype(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device* dev = (struct obd_device*)data; - struct lov_obd* lov = &dev->u.lov; + struct lov_desc *desc = &dev->u.lov.desc; - return snprintf(page, count, "%u\n", lov->desc.ld_pattern); + *eof = 1; + return snprintf(page, count, "%u\n", desc->ld_pattern); } int rd_stripecount(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct obd_device* dev = (struct obd_device*)data; - struct lov_obd* lov = &dev->u.lov; + struct obd_device *dev = (struct obd_device *)data; + struct lov_desc *desc = &dev->u.lov.desc; - return snprintf(page, count, "%u\n", lov->desc.ld_default_stripe_count); + *eof = 1; + return snprintf(page, count, "%u\n", desc->ld_default_stripe_count); } int rd_numobd(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device *dev = (struct obd_device*)data; - struct lov_obd *lov = &dev->u.lov; + struct lov_desc *desc = &dev->u.lov.desc; - return snprintf(page, count, "%u\n", lov->desc.ld_tgt_count); + *eof = 1; + return snprintf(page, count, "%u\n", desc->ld_tgt_count); } @@ -86,103 +91,64 @@ int rd_activeobd(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device* dev = (struct obd_device*)data; - struct lov_obd* lov = &dev->u.lov; - - return snprintf(page, count, "%u\n", lov->desc.ld_active_tgt_count); -} - -int rd_blksize(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - - -int rd_kbtotal(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - - -int rd_kbfree(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - -int rd_filestotal(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - -int rd_filesfree(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} + struct lov_desc *desc = &dev->u.lov.desc; -int rd_filegroups(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; + *eof = 1; + return snprintf(page, count, "%u\n", desc->ld_active_tgt_count); } int rd_target(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct obd_device* dev = (struct obd_device*)data; - int len = 0, i = 0; - struct lov_obd* lov = &dev->u.lov; - struct lov_tgt_desc* tgts = lov->tgts; - while (i < lov->desc.ld_tgt_count) { - len += snprintf(&page[len], count - len, "%d: %s %sACTIVE\n", - i, tgts->uuid, tgts->active ? "" : "IN"); - i++; - tgts++; + struct obd_device *dev = (struct obd_device*) data; + int len = 0, i; + struct lov_obd *lov = &dev->u.lov; + struct lov_tgt_desc *tgts = lov->tgts; + + for (i = 0; i < lov->desc.ld_tgt_count; i++, tgts++) { + int cur; + cur = snprintf(&page[len], count, "%d: %s %sACTIVE\n", + i, tgts->uuid.uuid, tgts->active ? "" : "IN"); + len += cur; + count -= cur; } + *eof = 1; return len; } -int rd_mdc(char* page, char **start, off_t off, int count, int *eof, void *data) +int rd_mdc(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct obd_device* dev = (struct obd_device*)data; - int len = 0; - struct lov_obd* lov = &dev->u.lov; - len += snprintf(page, count, "%s\n", lov->mdcobd->obd_uuid); - return len; -} + struct obd_device *dev = (struct obd_device*) data; + struct lov_obd *lov = &dev->u.lov; -struct lprocfs_vars status_var_nm_1[] = { - {"status/uuid", rd_uuid, 0, 0}, - {"status/stripesize",rd_stripesize, 0, 0}, - {"status/stripeoffset",rd_stripeoffset, 0, 0}, - {"status/stripecount",rd_stripecount, 0, 0}, - {"status/stripetype", rd_stripetype, 0, 0}, - {"status/numobd",rd_numobd, 0, 0}, - {"status/activeobd", rd_activeobd, 0, 0}, - {"status/filestotal", rd_filestotal, 0, 0}, - {"status/filesfree", rd_filesfree, 0, 0}, - {"status/filegroups", rd_filegroups, 0, 0}, - {"status/blocksize", rd_blksize, 0, 0}, - {"status/kbytestotal", rd_kbtotal, 0, 0}, - {"status/kbytesfree", rd_kbfree, 0, 0}, - {"status/target_obd", rd_target, 0, 0}, - {"status/target_mdc", rd_mdc, 0, 0}, - {0} + *eof = 1; + return snprintf(page, count, "%s\n", lov->mdcobd->obd_uuid.uuid); +} + +struct lprocfs_vars lprocfs_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "stripesize", rd_stripesize, 0, 0 }, + { "stripeoffset", rd_stripeoffset, 0, 0 }, + { "stripecount", rd_stripecount, 0, 0 }, + { "stripetype", rd_stripetype, 0, 0 }, + { "numobd", rd_numobd, 0, 0 }, + { "activeobd", rd_activeobd, 0, 0 }, + { "filestotal", rd_filestotal, 0, 0 }, + { "filesfree", rd_filesfree, 0, 0 }, + { "filegroups", rd_filegroups, 0, 0 }, + { "blocksize", rd_blksize, 0, 0 }, + { "kbytestotal", rd_kbytestotal, 0, 0 }, + { "kbytesfree", rd_kbytesfree, 0, 0 }, + { "target_obd", rd_target, 0, 0 }, + { "target_mdc", rd_mdc, 0, 0 }, + { 0 } }; -int rd_numrefs(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_type* class = (struct obd_type*)data; - - return snprintf(page, count, "%d\n", class->typ_refcnt); -} - -struct lprocfs_vars status_class_var[]={ - {"status/num_refs", rd_numrefs, 0, 0}, - {0} +struct lprocfs_vars lprocfs_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } }; + +#endif /* LPROCFS */ +LPROCFS_INIT_VARS(lprocfs_module_vars, lprocfs_obd_vars) diff --git a/lustre/mdc/lproc_mdc.c b/lustre/mdc/lproc_mdc.c index b0fcad6..f5b5b80 100644 --- a/lustre/mdc/lproc_mdc.c +++ b/lustre/mdc/lproc_mdc.c @@ -21,108 +21,39 @@ */ #define DEBUG_SUBSYSTEM S_CLASS -#include +#include #include - -int rd_uuid(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - - struct obd_device* temp = (struct obd_device*)data; - int len = 0; - len += snprintf(page, count, "%s\n",temp->obd_uuid); - return len; - - -} -int rd_blksize(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} -int rd_kbtotal(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - -int rd_kbfree(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - - -int rd_filestotal(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - -int rd_filesfree(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - -int rd_filegroups(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} -int rd_conn_uuid(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - struct client_obd* cli = &temp->u.cli; - struct obd_import* imp = &cli->cl_import; - int len = 0; - - len += snprintf(page, count, "%s\n",imp->imp_connection->c_remote_uuid); - return len; -} - -int rd_server_uuid(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - struct client_obd* cli = &temp->u.cli; - int len = 0; - - len += snprintf(page, count, "%s\n",cli->cl_target_uuid); - return len; -} - -int rd_server_name(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; - -} - -struct lprocfs_vars status_var_nm_1[] = { - {"status/uuid", rd_uuid, 0, 0}, - {"status/blocksize",rd_blksize, 0, 0}, - {"status/kbytestotal",rd_kbtotal, 0, 0}, - {"status/kbytesfree", rd_kbfree, 0, 0}, - {"status/filestotal", rd_filestotal, 0, 0}, - {"status/filesfree", rd_filesfree, 0, 0}, - {"status/filegroups", rd_filegroups, 0, 0}, - {"status/mds_server_uuid", rd_server_uuid, 0, 0}, - {"status/mds_conn_uuid", rd_conn_uuid, 0, 0}, - {0} +#ifndef LPROCFS +struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; +struct lprocfs_vars lprocfs_module_vars[] = { {0} }; +#else + +DEFINE_LPROCFS_STATFS_FCT(rd_blksize, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_kbytestotal, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_kbytesfree, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filestotal, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filesfree, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filegroups, obd_self_statfs); + +struct lprocfs_vars lprocfs_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "blocksize", rd_blksize, 0, 0 }, + { "kbytestotal", rd_kbytestotal, 0, 0 }, + { "kbytesfree", rd_kbytesfree, 0, 0 }, + { "filestotal", rd_filestotal, 0, 0 }, + { "filesfree", rd_filesfree, 0, 0 }, + { "filegroups", rd_filegroups, 0, 0 }, + { "mds_server_uuid", lprocfs_rd_server_uuid, 0, 0 }, + { "mds_conn_uuid", lprocfs_rd_conn_uuid, 0, 0 }, + { 0 } }; -int rd_numrefs(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_type* class = (struct obd_type*)data; - int len = 0; - len += snprintf(page, count, "%d\n", class->typ_refcnt); - return len; -} -struct lprocfs_vars status_class_var[] = { - {"status/num_refs", rd_numrefs, 0, 0}, - {0} +struct lprocfs_vars lprocfs_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } }; + +#endif /* LPROCFS */ + +LPROCFS_INIT_VARS(lprocfs_module_vars, lprocfs_obd_vars) diff --git a/lustre/mdc/mdc_reint.c b/lustre/mdc/mdc_reint.c index 63c1ef0..1fbd346 100644 --- a/lustre/mdc/mdc_reint.c +++ b/lustre/mdc/mdc_reint.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Copyright (C) 2001-2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.sf.net/projects/lustre/ * @@ -17,7 +17,6 @@ * You should have received a copy of the GNU General Public License * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * */ #define EXPORT_SYMTAB @@ -31,45 +30,59 @@ #include #include +extern struct semaphore mdc_sem; + static int mdc_reint(struct ptlrpc_request *request, int level) { int rc; + __u32 *opcodeptr = lustre_msg_buf(request->rq_reqmsg, 0); + request->rq_level = level; + if (!(*opcodeptr == REINT_SETATTR)) + mdc_get_rpc_lock(&mdc_rpc_lock, NULL); + rc = ptlrpc_queue_wait(request); + if (!(*opcodeptr == REINT_SETATTR)) + mdc_put_rpc_lock(&mdc_rpc_lock, NULL); if (rc) { - CERROR("error in handling %d\n", rc); + CDEBUG(D_INFO, "error in handling %d\n", rc); } else { /* For future resend/replays. */ - u32 *opcodeptr = lustre_msg_buf(request->rq_reqmsg, 0); *opcodeptr |= REINT_REPLAYING; } return rc; } -int mdc_setattr(struct lustre_handle *conn, - struct inode *inode, struct iattr *iattr, +int mdc_setattr(struct lustre_handle *conn, struct inode *inode, + struct iattr *iattr, void *ea, int ealen, struct ptlrpc_request **request) { struct ptlrpc_request *req; struct mds_rec_setattr *rec; - int rc, size = sizeof(*rec); + int rc, bufcount = 1, size[2] = {sizeof(*rec), ealen}; ENTRY; - req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_REINT, 1, &size, - NULL); + if (ealen > 0) + bufcount = 2; + + req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_REINT, bufcount, + size, NULL); if (!req) RETURN(-ENOMEM); - mds_setattr_pack(req, 0, inode, iattr, NULL, 0); + /* XXX FIXME bug 249 */ + req->rq_request_portal = MDS_GETATTR_PORTAL; + + mds_setattr_pack(req, inode, iattr, ea, ealen); - size = sizeof(struct mds_body); - req->rq_replen = lustre_msg_size(1, &size); + size[0] = sizeof(struct mds_body); + req->rq_replen = lustre_msg_size(1, size); rc = mdc_reint(req, LUSTRE_CONN_FULL); *request = req; - if (rc == -ERESTARTSYS ) + if (rc == -ERESTARTSYS) rc = 0; RETURN(rc); @@ -113,7 +126,8 @@ int mdc_create(struct lustre_handle *conn, struct inode *dir, goto resend; } - mdc_store_inode_generation(req, 0, 0); + if (!rc) + mdc_store_inode_generation(req, 0, 0); *request = req; RETURN(rc); @@ -123,47 +137,52 @@ int mdc_unlink(struct lustre_handle *conn, struct inode *dir, struct inode *child, __u32 mode, const char *name, int namelen, struct ptlrpc_request **request) { - struct ptlrpc_request *req; + struct obd_device *obddev = class_conn2obd(conn); + struct ptlrpc_request *req = *request; int rc, size[2] = {sizeof(struct mds_rec_unlink), namelen + 1}; ENTRY; - req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_REINT, 2, size, NULL); + LASSERT(req == NULL); + + req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_REINT, 2, size, + NULL); if (!req) RETURN(-ENOMEM); - - mds_unlink_pack(req, 0, dir, child, mode, name, namelen); + *request = req; size[0] = sizeof(struct mds_body); - req->rq_replen = lustre_msg_size(1, size); + size[1] = obddev->u.cli.cl_max_mds_easize; + req->rq_replen = lustre_msg_size(2, size); + + mds_unlink_pack(req, 0, dir, child, mode, name, namelen); rc = mdc_reint(req, LUSTRE_CONN_FULL); - *request = req; if (rc == -ERESTARTSYS) rc = 0; - RETURN(rc); } int mdc_link(struct lustre_handle *conn, - struct dentry *src, struct inode *dir, const char *name, + struct inode *src, struct inode *dir, const char *name, int namelen, struct ptlrpc_request **request) { struct ptlrpc_request *req; int rc, size[2] = {sizeof(struct mds_rec_link), namelen + 1}; ENTRY; - req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_REINT, 2, size, NULL); + req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_REINT, 2, size, + NULL); if (!req) RETURN(-ENOMEM); - mds_link_pack(req, 0, src->d_inode, dir, name, namelen); + mds_link_pack(req, 0, src, dir, name, namelen); size[0] = sizeof(struct mds_body); req->rq_replen = lustre_msg_size(1, size); rc = mdc_reint(req, LUSTRE_CONN_FULL); *request = req; - if (rc == -ERESTARTSYS ) + if (rc == -ERESTARTSYS) rc = 0; RETURN(rc); @@ -179,7 +198,8 @@ int mdc_rename(struct lustre_handle *conn, newlen + 1}; ENTRY; - req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_REINT, 3, size, NULL); + req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_REINT, 3, size, + NULL); if (!req) RETURN(-ENOMEM); diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index a97cfb5c..101e63d 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Copyright (C) 2001-2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.sf.net/projects/lustre/ * @@ -17,13 +17,13 @@ * You should have received a copy of the GNU General Public License * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * */ #define EXPORT_SYMTAB #define DEBUG_SUBSYSTEM S_MDC #include +#include #include #include #include @@ -34,8 +34,8 @@ #define REQUEST_MINOR 244 extern int mds_queue_req(struct ptlrpc_request *); -extern struct lprocfs_vars status_var_nm_1[]; -extern struct lprocfs_vars status_class_var[]; +struct mdc_rpc_lock mdc_rpc_lock; +EXPORT_SYMBOL(mdc_rpc_lock); /* Helper that implements most of mdc_getstatus and signal_completed_replay. */ static int send_getstatus(struct obd_import *imp, struct ll_fid *rootfid, @@ -53,10 +53,12 @@ static int send_getstatus(struct obd_import *imp, struct ll_fid *rootfid, body = lustre_msg_buf(req->rq_reqmsg, 0); req->rq_level = level; req->rq_replen = lustre_msg_size(1, &size); - + mds_pack_req_body(req); req->rq_reqmsg->flags |= msg_flags; + mdc_get_rpc_lock(&mdc_rpc_lock, NULL); rc = ptlrpc_queue_wait(req); + mdc_put_rpc_lock(&mdc_rpc_lock, NULL); if (!rc) { body = lustre_msg_buf(req->rq_repmsg, 0); @@ -105,8 +107,9 @@ int mdc_getlovinfo(struct obd_device *obd, struct lustre_handle *mdc_connh, size[0] = 512; size[1] = 8192; req->rq_replen = lustre_msg_size(2, size); - + mdc_get_rpc_lock(&mdc_rpc_lock, NULL); rc = ptlrpc_queue_wait(req); + mdc_put_rpc_lock(&mdc_rpc_lock, NULL); out: RETURN(rc); @@ -129,6 +132,9 @@ int mdc_getattr(struct lustre_handle *conn, if (!req) GOTO(out, rc = -ENOMEM); + /* XXX FIXME bug 249 */ + req->rq_request_portal = MDS_GETATTR_PORTAL; + body = lustre_msg_buf(req->rq_reqmsg, 0); ll_ino2fid(&body->fid1, ino, 0, type); body->valid = valid; @@ -143,15 +149,16 @@ int mdc_getattr(struct lustre_handle *conn, req->rq_replen = lustre_msg_size(bufcount, size); mds_pack_req_body(req); + mdc_get_rpc_lock(&mdc_rpc_lock, NULL); rc = ptlrpc_queue_wait(req); - + mdc_put_rpc_lock(&mdc_rpc_lock, NULL); if (!rc) { body = lustre_msg_buf(req->rq_repmsg, 0); mds_unpack_body(body); CDEBUG(D_NET, "mode: %o\n", body->mode); } - EXIT; + GOTO(out, rc); out: *request = req; return rc; @@ -188,8 +195,9 @@ int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent, req->rq_replen = lustre_msg_size(bufcount, size); mds_pack_req_body(req); + mdc_get_rpc_lock(&mdc_rpc_lock, NULL); rc = ptlrpc_queue_wait(req); - + mdc_put_rpc_lock(&mdc_rpc_lock, NULL); if (!rc) { body = lustre_msg_buf(req->rq_repmsg, 0); mds_unpack_body(body); @@ -201,32 +209,26 @@ int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent, return rc; } -void d_delete_aliases(struct inode *inode) +/* This should be called with both the request and the reply still packed. */ +void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff, + int repoff) { - struct dentry *dentry = NULL; - struct list_head *tmp; - struct ll_sb_info *sbi = ll_i2sbi(inode); - ENTRY; - - spin_lock(&dcache_lock); - list_for_each(tmp, &inode->i_dentry) { - dentry = list_entry(tmp, struct dentry, d_alias); - - list_del_init(&dentry->d_hash); - list_add(&dentry->d_hash, &sbi->ll_orphan_dentry_list); - } + struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, reqoff); + struct mds_body *body = lustre_msg_buf(req->rq_repmsg, repoff); - spin_unlock(&dcache_lock); - EXIT; + memcpy(&rec->cr_replayfid, &body->fid1, sizeof rec->cr_replayfid); + DEBUG_REQ(D_HA, req, "storing generation %x for ino "LPD64, + rec->cr_replayfid.generation, rec->cr_replayfid.id); } static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, - void *data, __u32 data_len, int flag) + void *data, int flag) { int rc; struct lustre_handle lockh; ENTRY; + switch (flag) { case LDLM_CB_BLOCKING: ldlm_lock2handle(lock, &lockh); @@ -238,15 +240,15 @@ static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, break; case LDLM_CB_CANCELING: { /* Invalidate all dentries associated with this inode */ - struct inode *inode; + struct inode *inode = lock->l_data; LASSERT(data != NULL); - LASSERT(data_len == sizeof(*inode)); /* XXX what tells us that 'data' is a valid inode at all? * we should probably validate the lock handle first? */ - inode = igrab(data); + + inode = igrab(inode); if (inode == NULL) /* inode->i_state & I_FREEING */ break; @@ -259,7 +261,7 @@ static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, } if (inode != inode->i_sb->s_root->d_inode) - d_delete_aliases(inode); + d_unhash_aliases(inode); iput(inode); break; @@ -271,18 +273,6 @@ static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, RETURN(0); } -/* This should be called with both the request and the reply still packed. */ -void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff, - int repoff) -{ - struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, reqoff); - struct mds_body *body = lustre_msg_buf(req->rq_repmsg, repoff); - - memcpy(&rec->cr_replayfid, &body->fid1, sizeof rec->cr_replayfid); - DEBUG_REQ(D_HA, req, "storing generation %x for ino "LPD64, - rec->cr_replayfid.generation, rec->cr_replayfid.id); -} - /* We always reserve enough space in the reply packet for a stripe MD, because * we don't know in advance the file type. * @@ -295,12 +285,14 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, { struct ptlrpc_request *req; struct obd_device *obddev = class_conn2obd(conn); - __u64 res_id[RES_NAME_SIZE] = {dir->i_ino, (__u64)dir->i_generation}; + struct ldlm_res_id res_id = + { .name = {dir->i_ino, dir->i_generation} }; int size[6] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)}; int rc, flags = LDLM_FL_HAS_INTENT; int repsize[3] = {sizeof(struct ldlm_reply), sizeof(struct mds_body), obddev->u.cli.cl_max_mds_easize}; + struct mdc_unlink_data *d = data; struct ldlm_reply *dlm_rep; struct ldlm_intent *lit; struct ldlm_request *lockreq; @@ -309,79 +301,31 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, LDLM_DEBUG_NOLOCK("mdsintent %s parent dir %lu", ldlm_it2str(it->it_op), dir->i_ino); - if (it->it_op & (IT_MKDIR | IT_CREAT | IT_SYMLINK | IT_MKNOD)) { - switch (it->it_op) { - case IT_MKDIR: - it->it_mode |= S_IFDIR; - break; - case (IT_CREAT|IT_OPEN): - case IT_CREAT: - it->it_mode |= S_IFREG; - break; - case IT_SYMLINK: - it->it_mode |= S_IFLNK; - break; - } + if (it->it_op & IT_OPEN) { + it->it_mode |= S_IFREG; it->it_mode &= ~current->fs->umask; size[2] = sizeof(struct mds_rec_create); size[3] = de->d_name.len + 1; - size[4] = tgtlen + 1; - req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 5, - size, NULL); - if (!req) - RETURN(-ENOMEM); - - /* pack the intent */ - lit = lustre_msg_buf(req->rq_reqmsg, 1); - lit->opc = NTOH__u64((__u64)it->it_op); - - /* pack the intended request */ - mds_create_pack(req, 2, dir, it->it_mode, 0, current->fsuid, - current->fsgid, CURRENT_TIME, de->d_name.name, - de->d_name.len, tgt, tgtlen); - req->rq_replen = lustre_msg_size(3, repsize); - } else if (it->it_op == IT_RENAME2) { - struct dentry *old_de = it->it_data; - - size[2] = sizeof(struct mds_rec_rename); - size[3] = old_de->d_name.len + 1; - size[4] = de->d_name.len + 1; - req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 5, - size, NULL); - if (!req) - RETURN(-ENOMEM); - - /* pack the intent */ - lit = lustre_msg_buf(req->rq_reqmsg, 1); - lit->opc = NTOH__u64((__u64)it->it_op); - - /* pack the intended request */ - mds_rename_pack(req, 2, old_de->d_parent->d_inode, dir, - old_de->d_name.name, old_de->d_name.len, - de->d_name.name, de->d_name.len); - req->rq_replen = lustre_msg_size(3, repsize); - } else if (it->it_op == IT_LINK2) { - struct dentry *old_de = it->it_data; - - size[2] = sizeof(struct mds_rec_link); - size[3] = de->d_name.len + 1; req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 4, size, NULL); if (!req) RETURN(-ENOMEM); + req->rq_flags |= PTL_RPC_FL_REPLAY; + /* pack the intent */ lit = lustre_msg_buf(req->rq_reqmsg, 1); lit->opc = NTOH__u64((__u64)it->it_op); /* pack the intended request */ - mds_link_pack(req, 2, old_de->d_inode, dir, - de->d_name.name, de->d_name.len); + mds_open_pack(req, 2, dir, it->it_mode, 0, current->fsuid, + current->fsgid, CURRENT_TIME, it->it_flags, + de->d_name.name, de->d_name.len, tgt, tgtlen); req->rq_replen = lustre_msg_size(3, repsize); - } else if (it->it_op == IT_UNLINK || it->it_op == IT_RMDIR) { + } else if (it->it_op & IT_UNLINK) { size[2] = sizeof(struct mds_rec_unlink); - size[3] = de->d_name.len + 1; + size[3] = d->unl_len + 1; req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 4, size, NULL); if (!req) @@ -392,13 +336,12 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, lit->opc = NTOH__u64((__u64)it->it_op); /* pack the intended request */ - mds_unlink_pack(req, 2, dir, NULL, - it->it_op == IT_UNLINK ? S_IFREG : S_IFDIR, - de->d_name.name, de->d_name.len); - + mds_unlink_pack(req, 2, d->unl_dir, + d->unl_de, d->unl_mode, + d->unl_name, d->unl_len); req->rq_replen = lustre_msg_size(3, repsize); - } else if (it->it_op & (IT_GETATTR | IT_RENAME | IT_LINK | - IT_OPEN | IT_SETATTR | IT_LOOKUP | IT_READLINK)) { + } else if (it->it_op & (IT_GETATTR| IT_SETATTR | IT_LOOKUP)) { + int valid = OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE; size[2] = sizeof(struct mds_body); size[3] = de->d_name.len + 1; @@ -412,8 +355,8 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, lit->opc = NTOH__u64((__u64)it->it_op); /* pack the intended request */ - mds_getattr_pack(req, 2, dir, de->d_name.name, de->d_name.len); - + mds_getattr_pack(req, valid, 2, it->it_flags, dir, + de->d_name.name, de->d_name.len); /* get ready for the reply */ req->rq_replen = lustre_msg_size(3, repsize); } else if (it->it_op == IT_READDIR) { @@ -424,163 +367,128 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, /* get ready for the reply */ req->rq_replen = lustre_msg_size(1, repsize); - } else { + } else { LBUG(); RETURN(-EINVAL); } + mdc_get_rpc_lock(&mdc_rpc_lock, it); rc = ldlm_cli_enqueue(conn, req, obddev->obd_namespace, NULL, res_id, lock_type, NULL, 0, lock_mode, &flags, - ldlm_completion_ast, mdc_blocking_ast, data, - datalen, lockh); - - if (it->it_op != IT_READDIR) { - /* XXX This should become a lustre_msg flag, but for now... */ - __u32 *opp = lustre_msg_buf(req->rq_reqmsg, 2); - *opp |= REINT_REPLAYING; + ldlm_completion_ast, mdc_blocking_ast, dir, NULL, + lockh); + + /* If we successfully created, mark the request so that replay will + * do the right thing */ + if (req->rq_transno) { + struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, 2); + rec->cr_opcode |= REINT_REPLAYING; } - - if (rc == -ENOENT) { - /* This can go when we're sure that this can never happen */ - LBUG(); + /* Similarly, if we're going to replay this request, we don't want to + * actually get a lock, just perform the intent. */ + if (req->rq_transno || (req->rq_flags & PTL_RPC_FL_REPLAY)) { + lockreq = lustre_msg_buf(req->rq_reqmsg, 0); + lockreq->lock_flags |= LDLM_FL_INTENT_ONLY; } + + dlm_rep = lustre_msg_buf(req->rq_repmsg, 0); + + /* This can go when we're sure that this can never happen */ + LASSERT(rc != -ENOENT); if (rc == ELDLM_LOCK_ABORTED) { lock_mode = 0; memset(lockh, 0, sizeof(*lockh)); - /* rc = 0 */ } else if (rc != 0) { CERROR("ldlm_cli_enqueue: %d\n", rc); RETURN(rc); - } else { - /* The server almost certainly gave us a lock other than the one - * that we asked for. If we already have a matching lock, then - * cancel this one--we don't need two. */ + } else { /* rc = 0 */ struct ldlm_lock *lock = ldlm_handle2lock(lockh); struct lustre_handle lockh2; LASSERT(lock); + /* If the server gave us back a different lock mode, we should + * fix up our variables. */ + if (lock->l_req_mode != lock_mode) { + ldlm_lock_addref(lockh, lock->l_req_mode); + ldlm_lock_decref(lockh, lock_mode); + lock_mode = lock->l_req_mode; + } + + /* The server almost certainly gave us a lock other than the + * one that we asked for. If we already have a matching lock, + * then cancel this one--we don't need two. */ LDLM_DEBUG(lock, "matching against this"); memcpy(&lockh2, lockh, sizeof(lockh2)); - if (ldlm_lock_match(NULL, NULL, LDLM_PLAIN, NULL, 0, LCK_NL, - &lockh2)) { - /* We already have a lock; cancel the old one */ - ldlm_lock_decref(lockh, lock_mode); - /* FIXME: bug 563 */ - //ldlm_cli_cancel(lockh); + if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL, + LDLM_PLAIN, NULL, 0, LCK_NL, &lockh2)) { + /* We already have a lock; cancel the new one */ + ldlm_lock_decref_and_cancel(lockh, lock_mode); memcpy(lockh, &lockh2, sizeof(lockh2)); } LDLM_LOCK_PUT(lock); } - /* On replay, we don't want the lock granted. */ - lockreq = lustre_msg_buf(req->rq_reqmsg, 0); - lockreq->lock_flags |= LDLM_FL_INTENT_ONLY; - - dlm_rep = lustre_msg_buf(req->rq_repmsg, 0); it->it_disposition = (int) dlm_rep->lock_policy_res1; it->it_status = (int) dlm_rep->lock_policy_res2; it->it_lock_mode = lock_mode; it->it_data = req; - RETURN(0); + RETURN(rc); +} + +void mdc_lock_set_inode(struct lustre_handle *lockh, struct inode *inode) +{ + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + ENTRY; + + LASSERT(lock != NULL); + lock->l_data = inode; + LDLM_LOCK_PUT(lock); + EXIT; } int mdc_cancel_unused(struct lustre_handle *conn, struct inode *inode, int flags) { - __u64 res_id[RES_NAME_SIZE] = {inode->i_ino, inode->i_generation}; + struct ldlm_res_id res_id = + { .name = {inode->i_ino, inode->i_generation} }; struct obd_device *obddev = class_conn2obd(conn); ENTRY; - RETURN(ldlm_cli_cancel_unused(obddev->obd_namespace, res_id, flags)); + RETURN(ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags)); } -struct replay_open_data { - struct lustre_handle *fh; -}; - static void mdc_replay_open(struct ptlrpc_request *req) { - int offset; - struct replay_open_data *saved; + struct lustre_handle old, *file_fh = req->rq_replay_data; + struct list_head *tmp; struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 0); - if (lustre_msg_get_op_flags(req->rq_reqmsg) & MDS_OPEN_HAS_EA) - offset = 2; - else - offset = 1; - - saved = lustre_msg_buf(req->rq_reqmsg, offset); mds_unpack_body(body); + memcpy(&old, file_fh, sizeof(old)); CDEBUG(D_HA, "updating from "LPD64"/"LPD64" to "LPD64"/"LPD64"\n", - saved->fh->addr, saved->fh->cookie, - body->handle.addr, body->handle.cookie); - memcpy(saved->fh, &body->handle, sizeof(body->handle)); + file_fh->addr, file_fh->cookie, body->handle.addr, + body->handle.cookie); + memcpy(file_fh, &body->handle, sizeof(body->handle)); + + /* A few frames up, ptlrpc_replay holds the lock, so this is safe. */ + list_for_each(tmp, &req->rq_import->imp_sending_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + if (req->rq_reqmsg->opc != MDS_CLOSE) + continue; + body = lustre_msg_buf(req->rq_reqmsg, 0); + if (memcmp(&body->handle, &old, sizeof(old))) + continue; + + DEBUG_REQ(D_HA, req, "updating close body with new fh"); + memcpy(&body->handle, file_fh, sizeof(*file_fh)); + } } -/* If lmm is non-NULL and lmm_size is non-zero, the stripe MD is stored on - * the MDS. Otherwise, we have already read a copy from the MDS (probably - * during mdc_enqueue() and we do not need to send it to the MDS again. - * - * In the future (when we support the non-intent case) we need to be able - * to read the stripe MD from the MDS here (need to fix mds_open() too). - */ -int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags, - struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh, - struct ptlrpc_request **request) +void mdc_set_open_replay_data(struct ll_file_data *fd) { - struct mds_body *body; - struct replay_open_data *replay_data; - int rc, size[3] = {sizeof(*body), sizeof(*replay_data)}, bufcount = 2; - struct ptlrpc_request *req; - ENTRY; - - if (lmm_size) { - bufcount = 3; - size[2] = size[1]; /* shuffle the replay data along */ - size[1] = lmm_size; - } - - req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_OPEN, bufcount, size, - NULL); - if (!req) - GOTO(out, rc = -ENOMEM); - - req->rq_flags |= PTL_RPC_FL_REPLAY; - body = lustre_msg_buf(req->rq_reqmsg, 0); - - ll_ino2fid(&body->fid1, ino, 0, type); - body->flags = HTON__u32(flags); - memcpy(&body->handle, fh, sizeof(body->handle)); - - if (lmm_size) { - body->flags |= HTON__u32(OBD_MD_FLEASIZE); - if (lmm) { - CDEBUG(D_INODE, "sending %u bytes MD for ino "LPU64"\n", - lmm_size, ino); - lustre_msg_set_op_flags(req->rq_reqmsg,MDS_OPEN_HAS_EA); - memcpy(lustre_msg_buf(req->rq_reqmsg,1), lmm, lmm_size); - } - } - - req->rq_replen = lustre_msg_size(1, size); - - rc = ptlrpc_queue_wait(req); - if (!rc) { - body = lustre_msg_buf(req->rq_repmsg, 0); - mds_unpack_body(body); - memcpy(fh, &body->handle, sizeof(*fh)); - - /* If open is replayed, we need to fix up the fh. */ - req->rq_replay_cb = mdc_replay_open; - replay_data = lustre_msg_buf(req->rq_reqmsg, lmm ? 2 : 1); - replay_data->fh = fh; - } - - EXIT; - out: - *request = req; - return rc; + fd->fd_req->rq_replay_cb = mdc_replay_open; + fd->fd_req->rq_replay_data = &fd->fd_mdshandle; } int mdc_close(struct lustre_handle *conn, obd_id ino, int type, @@ -613,12 +521,14 @@ int mdc_close(struct lustre_handle *conn, obd_id ino, int type, int mdc_readpage(struct lustre_handle *conn, obd_id ino, int type, __u64 offset, char *addr, struct ptlrpc_request **request) { - struct ptlrpc_connection *connection = + struct obd_import *imp = class_conn2cliimp(conn); + struct ptlrpc_connection *connection = client_conn2cli(conn)->cl_import.imp_connection; struct ptlrpc_request *req = NULL; struct ptlrpc_bulk_desc *desc = NULL; struct ptlrpc_bulk_page *bulk = NULL; struct mds_body *body; + unsigned long flags; int rc, size = sizeof(*body); ENTRY; @@ -628,25 +538,30 @@ int mdc_readpage(struct lustre_handle *conn, obd_id ino, int type, __u64 offset, if (desc == NULL) GOTO(out, rc = -ENOMEM); - req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_READPAGE, 1, &size, - NULL); + req = ptlrpc_prep_req(imp, MDS_READPAGE, 1, &size, NULL); if (!req) GOTO(out2, rc = -ENOMEM); bulk = ptlrpc_prep_bulk_page(desc); - bulk->bp_buflen = PAGE_SIZE; + if (bulk == NULL) + GOTO(out2, rc = -ENOMEM); + + spin_lock_irqsave(&imp->imp_lock, flags); + bulk->bp_xid = ++imp->imp_last_bulk_xid; + spin_unlock_irqrestore(&imp->imp_lock, flags); + bulk->bp_buflen = PAGE_CACHE_SIZE; bulk->bp_buf = addr; - bulk->bp_xid = req->rq_xid; + desc->bd_ptl_ev_hdlr = NULL; desc->bd_portal = MDS_BULK_PORTAL; - rc = ptlrpc_register_bulk(desc); + rc = ptlrpc_register_bulk_put(desc); if (rc) { CERROR("couldn't setup bulk sink: error %d.\n", rc); GOTO(out2, rc); } - mds_readdir_pack(req, offset, ino, type); + mds_readdir_pack(req, offset, ino, type, bulk->bp_xid); req->rq_replen = lustre_msg_size(1, &size); rc = ptlrpc_queue_wait(req); @@ -679,7 +594,9 @@ static int mdc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) req->rq_replen = lustre_msg_size(1, &size); + mdc_get_rpc_lock(&mdc_rpc_lock, NULL); rc = ptlrpc_queue_wait(req); + mdc_put_rpc_lock(&mdc_rpc_lock, NULL); if (rc) GOTO(out, rc); @@ -695,19 +612,22 @@ out: static int mdc_attach(struct obd_device *dev, obd_count len, void *data) { - return lprocfs_reg_obd(dev, status_var_nm_1, dev); + struct lprocfs_static_vars lvars; + + lprocfs_init_vars(&lvars); + return lprocfs_obd_attach(dev, lvars.obd_vars); } static int mdc_detach(struct obd_device *dev) { - return lprocfs_dereg_obd(dev); + return lprocfs_obd_detach(dev); } /* Send a mostly-dummy GETSTATUS request and indicate that we're done replay. */ static int signal_completed_replay(struct obd_import *imp) { struct ll_fid fid; - + return send_getstatus(imp, &fid, LUSTRE_CONN_RECOVD, MSG_LAST_REPLAY); } @@ -716,51 +636,63 @@ static int mdc_recover(struct obd_import *imp, int phase) int rc; unsigned long flags; struct ptlrpc_request *req; + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; ENTRY; switch(phase) { case PTLRPC_RECOVD_PHASE_PREPARE: - ldlm_cli_cancel_unused(imp->imp_obd->obd_namespace, - NULL, LDLM_FL_LOCAL_ONLY); + ldlm_cli_cancel_unused(ns, NULL, LDLM_FL_LOCAL_ONLY); RETURN(0); + + case PTLRPC_RECOVD_PHASE_NOTCONN: + ldlm_namespace_cleanup(ns, 1); + ptlrpc_abort_inflight(imp, 0); + /* FALL THROUGH */ case PTLRPC_RECOVD_PHASE_RECOVER: reconnect: rc = ptlrpc_reconnect_import(imp, MDS_CONNECT, &req); - /* We were still connected, just go about our business. */ - if (rc == EALREADY) - GOTO(skip_replay, rc); + flags = req->rq_repmsg + ? lustre_msg_get_op_flags(req->rq_repmsg) + : 0; + + if (rc == -EBUSY && (flags & MSG_CONNECT_RECOVERING)) + CERROR("reconnect denied by recovery; should retry\n"); if (rc) { - ptlrpc_req_finished(req); - RETURN(rc); - } - - /* We can't replay, which might be a problem. */ - if (!(lustre_msg_get_flags(req->rq_repmsg) & - MSG_REPLAY_IN_PROGRESS)) { if (phase != PTLRPC_RECOVD_PHASE_NOTCONN) { - CERROR("can't replay, invalidating\n"); - ldlm_namespace_cleanup(imp->imp_obd->obd_namespace, - 1); - ptlrpc_abort_inflight(imp); + CERROR("can't reconnect, invalidating\n"); + ldlm_namespace_cleanup(ns, 1); + ptlrpc_abort_inflight(imp, 0); } - goto skip_replay; - } - - rc = ptlrpc_replay(imp); - if (rc) - RETURN(rc); - - rc = ldlm_replay_locks(imp); - if (rc) + ptlrpc_req_finished(req); RETURN(rc); + } - rc = signal_completed_replay(imp); - if (rc) - RETURN(rc); + if (flags & MSG_CONNECT_RECOVERING) { + /* Replay if they want it. */ + DEBUG_REQ(D_HA, req, "MDS wants replay"); + rc = ptlrpc_replay(imp); + if (rc) + GOTO(check_rc, rc); + + rc = ldlm_replay_locks(imp); + if (rc) + GOTO(check_rc, rc); + + rc = signal_completed_replay(imp); + if (rc) + GOTO(check_rc, rc); + } else if (flags & MSG_CONNECT_RECONNECT) { + DEBUG_REQ(D_HA, req, "reconnecting to MDS\n"); + /* Nothing else to do here. */ + } else { + DEBUG_REQ(D_HA, req, "evicted: invalidating\n"); + /* Otherwise, clean everything up. */ + ldlm_namespace_cleanup(ns, 1); + ptlrpc_abort_inflight(imp, 0); + } - skip_replay: ptlrpc_req_finished(req); spin_lock_irqsave(&imp->imp_lock, flags); imp->imp_level = LUSTRE_CONN_FULL; @@ -771,14 +703,16 @@ static int mdc_recover(struct obd_import *imp, int phase) rc = ptlrpc_resend(imp); if (rc) - RETURN(rc); + GOTO(check_rc, rc); RETURN(0); - - case PTLRPC_RECOVD_PHASE_NOTCONN: - ldlm_namespace_cleanup(imp->imp_obd->obd_namespace, 1); - ptlrpc_abort_inflight(imp); - goto reconnect; + check_rc: + /* If we get disconnected in the middle, recovery has probably + * failed. Reconnect and find out. + */ + if (rc == -ENOTCONN) + goto reconnect; + RETURN(rc); default: RETURN(-EINVAL); @@ -786,7 +720,7 @@ static int mdc_recover(struct obd_import *imp, int phase) } static int mdc_connect(struct lustre_handle *conn, struct obd_device *obd, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { struct obd_import *imp = &obd->u.cli.cl_import; @@ -807,7 +741,10 @@ struct obd_ops mdc_obd_ops = { static int __init ptlrpc_request_init(void) { - return class_register_type(&mdc_obd_ops, status_class_var, + struct lprocfs_static_vars lvars; + mdc_init_rpc_lock(&mdc_rpc_lock); + lprocfs_init_vars(&lvars); + return class_register_type(&mdc_obd_ops, lvars.module_vars, LUSTRE_MDC_NAME); } @@ -816,11 +753,10 @@ static void __exit ptlrpc_request_exit(void) class_unregister_type(LUSTRE_MDC_NAME); } -MODULE_AUTHOR("Cluster File Systems "); -MODULE_DESCRIPTION("Lustre Metadata Client v1.0"); +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Lustre Metadata Client"); MODULE_LICENSE("GPL"); -EXPORT_SYMBOL(d_delete_aliases); EXPORT_SYMBOL(mdc_getstatus); EXPORT_SYMBOL(mdc_getlovinfo); EXPORT_SYMBOL(mdc_enqueue); @@ -834,7 +770,8 @@ EXPORT_SYMBOL(mdc_link); EXPORT_SYMBOL(mdc_readpage); EXPORT_SYMBOL(mdc_setattr); EXPORT_SYMBOL(mdc_close); -EXPORT_SYMBOL(mdc_open); +EXPORT_SYMBOL(mdc_lock_set_inode); +EXPORT_SYMBOL(mdc_set_open_replay_data); EXPORT_SYMBOL(mdc_store_inode_generation); diff --git a/lustre/mds/Makefile.am b/lustre/mds/Makefile.am index 12f06fc..f789c22 100644 --- a/lustre/mds/Makefile.am +++ b/lustre/mds/Makefile.am @@ -10,7 +10,7 @@ MODULE = mds modulefs_DATA = mds.o EXTRA_PROGRAMS = mds -LINX= mds_updates.c simple.c target.c +LINX= mds_updates.c mds_open.c simple.c target.c mds_updates.c: test -e mds_updates.c || ln -sf $(top_srcdir)/lib/mds_updates.c diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index bfdad03..e700a7a 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -4,7 +4,7 @@ * lustre/mds/handler.c * Lustre Metadata Server (mds) request handler * - * Copyright (c) 2001, 2002 Cluster File Systems, Inc. + * Copyright (c) 2001-2003 Cluster File Systems, Inc. * Author: Peter Braam * Author: Andreas Dilger * Author: Phil Schwan @@ -38,25 +38,23 @@ #include #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) #include +#include #endif #include #include #include #include -static kmem_cache_t *mds_file_cache; +kmem_cache_t *mds_file_cache; extern int mds_get_lovtgts(struct mds_obd *obd, int tgt_count, - obd_uuid_t *uuidarray); + struct obd_uuid *uuidarray); extern int mds_get_lovdesc(struct mds_obd *obd, struct lov_desc *desc); extern void mds_start_transno(struct mds_obd *mds); extern int mds_finish_transno(struct mds_obd *mds, void *handle, struct ptlrpc_request *req, int rc); static int mds_cleanup(struct obd_device * obddev); -extern struct lprocfs_vars status_var_nm_1[]; -extern struct lprocfs_vars status_class_var[]; - inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req) { return &req->rq_export->exp_obd->u.mds; @@ -73,7 +71,7 @@ static int mds_bulk_timeout(void *data) /* Assumes caller has already pushed into the kernel filesystem context */ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, - __u64 offset) + __u64 offset, __u64 xid) { struct ptlrpc_bulk_desc *desc; struct ptlrpc_bulk_page *bulk; @@ -90,23 +88,26 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, if (bulk == NULL) GOTO(cleanup_bulk, rc = -ENOMEM); - OBD_ALLOC(buf, PAGE_SIZE); + OBD_ALLOC(buf, PAGE_CACHE_SIZE); if (buf == NULL) GOTO(cleanup_bulk, rc = -ENOMEM); - rc = fsfilt_readpage(req->rq_export->exp_obd, file, buf, PAGE_SIZE, - (loff_t *)&offset); + CDEBUG(D_EXT2, "reading %lu@"LPU64" from dir %lu (size %llu)\n", + PAGE_CACHE_SIZE, offset, file->f_dentry->d_inode->i_ino, + file->f_dentry->d_inode->i_size); + rc = fsfilt_readpage(req->rq_export->exp_obd, file, buf, + PAGE_CACHE_SIZE, (loff_t *)&offset); - if (rc != PAGE_SIZE) + if (rc != PAGE_CACHE_SIZE) GOTO(cleanup_buf, rc = -EIO); - bulk->bp_xid = req->rq_xid; + bulk->bp_xid = xid; bulk->bp_buf = buf; - bulk->bp_buflen = PAGE_SIZE; + bulk->bp_buflen = PAGE_CACHE_SIZE; desc->bd_ptl_ev_hdlr = NULL; desc->bd_portal = MDS_BULK_PORTAL; - rc = ptlrpc_send_bulk(desc); + rc = ptlrpc_bulk_put(desc); if (rc) GOTO(cleanup_buf, rc); @@ -135,76 +136,26 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, return rc; } -/* - * Look up a named entry in a directory, and get an LDLM lock on it. - * 'dir' is a inode for which an LDLM lock has already been taken. - * - * If we do not need an exclusive or write lock on this entry (e.g. - * a read lock for attribute lookup only) then we do not hold the - * directory semaphore on return. It is up to the caller to know what - * type of lock it is getting, and clean up appropriately. - */ -struct dentry *mds_name2locked_dentry(struct obd_device *obd, - struct dentry *dir, struct vfsmount **mnt, - char *name, int namelen, int lock_mode, - struct lustre_handle *lockh, - int dir_lock_mode) -{ - struct dentry *dchild; - int flags = 0, rc; - __u64 res_id[3] = {0}; - ENTRY; - - down(&dir->d_inode->i_sem); - dchild = lookup_one_len(name, dir, namelen); - if (IS_ERR(dchild)) { - CERROR("child lookup error %ld\n", PTR_ERR(dchild)); - up(&dir->d_inode->i_sem); - LBUG(); - RETURN(dchild); - } - if (dir_lock_mode != LCK_EX && dir_lock_mode != LCK_PW) { - up(&dir->d_inode->i_sem); - ldlm_lock_decref(lockh, dir_lock_mode); - } - - if (lock_mode == 0 || !dchild->d_inode) - RETURN(dchild); - - res_id[0] = dchild->d_inode->i_ino; - res_id[1] = dchild->d_inode->i_generation; - rc = ldlm_match_or_enqueue(NULL, NULL, obd->obd_namespace, NULL, - res_id, LDLM_PLAIN, NULL, 0, lock_mode, - &flags, ldlm_completion_ast, - mds_blocking_ast, NULL, 0, lockh); - if (rc != ELDLM_OK) { - l_dput(dchild); - up(&dir->d_inode->i_sem); - RETURN(ERR_PTR(-ENOLCK)); /* XXX translate ldlm code */ - } - - RETURN(dchild); -} - +/* only valid locked dentries or errors should be returned */ struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid, struct vfsmount **mnt, int lock_mode, struct lustre_handle *lockh) { struct mds_obd *mds = &obd->u.mds; struct dentry *de = mds_fid2dentry(mds, fid, mnt), *retval = de; + struct ldlm_res_id res_id = { .name = {0} }; int flags = 0, rc; - __u64 res_id[3] = {0}; ENTRY; if (IS_ERR(de)) RETURN(de); - res_id[0] = de->d_inode->i_ino; - res_id[1] = de->d_inode->i_generation; - rc = ldlm_match_or_enqueue(NULL, NULL, obd->obd_namespace, NULL, - res_id, LDLM_PLAIN, NULL, 0, lock_mode, - &flags, ldlm_completion_ast, - mds_blocking_ast, NULL, 0, lockh); + res_id.name[0] = de->d_inode->i_ino; + res_id.name[1] = de->d_inode->i_generation; + rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, + res_id, LDLM_PLAIN, NULL, 0, lock_mode, + &flags, ldlm_completion_ast, + mds_blocking_ast, NULL, NULL, lockh); if (rc != ELDLM_OK) { l_dput(de); retval = ERR_PTR(-ENOLCK); /* XXX translate ldlm code */ @@ -217,7 +168,11 @@ struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid, #define DCACHE_DISCONNECTED DCACHE_NFSD_DISCONNECTED #endif + + /* Look up an entry by inode number. */ +/* this function ONLY returns valid dget'd dentries with an initialized inode + or errors */ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid, struct vfsmount **mnt) { @@ -277,6 +232,8 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid, return result; } +static void mds_abort_recovery(void *data); + /* Establish a connection to the MDS. * * This will set up an export structure for the client to hold state data @@ -284,47 +241,25 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid, * on the server, etc. */ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { struct obd_export *exp; struct mds_export_data *med; struct mds_client_data *mcd; - struct list_head *p; + struct mds_obd *mds = &obd->u.mds; int rc; ENTRY; if (!conn || !obd || !cluuid) RETURN(-EINVAL); - /* lctl gets a backstage, all-access pass. */ - if (!strcmp(cluuid, "OBD_CLASS_UUID")) - goto dont_check_exports; - - spin_lock(&obd->obd_dev_lock); - list_for_each(p, &obd->obd_exports) { - exp = list_entry(p, struct obd_export, exp_obd_chain); - mcd = exp->exp_mds_data.med_mcd; - if (!mcd) { - CERROR("FYI: NULL mcd - simultaneous connects\n"); - continue; - } - if (!memcmp(cluuid, mcd->mcd_uuid, sizeof mcd->mcd_uuid)) { - spin_unlock(&obd->obd_dev_lock); - LASSERT(exp->exp_obd == obd); - - RETURN(target_handle_reconnect(conn, exp, cluuid)); - } - } - spin_unlock(&obd->obd_dev_lock); - - if (obd->u.mds.mds_recoverable_clients != 0) { - CERROR("denying connection for new client %s: in recovery\n", - cluuid); - RETURN(-EBUSY); - } + /* Check for aborted recovery. */ + spin_lock_bh(&mds->mds_processing_task_lock); + if (obd->obd_flags & OBD_ABORT_RECOVERY) + mds_abort_recovery(mds); + spin_unlock_bh(&mds->mds_processing_task_lock); - dont_check_exports: /* XXX There is a small race between checking the list and adding a * new connection for the same UUID, but the real threat (list * corruption when multiple different clients connect) is solved. @@ -372,13 +307,22 @@ out_export: inline int mds_close_mfd(struct mds_file_data *mfd, struct mds_export_data *med) { struct file *file = mfd->mfd_file; + int rc; + struct dentry *de = NULL; LASSERT(file->private_data == mfd); + LASSERT(mfd->mfd_servercookie != DEAD_HANDLE_MAGIC); + list_del(&mfd->mfd_list); mfd->mfd_servercookie = DEAD_HANDLE_MAGIC; kmem_cache_free(mds_file_cache, mfd); - return filp_close(file, 0); + if (file->f_dentry->d_parent) + de = dget(file->f_dentry->d_parent); + rc = filp_close(file, 0); + if (de) + l_dput(de); + RETURN(rc); } static int mds_disconnect(struct lustre_handle *conn) @@ -490,7 +434,7 @@ static int mds_getlovinfo(struct ptlrpc_request *req) memcpy(desc, &mds->mds_lov_desc, sizeof *desc); lov_packdesc(desc); tgt_count = le32_to_cpu(desc->ld_tgt_count); - if (tgt_count * sizeof(obd_uuid_t) > streq->repbuf) { + if (tgt_count * sizeof(struct obd_uuid) > streq->repbuf) { CERROR("too many targets, enlarge client buffers\n"); req->rq_status = -ENOSPC; RETURN(0); @@ -507,7 +451,7 @@ static int mds_getlovinfo(struct ptlrpc_request *req) } int mds_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, - void *data, __u32 data_len, int flag) + void *data, int flag) { int do_ast; ENTRY; @@ -519,6 +463,16 @@ int mds_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, /* XXX layering violation! -phil */ l_lock(&lock->l_resource->lr_namespace->ns_lock); + /* Get this: if mds_blocking_ast is racing with ldlm_intent_policy, + * such that mds_blocking_ast is called just before l_i_p takes the + * ns_lock, then by the time we get the lock, we might not be the + * correct blocking function anymore. So check, and return early, if + * so. */ + if (lock->l_blocking_ast != mds_blocking_ast) { + l_unlock(&lock->l_resource->lr_namespace->ns_lock); + RETURN(0); + } + lock->l_flags |= LDLM_FL_CBPENDING; do_ast = (!lock->l_readers && !lock->l_writers); l_unlock(&lock->l_resource->lr_namespace->ns_lock); @@ -532,25 +486,29 @@ int mds_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, rc = ldlm_cli_cancel(&lockh); if (rc < 0) CERROR("ldlm_cli_cancel: %d\n", rc); - } else - LDLM_DEBUG(lock, "Lock still has references, will be" + } else { + LDLM_DEBUG(lock, "Lock still has references, will be " "cancelled later"); + } RETURN(0); } -int mds_pack_md(struct mds_obd *mds, struct ptlrpc_request *req, +int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, int offset, struct mds_body *body, struct inode *inode) { + struct mds_obd *mds = &obd->u.mds; struct lov_mds_md *lmm; - int lmm_size = req->rq_repmsg->buflens[offset]; + int lmm_size = msg->buflens[offset]; int rc; + ENTRY; if (lmm_size == 0) { - CDEBUG(D_INFO, "no space reserved for inode %lu MD\n", inode->i_ino); + CDEBUG(D_INFO, "no space reserved for inode %lu MD\n", + inode->i_ino); RETURN(0); } - lmm = lustre_msg_buf(req->rq_repmsg, offset); + lmm = lustre_msg_buf(msg, offset); /* I don't really like this, but it is a sanity check on the client * MD request. However, if the client doesn't know how much space @@ -566,9 +524,9 @@ int mds_pack_md(struct mds_obd *mds, struct ptlrpc_request *req, * discarded right after unpacking, and the LOV can figure out the * size itself from the ost count. */ - if ((rc = fsfilt_get_md(req->rq_export->exp_obd, inode, - lmm, lmm_size)) < 0) { - CDEBUG(D_INFO, "No md for ino %lu: rc = %d\n", inode->i_ino,rc); + if ((rc = fsfilt_get_md(obd, inode, lmm, lmm_size)) < 0) { + CDEBUG(D_INFO, "No md for ino %lu: rc = %d\n", + inode->i_ino, rc); } else if (rc > 0) { body->valid |= OBD_MD_FLEASIZE; rc = 0; @@ -577,7 +535,7 @@ int mds_pack_md(struct mds_obd *mds, struct ptlrpc_request *req, return rc; } -static int mds_getattr_internal(struct mds_obd *mds, struct dentry *dentry, +static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry, struct ptlrpc_request *req, struct mds_body *reqbody, int reply_off) { @@ -595,7 +553,8 @@ static int mds_getattr_internal(struct mds_obd *mds, struct dentry *dentry, mds_pack_inode2body(body, inode); if (S_ISREG(inode->i_mode) && reqbody->valid & OBD_MD_FLEASIZE) { - rc = mds_pack_md(mds, req, reply_off + 1, body, inode); + rc = mds_pack_md(obd, req->rq_repmsg, reply_off + 1, + body, inode); } else if (S_ISLNK(inode->i_mode) && reqbody->valid & OBD_MD_LINKNAME) { char *symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1); int len = req->rq_repmsg->buflens[reply_off + 1]; @@ -664,19 +623,22 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode, return(rc); } -static int mds_getattr_name(int offset, struct ptlrpc_request *req) +static int mds_getattr_name(int offset, struct ptlrpc_request *req, + struct lustre_handle *child_lockh) { + struct ldlm_intent *it = lustre_msg_buf(req->rq_reqmsg, 1); + int lock_mode; struct mds_obd *mds = mds_req2mds(req); struct obd_device *obd = req->rq_export->exp_obd; struct obd_run_ctxt saved; struct mds_body *body; struct dentry *de = NULL, *dchild = NULL; struct inode *dir; - struct lustre_handle lockh; - char *name; - int namelen, flags = 0, lock_mode, rc = 0; struct obd_ucred uc; - __u64 res_id[3] = {0, 0, 0}; + struct ldlm_res_id child_res_id = { .name = {0} }; + struct lustre_handle parent_lockh; + int namelen, flags = 0, rc = 0; + char *name; ENTRY; LASSERT(!strcmp(obd->obd_type->typ_name, "mds")); @@ -696,56 +658,58 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req) uc.ouc_fsuid = body->fsuid; uc.ouc_fsgid = body->fsgid; uc.ouc_cap = body->capability; + uc.ouc_suppgid = body->suppgid; push_ctxt(&saved, &mds->mds_ctxt, &uc); - de = mds_fid2dentry(mds, &body->fid1, NULL); - if (IS_ERR(de)) { + /* Step 1: Lookup/lock parent */ + de = mds_fid2locked_dentry(obd, &body->fid1, NULL, LCK_PR, + &parent_lockh); + if (IS_ERR(de)) GOTO(out_pre_de, rc = PTR_ERR(de)); - } - dir = de->d_inode; - CDEBUG(D_INODE, "parent ino %lu, name %*s\n", dir->i_ino,namelen,name); + LASSERT(dir); - lock_mode = LCK_PR; - res_id[0] = dir->i_ino; - res_id[1] = dir->i_generation; - - rc = ldlm_lock_match(obd->obd_namespace, res_id, LDLM_PLAIN, - NULL, 0, lock_mode, &lockh); - if (rc == 0) { - LDLM_DEBUG_NOLOCK("enqueue res "LPU64, res_id[0]); - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, - res_id, LDLM_PLAIN, NULL, 0, lock_mode, - &flags, ldlm_completion_ast, - mds_blocking_ast, NULL, 0, &lockh); - if (rc != ELDLM_OK) { - CERROR("lock enqueue: err: %d\n", rc); - GOTO(out_create_de, rc = -EIO); - } - } - ldlm_lock_dump_handle(D_OTHER, &lockh); + CDEBUG(D_INODE, "parent ino %lu, name %*s\n", dir->i_ino,namelen,name); - down(&dir->i_sem); + /* Step 2: Lookup child */ dchild = lookup_one_len(name, de, namelen - 1); - up(&dir->i_sem); if (IS_ERR(dchild)) { CDEBUG(D_INODE, "child lookup error %ld\n", PTR_ERR(dchild)); - GOTO(out_create_dchild, rc = PTR_ERR(dchild)); + GOTO(out_step_1, rc = PTR_ERR(dchild)); } else if (dchild->d_inode == NULL) { - GOTO(out_create_dchild, rc = -ENOENT); + GOTO(out_step_2, rc = -ENOENT); + } + + /* Step 3: Lock child */ + if (it->opc == IT_SETATTR) + lock_mode = LCK_PW; + else + lock_mode = LCK_PR; + child_res_id.name[0] = dchild->d_inode->i_ino; + child_res_id.name[1] = dchild->d_inode->i_generation; + rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, + child_res_id, LDLM_PLAIN, NULL, 0, lock_mode, + &flags, ldlm_completion_ast, mds_blocking_ast, + NULL, NULL, child_lockh); + if (rc != ELDLM_OK) { + CERROR("ldlm_cli_enqueue: %d\n", rc); + GOTO(out_step_2, rc = -EIO); } if (req->rq_repmsg == NULL) mds_getattr_pack_msg(req, dchild->d_inode, offset); - rc = mds_getattr_internal(mds, dchild, req, body, offset); - - EXIT; -out_create_dchild: + rc = mds_getattr_internal(obd, dchild, req, body, offset); + if (rc) + GOTO(out_step_3, rc); + GOTO(out_step_2, rc); /* returns the lock to the client */ + out_step_3: + ldlm_lock_decref(child_lockh, LCK_PR); + out_step_2: l_dput(dchild); - ldlm_lock_decref(&lockh, lock_mode); -out_create_de: + out_step_1: + ldlm_lock_decref(&parent_lockh, LCK_PR); l_dput(de); -out_pre_de: + out_pre_de: req->rq_status = rc; pop_ctxt(&saved, &mds->mds_ctxt, &uc); return rc; @@ -754,6 +718,7 @@ out_pre_de: static int mds_getattr(int offset, struct ptlrpc_request *req) { struct mds_obd *mds = mds_req2mds(req); + struct obd_device *obd = req->rq_export->exp_obd; struct obd_run_ctxt saved; struct dentry *de; struct mds_body *body; @@ -774,10 +739,10 @@ static int mds_getattr(int offset, struct ptlrpc_request *req) rc = mds_getattr_pack_msg(req, de->d_inode, offset); - req->rq_status = mds_getattr_internal(mds, de, req, body, 0); + req->rq_status = mds_getattr_internal(obd, de, req, body, 0); l_dput(de); - EXIT; + GOTO(out_pop, rc); out_pop: pop_ctxt(&saved, &mds->mds_ctxt, &uc); return rc; @@ -828,6 +793,8 @@ static struct mds_file_data *mds_handle2mfd(struct lustre_handle *handle) RETURN(mfd); } +#if 0 + static int mds_store_md(struct mds_obd *mds, struct ptlrpc_request *req, int offset, struct mds_body *body, struct inode *inode) { @@ -875,105 +842,7 @@ out_ea: RETURN(rc); } -static int mds_open(struct ptlrpc_request *req) -{ - struct mds_obd *mds = mds_req2mds(req); - struct mds_body *body; - struct mds_export_data *med; - struct mds_file_data *mfd; - struct dentry *de; - struct file *file; - struct vfsmount *mnt; - __u32 flags; - struct list_head *tmp; - int rc, size = sizeof(*body); - ENTRY; - - if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OPEN_PACK)) { - CERROR("test case OBD_FAIL_MDS_OPEN_PACK\n"); - req->rq_status = -ENOMEM; - RETURN(-ENOMEM); - } - - rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg); - if (rc) { - CERROR("mds: pack error: rc = %d\n", rc); - req->rq_status = rc; - RETURN(rc); - } - - body = lustre_msg_buf(req->rq_reqmsg, 0); - - /* was this animal open already and the client lost the reply? */ - /* XXX need some way to detect a reopen, to avoid locked list walks */ - med = &req->rq_export->exp_mds_data; - spin_lock(&med->med_open_lock); - list_for_each(tmp, &med->med_open_head) { - mfd = list_entry(tmp, typeof(*mfd), mfd_list); - if (!memcmp(&mfd->mfd_clienthandle, &body->handle, - sizeof(mfd->mfd_clienthandle)) && - body->fid1.id == mfd->mfd_file->f_dentry->d_inode->i_ino) { - de = mfd->mfd_file->f_dentry; - spin_unlock(&med->med_open_lock); - CERROR("Re opening "LPD64"\n", body->fid1.id); - GOTO(out_pack, rc = 0); - } - } - spin_unlock(&med->med_open_lock); - - mfd = kmem_cache_alloc(mds_file_cache, GFP_KERNEL); - if (!mfd) { - CERROR("mds: out of memory\n"); - req->rq_status = -ENOMEM; - RETURN(0); - } - - de = mds_fid2dentry(mds, &body->fid1, &mnt); - if (IS_ERR(de)) - GOTO(out_free, rc = PTR_ERR(de)); - - /* check if this inode has seen a delayed object creation */ - if (lustre_msg_get_op_flags(req->rq_reqmsg) & MDS_OPEN_HAS_EA) { - rc = mds_store_md(mds, req, 1, body, de->d_inode); - if (rc) { - l_dput(de); - mntput(mnt); - GOTO(out_free, rc); - } - } - - flags = body->flags; - /* dentry_open does a dput(de) and mntput(mnt) on error */ - file = dentry_open(de, mnt, flags & ~O_DIRECT); - if (IS_ERR(file)) { - rc = PTR_ERR(file); - GOTO(out_free, 0); - } - - file->private_data = mfd; - mfd->mfd_file = file; - memcpy(&mfd->mfd_clienthandle, &body->handle, sizeof(body->handle)); - get_random_bytes(&mfd->mfd_servercookie, sizeof(mfd->mfd_servercookie)); - spin_lock(&med->med_open_lock); - list_add(&mfd->mfd_list, &med->med_open_head); - spin_unlock(&med->med_open_lock); - -out_pack: - body = lustre_msg_buf(req->rq_repmsg, 0); - mds_pack_inode2fid(&body->fid1, de->d_inode); - mds_pack_inode2body(body, de->d_inode); - body->handle.addr = (__u64)(unsigned long)mfd; - body->handle.cookie = mfd->mfd_servercookie; - CDEBUG(D_INODE, "llite file "LPX64": addr %p, cookie "LPX64"\n", - mfd->mfd_clienthandle.addr, mfd, mfd->mfd_servercookie); - RETURN(0); - -out_free: - mfd->mfd_servercookie = DEAD_HANDLE_MAGIC; - kmem_cache_free(mds_file_cache, mfd); - req->rq_status = rc; - RETURN(0); -} +#endif static int mds_close(struct ptlrpc_request *req) { @@ -986,7 +855,7 @@ static int mds_close(struct ptlrpc_request *req) body = lustre_msg_buf(req->rq_reqmsg, 0); mfd = mds_handle2mfd(&body->handle); - if (!mfd) { + if (mfd == NULL) { DEBUG_REQ(D_ERROR, req, "no handle for file close "LPD64 ": addr "LPX64", cookie "LPX64"\n", body->fid1.id, body->handle.addr, @@ -1054,7 +923,8 @@ static int mds_readpage(struct ptlrpc_request *req) /* to make this asynchronous make sure that the handling function doesn't send a reply when this function completes. Instead a callback function would send the reply */ - rc = mds_sendpage(req, file, body->size); + /* body->blocks is actually the xid -phil */ + rc = mds_sendpage(req, file, body->size, body->blocks); filp_close(file, 0); out_pop: @@ -1064,70 +934,149 @@ out: RETURN(0); } -int mds_reint(struct ptlrpc_request *req, int offset) +int mds_reint(struct ptlrpc_request *req, int offset, + struct lustre_handle *lockh) { + struct mds_update_record *rec; /* 116 bytes on the stack? no sir! */ int rc; - struct mds_update_record rec; - rc = mds_update_unpack(req, offset, &rec); + OBD_ALLOC(rec, sizeof(*rec)); + if (rec == NULL) + RETURN(-ENOMEM); + + rc = mds_update_unpack(req, offset, rec); if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNPACK)) { CERROR("invalid record\n"); - req->rq_status = -EINVAL; - RETURN(0); + GOTO(out, req->rq_status = -EINVAL); } /* rc will be used to interrupt a for loop over multiple records */ - rc = mds_reint_rec(&rec, offset, req); + rc = mds_reint_rec(rec, offset, req, lockh); + out: + OBD_FREE(rec, sizeof(*rec)); return rc; } /* forward declaration */ int mds_handle(struct ptlrpc_request *req); +static void abort_delayed_replies(struct mds_obd *mds) +{ + struct ptlrpc_request *req; + struct list_head *tmp, *n; + list_for_each_safe(tmp, n, &mds->mds_delayed_reply_queue) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + DEBUG_REQ(D_ERROR, req, "aborted:"); + req->rq_status = -ENOTCONN; + req->rq_type = PTL_RPC_MSG_ERR; + ptlrpc_reply(req->rq_svc, req); + list_del(&req->rq_list); + OBD_FREE(req, sizeof *req); + } +} + +static void mds_abort_recovery(void *data) +{ + struct mds_obd *mds = data; + struct obd_device *obd = list_entry(mds, struct obd_device, u.mds); + CERROR("disconnecting clients and aborting recovery\n"); + mds->mds_recoverable_clients = 0; + obd->obd_flags &= ~(OBD_RECOVERING | OBD_ABORT_RECOVERY); + abort_delayed_replies(mds); + spin_unlock_bh(&mds->mds_processing_task_lock); + class_disconnect_all(obd); + spin_lock_bh(&mds->mds_processing_task_lock); +} + +static void mds_recovery_expired(unsigned long castmeharder) +{ + struct mds_obd *mds = (struct mds_obd *)castmeharder; + struct obd_device *obd = list_entry(mds, struct obd_device, u.mds); + CERROR("recovery timed out, aborting\n"); + spin_lock_bh(&mds->mds_processing_task_lock); + obd->obd_flags |= OBD_ABORT_RECOVERY; + wake_up(&mds->mds_next_transno_waitq); + spin_unlock_bh(&mds->mds_processing_task_lock); +} + +static void reset_recovery_timer(struct mds_obd *mds) +{ + CDEBUG(D_ERROR, "timer will expire in %ld seconds\n", + MDS_RECOVERY_TIMEOUT / HZ); + mod_timer(&mds->mds_recovery_timer, jiffies + MDS_RECOVERY_TIMEOUT); +} + +static void start_recovery_timer(struct mds_obd *mds) +{ + mds->mds_recovery_timer.function = mds_recovery_expired; + mds->mds_recovery_timer.data = (unsigned long)mds; + init_timer(&mds->mds_recovery_timer); + reset_recovery_timer(mds); +} + +static void cancel_recovery_timer(struct mds_obd *mds) +{ + del_timer(&mds->mds_recovery_timer); +} + static int check_for_next_transno(struct mds_obd *mds) { struct ptlrpc_request *req; + struct obd_device *obd = list_entry(mds, struct obd_device, u.mds); req = list_entry(mds->mds_recovery_queue.next, struct ptlrpc_request, rq_list); LASSERT(req->rq_reqmsg->transno >= mds->mds_next_recovery_transno); - return req->rq_reqmsg->transno == mds->mds_next_recovery_transno; + + return req->rq_reqmsg->transno == mds->mds_next_recovery_transno || + (obd->obd_flags & OBD_RECOVERING) == 0; } static void process_recovery_queue(struct mds_obd *mds) { struct ptlrpc_request *req; + struct obd_device *obd = list_entry(mds, struct obd_device, u.mds); + int aborted = 0; ENTRY; for (;;) { - spin_lock(&mds->mds_processing_task_lock); + spin_lock_bh(&mds->mds_processing_task_lock); LASSERT(mds->mds_processing_task == current->pid); req = list_entry(mds->mds_recovery_queue.next, struct ptlrpc_request, rq_list); if (req->rq_reqmsg->transno != mds->mds_next_recovery_transno) { - spin_unlock(&mds->mds_processing_task_lock); + spin_unlock_bh(&mds->mds_processing_task_lock); CDEBUG(D_HA, "Waiting for transno "LPD64" (1st is " LPD64")\n", mds->mds_next_recovery_transno, req->rq_reqmsg->transno); wait_event(mds->mds_next_transno_waitq, check_for_next_transno(mds)); + spin_lock_bh(&mds->mds_processing_task_lock); + if (obd->obd_flags & OBD_ABORT_RECOVERY) { + mds_abort_recovery(mds); + aborted = 1; + } + spin_unlock_bh(&mds->mds_processing_task_lock); + if (aborted) + return; continue; } list_del_init(&req->rq_list); - spin_unlock(&mds->mds_processing_task_lock); + spin_unlock_bh(&mds->mds_processing_task_lock); DEBUG_REQ(D_ERROR, req, "processing: "); (void)mds_handle(req); + reset_recovery_timer(mds); mds_fsync_super(mds->mds_sb); OBD_FREE(req, sizeof *req); - spin_lock(&mds->mds_processing_task_lock); + spin_lock_bh(&mds->mds_processing_task_lock); mds->mds_next_recovery_transno++; if (list_empty(&mds->mds_recovery_queue)) { mds->mds_processing_task = 0; - spin_unlock(&mds->mds_processing_task_lock); + spin_unlock_bh(&mds->mds_processing_task_lock); break; } - spin_unlock(&mds->mds_processing_task_lock); + spin_unlock_bh(&mds->mds_processing_task_lock); } EXIT; } @@ -1142,16 +1091,16 @@ static int queue_recovery_request(struct ptlrpc_request *req, if (!transno) { INIT_LIST_HEAD(&req->rq_list); - DEBUG_REQ(D_ERROR, req, "not queueing"); + DEBUG_REQ(D_HA, req, "not queueing"); return 1; } - spin_lock(&mds->mds_processing_task_lock); + spin_lock_bh(&mds->mds_processing_task_lock); if (mds->mds_processing_task == current->pid) { /* Processing the queue right now, don't re-add. */ LASSERT(list_empty(&req->rq_list)); - spin_unlock(&mds->mds_processing_task_lock); + spin_unlock_bh(&mds->mds_processing_task_lock); return 1; } @@ -1184,7 +1133,7 @@ static int queue_recovery_request(struct ptlrpc_request *req, */ if (transno == mds->mds_next_recovery_transno) wake_up(&mds->mds_next_transno_waitq); - spin_unlock(&mds->mds_processing_task_lock); + spin_unlock_bh(&mds->mds_processing_task_lock); return 0; } @@ -1192,7 +1141,7 @@ static int queue_recovery_request(struct ptlrpc_request *req, * now, so we'll do the honours. */ mds->mds_processing_task = current->pid; - spin_unlock(&mds->mds_processing_task_lock); + spin_unlock_bh(&mds->mds_processing_task_lock); process_recovery_queue(mds); return 0; @@ -1202,12 +1151,12 @@ static int filter_recovery_request(struct ptlrpc_request *req, struct mds_obd *mds, int *process) { switch (req->rq_reqmsg->opc) { - case MDS_CONNECT: + case MDS_CONNECT: /* This will never get here, but for completeness. */ case MDS_DISCONNECT: *process = 1; RETURN(0); - case MDS_OPEN: + case MDS_CLOSE: case MDS_GETSTATUS: /* used in unmounting */ case MDS_REINT: case LDLM_ENQUEUE: @@ -1225,9 +1174,10 @@ static int filter_recovery_request(struct ptlrpc_request *req, static int mds_queue_final_reply(struct ptlrpc_request *req, int rc) { struct mds_obd *mds = mds_req2mds(req); + struct obd_device *mds_obd = list_entry(mds, struct obd_device, u.mds); struct ptlrpc_request *saved_req; - spin_lock(&mds->mds_processing_task_lock); + spin_lock_bh(&mds->mds_processing_task_lock); if (rc) { /* Just like ptlrpc_error, but without the sending. */ lustre_pack_msg(0, NULL, NULL, &req->rq_replen, @@ -1245,6 +1195,7 @@ static int mds_queue_final_reply(struct ptlrpc_request *req, int rc) ldlm_reprocess_all_ns(req->rq_export->exp_obd->obd_namespace); CDEBUG(D_ERROR, "all clients recovered, sending delayed replies\n"); + mds_obd->obd_flags &= ~OBD_RECOVERING; list_for_each_safe(tmp, n, &mds->mds_delayed_reply_queue) { req = list_entry(tmp, struct ptlrpc_request, rq_list); DEBUG_REQ(D_ERROR, req, "delayed:"); @@ -1252,12 +1203,13 @@ static int mds_queue_final_reply(struct ptlrpc_request *req, int rc) list_del(&req->rq_list); OBD_FREE(req, sizeof *req); } + cancel_recovery_timer(mds); } else { CERROR("%d recoverable clients remain\n", mds->mds_recoverable_clients); } - spin_unlock(&mds->mds_processing_task_lock); + spin_unlock_bh(&mds->mds_processing_task_lock); return 1; } @@ -1266,14 +1218,15 @@ static char *reint_names[] = { [REINT_CREATE] "create", [REINT_LINK] "link", [REINT_UNLINK] "unlink", - [REINT_RENAME] "rename" + [REINT_RENAME] "rename", + [REINT_OPEN] "open", }; int mds_handle(struct ptlrpc_request *req) { - int rc; - int should_process; + int should_process, rc; struct mds_obd *mds = NULL; /* quell gcc overwarning */ + struct obd_device *mds_obd = NULL; ENTRY; rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); @@ -1282,20 +1235,42 @@ int mds_handle(struct ptlrpc_request *req) GOTO(out, rc); } + OBD_FAIL_RETURN(OBD_FAIL_MDS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0); + LASSERT(!strcmp(req->rq_obd->obd_type->typ_name, LUSTRE_MDT_NAME)); if (req->rq_reqmsg->opc != MDS_CONNECT) { + struct mds_export_data *med; if (req->rq_export == NULL) { req->rq_status = -ENOTCONN; GOTO(out, rc = -ENOTCONN); } - mds = mds_req2mds(req); - if (mds->mds_recoverable_clients != 0) { + med = &req->rq_export->exp_mds_data; + mds_obd = req->rq_export->exp_obd; + mds = &mds_obd->u.mds; + spin_lock_bh(&mds->mds_processing_task_lock); + if (mds_obd->obd_flags & OBD_ABORT_RECOVERY) + mds_abort_recovery(mds); + spin_unlock_bh(&mds->mds_processing_task_lock); + + if (mds_obd->obd_flags & OBD_RECOVERING) { rc = filter_recovery_request(req, mds, &should_process); if (rc || !should_process) RETURN(rc); + } else if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) { + if (req->rq_xid == med->med_last_xid) { + DEBUG_REQ(D_HA, req, "resending reply"); + OBD_ALLOC(req->rq_repmsg, med->med_last_replen); + req->rq_replen = med->med_last_replen; + memcpy(req->rq_repmsg, med->med_last_reply, + req->rq_replen); + ptlrpc_reply(req->rq_svc, req); + return 0; + } + DEBUG_REQ(D_HA, req, "no reply for resend, continuing"); } + } switch (req->rq_reqmsg->opc) { @@ -1309,12 +1284,6 @@ int mds_handle(struct ptlrpc_request *req) mds = mds_req2mds(req); mds_fsync_super(mds->mds_sb); } - - /* Let the client know if it can replay. */ - if (mds->mds_recoverable_clients) { - lustre_msg_add_flags(req->rq_repmsg, - MSG_REPLAY_IN_PROGRESS); - } break; case MDS_DISCONNECT: @@ -1344,12 +1313,15 @@ int mds_handle(struct ptlrpc_request *req) rc = mds_getattr(0, req); break; - case MDS_GETATTR_NAME: + case MDS_GETATTR_NAME: { + struct lustre_handle lockh; DEBUG_REQ(D_INODE, req, "getattr_name"); OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_NAME_NET, 0); - rc = mds_getattr_name(0, req); + rc = mds_getattr_name(0, req, &lockh); + if (rc == 0) + ldlm_lock_decref(&lockh, LCK_PR); break; - + } case MDS_STATFS: DEBUG_REQ(D_INODE, req, "statfs"); OBD_FAIL_RETURN(OBD_FAIL_MDS_STATFS_NET, 0); @@ -1366,32 +1338,30 @@ int mds_handle(struct ptlrpc_request *req) break; case MDS_REINT: { - int size = sizeof(struct mds_body); - int opc = *(u32 *)lustre_msg_buf(req->rq_reqmsg, 0), - realopc = opc & REINT_OPCODE_MASK; + int opc = *(u32 *)lustre_msg_buf(req->rq_reqmsg, 0); + int size[2] = {sizeof(struct mds_body), mds->mds_max_mdsize}; + int bufcount; DEBUG_REQ(D_INODE, req, "reint (%s%s)", - reint_names[realopc], + reint_names[opc & REINT_OPCODE_MASK], opc & REINT_REPLAYING ? "|REPLAYING" : ""); OBD_FAIL_RETURN(OBD_FAIL_MDS_REINT_NET, 0); - rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, - &req->rq_repmsg); - if (rc) { - req->rq_status = rc; + if (opc == REINT_UNLINK) + bufcount = 2; + else + bufcount = 1; + + rc = lustre_pack_msg(bufcount, size, NULL, + &req->rq_replen, &req->rq_repmsg); + if (rc) break; - } - rc = mds_reint(req, 0); - OBD_FAIL_RETURN(OBD_FAIL_MDS_REINT_NET_REP, 0); - break; - } - case MDS_OPEN: - DEBUG_REQ(D_INODE, req, "open"); - OBD_FAIL_RETURN(OBD_FAIL_MDS_OPEN_NET, 0); - rc = mds_open(req); + rc = mds_reint(req, 0, NULL); + OBD_FAIL_RETURN(OBD_FAIL_MDS_REINT_NET_REP, 0); break; + } case MDS_CLOSE: DEBUG_REQ(D_INODE, req, "close"); @@ -1402,7 +1372,8 @@ int mds_handle(struct ptlrpc_request *req) case LDLM_ENQUEUE: DEBUG_REQ(D_INODE, req, "enqueue"); OBD_FAIL_RETURN(OBD_FAIL_LDLM_ENQUEUE, 0); - rc = ldlm_handle_enqueue(req); + rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast, + ldlm_server_blocking_ast); break; case LDLM_CONVERT: DEBUG_REQ(D_INODE, req, "convert"); @@ -1426,35 +1397,60 @@ int mds_handle(struct ptlrpc_request *req) /* If we're DISCONNECTing, the mds_export_data is already freed */ if (!rc && req->rq_reqmsg->opc != MDS_DISCONNECT) { struct mds_export_data *med = &req->rq_export->exp_mds_data; - + struct obd_device *obd = list_entry(mds, struct obd_device, + u.mds); req->rq_repmsg->last_xid = HTON__u64(le64_to_cpu(med->med_mcd->mcd_last_xid)); - req->rq_repmsg->last_committed = - HTON__u64(mds->mds_last_committed); + if ((obd->obd_flags & OBD_NO_TRANSNO) == 0) { + req->rq_repmsg->last_committed = + HTON__u64(obd->obd_last_committed); + } else { + DEBUG_REQ(D_IOCTL, req, + "not sending last_committed update"); + } CDEBUG(D_INFO, "last_transno %Lu, last_committed %Lu, xid %d\n", (unsigned long long)mds->mds_last_rcvd, - (unsigned long long)mds->mds_last_committed, + (unsigned long long)obd->obd_last_committed, cpu_to_le32(req->rq_xid)); } out: if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) { - struct mds_obd *mds = mds_req2mds(req); - LASSERT(mds->mds_recoverable_clients); - DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply"); - return mds_queue_final_reply(req, rc); - } - - /* XXX bug 578 */ - /* MDS_CONNECT / EALREADY (note: not -EALREADY!) isn't an error */ - if (rc && (req->rq_reqmsg->opc != MDS_CONNECT || - rc != EALREADY)) { - DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc); - ptlrpc_error(req->rq_svc, req); + if (mds_obd && (mds_obd->obd_flags & OBD_RECOVERING)) { + DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply"); + return mds_queue_final_reply(req, rc); + } + /* Lost a race with recovery; let the error path DTRT. */ + rc = req->rq_status = -ENOTCONN; + } + + if (req->rq_export && mds_obd && + (mds_obd->obd_flags & OBD_RECOVERING) == 0) { + struct mds_export_data *med = &req->rq_export->exp_mds_data; + if (med->med_last_reply) + OBD_FREE(med->med_last_reply, med->med_last_replen); + OBD_ALLOC(med->med_last_reply, req->rq_replen); + med->med_last_replen = req->rq_replen; + med->med_last_xid = req->rq_xid; + memcpy(med->med_last_reply, req->rq_repmsg, req->rq_replen); + /* XXX serialize */ + } + + if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_ALL_REPLY_NET | OBD_FAIL_ONCE)) { + if (rc) { + DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc); + ptlrpc_error(req->rq_svc, req); + } else { + DEBUG_REQ(D_NET, req, "sending reply"); + ptlrpc_reply(req->rq_svc, req); + } } else { - DEBUG_REQ(D_NET, req, "sending reply"); - ptlrpc_reply(req->rq_svc, req); + obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED; + DEBUG_REQ(D_ERROR, req, "dropping reply"); + if (req->rq_repmsg) + OBD_FREE(req->rq_repmsg, req->rq_replen); } + return 0; } @@ -1539,6 +1535,9 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf) GOTO(err_put, rc); } + if (obddev->obd_flags & OBD_RECOVERING) + start_recovery_timer(mds); + obddev->obd_namespace = ldlm_namespace_new("mds_server", LDLM_NAMESPACE_SERVER); if (obddev->obd_namespace == NULL) { @@ -1598,11 +1597,12 @@ static int mds_cleanup(struct obd_device *obddev) RETURN(0); } -static int ldlm_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, - void *req_cookie, ldlm_mode_t mode, int flags, - void *data) +static int ldlm_intent_policy(struct ldlm_namespace *ns, + struct ldlm_lock **lockp, void *req_cookie, + ldlm_mode_t mode, int flags, void *data) { struct ptlrpc_request *req = req_cookie; + struct ldlm_lock *lock = *lockp; int rc = 0; ENTRY; @@ -1613,10 +1613,11 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, /* an intent needs to be considered */ struct ldlm_intent *it = lustre_msg_buf(req->rq_reqmsg, 1); struct mds_obd *mds = &req->rq_export->exp_obd->u.mds; - struct mds_body *mds_rep; + struct mds_body *mds_body; struct ldlm_reply *rep; - __u64 new_resid[3] = {0, 0, 0}, old_res; - int rc, size[3] = {sizeof(struct ldlm_reply), + struct lustre_handle lockh; + struct ldlm_lock *new_lock; + int rc, offset = 2, repsize[3] = {sizeof(struct ldlm_reply), sizeof(struct mds_body), mds->mds_max_mdsize}; @@ -1625,7 +1626,7 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, LDLM_DEBUG(lock, "intent policy, opc: %s", ldlm_it2str(it->opc)); - rc = lustre_pack_msg(3, size, NULL, &req->rq_replen, + rc = lustre_pack_msg(3, repsize, NULL, &req->rq_replen, &req->rq_repmsg); if (rc) { rc = req->rq_status = -ENOMEM; @@ -1633,102 +1634,103 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, } rep = lustre_msg_buf(req->rq_repmsg, 0); - rep->lock_policy_res1 = 1; + rep->lock_policy_res1 = IT_INTENT_EXEC; /* execute policy */ switch ((long)it->opc) { + case IT_OPEN: case IT_CREAT|IT_OPEN: - rc = mds_reint(req, 2); - if (rc || (req->rq_status != 0 && - req->rq_status != -EEXIST)) { - rep->lock_policy_res2 = req->rq_status; + rc = mds_reint(req, offset, &lockh); + /* We return a dentry to the client if IT_OPEN_POS is + * set, or if we make it to the OPEN portion of the + * programme (which implies that we created) */ + if (!(rep->lock_policy_res1 & IT_OPEN_POS || + rep->lock_policy_res1 & IT_OPEN_OPEN)) { + rep->lock_policy_res2 = rc; RETURN(ELDLM_LOCK_ABORTED); } break; - case IT_CREAT: - case IT_MKDIR: - case IT_MKNOD: - case IT_RENAME2: - case IT_LINK2: - case IT_RMDIR: - case IT_SYMLINK: case IT_UNLINK: - rc = mds_reint(req, 2); - if (rc || (req->rq_status != 0 && - req->rq_status != -EISDIR && - req->rq_status != -ENOTDIR)) { + rc = mds_reint(req, offset, &lockh); + /* Don't return a lock if the unlink failed, or if we're + * not sending back an EA */ + if (rc) { + rep->lock_policy_res2 = rc; + RETURN(ELDLM_LOCK_ABORTED); + } + if (req->rq_status != 0) { rep->lock_policy_res2 = req->rq_status; RETURN(ELDLM_LOCK_ABORTED); } + mds_body = lustre_msg_buf(req->rq_repmsg, 1); + if (!(mds_body->valid & OBD_MD_FLEASIZE)) { + rep->lock_policy_res2 = rc; + RETURN(ELDLM_LOCK_ABORTED); + } break; case IT_GETATTR: case IT_LOOKUP: - case IT_OPEN: case IT_READDIR: - case IT_READLINK: - case IT_RENAME: - case IT_LINK: case IT_SETATTR: - rc = mds_getattr_name(2, req); + rc = mds_getattr_name(offset, req, &lockh); /* FIXME: we need to sit down and decide on who should * set req->rq_status, who should return negative and * positive return values, and what they all mean. */ - if (rc || req->rq_status != 0) { + if (rc) { + rep->lock_policy_res2 = rc; + RETURN(ELDLM_LOCK_ABORTED); + } + if (req->rq_status != 0) { rep->lock_policy_res2 = req->rq_status; RETURN(ELDLM_LOCK_ABORTED); } break; - case IT_READDIR|IT_OPEN: - LBUG(); - break; default: CERROR("Unhandled intent "LPD64"\n", it->opc); LBUG(); } - /* We don't bother returning a lock to the client for a file - * or directory we are removing. - * - * As for link and rename, there is no reason for the client - * to get a lock on the target at this point. If they are - * going to modify the file/directory later they will get a - * lock at that time. - */ - if (it->opc & (IT_UNLINK | IT_RMDIR | IT_LINK | IT_LINK2 | - IT_RENAME | IT_RENAME2)) - RETURN(ELDLM_LOCK_ABORTED); - - rep->lock_policy_res2 = req->rq_status; - mds_rep = lustre_msg_buf(req->rq_repmsg, 1); - - /* If the client is about to open a file that doesn't have an - * MD stripe record, it's going to need a write lock. - */ - if (it->opc & IT_OPEN && !(mds_rep->valid & OBD_MD_FLEASIZE)) { - LDLM_DEBUG(lock, "open with no EA; returning PW lock"); - lock->l_req_mode = LCK_PW; - } - if (flags & LDLM_FL_INTENT_ONLY) { LDLM_DEBUG(lock, "INTENT_ONLY, aborting lock"); RETURN(ELDLM_LOCK_ABORTED); } - /* Give the client a lock on the child object, instead of the - * parent that it requested. */ - new_resid[0] = NTOH__u32(mds_rep->ino); - new_resid[1] = NTOH__u32(mds_rep->generation); - if (new_resid[0] == 0) - LBUG(); - old_res = lock->l_resource->lr_name[0]; - ldlm_lock_change_resource(ns, lock, new_resid); - if (lock->l_resource == NULL) { - LBUG(); - RETURN(-ENOMEM); - } - LDLM_DEBUG(lock, "intent policy, old res %ld", - (long)old_res); - RETURN(ELDLM_LOCK_CHANGED); + /* By this point, whatever function we called above must have + * filled in 'lockh' or returned an error. We want to give the + * new lock to the client instead of whatever lock it was about + * to get. */ + new_lock = ldlm_handle2lock(&lockh); + LASSERT(new_lock != NULL); + mds_body = lustre_msg_buf(req->rq_repmsg, 1); + *lockp = new_lock; + + /* Fixup the lock to be given to the client */ + l_lock(&new_lock->l_resource->lr_namespace->ns_lock); + LASSERT(new_lock->l_readers + new_lock->l_writers == 1); + new_lock->l_readers = 0; + new_lock->l_writers = 0; + + new_lock->l_export = req->rq_export; + list_add(&new_lock->l_export_chain, + &new_lock->l_export->exp_ldlm_data.led_held_locks); + + /* We don't need to worry about completion_ast (which isn't set + * in 'lock' yet anyways), because this lock is already + * granted. */ + new_lock->l_blocking_ast = lock->l_blocking_ast; + + memcpy(&new_lock->l_remote_handle, &lock->l_remote_handle, + sizeof(lock->l_remote_handle)); + + new_lock->l_flags &= ~(LDLM_FL_LOCAL | LDLM_FL_AST_SENT | + LDLM_FL_CBPENDING); + + LDLM_LOCK_PUT(new_lock); + l_unlock(&new_lock->l_resource->lr_namespace->ns_lock); + + rep->lock_policy_res2 = req->rq_status; + + RETURN(ELDLM_LOCK_REPLACED); } else { int size = sizeof(struct ldlm_reply); rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, @@ -1743,26 +1745,41 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, int mds_attach(struct obd_device *dev, obd_count len, void *data) { - return lprocfs_reg_obd(dev, status_var_nm_1, dev); + struct lprocfs_static_vars lvars; + + lprocfs_init_multi_vars(0, &lvars); + return lprocfs_obd_attach(dev, lvars.obd_vars); } int mds_detach(struct obd_device *dev) { - return lprocfs_dereg_obd(dev); + return lprocfs_obd_detach(dev); +} + +int mdt_attach(struct obd_device *dev, obd_count len, void *data) +{ + struct lprocfs_static_vars lvars; + + lprocfs_init_multi_vars(1, &lvars); + return lprocfs_obd_attach(dev, lvars.obd_vars); +} + +int mdt_detach(struct obd_device *dev) +{ + return lprocfs_obd_detach(dev); } static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf) { - int i; - // struct obd_ioctl_data* data = buf; struct mds_obd *mds = &obddev->u.mds; - int rc = 0; + struct obd_uuid uuid = { "self" }; + int i, rc = 0; ENTRY; mds->mds_service = ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL, - "self", mds_handle, "mds"); + &uuid, mds_handle, "mds"); if (!mds->mds_service) { CERROR("failed to start service\n"); RETURN(rc = -ENOMEM); @@ -1778,12 +1795,37 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf) } } + mds->mds_getattr_service = + ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS, + MDS_BUFSIZE, MDS_MAXREQSIZE, + MDS_GETATTR_PORTAL, MDC_REPLY_PORTAL, + &uuid, mds_handle, "mds"); + if (!mds->mds_getattr_service) { + CERROR("failed to start getattr service\n"); + GOTO(err_thread, rc = -ENOMEM); + } + + for (i = 0; i < MDT_NUM_THREADS; i++) { + char name[32]; + sprintf(name, "ll_mdt_attr_%02d", i); + rc = ptlrpc_start_thread(obddev, mds->mds_getattr_service, + name); + if (rc) { + CERROR("cannot start MDT getattr thread #%d: rc %d\n", + i, rc); + GOTO(err_thread2, rc); + } + } + RETURN(0); +err_thread2: + ptlrpc_stop_all_threads(mds->mds_getattr_service); + ptlrpc_unregister_service(mds->mds_getattr_service); err_thread: ptlrpc_stop_all_threads(mds->mds_service); ptlrpc_unregister_service(mds->mds_service); - RETURN(rc); + return rc; } @@ -1792,6 +1834,9 @@ static int mdt_cleanup(struct obd_device *obddev) struct mds_obd *mds = &obddev->u.mds; ENTRY; + ptlrpc_stop_all_threads(mds->mds_getattr_service); + ptlrpc_unregister_service(mds->mds_getattr_service); + ptlrpc_stop_all_threads(mds->mds_service); ptlrpc_unregister_service(mds->mds_service); @@ -1815,6 +1860,8 @@ static struct obd_ops mds_obd_ops = { static struct obd_ops mdt_obd_ops = { o_owner: THIS_MODULE, + o_attach: mdt_attach, + o_detach: mdt_detach, o_setup: mdt_setup, o_cleanup: mdt_cleanup, }; @@ -1822,14 +1869,17 @@ static struct obd_ops mdt_obd_ops = { static int __init mds_init(void) { + struct lprocfs_static_vars lvars; mds_file_cache = kmem_cache_create("ll_mds_file_data", sizeof(struct mds_file_data), 0, 0, NULL, NULL); if (mds_file_cache == NULL) return -ENOMEM; - class_register_type(&mds_obd_ops, status_class_var, LUSTRE_MDS_NAME); - class_register_type(&mdt_obd_ops, 0, LUSTRE_MDT_NAME); + lprocfs_init_multi_vars(0, &lvars); + class_register_type(&mds_obd_ops, lvars.module_vars, LUSTRE_MDS_NAME); + lprocfs_init_multi_vars(1, &lvars); + class_register_type(&mdt_obd_ops, lvars.module_vars, LUSTRE_MDT_NAME); ldlm_register_intent(ldlm_intent_policy); return 0; @@ -1844,8 +1894,8 @@ static void __exit mds_exit(void) CERROR("couldn't free MDS file cache\n"); } -MODULE_AUTHOR("Cluster File Systems "); -MODULE_DESCRIPTION("Lustre Metadata Server (MDS) v0.01"); +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Lustre Metadata Server (MDS)"); MODULE_LICENSE("GPL"); module_init(mds_init); diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c index 37c7bc8..eab0cf7 100644 --- a/lustre/mds/lproc_mds.c +++ b/lustre/mds/lproc_mds.c @@ -25,77 +25,28 @@ #include #include -int rd_uuid(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - return snprintf(page, count, "%s\n", temp->obd_uuid); -} +#ifndef LPROCFS +struct lprocfs_vars lprocfs_mds_obd_vars[] = { {0} }; +struct lprocfs_vars lprocfs_mds_module_vars[] = { {0} }; +struct lprocfs_vars lprocfs_mdt_obd_vars[] = { {0} }; +struct lprocfs_vars lprocfs_mdt_module_vars[] = { {0} }; -int rd_blksize(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - struct mds_obd *mds = &temp->u.mds; - struct statfs mystats; - int rc; +#else - rc = vfs_statfs(mds->mds_sb, &mystats); - if (rc) { - CERROR("mds: statfs failed: rc %d\n", rc); - return 0; - } - return snprintf(page, count, LPU64"\n", (__u64)(mystats.f_bsize)); -} - -int rd_kbtotal(char *page, char **start, off_t off, int count, int *eof, - void *data) +static inline +int lprocfs_mds_statfs(void *data, struct statfs *sfs) { - struct obd_device* temp = (struct obd_device*)data; - struct mds_obd *mds = &temp->u.mds; - struct statfs mystats; - int rc; - __u32 blk_size; - __u64 result; - - rc = vfs_statfs(mds->mds_sb, &mystats); - if (rc) { - CERROR("mds: statfs failed: rc %d\n", rc); - return 0; - } - - blk_size = mystats.f_bsize; - blk_size >>= 10; - result = mystats.f_blocks; - while(blk_size >>= 1) - result <<= 1; - - return snprintf(page, count, LPU64"\n", result); + struct obd_device* dev = (struct obd_device*) data; + struct mds_obd *mds = &dev->u.mds; + return vfs_statfs(mds->mds_sb, sfs); } -int rd_kbfree(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - struct mds_obd *mds = &temp->u.mds; - struct statfs mystats; - int rc; - __u32 blk_size; - __u64 result; - - rc = vfs_statfs(mds->mds_sb, &mystats); - if (rc) { - CERROR("mds: statfs failed: rc %d\n", rc); - return 0; - } - blk_size = mystats.f_bsize; - blk_size >>= 10; - result = mystats.f_blocks; - while (blk_size >>= 1) - result <<= 1; - - return snprintf(page, count, LPU64"\n", result); -} +DEFINE_LPROCFS_STATFS_FCT(rd_blksize, lprocfs_mds_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_kbytestotal, lprocfs_mds_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_kbytesfree, lprocfs_mds_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filestotal, lprocfs_mds_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filesfree, lprocfs_mds_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filegroups, lprocfs_mds_statfs); int rd_fstype(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -105,66 +56,40 @@ int rd_fstype(char *page, char **start, off_t off, int count, int *eof, return snprintf(page, count, "%s\n", obd->obd_fsops->fs_type); } -int rd_filestotal(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - struct mds_obd *mds = &temp->u.mds; - struct statfs mystats; - int rc; - rc = vfs_statfs(mds->mds_sb, &mystats); - if (rc) { - CERROR("mds: statfs failed: rc %d\n", rc); - return 0; - } - return snprintf(page, count, LPU64"\n", (__u64)(mystats.f_files)); -} - -int rd_filesfree(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - struct mds_obd *mds = &temp->u.mds; - struct statfs mystats; - int rc, len = 0; - - rc = vfs_statfs(mds->mds_sb, &mystats); - if (rc) { - CERROR("mds: statfs failed: rc %d\n", rc); - return 0; - } +struct lprocfs_vars lprocfs_mds_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "blocksize", rd_blksize, 0, 0 }, + { "bytestotal", rd_kbytestotal, 0, 0 }, + { "kbytesfree", rd_kbytesfree, 0, 0 }, + { "fstype", rd_fstype, 0, 0 }, + { "filestotal", rd_filestotal, 0, 0 }, + { "filesfree", rd_filesfree, 0, 0 }, + { "filegroups", rd_filegroups, 0, 0 }, + { 0 } +}; - len += snprintf(page, count, LPU64"\n", (__u64)(mystats.f_ffree)); - return len; -} +struct lprocfs_vars lprocfs_mds_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } +}; -int rd_filegroups(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} -struct lprocfs_vars status_var_nm_1[]={ - {"status/uuid", rd_uuid, 0, 0}, - {"status/blocksize",rd_blksize, 0, 0}, - {"status/kbytestotal",rd_kbtotal, 0, 0}, - {"status/kbytesfree", rd_kbfree, 0, 0}, - {"status/fstype", rd_fstype, 0, 0}, - {"status/filestotal", rd_filestotal, 0, 0}, - {"status/filesfree", rd_filesfree, 0, 0}, - {"status/filegroups", rd_filegroups, 0, 0}, - {0} +struct lprocfs_vars lprocfs_mdt_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { 0 } }; -int rd_numrefs(char *page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_type *class = (struct obd_type*)data; +struct lprocfs_vars lprocfs_mdt_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } +}; - return snprintf(page, count, "%d\n", class->typ_refcnt); -} +#endif +struct lprocfs_static_vars lprocfs_array_vars[] = { {lprocfs_mds_module_vars, + lprocfs_mds_obd_vars}, + {lprocfs_mdt_module_vars, + lprocfs_mdt_obd_vars}}; -struct lprocfs_vars status_class_var[]={ - {"status/num_refs", rd_numrefs, 0, 0}, - {0} -}; +LPROCFS_INIT_MULTI_VARS(lprocfs_array_vars, + (sizeof(lprocfs_array_vars)/ + sizeof(struct lprocfs_static_vars))) diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index 3f6c420..bf04553 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -1,17 +1,26 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * linux/mds/mds_fs.c - * + * mds/mds_fs.c * Lustre Metadata Server (MDS) filesystem interface code * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. + * Author: Andreas Dilger + * + * This file is part of Lustre, http://www.lustre.org. * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. * - * by Andreas Dilger + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define EXPORT_SYMTAB @@ -79,8 +88,8 @@ int mds_client_add(struct mds_obd *mds, struct mds_export_data *med, int cl_off) push_ctxt(&saved, &mds->mds_ctxt, NULL); written = lustre_fwrite(mds->mds_rcvd_filp, - (char *)med->med_mcd, - sizeof(*med->med_mcd), &off); + (char *)med->med_mcd, + sizeof(*med->med_mcd), &off); pop_ctxt(&saved, &mds->mds_ctxt, NULL); if (written != sizeof(*med->med_mcd)) { @@ -133,6 +142,10 @@ int mds_client_free(struct obd_export *exp) med->med_mcd->mcd_uuid, med->med_off); } + if (med->med_last_reply) { + OBD_FREE(med->med_last_reply, med->med_last_replen); + med->med_last_reply = NULL; + } OBD_FREE(med->med_mcd, sizeof(*med->med_mcd)); return 0; @@ -177,7 +190,7 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) } CDEBUG(D_INODE, "last_rcvd has size %lu (msd + %lu clients)\n", - last_rcvd_size, (last_rcvd_size - sizeof *msd) / sizeof *mcd); + last_rcvd_size, (last_rcvd_size - MDS_LR_CLIENT)/MDS_LR_SIZE); /* * When we do a clean MDS shutdown, we save the last_rcvd into @@ -232,6 +245,8 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) break; } + memcpy(&exp->exp_client_uuid.uuid, mcd->mcd_uuid, + sizeof exp->exp_client_uuid.uuid); med = &exp->exp_mds_data; med->med_mcd = mcd; mds_client_add(mds, med, cl_off); @@ -255,11 +270,12 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) mds->mds_last_rcvd = last_rcvd; } - mds->mds_last_committed = mds->mds_last_rcvd; + obddev->obd_last_committed = mds->mds_last_rcvd; if (mds->mds_recoverable_clients) { CERROR("RECOVERY: %d recoverable clients, last_rcvd "LPU64"\n", mds->mds_recoverable_clients, mds->mds_last_rcvd); - mds->mds_next_recovery_transno = mds->mds_last_committed + 1; + mds->mds_next_recovery_transno = obddev->obd_last_committed + 1; + obddev->obd_flags |= OBD_RECOVERING; } if (mcd) diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index b548792..fef9a0d 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -2,16 +2,25 @@ * vim:expandtab:shiftwidth=8:tabstop=8: * * linux/mds/mds_lov.c - * * Lustre Metadata Server (mds) handling of striped file data * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Copyright (C) 2001-2003 Cluster File Systems, Inc. + * Author: Peter Braam + * + * This file is part of Lustre, http://www.lustre.org. * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. * - * by Peter Braam & + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define EXPORT_SYMTAB @@ -35,7 +44,7 @@ void lov_packdesc(struct lov_desc *ld) } int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc, - obd_uuid_t *uuidarray) + struct obd_uuid *uuidarray) { struct mds_obd *mds = &obd->u.mds; struct obd_run_ctxt saved; @@ -99,11 +108,11 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc, #warning FIXME: if there is an existing LOVTGTS, verify existing UUIDs same rc = 0; for (i = 0; i < tgt_count ; i++) { - rc = lustre_fwrite(f, uuidarray[i], + rc = lustre_fwrite(f, uuidarray[i].uuid, sizeof(uuidarray[i]), &f->f_pos); if (rc != sizeof(uuidarray[i])) { CERROR("cannot write LOV UUID %s (%d)\n", - uuidarray[i], i); + uuidarray[i].uuid, i); if (rc >= 0) rc = -EIO; break; @@ -148,7 +157,7 @@ out: return rc; } -int mds_get_lovtgts(struct mds_obd *mds, int tgt_count,obd_uuid_t *uuidarray) +int mds_get_lovtgts(struct mds_obd *mds, int tgt_count,struct obd_uuid *uuidarray) { struct obd_run_ctxt saved; struct file *f; @@ -188,11 +197,10 @@ int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn, struct obd_device *obd = class_conn2obd(conn); struct obd_ioctl_data *data = karg; struct lov_desc *desc; - obd_uuid_t *uuidarray; + struct obd_uuid *uuidarray; int count; int rc; - switch (cmd) { case OBD_IOC_LOV_SET_CONFIG: desc = (struct lov_desc *)data->ioc_inlbuf1; @@ -202,7 +210,7 @@ int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn, } count = desc->ld_tgt_count; - uuidarray = (obd_uuid_t *)data->ioc_inlbuf2; + uuidarray = (struct obd_uuid *)data->ioc_inlbuf2; if (sizeof(*uuidarray) * count != data->ioc_inllen2) { CERROR("UUID array size wrong\n"); RETURN(-EINVAL); @@ -218,7 +226,7 @@ int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn, } count = desc->ld_tgt_count; - uuidarray = (obd_uuid_t *)data->ioc_inlbuf2; + uuidarray = (struct obd_uuid *)data->ioc_inlbuf2; if (sizeof(*uuidarray) * count != data->ioc_inllen2) { CERROR("UUID array size wrong\n"); RETURN(-EINVAL); @@ -231,9 +239,15 @@ int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn, rc = mds_get_lovtgts(&obd->u.mds, desc->ld_tgt_count, uuidarray); RETURN(rc); + + case OBD_IOC_SET_READONLY: + CERROR("setting device %s read-only\n", + ll_bdevname(obd->u.mds.mds_sb->s_dev)); + dev_set_rdonly(obd->u.mds.mds_sb->s_dev, 2); + RETURN(0); + default: RETURN(-EINVAL); } - RETURN(0); } diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c new file mode 100644 index 0000000..2f65384 --- /dev/null +++ b/lustre/mds/mds_open.c @@ -0,0 +1,238 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lustre/mds/handler.c + * Lustre Metadata Server (mds) request handler + * + * Copyright (c) 2001, 2002 Cluster File Systems, Inc. + * Author: Peter Braam + * Author: Andreas Dilger + * Author: Phil Schwan + * Author: Mike Shaver + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB +#define DEBUG_SUBSYSTEM S_MDS + +#include +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#include +#include +#endif +#include +#include +#include +#include + +extern kmem_cache_t *mds_file_cache; +extern inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req); +extern void mds_start_transno(struct mds_obd *mds); +extern int mds_finish_transno(struct mds_obd *mds, void *handle, + struct ptlrpc_request *req, int rc); +extern int enqueue_ordered_locks(int lock_mode, struct obd_device *obd, + struct ldlm_res_id *p1_res_id, + struct ldlm_res_id *p2_res_id, + struct ldlm_res_id *c1_res_id, + struct ldlm_res_id *c2_res_id, + struct lustre_handle *p1_lockh, + struct lustre_handle *p2_lockh, + struct lustre_handle *c1_lockh, + struct lustre_handle *c2_lockh); + +int mds_open(struct mds_update_record *rec, int offset, + struct ptlrpc_request *req, struct lustre_handle *child_lockh) +{ + struct mds_obd *mds = mds_req2mds(req); + struct obd_device *obd = req->rq_export->exp_obd; + struct ldlm_reply *rep = lustre_msg_buf(req->rq_repmsg, 0); + struct file *file; + struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1); + struct dentry *dchild, *parent; + struct mds_export_data *med; + struct mds_file_data *mfd = NULL; + struct ldlm_res_id child_res_id = { .name = {0} }; + struct lustre_handle parent_lockh; + int rc = 0, parent_mode, child_mode = LCK_PR, lock_flags, created = 0; + ENTRY; + +#warning replay of open needs to be redone + /* was this animal open already and the client lost the reply? */ + /* XXX need some way to detect a reopen, to avoid locked list walks */ + med = &req->rq_export->exp_mds_data; +#if 0 + spin_lock(&med->med_open_lock); + list_for_each(tmp, &med->med_open_head) { + mfd = list_entry(tmp, typeof(*mfd), mfd_list); + if (!memcmp(&mfd->mfd_clienthandle, &body->handle, + sizeof(mfd->mfd_clienthandle)) && + body->fid1.id == mfd->mfd_file->f_dentry->d_inode->i_ino) { + dchild = mfd->mfd_file->f_dentry; + spin_unlock(&med->med_open_lock); + CERROR("Re opening "LPD64"\n", body->fid1.id); + GOTO(out_pack, rc = 0); + } + } + spin_unlock(&med->med_open_lock); +#endif + rep->lock_policy_res1 |= IT_OPEN_LOOKUP; + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OPEN_PACK)) { + CERROR("test case OBD_FAIL_MDS_OPEN_PACK\n"); + req->rq_status = -ENOMEM; + RETURN(-ENOMEM); + } + + /* Step 1: Find and lock the parent */ + parent_mode = (rec->ur_flags & O_CREAT) ? LCK_PW : LCK_PR; + parent = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, parent_mode, + &parent_lockh); + if (IS_ERR(parent)) { + rc = PTR_ERR(parent); + CERROR("parent lookup error %d\n", rc); + LBUG(); + RETURN(rc); + } + LASSERT(parent->d_inode); + + /* Step 2: Lookup the child */ + dchild = lookup_one_len(lustre_msg_buf(req->rq_reqmsg, 3), + parent, req->rq_reqmsg->buflens[3] - 1); + if (IS_ERR(dchild)) + GOTO(out_step_2, rc = PTR_ERR(dchild)); + + if (dchild->d_inode) + rep->lock_policy_res1 |= IT_OPEN_POS; + else + rep->lock_policy_res1 |= IT_OPEN_NEG; + + /* Step 3: If the child was negative, and we're supposed to, + * create it. */ + if ((rec->ur_flags & O_CREAT) && !dchild->d_inode) { + int err; + void *handle; + mds_start_transno(mds); + rep->lock_policy_res1 |= IT_OPEN_CREATE; + handle = fsfilt_start(obd, parent->d_inode, FSFILT_OP_CREATE); + if (IS_ERR(handle)) { + rc = PTR_ERR(handle); + mds_finish_transno(mds, handle, req, rc); + GOTO(out_step_3, rc); + } + rc = vfs_create(parent->d_inode, dchild, rec->ur_mode); + rc = mds_finish_transno(mds, handle, req, rc); + err = fsfilt_commit(obd, parent->d_inode, handle); + if (rc || err) { + CERROR("error on commit: err = %d\n", err); + if (!rc) + rc = err; + GOTO(out_step_3, rc); + } + created = 1; + child_mode = LCK_PW; + } else if (!dchild->d_inode) { + /* It's negative and we weren't supposed to create it */ + GOTO(out_step_3, rc = -ENOENT); + } + + /* Step 4: It's positive, so lock the child */ + child_res_id.name[0] = dchild->d_inode->i_ino; + child_res_id.name[1] = dchild->d_inode->i_generation; + reacquire: + lock_flags = 0; + rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, + child_res_id, LDLM_PLAIN, NULL, 0, child_mode, + &lock_flags, ldlm_completion_ast, + mds_blocking_ast, NULL, NULL, child_lockh); + if (rc != ELDLM_OK) { + CERROR("ldlm_cli_enqueue: %d\n", rc); + GOTO(out_step_3, rc = -EIO); + } + + mds_pack_inode2fid(&body->fid1, dchild->d_inode); + mds_pack_inode2body(body, dchild->d_inode); + if (S_ISREG(dchild->d_inode->i_mode)) { + rc = mds_pack_md(obd, req->rq_repmsg, 2, body, dchild->d_inode); + if (rc) + GOTO(out_step_4, rc); + } else { + /* If this isn't a regular file, we can't open it. */ + GOTO(out_step_3, rc = 0); /* returns the lock to the client */ + } + + if (!created && (rec->ur_flags & O_CREAT) && (rec->ur_flags & O_EXCL)) { + /* File already exists, we didn't just create it, and we + * were passed O_EXCL; err-or. */ + GOTO(out_step_3, rc = -EEXIST); // returns a lock to the client + } + + /* If we're opening a file without an EA, the client needs a write + * lock. */ + if (child_mode != LCK_PW && S_ISREG(dchild->d_inode->i_mode) && + !(body->valid & OBD_MD_FLEASIZE)) { + ldlm_lock_decref(child_lockh, child_mode); + child_mode = LCK_PW; + goto reacquire; + } + + /* Step 5: Open it */ + rep->lock_policy_res1 |= IT_OPEN_OPEN; + mfd = kmem_cache_alloc(mds_file_cache, GFP_KERNEL); + if (!mfd) { + CERROR("mds: out of memory\n"); + GOTO(out_step_4, req->rq_status = -ENOMEM); + } + + /* dentry_open does a dput(de) and mntput(mds->mds_vfsmnt) on error */ + mntget(mds->mds_vfsmnt); + file = dentry_open(dchild,mds->mds_vfsmnt, + rec->ur_flags & ~(O_DIRECT | O_TRUNC)); + if (IS_ERR(file)) + GOTO(out_step_5, rc = PTR_ERR(file)); + + file->private_data = mfd; + mfd->mfd_file = file; + get_random_bytes(&mfd->mfd_servercookie, sizeof(mfd->mfd_servercookie)); + spin_lock(&med->med_open_lock); + list_add(&mfd->mfd_list, &med->med_open_head); + spin_unlock(&med->med_open_lock); + + body->handle.addr = (__u64)(unsigned long)mfd; + body->handle.cookie = mfd->mfd_servercookie; + CDEBUG(D_INODE, "file %p: mfd %p, cookie "LPX64"\n", + mfd->mfd_file, mfd, mfd->mfd_servercookie); + GOTO(out_step_2, rc = 0); /* returns a lock to the client */ + + out_step_5: + if (mfd != NULL) { + kmem_cache_free(mds_file_cache, mfd); + mfd = NULL; + } + out_step_4: + ldlm_lock_decref(child_lockh, child_mode); + out_step_3: + l_dput(dchild); + out_step_2: + l_dput(parent); + ldlm_lock_decref(&parent_lockh, parent_mode); + RETURN(rc); +} diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index 3d340f7..608747f 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -4,7 +4,10 @@ * linux/mds/mds_reint.c * Lustre Metadata Server (mds) reintegration routines * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. + * Author: Peter Braam + * Author: Andreas Dilger + * Author: Phil Schwan * * This file is part of Lustre, http://www.lustre.org. * @@ -20,9 +23,6 @@ * You should have received a copy of the GNU General Public License * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Author: Peter Braam - * Author: Andreas Dilger */ #define EXPORT_SYMTAB @@ -41,12 +41,10 @@ extern inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req); static void mds_last_rcvd_cb(struct obd_device *obd, __u64 last_rcvd, int error) { - struct mds_obd *mds = &obd->u.mds; - CDEBUG(D_HA, "got callback for last_rcvd "LPD64": rc = %d\n", last_rcvd, error); - if (!error && last_rcvd > mds->mds_last_committed) - mds->mds_last_committed = last_rcvd; + if (!error && last_rcvd > obd->obd_last_committed) + obd->obd_last_committed = last_rcvd; } void mds_start_transno(struct mds_obd *mds) @@ -102,11 +100,11 @@ int mds_finish_transno(struct mds_obd *mds, void *handle, return rc; } -/* In the write-back case, the client holds a lock on a subtree. - * In the intent case, the client holds a lock on the child inode. - * In the pathname case, the client (may) hold a lock on the child inode. */ +/* In the write-back case, the client holds a lock on a subtree (not supported). + * In the intent case, the client holds a lock on the child inode. */ static int mds_reint_setattr(struct mds_update_record *rec, int offset, - struct ptlrpc_request *req) + struct ptlrpc_request *req, + struct lustre_handle *lh) { struct mds_obd *mds = mds_req2mds(req); struct obd_device *obd = req->rq_export->exp_obd; @@ -114,39 +112,14 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, struct dentry *de; struct inode *inode; void *handle; - struct lustre_handle child_lockh; int rc = 0, err; - if (req->rq_reqmsg->bufcount > offset + 1) { - struct dentry *dir; - struct lustre_handle dir_lockh; - char *name; - int namelen; - - /* a name was supplied by the client; fid1 is the directory */ - dir = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, LCK_PR, - &dir_lockh); - if (IS_ERR(dir)) { - LBUG(); - GOTO(out_setattr, rc = PTR_ERR(dir)); - } - - name = lustre_msg_buf(req->rq_reqmsg, offset + 1); - namelen = req->rq_reqmsg->buflens[offset + 1] - 1; - de = mds_name2locked_dentry(obd, dir, NULL, name, namelen, - 0, &child_lockh, LCK_PR); - l_dput(dir); - if (IS_ERR(de)) { - LBUG(); - GOTO(out_setattr_de, rc = PTR_ERR(de)); - } - } else { - de = mds_fid2dentry(mds, rec->ur_fid1, NULL); - if (!de || IS_ERR(de)) { - GOTO(out_setattr_de, rc = PTR_ERR(de)); - } - } + de = mds_fid2dentry(mds, rec->ur_fid1, NULL); + if (IS_ERR(de)) + GOTO(out_setattr, rc = PTR_ERR(de)); inode = de->d_inode; + + LASSERT(inode); CDEBUG(D_INODE, "ino %lu\n", inode->i_ino); OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_SETATTR_WRITE, @@ -161,15 +134,18 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, } rc = fsfilt_setattr(obd, de, handle, &rec->ur_iattr); - - if (offset) { - body = lustre_msg_buf(req->rq_repmsg, 1); - mds_pack_inode2fid(&body->fid1, inode); - mds_pack_inode2body(body, inode); + if (rc == 0 && S_ISREG(inode->i_mode) && + req->rq_reqmsg->bufcount > 1) { + rc = fsfilt_set_md(obd, inode, handle, + lustre_msg_buf(req->rq_reqmsg, 1), + req->rq_reqmsg->buflens[1]); } - rc = mds_finish_transno(mds, handle, req, rc); + body = lustre_msg_buf(req->rq_repmsg, 0); + mds_pack_inode2fid(&body->fid1, inode); + mds_pack_inode2body(body, inode); + rc = mds_finish_transno(mds, handle, req, rc); err = fsfilt_commit(obd, de->d_inode, handle); if (err) { CERROR("error on commit: err = %d\n", err); @@ -186,7 +162,8 @@ out_setattr: } static int mds_reint_create(struct mds_update_record *rec, int offset, - struct ptlrpc_request *req) + struct ptlrpc_request *req, + struct lustre_handle *lh) { struct dentry *de = NULL; struct mds_obd *mds = mds_req2mds(req); @@ -195,21 +172,16 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, struct inode *dir; void *handle; struct lustre_handle lockh; - int rc = 0, err, lock_mode, type = rec->ur_mode & S_IFMT; + int rc = 0, err, type = rec->ur_mode & S_IFMT; ENTRY; - /* requests were at offset 2, replies go back at 1 */ - if (offset) - offset = 1; - + LASSERT(offset == 0); LASSERT(!strcmp(req->rq_export->exp_obd->obd_type->typ_name, "mds")); - lock_mode = (req->rq_reqmsg->opc == MDS_REINT) ? LCK_CW : LCK_PW; - if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_CREATE)) GOTO(out_create, rc = -ESTALE); - de = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, lock_mode, &lockh); + de = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, LCK_PW, &lockh); if (IS_ERR(de)) { rc = PTR_ERR(de); CERROR("parent lookup error %d\n", rc); @@ -217,42 +189,17 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, GOTO(out_create, rc); } dir = de->d_inode; - CDEBUG(D_INODE, "parent ino %lu name %s mode %o\n", + LASSERT(dir); + CDEBUG(D_INODE, "parent ino %lu creating name %s mode %o\n", dir->i_ino, rec->ur_name, rec->ur_mode); ldlm_lock_dump_handle(D_OTHER, &lockh); - down(&dir->i_sem); dchild = lookup_one_len(rec->ur_name, de, rec->ur_namelen - 1); if (IS_ERR(dchild)) { - CERROR("child lookup error %ld\n", PTR_ERR(dchild)); - LBUG(); - GOTO(out_create_de, rc = -ESTALE); - } - - if (dchild->d_inode) { - struct mds_body *body; - struct inode *inode = dchild->d_inode; - - CDEBUG(D_INODE, "child exists (dir %lu, name %s, ino %lu)\n", - dir->i_ino, rec->ur_name, dchild->d_inode->i_ino); - - /* XXX check that mode is correct? */ - - body = lustre_msg_buf(req->rq_repmsg, offset); - mds_pack_inode2fid(&body->fid1, inode); - mds_pack_inode2body(body, inode); - if (S_ISREG(inode->i_mode)) - mds_pack_md(mds, req, offset + 1, body, inode); - - /* This isn't an error for RECREATE. */ - if (rec->ur_opcode & REINT_REPLAYING) { - CDEBUG(D_INODE, "EEXIST suppressed for REPLAYING\n"); - rc = 0; - } else { - rc = -EEXIST; - } - GOTO(out_create_dchild, rc); + rc = PTR_ERR(dchild); + CERROR("child lookup error %d\n", rc); + GOTO(out_create_de, rc); } OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_CREATE_WRITE, @@ -378,9 +325,8 @@ out_create_commit: } out_create_dchild: l_dput(dchild); - ldlm_lock_decref(&lockh, lock_mode); out_create_de: - up(&dir->i_sem); + ldlm_lock_decref(&lockh, LCK_PW); l_dput(de); out_create: req->rq_status = rc; @@ -414,238 +360,278 @@ out_create_unlink: goto out_create_commit; } +/* This function doesn't use ldlm_match_or_enqueue because we're always called + * with EX or PW locks, and the MDS is no longer allowed to match write locks, + * because they take the place of local semaphores. + * + * Two locks are taken in numerical order */ +int enqueue_ordered_locks(int lock_mode, struct obd_device *obd, + struct ldlm_res_id *p1_res_id, + struct ldlm_res_id *p2_res_id, + struct lustre_handle *p1_lockh, + struct lustre_handle *p2_lockh) +{ + struct ldlm_res_id res_id[2]; + struct lustre_handle *handles[2] = {p1_lockh, p2_lockh}; + int rc, flags; + ENTRY; + + LASSERT(p1_res_id != NULL && p2_res_id != NULL); + + CDEBUG(D_INFO, "locks before: "LPU64"/"LPU64"\n", + p1_res_id[0].name[0], p2_res_id[0].name[0]); + + if (p1_res_id->name[0] < p2_res_id->name[0]) { + handles[0] = p1_lockh; + handles[1] = p2_lockh; + res_id[0] = *p1_res_id; + res_id[1] = *p2_res_id; + } else { + handles[1] = p1_lockh; + handles[0] = p2_lockh; + res_id[1] = *p1_res_id; + res_id[0] = *p2_res_id; + } + + CDEBUG(D_INFO, "lock order: "LPU64"/"LPU64"\n", + p1_res_id[0].name[0], p2_res_id[0].name[0]); + + flags = LDLM_FL_LOCAL_ONLY; + rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, res_id[0], + LDLM_PLAIN, NULL, 0, lock_mode, &flags, + ldlm_completion_ast, mds_blocking_ast, NULL, + NULL, handles[0]); + if (rc != ELDLM_OK) + RETURN(-EIO); + ldlm_lock_dump_handle(D_OTHER, handles[0]); + + if (memcmp(&res_id[0], &res_id[1], sizeof(res_id[0])) == 0) { + memcpy(handles[1], handles[0], sizeof(*(handles[1]))); + ldlm_lock_addref(handles[1], lock_mode); + } else { + flags = LDLM_FL_LOCAL_ONLY; + rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, + res_id[1], LDLM_PLAIN, NULL, 0, lock_mode, + &flags, ldlm_completion_ast, + mds_blocking_ast, NULL, 0, handles[1]); + if (rc != ELDLM_OK) { + ldlm_lock_decref(handles[0], lock_mode); + RETURN(-EIO); + } + } + ldlm_lock_dump_handle(D_OTHER, handles[1]); + + RETURN(0); +} + static int mds_reint_unlink(struct mds_update_record *rec, int offset, - struct ptlrpc_request *req) + struct ptlrpc_request *req, + struct lustre_handle *child_lockh) { - struct dentry *de = NULL; + struct dentry *dir_de = NULL; struct dentry *dchild = NULL; struct mds_obd *mds = mds_req2mds(req); struct obd_device *obd = req->rq_export->exp_obd; struct mds_body *body = NULL; + struct inode *dir_inode, *child_inode; + struct lustre_handle *handle, parent_lockh; + struct ldlm_res_id child_res_id = { .name = {0} }; char *name; - struct inode *dir, *inode; - struct lustre_handle lockh, child_lockh; - void *handle; - int namelen, lock_mode, err, rc = 0; + int namelen, err, rc = 0, flags = 0, return_lock = 0; ENTRY; - /* a name was supplied by the client; fid1 is the directory */ - lock_mode = (req->rq_reqmsg->opc == MDS_REINT) ? LCK_PW : LCK_PW; - de = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, lock_mode, &lockh); - if (IS_ERR(de)) { - LBUG(); - RETURN(PTR_ERR(de)); - } - if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNLINK)) - GOTO(out_unlink, rc = -ENOENT); + GOTO(out, rc = -ENOENT); + + /* Step 1: Lookup the parent by FID */ + dir_de = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, LCK_PW, + &parent_lockh); + if (IS_ERR(dir_de)) + GOTO(out, rc = PTR_ERR(dir_de)); + dir_inode = dir_de->d_inode; + LASSERT(dir_inode); + /* Step 2: Lookup the child */ name = lustre_msg_buf(req->rq_reqmsg, offset + 1); namelen = req->rq_reqmsg->buflens[offset + 1] - 1; -#warning "FIXME: if mds_name2locked_dentry decrefs this lock, we must not" - memcpy(&child_lockh, &lockh, sizeof(child_lockh)); - dchild = mds_name2locked_dentry(obd, de, NULL, name, namelen, - LCK_EX, &child_lockh, lock_mode); - - if (IS_ERR(dchild)) { - LBUG(); - GOTO(out_unlink, rc = PTR_ERR(dchild)); - } - - dir = de->d_inode; - inode = dchild->d_inode; - DEBUG_REQ(D_INODE, req, "parent ino %lu, child ino %lu\n", dir->i_ino, - inode ? inode->i_ino : 0); - if (!inode) { + dchild = lookup_one_len(name, dir_de, namelen); + if (IS_ERR(dchild)) + GOTO(out_step_2a, rc = PTR_ERR(dchild)); + child_inode = dchild->d_inode; + if (child_inode == NULL) { if (rec->ur_opcode & REINT_REPLAYING) { CDEBUG(D_INODE, "child missing (%lu/%s); OK for REPLAYING\n", - dir->i_ino, rec->ur_name); + dir_inode->i_ino, rec->ur_name); rc = 0; } else { CDEBUG(D_INODE, "child doesn't exist (dir %lu, name %s)\n", - dir->i_ino, rec->ur_name); + dir_inode->i_ino, rec->ur_name); rc = -ENOENT; } - /* going to out_unlink_cancel causes an LBUG, don't know why */ - GOTO(out_unlink_dchild, rc); + GOTO(out_step_2b, rc); } - if (offset) { - /* XXX offset? */ - offset = 1; + DEBUG_REQ(D_INODE, req, "parent ino %lu, child ino %lu", + dir_inode->i_ino, child_inode->i_ino); - body = lustre_msg_buf(req->rq_repmsg, offset); - mds_pack_inode2fid(&body->fid1, inode); - mds_pack_inode2body(body, inode); - } + /* Step 3: Get lock a lock on the child */ + child_res_id.name[0] = child_inode->i_ino; + child_res_id.name[1] = child_inode->i_generation; + + rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, + child_res_id, LDLM_PLAIN, NULL, 0, LCK_EX, + &flags, ldlm_completion_ast, mds_blocking_ast, + NULL, NULL, child_lockh); + if (rc != ELDLM_OK) + GOTO(out_step_2b, rc); OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_UNLINK_WRITE, - to_kdev_t(dir->i_sb->s_dev)); + to_kdev_t(dir_inode->i_sb->s_dev)); + + /* Slightly magical; see ldlm_intent_policy */ + if (offset) + offset = 1; + + body = lustre_msg_buf(req->rq_repmsg, offset); + /* Step 4: Do the unlink: client decides between rmdir/unlink! + * (bug 72) */ mds_start_transno(mds); - switch (rec->ur_mode /* & S_IFMT ? */) { + switch (rec->ur_mode & S_IFMT) { case S_IFDIR: - handle = fsfilt_start(obd, dir, FSFILT_OP_RMDIR); + handle = fsfilt_start(obd, dir_inode, FSFILT_OP_RMDIR); if (IS_ERR(handle)) - GOTO(out_unlink_cancel_transno, rc = PTR_ERR(handle)); - rc = vfs_rmdir(dir, dchild); + GOTO(out_cancel_transno, rc = PTR_ERR(handle)); + rc = vfs_rmdir(dir_inode, dchild); break; case S_IFREG: - /* get OBD EA data first so client can also destroy object */ - if ((inode->i_mode & S_IFMT) == S_IFREG && offset) - mds_pack_md(mds, req, offset + 1, body, inode); + /* If this is the last reference to this inode, get the OBD EA + * data first so the client can destroy OST objects */ + if ((child_inode->i_mode & S_IFMT) == S_IFREG && + child_inode->i_nlink == 1) { + mds_pack_inode2fid(&body->fid1, child_inode); + mds_pack_inode2body(body, child_inode); + mds_pack_md(obd, req->rq_repmsg, offset + 1, + body, child_inode); + if (body->valid & OBD_MD_FLEASIZE) + return_lock = 1; + } /* no break */ case S_IFLNK: case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: - handle = fsfilt_start(obd, dir, FSFILT_OP_UNLINK); + handle = fsfilt_start(obd, dir_inode, FSFILT_OP_UNLINK); if (IS_ERR(handle)) - GOTO(out_unlink_cancel_transno, rc = PTR_ERR(handle)); - rc = vfs_unlink(dir, dchild); + GOTO(out_cancel_transno, rc = PTR_ERR(handle)); + rc = vfs_unlink(dir_inode, dchild); break; default: CERROR("bad file type %o unlinking %s\n", rec->ur_mode, name); handle = NULL; LBUG(); - GOTO(out_unlink_cancel_transno, rc = -EINVAL); + GOTO(out_cancel_transno, rc = -EINVAL); } rc = mds_finish_transno(mds, handle, req, rc); - err = fsfilt_commit(obd, dir, handle); + err = fsfilt_commit(obd, dir_inode, handle); + if (rc != 0 || err != 0) { + /* Don't unlink the OST objects if the MDS unlink failed */ + body->valid = 0; + } if (err) { CERROR("error on commit: err = %d\n", err); if (!rc) rc = err; } - EXIT; - -out_unlink_cancel: - ldlm_lock_decref(&child_lockh, LCK_EX); - err = ldlm_cli_cancel(&child_lockh); - if (err < 0) { - CERROR("failed to cancel child inode lock: err = %d\n", err); - if (!rc) - rc = -ENOLCK; /*XXX translate LDLM lock error */ - } -out_unlink_dchild: + GOTO(out_step_4, rc); + out_step_4: + if (rc != 0 || return_lock == 0) + ldlm_lock_decref(child_lockh, LCK_EX); + out_step_2b: l_dput(dchild); - up(&dir->i_sem); -out_unlink: - ldlm_lock_decref(&lockh, lock_mode); - l_dput(de); + out_step_2a: + ldlm_lock_decref(&parent_lockh, LCK_EX); + l_dput(dir_de); + out: req->rq_status = rc; return 0; -out_unlink_cancel_transno: + out_cancel_transno: rc = mds_finish_transno(mds, handle, req, rc); - goto out_unlink_cancel; + goto out_step_4; } static int mds_reint_link(struct mds_update_record *rec, int offset, - struct ptlrpc_request *req) + struct ptlrpc_request *req, struct lustre_handle *lh) { struct obd_device *obd = req->rq_export->exp_obd; struct dentry *de_src = NULL; struct dentry *de_tgt_dir = NULL; struct dentry *dchild = NULL; struct mds_obd *mds = mds_req2mds(req); - struct lustre_handle *handle, tgtlockh, srclockh; - int lock_mode; - __u64 res_id[3] = { 0 }; - int flags = 0; - int rc = 0, err; - + struct lustre_handle *handle, tgt_dir_lockh, src_lockh; + struct ldlm_res_id src_res_id = { .name = {0} }; + struct ldlm_res_id tgt_dir_res_id = { .name = {0} }; + int lock_mode, rc = 0, err; ENTRY; - de_src = mds_fid2dentry(mds, rec->ur_fid1, NULL); - if (IS_ERR(de_src) || OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_LINK)) { - GOTO(out_link, rc = -ESTALE); - } - /* plan to change the link count on this inode: write lock */ - lock_mode = (req->rq_reqmsg->opc == MDS_REINT) ? LCK_PW : LCK_PW; - res_id[0] = de_src->d_inode->i_ino; - res_id[1] = de_src->d_inode->i_generation; + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_LINK)) + GOTO(out, rc = -ENOENT); - rc = ldlm_lock_match(obd->obd_namespace, res_id, LDLM_PLAIN, - NULL, 0, lock_mode, &srclockh); - if (rc == 0) { - LDLM_DEBUG_NOLOCK("enqueue res "LPU64, res_id[0]); - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, - res_id, LDLM_PLAIN, NULL, 0, lock_mode, - &flags, ldlm_completion_ast, - mds_blocking_ast, NULL, 0, &srclockh); - if (rc != ELDLM_OK) { - CERROR("lock enqueue: err: %d\n", rc); - GOTO(out_link_src_put, rc = -EIO); - } - } else { - ldlm_lock_dump_handle(D_OTHER, &srclockh); - } + /* Step 1: Lookup the source inode and target directory by FID */ + de_src = mds_fid2dentry(mds, rec->ur_fid1, NULL); + if (IS_ERR(de_src)) + GOTO(out, rc = PTR_ERR(de_src)); de_tgt_dir = mds_fid2dentry(mds, rec->ur_fid2, NULL); - if (IS_ERR(de_tgt_dir)) { - GOTO(out_link_src, rc = -ESTALE); - } - - lock_mode = (req->rq_reqmsg->opc == MDS_REINT) ? LCK_PW : LCK_PW; - res_id[0] = de_tgt_dir->d_inode->i_ino; - res_id[1] = de_tgt_dir->d_inode->i_generation; - - rc = ldlm_lock_match(obd->obd_namespace, res_id, LDLM_PLAIN, - NULL, 0, lock_mode, &tgtlockh); - if (rc == 0) { - LDLM_DEBUG_NOLOCK("enqueue res "LPU64, res_id[0]); - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, - res_id, LDLM_PLAIN, NULL, 0, lock_mode, - &flags, ldlm_completion_ast, - mds_blocking_ast, NULL, 0, &tgtlockh); - if (rc != ELDLM_OK) { - CERROR("lock enqueue: err: %d\n", rc); - GOTO(out_link_tgt_dir_put, rc = -EIO); - } - } else { - ldlm_lock_dump_handle(D_OTHER, &tgtlockh); - } - - down(&de_tgt_dir->d_inode->i_sem); + if (IS_ERR(de_tgt_dir)) + GOTO(out_de_src, rc = PTR_ERR(de_tgt_dir)); + + CDEBUG(D_INODE, "linking %*s/%s to inode %lu\n", + de_tgt_dir->d_name.len, de_tgt_dir->d_name.name, rec->ur_name, + de_src->d_inode->i_ino); + + /* Step 2: Take the two locks */ + lock_mode = LCK_EX; + src_res_id.name[0] = de_src->d_inode->i_ino; + src_res_id.name[1] = de_src->d_inode->i_generation; + tgt_dir_res_id.name[0] = de_tgt_dir->d_inode->i_ino; + tgt_dir_res_id.name[1] = de_tgt_dir->d_inode->i_generation; + + rc = enqueue_ordered_locks(LCK_EX, obd, &src_res_id, &tgt_dir_res_id, + &src_lockh, &tgt_dir_lockh); + if (rc != ELDLM_OK) + GOTO(out_tgt_dir, rc = -EIO); + + /* Step 3: Lookup the child */ dchild = lookup_one_len(rec->ur_name, de_tgt_dir, rec->ur_namelen - 1); if (IS_ERR(dchild)) { CERROR("child lookup error %ld\n", PTR_ERR(dchild)); - GOTO(out_link_tgt_dir, rc = -ESTALE); + GOTO(out_drop_locks, rc = PTR_ERR(dchild)); } if (dchild->d_inode) { - struct inode *inode = dchild->d_inode; - /* in intent case ship back attributes to client */ - if (offset) { - struct mds_body *body = - lustre_msg_buf(req->rq_repmsg, 1); - - mds_pack_inode2fid(&body->fid1, inode); - mds_pack_inode2body(body, inode); - if (S_ISREG(inode->i_mode)) - mds_pack_md(mds, req, 2, body, inode); - } if (rec->ur_opcode & REINT_REPLAYING) { /* XXX verify that the link is to the the right file? */ - rc = 0; CDEBUG(D_INODE, "child exists (dir %lu, name %s) (REPLAYING)\n", de_tgt_dir->d_inode->i_ino, rec->ur_name); + rc = 0; } else { - rc = -EEXIST; - CERROR("child exists (dir %lu, name %s)\n", + CDEBUG(D_INODE, "child exists (dir %lu, name %s)\n", de_tgt_dir->d_inode->i_ino, rec->ur_name); + rc = -EEXIST; } - GOTO(out_link_dchild, rc); + GOTO(out_drop_child, rc); } + /* Step 4: Do it. */ OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_LINK_WRITE, to_kdev_t(de_src->d_inode->i_sb->s_dev)); @@ -654,7 +640,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset, if (IS_ERR(handle)) { rc = PTR_ERR(handle); mds_finish_transno(mds, handle, req, rc); - GOTO(out_link_dchild, rc); + GOTO(out_drop_child, rc); } rc = vfs_link(de_src, de_tgt_dir->d_inode, dchild); @@ -668,26 +654,26 @@ static int mds_reint_link(struct mds_update_record *rec, int offset, if (!rc) rc = err; } + EXIT; -out_link_dchild: +out_drop_child: l_dput(dchild); -out_link_tgt_dir: - ldlm_lock_decref(&tgtlockh, lock_mode); -out_link_tgt_dir_put: - up(&de_tgt_dir->d_inode->i_sem); +out_drop_locks: + ldlm_lock_decref(&src_lockh, lock_mode); + ldlm_lock_decref(&tgt_dir_lockh, lock_mode); +out_tgt_dir: l_dput(de_tgt_dir); -out_link_src: - ldlm_lock_decref(&srclockh, lock_mode); -out_link_src_put: +out_de_src: l_dput(de_src); -out_link: +out: req->rq_status = rc; return 0; } static int mds_reint_rename(struct mds_update_record *rec, int offset, - struct ptlrpc_request *req) + struct ptlrpc_request *req, + struct lustre_handle *lockh) { struct obd_device *obd = req->rq_export->exp_obd; struct dentry *de_srcdir = NULL; @@ -695,93 +681,88 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset, struct dentry *de_old = NULL; struct dentry *de_new = NULL; struct mds_obd *mds = mds_req2mds(req); - struct lustre_handle tgtlockh, srclockh, oldhandle; - int flags = 0, lock_mode, rc = 0, err; + struct lustre_handle dlm_handles[4]; + struct ldlm_res_id p1_res_id = { .name = {0} }; + struct ldlm_res_id p2_res_id = { .name = {0} }; + struct ldlm_res_id c1_res_id = { .name = {0} }; + struct ldlm_res_id c2_res_id = { .name = {0} }; + int rc = 0, err, lock_count = 3, flags = LDLM_FL_LOCAL_ONLY; void *handle; - __u64 res_id[3] = { 0 }; ENTRY; de_srcdir = mds_fid2dentry(mds, rec->ur_fid1, NULL); if (IS_ERR(de_srcdir)) - GOTO(out_rename, rc = -ESTALE); - - lock_mode = (req->rq_reqmsg->opc == MDS_REINT) ? LCK_PW : LCK_PW; - res_id[0] = de_srcdir->d_inode->i_ino; - res_id[1] = de_srcdir->d_inode->i_generation; - - rc = ldlm_lock_match(obd->obd_namespace, res_id, LDLM_PLAIN, - NULL, 0, lock_mode, &srclockh); - if (rc == 0) { - LDLM_DEBUG_NOLOCK("enqueue res "LPU64, res_id[0]); - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, - res_id, LDLM_PLAIN, NULL, 0, lock_mode, - &flags, ldlm_completion_ast, - mds_blocking_ast, NULL, 0, &srclockh); - if (rc != ELDLM_OK) { - CERROR("lock enqueue: err: %d\n", rc); - GOTO(out_rename_srcput, rc = -EIO); - } - } else { - ldlm_lock_dump_handle(D_OTHER, &srclockh); - } - + GOTO(out, rc = PTR_ERR(de_srcdir)); de_tgtdir = mds_fid2dentry(mds, rec->ur_fid2, NULL); if (IS_ERR(de_tgtdir)) - GOTO(out_rename_srcdir, rc = -ESTALE); + GOTO(out_put_srcdir, rc = PTR_ERR(de_tgtdir)); + + /* The idea here is that we need to get four locks in the end: + * one on each parent directory, one on each child. We need to take + * these locks in some kind of order (to avoid deadlocks), and the order + * I selected is "increasing resource number" order. We need to take + * the locks on the parent directories, however, before we can lookup + * the children. Thus the following plan: + * + * 1. Take locks on the parent(s), in order + * 2. Lookup the children + * 3. Take locks on the children, in order + * 4. Execute the rename + */ - lock_mode = (req->rq_reqmsg->opc == MDS_REINT) ? LCK_PW : LCK_PW; - res_id[0] = de_tgtdir->d_inode->i_ino; - res_id[1] = de_tgtdir->d_inode->i_generation; + /* Step 1: Take locks on the parent(s), in order */ + p1_res_id.name[0] = de_srcdir->d_inode->i_ino; + p1_res_id.name[1] = de_srcdir->d_inode->i_generation; - rc = ldlm_lock_match(obd->obd_namespace, res_id, LDLM_PLAIN, - NULL, 0, lock_mode, &tgtlockh); - if (rc == 0) { - flags = 0; - LDLM_DEBUG_NOLOCK("enqueue res "LPU64, res_id[0]); - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, - res_id, LDLM_PLAIN, NULL, 0, lock_mode, - &flags, ldlm_completion_ast, - mds_blocking_ast, NULL, 0, &tgtlockh); - if (rc != ELDLM_OK) { - CERROR("lock enqueue: err: %d\n", rc); - GOTO(out_rename_tgtput, rc = -EIO); - } - } else { - ldlm_lock_dump_handle(D_OTHER, &tgtlockh); - } + p2_res_id.name[0] = de_tgtdir->d_inode->i_ino; + p2_res_id.name[1] = de_tgtdir->d_inode->i_generation; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - double_lock(de_tgtdir, de_srcdir); -#endif + rc = enqueue_ordered_locks(LCK_EX, obd, &p1_res_id, &p2_res_id, + &(dlm_handles[0]), &(dlm_handles[1])); + if (rc != ELDLM_OK) + GOTO(out_put_tgtdir, rc); + + /* Step 2: Lookup the children */ de_old = lookup_one_len(rec->ur_name, de_srcdir, rec->ur_namelen - 1); if (IS_ERR(de_old)) { CERROR("old child lookup error (%*s): %ld\n", rec->ur_namelen - 1, rec->ur_name, PTR_ERR(de_old)); - GOTO(out_rename_tgtdir, rc = -ENOENT); + GOTO(out_step_2a, rc = PTR_ERR(de_old)); } + if (de_old->d_inode == NULL) + GOTO(out_step_2b, rc = -ENOENT); + de_new = lookup_one_len(rec->ur_tgt, de_tgtdir, rec->ur_tgtlen - 1); if (IS_ERR(de_new)) { CERROR("new child lookup error (%*s): %ld\n", rec->ur_tgtlen - 1, rec->ur_tgt, PTR_ERR(de_new)); - GOTO(out_rename_deold, rc = -ENOENT); + GOTO(out_step_2b, rc = PTR_ERR(de_new)); } - /* in intent case ship back attributes to client */ - if (offset) { - struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1); - struct inode *inode = de_new->d_inode; - - if (!inode) { - body->valid = 0; - } else { - mds_pack_inode2fid(&body->fid1, inode); - mds_pack_inode2body(body, inode); - if (S_ISREG(inode->i_mode)) - mds_pack_md(mds, req, 2, body, inode); - } + /* Step 3: Take locks on the children */ + c1_res_id.name[0] = de_old->d_inode->i_ino; + c1_res_id.name[1] = de_old->d_inode->i_generation; + if (de_new->d_inode == NULL) { + flags = LDLM_FL_LOCAL_ONLY; + rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, + c1_res_id, LDLM_PLAIN, NULL, 0, LCK_EX, + &flags, ldlm_completion_ast, + mds_blocking_ast, NULL, NULL, + &(dlm_handles[2])); + lock_count = 3; + } else { + c2_res_id.name[0] = de_new->d_inode->i_ino; + c2_res_id.name[1] = de_new->d_inode->i_generation; + rc = enqueue_ordered_locks(LCK_EX, obd, &c1_res_id, &c2_res_id, + &(dlm_handles[2]), + &(dlm_handles[3])); + lock_count = 4; } + if (rc != ELDLM_OK) + GOTO(out_step_3, rc); + /* Step 4: Execute the rename */ OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_RENAME_WRITE, to_kdev_t(de_srcdir->d_inode->i_sb->s_dev)); @@ -790,7 +771,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset, if (IS_ERR(handle)) { rc = PTR_ERR(handle); mds_finish_transno(mds, handle, req, rc); - GOTO(out_rename_denew, rc); + GOTO(out_step_4, rc); } lock_kernel(); @@ -806,56 +787,30 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset, if (!rc) rc = err; } - EXIT; -out_rename_denew: + EXIT; + out_step_4: + ldlm_lock_decref(&(dlm_handles[2]), LCK_EX); + if (lock_count == 4) + ldlm_lock_decref(&(dlm_handles[3]), LCK_EX); + out_step_3: l_dput(de_new); -out_rename_deold: - if (!rc) { - res_id[0] = de_old->d_inode->i_ino; - res_id[1] = de_old->d_inode->i_generation; - flags = 0; - /* Take an exclusive lock on the resource that we're - * about to free, to force everyone to drop their - * locks. */ - LDLM_DEBUG_NOLOCK("getting EX lock res "LPU64, res_id[0]); - rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, - res_id, LDLM_PLAIN, NULL, 0, LCK_EX, - &flags, ldlm_completion_ast, - mds_blocking_ast, NULL, 0, &oldhandle); - if (rc) - CERROR("failed to get child inode lock (child ino " - LPD64" dir ino %lu)\n", - res_id[0], de_old->d_inode->i_ino); - } - + out_step_2b: l_dput(de_old); - - if (!rc) { - ldlm_lock_decref(&oldhandle, LCK_EX); - rc = ldlm_cli_cancel(&oldhandle); - if (rc < 0) - CERROR("failed to cancel child inode lock ino " - LPD64": %d\n", res_id[0], rc); - } -out_rename_tgtdir: -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - double_up(&de_srcdir->d_inode->i_sem, &de_tgtdir->d_inode->i_sem); -#endif - ldlm_lock_decref(&tgtlockh, lock_mode); -out_rename_tgtput: + out_step_2a: + ldlm_lock_decref(&(dlm_handles[0]), LCK_EX); + ldlm_lock_decref(&(dlm_handles[1]), LCK_EX); + out_put_tgtdir: l_dput(de_tgtdir); -out_rename_srcdir: - ldlm_lock_decref(&srclockh, lock_mode); -out_rename_srcput: + out_put_srcdir: l_dput(de_srcdir); -out_rename: + out: req->rq_status = rc; return 0; } -typedef int (*mds_reinter) (struct mds_update_record *, int offset, - struct ptlrpc_request *); +typedef int (*mds_reinter)(struct mds_update_record *, int offset, + struct ptlrpc_request *, struct lustre_handle *); static mds_reinter reinters[REINT_MAX + 1] = { [REINT_SETATTR] mds_reint_setattr, @@ -863,16 +818,17 @@ static mds_reinter reinters[REINT_MAX + 1] = { [REINT_UNLINK] mds_reint_unlink, [REINT_LINK] mds_reint_link, [REINT_RENAME] mds_reint_rename, + [REINT_OPEN] mds_open }; int mds_reint_rec(struct mds_update_record *rec, int offset, - struct ptlrpc_request *req) + struct ptlrpc_request *req, struct lustre_handle *lockh) { struct mds_obd *mds = mds_req2mds(req); struct obd_run_ctxt saved; struct obd_ucred uc; - int realop = rec->ur_opcode & REINT_OPCODE_MASK; - int rc; + int realop = rec->ur_opcode & REINT_OPCODE_MASK, rc; + ENTRY; if (realop < 1 || realop > REINT_MAX) { CERROR("opcode %d not valid (%sREPLAYING)\n", realop, @@ -884,10 +840,11 @@ int mds_reint_rec(struct mds_update_record *rec, int offset, uc.ouc_fsuid = rec->ur_fsuid; uc.ouc_fsgid = rec->ur_fsgid; uc.ouc_cap = rec->ur_cap; + uc.ouc_suppgid = rec->ur_suppgid; push_ctxt(&saved, &mds->mds_ctxt, &uc); - rc = reinters[realop] (rec, offset, req); + rc = reinters[realop] (rec, offset, req, lockh); pop_ctxt(&saved, &mds->mds_ctxt, &uc); - return rc; + RETURN(rc); } diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 61e9114..9512e2a 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -1,15 +1,27 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Object Devices Class Driver + * + * Copyright (C) 2001-2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * These are the only exported functions, they provide some generic * infrastructure for managing object devices - * - * Object Devices Class Driver */ #define EXPORT_SYMTAB @@ -54,29 +66,92 @@ struct list_head obd_types; atomic_t obd_memory; int obd_memmax; +/* Root for /proc/lustre */ +struct proc_dir_entry *proc_lustre_root = NULL; + /* The following are visible and mutable through /proc/sys/lustre/. */ unsigned long obd_fail_loc; unsigned long obd_timeout = 100; char obd_recovery_upcall[128] = "/usr/lib/lustre/ha_assist"; +unsigned long obd_sync_filter; /* = 0, don't sync by default */ /* opening /dev/obd */ static int obd_class_open(struct inode * inode, struct file * file) { + struct obd_class_user_state *ocus; ENTRY; - file->private_data = NULL; + OBD_ALLOC (ocus, sizeof (*ocus)); + if (ocus == NULL) + return (-ENOMEM); + + INIT_LIST_HEAD (&ocus->ocus_conns); + ocus->ocus_current_obd = NULL; + file->private_data = ocus; + MOD_INC_USE_COUNT; RETURN(0); } +static int +obd_class_add_user_conn (struct obd_class_user_state *ocus, + struct lustre_handle *conn) +{ + struct obd_class_user_conn *c; + + /* NB holding obd_conf_sem */ + + OBD_ALLOC (c, sizeof (*c)); + if (ocus == NULL) + return (-ENOMEM); + + c->ocuc_conn = *conn; + list_add (&c->ocuc_chain, &ocus->ocus_conns); + return (0); +} + +static void +obd_class_remove_user_conn (struct obd_class_user_state *ocus, + struct lustre_handle *conn) +{ + struct list_head *e; + struct obd_class_user_conn *c; + + /* NB holding obd_conf_sem or last reference */ + + list_for_each (e, &ocus->ocus_conns) { + c = list_entry (e, struct obd_class_user_conn, ocuc_chain); + if (!memcmp (conn, &c->ocuc_conn, sizeof (*conn))) { + list_del (&c->ocuc_chain); + OBD_FREE (c, sizeof (*c)); + return; + } + } +} + /* closing /dev/obd */ static int obd_class_release(struct inode * inode, struct file * file) { + struct obd_class_user_state *ocus = file->private_data; + struct obd_class_user_conn *c; ENTRY; - // XXX drop lsm, connections here - if (file->private_data) - file->private_data = NULL; + while (!list_empty (&ocus->ocus_conns)) { + c = list_entry (ocus->ocus_conns.next, + struct obd_class_user_conn, ocuc_chain); + list_del (&c->ocuc_chain); + + CDEBUG (D_IOCTL, "Auto-disconnect %p\n", &c->ocuc_conn); + + down (&obd_conf_sem); + obd_disconnect (&c->ocuc_conn); + up (&obd_conf_sem); + + OBD_FREE (c, sizeof (*c)); + } + + OBD_FREE (ocus, sizeof (*ocus)); + MOD_DEC_USE_COUNT; RETURN(0); } @@ -124,7 +199,8 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, { char *buf = NULL; struct obd_ioctl_data *data; - struct obd_device *obd = filp->private_data; + struct obd_class_user_state *ocus = filp->private_data; + struct obd_device *obd = ocus->ocus_current_obd; struct lustre_handle conn; int err = 0, len = 0, serialised = 0; ENTRY; @@ -133,6 +209,8 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, case OBD_IOC_BRW_WRITE: case OBD_IOC_BRW_READ: case OBD_IOC_GETATTR: + case ECHO_IOC_ENQUEUE: + case ECHO_IOC_CANCEL: break; default: down(&obd_conf_sem); @@ -163,7 +241,7 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, } CDEBUG(D_IOCTL, "device %d\n", data->ioc_dev); - filp->private_data = &obd_dev[data->ioc_dev]; + ocus->ocus_current_obd = &obd_dev[data->ioc_dev]; GOTO(out, err=0); } @@ -192,7 +270,7 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, status = "-"; l = snprintf(buf2, remains, "%2d %s %s %s %s %d\n", i, status, obd->obd_type->typ_name, - obd->obd_name, obd->obd_uuid, obd->obd_type->typ_refcnt); + obd->obd_name, obd->obd_uuid.uuid, obd->obd_type->typ_refcnt); buf2 +=l; remains -=l; if (remains <= 0) { @@ -263,6 +341,7 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, * currently selected device. */ int dev; + struct obd_uuid uuid; if (!data->ioc_inllen1 || !data->ioc_inlbuf1) { CERROR("No UUID passed!\n"); @@ -274,7 +353,8 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, } CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1); - dev = class_uuid2dev(data->ioc_inlbuf1); + obd_str2uuid(&uuid, data->ioc_inlbuf1); + dev = class_uuid2dev(&uuid); data->ioc_dev = dev; if (dev == -1) { CDEBUG(D_IOCTL, "No device for name %s!\n", @@ -294,11 +374,11 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, int dev = -1; int i; - filp->private_data = NULL; + ocus->ocus_current_obd = NULL; for (i = 0 ; i < MAX_OBD_DEVICES ; i++) { struct obd_device *obd = &obd_dev[i]; if (!obd->obd_type) { - filp->private_data = obd; + ocus->ocus_current_obd = obd; dev = i; break; } @@ -359,6 +439,7 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, OBD_ALLOC(obd->obd_name, len); if (!obd->obd_name) { class_put_type(obd->obd_type); + obd->obd_type = NULL; GOTO(out, err = -ENOMEM); } memcpy(obd->obd_name, data->ioc_inlbuf2, len); @@ -374,9 +455,10 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, OBD_FREE(obd->obd_name, strlen(obd->obd_name) + 1); class_put_type(obd->obd_type); + obd->obd_type = NULL; GOTO(out, err=-EINVAL); } - memcpy(obd->obd_uuid, data->ioc_inlbuf3, len); + memcpy(obd->obd_uuid.uuid, data->ioc_inlbuf3, len); } /* do the attach */ if (OBP(obd, attach)) @@ -407,14 +489,6 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, CERROR("OBD device %d not attached\n", obd->obd_minor); GOTO(out, err=-ENODEV); } - if (!list_empty(&obd->obd_exports)) { - if (!data->ioc_inlbuf1 || data->ioc_inlbuf1[0] != 'F') { - CERROR("OBD device %d (%p) has exports\n", - obd->obd_minor, obd); - GOTO(out, err=-EBUSY); - } - forcibly_detach_exports(obd); - } if (OBP(obd, detach)) err = OBP(obd,detach)(obd); @@ -460,41 +534,69 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, CERROR("Device %d not attached\n", obd->obd_minor); GOTO(out, err=-ENODEV); } - - if ( OBT(obd) && OBP(obd, cleanup) ) + if (!list_empty(&obd->obd_exports)) { + if (!data->ioc_inlbuf1 || data->ioc_inlbuf1[0] != 'F') { + CERROR("OBD device %d (%p) has exports\n", + obd->obd_minor, obd); + GOTO(out, err = -EBUSY); + } + forcibly_detach_exports(obd); + } + if (OBT(obd) && OBP(obd, cleanup)) err = obd_cleanup(obd); if (!err) { obd->obd_flags &= ~OBD_SET_UP; obd->obd_type->typ_refcnt--; } - GOTO(out, err); + GOTO(out, err); } case OBD_IOC_CONNECT: { - char * cluuid = "OBD_CLASS_UUID"; + struct obd_uuid cluuid = { "OBD_CLASS_UUID" }; obd_data2conn(&conn, data); - err = obd_connect(&conn, obd, cluuid, NULL, NULL); + err = obd_connect(&conn, obd, &cluuid, NULL, NULL); CDEBUG(D_IOCTL, "assigned export "LPX64"\n", conn.addr); obd_conn2data(data, &conn); if (err) GOTO(out, err); + err = obd_class_add_user_conn (ocus, &conn); + if (err != 0) { + obd_disconnect (&conn); + GOTO (out, err); + } + err = copy_to_user((void *)arg, data, sizeof(*data)); - if (err) - err = -EFAULT; - // XXX save connection data into file handle + if (err != 0) { + obd_class_remove_user_conn (ocus, &conn); + obd_disconnect (&conn); + GOTO (out, err=-EFAULT); + } GOTO(out, err); } case OBD_IOC_DISCONNECT: { obd_data2conn(&conn, data); + obd_class_remove_user_conn (ocus, &conn); err = obd_disconnect(&conn); GOTO(out, err); } + case OBD_IOC_NO_TRANSNO: { + if (!(obd->obd_flags & OBD_ATTACHED)) { + CERROR("Device %d not attached\n", obd->obd_minor); + GOTO(out, err=-ENODEV); + } + CDEBUG(D_IOCTL, + "disabling committed-transno notifications on %d\n", + obd->obd_minor); + obd->obd_flags |= OBD_NO_TRANSNO; + GOTO(out, err = 0); + } + default: obd_data2conn(&conn, data); @@ -607,7 +709,10 @@ EXPORT_SYMBOL(obd_memmax); EXPORT_SYMBOL(obd_fail_loc); EXPORT_SYMBOL(obd_timeout); EXPORT_SYMBOL(obd_recovery_upcall); +EXPORT_SYMBOL(obd_sync_filter); EXPORT_SYMBOL(ptlrpc_put_connection_superhack); +EXPORT_SYMBOL(ptlrpc_abort_inflight_superhack); +EXPORT_SYMBOL(proc_lustre_root); EXPORT_SYMBOL(class_register_type); EXPORT_SYMBOL(class_unregister_type); @@ -656,14 +761,19 @@ static int __init init_obdclass(void) obd_sysctl_init(); - err = lprocfs_reg_main(); - +#ifdef LPROCFS + proc_lustre_root = proc_mkdir("lustre", proc_root_fs); + if (!proc_lustre_root) + printk(KERN_ERR "error registering /proc/fs/lustre\n"); +#else + proc_lustre_root = NULL; +#endif return 0; } static void __exit cleanup_obdclass(void) { - int i, err; + int i; ENTRY; misc_deregister(&obd_psdev); @@ -679,7 +789,10 @@ static void __exit cleanup_obdclass(void) obd_cleanup_caches(); obd_sysctl_clean(); - err = lprocfs_dereg_main(); + if (proc_lustre_root) { + lprocfs_remove(proc_lustre_root); + proc_lustre_root = NULL; + } CERROR("obd mem max: %d leaked: %d\n", obd_memmax, atomic_read(&obd_memory)); @@ -689,8 +802,11 @@ static void __exit cleanup_obdclass(void) /* Check that we're building against the appropriate version of the Lustre * kernel patch */ #include -#if (LUSTRE_KERNEL_VERSION != 5) -# error Cannot continue: Your Lustre kernel patch is out of date +#define LUSTRE_SOURCE_VERSION 10 +#if (LUSTRE_KERNEL_VERSION < LUSTRE_SOURCE_VERSION) +# error Cannot continue: Your Lustre kernel patch is older than the sources +#elif (LUSTRE_KERNEL_VERSION > LUSTRE_SOURCE_VERSION) +# error Cannot continue: Your Lustre sources are older than the kernel patch #endif MODULE_AUTHOR("Cluster File Systems, Inc. "); diff --git a/lustre/obdclass/fsfilt_ext3.c b/lustre/obdclass/fsfilt_ext3.c index 3878315..5c52b43 100644 --- a/lustre/obdclass/fsfilt_ext3.c +++ b/lustre/obdclass/fsfilt_ext3.c @@ -23,6 +23,8 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#error "FIXME: this needs to be updated to match fsfilt_extN.c" + #define DEBUG_SUBSYSTEM S_FILTER #include @@ -269,6 +271,11 @@ static int fsfilt_ext3_statfs(struct super_block *sb, struct statfs *sfs) return rc; } +static int fsfilt_ext3_sync(struct super_block *sb) +{ + return ext3_force_commit(sb); +} + static struct fsfilt_operations fsfilt_ext3_ops = { fs_type: "ext3", fs_owner: THIS_MODULE, @@ -281,6 +288,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = { fs_journal_data: fsfilt_ext3_journal_data, fs_set_last_rcvd: fsfilt_ext3_set_last_rcvd, fs_statfs: fsfilt_ext3_statfs, + fs_sync: fsfilt_ext3_sync, }; static int __init fsfilt_ext3_init(void) diff --git a/lustre/obdclass/fsfilt_extN.c b/lustre/obdclass/fsfilt_extN.c index 4302392..0984c66 100644 --- a/lustre/obdclass/fsfilt_extN.c +++ b/lustre/obdclass/fsfilt_extN.c @@ -4,7 +4,7 @@ * lustre/lib/fsfilt_extN.c * Lustre filesystem abstraction routines * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. * Author: Andreas Dilger * * This file is part of Lustre, http://www.lustre.org. @@ -124,6 +124,8 @@ static void *fsfilt_extN_start(struct inode *inode, int op) * objcount inode blocks * 1 superblock * 2 * EXTN_SINGLEDATA_TRANS_BLOCKS for the quota files + * + * 1 EXTN_DATA_TRANS_BLOCKS for the last_rcvd update. */ static int fsfilt_extN_credits_needed(int objcount, struct fsfilt_objinfo *fso) { @@ -153,6 +155,9 @@ static int fsfilt_extN_credits_needed(int objcount, struct fsfilt_objinfo *fso) ngdblocks = EXTN_SB(sb)->s_gdb_count; needed += nbitmaps + ngdblocks; + + /* last_rcvd update */ + needed += EXTN_DATA_TRANS_BLOCKS; #ifdef CONFIG_QUOTA /* We assume that there will be 1 bit set in s_dquot.flags for each @@ -351,26 +356,55 @@ static int fsfilt_extN_get_md(struct inode *inode, void *lmm, int lmm_size) } static ssize_t fsfilt_extN_readpage(struct file *file, char *buf, size_t count, - loff_t *offset) + loff_t *off) { struct inode *inode = file->f_dentry->d_inode; int rc = 0; if (S_ISREG(inode->i_mode)) - rc = file->f_op->read(file, buf, count, offset); + rc = file->f_op->read(file, buf, count, off); else { - struct buffer_head *bh; - - /* FIXME: this assumes the blocksize == count, but the calling - * function will detect this as an error for now */ - bh = extN_bread(NULL, inode, - *offset >> inode->i_sb->s_blocksize_bits, - 0, &rc); - - if (bh) { - memcpy(buf, bh->b_data, inode->i_blksize); - brelse(bh); - rc = inode->i_blksize; + const int blkbits = inode->i_sb->s_blocksize_bits; + const int blksize = inode->i_sb->s_blocksize; + + CDEBUG(D_EXT2, "reading "LPSZ" at dir %lu+%llu\n", + count, inode->i_ino, *off); + while (count > 0) { + struct buffer_head *bh; + + bh = NULL; + if (*off < inode->i_size) { + int err = 0; + + bh = extN_bread(NULL, inode, *off >> blkbits, + 0, &err); + + CDEBUG(D_EXT2, "read %u@%llu\n", blksize, *off); + + if (bh) { + memcpy(buf, bh->b_data, blksize); + brelse(bh); + } else if (err) { + /* XXX in theory we should just fake + * this buffer and continue like ext3, + * especially if this is a partial read + */ + CERROR("error read dir %lu+%llu: %d\n", + inode->i_ino, *off, err); + RETURN(err); + } + } + if (!bh) { + struct extN_dir_entry_2 *fake = (void *)buf; + + CDEBUG(D_EXT2, "fake %u@%llu\n", blksize, *off); + memset(fake, 0, sizeof(*fake)); + fake->rec_len = cpu_to_le32(blksize); + } + count -= blksize; + buf += blksize; + *off += blksize; + rc += blksize; } } @@ -390,7 +424,6 @@ static void fsfilt_extN_cb_func(struct journal_callback *jcb, int error) static int fsfilt_extN_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd, void *handle, fsfilt_cb_t cb_func) { -#ifdef HAVE_JOURNAL_CALLBACK_STATUS struct fsfilt_cb_data *fcb; fcb = kmem_cache_alloc(fcb_cache, GFP_NOFS); @@ -408,17 +441,6 @@ static int fsfilt_extN_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd, journal_callback_set(handle, fsfilt_extN_cb_func, (struct journal_callback *)fcb); unlock_kernel(); -#else -#warning "no journal callback kernel patch, faking it..." - static long next = 0; - - if (time_after(jiffies, next)) { - CERROR("no journal callback kernel patch, faking it...\n"); - next = jiffies + 300 * HZ; - } - - cb_func(obd, last_rcvd, 0); -#endif return 0; } @@ -451,6 +473,11 @@ static int fsfilt_extN_statfs(struct super_block *sb, struct obd_statfs *osfs) return rc; } +static int fsfilt_extN_sync(struct super_block *sb) +{ + return extN_force_commit(sb); +} + static struct fsfilt_operations fsfilt_extN_ops = { fs_type: "extN", fs_owner: THIS_MODULE, @@ -464,6 +491,7 @@ static struct fsfilt_operations fsfilt_extN_ops = { fs_journal_data: fsfilt_extN_journal_data, fs_set_last_rcvd: fsfilt_extN_set_last_rcvd, fs_statfs: fsfilt_extN_statfs, + fs_sync: fsfilt_extN_sync, }; static int __init fsfilt_extN_init(void) diff --git a/lustre/obdclass/fsfilt_reiserfs.c b/lustre/obdclass/fsfilt_reiserfs.c index 1ec5916..f8d4ac3 100644 --- a/lustre/obdclass/fsfilt_reiserfs.c +++ b/lustre/obdclass/fsfilt_reiserfs.c @@ -160,6 +160,12 @@ static int fsfilt_reiserfs_statfs(struct super_block *sb, struct obd_statfs *osf return rc; } +static int fsfilt_reiserfs_sync(struct super_block *sb) +{ + CERROR("not implemented yet\n"); + return -ENOSYS; +} + static struct fsfilt_operations fsfilt_reiserfs_ops = { fs_type: "reiserfs", fs_owner: THIS_MODULE, @@ -173,6 +179,7 @@ static struct fsfilt_operations fsfilt_reiserfs_ops = { fs_journal_data: fsfilt_reiserfs_journal_data, fs_set_last_rcvd: fsfilt_reiserfs_set_last_rcvd, fs_statfs: fsfilt_reiserfs_statfs, + fs_sync: fsfilt_reiserfs_sync, }; static int __init fsfilt_reiserfs_init(void) diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 994949e..e5be2bc 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (c) 2001, 2002 Cluster File Systems, Inc. + * Copyright (c) 2001-2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -36,6 +36,8 @@ kmem_cache_t *import_cachep = NULL; kmem_cache_t *export_cachep = NULL; int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); +void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp, + int dying_import); /* * support functions: we could use inter-module communication, but this @@ -87,7 +89,7 @@ int class_register_type(struct obd_ops *ops, struct lprocfs_vars *vars, char *name) { struct obd_type *type; - int rc; + int rc = 0; ENTRY; LASSERT(strnlen(name, 1024) < 1024); /* sanity check */ @@ -111,10 +113,13 @@ int class_register_type(struct obd_ops *ops, struct lprocfs_vars *vars, strcpy(type->typ_name, name); list_add(&type->typ_chain, &obd_types); - rc = lprocfs_reg_class(type, vars, type); - if (rc != 0) { + type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root, + vars, type); + if (IS_ERR(type->typ_procroot)) { + rc = PTR_ERR(type->typ_procroot); + type->typ_procroot = NULL; list_del(&type->typ_chain); - GOTO(failed, rc); + GOTO (failed, rc); } RETURN (0); @@ -144,8 +149,11 @@ int class_unregister_type(char *name) OBD_FREE(type->typ_ops, sizeof(*type->typ_ops)); RETURN(-EBUSY); } - if(type->typ_procroot) - lprocfs_dereg_class(type); + + if (type->typ_procroot) { + lprocfs_remove(type->typ_procroot); + type->typ_procroot = NULL; + } list_del(&type->typ_chain); OBD_FREE(type->typ_name, strlen(name) + 1); @@ -174,14 +182,14 @@ int class_name2dev(char *name) return res; } -int class_uuid2dev(char *uuid) +int class_uuid2dev(struct obd_uuid *uuid) { int res = -1; int i; for (i = 0; i < MAX_OBD_DEVICES; i++) { struct obd_device *obd = &obd_dev[i]; - if (strncmp(uuid, obd->obd_uuid, sizeof(obd->obd_uuid)) == 0) { + if (strncmp(uuid->uuid, obd->obd_uuid.uuid, sizeof(obd->obd_uuid.uuid)) == 0) { res = i; return res; } @@ -191,13 +199,13 @@ int class_uuid2dev(char *uuid) } -struct obd_device *class_uuid2obd(char *uuid) +struct obd_device *class_uuid2obd(struct obd_uuid *uuid) { int i; for (i = 0; i < MAX_OBD_DEVICES; i++) { struct obd_device *obd = &obd_dev[i]; - if (strncmp(uuid, obd->obd_uuid, sizeof(obd->obd_uuid)) == 0) + if (strncmp(uuid->uuid, obd->obd_uuid.uuid, sizeof(obd->obd_uuid.uuid)) == 0) return obd; } @@ -353,6 +361,12 @@ void class_destroy_export(struct obd_export *exp) ptlrpc_put_connection_superhack(exp->exp_connection); } + /* Abort any inflight DLM requests and NULL out their (about to be + * freed) import. */ + if (exp->exp_ldlm_data.led_import.imp_obd) + ptlrpc_abort_inflight_superhack(&exp->exp_ldlm_data.led_import, + 1); + exp->exp_cookie = DEAD_HANDLE_MAGIC; kmem_cache_free(export_cachep, exp); @@ -362,7 +376,7 @@ void class_destroy_export(struct obd_export *exp) /* a connection defines an export context in which preallocation can be managed. */ int class_connect(struct lustre_handle *conn, struct obd_device *obd, - obd_uuid_t cluuid) + struct obd_uuid *cluuid) { struct obd_export * export; if (conn == NULL) { @@ -375,12 +389,18 @@ int class_connect(struct lustre_handle *conn, struct obd_device *obd, return -EINVAL; } + if (cluuid == NULL) { + LBUG(); + return -EINVAL; + } + export = class_new_export(obd); if (!export) return -ENOMEM; conn->addr = (__u64) (unsigned long)export; conn->cookie = export->exp_cookie; + memcpy(&export->exp_client_uuid, cluuid, sizeof(export->exp_client_uuid)); CDEBUG(D_IOCTL, "connect: addr %Lx cookie %Lx\n", (long long)conn->addr, (long long)conn->cookie); @@ -427,7 +447,7 @@ void class_disconnect_all(struct obd_device *obddev) CERROR("force disconnecting %s:%s export %p\n", export->exp_obd->obd_type->typ_name, export->exp_connection ? - (char *)export->exp_connection->c_remote_uuid : + (char *)export->exp_connection->c_remote_uuid.uuid : "", export); rc = obd_disconnect(&conn); if (rc < 0) { diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index f096772..d4be2d6 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -1,7 +1,8 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. + * Author: Hariharan Thantry * * This file is part of Lustre, http://www.lustre.org. * @@ -17,68 +18,95 @@ * You should have received a copy of the GNU General Public License * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Author: Hariharan Thantry thantry@users.sourceforge.net */ + #define EXPORT_SYMTAB #include #include #include -#include #include #include #define DEBUG_SUBSYSTEM S_CLASS -#include +#include #include -#ifdef LPROC_SNMP - -#define DEFAULT_MODE 0444 -/* - * Tokenizer array. Change this array to include special - * characters for string tokenizing - */ -const char tok[] = {'/', '\0'}; - -/* - * Externs - */ -extern struct proc_dir_entry proc_root; /* Defined in proc/root.c */ +#ifdef LPROCFS -/* - * Globals - */ -struct proc_dir_entry *proc_lustre_root; -struct proc_dir_entry *proc_lustre_dev_root; -struct proc_dir_entry *proc_lustre_fs_root; - -struct proc_dir_entry* lprocfs_mkdir(const char* dname, - struct proc_dir_entry *parent) -{ - struct proc_dir_entry *child_dir_entry; - child_dir_entry = proc_mkdir(dname, parent); - if (!child_dir_entry) - CERROR("lustre: failed to create /proc entry %s\n", dname); - return child_dir_entry; -} - -struct proc_dir_entry* lprocfs_srch(struct proc_dir_entry* head, - const char* name) +struct proc_dir_entry *lprocfs_srch(struct proc_dir_entry *head, + const char *name) { struct proc_dir_entry* temp; + if (!head) return NULL; + temp = head->subdir; while (temp != NULL) { if (!strcmp(temp->name, name)) return temp; + temp = temp->next; } return NULL; } -void lprocfs_remove_all(struct proc_dir_entry* root) +/* lprocfs API calls */ + +int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list, + void *data) +{ + if ((root == NULL) || (list == NULL)) + return -EINVAL; + + while (list->name) { + struct proc_dir_entry *cur_root, *proc; + char *pathcopy, *cur, *next; + int pathsize = strlen(list->name)+1; + + proc = NULL; + cur_root = root; + + /* need copy of path for strsep */ + OBD_ALLOC(pathcopy, pathsize); + if (!pathcopy) + return -ENOMEM; + + next = pathcopy; + strcpy(pathcopy, list->name); + + while (cur_root && (cur = strsep(&next, "/"))) { + if (*cur =='\0') /* skip double/trailing "/" */ + continue; + + proc = lprocfs_srch(cur_root, cur); + CDEBUG(D_OTHER, "cur_root=%s, cur=%s, next=%s, (%s)\n", + cur_root->name, cur, next, + (proc ? "exists" : "new")); + if (next) + cur_root = (proc ? proc : + proc_mkdir(cur, cur_root)); + else if (!proc) + proc = create_proc_entry(cur, 0444, cur_root); + } + + OBD_FREE(pathcopy, pathsize); + + if ((cur_root==NULL) || (proc==NULL)) { + CERROR("LprocFS: No memory to create /proc entry %s", + list->name); + return -ENOMEM; + } + + proc->read_proc = list->read_fptr; + proc->write_proc = list->write_fptr; + proc->data = (list->data ? list->data : data); + list++; + } + return 0; +} + +void lprocfs_remove(struct proc_dir_entry* root) { struct proc_dir_entry *temp = root; struct proc_dir_entry *rm_entry; @@ -96,235 +124,179 @@ void lprocfs_remove_all(struct proc_dir_entry* root) } } -#define MAX_STRING_SIZE 100 -struct proc_dir_entry* lprocfs_new_dir(struct proc_dir_entry* root, - const char* string, const char* tok) +struct proc_dir_entry *lprocfs_register(const char *name, + struct proc_dir_entry *parent, + struct lprocfs_vars *list, void *data) { - struct proc_dir_entry* new_root; - struct proc_dir_entry* temp_entry; - char temp_string[MAX_STRING_SIZE+1]; - char* my_str; - char* mover_str; - - strncpy(temp_string, string, MAX_STRING_SIZE); - temp_string[MAX_STRING_SIZE] = '\0'; - - new_root = root; - mover_str = temp_string; - while ((my_str = strsep(&mover_str, tok))) { - if (!*my_str) - continue; - CDEBUG(D_OTHER, "SEARCH= %s\t, ROOT=%s\n", my_str, - new_root->name); - temp_entry = lprocfs_srch(new_root, my_str); - if (temp_entry == NULL) { - CDEBUG(D_OTHER, "Adding: %s\n", my_str); - temp_entry = lprocfs_mkdir(my_str, new_root); - if (temp_entry == NULL) { - CDEBUG(D_OTHER, - "! Did not create new dir %s !!\n", - my_str); - return temp_entry; - } + struct proc_dir_entry *newchild; + + newchild = lprocfs_srch(parent, name); + if (newchild) { + CERROR(" Lproc: Attempting to register %s more than once \n", + name); + return NULL; + } + + newchild = proc_mkdir(name, parent); + if (newchild && list) { + int rc = lprocfs_add_vars(newchild, list, data); + if (rc) { + lprocfs_remove(newchild); + return ERR_PTR(rc); } - new_root = temp_entry; } - return new_root; + return newchild; } -int lprocfs_new_vars(struct proc_dir_entry* root, struct lprocfs_vars* list, - const char* tok, void* data) -{ - struct proc_dir_entry *temp_root; - struct proc_dir_entry *new_leaf; - struct proc_dir_entry *new_parent; - char temp_string[MAX_STRING_SIZE+1]; - - if (list == NULL) - return 0; +/* Generic callbacks */ - while (list->name) { - temp_root = lprocfs_new_dir(root, list->name, tok); - if (temp_root == NULL) { - CDEBUG(D_OTHER, "!LProcFS: Mods: No root!"); - return -ENOMEM; - } +int lprocfs_rd_u64(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + *eof = 1; + return snprintf(page, count, LPU64"\n", *(__u64 *)data); +} - /* Convert the last element into a leaf-node */ - strncpy(temp_string, temp_root->name, MAX_STRING_SIZE); - temp_string[MAX_STRING_SIZE] = '\0'; - new_parent = temp_root->parent; - remove_proc_entry(temp_root->name, new_parent); - new_leaf = create_proc_entry(temp_string, DEFAULT_MODE, - new_parent); - if (new_leaf == NULL) { - CERROR("LprocFS: No memory to create /proc entry %s", - temp_string); - return -ENOMEM; - } - new_leaf->read_proc = list->read_fptr; - new_leaf->write_proc = list->write_fptr; - if (data) - new_leaf->data=data; - else - new_leaf->data=list->data; - list++; - } - return 0; +int lprocfs_rd_uuid(char* page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device* dev = (struct obd_device*)data; + *eof = 1; + return snprintf(page, count, "%s\n", dev->obd_uuid.uuid); } -#undef MAX_STRING_SIZE -/* - * API implementations - */ -int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *var, - void *data) + +int lprocfs_rd_name(char *page, char **start, off_t off, int count, + int *eof, void *data) { - return lprocfs_new_vars(root, var, tok, data); + struct obd_device* dev = (struct obd_device *)data; + + *eof = 1; + return snprintf(page, count, "%s\n", dev->obd_name); } -int lprocfs_reg_obd(struct obd_device *device, struct lprocfs_vars *list, - void *data) +int lprocfs_rd_blksize(char* page, char **start, off_t off, int count, + int *eof, struct statfs *sfs) { - struct proc_dir_entry* this_dev_root; - int retval; + *eof = 1; - if (lprocfs_srch(device->obd_type->typ_procroot, device->obd_name)) { - CDEBUG(D_OTHER, "Device with name [%s] exists!", - device->obd_name); - return 0; - } + return snprintf(page, count, "%lu\n", sfs->f_bsize); +} - /* Obtain this device root */ - this_dev_root = lprocfs_mkdir(device->obd_name, - device->obd_type->typ_procroot); +int lprocfs_rd_kbytestotal(char* page, char **start, off_t off, int count, + int *eof, struct statfs *sfs) +{ + __u32 blk_size = sfs->f_bsize >> 10; + __u64 result = sfs->f_blocks; - device->obd_proc_entry = this_dev_root; - retval = lprocfs_add_vars(this_dev_root, list, data); + while (blk_size >>= 1) + result <<= 1; - return retval; + *eof = 1; + return snprintf(page, count, LPU64"\n", result); } -int lprocfs_dereg_obd(struct obd_device* device) +int lprocfs_rd_kbytesfree(char* page, char **start, off_t off, int count, + int *eof, struct statfs *sfs) { - CDEBUG(D_OTHER, "LPROCFS removing device = %s\n", device->obd_name); + __u32 blk_size = sfs->f_bsize >> 10; + __u64 result = sfs->f_bfree; - if (device == NULL) { - CDEBUG(D_OTHER, "! LProcfs: Null pointer !\n"); - return 0; - } - if (device->obd_proc_entry == NULL) { - CDEBUG(D_OTHER, "! Proc entry non-existent !"); - return 0; - } - lprocfs_remove_all(device->obd_proc_entry); - device->obd_proc_entry = NULL; - if (device->counters) - OBD_FREE(device->counters, device->cntr_mem_size); + while (blk_size >>= 1) + result <<= 1; - return 0; + *eof = 1; + return snprintf(page, count, LPU64"\n", result); } -struct proc_dir_entry* lprocfs_reg_mnt(char* mnt_name) +int lprocfs_rd_filestotal(char* page, char **start, off_t off, int count, + int *eof, struct statfs *sfs) { - if (lprocfs_srch(proc_lustre_fs_root, mnt_name)) { - CDEBUG(D_OTHER, "Mount with same name exists!"); - return 0; - } - return lprocfs_mkdir(mnt_name, proc_lustre_fs_root); + *eof = 1; + return snprintf(page, count, "%ld\n", sfs->f_files); } -int lprocfs_dereg_mnt(struct proc_dir_entry* root) +int lprocfs_rd_filesfree(char* page, char **start, off_t off, int count, + int *eof, struct statfs *sfs) { - if (root == NULL) { - CDEBUG(D_OTHER, "Non-existent root!"); - return 0; - } - lprocfs_remove_all(root); - return 0; + *eof = 1; + return snprintf(page, count, "%ld\n", sfs->f_ffree); } -int lprocfs_reg_class(struct obd_type* type, struct lprocfs_vars* list, - void* data) +int lprocfs_rd_filegroups(char* page, char **start, off_t off, int count, + int *eof, struct statfs *sfs) { - struct proc_dir_entry* root; - int retval; - root = lprocfs_mkdir(type->typ_name, proc_lustre_dev_root); - lprocfs_add_vars(root, list, data); - type->typ_procroot = root; - retval = lprocfs_add_vars(root, list, data); - return retval; + *eof = 1; + return snprintf(page, count, "unimplemented\n"); } -int lprocfs_dereg_class(struct obd_type* class) +int lprocfs_rd_server_uuid(char* page, char **start, off_t off, int count, + int *eof, void *data) { - if (class == NULL) { - CDEBUG(D_OTHER, "Non-existent class"); - return 0; - } - lprocfs_remove_all(class->typ_procroot); - class->typ_procroot = NULL; - CDEBUG(D_OTHER, "LPROCFS removed = %s\n", class->typ_name); - return 0; - + struct obd_device* obd = (struct obd_device*)data; + struct client_obd* cli = &obd->u.cli; + return snprintf(page, count, "%s\n", cli->cl_target_uuid.uuid); } -int lprocfs_reg_main() +int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count, + int *eof, void *data) { - proc_lustre_root = lprocfs_mkdir("lustre", &proc_root); - if (proc_lustre_root == NULL) { - CERROR(" !! Cannot create /proc/lustre !! \n"); - return -EINVAL; - } + struct obd_device *obd = (struct obd_device*)data; + struct ptlrpc_connection *conn = obd->u.cli.cl_import.imp_connection; - proc_lustre_dev_root = lprocfs_mkdir("devices", proc_lustre_root); - if (proc_lustre_dev_root == NULL) { - CERROR(" !! Cannot create /proc/lustre/devices !! \n"); - return -EINVAL; - } - proc_lustre_fs_root = lprocfs_mkdir("mnt_pnt", proc_lustre_root); + *eof = 1; + return snprintf(page, count, "%s\n", conn->c_remote_uuid.uuid); +} - if (proc_lustre_fs_root == NULL) { - CERROR(" !! Cannot create /proc/lustre/mnt_pnt !! \n"); - return -EINVAL; - } +int lprocfs_rd_numrefs(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_type* class = (struct obd_type*) data; - return 0; + *eof = 1; + return snprintf(page, count, "%d\n", class->typ_refcnt); } -int lprocfs_dereg_main() +int lprocfs_obd_attach(struct obd_device *dev, struct lprocfs_vars *list) { - lprocfs_remove_all(proc_lustre_root); - proc_lustre_root = NULL; - proc_lustre_dev_root = NULL; - proc_lustre_fs_root = NULL; - return 0; + int rc = 0; + dev->obd_proc_entry = lprocfs_register(dev->obd_name, + dev->obd_type->typ_procroot, + list, dev); + if (IS_ERR(dev->obd_proc_entry)) { + rc = PTR_ERR(dev->obd_proc_entry); + dev->obd_proc_entry = NULL; + } + return rc; } - -/* - * Needs to go... - */ -int lprocfs_ll_rd(char *page, char **start, off_t off, - int count, int *eof, void *data) +int lprocfs_obd_detach(struct obd_device *dev) { - __u64 *temp = (__u64 *)data; - int len; - len = snprintf(page, count, LPU64"\n", *temp); - return len; + if (dev && dev->obd_proc_entry) { + lprocfs_remove(dev->obd_proc_entry); + dev->obd_proc_entry = NULL; + } + return 0; } -#endif /* LPROC_SNMP */ +#endif /* LPROCFS*/ -EXPORT_SYMBOL(lprocfs_reg_obd); -EXPORT_SYMBOL(lprocfs_dereg_obd); -EXPORT_SYMBOL(lprocfs_reg_main); -EXPORT_SYMBOL(lprocfs_dereg_main); -EXPORT_SYMBOL(lprocfs_reg_mnt); -EXPORT_SYMBOL(lprocfs_dereg_mnt); +EXPORT_SYMBOL(lprocfs_register); +EXPORT_SYMBOL(lprocfs_remove); EXPORT_SYMBOL(lprocfs_add_vars); -EXPORT_SYMBOL(lprocfs_reg_class); -EXPORT_SYMBOL(lprocfs_dereg_class); -EXPORT_SYMBOL(lprocfs_ll_rd); - - +EXPORT_SYMBOL(lprocfs_obd_attach); +EXPORT_SYMBOL(lprocfs_obd_detach); + +EXPORT_SYMBOL(lprocfs_rd_u64); +EXPORT_SYMBOL(lprocfs_rd_uuid); +EXPORT_SYMBOL(lprocfs_rd_name); +EXPORT_SYMBOL(lprocfs_rd_server_uuid); +EXPORT_SYMBOL(lprocfs_rd_conn_uuid); +EXPORT_SYMBOL(lprocfs_rd_numrefs); + +EXPORT_SYMBOL(lprocfs_rd_blksize); +EXPORT_SYMBOL(lprocfs_rd_kbytestotal); +EXPORT_SYMBOL(lprocfs_rd_kbytesfree); +EXPORT_SYMBOL(lprocfs_rd_filestotal); +EXPORT_SYMBOL(lprocfs_rd_filesfree); +EXPORT_SYMBOL(lprocfs_rd_filegroups); diff --git a/lustre/obdclass/statfs_pack.c b/lustre/obdclass/statfs_pack.c index 876d41c..4efffa5 100644 --- a/lustre/obdclass/statfs_pack.c +++ b/lustre/obdclass/statfs_pack.c @@ -1,7 +1,8 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. + * Author: Andreas Dilger * * This file is part of Lustre, http://www.lustre.org. * @@ -27,6 +28,7 @@ #define EXPORT_SYMTAB #include #include +#include void obd_statfs_pack(struct obd_statfs *tgt, struct obd_statfs *src) { @@ -69,7 +71,35 @@ void statfs_unpack(struct statfs *sfs, struct obd_statfs *osfs) sfs->f_namelen = osfs->os_namelen; } +int obd_self_statfs(struct obd_device *obd, struct statfs *sfs) +{ + struct lustre_handle conn; + struct obd_export *export, *my_export = NULL; + struct obd_statfs osfs = { 0 }; + int rc; + ENTRY; + + if (list_empty(&obd->obd_exports)) { + export = my_export = class_new_export(obd); + if (export == NULL) + RETURN(-ENOMEM); + } else + export = list_entry(obd->obd_exports.next, typeof(*export), + exp_obd_chain); + conn.addr = (unsigned long)export; + conn.cookie = export->exp_cookie; + + rc = obd_statfs(&conn, &osfs); + if (!rc) + statfs_unpack(sfs, &osfs); + + if (my_export) + class_destroy_export(my_export); + RETURN(rc); +} + EXPORT_SYMBOL(obd_statfs_pack); EXPORT_SYMBOL(obd_statfs_unpack); EXPORT_SYMBOL(statfs_pack); EXPORT_SYMBOL(statfs_unpack); +EXPORT_SYMBOL(obd_self_statfs); diff --git a/lustre/obdclass/sysctl.c b/lustre/obdclass/sysctl.c index 8e74aab..d1388d6 100644 --- a/lustre/obdclass/sysctl.c +++ b/lustre/obdclass/sysctl.c @@ -60,6 +60,8 @@ static int obd_sctl_reset( ctl_table * table, int write, struct file #define OBD_TIMEOUT 6 /* RPC timeout before recovery/intr */ /* XXX move to /proc/sys/lustre/recovery? */ #define OBD_UPCALL 7 /* path to recovery upcall */ +/* XXX temporary, as we play with sync osts.. */ +#define OBD_SYNCFILTER 8 #define OBD_VARS_SLOT 2 @@ -72,6 +74,8 @@ static ctl_table obd_table[] = { /* XXX need to lock so we avoid update races with the recovery upcall! */ {OBD_UPCALL, "recovery_upcall", obd_recovery_upcall, 128, 0644, NULL, &proc_dostring, &sysctl_string }, + {OBD_SYNCFILTER, "filter_sync_on_commit", &obd_sync_filter, sizeof(int), + 0644, NULL, &proc_dointvec}, { 0 } }; diff --git a/lustre/obdclass/uuid.c b/lustre/obdclass/uuid.c index 7048baa..0e279fb 100644 --- a/lustre/obdclass/uuid.c +++ b/lustre/obdclass/uuid.c @@ -86,7 +86,7 @@ static void uuid_pack(struct uuid *uu, class_uuid_t ptr) memcpy(out+10, uu->node, 6); } -int class_uuid_parse(obd_uuid_t in, class_uuid_t uu) +int class_uuid_parse(struct obd_uuid in, class_uuid_t uu) { struct uuid uuid; int i; @@ -122,12 +122,12 @@ int class_uuid_parse(obd_uuid_t in, class_uuid_t uu) } #endif -void class_uuid_unparse(class_uuid_t uu, obd_uuid_t out) +void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out) { struct uuid uuid; uuid_unpack(uu, &uuid); - sprintf(out, + sprintf(out->uuid, "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x", uuid.time_low, uuid.time_mid, uuid.time_hi_and_version, uuid.clock_seq >> 8, uuid.clock_seq & 0xFF, diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c index 8339327..281166e 100644 --- a/lustre/obdecho/echo.c +++ b/lustre/obdecho/echo.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (c) 2001, 2002 Cluster File Systems, Inc. + * Copyright (c) 2001-2003 Cluster File Systems, Inc. * Author: Peter Braam * Author: Andreas Dilger * @@ -21,8 +21,6 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#define OBDECHO_VERSION "1.0" - #define EXPORT_SYMTAB #include @@ -48,70 +46,160 @@ #include #include -static atomic_t echo_page_rws; -static atomic_t echo_getattrs; - -#define ECHO_PROC_STAT "sys/obdecho" -#define ECHO_INIT_OBJID 0x1000000000000000ULL +#define ECHO_INIT_OBJID 0x1000000000000000ULL +#define ECHO_HANDLE_MAGIC 0xabcd0123fedc9876ULL + +#define ECHO_OBJECT0_NPAGES 16 +static struct page *echo_object0_pages[ECHO_OBJECT0_NPAGES]; + +/* should be generic per-obd stats... */ +struct xprocfs_io_stat { + __u64 st_read_bytes; + __u64 st_read_reqs; + __u64 st_write_bytes; + __u64 st_write_reqs; + __u64 st_getattr_reqs; + __u64 st_setattr_reqs; + __u64 st_create_reqs; + __u64 st_destroy_reqs; + __u64 st_statfs_reqs; + __u64 st_open_reqs; + __u64 st_close_reqs; + __u64 st_punch_reqs; +}; -extern struct lprocfs_vars status_var_nm_1[]; -extern struct lprocfs_vars status_class_var[]; +static struct xprocfs_io_stat xprocfs_iostats[NR_CPUS]; +static struct proc_dir_entry *xprocfs_dir; + +#define XPROCFS_BUMP_MYCPU_IOSTAT(field, count) \ +do { \ + xprocfs_iostats[smp_processor_id()].field += (count); \ +} while (0) + +#define DECLARE_XPROCFS_SUM_STAT(field) \ +static long long \ +xprocfs_sum_##field (void) \ +{ \ + long long stat = 0; \ + int i; \ + \ + for (i = 0; i < smp_num_cpus; i++) \ + stat += xprocfs_iostats[i].field; \ + return (stat); \ +} -int echo_proc_read(char *page, char **start, off_t off, int count, int *eof, - void *data) +DECLARE_XPROCFS_SUM_STAT (st_read_bytes) +DECLARE_XPROCFS_SUM_STAT (st_read_reqs) +DECLARE_XPROCFS_SUM_STAT (st_write_bytes) +DECLARE_XPROCFS_SUM_STAT (st_write_reqs) +DECLARE_XPROCFS_SUM_STAT (st_getattr_reqs) +DECLARE_XPROCFS_SUM_STAT (st_setattr_reqs) +DECLARE_XPROCFS_SUM_STAT (st_create_reqs) +DECLARE_XPROCFS_SUM_STAT (st_destroy_reqs) +DECLARE_XPROCFS_SUM_STAT (st_statfs_reqs) +DECLARE_XPROCFS_SUM_STAT (st_open_reqs) +DECLARE_XPROCFS_SUM_STAT (st_close_reqs) +DECLARE_XPROCFS_SUM_STAT (st_punch_reqs) + +static int +xprocfs_rd_stat (char *page, char **start, off_t off, int count, + int *eof, void *data) { - long long attrs = atomic_read(&echo_getattrs); - long long pages = atomic_read(&echo_page_rws); - int len; - + long long (*fn)(void) = (long long(*)(void))data; + int len; + *eof = 1; if (off != 0) return (0); - len = sprintf(page, "%Ld %Ld\n", attrs, pages); - + len = snprintf (page, count, "%Ld\n", fn()); *start = page; return (len); } + -int echo_proc_write(struct file *file, const char *ubuffer, - unsigned long count, void *data) +static void +xprocfs_add_stat(char *name, long long (*fn)(void)) { - /* Ignore what we've been asked to write, and just zero the counters */ - atomic_set (&echo_page_rws, 0); - atomic_set (&echo_getattrs, 0); + struct proc_dir_entry *entry; + + entry = create_proc_entry (name, S_IFREG|S_IRUGO, xprocfs_dir); + if (entry == NULL) { + CERROR ("Can't add procfs stat %s\n", name); + return; + } - return (count); + entry->data = fn; + entry->read_proc = xprocfs_rd_stat; + entry->write_proc = NULL; } -void echo_proc_init(void) +static void +xprocfs_init (char *name) { - struct proc_dir_entry *entry; + char dirname[64]; + + snprintf (dirname, sizeof (dirname), "sys/%s", name); - entry = create_proc_entry(ECHO_PROC_STAT, S_IFREG|S_IRUGO|S_IWUSR,NULL); - - if (entry == NULL) { - CERROR("couldn't create proc entry %s\n", ECHO_PROC_STAT); + xprocfs_dir = proc_mkdir (dirname, NULL); + if (xprocfs_dir == NULL) { + CERROR ("Can't make dir\n"); return; } - entry->data = NULL; - entry->read_proc = echo_proc_read; - entry->write_proc = echo_proc_write; + xprocfs_add_stat ("read_bytes", xprocfs_sum_st_read_bytes); + xprocfs_add_stat ("read_reqs", xprocfs_sum_st_read_reqs); + xprocfs_add_stat ("write_bytes", xprocfs_sum_st_write_bytes); + xprocfs_add_stat ("write_reqs", xprocfs_sum_st_write_reqs); + xprocfs_add_stat ("getattr_reqs", xprocfs_sum_st_getattr_reqs); + xprocfs_add_stat ("setattr_reqs", xprocfs_sum_st_setattr_reqs); + xprocfs_add_stat ("create_reqs", xprocfs_sum_st_create_reqs); + xprocfs_add_stat ("destroy_reqs", xprocfs_sum_st_destroy_reqs); + xprocfs_add_stat ("statfs_reqs", xprocfs_sum_st_statfs_reqs); + xprocfs_add_stat ("open_reqs", xprocfs_sum_st_open_reqs); + xprocfs_add_stat ("close_reqs", xprocfs_sum_st_close_reqs); + xprocfs_add_stat ("punch_reqs", xprocfs_sum_st_punch_reqs); } -void echo_proc_fini(void) +void xprocfs_fini (void) { - remove_proc_entry(ECHO_PROC_STAT, 0); + if (xprocfs_dir == NULL) + return; + + remove_proc_entry ("read_bytes", xprocfs_dir); + remove_proc_entry ("read_reqs", xprocfs_dir); + remove_proc_entry ("write_bytes", xprocfs_dir); + remove_proc_entry ("write_reqs", xprocfs_dir); + remove_proc_entry ("getattr_reqs", xprocfs_dir); + remove_proc_entry ("setattr_reqs", xprocfs_dir); + remove_proc_entry ("create_reqs", xprocfs_dir); + remove_proc_entry ("destroy_reqs", xprocfs_dir); + remove_proc_entry ("statfs_reqs", xprocfs_dir); + remove_proc_entry ("open_reqs", xprocfs_dir); + remove_proc_entry ("close_reqs", xprocfs_dir); + remove_proc_entry ("punch_reqs", xprocfs_dir); + + remove_proc_entry (xprocfs_dir->name, xprocfs_dir->parent); + xprocfs_dir = NULL; } static int echo_connect(struct lustre_handle *conn, struct obd_device *obd, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { return class_connect(conn, obd, cluuid); } +static int echo_disconnect(struct lustre_handle *conn) +{ + struct obd_export *exp = class_conn2export(conn); + + LASSERT (exp != NULL); + + ldlm_cancel_locks_for_export (exp); + return (class_disconnect (conn)); +} + static __u64 echo_next_id(struct obd_device *obddev) { obd_id id; @@ -124,17 +212,19 @@ static __u64 echo_next_id(struct obd_device *obddev) } int echo_create(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md **ea) + struct lov_stripe_md **ea, struct obd_trans_info *oti) { struct obd_device *obd = class_conn2obd(conn); + XPROCFS_BUMP_MYCPU_IOSTAT (st_create_reqs, 1); + if (!obd) { CERROR("invalid client "LPX64"\n", conn->addr); return -EINVAL; } if (!(oa->o_mode && S_IFMT)) { - CERROR("filter obd: no type!\n"); + CERROR("echo obd: no type!\n"); return -ENOENT; } @@ -151,10 +241,12 @@ int echo_create(struct lustre_handle *conn, struct obdo *oa, } int echo_destroy(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *ea) + struct lov_stripe_md *ea, struct obd_trans_info *oti) { struct obd_device *obd = class_conn2obd(conn); + XPROCFS_BUMP_MYCPU_IOSTAT (st_destroy_reqs, 1); + if (!obd) { CERROR("invalid client "LPX64"\n", conn->addr); RETURN(-EINVAL); @@ -176,14 +268,53 @@ int echo_destroy(struct lustre_handle *conn, struct obdo *oa, } static int echo_open(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *md) + struct lov_stripe_md *md, struct obd_trans_info *oti) { + struct lustre_handle *fh = obdo_handle (oa); + struct obd_device *obd = class_conn2obd (conn); + + XPROCFS_BUMP_MYCPU_IOSTAT (st_open_reqs, 1); + + if (!obd) { + CERROR ("invalid client "LPX64"\n", conn->addr); + return (-EINVAL); + } + + if (!(oa->o_valid & OBD_MD_FLID)) { + CERROR ("obdo missing FLID valid flag: %08x\n", oa->o_valid); + return (-EINVAL); + } + + fh->addr = oa->o_id; + fh->cookie = ECHO_HANDLE_MAGIC; + + oa->o_valid |= OBD_MD_FLHANDLE; return 0; } static int echo_close(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *md) + struct lov_stripe_md *md, struct obd_trans_info *oti) { + struct lustre_handle *fh = obdo_handle (oa); + struct obd_device *obd = class_conn2obd(conn); + + XPROCFS_BUMP_MYCPU_IOSTAT (st_close_reqs, 1); + + if (!obd) { + CERROR("invalid client "LPX64"\n", conn->addr); + return (-EINVAL); + } + + if (!(oa->o_valid & OBD_MD_FLHANDLE)) { + CERROR("obdo missing FLHANDLE valid flag: %08x\n", oa->o_valid); + return (-EINVAL); + } + + if (fh->cookie != ECHO_HANDLE_MAGIC) { + CERROR ("invalid file handle on close: "LPX64"\n", fh->cookie); + return (-EINVAL); + } + return 0; } @@ -193,6 +324,8 @@ static int echo_getattr(struct lustre_handle *conn, struct obdo *oa, struct obd_device *obd = class_conn2obd(conn); obd_id id = oa->o_id; + XPROCFS_BUMP_MYCPU_IOSTAT (st_getattr_reqs, 1); + if (!obd) { CERROR("invalid client "LPX64"\n", conn->addr); RETURN(-EINVAL); @@ -203,20 +336,19 @@ static int echo_getattr(struct lustre_handle *conn, struct obdo *oa, RETURN(-EINVAL); } - memcpy(oa, &obd->u.echo.oa, sizeof(*oa)); + obdo_cpy_md(oa, &obd->u.echo.oa, oa->o_valid); oa->o_id = id; - oa->o_valid |= OBD_MD_FLID; - - atomic_inc(&echo_getattrs); return 0; } static int echo_setattr(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *md) + struct lov_stripe_md *md, struct obd_trans_info *oti) { struct obd_device *obd = class_conn2obd(conn); + XPROCFS_BUMP_MYCPU_IOSTAT (st_setattr_reqs, 1); + if (!obd) { CERROR("invalid client "LPX64"\n", conn->addr); RETURN(-EINVAL); @@ -239,15 +371,19 @@ static int echo_setattr(struct lustre_handle *conn, struct obdo *oa, int echo_preprw(int cmd, struct lustre_handle *conn, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_remote *nb, - struct niobuf_local *res, void **desc_private) + struct niobuf_local *res, void **desc_private, struct obd_trans_info *oti) { struct obd_device *obd; struct niobuf_local *r = res; int rc = 0; int i; - ENTRY; + if ((cmd & OBD_BRW_WRITE) != 0) + XPROCFS_BUMP_MYCPU_IOSTAT (st_write_reqs, 1); + else + XPROCFS_BUMP_MYCPU_IOSTAT (st_read_reqs, 1); + obd = class_conn2obd(conn); if (!obd) { CERROR("invalid client "LPX64"\n", conn->addr); @@ -265,16 +401,26 @@ int echo_preprw(int cmd, struct lustre_handle *conn, int objcount, for (i = 0; i < objcount; i++, obj++) { int gfp_mask = (obj->ioo_id & 1) ? GFP_HIGHUSER : GFP_KERNEL; - int verify = obj->ioo_id != 0; + int isobj0 = obj->ioo_id == 0; + int verify = !isobj0; int j; for (j = 0 ; j < obj->ioo_bufcnt ; j++, nb++, r++) { - r->page = alloc_pages(gfp_mask, 0); - if (!r->page) { - CERROR("can't get page %d/%d for id "LPU64"\n", - j, obj->ioo_bufcnt, obj->ioo_id); - GOTO(preprw_cleanup, rc = -ENOMEM); + + if (isobj0 && + (nb->offset >> PAGE_SHIFT) < ECHO_OBJECT0_NPAGES) { + r->page = echo_object0_pages[nb->offset >> PAGE_SHIFT]; + /* Take extra ref so __free_pages() can be called OK */ + get_page (r->page); + } else { + r->page = alloc_pages(gfp_mask, 0); + if (r->page == NULL) { + CERROR("can't get page %d/%d for id "LPU64"\n", + j, obj->ioo_bufcnt, obj->ioo_id); + GOTO(preprw_cleanup, rc = -ENOMEM); + } } + atomic_inc(&obd->u.echo.eo_prep); r->offset = nb->offset; @@ -284,13 +430,18 @@ int echo_preprw(int cmd, struct lustre_handle *conn, int objcount, CDEBUG(D_PAGE, "$$$$ get page %p, addr %p@"LPU64"\n", r->page, r->addr, r->offset); - if (verify && cmd == OBD_BRW_READ) - page_debug_setup(r->addr, r->len, r->offset, - obj->ioo_id); - else if (verify) - page_debug_setup(r->addr, r->len, - 0xecc0ecc0ecc0ecc0, - 0xecc0ecc0ecc0ecc0); + if (cmd == OBD_BRW_READ) { + XPROCFS_BUMP_MYCPU_IOSTAT (st_read_bytes, r->len); + if (verify) + page_debug_setup(r->addr, r->len, r->offset, + obj->ioo_id); + } else { + XPROCFS_BUMP_MYCPU_IOSTAT (st_write_bytes, r->len); + if (verify) + page_debug_setup(r->addr, r->len, + 0xecc0ecc0ecc0ecc0, + 0xecc0ecc0ecc0ecc0); + } } } CDEBUG(D_PAGE, "%d pages allocated after prep\n", @@ -307,6 +458,8 @@ preprw_cleanup: CERROR("cleaning up %ld pages (%d obdos)\n", (long)(r - res), objcount); while (r-- > res) { kunmap(r->page); + /* NB if this is an 'object0' page, __free_pages will just + * lose the extra ref gained above */ __free_pages(r->page, 0); atomic_dec(&obd->u.echo.eo_prep); } @@ -318,11 +471,12 @@ preprw_cleanup: int echo_commitrw(int cmd, struct lustre_handle *conn, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_local *res, - void *desc_private) + void *desc_private, struct obd_trans_info *oti) { struct obd_device *obd; struct niobuf_local *r = res; int rc = 0; + int vrc = 0; int i; ENTRY; @@ -363,16 +517,19 @@ int echo_commitrw(int cmd, struct lustre_handle *conn, int objcount, GOTO(commitrw_cleanup, rc = -EFAULT); } - atomic_inc(&echo_page_rws); - CDEBUG(D_PAGE, "$$$$ use page %p, addr %p@"LPU64"\n", r->page, addr, r->offset); - if (verify) - page_debug_check("echo", addr, r->len, - r->offset, obj->ioo_id); - + if (verify) { + vrc = page_debug_check("echo", addr, r->len, + r->offset, obj->ioo_id); + /* check all the pages always */ + if (vrc != 0 && rc == 0) + rc = vrc; + } + kunmap(page); + /* NB see comment above regarding object0 pages */ obd_kmap_put(1); __free_pages(page, 0); atomic_dec(&obd->u.echo.eo_prep); @@ -380,7 +537,7 @@ int echo_commitrw(int cmd, struct lustre_handle *conn, int objcount, } CDEBUG(D_PAGE, "%d pages remain after commit\n", atomic_read(&obd->u.echo.eo_prep)); - RETURN(0); + RETURN(rc); commitrw_cleanup: CERROR("cleaning up %ld pages (%d obdos)\n", @@ -390,6 +547,7 @@ commitrw_cleanup: kunmap(page); obd_kmap_put(1); + /* NB see comment above regarding object0 pages */ __free_pages(page, 0); atomic_dec(&obd->u.echo.eo_prep); } @@ -400,15 +558,18 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf) { ENTRY; + spin_lock_init(&obddev->u.echo.eo_lock); + obddev->u.echo.eo_lastino = ECHO_INIT_OBJID; + obddev->obd_namespace = ldlm_namespace_new("echo-tgt", LDLM_NAMESPACE_SERVER); if (obddev->obd_namespace == NULL) { LBUG(); RETURN(-ENOMEM); } - spin_lock_init(&obddev->u.echo.eo_lock); - obddev->u.echo.eo_lastino = ECHO_INIT_OBJID; + ptlrpc_init_client (LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, + "echo_ldlm_cb_client", &obddev->obd_ldlm_client); RETURN(0); } @@ -425,12 +586,15 @@ static int echo_cleanup(struct obd_device *obddev) int echo_attach(struct obd_device *dev, obd_count len, void *data) { - return lprocfs_reg_obd(dev, status_var_nm_1, dev); + struct lprocfs_static_vars lvars; + + lprocfs_init_vars(&lvars); + return lprocfs_obd_attach(dev, lvars.obd_vars); } int echo_detach(struct obd_device *dev) { - return lprocfs_dereg_obd(dev); + return lprocfs_obd_detach(dev); } static struct obd_ops echo_obd_ops = { @@ -438,7 +602,7 @@ static struct obd_ops echo_obd_ops = { o_attach: echo_attach, o_detach: echo_detach, o_connect: echo_connect, - o_disconnect: class_disconnect, + o_disconnect: echo_disconnect, o_create: echo_create, o_destroy: echo_destroy, o_open: echo_open, @@ -454,35 +618,85 @@ static struct obd_ops echo_obd_ops = { extern int echo_client_init(void); extern void echo_client_cleanup(void); +static void +echo_object0_pages_fini (void) +{ + int i; + + for (i = 0; i < ECHO_OBJECT0_NPAGES; i++) + if (echo_object0_pages[i] != NULL) { + __free_pages (echo_object0_pages[i], 0); + echo_object0_pages[i] = NULL; + } +} + +static int +echo_object0_pages_init (void) +{ + struct page *pg; + int i; + + for (i = 0; i < ECHO_OBJECT0_NPAGES; i++) { + int gfp_mask = (i < ECHO_OBJECT0_NPAGES/2) ? GFP_KERNEL : GFP_HIGHUSER; + + pg = alloc_pages (gfp_mask, 0); + if (pg == NULL) { + echo_object0_pages_fini (); + return (-ENOMEM); + } + + memset (kmap (pg), 0, PAGE_SIZE); + kunmap (pg); + + echo_object0_pages[i] = pg; + } + + return (0); +} + static int __init obdecho_init(void) { + struct lprocfs_static_vars lvars; int rc; - printk(KERN_INFO "Echo OBD driver " OBDECHO_VERSION - " info@clusterfs.com\n"); + printk(KERN_INFO "Lustre Echo OBD driver; info@clusterfs.com\n"); + + lprocfs_init_vars(&lvars); - echo_proc_init(); - rc = class_register_type(&echo_obd_ops, status_class_var, + xprocfs_init ("echo"); + + rc = echo_object0_pages_init (); + if (rc != 0) + goto failed_0; + + rc = class_register_type(&echo_obd_ops, lvars.module_vars, OBD_ECHO_DEVICENAME); - if (rc) - RETURN(rc); + if (rc != 0) + goto failed_1; rc = echo_client_init(); - if (rc) - class_unregister_type(OBD_ECHO_DEVICENAME); + if (rc == 0) + RETURN (0); + class_unregister_type(OBD_ECHO_DEVICENAME); + failed_1: + echo_object0_pages_fini (); + failed_0: + xprocfs_fini (); + RETURN(rc); } static void __exit obdecho_exit(void) { - echo_proc_fini(); echo_client_cleanup(); class_unregister_type(OBD_ECHO_DEVICENAME); + echo_object0_pages_fini (); + xprocfs_fini (); } -MODULE_AUTHOR("Cluster Filesystems Inc. "); -MODULE_DESCRIPTION("Lustre Testing Echo OBD driver " OBDECHO_VERSION); +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Lustre Testing Echo OBD driver"); MODULE_LICENSE("GPL"); module_init(obdecho_init); diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index e9c0e90..6c4eb6d 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (c) 2001, 2002 Cluster File Systems, Inc. + * Copyright (c) 2001-2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -22,6 +22,8 @@ #include #include #include +#include +#include #define DEBUG_SUBSYSTEM S_ECHO @@ -30,159 +32,935 @@ #include #include #include +#include /* for LL_IOC_LOV_SETSTRIPE */ -static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn, int len, - void *karg, void *uarg) +#if 0 +static void +echo_printk_object (char *msg, struct ec_object *eco) +{ + struct lov_stripe_md *lsm = eco->eco_lsm; + int i; + + printk (KERN_INFO "%s: object %p: "LPX64", refs %d%s: "LPX64 + "=%u!%u@%d\n", msg, eco, eco->eco_id, eco->eco_refcount, + eco->eco_deleted ? "(deleted) " : "", + lsm->lsm_object_id, lsm->lsm_stripe_size, + lsm->lsm_stripe_count, lsm->lsm_stripe_offset); + + for (i = 0; i < lsm->lsm_stripe_count; i++) + printk (KERN_INFO " [%2u]"LPX64"\n", + lsm->lsm_oinfo[i].loi_ost_idx, + lsm->lsm_oinfo[i].loi_id); +} +#endif + +static struct ec_object * +echo_find_object_locked (struct obd_device *obd, obd_id id) { - struct obd_device *obd = class_conn2obd(obdconn); struct echo_client_obd *ec = &obd->u.echo_client; - struct obd_ioctl_data *data = karg; - int rw = OBD_BRW_READ, rc = 0; - struct lov_stripe_md *lsm = NULL; + struct ec_object *eco = NULL; + struct list_head *el; + + list_for_each (el, &ec->ec_objects) { + eco = list_entry (el, struct ec_object, eco_obj_chain); + + if (eco->eco_id == id) + return (eco); + } + return (NULL); +} + +static int +echo_copyout_lsm (struct lov_stripe_md *lsm, void *ulsm, int ulsm_nob) +{ + int nob; + + nob = offsetof (struct lov_stripe_md, lsm_oinfo[lsm->lsm_stripe_count]); + if (nob > ulsm_nob) + return (-EINVAL); + + if (copy_to_user (ulsm, lsm, nob)) + return (-EFAULT); + + return (0); +} + +static int +echo_copyin_lsm (struct obd_device *obd, struct lov_stripe_md *lsm, + void *ulsm, int ulsm_nob) +{ + struct echo_client_obd *ec = &obd->u.echo_client; + int nob; + + if (ulsm_nob < sizeof (*lsm)) + return (-EINVAL); + + if (copy_from_user (lsm, ulsm, sizeof (*lsm))) + return (-EFAULT); + + nob = lsm->lsm_stripe_count * sizeof (lsm->lsm_oinfo[0]); + + if (ulsm_nob < nob || + lsm->lsm_stripe_count > ec->ec_nstripes || + lsm->lsm_magic != LOV_MAGIC || + (lsm->lsm_stripe_offset != 0 && + lsm->lsm_stripe_offset != 0xffffffff && + lsm->lsm_stripe_offset >= ec->ec_nstripes) || + (lsm->lsm_stripe_size & (PAGE_SIZE - 1)) != 0 || + ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL)) + return (-EINVAL); + + LASSERT (ec->ec_lsmsize >= sizeof (*lsm) + nob); + + if (copy_from_user(lsm->lsm_oinfo, + ((struct lov_stripe_md *)ulsm)->lsm_oinfo, nob)) + return (-EFAULT); + + return (0); +} + +static struct ec_object * +echo_allocate_object (struct obd_device *obd) +{ + struct echo_client_obd *ec = &obd->u.echo_client; + struct ec_object *eco; + + OBD_ALLOC (eco, sizeof (*eco)); + if (eco == NULL) + return (NULL); + + OBD_ALLOC (eco->eco_lsm, ec->ec_lsmsize); + if (eco->eco_lsm == NULL) { + OBD_FREE (eco, sizeof (*eco)); + return (NULL); + } + + eco->eco_device = obd; + eco->eco_deleted = 0; + eco->eco_refcount = 0; + eco->eco_lsm->lsm_magic = LOV_MAGIC; + /* leave stripe count 0 by default */ + + return (eco); +} + +static void +echo_free_object (struct ec_object *eco) +{ + struct obd_device *obd = eco->eco_device; + struct echo_client_obd *ec = &obd->u.echo_client; + + LASSERT (eco->eco_refcount == 0); + OBD_FREE (eco->eco_lsm, ec->ec_lsmsize); + OBD_FREE (eco, sizeof (*eco)); +} + +static int +echo_create_object (struct obd_device *obd, int on_target, struct obdo *oa, + void *ulsm, int ulsm_nob) +{ + struct echo_client_obd *ec = &obd->u.echo_client; + struct ec_object *eco2; + struct ec_object *eco; + struct lov_stripe_md *lsm; + int rc; + int i; + + if ((oa->o_valid & OBD_MD_FLID) == 0 && /* no obj id */ + (on_target || /* set_stripe */ + ec->ec_nstripes != 0)) { /* LOV */ + CERROR ("No valid oid\n"); + return (-EINVAL); + } + + eco = echo_allocate_object (obd); + if (eco == NULL) + return (-ENOMEM); + + lsm = eco->eco_lsm; + + if (ulsm != NULL) { + rc = echo_copyin_lsm (obd, lsm, ulsm, ulsm_nob); + if (rc != 0) + goto failed; + } + + /* setup object ID here for !on_target and LOV hint */ + if ((oa->o_valid & OBD_MD_FLID) != 0) + eco->eco_id = lsm->lsm_object_id = oa->o_id; + + /* defaults -> actual values */ + if (lsm->lsm_stripe_offset == 0xffffffff) + lsm->lsm_stripe_offset = 0; + + if (lsm->lsm_stripe_count == 0) + lsm->lsm_stripe_count = ec->ec_nstripes; + + if (lsm->lsm_stripe_size == 0) + lsm->lsm_stripe_size = PAGE_SIZE; + + /* setup stripes: indices + default ids if required */ + for (i = 0; i < lsm->lsm_stripe_count; i++) { + if (lsm->lsm_oinfo[i].loi_id == 0) + lsm->lsm_oinfo[i].loi_id = lsm->lsm_object_id; + + lsm->lsm_oinfo[i].loi_ost_idx = + (lsm->lsm_stripe_offset + i) % ec->ec_nstripes; + } + + if (on_target) { + rc = obd_create (&ec->ec_conn, oa, &lsm, NULL); + if (rc != 0) + goto failed; + + /* See what object ID we were given */ + LASSERT ((oa->o_valid & OBD_MD_FLID) != 0); + eco->eco_id = lsm->lsm_object_id = oa->o_id; + } + + spin_lock (&ec->ec_lock); + + eco2 = echo_find_object_locked (obd, oa->o_id); + if (eco2 != NULL) { /* conflict */ + spin_unlock (&ec->ec_lock); + + CERROR ("Can't create object id "LPX64": id already exists%s\n", + oa->o_id, on_target ? " (undoing create)" : ""); + + if (on_target) + obd_destroy (&ec->ec_conn, oa, lsm, NULL); + + rc = -EEXIST; + goto failed; + } + + list_add (&eco->eco_obj_chain, &ec->ec_objects); + spin_unlock (&ec->ec_lock); + CDEBUG (D_INFO, + "created %p: "LPX64"=%u#%u&%d refs %d del %d\n", + eco, eco->eco_id, + eco->eco_lsm->lsm_stripe_size, + eco->eco_lsm->lsm_stripe_count, + eco->eco_lsm->lsm_stripe_offset, + eco->eco_refcount, eco->eco_deleted); + return (0); + + failed: + echo_free_object (eco); + return (rc); +} + +static int +echo_get_object (struct ec_object **ecop, struct obd_device *obd, struct obdo *oa) +{ + struct echo_client_obd *ec = &obd->u.echo_client; + struct ec_object *eco; + struct ec_object *eco2; + int rc; + + if ((oa->o_valid & OBD_MD_FLID) == 0) + { + CERROR ("No valid oid\n"); + return (-EINVAL); + } + + spin_lock (&ec->ec_lock); + eco = echo_find_object_locked (obd, oa->o_id); + if (eco != NULL) { + if (eco->eco_deleted) /* being deleted */ + return (-EAGAIN); /* (see comment in cleanup) */ + + eco->eco_refcount++; + spin_unlock (&ec->ec_lock); + *ecop = eco; + CDEBUG (D_INFO, + "found %p: "LPX64"=%u#%u&%d refs %d del %d\n", + eco, eco->eco_id, + eco->eco_lsm->lsm_stripe_size, + eco->eco_lsm->lsm_stripe_count, + eco->eco_lsm->lsm_stripe_offset, + eco->eco_refcount, eco->eco_deleted); + return (0); + } + spin_unlock (&ec->ec_lock); + + if (ec->ec_nstripes != 0) /* striping required */ + return (-ENOENT); + + eco = echo_allocate_object (obd); + if (eco == NULL) + return (-ENOMEM); + + eco->eco_id = eco->eco_lsm->lsm_object_id = oa->o_id; + + spin_lock (&ec->ec_lock); + + eco2 = echo_find_object_locked (obd, oa->o_id); + if (eco2 == NULL) { /* didn't race */ + list_add (&eco->eco_obj_chain, &ec->ec_objects); + spin_unlock (&ec->ec_lock); + eco->eco_refcount = 1; + *ecop = eco; + CDEBUG (D_INFO, + "created %p: "LPX64"=%u#%u&%d refs %d del %d\n", + eco, eco->eco_id, + eco->eco_lsm->lsm_stripe_size, + eco->eco_lsm->lsm_stripe_count, + eco->eco_lsm->lsm_stripe_offset, + eco->eco_refcount, eco->eco_deleted); + return (0); + } + + if (eco2->eco_deleted) + rc = -EAGAIN; /* lose race */ + else { + eco2->eco_refcount++; /* take existing */ + *ecop = eco2; + rc = 0; + LASSERT (eco2->eco_id == eco2->eco_lsm->lsm_object_id); + CDEBUG (D_INFO, + "found(2) %p: "LPX64"=%u#%u&%d refs %d del %d\n", + eco2, eco2->eco_id, + eco2->eco_lsm->lsm_stripe_size, + eco2->eco_lsm->lsm_stripe_count, + eco2->eco_lsm->lsm_stripe_offset, + eco2->eco_refcount, eco2->eco_deleted); + } + + spin_unlock (&ec->ec_lock); + + echo_free_object (eco); + return (rc); +} + +static void +echo_put_object (struct ec_object *eco) +{ + struct obd_device *obd = eco->eco_device; + struct echo_client_obd *ec = &obd->u.echo_client; + + /* Release caller's ref on the object. + * delete => mark for deletion when last ref goes + */ + + spin_lock (&ec->ec_lock); + + eco->eco_refcount--; + LASSERT (eco->eco_refcount >= 0); + + if (eco->eco_refcount != 0 || + !eco->eco_deleted) { + spin_unlock (&ec->ec_lock); + return; + } + + spin_unlock (&ec->ec_lock); + + /* NB leave obj in the object list. We must prevent anyone from + * attempting to enqueue on this object number until we can be + * sure there will be no more lock callbacks. + */ + obd_cancel_unused (&ec->ec_conn, eco->eco_lsm, 0); + + /* now we can let it go */ + spin_lock (&ec->ec_lock); + list_del (&eco->eco_obj_chain); + spin_unlock (&ec->ec_lock); + + LASSERT (eco->eco_refcount == 0); + + echo_free_object (eco); +} + +static void +echo_get_stripe_off_id (struct lov_stripe_md *lsm, obd_off *offp, obd_id *idp) +{ + unsigned long stripe_count; + unsigned long stripe_size; + unsigned long width; + unsigned long woffset; + int stripe_index; + obd_off offset; + + if (lsm->lsm_stripe_count <= 1) + return; + + offset = *offp; + stripe_size = lsm->lsm_stripe_size; + stripe_count = lsm->lsm_stripe_count; + + /* width = # bytes in all stripes */ + width = stripe_size * stripe_count; + + /* woffset = offset within a width; offset = whole number of widths */ + woffset = do_div (offset, width); + + stripe_index = woffset / stripe_size; + + *idp = lsm->lsm_oinfo[stripe_index].loi_id; + *offp = offset * stripe_size + woffset % stripe_size; +} + +static int +echo_client_kbrw (struct obd_device *obd, int rw, + struct obdo *oa, struct lov_stripe_md *lsm, + obd_off offset, obd_size count) +{ + struct echo_client_obd *ec = &obd->u.echo_client; + struct obd_brw_set *set; + obd_count npages; + struct brw_page *pga; + struct brw_page *pgp; + obd_off off; + int i; + int rc; + int verify; + int gfp_mask; + + /* oa_id == 0 => speed test (no verification) else... + * oa & 1 => use HIGHMEM + */ + verify = (oa->o_id != 0); + gfp_mask = ((oa->o_id & 1) == 0) ? GFP_KERNEL : GFP_HIGHUSER; + + LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ); + + if (count <= 0 || + (count & (PAGE_SIZE - 1)) != 0 || + (lsm != NULL && + lsm->lsm_object_id != oa->o_id)) + return (-EINVAL); + + set = obd_brw_set_new(); + if (set == NULL) + return (-ENOMEM); + + /* XXX think again with misaligned I/O */ + npages = count >> PAGE_SHIFT; + + rc = -ENOMEM; + OBD_ALLOC(pga, npages * sizeof(*pga)); + if (pga == NULL) + goto out_0; + + for (i = 0, pgp = pga, off = offset; + i < npages; + i++, pgp++, off += PAGE_SIZE) { + + LASSERT (pgp->pg == NULL); /* for cleanup */ + + rc = -ENOMEM; + pgp->pg = alloc_pages (gfp_mask, 0); + if (pgp->pg == NULL) + goto out_1; + + pgp->count = PAGE_SIZE; + pgp->off = off; + pgp->flag = 0; + + if (verify) { + void *addr = kmap(pgp->pg); + obd_off stripe_off = off; + obd_id stripe_id = oa->o_id; + + if (rw == OBD_BRW_WRITE) { + echo_get_stripe_off_id(lsm, &stripe_off, + &stripe_id); + page_debug_setup(addr, pgp->count, + stripe_off, stripe_id); + } else { + page_debug_setup(addr, pgp->count, + 0xdeadbeef00c0ffee, + 0xdeadbeef00c0ffee); + } + kunmap(pgp->pg); + } + } + + set->brw_callback = ll_brw_sync_wait; + rc = obd_brw(rw, &ec->ec_conn, lsm, npages, pga, set, NULL); + if (rc == 0) + rc = ll_brw_sync_wait(set, CB_PHASE_START); + + out_1: + if (rc != 0) + verify = 0; + + for (i = 0, pgp = pga; i < npages; i++, pgp++) { + if (pgp->pg == NULL) + continue; + + if (verify) { + void *addr = kmap(pgp->pg); + obd_off stripe_off = pgp->off; + obd_id stripe_id = oa->o_id; + int vrc; + + echo_get_stripe_off_id (lsm, &stripe_off, &stripe_id); + vrc = page_debug_check("test_brw", addr, pgp->count, + stripe_off, stripe_id); + if (vrc != 0 && rc == 0) + rc = vrc; + + kunmap(pgp->pg); + } + __free_pages(pgp->pg, 0); + } + OBD_FREE(pga, npages * sizeof(*pga)); + out_0: + obd_brw_set_free(set); + return (rc); +} + +static int +echo_client_ubrw (struct obd_device *obd, int rw, + struct obdo *oa, struct lov_stripe_md *lsm, + obd_off offset, obd_size count, char *buffer) +{ + struct echo_client_obd *ec = &obd->u.echo_client; + struct obd_brw_set *set; + obd_count npages; + struct brw_page *pga; + struct brw_page *pgp; + obd_off off; + struct kiobuf *kiobuf; + int i; + int rc; + + LASSERT (rw == OBD_BRW_WRITE || + rw == OBD_BRW_READ); + + /* NB: for now, only whole pages, page aligned */ + + if (count <= 0 || + ((long)buffer & (PAGE_SIZE - 1)) != 0 || + (count & (PAGE_SIZE - 1)) != 0 || + (lsm != NULL && lsm->lsm_object_id != oa->o_id)) + return (-EINVAL); + + set = obd_brw_set_new(); + if (set == NULL) + return (-ENOMEM); + + /* XXX think again with misaligned I/O */ + npages = count >> PAGE_SHIFT; + + rc = -ENOMEM; + OBD_ALLOC(pga, npages * sizeof(*pga)); + if (pga == NULL) + goto out_0; + + rc = alloc_kiovec (1, &kiobuf); + if (rc != 0) + goto out_1; + + rc = map_user_kiobuf ((rw == OBD_BRW_READ) ? READ : WRITE, + kiobuf, (unsigned long)buffer, count); + if (rc != 0) + goto out_2; + + LASSERT (kiobuf->offset == 0); + LASSERT (kiobuf->nr_pages == npages); + + for (i = 0, off = offset, pgp = pga; + i < npages; + i++, off += PAGE_SIZE, pgp++) { + pgp->off = off; + pgp->pg = kiobuf->maplist[i]; + pgp->count = PAGE_SIZE; + pgp->flag = 0; + } + + set->brw_callback = ll_brw_sync_wait; + rc = obd_brw(rw, &ec->ec_conn, lsm, npages, pga, set, NULL); + + if (rc == 0) + rc = ll_brw_sync_wait(set, CB_PHASE_START); + + // if (rw == OBD_BRW_READ) + // mark_dirty_kiobuf (kiobuf, count); + + unmap_kiobuf (kiobuf); + out_2: + free_kiovec (1, &kiobuf); + out_1: + OBD_FREE(pga, npages * sizeof(*pga)); + out_0: + obd_brw_set_free(set); + return (rc); +} + +static int +echo_open (struct obd_export *exp, struct obdo *oa) +{ + struct obd_device *obd = exp->exp_obd; + struct echo_client_obd *ec = &obd->u.echo_client; + struct lustre_handle *ufh = obdo_handle (oa); + struct ec_open_object *ecoo; + struct ec_object *eco; + int rc; + + rc = echo_get_object (&eco, obd, oa); + if (rc != 0) + return (rc); + + rc = -ENOMEM; + OBD_ALLOC (ecoo, sizeof (*ecoo)); + if (ecoo == NULL) + goto failed_0; + + rc = obd_open (&ec->ec_conn, oa, eco->eco_lsm, NULL); + if (rc != 0) + goto failed_1; + + memcpy (&ecoo->ecoo_oa, oa, sizeof (*oa)); + ecoo->ecoo_object = eco; + /* ecoo takes ref from echo_get_object() above */ + + spin_lock (&ec->ec_lock); + + list_add (&ecoo->ecoo_exp_chain, + &exp->exp_ec_data.eced_open_head); + + ufh->addr = (__u64)((long) ecoo); + ufh->cookie = ecoo->ecoo_cookie = ec->ec_unique++; + + spin_unlock (&ec->ec_lock); + return (0); + + failed_1: + OBD_FREE (ecoo, sizeof (*ecoo)); + failed_0: + echo_put_object (eco); + return (rc); +} + +static int +echo_close (struct obd_export *exp, struct obdo *oa) +{ + struct obd_device *obd = exp->exp_obd; + struct echo_client_obd *ec = &obd->u.echo_client; + struct lustre_handle *ufh = obdo_handle (oa); + struct ec_open_object *ecoo = NULL; + int found = 0; + struct list_head *el; + int rc; + + if ((oa->o_valid & OBD_MD_FLHANDLE) == 0) + return (-EINVAL); + + spin_lock (&ec->ec_lock); + + list_for_each (el, &exp->exp_ec_data.eced_open_head) { + ecoo = list_entry (el, struct ec_open_object, ecoo_exp_chain); + if ((__u64)((long)ecoo) == ufh->addr) { + found = (ecoo->ecoo_cookie == ufh->cookie); + if (found) + list_del (&ecoo->ecoo_exp_chain); + break; + } + } + + spin_unlock (&ec->ec_lock); + + if (!found) + return (-EINVAL); + + rc = obd_close (&ec->ec_conn, &ecoo->ecoo_oa, + ecoo->ecoo_object->eco_lsm, NULL); + + echo_put_object (ecoo->ecoo_object); + OBD_FREE (ecoo, sizeof (*ecoo)); + + return (rc); +} + +static int +echo_ldlm_callback (struct ldlm_lock *lock, struct ldlm_lock_desc *new, + void *data, int flag) +{ + struct ec_object *eco = (struct ec_object *)data; + struct echo_client_obd *ec = &(eco->eco_device->u.echo_client); + struct lustre_handle lockh; + struct list_head *el; + int found = 0; + int rc; + + ldlm_lock2handle (lock, &lockh); + + /* #ifdef this out if we're not feeling paranoid */ + spin_lock (&ec->ec_lock); + list_for_each (el, &ec->ec_objects) { + found = (eco == list_entry (el, struct ec_object, eco_obj_chain)); + if (found) + break; + } + spin_unlock (&ec->ec_lock); + LASSERT (found); + + switch (flag) { + case LDLM_CB_BLOCKING: + CDEBUG (D_INFO, "blocking callback on "LPX64", handle "LPX64"."LPX64"\n", + eco->eco_id, lockh.addr, lockh.cookie); + rc = ldlm_cli_cancel (&lockh); + if (rc != ELDLM_OK) + CERROR ("ldlm_cli_cancel failed: %d\n", rc); + break; + + case LDLM_CB_CANCELING: + CDEBUG (D_INFO, "canceling callback on "LPX64", handle "LPX64"."LPX64"\n", + eco->eco_id, lockh.addr, lockh.cookie); + break; + + default: + LBUG (); + } + + return (0); +} + +static int +echo_enqueue (struct obd_export *exp, struct obdo *oa, + int mode, obd_off offset, obd_size nob) +{ + struct obd_device *obd = exp->exp_obd; + struct echo_client_obd *ec = &obd->u.echo_client; + struct lustre_handle *ulh = obdo_handle (oa); + struct ec_object *eco; + struct ec_lock *ecl; + int flags; + int rc; + + if (!(mode == LCK_PR || mode == LCK_PW)) + return (-EINVAL); + + if ((offset & (PAGE_SIZE - 1)) != 0 || + (nob & (PAGE_SIZE - 1)) != 0) + return (-EINVAL); + + rc = echo_get_object (&eco, obd, oa); + if (rc != 0) + return (rc); + + rc = -ENOMEM; + OBD_ALLOC (ecl, sizeof (*ecl)); + if (ecl == NULL) + goto failed_0; + + ecl->ecl_mode = mode; + ecl->ecl_object = eco; + ecl->ecl_extent.start = offset; + ecl->ecl_extent.end = (nob == 0) ? ((obd_off)-1) : (offset + nob - 1); + + flags = 0; + rc = obd_enqueue (&ec->ec_conn, eco->eco_lsm, NULL, + LDLM_EXTENT, &ecl->ecl_extent, sizeof (ecl->ecl_extent), + mode, &flags, echo_ldlm_callback, eco, sizeof (*eco), + &ecl->ecl_handle); + if (rc != 0) + goto failed_1; + + CDEBUG (D_INFO, "enqueue handle "LPX64"."LPX64"\n", + ecl->ecl_handle.addr, ecl->ecl_handle.cookie); + + /* NB ecl takes object ref from echo_get_object() above */ + + spin_lock (&ec->ec_lock); + + list_add (&ecl->ecl_exp_chain, &exp->exp_ec_data.eced_locks); + + ulh->addr = (__u64)((long)ecl); + ulh->cookie = ecl->ecl_cookie = ec->ec_unique++; + + spin_unlock (&ec->ec_lock); + + oa->o_valid |= OBD_MD_FLHANDLE; + return (0); + + failed_1: + OBD_FREE (ecl, sizeof (*ecl)); + failed_0: + echo_put_object (eco); + return (rc); +} + +static int +echo_cancel (struct obd_export *exp, struct obdo *oa) +{ + struct obd_device *obd = exp->exp_obd; + struct echo_client_obd *ec = &obd->u.echo_client; + struct lustre_handle *ulh = obdo_handle (oa); + struct ec_lock *ecl = NULL; + int found = 0; + struct list_head *el; + int rc; + + if ((oa->o_valid & OBD_MD_FLHANDLE) == 0) + return (-EINVAL); + + spin_lock (&ec->ec_lock); + + list_for_each (el, &exp->exp_ec_data.eced_locks) { + ecl = list_entry (el, struct ec_lock, ecl_exp_chain); + + if ((__u64)((long)ecl) == ulh->addr) { + found = (ecl->ecl_cookie == ulh->cookie); + if (found) + list_del (&ecl->ecl_exp_chain); + break; + } + } + + spin_unlock (&ec->ec_lock); + + if (!found) + return (-ENOENT); + + rc = obd_cancel (&ec->ec_conn, + ecl->ecl_object->eco_lsm, + ecl->ecl_mode, + &ecl->ecl_handle); + + echo_put_object (ecl->ecl_object); + OBD_FREE (ecl, sizeof (*ecl)); + + return (rc); +} + +static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn, + int len, void *karg, void *uarg) +{ + struct obd_export *exp = class_conn2export (obdconn); + struct obd_device *obd; + struct echo_client_obd *ec; + struct ec_object *eco; + struct obd_ioctl_data *data = karg; + int rw = OBD_BRW_READ; + int rc = 0; ENTRY; - if (obd == NULL) { + if (exp == NULL) { CERROR("ioctl: No device\n"); GOTO(out, rc = -EINVAL); } - if (data->ioc_inllen1 == sizeof(*lsm)) { - lsm = (struct lov_stripe_md *)data->ioc_inlbuf1; - } else if (data->ioc_inllen1 != 0) { - CERROR("nonzero ioc_inllen1 != sizeof(struct lov_stripe_md)\n"); - GOTO(out, rc = -EINVAL); - } + obd = exp->exp_obd; + ec = &obd->u.echo_client; switch (cmd) { - case OBD_IOC_CREATE: { - struct lov_stripe_md *tmp_lsm = NULL; - rc = obd_create(&ec->conn, &data->ioc_obdo1, &tmp_lsm); - if (lsm && tmp_lsm ) { - memcpy(lsm, tmp_lsm, sizeof(*tmp_lsm)); - data->ioc_conn2 = 1; - } + case OBD_IOC_CREATE: /* may create echo object */ + if (!capable (CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rc = echo_create_object (obd, 1, &data->ioc_obdo1, + data->ioc_pbuf1, data->ioc_plen1); + GOTO(out, rc); + case OBD_IOC_DESTROY: + if (!capable (CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rc = echo_get_object (&eco, obd, &data->ioc_obdo1); + if (rc == 0) { + rc = obd_destroy(&ec->ec_conn, &data->ioc_obdo1, + eco->eco_lsm, NULL); + if (rc == 0) + eco->eco_deleted = 1; + echo_put_object(eco); + } GOTO(out, rc); - } case OBD_IOC_GETATTR: - rc = obd_getattr(&ec->conn, &data->ioc_obdo1, lsm); + rc = echo_get_object (&eco, obd, &data->ioc_obdo1); + if (rc == 0) { + rc = obd_getattr(&ec->ec_conn, &data->ioc_obdo1, + eco->eco_lsm); + echo_put_object(eco); + } GOTO(out, rc); case OBD_IOC_SETATTR: - rc = obd_setattr(&ec->conn, &data->ioc_obdo1, lsm); - GOTO(out, rc); - - case OBD_IOC_DESTROY: - rc = obd_destroy(&ec->conn, &data->ioc_obdo1, lsm); + if (!capable (CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rc = echo_get_object (&eco, obd, &data->ioc_obdo1); + if (rc == 0) { + rc = obd_setattr(&ec->ec_conn, &data->ioc_obdo1, + eco->eco_lsm, NULL); + echo_put_object(eco); + } GOTO(out, rc); case OBD_IOC_OPEN: - rc = obd_open(&ec->conn, &data->ioc_obdo1, lsm); + rc = echo_open (exp, &data->ioc_obdo1); GOTO(out, rc); case OBD_IOC_CLOSE: - rc = obd_close(&ec->conn, &data->ioc_obdo1, lsm); + rc = echo_close (exp, &data->ioc_obdo1); GOTO(out, rc); case OBD_IOC_BRW_WRITE: + if (!capable (CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + rw = OBD_BRW_WRITE; - case OBD_IOC_BRW_READ: { - struct lov_stripe_md tmp_lsm; - struct obd_brw_set *set; - obd_count pages = 0; - struct brw_page *pga, *pgp; - __u64 off, id = data->ioc_obdo1.o_id; - int gfp_mask = (id & 1) ? GFP_HIGHUSER : GFP_KERNEL; - int j, verify = (id != 0); - - if (lsm && lsm->lsm_object_id != id) { - CERROR("LSM object ID ("LPU64") != id ("LPU64")\n", - lsm->lsm_object_id, id); - GOTO(out, rc = -EINVAL); - } - - if (!lsm) { - memset(&tmp_lsm, 0, sizeof(tmp_lsm)); - lsm = &tmp_lsm; - lsm->lsm_object_id = id; + /* fall through */ + case OBD_IOC_BRW_READ: + rc = echo_get_object (&eco, obd, &data->ioc_obdo1); + if (rc == 0) { + if (data->ioc_pbuf2 == NULL) // NULL user data pointer + rc = echo_client_kbrw(obd, rw, &data->ioc_obdo1, + eco->eco_lsm, + data->ioc_offset, + data->ioc_count); + else + rc = echo_client_ubrw(obd, rw, &data->ioc_obdo1, + eco->eco_lsm, + data->ioc_offset, + data->ioc_count, + data->ioc_pbuf2); + echo_put_object(eco); } + GOTO(out, rc); - if (data->ioc_count < 0) { - CERROR("invalid buffer size: "LPD64"\n", - data->ioc_count); - GOTO(out, rc = -EINVAL); + case ECHO_IOC_GET_STRIPE: + rc = echo_get_object(&eco, obd, &data->ioc_obdo1); + if (rc == 0) { + rc = echo_copyout_lsm(eco->eco_lsm, data->ioc_pbuf1, + data->ioc_plen1); + echo_put_object(eco); } + GOTO(out, rc); - set = obd_brw_set_new(); - if (set == NULL) - GOTO(out, rc = -ENOMEM); - - pages = data->ioc_count / PAGE_SIZE; - off = data->ioc_offset; - - CDEBUG(D_INODE, "BRW %s with %d pages @ "LPX64"\n", - rw == OBD_BRW_READ ? "read" : "write", pages, off); - OBD_ALLOC(pga, pages * sizeof(*pga)); - if (!pga) { - CERROR("no memory for %d BRW per-page data\n", pages); - GOTO(brw_free, rc = -ENOMEM); - } + case ECHO_IOC_SET_STRIPE: + if (!capable (CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); - for (j = 0, pgp = pga; j < pages; j++, off += PAGE_SIZE, pgp++){ - pgp->pg = alloc_pages(gfp_mask, 0); - if (!pgp->pg) { - CERROR("no memory for brw pages\n"); - GOTO(brw_cleanup, rc = -ENOMEM); - } - pgp->count = PAGE_SIZE; - pgp->off = off; - pgp->flag = 0; - - if (verify) { - void *addr = kmap(pgp->pg); - - if (rw == OBD_BRW_WRITE) - page_debug_setup(addr, pgp->count, - pgp->off, id); - else - page_debug_setup(addr, pgp->count, - 0xdeadbeef00c0ffee, - 0xdeadbeef00c0ffee); - kunmap(pgp->pg); + if (data->ioc_pbuf1 == NULL) { /* unset */ + rc = echo_get_object(&eco, obd, &data->ioc_obdo1); + if (rc == 0) { + eco->eco_deleted = 1; + echo_put_object(eco); } + } else { + rc = echo_create_object(obd, 0, &data->ioc_obdo1, + data->ioc_pbuf1, data->ioc_plen1); } + GOTO (out, rc); + + case ECHO_IOC_ENQUEUE: + if (!capable (CAP_SYS_ADMIN)) + GOTO (out, rc = -EPERM); + + rc = echo_enqueue (exp, &data->ioc_obdo1, + data->ioc_conn1, /* lock mode */ + data->ioc_offset, data->ioc_count); /* extent */ + GOTO (out, rc); + + case ECHO_IOC_CANCEL: + rc = echo_cancel (exp, &data->ioc_obdo1); + GOTO (out, rc); - set->brw_callback = ll_brw_sync_wait; - rc = obd_brw(rw, &ec->conn, lsm, j, pga, set); - if (rc) - CERROR("test_brw: error from obd_brw: rc = %d\n", rc); - else { - rc = ll_brw_sync_wait(set, CB_PHASE_START); - if (rc) - CERROR("test_brw: error from callback: rc = " - "%d\n", rc); - } - EXIT; - brw_cleanup: - for (j = 0, pgp = pga; j < pages; j++, pgp++) { - if (pgp->pg == NULL) - continue; - - if (verify && !rc) { - void *addr = kmap(pgp->pg); - - rc = page_debug_check("test_brw", addr, - pgp->count, pgp->off, id); - kunmap(pgp->pg); - } - __free_pages(pgp->pg, 0); - } - brw_free: - obd_brw_set_free(set); - OBD_FREE(pga, pages * sizeof(*pga)); - GOTO(out, rc); - } default: CERROR ("echo_ioctl(): unrecognised ioctl %#x\n", cmd); GOTO (out, rc = -ENOTTY); @@ -197,6 +975,9 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf) struct obd_ioctl_data* data = buf; struct echo_client_obd *ec = &obddev->u.echo_client; struct obd_device *tgt; + struct obd_uuid uuid; + struct lov_stripe_md *lsm = NULL; + struct obd_uuid echo_uuid = { "ECHO_UUID" }; int rc; ENTRY; @@ -209,7 +990,8 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(-EINVAL); } - tgt = class_uuid2obd(data->ioc_inlbuf1); + obd_str2uuid(&uuid, data->ioc_inlbuf1); + tgt = class_uuid2obd(&uuid); if (!tgt || !(tgt->obd_flags & OBD_ATTACHED) || !(tgt->obd_flags & OBD_SET_UP)) { CERROR("device not attached or not set up (%d)\n", @@ -217,14 +999,33 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(rc = -EINVAL); } - rc = obd_connect(&ec->conn, tgt, NULL, NULL, NULL); - if (rc) + spin_lock_init (&ec->ec_lock); + INIT_LIST_HEAD (&ec->ec_objects); + ec->ec_unique = 0; + + rc = obd_connect(&ec->ec_conn, tgt, &echo_uuid, NULL, NULL); + if (rc) { CERROR("fail to connect to device %d\n", data->ioc_dev); + return (rc); + } + + ec->ec_lsmsize = obd_alloc_memmd (&ec->ec_conn, &lsm); + if (ec->ec_lsmsize < 0) { + CERROR ("Can't get # stripes: %d\n", rc); + obd_disconnect (&ec->ec_conn); + rc = ec->ec_lsmsize; + } else { + ec->ec_nstripes = lsm->lsm_stripe_count; + obd_free_memmd (&ec->ec_conn, &lsm); + } + RETURN(rc); } static int echo_cleanup(struct obd_device * obddev) { + struct list_head *el; + struct ec_object *eco; struct echo_client_obd *ec = &obddev->u.echo_client; int rc; ENTRY; @@ -234,20 +1035,90 @@ static int echo_cleanup(struct obd_device * obddev) RETURN(-EBUSY); } - rc = obd_disconnect(&ec->conn); - if (rc) { - CERROR("fail to disconnect device: %d\n", rc); - RETURN(-EINVAL); + /* XXX assuming sole access */ + while (!list_empty (&ec->ec_objects)) { + el = ec->ec_objects.next; + eco = list_entry (el, struct ec_object, eco_obj_chain); + + LASSERT (eco->eco_refcount == 0); + eco->eco_refcount = 1; + eco->eco_deleted = 1; + echo_put_object (eco); } - RETURN(0); + rc = obd_disconnect (&ec->ec_conn); + if (rc != 0) + CERROR("fail to disconnect device: %d\n", rc); + + RETURN (rc); } static int echo_connect(struct lustre_handle *conn, struct obd_device *src, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { - return class_connect(conn, src, cluuid); + struct obd_export *exp; + int rc; + + rc = class_connect(conn, src, cluuid); + if (rc == 0) { + exp = class_conn2export (conn); + INIT_LIST_HEAD (&exp->exp_ec_data.eced_open_head); + INIT_LIST_HEAD (&exp->exp_ec_data.eced_locks); + } + + RETURN (rc); +} + +static int echo_disconnect(struct lustre_handle *conn) +{ + struct obd_export *exp = class_conn2export (conn); + struct obd_device *obd; + struct echo_client_obd *ec; + struct ec_open_object *ecoo; + struct ec_lock *ecl; + int rc; + + if (exp == NULL) + return (-EINVAL); + + obd = exp->exp_obd; + ec = &obd->u.echo_client; + + /* no more contention on export's lock list */ + while (!list_empty (&exp->exp_ec_data.eced_locks)) { + ecl = list_entry (exp->exp_ec_data.eced_locks.next, + struct ec_lock, ecl_exp_chain); + list_del (&ecl->ecl_exp_chain); + + rc = obd_cancel (&ec->ec_conn, ecl->ecl_object->eco_lsm, + ecl->ecl_mode, &ecl->ecl_handle); + + CERROR ("Cancel lock on object "LPX64" on disconnect (%d)\n", + ecl->ecl_object->eco_id, rc); + + echo_put_object (ecl->ecl_object); + OBD_FREE (ecl, sizeof (*ecl)); + } + + /* no more contention on export's open handle list */ + while (!list_empty (&exp->exp_ec_data.eced_open_head)) { + ecoo = list_entry (exp->exp_ec_data.eced_open_head.next, + struct ec_open_object, ecoo_exp_chain); + list_del (&ecoo->ecoo_exp_chain); + + rc = obd_close (&ec->ec_conn, &ecoo->ecoo_oa, + ecoo->ecoo_object->eco_lsm, NULL); + + CDEBUG (D_INFO, "Closed object "LPX64" on disconnect (%d)\n", + ecoo->ecoo_oa.o_id, rc); + + echo_put_object (ecoo->ecoo_object); + OBD_FREE (ecoo, sizeof (*ecoo)); + } + + rc = class_disconnect (conn); + RETURN (rc); } static struct obd_ops echo_obd_ops = { @@ -256,14 +1127,15 @@ static struct obd_ops echo_obd_ops = { o_cleanup: echo_cleanup, o_iocontrol: echo_iocontrol, o_connect: echo_connect, - o_disconnect: class_disconnect + o_disconnect: echo_disconnect }; int echo_client_init(void) { - extern struct lprocfs_vars status_class_var[]; + struct lprocfs_static_vars lvars; - return class_register_type(&echo_obd_ops, status_class_var, + lprocfs_init_vars(&lvars); + return class_register_type(&echo_obd_ops, lvars.module_vars, OBD_ECHO_CLIENT_DEVICENAME); } diff --git a/lustre/obdecho/lproc_echo.c b/lustre/obdecho/lproc_echo.c index 449f9c5..bb2870a 100644 --- a/lustre/obdecho/lproc_echo.c +++ b/lustre/obdecho/lproc_echo.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -21,47 +21,33 @@ */ #define DEBUG_SUBSYSTEM S_ECHO -#include #include +#include +#ifndef LPROCFS +struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; +struct lprocfs_vars lprocfs_module_vars[] = { {0} }; +#else -int rd_uuid(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - int len = 0; - struct obd_device* dev = (struct obd_device*)data; - len += snprintf(page, count, "%s\n", dev->obd_uuid); - return len; - -} - -int rd_fstype(char* page, char **start, off_t off, int count, int *eof, +int rd_fstype(char* page, char **start, off_t off, int count, int *eof, void *data) { - int len = 0; struct obd_device* dev = (struct obd_device*)data; - len += snprintf(page, count, "%s\n", dev->u.echo.eo_fstype); - return len; - + int rc = snprintf(page, count, "%s\n", dev->u.echo.eo_fstype); + *eof = 1; + return rc; } - -struct lprocfs_vars status_var_nm_1[] = { - {"status/uuid", rd_uuid, 0, 0}, - {"status/fstype", rd_fstype, 0, 0}, - {0} +struct lprocfs_vars lprocfs_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "fstype", rd_fstype, 0, 0 }, + { 0 } }; -int rd_numrefs(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_type* class = (struct obd_type*)data; - int len = 0; - len += snprintf(page, count, "%d\n", class->typ_refcnt); - return len; -} - -struct lprocfs_vars status_class_var[] = { - {"status/num_refs", rd_numrefs, 0, 0}, - {0} +struct lprocfs_vars lprocfs_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } }; + +#endif /* LPROCFS */ +LPROCFS_INIT_VARS(lprocfs_module_vars, lprocfs_obd_vars) diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 2d495b2..591005e 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -3,7 +3,7 @@ * * linux/fs/obdfilter/filter.c * - * Copyright (c) 2001, 2002 Cluster File Systems, Inc. + * Copyright (c) 2001-2003 Cluster File Systems, Inc. * Author: Peter Braam * Author: Andreas Dilger * @@ -24,8 +24,13 @@ */ /* - * Invariant: get O/R i_sem for lookup, if needed, before any journal ops + * Invariant: Get O/R i_sem for lookup, if needed, before any journal ops * (which need to get journal_lock, may block if journal full). + * + * Invariant: Call filter_start_transno() before any journal ops to avoid the + * same deadlock problem. We can (and want) to get rid of the + * transno sem in favour of the dir/inode i_sem to avoid single + * threaded operation on the OST. */ #define EXPORT_SYMTAB @@ -44,12 +49,141 @@ #include #include -extern struct lprocfs_vars status_class_var[]; -extern struct lprocfs_vars status_var_nm_1[]; static kmem_cache_t *filter_open_cache; static kmem_cache_t *filter_dentry_cache; +/* should be generic per-obd stats... */ +struct xprocfs_io_stat { + __u64 st_read_bytes; + __u64 st_read_reqs; + __u64 st_write_bytes; + __u64 st_write_reqs; + __u64 st_getattr_reqs; + __u64 st_setattr_reqs; + __u64 st_create_reqs; + __u64 st_destroy_reqs; + __u64 st_statfs_reqs; + __u64 st_open_reqs; + __u64 st_close_reqs; + __u64 st_punch_reqs; +}; + +static struct xprocfs_io_stat xprocfs_iostats[NR_CPUS]; +static struct proc_dir_entry *xprocfs_dir; + +#define XPROCFS_BUMP_MYCPU_IOSTAT(field, count) \ +do { \ + xprocfs_iostats[smp_processor_id()].field += (count); \ +} while (0) + +#define DECLARE_XPROCFS_SUM_STAT(field) \ +static long long \ +xprocfs_sum_##field (void) \ +{ \ + long long stat = 0; \ + int i; \ + \ + for (i = 0; i < smp_num_cpus; i++) \ + stat += xprocfs_iostats[i].field; \ + return (stat); \ +} + +DECLARE_XPROCFS_SUM_STAT (st_read_bytes) +DECLARE_XPROCFS_SUM_STAT (st_read_reqs) +DECLARE_XPROCFS_SUM_STAT (st_write_bytes) +DECLARE_XPROCFS_SUM_STAT (st_write_reqs) +DECLARE_XPROCFS_SUM_STAT (st_getattr_reqs) +DECLARE_XPROCFS_SUM_STAT (st_setattr_reqs) +DECLARE_XPROCFS_SUM_STAT (st_create_reqs) +DECLARE_XPROCFS_SUM_STAT (st_destroy_reqs) +DECLARE_XPROCFS_SUM_STAT (st_statfs_reqs) +DECLARE_XPROCFS_SUM_STAT (st_open_reqs) +DECLARE_XPROCFS_SUM_STAT (st_close_reqs) +DECLARE_XPROCFS_SUM_STAT (st_punch_reqs) + +static int +xprocfs_rd_stat (char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + long long (*fn)(void) = (long long(*)(void))data; + int len; + + *eof = 1; + if (off != 0) + return (0); + + len = snprintf (page, count, "%Ld\n", fn()); + *start = page; + return (len); +} + + +static void +xprocfs_add_stat(char *name, long long (*fn)(void)) +{ + struct proc_dir_entry *entry; + + entry = create_proc_entry (name, S_IFREG|S_IRUGO, xprocfs_dir); + if (entry == NULL) { + CERROR ("Can't add procfs stat %s\n", name); + return; + } + + entry->data = fn; + entry->read_proc = xprocfs_rd_stat; + entry->write_proc = NULL; +} + +static void +xprocfs_init (char *name) +{ + char dirname[64]; + + snprintf (dirname, sizeof (dirname), "sys/%s", name); + + xprocfs_dir = proc_mkdir ("sys/obdfilter", NULL); + if (xprocfs_dir == NULL) { + CERROR ("Can't make dir\n"); + return; + } + + xprocfs_add_stat ("read_bytes", xprocfs_sum_st_read_bytes); + xprocfs_add_stat ("read_reqs", xprocfs_sum_st_read_reqs); + xprocfs_add_stat ("write_bytes", xprocfs_sum_st_write_bytes); + xprocfs_add_stat ("write_reqs", xprocfs_sum_st_write_reqs); + xprocfs_add_stat ("getattr_reqs", xprocfs_sum_st_getattr_reqs); + xprocfs_add_stat ("setattr_reqs", xprocfs_sum_st_setattr_reqs); + xprocfs_add_stat ("create_reqs", xprocfs_sum_st_create_reqs); + xprocfs_add_stat ("destroy_reqs", xprocfs_sum_st_destroy_reqs); + xprocfs_add_stat ("statfs_reqs", xprocfs_sum_st_statfs_reqs); + xprocfs_add_stat ("open_reqs", xprocfs_sum_st_open_reqs); + xprocfs_add_stat ("close_reqs", xprocfs_sum_st_close_reqs); + xprocfs_add_stat ("punch_reqs", xprocfs_sum_st_punch_reqs); +} + +void xprocfs_fini (void) +{ + if (xprocfs_dir == NULL) + return; + + remove_proc_entry ("read_bytes", xprocfs_dir); + remove_proc_entry ("read_reqs", xprocfs_dir); + remove_proc_entry ("write_bytes", xprocfs_dir); + remove_proc_entry ("write_reqs", xprocfs_dir); + remove_proc_entry ("getattr_reqs", xprocfs_dir); + remove_proc_entry ("setattr_reqs", xprocfs_dir); + remove_proc_entry ("create_reqs", xprocfs_dir); + remove_proc_entry ("destroy_reqs", xprocfs_dir); + remove_proc_entry ("statfs_reqs", xprocfs_dir); + remove_proc_entry ("open_reqs", xprocfs_dir); + remove_proc_entry ("close_reqs", xprocfs_dir); + remove_proc_entry ("punch_reqs", xprocfs_dir); + + remove_proc_entry (xprocfs_dir->name, xprocfs_dir->parent); + xprocfs_dir = NULL; +} + #define S_SHIFT 12 static char *obd_type_by_mode[S_IFMT >> S_SHIFT] = { [0] NULL, @@ -67,6 +201,83 @@ static inline const char *obd_mode_to_type(int mode) return obd_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } +static void filter_last_rcvd_cb(struct obd_device *obd, __u64 last_rcvd, + int error) +{ + CDEBUG(D_HA, "got callback for last_rcvd "LPD64": rc = %d\n", + last_rcvd, error); + if (!error && last_rcvd > obd->obd_last_committed) + obd->obd_last_committed = last_rcvd; +} + +void filter_start_transno(struct obd_export *export) +{ + struct obd_device * obd = export->exp_obd; + ENTRY; + + down(&obd->u.filter.fo_transno_sem); +} + +/* Assumes caller has already pushed us into the kernel context. */ +int filter_finish_transno(struct obd_export *export, void *handle, + struct obd_trans_info *oti, int rc) +{ + __u64 last_rcvd; + struct obd_device *obd = export->exp_obd; + struct filter_obd *filter = &obd->u.filter; + struct filter_export_data *fed = &export->exp_filter_data; + struct filter_client_data *fcd = fed->fed_fcd; + loff_t off; + ssize_t written; + + /* Propagate error code. */ + if (rc) + GOTO(out, rc); + + /* we don't allocate new transnos for replayed requests */ +#if 0 + /* perhaps if transno already set? or should level be in oti? */ + if (req->rq_level == LUSTRE_CONN_RECOVD) + GOTO(out, rc = 0); +#endif + + off = FILTER_LR_CLIENT_START + fed->fed_lr_off * FILTER_LR_CLIENT_SIZE; + + last_rcvd = ++filter->fo_fsd->fsd_last_rcvd; + if (oti) + oti->oti_transno = last_rcvd; + fcd->fcd_last_rcvd = cpu_to_le64(last_rcvd); + fcd->fcd_mount_count = cpu_to_le64(filter->fo_fsd->fsd_mount_count); + + /* get this from oti */ +#if 0 + if (oti) + fcd->fcd_last_xid = cpu_to_le64(oti->oti_xid); + else +#else + fcd->fcd_last_xid = 0; +#endif + fsfilt_set_last_rcvd(obd, last_rcvd, handle, filter_last_rcvd_cb); + written = lustre_fwrite(filter->fo_rcvd_filp, (char *)fcd, sizeof(*fcd), + &off); + CDEBUG(D_INODE, "wrote trans #"LPD64" for client %s at #%d: written = " + LPSZ"\n", last_rcvd, fcd->fcd_uuid, fed->fed_lr_off, written); + + if (written == sizeof(*fcd)) + GOTO(out, rc = 0); + CERROR("error writing to last_rcvd file: rc = %d\n", rc); + if (written >= 0) + GOTO(out, rc = -EIO); + + rc = 0; + + EXIT; + out: + + up(&filter->fo_transno_sem); + return rc; +} + /* write the pathname into the string */ static int filter_id(char *buf, obd_id id, obd_mode mode) { @@ -95,6 +306,336 @@ struct dentry_operations filter_dops = { }; #define LAST_RCVD "last_rcvd" +#define INIT_OBJID 2 + +/* This limit is arbitrary, but for now we fit it in 1 page (32k clients) */ +#define FILTER_LR_MAX_CLIENTS (PAGE_SIZE * 8) +#define FILTER_LR_MAX_CLIENT_WORDS (FILTER_LR_MAX_CLIENTS/sizeof(unsigned long)) + +static unsigned long filter_last_rcvd_slots[FILTER_LR_MAX_CLIENT_WORDS]; + +/* Add client data to the FILTER. We use a bitmap to locate a free space + * in the last_rcvd file if cl_off is -1 (i.e. a new client). + * Otherwise, we have just read the data from the last_rcvd file and + * we know its offset. + */ +int filter_client_add(struct filter_obd *filter, + struct filter_export_data *fed, int cl_off) +{ + int new_client = (cl_off == -1); + + /* the bitmap operations can handle cl_off > sizeof(long) * 8, so + * there's no need for extra complication here + */ + if (new_client) { + cl_off = find_first_zero_bit(filter_last_rcvd_slots, + FILTER_LR_MAX_CLIENTS); + repeat: + if (cl_off >= FILTER_LR_MAX_CLIENTS) { + CERROR("no client slots - fix FILTER_LR_MAX_CLIENTS\n"); + return -ENOMEM; + } + if (test_and_set_bit(cl_off, filter_last_rcvd_slots)) { + CERROR("FILTER client %d: found bit is set in bitmap\n", + cl_off); + cl_off = find_next_zero_bit(filter_last_rcvd_slots, + FILTER_LR_MAX_CLIENTS, + cl_off); + goto repeat; + } + } else { + if (test_and_set_bit(cl_off, filter_last_rcvd_slots)) { + CERROR("FILTER client %d: bit already set in bitmap!\n", + cl_off); + LBUG(); + } + } + + CDEBUG(D_INFO, "client at offset %d with UUID '%s' added\n", + cl_off, fed->fed_fcd->fcd_uuid); + + fed->fed_lr_off = cl_off; + + if (new_client) { + struct obd_run_ctxt saved; + loff_t off = FILTER_LR_CLIENT_START + + (cl_off * FILTER_LR_CLIENT_SIZE); + ssize_t written; + + push_ctxt(&saved, &filter->fo_ctxt, NULL); + written = lustre_fwrite(filter->fo_rcvd_filp, + (char *)fed->fed_fcd, + sizeof(*fed->fed_fcd), &off); + pop_ctxt(&saved, &filter->fo_ctxt, NULL); + + if (written != sizeof(*fed->fed_fcd)) { + if (written < 0) + RETURN(written); + RETURN(-EIO); + } + CDEBUG(D_INFO, "wrote client fcd at off %u (len %u)\n", + FILTER_LR_CLIENT_START + (cl_off*FILTER_LR_CLIENT_SIZE), + (unsigned int)sizeof(*fed->fed_fcd)); + } + return 0; +} + +int filter_client_free(struct obd_export *exp) +{ + struct filter_export_data *fed = &exp->exp_filter_data; + struct filter_obd *filter = &exp->exp_obd->u.filter; + struct filter_client_data zero_fcd; + struct obd_run_ctxt saved; + int written; + loff_t off; + + if (!fed->fed_fcd) + RETURN(0); + + off = FILTER_LR_CLIENT_START + (fed->fed_lr_off*FILTER_LR_CLIENT_SIZE); + + CDEBUG(D_INFO, "freeing client at offset %u (%lld)with UUID '%s'\n", + fed->fed_lr_off, off, fed->fed_fcd->fcd_uuid); + + if (!test_and_clear_bit(fed->fed_lr_off, filter_last_rcvd_slots)) { + CERROR("FILTER client %u: bit already clear in bitmap!!\n", + fed->fed_lr_off); + LBUG(); + } + + memset(&zero_fcd, 0, sizeof zero_fcd); + push_ctxt(&saved, &filter->fo_ctxt, NULL); + written = lustre_fwrite(filter->fo_rcvd_filp, (const char *)&zero_fcd, + sizeof(zero_fcd), &off); + + /* XXX: this write gets lost sometimes, unless this sync is here. */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + fsync_dev(filter->fo_rcvd_filp->f_dentry->d_inode->i_rdev); +#else + file_fsync(filter->fo_rcvd_filp, filter->fo_rcvd_filp->f_dentry, 1); +#endif + pop_ctxt(&saved, &filter->fo_ctxt, NULL); + + if (written != sizeof(zero_fcd)) { + CERROR("error zeroing out client %s off %d in %s: %d\n", + fed->fed_fcd->fcd_uuid, fed->fed_lr_off, LAST_RCVD, + written); + } else { + CDEBUG(D_INFO, + "zeroed disconnecting client %s at off %d ("LPX64")\n", + fed->fed_fcd->fcd_uuid, fed->fed_lr_off, off); + } + + OBD_FREE(fed->fed_fcd, sizeof(*fed->fed_fcd)); + + return 0; +} + +static void filter_unpack_fsd(struct filter_server_data *fsd) +{ + fsd->fsd_last_objid = le64_to_cpu(fsd->fsd_last_objid); + fsd->fsd_last_rcvd = le64_to_cpu(fsd->fsd_last_rcvd); + fsd->fsd_mount_count = le64_to_cpu(fsd->fsd_mount_count); +} + +static void filter_pack_fsd(struct filter_server_data *disk_fsd, + struct filter_server_data *fsd) +{ + memset(disk_fsd, 0, sizeof(*disk_fsd)); + memcpy(disk_fsd->fsd_uuid, fsd->fsd_uuid, sizeof(fsd->fsd_uuid)); + disk_fsd->fsd_last_objid = cpu_to_le64(fsd->fsd_last_objid); + disk_fsd->fsd_last_rcvd = cpu_to_le64(fsd->fsd_last_rcvd); + disk_fsd->fsd_mount_count = cpu_to_le64(fsd->fsd_mount_count); +} + +static int filter_free_server_data(struct filter_obd *filter) +{ + OBD_FREE(filter->fo_fsd, sizeof(*filter->fo_fsd)); + filter->fo_fsd = NULL; + + return 0; +} + + +/* assumes caller has already in kernel ctxt */ +static int filter_update_server_data(struct file *filp, + struct filter_server_data *fsd) +{ + struct filter_server_data disk_fsd; + loff_t off = 0; + int rc; + + CDEBUG(D_INODE, "server uuid : %s\n", fsd->fsd_uuid); + CDEBUG(D_INODE, "server last_objid: "LPU64"\n", fsd->fsd_last_objid); + CDEBUG(D_INODE, "server last_rcvd : "LPU64"\n", fsd->fsd_last_rcvd); + CDEBUG(D_INODE, "server last_mount: "LPU64"\n", fsd->fsd_mount_count); + + filter_pack_fsd(&disk_fsd, fsd); + rc = lustre_fwrite(filp, (char *)&disk_fsd, + sizeof(disk_fsd), &off); + if (rc != sizeof(disk_fsd)) { + CDEBUG(D_INODE, "error writing filter_server_data: rc = %d\n", + rc); + RETURN(-EIO); + } + RETURN(0); +} + +/* assumes caller has already in kernel ctxt */ +static int filter_init_server_data(struct obd_device *obd, + struct file * filp, + __u64 init_lastobjid) +{ + struct filter_obd *filter = &obd->u.filter; + struct filter_server_data *fsd; + struct filter_client_data *fcd = NULL; + struct inode *inode = filp->f_dentry->d_inode; + unsigned long last_rcvd_size = inode->i_size; + int cl_off; + loff_t off = 0; + int rc; + + /* ensure padding in the struct is the correct size */ + LASSERT (offsetof(struct filter_server_data, fsd_padding) + + sizeof(fsd->fsd_padding) == FILTER_LR_SERVER_SIZE); + LASSERT (offsetof(struct filter_client_data, fcd_padding) + + sizeof(fcd->fcd_padding) == FILTER_LR_CLIENT_SIZE); + + OBD_ALLOC(fsd, sizeof(*fsd)); + if (!fsd) + RETURN(-ENOMEM); + filter->fo_fsd = fsd; + + if (last_rcvd_size == 0) { + CERROR("%s: initializing new last_rcvd\n", obd->obd_name); + + memcpy(fsd->fsd_uuid, obd->obd_uuid.uuid,sizeof(fsd->fsd_uuid)); + fsd->fsd_last_objid = init_lastobjid; + fsd->fsd_last_rcvd = 0; + fsd->fsd_mount_count = 0; + + } else { + ssize_t retval = lustre_fread(filp, (char *)fsd, + sizeof(*fsd), + &off); + if (retval != sizeof(*fsd)) { + CDEBUG(D_INODE,"OBD filter: error reading lastobjid\n"); + GOTO(out, rc = -EIO); + } + filter_unpack_fsd(fsd); + } + + CDEBUG(D_INODE, "%s: server last_objid: "LPU64"\n", + obd->obd_name, fsd->fsd_last_objid); + CDEBUG(D_INODE, "%s: server last_rcvd : "LPU64"\n", + obd->obd_name, fsd->fsd_last_rcvd); + CDEBUG(D_INODE, "%s: server last_mount: "LPU64"\n", + obd->obd_name, fsd->fsd_mount_count); + + /* + * When we do a clean FILTER shutdown, we save the last_rcvd into + * the header. If we find clients with higher last_rcvd values + * then those clients may need recovery done. + */ + /* off is adjusted by lustre_fread, so we don't adjust it in the loop */ + for (off = FILTER_LR_CLIENT_START, cl_off = 0; off < last_rcvd_size; + cl_off++) { + __u64 last_rcvd; + int mount_age; + + if (!fcd) { + OBD_ALLOC(fcd, sizeof(*fcd)); + if (!fcd) + GOTO(err_fsd, rc = -ENOMEM); + } + + rc = lustre_fread(filp, (char *)fcd, sizeof(*fcd), &off); + if (rc != sizeof(*fcd)) { + CERROR("error reading FILTER %s offset %d: rc = %d\n", + LAST_RCVD, cl_off, rc); + if (rc > 0) /* XXX fatal error or just abort reading? */ + rc = -EIO; + break; + } + + if (fcd->fcd_uuid[0] == '\0') { + CDEBUG(D_INFO, "skipping zeroed client at offset %d\n", + cl_off); + continue; + } + + last_rcvd = le64_to_cpu(fcd->fcd_last_rcvd); + + /* These exports are cleaned up by filter_disconnect(), so they + * need to be set up like real exports as filter_connect() does. + */ + mount_age = fsd->fsd_mount_count - + le64_to_cpu(fcd->fcd_mount_count); + if (mount_age < FILTER_MOUNT_RECOV) { + CERROR("RCVRNG CLIENT uuid: %s off: %d lr: "LPU64 + "srv lr: "LPU64" mnt: "LPU64" last mount: "LPU64 + "\n", fcd->fcd_uuid, cl_off, + last_rcvd, fsd->fsd_last_rcvd, + le64_to_cpu(fcd->fcd_mount_count), + fsd->fsd_mount_count); +#if 0 + /* disabled until OST recovery is actually working */ + struct obd_export *exp = class_new_export(obd); + struct filter_export_data *fed; + + if (!exp) { + rc = -ENOMEM; + break; + } + + fed = &exp->exp_filter_data; + fed->fed_fcd = fcd; + filter_client_add(filter, fed, cl_off); + /* create helper if export init gets more complex */ + INIT_LIST_HEAD(&fed->fed_open_head); + spin_lock_init(&fed->fed_lock); + + fcd = NULL; + filter->fo_recoverable_clients++; +#endif + } else { + CDEBUG(D_INFO, + "discarded client %d, UUID '%s', count %Ld\n", + cl_off, fcd->fcd_uuid, + (long long)le64_to_cpu(fcd->fcd_mount_count)); + } + + CDEBUG(D_OTHER, "client at offset %d has last_rcvd = %Lu\n", + cl_off, (unsigned long long)last_rcvd); + + if (last_rcvd > filter->fo_fsd->fsd_last_rcvd) + filter->fo_fsd->fsd_last_rcvd = last_rcvd; + } + + obd->obd_last_committed = filter->fo_fsd->fsd_last_rcvd; + if (filter->fo_recoverable_clients) { + CERROR("RECOVERY: %d recoverable clients, last_rcvd "LPU64"\n", + filter->fo_recoverable_clients, + filter->fo_fsd->fsd_last_rcvd); + filter->fo_next_recovery_transno = obd->obd_last_committed + 1; + obd->obd_flags |= OBD_RECOVERING; + } + + if (fcd) + OBD_FREE(fcd, sizeof(*fcd)); + + fsd->fsd_mount_count++; + + /* save it,so mount count and last_recvd is current */ + rc = filter_update_server_data(filp, filter->fo_fsd); + +out: + RETURN(rc); + +err_fsd: + filter_free_server_data(filter); + RETURN(rc); +} /* setup the object store with correct subdirectories */ static int filter_prep(struct obd_device *obd) @@ -105,7 +646,6 @@ static int filter_prep(struct obd_device *obd) struct file *file; struct inode *inode; int rc = 0; - __u64 lastobjid = 2; int mode = 0; push_ctxt(&saved, &filter->fo_ctxt, NULL); @@ -147,36 +687,29 @@ static int filter_prep(struct obd_device *obd) GOTO(out_O_mode, rc); } + if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { + CERROR("%s is not a regular file!: mode = %o\n", LAST_RCVD, + file->f_dentry->d_inode->i_mode); + GOTO(err_filp, rc = -ENOENT); + } + + rc = fsfilt_journal_data(obd, file); + if (rc) { + CERROR("cannot journal data on %s: rc = %d\n", LAST_RCVD, rc); + GOTO(err_filp, rc); + } /* steal operations */ inode = file->f_dentry->d_inode; filter->fo_fop = file->f_op; filter->fo_iop = inode->i_op; filter->fo_aops = inode->i_mapping->a_ops; - if (inode->i_size == 0) { - __u64 disk_lastobjid = cpu_to_le64(lastobjid); - ssize_t retval = file->f_op->write(file,(char *)&disk_lastobjid, - sizeof(disk_lastobjid), - &file->f_pos); - if (retval != sizeof(disk_lastobjid)) { - CDEBUG(D_INODE,"OBD filter: error writing lastobjid\n"); - filp_close(file, 0); - GOTO(out_O_mode, rc = -EIO); - } - } else { - __u64 disk_lastobjid; - ssize_t retval = file->f_op->read(file, (char *)&disk_lastobjid, - sizeof(disk_lastobjid), - &file->f_pos); - if (retval != sizeof(disk_lastobjid)) { - CDEBUG(D_INODE,"OBD filter: error reading lastobjid\n"); - filp_close(file, 0); - GOTO(out_O_mode, rc = -EIO); - } - lastobjid = le64_to_cpu(disk_lastobjid); + rc = filter_init_server_data(obd, file, INIT_OBJID); + if (rc) { + CERROR("cannot read %s: rc = %d\n", LAST_RCVD, rc); + GOTO(err_client, rc); } - filter->fo_lastobjid = lastobjid; - filp_close(file, 0); + filter->fo_rcvd_filp = file; rc = 0; out: @@ -184,6 +717,12 @@ static int filter_prep(struct obd_device *obd) return(rc); +err_client: + class_disconnect_all(obd); +err_filp: + if (filp_close(file, 0)) + CERROR("can't close %s after error\n", LAST_RCVD); + filter->fo_rcvd_filp = NULL; out_O_mode: while (mode-- > 0) { struct dentry *dentry = filter->fo_dentry_O_mode[mode]; @@ -202,28 +741,33 @@ static void filter_post(struct obd_device *obd) { struct obd_run_ctxt saved; struct filter_obd *filter = &obd->u.filter; - __u64 disk_lastobjid; long rc; - struct file *file; int mode; - push_ctxt(&saved, &filter->fo_ctxt, NULL); - file = filp_open(LAST_RCVD, O_RDWR | O_CREAT, 0700); - if (IS_ERR(file)) { - CERROR("OBD filter: cannot create %s\n", LAST_RCVD); - goto out; - } + /* XXX: filter_update_lastobjid used to call fsync_dev. It might be + * best to start a transaction with h_sync, because we removed this + * from lastobjid */ - file->f_pos = 0; - disk_lastobjid = cpu_to_le64(filter->fo_lastobjid); - rc = file->f_op->write(file, (char *)&disk_lastobjid, - sizeof(disk_lastobjid), &file->f_pos); - if (rc != sizeof(disk_lastobjid)) + push_ctxt(&saved, &filter->fo_ctxt, NULL); + rc = filter_update_server_data(filter->fo_rcvd_filp, filter->fo_fsd); + if (rc) CERROR("OBD filter: error writing lastobjid: rc = %ld\n", rc); + filter_free_server_data(filter); - rc = filp_close(file, NULL); - if (rc) - CERROR("OBD filter: cannot close status file: rc = %ld\n", rc); + + if (filter->fo_rcvd_filp) { + /* broken sync at umount bug workaround */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + rc = fsync_dev(filter->fo_rcvd_filp->f_dentry->d_inode->i_rdev); +#else + rc = file_fsync(filter->fo_rcvd_filp, + filter->fo_rcvd_filp->f_dentry, 1); +#endif + filp_close(filter->fo_rcvd_filp, 0); + filter->fo_rcvd_filp = NULL; + if (rc) + CERROR("last_rcvd file won't closek rc = %ld\n", rc); + } for (mode = 0; mode < (S_IFMT >> S_SHIFT); mode++) { struct dentry *dentry = filter->fo_dentry_O_mode[mode]; @@ -233,7 +777,6 @@ static void filter_post(struct obd_device *obd) } } f_dput(filter->fo_dentry_O); -out: pop_ctxt(&saved, &filter->fo_ctxt, NULL); } @@ -241,9 +784,10 @@ out: static __u64 filter_next_id(struct obd_device *obd) { obd_id id; + LASSERT(obd->u.filter.fo_fsd != NULL); spin_lock(&obd->u.filter.fo_objidlock); - id = ++obd->u.filter.fo_lastobjid; + id = ++obd->u.filter.fo_fsd->fsd_last_objid; spin_unlock(&obd->u.filter.fo_objidlock); return id; @@ -253,7 +797,7 @@ static __u64 filter_next_id(struct obd_device *obd) /* parent i_sem is already held if needed for exclusivity */ static struct dentry *filter_fid2dentry(struct obd_device *obd, struct dentry *dparent, - __u64 id, int locked) + __u64 id, int lockit) { struct super_block *sb = obd->u.filter.fo_sb; struct dentry *dchild; @@ -273,13 +817,13 @@ static struct dentry *filter_fid2dentry(struct obd_device *obd, } len = sprintf(name, LPU64, id); - CDEBUG(D_INODE, "opening object O/%*s/%s\n", + CDEBUG(D_INODE, "looking up object O/%*s/%s\n", dparent->d_name.len, dparent->d_name.name, name); - //if (!locked) - //down(&dparent->d_inode->i_sem); + if (lockit) + down(&dparent->d_inode->i_sem); dchild = lookup_one_len(name, dparent, len); - //if (!locked) - //up(&dparent->d_inode->i_sem); + if (lockit) + up(&dparent->d_inode->i_sem); if (IS_ERR(dchild)) { CERROR("child lookup error %ld\n", PTR_ERR(dchild)); RETURN(dchild); @@ -389,9 +933,9 @@ static struct file *filter_obj_open(struct obd_export *export, spin_unlock(&fed->fed_lock); CDEBUG(D_INODE, "opened objid "LPX64": rc = %p\n", id, file); - + EXIT; out: - RETURN(file); + return file; out_fdd: kmem_cache_free(filter_dentry_cache, fdd); @@ -402,11 +946,11 @@ out_ffd: } /* Caller must hold i_sem on dir_dentry->d_inode */ +/* Caller must push us into kernel context */ static int filter_destroy_internal(struct obd_device *obd, struct dentry *dir_dentry, struct dentry *object_dentry) { - struct obd_run_ctxt saved; struct inode *inode = object_dentry->d_inode; int rc; ENTRY; @@ -418,9 +962,7 @@ static int filter_destroy_internal(struct obd_device *obd, inode->i_nlink, atomic_read(&inode->i_count)); } - push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); rc = vfs_unlink(dir_dentry->d_inode, object_dentry); - pop_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); if (rc) CERROR("error unlinking objid %*s: rc %d\n", @@ -430,13 +972,16 @@ static int filter_destroy_internal(struct obd_device *obd, RETURN(rc); } -static int filter_close_internal(struct obd_device *obd, - struct filter_file_data *ffd) +static int filter_close_internal(struct obd_export *export, + struct filter_file_data *ffd, + struct obd_trans_info *oti) { + struct obd_device *obd = export->exp_obd; + struct filter_obd *filter = &obd->u.filter; struct file *filp = ffd->ffd_file; struct dentry *object_dentry = dget(filp->f_dentry); struct filter_dentry_data *fdd = object_dentry->d_fsdata; - int rc, rc2 = 0; + int rc, rc2; ENTRY; LASSERT(filp->private_data == ffd); @@ -447,14 +992,32 @@ static int filter_close_internal(struct obd_device *obd, if (atomic_dec_and_test(&fdd->fdd_open_count) && fdd->fdd_flags & FILTER_FLAG_DESTROY) { struct dentry *dir_dentry = filter_parent(obd, S_IFREG); + struct obd_run_ctxt saved; + void *handle; down(&dir_dentry->d_inode->i_sem); - /* XXX start transaction */ + push_ctxt(&saved, &filter->fo_ctxt, NULL); + filter_start_transno(export); + handle = fsfilt_start(obd, dir_dentry->d_inode, + FSFILT_OP_UNLINK); + if (IS_ERR(handle)) { + rc = filter_finish_transno(export, handle, oti, + PTR_ERR(handle)); + GOTO(out, rc); + } /* XXX unlink from PENDING directory now too */ rc2 = filter_destroy_internal(obd, dir_dentry, object_dentry); - /* XXX finish transaction */ if (rc2 && !rc) rc = rc2; + rc = filter_finish_transno(export, handle, oti, rc); + rc2 = fsfilt_commit(obd, dir_dentry->d_inode, handle); + if (rc2) { + CERROR("error on commit, err = %d\n", rc2); + if (!rc) + rc = rc2; + } + out: + pop_ctxt(&saved, &filter->fo_ctxt, NULL); up(&dir_dentry->d_inode->i_sem); } @@ -474,20 +1037,22 @@ static int filter_setup(struct obd_device *obd, obd_count len, void *buf) int rc = 0; ENTRY; - MOD_INC_USE_COUNT; if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2) - GOTO(err_dec, rc = -EINVAL); + RETURN(rc = -EINVAL); obd->obd_fsops = fsfilt_get_ops(data->ioc_inlbuf2); if (IS_ERR(obd->obd_fsops)) - GOTO(err_dec, rc = PTR_ERR(obd->obd_fsops)); + RETURN(rc = PTR_ERR(obd->obd_fsops)); mnt = do_kern_mount(data->ioc_inlbuf2, 0, data->ioc_inlbuf1, NULL); rc = PTR_ERR(mnt); if (IS_ERR(mnt)) GOTO(err_ops, rc); + obd->obd_flags |= OBD_REPLAYABLE; + filter = &obd->u.filter;; + init_MUTEX(&filter->fo_transno_sem); filter->fo_vfsmnt = mnt; filter->fo_fstype = strdup(data->ioc_inlbuf2); filter->fo_sb = mnt->mnt_root->d_inode->i_sb; @@ -526,8 +1091,6 @@ err_kfree: lock_kernel(); err_ops: fsfilt_put_ops(obd->obd_fsops); -err_dec: - MOD_DEC_USE_COUNT; return rc; } @@ -563,45 +1126,73 @@ static int filter_cleanup(struct obd_device *obd) lock_kernel(); - MOD_DEC_USE_COUNT; RETURN(0); } int filter_attach(struct obd_device *dev, obd_count len, void *data) { - return lprocfs_reg_obd(dev, status_var_nm_1, dev); + struct lprocfs_static_vars lvars; + + lprocfs_init_vars(&lvars); + return lprocfs_obd_attach(dev, lvars.obd_vars); } int filter_detach(struct obd_device *dev) { - return lprocfs_dereg_obd(dev); + return lprocfs_obd_detach(dev); } +/* nearly identical to mds_connect */ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { struct obd_export *exp; + struct filter_export_data *fed; + struct filter_client_data *fcd; + struct filter_obd *filter = &obd->u.filter; int rc; ENTRY; - MOD_INC_USE_COUNT; + + if (!conn || !obd || !cluuid) + RETURN(-EINVAL); + rc = class_connect(conn, obd, cluuid); if (rc) - GOTO(out_dec, rc); + RETURN(rc); exp = class_conn2export(conn); LASSERT(exp); + fed = &exp->exp_filter_data; + + OBD_ALLOC(fcd, sizeof(*fcd)); + if (!fcd) { + CERROR("filter: out of memory for client data\n"); + GOTO(out_export, rc = -ENOMEM); + } + + memcpy(fcd->fcd_uuid, cluuid, sizeof(fcd->fcd_uuid)); + fed->fed_fcd = fcd; + fcd->fcd_mount_count = cpu_to_le64(filter->fo_fsd->fsd_mount_count); INIT_LIST_HEAD(&exp->exp_filter_data.fed_open_head); spin_lock_init(&exp->exp_filter_data.fed_lock); -out: + + rc = filter_client_add(filter, fed, -1); + if (rc) + GOTO(out_fcd, rc); + RETURN(rc); -out_dec: - MOD_DEC_USE_COUNT; - goto out; +out_fcd: + OBD_FREE(fcd, sizeof(*fcd)); +out_export: + class_disconnect(conn); + + RETURN(rc); } +/* also incredibly similar to mds_disconnect */ static int filter_disconnect(struct lustre_handle *conn) { struct obd_export *exp = class_conn2export(conn); @@ -620,19 +1211,20 @@ static int filter_disconnect(struct lustre_handle *conn) list_del(&ffd->ffd_export_list); spin_unlock(&fed->fed_lock); - CERROR("force closing file %*s on disconnect\n", + CERROR("force close file %*s (hdl %p:"LPX64") on disconnect\n", ffd->ffd_file->f_dentry->d_name.len, - ffd->ffd_file->f_dentry->d_name.name); + ffd->ffd_file->f_dentry->d_name.name, + ffd, ffd->ffd_servercookie); - filter_close_internal(exp->exp_obd, ffd); + filter_close_internal(exp, ffd, NULL); spin_lock(&fed->fed_lock); } spin_unlock(&fed->fed_lock); ldlm_cancel_locks_for_export(exp); + filter_client_free(exp); + rc = class_disconnect(conn); - if (!rc) - MOD_DEC_USE_COUNT; /* XXX cleanup preallocated inodes */ RETURN(rc); @@ -709,6 +1301,7 @@ static struct dentry *__filter_oa2dentry(struct lustre_handle *conn, if (!dentry->d_inode) { CERROR("%s on non-existent object: "LPX64"\n", what, oa->o_id); f_dput(dentry); + LBUG(); RETURN(ERR_PTR(-ENOENT)); } @@ -725,7 +1318,9 @@ static int filter_getattr(struct lustre_handle *conn, struct obdo *oa, int rc = 0; ENTRY; - dentry = filter_oa2dentry(conn, oa, 0); + XPROCFS_BUMP_MYCPU_IOSTAT (st_getattr_reqs, 1); + + dentry = filter_oa2dentry(conn, oa, 1); if (IS_ERR(dentry)) RETURN(PTR_ERR(dentry)); @@ -737,16 +1332,21 @@ static int filter_getattr(struct lustre_handle *conn, struct obdo *oa, /* this is called from filter_truncate() until we have filter_punch() */ static int filter_setattr(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *md) + struct lov_stripe_md *md, struct obd_trans_info *oti) { struct obd_run_ctxt saved; + struct obd_export *export = class_conn2export(conn); struct obd_device *obd = class_conn2obd(conn); + struct filter_obd *filter = &obd->u.filter; struct dentry *dentry; struct iattr iattr; struct inode *inode; - int rc; + void * handle; + int rc, rc2; ENTRY; + XPROCFS_BUMP_MYCPU_IOSTAT (st_setattr_reqs, 1); + dentry = filter_oa2dentry(conn, oa, 0); if (IS_ERR(dentry)) @@ -756,17 +1356,29 @@ static int filter_setattr(struct lustre_handle *conn, struct obdo *oa, iattr.ia_mode = (iattr.ia_mode & ~S_IFMT) | S_IFREG; inode = dentry->d_inode; - push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + push_ctxt(&saved, &filter->fo_ctxt, NULL); lock_kernel(); if (iattr.ia_valid & ATTR_SIZE) down(&inode->i_sem); - /* XXX start transaction */ + filter_start_transno(export); + handle = fsfilt_start(obd, dentry->d_inode, FSFILT_OP_SETATTR); + if (IS_ERR(handle)) { + rc = filter_finish_transno(export, handle, oti,PTR_ERR(handle)); + GOTO(out_unlock, rc); + } + if (inode->i_op->setattr) rc = inode->i_op->setattr(dentry, &iattr); else rc = inode_setattr(inode, &iattr); - /* XXX update last_rcvd, finish transaction */ + rc = filter_finish_transno(export, handle, oti, rc); + rc2 = fsfilt_commit(obd, dentry->d_inode, handle); + if (rc2) { + CERROR("error on commit, err = %d\n", rc2); + if (!rc) + rc = rc2; + } if (iattr.ia_valid & ATTR_SIZE) { up(&inode->i_sem); @@ -774,15 +1386,16 @@ static int filter_setattr(struct lustre_handle *conn, struct obdo *oa, obdo_from_inode(oa, inode, oa->o_valid); } +out_unlock: unlock_kernel(); - pop_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + pop_ctxt(&saved, &filter->fo_ctxt, NULL); f_dput(dentry); RETURN(rc); } static int filter_open(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *ea) + struct lov_stripe_md *ea, struct obd_trans_info *oti) { struct obd_export *export; struct lustre_handle *handle; @@ -797,6 +1410,8 @@ static int filter_open(struct lustre_handle *conn, struct obdo *oa, RETURN(-EINVAL); } + XPROCFS_BUMP_MYCPU_IOSTAT (st_open_reqs, 1); + filp = filter_obj_open(export, oa->o_id, oa->o_mode); if (IS_ERR(filp)) GOTO(out, rc = PTR_ERR(filp)); @@ -814,7 +1429,7 @@ out: } /* filter_open */ static int filter_close(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *ea) + struct lov_stripe_md *ea, struct obd_trans_info *oti) { struct obd_export *exp; struct filter_file_data *ffd; @@ -828,6 +1443,8 @@ static int filter_close(struct lustre_handle *conn, struct obdo *oa, RETURN(-EINVAL); } + XPROCFS_BUMP_MYCPU_IOSTAT (st_close_reqs, 1); + if (!(oa->o_valid & OBD_MD_FLHANDLE)) { CERROR("no handle for close of objid "LPX64"\n", oa->o_id); RETURN(-EINVAL); @@ -846,20 +1463,23 @@ static int filter_close(struct lustre_handle *conn, struct obdo *oa, list_del(&ffd->ffd_export_list); spin_unlock(&fed->fed_lock); - rc = filter_close_internal(exp->exp_obd, ffd); + rc = filter_close_internal(exp, ffd, oti); RETURN(rc); } /* filter_close */ static int filter_create(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md **ea) + struct lov_stripe_md **ea, struct obd_trans_info *oti) { + struct obd_export *export = class_conn2export(conn); struct obd_device *obd = class_conn2obd(conn); + struct filter_obd *filter = &obd->u.filter; struct obd_run_ctxt saved; struct dentry *dir_dentry; struct dentry *new; struct iattr; - int rc; + void *handle; + int err, rc; ENTRY; if (!obd) { @@ -867,12 +1487,14 @@ static int filter_create(struct lustre_handle *conn, struct obdo *oa, return -EINVAL; } + XPROCFS_BUMP_MYCPU_IOSTAT (st_create_reqs, 1); + oa->o_id = filter_next_id(obd); - push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + push_ctxt(&saved, &filter->fo_ctxt, NULL); dir_dentry = filter_parent(obd, oa->o_mode); down(&dir_dentry->d_inode->i_sem); - new = filter_fid2dentry(obd, dir_dentry, oa->o_id, 1); + new = filter_fid2dentry(obd, dir_dentry, oa->o_id, 0); if (IS_ERR(new)) GOTO(out, rc = PTR_ERR(new)); @@ -885,11 +1507,32 @@ static int filter_create(struct lustre_handle *conn, struct obdo *oa, GOTO(out, rc = -EEXIST); } - /* XXX start transaction */ + filter_start_transno(export); + handle = fsfilt_start(obd, dir_dentry->d_inode, FSFILT_OP_CREATE); + if (IS_ERR(handle)) { + rc = filter_finish_transno(export, handle, oti,PTR_ERR(handle)); + GOTO(out_put, rc); + } rc = vfs_create(dir_dentry->d_inode, new, oa->o_mode); if (rc) + CERROR("create failed rc = %d\n", rc); + + rc = filter_finish_transno(export, handle, oti, rc); + err = filter_update_server_data(filter->fo_rcvd_filp, filter->fo_fsd); + if (err) { + CERROR("unable to write lastobjid but file created\n"); + if (!rc) + rc = err; + } + err = fsfilt_commit(obd, dir_dentry->d_inode, handle); + if (err) { + CERROR("error on commit, err = %d\n", err); + if (!rc) + rc = err; + } + + if (rc) GOTO(out_put, rc); - /* XXX update last_rcvd+lastobjid on disk, finish transaction */ /* Set flags for fields we have set in the inode struct */ oa->o_valid = OBD_MD_FLID | OBD_MD_FLBLKSZ | OBD_MD_FLBLOCKS | @@ -901,17 +1544,21 @@ out_put: f_dput(new); out: up(&dir_dentry->d_inode->i_sem); - pop_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); + pop_ctxt(&saved, &filter->fo_ctxt, NULL); return rc; } static int filter_destroy(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *ea) + struct lov_stripe_md *ea, struct obd_trans_info *oti) { + struct obd_export *export = class_conn2export(conn); struct obd_device *obd = class_conn2obd(conn); + struct filter_obd *filter = &obd->u.filter; struct dentry *dir_dentry, *object_dentry; struct filter_dentry_data *fdd; - int rc; + struct obd_run_ctxt saved; + void *handle; + int rc, rc2; ENTRY; if (!obd) { @@ -919,17 +1566,26 @@ static int filter_destroy(struct lustre_handle *conn, struct obdo *oa, RETURN(-EINVAL); } + XPROCFS_BUMP_MYCPU_IOSTAT (st_destroy_reqs, 1); + CDEBUG(D_INODE, "destroying objid "LPX64"\n", oa->o_id); dir_dentry = filter_parent(obd, oa->o_mode); down(&dir_dentry->d_inode->i_sem); - object_dentry = filter_oa2dentry(conn, oa, 1); + object_dentry = filter_oa2dentry(conn, oa, 0); if (IS_ERR(object_dentry)) GOTO(out, rc = -ENOENT); + push_ctxt(&saved, &filter->fo_ctxt, NULL); + filter_start_transno(export); + handle = fsfilt_start(obd, dir_dentry->d_inode, FSFILT_OP_UNLINK); + if (IS_ERR(handle)) { + rc = filter_finish_transno(export, handle, oti,PTR_ERR(handle)); + GOTO(out_ctxt, rc); + } + fdd = object_dentry->d_fsdata; - /* XXX start transaction */ if (fdd && atomic_read(&fdd->fdd_open_count)) { if (!(fdd->fdd_flags & FILTER_FLAG_DESTROY)) { fdd->fdd_flags |= FILTER_FLAG_DESTROY; @@ -941,12 +1597,22 @@ static int filter_destroy(struct lustre_handle *conn, struct obdo *oa, CDEBUG(D_INODE, "repeat destroy of %dx open objid "LPX64"\n", atomic_read(&fdd->fdd_open_count), oa->o_id); - GOTO(out_dput, rc = 0); + GOTO(out_commit, rc = 0); } rc = filter_destroy_internal(obd, dir_dentry, object_dentry); -out_dput: - /* XXX update last_rcvd on disk, finish transaction */ + +out_commit: + /* XXX save last_rcvd on disk */ + rc = filter_finish_transno(export, handle, oti, rc); + rc2 = fsfilt_commit(obd, dir_dentry->d_inode, handle); + if (rc2) { + CERROR("error on commit, err = %d\n", rc2); + if (!rc) + rc = rc2; + } +out_ctxt: + pop_ctxt(&saved, &filter->fo_ctxt, NULL); f_dput(object_dentry); EXIT; @@ -958,18 +1624,21 @@ out: /* NB start and end are used for punch, but not truncate */ static int filter_truncate(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md *lsm, - obd_off start, obd_off end) + obd_off start, obd_off end, + struct obd_trans_info *oti) { int error; ENTRY; + XPROCFS_BUMP_MYCPU_IOSTAT (st_punch_reqs, 1); + if (end != OBD_OBJECT_EOF) CERROR("PUNCH not supported, only truncate works\n"); CDEBUG(D_INODE, "calling truncate for object "LPX64", valid = %x, " "o_size = "LPD64"\n", oa->o_id, oa->o_valid, start); oa->o_size = start; - error = filter_setattr(conn, oa, NULL); + error = filter_setattr(conn, oa, NULL, oti); RETURN(error); } @@ -1054,11 +1723,31 @@ int waitfor_one_page(struct page *page) } #endif -static int lustre_commit_write(struct page *page, unsigned from, unsigned to) +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +/* We should only change the file mtime (and not the ctime, like + * update_inode_times() in generic_file_write()) when we only change data. + */ +static inline void inode_update_time(struct inode *inode, int ctime_too) +{ + time_t now = CURRENT_TIME; + if (inode->i_mtime == now && (!ctime_too || inode->i_ctime == now)) + return; + inode->i_mtime = now; + if (ctime_too) + inode->i_ctime = now; + mark_inode_dirty_sync(inode); +} +#endif + +static int lustre_commit_write(struct niobuf_local *lnb) { + struct page *page = lnb->page; + unsigned from = lnb->offset & ~PAGE_MASK; + unsigned to = from + lnb->len; struct inode *inode = page->mapping->host; int err; + LASSERT(to <= PAGE_SIZE); err = page->mapping->a_ops->commit_write(NULL, page, from, to); if (!err && IS_SYNC(inode)) err = waitfor_one_page(page); @@ -1098,7 +1787,7 @@ struct page *filter_get_page_write(struct inode *inode, */ if (!page) { unsigned long addr; - CDEBUG(D_PAGE, "ino %lu page %ld locked\n", inode->i_ino,index); + CDEBUG(D_ERROR,"ino %lu page %ld locked\n", inode->i_ino,index); addr = __get_free_pages(GFP_KERNEL, 0); /* locked page */ if (!addr) { CERROR("no memory for a temp page\n"); @@ -1151,38 +1840,38 @@ err: * pages, and the filesystems mark these buffers as BH_New if they * were newly allocated from disk. We use the BH_New flag similarly. */ -static int filter_commit_write(struct page *page, unsigned from, unsigned to, - int err) +static int filter_commit_write(struct niobuf_local *lnb, int err) { #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) if (err) { unsigned block_start, block_end; - struct buffer_head *bh, *head = page->buffers; + struct buffer_head *bh, *head = lnb->page->buffers; unsigned blocksize = head->b_size; - void *addr = page_address(page); /* debugging: just seeing if this ever happens */ CERROR("called filter_commit_write for ino %lu:%lu on err %d\n", - page->mapping->host->i_ino, page->index, err); + lnb->page->mapping->host->i_ino, lnb->page->index, err); /* Currently one buffer per page, but in the future... */ for (bh = head, block_start = 0; bh != head || !block_start; block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (buffer_new(bh)) - memset(addr + block_start, 0, blocksize); + memset(lnb->addr + block_start, 0, blocksize); } } #endif - return lustre_commit_write(page, from, to); + return lustre_commit_write(lnb); } static int filter_preprw(int cmd, struct lustre_handle *conn, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_remote *nb, - struct niobuf_local *res, void **desc_private) + struct niobuf_local *res, void **desc_private, + struct obd_trans_info *oti) { struct obd_run_ctxt saved; + struct obd_export *export; struct obd_device *obd; struct obd_ioobj *o; struct niobuf_remote *rnb = nb; @@ -1194,8 +1883,14 @@ static int filter_preprw(int cmd, struct lustre_handle *conn, int i; ENTRY; + if ((cmd & OBD_BRW_WRITE) != 0) + XPROCFS_BUMP_MYCPU_IOSTAT (st_write_reqs, 1); + else + XPROCFS_BUMP_MYCPU_IOSTAT (st_read_reqs, 1); + memset(res, 0, niocount * sizeof(*res)); + export = class_conn2export(conn); obd = class_conn2obd(conn); if (!obd) { CDEBUG(D_IOCTL, "invalid client "LPX64"\n", conn->addr); @@ -1238,6 +1933,22 @@ static int filter_preprw(int cmd, struct lustre_handle *conn, } if (cmd & OBD_BRW_WRITE) { +#warning "FIXME: we need to get inode->i_sem for each object here" + /* Even worse, we need to get locks on mulitple inodes (in + * order) or use the DLM to do the locking for us (and use + * the same locking in filter_setattr() for truncate. The + * handling gets very ugly when dealing with locked pages. + * It may be easier to just get rid of the locked page code + * (which has problems of its own) and either discover we do + * not need it anymore (i.e. it was a symptom of another bug) + * or ensure we get the page locks in an appropriate order. + */ + /* Danger, Will Robinson! You are taking a lock here and also + * starting a transaction and releasing/finishing then in + * filter_commitrw(), so you must call fsfilt_commit() and + * finish_transno() if an error occurs in this function. + */ + filter_start_transno(export); *desc_private = fsfilt_brw_start(obd, objcount, fso, niocount, nb); if (IS_ERR(*desc_private)) @@ -1247,10 +1958,13 @@ static int filter_preprw(int cmd, struct lustre_handle *conn, obd_kmap_get(niocount, 1); for (i = 0, o = obj; i < objcount; i++, o++) { - struct dentry *dentry = fso->fso_dentry; - struct inode *inode = dentry->d_inode; + struct dentry *dentry; + struct inode *inode; int j; + dentry = fso[i].fso_dentry; + inode = dentry->d_inode; + for (j = 0; j < o->ioo_bufcnt; j++, rnb++, lnb++) { struct page *page; @@ -1259,18 +1973,23 @@ static int filter_preprw(int cmd, struct lustre_handle *conn, else lnb->dentry = dget(dentry); - if (cmd & OBD_BRW_WRITE) + if (cmd & OBD_BRW_WRITE) { page = filter_get_page_write(inode, rnb, lnb, &pglocked); - else + + XPROCFS_BUMP_MYCPU_IOSTAT (st_write_bytes, + rnb->len); + } else { page = lustre_get_page_read(inode, rnb); - if (IS_ERR(page)) { - if (cmd & OBD_BRW_WRITE) - fsfilt_commit(obd, dir_dentry->d_inode, - *desc_private); + XPROCFS_BUMP_MYCPU_IOSTAT (st_read_bytes, + rnb->len); + } - GOTO(out_pages, rc = PTR_ERR(page)); + if (IS_ERR(page)) { + rc = PTR_ERR(page); + f_dput(dentry); + GOTO(out_pages, rc); } lnb->addr = page_address(page); @@ -1280,13 +1999,6 @@ static int filter_preprw(int cmd, struct lustre_handle *conn, } } - if (cmd & OBD_BRW_WRITE) { - int err = fsfilt_commit(obd, dir_dentry->d_inode, - *desc_private); - if (err) - GOTO(out_pages, rc = err); - } - EXIT; out: OBD_FREE(fso, objcount * sizeof(*fso)); @@ -1296,17 +2008,24 @@ out: out_pages: while (lnb-- > res) { - CERROR("error cleanup on brw\n"); + CERROR("%d error cleanup on brw\n", rc); if (cmd & OBD_BRW_WRITE) - filter_commit_write(lnb->page, 0, PAGE_SIZE, rc); + filter_commit_write(lnb, rc); else lustre_put_page(lnb->page); + f_dput(lnb->dentry); } obd_kmap_put(niocount); + goto out_err; /* dropped the dentry refs already (one per page) */ + out_objinfo: for (i = 0; i < objcount && fso[i].fso_dentry; i++) f_dput(fso[i].fso_dentry); - +out_err: + if (cmd & OBD_BRW_WRITE) { + filter_finish_transno(export, *desc_private, oti, rc); + fsfilt_commit(obd, dir_dentry->d_inode, *desc_private); + } goto out; } @@ -1314,6 +2033,7 @@ static int filter_write_locked_page(struct niobuf_local *lnb) { struct page *lpage; int rc; + ENTRY; lpage = lustre_get_page_write(lnb->dentry->d_inode, lnb->page->index); if (IS_ERR(lpage)) { @@ -1331,7 +2051,9 @@ static int filter_write_locked_page(struct niobuf_local *lnb) rc = PTR_ERR(lpage); CERROR("error getting locked page index %ld: rc = %d\n", lnb->page->index, rc); - GOTO(out, rc); + LBUG(); + lustre_commit_write(lnb); + RETURN(rc); } /* lpage is kmapped in lustre_get_page_write() above and kunmapped in @@ -1339,24 +2061,31 @@ static int filter_write_locked_page(struct niobuf_local *lnb) * filter_get_page_write() and kunmapped in lustre_put_page() below. */ memcpy(page_address(lpage), page_address(lnb->page), PAGE_SIZE); - rc = lustre_commit_write(lpage, 0, PAGE_SIZE); + lustre_put_page(lnb->page); + + lnb->page = lpage; + rc = lustre_commit_write(lnb); if (rc) CERROR("error committing locked page %ld: rc = %d\n", lnb->page->index, rc); -out: - lustre_put_page(lnb->page); - return rc; + RETURN(rc); +} + +static int filter_sync(struct obd_device *obd) +{ + RETURN(fsfilt_sync(obd, obd->u.filter.fo_sb)); } static int filter_commitrw(int cmd, struct lustre_handle *conn, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_local *res, - void *private) + void *desc_private, struct obd_trans_info *oti) { struct obd_run_ctxt saved; struct obd_ioobj *o; - struct niobuf_local *r; + struct niobuf_local *lnb; + struct obd_export *export = class_conn2export(conn); struct obd_device *obd = class_conn2obd(conn); int found_locked = 0; int rc = 0; @@ -1366,57 +2095,65 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn, push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); LASSERT(!current->journal_info); - current->journal_info = private; + current->journal_info = desc_private; - for (i = 0, o = obj, r = res; i < objcount; i++, o++) { + for (i = 0, o = obj, lnb = res; i < objcount; i++, o++) { int j; - for (j = 0 ; j < o->ioo_bufcnt ; j++, r++) { - struct page *page = r->page; - - if (!page) - LBUG(); - - if (r->flags & N_LOCAL_TEMP_PAGE) { + if (cmd & OBD_BRW_WRITE) + inode_update_time(lnb->dentry->d_inode, 1); + for (j = 0 ; j < o->ioo_bufcnt ; j++, lnb++) { + if (lnb->flags & N_LOCAL_TEMP_PAGE) { found_locked++; continue; } if (cmd & OBD_BRW_WRITE) { - int err = filter_commit_write(page, 0, - r->len, 0); + int err = filter_commit_write(lnb, 0); if (!rc) rc = err; } else - lustre_put_page(page); + lustre_put_page(lnb->page); obd_kmap_put(1); - f_dput(r->dentry); + f_dput(lnb->dentry); } } - if (!found_locked) - goto out_ctxt; - - for (i = 0, o = obj, r = res; i < objcount; i++, o++) { + for (i = 0, o = obj, lnb = res; found_locked > 0 && i < objcount; + i++, o++) { int j; - for (j = 0 ; j < o->ioo_bufcnt ; j++, r++) { + for (j = 0 ; j < o->ioo_bufcnt ; j++, lnb++) { int err; - if (!(r->flags & N_LOCAL_TEMP_PAGE)) + if (!(lnb->flags & N_LOCAL_TEMP_PAGE)) continue; - err = filter_write_locked_page(r); + err = filter_write_locked_page(lnb); obd_kmap_put(1); if (!rc) rc = err; - f_dput(r->dentry); + f_dput(lnb->dentry); + found_locked--; } } -out_ctxt: + if (cmd & OBD_BRW_WRITE) { + int err; + struct dentry *dir_dentry = filter_parent(obd, S_IFREG); + + rc = filter_finish_transno(export, desc_private, oti, rc); + err = fsfilt_commit(obd, dir_dentry->d_inode, desc_private); + if (err) + rc = err; + if (obd_sync_filter) { + /* this can fail with ENOMEM, what should we do then? */ + filter_sync(obd); + } + /* XXX LASSERT(last_rcvd == last_committed)*/ + } + LASSERT(!current->journal_info); - current->journal_info = NULL; pop_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); RETURN(rc); @@ -1424,7 +2161,8 @@ out_ctxt: static int filter_brw(int cmd, struct lustre_handle *conn, struct lov_stripe_md *lsm, obd_count oa_bufs, - struct brw_page *pga, struct obd_brw_set *set) + struct brw_page *pga, struct obd_brw_set *set, + struct obd_trans_info *oti) { struct obd_ioobj ioo; struct niobuf_local *lnb; @@ -1451,7 +2189,7 @@ static int filter_brw(int cmd, struct lustre_handle *conn, ioo.ioo_bufcnt = oa_bufs; ret = filter_preprw(cmd, conn, 1, &ioo, oa_bufs, rnb, lnb, - &desc_private); + &desc_private, oti); if (ret != 0) GOTO(out, ret); @@ -1467,7 +2205,8 @@ static int filter_brw(int cmd, struct lustre_handle *conn, kunmap(virt); } - ret = filter_commitrw(cmd, conn, 1, &ioo, oa_bufs, lnb, desc_private); + ret = filter_commitrw(cmd, conn, 1, &ioo, oa_bufs, lnb, desc_private, + oti); out: if (lnb) @@ -1484,6 +2223,8 @@ static int filter_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) obd = class_conn2obd(conn); + XPROCFS_BUMP_MYCPU_IOSTAT (st_statfs_reqs, 1); + RETURN(fsfilt_statfs(obd, obd->u.filter.fo_sb, osfs)); } @@ -1519,7 +2260,7 @@ static int filter_get_info(struct lustre_handle *conn, obd_count keylen, int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst, struct lustre_handle *src_conn, struct obdo *src, - obd_size count, obd_off offset) + obd_size count, obd_off offset, struct obd_trans_info *oti) { struct page *page; struct lov_stripe_md srcmd, dstmd; @@ -1568,7 +2309,7 @@ int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst, page->index = index; set->brw_callback = ll_brw_sync_wait; - err = obd_brw(OBD_BRW_READ, src_conn, &srcmd, 1, &pg, set); + err = obd_brw(OBD_BRW_READ, src_conn, &srcmd, 1, &pg, set,NULL); obd_brw_set_free(set); if (err) { EXIT; @@ -1585,7 +2326,7 @@ int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst, CDEBUG(D_INFO, "Read page %ld ...\n", page->index); set->brw_callback = ll_brw_sync_wait; - err = obd_brw(OBD_BRW_WRITE, dst_conn, &dstmd, 1, &pg, set); + err = obd_brw(OBD_BRW_WRITE, dst_conn, &dstmd, 1, &pg, set,oti); obd_brw_set_free(set); /* XXX should handle dst->o_size, dst->o_blocks here */ @@ -1638,7 +2379,9 @@ static struct obd_ops filter_obd_ops = { static int __init obdfilter_init(void) { - printk(KERN_INFO "Filtering OBD driver v0.001, info@clusterfs.com\n"); + struct lprocfs_static_vars lvars; + + printk(KERN_INFO "Lustre Filtering OBD driver; info@clusterfs.com\n"); filter_open_cache = kmem_cache_create("ll_filter_fdata", sizeof(struct filter_file_data), 0, 0, NULL, NULL); @@ -1653,7 +2396,10 @@ static int __init obdfilter_init(void) RETURN(-ENOMEM); } - return class_register_type(&filter_obd_ops, status_class_var, + xprocfs_init ("filter"); + + lprocfs_init_vars(&lvars); + return class_register_type(&filter_obd_ops, lvars.module_vars, OBD_FILTER_DEVICENAME); } @@ -1664,10 +2410,11 @@ static void __exit obdfilter_exit(void) CERROR("couldn't free obdfilter dentry cache\n"); if (kmem_cache_destroy(filter_open_cache)) CERROR("couldn't free obdfilter open cache\n"); + xprocfs_fini (); } MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Lustre Filtering OBD driver v1.0"); +MODULE_DESCRIPTION("Lustre Filtering OBD driver"); MODULE_LICENSE("GPL"); module_init(obdfilter_init); diff --git a/lustre/obdfilter/lproc_obdfilter.c b/lustre/obdfilter/lproc_obdfilter.c index e680784..ad92f83 100644 --- a/lustre/obdfilter/lproc_obdfilter.c +++ b/lustre/obdfilter/lproc_obdfilter.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -21,125 +21,50 @@ */ #define DEBUG_SUBSYSTEM S_CLASS -#include #include +#include +#ifndef LPROCFS +struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; +struct lprocfs_vars lprocfs_module_vars[] = { {0} }; +#else -int rd_uuid(char* page, char **start, off_t off, int count, int *eof, - void *data) +static inline int lprocfs_filter_statfs(void *data, struct statfs *sfs) { - int len = 0; - struct obd_device* dev = (struct obd_device*)data; - len += snprintf(page, count, "%s\n", dev->obd_uuid); - return len; + struct obd_device *dev = (struct obd_device *) data; + return vfs_statfs(dev->u.filter.fo_sb, sfs); } -int rd_blksize(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - struct statfs mystats; - int len = 0; - vfs_statfs(temp->u.filter.fo_sb, &mystats); - len+=snprintf(page, count, "%ld\n", mystats.f_bsize); - return len; -} -int rd_kbtotal(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - struct statfs mystats; - int len = 0; - __u32 blk_size; - __u64 result; - - vfs_statfs(temp->u.filter.fo_sb, &mystats); - blk_size = mystats.f_bsize; - blk_size >>= 10; - result = mystats.f_blocks; - while(blk_size >>= 1){ - result <<= 1; - } - len+=snprintf(page, count, LPU64"\n", result); - return len; -} +DEFINE_LPROCFS_STATFS_FCT(rd_blksize, lprocfs_filter_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_kbytestotal, lprocfs_filter_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_kbytesfree, lprocfs_filter_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filestotal, lprocfs_filter_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filesfree, lprocfs_filter_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filegroups, lprocfs_filter_statfs); -int rd_kbfree(char* page, char **start, off_t off, int count, int *eof, +int rd_fstype(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct obd_device* temp = (struct obd_device*)data; - struct statfs mystats; - int len = 0; - __u32 blk_size; - __u64 result; - - vfs_statfs(temp->u.filter.fo_sb, &mystats); - blk_size = mystats.f_bsize; - blk_size >>= 10; - result = mystats.f_bfree; - while(blk_size >>= 1){ - result <<= 1; - } - len += snprintf(page, count, LPU64"\n", result); - return len; + struct obd_device *dev = (struct obd_device *)data; + return snprintf(page, count, "%s\n", dev->u.filter.fo_fstype); } -int rd_fstype(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - int len = 0; - len += snprintf(page, count, "%s\n", temp->u.filter.fo_fstype); - return len; -} -int rd_filestotal(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - struct statfs mystats; - int len = 0; - vfs_statfs(temp->u.filter.fo_sb, &mystats); - len += snprintf(page, count, "%ld\n", mystats.f_files); - return len; -} - -int rd_filesfree(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - struct statfs mystats; - int len = 0; - vfs_statfs(temp->u.filter.fo_sb, &mystats); - len += snprintf(page, count, "%ld\n", mystats.f_ffree); - return len; -} - -int rd_filegroups(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} -struct lprocfs_vars status_var_nm_1[] = { - {"status/uuid", rd_uuid, 0, 0}, - {"status/blocksize",rd_blksize, 0, 0}, - {"status/kbytestotal",rd_kbtotal, 0, 0}, - {"status/kbytesfree", rd_kbfree, 0, 0}, - {"status/filestotal", rd_filestotal, 0, 0}, - {"status/filesfree", rd_filesfree, 0, 0}, - {"status/filegroups", rd_filegroups, 0, 0}, - {"status/fstype", rd_fstype, 0, 0}, - {0} +struct lprocfs_vars lprocfs_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "blocksize", rd_blksize, 0, 0 }, + { "kbytestotal", rd_kbytestotal, 0, 0 }, + { "kbytesfree", rd_kbytesfree, 0, 0 }, + { "filestotal", rd_filestotal, 0, 0 }, + { "filesfree", rd_filesfree, 0, 0 }, + { "filegroups", rd_filegroups, 0, 0 }, + { "fstype", rd_fstype, 0, 0 }, + { 0 } }; -int rd_numrefs(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_type* class = (struct obd_type*)data; - int len = 0; - len += snprintf(page, count, "%d\n", class->typ_refcnt); - return len; -} -struct lprocfs_vars status_class_var[] = { - {"status/num_refs", rd_numrefs, 0, 0}, - {0} +struct lprocfs_vars lprocfs_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } }; + +#endif /* LPROCFS */ +LPROCFS_INIT_VARS(lprocfs_module_vars, lprocfs_obd_vars) diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 58e9097..69af4bc 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -21,99 +21,38 @@ */ #define DEBUG_SUBSYSTEM S_CLASS -#include +#include #include -int rd_uuid(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - int len = 0; - struct obd_device* dev = (struct obd_device*)data; - len += snprintf(page, count, "%s\n", dev->obd_uuid); - return len; - -} -int rd_blksize(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} -int rd_kbytestotal(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - -int rd_kbytesfree(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - -int rd_filestotal(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - -int rd_filesfree(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - -int rd_filegroups(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} -int rd_server_uuid(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - - struct obd_device* temp = (struct obd_device*)data; - struct client_obd* cli = &temp->u.cli; - int len = 0; - len += snprintf(page, count, "%s\n",cli->cl_target_uuid); - return len; - - -} -int rd_conn_uuid(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp=(struct obd_device*)data; - struct client_obd* cli=&temp->u.cli; - struct obd_import* imp=&cli->cl_import; - int len = 0; - len += snprintf(page, count, "%s\n", - imp->imp_connection->c_remote_uuid); - return len; - -} - -struct lprocfs_vars status_var_nm_1[] = { - {"status/uuid", rd_uuid, 0, 0}, - {"status/blocksize",rd_blksize, 0, 0}, - {"status/kbytestotal", rd_kbytestotal, 0, 0}, - {"status/kbytesfree", rd_kbytesfree, 0, 0}, - {"status/filestotal", rd_filestotal, 0, 0}, - {"status/filesfree", rd_filesfree, 0, 0}, - {"status/filegroups", rd_filegroups, 0, 0}, - {"status/ost_server_uuid", rd_server_uuid, 0, 0}, - {"status/ost_conn_uuid", rd_conn_uuid, 0, 0}, - {0} +#ifndef LPROCFS +struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; +struct lprocfs_vars lprocfs_module_vars[] = { {0} }; +#else + +DEFINE_LPROCFS_STATFS_FCT(rd_blksize, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_kbytestotal, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_kbytesfree, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filestotal, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filesfree, obd_self_statfs); +DEFINE_LPROCFS_STATFS_FCT(rd_filegroups, obd_self_statfs); + +struct lprocfs_vars lprocfs_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { "blocksize", rd_blksize, 0, 0 }, + { "kbytestotal", rd_kbytestotal, 0, 0 }, + { "kbytesfree", rd_kbytesfree, 0, 0 }, + { "filestotal", rd_filestotal, 0, 0 }, + { "filesfree", rd_filesfree, 0, 0 }, + { "filegroups", rd_filegroups, 0, 0 }, + { "ost_server_uuid", lprocfs_rd_server_uuid, 0, 0 }, + { "ost_conn_uuid", lprocfs_rd_conn_uuid, 0, 0 }, + { 0 } }; -int rd_numrefs(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_type* class = (struct obd_type*)data; - int len = 0; - len += snprintf(page, count, "%d\n", class->typ_refcnt); - return len; -} -struct lprocfs_vars status_class_var[] = { - {"status/num_refs", rd_numrefs, 0, 0}, - {0} +struct lprocfs_vars lprocfs_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } }; + +#endif /* LPROCFS */ +LPROCFS_INIT_VARS(lprocfs_module_vars, lprocfs_obd_vars) diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 85b1694..1abd150 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Copyright (C) 2001-2003 Cluster File Systems, Inc. * Author Peter Braam * * This file is part of Lustre, http://www.lustre.org. @@ -49,17 +49,17 @@ #include /* for PTL_MD_MAX_IOV */ #include -extern struct lprocfs_vars status_var_nm_1[]; -extern struct lprocfs_vars status_class_var[]; - static int osc_attach(struct obd_device *dev, obd_count len, void *data) { - return lprocfs_reg_obd(dev, status_var_nm_1, dev); + struct lprocfs_static_vars lvars; + + lprocfs_init_vars(&lvars); + return lprocfs_obd_attach(dev, lvars.obd_vars); } static int osc_detach(struct obd_device *dev) { - return lprocfs_dereg_obd(dev); + return lprocfs_obd_detach(dev); } /* Pack OSC object metadata for shipment to the MDS. */ @@ -123,6 +123,13 @@ static int osc_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp, RETURN(lsm_size); } +inline void oti_from_request(struct obd_trans_info *oti, struct ptlrpc_request *req) +{ + if (oti && req->rq_repmsg) + oti->oti_transno = NTOH__u64(req->rq_repmsg->transno); + EXIT; +} + static int osc_getattr(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md *md) { @@ -150,8 +157,7 @@ static int osc_getattr(struct lustre_handle *conn, struct obdo *oa, body = lustre_msg_buf(request->rq_repmsg, 0); CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); - if (oa) - memcpy(oa, &body->oa, sizeof(*oa)); + memcpy(oa, &body->oa, sizeof(*oa)); EXIT; out: @@ -160,7 +166,7 @@ static int osc_getattr(struct lustre_handle *conn, struct obdo *oa, } static int osc_open(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *md) + struct lov_stripe_md *md, struct obd_trans_info *oti) { struct ptlrpc_request *request; struct ost_body *body; @@ -172,6 +178,7 @@ static int osc_open(struct lustre_handle *conn, struct obdo *oa, if (!request) RETURN(-ENOMEM); +#warning FIXME: request->rq_flags |= PTL_RPC_FL_REPLAY; body = lustre_msg_buf(request->rq_reqmsg, 0); #warning FIXME: pack only valid fields instead of memcpy, endianness memcpy(&body->oa, oa, sizeof(*oa)); @@ -194,7 +201,7 @@ static int osc_open(struct lustre_handle *conn, struct obdo *oa, } static int osc_close(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *md) + struct lov_stripe_md *md, struct obd_trans_info *oti) { struct ptlrpc_request *request; struct ost_body *body; @@ -228,7 +235,7 @@ static int osc_close(struct lustre_handle *conn, struct obdo *oa, } static int osc_setattr(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *md) + struct lov_stripe_md *md, struct obd_trans_info *oti) { struct ptlrpc_request *request; struct ost_body *body; @@ -252,11 +259,12 @@ static int osc_setattr(struct lustre_handle *conn, struct obdo *oa, } static int osc_create(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md **ea) + struct lov_stripe_md **ea, struct obd_trans_info *oti_in) { struct ptlrpc_request *request; struct ost_body *body; struct lov_stripe_md *lsm; + struct obd_trans_info *oti, trans_info; int rc, size = sizeof(*body); ENTRY; @@ -270,6 +278,11 @@ static int osc_create(struct lustre_handle *conn, struct obdo *oa, RETURN(rc); } + if (oti_in) + oti = oti_in; + else + oti = &trans_info; + request = ptlrpc_prep_req(class_conn2cliimp(conn), OST_CREATE, 1, &size, NULL); if (!request) @@ -290,6 +303,9 @@ static int osc_create(struct lustre_handle *conn, struct obdo *oa, lsm->lsm_object_id = oa->o_id; lsm->lsm_stripe_count = 0; *ea = lsm; + + oti_from_request(oti, request); + CDEBUG(D_HA, "transno: "LPD64"\n", oti->oti_transno); EXIT; out_req: ptlrpc_req_finished(request); @@ -301,7 +317,7 @@ out: static int osc_punch(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md *md, obd_size start, - obd_size end) + obd_size end, struct obd_trans_info *oti) { struct ptlrpc_request *request; struct ost_body *body; @@ -343,7 +359,7 @@ static int osc_punch(struct lustre_handle *conn, struct obdo *oa, } static int osc_destroy(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *ea) + struct lov_stripe_md *ea, struct obd_trans_info *oti) { struct ptlrpc_request *request; struct ost_body *body; @@ -398,6 +414,7 @@ static void unmap_and_decref_bulk_desc(void *data) EXIT; } + /* this is the callback function which is invoked by the Portals * event handler associated with the bulk_sink queue and bulk_source queue. */ @@ -488,7 +505,7 @@ static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm, CERROR("obd_fail_loc=%x, skipping register_bulk\n", OBD_FAIL_OSC_BRW_READ_BULK); } else { - rc = ptlrpc_register_bulk(desc); + rc = ptlrpc_register_bulk_put(desc); if (rc) GOTO(out_unmap, rc); obd_brw_set_add(set, desc); @@ -525,19 +542,18 @@ out_unmap: static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *lsm, obd_count page_count, struct brw_page *pga, - struct obd_brw_set *set) + struct obd_brw_set *set, struct obd_trans_info *oti) { struct obd_import *imp = class_conn2cliimp(conn); struct ptlrpc_connection *connection = imp->imp_connection; struct ptlrpc_request *request = NULL; struct ptlrpc_bulk_desc *desc = NULL; struct ost_body *body; - struct niobuf_local *local = NULL; - struct niobuf_remote *remote; int rc, size[3] = {sizeof(*body)}, mapped = 0; - int j; + unsigned long flags; struct obd_ioobj *iooptr; void *nioptr; + __u32 xid; ENTRY; size[1] = sizeof(struct obd_ioobj); @@ -561,73 +577,62 @@ static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *lsm, ost_pack_ioo(&iooptr, lsm, page_count); /* end almost identical to brw_read case */ - OBD_ALLOC(local, page_count * sizeof(*local)); - if (!local) - GOTO(out_desc, rc = -ENOMEM); + spin_lock_irqsave(&imp->imp_lock, flags); + xid = ++imp->imp_last_xid; /* single xid for all pages */ + spin_unlock_irqrestore(&imp->imp_lock, flags); obd_kmap_get(page_count, 0); for (mapped = 0; mapped < page_count; mapped++) { - local[mapped].addr = kmap(pga[mapped].pg); + struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); + if (bulk == NULL) + GOTO(out_unmap, rc = -ENOMEM); - CDEBUG(D_INFO, "kmap(pg) = %p ; pg->flags = %lx ; pg->refcount = " - "%d ; page %d of %d\n", - local[mapped].addr, pga[mapped].pg->flags, - page_count(pga[mapped].pg), - mapped, page_count - 1); + bulk->bp_xid = xid; /* single xid for all pages */ - local[mapped].offset = pga[mapped].off; - local[mapped].len = pga[mapped].count; + bulk->bp_buf = kmap(pga[mapped].pg); + bulk->bp_page = pga[mapped].pg; + bulk->bp_buflen = PAGE_SIZE; ost_pack_niobuf(&nioptr, pga[mapped].off, pga[mapped].count, - pga[mapped].flag, 0); - } - - size[1] = page_count * sizeof(*remote); - request->rq_replen = lustre_msg_size(2, size); - rc = ptlrpc_queue_wait(request); - if (rc) - GOTO(out_unmap, rc); - - nioptr = lustre_msg_buf(request->rq_repmsg, 1); - if (!nioptr) - GOTO(out_unmap, rc = -EINVAL); - - if (request->rq_repmsg->buflens[1] != size[1]) { - CERROR("buffer length wrong (%d vs. %d)\n", - request->rq_repmsg->buflens[1], size[1]); - GOTO(out_unmap, rc = -EINVAL); + pga[mapped].flag, bulk->bp_xid); } - for (j = 0; j < page_count; j++) { - struct ptlrpc_bulk_page *bulk; - - ost_unpack_niobuf(&nioptr, &remote); - - bulk = ptlrpc_prep_bulk_page(desc); - if (!bulk) - GOTO(out_unmap, rc = -ENOMEM); - - bulk->bp_buf = local[j].addr; - bulk->bp_buflen = local[j].len; - bulk->bp_xid = remote->xid; - bulk->bp_page = pga[j].pg; + /* + * Register the bulk first, because the reply could arrive out of + * order, and we want to be ready for the bulk data. + * + * One reference is released when brw_finish is complete, the other + * when the caller removes us from the "set" list. + * + * On error, we never do the brw_finish, so we handle all decrefs. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_WRITE_BULK)) { + CERROR("obd_fail_loc=%x, skipping register_bulk\n", + OBD_FAIL_OSC_BRW_WRITE_BULK); + } else { + rc = ptlrpc_register_bulk_get(desc); + if (rc) + GOTO(out_unmap, rc); + obd_brw_set_add(set, desc); } - if (desc->bd_page_count != page_count) - LBUG(); - - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_WRITE_BULK)) - GOTO(out_unmap, rc = 0); - - OBD_FREE(local, page_count * sizeof(*local)); - - /* One reference is released when brw_finish is complete, the other - * when the caller removes it from the "set" list. */ - obd_brw_set_add(set, desc); - rc = ptlrpc_send_bulk(desc); + request->rq_replen = lustre_msg_size(1, size); + rc = ptlrpc_queue_wait(request); - /* XXX: Mike, same question as in osc_brw_read. */ -out_req: + /* + * XXX: If there is an error during the processing of the callback, + * such as a timeout in a sleep that it performs, brw_finish + * will never get called, and we'll leak the desc, fail to kunmap + * things, cats will live with dogs. One solution would be to + * export brw_finish as osc_brw_finish, so that the timeout case + * and its kin could call it for proper cleanup. An alternative + * would be for an error return from the callback to cause us to + * clean up, but that doesn't help the truly async cases (like + * LOV), which will immediately return from their PHASE_START + * callback, before any such cleanup-requiring error condition can + * be detected. + */ + out_req: ptlrpc_req_finished(request); RETURN(rc); @@ -635,18 +640,15 @@ out_req: out_unmap: while (mapped-- > 0) kunmap(pga[mapped].pg); - obd_kmap_put(page_count); - - OBD_FREE(local, page_count * sizeof(*local)); -out_desc: ptlrpc_bulk_decref(desc); goto out_req; } static int osc_brw(int cmd, struct lustre_handle *conn, struct lov_stripe_md *md, obd_count page_count, - struct brw_page *pga, struct obd_brw_set *set) + struct brw_page *pga, struct obd_brw_set *set, + struct obd_trans_info *oti) { ENTRY; @@ -660,7 +662,7 @@ static int osc_brw(int cmd, struct lustre_handle *conn, pages_per_brw = page_count; if (cmd & OBD_BRW_WRITE) - rc = osc_brw_write(conn, md, pages_per_brw, pga, set); + rc = osc_brw_write(conn, md, pages_per_brw, pga, set, oti); else rc = osc_brw_read(conn, md, pages_per_brw, pga, set); @@ -679,7 +681,7 @@ static int osc_enqueue(struct lustre_handle *connh, struct lov_stripe_md *lsm, int *flags, void *callback, void *data, int datalen, struct lustre_handle *lockh) { - __u64 res_id[RES_NAME_SIZE] = { lsm->lsm_object_id }; + struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} }; struct obd_device *obddev = class_conn2obd(connh); struct ldlm_extent *extent = extentp; int rc; @@ -694,7 +696,7 @@ static int osc_enqueue(struct lustre_handle *connh, struct lov_stripe_md *lsm, } /* Next, search for already existing extent locks that will cover us */ - rc = ldlm_lock_match(obddev->obd_namespace, res_id, type, extent, + rc = ldlm_lock_match(obddev->obd_namespace, 0, &res_id, type, extent, sizeof(extent), mode, lockh); if (rc == 1) /* We already have a lock, and it's referenced */ @@ -713,7 +715,7 @@ static int osc_enqueue(struct lustre_handle *connh, struct lov_stripe_md *lsm, * locks out from other users right now, too. */ if (mode == LCK_PR) { - rc = ldlm_lock_match(obddev->obd_namespace, res_id, type, + rc = ldlm_lock_match(obddev->obd_namespace, 0, &res_id, type, extent, sizeof(extent), LCK_PW, lockh); if (rc == 1) { /* FIXME: This is not incredibly elegant, but it might @@ -728,7 +730,7 @@ static int osc_enqueue(struct lustre_handle *connh, struct lov_stripe_md *lsm, rc = ldlm_cli_enqueue(connh, NULL, obddev->obd_namespace, parent_lock, res_id, type, extent, sizeof(extent), mode, flags, - ldlm_completion_ast, callback, data, datalen, + ldlm_completion_ast, callback, data, NULL, lockh); RETURN(rc); } @@ -747,9 +749,9 @@ static int osc_cancel_unused(struct lustre_handle *connh, struct lov_stripe_md *lsm, int flags) { struct obd_device *obddev = class_conn2obd(connh); - __u64 res_id[RES_NAME_SIZE] = { lsm->lsm_object_id }; + struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} }; - return ldlm_cli_cancel_unused(obddev->obd_namespace, res_id, flags); + return ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags); } static int osc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) @@ -832,6 +834,7 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, ENTRY; switch (cmd) { +#if 0 case IOC_LDLM_TEST: { err = ldlm_test(obddev, conn); CERROR("-- done err %d\n", err); @@ -879,6 +882,7 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, CERROR("-- done err %d\n", err); GOTO(out, err); } +#endif case IOC_OSC_REGISTER_LOV: { if (obddev->u.cli.cl_containing_lov) GOTO(out, err = -EALREADY); @@ -888,7 +892,7 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, case OBD_IOC_LOV_GET_CONFIG: { char *buf; struct lov_desc *desc; - obd_uuid_t *uuidp; + struct obd_uuid uuid; buf = NULL; len = 0; @@ -902,7 +906,7 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, GOTO(out, err = -EINVAL); } - if (data->ioc_inllen2 < sizeof(*uuidp)) { + if (data->ioc_inllen2 < sizeof(uuid.uuid)) { OBD_FREE(buf, len); GOTO(out, err = -EINVAL); } @@ -914,10 +918,10 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, desc->ld_default_stripe_size = 0; desc->ld_default_stripe_offset = 0; desc->ld_pattern = 0; - memcpy(desc->ld_uuid, obddev->obd_uuid, sizeof(*uuidp)); + memcpy(desc->ld_uuid.uuid, obddev->obd_uuid.uuid, sizeof(uuid.uuid)); - uuidp = (obd_uuid_t *)data->ioc_inlbuf2; - memcpy(uuidp, obddev->obd_uuid, sizeof(*uuidp)); + memcpy(data->ioc_inlbuf2, obddev->obd_uuid.uuid, + sizeof(uuid.uuid)); err = copy_to_user((void *)uarg, buf, len); if (err) @@ -943,7 +947,11 @@ out: static void set_osc_active(struct obd_import *imp, int active) { - struct obd_device *notify_obd = imp->imp_obd->u.cli.cl_containing_lov; + struct obd_device *notify_obd; + + LASSERT(imp->imp_obd); + + notify_obd = imp->imp_obd->u.cli.cl_containing_lov; if (notify_obd == NULL) return; @@ -952,25 +960,26 @@ static void set_osc_active(struct obd_import *imp, int active) if (!list_empty(¬ify_obd->obd_exports)) { int rc; struct lustre_handle fakeconn; - struct obd_ioctl_data ioc_data; + struct obd_ioctl_data ioc_data = { 0 }; struct obd_export *exp = list_entry(notify_obd->obd_exports.next, struct obd_export, exp_obd_chain); fakeconn.addr = (__u64)(unsigned long)exp; fakeconn.cookie = exp->exp_cookie; - ioc_data.ioc_inlbuf1 = imp->imp_obd->u.cli.cl_target_uuid; + ioc_data.ioc_inlbuf1 = &imp->imp_obd->u.cli.cl_target_uuid; ioc_data.ioc_offset = active; rc = obd_iocontrol(IOC_LOV_SET_OSC_ACTIVE, &fakeconn, sizeof ioc_data, &ioc_data, NULL); - if (rc) + if (rc) { CERROR("disabling %s on LOV %p/%s: %d\n", - imp->imp_obd->obd_uuid, notify_obd, - notify_obd->obd_uuid, rc); + imp->imp_obd->u.cli.cl_target_uuid.uuid, + notify_obd, notify_obd->obd_uuid.uuid, rc); + } } else { CDEBUG(D_HA, "No exports for obd %p/%s, can't notify about " - "%p\n", notify_obd, notify_obd->obd_uuid, - imp->imp_obd->obd_uuid); + "%p\n", notify_obd, notify_obd->obd_uuid.uuid, + imp->imp_obd->obd_uuid.uuid); } } @@ -986,7 +995,7 @@ static int osc_recover(struct obd_import *imp, int phase) case PTLRPC_RECOVD_PHASE_PREPARE: { struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; ldlm_namespace_cleanup(ns, 1 /* no network ops */); - ptlrpc_abort_inflight(imp); + ptlrpc_abort_inflight(imp, 0); set_osc_active(imp, 0 /* inactive */); RETURN(0); } @@ -1022,7 +1031,7 @@ static int osc_recover(struct obd_import *imp, int phase) } static int osc_connect(struct lustre_handle *conn, struct obd_device *obd, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { struct obd_import *imp = &obd->u.cli.cl_import; @@ -1057,7 +1066,10 @@ struct obd_ops osc_obd_ops = { static int __init osc_init(void) { - RETURN(class_register_type(&osc_obd_ops, status_class_var, + struct lprocfs_static_vars lvars; + + lprocfs_init_vars(&lvars); + RETURN(class_register_type(&osc_obd_ops, lvars.module_vars, LUSTRE_OSC_NAME)); } @@ -1067,7 +1079,7 @@ static void __exit osc_exit(void) } MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Lustre Object Storage Client (OSC) v1.0"); +MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)"); MODULE_LICENSE("GPL"); module_init(osc_init); diff --git a/lustre/ost/lproc_ost.c b/lustre/ost/lproc_ost.c index 1fa1c59..c44093c 100644 --- a/lustre/ost/lproc_ost.c +++ b/lustre/ost/lproc_ost.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -21,142 +21,22 @@ */ #define DEBUG_SUBSYSTEM S_OST -#include +#include #include - -int rd_uuid(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - - struct obd_device* temp = (struct obd_device*)data; - int len = 0; - len += snprintf(page, count, "%s\n", temp->obd_uuid); - return len; - - -} -int rd_blksize(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - - struct obd_device* temp = (struct obd_device*)data; - struct ost_obd *ost = &temp->u.ost; - struct lustre_handle *conn = &ost->ost_conn; - struct obd_statfs mystats; - int len = 0; - - obd_statfs(conn, &mystats); - len += snprintf(page, count, "%d\n", mystats.os_bsize); - return len; - -} -int rd_kbtotal(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - struct ost_obd *ost = &temp->u.ost; - struct lustre_handle *conn = &ost->ost_conn; - struct obd_statfs mystats; - int len = 0; - __u32 blk_size; - __u64 result; - - obd_statfs(conn, &mystats); - blk_size = mystats.os_bsize; - blk_size >>= 10; - result = mystats.os_blocks; - while(blk_size >>= 1){ - result <<= 1; - } - len += snprintf(page, count, LPU64"\n", result); - return len; - -} - - -int rd_kbfree(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - - struct obd_device* temp = (struct obd_device*)data; - struct ost_obd *ost = &temp->u.ost; - struct lustre_handle *conn = &ost->ost_conn; - struct obd_statfs mystats; - int len = 0; - __u32 blk_size; - __u64 result; - - obd_statfs(conn, &mystats); - blk_size = mystats.os_bsize; - blk_size >>= 10; - result = mystats.os_bfree; - while(blk_size >>= 1){ - result <<= 1; - } - len += snprintf(page, count, LPU64"\n", result); - return len; -} - -int rd_filestotal(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_device* temp = (struct obd_device*)data; - struct ost_obd *ost = &temp->u.ost; - struct lustre_handle *conn = &ost->ost_conn; - struct obd_statfs mystats; - int len = 0; - - obd_statfs(conn, &mystats); - len += snprintf(page, count, LPU64"\n",mystats.os_files); - return len; - -} - -int rd_filesfree(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - - struct obd_device* temp = (struct obd_device*)data; - struct ost_obd *ost = &temp->u.ost; - struct lustre_handle *conn = &ost->ost_conn; - struct obd_statfs mystats; - int len = 0; - - obd_statfs(conn, &mystats); - len += snprintf(page, count, LPU64"\n", mystats.os_ffree); - return len; - -} - -int rd_filegroups(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - return 0; -} - -struct lprocfs_vars status_var_nm_1[] = { - {"status/uuid", rd_uuid, 0, 0}, - {"status/blocksize",rd_blksize, 0, 0}, - {"status/kbytesfree", rd_kbfree, 0, 0}, - {"status/kbytestotal", rd_kbtotal, 0, 0}, - {"status/filestotal", rd_filestotal, 0, 0}, - {"status/filesfree", rd_filesfree, 0, 0}, - {"status/filegroups", rd_filegroups, 0, 0}, - {0} +#ifndef LPROCFS +struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; +struct lprocfs_vars lprocfs_module_vars[] = { {0} }; +#else +struct lprocfs_vars lprocfs_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0 }, + { 0 } }; -int rd_numrefs(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_type* class = (struct obd_type*)data; - int len = 0; - len += snprintf(page, count, "%d\n", class->typ_refcnt); - return len; -} - -struct lprocfs_vars status_class_var[] = { - {"status/num_refs", rd_numrefs, 0, 0}, - {0} +struct lprocfs_vars lprocfs_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0 }, + { 0 } }; - + +#endif /* LPROCFS */ +LPROCFS_INIT_VARS(lprocfs_module_vars, lprocfs_obd_vars) diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index db7857c..d595757 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Copyright (C) 2001-2003 Cluster File Systems, Inc. * Author: Peter J. Braam * Author: Phil Schwan * @@ -41,10 +41,8 @@ #include #include -extern struct lprocfs_vars status_var_nm_1[]; -extern struct lprocfs_vars status_class_var[]; -static int ost_destroy(struct ptlrpc_request *req) +static int ost_destroy(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ost_body *body; @@ -57,7 +55,7 @@ static int ost_destroy(struct ptlrpc_request *req) if (rc) RETURN(rc); - req->rq_status = obd_destroy(conn, &body->oa, NULL); + req->rq_status = obd_destroy(conn, &body->oa, NULL, oti); RETURN(0); } @@ -106,7 +104,7 @@ static int ost_statfs(struct ptlrpc_request *req) RETURN(0); } -static int ost_open(struct ptlrpc_request *req) +static int ost_open(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ost_body *body, *repbody; @@ -122,11 +120,11 @@ static int ost_open(struct ptlrpc_request *req) repbody = lustre_msg_buf(req->rq_repmsg, 0); /* FIXME: unpack only valid fields instead of memcpy, endianness */ memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); - req->rq_status = obd_open(conn, &repbody->oa, NULL); + req->rq_status = obd_open(conn, &repbody->oa, NULL, oti); RETURN(0); } -static int ost_close(struct ptlrpc_request *req) +static int ost_close(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ost_body *body, *repbody; @@ -142,11 +140,11 @@ static int ost_close(struct ptlrpc_request *req) repbody = lustre_msg_buf(req->rq_repmsg, 0); /* FIXME: unpack only valid fields instead of memcpy, endianness */ memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); - req->rq_status = obd_close(conn, &repbody->oa, NULL); + req->rq_status = obd_close(conn, &repbody->oa, NULL, oti); RETURN(0); } -static int ost_create(struct ptlrpc_request *req) +static int ost_create(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ost_body *body, *repbody; @@ -162,11 +160,11 @@ static int ost_create(struct ptlrpc_request *req) repbody = lustre_msg_buf(req->rq_repmsg, 0); /* FIXME: unpack only valid fields instead of memcpy, endianness */ memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); - req->rq_status = obd_create(conn, &repbody->oa, NULL); + req->rq_status = obd_create(conn, &repbody->oa, NULL, oti); RETURN(0); } -static int ost_punch(struct ptlrpc_request *req) +static int ost_punch(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ost_body *body, *repbody; @@ -187,11 +185,11 @@ static int ost_punch(struct ptlrpc_request *req) /* FIXME: unpack only valid fields instead of memcpy, endianness */ memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); req->rq_status = obd_punch(conn, &repbody->oa, NULL, - repbody->oa.o_size, repbody->oa.o_blocks); + repbody->oa.o_size, repbody->oa.o_blocks, oti); RETURN(0); } -static int ost_setattr(struct ptlrpc_request *req) +static int ost_setattr(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ost_body *body, *repbody; @@ -207,7 +205,7 @@ static int ost_setattr(struct ptlrpc_request *req) repbody = lustre_msg_buf(req->rq_repmsg, 0); /* FIXME: unpack only valid fields instead of memcpy, endianness */ memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); - req->rq_status = obd_setattr(conn, &repbody->oa, NULL); + req->rq_status = obd_setattr(conn, &repbody->oa, NULL, oti); RETURN(0); } @@ -232,7 +230,8 @@ static int ost_brw_read(struct ptlrpc_request *req) struct ost_body *body; struct l_wait_info lwi; void *desc_priv = NULL; - int rc, cmd, i, j, objcount, niocount, size = sizeof(*body); + int cmd, i, j, objcount, niocount, size = sizeof(*body); + int rc = 0; ENTRY; body = lustre_msg_buf(req->rq_reqmsg, 0); @@ -244,7 +243,7 @@ static int ost_brw_read(struct ptlrpc_request *req) cmd = OBD_BRW_READ; if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK)) - GOTO(out, rc = 0); + GOTO(out, req->rq_status = -EIO); for (i = 0; i < objcount; i++) { ost_unpack_ioo(&tmp1, &ioo); @@ -252,8 +251,10 @@ static int ost_brw_read(struct ptlrpc_request *req) LBUG(); GOTO(out, rc = -EFAULT); } - for (j = 0; j < ioo->ioo_bufcnt; j++) + for (j = 0; j < ioo->ioo_bufcnt; j++) { + /* XXX verify niobuf[j].offset > niobuf[j-1].offset */ ost_unpack_niobuf(&tmp2, &remote_nb); + } } OBD_ALLOC(local_nb, sizeof(*local_nb) * niocount); @@ -264,10 +265,10 @@ static int ost_brw_read(struct ptlrpc_request *req) ioo = lustre_msg_buf(req->rq_reqmsg, 1); remote_nb = lustre_msg_buf(req->rq_reqmsg, 2); req->rq_status = obd_preprw(cmd, conn, objcount, ioo, niocount, - remote_nb, local_nb, &desc_priv); + remote_nb, local_nb, &desc_priv, NULL); if (req->rq_status) - GOTO(out, rc = 0); + GOTO(out, req->rq_status); desc = ptlrpc_prep_bulk(req->rq_connection); if (desc == NULL) @@ -285,7 +286,7 @@ static int ost_brw_read(struct ptlrpc_request *req) bulk->bp_buflen = remote_nb[i].len; } - rc = ptlrpc_send_bulk(desc); + rc = ptlrpc_bulk_put(desc); if (rc) GOTO(out_bulk, rc); @@ -298,15 +299,19 @@ static int ost_brw_read(struct ptlrpc_request *req) } req->rq_status = obd_commitrw(cmd, conn, objcount, ioo, niocount, - local_nb, desc_priv); - - rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg); + local_nb, desc_priv, NULL); out_bulk: ptlrpc_bulk_decref(desc); out_local: OBD_FREE(local_nb, sizeof(*local_nb) * niocount); out: + if (!rc) + /* Hmm, we don't return anything in this reply buffer? + * We should be returning per-page status codes and also + * per-object size, blocks count, mtime, ctime. (bug 593) */ + rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, + &req->rq_repmsg); if (rc) ptlrpc_error(req->rq_svc, req); else @@ -314,7 +319,7 @@ out: RETURN(rc); } -static int ost_brw_write(struct ptlrpc_request *req) +static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ptlrpc_bulk_desc *desc; @@ -322,16 +327,12 @@ static int ost_brw_write(struct ptlrpc_request *req) void *tmp2, *end2; struct niobuf_remote *remote_nb; struct niobuf_local *local_nb = NULL; - struct niobuf_local *lnb; struct obd_ioobj *ioo; struct ost_body *body; struct l_wait_info lwi; - int rc, cmd, i, j, objcount, niocount; - int size[2] = {sizeof(*body)}; void *desc_priv = NULL; - int reply_sent = 0; - struct ptlrpc_service *srv; - __u32 xid; + int cmd, i, j, objcount, niocount, size = sizeof(*body); + int rc = 0; ENTRY; body = lustre_msg_buf(req->rq_reqmsg, 0); @@ -342,117 +343,97 @@ static int ost_brw_write(struct ptlrpc_request *req) niocount = req->rq_reqmsg->buflens[2] / sizeof(*remote_nb); cmd = OBD_BRW_WRITE; + if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK)) + GOTO(out, req->rq_status = -EIO); + for (i = 0; i < objcount; i++) { - ost_unpack_ioo((void *)&tmp1, &ioo); + ost_unpack_ioo(&tmp1, &ioo); if (tmp2 + ioo->ioo_bufcnt > end2) { - rc = -EFAULT; - break; + LBUG(); + GOTO(out, rc = -EFAULT); + } + for (j = 0; j < ioo->ioo_bufcnt; j++) { + /* XXX verify niobuf[j].offset > niobuf[j-1].offset */ + ost_unpack_niobuf(&tmp2, &remote_nb); } - for (j = 0; j < ioo->ioo_bufcnt; j++) - ost_unpack_niobuf((void *)&tmp2, &remote_nb); } - size[1] = niocount * sizeof(*remote_nb); - rc = lustre_pack_msg(2, size, NULL, &req->rq_replen, &req->rq_repmsg); - if (rc) - GOTO(out, rc); - remote_nb = lustre_msg_buf(req->rq_repmsg, 1); - - OBD_ALLOC(local_nb, niocount * sizeof(*local_nb)); + OBD_ALLOC(local_nb, sizeof(*local_nb)* niocount); if (local_nb == NULL) GOTO(out, rc = -ENOMEM); /* The unpackers move tmp1 and tmp2, so reset them before using */ - tmp1 = lustre_msg_buf(req->rq_reqmsg, 1); - tmp2 = lustre_msg_buf(req->rq_reqmsg, 2); - req->rq_status = obd_preprw(cmd, conn, objcount, tmp1, niocount, tmp2, - local_nb, &desc_priv); - if (req->rq_status) - GOTO(out_free, rc = 0); /* XXX is this correct? */ + ioo = lustre_msg_buf(req->rq_reqmsg, 1); + remote_nb = lustre_msg_buf(req->rq_reqmsg, 2); + req->rq_status = obd_preprw(cmd, conn, objcount, ioo, niocount, + remote_nb, local_nb, &desc_priv, oti); - if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK)) - GOTO(fail_preprw, rc = 0); + if (req->rq_status) + GOTO(out, rc = 0); desc = ptlrpc_prep_bulk(req->rq_connection); if (desc == NULL) - GOTO(fail_preprw, rc = -ENOMEM); + GOTO(out_local, rc = -ENOMEM); desc->bd_ptl_ev_hdlr = NULL; desc->bd_portal = OSC_BULK_PORTAL; - desc->bd_desc_private = desc_priv; - memcpy(&(desc->bd_conn), &conn, sizeof(conn)); - - srv = req->rq_obd->u.ost.ost_service; - spin_lock(&srv->srv_lock); - xid = srv->srv_xid++; /* single xid for all pages */ - spin_unlock(&srv->srv_lock); - for (i = 0, lnb = local_nb; i < niocount; i++, lnb++) { - struct ptlrpc_bulk_page *bulk; + for (i = 0; i < niocount; i++) { + struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); - bulk = ptlrpc_prep_bulk_page(desc); if (bulk == NULL) - GOTO(fail_bulk, rc = -ENOMEM); - - bulk->bp_xid = xid; /* single xid for all pages */ - - bulk->bp_buf = lnb->addr; - bulk->bp_page = lnb->page; - bulk->bp_flags = lnb->flags; - bulk->bp_dentry = lnb->dentry; - bulk->bp_buflen = lnb->len; - bulk->bp_cb = NULL; - - /* this advances remote_nb */ - ost_pack_niobuf((void **)&remote_nb, lnb->offset, lnb->len, 0, - bulk->bp_xid); + GOTO(out_bulk, rc = -ENOMEM); + bulk->bp_xid = remote_nb[i].xid; + bulk->bp_buf = local_nb[i].addr; + bulk->bp_buflen = remote_nb[i].len; } - rc = ptlrpc_register_bulk(desc); + rc = ptlrpc_bulk_get(desc); if (rc) - GOTO(fail_bulk, rc); - - reply_sent = 1; - ptlrpc_reply(req->rq_svc, req); + GOTO(out_bulk, rc); lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, desc->bd_flags & PTL_BULK_FL_RCVD, &lwi); if (rc) { - if (rc != -ETIMEDOUT) - LBUG(); + LASSERT(rc == -ETIMEDOUT); ptlrpc_abort_bulk(desc); recovd_conn_fail(desc->bd_connection); - obd_commitrw(cmd, conn, objcount, tmp1, niocount, local_nb, - desc->bd_desc_private); - } else { - rc = obd_commitrw(cmd, conn, objcount, tmp1, niocount, local_nb, - desc->bd_desc_private); + obd_commitrw(cmd, conn, objcount, ioo, niocount, local_nb, + desc_priv, oti); + GOTO(out_bulk, rc); } + req->rq_status = obd_commitrw(cmd, conn, objcount, ioo, niocount, + local_nb, desc_priv, oti); + + out_bulk: ptlrpc_bulk_decref(desc); - EXIT; -out_free: - OBD_FREE(local_nb, niocount * sizeof(*local_nb)); -out: - if (!reply_sent) { - if (rc) { - OBD_FREE(req->rq_repmsg, req->rq_replen); - req->rq_repmsg = NULL; - ptlrpc_error(req->rq_svc, req); - } else - ptlrpc_reply(req->rq_svc, req); - } - return rc; + out_local: + OBD_FREE(local_nb, sizeof(*local_nb) * niocount); + out: + if (!rc) + /* Hmm, we don't return anything in this reply buffer? + * We should be returning per-page status codes and also + * per-object size, blocks count, mtime, ctime. (bug 593) */ + rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, + &req->rq_repmsg); + if (rc) + ptlrpc_error(req->rq_svc, req); + else + rc = ptlrpc_reply(req->rq_svc, req); + RETURN(rc); +} -fail_bulk: - ptlrpc_free_bulk(desc); -fail_preprw: - /* FIXME: how do we undo the preprw? - answer = call commitrw */ - goto out_free; +inline void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req) +{ + if (oti && req->rq_repmsg) + req->rq_repmsg->transno = HTON__u64(oti->oti_transno); + EXIT; } static int ost_handle(struct ptlrpc_request *req) { + struct obd_trans_info trans_info = { 0, }, *oti = &trans_info; int rc; ENTRY; @@ -462,8 +443,7 @@ static int ost_handle(struct ptlrpc_request *req) GOTO(out, rc); } - if (req->rq_reqmsg->opc != OST_CONNECT && - req->rq_export == NULL) { + if (req->rq_reqmsg->opc != OST_CONNECT && req->rq_export == NULL) { CERROR("lustre_ost: operation %d on unconnected OST\n", req->rq_reqmsg->opc); req->rq_status = -ENOTCONN; @@ -487,12 +467,12 @@ static int ost_handle(struct ptlrpc_request *req) case OST_CREATE: CDEBUG(D_INODE, "create\n"); OBD_FAIL_RETURN(OBD_FAIL_OST_CREATE_NET, 0); - rc = ost_create(req); + rc = ost_create(req, oti); break; case OST_DESTROY: CDEBUG(D_INODE, "destroy\n"); OBD_FAIL_RETURN(OBD_FAIL_OST_DESTROY_NET, 0); - rc = ost_destroy(req); + rc = ost_destroy(req, oti); break; case OST_GETATTR: CDEBUG(D_INODE, "getattr\n"); @@ -502,22 +482,22 @@ static int ost_handle(struct ptlrpc_request *req) case OST_SETATTR: CDEBUG(D_INODE, "setattr\n"); OBD_FAIL_RETURN(OBD_FAIL_OST_SETATTR_NET, 0); - rc = ost_setattr(req); + rc = ost_setattr(req, oti); break; case OST_OPEN: CDEBUG(D_INODE, "open\n"); OBD_FAIL_RETURN(OBD_FAIL_OST_OPEN_NET, 0); - rc = ost_open(req); + rc = ost_open(req, oti); break; case OST_CLOSE: CDEBUG(D_INODE, "close\n"); OBD_FAIL_RETURN(OBD_FAIL_OST_CLOSE_NET, 0); - rc = ost_close(req); + rc = ost_close(req, oti); break; case OST_WRITE: CDEBUG(D_INODE, "write\n"); OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0); - rc = ost_brw_write(req); + rc = ost_brw_write(req, oti); /* ost_brw sends its own replies */ RETURN(rc); case OST_READ: @@ -529,7 +509,7 @@ static int ost_handle(struct ptlrpc_request *req) case OST_PUNCH: CDEBUG(D_INODE, "punch\n"); OBD_FAIL_RETURN(OBD_FAIL_OST_PUNCH_NET, 0); - rc = ost_punch(req); + rc = ost_punch(req, oti); break; case OST_STATFS: CDEBUG(D_INODE, "statfs\n"); @@ -539,7 +519,8 @@ static int ost_handle(struct ptlrpc_request *req) case LDLM_ENQUEUE: CDEBUG(D_INODE, "enqueue\n"); OBD_FAIL_RETURN(OBD_FAIL_LDLM_ENQUEUE, 0); - rc = ldlm_handle_enqueue(req); + rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast, + ldlm_server_blocking_ast); break; case LDLM_CONVERT: CDEBUG(D_INODE, "convert\n"); @@ -565,6 +546,20 @@ static int ost_handle(struct ptlrpc_request *req) } EXIT; + /* If we're DISCONNECTing, the export_data is already freed */ + if (!rc && req->rq_reqmsg->opc != OST_DISCONNECT) { + struct obd_device *obd = req->rq_export->exp_obd; + if ((obd->obd_flags & OBD_NO_TRANSNO) == 0) { + req->rq_repmsg->last_committed = + HTON__u64(obd->obd_last_committed); + } else { + DEBUG_REQ(D_IOCTL, req, + "not sending last_committed update"); + } + CDEBUG(D_INFO, "last_committed "LPU64", xid "LPX64"\n", + obd->obd_last_committed, HTON__u64(req->rq_xid)); + } + out: //req->rq_status = rc; if (rc) { @@ -575,51 +570,28 @@ out: CDEBUG(D_INODE, "sending reply\n"); if (req->rq_repmsg == NULL) CERROR("handler for opcode %d returned rc=0 without " - "creating rq_repmsg; needs to return rc != " - "0!\n", req->rq_reqmsg->opc); + "creating rq_repmsg; needs to return rc != 0!\n", + req->rq_reqmsg->opc); + else + oti_to_request(oti, req); ptlrpc_reply(req->rq_svc, req); } return 0; } -/* mount the file system (secretly) */ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf) { - struct obd_ioctl_data* data = buf; struct ost_obd *ost = &obddev->u.ost; - struct obd_device *tgt; + struct obd_uuid self = { "self" }; int err; int i; ENTRY; - if (data->ioc_inllen1 < 1) { - CERROR("requires a TARGET OBD UUID\n"); - RETURN(-EINVAL); - } - if (data->ioc_inllen1 > 37) { - CERROR("OBD UUID must be less than 38 characters\n"); - RETURN(-EINVAL); - } - - tgt = class_uuid2obd(data->ioc_inlbuf1); - if (!tgt || !(tgt->obd_flags & OBD_ATTACHED) || - !(tgt->obd_flags & OBD_SET_UP)) { - CERROR("device not attached or not set up (%d)\n", - data->ioc_dev); - RETURN(err = -EINVAL); - } - - err = obd_connect(&ost->ost_conn, tgt, NULL, NULL, NULL); - if (err) { - CERROR("fail to connect to device %d\n", data->ioc_dev); - RETURN(err); - } - ost->ost_service = ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_REQUEST_PORTAL, OSC_REPLY_PORTAL, - "self", ost_handle, "ost"); + &self, ost_handle, "ost"); if (!ost->ost_service) { CERROR("failed to start service\n"); GOTO(error_disc, err = -ENOMEM); @@ -638,40 +610,33 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(0); error_disc: - obd_disconnect(&ost->ost_conn); RETURN(err); } static int ost_cleanup(struct obd_device * obddev) { struct ost_obd *ost = &obddev->u.ost; - int err; + int err = 0; ENTRY; - if ( !list_empty(&obddev->obd_exports) ) { - CERROR("still has clients!\n"); - RETURN(-EBUSY); - } - ptlrpc_stop_all_threads(ost->ost_service); ptlrpc_unregister_service(ost->ost_service); - err = obd_disconnect(&ost->ost_conn); - if (err) - CERROR("lustre ost: fail to disconnect device\n"); - RETURN(err); } int ost_attach(struct obd_device *dev, obd_count len, void *data) { - return lprocfs_reg_obd(dev, status_var_nm_1, dev); + struct lprocfs_static_vars lvars; + + lprocfs_init_vars(&lvars); + return lprocfs_obd_attach(dev, lvars.obd_vars); } int ost_detach(struct obd_device *dev) { - return lprocfs_dereg_obd(dev); + return lprocfs_obd_detach(dev); } /* This is so similar to mds_connect that it makes my heart weep: we should @@ -679,7 +644,7 @@ int ost_detach(struct obd_device *dev) * target_handle_connect. */ static int ost_connect(struct lustre_handle *conn, - struct obd_device *obd, obd_uuid_t cluuid, + struct obd_device *obd, struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { @@ -693,14 +658,15 @@ static int ost_connect(struct lustre_handle *conn, RETURN(-EINVAL); /* lctl gets a backstage, all-access pass. */ - if (!strcmp(cluuid, "OBD_CLASS_UUID")) + if (!strcmp(cluuid->uuid, "OBD_CLASS_UUID")) goto dont_check_exports; spin_lock(&obd->obd_dev_lock); list_for_each(p, &obd->obd_exports) { exp = list_entry(p, struct obd_export, exp_obd_chain); oed = &exp->exp_ost_data; - if (!memcmp(cluuid, oed->oed_uuid, sizeof oed->oed_uuid)) { + if (!memcmp(cluuid->uuid, oed->oed_uuid.uuid, + sizeof(oed->oed_uuid.uuid))) { spin_unlock(&obd->obd_dev_lock); LASSERT(exp->exp_obd == obd); @@ -716,12 +682,11 @@ static int ost_connect(struct lustre_handle *conn, LASSERT(exp); oed = &exp->exp_ost_data; - memcpy(oed->oed_uuid, cluuid, sizeof oed->oed_uuid); + memcpy(oed->oed_uuid.uuid, cluuid->uuid, sizeof(oed->oed_uuid.uuid)); RETURN(0); } - /* use obd ops to offer management infrastructure */ static struct obd_ops ost_obd_ops = { o_owner: THIS_MODULE, @@ -734,12 +699,12 @@ static struct obd_ops ost_obd_ops = { static int __init ost_init(void) { - int rc; - - rc = class_register_type(&ost_obd_ops, status_class_var, - LUSTRE_OST_NAME); - RETURN(rc); + struct lprocfs_static_vars lvars; + ENTRY; + lprocfs_init_vars(&lvars); + RETURN(class_register_type(&ost_obd_ops, lvars.module_vars, + LUSTRE_OST_NAME)); } static void __exit ost_exit(void) diff --git a/lustre/ptlbd/blk.c b/lustre/ptlbd/blk.c index 4a793436..70ea9e4 100644 --- a/lustre/ptlbd/blk.c +++ b/lustre/ptlbd/blk.c @@ -1,7 +1,8 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (c) 2002 Cluster File Systems, Inc. + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * Author: Zach Brown * * This file is part of Lustre, http://www.lustre.org. * @@ -49,6 +50,7 @@ #define LOCAL_END_REQUEST #include #include +#include #include static int ptlbd_size_size[PTLBD_MAX_MINOR]; @@ -106,6 +108,7 @@ static int ptlbd_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { struct ptlbd_obd *ptlbd; + int ret; if ( ! capable(CAP_SYS_ADMIN) ) RETURN(-EPERM); @@ -114,9 +117,16 @@ static int ptlbd_ioctl(struct inode *inode, struct file *file, if ( IS_ERR(ptlbd) ) RETURN( PTR_ERR(ptlbd) ); - /* XXX getattr{,64} */ + switch(cmd) { + case BLKFLSBUF: + ret = blk_ioctl(inode->i_rdev, cmd, arg); + break; + default: + ret = -EINVAL; + break; + } - RETURN(-EINVAL); + RETURN(ret); } static int ptlbd_release(struct inode *inode, struct file *file) diff --git a/lustre/ptlbd/client.c b/lustre/ptlbd/client.c index d57e001..a6580e0 100644 --- a/lustre/ptlbd/client.c +++ b/lustre/ptlbd/client.c @@ -1,7 +1,8 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (c) 2002 Cluster File Systems, Inc. + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * Author: Zach Brown * * This file is part of Lustre, http://www.lustre.org. * @@ -36,7 +37,7 @@ static int ptlbd_cl_setup(struct obd_device *obddev, obd_count len, void *buf) struct ptlbd_obd *ptlbd = &obddev->u.ptlbd; struct obd_import *imp = &ptlbd->bd_import; struct obd_ioctl_data* data = buf; - obd_uuid_t server_uuid; + struct obd_uuid server_uuid; ENTRY; if ( ptlbd->bd_import.imp_connection != NULL ) @@ -52,10 +53,9 @@ static int ptlbd_cl_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(-EINVAL); } - memcpy(server_uuid, data->ioc_inlbuf1, MIN(data->ioc_inllen1, - sizeof(server_uuid))); + obd_str2uuid(&server_uuid, data->ioc_inlbuf1); - imp->imp_connection = ptlrpc_uuid_to_connection(server_uuid); + imp->imp_connection = ptlrpc_uuid_to_connection(&server_uuid); if (!imp->imp_connection) RETURN(-ENOENT); @@ -69,7 +69,6 @@ static int ptlbd_cl_setup(struct obd_device *obddev, obd_count len, void *buf) INIT_LIST_HEAD(&imp->imp_chain); imp->imp_last_xid = 0; imp->imp_max_transno = 0; - imp->imp_peer_last_xid = 0; imp->imp_peer_committed_transno = 0; imp->imp_level = LUSTRE_CONN_FULL; @@ -95,7 +94,7 @@ static int ptlbd_cl_cleanup(struct obd_device *obddev) #if 0 static int ptlbd_cl_connect(struct lustre_handle *conn, struct obd_device *obd, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { struct ptlbd_obd *ptlbd = &obd->u.ptlbd; @@ -104,7 +103,7 @@ static int ptlbd_cl_connect(struct lustre_handle *conn, struct obd_device *obd, ENTRY; rc = class_connect(conn, obd, cluuid); - if (rc) + if (rc) RETURN(rc); INIT_LIST_HEAD(&imp->imp_chain); @@ -130,9 +129,10 @@ static struct obd_ops ptlbd_cl_obd_ops = { int ptlbd_cl_init(void) { - extern struct lprocfs_vars status_class_var[]; + struct lprocfs_static_vars lvars; - return class_register_type(&ptlbd_cl_obd_ops, status_class_var, + lprocfs_init_vars(&lvars); + return class_register_type(&ptlbd_cl_obd_ops, lvars.module_vars, OBD_PTLBD_CL_DEVICENAME); } diff --git a/lustre/ptlbd/rpc.c b/lustre/ptlbd/rpc.c index 5ff5177..62c0236 100644 --- a/lustre/ptlbd/rpc.c +++ b/lustre/ptlbd/rpc.c @@ -1,7 +1,8 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (c) 2002 Cluster File Systems, Inc. + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * Author: Zach Brown * * This file is part of Lustre, http://www.lustre.org. * @@ -31,168 +32,8 @@ #include #include -static __u32 get_next_xid(struct obd_import *imp) -{ - unsigned long flags; - __u32 xid; - spin_lock_irqsave(&imp->imp_lock, flags); - xid = ++imp->imp_last_xid; - spin_unlock_irqrestore(&imp->imp_lock, flags); - return xid; -} - -static int ptlbd_brw_callback(struct obd_brw_set *set, int phase) -{ - ENTRY; - RETURN(0); -} - -static void decref_bulk_desc(void *data) -{ - struct ptlrpc_bulk_desc *desc = data; - ENTRY; - - ptlrpc_bulk_decref(desc); - EXIT; -} - -/* this is the callback function which is invoked by the Portals - * event handler associated with the bulk_sink queue and bulk_source queue. - */ -static void ptlbd_ptl_ev_hdlr(struct ptlrpc_bulk_desc *desc) -{ - ENTRY; - - LASSERT(desc->bd_brw_set != NULL); - LASSERT(desc->bd_brw_set->brw_callback != NULL); - - desc->bd_brw_set->brw_callback(desc->bd_brw_set, CB_PHASE_FINISH); - - prepare_work(&desc->bd_queue, decref_bulk_desc, desc); - schedule_work(&desc->bd_queue); - - EXIT; -} - - -int ptlbd_write_put_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, - struct buffer_head *first_bh, unsigned int page_count) -{ - struct obd_import *imp = &ptlbd->bd_import; - struct ptlbd_op *op; - struct ptlbd_niob *niob, *niobs; - struct ptlbd_rsp *rsp; - struct ptlrpc_request *req; - struct ptlrpc_bulk_desc *desc; - struct buffer_head *bh; - int rc, size[2]; - struct obd_brw_set *set; - ENTRY; - - size[0] = sizeof(struct ptlbd_op); - size[1] = page_count * sizeof(struct ptlbd_niob); - - req = ptlrpc_prep_req(imp, cmd, 2, size, NULL); - if (!req) - GOTO(out, rc = -ENOMEM); - /* XXX might not need these */ - req->rq_request_portal = PTLBD_REQUEST_PORTAL; - req->rq_reply_portal = PTLBD_REPLY_PORTAL; - - op = lustre_msg_buf(req->rq_reqmsg, 0); - niobs = lustre_msg_buf(req->rq_reqmsg, 1); - - /* XXX pack */ - op->op_cmd = cmd; - op->op_lun = 0; - op->op_niob_cnt = page_count; - op->op__padding = 0; - op->op_block_cnt = page_count; - - desc = ptlrpc_prep_bulk(imp->imp_connection); - if ( desc == NULL ) - GOTO(out_req, rc = -ENOMEM); - desc->bd_portal = PTLBD_BULK_PORTAL; - desc->bd_ptl_ev_hdlr = ptlbd_ptl_ev_hdlr; - - /* XXX someone needs to free this */ - set = obd_brw_set_new(); - if (set == NULL) - GOTO(out_desc, rc = -ENOMEM); - - set->brw_callback = ptlbd_brw_callback; - -#if 0 - xid = get_next_xid(imp); -#endif - - for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_next, niob++ ) { -#if 0 - struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); - if (bulk == NULL) - GOTO(out_set, rc = -ENOMEM); -#endif - -#if 0 - niob->n_xid = xid; -#endif - niob->n_block_nr = bh->b_blocknr; - niob->n_offset = bh_offset(bh); - niob->n_length = bh->b_size; - - -#if 0 - bulk->bp_xid = xid; - bulk->bp_buf = bh->b_data; - bulk->bp_page = bh->b_page; - bulk->bp_buflen = bh->b_size; -#endif - } - - - size[0] = sizeof(struct ptlbd_rsp); - size[1] = sizeof(struct ptlbd_niob) * page_count; - req->rq_replen = lustre_msg_size(2, size); - - /* XXX find out how we're really supposed to manage levels */ - req->rq_level = imp->imp_level; - rc = ptlrpc_queue_wait(req); - - rsp = lustre_msg_buf(req->rq_repmsg, 0); - - niob = lustre_msg_buf(req->rq_repmsg, 1); - /* XXX check that op->num matches ours */ - for ( bh = first_bh ; bh ; bh = bh->b_next, niob++ ) { - struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); - if (bulk == NULL) - GOTO(out_set, rc = -ENOMEM); - - bulk->bp_xid = niob->n_xid; - bulk->bp_page = bh->b_page; - bulk->bp_buf = bh->b_data; - bulk->bp_buflen = bh->b_size; - } - - obd_brw_set_add(set, desc); - rc = ptlrpc_send_bulk(desc); - - /* if there's an error, no brw_finish called, just like - * osc_brw_read */ - - GOTO(out_req, rc); - -out_set: - obd_brw_set_free(set); -out_desc: - ptlrpc_bulk_decref(desc); -out_req: - ptlrpc_req_finished(req); -out: - RETURN(rc); -} - -int ptlbd_read_put_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, - struct buffer_head *first_bh, unsigned int page_count) +int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, + struct buffer_head *first_bh) { struct obd_import *imp = &ptlbd->bd_import; struct ptlbd_op *op; @@ -201,20 +42,23 @@ int ptlbd_read_put_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, struct ptlrpc_request *req; struct ptlrpc_bulk_desc *desc; struct buffer_head *bh; + unsigned long flags; + unsigned int page_count; int rc, rep_size, size[2]; - struct obd_brw_set *set; __u32 xid; ENTRY; + LASSERT(cmd == PTLBD_READ || cmd == PTLBD_WRITE); + + for ( page_count = 0, bh = first_bh ; bh ; bh = bh->b_next ) + page_count++; + size[0] = sizeof(struct ptlbd_op); size[1] = page_count * sizeof(struct ptlbd_niob); req = ptlrpc_prep_req(imp, cmd, 2, size, NULL); if (!req) - GOTO(out, rc = -ENOMEM); - /* XXX might not need these? */ - req->rq_request_portal = PTLBD_REQUEST_PORTAL; - req->rq_reply_portal = PTLBD_REPLY_PORTAL; + RETURN(-ENOMEM); op = lustre_msg_buf(req->rq_reqmsg, 0); niobs = lustre_msg_buf(req->rq_reqmsg, 1); @@ -230,21 +74,16 @@ int ptlbd_read_put_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, if ( desc == NULL ) GOTO(out_req, rc = -ENOMEM); desc->bd_portal = PTLBD_BULK_PORTAL; - desc->bd_ptl_ev_hdlr = ptlbd_ptl_ev_hdlr; - - /* XXX someone needs to free this */ - set = obd_brw_set_new(); - if (set == NULL) - GOTO(out_desc, rc = -ENOMEM); - - set->brw_callback = ptlbd_brw_callback; + desc->bd_ptl_ev_hdlr = NULL; - xid = get_next_xid(imp); + spin_lock_irqsave(&imp->imp_lock, flags); + xid = ++imp->imp_last_xid; + spin_unlock_irqrestore(&imp->imp_lock, flags); for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_next, niob++ ) { struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); if (bulk == NULL) - GOTO(out_set, rc = -ENOMEM); + GOTO(out_req, rc = -ENOMEM); niob->n_xid = xid; niob->n_block_nr = bh->b_blocknr; @@ -257,12 +96,13 @@ int ptlbd_read_put_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, bulk->bp_buflen = bh->b_size; } - /* XXX put in OBD_FAIL_CHECK for ptlbd? */ - rc = ptlrpc_register_bulk(desc); - if (rc) - GOTO(out_set, rc); + if ( cmd == PTLBD_READ ) + rc = ptlrpc_register_bulk_put(desc); + else + rc = ptlrpc_register_bulk_get(desc); - obd_brw_set_add(set, desc); + if (rc) + GOTO(out_desc, rc); rep_size = sizeof(struct ptlbd_rsp); req->rq_replen = lustre_msg_size(1, &rep_size); @@ -271,48 +111,15 @@ int ptlbd_read_put_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, req->rq_level = imp->imp_level; rc = ptlrpc_queue_wait(req); - rsp = lustre_msg_buf(req->rq_repmsg, 0); - - /* if there's an error, no brw_finish called, just like - * osc_brw_read */ - - GOTO(out_req, rc); + if ( rc == 0 ) { + rsp = lustre_msg_buf(req->rq_repmsg, 0); + /* XXX do stuff */ + } -out_set: - obd_brw_set_free(set); out_desc: ptlrpc_bulk_decref(desc); out_req: ptlrpc_req_finished(req); -out: - RETURN(rc); -} - -int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, - struct buffer_head *first_bh) -{ - unsigned int page_count = 0; - struct buffer_head *bh; - int rc; - ENTRY; - - for ( page_count = 0, bh = first_bh ; bh ; bh = bh->b_next ) - page_count++; - - switch (cmd) { - case PTLBD_READ: - rc = ptlbd_read_put_req(ptlbd, cmd, - first_bh, page_count); - break; - case PTLBD_WRITE: - rc = ptlbd_write_put_req(ptlbd, cmd, - first_bh, page_count); - break; - default: - rc = -EINVAL; - break; - }; - RETURN(rc); } @@ -326,108 +133,63 @@ static int ptlbd_bulk_timeout(void *data) RETURN(1); } -#define SILLY_MAX 2048 -static struct page *pages[SILLY_MAX] = {NULL,}; - -static struct page * fake_page(int block_nr) -{ - if ( block_nr >= SILLY_MAX ) - return NULL; - - if (pages[block_nr] == NULL) { - void *vaddr = (void *)get_free_page(GFP_KERNEL); - pages[block_nr] = virt_to_page(vaddr); - } - return pages[block_nr]; -} - -static int ptlbd_put_write(struct ptlrpc_request *req) +void ptlbd_do_filp(struct file *filp, int op, struct ptlbd_niob *niobs, + int page_count, struct list_head *page_list) { - struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; - struct ptlbd_op *op; - struct ptlbd_niob *reply_niob, *request_niob; - struct ptlbd_rsp *rsp; - struct ptlrpc_bulk_desc *desc; - struct ptlrpc_service *srv; - struct l_wait_info lwi; - int size[2]; - int i, page_count, rc; - __u32 xid; + mm_segment_t old_fs; + struct list_head *pos; + ENTRY; - op = lustre_msg_buf(req->rq_reqmsg, 0); - request_niob = lustre_msg_buf(req->rq_reqmsg, 1); - page_count = req->rq_reqmsg->buflens[1] / sizeof(struct ptlbd_niob); + old_fs = get_fs(); + set_fs(KERNEL_DS); - size[0] = sizeof(struct ptlbd_rsp); - size[1] = sizeof(struct ptlbd_niob) * page_count; - rc = lustre_pack_msg(2, size, NULL, &req->rq_replen, &req->rq_repmsg); - if (rc) - GOTO(out, rc); - reply_niob = lustre_msg_buf(req->rq_repmsg, 1); + list_for_each(pos, page_list) { + ssize_t ret; + struct page *page = list_entry(pos, struct page, list); + loff_t offset = (niobs->n_block_nr << PAGE_SHIFT) + + niobs->n_offset; - desc = ptlrpc_prep_bulk(req->rq_connection); - if (desc == NULL) - GOTO(out, rc = -ENOMEM); - desc->bd_ptl_ev_hdlr = NULL; - desc->bd_portal = PTLBD_BULK_PORTAL; - memcpy(&(desc->bd_conn), &conn, sizeof(conn)); /* XXX what? */ - - srv = req->rq_obd->u.ptlbd.ptlbd_service; - spin_lock(&srv->srv_lock); - xid = srv->srv_xid++; /* single xid for all pages */ - spin_unlock(&srv->srv_lock); - - for ( i = 0; i < page_count; i++) { - struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); - if (bulk == NULL) - GOTO(out_desc, rc = -ENOMEM); - - reply_niob[i] = request_niob[i]; - reply_niob[i].n_xid = xid; + if ( op == PTLBD_READ ) + ret = filp->f_op->read(filp, page_address(page), + niobs->n_length, &offset); + else + ret = filp->f_op->write(filp, page_address(page), + niobs->n_length, &offset); - bulk->bp_xid = xid; - bulk->bp_page = fake_page(request_niob[i].n_block_nr); - bulk->bp_buf = page_address(bulk->bp_page); - bulk->bp_buflen = request_niob[i].n_length; + niobs++; } - rc = ptlrpc_register_bulk(desc); - if ( rc ) - GOTO(out_desc, rc); - - rsp = lustre_msg_buf(req->rq_reqmsg, 0); - rsp->r_status = 42; - rsp->r_error_cnt = 13; - ptlrpc_reply(req->rq_svc, req); - - /* this synchronization probably isn't good enough */ - lwi = LWI_TIMEOUT(obd_timeout * HZ, ptlbd_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, desc->bd_flags &PTL_BULK_FL_RCVD, - &lwi); - -out_desc: - ptlrpc_free_bulk(desc); -out: - RETURN(rc); + set_fs(old_fs); + EXIT; } -static int ptlbd_put_read(struct ptlrpc_request *req) +int ptlbd_parse_req(struct ptlrpc_request *req) { struct ptlbd_op *op; struct ptlbd_niob *niob, *niobs; struct ptlbd_rsp *rsp; struct ptlrpc_bulk_desc *desc; + struct file *filp = req->rq_obd->u.ptlbd.filp; struct l_wait_info lwi; - int size[1]; - int i, page_count, rc; + int size[1], wait_flag, i, page_count, rc; + struct list_head *pos, *n; + LIST_HEAD(tmp_pages); + ENTRY; + + rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); + if ( rc ) + RETURN(rc); op = lustre_msg_buf(req->rq_reqmsg, 0); + LASSERT(op->op_cmd == PTLBD_READ || op->op_cmd == PTLBD_WRITE); + niobs = lustre_msg_buf(req->rq_reqmsg, 1); page_count = req->rq_reqmsg->buflens[1] / sizeof(struct ptlbd_niob); desc = ptlrpc_prep_bulk(req->rq_connection); if (desc == NULL) GOTO(out, rc = -ENOMEM); + desc->bd_ptl_ev_hdlr = NULL; desc->bd_portal = PTLBD_BULK_PORTAL; for ( i = 0, niob = niobs ; i < page_count; niob++, i++) { @@ -435,23 +197,34 @@ static int ptlbd_put_read(struct ptlrpc_request *req) if (bulk == NULL) GOTO(out_bulk, rc = -ENOMEM); + bulk->bp_page = alloc_page(GFP_KERNEL); + if (bulk->bp_page == NULL) + GOTO(out_bulk, rc = -ENOMEM); + list_add(&bulk->bp_page->list, &tmp_pages); + /* * XXX what about the block number? */ bulk->bp_xid = niob->n_xid; - bulk->bp_page = fake_page(niob->n_block_nr); bulk->bp_buf = page_address(bulk->bp_page); bulk->bp_buflen = niob->n_length; } - rc = ptlrpc_send_bulk(desc); + if ( op->op_cmd == PTLBD_READ ) { + ptlbd_do_filp(filp, PTLBD_READ, niobs, page_count, &tmp_pages); + rc = ptlrpc_bulk_put(desc); + wait_flag = PTL_BULK_FL_SENT; + } else { + rc = ptlrpc_bulk_get(desc); + wait_flag = PTL_BULK_FL_RCVD; + } + if ( rc ) GOTO(out_bulk, rc); /* this synchronization probably isn't good enough */ lwi = LWI_TIMEOUT(obd_timeout * HZ, ptlbd_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, desc->bd_flags &PTL_BULK_FL_SENT, - &lwi); + rc = l_wait_event(desc->bd_waitq, desc->bd_flags & wait_flag, &lwi); size[0] = sizeof(struct ptlbd_rsp); rc = lustre_pack_msg(1, size, NULL, &req->rq_replen, &req->rq_repmsg); @@ -461,6 +234,8 @@ static int ptlbd_put_read(struct ptlrpc_request *req) rsp = lustre_msg_buf(req->rq_repmsg, 0); if ( rsp == NULL ) GOTO(out, rc = -EINVAL); + + ptlbd_do_filp(filp, PTLBD_WRITE, niobs, page_count, &tmp_pages); rsp->r_error_cnt = 42; rsp->r_status = 69; @@ -469,82 +244,12 @@ static int ptlbd_put_read(struct ptlrpc_request *req) ptlrpc_reply(req->rq_svc, req); out_bulk: - ptlrpc_free_bulk(desc); -out: - RETURN(rc); -} - - -int ptlbd_parse_req(struct ptlrpc_request *req) -{ - struct ptlbd_op *op; - int rc; - ENTRY; - - rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); - if ( rc ) - RETURN(rc); - - op = lustre_msg_buf(req->rq_reqmsg, 0); - - switch(op->op_cmd) { - case PTLBD_READ: - ptlbd_put_read(req); - break; - case PTLBD_WRITE: - ptlbd_put_write(req); - break; - default: - CERROR("fix this %d\n", op->op_cmd); - break; + list_for_each_safe(pos, n, &tmp_pages) { + struct page *page = list_entry(pos, struct page, list); + list_del(&page->list); + __free_page(page); } - - RETURN(0); -} - - -#if 0 -int ptlbd_bh_req(int cmd, struct ptlbd_state *st, struct buffer_head *first_bh) -{ - struct obd_brw_set *set = NULL; - struct brw_page *pg = NULL; - struct buffer_head *bh; - int rc, i, pg_bytes = 0; - ENTRY; - - for ( bh = first_bh ; bh ; bh = bh->b_reqnext ) - pg_bytes += sizeof(struct brw_page); - - OBD_ALLOC(pg, pg_bytes); - if ( pg == NULL ) - GOTO(out, rc = -ENOMEM); - - set = obd_brw_set_new(); - if (set == NULL) - GOTO(out, rc = -ENOMEM); - - for ( i = 0, bh = first_bh ; bh ; bh = bh->b_reqnext, i++) { - pg[i].pg = bh->b_page; - pg[i].off = bh_offset(bh); - pg[i].count = bh->b_size; - pg[i].flag = 0; - } - - set->brw_callback = ll_brw_sync_wait; - rc = obd_brw(cmd, /* lsm */NULL, num_pages, pg, set); - if ( rc ) - GOTO(out, rc); - - rc = ll_brw_sync_wait(set, CB_PHASE_START); - if (rc) - CERROR("error from callback: rc = %d\n", rc); - + ptlrpc_bulk_decref(desc); out: - if ( pg != NULL ) - OBD_FREE(pg, pg_bytes); - if ( set != NULL ) - obd_brw_set_free(set); - - RETURN(rc); + RETURN(rc); } -#endif diff --git a/lustre/ptlbd/server.c b/lustre/ptlbd/server.c index 422f0e1..78d01a6 100644 --- a/lustre/ptlbd/server.c +++ b/lustre/ptlbd/server.c @@ -1,7 +1,8 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (c) 2002 Cluster File Systems, Inc. + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * Author: Zach Brown * * This file is part of Lustre, http://www.lustre.org. * @@ -31,79 +32,44 @@ #include #include -#if 0 -static int ptlbd_sv_callback(struct ptlrpc_request *req) -{ - int rc; - ENTRY; - - rc = ptlbd_parse_request(req); - - rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); - if ( rc ) - GOTO(out, rc); - - printk("callback got a friggin opc %d\n", req->rq_reqmsg->opc); - -out: - RETURN(rc); -} -#endif - static int ptlbd_sv_already_setup = 1; static int ptlbd_sv_setup(struct obd_device *obddev, obd_count len, void *buf) { -#if 0 - struct obd_ioctl_data* data = buf; - obd_uuid_t server_uuid; -#endif + struct obd_uuid self_uuid = { "self" }; struct ptlbd_obd *ptlbd = &obddev->u.ptlbd; int rc; ENTRY; -#if 0 - if (data->ioc_inllen1 < 1) { - CERROR("requires a PTLBD server UUID\n"); - RETURN(rc = -EINVAL); - } - - if (data->ioc_inllen1 > 37) { - CERROR("PTLBD server UUID must be less than 38 characters\n"); - RETURN(rc = -EINVAL); - } - - memcpy(server_uuid, data->ioc_inlbuf1, MIN(data->ioc_inllen1, - sizeof(server_uuid))); + ptlbd->filp = filp_open("/tmp/ptlbd-backing-file-la-la-la", + O_RDWR|O_CREAT, 0600); + if ( IS_ERR(ptlbd->filp) ) + RETURN(PTR_ERR(ptlbd->filp)); -#endif ptlbd->ptlbd_service = ptlrpc_init_svc(PTLBD_NEVENTS, PTLBD_NBUFS, PTLBD_BUFSIZE, PTLBD_MAXREQSIZE, PTLBD_REQUEST_PORTAL, - PTLBD_REPLY_PORTAL, "self", + PTLBD_REPLY_PORTAL, &self_uuid, ptlbd_parse_req, "ptlbd_sv"); - if (!ptlbd->ptlbd_service) { - CERROR("failed to start service\n"); - RETURN(rc = -ENOMEM); - } + if (ptlbd->ptlbd_service == NULL) + GOTO(out_filp, rc = -ENOMEM); rc = ptlrpc_start_thread(obddev, ptlbd->ptlbd_service, "ptldb"); - if (rc) { - CERROR("cannot start PTLBD thread: rc %d\n", rc); - LBUG(); + if (rc != 0) GOTO(out_thread, rc); - } ptlbd_sv_already_setup = 1; RETURN(0); - out_thread: +out_thread: ptlrpc_stop_all_threads(ptlbd->ptlbd_service); ptlrpc_unregister_service(ptlbd->ptlbd_service); +out_filp: + filp_close(ptlbd->filp, NULL); - return rc; + RETURN(rc); } static int ptlbd_sv_cleanup(struct obd_device *obddev) @@ -115,36 +81,25 @@ static int ptlbd_sv_cleanup(struct obd_device *obddev) ptlrpc_stop_all_threads(ptlbd->ptlbd_service); ptlrpc_unregister_service(ptlbd->ptlbd_service); + if ( ! IS_ERR(ptlbd->filp) ) + filp_close(ptlbd->filp, NULL); ptlbd_sv_already_setup = 0; RETURN(0); } -#if 0 -static int ptlbd_sv_connect(struct lustre_handle *conn, struct obd_device *src, - obd_uuid_t cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) -{ - return class_connect(conn, src, cluuid); -} -#endif - static struct obd_ops ptlbd_sv_obd_ops = { o_owner: THIS_MODULE, -/* o_iocontrol: ptlbd_iocontrol,*/ o_setup: ptlbd_sv_setup, o_cleanup: ptlbd_sv_cleanup, -#if 0 - o_connect: ptlbd_sv_connect, - o_disconnect: class_disconnect -#endif }; int ptlbd_sv_init(void) { - extern struct lprocfs_vars status_class_var[]; + struct lprocfs_static_vars lvars; - return class_register_type(&ptlbd_sv_obd_ops, status_class_var, + lprocfs_init_vars(&lvars); + return class_register_type(&ptlbd_sv_obd_ops, lvars.module_vars, OBD_PTLBD_SV_DEVICENAME); } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 1d6c719..48e11b5 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -36,42 +36,43 @@ void ptlrpc_init_client(int req_portal, int rep_portal, char *name, cl->cli_name = name; } -__u8 *ptlrpc_req_to_uuid(struct ptlrpc_request *req) +struct obd_uuid *ptlrpc_req_to_uuid(struct ptlrpc_request *req) { - return req->rq_connection->c_remote_uuid; + return &req->rq_connection->c_remote_uuid; } -struct ptlrpc_connection *ptlrpc_uuid_to_connection(obd_uuid_t uuid) +struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid) { struct ptlrpc_connection *c; struct lustre_peer peer; int err; - err = kportal_uuid_to_peer(uuid, &peer); + err = kportal_uuid_to_peer(uuid->uuid, &peer); if (err != 0) { - CERROR("cannot find peer %s!\n", uuid); + CERROR("cannot find peer %s!\n", uuid->uuid); return NULL; } c = ptlrpc_get_connection(&peer, uuid); if (c) { - memcpy(c->c_remote_uuid, uuid, sizeof(c->c_remote_uuid)); + memcpy(c->c_remote_uuid.uuid, + uuid->uuid, sizeof(c->c_remote_uuid.uuid)); c->c_epoch++; } - CDEBUG(D_INFO, "%s -> %p\n", uuid, c); + CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c); return c; } -void ptlrpc_readdress_connection(struct ptlrpc_connection *conn,obd_uuid_t uuid) +void ptlrpc_readdress_connection(struct ptlrpc_connection *conn,struct obd_uuid *uuid) { struct lustre_peer peer; int err; - err = kportal_uuid_to_peer(uuid, &peer); + err = kportal_uuid_to_peer(uuid->uuid, &peer); if (err != 0) { - CERROR("cannot find peer %s!\n", uuid); + CERROR("cannot find peer %s!\n", uuid->uuid); return; } @@ -189,12 +190,13 @@ static int ll_sync_brw_timeout(void *data) if (PtlMDUnlink(desc->bd_md_h) != 0) { CERROR("Near-miss on OST %s -- need to adjust " "obd_timeout?\n", - desc->bd_connection->c_remote_uuid); + desc->bd_connection->c_remote_uuid.uuid); continue; } CERROR("IO of %d pages to/from %s:%d (conn %p) timed out\n", - desc->bd_page_count, desc->bd_connection->c_remote_uuid, + desc->bd_page_count, + desc->bd_connection->c_remote_uuid.uuid, desc->bd_portal, desc->bd_connection); /* This one will "never" arrive, don't wait for it. */ @@ -259,7 +261,6 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, { struct ptlrpc_connection *conn; struct ptlrpc_request *request; - unsigned long flags; int rc; ENTRY; @@ -284,7 +285,7 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, request->rq_type = PTL_RPC_MSG_REQUEST; request->rq_import = imp; - /* XXX FIXME bug 625069 */ + /* XXX FIXME bug 625069, now 249 */ request->rq_request_portal = imp->imp_client->cli_request_portal; request->rq_reply_portal = imp->imp_client->cli_reply_portal; @@ -293,10 +294,6 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, INIT_LIST_HEAD(&request->rq_list); atomic_set(&request->rq_refcount, 1); - spin_lock_irqsave(&imp->imp_lock, flags); - request->rq_xid = HTON__u32(++imp->imp_last_xid); - spin_unlock_irqrestore(&imp->imp_lock, flags); - request->rq_reqmsg->magic = PTLRPC_MSG_MAGIC; request->rq_reqmsg->version = PTLRPC_MSG_VERSION; request->rq_reqmsg->opc = HTON__u32(opcode); @@ -317,7 +314,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) if (atomic_read(&request->rq_refcount) != 0) { CERROR("freeing request %p (%d->%s:%d) with refcount %d\n", request, request->rq_reqmsg->opc, - request->rq_connection->c_remote_uuid, + request->rq_connection->c_remote_uuid.uuid, request->rq_import->imp_client->cli_request_portal, atomic_read (&request->rq_refcount)); /* LBUG(); */ @@ -428,7 +425,7 @@ static int ptlrpc_check_status(struct ptlrpc_request *req) } if (err < 0) { - DEBUG_REQ(D_ERROR, req, "status is %d", err); + DEBUG_REQ(D_INFO, req, "status is %d", err); } else if (err > 0) { /* XXX: translate this error from net to host */ DEBUG_REQ(D_INFO, req, "status is %d", err); @@ -466,12 +463,14 @@ void ptlrpc_free_committed(struct obd_import *imp) struct ptlrpc_request *req; ENTRY; + LASSERT(imp != NULL); + #ifdef CONFIG_SMP LASSERT(spin_is_locked(&imp->imp_lock)); #endif - CDEBUG(D_HA, "committing for xid "LPU64", last_committed "LPU64"\n", - imp->imp_peer_last_xid, imp->imp_peer_committed_transno); + CDEBUG(D_HA, "committing for last_committed "LPU64"\n", + imp->imp_peer_committed_transno); list_for_each_safe(tmp, saved, &imp->imp_replay_list) { req = list_entry(tmp, struct ptlrpc_request, rq_list); @@ -489,6 +488,7 @@ void ptlrpc_free_committed(struct obd_import *imp) DEBUG_REQ(D_HA, req, "committing (last_committed "LPU64")", imp->imp_peer_committed_transno); + list_del_init(&req->rq_list); __ptlrpc_req_finished(req, 1); } @@ -517,7 +517,7 @@ void ptlrpc_cleanup_client(struct obd_import *imp) __ptlrpc_req_finished(req, 0); } spin_unlock_irqrestore(&imp->imp_lock, flags); - + EXIT; return; } @@ -573,9 +573,8 @@ static int expired_request(void *data) req->rq_flags |= PTL_RPC_FL_TIMEOUT; if (!req->rq_import) { - DEBUG_REQ(D_ERROR, req, "NULL import"); - LBUG(); - RETURN(0); + DEBUG_REQ(D_HA, req, "NULL import; already cleaned up?"); + RETURN(1); } if (!req->rq_import->imp_connection) { @@ -605,6 +604,50 @@ static int interrupted_request(void *data) RETURN(1); /* ignored, as of this writing */ } +struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req) +{ + ENTRY; + atomic_inc(&req->rq_refcount); + RETURN(req); +} + +void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, + struct obd_import *imp) +{ + struct list_head *tmp; + +#ifdef CONFIG_SMP + LASSERT(spin_is_locked(&imp->imp_lock)); +#endif + + LASSERT(imp->imp_flags & IMP_REPLAYABLE); + /* Balanced in ptlrpc_free_committed, usually. */ + ptlrpc_request_addref(req); + list_for_each_prev(tmp, &imp->imp_replay_list) { + struct ptlrpc_request *iter = + list_entry(tmp, struct ptlrpc_request, rq_list); + + /* We may have duplicate transnos if we create and then + * open a file, or for closes retained if to match creating + * opens, so use req->rq_xid as a secondary key. + * (See bugs 684, 685, and 428.) + */ + if (iter->rq_transno > req->rq_transno) + continue; + + if (iter->rq_transno == req->rq_transno) { + LASSERT(iter->rq_xid != req->rq_xid); + if (iter->rq_xid > req->rq_xid) + continue; + } + + list_add(&req->rq_list, &iter->rq_list); + return; + } + + list_add_tail(&req->rq_list, &imp->imp_replay_list); +} + int ptlrpc_queue_wait(struct ptlrpc_request *req) { int rc = 0; @@ -616,15 +659,19 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) init_waitqueue_head(&req->rq_wait_for_rep); + spin_lock_irqsave(&imp->imp_lock, flags); + req->rq_xid = HTON__u32(++imp->imp_last_xid); + spin_unlock_irqrestore(&imp->imp_lock, flags); + /* for distributed debugging */ - req->rq_reqmsg->status = HTON__u32(current->pid); + req->rq_reqmsg->status = HTON__u32(current->pid); CDEBUG(D_RPCTRACE, "Sending RPC pid:xid:nid:opc %d:"LPU64":%x:%d\n", NTOH__u32(req->rq_reqmsg->status), req->rq_xid, conn->c_peer.peer_nid, NTOH__u32(req->rq_reqmsg->opc)); spin_lock_irqsave(&imp->imp_lock, flags); - /* + /* * If the import has been invalidated (such as by an OST failure), the * request must fail with -EIO. */ @@ -646,12 +693,15 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) (req->rq_level <= imp->imp_level) || (req->rq_flags & PTL_RPC_FL_ERR), &lwi); - spin_lock_irqsave(&imp->imp_lock, flags); - list_del_init(&req->rq_list); - if (req->rq_flags & PTL_RPC_FL_ERR) rc = -EIO; + if (!req->rq_import) + RETURN(rc); + + spin_lock_irqsave(&imp->imp_lock, flags); + list_del_init(&req->rq_list); + if (rc) { spin_unlock_irqrestore(&imp->imp_lock, flags); RETURN(rc); @@ -756,24 +806,18 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) spin_lock_irqsave(&imp->imp_lock, flags); if ((req->rq_flags & PTL_RPC_FL_REPLAY || req->rq_transno != 0) && rc >= 0) { - /* Balanced in ptlrpc_free_committed, usually. */ - atomic_inc(&req->rq_refcount); - list_add_tail(&req->rq_list, &imp->imp_replay_list); + ptlrpc_retain_replayable_request(req, imp); } if (req->rq_transno > imp->imp_max_transno) { imp->imp_max_transno = req->rq_transno; - } else if (req->rq_transno != 0 && - imp->imp_level == LUSTRE_CONN_FULL) { - CDEBUG(D_HA, "got transno "LPD64" after "LPD64 - ": recovery may not work\n", req->rq_transno, - imp->imp_max_transno); } /* Replay-enabled imports return commit-status information. */ - imp->imp_peer_last_xid = req->rq_repmsg->last_xid; - imp->imp_peer_committed_transno = - req->rq_repmsg->last_committed; + if (req->rq_repmsg->last_committed) { + imp->imp_peer_committed_transno = + req->rq_repmsg->last_committed; + } ptlrpc_free_committed(imp); spin_unlock_irqrestore(&imp->imp_lock, flags); } @@ -847,7 +891,7 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) } /* XXX looks a lot like super.c:invalidate_request_list, don't it? */ -void ptlrpc_abort_inflight(struct obd_import *imp) +void ptlrpc_abort_inflight(struct obd_import *imp, int dying_import) { unsigned long flags; struct list_head *tmp, *n; @@ -866,6 +910,8 @@ void ptlrpc_abort_inflight(struct obd_import *imp) DEBUG_REQ(D_HA, req, "inflight"); req->rq_flags |= PTL_RPC_FL_ERR; + if (dying_import) + req->rq_import = NULL; wake_up(&req->rq_wait_for_rep); } @@ -875,6 +921,8 @@ void ptlrpc_abort_inflight(struct obd_import *imp) DEBUG_REQ(D_HA, req, "aborting waiting req"); req->rq_flags |= PTL_RPC_FL_ERR; + if (dying_import) + req->rq_import = NULL; wake_up(&req->rq_wait_for_rep); } } diff --git a/lustre/ptlrpc/connection.c b/lustre/ptlrpc/connection.c index 2182591..b2d204d 100644 --- a/lustre/ptlrpc/connection.c +++ b/lustre/ptlrpc/connection.c @@ -32,18 +32,20 @@ static struct list_head conn_unused_list; /* If UUID is NULL, c->c_remote_uuid must be all zeroes * If UUID is non-NULL, c->c_remote_uuid must match. */ -static int match_connection_uuid(struct ptlrpc_connection *c, obd_uuid_t uuid) +static int match_connection_uuid(struct ptlrpc_connection *c, struct obd_uuid *uuid) { - obd_uuid_t zero_uuid = {0}; + struct obd_uuid zero_uuid; + memset(&zero_uuid, 0, sizeof(zero_uuid)); if (uuid) - return memcmp(c->c_remote_uuid, uuid, sizeof(uuid)); + return memcmp(c->c_remote_uuid.uuid, uuid->uuid, + sizeof(uuid->uuid)); - return memcmp(c->c_remote_uuid, zero_uuid, sizeof(zero_uuid)); + return memcmp(c->c_remote_uuid.uuid, &zero_uuid, sizeof(zero_uuid)); } struct ptlrpc_connection *ptlrpc_get_connection(struct lustre_peer *peer, - obd_uuid_t uuid) + struct obd_uuid *uuid) { struct list_head *tmp, *pos; struct ptlrpc_connection *c; @@ -83,8 +85,8 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct lustre_peer *peer, c->c_epoch = 1; c->c_bootcount = 0; c->c_flags = 0; - if (uuid) - strcpy(c->c_remote_uuid, uuid); + if (uuid->uuid) + obd_str2uuid(&c->c_remote_uuid, uuid->uuid); INIT_LIST_HEAD(&c->c_imports); INIT_LIST_HEAD(&c->c_exports); INIT_LIST_HEAD(&c->c_sb_chain); @@ -160,7 +162,7 @@ void ptlrpc_cleanup_connection(void) list_for_each_safe(tmp, pos, &conn_list) { c = list_entry(tmp, struct ptlrpc_connection, c_link); CERROR("Connection %p/%s has refcount %d (nid=%lu)\n", - c, c->c_remote_uuid, atomic_read(&c->c_refcount), + c, c->c_remote_uuid.uuid, atomic_read(&c->c_refcount), (unsigned long)c->c_peer.peer_nid); list_del(&c->c_link); OBD_FREE(c, sizeof(*c)); diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index c260f5d..e7a1e08 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -26,8 +26,9 @@ #include #include -ptl_handle_eq_t request_out_eq, reply_in_eq, reply_out_eq, bulk_source_eq, - bulk_sink_eq; +ptl_handle_eq_t request_out_eq, reply_in_eq, reply_out_eq, + bulk_put_source_eq, bulk_put_sink_eq, + bulk_get_source_eq, bulk_get_sink_eq; static const ptl_handle_ni_t *socknal_nip = NULL, *toenal_nip = NULL, *qswnal_nip = NULL, *gmnal_nip = NULL; @@ -149,7 +150,7 @@ int request_in_callback(ptl_event_t *ev) return 0; } -static int bulk_source_callback(ptl_event_t *ev) +static int bulk_put_source_callback(ptl_event_t *ev) { struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr; struct ptlrpc_bulk_page *bulk; @@ -196,7 +197,7 @@ static int bulk_source_callback(ptl_event_t *ev) RETURN(0); } -static int bulk_sink_callback(ptl_event_t *ev) +static int bulk_put_sink_callback(ptl_event_t *ev) { struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr; struct ptlrpc_bulk_page *bulk; @@ -241,6 +242,100 @@ static int bulk_sink_callback(ptl_event_t *ev) RETURN(1); } +static int bulk_get_source_callback(ptl_event_t *ev) +{ + struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr; + struct ptlrpc_bulk_page *bulk; + struct list_head *tmp; + struct list_head *next; + ptl_size_t total = 0; + void (*event_handler)(struct ptlrpc_bulk_desc *); + ENTRY; + + LASSERT(ev->type == PTL_EVENT_GET); + + /* put with zero offset */ + LASSERT(ev->offset == 0); + /* used iovs */ + LASSERT((ev->mem_desc.options & PTL_MD_IOV) != 0); + /* 1 fragment for each page always */ + LASSERT(ev->mem_desc.niov == desc->bd_page_count); + + list_for_each_safe (tmp, next, &desc->bd_page_list) { + bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link); + + total += bulk->bp_buflen; + + if (bulk->bp_cb != NULL) + bulk->bp_cb(bulk); + } + + LASSERT(ev->mem_desc.length == total); + + /* We need to make a note of whether there's an event handler + * before we call wake_up, because if there is no event + * handler, 'desc' might be freed before we're scheduled again. */ + event_handler = desc->bd_ptl_ev_hdlr; + + desc->bd_flags |= PTL_BULK_FL_SENT; + wake_up(&desc->bd_waitq); + if (event_handler) { + LASSERT(desc->bd_ptl_ev_hdlr == event_handler); + event_handler(desc); + } + + RETURN(1); +} + + +static int bulk_get_sink_callback(ptl_event_t *ev) +{ + struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr; + struct ptlrpc_bulk_page *bulk; + struct list_head *tmp; + struct list_head *next; + ENTRY; + + CDEBUG(D_NET, "got %s event %d\n", + (ev->type == PTL_EVENT_SENT) ? "SENT" : + (ev->type == PTL_EVENT_REPLY) ? "REPLY" : "UNEXPECTED", + ev->type); + + LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_REPLY); + + LASSERT(atomic_read(&desc->bd_source_callback_count) > 0 && + atomic_read(&desc->bd_source_callback_count) <= 2); + + /* 1 fragment for each page always */ + LASSERT(ev->mem_desc.niov == desc->bd_page_count); + + if (atomic_dec_and_test(&desc->bd_source_callback_count)) { + void (*event_handler)(struct ptlrpc_bulk_desc *); + + list_for_each_safe(tmp, next, &desc->bd_page_list) { + bulk = list_entry(tmp, struct ptlrpc_bulk_page, + bp_link); + + if (bulk->bp_cb != NULL) + bulk->bp_cb(bulk); + } + + /* We need to make a note of whether there's an event handler + * before we call wake_up, because if there is no event handler, + * 'desc' might be freed before we're scheduled again. */ + event_handler = desc->bd_ptl_ev_hdlr; + + desc->bd_flags |= PTL_BULK_FL_RCVD; + wake_up(&desc->bd_waitq); + if (event_handler) { + LASSERT(desc->bd_ptl_ev_hdlr == event_handler); + event_handler(desc); + } + } + + RETURN(0); +} + int ptlrpc_init_portals(void) { int rc; @@ -272,11 +367,21 @@ int ptlrpc_init_portals(void) if (rc != PTL_OK) CERROR("PtlEQAlloc failed: %d\n", rc); - rc = PtlEQAlloc(ni, 1024, bulk_source_callback, &bulk_source_eq); + rc = PtlEQAlloc(ni, 1024, bulk_put_source_callback, + &bulk_put_source_eq); + if (rc != PTL_OK) + CERROR("PtlEQAlloc failed: %d\n", rc); + + rc = PtlEQAlloc(ni, 1024, bulk_put_sink_callback, &bulk_put_sink_eq); + if (rc != PTL_OK) + CERROR("PtlEQAlloc failed: %d\n", rc); + + rc = PtlEQAlloc(ni, 1024, bulk_get_source_callback, + &bulk_get_source_eq); if (rc != PTL_OK) CERROR("PtlEQAlloc failed: %d\n", rc); - rc = PtlEQAlloc(ni, 1024, bulk_sink_callback, &bulk_sink_eq); + rc = PtlEQAlloc(ni, 1024, bulk_get_sink_callback, &bulk_get_sink_eq); if (rc != PTL_OK) CERROR("PtlEQAlloc failed: %d\n", rc); @@ -288,8 +393,10 @@ void ptlrpc_exit_portals(void) PtlEQFree(request_out_eq); PtlEQFree(reply_out_eq); PtlEQFree(reply_in_eq); - PtlEQFree(bulk_source_eq); - PtlEQFree(bulk_sink_eq); + PtlEQFree(bulk_put_source_eq); + PtlEQFree(bulk_put_sink_eq); + PtlEQFree(bulk_get_source_eq); + PtlEQFree(bulk_get_sink_eq); if (qswnal_nip != NULL) inter_module_put("kqswnal_ni"); diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index a778b57..1b3532e 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -21,32 +21,21 @@ */ #define DEBUG_SUBSYSTEM S_CLASS -#include #include -int rd_uuid(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - int len = 0; - len += snprintf(page, count, "%s\n", - ((struct obd_device*)data)->obd_uuid); - return len; -} - -struct lprocfs_vars status_var_nm_1[] = { - {"status/uuid", rd_uuid, 0, 0}, - {0} +#ifndef LPROCFS +struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; +struct lprocfs_vars lprocfs_module_vars[] = { {0} }; +#else +struct lprocfs_vars lprocfs_obd_vars[] = { + { "uuid", lprocfs_rd_uuid, 0, 0}, + { 0 } }; -int rd_numrefs(char* page, char **start, off_t off, int count, int *eof, - void *data) -{ - struct obd_type* class = (struct obd_type*)data; - int len = 0; - len += snprintf(page, count, "%d\n", class->typ_refcnt); - return len; -} -struct lprocfs_vars status_class_var[] = { - {"status/num_refs", rd_numrefs, 0, 0}, - {0} +struct lprocfs_vars lprocfs_module_vars[] = { + { "num_refs", lprocfs_rd_numrefs, 0, 0}, + { 0 } }; + +#endif /* LPROCFS */ +LPROCFS_INIT_VARS(lprocfs_module_vars, lprocfs_obd_vars) diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 1d6284e..ef3a215 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -28,7 +28,8 @@ #include extern ptl_handle_eq_t request_out_eq, reply_in_eq, reply_out_eq, - bulk_source_eq, bulk_sink_eq; + bulk_put_source_eq, bulk_put_sink_eq, + bulk_get_source_eq, bulk_get_sink_eq; static int ptl_send_buf(struct ptlrpc_request *request, struct ptlrpc_connection *conn, int portal) @@ -113,7 +114,7 @@ ptlrpc_put_bulk_iov (struct ptlrpc_bulk_desc *desc, struct iovec *iov) OBD_FREE (iov, desc->bd_page_count * sizeof (struct iovec)); } -int ptlrpc_send_bulk(struct ptlrpc_bulk_desc *desc) +int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc) { int rc; struct list_head *tmp, *next; @@ -129,7 +130,7 @@ int ptlrpc_send_bulk(struct ptlrpc_bulk_desc *desc) desc->bd_md.start = iov; desc->bd_md.niov = 0; desc->bd_md.length = 0; - desc->bd_md.eventq = bulk_source_eq; + desc->bd_md.eventq = bulk_put_source_eq; desc->bd_md.threshold = 2; /* SENT and ACK */ desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_IOV; desc->bd_md.user_ptr = desc; @@ -194,7 +195,87 @@ int ptlrpc_send_bulk(struct ptlrpc_bulk_desc *desc) RETURN(0); } -int ptlrpc_register_bulk(struct ptlrpc_bulk_desc *desc) +int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc) +{ + int rc; + struct list_head *tmp, *next; + ptl_process_id_t remote_id; + __u32 xid = 0; + struct iovec *iov; + ENTRY; + + iov = ptlrpc_get_bulk_iov (desc); + if (iov == NULL) + RETURN (-ENOMEM); + + desc->bd_md.start = iov; + desc->bd_md.niov = 0; + desc->bd_md.length = 0; + desc->bd_md.eventq = bulk_get_sink_eq; + desc->bd_md.threshold = 2; /* SENT and REPLY */ + desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_IOV; + desc->bd_md.user_ptr = desc; + + atomic_set(&desc->bd_source_callback_count, 2); + + list_for_each_safe(tmp, next, &desc->bd_page_list) { + struct ptlrpc_bulk_page *bulk; + bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link); + + LASSERT(desc->bd_md.niov < desc->bd_page_count); + + if (desc->bd_md.niov == 0) + xid = bulk->bp_xid; + LASSERT(xid == bulk->bp_xid); /* should all be the same */ + + iov[desc->bd_md.niov].iov_base = bulk->bp_buf; + iov[desc->bd_md.niov].iov_len = bulk->bp_buflen; + if (iov[desc->bd_md.niov].iov_len <= 0) { + CERROR("bad bp_buflen[%d] @ %p: %d\n", desc->bd_md.niov, + bulk->bp_buf, bulk->bp_buflen); + CERROR("desc: xid %u, pages %d, ptl %d, ref %d\n", + xid, desc->bd_page_count, desc->bd_portal, + atomic_read(&desc->bd_refcount)); + LBUG(); + } + desc->bd_md.niov++; + desc->bd_md.length += bulk->bp_buflen; + } + + LASSERT(desc->bd_md.niov == desc->bd_page_count); + LASSERT(desc->bd_md.niov != 0); + + rc = PtlMDBind(desc->bd_connection->c_peer.peer_ni, desc->bd_md, + &desc->bd_md_h); + + ptlrpc_put_bulk_iov (desc, iov); /*move down to reduce latency to send*/ + + if (rc != PTL_OK) { + CERROR("PtlMDBind failed: %d\n", rc); + LBUG(); + RETURN(rc); + } + + remote_id.nid = desc->bd_connection->c_peer.peer_nid; + remote_id.pid = 0; + + CDEBUG(D_NET, "Sending %u pages %u bytes to portal %d nid "LPX64" pid " + "%d xid %d\n", desc->bd_md.niov, desc->bd_md.length, + desc->bd_portal, remote_id.nid, remote_id.pid, xid); + + rc = PtlGet(desc->bd_md_h, remote_id, desc->bd_portal, 0, xid, 0); + if (rc != PTL_OK) { + CERROR("PtlGet("LPU64", %d, %d) failed: %d\n", + remote_id.nid, desc->bd_portal, xid, rc); + PtlMDUnlink(desc->bd_md_h); + LBUG(); + RETURN(rc); + } + + RETURN(0); +} + +static int ptlrpc_register_bulk_shared(struct ptlrpc_bulk_desc *desc) { struct list_head *tmp, *next; int rc; @@ -217,9 +298,7 @@ int ptlrpc_register_bulk(struct ptlrpc_bulk_desc *desc) desc->bd_md.niov = 0; desc->bd_md.length = 0; desc->bd_md.threshold = 1; - desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_IOV; desc->bd_md.user_ptr = desc; - desc->bd_md.eventq = bulk_sink_eq; list_for_each_safe(tmp, next, &desc->bd_page_list) { struct ptlrpc_bulk_page *bulk; @@ -276,6 +355,22 @@ int ptlrpc_register_bulk(struct ptlrpc_bulk_desc *desc) return rc; } +int ptlrpc_register_bulk_get(struct ptlrpc_bulk_desc *desc) +{ + desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_IOV; + desc->bd_md.eventq = bulk_get_source_eq; + + return ptlrpc_register_bulk_shared(desc); +} + +int ptlrpc_register_bulk_put(struct ptlrpc_bulk_desc *desc) +{ + desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_IOV; + desc->bd_md.eventq = bulk_put_sink_eq; + + return ptlrpc_register_bulk_shared(desc); +} + int ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc) { /* This should be safe: these handles are initialized to be @@ -356,14 +451,13 @@ int ptlrpc_error(struct ptlrpc_service *svc, struct ptlrpc_request *req) int rc; ENTRY; - if (req->rq_repmsg) { - CERROR("req already has repmsg\n"); - LBUG(); + if (!req->rq_repmsg) { + rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, + &req->rq_repmsg); + if (rc) + RETURN(rc); } - rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg); - if (rc) - RETURN(rc); req->rq_type = PTL_RPC_MSG_ERR; @@ -390,7 +484,7 @@ int ptl_send_rpc(struct ptlrpc_request *request) source_id.pid = PTL_PID_ANY; /* add a ref, which will be balanced in request_out_callback */ - atomic_inc(&request->rq_refcount); + ptlrpc_request_addref(request); if (request->rq_replen != 0) { if (request->rq_reply_md.start != NULL) { rc = PtlMEUnlink(request->rq_reply_me_h); diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 49d79dc..10e8200 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -100,6 +100,9 @@ int lustre_unpack_msg(struct lustre_msg *m, int len) if (len < required_len) { CERROR("len: %d, required_len %d\n", len, required_len); + CERROR("bufcount: %d\n", m->bufcount); + for (i = 0; i < m->bufcount; i++) + CERROR("buffer %d length %d\n", i, m->buflens[i]); RETURN(-EINVAL); } @@ -117,15 +120,15 @@ void *lustre_msg_buf(struct lustre_msg *m, int n) } if (n < 0 || n >= m->bufcount) { - CERROR("referencing bad sub buffer in %p (want %d, count %d)!\n", - m, n, m->bufcount); + CERROR("referencing bad sub buffer in %p (want %d, count " + "%d)!\n", m, n, m->bufcount); LBUG(); return NULL; } if (m->buflens[n] == 0) { - CERROR("zero-length buffer requested for buffer %d in %p\n", n, - m); + CERROR("zero-length buffer requested for buffer %d in %p\n", + n, m); return NULL; } diff --git a/lustre/ptlrpc/recovd.c b/lustre/ptlrpc/recovd.c index d544a19..279c903 100644 --- a/lustre/ptlrpc/recovd.c +++ b/lustre/ptlrpc/recovd.c @@ -29,7 +29,8 @@ static void d_c_l(struct list_head *head) struct ptlrpc_connection *conn = list_entry(tmp, struct ptlrpc_connection, c_recovd_data.rd_managed_chain); - CDEBUG(D_HA, " %p = %s (%d/%d)\n", conn, conn->c_remote_uuid, + CDEBUG(D_HA, " %p = %s (%d/%d)\n", conn, + conn->c_remote_uuid.uuid, conn->c_recovd_data.rd_phase, conn->c_recovd_data.rd_next_phase); } @@ -56,13 +57,13 @@ void recovd_conn_manage(struct ptlrpc_connection *conn, if (!list_empty(&rd->rd_managed_chain)) { if (rd->rd_recovd == recovd && rd->rd_recover == recover) { CDEBUG(D_HA, "conn %p/%s already setup for recovery\n", - conn, conn->c_remote_uuid); + conn, conn->c_remote_uuid.uuid); EXIT; return; } CDEBUG(D_HA, "conn %p/%s has recovery items %p/%p, making %p/%p\n", - conn, conn->c_remote_uuid, rd->rd_recovd, rd->rd_recover, + conn, conn->c_remote_uuid.uuid, rd->rd_recovd, rd->rd_recover, recovd, recover); spin_lock(&rd->rd_recovd->recovd_lock); list_del_init(&rd->rd_managed_chain); @@ -115,21 +116,21 @@ void recovd_conn_fail(struct ptlrpc_connection *conn) spin_lock(&recovd->recovd_lock); if (rd->rd_phase == RD_TROUBLED || rd->rd_phase == RD_PREPARING) { CDEBUG(D_HA, "connection %p to %s already in recovery\n", - conn, conn->c_remote_uuid); + conn, conn->c_remote_uuid.uuid); spin_unlock(&recovd->recovd_lock); EXIT; return; } CERROR("connection %p to %s (%08x %08lx %08lx) failed\n", conn, - conn->c_remote_uuid, conn->c_peer.peer_nid, + conn->c_remote_uuid.uuid, conn->c_peer.peer_nid, conn->c_peer.peer_ni.nal_idx, conn->c_peer.peer_ni.handle_idx); list_del(&rd->rd_managed_chain); list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items); if (rd->rd_phase != RD_IDLE) { CDEBUG(D_HA, "connection %p to %s failed in recovery: restarting\n", - conn, conn->c_remote_uuid); + conn, conn->c_remote_uuid.uuid); /* XXX call callback with PHASE_FAILED? */ rd->rd_next_phase = RD_TROUBLED; } @@ -148,7 +149,7 @@ void recovd_conn_fixed(struct ptlrpc_connection *conn) ENTRY; CDEBUG(D_HA, "connection %p (now to %s) fixed\n", - conn, conn->c_remote_uuid); + conn, conn->c_remote_uuid.uuid); spin_lock(&rd->rd_recovd->recovd_lock); list_del(&rd->rd_managed_chain); rd->rd_phase = RD_IDLE; diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index b4f3c85..1c99fed 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -3,15 +3,23 @@ * * Portal-RPC reconnection and replay operations, for use in recovery. * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * Author: Mike Shaver * - * Copyright (C) 1996 Peter J. Braam - * Copyright (C) 1999 Stelias Computing Inc. - * Copyright (C) 1999 Seagate Technology Inc. - * Copyright (C) 2001 Mountain View Data, Inc. - * Copyright (C) 2002 Cluster File Systems, Inc. + * This file is part of Lustre, http://www.lustre.org. * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include @@ -30,18 +38,18 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc, struct obd_device *obd = imp->imp_obd; struct client_obd *cli = &obd->u.cli; int size[] = { sizeof(cli->cl_target_uuid), sizeof(obd->obd_uuid) }; - char *tmp[] = {cli->cl_target_uuid, obd->obd_uuid }; + char *tmp[] = {cli->cl_target_uuid.uuid, obd->obd_uuid.uuid}; struct ptlrpc_connection *conn = imp->imp_connection; - struct lustre_handle old_hdl; - struct ptlrpc_request *request; + struct ptlrpc_request *req; struct obd_export *ldlmexp; + struct lustre_handle old_hdl; int rc; - request = ptlrpc_prep_req(imp, rq_opc, 2, size, tmp); - if (!request) + req = ptlrpc_prep_req(imp, rq_opc, 2, size, tmp); + if (!req) RETURN(-ENOMEM); - request->rq_level = LUSTRE_CONN_NEW; - request->rq_replen = lustre_msg_size(0, NULL); + req->rq_level = LUSTRE_CONN_NEW; + req->rq_replen = lustre_msg_size(0, NULL); /* * This address is the export that represents our client-side LDLM * service (for ASTs). We should only have one on this list, so we @@ -51,58 +59,59 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc, */ ldlmexp = list_entry(obd->obd_exports.next, struct obd_export, exp_obd_chain); - request->rq_reqmsg->addr = (__u64)(unsigned long)ldlmexp; - request->rq_reqmsg->cookie = ldlmexp->exp_cookie; - rc = ptlrpc_queue_wait(request); - switch (rc) { - case EALREADY: - case -EALREADY: - /* already connected! */ + req->rq_reqmsg->addr = (__u64)(unsigned long)ldlmexp; + req->rq_reqmsg->cookie = ldlmexp->exp_cookie; + rc = ptlrpc_queue_wait(req); + if (rc) { + CERROR("cannot connect to %s@%s: rc = %d\n", + cli->cl_target_uuid.uuid, conn->c_remote_uuid.uuid, rc); + GOTO(out_disc, rc); + } + if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT) { memset(&old_hdl, 0, sizeof(old_hdl)); - if (!memcmp(&old_hdl.addr, &request->rq_repmsg->addr, + if (!memcmp(&old_hdl.addr, &req->rq_repmsg->addr, sizeof (old_hdl.addr)) && - !memcmp(&old_hdl.cookie, &request->rq_repmsg->cookie, + !memcmp(&old_hdl.cookie, &req->rq_repmsg->cookie, sizeof (old_hdl.cookie))) { - CERROR("%s@%s didn't like our handle "LPX64"/"LPX64", failed\n", - cli->cl_target_uuid, conn->c_remote_uuid, + CERROR("%s@%s didn't like our handle "LPX64"/"LPX64 + ", failed\n", cli->cl_target_uuid.uuid, + conn->c_remote_uuid.uuid, (__u64)(unsigned long)ldlmexp, ldlmexp->exp_cookie); GOTO(out_disc, rc = -ENOTCONN); } - old_hdl.addr = request->rq_repmsg->addr; - old_hdl.cookie = request->rq_repmsg->cookie; + old_hdl.addr = req->rq_repmsg->addr; + old_hdl.cookie = req->rq_repmsg->cookie; if (memcmp(&imp->imp_handle, &old_hdl, sizeof(old_hdl))) { - CERROR("%s@%s changed handle from "LPX64"/"LPX64" to "LPX64"/"LPX64"; " + CERROR("%s@%s changed handle from "LPX64"/"LPX64 + " to "LPX64"/"LPX64"; " "copying, but this may foreshadow disaster\n", - cli->cl_target_uuid, conn->c_remote_uuid, + cli->cl_target_uuid.uuid, + conn->c_remote_uuid.uuid, old_hdl.addr, old_hdl.cookie, imp->imp_handle.addr, imp->imp_handle.cookie); - imp->imp_handle.addr = request->rq_repmsg->addr; - imp->imp_handle.cookie = request->rq_repmsg->cookie; - GOTO(out_disc, rc = EALREADY); + imp->imp_handle.addr = req->rq_repmsg->addr; + imp->imp_handle.cookie = req->rq_repmsg->cookie; + GOTO(out_disc, rc = 0); } CERROR("reconnected to %s@%s after partition\n", - cli->cl_target_uuid, conn->c_remote_uuid); - GOTO(out_disc, rc = EALREADY); - case 0: - old_hdl = imp->imp_handle; - imp->imp_handle.addr = request->rq_repmsg->addr; - imp->imp_handle.cookie = request->rq_repmsg->cookie; - CERROR("now connected to %s@%s ("LPX64"/"LPX64", was "LPX64"/"LPX64")!\n", - cli->cl_target_uuid, conn->c_remote_uuid, - imp->imp_handle.addr, imp->imp_handle.cookie, - old_hdl.addr, old_hdl.cookie); + cli->cl_target_uuid.uuid, conn->c_remote_uuid.uuid); GOTO(out_disc, rc = 0); - default: - CERROR("cannot connect to %s@%s: rc = %d\n", - cli->cl_target_uuid, conn->c_remote_uuid, rc); - GOTO(out_disc, rc = -ENOTCONN); /* XXX preserve rc? */ } + old_hdl = imp->imp_handle; + imp->imp_handle.addr = req->rq_repmsg->addr; + imp->imp_handle.cookie = req->rq_repmsg->cookie; + CERROR("reconnected to %s@%s ("LPX64"/"LPX64", was "LPX64"/" + LPX64")!\n", cli->cl_target_uuid.uuid, conn->c_remote_uuid.uuid, + imp->imp_handle.addr, imp->imp_handle.cookie, + old_hdl.addr, old_hdl.cookie); + GOTO(out_disc, rc = 0); + out_disc: - *reqptr = request; + *reqptr = req; return rc; } @@ -114,7 +123,7 @@ int ptlrpc_run_recovery_upcall(struct ptlrpc_connection *conn) ENTRY; argv[0] = obd_recovery_upcall; - argv[1] = conn->c_remote_uuid; + argv[1] = conn->c_remote_uuid.uuid; argv[2] = NULL; envp[0] = "HOME=/"; @@ -156,7 +165,7 @@ int ptlrpc_replay(struct obd_import *imp) ptlrpc_free_committed(imp); CDEBUG(D_HA, "import %p from %s has committed "LPD64"\n", - imp, imp->imp_obd->u.cli.cl_target_uuid, committed); + imp, imp->imp_obd->u.cli.cl_target_uuid.uuid, committed); list_for_each(tmp, &imp->imp_replay_list) { req = list_entry(tmp, struct ptlrpc_request, rq_list); diff --git a/lustre/ptlrpc/rpc.c b/lustre/ptlrpc/rpc.c index 200c029..95fe7ec 100644 --- a/lustre/ptlrpc/rpc.c +++ b/lustre/ptlrpc/rpc.c @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -32,13 +32,9 @@ #include #include - - extern int ptlrpc_init_portals(void); extern void ptlrpc_exit_portals(void); -extern struct lprocfs_vars status_var_nm_1[]; -extern struct lprocfs_vars status_class_var[]; int connmgr_setup(struct obd_device *obddev, obd_count len, void *buf) { @@ -83,8 +79,8 @@ int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, c_recovd_data.rd_managed_chain); LASSERT(conn->c_recovd_data.rd_recovd == recovd); /* sanity */ - - if (!strcmp(conn->c_remote_uuid, data->ioc_inlbuf1)) +#warning check buffer overflow in next line + if (!strcmp(conn->c_remote_uuid.uuid, data->ioc_inlbuf1)) break; conn = NULL; } @@ -99,7 +95,8 @@ int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, LASSERT(conn->c_recovd_data.rd_recovd == recovd); - if (!strcmp(conn->c_remote_uuid, data->ioc_inlbuf1)) +#warning check buffer overflow in next line + if (!strcmp(conn->c_remote_uuid.uuid, data->ioc_inlbuf1)) break; conn = NULL; } @@ -111,9 +108,6 @@ int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, spin_unlock(&recovd->recovd_lock); recovd_conn_fail(conn); spin_lock(&recovd->recovd_lock); - - /* Jump straight to the "failed" phase of recovery. */ - conn->c_recovd_data.rd_phase = RD_FAILED; goto out; } @@ -134,13 +128,13 @@ int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, if (data->ioc_inllen2) { CERROR("conn %p UUID change %s -> %s\n", - conn, conn->c_remote_uuid, data->ioc_inlbuf2); - strcpy(conn->c_remote_uuid, data->ioc_inlbuf2); + conn, conn->c_remote_uuid.uuid, data->ioc_inlbuf2); + obd_str2uuid(&conn->c_remote_uuid, data->ioc_inlbuf2); } else { CERROR("conn %p UUID %s reconnected\n", conn, - conn->c_remote_uuid); + conn->c_remote_uuid.uuid); } - ptlrpc_readdress_connection(conn, conn->c_remote_uuid); + ptlrpc_readdress_connection(conn, &conn->c_remote_uuid); spin_unlock(&conn->c_lock); conn->c_recovd_data.rd_phase = RD_PREPARED; @@ -151,7 +145,7 @@ int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, } static int connmgr_connect(struct lustre_handle *conn, struct obd_device *src, - obd_uuid_t cluuid, struct recovd_obd *recovd, + struct obd_uuid *cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { return class_connect(conn, src, cluuid); @@ -159,12 +153,15 @@ static int connmgr_connect(struct lustre_handle *conn, struct obd_device *src, int connmgr_attach(struct obd_device *dev, obd_count len, void *data) { - return lprocfs_reg_obd(dev, status_var_nm_1, dev); + struct lprocfs_static_vars lvars; + + lprocfs_init_vars(&lvars); + return lprocfs_obd_attach(dev, lvars.obd_vars); } int conmgr_detach(struct obd_device *dev) { - return lprocfs_dereg_obd(dev); + return lprocfs_obd_detach(dev); } /* use obd ops to offer management infrastructure */ @@ -181,17 +178,23 @@ static struct obd_ops recovd_obd_ops = { static int __init ptlrpc_init(void) { + struct lprocfs_static_vars lvars; int rc; + ENTRY; + rc = ptlrpc_init_portals(); if (rc) RETURN(rc); ptlrpc_init_connection(); - rc = class_register_type(&recovd_obd_ops, status_class_var, + + lprocfs_init_vars(&lvars); + rc = class_register_type(&recovd_obd_ops, lvars.module_vars, LUSTRE_HA_NAME); if (rc) RETURN(rc); ptlrpc_put_connection_superhack = ptlrpc_put_connection; - return 0; + ptlrpc_abort_inflight_superhack = ptlrpc_abort_inflight; + RETURN(0); } static void __exit ptlrpc_exit(void) @@ -218,8 +221,10 @@ EXPORT_SYMBOL(ptlrpc_init_connection); EXPORT_SYMBOL(ptlrpc_cleanup_connection); /* niobuf.c */ -EXPORT_SYMBOL(ptlrpc_send_bulk); -EXPORT_SYMBOL(ptlrpc_register_bulk); +EXPORT_SYMBOL(ptlrpc_bulk_put); +EXPORT_SYMBOL(ptlrpc_bulk_get); +EXPORT_SYMBOL(ptlrpc_register_bulk_put); +EXPORT_SYMBOL(ptlrpc_register_bulk_get); EXPORT_SYMBOL(ptlrpc_abort_bulk); EXPORT_SYMBOL(ptlrpc_reply); EXPORT_SYMBOL(ptlrpc_error); @@ -242,12 +247,14 @@ EXPORT_SYMBOL(ptlrpc_restart_req); EXPORT_SYMBOL(ptlrpc_prep_req); EXPORT_SYMBOL(ptlrpc_free_req); EXPORT_SYMBOL(ptlrpc_req_finished); +EXPORT_SYMBOL(ptlrpc_request_addref); EXPORT_SYMBOL(ptlrpc_prep_bulk); EXPORT_SYMBOL(ptlrpc_free_bulk); EXPORT_SYMBOL(ptlrpc_prep_bulk_page); EXPORT_SYMBOL(ptlrpc_free_bulk_page); EXPORT_SYMBOL(ll_brw_sync_wait); EXPORT_SYMBOL(ptlrpc_abort_inflight); +EXPORT_SYMBOL(ptlrpc_retain_replayable_request); /* service.c */ EXPORT_SYMBOL(ptlrpc_init_svc); @@ -268,8 +275,8 @@ EXPORT_SYMBOL(ptlrpc_replay); EXPORT_SYMBOL(ptlrpc_resend); EXPORT_SYMBOL(ptlrpc_wake_delayed); -MODULE_AUTHOR("Cluster File Systems, Inc "); -MODULE_DESCRIPTION("Lustre Request Processor v1.0"); +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Lustre Request Processor"); MODULE_LICENSE("GPL"); module_init(ptlrpc_init); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index c20fc48..0ea29b3 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -65,7 +65,7 @@ struct ptlrpc_service * ptlrpc_init_svc(__u32 nevents, __u32 nbufs, __u32 bufsize, __u32 max_req_size, int req_portal, int rep_portal, - obd_uuid_t uuid, svc_handler_t handler, char *name) + struct obd_uuid *uuid, svc_handler_t handler, char *name) { int err; int rc, i; @@ -91,9 +91,10 @@ ptlrpc_init_svc(__u32 nevents, __u32 nbufs, service->srv_req_portal = req_portal; service->srv_handler = handler; - err = kportal_uuid_to_peer(uuid, &service->srv_self); + err = kportal_uuid_to_peer(uuid->uuid, &service->srv_self); if (err) { - CERROR("%s: cannot get peer for uuid '%s'\n", name, uuid); + CERROR("%s: cannot get peer for uuid '%s'\n", name, + uuid->uuid); OBD_FREE(service, sizeof(*service)); RETURN(NULL); } @@ -165,13 +166,13 @@ static int handle_incoming_request(struct obd_device *obddev, if (request->rq_reqlen < sizeof(struct lustre_msg)) { CERROR("incomplete request (%d): ptl %d from "LPX64" xid " - LPD64"\n", + LPU64"\n", request->rq_reqlen, svc->srv_req_portal, event->initiator.nid, request->rq_xid); goto out; } - CDEBUG(D_RPCTRACE, "Handling RPC pid:xid:nid:opc %d:"LPX64":"LPX64":%d\n", + CDEBUG(D_RPCTRACE, "Handling RPC pid:xid:nid:opc %d:"LPU64":"LPX64":%d\n", NTOH__u32(request->rq_reqmsg->status), request->rq_xid, event->initiator.nid, diff --git a/lustre/scripts/lustre.spec.in b/lustre/scripts/lustre.spec.in index dd1e33c..92cd1c2 100644 --- a/lustre/scripts/lustre.spec.in +++ b/lustre/scripts/lustre.spec.in @@ -4,7 +4,7 @@ %define linuxdir @LINUX@ %define portalsdir @PORTALS@ %define portalslibdir @PORTALSLIB@ -Release: 0208282230chaos +Release: 0301070810ltutor3 Summary: Lustre Lite File System Name: lustre-lite @@ -43,6 +43,14 @@ Group: Documentation %description -n lustre-doc Documentation and sample configuration files for Lustre +%package -n lustre-ldap +Summary: Configures openldap server for LDAP Lustre config database +Group: Configuration +Requires: openldap-servers, openldap-clients, python-ldap, 4Suite + +%description -n lustre-ldap +Configures openldap server for LDAP Lustre config database + %prep %setup -qn lustre-%{version} @@ -62,10 +70,17 @@ rm -f lustre-source ln -s $RPM_BUILD_ROOT/usr/src lustre-source make distdir distdir=lustre-source/lustre-%{version} +# ldap database directory +mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre + %files %attr(-, root, root) /usr/sbin/lmc %attr(-, root, root) /usr/sbin/lctl %attr(-, root, root) /usr/sbin/lconf +%attr(-, root, root) /usr/sbin/llanalyze +%attr(-, root, root) /usr/sbin/lfind +%attr(-, root, root) /usr/sbin/lstripe +%attr(-, root, root) /usr/sbin/mcreate %attr(-, root, root) /usr/lib/lustre/examples/llmount.sh %attr(-, root, root) /usr/lib/lustre/examples/llmountcleanup.sh %attr(-, root, root) /usr/lib/lustre/examples/llecho.sh @@ -104,6 +119,14 @@ make distdir distdir=lustre-source/lustre-%{version} %files -n lustre-source %attr(-, root, root) /usr/src/lustre-%{version} +%files -n lustre-ldap +%attr(-, root, root) /etc/openldap/slapd-lustre.conf +%attr(-, root, root) /etc/openldap/schema/lustre.schema +%attr(-, root, root) /usr/lib/lustre/lustre2ldif.xsl +%attr(-, root, root) /usr/lib/lustre/top.ldif +%dir /var/lib/ldap/lustre +%attr(700, ldap, ldap) /var/lib/ldap/lustre + %post if [ ! -e /dev/obd ]; then mknod /dev/obd c 10 241 @@ -122,6 +145,20 @@ grep -q '/dev/lustre' /etc/modules.conf || \ %postun depmod -ae || exit 0 +%post -n lustre-ldap +if ! grep -q slapd-lustre /etc/openldap/slapd.conf; then + echo "include /etc/openldap/slapd-lustre.conf" >> /etc/openldap/slapd.conf +fi + +%postun -n lustre-ldap +slapd=/etc/openldap/slapd.conf +if grep -q slapd-lustre $slapd; then + tmp=/tmp/lustre-ldap.$$ + sed "/slapd-lustre/d" $slapd >> $tmp + cp $tmp $slapd + rm $tmp +fi + %clean #rm -rf $RPM_BUILD_ROOT diff --git a/lustre/tests/.cvsignore b/lustre/tests/.cvsignore index b9e1962..239e0fd 100644 --- a/lustre/tests/.cvsignore +++ b/lustre/tests/.cvsignore @@ -31,3 +31,5 @@ setuid multifstat checkstat wantedi +createtest +open_delay diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index c7f411e..ee22c80 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -22,11 +22,12 @@ noinst_SCRIPTS += fs.sh intent-test.sh intent-test2.sh leak_finder.pl \ ostreq.sh runfailure-client-mds-recover.sh runfailure-mds \ runfailure-net runfailure-ost runiozone runregression-net.sh \ runtests runvmstat snaprun.sh tbox.sh common.sh -noinst_PROGRAMS = openunlink testreq truncate directio openme writeme mcreate +noinst_PROGRAMS = openunlink testreq truncate directio openme writeme open_delay noinst_PROGRAMS += munlink tchmod toexcl fsx test_brw openclose createdestroy -noinst_PROGRAMS += stat createmany statmany mkdirmany multifstat +noinst_PROGRAMS += stat createmany statmany mkdirmany multifstat createtest # noinst_PROGRAMS += ldaptest noinst_PROGRAMS += checkstat wantedi +sbin_PROGRAMS = mcreate # ldaptest_SOURCES = ldaptest.c tchmod_SOURCES = tchmod.c @@ -50,5 +51,7 @@ mkdirmany_SOURCES = mkdirmany.c multifstat_SOURCES = multifstat.c checkstat_SOURCES = checkstat.c wantedi_SOURCES = wantedi.c +createtest_SOURCES = createtest.c +open_delay_SOURCES = open_delay.c include $(top_srcdir)/Rules diff --git a/lustre/tests/acceptance-metadata-single.sh b/lustre/tests/acceptance-metadata-single.sh new file mode 100644 index 0000000..501d2be --- /dev/null +++ b/lustre/tests/acceptance-metadata-single.sh @@ -0,0 +1,130 @@ +#!/bin/sh +set -e + +# +# Runs create.pl and rename.pl on a single mountpoint with increasing +# load, varying debug levels +# + +SRCDIR="`dirname $0`/" +. $SRCDIR/common.sh + +MNT=${MNT:-/mnt/lustre} + +debug_client_on +echo "create.pl, 1 mount, 1 thread, 10 ops, debug on" +perl create.pl -- $MNT -1 10 +echo "create.pl, 1 mount, 1 thread, 100 ops, debug on" +perl create.pl --silent -- $MNT -1 100 +echo "create.pl --mcreate=0, 1 mount, 1 thread, 10 ops, debug on" +perl create.pl --mcreate=0 -- $MNT -1 10 +echo "create.pl --mcreate=0, 1 mount, 1 thread, 100 ops, debug on" +perl create.pl --mcreate=0 --silent -- $MNT -1 100 +echo "rename.pl, 1 mount, 1 thread, 10 ops, debug on" +perl rename.pl $MNT 10 +echo "rename.pl, 1 mount, 1 thread, 100 ops, debug on" +perl rename.pl --silent $MNT 100 + +debug_client_off +echo "create.pl, 1 mount, 1 thread, 1000 ops, debug off" +perl create.pl --silent -- $MNT -1 1000 +echo "create.pl --mcreate=0, 1 mount, 1 thread, 1000 ops, debug off" +perl create.pl --silent --mcreate=0 -- $MNT -1 1000 +echo "rename.pl, 1 mount, 1 thread, 1000 ops, debug off" +perl rename.pl --silent $MNT 1000 + +debug_client_on +echo "create.pl, 1 mount, 2 threads, 100 ops, debug on" +perl create.pl --silent -- $MNT -1 100 & +perl create.pl --silent -- $MNT -1 100 & +wait +echo "create.pl --mcreate=0, 1 mount, 2 threads, 100 ops, debug on" +perl create.pl --silent --mcreate=0 -- $MNT -1 100 & +perl create.pl --silent --mcreate=0 -- $MNT -1 100 & +wait +echo "rename.pl, 1 mount, 2 thread, 1000 ops, debug on" +perl rename.pl --silent $MNT 1000 & +perl rename.pl --silent $MNT 1000 & +wait + +debug_client_off +echo "create.pl, 1 mount, 2 threads, 2000 ops, debug off" +perl create.pl --silent -- $MNT -1 2000 & +perl create.pl --silent -- $MNT -1 2000 & +wait +echo "create.pl --mcreate=0, 1 mount, 2 threads, 2000 ops, debug off" +perl create.pl --silent --mcreate=0 -- $MNT -1 2000 & +perl create.pl --silent --mcreate=0 -- $MNT -1 2000 & +wait +echo "rename.pl, 1 mount, 2 threads, 2000 ops, debug off" +perl rename.pl --silent $MNT 2000 & +perl rename.pl --silent $MNT 2000 & +wait + +debug_client_on +echo "create.pl, 1 mount, 4 threads, 100 ops, debug on" +for i in `seq 1 4`; do + perl create.pl --silent -- $MNT -1 100 & +done +wait +echo "create.pl --mcreate=0, 1 mount, 4 threads, 100 ops, debug on" +for i in `seq 1 4`; do + perl create.pl --silent --mcreate=0 -- $MNT -1 100 & +done +wait +echo "rename.pl, 1 mount, 4 threads, 2000 ops, debug on" +for i in `seq 1 4`; do + perl rename.pl --silent $MNT 2000 & +done +wait + +debug_client_off +echo "create.pl, 1 mount, 4 threads, 2000 ops, debug off" +for i in `seq 1 4`; do + perl create.pl --silent -- $MNT -1 2000 & +done +wait +echo "create.pl --mcreate=0, 1 mount, 4 threads, 2000 ops, debug off" +for i in `seq 1 4`; do + perl create.pl --silent --mcreate=0 -- $MNT -1 2000 & +done +wait +echo "rename.pl, 1 mount, 4 threads, 2000 ops, debug off" +for i in `seq 1 4`; do + perl rename.pl --silent $MNT 2000 & +done +wait + +debug_client_on +echo "create.pl, 1 mount, 8 threads, 500 ops, debug on" +for i in `seq 1 8`; do + perl create.pl --silent -- $MNT -1 500 & +done +wait +echo "create.pl --mcreate=0, 1 mount, 8 threads, 500 ops, debug on" +for i in `seq 1 8`; do + perl create.pl --silent --mcreate=0 -- $MNT -1 500 & +done +wait +echo "rename.pl, 1 mount, 8 threads, 2000 ops, debug on" +for i in `seq 1 8`; do + perl rename.pl --silent $MNT 2000 & +done +wait + +debug_client_off +echo "create.pl, 1 mount, 8 threads, 2000 ops, debug off" +for i in `seq 1 8`; do + perl create.pl --silent -- $MNT -1 2000 & +done +wait +echo "create.pl --mcreate=0, 1 mount, 8 threads, 2000 ops, debug off" +for i in `seq 1 8`; do + perl create.pl --silent --mcreate=0 -- $MNT -1 2000 & +done +wait +echo "rename.pl, 1 mount, 8 threads, 2000 ops, debug off" +for i in `seq 1 8`; do + perl rename.pl --silent $MNT 2000 & +done +wait diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index 5202052..286f417 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -23,6 +23,7 @@ for NAME in $CONFIGS; do fi [ "$SANITY" != "no" ] && sh sanity.sh + [ "$SANITY" != "no" ] && START=" " CLEAN=" " sh sanity.sh if [ "$DBENCH" != "no" ]; then mount | grep $MNT || sh llmount.sh @@ -88,3 +89,6 @@ for NAME in $CONFIGS; do fi mount | grep $MNT && sh llmountcleanup.sh done + +[ "$SANITYN" != "no" ] && NAME=mount2 sh sanityN.sh + diff --git a/lustre/tests/ba-echo.sh b/lustre/tests/ba-echo.sh index 9f31edc..6dc6124 100644 --- a/lustre/tests/ba-echo.sh +++ b/lustre/tests/ba-echo.sh @@ -32,7 +32,7 @@ ${LMC} --add net --node $OST --tcpbuf $TCPBUF --nid $OST --nettype tcp ${LMC} --add ost --node $OST --obd obd1 --obdtype=obdecho -obduuid $OBD_UUID # osc on client -${LMC} --add oscref --node $CLIENT --echo_client obd1 +${LMC} --add echo_client --node $CLIENT --obd obd1 $LMC_REAL --batch $BATCH rm -f $BATCH diff --git a/lustre/tests/busy.sh b/lustre/tests/busy.sh new file mode 100644 index 0000000..2f90986 --- /dev/null +++ b/lustre/tests/busy.sh @@ -0,0 +1,7 @@ +#!/bin/bash + + mkdir /mnt/lustre/d22 + mkdir /mnt/lustre/d22/etc + ./mcreate /mnt/lustre/d22/etc/foo + ls -ld /mnt/lustre/etc + ls -ld /mnt/lustre/d22/etc diff --git a/lustre/tests/create.pl b/lustre/tests/create.pl index 341d31b..6156869 100644 --- a/lustre/tests/create.pl +++ b/lustre/tests/create.pl @@ -1,16 +1,21 @@ #!/usr/bin/perl use Getopt::Long; -GetOptions("silent!"=> \$silent); +my $silent = 0; +my $mcreate = 1; # should we use mcreate or open? +my $files = 5; + +GetOptions("silent!" => \$silent, + "mcreate=i" => \$mcreate, + "files=i" => \$files); my $mtpt = shift || usage(); my $mount_count = shift || usage(); my $i = shift || usage(); -my $files = 5; -my $mcreate = 0; # should we use mcreate or open? +my $count = $i; sub usage () { - print "Usage: $0 \n"; + print "Usage: $0 [--silent] [--mcreate=n] [--files=n] \n"; print "example: $0 /mnt/lustre 2 50\n"; print " will test in /mnt/lustre1 and /mnt/lustre2\n"; print " $0 /mnt/lustre -1 50\n"; @@ -57,5 +62,17 @@ while ($i--) { } else { print "Unlink done [$$] $path: $!\n"if !$silent; } + if (($count - $i) % 100 == 0) { + print STDERR ($count - $i) . " operations [" . $$ . "]\n"; + } } + +my $which = ""; +if ($mount_count > 0) { + $which = int(rand() * $mount_count) + 1; +} +for ($d = 0; $d < $files; $d++) { + unlink("$mtpt$which/$d"); +} + print "Done.\n"; diff --git a/lustre/tests/createmany.c b/lustre/tests/createmany.c index c56eda8..8399824 100644 --- a/lustre/tests/createmany.c +++ b/lustre/tests/createmany.c @@ -8,15 +8,23 @@ #include #include +void usage(char *prog) +{ + printf("usage: %s {-o|-m} filenamefmt count\n", prog); + printf(" %s {-o|-m} filenamefmt -seconds\n", prog); + printf(" %s {-o|-m} filenamefmt start count\n", prog); +} + int main(int argc, char ** argv) { int i, rc = 0, do_open; + char format[4096], *fmt; char filename[4096]; - long int start, last, end, count; + long start, last, end; + long begin = 0, count; - if (argc != 4) { - printf("Usage %s <-o|-m> filenamebase \n", - argv[0]); + if (argc < 4 || argc > 5) { + usage(argv[0]); return 1; } @@ -25,8 +33,7 @@ int main(int argc, char ** argv) } else if (strcmp(argv[1], "-m") == 0) { do_open = 0; } else { - printf("Usage %s {-o|-m} filenamebase \n", - argv[0]); + usage(argv[0]); return 1; } @@ -37,18 +44,29 @@ int main(int argc, char ** argv) start = last = time(0); - end = strtol(argv[3], NULL, 0); - - if (end > 0) { - count = end; - end = -1UL >> 1; - } else { - end = start - end; - count = -1UL >> 1; - } + if (argc == 4) { + end = strtol(argv[3], NULL, 0); + if (end > 0) { + count = end; + end = -1UL >> 1; + } else { + end = start - end; + count = -1UL >> 1; + } + } else { + end = -1UL >> 1; + begin = strtol(argv[3], NULL, 0); + count = strtol(argv[4], NULL, 0); + } - for (i = 0; i < count && time(0) < end; i++) { - sprintf(filename, "%s%d", argv[2], i); + if (strchr(argv[2], '%')) + fmt = argv[2]; + else { + sprintf(format, "%s%%d", argv[2]); + fmt = format; + } + for (i = 0; i < count && time(0) < end; i++, begin++) { + sprintf(filename, fmt, begin); if (do_open) { int fd = open(filename, O_CREAT|O_RDWR, 0644); if (fd < 0) { diff --git a/lustre/tests/createtest.c b/lustre/tests/createtest.c new file mode 100644 index 0000000..5404f13 --- /dev/null +++ b/lustre/tests/createtest.c @@ -0,0 +1,142 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef S_SHIFT +#define S_SHIFT 12 +#endif + +int usage(char *prog) +{ + fprintf(stderr, "usage: %s \n", prog); + exit(1); +} + +int main(int argc, char *argv[]) +{ + char name[4096]; + int i; + + if (argc != 2) + usage(argv[0]); + + umask(0); + for (i = 0; i <= S_IFMT; i += (1 << S_SHIFT)) { + struct stat st; + int mode = i | 0644; + int rc; + + sprintf(name, "%s-mknod%06o", argv[1], mode); + rc = mknod(name, mode, 0x1234); + switch (i) { + case 0: + mode |= S_IFREG; + case S_IFREG: + case S_IFCHR: case S_IFBLK: + if (rc < 0 && getuid() != 0) + continue; + case S_IFSOCK: case S_IFIFO: + if (rc < 0) { + fprintf(stderr, "%s: ERROR mknod %s: %s\n", + argv[0], name, strerror(errno)); + exit(10); + } + rc = stat(name, &st); + if (rc < 0) { + fprintf(stderr, "%s: ERROR stat %s: %s", + argv[0], name, strerror(errno)); + exit(11); + } + if (st.st_mode != mode) { + fprintf(stderr, "%s: ERROR mode %s: %o != %o", + argv[0], name, st.st_mode, mode); + exit(12); + } + rc = unlink(name); + if (rc < 0) { + fprintf(stderr, "%s: ERROR unlink %s: %s", + argv[0], name, strerror(errno)); + exit(13); + } + break; + default: + if (rc == 0) { + fprintf(stderr, "%s: ERROR: %s created\n", + argv[0], name); + exit(14); + } + } + } + + for (i = 0; i <= S_IFMT; i += (1 << S_SHIFT)) { + struct stat st; + int mode; + int fd; + int rc; + + mode = i | 0644; + sprintf(name, "%s-creat%06o", argv[1], mode); + fd = open(name, O_CREAT|O_RDONLY, mode); + if (fd < 0) { + fprintf(stderr, "%s: ERROR creat %s: %s\n", + argv[0], name, strerror(errno)); + exit(21); + } + close(fd); + rc = stat(name, &st); + if (rc < 0) { + fprintf(stderr, "%s: ERROR stat %s: %s", + argv[0], name, strerror(errno)); + exit(11); + } + if ((st.st_mode & S_IFMT) != S_IFREG) { + fprintf(stderr, "%s: ERROR mode %s: %o != %o", + argv[0], name, st.st_mode & S_IFMT, S_IFREG); + exit(12); + } + rc = unlink(name); + if (rc < 0) { + fprintf(stderr, "%s: ERROR unlink %s: %s\n", + argv[0], name, strerror(errno)); + exit(20); + } + } + + for (i = 0; i <= S_IFMT; i += (1 << S_SHIFT)) { + struct stat st; + int rc; + + sprintf(name, "%s-mkdir%06o", argv[1], i | 0644); + rc = mkdir(name, i | 0664); + if (rc < 0) { + fprintf(stderr, "%s: ERROR mkdir %s: %s\n", + argv[0], name, strerror(errno)); + exit(30); + } + rc = stat(name, &st); + if (rc < 0) { + fprintf(stderr, "%s: ERROR stat %s: %s", + argv[0], name, strerror(errno)); + exit(11); + } + if ((st.st_mode & S_IFMT) != S_IFDIR) { + fprintf(stderr, "%s: ERROR mode %s: %o != %o", + argv[0], name, st.st_mode & S_IFMT, S_IFDIR); + exit(12); + } + rc = rmdir(name); + if (rc < 0) { + fprintf(stderr, "%s: ERROR rmdir %s: %s\n", + argv[0], name, strerror(errno)); + exit(31); + } + } + + printf("%s: SUCCESS\n", argv[0]); + return 0; +} diff --git a/lustre/tests/echo.sh b/lustre/tests/echo.sh index f30f056..99e026f 100755 --- a/lustre/tests/echo.sh +++ b/lustre/tests/echo.sh @@ -1,47 +1,49 @@ #!/bin/bash -config=${1:-$(basename $0 .sh).xml} +LOV=${LOV:-0} +while [ "$1" ]; do + case $1 in + --lov) LOV="1" ;; + *) [ -z $config ] && config=$1 || OPTS="$OPTS $1" ;; + esac + shift +done + +config=${config:-$(basename $0 .sh).xml} LMC=${LMC:-../utils/lmc -m $config} +TMP=${TMP:-/tmp} -SERVER=localhost -CLIENT=localhost +SERVER=${SERVER:-localhost} +CLIENT=${CLIENT:-localhost} +NET=${NET:-tcp} # FIXME: make LMC not require MDS for obdecho LOV -MDSDEV=$TMP/mds1 +MDSDEV=${MDSDEV:-$TMP/mds1} MDSSIZE=10000 STRIPE_BYTES=65536 STRIPES_PER_OBJ=2 # 0 means stripe over all OSTs -LOV=0 -while [ "$1" ]; do - case $1 in - --lov) LOV="1" ;; - *) OPTS="$OPTS $1" ;; - esac - shift -done - rm -f $config # create nodes $LMC --add node --node $SERVER || exit 1 -$LMC --add net --node $SERVER --nid $SERVER --nettype tcp || exit 2 +$LMC --add net --node $SERVER --nid $SERVER --nettype $NET || exit 2 if (($LOV)); then $LMC --add mds --node $SERVER --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 10 $LMC --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 || exit 11 - $LMC --add ost --node $SERVER --lov lov1 --obdtype=obdecho || exit 12 - $LMC --add ost --node $SERVER --lov lov1 --obdtype=obdecho || exit 13 + $LMC --add ost --node $SERVER --lov lov1 --osdtype=obdecho || exit 12 + $LMC --add ost --node $SERVER --lov lov1 --osdtype=obdecho || exit 13 OBD_NAME=lov1 else - $LMC --add ost --obd obd1 --node $SERVER --obdtype=obdecho || exit 2 + $LMC --add ost --ost obd1 --node $SERVER --osdtype=obdecho || exit 12 OBD_NAME=obd1 fi if [ "$SERVER" != "$CLIENT" ]; then $LMC --add node --node $CLIENT || exit 1 - $LMC --add net --node $CLIENT --nid $CLIENT --nettype tcp || exit 2 + $LMC --add net --node $CLIENT --nid $CLIENT --nettype $NET || exit 2 fi -$LMC --add echo_client --node $CLIENT --obd ${OBD_NAME} || exit 3 +$LMC --add echo_client --node $CLIENT --ost ${OBD_NAME} || exit 3 diff --git a/lustre/tests/leak_finder.pl b/lustre/tests/leak_finder.pl index fbf1d00..b8d234b 100644 --- a/lustre/tests/leak_finder.pl +++ b/lustre/tests/leak_finder.pl @@ -19,6 +19,12 @@ while ($line = <>) { $name = $6; $size = $7; $addr = $8; + + # we can't dump the log after portals has exited, so skip "leaks" + # from memory freed in the portals module unloading. + if ($func eq 'portals_handle_init') { + next; + } printf("%8s %6d bytes at %s called %s (%s:%s:%d)\n", $type, $size, $addr, $name, $file, $func, $lno); } else { diff --git a/lustre/tests/lkcdmap b/lustre/tests/lkcdmap new file mode 100755 index 0000000..f8a1fd5 --- /dev/null +++ b/lustre/tests/lkcdmap @@ -0,0 +1,11 @@ +#!/bin/sh +TMP=${TMP:-/tmp} +cat /tmp/ogdb-`hostname` | while read JUNK M JUNK; do + MOD="../$M" + MAP=`echo $MOD | sed -e 's/\.o$/.map/'` + MODNAME=`basename $MOD | sed -e 's/\.o$//'` + + nm $MOD > $MAP + echo namelist -a $PWD/$MOD + echo symtab -a $PWD/$MAP $MODNAME +done diff --git a/lustre/tests/llmount.sh b/lustre/tests/llmount.sh index bc30630..1e2bd6a 100755 --- a/lustre/tests/llmount.sh +++ b/lustre/tests/llmount.sh @@ -7,6 +7,12 @@ NAME=${NAME:-local} config=$NAME.xml mkconfig=$NAME.sh +if [ "$PORTALS" ]; then + portals_opt="--portals=$PORTALS" +fi + +[ -x $LCONF ] || chmod a+rx $LCONF + sh $mkconfig $config || exit 1 -${LCONF} --reformat --gdb $config || exit 2 +${LCONF} $portals_opt --reformat --gdb $config || exit 2 diff --git a/lustre/tests/llmount2-hack.sh b/lustre/tests/llmount2-hack.sh deleted file mode 100644 index 495626c..0000000 --- a/lustre/tests/llmount2-hack.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/sh -# suggested boilerplate for test script - -LCONF=${LCONF:-../utils/lconf} -NAME=${NAME:-local2-hack} - -config=$NAME.xml - -${LCONF} --reformat --gdb $config || exit 2 - -../utils/lctl <&2 mv $TMP/debug $TMP/debug-busy.`date +%s` - exit -1 + exit 255 fi LEAK_LUSTRE=`dmesg | tail -20 | grep -v "leaked: 0" | grep leaked` LEAK_PORTALS=`dmesg | tail -20 | grep "Portals memory leaked"` @@ -25,5 +30,7 @@ if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then echo "$LEAK_LUSTRE" 1>&2 echo "$LEAK_PORTALS" 1>&2 mv $TMP/debug $TMP/debug-leak.`date +%s` - exit -2 + exit 254 fi + +exit $rc diff --git a/lustre/tests/llmountcleanup2-hack.sh b/lustre/tests/llmountcleanup2-hack.sh deleted file mode 100644 index b2996cf..0000000 --- a/lustre/tests/llmountcleanup2-hack.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/sh - -umount /mnt/lustre2 -umount /mnt/lustre1 -../utils/lctl < - - - - - - - - - - - - - localhost - 988 - - - - extN - /tmp/mds1 - yes - - - - - extN - /tmp/ost1 - yes - - - - - - - - - - - - - /mnt/lustre1 - - diff --git a/lustre/tests/mkdirmany.c b/lustre/tests/mkdirmany.c index f90327a..26c3016 100755 --- a/lustre/tests/mkdirmany.c +++ b/lustre/tests/mkdirmany.c @@ -27,7 +27,7 @@ int main(int argc, char ** argv) for (i=0 ; i < count ; i++) { sprintf(dirname, "%s-%d", argv[1], i); - rc = mkdir(dirname, S_IFREG| 0444); + rc = mkdir(dirname, 0444); if (rc) { printf("mkdir(%s) error: %s\n", dirname, strerror(errno)); diff --git a/lustre/tests/mount2.sh b/lustre/tests/mount2.sh index 6ae6e70..f1c00b4 100644 --- a/lustre/tests/mount2.sh +++ b/lustre/tests/mount2.sh @@ -2,14 +2,14 @@ config=${1:-mount2.xml} -LMC=${LMC:-../utils/lmc} +LMC="${LMC:-../utils/lmc} -m $config" TMP=${TMP:-/tmp} -MDSDEV=$TMP/mds1 -MDSSIZE=50000 +MDSDEV=${MDSDEV:-$TMP/mds1} +MDSSIZE=${MDSSIZE:-50000} -OSTDEV=$TMP/ost1 -OSTSIZE=100000 +OSTDEV=${OSTDEV:-$TMP/ost1} +OSTSIZE=${OSTSIZE:-200000} kver=`uname -r | cut -d "." -f 1,2` @@ -21,15 +21,19 @@ case $kver in ;; esac + +rm -f $config + # create nodes -${LMC} -o $config --add net --node localhost --nid localhost --nettype tcp || exit 1 +${LMC} --add node --node localhost || exit 10 +${LMC} --add net --node localhost --nid localhost --nettype tcp || exit 11 # configure mds server -${LMC} -m $config --add mds --format --node localhost $FSTYPE --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 2 +${LMC} --add mds --node localhost --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 20 # configure ost -${LMC} -m $config --add ost --format --obd obd1 --node localhost $FSTYPE --dev $OSTDEV --size $OSTSIZE || exit 3 +${LMC} --add ost --node localhost --obd obd1 --dev $OSTDEV --size $OSTSIZE || exit 30 # create client config -${LMC} -m $config --add mtpt --node localhost --path /mnt/lustre1 --mds mds1 --obd obd1 || exit 4 -${LMC} -m $config --add mtpt --node localhost --path /mnt/lustre2 --mds mds1 --obd obd1 || exit 4 +${LMC} --add mtpt --node localhost --path /mnt/lustre1 --mds mds1 --obd obd1 || exit 40 +${LMC} --add mtpt --node localhost --path /mnt/lustre2 --mds mds1 --obd obd1 || exit 40 diff --git a/lustre/tests/open_delay.c b/lustre/tests/open_delay.c new file mode 100644 index 0000000..2f418846 --- /dev/null +++ b/lustre/tests/open_delay.c @@ -0,0 +1,25 @@ +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + int fd; + + if (argc != 2) { + printf("Usage %s \n", argv[0]); + exit(1); + } + + fd = open(argv[1], O_RDONLY | O_LOV_DELAY_CREATE); + if (fd == -1) { + printf("Error opening %s\n", argv[1]); + exit(1); + } + + return 0; +} diff --git a/lustre/tests/openunlink.c b/lustre/tests/openunlink.c index 3d5904d..e7671c8 100644 --- a/lustre/tests/openunlink.c +++ b/lustre/tests/openunlink.c @@ -12,17 +12,22 @@ char buf[128]; int main(int argc, char **argv) { + char *fname, *fname2; int fd, rc; - if (argc != 2) { - fprintf(stderr, "usage: %s filename\n", argv[0]); + if (argc < 2 || argc > 3) { + fprintf(stderr, "usage: %s filename [filename2]\n", argv[0]); exit(1); - } else { - fprintf(stderr, "congratulations - program starting\n"); } + fname = argv[1]; + if (argc == 3) + fname2 = argv[2]; + else + fname2 = argv[1]; + fprintf(stderr, "opening\n"); - fd = open(argv[1], O_RDWR | O_TRUNC | O_CREAT, 0644); + fd = open(fname, O_RDWR | O_TRUNC | O_CREAT, 0644); if (fd == -1) { fprintf(stderr, "open (normal) %s\n", strerror(errno)); exit(1); @@ -35,31 +40,41 @@ int main(int argc, char **argv) exit(1); } - fprintf(stderr, "closing\n"); - rc = close(fd); - if (rc) { - fprintf(stderr, "close (normal) %s\n", strerror(errno)); - exit(1); - } - - fprintf(stderr, "opening again\n"); - fd = open(argv[1], O_RDWR); - if (fd == -1) { - fprintf(stderr, "open (unlink) %s\n", strerror(errno)); - exit(1); - } - -#if 0 - fprintf(stderr, "unlinking\n"); - rc = unlink(argv[1]); - if (rc) { - fprintf(stderr, "unlink %s\n", strerror(errno)); - exit(1); - } -#else - printf("unlink %s and press enter\n", argv[1]); - getc(stdin); -#endif + if (argc == 3) { + fprintf(stderr, "closing %s\n", fname); + rc = close(fd); + if (rc) { + fprintf(stderr, "close (normal) %s\n", strerror(errno)); + exit(1); + } + + fprintf(stderr, "opening %s\n", fname2); + fd = open(fname2, O_RDWR); + if (fd == -1) { + fprintf(stderr, "open (unlink) %s\n", strerror(errno)); + exit(1); + } + + fprintf (stderr, "unlinking %s\n", fname2); + rc = unlink(fname2); + if (rc) { + fprintf(stderr, "unlink %s\n", strerror(errno)); + exit(1); + } + + if (access(fname2, F_OK) == 0) { + fprintf(stderr, "%s still exists\n", fname2); + exit(1); + } + } else { + printf("unlink %s and press enter\n", fname); + getc(stdin); + } + + if (access(fname, F_OK) == 0) { + fprintf(stderr, "%s still exists\n", fname); + exit(1); + } fprintf(stderr, "reading\n"); rc = read(fd, buf, strlen(T1) + 1); diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh new file mode 100755 index 0000000..26bb81f --- /dev/null +++ b/lustre/tests/recovery-small.sh @@ -0,0 +1,124 @@ +#!/bin/sh + +set -ex + +LUSTRE=${LUSTRE:-`dirname $0`/..} +PATH=$PATH:$LUSTRE/utils:$LUSTRE/tests + +. $LUSTRE/../ltest/functional/llite/common/common.sh + +PDSH='pdsh -S -w' + +# XXX I wish all this stuff was in some default-config.sh somewhere +MDSNODE=${MDSNODE:-dev2} +OSTNODE=${OSTNODE:-dev3} +CLIENT=${CLIENTNODE:-dev4} +NETWORKTYPE=${NETWORKTYPE:-tcp} +MOUNTPT=${MOUNTPT:-/mnt/lustre} +CONFIG=recovery-small.xml +MDSDEV=/tmp/mds +OSTDEV=/tmp/ost +MDSSIZE=100000 +OSTSIZE=100000 + +do_mds() { + $PDSH $MDSNODE "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@" +} + +do_client() { + $PDSH $CLIENT "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@" +} + +do_ost() { + $PDSH $OSTNODE "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@" +} + +drop_request() { + do_mds "echo 0x121 > /proc/sys/lustre/fail_loc" + do_client "$1" + do_mds "echo 0 > /proc/sys/lustre/fail_loc" +} + +drop_reply() { + do_mds "echo 0x120 > /proc/sys/lustre/fail_loc" + do_client "$@" + do_mds "echo 0 > /proc/sys/lustre/fail_loc" +} + +make_config() { + rm -f $CONFIG + for NODE in $CLIENT $MDSNODE $OSTNODE; do + lmc -m $CONFIG --add net --node $NODE --nid `h2$NETWORKTYPE $NODE` \ + --nettype $NETWORKTYPE || exit 4 + done + lmc -m $CONFIG --add mds --node $MDSNODE --mds mds1 --dev $MDSDEV \ + --size $MDSSIZE || exit 5 + lmc -m $CONFIG --add ost --node $OSTNODE --ost ost1 --dev $OSTDEV \ + --size $OSTSIZE || exit 6 + lmc -m $CONFIG --add mtpt --node $CLIENT --path $MOUNTPT --mds mds1 \ + --ost ost1 || exit 7 +} + +start_mds() { + do_mds "lconf $@ $CONFIG" +} + +shutdown_mds() { + do_mds "lconf $@ --cleanup $CONFIG" +} + +start_ost() { + do_ost "lconf $@ $CONFIG" +} + +shutdown_ost() { + do_ost "lconf $@ --cleanup $CONFIG" +} + +mount_client() { + do_client "lconf $@ $CONFIG" +} + +unmount_client() { + do_client "lconf $@ --cleanup $CONFIG" +} + +setup() { + make_config + start_mds --reformat + start_ost --reformat + # XXX we should write our own upcall, when we move this somewhere better. + mount_client --timeout=10 \ + --recovery_upcall=$PWD/../../ltest/functional/llite/09/client-upcall.sh +} + +cleanup() { + unmount_client || true + shutdown_mds || true + shutdown_ost || true +} + +replay() { + if [ $# -gt 1 ]; then + do_client "$1" + shift + fi + do_mds "sync" + do_mds 'echo -e "device \$mds1\\nprobe\\nnotransno\\nreadonly" | lctl' + do_client "$1" & + shutdown_mds -f + start_mds + wait + do_client "ls $MOUNPT" # trigger failover, if we haven't already +} + +if [ ! -z "$ONLY" ]; then + eval "$ONLY" + exit $? +fi + +setup +drop_request "mcreate /mnt/lustre/1" +drop_reply "mcreate /mnt/lustre/2" +replay "mcreate /mnt/lustre/3" +cleanup diff --git a/lustre/tests/rename.pl b/lustre/tests/rename.pl new file mode 100644 index 0000000..3ba9368 --- /dev/null +++ b/lustre/tests/rename.pl @@ -0,0 +1,78 @@ +#!/usr/bin/perl +use strict; +use diagnostics; +use Getopt::Long; + +sub usage () { + print "Usage: $0 \n"; + print "example: $0 --count=2 /mnt/lustre 50\n"; + print " will test in /mnt/lustre1 and /mnt/lustre2\n"; + print " $0 --count=0 /mnt/lustre 50\n"; + print " will test in /mnt/lustre only\n"; + exit; +} +my ($j, $k, $d, $f1, $f2, $path, $silent); +my $count = 0; +my $create = 10; + +GetOptions("silent!"=> \$silent, + "count=i" => \$count, + "create=i" => \$create); + +my $mtpt = shift || usage(); +my $i = shift || usage(); +my $total = $i; +my $files = 6; +my $dirs = 3; +my $mcreate = 0; # should we use mcreate or open? + +my $which = ""; +if ($count > 0) { + $which = int(rand() * $count) + 1; +} + +$k = $dirs; +if ($create == 0) { + $k = 0; +} +while ($k--) { + $path = "$mtpt$which/$k"; + my $rc = mkdir $path, 0755; + print "mkdir $path failed: $!\n" if !$rc; + $j = $files; + while ($j--) { + `./mcreate $path/$j`; + } +} + +while ($i--) { + my $which = ""; + if ($count > 0) { + $which = int(rand() * $count) + 1; + } + $d = int(rand() * $dirs); + $f1 = int(rand() * $files); + $f2 = int(rand() * $files); + print "[$$] $mtpt$which/$d/$f1 $mtpt$which/$d/$f2 ...\n" if !$silent; + my $rc = rename "$mtpt$which/$d/$f1", "$mtpt$which/$d/$f2"; + print "[$$] done: $rc\n" if !$silent; + if (($total - $i) % 100 == 0) { + print STDERR "[" . $$ . "]" . ($total - $i) . " operations\n"; + } +} + +$k = $dirs; +if ($create == 0) { + $k = 0; +} +while ($k--) { + $path = "$mtpt$which/$k"; + $j = $files; + while ($j--) { + unlink "$path/$j"; + } + my $rc = rmdir $path; + print "rmdir $path failed: $!\n" if !$rc; +} + +print "Done.\n"; diff --git a/lustre/tests/runiozone b/lustre/tests/runiozone index cf198ad..4fc00b2 100755 --- a/lustre/tests/runiozone +++ b/lustre/tests/runiozone @@ -4,7 +4,7 @@ [ -z "$VERIFY" ] && VERIFY="-+d" [ -z "$ODIR" ] && ODIR="-I" [ -z "$REC" ] && REC=64 -[ -z "$FILE" ] && FILE=/mnt/lustre/test.$$ +[ -z "$FILE" ] && FILE=/mnt/lustre/iozone.$$ [ $1 ] && SIZE=$1 COUNT=0 rm -f endiozone diff --git a/lustre/tests/runregression-brw.sh b/lustre/tests/runregression-brw.sh index 702bd1f..4d86248 100644 --- a/lustre/tests/runregression-brw.sh +++ b/lustre/tests/runregression-brw.sh @@ -1,16 +1,15 @@ #!/bin/sh -export PATH=/sbin:/usr/sbin:$PATH - SRCDIR="`dirname $0`/" -. $SRCDIR/common.sh +export PATH=/sbin:/usr/sbin:$SRCDIR:$PATH +LOOPS=${LOOPS:-1} COUNT=${COUNT:-1000000} COUNT_10=`expr $COUNT / 10` COUNT_100=`expr $COUNT / 100` ENDRUN=endrun-`hostname` -ECHONAME="`$OBDCTL device_list 2> /dev/null | awk '/ echo_client / { print $4 }' | tail -1`" +ECHONAME="`lctl device_list 2> /dev/null | awk '/ echo_client / { print $4 }' | tail -1`" if [ -z "$ECHONAME" ]; then echo "$0: needs an ECHO_CLIENT set up first" 1>&2 @@ -18,7 +17,7 @@ if [ -z "$ECHONAME" ]; then fi cleanup () { - $OBDCTL --device \$$ECHONAME destroy $OID + lctl --device \$$ECHONAME destroy $OID } runthreads() { @@ -42,7 +41,7 @@ runthreads() { ;; esac - $OBDCTL --threads $THR v \$$ECHONAME $DO $CNT $RW $V $PGS $OID || exit 1 + lctl --threads $THR v \$$ECHONAME $DO $CNT $RW $V $PGS $OID || exit 1 if [ -e $ENDRUN ]; then rm $ENDRUN @@ -51,15 +50,15 @@ runthreads() { fi } -[ -z "$OID" ] && OID=`$OBDCTL --device \\$$ECHONAME create 1 | awk '/is object id/ { print $6 }'` +[ -z "$OID" ] && OID=`lctl --device \\$$ECHONAME create 1 | awk '/is object id/ { print $6 }'` && echo "created object $OID" [ -z "$OID" ] && echo "error creating object" 1>&2 && exit 1 # TODO: obdctl needs to check on the progress of each forked thread # (IPC SHM, sockets?) to see if it hangs. -while date; do +for i in `seq $LOOPS`; do PG=1 - PGVW=16 - PGVR=16 + PGVW=${PGVW:-16} + PGVR=${PGVR:-16} # We use '--threads 1 X' instead of '--device X' so that # obdctl can monitor the forked thread for progress (TODO). diff --git a/lustre/tests/runregression-net.sh b/lustre/tests/runregression-net.sh index 288f847..6de9a6c 100644 --- a/lustre/tests/runregression-net.sh +++ b/lustre/tests/runregression-net.sh @@ -1,8 +1,6 @@ #!/bin/sh -export PATH=/sbin:/usr/sbin:$PATH - SRCDIR="`dirname $0`/" -. $SRCDIR/common.sh +export PATH=/sbin:/usr/sbin:$SRCDIR/../utils:$PATH COUNT=${COUNT:-1000000} COUNT_10=`expr $COUNT / 10` @@ -11,13 +9,17 @@ COUNT_1000=`expr $COUNT / 1000` ENDRUN=endrun-`hostname` -ECHONAME="`$OBDCTL device_list 2> /dev/null | awk '/ echo_client / { print $4 }' | tail -1`" +ECHONAME="`lctl device_list 2> /dev/null | awk '/ echo_client / { print $4 }' | tail -1`" if [ -z "$ECHONAME" ]; then echo "$0: needs an ECHO_CLIENT set up first" 1>&2 exit 1 fi +cleanup () { + lctl --device \$$ECHONAME destroy $OID +} + runthreads() { THR=$1 DO=$2 @@ -29,28 +31,26 @@ runthreads() { test_getattr) RW= ;; - test_brw_write) DO=test_brw RW=w ;; - test_brw_read) DO=test_brw RW=r ;; esac - $OBDCTL --threads $THR v \$$ECHONAME $DO $CNT $RW $V $PGS $OID || exit 1 + lctl --threads $THR v \$$ECHONAME $DO $CNT $RW $V $PGS $OID || exit 1 - if [ -e endrun ]; then - rm endrun - echo "exiting because endrun file was found" - exit 0 + if [ -e $ENDRUN ]; then + rm $ENDRUN + echo "exiting because $ENDRUN file was found" + cleanup fi } -[ -z "$OID" ] && OID=`$OBDCTL --device \\$$ECHONAME create 1 | awk '/is object id/ { print $6 }'` +[ -z "$OID" ] && OID=`lctl --device \\$$ECHONAME create 1 | awk '/is object id/ { print $6 }'` && echo "created object $OID" [ -z "$OID" ] && echo "error creating object" 1>&2 && exit 1 # TODO: obdctl needs to check on the progress of each forked thread @@ -63,11 +63,11 @@ for CMD in test_getattr test_brw_write test_brw_read; do ;; test_brw_write) PG=1 - PGV=16 + PGV=${PGV:-16} ;; test_brw_read) PG=1 - PGV=16 + PGV=${PGV:-16} ;; esac @@ -76,8 +76,7 @@ for CMD in test_getattr test_brw_write test_brw_read; do runthreads 1 $CMD 1 1 $PG runthreads 1 $CMD 100 1 $PG - debug_server_off - debug_client_off + echo 0 > /proc/sys/portals/debug runthreads 1 $CMD $COUNT_100 -10 $PG [ "$PGV" ] && runthreads 1 $CMD $COUNT_1000 -10 $PGV @@ -97,4 +96,4 @@ for CMD in test_getattr test_brw_write test_brw_read; do [ "$PGV" ] && runthreads 100 $CMD $COUNT_1000 -30 $PGV done -$OBDCTL --device \$$ECHONAME destroy $OID +lctl --device \$$ECHONAME destroy $OID diff --git a/lustre/tests/runtests b/lustre/tests/runtests index e068a01..05bf71f 100755 --- a/lustre/tests/runtests +++ b/lustre/tests/runtests @@ -104,6 +104,12 @@ rm $HOSTS || fail "can't remove $HOSTS again" 36 echo "removing $DST" rm -r $V $DST || fail "can't remove $DST" 37 +# mkdirmany test (bug 589) +echo "running mkdirmany $OSCMT/base$$ 100" +./mkdirmany $OSCMT/base$$ 100 || fail "mkdirmany failed" +echo "removing mkdirmany directories" +rmdir $OSCMT/base$$* || fail "mkdirmany cleanup failed" + NOWUSED=`df | awk "/$OSCTMP/ { print \\$3 }" | tail -1` if [ $NOWUSED -gt $USED ]; then echo "Space not all freed: now ${NOWUSED}kB, was ${USED}kB." 1>&2 diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 73117b6..111606a 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -2,18 +2,28 @@ set -e +SRCDIR=`dirname $0` +PATH=$SRCDIR:$SRCDIR/../utils:$PATH + CHECKSTAT=${CHECKSTAT:-"./checkstat -v"} +CREATETEST=${CREATETEST:-createtest} +LFIND=${LFIND:-lfind} +LSTRIPE=${LSTRIPE:-lstripe} +MCREATE=${MCREATE:-mcreate} +TOEXCL=${TOEXCL:-toexcl} + MOUNT=${MOUNT:-/mnt/lustre} +DIR=${DIR:-$MOUNT} export NAME=$NAME clean() { echo -n "cln.." - sh llmountcleanup.sh > /dev/null + sh llmountcleanup.sh > /dev/null || exit 20 } CLEAN=${CLEAN:-clean} start() { echo -n "mnt.." - sh llrmount.sh > /dev/null - echo -n "done" + sh llrmount.sh > /dev/null || exit 10 + echo "done" } START=${START:-start} @@ -26,270 +36,279 @@ pass() { echo PASS } -mount | grep $MOUNT || $START +mount | grep $MOUNT || sh llmount.sh echo '== touch .../f ; rm .../f ======================== test 0' -touch $MOUNT/f -$CHECKSTAT -t file $MOUNT/f || error -rm $MOUNT/f -$CHECKSTAT -a $MOUNT/f || error +touch $DIR/f +$CHECKSTAT -t file $DIR/f || error +rm $DIR/f +$CHECKSTAT -a $DIR/f || error pass $CLEAN $START echo '== mkdir .../d1; mkdir .../d1/d2 ================= test 1' -mkdir $MOUNT/d1 -mkdir $MOUNT/d1/d2 -$CHECKSTAT -t dir $MOUNT/d1/d2 || error +mkdir $DIR/d1 +mkdir $DIR/d1/d2 +$CHECKSTAT -t dir $DIR/d1/d2 || error pass $CLEAN $START echo '== rmdir .../d1/d2; rmdir .../d1 ================= test 1b' -rmdir $MOUNT/d1/d2 -rmdir $MOUNT/d1 -$CHECKSTAT -a $MOUNT/d1 || error +rmdir $DIR/d1/d2 +rmdir $DIR/d1 +$CHECKSTAT -a $DIR/d1 || error pass $CLEAN $START echo '== mkdir .../d2; touch .../d2/f ================== test 2' -mkdir $MOUNT/d2 -touch $MOUNT/d2/f -$CHECKSTAT -t file $MOUNT/d2/f || error +mkdir $DIR/d2 +touch $DIR/d2/f +$CHECKSTAT -t file $DIR/d2/f || error pass $CLEAN $START echo '== rm -r .../d2; touch .../d2/f ================== test 2b' -rm -r $MOUNT/d2 -$CHECKSTAT -a $MOUNT/d2 || error +rm -r $DIR/d2 +$CHECKSTAT -a $DIR/d2 || error pass $CLEAN $START echo '== mkdir .../d3 ================================== test 3' -mkdir $MOUNT/d3 -$CHECKSTAT -t dir $MOUNT/d3 || error +mkdir $DIR/d3 +$CHECKSTAT -t dir $DIR/d3 || error pass $CLEAN $START echo '== touch .../d3/f ================================ test 3b' -touch $MOUNT/d3/f -$CHECKSTAT -t file $MOUNT/d3/f || error +touch $DIR/d3/f +$CHECKSTAT -t file $DIR/d3/f || error pass $CLEAN $START echo '== rm -r .../d3 ================================== test 3c' -rm -r $MOUNT/d3 -$CHECKSTAT -a $MOUNT/d3 || error +rm -r $DIR/d3 +$CHECKSTAT -a $DIR/d3 || error pass $CLEAN $START echo '== mkdir .../d4 ================================== test 4' -mkdir $MOUNT/d4 -$CHECKSTAT -t dir $MOUNT/d4 || error +mkdir $DIR/d4 +$CHECKSTAT -t dir $DIR/d4 || error pass $CLEAN $START echo '== mkdir .../d4/d2 =============================== test 4b' -mkdir $MOUNT/d4/d2 -$CHECKSTAT -t dir $MOUNT/d4/d2 || error +mkdir $DIR/d4/d2 +$CHECKSTAT -t dir $DIR/d4/d2 || error pass $CLEAN $START echo '== mkdir .../d5; mkdir .../d5/d2; chmod .../d5/d2 = test 5' -mkdir $MOUNT/d5 -mkdir $MOUNT/d5/d2 -chmod 0666 $MOUNT/d5/d2 -$CHECKSTAT -t dir -p 0666 $MOUNT/d5/d2 || error +mkdir $DIR/d5 +mkdir $DIR/d5/d2 +chmod 0707 $DIR/d5/d2 +$CHECKSTAT -t dir -p 0707 $DIR/d5/d2 || error pass $CLEAN $START echo '== touch .../f6; chmod .../f6 ==================== test 6' -touch $MOUNT/f6 -chmod 0666 $MOUNT/f6 -$CHECKSTAT -t file -p 0666 $MOUNT/f6 || error +touch $DIR/f6 +chmod 0666 $DIR/f6 +$CHECKSTAT -t file -p 0666 $DIR/f6 || error pass $CLEAN $START echo '== mkdir .../d7; mcreate .../d7/f; chmod .../d7/f = test 7' -mkdir $MOUNT/d7 -./mcreate $MOUNT/d7/f -chmod 0666 $MOUNT/d7/f -$CHECKSTAT -t file -p 0666 $MOUNT/d7/f || error +mkdir $DIR/d7 +$MCREATE $DIR/d7/f +chmod 0666 $DIR/d7/f +$CHECKSTAT -t file -p 0666 $DIR/d7/f || error +pass +$CLEAN +$START + +echo '== mkdir .../d7; mcreate .../d7/f2; chmod .../d7/f2 = test 7b' +$MCREATE $DIR/d7/f2 +echo -n foo > $DIR/d7/f2 +[ "`cat $DIR/d7/f2`" = "foo" ] || error +$CHECKSTAT -t file -s 3 $DIR/d7/f2 || error pass $CLEAN $START echo '== mkdir .../d8; touch .../d8/f; chmod .../d8/f == test 8' -mkdir $MOUNT/d8 -touch $MOUNT/d8/f -chmod 0666 $MOUNT/d8/f -$CHECKSTAT -t file -p 0666 $MOUNT/d8/f || error +mkdir $DIR/d8 +touch $DIR/d8/f +chmod 0666 $DIR/d8/f +$CHECKSTAT -t file -p 0666 $DIR/d8/f || error pass $CLEAN $START echo '== mkdir .../d9 .../d9/d2 .../d9/d2/d3 =========== test 9' -mkdir $MOUNT/d9 -mkdir $MOUNT/d9/d2 -mkdir $MOUNT/d9/d2/d3 -$CHECKSTAT -t dir $MOUNT/d9/d2/d3 || error +mkdir $DIR/d9 +mkdir $DIR/d9/d2 +mkdir $DIR/d9/d2/d3 +$CHECKSTAT -t dir $DIR/d9/d2/d3 || error pass $CLEAN $START echo '== mkdir .../d10 .../d10/d2; touch .../d10/d2/f = test 10' -mkdir $MOUNT/d10 -mkdir $MOUNT/d10/d2 -touch $MOUNT/d10/d2/f -$CHECKSTAT -t file $MOUNT/d10/d2/f || error +mkdir $DIR/d10 +mkdir $DIR/d10/d2 +touch $DIR/d10/d2/f +$CHECKSTAT -t file $DIR/d10/d2/f || error pass $CLEAN $START echo '== mkdir .../d11 d11/d2; chmod .../d11/d2 ======= test 11' -mkdir $MOUNT/d11 -mkdir $MOUNT/d11/d2 -chmod 0666 $MOUNT/d11/d2 -chmod 0555 $MOUNT/d11/d2 -$CHECKSTAT -t dir -p 0555 $MOUNT/d11/d2 || error +mkdir $DIR/d11 +mkdir $DIR/d11/d2 +chmod 0666 $DIR/d11/d2 +chmod 0705 $DIR/d11/d2 +$CHECKSTAT -t dir -p 0705 $DIR/d11/d2 || error pass $CLEAN $START echo '== mkdir .../d12; touch .../d12/f; chmod .../d12/f == test 12' -mkdir $MOUNT/d12 -touch $MOUNT/d12/f -chmod 0666 $MOUNT/d12/f -chmod 0555 $MOUNT/d12/f -$CHECKSTAT -t file -p 0555 $MOUNT/d12/f || error +mkdir $DIR/d12 +touch $DIR/d12/f +chmod 0666 $DIR/d12/f +chmod 0654 $DIR/d12/f +$CHECKSTAT -t file -p 0654 $DIR/d12/f || error pass $CLEAN $START -echo '== mkdir .../d13; cp /etc/passwd .../d13/f; > .../d13/f == test 13' -mkdir $MOUNT/d13 -cp /etc/hosts $MOUNT/d13/f -> $MOUNT/d13/f -$CHECKSTAT -t file -s 0 $MOUNT/d13/f || error +echo '== mkdir .../d13; creat .../d13/f; .../d13/f; > .../d13/f == test 13' +mkdir $DIR/d13 +dd if=/dev/zero of=$DIR/d13/f count=10 +> $DIR/d13/f +$CHECKSTAT -t file -s 0 $DIR/d13/f || error pass $CLEAN $START - echo '================================================== test 14' -mkdir $MOUNT/d14 -touch $MOUNT/d14/f -rm $MOUNT/d14/f -$CHECKSTAT -a $MOUNT/d14/f || error +mkdir $DIR/d14 +touch $DIR/d14/f +rm $DIR/d14/f +$CHECKSTAT -a $DIR/d14/f || error pass $CLEAN $START - echo '================================================== test 15' -mkdir $MOUNT/d15 -touch $MOUNT/d15/f -mv $MOUNT/d15/f $MOUNT/d15/f2 -$CHECKSTAT -t file $MOUNT/d15/f2 || error +mkdir $DIR/d15 +touch $DIR/d15/f +mv $DIR/d15/f $DIR/d15/f2 +$CHECKSTAT -t file $DIR/d15/f2 || error pass $CLEAN $START echo '================================================== test 16' -mkdir $MOUNT/d16 -touch $MOUNT/d16/f -rm -rf $MOUNT/d16/f -$CHECKSTAT -a $MOUNT/d16/f || error +mkdir $DIR/d16 +touch $DIR/d16/f +rm -rf $DIR/d16/f +$CHECKSTAT -a $DIR/d16/f || error pass $CLEAN $START echo '== symlinks: create, remove (dangling and real) == test 17' -mkdir $MOUNT/d17 -touch $MOUNT/d17/f -ln -s $MOUNT/d17/f $MOUNT/d17/l-exist -ln -s no-such-file $MOUNT/d17/l-dangle -ls -l $MOUNT/d17 -$CHECKSTAT -l $MOUNT/d17/f $MOUNT/d17/l-exist || error -$CHECKSTAT -f -t f $MOUNT/d17/l-exist || error -$CHECKSTAT -l no-such-file $MOUNT/d17/l-dangle || error -$CHECKSTAT -fa $MOUNT/d17/l-dangle || error -rm -f $MOUNT/l-dangle -rm -f $MOUNT/l-exist -$CHECKSTAT -a $MOUNT/l-dangle || error -$CHECKSTAT -a $MOUNT/l-exist || error +mkdir $DIR/d17 +touch $DIR/d17/f +ln -s $DIR/d17/f $DIR/d17/l-exist +ln -s no-such-file $DIR/d17/l-dangle +ls -l $DIR/d17 +$CHECKSTAT -l $DIR/d17/f $DIR/d17/l-exist || error +$CHECKSTAT -f -t f $DIR/d17/l-exist || error +$CHECKSTAT -l no-such-file $DIR/d17/l-dangle || error +$CHECKSTAT -fa $DIR/d17/l-dangle || error +rm -f $DIR/l-dangle +rm -f $DIR/l-exist +$CHECKSTAT -a $DIR/l-dangle || error +$CHECKSTAT -a $DIR/l-exist || error pass $CLEAN $START echo "== touch .../f ; ls ... ========================= test 18" -touch $MOUNT/f -ls $MOUNT || error +touch $DIR/f +ls $DIR || error pass $CLEAN $START echo "== touch .../f ; ls -l ... ====================== test 19" -touch $MOUNT/f -ls -l $MOUNT -rm $MOUNT/f -$CHECKSTAT -a $MOUNT/f || error +touch $DIR/f +ls -l $DIR +rm $DIR/f +$CHECKSTAT -a $DIR/f || error pass $CLEAN $START echo "== touch .../f ; ls -l ... ====================== test 20" -touch $MOUNT/f -rm $MOUNT/f +touch $DIR/f +rm $DIR/f echo "1 done" -touch $MOUNT/f -rm $MOUNT/f +touch $DIR/f +rm $DIR/f echo "2 done" -touch $MOUNT/f -rm $MOUNT/f +touch $DIR/f +rm $DIR/f echo "3 done" -$CHECKSTAT -a $MOUNT/f || error +$CHECKSTAT -a $DIR/f || error pass $CLEAN $START echo '== write to dangling link ======================== test 21' -mkdir $MOUNT/d21 -[ -f $MOUNT/d21/dangle ] && rm -f $MOUNT/d21/dangle -ln -s dangle $MOUNT/d21/link -echo foo >> $MOUNT/d21/link -cat $MOUNT/d21/dangle -$CHECKSTAT -t link $MOUNT/d21/link || error -$CHECKSTAT -f -t file $MOUNT/d21/link || error +mkdir $DIR/d21 +[ -f $DIR/d21/dangle ] && rm -f $DIR/d21/dangle +ln -s dangle $DIR/d21/link +echo foo >> $DIR/d21/link +cat $DIR/d21/dangle +$CHECKSTAT -t link $DIR/d21/link || error +$CHECKSTAT -f -t file $DIR/d21/link || error pass $CLEAN $START echo '== unpack tar archive as non-root user =========== test 22' -mkdir $MOUNT/d22 -which sudo && chown 4711 $MOUNT/d22 +mkdir $DIR/d22 +which sudo && chown 4711 $DIR/d22 SUDO=`which sudo 2> /dev/null` && SUDO="$SUDO -u #4711" || SUDO="" -$SUDO tar cf - /etc/hosts /etc/sysconfig/network | $SUDO tar xfC - $MOUNT/d22 -ls -lR $MOUNT/d22/etc -$CHECKSTAT -t dir $MOUNT/d22/etc || error -[ -z "$SUDO" ] || $CHECKSTAT -u \#4711 $MOUNT/d22/etc || error +echo '**** FIX THIS TEST ****' +SUDO="" +$SUDO tar cf - /etc/hosts /etc/sysconfig/network | $SUDO tar xfC - $DIR/d22 +ls -lR $DIR/d22/etc +$CHECKSTAT -t dir $DIR/d22/etc || error +[ -z "$SUDO" ] || $CHECKSTAT -u \#4711 $DIR/d22/etc || error pass $CLEAN $START echo '== O_CREAT|O_EXCL in subdir ====================== test 23' -mkdir $MOUNT/d23 -./toexcl $MOUNT/d23/f23 -./toexcl -e $MOUNT/d23/f23 || error +mkdir $DIR/d23 +$TOEXCL $DIR/d23/f23 +$TOEXCL -e $DIR/d23/f23 || error pass $CLEAN $START @@ -297,189 +316,217 @@ $START echo '== rename sanity ================================= test24' echo '-- same directory rename' echo '-- test 24-R1: touch a ; rename a b' -mkdir $MOUNT/R1 -touch $MOUNT/R1/f -mv $MOUNT/R1/f $MOUNT/R1/g -$CHECKSTAT -t file $MOUNT/R1/g || error +mkdir $DIR/R1 +touch $DIR/R1/f +mv $DIR/R1/f $DIR/R1/g +$CHECKSTAT -t file $DIR/R1/g || error pass $CLEAN $START echo '-- test 24-R2: touch a b ; rename a b;' -mkdir $MOUNT/R2 -touch $MOUNT/R2/{f,g} -mv $MOUNT/R2/f $MOUNT/R2/g -$CHECKSTAT -a $MOUNT/R2/f || error -$CHECKSTAT -t file $MOUNT/R2/g || error +mkdir $DIR/R2 +touch $DIR/R2/{f,g} +mv $DIR/R2/f $DIR/R2/g +$CHECKSTAT -a $DIR/R2/f || error +$CHECKSTAT -t file $DIR/R2/g || error pass $CLEAN $START echo '-- test 24-R3: mkdir a ; rename a b;' -mkdir $MOUNT/R3 -mkdir $MOUNT/R3/f -mv $MOUNT/R3/f $MOUNT/R3/g -$CHECKSTAT -a $MOUNT/R3/f || error -$CHECKSTAT -t dir $MOUNT/R3/g || error +mkdir $DIR/R3 +mkdir $DIR/R3/f +mv $DIR/R3/f $DIR/R3/g +$CHECKSTAT -a $DIR/R3/f || error +$CHECKSTAT -t dir $DIR/R3/g || error pass $CLEAN $START echo '-- test 24-R4: mkdir a b ; rename a b;' -mkdir $MOUNT/R4 -mkdir $MOUNT/R4/{f,g} -perl -e "rename \"$MOUNT/R4/f\", \"$MOUNT/R4/g\";" -$CHECKSTAT -a $MOUNT/R4/f || error -$CHECKSTAT -t dir $MOUNT/R4/g || error +mkdir $DIR/R4 +mkdir $DIR/R4/{f,g} +perl -e "rename \"$DIR/R4/f\", \"$DIR/R4/g\";" +$CHECKSTAT -a $DIR/R4/f || error +$CHECKSTAT -t dir $DIR/R4/g || error pass $CLEAN $START echo '-- cross directory renames --' echo '-- test 24-R5: touch a ; rename a b' -mkdir $MOUNT/R5{a,b} -touch $MOUNT/R5a/f -mv $MOUNT/R5a/f $MOUNT/R5b/g -$CHECKSTAT -a $MOUNT/R5a/f || error -$CHECKSTAT -t file $MOUNT/R5b/g || error +mkdir $DIR/R5{a,b} +touch $DIR/R5a/f +mv $DIR/R5a/f $DIR/R5b/g +$CHECKSTAT -a $DIR/R5a/f || error +$CHECKSTAT -t file $DIR/R5b/g || error pass $CLEAN $START echo '-- test 24-R6: touch a ; rename a b' -mkdir $MOUNT/R6{a,b} -touch $MOUNT/R6a/f $MOUNT/R6b/g -mv $MOUNT/R6a/f $MOUNT/R6b/g -$CHECKSTAT -a $MOUNT/R6a/f || error -$CHECKSTAT -t file $MOUNT/R6b/g || error +mkdir $DIR/R6{a,b} +touch $DIR/R6a/f $DIR/R6b/g +mv $DIR/R6a/f $DIR/R6b/g +$CHECKSTAT -a $DIR/R6a/f || error +$CHECKSTAT -t file $DIR/R6b/g || error pass $CLEAN $START echo '-- test 24-R7: touch a ; rename a b' -mkdir $MOUNT/R7{a,b} -mkdir $MOUNT/R7a/f -mv $MOUNT/R7a/f $MOUNT/R7b/g -$CHECKSTAT -a $MOUNT/R7a/f || error -$CHECKSTAT -t dir $MOUNT/R7b/g || error +mkdir $DIR/R7{a,b} +mkdir $DIR/R7a/f +mv $DIR/R7a/f $DIR/R7b/g +$CHECKSTAT -a $DIR/R7a/f || error +$CHECKSTAT -t dir $DIR/R7b/g || error pass $CLEAN $START echo '-- test 24-R8: touch a ; rename a b' -mkdir $MOUNT/R8{a,b} -mkdir $MOUNT/R8a/f $MOUNT/R8b/g -perl -e "rename \"$MOUNT/R8a/f\", \"$MOUNT/R8b/g\";" -$CHECKSTAT -a $MOUNT/R8a/f || error -$CHECKSTAT -t dir $MOUNT/R8b/g || error +mkdir $DIR/R8{a,b} +mkdir $DIR/R8a/f $DIR/R8b/g +perl -e "rename \"$DIR/R8a/f\", \"$DIR/R8b/g\";" +$CHECKSTAT -a $DIR/R8a/f || error +$CHECKSTAT -t dir $DIR/R8b/g || error pass $CLEAN $START echo "-- rename error cases" echo "-- test 24-R9 target error: touch f ; mkdir a ; rename f a" -mkdir $MOUNT/R9 -mkdir $MOUNT/R9/a -touch $MOUNT/R9/f -perl -e "rename \"$MOUNT/R9/f\", \"$MOUNT/R9/a\";" -$CHECKSTAT -t file $MOUNT/R9/f || error -$CHECKSTAT -t dir $MOUNT/R9/a || error -$CHECKSTAT -a file $MOUNT/R9/a/f || error +mkdir $DIR/R9 +mkdir $DIR/R9/a +touch $DIR/R9/f +perl -e "rename \"$DIR/R9/f\", \"$DIR/R9/a\";" +$CHECKSTAT -t file $DIR/R9/f || error +$CHECKSTAT -t dir $DIR/R9/a || error +$CHECKSTAT -a file $DIR/R9/a/f || error pass $CLEAN $START echo "--test 24-R10 source does not exist" -mkdir $MOUNT/R10 -perl -e "rename \"$MOUNT/R10/f\", \"$MOUNT/R10/g\"" -$CHECKSTAT -t dir $MOUNT/R10 || error -$CHECKSTAT -a $MOUNT/R10/f || error -$CHECKSTAT -a $MOUNT/R10/g || error +mkdir $DIR/R10 +perl -e "rename \"$DIR/R10/f\", \"$DIR/R10/g\"" +$CHECKSTAT -t dir $DIR/R10 || error +$CHECKSTAT -a $DIR/R10/f || error +$CHECKSTAT -a $DIR/R10/g || error pass $CLEAN $START echo '== symlink sanity ================================ test25' echo "--test 25.1 create file in symlinked directory" -mkdir $MOUNT/d25 -ln -s d25 $MOUNT/s25 -touch $MOUNT/s25/foo +mkdir $DIR/d25 +ln -s d25 $DIR/s25 +touch $DIR/s25/foo pass $CLEAN $START echo "--test 25.2 lookup file in symlinked directory" -$CHECKSTAT -t file $MOUNT/s25/foo +$CHECKSTAT -t file $DIR/s25/foo pass $CLEAN $START echo "--test 26 multiple component symlink" -mkdir $MOUNT/d26 -mkdir $MOUNT/d26/d26-2 -ln -s d26/d26-2 $MOUNT/s26 -touch $MOUNT/s26/foo +mkdir $DIR/d26 +mkdir $DIR/d26/d26-2 +ln -s d26/d26-2 $DIR/s26 +touch $DIR/s26/foo pass $CLEAN $START echo "--test 26.1 multiple component symlink at the end of a lookup" -ln -s d26/d26-2/foo $MOUNT/s26-2 -touch $MOUNT/s26-2 +ln -s d26/d26-2/foo $DIR/s26-2 +touch $DIR/s26-2 pass $CLEAN $START echo "--test 26.2 a chain of symlinks" -mkdir $MOUNT/d26.2 -touch $MOUNT/d26.2/foo -ln -s d26.2 $MOUNT/s26.2-1 -ln -s s26.2-1 $MOUNT/s26.2-2 -ln -s s26.2-2 $MOUNT/s26.2-3 -chmod 0666 $MOUNT/s26.2-3/foo +mkdir $DIR/d26.2 +touch $DIR/d26.2/foo +ln -s d26.2 $DIR/s26.2-1 +ln -s s26.2-1 $DIR/s26.2-2 +ln -s s26.2-2 $DIR/s26.2-3 +chmod 0666 $DIR/s26.2-3/foo pass $CLEAN $START -echo '== stripe sanity ================================= test27' -echo "--test 26.1 create one stripe" -mkdir $MOUNT/d27 -../utils/lstripe $MOUNT/d27/f0 4096 0 1 -$CHECKSTAT -t file $MOUNT/d27/f0 -echo "--test 26.2 write to one stripe file" -cp /etc/hosts $MOUNT/d27/f0 +# recursive symlinks (bug 439) +echo "--test 26.3 create multiple component recursive symlink" +ln -s d26-3/foo $DIR/d26-3 pass $CLEAN $START -echo "--test 26.3 create two stripes" -../utils/lstripe $MOUNT/d27/f01 4096 0 2 -echo "--test 26.4 write to two stripe file" -cp /etc/hosts $MOUNT/d27/f01 +echo "--test 26.3 unlink multiple component recursive symlink" +rm $DIR/d26-3 pass $CLEAN $START -echo "--test 26.5 lstripe existing file (should return error)" -../utils/lstripe $MOUNT/d27/f12 4096 1 2 -! ../utils/lstripe $MOUNT/d27/f12 4096 1 2 +echo '== stripe sanity ================================= test27' +echo "--test 27.1 create one stripe" +mkdir $DIR/d27 +$LSTRIPE $DIR/d27/f0 8192 0 1 +$CHECKSTAT -t file $DIR/d27/f0 +echo "--test 27.2 write to one stripe file" +cp /etc/hosts $DIR/d27/f0 +pass + +echo "--test 27.3 create two stripe file f01" +$LSTRIPE $DIR/d27/f01 8192 0 2 +echo "--test 27.4 write to two stripe file file f01" +dd if=/dev/zero of=$DIR/d27/f01 bs=4k count=4 +pass + +echo "--test 27.5 create file with default settings" +$LSTRIPE $DIR/d27/fdef 0 -1 0 +$CHECKSTAT -t file $DIR/d27/fdef +#dd if=/dev/zero of=$DIR/d27/fdef bs=4k count=4 + +echo "--test 27.6 lstripe existing file (should return error)" +$LSTRIPE $DIR/d27/f12 8192 1 2 +! $LSTRIPE $DIR/d27/f12 8192 1 2 +$CHECKSTAT -t file $DIR/d27/f12 +#dd if=/dev/zero of=$DIR/d27/f12 bs=4k count=4 +pass + + +echo "--test 27.7 lstripe with bad stripe size (should return error on LOV)" +$LSTRIPE $DIR/d27/fbad 100 1 2 || /bin/true +dd if=/dev/zero of=$DIR/d27/f12 bs=4k count=4 pass $CLEAN $START -echo "--test 26.6 lfind " -../utils/lfind $MOUNT/d27 +echo "--test 27.8 lfind " +$LFIND $DIR/d27 pass $CLEAN $START -echo '== IT_GETATTR regression ======================== test28' -mkdir $MOUNT/d28 -touch $MOUNT/d28/foo -MDCDIR=${MDCDIR:-/proc/lustre/devices/ldlm/MDC_mds1} +echo '== create/mknod/mkdir with bad file types ======== test28' +mkdir $DIR/d28 +$CREATETEST $DIR/d28/ct || error +pass + +echo '== IT_GETATTR regression ======================== test29' +mkdir $MOUNT/d29 +touch $MOUNT/d29/foo +ls -l $MOUNT/d29 +MDCDIR=${MDCDIR:-/proc/fs/lustre/ldlm/ldlm/MDC_MNT_localhost_mds1} LOCKCOUNTORIG=`cat $MDCDIR/lock_count` LOCKUNUSEDCOUNTORIG=`cat $MDCDIR/lock_unused_count` -ls -l $MOUNT/d28 +ls -l $MOUNT/d29 LOCKCOUNTCURRENT=`cat $MDCDIR/lock_count` LOCKUNUSEDCOUNTCURRENT=`cat $MDCDIR/lock_unused_count` if [ $LOCKCOUNTCURRENT -gt $LOCKCOUNTORIG ] || [ $LOCKUNUSEDCOUNTCURRENT -gt $LOCKUNUSEDCOUNTORIG ]; then @@ -490,7 +537,7 @@ $CLEAN $START echo '== cleanup =============================================' -rm -r $MOUNT/[Rdfs][1-9]* +rm -r $DIR/[Rdfs][1-9]* echo '======================= finished =======================' exit diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index 9c50574..8e95654 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -1,62 +1,112 @@ #!/bin/bash -export NAME=$NAME +set -e + +PATH=$PATH:. + +CHECKSTAT=${CHECKSTAT:-"checkstat -v"} +MOUNT1=${MOUNT1:-/mnt/lustre1} +MOUNT2=${MOUNT2:-/mnt/lustre2} +export NAME=${NAME:-mount2} + clean() { - echo -n "cleanup..." - sh llmount2-hackcleanup.sh > /dev/null + echo -n "cln.." + sh llmountcleanup.sh > /dev/null } -CLEAN=clean +CLEAN=${CLEAN:-clean} start() { - echo -n "mounting..." - sh llmount2-hack.sh > /dev/null - echo -n "mounted" + echo -n "mnt.." + sh llrmount.sh > /dev/null + echo -n "done" } -START=start +START=${START:-start} error () { - echo $1 + echo FAIL exit 1 } -mkdir -p /mnt/lustre2 -mount | grep /mnt/lustre2 || $START +pass() { + echo PASS +} + +mkdir -p $MOUNT2 +mount | grep $MOUNT1 || sh llmount.sh echo -n "test 1: check create on 2 mtpt's..." -touch /mnt/lustre1/f1 -[ -f /mnt/lustre2/f1 ] || error "test 1 failure" -echo "pass" +touch $MOUNT1/f1 +[ -f $MOUNT2/f1 ] || error +pass + +echo "test 2: check attribute updates on 2 mtpt's..." +chmod 777 $MOUNT2/f1 +$CHECKSTAT -t file -p 0777 $MOUNT1/f1 || error +pass -echo -n "test 2: check attribute updates on 2 mtpt's..." -chmod a+x /mnt/lustre2/f1 -[ -x /mnt/lustre1/f1 ] || error "test 2 failure" -echo "pass" +echo "test 2b: check cached attribute updates on 2 mtpt's..." +touch $MOUNT1/f2b +ls -l $MOUNT2/f2b +chmod 777 $MOUNT2/f2b +$CHECKSTAT -t file -p 0777 $MOUNT1/f2b || error +pass -echo -n "test 3: check after remount attribute updates on 2 mtpt's..." -chmod a-x /mnt/lustre2/f1 +echo "test 2c: check cached attribute updates on 2 mtpt's..." +touch $MOUNT1/f2c +ls -l $MOUNT2/f2c +chmod 777 $MOUNT1/f2c +$CHECKSTAT -t file -p 0777 $MOUNT2/f2c || error +pass + +echo "test 3: check after remount attribute updates on 2 mtpt's..." +chmod a-x $MOUNT2/f1 $CLEAN $START +$CHECKSTAT -t file -p 0666 $MOUNT1/f1 || error +pass + +echo "test 4: unlink on one mountpoint removes file on other..." +rm $MOUNT2/f1 +$CHECKSTAT -a $MOUNT1/f1 || error +pass -[ ! -x /mnt/lustre1/f1 ] || error "test 3 failure" -echo "pass" +echo -n "test 5: symlink on one mtpt, readlink on another..." +( cd $MOUNT1 ; ln -s this/is/good lnk ) -echo -n "test 4: symlink on one mtpt, readlink on another..." -( cd /mnt/lustre1 ; ln -s this/is/good lnk ) +[ "this/is/good" = "`perl -e 'print readlink("/mnt/lustre2/lnk");'`" ] || error +pass -[ "Xthis/is/good" = X`perl -e 'print readlink("/mnt/lustre2/lnk");'` ] || error "test 4 fails" -echo "pass" +echo -n "test 6: fstat validation on multiple mount points..." +./multifstat $MOUNT1/f6 $MOUNT2/f6 +pass -echo -n "test 5: fstat validation on multiple mount points..." -./multifstat /mnt/lustre1/fstatfile /mnt/lustre2/fstatfile || error "test 5 fails" -echo "pass" +echo "test 9: remove of open file on other node..." +./openunlink $MOUNT1/f9 $MOUNT2/f9 || error +pass -echo -n "test 9: remove of open file on other node..." -touch /mnt/lustre1/f9 -tail -f /mnt/lustre1/f9 & -rm /mnt/lustre2/f9 -kill %1 -cat /mnt/lustre1/f9 && error "test 9 fails" -echo "pass" +echo -n "test 10: append of file with sub-page size on multiple mounts..." +MTPT=1 +> $MOUNT2/f10 +for C in a b c d e f g h i j k l; do + MOUNT=`eval echo \\$MOUNT$MTPT` + echo -n $C >> $MOUNT/f10 + [ "$MTPT" -eq 1 ] && MTPT=2 || MTPT=1 +done +[ "`cat $MOUNT1/f10`" = "abcdefghijkl" ] && pass || error + +echo -n "test 11: write of file with sub-page size on multiple mounts..." +MTPT=1 +OFFSET=0 +> $MOUNT2/f11 +for C in a b c d e f g h i j k l; do + MOUNT=`eval echo \\$MOUNT$MTPT` + echo -n $C | dd of=$MOUNT/f11 bs=1 seek=$OFFSET count=1 + [ "$MTPT" -eq 1 ] && MTPT=2 || MTPT=1 + OFFSET=`expr $OFFSET + 1` +done +[ "`cat $MOUNT1/f11`" = "abcdefghijkl" ] && pass || error + +rm -f $MOUNT1/f[0-9]* $MOUNT1/lnk $CLEAN diff --git a/lustre/tests/uml.sh b/lustre/tests/uml.sh index 112a796..7a4b95e 100644 --- a/lustre/tests/uml.sh +++ b/lustre/tests/uml.sh @@ -4,12 +4,14 @@ config=${1-uml.xml} LMC=${LMC-../utils/lmc} TMP=${TMP:-/tmp} -MDSDEV=$TMP/mds1 -MDSSIZE=50000 +MDSDEV=${MDSDEV:-$TMP/mds1} +MDSSIZE=${MDSSIZE:-50000} -OSTDEV1=$TMP/ost1 -OSTDEV2=$TMP/ost2 -OSTSIZE=100000 +OSTDEV1=${OSTDEV1:-$TMP/ost1} +OSTDEV2=${OSTDEV2:-$TMP/ost2} +OSTSIZE=${OSTSIZE:-100000} + +NETTYPE=${NETTYPE:-tcp} # NOTE - You can't have different MDS/OST nodes and also have clients on the # MDS/OST nodes without using --endlevel and --startlevel during lconf. @@ -20,9 +22,9 @@ OSTSIZE=100000 # of the clients can be started, so plan accordingly. # Three separate systems -MDSNODE=uml1 -OSTNODES="uml2 uml2" -CLIENTS="uml3" +MDSNODE=${MDSNODE:-uml1} +OSTNODES=${OSTNODES:-"uml2 uml2"} +CLIENTS=${CLIENTS:-"uml3"} # Single system with additional clients #MDSNODE=uml1 @@ -41,26 +43,47 @@ CLIENTS="uml3" rm -f $config +h2tcp () { + case $1 in + client) echo '\*' ;; + *) echo $1 ;; + esac +} + +h2elan () { + case $1 in + client) echo '\*' ;; + *) echo $1 | sed "s/[^0-9]*//" ;; + esac +} + # create nodes -for NODE in $MDSNODE $OSTNODES $CLIENTS; do - eval [ \$$NODE ] && continue - ${LMC} -m $config --add net --node $NODE --nid $NODE --nettype tcp || exit 1 - eval "$NODE=done" +echo -n "adding NET for:" +for NODE in `echo $MDSNODE $OSTNODES $CLIENTS | sort -u`; do + echo -n " $NODE" + ${LMC} -m $config --add net --node $NODE --nid `h2$NETTYPE $NODE` --nettype elan || exit 1 done # configure mds server +echo; echo "adding MDS on: $MDSNODE" ${LMC} -m $config --add mds --format --node $MDSNODE --mds mds1 --dev $MDSDEV --size $MDSSIZE ||exit 10 # configure ost -${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt 0 --stripe_pattern 0 || exit 20 +${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt 1 --stripe_pattern 0 || exit 20 COUNT=1 +echo -n "adding OST on:" for NODE in $OSTNODES; do eval OSTDEV=\$OSTDEV$COUNT + echo -n " $NODE" + OSTDEV=${OSTDEV:-$OSTDEV1} ${LMC} -m $config --add ost --node $NODE --lov lov1 --dev $OSTDEV --size $OSTSIZE || exit 21 COUNT=`expr $COUNT + 1` done # create client config(s) +echo; echo -n "adding CLIENT on:" for NODE in $CLIENTS; do + echo -n " $NODE" ${LMC} -m $config --add mtpt --node $NODE --path /mnt/lustre --mds mds1 --lov lov1 || exit 30 done +echo diff --git a/lustre/utils/.cvsignore b/lustre/utils/.cvsignore index de7b425..fc0f010 100644 --- a/lustre/utils/.cvsignore +++ b/lustre/utils/.cvsignore @@ -12,3 +12,6 @@ lctl lfind lstripe lconf +obdstat +obdio +obdbarrier diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index 6a5483d..bfeebd72 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -6,10 +6,12 @@ KFLAGS:= CPPFLAGS = $(HAVE_LIBREADLINE) obdctl_LDADD := $(LIBREADLINE) lctl_LDADD := $(LIBREADLINE) -lptlctl -sbin_PROGRAMS = lctl lfind lstripe obdctl -sbin_SCRIPTS = lconf lmc +sbin_PROGRAMS = lctl lfind lstripe obdctl obdio obdbarrier obdstat +sbin_SCRIPTS = lconf lmc llanalyze obdctl_SOURCES = parser.c obdctl.c obd.c parser.h obdctl.h lctl_SOURCES = parser.c obd.c lctl.c parser.h +obdio_SOURCES = obdio.c obdiolib.c obdiolib.h +obdbarrier_SOURCES = obdbarrier.c obdiolib.c obdiolib.h lfind_SOURCES = lfind.c lstripe_SOURCES = lstripe.c lfind_CPPFLAGS = -D_XOPEN_SOURCE=500 diff --git a/lustre/utils/lconf.in b/lustre/utils/lconf.in index 46549cc..796871d 100755 --- a/lustre/utils/lconf.in +++ b/lustre/utils/lconf.in @@ -25,10 +25,15 @@ # Based in part on the XML obdctl modifications done by Brian Behlendorf import sys, getopt, types -import string, os, stat, popen2, socket, time, random, fcntl, FCNTL, select +import string, os, stat, popen2, socket, time, random, fcntl, select import re, exceptions import xml.dom.minidom +if sys.version[0] == '1': + from FCNTL import F_GETFL, F_SETFL +else: + from fcntl import F_GETFL, F_SETFL + # Global parameters TCP_ACCEPTOR = '' MAXTCPBUF = 1048576 @@ -72,7 +77,7 @@ config.xml Lustre configuration in xml format. Levels are aproximatly like: 10 - network 20 - device, ldlm - 30 - obd, mdd + 30 - osd, mdd 40 - mds, ost 50 - mdc, osc 60 - lov @@ -294,8 +299,8 @@ class LCTLInterface: raise CommandError('lctl', "unable to find lctl binary.") def set_nonblock(self, fd): - fl = fcntl.fcntl(fd, FCNTL.F_GETFL) - fcntl.fcntl(fd, FCNTL.F_SETFL, fl | os.O_NDELAY) + fl = fcntl.fcntl(fd, F_GETFL) + fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY) def run(self, cmds): """ @@ -464,8 +469,8 @@ class LCTLInterface: cmds = """ ignore_errors device $%s - cleanup - detach %s + cleanup %s + detach quit""" % (name, ('', 'force')[config.force()]) self.run(cmds) @@ -575,23 +580,20 @@ def is_block(path): # build fs according to type # fixme: dangerous -def mkfs(fstype, dev): +def mkfs(dev, devsize, fstype): + block_cnt = '' + if devsize: + # devsize is in 1k, and fs block count is in 4k + block_cnt = devsize/4 + if(fstype in ('ext3', 'extN')): - mkfs = 'mkfs.ext2 -j -b 4096' + mkfs = 'mkfs.ext2 -j -b 4096 -F ' elif (fstype == 'reiserfs'): - mkfs = 'mkfs.reiserfs -f' + mkfs = 'mkreiserfs -ff' else: print 'unsupported fs type: ', fstype - if not is_block(dev): - if(fstype in ('ext3', 'extN')): - force = '-F' - elif (fstype == 'reiserfs'): - force = '' - else: - print 'unsupported fs type: ', fstype - else: - force = '' - (ret, out) = run (mkfs, force, dev) + + (ret, out) = run (mkfs, dev, block_cnt) if ret: panic("Unable to build fs:", dev) # enable hash tree indexing on fsswe @@ -676,7 +678,7 @@ def block_dev(dev, size, fstype, format): if not is_block(dev): dev = init_loop(dev, size, fstype) if config.reformat() or (need_format(fstype, dev) and format == 'yes'): - mkfs(fstype, dev) + mkfs(dev, size, fstype) # else: # panic("device:", dev, @@ -869,14 +871,13 @@ class Network(Module): if not self.nid: panic("unable to set nid for", self.net_type, self.nid) debug("nid:", self.nid) - self.add_portals_module("linux/oslib", 'portals') if node_needs_router(): self.add_portals_module("linux/router", 'kptlrouter') if self.net_type == 'tcp': self.add_portals_module("linux/socknal", 'ksocknal') if self.net_type == 'toe': - self.add_portals_odule("/linux/toenal", 'ktoenal') + self.add_portals_module("/linux/toenal", 'ktoenal') if self.net_type == 'elan': self.add_portals_module("/linux/rqswnal", 'kqswnal') if self.net_type == 'gm': @@ -897,7 +898,7 @@ class Network(Module): lctl.add_route(net_type, gw, lo, hi) if net_type in ('tcp', 'toe') and net_type == self.net_type and hi == '': srvdb = self.db.nid2server(lo) - if not srv: + if not srvdb: panic("no server for nid", lo) else: srv = Network(srvdb) @@ -905,14 +906,15 @@ class Network(Module): lctl.network(self.net_type, self.nid) - lctl.newdev(attach = "ptlrpc RPCDEV RPCDEV_UUID") + if not is_prepared("RPCDEV_UUID"): + lctl.newdev(attach = "ptlrpc RPCDEV RPCDEV_UUID") def cleanup(self): self.info(self.net_type, self.nid, self.port) for net_type, gw, lo, hi in self.db.get_route_tbl(): if self.net_type in ('tcp', 'toe') and hi == '': srvdb = self.db.nid2server(lo) - if not srv: + if not srvdb: panic("no server for nid", lo) else: srv = Network(srvdb) @@ -930,9 +932,10 @@ class Network(Module): cleanup_error(e.rc) try: - lctl.cleanup("RPCDEV", "RPCDEV_UUID") + if is_prepared("RPCDEV_UUID"): + lctl.cleanup("RPCDEV", "RPCDEV_UUID") except CommandError, e: - print "cleanup failed: ", self.name + print "cleanup failed: RPCDEV" e.dump() cleanup_error(e.rc) try: @@ -959,6 +962,8 @@ class LDLM(Module): class LOV(Module): def __init__(self,db): Module.__init__(self, 'LOV', db) + self.add_lustre_module('mdc', 'mdc') + self.add_lustre_module('lov', 'lov') self.mds_uuid = self.db.get_first_ref('mds') mds= self.db.lookup(self.mds_uuid) self.mds_name = mds.getName() @@ -967,66 +972,49 @@ class LOV(Module): self.pattern = self.db.get_val_int('stripepattern', 0) self.devlist = self.db.get_refs('obd') self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist)) - self.add_lustre_module('mdc', 'mdc') - self.add_lustre_module('lov', 'lov') - - def prepare(self): - if is_prepared(self.uuid): - return + self.osclist = [] for obd_uuid in self.devlist: obd = self.db.lookup(obd_uuid) - osc = get_osc(obd) + osc = get_osc(obd, self.name) if osc: - try: - # Ignore connection failures, because the LOV will DTRT with - # an unconnected OSC. - osc.prepare(ignore_connect_failure=1) - except CommandError: - print "Error preparing OSC %s (inactive)\n" % osc_uuid + self.osclist.append(osc) else: - panic('osc not found:', osc_uuid) - mdc_uuid = prepare_mdc(self.db, self.mds_uuid) + panic('osc not found:', obd_uuid) + + def prepare(self): + if is_prepared(self.uuid): + return + for osc in self.osclist: + try: + # Ignore connection failures, because the LOV will DTRT with + # an unconnected OSC. + osc.prepare(ignore_connect_failure=1) + except CommandError: + print "Error preparing OSC %s (inactive)\n" % osc.uuid + self.mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid) self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern, self.devlist, self.mds_name) lctl.newdev(attach="lov %s %s" % (self.name, self.uuid), - setup ="%s" % (mdc_uuid)) + setup ="%s" % (self.mdc_uuid)) def cleanup(self): - if not is_prepared(self.uuid): - return - for obd_uuid in self.devlist: - obd = self.db.lookup(obd_uuid) - osc = get_osc(obd) - if osc: - osc.cleanup() - else: - panic('osc not found:', osc_uuid) - Module.cleanup(self) - cleanup_mdc(self.db, self.mds_uuid) - + if is_prepared(self.uuid): + Module.cleanup(self) + for osc in self.osclist: + osc.cleanup() + cleanup_mdc(self.db, self.name, self.mds_uuid) def load_module(self): - for obd_uuid in self.devlist: - obd = self.db.lookup(obd_uuid) - osc = get_osc(obd) - if osc: - osc.load_module() - break - else: - panic('osc not found:', osc_uuid) + for osc in self.osclist: + osc.load_module() + break Module.load_module(self) - def cleanup_module(self): Module.cleanup_module(self) - for obd_uuid in self.devlist: - obd = self.db.lookup(obd_uuid) - osc = get_osc(obd) - if osc: - osc.cleanup_module() - break - else: - panic('osc not found:', osc_uuid) + for osc in self.osclist: + osc.cleanup_module() + break class LOVConfig(Module): def __init__(self,db): @@ -1055,7 +1043,7 @@ class MDSDEV(Module): self.size = self.db.get_val_int('devsize', 0) self.fstype = self.db.get_val('fstype', '') # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid - self.uuid = self.db.get_first_ref('mds') + self.uuid = self.db.get_first_ref('target') mds = self.db.lookup(self.uuid) self.name = mds.getName() self.lovconfig_uuids = mds.get_refs('lovconfig') @@ -1090,60 +1078,26 @@ class MDSDEV(Module): print "cleanup failed: ", self.name e.dump() cleanup_error(e.rc) - if not is_prepared(self.uuid): - return - Module.cleanup(self) + if is_prepared(self.uuid): + Module.cleanup(self) clean_loop(self.devname) -# Very unusual case, as there is no MDC element in the XML anymore -# Builds itself from an MDS node -class MDC(Module): - def __init__(self,db): - self.mds_uuid = db.getUUID() - self.mds_name = db.getName() - self.db = db - node_name = config.select(self.mds_name) - if node_name: - self.mdd_uuid = self.db.get_mdd(node_name, self.mds_uuid) - else: - self.mdd_uuid = db.get_first_ref('active') - if not self.mdd_uuid: - panic("No MDSDEV found for MDS service:", self.mds_name) - self.module_name = 'MDC' - self.kmodule_list = [] - self._server = None - self._connected = 0 - - host = socket.gethostname() - self.name = 'MDC_%s' % (self.mds_name) - self.uuid = '%s_%05x_%05x' % (self.name, int(random.random() * 1048576), - int(random.random() * 1048576)) - - self.lookup_server(self.mdd_uuid) - self.add_lustre_module('mdc', 'mdc') - - def prepare(self): - if is_prepared(self.uuid): - return - self.info(self.mds_uuid) - srv = self.get_server() - lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem) - lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid), - setup ="%s %s" %(self.mds_uuid, srv.uuid)) - -class OBD(Module): +class OSD(Module): def __init__(self, db): - Module.__init__(self, 'OBD', db) - self.obdtype = self.db.get_val('obdtype') + Module.__init__(self, 'OSD', db) + self.osdtype = self.db.get_val('osdtype') self.devname = self.db.get_val('devpath', '') self.size = self.db.get_val_int('devsize', 0) self.fstype = self.db.get_val('fstype', '') - self.active_target = self.db.get_first_ref('active') + self.uuid = self.db.get_first_ref('target') + ost = self.db.lookup(self.uuid) + self.name = ost.getName() # FIXME: if fstype not set, then determine based on kernel version self.format = self.db.get_val('autoformat', 'yes') if self.fstype == 'extN': self.add_lustre_module('extN', 'extN') - self.add_lustre_module(self.obdtype, self.obdtype) + self.add_lustre_module('ost', 'ost') + self.add_lustre_module(self.osdtype, self.osdtype) if self.fstype: self.add_lustre_module('obdclass' , 'fsfilt_%s' % (self.fstype)) @@ -1153,96 +1107,67 @@ class OBD(Module): def prepare(self): if is_prepared(self.uuid): return - self.info(self.obdtype, self.devname, self.size, self.fstype, self.format) - if self.obdtype == 'obdecho': + self.info(self.osdtype, self.devname, self.size, self.fstype, self.format) + if self.osdtype == 'obdecho': blkdev = '' else: blkdev = block_dev(self.devname, self.size, self.fstype, self.format) - lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid), + lctl.newdev(attach="%s %s %s" % (self.osdtype, self.name, self.uuid), setup ="%s %s" %(blkdev, self.fstype)) - def cleanup(self): - if not is_prepared(self.uuid): - return - Module.cleanup(self) - if not self.obdtype == 'obdecho': - clean_loop(self.devname) - -class COBD(Module): - def __init__(self, db): - Module.__init__(self, 'COBD', db) - self.real_uuid = self.db.get_first_ref('realobd') - self.cache_uuid = self.db.get_first_ref('cacheobd') - self.add_lustre_module('cobd' , 'cobd') - - # need to check /proc/mounts and /etc/mtab before - # formatting anything. - # FIXME: check if device is already formatted. - def prepare(self): - if is_prepared(self.uuid): - return - self.info(self.real_uuid, self.cache_uuid) - lctl.newdev(attach="cobd %s %s" % (self.name, self.uuid), - setup ="%s %s" %(self.real_uuid, self.cache_uuid)) - -class OST(Module): - def __init__(self,db): - Module.__init__(self, 'OST', db) - self.obd_uuid = self.db.get_first_ref('obd') - self.add_lustre_module('ost', 'ost') - - def prepare(self): - if is_prepared(self.uuid): - return - self.info(self.obd_uuid) - lctl.newdev(attach="ost %s %s" % (self.name, self.uuid), - setup ="%s" % (self.obd_uuid)) - + if not is_prepared('OSS_UUID'): + lctl.newdev(attach="ost %s %s" % ('OSS', 'OSS_UUID'), + setup ="") -# virtual interface for OSC and LOV -class VOSC(Module): - def __init__(self,db): - Module.__init__(self, 'VOSC', db) - if db.get_class() == 'lov': - self.osc = LOV(db) - else: - self.osc = get_osc(db) - def get_uuid(self): - return self.osc.uuid - def prepare(self): - self.osc.prepare() def cleanup(self): - self.osc.cleanup() - def load_module(self): - self.osc.load_module() - def cleanup_module(self): - self.osc.cleanup_module() - + if is_prepared('OSS_UUID'): + try: + lctl.cleanup("OSS", "OSS_UUID") + except CommandError, e: + print "cleanup failed: ", self.name + e.dump() + cleanup_error(e.rc) + if is_prepared(self.uuid): + Module.cleanup(self) + if not self.osdtype == 'obdecho': + clean_loop(self.devname) -class OSC(Module): - def __init__(self, db, obd_name, obd_uuid, ost_uuid): +# Generic client module, used by OSC and MDC +class Client(Module): + def __init__(self, db, module, owner, target_name, target_uuid): + self.target_name = target_name + self.target_uuid = target_uuid self.db = db - self.module_name = 'OSC' - self.name = 'OSC_%s' % (obd_name) - self.uuid = '%s_%05x' % (self.name, int(random.random() * 1048576)) + node_name = config.select(target_name) + if node_name: + self.tgt_dev_uuid = self.db.get_target_device(node_name, target_uuid) + else: + self.tgt_dev_uuid = db.get_first_ref('active') + if not self.tgt_dev_uuid: + panic("No target device found for target:", target_name) self.kmodule_list = [] self._server = None self._connected = 0 - self.obd_uuid = obd_uuid - self.ost_uuid = ost_uuid - debug("OSC:", obd_uuid, ost_uuid) - self.lookup_server(self.ost_uuid) - self.add_lustre_module('osc', 'osc') + self.module = module + self.module_name = string.upper(module) + self.name = '%s_%s_%s' % (self.module_name, owner, target_name) + self.uuid = '%05x_%s_%05x' % (int(random.random() * 1048576), self.name, + int(random.random() * 1048576)) + self.uuid = self.uuid[0:36] + self.lookup_server(self.tgt_dev_uuid) + self.add_lustre_module(module, module) def prepare(self, ignore_connect_failure = 0): if is_prepared(self.uuid): return - self.info(self.obd_uuid, self.ost_uuid) + self.info(self.target_uuid) srv = self.get_server() try: if local_net(srv): + #debug("LOCAL NET") lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem) else: + #debug("NOT LOCAL NET") r = find_route(srv) if r: lctl.add_route_host(r[0], srv.uuid, r[1], r[2]) @@ -1251,16 +1176,15 @@ class OSC(Module): except CommandError: if (ignore_connect_failure == 0): pass - - lctl.newdev(attach="osc %s %s" % (self.name, self.uuid), - setup ="%s %s" %(self.obd_uuid, srv.uuid)) + lctl.newdev(attach="%s %s %s" % (self.module, self.name, self.uuid), + setup ="%s %s" %(self.target_uuid, srv.uuid)) def cleanup(self): srv = self.get_server() if local_net(srv): Module.cleanup(self) else: - self.info(self.obd_uuid, self.ost_uuid) + self.info(self.targt_uuid) r = find_route(srv) if r: try: @@ -1270,7 +1194,61 @@ class OSC(Module): e.dump() cleanup_error(e.rc) Module.cleanup(self) + + + +class MDC(Client): + def __init__(self, db, owner, target_name, target_uuid): + Client.__init__(self, db, 'mdc', owner, target_name, target_uuid) + +class OSC(Client): + def __init__(self, db, owner, target_name, target_uuid): + Client.__init__(self, db, 'osc', owner, target_name, target_uuid) + +class COBD(Module): + def __init__(self, db): + Module.__init__(self, 'COBD', db) + self.real_uuid = self.db.get_first_ref('realobd') + self.cache_uuid = self.db.get_first_ref('cacheobd') + self.add_lustre_module('cobd' , 'cobd') + + # need to check /proc/mounts and /etc/mtab before + # formatting anything. + # FIXME: check if device is already formatted. + def prepare(self): + if is_prepared(self.uuid): + return + self.info(self.real_uuid, self.cache_uuid) + lctl.newdev(attach="cobd %s %s" % (self.name, self.uuid), + setup ="%s %s" %(self.real_uuid, self.cache_uuid)) + + +# virtual interface for OSC and LOV +class VOSC(Module): + def __init__(self,db, owner): + Module.__init__(self, 'VOSC', db) + if db.get_class() == 'lov': + self.osc = LOV(db) + else: + self.osc = get_osc(db, owner) + def get_uuid(self): + return self.osc.uuid + def prepare(self): + self.osc.prepare() + def cleanup(self): + self.osc.cleanup() + def load_module(self): + self.osc.load_module() + def cleanup_module(self): + self.osc.cleanup_module() + def need_mdc(self): + return self.db.get_class() != 'lov' + def get_mdc_uuid(self): + if self.db.get_class() == 'lov': + return self.osc.mdc_uuid + return '' + class ECHO_CLIENT(Module): def __init__(self,db): @@ -1278,7 +1256,7 @@ class ECHO_CLIENT(Module): self.add_lustre_module('obdecho', 'obdecho') self.obd_uuid = self.db.get_first_ref('obd') obd = self.db.lookup(self.obd_uuid) - self.osc = VOSC(obd) + self.osc = VOSC(obd, self.name) def prepare(self): if is_prepared(self.uuid): @@ -1290,8 +1268,8 @@ class ECHO_CLIENT(Module): setup = self.osc.get_uuid()) def cleanup(self): - if not is_prepared(self.uuid): - return + if is_prepared(self.uuid): + Module.cleanup(self) self.osc.cleanup() def load_module(self): @@ -1308,18 +1286,22 @@ class Mountpoint(Module): self.path = self.db.get_val('path') self.mds_uuid = self.db.get_first_ref('mds') self.obd_uuid = self.db.get_first_ref('obd') - self.add_lustre_module('mdc', 'mdc') - self.add_lustre_module('llite', 'llite') obd = self.db.lookup(self.obd_uuid) - self.osc = VOSC(obd) + self.vosc = VOSC(obd, self.name) + if self.vosc.need_mdc(): + self.add_lustre_module('mdc', 'mdc') + self.add_lustre_module('llite', 'llite') def prepare(self): - self.osc.prepare() - mdc_uuid = prepare_mdc(self.db, self.mds_uuid) + self.vosc.prepare() + if self.vosc.need_mdc(): + mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid) + else: + mdc_uuid = self.vosc.get_mdc_uuid() self.info(self.path, self.mds_uuid, self.obd_uuid) cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \ - (self.osc.get_uuid(), mdc_uuid, self.path) + (self.vosc.get_uuid(), mdc_uuid, self.path) run("mkdir", self.path) ret, val = run(cmd) if ret: @@ -1338,27 +1320,21 @@ class Mountpoint(Module): if fs_is_mounted(self.path): panic("fs is still mounted:", self.path) - self.osc.cleanup() - cleanup_mdc(self.db, self.mds_uuid) + self.vosc.cleanup() + if self.vosc.need_mdc(): + cleanup_mdc(self.db, self.name, self.mds_uuid) def load_module(self): - self.osc.load_module() + self.vosc.load_module() Module.load_module(self) def cleanup_module(self): Module.cleanup_module(self) - self.osc.cleanup_module() + self.vosc.cleanup_module() # ============================================================ # XML processing and query -# OSC is no longer in the xml, so we have to fake it. -# this is getting ugly and begging for another refactoring -def get_osc(obd_dom): - obd = OBD(obd_dom) - osc = OSC(obd_dom, obd.name, obd.uuid, obd.active_target) - return osc - class LustreDB: def lookup(self, uuid): """ lookup returns a new LustreDB instance""" @@ -1419,10 +1395,10 @@ class LustreDB: return ost.lookup(uuid) def nid2server(self, nid): - netlist = self.parent.parent.attrs['network'] + netlist = self.lookup_class('network') for net_db in netlist: if net_db.get_val('nid') == nid: - return net + return net_db return None # the tag name is the service type @@ -1438,7 +1414,7 @@ class LustreDB: ret = 10 elif type in ('device', 'ldlm'): ret = 20 - elif type in ('obd', 'mdd', 'cobd'): + elif type in ('osd', 'mdd', 'cobd'): ret = 30 elif type in ('mdsdev','ost'): ret = 40 @@ -1470,23 +1446,33 @@ class LustreDB: list.sort() return list - # Find the mdsdev attached to node_name that points to - # mds_uuid - # node->profiles->mdsdev_refs->mds - def get_mdd(self, node_name, mds_uuid): + # Find the target_device for target on a node + # node->profiles->device_refs->target + def get_target_device(self, node_name, target_uuid): node_db = self.lookup_name(node_name) if not node_db: return None prof_list = node_db.get_refs('profile') for prof_uuid in prof_list: prof_db = node_db.lookup(prof_uuid) - mdd_list = prof_db.get_refs('mdsdev') - for mdd_uuid in mdd_list: - mdd = self.lookup(mdd_uuid) - if mdd.get_first_ref('mds') == mds_uuid: - return mdd_uuid + ref_list = prof_db.get_all_refs() + for ref in ref_list: + dev = self.lookup(ref[1]) + if dev and dev.get_first_ref('target') == target_uuid: + return ref[1] return None - + + # get all network uuids for this node + def get_networks(self): + ret = [] + prof_list = self.get_refs('profile') + for prof_uuid in prof_list: + prof_db = self.lookup(prof_uuid) + net_list = prof_db.get_refs('network') + debug("get_networks():", prof_uuid, net_list) + for net_uuid in net_list: + ret.append(net_uuid) + return ret class LustreDB_XML(LustreDB): def __init__(self, dom, root_node): @@ -1599,24 +1585,24 @@ class LustreDB_XML(LustreDB): """ Return the routes as a list of tuples of the form: [(type, gw, lo, hi),]""" res = [] - tbl = self.dom_node.getElementsByTagName('route_tbl') + tbl = self.dom_node.getElementsByTagName('routetbl') for t in tbl: routes = t.getElementsByTagName('route') for r in routes: lo = self.xmlattr(r, 'lo') - hi = self.xmlattr(r, 'hi', '') + hi = self.xmlattr(r, 'hi') res.append((type, gw, lo, hi)) return res def get_route_tbl(self): ret = [] - tbls = self.dom_node.getElementsByTagName('route_tbl') + tbls = self.dom_node.getElementsByTagName('routetbl') for tbl in tbls: for r in tbl.getElementsByTagName('route'): net_type = self.xmlattr(r, 'type') gw = self.xmlattr(r, 'gw') lo = self.xmlattr(r, 'lo') - hi = self.xmlattr(r,'hi', '') + hi = self.xmlattr(r, 'hi') ret.append((net_type, gw, lo, hi)) return ret @@ -1652,7 +1638,7 @@ class LustreDB_LDAP(LustreDB): self.l.protocol_version=ldap.VERSION3 # user and pw only needed if modifying db self.l.bind_s("", "", ldap.AUTH_SIMPLE); - except ldap.LDAPerror, e: + except ldap.LDAPError, e: panic(e) # FIXME, do something useful here @@ -1749,28 +1735,27 @@ class LustreDB_LDAP(LustreDB): # MDC UUID hack - # FIXME: clean this mess up! # -saved_mdc = {} -def prepare_mdc(db, mds_uuid): - global saved_mdc +# OSC is no longer in the xml, so we have to fake it. +# this is getting ugly and begging for another refactoring +def get_osc(ost_db, owner): + osc = OSC(ost_db, owner, ost_db.getName(), ost_db.getUUID()) + return osc + +def get_mdc(db, owner, mds_uuid): mds_db = db.lookup(mds_uuid); if not mds_db: panic("no mds:", mds_uuid) - if saved_mdc.has_key(mds_uuid): - return saved_mdc[mds_uuid] - mdc = MDC(mds_db) + mdc = MDC(mds_db, owner, mds_db.getName(), mds_uuid) + return mdc + +def prepare_mdc(db, owner, mds_uuid): + mdc = get_mdc(db, owner, mds_uuid) mdc.prepare() - saved_mdc[mds_uuid] = mdc.uuid return mdc.uuid -def cleanup_mdc(db, mds_uuid): - global saved_mdc - mds_db = db.lookup(mds_uuid); - if not mds_db: - panic("no mds:", mds_uuid) - if not saved_mdc.has_key(mds_uuid): - mdc = MDC(mds_db) - mdc.cleanup() - saved_mdc[mds_uuid] = mdc.uuid +def cleanup_mdc(db, owner, mds_uuid): + mdc = get_mdc(db, owner, mds_uuid) + mdc.cleanup() ############################################################ @@ -1780,13 +1765,13 @@ routes = [] local_node = [] router_flag = 0 -def init_node(node_db): - global local_node, router_flag - netlist = node_db.lookup_class('network') - for db in netlist: - type = db.get_val('nettype') - gw = db.get_val('nid') - local_node.append((type, gw)) +def add_local_interfaces(node_db): + global local_node + debug("add_local") + for netuuid in node_db.get_networks(): + net = node_db.lookup(netuuid) + debug("add_local", netuuid) + local_node.append((net.get_val('nettype'), net.get_val('nid'))) def node_needs_router(): return router_flag @@ -1800,20 +1785,26 @@ def init_route_config(lustre): for node_db in list: if node_db.get_val_int('router', 0): router_flag = 1 + #debug("init_route_config: found router", node_db.getName()) for (local_type, local_nid) in local_node: + #debug("init_route_config:", local_type, local_nid) gw = None - netlist = node_db.lookup_class('network') - for db in netlist: - if local_type == db.get_val('type'): - gw = db.get_val('server') + for netuuid in node_db.get_networks(): + db = node_db.lookup(netuuid) + if local_type == db.get_val('nettype'): + gw = db.get_val('nid') break + #debug("init_route_config: gw is", gw) if not gw: continue - for db in netlist: - if local_type != db.get_val('type'): + for netuuid in node_db.get_networks(): + db = node_db.lookup(netuuid) + #debug("init_route_config: tbl: ", db.get_route_tbl()) + if local_type != db.get_val('nettype'): for route in db.get_routes(local_type, gw): routes.append(route) - + #debug("init_route_config routes:", routes) + def local_net(net): global local_node @@ -1830,19 +1821,18 @@ def find_route(net): to = net.nid debug ('looking for route to', to_type,to) for r in routes: + #debug("find_route: ", r) if r[2] == to: return r return None - ############################################################ # lconf level logic # Start a service. -def startService(db, module_flag): +def newService(db): type = db.get_class() debug('Service:', type, db.getName(), db.getUUID()) - # there must be a more dynamic way of doing this... n = None if type == 'ldlm': n = LDLM(db) @@ -1850,39 +1840,19 @@ def startService(db, module_flag): n = LOV(db) elif type == 'network': n = Network(db) - elif type == 'obd': - n = OBD(db) + elif type == 'osd': + n = OSD(db) elif type == 'cobd': n = COBD(db) - elif type == 'ost': - n = OST(db) elif type == 'mdsdev': n = MDSDEV(db) - elif type == 'osc': - n = VOSC(db) - elif type == 'mdc': - n = MDC(db) elif type == 'mountpoint': n = Mountpoint(db) elif type == 'echoclient': n = ECHO_CLIENT(db) else: panic ("unknown service type:", type) - - if module_flag: - if config.nomod(): - return - if config.cleanup(): - n.cleanup_module() - else: - n.load_module() - else: - if config.nosetup(): - return - if config.cleanup(): - n.cleanup() - else: - n.prepare() + return n # # Prepare the system to run lustre using a particular profile @@ -1892,15 +1862,35 @@ def startService(db, module_flag): # * make sure partitions are in place and prepared # * initialize devices with lctl # Levels is important, and needs to be enforced. -def startProfile(prof_db, module_flag): - if not prof_db: - panic("profile:", profile, "not found.") - services = prof_db.getServices() - if config.cleanup(): - services.reverse() +def for_each_profile(db, prof_list, operation): + for prof_uuid in prof_list: + prof_db = db.lookup(prof_uuid) + if not prof_db: + panic("profile:", profile, "not found.") + services = prof_db.getServices() + operation(services) + +def doSetup(services): for s in services: - startService(s[1], module_flag) + n = newService(s[1]) + n.prepare() + +def doModules(services): + for s in services: + n = newService(s[1]) + n.load_module() +def doCleanup(services): + services.reverse() + for s in services: + n = newService(s[1]) + n.cleanup() + +def doUnloadModules(services): + services.reverse() + for s in services: + n = newService(s[1]) + n.cleanup_module() # # Load profile for @@ -1921,32 +1911,38 @@ def doHost(lustreDB, hosts): timeout = node_db.get_val_int('timeout', 0) if not router_flag: - init_node(node_db) + add_local_interfaces(node_db) init_route_config(lustreDB) # Two step process: (1) load modules, (2) setup lustre # if not cleaning, load modules first. - module_flag = not config.cleanup() prof_list = node_db.get_refs('profile') - for prof_uuid in prof_list: - prof_db = node_db.lookup(prof_uuid) - startProfile(prof_db, module_flag) - if not config.cleanup(): + if config.cleanup(): + if config.force(): + # the command line can override this value + timeout = 5 + sys_set_timeout(timeout) + sys_set_recovery_upcall(recovery_upcall) + + for_each_profile(node_db, prof_list, doCleanup) + for_each_profile(node_db, prof_list, doUnloadModules) + + else: + for_each_profile(node_db, prof_list, doModules) + sys_set_debug_path() script = config.gdb_script() run(lctl.lctl, ' modules >', script) if config.gdb(): - # dump /tmp/ogdb and sleep/pause here log ("The GDB module script is in", script) + # pause, so user has time to break and + # load the script time.sleep(5) sys_set_timeout(timeout) sys_set_recovery_upcall(recovery_upcall) - - module_flag = not module_flag - for prof_uuid in prof_list: - prof_db = node_db.lookup(prof_uuid) - startProfile(prof_db, module_flag) + + for_each_profile(node_db, prof_list, doSetup) ############################################################ # Command line processing diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index 2217058..fb81dd3 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -124,7 +124,7 @@ command_t cmdlist[] = { "type specific device configuration information\n" "usage: setup "}, {"cleanup", jt_obd_cleanup, 0, "cleanup previously setup device\n" - "usage: cleanup"}, + "usage: cleanup [force]"}, {"detach", jt_obd_detach, 0, "remove driver (and name and uuid) from current device\n" "usage: detach"}, @@ -164,6 +164,15 @@ command_t cmdlist[] = { {"test_brw", jt_obd_test_brw, 0, "do bulk read/writes ( per I/O, on OST object )\n" "usage: test_brw [t] [write [verbose [npages [[t]objid]]]]"}, + {"get_stripe", jt_obd_get_stripe, 0, + "show stripe info for an echo client object\n" + "usage: get_stripe objid\n"}, + {"set_stripe", jt_obd_set_stripe, 0, + "set stripe info for an echo client object\n" + "usage: set_stripe objid[=width!count[@offset][:id:id...]\n"}, + {"unset_stripe", jt_obd_unset_stripe, 0, + "unset stripe info for an echo client object\n" + "usage: unset_stripe objid\n"}, {"test_ldlm", jt_obd_test_ldlm, 0, "perform lock manager test\n" "usage: test_ldlm"}, @@ -180,6 +189,12 @@ command_t cmdlist[] = { {"newconn", jt_obd_newconn, 0, "newconn [newuuid]"}, {"failconn", jt_obd_failconn, 0, "failconn "}, {"lookup", jt_obd_mdc_lookup, 0, "usage: lookup "}, + {"notransno", jt_obd_no_transno, 0, + "disable sending of committed-transno updates\n" + "usage: notransno"}, + {"readonly", jt_obd_set_readonly, 0, + "disable writes to the underlying device\n" + "usage: readonly"}, /* Debug commands */ {"======== debug =========", jt_noop, 0, "debug"}, diff --git a/lustre/utils/lfind.c b/lustre/utils/lfind.c index 93777d6..1b75135 100644 --- a/lustre/utils/lfind.c +++ b/lustre/utils/lfind.c @@ -11,7 +11,6 @@ #include #include #include -#define printk printf #include #include #include @@ -34,14 +33,14 @@ char * shortOpts = "ho:qv"; char * usageMsg = "[ --obd | --query ] ..."; int max_ost_count = MAX_LOV_UUID_COUNT; -obd_uuid_t * obduuid; +struct obd_uuid * obduuid; __u32 obdcount; __u32 obdindex; char * buf; int buflen; struct obd_ioctl_data data; struct lov_desc desc; -obd_uuid_t * uuids; +struct obd_uuid * uuids; int uuidslen; int cfglen; struct lov_mds_md *lmm; @@ -74,7 +73,7 @@ main (int argc, char **argv) { exit(1); } - obduuid = (obd_uuid_t *)optarg; + obduuid = (struct obd_uuid *)optarg; break; case 'h': usage(stdout); @@ -155,7 +154,7 @@ init() } lmm = (struct lov_mds_md *)buf; - uuids = (obd_uuid_t *)buf; + uuids = (struct obd_uuid *)buf; } void @@ -261,7 +260,7 @@ processFile(const char *path, const struct stat *sp, int flag, struct FTW *ftwp) __u32 getobdindex(const char *path) { - obd_uuid_t *uuidp; + struct obd_uuid *uuidp; int fd; int rc; int i; diff --git a/lustre/utils/llparser.pm b/lustre/utils/llparser.pm new file mode 100644 index 0000000..5cee31f --- /dev/null +++ b/lustre/utils/llparser.pm @@ -0,0 +1,399 @@ +#!/usr/bin/perl +# Copyright (C) 2002 Cluster File Systems, Inc. +# Author: Hariharan Thantry + +# This file is part of Lustre, http://www.lustre.org. +# +# Lustre is free software; you can redistribute it and/or +# modify it under the terms of version 2 of the GNU General Public +# License as published by the Free Software Foundation. +# +# Lustre is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Lustre; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# + + +package llparser; +require Exporter; +@ISA = qw(Exporter); +@EXPORT = qw(parse_file print_rpcrelations parse_foptions %ll_subsystems + %subsysnum %trace_masks $e_subsys $e_mask $e_processor $e_time + $e_file $e_line $e_function $e_pid $e_stack $e_fmtstr $e_backref + $e_treeparent $e_numchildren $e_youngestchild $e_next $e_pidhead + $e_rpcsndrcv $e_rpcpid $e_rpcxid $e_rpcnid $e_rpcopc $e_rpcnext + $e_curlineref $SEND $RCV); + +($e_subsys, + $e_mask, + $e_processor, + $e_time, + $e_file, + $e_line, + $e_function, + $e_pid, + $e_stack, + $e_fmtstr, + $e_treeparent, + $e_numchildren, + $e_youngestchild, + $e_pidhead, + $e_next, + $e_backref) = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + +($e_rpcpid, + $e_rpcxid, + $e_rpcnid, + $e_rpcopc, + $e_rpcnext, + $e_rpcsndrcv, + $e_curlineref) = (0, 1, 2, 3, 4, 5, 6); + +$SEND = 0; +$RCV = 1; + +$REGEX=qr/^\s*(\w+)\s*:\s*(\d+)\s*:\s*(\d+)\s*:\s*(\d+\.(?:\d+))\s*\(\s*([^:]+)\s*:\s*(\d+)\s*:\s*([^()]+)\s*\(\)\s*(?:(?:\d+)\s*\|\s*)?(\d+)\s*\+\s*(\d+)\s*(?:.*)\):(.*)$/; + +$RPCREGEX = qr/^\s*(?:Sending|Handling)\s*RPC\s*pid:xid:nid:opc\s*(\d+):(?:0x)?(\w+):(?:0x)?(\w+):(\d+)\s*$/; +$FILEOPTIONREGEX = qr/(--server)|(-s)/; +$SENDING = qr/Sending/; + + +# Needs to match definition in portals/include/linux/kp30.h +%ll_subsystems = ("00" => "UNDEFINED", "01" => "MDC", "02" => "MDS", + "03" => "OSC", "04" => "OST", "05" => "CLASS", + "06" => "OBDFS","07" => "LLITE","08" => "RPC", + "09" => "EXT2OBD","0a" => "PORTALS","0b" => "SOCKNAL", + "0c" => "QSWNAL","0d" => "PINGER","0e" => "FILTER", + "0f" => "TRACE","10" => "ECHO","11" => "LDLM", + "12" => "LOV", "13" => "GMNAL","14" => "PTLROUTER" ); + +%subsysnum; +$subsysnum->{UNDEFINED} = 0; +$subsysnum->{MDC} = 1; +$subsysnum->{MDS} = 2; +$subsysnum->{OSC} = 3; +$subsysnum->{OST} = 4; +$subsysnum->{CLASS} = 5; +$subsysnum->{OBDFS} = 6; +$subsysnum->{LLITE} = 7; +$subsysnum->{RPC} = 8; +$subsysnum->{EXT2OBD} = 9; +$subsysnum->{PORTALS} = 10; +$subsysnum->{SOCKNAL} = 11; +$subsysnum->{QSWNAL} = 12; +$subsysnum->{PINGER} = 13; +$subsysnum->{FILTER} = 14; +$subsysnum->{TRACE} = 15; # obdtrace, not to be confused with D_TRACE */ +$subsysnum->{ECHO} = 16; +$subsysnum->{LDLM} = 17; +$subsysnum->{LOV} = 18; +$subsysnum->{GMNAL} = 19; +$subsysnum->{PTLROUTER} = 20; + +%tracemasks; +$tracemasks->{TRACE} = 1 << 0; # /* ENTRY/EXIT markers */ +$tracemasks->{INODE} = 1 << 1; # +$tracemasks->{SUPER} = 1 << 2; # +$tracemasks->{EXT2} = 1 << 3; # /* anything from ext2_debug */ +$tracemasks->{MALLOC} = 1 << 4; # /* print malloc, free information */ +$tracemasks->{CACHE} = 1 << 5; # /* cache-related items */ +$tracemasks->{INFO} = 1 << 6; # /* general information */ +$tracemasks->{IOCTL} = 1 << 7; # /* ioctl related information */ +$tracemasks->{BLOCKS} = 1 << 8; # /* ext2 block allocation */ +$tracemasks->{NET} = 1 << 9; # /* network communications */ +$tracemasks->{WARNING} = 1 << 10; # +$tracemasks->{BUFFS} = 1 << 11; # +$tracemasks->{OTHER} = 1 << 12; # +$tracemasks->{DENTRY} = 1 << 13; # +$tracemasks->{PORTALS} = 1 << 14; # /* ENTRY/EXIT markers */ +$tracemasks->{PAGE} = 1 << 15; # /* bulk page handling */ +$tracemasks->{DLMTRACE} = 1 << 16; # +$tracemasks->{ERROR} = 1 << 17; # /* CERROR} = ...) == CDEBUG} = D_ERROR, ...) */ +$tracemasks->{EMERG} = 1 << 18; # /* CEMERG} = ...) == CDEBUG} = D_EMERG, ...) */ +$tracemasks->{HA} = 1 << 19; # /* recovery and failover */ +$tracemasks->{RPCTRACE} = 1 << 19; # /* recovery and failover */ + +# Contains all the file names, the first filename is the +# client. After that are all servers. +my @filearray = (); + + +# Create backlinks between array entries based on the calling sequence +# For each new PID encountered, the first entry will be present in the +# PID hash. + +sub create_links { + my $arrayref = shift @_; + my $pidhashref = shift @_; + my $stitchref = shift @_; + my %local_hash; + my $hash_lineref; + my $tmpfmtref; + my $tmpref; + my $firstlineaftermarker = 0; + + foreach $lineref (@$arrayref) { + next if ($lineref->[$e_time] == 0); # Skip the client marker line + my $pidprevious = $pidhashref->{$lineref->[$e_pid]}; + if ($pidprevious->[$e_next] == 0) { + $pidprevious->[$e_next] = $lineref; + if (exists $local_hash{$lineref->[$e_pid]} + && $firstlineaftermarker) { + $hash_lineref=$local_hash{$lineref->[$e_pid]}; + $hash_lineref->[$e_next] =$lineref; + $firstlineaftermarker = 0; + } + } elsif ($local_hash{$lineref->[$e_pid]} == 0) { + # True only for the first line, the marker line. + $local_hash{$lineref->[$e_pid]}=$lineref; + #print "LINE ADDED TO HASH: @$lineref\n"; + $firstlineaftermarker = 1; + } + # Stack grows upward (assumes x86 kernel) + if ($lineref->[$e_stack] < $pidprevious->[$e_stack]) { + # lineref is not a child of pidprevious, find its parent + LINE: while(($lineref->[$e_stack] < $pidprevious->[$e_stack]) && + ($lineref->[$e_function] == $pidprevious->[$e_function]) + ) { + #This second part of the comparision is a HACK + last LINE if ($pidprevious->[$e_backref] == 0); + $pidprevious = $pidprevious->[$e_backref]; + } + } + if ($lineref->[$e_stack] > $pidprevious->[$e_stack]) { + # lineref is child of pidprevious, with the caveat that they must + # belong to different functions. This is a HACK + # until CDEBUG is modified + while($lineref->[$e_function] eq $pidprevious->[$e_function]) { + last if ($pidprevious->[$e_backref] == 0); + $pidprevious = $pidprevious->[$e_backref]; + } + + $lineref->[$e_backref] = $pidprevious; + $pidprevious->[$e_numchildren]++; + } else { + # lineref is sibling of pidprevious + $lineref->[$e_numchildren] = 0; + $lineref->[$e_backref] = $pidprevious->[$e_backref]; + ($lineref->[$e_backref])->[$e_numchildren]++; + } + + $pidhashref->{$lineref->[$e_pid]} = $lineref; + $lineref->[$e_youngestchild] = $lineref; + while ($pidprevious->[$e_backref] != 0) { + $pidprevious->[$e_youngestchild] = $lineref; + $pidprevious = $pidprevious->[$e_backref]; + } + $pidprevious->[$e_youngestchild] = $lineref; + $lineref->[$e_pidhead]=$pidprevious; + + # Stitch together rpc's + if($lineref->[$e_fmtstr] =~ $RPCREGEX) { + #print "RPC LINE: @$lineref\n"; + $tmpfmtref = [$1, $2, $3, $4, 0, 0, 0]; + if ($lineref->[$e_fmtstr] =~ $SENDING) { + $tmpfmtref->[$e_rpcsndrcv] = $SEND; + } else { $tmpfmtref->[$e_rpcsndrcv] = $RCV; } + $tmpfmtref->[$e_curlineref] = $lineref; + $stitchref->{$lineref->[$e_time]} = $tmpfmtref; + + } + + } +match_rpcs($stitchref); +return $arrayref; +} + + + + +# Main loop, parses the debug log + +sub parse_file { + my %hasharray; + my $input_files = shift; + + my $stitch_ref = shift; + my $pid = shift; + my $rpctrace = shift; + my $trace = shift; + my $nodlm = shift; + my $noclass = shift; + my $nonet = shift; + + print "$pid, $rpctrace, $nodlm, $noclass, $nonet\n"; + $backref = 0; + $treeparent = 0; + $numchildren = 0; + $youngestchild = 0; + $next = 0; + $pidhead = 0; + $iter = 0; + + foreach $file (@$input_files) { + + open(FILEHANDLE, $file) or die "Can't open file: $file\n"; + while() { + if (/$REGEX/) { + @parsed_line=($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, + $treeparent, $numchildren, $youngestchild, + $pidhead, $next, $backref); + next if (($parsed_line[$e_pid] != $pid) && + ($pid) && ($iter == 0)); + next if (($parsed_line[$e_mask] != $tracemasks->{RPCTRACE}) + && ($rpctrace)); + next if ($trace && $parsed_line[$e_mask] != + $tracemasks->{TRACE}); + next if ($nodlm && hex($parsed_line[$e_subsys]) == + $subsysnum->{LDLM}); + next if ($noclass && hex($parsed_line[$e_subsys]) == + $subsysnum->{CLASS}); + next if ($nonet && (hex($parsed_line[$e_subsys]) == + $subsysnum->{RPC} || + hex($parsed_line[$e_subsys]) == + $subsysnum->{NET} || + hex($parsed_line[$e_subsys]) == + $subsysnum->{PORTALS} || + hex($parsed_line[$e_subsys]) == + $subsysnum->{SOCKNAL} || + hex($parsed_line[$e_subsys]) == + $subsysnum->{QSWNAL} || + hex($parsed_line[$e_subsys]) == + $subsysnum->{GMNAL})); + + + if (!exists($hasharray{$parsed_line[$e_pid]})) { + # Push a marker for the beginning of this PID + my @marker_line; + $marker_line[$e_subsys] = 0; + $marker_line[$e_mask] = 0; + $marker_line[$e_processor] = 0; + $marker_line[$e_time] = $parsed_line[$e_time]; + $marker_line[$e_file] = 0; + $marker_line[$e_line] = 0; + $marker_line[$e_function] = 0; + $marker_line[$e_pid] = $parsed_line[$e_pid]; + # marker lines are everyone's parent, so stack value zero + $marker_line[$e_stack] = 0; + $marker_line[$e_fmtstr] = ""; + $marker_line[$e_treeparent] = 0; + $marker_line[$e_numchildren] = 0; + $marker_line[$e_youngestchild] = 0; + $marker_line[$e_pidhead] = 0; + $marker_line[$e_next]= \@parsed_line; + $marker_line[$e_backref] = 0; + $hasharray{$parsed_line[$e_pid]} = \@marker_line; + push @$array_parsed, [ @marker_line ]; + + } + push @$array_parsed, [ @parsed_line ]; + } + + } + close(FILEHANDLE); + if ($iter == 0) { + # Insert end of client line marker, an all zero pattern; + @marker_line = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + push @$array_parsed, [ @marker_line ]; + + } + $iter ++; + } + + $array_parsed=create_links($array_parsed, \%hasharray, $stitch_ref); + #print_array($array_parsed); + return $array_parsed; +} + +sub print_array { + + my $arrayref = shift; + foreach $lineref(@$arrayref){ + if ($lineref->[$e_backref]==0){ + print "MARKER LINE(addr): $lineref contents: [@$lineref]\n"; + } else { + + print "REGULAR LINE (addr) :$lineref contents:[@$lineref]\n"; + } + } + +} + +sub print_rpcrelations { + + my $rpchashref = shift; + foreach $rpckeys (sort keys %$rpchashref) { + $tmpref = $rpchashref->{$rpckeys}; + #print "Key: $rpckeys, Contents: @$tmpref\n"; + + } + +} +sub match_rpcs { + my $rpchashref = shift; + foreach $rpckeys (sort keys %$rpchashref) { + $tmpref = $rpchashref->{$rpckeys}; + #print "MATCHING: $@tmpref...\n"; + foreach $cmpkeys (sort keys %$rpchashref) { + next if($cmpkeys == $rpckeys); + $cmpref = $rpchashref->{$cmpkeys}; + # print "Line compared: @$cmpref\n"; + next if ($tmpref->[$e_rpcsndrcv] == $cmpref->[$e_rpcsndrcv]); + next if ($tmpref->[$e_rpcpid] != $cmpref->[$e_rpcpid]); + next if ($tmpref->[$e_rpcxid] != $cmpref->[$e_rpcxid]); + if ($tmpref->[$e_rpcsndrcv] == $SEND) { + $tmpref->[$e_rpcnext] = $cmpkeys; + #print "MACTHED: KEY 1: $rpckeys CONTENTS: @$tmpref", + #"KEY2: $cmpkeys CONTENTS: @$cmpref\n" + + } + + } + + } + +} + +sub getnextchild { + my $rootline = shift; + my $lineref = shift; + my $tempref = $lineref->[$e_next]; + if ($tempref == 0) { + return 0; + } + + if (($tempref->[$e_stack] > $rootline->[$e_stack]) || + (($tempref->[$e_stack] <= $rootline->[$e_stack]) && + ($tempref->[$e_function] == $rootline->[$e_function]) + )){ + # Child + return $tempref; + + } + return 0; + + +} + + +sub parse_foptions { + + my $inarg = shift; + my $idx = 0; + foreach $elem(@$inarg) { + next if ($elem =~ /$FILEOPTIONREGEX/); + $filearray[$idx] = $elem; + $idx++; + } + return \@filearray; +} + +1; +#$array_parsed=parse_file(); +#print_array($array_parsed); diff --git a/lustre/utils/lmc b/lustre/utils/lmc index 4d40a5b..3de4eb4 100755 --- a/lustre/utils/lmc +++ b/lustre/utils/lmc @@ -65,18 +65,18 @@ Object creation command summary: -add ost --node node_name - --obd obd_name + --ost ost_name --lov lov_name --dev path --size size --fstype extN|ext3 - --obduuid uuid + --ostuuid uuid --add mtpt - Mountpoint --node node_name --path /mnt/point --mds mds_name - --obd obd_name OR --lov lovname + --ost ost_name OR --lov lov_name """ sys.exit(1) @@ -217,18 +217,19 @@ class GenConfig: ldlm = self.newService("ldlm", name, uuid) return ldlm - def obd(self, name, uuid, fs, obdtype, devname, format, ost_uuid, dev_size=0): - obd = self.newService("obd", name, uuid) - obd.setAttribute('obdtype', obdtype) - obd.appendChild(self.ref("active", ost_uuid)) + def osd(self, name, uuid, fs, osdtype, devname, format, ost_uuid, net_uuid, dev_size=0): + osd = self.newService("osd", name, uuid) + osd.setAttribute('osdtype', osdtype) + osd.appendChild(self.ref("target", ost_uuid)) + osd.appendChild(self.ref("network", net_uuid)) if fs: - self.addElement(obd, "fstype", fs) + self.addElement(osd, "fstype", fs) if devname: - dev = self.addElement(obd, "devpath", devname) - self.addElement(obd, "autoformat", format) + dev = self.addElement(osd, "devpath", devname) + self.addElement(osd, "autoformat", format) if dev_size: - self.addElement(obd, "devsize", "%s" % (dev_size)) - return obd + self.addElement(osd, "devsize", "%s" % (dev_size)) + return osd def cobd(self, name, uuid, real_uuid, cache_uuid): cobd = self.newService("cobd", name, uuid) @@ -236,18 +237,21 @@ class GenConfig: cobd.appendChild(self.ref("cacheobd",cache_uuid)) return cobd - def ost(self, name, uuid, obd_uuid, net_uuid): + def ost(self, name, uuid, osd_uuid): ost = self.newService("ost", name, uuid) - ost.appendChild(self.ref("network", net_uuid)) - ost.appendChild(self.ref("obd", obd_uuid)) + ost.appendChild(self.ref("active", osd_uuid)) return ost + def oss(self, name, uuid): + oss = self.newService("oss", name, uuid) + return oss + def lov(self, name, uuid, mds_uuid, stripe_sz, stripe_cnt, pattern): lov = self.newService("lov", name, uuid) lov.appendChild(self.ref("mds", mds_uuid)) - lov.setAttribute("stripesize", stripe_sz) - lov.setAttribute("stripecount", stripe_cnt) - lov.setAttribute("stripepattern", pattern) + lov.setAttribute("stripesize", str(stripe_sz)) + lov.setAttribute("stripecount", str(stripe_cnt)) + lov.setAttribute("stripepattern", str(pattern)) return lov def lovconfig(self, name, uuid, lov_uuid): @@ -269,7 +273,7 @@ class GenConfig: if dev_size: self.addElement(mdd, "devsize", "%s" % (dev_size)) mdd.appendChild(self.ref("network", net_uuid)) - mdd.appendChild(self.ref("mds", mds_uuid)) + mdd.appendChild(self.ref("target", mds_uuid)) return mdd def mountpoint(self, name, uuid, mds_uuid, osc_uuid, path): @@ -345,6 +349,17 @@ def get_net_uuid(lustre, node_name): def lov_add_obd(gen, lov, osc_uuid): lov.appendChild(gen.ref("obd", osc_uuid)) +def ref_exists(profile, uuid): + elist = profile.childNodes + for e in elist: + if e.nodeType == e.ELEMENT_NODE: + ref = e.getAttribute('uuidref') + if ref == uuid: + return 1 + return 0 + +# ensure that uuid is not already in the profile +# return true if uuid is added def node_add_profile(gen, node, ref, uuid): refname = "%s_ref" % "profile" ret = node.getElementsByTagName(refname) @@ -352,7 +367,12 @@ def node_add_profile(gen, node, ref, uuid): error('node has no profile ref:', node) prof_uuid = ret[0].getAttribute('uuidref') profile = lookup(node.parentNode, prof_uuid) + if not profile: + error("no profile found:", prof_uuid) + if ref_exists(profile, uuid): + return 0 profile.appendChild(gen.ref(ref, uuid)) + return 1 def get_attr(dom_node, attr, default=""): v = dom_node.getAttribute(attr) @@ -400,7 +420,7 @@ def add_net(gen, lustre, options): nid = get_option(options, 'nid') net_type = get_option(options, 'nettype') - if net_type == 'tcp': + if net_type in ('tcp', 'toe'): port = get_option_int(options, 'port', DEFAULT_PORT) tcpbuf = get_option_int(options, 'tcpbuf', 0) elif net_type in ('elan', 'gm'): @@ -476,9 +496,9 @@ def add_mds(gen, lustre, options): def add_ost(gen, lustre, options): node_name = get_option(options, 'node') lovname = get_option(options, 'lov', '') - obdtype = get_option(options, 'obdtype', 'obdfilter') + osdtype = get_option(options, 'osdtype', 'obdfilter', deprecated_tag="obdtype") - if obdtype == 'obdecho': + if osdtype == 'obdecho': fstype = '' devname = '' size = 0 @@ -488,38 +508,45 @@ def add_ost(gen, lustre, options): size = get_option(options, 'size', 0) fstype = get_option(options, 'fstype', 'extN') - obdname = get_option(options, 'obd', 'OBD_'+ node_name) - obdname = new_name(obdname) - ostname = new_name('OST_'+ obdname) - if options.has_key('obduuid'): - obd_uuid = options['obduuid'] - obd = lookup(lustre, obd_uuid) - if obd: - error("Duplicate OBD UUID:", obd_uuid) + ostname = get_option(options, 'ost', '', deprecated_tag='obd') + if not ostname: + ostname = new_name('OST_'+ node_name) + + osdname = new_name("OSD_" + ostname) + osd_uuid = get_option(options, 'osduuid', '', deprecated_tag = 'obduuid') + if osd_uuid and lookup(lustre, osd_uuid): + error("Duplicate OBD UUID:", osd_uuid) else: - obd_uuid = new_uuid(obdname) - ost_uuid = new_uuid(ostname) + osd_uuid = new_uuid(osdname) + + ost_uuid = name2uuid(lustre, ostname, fatal=0) + if not ost_uuid: + ost_uuid = new_uuid(ostname) + ost = gen.ost(ostname, ost_uuid, osd_uuid) + lustre.appendChild(ost) + if lovname: + lov = findByName(lustre, lovname, "lov") + if not lov: + error('add_ost:', '"'+lovname+'"', "lov element not found.") + lov_add_obd(gen, lov, ost_uuid) net_uuid = get_net_uuid(lustre, node_name) if not net_uuid: - error("NODE: ", node_name, "not found") + error("NODE: No net network interface for", node_name, "found") - obd = gen.obd(obdname, obd_uuid, fstype, obdtype, devname, get_format_flag(options), ost_uuid, - size) - ost = gen.ost(ostname, ost_uuid, obd_uuid, net_uuid) - - if lovname: - lov = findByName(lustre, lovname, "lov") - if not lov: - error('add_ost:', '"'+lovname+'"', "lov element not found.") - lov_add_obd(gen, lov, obd_uuid) + osd = gen.osd(osdname, osd_uuid, fstype, osdtype, devname, get_format_flag(options), ost_uuid, + net_uuid, size) node = findByName(lustre, node_name, "node") - node_add_profile(gen, node, 'obd', obd_uuid) - node_add_profile(gen, node, 'ost', ost_uuid) - lustre.appendChild(obd) - lustre.appendChild(ost) +## if node_add_profile(gen, node, 'oss', oss_uuid): +## ossname = 'OSS' +## oss_uuid = new_uuid(ossname) +## oss = gen.oss(ossname, oss_uuid) +## lustre.appendChild(oss) + + node_add_profile(gen, node, 'osd', osd_uuid) + lustre.appendChild(osd) def add_cobd(gen, lustre, options): @@ -542,7 +569,7 @@ def add_cobd(gen, lustre, options): def add_echo_client(gen, lustre, options): """ add an echo client to the profile for this node. """ node_name = get_option(options, 'node') - lov_name = get_option(options, 'obd') + lov_name = get_option(options, 'ost') node = findByName(lustre, node_name, 'node') @@ -552,7 +579,7 @@ def add_echo_client(gen, lustre, options): lov_uuid = name2uuid(lustre, lov_name, tag='lov', fatal=0) if not lov_uuid: - lov_uuid = name2uuid(lustre, lov_name, tag='obd', fatal=1) + lov_uuid = name2uuid(lustre, lov_name, tag='ost', fatal=1) echo = gen.echo_client(echoname, echo_uuid, lov_uuid) lustre.appendChild(echo) @@ -567,9 +594,9 @@ def add_lov(gen, lustre, options): warning("name:", lov_orig, "already used. using:", name) mds_name = get_option(options, 'mds') - stripe_sz = get_option(options, 'stripe_sz') - stripe_cnt = get_option(options, 'stripe_cnt', 0) - pattern = get_option(options, 'stripe_pattern', 0) + stripe_sz = get_option_int(options, 'stripe_sz') + stripe_cnt = get_option_int(options, 'stripe_cnt', 0) + pattern = get_option_int(options, 'stripe_pattern', 0) uuid = new_uuid(name) ret = findByName(lustre, name, "lov") @@ -597,9 +624,9 @@ def add_mtpt(gen, lustre, options): mds_name = get_option(options, 'mds') lov_name = get_option(options, 'lov', '') if lov_name == '': - lov_name = get_option(options, 'obd', '') + lov_name = get_option(options, 'ost', '', deprecated_tag='obd') if lov_name == '': - error("--add mtpt requires either --lov lov_name or --obd obd_name") + error("--add mtpt requires either --lov lov_name or --ost ost_name") name = new_name('MNT_'+ node_name) @@ -610,7 +637,7 @@ def add_mtpt(gen, lustre, options): mds_uuid = name2uuid(lustre, mds_name, tag='mds') lov_uuid = name2uuid(lustre, lov_name, tag='lov', fatal=0) if not lov_uuid: - lov_uuid = name2uuid(lustre, lov_name, tag='obd', fatal=1) + lov_uuid = name2uuid(lustre, lov_name, tag='ost', fatal=1) uuid = new_uuid(name) mtpt = gen.mountpoint(name, uuid, mds_uuid, lov_uuid, path) @@ -620,6 +647,7 @@ def add_mtpt(gen, lustre, options): node_add_profile(gen, node, "mountpoint", uuid) lustre.appendChild(mtpt) +# obsolete, leaving behind for reference def add_oscref(gen, lustre, options): """ create mtpt on a node """ node_name = get_option(options, 'node') @@ -644,30 +672,37 @@ def has_option(options, tag): return 1 return 0 -def get_option(options, tag, default = None): +def get_option(options, tag, default = None, deprecated_tag=None): """Look for tag in options hash and return the value if set. If not set, then if return default it is set, otherwise exception.""" if options.has_key(tag): return options[tag] + elif deprecated_tag and options.has_key(deprecated_tag): + warning('--'+deprecated_tag, " is deprecated, please use:", '--'+tag) + return options[deprecated_tag] elif default != None: return default else: - raise OptionError("--add %s requires --%s value" % (options['add'], tag)) + raise OptionError("--add %s requires --%s " % (options['add'], tag)) # this exception should print an error like '--add blah requires -- value' def get_option_int(options, tag, default = None): """Return an integer option. Raise exception if the value is not an int""" val = get_option(options, tag, default) - return int(val) + try: + n = int(val) + except ValueError: + raise OptionError("--%s (value must be integer)" % (tag)) + return n def parse_cmdline(argv): short_opts = "ho:i:m:" long_opts = ["add=", "node=", "nettype=", "nid=", "tcpbuf=", "port=", "echo_client=", "stripe_sz=", "stripe_cnt=", "stripe_pattern=", "mds=", "route", "router", "merge=", "format", "reformat", "output=", - "dev=", "size=", "obd=", "obdtype=", "obduuid=", "in=", - "path=", "help", "batch=", "lov=", "gw=", "lo=", "hi=", - "oscref", "osc=", "real_obd=", "cache_obd=", "fstype=", + "dev=", "size=", "obd=", "ost=", "obdtype=", "osdtype=", "obduuid=", "in=", + "osduuid=", "path=", "help", "batch=", "lov=", "gw=", "lo=", "hi=", + "osc=", "real_obd=", "cache_obd=", "fstype=", "timeout=", "recovery_upcall="] opts = [] args = [] @@ -692,6 +727,8 @@ def parse_cmdline(argv): options['mds'] = a if o == "--obd": options['obd'] = a + if o == "--ost": + options['ost'] = a # node options if o == "--timeout": @@ -728,10 +765,14 @@ def parse_cmdline(argv): options['osc'] = a if o == "--obdtype": options['obdtype'] = a + if o == "--osdtype": + options['osdtype'] = a if o == "--fstype": options['fstype'] = a if o == "--obduuid": options['obduuid'] = a + if o == "--osduuid": + options['osduuid'] = a # lov options if o == "--stripe_sz": @@ -763,6 +804,7 @@ def parse_cmdline(argv): if o == "--format": options['format'] = 1 if o == "--reformat": + warning("the lmc --reformat option is not supported. Use lconf --reformat") options['reformat'] = 1 if o == "--batch": options['batch'] = a @@ -816,8 +858,6 @@ def add(devtype, gen, lustre, options): add_node(gen, lustre, options) elif devtype == 'echo_client': add_echo_client(gen, lustre, options) - elif devtype == 'oscref': - add_oscref(gen, lustre, options) elif devtype == 'cobd': add_cobd(gen, lustre, options) else: diff --git a/lustre/utils/lstripe.c b/lustre/utils/lstripe.c index 1aa9d91..39e2bdf 100644 --- a/lustre/utils/lstripe.c +++ b/lustre/utils/lstripe.c @@ -15,13 +15,17 @@ /****************** Functions ******************/ -void usage(char *pgm) +void usage(char *prog) { - fprintf(stderr, "usage: %s \n", pgm); - - fprintf(stderr, "\tstripe size: number of bytes in each stripe\n"); - fprintf(stderr, "\tstripe start: OST index which holds first stripe\n"); - fprintf(stderr, "\tstripe count: number of OSTs to stripe over\n"); + fprintf(stderr, "usage: %s " + "\n", prog); + + fprintf(stderr, + "\tstripe size: number of bytes in each stripe (0 default)\n"); + fprintf(stderr, + "\tstripe start: OST index of first stripe (-1 default)\n"); + fprintf(stderr, + "\tstripe count: number of OSTs to stripe over (0 default)\n"); } int create_file(char *name, long stripe_size, int stripe_offset, @@ -60,21 +64,45 @@ int main(int argc, char *argv[]) long st_size; int st_offset, st_count; + char *end; /* Check to make sure we have enough parameters */ if (argc != 5) { usage(argv[0]); - return(-1); + return 1; } /* Get the stripe size */ - st_size = atol(argv[2]); + st_size = strtoul(argv[2], &end, 0); + if (*end != '\0') { + fprintf(stderr, "bad stripe size '%s'\n", argv[2]); + usage(argv[0]); + return 2; + } + + /* + if (st_size & 4095) { + fprintf(stderr, "stripe size must be multiple of page size\n"); + usage(argv[0]); + return 3; + } + */ /* Get the stripe offset*/ - st_offset = atoi(argv[3]); + st_offset = strtoul(argv[3], &end, 0); + if (*end != '\0') { + fprintf(stderr, "bad stripe offset '%s'\n", argv[3]); + usage(argv[0]); + return 4; + } /* Get the stripe count */ - st_count = atoi(argv[4]); + st_count = strtoul(argv[4], &end, 0); + if (*end != '\0') { + fprintf(stderr, "bad stripe count '%s'\n", argv[4]); + usage(argv[0]); + return 5; + } /* Create the file, as specified. Return and display any errors. */ result = create_file(argv[1], st_size, st_offset, st_count); diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index 8c329ff..8800b57 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -35,7 +35,6 @@ #include #include #include -#define printk printf #include #include @@ -81,8 +80,11 @@ char *buf = rawbuf; int max = sizeof(rawbuf); static int thread; -static struct lov_stripe_md saved_lsm; -static char lsm_valid = 0; + +union lsm_buffer { + char space [4096]; + struct lov_stripe_md lsm; +} lsm_buffer; static int getfd(char *func); static char *cmdname(char *func); @@ -190,6 +192,118 @@ static int parse_devname(char *func, char *name) return ret; } +static char * +lsm_string (struct lov_stripe_md *lsm) +{ + static char buffer[4096]; + char *p = buffer; + int space = sizeof (buffer); + int i; + int nob; + + *p = 0; + space--; + + nob = snprintf(p, space, LPX64, lsm->lsm_object_id); + p += nob; + space -= nob; + + if (lsm->lsm_stripe_count != 0) { + nob = snprintf (p, space, "=%u#%u@%d", + lsm->lsm_stripe_size, + lsm->lsm_stripe_count, + lsm->lsm_stripe_offset); + p += nob; + space -= nob; + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + nob = snprintf (p, space, ":"LPX64, + lsm->lsm_oinfo[i].loi_id); + p += nob; + space -= nob; + } + } + + if (space == 0) { /* probable overflow */ + fprintf (stderr, "lsm_string() overflowed buffer\n"); + abort (); + } + + return (buffer); +} + +static void +reset_lsmb (union lsm_buffer *lsmb) +{ + memset (lsmb->space, 0, sizeof (lsmb->space)); + lsmb->lsm.lsm_magic = LOV_MAGIC; + +} + +static int +parse_lsm (union lsm_buffer *lsmb, char *string) +{ + struct lov_stripe_md *lsm = &lsmb->lsm; + char *end; + int i; + + /* + * object_id[=size#count[@offset][:id]*] + */ + + reset_lsmb (lsmb); + + lsm->lsm_object_id = strtoull (string, &end, 0); + if (end == string) + return (-1); + string = end; + + if (*string == 0) + return (0); + + if (*string != '=') + return (-1); + string++; + + lsm->lsm_stripe_size = strtoul (string, &end, 0); + if (end == string) + return (-1); + string = end; + + if (*string != '#') + return (-1); + string++; + + lsm->lsm_stripe_count = strtoul (string, &end, 0); + if (end == string) + return (-1); + string = end; + + if (*string == '@') { + string++; + lsm->lsm_stripe_offset = strtol (string, &end, 0); + if (end == string) + return (-1); + string = end; + } + + if (*string == 0) /* don't have to specify obj ids */ + return (0); + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + if (*string != ':') + return (-1); + string++; + lsm->lsm_oinfo[i].loi_id = strtoull (string, &end, 0); + string = end; + } + + if (*string != 0) + return (-1); + + return (0); +} + static char *cmdname(char *func) { static char buf[512]; @@ -446,7 +560,7 @@ int jt_obd_connect(int argc, char **argv) do_disconnect(argv[0], 1); -#warning TODO: implement timeout per lctl usage for probe + /* XXX TODO: implement timeout per lctl usage for probe */ if (argc != 1) return CMD_HELP; @@ -591,6 +705,24 @@ int jt_opt_threads(int argc, char **argv) int jt_obd_detach(int argc, char **argv) { struct obd_ioctl_data data; + int rc; + + IOCINIT(data); + + if (argc != 1) + return CMD_HELP; + + rc = ioctl(fd, OBD_IOC_DETACH, buf); + if (rc < 0) + fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]), + strerror(rc = errno)); + + return rc; +} + +int jt_obd_cleanup(int argc, char **argv) +{ + struct obd_ioctl_data data; char force = 'F'; int rc; @@ -600,12 +732,14 @@ int jt_obd_detach(int argc, char **argv) return CMD_HELP; if (argc == 2) { + if (strcmp(argv[1], "force")) + return CMD_HELP; data.ioc_inllen1 = 1; data.ioc_inlbuf1 = &force; } IOC_PACK(argv[0], data); - rc = ioctl(fd, OBD_IOC_DETACH, buf); + rc = ioctl(fd, OBD_IOC_CLEANUP, buf); if (rc < 0) fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]), strerror(rc = errno)); @@ -613,7 +747,7 @@ int jt_obd_detach(int argc, char **argv) return rc; } -int jt_obd_cleanup(int argc, char **argv) +int jt_obd_no_transno(int argc, char **argv) { struct obd_ioctl_data data; int rc; @@ -623,7 +757,25 @@ int jt_obd_cleanup(int argc, char **argv) if (argc != 1) return CMD_HELP; - rc = ioctl(fd, OBD_IOC_CLEANUP, &data); + rc = ioctl(fd, OBD_IOC_NO_TRANSNO, &data); + if (rc < 0) + fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]), + strerror(rc = errno)); + + return rc; +} + +int jt_obd_set_readonly(int argc, char **argv) +{ + struct obd_ioctl_data data; + int rc; + + IOCINIT(data); + + if (argc != 1) + return CMD_HELP; + + rc = ioctl(fd, OBD_IOC_SET_READONLY, &data); if (rc < 0) fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]), strerror(rc = errno)); @@ -808,12 +960,146 @@ int jt_obd_setup(int argc, char **argv) return rc; } -/* The ioctl API has been extended to provide the LOV stripe metadata to the - * caller when applicable. This utility, however, only saves the LSM for the - * latest CREATE. It only saves the LSM when the ioctl indicates that it - * is valid by overloading 'ioc_conn2' as a boolean. */ +/* Get echo client's stripe meta-data for the given object + */ +int jt_obd_get_stripe (int argc, char **argv) +{ + struct obd_ioctl_data data; + __u64 id; + int rc; + char *end; + + if (argc != 2) + return (CMD_HELP); + + id = strtoull (argv[1], &end, 0); + if (*end) { + fprintf (stderr, "Error: %s: invalid object id '%s'\n", + cmdname (argv[0]), argv[1]); + return (CMD_HELP); + } + + memset (&lsm_buffer, 0, sizeof (lsm_buffer)); + + IOCINIT (data); + data.ioc_obdo1.o_id = id; + data.ioc_obdo1.o_mode = S_IFREG | 0644; + data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLMODE; + data.ioc_pbuf1 = (char *)&lsm_buffer; + data.ioc_plen1 = sizeof (lsm_buffer); + + IOC_PACK(argv[0], data); + rc = ioctl(fd, ECHO_IOC_GET_STRIPE, buf); + IOC_UNPACK(argv[0], data); + + if (rc != 0) { + fprintf (stderr, "Error: %s: rc %d(%s)\n", + cmdname (argv[0]), rc, strerror (errno)); + return (rc); + } + + printf ("%s\n", lsm_string (&lsm_buffer.lsm)); + + return (rc); +} + +/* Set stripe meta-data for 1 or more objects. Object must be new to + * this echo client instance. + */ +int jt_obd_set_stripe (int argc, char **argv) +{ + struct obd_ioctl_data data; + char *end; + int count = 1; + int i; + int rc; + + if (argc < 2 || argc > 3) + return CMD_HELP; + + rc = parse_lsm (&lsm_buffer, argv[1]); + if (rc != 0) { + fprintf (stderr, "error: %s: invalid object '%s'\n", + cmdname (argv[0]), argv[1]); + return CMD_HELP; + } + + if (argc > 2) { + count = strtol (argv[2], &end, 0); + if (*end != 0) { + fprintf (stderr, "error: %s: invalid count '%s'\n", + cmdname (argv[0]), argv[1]); + return CMD_HELP; + } + } + + for (i = 0; i < count; i++) + { + IOCINIT (data); + data.ioc_obdo1.o_id = lsm_buffer.lsm.lsm_object_id + i; + data.ioc_obdo1.o_mode = S_IFREG | 0644; + data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLMODE; + data.ioc_pbuf1 = (char *)&lsm_buffer; + data.ioc_plen1 = sizeof (lsm_buffer); + + IOC_PACK (argv[0], data); + rc = ioctl (fd, ECHO_IOC_SET_STRIPE, buf); + IOC_UNPACK (argv[0], data); + + if (rc != 0) { + fprintf (stderr, "Error: %s: rc %d(%s)\n", + cmdname (argv[0]), rc, strerror (errno)); + return (rc); + } + } + + return (0); +} + +/* Clear stripe meta-data info for an object on this echo-client instance + */ +int jt_obd_unset_stripe (int argc, char **argv) +{ + struct obd_ioctl_data data; + char *end; + obd_id id; + int rc; + + if (argc != 2) + return CMD_HELP; + + id = strtoll (argv[1], &end, 0); + if (*end == 0) { + fprintf (stderr, "error: %s: invalid object id '%s'\n", + cmdname (argv[0]), argv[1]); + return CMD_HELP; + } + + IOCINIT (data); + data.ioc_obdo1.o_id = lsm_buffer.lsm.lsm_object_id; + data.ioc_obdo1.o_mode = S_IFREG | 0644; + data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLMODE; + + IOC_PACK (argv[0], data); + rc = ioctl (fd, ECHO_IOC_SET_STRIPE, buf); + IOC_UNPACK (argv[0], data); + + if (rc != 0) + fprintf (stderr, "Error: %s: rc %d(%s)\n", + cmdname (argv[0]), rc, strerror (errno)); + + return (0); +} + +/* Create one or more objects, arg[1] may describe stripe meta-data. If + * not, defaults assumed. This echo-client instances stashes the stripe + * object ids. Use get_stripe on this node to print full lsm and + * set_stripe on another node to cut/paste between nodes. + */ int jt_obd_create(int argc, char **argv) { + static __u64 base_id = 1; + struct obd_ioctl_data data; struct timeval next_time; __u64 count = 1, next_count; @@ -821,7 +1107,7 @@ int jt_obd_create(int argc, char **argv) char *end; IOCINIT(data); - if (argc < 2 || argc > 4) + if (argc < 2 || argc > 5) return CMD_HELP; count = strtoull(argv[1], &end, 0); @@ -848,26 +1134,36 @@ int jt_obd_create(int argc, char **argv) return CMD_HELP; } + if (argc < 5) + reset_lsmb (&lsm_buffer); /* will set default */ + else { + rc = parse_lsm (&lsm_buffer, argv[4]); + if (rc != 0) { + fprintf(stderr, "error: %s: invalid lsm '%s'\n", + cmdname(argv[0]), argv[4]); + return CMD_HELP; + } + base_id = lsm_buffer.lsm.lsm_object_id; + } + printf("%s: "LPD64" objects\n", cmdname(argv[0]), count); gettimeofday(&next_time, NULL); next_time.tv_sec -= verbose; for (i = 1, next_count = verbose; i <= count; i++) { data.ioc_obdo1.o_mode = mode; - data.ioc_obdo1.o_id = i; + data.ioc_obdo1.o_id = base_id++; data.ioc_obdo1.o_uid = 0; data.ioc_obdo1.o_gid = 0; data.ioc_obdo1.o_valid = OBD_MD_FLTYPE | OBD_MD_FLMODE | - OBD_MD_FLID | OBD_MD_FLUID | OBD_MD_FLGID;; + OBD_MD_FLID | OBD_MD_FLUID | OBD_MD_FLGID; - data.ioc_inllen1 = sizeof(saved_lsm); - data.ioc_inlbuf1 = (char *)&saved_lsm; + data.ioc_plen1 = sizeof (lsm_buffer); + data.ioc_pbuf1 = (char *)&lsm_buffer; IOC_PACK(argv[0], data); rc = ioctl(fd, OBD_IOC_CREATE, buf); IOC_UNPACK(argv[0], data); - fprintf(stderr, "lsm->lsm_o_id: "LPX64"\n", - saved_lsm.lsm_object_id); SHMEM_BUMP(); if (rc < 0) { fprintf(stderr, "error: %s: #%d - %s\n", @@ -881,8 +1177,6 @@ int jt_obd_create(int argc, char **argv) break; } - lsm_valid = data.ioc_conn2; - if (be_verbose(verbose, &next_time, i, &next_count, count)) printf("%s: #%d is object id "LPX64"\n", cmdname(argv[0]), i, data.ioc_obdo1.o_id); @@ -914,11 +1208,6 @@ int jt_obd_setattr(int argc, char **argv) } data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE; - if (lsm_valid == 1) { - data.ioc_inllen1 = sizeof(saved_lsm); - data.ioc_inlbuf1 = (char *)&saved_lsm; - } - IOC_PACK(argv[0], data); rc = ioctl(fd, OBD_IOC_SETATTR, buf); if (rc < 0) @@ -973,9 +1262,6 @@ int jt_obd_destroy(int argc, char **argv) data.ioc_obdo1.o_mode = S_IFREG | 0644; data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLMODE; - data.ioc_inllen1 = sizeof(saved_lsm); - data.ioc_inlbuf1 = (char *)&saved_lsm; - IOC_PACK(argv[0], data); rc = ioctl(fd, OBD_IOC_DESTROY, buf); IOC_UNPACK(argv[0], data); @@ -985,7 +1271,6 @@ int jt_obd_destroy(int argc, char **argv) cmdname(argv[0]), id, strerror(rc = errno)); break; } - lsm_valid = 0; if (be_verbose(verbose, &next_time, i, &next_count, count)) printf("%s: #%d is object id "LPX64"\n", @@ -1016,11 +1301,6 @@ int jt_obd_getattr(int argc, char **argv) data.ioc_obdo1.o_valid = 0xffffffff; printf("%s: object id "LPX64"\n", cmdname(argv[0]),data.ioc_obdo1.o_id); - if (lsm_valid == 1) { - data.ioc_inllen1 = sizeof(saved_lsm); - data.ioc_inlbuf1 = (char *)&saved_lsm; - } - IOC_PACK(argv[0], data); rc = ioctl(fd, OBD_IOC_GETATTR, buf); IOC_UNPACK(argv[0], data); @@ -1195,11 +1475,6 @@ int jt_obd_test_brw(int argc, char **argv) data.ioc_count = len; data.ioc_offset = thr_offset * len * count; - if (lsm_valid == 1) { - data.ioc_inllen1 = sizeof(saved_lsm); - data.ioc_inlbuf1 = (char *)&saved_lsm; - } - gettimeofday(&start, NULL); next_time.tv_sec = start.tv_sec - verbose; next_time.tv_usec = start.tv_usec; @@ -1262,7 +1537,7 @@ int jt_obd_lov_setconfig(int argc, char **argv) { struct obd_ioctl_data data; struct lov_desc desc; - obd_uuid_t *uuidarray, *ptr; + struct obd_uuid *uuidarray, *ptr; int rc, i; char *end; @@ -1273,13 +1548,13 @@ int jt_obd_lov_setconfig(int argc, char **argv) if (strlen(argv[1]) > sizeof(desc.ld_uuid) - 1) { fprintf(stderr, - "error: %s: LOV uuid '%s' longer than "LPSZ" characters\n", + "error: %s: LOV uuid '%s' longer than "LPSZ" chars\n", cmdname(argv[0]), argv[1], sizeof(desc.ld_uuid) - 1); return -EINVAL; } memset(&desc, 0, sizeof(desc)); - strncpy(desc.ld_uuid, argv[1], sizeof(desc.ld_uuid) - 1); + obd_str2uuid(&desc.ld_uuid, argv[1]); desc.ld_tgt_count = argc - 6; desc.ld_default_stripe_count = strtoul(argv[2], &end, 0); if (*end) { @@ -1374,7 +1649,7 @@ int jt_obd_lov_getconfig(int argc, char **argv) { struct obd_ioctl_data data; struct lov_desc desc; - obd_uuid_t *uuidarray; + struct obd_uuid *uuidarray; char *path; int rc, tmpfd; @@ -1396,7 +1671,7 @@ int jt_obd_lov_getconfig(int argc, char **argv) } memset(&desc, 0, sizeof(desc)); - strncpy(desc.ld_uuid, argv[1], sizeof(desc.ld_uuid) - 1); + obd_str2uuid(&desc.ld_uuid, argv[1]); desc.ld_tgt_count = DEF_UUID_ARRAY_LEN; repeat: uuidarray = calloc(desc.ld_tgt_count, sizeof(*uuidarray)); @@ -1425,7 +1700,7 @@ repeat: fprintf(stderr, "error: %s: ioctl error: %s\n", cmdname(argv[0]), strerror(rc = errno)); } else { - obd_uuid_t *ptr; + struct obd_uuid *ptr; int i; if (obd_ioctl_unpack(&data, buf, max)) { diff --git a/lustre/utils/obdbarrier.c b/lustre/utils/obdbarrier.c new file mode 100644 index 0000000..911ab5f --- /dev/null +++ b/lustre/utils/obdbarrier.c @@ -0,0 +1,223 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include + +#include "obdiolib.h" + +int +parse_kmg (uint64_t *valp, char *str) +{ + uint64_t val; + char mod[32]; + + switch (sscanf (str, LPU64"%1[gGmMkK]", &val, mod)) + { + default: + return (-1); + + case 1: + *valp = val; + return (0); + + case 2: + switch (*mod) + { + case 'g': + case 'G': + *valp = val << 30; + return (0); + + case 'm': + case 'M': + *valp = val << 20; + return (0); + + case 'k': + case 'K': + *valp = val << 10; + return (0); + + default: + *valp = val; + return (0); + } + } +} + +void +usage (char *cmdname, int help) +{ + char *name = strrchr (cmdname, '/'); + + if (name == NULL) + name = cmdname; + + fprintf (help ? stdout : stderr, + "usage: %s -d device -s size -o offset [-i id][-n reps][-l] oid\n", + name); +} + +int +exponential_modulus (int i, int base) +{ + int top = base; + int mod = 1; + + for (;;) { + if (i < top) + return (i%mod == 0); + + mod = top; + top *= base; + } +} + +int +main (int argc, char **argv) +{ + uint64_t bid = (((uint64_t)gethostid()) << 32) | getpid (); + int set_bid = 0; + uint64_t oid; + int setup = 0; + int device = -1; + int npeers = 0; + int reps = 1; + char hostname[128]; + struct obdio_conn *conn; + struct obdio_barrier *b; + char *end; + uint64_t val; + int rc; + int c; + + setvbuf (stdout, NULL, _IOLBF, 0); + memset (hostname, 0, sizeof (hostname)); + gethostname (hostname, sizeof (hostname)); + hostname[sizeof(hostname) - 1] = 0; + + while ((c = getopt (argc, argv, "hsi:d:n:p:")) != -1) + switch (c) { + case 'h': + usage (argv[0], 1); + return (0); + + case 'i': + bid = strtoll (optarg, &end, 0); + if (end == optarg || *end != 0) { + fprintf (stderr, "Can't parse id %s\n", + optarg); + return (1); + } + set_bid = 1; + break; + + case 's': + setup = 1; + break; + + case 'd': + device = strtol (optarg, &end, 0); + if (end == optarg || *end != 0 || device < 0) { + fprintf (stderr, "Can't parse device %s\n", + optarg); + return (1); + } + break; + + case 'n': + if (parse_kmg (&val, optarg) != 0) { + fprintf (stderr, "Can't parse reps %s\n", + optarg); + return (1); + } + reps = (int)val; + break; + + case 'p': + npeers = strtol (optarg, &end, 0); + if (end == optarg || *end != 0 || npeers <= 0) { + fprintf (stderr, "Can't parse npeers %s\n", + optarg); + return (1); + } + break; + + default: + usage (argv[0], 0); + return (1); + } + + if ((!setup && !set_bid) || + npeers <= 0 || + device < 0 || + optind == argc) { + fprintf (stderr, "%s not specified\n", + (!setup && !set_bid) ? "id" : + npeers <= 0 ? "npeers" : + device < 0 ? "device" : "object id"); + return (1); + } + + oid = strtoull (argv[optind], &end, 0); + if (end == argv[optind] || *end != 0) { + fprintf (stderr, "Can't parse object id %s\n", + argv[optind]); + return (1); + } + + conn = obdio_connect (device); + if (conn == NULL) + return (1); + + b = obdio_new_barrier (oid, bid, npeers); + if (b == NULL) + return (1); + + rc = 0; + if (setup) { + rc = obdio_setup_barrier (conn, b); + if (rc == 0) + printf ("Setup barrier: -d %d -i "LPX64" -p %d -n1 "LPX64"\n", + device, bid, npeers, oid); + } else { + for (c = 0; c < reps; c++) { + rc = obdio_barrier (conn, b); + if (rc != 0) + break; + if (exponential_modulus (c, 10)) + printf ("%s: Barrier %d\n", hostname, c); + } + } + + free (b); + + obdio_disconnect (conn); + + return (rc == 0 ? 0 : 1); +} + + diff --git a/lustre/utils/obdctl.h b/lustre/utils/obdctl.h index acc5c5f..b8c210c 100644 --- a/lustre/utils/obdctl.h +++ b/lustre/utils/obdctl.h @@ -38,6 +38,8 @@ int jt_obd_connect(int argc, char **argv); int jt_obd_disconnect(int argc, char **argv); int jt_obd_detach(int argc, char **argv); int jt_obd_cleanup(int argc, char **argv); +int jt_obd_no_transno(int argc, char **argv); +int jt_obd_set_readonly(int argc, char **argv); int jt_obd_newdev(int argc, char **argv); int jt_obd_list(int argc, char **argv); int jt_obd_attach(int argc, char **argv); @@ -49,6 +51,9 @@ int jt_obd_destroy(int argc, char **argv); int jt_obd_getattr(int argc, char **argv); int jt_obd_test_getattr(int argc, char **argv); int jt_obd_test_brw(int argc, char **argv); +int jt_obd_get_stripe(int argc, char **argv); +int jt_obd_set_stripe(int argc, char **argv); +int jt_obd_unset_stripe(int argc, char **argv); int jt_obd_lov_setconfig(int argc, char **argv); int jt_obd_lov_getconfig(int argc, char **argv); int jt_obd_test_ldlm(int argc, char **argv); diff --git a/lustre/utils/obdio.c b/lustre/utils/obdio.c new file mode 100644 index 0000000..ccee788 --- /dev/null +++ b/lustre/utils/obdio.c @@ -0,0 +1,304 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include + +#include "obdiolib.h" + +int +obdio_test_fixed_extent (struct obdio_conn *conn, + uint32_t myhid, uint32_t mypid, + int reps, int locked, uint64_t oid, + uint64_t offset, uint32_t size) +{ + struct lustre_handle fh; + struct lustre_handle lh; + void *space; + void *buffer; + uint32_t *ibuf; + int i; + int j; + int rc; + int rc2; + + rc = obdio_open (conn, oid, &fh); + if (rc != 0) { + fprintf (stderr, "Failed to open object "LPX64": %s\n", + oid, strerror (errno)); + return (rc); + } + + buffer = obdio_alloc_aligned_buffer (&space, size); + if (buffer == NULL) { + fprintf (stderr, "Can't allocate buffer size %d\n", size); + rc = -1; + goto out_0; + } + + for (i = 0; i < reps; i++) { + ibuf = (uint32_t *) buffer; + for (j = 0; j < size / (4 * sizeof (*ibuf)); j++) { + ibuf[0] = myhid; + ibuf[1] = mypid; + ibuf[2] = i; + ibuf[3] = j; + ibuf += 4; + } + + if (locked) { + rc = obdio_enqueue (conn, oid, LCK_PW, offset, size, &lh); + if (rc != 0) { + fprintf (stderr, "Error on enqueue "LPX64" @ "LPU64" for %u: %s\n", + oid, offset, size, strerror (errno)); + goto out_1; + } + } + + rc = obdio_pwrite (conn, oid, buffer, size, offset); + if (rc != 0) { + fprintf (stderr, "Error writing "LPX64" @ "LPU64" for %u: %s\n", + oid, offset, size, strerror (errno)); + if (locked) + obdio_cancel (conn, &lh); + rc = -1; + goto out_1; + } + + memset (buffer, 0xbb, size); + + rc = obdio_pread (conn, oid, buffer, size, offset); + if (rc != 0) { + fprintf (stderr, "Error reading "LPX64" @ "LPU64" for %u: %s\n", + oid, offset, size, strerror (errno)); + if (locked) + obdio_cancel (conn, &lh); + rc = -1; + goto out_1; + } + + if (locked) { + rc = obdio_cancel (conn, &lh); + if (rc != 0) { + fprintf (stderr, "Error on cancel "LPX64" @ "LPU64" for %u: %s\n", + oid, offset, size, strerror (errno)); + rc = -1; + goto out_1; + } + } + + ibuf = (uint32_t *) buffer; + for (j = 0; j < size / (4 * sizeof (*ibuf)); j++) { + if (ibuf[0] != myhid || + ibuf[1] != mypid || + ibuf[2] != i || + ibuf[3] != j) { + fprintf (stderr, "Error checking "LPX64" @ "LPU64" for %u, chunk %d\n", + oid, offset, size, j); + fprintf (stderr, "Expected [%x,%x,%x,%x], got [%x,%x,%x,%x]\n", + myhid, mypid, i, j, ibuf[0], ibuf[1], ibuf[2], ibuf[3]); + rc = -1; + goto out_1; + } + ibuf += 4; + } + } + out_1: + free (space); + out_0: + rc2 = obdio_close (conn, oid, &fh); + if (rc2 != 0) + fprintf (stderr, "Error closing object "LPX64": %s\n", + oid, strerror (errno)); + return (rc); +} + +int +parse_kmg (uint64_t *valp, char *str) +{ + uint64_t val; + char mod[32]; + + switch (sscanf (str, LPU64"%1[gGmMkK]", &val, mod)) + { + default: + return (-1); + + case 1: + *valp = val; + return (0); + + case 2: + switch (*mod) + { + case 'g': + case 'G': + *valp = val << 30; + return (0); + + case 'm': + case 'M': + *valp = val << 20; + return (0); + + case 'k': + case 'K': + *valp = val << 10; + return (0); + + default: + *valp = val; + return (0); + } + } +} + +void +usage (char *cmdname, int help) +{ + char *name = strrchr (cmdname, '/'); + + if (name == NULL) + name = cmdname; + + fprintf (help ? stdout : stderr, + "usage: %s -d device -s size -o offset [-i id][-n reps][-l] oid\n", + name); +} + +int +main (int argc, char **argv) +{ + uint32_t mypid = getpid (); + uint32_t myhid = gethostid (); + uint64_t oid; + uint64_t base_offset = 0; + uint32_t size = 0; + int set_size = 0; + int device = -1; + int reps = 1; + int locked = 0; + char *end; + struct obdio_conn *conn; + uint64_t val; + int v1; + int v2; + int rc; + int c; + + while ((c = getopt (argc, argv, "hi:s:o:d:n:l")) != -1) + switch (c) { + case 'h': + usage (argv[0], 1); + return (0); + + case 'i': + switch (sscanf (optarg, "%i.%i", &v1, &v2)) { + case 1: + mypid = v1; + break; + case 2: + myhid = v1; + mypid = v2; + break; + default: + fprintf (stderr, "Can't parse id %s\n", + optarg); + return (1); + } + break; + + case 's': + if (parse_kmg (&val, optarg) != 0) { + fprintf (stderr, "Can't parse size %s\n", + optarg); + return (1); + } + size = (uint32_t)val; + set_size++; + break; + + case 'o': + if (parse_kmg (&val, optarg) != 0) { + fprintf (stderr, "Can't parse offset %s\n", + optarg); + return (1); + } + base_offset = val; + break; + + case 'd': + device = strtol (optarg, &end, 0); + if (end == optarg || *end != 0 || device < 0) { + fprintf (stderr, "Can't parse device %s\n", + optarg); + return (1); + } + break; + case 'n': + if (parse_kmg (&val, optarg) != 0) { + fprintf (stderr, "Can't parse reps %s\n", + optarg); + return (1); + } + reps = (int)val; + break; + case 'l': + locked = 1; + break; + default: + usage (argv[0], 0); + return (1); + } + + if (!set_size || + device < 0 || + optind == argc) { + fprintf (stderr, "No %s specified\n", + !set_size ? "size" : + device < 0 ? "device" : "object id"); + return (1); + } + + oid = strtoull (argv[optind], &end, 0); + if (end == argv[optind] || *end != 0) { + fprintf (stderr, "Can't parse object id %s\n", + argv[optind]); + return (1); + } + + conn = obdio_connect (device); + if (conn == NULL) + return (1); + + rc = obdio_test_fixed_extent (conn, myhid, mypid, reps, locked, + oid, base_offset, size); + + obdio_disconnect (conn); + + return (rc == 0 ? 0 : 1); +} + + diff --git a/lustre/utils/obdiolib.c b/lustre/utils/obdiolib.c new file mode 100644 index 0000000..ef95055 --- /dev/null +++ b/lustre/utils/obdiolib.c @@ -0,0 +1,465 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2003 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "obdiolib.h" + +void +obdio_iocinit (struct obdio_conn *conn) +{ + memset (&conn->oc_data, 0, sizeof (conn->oc_data)); + conn->oc_data.ioc_version = OBD_IOCTL_VERSION; + conn->oc_data.ioc_addr = conn->oc_conn_addr; + conn->oc_data.ioc_cookie = conn->oc_conn_cookie; + conn->oc_data.ioc_len = sizeof (conn->oc_data); +} + +int +obdio_ioctl (struct obdio_conn *conn, int cmd) +{ + char *buf = conn->oc_buffer; + int rc; + int rc2; + + rc = obd_ioctl_pack (&conn->oc_data, &buf, sizeof (conn->oc_buffer)); + if (rc != 0) { + fprintf (stderr, "obdio_ioctl: obd_ioctl_pack: %d (%s)\n", + rc, strerror (errno)); + abort (); + } + + rc = ioctl (conn->oc_fd, cmd, buf); + if (rc != 0) + return (rc); + + rc2 = obd_ioctl_unpack (&conn->oc_data, buf, sizeof (conn->oc_buffer)); + if (rc2 != 0) { + fprintf (stderr, "obdio_ioctl: obd_ioctl_unpack: %d (%s)\n", + rc2, strerror (errno)); + abort (); + } + + return (rc); +} + +struct obdio_conn * +obdio_connect (int device) +{ + struct obdio_conn *conn; + int rc; + + conn = malloc (sizeof (*conn)); + if (conn == NULL) { + fprintf (stderr, "obdio_connect: no memory\n"); + return (NULL); + } + memset (conn, 0, sizeof (*conn)); + + conn->oc_fd = open ("/dev/obd", O_RDWR); + if (conn->oc_fd < 0) { + fprintf (stderr, "obdio_connect: Can't open /dev/obd: %s\n", + strerror (errno)); + goto failed; + } + + obdio_iocinit (conn); + conn->oc_data.ioc_dev = device; + rc = obdio_ioctl (conn, OBD_IOC_DEVICE); + if (rc != 0) { + fprintf (stderr, "obdio_connect: Can't set device %d: %s\n", + device, strerror (errno)); + goto failed; + } + + obdio_iocinit (conn); + rc = obdio_ioctl (conn, OBD_IOC_CONNECT); + if (rc != 0) { + fprintf (stderr, "obdio_connect: Can't connect to device %d: %s\n", + device, strerror (errno)); + goto failed; + } + + conn->oc_conn_addr = conn->oc_data.ioc_addr; + conn->oc_conn_cookie = conn->oc_data.ioc_cookie; + return (conn); + + failed: + free (conn); + return (NULL); +} + +void +obdio_disconnect (struct obdio_conn *conn) +{ + close (conn->oc_fd); + /* obdclass will automatically close on last ref */ + free (conn); +} + +int +obdio_open (struct obdio_conn *conn, uint64_t oid, struct lustre_handle *fh) +{ + int rc; + + obdio_iocinit (conn); + + conn->oc_data.ioc_obdo1.o_id = oid; + conn->oc_data.ioc_obdo1.o_mode = S_IFREG; + conn->oc_data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE; + + rc = obdio_ioctl (conn, OBD_IOC_OPEN); + + if (rc == 0) + memcpy (fh, obdo_handle(&conn->oc_data.ioc_obdo1), sizeof (*fh)); + + return (rc); +} + +int +obdio_close (struct obdio_conn *conn, uint64_t oid, struct lustre_handle *fh) +{ + obdio_iocinit (conn); + + + conn->oc_data.ioc_obdo1.o_id = oid; + conn->oc_data.ioc_obdo1.o_mode = S_IFREG; + memcpy (obdo_handle (&conn->oc_data.ioc_obdo1), fh, sizeof (*fh)); + conn->oc_data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | + OBD_MD_FLMODE | OBD_MD_FLHANDLE; + + return (obdio_ioctl (conn, OBD_IOC_CLOSE)); +} + +int +obdio_pread (struct obdio_conn *conn, uint64_t oid, + char *buffer, uint32_t count, uint64_t offset) +{ + obdio_iocinit (conn); + + conn->oc_data.ioc_obdo1.o_id = oid; + conn->oc_data.ioc_obdo1.o_mode = S_IFREG; + conn->oc_data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE; + + conn->oc_data.ioc_pbuf2 = buffer; + conn->oc_data.ioc_plen2 = count; + conn->oc_data.ioc_count = count; + conn->oc_data.ioc_offset = offset; + + return (obdio_ioctl (conn, OBD_IOC_BRW_READ)); +} + +int +obdio_pwrite (struct obdio_conn *conn, uint64_t oid, + char *buffer, uint32_t count, uint64_t offset) +{ + obdio_iocinit (conn); + + conn->oc_data.ioc_obdo1.o_id = oid; + conn->oc_data.ioc_obdo1.o_mode = S_IFREG; + conn->oc_data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE; + + conn->oc_data.ioc_pbuf2 = buffer; + conn->oc_data.ioc_plen2 = count; + conn->oc_data.ioc_count = count; + conn->oc_data.ioc_offset = offset; + + return (obdio_ioctl (conn, OBD_IOC_BRW_WRITE)); +} + +int +obdio_enqueue (struct obdio_conn *conn, uint64_t oid, + int mode, uint64_t offset, uint32_t count, + struct lustre_handle *lh) +{ + int rc; + + obdio_iocinit (conn); + + conn->oc_data.ioc_obdo1.o_id = oid; + conn->oc_data.ioc_obdo1.o_mode = S_IFREG; + conn->oc_data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE; + + conn->oc_data.ioc_conn1 = mode; + conn->oc_data.ioc_count = count; + conn->oc_data.ioc_offset = offset; + + rc = obdio_ioctl (conn, ECHO_IOC_ENQUEUE); + + if (rc == 0) + memcpy (lh, obdo_handle (&conn->oc_data.ioc_obdo1), sizeof (*lh)); + + return (rc); +} + +int +obdio_cancel (struct obdio_conn *conn, struct lustre_handle *lh) +{ + obdio_iocinit (conn); + + memcpy (obdo_handle (&conn->oc_data.ioc_obdo1), lh, sizeof (*lh)); + conn->oc_data.ioc_obdo1.o_valid = OBD_MD_FLHANDLE; + + return (obdio_ioctl (conn, ECHO_IOC_CANCEL)); +} + +void * +obdio_alloc_aligned_buffer (void **spacep, int size) +{ + int pagesize = getpagesize(); + void *space = malloc (size + pagesize - 1); + + *spacep = space; + if (space == NULL) + return (NULL); + + return ((void *)(((unsigned long)space + pagesize - 1) & ~(pagesize - 1))); +} + +struct obdio_barrier * +obdio_new_barrier (uint64_t oid, uint64_t id, int npeers) +{ + struct obdio_barrier *b; + + b = (struct obdio_barrier *)malloc (sizeof (*b)); + if (b == NULL) { + fprintf (stderr, "obdio_new_barrier "LPX64": Can't allocate\n", oid); + return (NULL); + } + + b->ob_id = id; + b->ob_oid = oid; + b->ob_npeers = npeers; + b->ob_ordinal = 0; + b->ob_count = 0; + return (b); +} + +int +obdio_setup_barrier (struct obdio_conn *conn, struct obdio_barrier *b) +{ + struct lustre_handle fh; + struct lustre_handle lh; + int rc; + int rc2; + void *space; + struct obdio_barrier *fileb; + + if (b->ob_ordinal != 0 || + b->ob_count != 0) { + fprintf (stderr, "obdio_setup_barrier: invalid parameter\n"); + abort (); + } + + rc = obdio_open (conn, b->ob_oid, &fh); + if (rc != 0) { + fprintf (stderr, "obdio_setup_barrier "LPX64": Failed to open object: %s\n", + b->ob_oid, strerror (errno)); + return (rc); + } + + fileb = (struct obdio_barrier *) obdio_alloc_aligned_buffer (&space, getpagesize ()); + if (fileb == NULL) { + fprintf (stderr, "obdio_setup_barrier "LPX64": Can't allocate page buffer\n", + b->ob_oid); + rc = -1; + goto out_0; + } + + memset (fileb, 0, getpagesize ()); + *fileb = *b; + + rc = obdio_enqueue (conn, b->ob_oid, LCK_PW, 0, getpagesize (), &lh); + if (rc != 0) { + fprintf (stderr, "obdio_setup_barrier "LPX64": Error on enqueue: %s\n", + b->ob_oid, strerror (errno)); + goto out_1; + } + + rc = obdio_pwrite (conn, b->ob_oid, (void *)fileb, getpagesize (), 0); + if (rc != 0) + fprintf (stderr, "obdio_setup_barrier "LPX64": Error on write: %s\n", + b->ob_oid, strerror (errno)); + + rc2 = obdio_cancel (conn, &lh); + if (rc == 0 && rc2 != 0) { + fprintf (stderr, "obdio_setup_barrier "LPX64": Error on cancel: %s\n", + b->ob_oid, strerror (errno)); + rc = rc2; + } + out_1: + free (space); + out_0: + rc2 = obdio_close (conn, b->ob_oid, &fh); + if (rc == 0 && rc2 != 0) { + fprintf (stderr, "obdio_setup_barrier "LPX64": Error on close: %s\n", + b->ob_oid, strerror (errno)); + rc = rc2; + } + + return (rc); +} + +int +obdio_barrier (struct obdio_conn *conn, struct obdio_barrier *b) +{ + struct lustre_handle fh; + struct lustre_handle lh; + int rc; + int rc2; + void *space; + struct obdio_barrier *fileb; + char *mode; + + rc = obdio_open (conn, b->ob_oid, &fh); + if (rc != 0) { + fprintf (stderr, "obdio_barrier "LPX64": Error on open: %s\n", + b->ob_oid, strerror (errno)); + return (rc); + } + + fileb = (struct obdio_barrier *) obdio_alloc_aligned_buffer (&space, getpagesize ()); + if (fileb == NULL) { + fprintf (stderr, "obdio_barrier "LPX64": Can't allocate page buffer\n", + b->ob_oid); + rc = -1; + goto out_0; + } + + rc = obdio_enqueue (conn, b->ob_oid, LCK_PW, 0, getpagesize (), &lh); + if (rc != 0) { + fprintf (stderr, "obdio_barrier "LPX64": Error on PW enqueue: %s\n", + b->ob_oid, strerror (errno)); + goto out_1; + } + + memset (fileb, 0xeb, getpagesize ()); + rc = obdio_pread (conn, b->ob_oid, (void *)fileb, getpagesize (), 0); + if (rc != 0) { + fprintf (stderr, "obdio_barrier "LPX64": Error on initial read: %s\n", + b->ob_oid, strerror (errno)); + goto out_2; + } + + if (fileb->ob_id != b->ob_id || + fileb->ob_oid != b->ob_oid || + fileb->ob_npeers != b->ob_npeers || + fileb->ob_count >= b->ob_npeers || + fileb->ob_ordinal != b->ob_ordinal) { + fprintf (stderr, "obdio_barrier "LPX64": corrupt on initial read\n", b->ob_id); + fprintf (stderr, " got ["LPX64","LPX64","LPX64","LPX64","LPX64"]\n", + fileb->ob_id, fileb->ob_oid, fileb->ob_npeers, + fileb->ob_ordinal, fileb->ob_count); + fprintf (stderr, " expected ["LPX64","LPX64","LPX64","LPX64","LPX64"]\n", + b->ob_id, b->ob_oid, b->ob_npeers, + b->ob_ordinal, b->ob_count); + rc = -1; + goto out_2; + } + + fileb->ob_count++; + if (fileb->ob_count == fileb->ob_npeers) { /* I'm the last joiner */ + fileb->ob_count = 0; /* join count for next barrier */ + fileb->ob_ordinal++; /* signal all joined */ + } + + rc = obdio_pwrite (conn, b->ob_oid, (void *)fileb, getpagesize (), 0); + if (rc != 0) { + fprintf (stderr, "obdio_barrier "LPX64": Error on initial write: %s\n", + b->ob_oid, strerror (errno)); + goto out_2; + } + + mode = "PW"; + b->ob_ordinal++; /* now I wait... */ + while (fileb->ob_ordinal != b->ob_ordinal) { + + rc = obdio_cancel (conn, &lh); + if (rc != 0) { + fprintf (stderr, "obdio_barrier "LPX64": Error on %s cancel: %s\n", + b->ob_oid, mode, strerror (errno)); + goto out_1; + } + + mode = "PR"; + rc = obdio_enqueue (conn, b->ob_oid, LCK_PR, 0, getpagesize (), &lh); + if (rc != 0) { + fprintf (stderr, "obdio_barrier "LPX64": Error on PR enqueue: %s\n", + b->ob_oid, strerror (errno)); + goto out_1; + } + + memset (fileb, 0xeb, getpagesize ()); + rc = obdio_pread (conn, b->ob_oid, (void *)fileb, getpagesize (), 0); + if (rc != 0) { + fprintf (stderr, "obdio_barrier "LPX64": Error on read: %s\n", + b->ob_oid, strerror (errno)); + goto out_2; + } + + if (fileb->ob_id != b->ob_id || + fileb->ob_oid != b->ob_oid || + fileb->ob_npeers != b->ob_npeers || + fileb->ob_count >= b->ob_npeers || + (fileb->ob_ordinal != b->ob_ordinal - 1 && + fileb->ob_ordinal != b->ob_ordinal)) { + fprintf (stderr, "obdio_barrier "LPX64": corrupt\n", b->ob_id); + fprintf (stderr, " got ["LPX64","LPX64","LPX64","LPX64","LPX64"]\n", + fileb->ob_id, fileb->ob_oid, fileb->ob_npeers, + fileb->ob_ordinal, fileb->ob_count); + fprintf (stderr, " expected ["LPX64","LPX64","LPX64","LPX64","LPX64"]\n", + b->ob_id, b->ob_oid, b->ob_npeers, + b->ob_ordinal, b->ob_count); + rc = -1; + goto out_2; + } + } + + out_2: + rc2 = obdio_cancel (conn, &lh); + if (rc == 0 && rc2 != 0) { + fprintf (stderr, "obdio_barrier "LPX64": Error on cancel: %s\n", + b->ob_oid, strerror (errno)); + rc = rc2; + } + out_1: + free (space); + out_0: + rc2 = obdio_close (conn, b->ob_oid, &fh); + if (rc == 0 && rc2 != 0) { + fprintf (stderr, "obdio_barrier "LPX64": Error on close: %s\n", + b->ob_oid, strerror (errno)); + rc = rc2; + } + + return (rc); +} + + diff --git a/lustre/utils/obdiolib.h b/lustre/utils/obdiolib.h new file mode 100644 index 0000000..9b06941 --- /dev/null +++ b/lustre/utils/obdiolib.h @@ -0,0 +1,70 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2003 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#ifndef _OBDIOLIB_H_ +#define _OBDIOLIB_H_ + +#include + +#include +#include + +#include +#include +#include + +struct obdio_conn { + int oc_fd; + uint64_t oc_conn_addr; + uint64_t oc_conn_cookie; + struct obd_ioctl_data oc_data; + char oc_buffer[8192]; +}; + +struct obdio_barrier { + uint64_t ob_id; + uint64_t ob_oid; + uint64_t ob_npeers; + uint64_t ob_ordinal; + uint64_t ob_count; +}; + +extern struct obdio_conn * obdio_connect (int device); +extern void obdio_disconnect (struct obdio_conn *conn); +extern int obdio_open (struct obdio_conn *conn, uint64_t oid, + struct lustre_handle *fh); +extern int obdio_close (struct obdio_conn *conn, uint64_t oid, + struct lustre_handle *fh); +extern int obdio_pread (struct obdio_conn *conn, uint64_t oid, + char *buffer, uint32_t count, uint64_t offset); +extern int obdio_pwrite (struct obdio_conn *conn, uint64_t oid, + char *buffer, uint32_t count, uint64_t offset); +extern int obdio_enqueue (struct obdio_conn *conn, uint64_t oid, + int mode, uint64_t offset, uint32_t count, + struct lustre_handle *lh); +extern int obdio_cancel (struct obdio_conn *conn, struct lustre_handle *lh); +extern void *obdio_alloc_aligned_buffer (void **spacep, int size); +extern struct obdio_barrier *obdio_new_barrier (uint64_t oid, uint64_t id, int npeers) ; +extern int obdio_setup_barrier (struct obdio_conn *conn, struct obdio_barrier *b); +extern int obdio_barrier (struct obdio_conn *conn, struct obdio_barrier *b); + +#endif diff --git a/lustre/utils/obdstat.c b/lustre/utils/obdstat.c new file mode 100644 index 0000000..1e23a31 --- /dev/null +++ b/lustre/utils/obdstat.c @@ -0,0 +1,197 @@ +#include +#include +#include +#include +#include +#include +#include + +struct one_stat { + char *name; + int fd; + long long current; + long long delta; +}; + +struct one_stat *read_bytes; +struct one_stat *read_reqs; +struct one_stat *write_bytes; +struct one_stat *write_reqs; +struct one_stat *getattr_reqs; +struct one_stat *setattr_reqs; +struct one_stat *create_reqs; +struct one_stat *destroy_reqs; +struct one_stat *statfs_reqs; +struct one_stat *open_reqs; +struct one_stat *close_reqs; +struct one_stat *punch_reqs; + +struct one_stat * +init_one_stat (char *basename, char *name) +{ + char fname[1024]; + struct one_stat *stat = (struct one_stat *)malloc (sizeof (*stat)); + + if (stat == NULL) { + fprintf (stderr, "Can't allocate stat %s: %s\n", + name, strerror (errno)); + abort (); + } + + snprintf (fname, sizeof (fname), "%s/%s", basename, name); + + memset (stat, 0, sizeof (*stat)); + stat->name = name; + + stat->fd = open (fname, O_RDONLY); + if (stat->fd < 0 ) { + fprintf (stderr, "Can't open stat %s: %s\n", + fname, strerror (errno)); + abort (); + } + + return (stat); +} + +void +update_one_stat (struct one_stat *stat) +{ + static char buffer[1024]; + long long prev = stat->current; + int nob; + + lseek (stat->fd, 0, SEEK_SET); + nob = read (stat->fd, buffer, sizeof (buffer) - 1); + if (nob < 0) { + fprintf (stderr, "Can't read stat %s: %s\n", + stat->name, strerror (errno)); + abort (); + } + + buffer[nob] = 0; + if (sscanf (buffer, "%Ld", &stat->current) != 1) { + fprintf (stderr, "Can't parse stat %s: %s\n", + stat->name, strerror (errno)); + abort (); + } + + stat->delta = stat->current - prev; +} + +double +timenow () +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + return (tv.tv_sec + tv.tv_usec / 1000000.0); +} + +void +do_stat (void) +{ + static double last = 0.0; + double now; + double t; + + now = timenow(); + + update_one_stat (read_bytes); + update_one_stat (read_reqs); + update_one_stat (write_bytes); + update_one_stat (write_reqs); + update_one_stat (getattr_reqs); + update_one_stat (setattr_reqs); + update_one_stat (open_reqs); + update_one_stat (close_reqs); + update_one_stat (create_reqs); + update_one_stat (destroy_reqs); + update_one_stat (statfs_reqs); + update_one_stat (punch_reqs); + + if (last == 0.0) { + printf ("R %Ld/%Ld W %Ld/%Ld attr %Ld/%Ld open %Ld/%Ld create %Ld/%Ld stat %Ld punch %Ld\n", + read_bytes->current, read_reqs->current, + write_bytes->current, write_reqs->current, + getattr_reqs->current, setattr_reqs->current, + open_reqs->current, close_reqs->current, + create_reqs->current, destroy_reqs->current, + statfs_reqs->current, punch_reqs->current); + } else { + t = now - last; + + printf ("R %6Ld (%5d %6.2fMb)/s W %6Ld (%5d %6.2fMb)/s", + read_reqs->delta, (int)(read_reqs->delta / t), + read_bytes->delta / ((1<<20) * t), + write_reqs->delta, (int)(write_reqs->delta / t), + write_bytes->delta / ((1<<20) * t)); + + if (getattr_reqs->delta != 0) + printf (" ga:%Ld,%d/s", getattr_reqs->delta, + (int)(getattr_reqs->delta / t)); + + if (setattr_reqs->delta != 0) + printf (" sa:%Ld", setattr_reqs->delta); + + if (open_reqs->delta != 0) + printf (" op:%Ld", open_reqs->delta); + + if (close_reqs->delta != 0) + printf (" cl:%Ld", close_reqs->delta); + + if (create_reqs->delta != 0) + printf (" cx:%Ld", create_reqs->delta); + + if (destroy_reqs->delta != 0) + printf (" dx:%Ld", destroy_reqs->delta); + + if (statfs_reqs->delta != 0) + printf (" st:%Ld", statfs_reqs->delta); + + if (punch_reqs->delta != 0) + printf (" pu:%Ld", punch_reqs->delta); + + printf ("\n"); + } + + last = timenow(); +} + +int main (int argc, char **argv) +{ + char basedir[128]; + int interval = 0; + + if (argc < 2) { + fprintf (stderr, "obd type not specified\n"); + return (1); + } + + snprintf (basedir, sizeof (basedir), "/proc/sys/%s", argv[1]); + + if (argc > 2) + interval = atoi (argv[2]); + + read_bytes = init_one_stat (basedir, "read_bytes"); + read_reqs = init_one_stat (basedir, "read_reqs"); + write_bytes = init_one_stat (basedir, "write_bytes"); + write_reqs = init_one_stat (basedir, "write_reqs"); + getattr_reqs = init_one_stat (basedir, "getattr_reqs"); + setattr_reqs = init_one_stat (basedir, "setattr_reqs"); + create_reqs = init_one_stat (basedir, "create_reqs"); + destroy_reqs = init_one_stat (basedir, "destroy_reqs"); + statfs_reqs = init_one_stat (basedir, "statfs_reqs"); + open_reqs = init_one_stat (basedir, "open_reqs"); + close_reqs = init_one_stat (basedir, "close_reqs"); + punch_reqs = init_one_stat (basedir, "punch_reqs"); + + do_stat (); + + if (interval == 0) + return (0); + + for (;;) { + sleep (interval); + do_stat (); + } +} -- 1.8.3.1