Whamcloud - gitweb
Merge b_md to HEAD for 0.5.19 release.
authoradilger <adilger>
Mon, 6 Jan 2003 22:22:15 +0000 (22:22 +0000)
committeradilger <adilger>
Mon, 6 Jan 2003 22:22:15 +0000 (22:22 +0000)
Fixes a _huge_ number of bugs:
  - Fully reactivate OST imports after reconnection (512, others)
  - Make sure client sees our -ENOTCONN from mds_handle (513 - partial)
  - More graceful error handling for truncating on dead OST (515)
  - Don't error out unless we're actually accessing dead stripes (474)
  - Fix garbage sizes when stripes are missing (410)
  - LRU counters were broken, causing constant lock purge (433, 432)
  - garbage on read from stripes with failed OSTs (441)
  - mark OSCs as active before reconnecting during recovery (438)
  - lov_enqueue and lov_cancel need to handle inactive OSTs (403)
  - lfind did not preserve OST order in output (443)
  - symlinks cause hung clients, incorrect data (439)
  - stop dereferencing request after dropping refcount (457)
  - don't LASSERT(spin_is_locked) on non-SMP (455)
  - fixes for many rename() bugs
  - fstat didn't correctly synchronize attributes (399)
  - server must handle lock cancellation during blocking AST prep (487)
  - bulk descriptors were free()d too soon (511)
  - fix paths in lconf, which would load incorrect modules (451, 507)
  - fix confusing lconf 'host not found' error message (386)
  - fix lock order deadlock on OST (O/R i_sem before journal ops, 478)
  - fix race condition in mdc_blocking_ast() for inode access (526)
  - fix lov_unpackmd() unpacking wrong number of stripes (537)
  - fix lov_set_osc_active() marking wrong OSC inactive (440)
  - fix bad lstripe lov_unpackmd() assertion (fix layering too) (527)
  - fix multiple writes of stripe MD to MDS (358, maybe 519)
  - fix lstripe in several ways (kernel side) (527)
  - fix request leak in ldlm_cli_enqueue (262)
  - incorrect OSC was marked inactive after OST failure
  - call mds_fs_cleanup before unmounting filesystem (524)
  - fix races between taking ns_lock and ldlm_lock_change_resource
  - fix races updating LOV export open file list
  - fix lov_enqueue error path, avoid decref-ing bad lock handle
  - fix recovery NULL deref in ldlm_cli_cancel_unused
  - fix some DLM races by using new hash table for lock handles (419)
  - permit the client to specify desired inodes, at replay
  - duplicate requests when we queue them for replay reintegration
  - fix last_rcvd offset calculation
  - sync after each recovered transaction, so we always make progress
  - never, not always, ERESTART requests without transnos
  - store the lov_desc in the MDS, so we don't depend on getlovinfo to set it
  - skip replay if the MDS says that the client is already connected
  - don't check for a recovery-enabled export to match lctl's UUID
  - don't INC_USE_COUNT for phantom exports
  - don't crash when cleaning up phantom exports (567)
  - don't double-finish or set replay data for errored mdc_open requests
  - abort requests when they time out, so we don't get old replies
  - send/receive replies for AST messages again
  - if the client says that it doesn't have the lock, cancel it on the server
  - if we timeout during I/O, don't try to cancel an in-use lock; instead
    mark it as destroyed, it will all work out when decref is called
  - fix module use counts (22, 581)
 * protocol changes
  - ASTs now expect a reply (server cancels lock on error reply)

118 files changed:
lustre/.cvsignore
lustre/ChangeLog
lustre/Makefile.am
lustre/cobd/.cvsignore [new file with mode: 0644]
lustre/cobd/Makefile.am [new file with mode: 0644]
lustre/cobd/cache_obd.c [new file with mode: 0644]
lustre/cobd/lproc_cache.c [new file with mode: 0644]
lustre/conf/.cvsignore [new file with mode: 0644]
lustre/conf/Makefile.am [new file with mode: 0644]
lustre/conf/lustre.dtd [new file with mode: 0644]
lustre/conf/lustre2ldif.xsl [new file with mode: 0644]
lustre/conf/slapd-lustre.conf [new file with mode: 0644]
lustre/conf/top.ldif [new file with mode: 0644]
lustre/configure.in
lustre/extN/Makefile.am
lustre/extN/extN-noread.diff [new file with mode: 0644]
lustre/extN/extN-wantedi.diff [new file with mode: 0644]
lustre/include/linux/lustre_dlm.h
lustre/include/linux/lustre_export.h
lustre/include/linux/lustre_ha.h
lustre/include/linux/lustre_idl.h
lustre/include/linux/lustre_lib.h
lustre/include/linux/lustre_lite.h
lustre/include/linux/lustre_mds.h
lustre/include/linux/lustre_net.h
lustre/include/linux/obd.h
lustre/include/linux/obd_cache.h [new file with mode: 0644]
lustre/include/linux/obd_class.h
lustre/include/linux/obd_ost.h
lustre/include/linux/obd_ptlbd.h [new file with mode: 0644]
lustre/include/linux/obd_support.h
lustre/ldlm/l_lock.c
lustre/ldlm/ldlm_extent.c
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/ldlm/ldlm_resource.c
lustre/ldlm/ldlm_test.c
lustre/lib/Makefile.am
lustre/lib/client.c
lustre/lib/obd_pack.c
lustre/lib/simple.c
lustre/lib/target.c
lustre/llite/Makefile.am
lustre/llite/commit_callback.c
lustre/llite/dir.c
lustre/llite/file.c
lustre/llite/namei.c
lustre/llite/recover.c
lustre/llite/rw.c
lustre/llite/super.c
lustre/llite/super25.c
lustre/llite/symlink.c
lustre/lov/lov_obd.c
lustre/lov/lov_pack.c
lustre/lov/lproc_lov.c
lustre/mdc/Makefile.am
lustre/mdc/mdc_request.c
lustre/mds/Makefile.am
lustre/mds/handler.c
lustre/mds/lproc_mds.c
lustre/mds/mds_fs.c
lustre/mds/mds_lov.c
lustre/mds/mds_reint.c
lustre/obdclass/Makefile.am
lustre/obdclass/class_obd.c
lustre/obdclass/fsfilt.c
lustre/obdclass/fsfilt_extN.c
lustre/obdclass/fsfilt_reiserfs.c [new file with mode: 0644]
lustre/obdclass/genops.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/statfs_pack.c [moved from lustre/lib/ll_pack.c with 88% similarity]
lustre/obdecho/echo.c
lustre/obdecho/echo_client.c
lustre/obdfilter/Makefile.am
lustre/obdfilter/filter.c
lustre/osc/Makefile.am
lustre/osc/osc_request.c
lustre/ost/Makefile.am
lustre/ost/ost_handler.c
lustre/patches/.cvsignore [deleted file]
lustre/ptlbd/.cvsignore [new file with mode: 0644]
lustre/ptlbd/Makefile.am [new file with mode: 0644]
lustre/ptlbd/blk.c [new file with mode: 0644]
lustre/ptlbd/client.c [new file with mode: 0644]
lustre/ptlbd/main.c [new file with mode: 0644]
lustre/ptlbd/rpc.c [new file with mode: 0644]
lustre/ptlbd/server.c [new file with mode: 0644]
lustre/ptlrpc/client.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/recovd.c
lustre/ptlrpc/recover.c
lustre/ptlrpc/rpc.c
lustre/ptlrpc/service.c
lustre/tests/.cvsignore
lustre/tests/Makefile.am
lustre/tests/createmany.c
lustre/tests/echo.sh [new file with mode: 0755]
lustre/tests/llecho.sh
lustre/tests/llechocleanup.sh
lustre/tests/llmount.sh
lustre/tests/llmountcleanup.sh
lustre/tests/local.sh
lustre/tests/lov.sh
lustre/tests/lovstripe.c [deleted file]
lustre/tests/sanity.sh
lustre/tests/statmany.c [new file with mode: 0644]
lustre/tests/uml.sh
lustre/tests/wantedi.c [new file with mode: 0644]
lustre/utils/automatic-reconnect-sample [new file with mode: 0755]
lustre/utils/lconf.in
lustre/utils/lctl.c
lustre/utils/lfind.c
lustre/utils/lmc
lustre/utils/lstripe.c
lustre/utils/lustre.dtd [deleted file]
lustre/utils/obd.c
lustre/utils/obdctl.h

index 111b232..34373dd 100644 (file)
@@ -12,3 +12,4 @@ TAGS
 lustre*.tar.gz
 cscope.files
 cscope.out
+autom4te-2.53.cache
index fc930e2..41e712f 100644 (file)
@@ -1,4 +1,5 @@
-TBA
+2003-01-06  Andreas Dilger  <adilger@clusterfs.com>
+       * version v0_5_19
        * bug fixes
         - Fully reactivate OST imports after reconnection (512, others)
         - Make sure client sees our -ENOTCONN from mds_handle (513 - partial)
@@ -20,6 +21,41 @@ TBA
         - fix paths in lconf, which would load incorrect modules (451, 507)
         - fix confusing lconf 'host not found' error message (386)
         - fix lock order deadlock on OST (O/R i_sem before journal ops, 478)
+        - fix race condition in mdc_blocking_ast() for inode access (526)
+        - fix lov_unpackmd() unpacking wrong number of stripes (537)
+        - fix lov_set_osc_active() marking wrong OSC inactive (440)
+        - fix bad lstripe lov_unpackmd() assertion (fix layering too) (527)
+        - fix multiple writes of stripe MD to MDS (358, maybe 519)
+        - fix lstripe in several ways (kernel side) (527)
+        - fix request leak in ldlm_cli_enqueue (262)
+        - incorrect OSC was marked inactive after OST failure
+        - call mds_fs_cleanup before unmounting filesystem (524)
+        - fix races between taking ns_lock and ldlm_lock_change_resource
+        - fix races updating LOV export open file list
+        - fix lov_enqueue error path, avoid decref-ing bad lock handle
+        - fix recovery NULL deref in ldlm_cli_cancel_unused
+        - fix some DLM races by using new hash table for lock handles (419)
+        - permit the client to specify desired inodes, at replay
+        - duplicate requests when we queue them for replay reintegration
+        - fix last_rcvd offset calculation
+        - sync after each recovered transaction, so we always make progress
+        - never, not always, ERESTART requests without transnos
+        - store the lov_desc in the MDS, so we don't depend on getlovinfo to
+          set it
+        - skip replay if the MDS says that the client is already connected
+        - don't check for a recovery-enabled export to match lctl's UUID
+        - don't INC_USE_COUNT for phantom exports
+        - don't crash when cleaning up phantom exports (567)
+        - don't double-finish or set replay data for errored mdc_open requests
+        - abort requests when they time out, so we don't get old replies
+        - send/receive replies for AST messages again
+        - if the client says that it doesn't have the lock, cancel it on the
+          server
+        - if we timeout during I/O, don't try to cancel an in-use lock; instead
+          mark it as destroyed, it will all work out when decref is called
+        - fix module use counts (22, 581)
+       * protocol changes
+        - ASTs now expect a reply (server cancels lock on error reply)
 
 2002-12-02  Andreas Dilger  <adilger@clusterfs.com>
        * version v0_5_18
index 6e9281d..b0d8dd3 100644 (file)
@@ -12,8 +12,8 @@ DIRS24 = extN mds
 endif
 
 # NOTE: keep extN before mds and obdfilter
-SUBDIRS = $(DIRS24) obdclass utils ptlrpc ldlm lib obdfilter mdc osc ost llite 
-SUBDIRS+= obdecho lov tests doc scripts 
+SUBDIRS = $(DIRS24) obdclass utils ptlrpc ldlm lib obdfilter mdc osc ost llite
+SUBDIRS+= obdecho lov cobd ptlbd tests doc scripts conf
 
 DIST_SUBDIRS = $(SUBDIRS)
 EXTRA_DIST = BUGS FDL Rules include archdep.m4
@@ -28,4 +28,4 @@ dist-hook:
 include $(top_srcdir)/Rules
 
 rpms: dist Makefile
-       rpm -ta $(distdir).tar.gz
+       rpmbuild -ta $(distdir).tar.gz
diff --git a/lustre/cobd/.cvsignore b/lustre/cobd/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lustre/cobd/Makefile.am b/lustre/cobd/Makefile.am
new file mode 100644 (file)
index 0000000..781c6ce
--- /dev/null
@@ -0,0 +1,15 @@
+# Copyright (C) 2002 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+DEFS=
+
+MODULE = cobd
+modulefs_DATA = cobd.o
+EXTRA_PROGRAMS = cobd
+LINX=
+
+cobd_SOURCES = cache_obd.c lproc_cache.c $(LINX)
+
+include $(top_srcdir)/Rules
diff --git a/lustre/cobd/cache_obd.c b/lustre/cobd/cache_obd.c
new file mode 100644 (file)
index 0000000..ac921d8
--- /dev/null
@@ -0,0 +1,329 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ * This code is issued under the GNU General Public License.
+ * See the file COPYING in this distribution
+ */
+
+#define DEBUG_SUBSYSTEM S_COBD
+
+#include <linux/obd_support.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_idl.h>
+#include <linux/obd_class.h>
+#include <linux/obd_cache.h>
+
+extern struct lprocfs_vars status_var_nm_1[];
+extern struct lprocfs_vars status_class_var[];
+
+static int
+cobd_attach (struct obd_device *dev, obd_count len, void *data)
+{
+       return (lprocfs_reg_obd (dev, status_var_nm_1, dev));
+}
+
+static int
+cobd_detach (struct obd_device *dev)
+{
+       return (lprocfs_dereg_obd (dev));
+}
+
+static int
+cobd_setup (struct obd_device *dev, obd_count len, void *buf)
+{
+        struct obd_ioctl_data *data = (struct obd_ioctl_data *)buf;
+        struct cache_obd  *cobd = &dev->u.cobd;
+        struct obd_device *target;
+        struct obd_device *cache;
+        int                rc;
+        
+        if (data->ioc_inlbuf1 == NULL ||
+            data->ioc_inlbuf2 == NULL)
+                return (-EINVAL);
+        
+        target = class_uuid2obd (data->ioc_inlbuf1);
+        cache  = class_uuid2obd (data->ioc_inlbuf2);
+        if (target == NULL ||
+            cache == NULL)
+                return (-EINVAL);
+        
+        /* don't bother checking attached/setup; 
+         * obd_connect() should, and it can change underneath us */
+
+        rc = obd_connect (&cobd->cobd_target, target, NULL, NULL, NULL);
+        if (rc != 0)
+                return (rc);
+
+        rc = obd_connect (&cobd->cobd_cache, cache, NULL, NULL, NULL);
+        if (rc != 0)
+                goto fail_0;
+
+        return (0);
+
+ fail_0:
+        obd_disconnect (&cobd->cobd_target);
+        return (rc);
+}
+
+static int
+cobd_cleanup (struct obd_device *dev)
+{
+        struct cache_obd  *cobd = &dev->u.cobd;
+        int                rc;
+        
+        if (!list_empty (&dev->obd_exports))
+                return (-EBUSY);
+        
+        rc = obd_disconnect (&cobd->cobd_cache);
+        if (rc != 0)
+                CERROR ("error %d disconnecting cache\n", rc);
+        
+        rc = obd_disconnect (&cobd->cobd_target);
+        if (rc != 0)
+                CERROR ("error %d disconnecting target\n", rc);
+
+        return (0);
+}
+
+static int
+cobd_connect (struct lustre_handle *conn, struct obd_device *obd,
+              obd_uuid_t cluuid, struct recovd_obd *recovd,
+              ptlrpc_recovery_cb_t recover)
+{
+        int rc = class_connect (conn, obd, cluuid);
+
+        CERROR ("rc %d\n", rc);
+        return (rc);
+}
+
+static int
+cobd_disconnect (struct lustre_handle *conn)
+{
+       int rc = class_disconnect (conn);
+       
+        CERROR ("rc %d\n", rc);
+       return (rc);
+}
+
+static int 
+cobd_get_info(struct lustre_handle *conn, obd_count keylen,
+              void *key, obd_count *vallen, void **val)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        struct cache_obd  *cobd;
+
+        if (obd == NULL) {
+                CERROR("invalid client "LPX64"\n", conn->addr);
+                return -EINVAL;
+        }
+
+        cobd = &obd->u.cobd;
+
+        /* intercept cache utilisation info? */
+
+        return (obd_get_info (&cobd->cobd_target, 
+                              keylen, key, vallen, val));
+}
+
+static int 
+cobd_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        struct cache_obd  *cobd;
+
+        if (obd == NULL) {
+                CERROR("invalid client "LPX64"\n", conn->addr);
+                return -EINVAL;
+        }
+
+        cobd = &obd->u.cobd;
+        return (obd_statfs (&cobd->cobd_target, osfs));
+}
+
+static int 
+cobd_getattr(struct lustre_handle *conn, struct obdo *oa,
+             struct lov_stripe_md *lsm)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        struct cache_obd  *cobd;
+
+        if (obd == NULL) {
+                CERROR("invalid client "LPX64"\n", conn->addr);
+                return -EINVAL;
+        }
+
+        cobd = &obd->u.cobd;
+        return (obd_getattr (&cobd->cobd_target, oa, lsm));
+}
+
+static int 
+cobd_open(struct lustre_handle *conn, struct obdo *oa,
+          struct lov_stripe_md *lsm)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        struct cache_obd  *cobd;
+
+        if (obd == NULL) {
+                CERROR("invalid client "LPX64"\n", conn->addr);
+                return -EINVAL;
+        }
+
+        cobd = &obd->u.cobd;
+        return (obd_open (&cobd->cobd_target, oa, lsm));
+}
+
+static int 
+cobd_close(struct lustre_handle *conn, struct obdo *oa,
+           struct lov_stripe_md *lsm)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        struct cache_obd  *cobd;
+
+        if (obd == NULL) {
+                CERROR("invalid client "LPX64"\n", conn->addr);
+                return -EINVAL;
+        }
+
+        cobd = &obd->u.cobd;
+        return (obd_close (&cobd->cobd_target, oa, lsm));
+}
+
+static int 
+cobd_preprw(int cmd, struct lustre_handle *conn,
+            int objcount, struct obd_ioobj *obj,
+            int niocount, struct niobuf_remote *nb,
+            struct niobuf_local *res, void **desc_private)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        struct cache_obd  *cobd;
+
+        if (obd == NULL) {
+                CERROR("invalid client "LPX64"\n", conn->addr);
+                return -EINVAL;
+        }
+
+        if ((cmd & OBD_BRW_WRITE) != 0)
+                return -EOPNOTSUPP;
+        
+        cobd = &obd->u.cobd;
+        return (obd_preprw (cmd, &cobd->cobd_target, 
+                            objcount, obj, 
+                            niocount, nb, 
+                            res, desc_private));
+}
+
+static int 
+cobd_commitrw(int cmd, struct lustre_handle *conn,
+              int objcount, struct obd_ioobj *obj,
+              int niocount, struct niobuf_local *local,
+              void *desc_private)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        struct cache_obd  *cobd;
+
+        if (obd == NULL) {
+                CERROR("invalid client "LPX64"\n", conn->addr);
+                return -EINVAL;
+        }
+
+        if ((cmd & OBD_BRW_WRITE) != 0)
+                return -EOPNOTSUPP;
+        
+        cobd = &obd->u.cobd;
+        return (obd_commitrw (cmd, &cobd->cobd_target,
+                              objcount, obj,
+                              niocount, local,
+                              desc_private));
+}
+
+static inline int 
+cobd_brw(int cmd, struct lustre_handle *conn,
+         struct lov_stripe_md *lsm, obd_count oa_bufs,
+         struct brw_page *pga, struct obd_brw_set *set)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        struct cache_obd  *cobd;
+
+        if (obd == NULL) {
+                CERROR("invalid client "LPX64"\n", conn->addr);
+                return -EINVAL;
+        }
+
+        if ((cmd & OBD_BRW_WRITE) != 0)
+                return -EOPNOTSUPP;
+        
+        cobd = &obd->u.cobd;
+        return (obd_brw (cmd, &cobd->cobd_target, 
+                         lsm, oa_bufs, pga, set));
+}
+
+static int 
+cobd_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
+               void *karg, void *uarg)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        struct cache_obd  *cobd;
+
+        if (obd == NULL) {
+                CERROR("invalid client "LPX64"\n", conn->addr);
+                return -EINVAL;
+        }
+
+        /* intercept? */
+
+        cobd = &obd->u.cobd;
+        return (obd_iocontrol (cmd, &cobd->cobd_target, len, karg, uarg));
+}
+
+static struct obd_ops cobd_ops = {
+        o_owner:                THIS_MODULE,
+        o_attach:               cobd_attach,
+        o_detach:               cobd_detach,
+
+        o_setup:                cobd_setup,
+        o_cleanup:              cobd_cleanup,
+
+        o_connect:              cobd_connect,
+        o_disconnect:           cobd_disconnect,
+
+        o_get_info:             cobd_get_info,
+        o_statfs:               cobd_statfs,
+
+        o_getattr:              cobd_getattr,
+        o_open:                 cobd_open,
+        o_close:                cobd_close,
+        o_preprw:               cobd_preprw,
+        o_commitrw:             cobd_commitrw,
+        o_brw:                  cobd_brw,
+        o_iocontrol:            cobd_iocontrol,
+};
+
+static int __init
+cobd_init (void)
+{
+       int   rc;
+       
+       printk (KERN_INFO "Lustre Caching OBD driver\n");
+       
+       rc = class_register_type (&cobd_ops, status_class_var,
+                                 OBD_CACHE_DEVICENAME);
+       return (rc);
+}
+
+static void __exit
+cobd_exit (void)
+{
+       class_unregister_type (OBD_CACHE_DEVICENAME);
+}
+
+MODULE_AUTHOR("Cluster Filesystems Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Lustre Caching OBD driver");
+MODULE_LICENSE("GPL");
+
+module_init(cobd_init);
+module_exit(cobd_exit);
+
+       
diff --git a/lustre/cobd/lproc_cache.c b/lustre/cobd/lproc_cache.c
new file mode 100644 (file)
index 0000000..5adcaf8
--- /dev/null
@@ -0,0 +1,95 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/lustre_lite.h>
+#include <linux/lprocfs_status.h>
+
+/*
+ * Common STATUS namespace
+ */
+
+static int rd_uuid (char *page, char **start, off_t off, int count,
+                    int *eof, void *data)
+{
+        struct obd_device* dev = (struct obd_device*)data;
+
+        return (snprintf(page, count, "%s\n", dev->obd_uuid));
+}
+
+static int rd_target (char *page, char **start, off_t off, int count,
+                      int *eof, void *data)
+{
+        struct obd_device    *dev = (struct obd_device*)data;
+        struct cache_obd     *cobd = &dev->u.cobd;
+       struct lustre_handle *conn = &cobd->cobd_target;
+       struct obd_export    *exp;
+       int    rc;
+
+       if ((dev->obd_flags & OBD_SET_UP) == 0)
+               rc = snprintf (page, count, "not set up\n");
+       else {
+               exp = class_conn2export (conn);
+               LASSERT (exp != NULL);
+               rc = snprintf(page, count, "%s\n", exp->exp_obd->obd_uuid);
+       }
+       return (rc);
+}
+
+static int rd_cache(char *page, char **start, off_t off, int count,
+                    int *eof, void *data)
+{
+        struct obd_device    *dev = (struct obd_device*)data;
+       struct cache_obd     *cobd = &dev->u.cobd;
+       struct lustre_handle *conn = &cobd->cobd_cache;
+       struct obd_export    *exp;
+       int    rc;
+
+       if ((dev->obd_flags & OBD_SET_UP) == 0)
+               rc = snprintf (page, count, "not set up\n");
+       else {
+               exp = class_conn2export (conn);
+               LASSERT (exp != NULL);
+               rc = snprintf(page, count, "%s\n", exp->exp_obd->obd_uuid);
+       }
+       return (rc);
+}
+
+struct lprocfs_vars status_var_nm_1[] = {
+        {"status/uuid", rd_uuid, 0, 0},
+        {"status/target_uuid", rd_target, 0, 0},
+        {"status/cache_uuid", rd_cache, 0, 0},
+        {0}
+};
+
+int rd_numrefs(char *page, char **start, off_t off, int count,
+               int *eof, void *data)
+{
+        struct obd_type* class = (struct obd_type*)data;
+
+        return (snprintf(page, count, "%d\n", class->typ_refcnt));
+}
+
+struct lprocfs_vars status_class_var[] = {
+        {"status/num_refs", rd_numrefs, 0, 0},
+        {0}
+};
diff --git a/lustre/conf/.cvsignore b/lustre/conf/.cvsignore
new file mode 100644 (file)
index 0000000..282522d
--- /dev/null
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
diff --git a/lustre/conf/Makefile.am b/lustre/conf/Makefile.am
new file mode 100644 (file)
index 0000000..7f98129
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+EXTRA_DIST = lustre2ldif.xsl lustre.dtd lustre.schema slapd-lustre.conf
+ldapconfdir = $(sysconfdir)/openldap
+ldapschemadir = $(sysconfdir)/openldap/schema
+ldapconf_SCRIPTS = slapd-lustre.conf
+ldapschema_SCRIPTS = lustre.schema
+
+include $(top_srcdir)/Rules
+
diff --git a/lustre/conf/lustre.dtd b/lustre/conf/lustre.dtd
new file mode 100644 (file)
index 0000000..73f7c95
--- /dev/null
@@ -0,0 +1,111 @@
+<!-- Lustre Management DTD -->
+
+<!-- basic entities -->
+<!ENTITY % object.content "(#PCDATA)">
+<!ENTITY % object.attr "
+             name CDATA #REQUIRED
+             uuid CDATA #REQUIRED">
+
+<!ENTITY % objref.content "(#PCDATA)">
+<!ENTITY % objref.attr    "uuidref CDATA #REQUIRED">
+
+<!-- main elements -->
+<!ELEMENT lustre (node | profile | mountpoint | ldlm | echoclient |
+                  mds | obd | ost | lov | lovconfig)*>
+
+<!ELEMENT node (network | profile_ref)*>
+<!ATTLIST node %object.attr;
+               router CDATA #IMPLIED>
+               
+<!ELEMENT network (nid | port | route_tbl | sendmem | recvmem)*>
+<!ATTLIST network %object.attr;
+                  nettype (tcp | elan | gm) 'tcp'>
+
+<!ELEMENT routetbl (route)*>
+<!ATTLIST routetbl %object.attr;>
+<!ELEMENT route %object.content;>
+<!ATTLIST route type (elan | tcp | gm) #REQUIRED
+                gw CDATA #REQUIRED
+                lo CDATA #REQUIRED
+                hi CDATA #IMPLIED >
+
+<!ELEMENT profile (ldlm_ref | network_ref | obd_ref | ost_ref |
+                   echoclient_ref | mdsdev_ref | lov_ref |
+                   lovconfig_ref| mountpoint_ref)*>
+<!ATTLIST profile %object.attr;>
+
+<!ELEMENT mountpoint (path | fileset | mds_ref | obd_ref)*>
+<!ATTLIST mountpoint %object.attr;>
+
+<!ELEMENT echoclient (obd_ref)>
+<!ATTLIST echoclient %object.attr;>
+
+<!ELEMENT ldlm EMPTY>
+<!ATTLIST ldlm %object.attr;>
+
+<!ELEMENT obd (fstype | devpath | devsize | autoformat | active_ref)*>
+<!ATTLIST obd %object.attr; 
+              obdtype (obdfilter | obdecho) 'obdfilter'>
+
+<!ELEMENT ost (network_ref | obd_ref | failover_ref)*>
+<!ATTLIST ost %object.attr;>
+
+<!ELEMENT mds (active_ref)*>
+<!ATTLIST mds %object.attr;>
+
+<!ELEMENT mdsdev (fstype | devpath | devsize | autoformat | 
+                  mds_ref | network_ref )*>
+<!ATTLIST mdsdev %object.attr;>
+
+<!ELEMENT lov (mds_ref |(obd_ref)+)*>
+<!ATTLIST lov %object.attr;
+               stripesize    CDATA #REQUIRED
+               stripecount   CDATA #REQUIRED
+               stripeoffset  CDATA #IMPLIED
+               stripepattern CDATA #REQUIRED>
+
+<!ELEMENT lovconfig (lov_ref)>
+<!ATTLIST lovconfig %object.attr;>
+
+<!-- basic elements -->
+<!ELEMENT fstype        %object.content;>
+<!ELEMENT nid           %object.content;>
+<!ELEMENT port          %object.content;>
+<!ELEMENT send_mem      %object.content;>
+<!ELEMENT recv_mem      %object.content;>
+<!ELEMENT autoformat    %object.content;>
+<!ELEMENT activetarget  %object.content;>
+<!ELEMENT devpath       %object.content;>
+<!ELEMENT devsize       %object.content;>
+<!ELEMENT path          %object.content;>
+<!ELEMENT fileset       %object.content;>
+
+<!-- object reference tag elements -->
+<!ELEMENT network_ref     %objref.content;>
+<!ATTLIST network_ref     %objref.attr;>
+<!ELEMENT node_ref        %objref.content;>
+<!ATTLIST node_ref        %objref.attr;>
+<!ELEMENT profile_ref     %objref.content;>
+<!ATTLIST profile_ref     %objref.attr;>
+<!ELEMENT obd_ref         %objref.content;>
+<!ATTLIST obd_ref         %objref.attr;>
+<!ELEMENT mds_ref         %objref.content;>
+<!ATTLIST mds_ref         %objref.attr;>
+<!ELEMENT mdsdev_ref      %objref.content;>
+<!ATTLIST mdsdev_ref      %objref.attr;>
+<!ELEMENT ost_ref         %objref.content;>
+<!ATTLIST ost_ref         %objref.attr;>
+<!ELEMENT lov_ref         %objref.content;>
+<!ATTLIST lov_ref         %objref.attr;>
+<!ELEMENT lovconfig_ref   %objref.content;>
+<!ATTLIST lovconfig_ref   %objref.attr;>
+<!ELEMENT mountpoint_ref  %objref.content;>
+<!ATTLIST mountpoint_ref  %objref.attr;>
+<!ELEMENT echoclient_ref %objref.content;>
+<!ATTLIST echoclient_ref %objref.attr;>
+<!ELEMENT failover_ref    %objref.content;>
+<!ATTLIST failover_ref    %objref.attr;>
+<!ELEMENT ldlm_ref        %objref.content;>
+<!ATTLIST ldlm_ref        %objref.attr;>
+
+
diff --git a/lustre/conf/lustre2ldif.xsl b/lustre/conf/lustre2ldif.xsl
new file mode 100644 (file)
index 0000000..f5d8098
--- /dev/null
@@ -0,0 +1,212 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<stylesheet version="1.0" xmlns="http://www.w3.org/1999/XSL/Transform"> 
+<output omit-xml-declaration="yes" />
+<strip-space elements="*"/>
+<param name="config">fs=lustre</param>
+<variable name="basedn">config=<value-of select="$config"/>,fs=lustre</variable>
+
+<template match="lustre">
+dn: <value-of select="$basedn"/>
+uuid: CONFIG_UUID
+objectClass: LUSTRECONFIG
+config: <value-of select="$config"/>
+<text>
+</text><apply-templates/>
+</template>
+
+<template match="node">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: NODE
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/>
+networkRef: <value-of select="network/@uuid"/>
+<for-each select="profile_ref">
+profileRef: <value-of select="@uuidref"/>
+</for-each>
+<text>
+</text><apply-templates/>
+</template>
+
+<template match="profile">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: PROFILE
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/><apply-templates/>
+<text>
+</text>
+</template>
+
+<template match="network">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: NETWORK
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/>
+nettype: <value-of select="@nettype"/>
+nid: <value-of select="nid"/>
+<if test="port">
+port: <value-of select="port"/>
+</if>
+<text>
+</text>
+</template>
+
+<template match="mds">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: MDS
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/><apply-templates/>
+<text>
+</text>
+</template>
+
+<template match="mdsdev">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: MDSDEV
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/>
+<if test="fstype">
+fstype: <value-of select="fstype"/>
+</if>
+<if test="autoformat">
+autoformat: <value-of select="autoformat"/>
+</if>
+<if test="devpath">
+devpath: <value-of select="devpath"/>
+</if>
+<if test="devsize">
+devsize: <value-of select="devsize"/>
+</if>
+networkRef: <value-of select="network_ref/@uuidref"/>
+mdsRef: <value-of select="mds_ref/@uuidref"/>
+<text>
+</text>
+</template>
+
+<template match="lov">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: LOV
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/>
+mdsRef: <value-of select="mds_ref/@uuidref"/>
+stripepattern: <value-of select="@stripepattern"/>
+stripesize: <value-of select="@stripesize"/>
+stripecount: <value-of select="@stripecount"/><apply-templates/>
+<text>
+</text>
+</template>
+
+<template match="lovconfig">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: LOVCONFIG
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/><apply-templates/>
+<text>
+</text>
+</template>
+
+<template match="obd">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: OBD
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/>
+activeRef: <value-of select="active_ref/@uuidref"/>
+obdtype: <value-of select="@obdtype"/>
+<if test="fstype">
+fstype: <value-of select="fstype"/>
+</if>
+<if test="autoformat">
+autoformat: <value-of select="autoformat"/>
+</if>
+<if test="devpath">
+devpath: <value-of select="devpath"/>
+</if>
+<if test="devsize">
+devsize: <value-of select="devsize"/>
+</if>
+<text>
+</text>
+</template>
+
+<template match="ost">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: OST
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/><apply-templates/>
+<text>
+</text>
+</template>
+
+<template match="mountpoint">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: MOUNTPOINT
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/><apply-templates/>
+<text>
+</text>
+</template>
+
+<template match="echoclient">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: ECHOCLIENT
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/><apply-templates/>
+<text>
+</text>
+</template>
+
+<template match="ldlm">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: LDLM
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/>
+<text>
+</text>
+</template>
+
+
+<template match="ldlm_ref">
+ldlmRef: <value-of select="@uuidref"/>
+</template>
+
+<template match="obd_ref">
+obdRef: <value-of select="@uuidref"/>
+</template>
+
+<template match="ost_ref">
+ostRef: <value-of select="@uuidref"/>
+</template>
+
+<template match="network_ref">
+networkRef: <value-of select="@uuidref"/>
+</template>
+
+<template match="mds_ref">
+mdsRef: <value-of select="@uuidref"/>
+</template>
+
+<template match="mountpoint_ref">
+mountpointRef: <value-of select="@uuidref"/>
+</template>
+
+<template match="echoclient_ref">
+echoclientRef: <value-of select="@uuidref"/>
+</template>
+
+<template match="lov_ref">
+lovRef: <value-of select="@uuidref"/>
+</template>
+
+<template match="lovconfig_ref">
+lovconfigRef: <value-of select="@uuidref"/>
+</template>
+
+<template match="path">
+path: <value-of select="."/>
+</template>
+
+<template match="active_ref">
+activeRef: <value-of select="@uuidref"/>
+</template>
+</stylesheet>
+
+
diff --git a/lustre/conf/slapd-lustre.conf b/lustre/conf/slapd-lustre.conf
new file mode 100644 (file)
index 0000000..de89c76
--- /dev/null
@@ -0,0 +1,12 @@
+#######################################################################
+# lustre ldap config database
+# $Id: slapd-lustre.conf,v 1.2 2003/01/06 22:17:53 adilger Exp $
+#######################################################################
+
+database       ldbm
+suffix         "fs=lustre"
+rootdn         "cn=Manager,fs=lustre"
+include                /etc/openldap/schema/lustre.schema
+rootpw         secret
+directory      /var/lib/ldap/lustre
+index           objectClass eq, uuid eq
diff --git a/lustre/conf/top.ldif b/lustre/conf/top.ldif
new file mode 100644 (file)
index 0000000..8629444
--- /dev/null
@@ -0,0 +1,4 @@
+dn: fs=lustre
+fs:lustre
+objectClass: lustre
+desc: Lustre Config
index bd378b7..c172cd2 100644 (file)
@@ -128,6 +128,7 @@ AC_SUBST(demodir)
 
 AC_OUTPUT(Makefile lib/Makefile ldlm/Makefile obdecho/Makefile ptlrpc/Makefile \
        lov/Makefile osc/Makefile mdc/Makefile mds/Makefile ost/Makefile \
+       cobd/Makefile ptlbd/Makefile conf/Makefile \
        utils/Makefile utils/lconf tests/Makefile obdfilter/Makefile \
         obdclass/Makefile llite/Makefile doc/Makefile scripts/Makefile \
        scripts/lustre.spec extN/Makefile, chmod +x utils/lconf)
index 33c6d07..5ad1642 100644 (file)
@@ -16,7 +16,8 @@ EXTRA_PROGRAMS = extN
 EXTN_FIXES = patch-2.4.18-chaos22
 #EXTN_FIXES = ext3-2.4.18-fixes.diff
 EXTNP = htree-ext3-2.4.18.diff linux-2.4.18ea-0.8.26.diff
-EXTNP+= ext3-2.4.18-ino_sb_macro.diff extN-misc-fixup.diff
+EXTNP+= ext3-2.4.18-ino_sb_macro.diff extN-misc-fixup.diff extN-noread.diff
+EXTNP+= extN-wantedi.diff
 EXTNC = balloc.c bitmap.c dir.c file.c fsync.c ialloc.c inode.c ioctl.c
 EXTNC+= namei.c super.c symlink.c
 EXTNI = extN_fs.h extN_fs_i.h extN_fs_sb.h extN_jbd.h quotaops.h
@@ -52,31 +53,27 @@ diff:
        $(RM) extN.patchT
        l='$(EXTNC)'; for f in $$l; do                                        \
           echo "$$f";                                                         \
-          (diff -u $(extN_orig)/$$f extN/$$f) >> extN.patchT;              \
-          test $$? -le 1 || exit 1;                                       
+          (diff -u $(extN_orig)/$$f extN/$$f) >> extN.patchT;                 \
+          test $$? -le 1 || exit 1;                                           \
        done
        l='$(EXTNI)'; for f in $$l; do                                        \
           echo "$$f";                                                         \
           (diff -u $(extN_include_orig)/$$f $(top_srcdir)/include/linux/$$f)>>extN.patchT;\
-          test $$? -le 1 || exit 1;                                          
+          test $$? -le 1 || exit 1;                                           \
        done
        l='$(EXTN_EXTRA)'; for f in $$l; do                                   \
           f=`echo "$$f" | sed 's%^fs/%%'`;                                    \
           echo "$$f";                                                         \
           (cd $(top_srcdir) &&                                                \
             diff -u /dev/null $$f) >> extN.patchT;                            \
-          test $$? -le 1 || exit 1;                                          
+          test $$? -le 1 || exit 1;                                           \
        done
        mv -f extN.patchT $(top_builddir)/$(subdir)/extN.patch-$(RELEASE)
        echo "Don't forget to add $(srcdir)/extN.patch-$(RELEASE) to CVS!"
 
-
-
 .PHONY: diff
 
 # Just do the SUB transformation on all our source files.
-
-
 sed-stamp:
        $(RM) $@
        rm -rf $(extN_orig) $(extN_include_orig)
diff --git a/lustre/extN/extN-noread.diff b/lustre/extN/extN-noread.diff
new file mode 100644 (file)
index 0000000..463516c
--- /dev/null
@@ -0,0 +1,225 @@
+diff -ru lustre-head/fs/extN/ialloc.c lustre/fs/extN/ialloc.c
+--- lustre-head/fs/extN/ialloc.c       Mon Dec 23 10:02:58 2002
++++ lustre/fs/extN/ialloc.c    Mon Dec 23 09:46:20 2002
+@@ -289,6 +289,37 @@
+ }
+ /*
++ * @block_group: block group of inode
++ * @offset: relative offset of inode within @block_group
++ *
++ * Check whether any of the inodes in this disk block are in use.
++ *
++ * Caller must be holding superblock lock (group/bitmap read lock in future).
++ */
++int extN_itable_block_used(struct super_block *sb, unsigned int block_group,
++                         int offset)
++{
++      int bitmap_nr = load_inode_bitmap(sb, block_group);
++      int inodes_per_block;
++      unsigned long inum, iend;
++      struct buffer_head *ibitmap;
++
++      if (bitmap_nr < 0)
++              return 1;
++
++      inodes_per_block = sb->s_blocksize / EXTN_SB(sb)->s_inode_size;
++      inum = offset & ~(inodes_per_block - 1);
++      iend = inum + inodes_per_block;
++      ibitmap = EXTN_SB(sb)->s_inode_bitmap[bitmap_nr];
++      for (; inum < iend; inum++) {
++              if (inum != offset && extN_test_bit(inum, ibitmap->b_data))
++                      return 1;
++      }
++
++      return 0;
++}
++
++/*
+  * There are two policies for allocating an inode.  If the new inode is
+  * a directory, then a forward search is made for a block group with both
+  * free space and a low directory-to-inode ratio; if that fails, then of
+@@ -312,6 +343,7 @@
+       struct extN_group_desc * gdp;
+       struct extN_group_desc * tmp;
+       struct extN_super_block * es;
++      struct extN_iloc iloc;
+       int err = 0;
+       /* Cannot create files in a deleted directory */
+@@ -505,7 +538,7 @@
+       ei->i_prealloc_count = 0;
+ #endif
+       ei->i_block_group = i;
+-      
++
+       if (ei->i_flags & EXTN_SYNC_FL)
+               inode->i_flags |= S_SYNC;
+       if (IS_SYNC(inode))
+@@ -514,9 +547,18 @@
+       inode->i_generation = sbi->s_next_generation++;
+       ei->i_state = EXTN_STATE_NEW;
+-      err = extN_mark_inode_dirty(handle, inode);
++      err = extN_get_inode_loc_new(inode, &iloc, 1);
+       if (err) goto fail;
+-      
++      BUFFER_TRACE(iloc->bh, "get_write_access");
++      err = extN_journal_get_write_access(handle, iloc.bh);
++      if (err) {
++              brelse(iloc.bh);
++              iloc.bh = NULL;
++              goto fail;
++      }
++      err = extN_mark_iloc_dirty(handle, inode, &iloc);
++      if (err) goto fail;
++
+       unlock_super (sb);
+       if(DQUOT_ALLOC_INODE(inode)) {
+               DQUOT_DROP(inode);
+diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c
+--- lustre-head/fs/extN/inode.c        Mon Dec 23 10:02:58 2002
++++ lustre/fs/extN/inode.c     Mon Dec 23 09:50:25 2002
+@@ -2011,23 +1994,32 @@
+       extN_journal_stop(handle, inode);
+ }
+-/* 
+- * extN_get_inode_loc returns with an extra refcount against the
+- * inode's underlying buffer_head on success. 
+- */
++extern int extN_itable_block_used(struct super_block *sb,
++                                unsigned int block_group,
++                                int offset);
++
++#define NUM_INODE_PREREAD 16
+-int extN_get_inode_loc (struct inode *inode, struct extN_iloc *iloc)
++/*
++ * extN_get_inode_loc returns with an extra refcount against the inode's
++ * underlying buffer_head on success.  If this is for a new inode allocation
++ * (new is non-zero) then we may be able to optimize away the read if there
++ * are no other in-use inodes in this inode table block.  If we need to do
++ * a read, then read in a whole chunk of blocks to avoid blocking again soon
++ * if we are doing lots of creates/updates.
++ */
++int extN_get_inode_loc_new(struct inode *inode, struct extN_iloc *iloc, int new)
+ {
+       struct super_block *sb = inode->i_sb;
+       struct extN_sb_info *sbi = EXTN_SB(sb);
+-      struct buffer_head *bh = 0;
++      struct buffer_head *bh[NUM_INODE_PREREAD];
+       unsigned long block;
+       unsigned long block_group;
+       unsigned long group_desc;
+       unsigned long desc;
+       unsigned long offset;
+       struct extN_group_desc * gdp;
+-              
++
+       if ((inode->i_ino != EXTN_ROOT_INO &&
+               inode->i_ino != EXTN_JOURNAL_INO &&
+               inode->i_ino < EXTN_FIRST_INO(sb)) ||
+@@ -2042,38 +2034,86 @@
+       }
+       group_desc = block_group >> sbi->s_desc_per_block_bits;
+       desc = block_group & (sbi->s_desc_per_block - 1);
+-      bh = sbi->s_group_desc[group_desc];
+-      if (!bh) {
++      if (!sbi->s_group_desc[group_desc]) {
+               extN_error(sb, __FUNCTION__, "Descriptor not loaded");
+               goto bad_inode;
+       }
+-      gdp = (struct extN_group_desc *) bh->b_data;
++      gdp = (struct extN_group_desc *)(sbi->s_group_desc[group_desc]->b_data);
++
+       /*
+        * Figure out the offset within the block group inode table
+        */
+-      offset = ((inode->i_ino - 1) % sbi->s_inodes_per_group) *
+-              sbi->s_inode_size;
++      offset = ((inode->i_ino - 1) % sbi->s_inodes_per_group);
++
+       block = le32_to_cpu(gdp[desc].bg_inode_table) +
+-              (offset >> EXTN_BLOCK_SIZE_BITS(sb));
+-      if (!(bh = sb_bread(sb, block))) {
+-              extN_error (sb, __FUNCTION__,
+-                          "unable to read inode block - "
+-                          "inode=%lu, block=%lu", inode->i_ino, block);
+-              goto bad_inode;
++              (offset * sbi->s_inode_size >> EXTN_BLOCK_SIZE_BITS(sb));
++
++      bh[0] = sb_getblk(sb, block);
++      if (buffer_uptodate(bh[0]))
++              goto done;
++
++      /* If we don't really need to read this block, and it isn't already
++       * in memory, then we just zero it out.  Otherwise, we keep the
++       * current block contents (deleted inode data) for posterity.
++       */
++      if (new && !extN_itable_block_used(sb, block_group, offset)) {
++              lock_buffer(bh[0]);
++              memset(bh[0]->b_data, 0, bh[0]->b_size);
++              mark_buffer_uptodate(bh[0], 1);
++              unlock_buffer(bh[0]);
++      } else {
++              unsigned long block_end, itable_end;
++              int count = 1;
++
++              itable_end = le32_to_cpu(gdp[desc].bg_inode_table) +
++                              sbi->s_itb_per_group;
++              block_end = block + NUM_INODE_PREREAD;
++              if (block_end > itable_end)
++                      block_end = itable_end;
++
++              for (; block < block_end; block++) {
++                      bh[count] = sb_getblk(sb, block);
++                      if (count && (buffer_uptodate(bh[count]) ||
++                                    buffer_locked(bh[count]))) {
++                              __brelse(bh[count]);
++                      } else
++                              count++;
++              }
++
++              ll_rw_block(READ, count, bh);
++
++              /* Release all but the block we actually need (bh[0]) */
++              while (--count > 0)
++                      __brelse(bh[count]);
++
++              wait_on_buffer(bh[0]);
++              if (!buffer_uptodate(bh[0])) {
++                      extN_error(sb, __FUNCTION__,
++                                 "unable to read inode block - "
++                                 "inode=%lu, block=%lu", inode->i_ino,
++                                 bh[0]->b_blocknr);
++                      goto bad_inode;
++              }
+       }
+-      offset &= (EXTN_BLOCK_SIZE(sb) - 1);
++ done:
++      offset = (offset * sbi->s_inode_size) & (EXTN_BLOCK_SIZE(sb) - 1);
+-      iloc->bh = bh;
+-      iloc->raw_inode = (struct extN_inode *) (bh->b_data + offset);
++      iloc->bh = bh[0];
++      iloc->raw_inode = (struct extN_inode *)(bh[0]->b_data + offset);
+       iloc->block_group = block_group;
+-      
++
+       return 0;
+-      
++
+  bad_inode:
+       return -EIO;
+ }
++int extN_get_inode_loc(struct inode *inode, struct extN_iloc *iloc)
++{
++      return extN_get_inode_loc_new(inode, iloc, 0);
++}
++
+ void extN_read_inode(struct inode * inode)
+ {
+       struct extN_iloc iloc;
diff --git a/lustre/extN/extN-wantedi.diff b/lustre/extN/extN-wantedi.diff
new file mode 100644 (file)
index 0000000..3be559f
--- /dev/null
@@ -0,0 +1,163 @@
+--- lustre/extN-clean/namei.c  2002-12-30 05:56:09.000000000 -0500
++++ lustre/extN/namei.c        2002-12-30 06:29:39.000000000 -0500
+@@ -1224,7 +1224,8 @@
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+-      inode = extN_new_inode (handle, dir, mode);
++      inode = extN_new_inode (handle, dir, mode,
++                              (unsigned long)dentry->d_fsdata);
+       err = PTR_ERR(inode);
+       if (!IS_ERR(inode)) {
+               inode->i_op = &extN_file_inode_operations;
+@@ -1254,7 +1254,8 @@
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+-      inode = extN_new_inode (handle, dir, mode);
++      inode = extN_new_inode (handle, dir, mode,
++                              (unsigned long)dentry->d_fsdata);
+       err = PTR_ERR(inode);
+       if (!IS_ERR(inode)) {
+               init_special_inode(inode, mode, rdev);
+@@ -1286,7 +1286,8 @@
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+-      inode = extN_new_inode (handle, dir, S_IFDIR | mode);
++      inode = extN_new_inode (handle, dir, S_IFDIR | mode,
++                              (unsigned long)dentry->d_fsdata);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+@@ -1680,7 +1681,8 @@
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+-      inode = extN_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
++      inode = extN_new_inode (handle, dir, S_IFLNK|S_IRWXUGO,
++                              (unsigned long)dentry->d_fsdata);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+--- lustre/extN-clean/ialloc.c 2002-12-28 23:56:42.000000000 -0500
++++ lustre/extN/ialloc.c       2002-12-30 06:29:39.000000000 -0500
+@@ -329,8 +329,8 @@
+  * For other inodes, search forward from the parent directory's block
+  * group to find a free inode.
+  */
+-struct inode * extN_new_inode (handle_t *handle,
+-                              const struct inode * dir, int mode)
++struct inode *extN_new_inode(handle_t *handle, const struct inode *dir,
++                           int mode, unsigned long goal)
+ {
+       struct super_block * sb;
+       struct buffer_head * bh;
+@@ -360,6 +361,38 @@
+       lock_super (sb);
+       es = sbi->s_es;
++
++      if (goal) {
++              i = (goal - 1) / EXTN_INODES_PER_GROUP(sb);
++              j = (goal - 1) % EXTN_INODES_PER_GROUP(sb);
++              gdp = extN_get_group_desc(sb, i, &bh2);
++
++              bitmap_nr = load_inode_bitmap (sb, i);
++              if (bitmap_nr < 0)
++                      goto fail;
++
++              bh = sbi->s_inode_bitmap[bitmap_nr];
++
++              BUFFER_TRACE(bh, "get_write_access");
++              err = extN_journal_get_write_access(handle, bh);
++              if (err) goto fail;
++
++              if (extN_set_bit(j, bh->b_data)) {
++                      printk(KERN_ERR "goal inode %lu unavailable", goal);
++                      /* Oh well, we tried. */
++                      goto repeat;
++              }
++
++              BUFFER_TRACE(bh, "call extN_journal_dirty_metadata");
++              err = extN_journal_dirty_metadata(handle, bh);
++              if (err) goto fail;
++
++              /* We've shortcircuited the allocation system successfully,
++               * now finish filling in the inode.
++               */
++              goto have_bit_and_group;
++      }
++
+ repeat:
+       gdp = NULL;
+       i = 0;
+@@ -474,6 +509,7 @@
+               }
+               goto repeat;
+       }
++have_bit_and_group:
+       j += i * sbi->s_inodes_per_group + 1;
+       if (j < sbi->s_first_ino || j > le32_to_cpu(es->s_inodes_count)) {
+               extN_error (sb, "extN_new_inode",
+--- lustre/extN-clean/ioctl.c  2002-12-28 23:56:42.000000000 -0500
++++ lustre/extN/ioctl.c        2002-12-30 06:29:39.000000000 -0500
+@@ -24,6 +24,31 @@
+       extN_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+       switch (cmd) {
++      case EXTN_IOC_CREATE_INUM: {
++              char name[32];
++              struct dentry *dchild, *dparent;
++              int rc = 0;
++
++              dparent = list_entry(inode->i_dentry.next, struct dentry,
++                                   d_alias);
++              snprintf(name, sizeof name, "%lu", arg);
++              dchild = lookup_one_len(name, dparent, strlen(name));
++              if (dchild->d_inode) {
++                      printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n",
++                             dparent->d_name.len, dparent->d_name.name, arg,
++                             dchild->d_inode->i_ino);
++                      rc = -EEXIST;
++              } else {
++                      dchild->d_fsdata = (void *)arg;
++                      rc = vfs_create(inode, dchild, 0644);
++                      if (rc)
++                              printk(KERN_ERR "vfs_create: %d\n", rc);
++                      else if (dchild->d_inode->i_ino != arg)
++                              rc = -EEXIST;
++              }
++              dput(dchild);
++              return rc;
++      }
+       case EXTN_IOC_GETFLAGS:
+               flags = ei->i_flags & EXTN_FL_USER_VISIBLE;
+               return put_user(flags, (int *) arg);
+--- lustre/include/linux/extN_fs.h~    2002-12-30 06:01:43.000000000 -0500
++++ lustre/include/linux/extN_fs.h     2002-12-30 06:02:51.000000000 -0500
+@@ -200,6 +200,7 @@
+ #define       EXTN_IOC_SETFLAGS               _IOW('f', 2, long)
+ #define       EXTN_IOC_GETVERSION             _IOR('f', 3, long)
+ #define       EXTN_IOC_SETVERSION             _IOW('f', 4, long)
++/* EXTN_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
+ #define       EXTN_IOC_GETVERSION_OLD         _IOR('v', 1, long)
+ #define       EXTN_IOC_SETVERSION_OLD         _IOW('v', 2, long)
+ #ifdef CONFIG_JBD_DEBUG
+@@ -632,7 +633,8 @@
+ extern int extN_sync_file (struct file *, struct dentry *, int);
+ /* ialloc.c */
+-extern struct inode * extN_new_inode (handle_t *, const struct inode *, int);
++extern struct inode * extN_new_inode (handle_t *, const struct inode *, int,
++                                    unsigned long);
+ extern void extN_free_inode (handle_t *, struct inode *);
+ extern struct inode * extN_orphan_get (struct super_block *, ino_t);
+ extern unsigned long extN_count_free_inodes (struct super_block *);
+@@ -714,4 +716,6 @@
+ #endif        /* __KERNEL__ */
++#define EXTN_IOC_CREATE_INUM                  _IOW('f', 5, long)
++
+ #endif        /* _LINUX_EXTN_FS_H */
index c1382a9..e552dfd 100644 (file)
@@ -145,7 +145,7 @@ typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock,
 typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, int flags);
 
 struct ldlm_lock {
-        __u64                 l_random;
+        struct portals_handle l_handle; // must be first in the structure
         atomic_t              l_refc;
         struct ldlm_resource *l_resource;
         struct ldlm_lock     *l_parent;
@@ -183,8 +183,9 @@ struct ldlm_lock {
 };
 
 typedef int (*ldlm_res_compat)(struct ldlm_lock *child, struct ldlm_lock *new);
-typedef int (*ldlm_res_policy)(struct ldlm_lock *lock, void *req_cookie,
-                               ldlm_mode_t mode, int flags, void *data);
+typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock *,
+                               void *req_cookie, ldlm_mode_t mode, int flags,
+                               void *data);
 
 #define LDLM_PLAIN       10
 #define LDLM_EXTENT      11
@@ -246,22 +247,24 @@ extern char *ldlm_it2str(int it);
 do {                                                                          \
         if (lock->l_resource == NULL) {                                       \
                 CDEBUG(D_DLMTRACE, "### " format                              \
-                       " ns: \?\? lock: %p lrc: %d/%d,%d mode: %s/%s "        \
+                       " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "\
                        "res: \?\? rrc=\?\? type: \?\?\? remote: "LPX64")\n"   \
-                       , ## a, lock, lock->l_refc, lock->l_readers,           \
-                       lock->l_writers,                                       \
+                       , ## a, lock, lock->l_handle.h_cookie,                 \
+                       atomic_read(&lock->l_refc),                            \
+                       lock->l_readers, lock->l_writers,                      \
                        ldlm_lockname[lock->l_granted_mode],                   \
                        ldlm_lockname[lock->l_req_mode],                       \
-                       lock->l_remote_handle.addr);                           \
+                       lock->l_remote_handle.cookie);                         \
                 break;                                                        \
         }                                                                     \
         if (lock->l_resource->lr_type == LDLM_EXTENT) {                       \
                 CDEBUG(D_DLMTRACE, "### " format                              \
-                       " ns: %s lock: %p lrc: %d/%d,%d mode: %s/%s res: "LPU64   \
-                       "/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64"] remote: "  \
-                       LPX64"\n" , ## a,                                     \
+                       " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "  \
+                       "res: "LPU64"/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64\
+                       "] remote: "LPX64"\n" , ## a,                          \
                        lock->l_resource->lr_namespace->ns_name, lock,         \
-                       lock->l_refc, lock->l_readers, lock->l_writers,        \
+                       lock->l_handle.h_cookie, atomic_read(&lock->l_refc),   \
+                       lock->l_readers, lock->l_writers,                      \
                        ldlm_lockname[lock->l_granted_mode],                   \
                        ldlm_lockname[lock->l_req_mode],                       \
                        lock->l_resource->lr_name[0],                          \
@@ -269,22 +272,24 @@ do {                                                                          \
                        atomic_read(&lock->l_resource->lr_refcount),           \
                        ldlm_typename[lock->l_resource->lr_type],              \
                        lock->l_extent.start, lock->l_extent.end,              \
-                       lock->l_remote_handle.addr);                           \
+                       lock->l_remote_handle.cookie);                         \
                 break;                                                        \
         }                                                                     \
         {                                                                     \
                 CDEBUG(D_DLMTRACE, "### " format                              \
-                       " ns: %s lock: %p lrc: %d/%d,%d mode: %s/%s res: "LPU64   \
-                       "/"LPU64" rrc: %d type: %s remote: "LPX64"\n" , ## a,   \
-                       lock->l_resource->lr_namespace->ns_name, lock,         \
-                       lock->l_refc, lock->l_readers, lock->l_writers,        \
+                       " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "  \
+                       "res: "LPU64"/"LPU64" rrc: %d type: %s remote: "LPX64  \
+                       "\n" , ## a, lock->l_resource->lr_namespace->ns_name,  \
+                       lock, lock->l_handle.h_cookie,                         \
+                       atomic_read (&lock->l_refc),                           \
+                       lock->l_readers, lock->l_writers,                      \
                        ldlm_lockname[lock->l_granted_mode],                   \
                        ldlm_lockname[lock->l_req_mode],                       \
                        lock->l_resource->lr_name[0],                          \
                        lock->l_resource->lr_name[1],                          \
                        atomic_read(&lock->l_resource->lr_refcount),           \
                        ldlm_typename[lock->l_resource->lr_type],              \
-                       lock->l_remote_handle.addr);                           \
+                       lock->l_remote_handle.cookie);                         \
         }                                                                     \
 } while (0)
 
@@ -295,22 +300,25 @@ do {                                                                          \
  * Iterators.
  */
 
-#define LDLM_ITER_CONTINUE 0 /* keep iterating */
-#define LDLM_ITER_STOP     1 /* stop iterating */
+#define LDLM_ITER_CONTINUE 1 /* keep iterating */
+#define LDLM_ITER_STOP     0 /* stop iterating */
 
 typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *);
+typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *);
 
 int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
                           void *closure);
 int ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
                            void *closure);
+int ldlm_namespace_foreach_res(struct ldlm_namespace *ns,
+                               ldlm_res_iterator_t iter, void *closure);
 
 int ldlm_replay_locks(struct obd_import *imp);
 
 /* ldlm_extent.c */
 int ldlm_extent_compat(struct ldlm_lock *, struct ldlm_lock *);
-int ldlm_extent_policy(struct ldlm_lock *, void *, ldlm_mode_t, int flags,
-                       void *);
+int ldlm_extent_policy(struct ldlm_namespace *, struct ldlm_lock *, void *,
+                       ldlm_mode_t, int flags, void *);
 
 /* ldlm_lockd.c */
 int ldlm_handle_enqueue(struct ptlrpc_request *req);
@@ -319,19 +327,17 @@ int ldlm_handle_cancel(struct ptlrpc_request *req);
 int ldlm_del_waiting_lock(struct ldlm_lock *lock);
 
 /* ldlm_lock.c */
-void ldlm_register_intent(int (*arg)(struct ldlm_lock *lock, void *req_cookie,
-                                     ldlm_mode_t mode, int flags, void *data));
+void ldlm_register_intent(ldlm_res_policy arg);
 void ldlm_unregister_intent(void);
 void ldlm_lock2handle(struct ldlm_lock *lock, struct lustre_handle *lockh);
-struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *, int strict,
-                                     int flags);
+struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *, int flags);
 void ldlm_cancel_callback(struct ldlm_lock *);
 int ldlm_lock_set_data(struct lustre_handle *, void *data, int datalen);
 void ldlm_lock_remove_from_lru(struct ldlm_lock *);
 
 static inline struct ldlm_lock *ldlm_handle2lock(struct lustre_handle *h)
 {
-        return __ldlm_handle2lock(h, 1, 0);
+        return __ldlm_handle2lock(h, 0);
 }
 
 #define LDLM_LOCK_PUT(lock)                     \
@@ -363,18 +369,19 @@ ldlm_lock_create(struct ldlm_namespace *ns,
                  struct lustre_handle *parent_lock_handle,
                  __u64 *res_id, __u32 type, ldlm_mode_t mode, void *data,
                  __u32 data_len);
-ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock *lock, void *cookie,
-                               int cookie_len, int *flags,
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock *,
+                               void *cookie, int cookie_len, int *flags,
                                ldlm_completion_callback completion,
                                ldlm_blocking_callback blocking);
 struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
                                         int *flags);
 void ldlm_lock_cancel(struct ldlm_lock *lock);
 void ldlm_cancel_locks_for_export(struct obd_export *export);
-void ldlm_run_ast_work(struct list_head *rpc_list);
+int ldlm_run_ast_work(struct list_head *rpc_list);
 void ldlm_reprocess_all(struct ldlm_resource *res);
-void ldlm_lock_dump(struct ldlm_lock *lock);
-void ldlm_lock_dump_handle(struct lustre_handle *);
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns);
+void ldlm_lock_dump(int level, struct ldlm_lock *lock);
+void ldlm_lock_dump_handle(int level, struct lustre_handle *);
 
 /* ldlm_test.c */
 int ldlm_test(struct obd_device *device, struct lustre_handle *connh);
@@ -406,9 +413,11 @@ void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc);
 void ldlm_dump_all_namespaces(void);
 void ldlm_namespace_dump(struct ldlm_namespace *);
 void ldlm_resource_dump(struct ldlm_resource *);
-int ldlm_lock_change_resource(struct ldlm_lock *, __u64 new_resid[3]);
+int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *,
+                              __u64 new_resid[3]);
 
 /* ldlm_request.c */
+int ldlm_expired_completion_wait(void *data);
 int ldlm_completion_ast(struct ldlm_lock *lock, int flags);
 int ldlm_cli_enqueue(struct lustre_handle *conn,
                      struct ptlrpc_request *req,
index ba9555c..342721c 100644 (file)
 #include <linux/obd_filter.h>
 
 struct lov_export_data {
+        spinlock_t       led_lock;
         struct list_head led_open_head;
 };
 
+struct ost_export_data {
+        __u8 oed_uuid[37]; /* client UUID */
+};
+
 struct obd_export {
         __u64                     exp_cookie;
         struct list_head          exp_obd_chain;
@@ -32,12 +37,14 @@ struct obd_export {
                 struct mds_export_data    eu_mds_data;
                 struct filter_export_data eu_filter_data;
                 struct lov_export_data    eu_lov_data;
+                struct ost_export_data    eu_ost_data;
         } u;
 };
 
 #define exp_mds_data    u.eu_mds_data
 #define exp_lov_data    u.eu_lov_data
 #define exp_filter_data u.eu_filter_data
+#define exp_ost_data    u.eu_ost_data
 
 extern struct obd_export *class_conn2export(struct lustre_handle *conn);
 extern struct obd_device *class_conn2obd(struct lustre_handle *conn);
index bfac4c3..87b0bf3 100644 (file)
@@ -52,10 +52,12 @@ int recovd_setup(struct recovd_obd *mgr);
 int recovd_cleanup(struct recovd_obd *mgr);
 
 extern struct recovd_obd *ptlrpc_recovd;
+struct ptlrpc_request;
 
 int ptlrpc_run_recovery_upcall(struct ptlrpc_connection *conn);
-int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc);
-int ptlrpc_replay(struct obd_import *imp, int send_last_flag);
+int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc,
+                            struct ptlrpc_request **reqptr);
+int ptlrpc_replay(struct obd_import *imp);
 int ptlrpc_resend(struct obd_import *imp);
 void ptlrpc_free_committed(struct obd_import *imp);
 void ptlrpc_wake_delayed(struct obd_import *imp);
index 0febd11..cc194ac 100644 (file)
@@ -72,6 +72,9 @@ typedef __u8 obd_uuid_t[37];
 #define LDLM_CB_REPLY_PORTAL       16
 #define LDLM_CANCEL_REQUEST_PORTAL     17
 #define LDLM_CANCEL_REPLY_PORTAL       18
+#define PTLBD_REQUEST_PORTAL           19
+#define PTLBD_REPLY_PORTAL             20
+#define PTLBD_BULK_PORTAL              21
 
 #define SVC_KILLED               1
 #define SVC_EVENT                2
@@ -126,9 +129,12 @@ struct lustre_msg {
 #define MSG_OP_FLAG_SHIFT  16
 
 /* Flags that apply to all requests are in the bottom 16 bits */
-#define MSG_GEN_FLAG_MASK  0x0000ffff
-#define MSG_LAST_REPLAY    1
-#define MSG_RESENT         2
+#define MSG_GEN_FLAG_MASK      0x0000ffff
+#define MSG_LAST_REPLAY        1
+#define MSG_RESENT             2
+
+/* XXX horrible interim hack -- see bug 578 */
+#define MSG_REPLAY_IN_PROGRESS 4
 
 static inline int lustre_msg_get_flags(struct lustre_msg *msg)
 {
@@ -231,13 +237,11 @@ struct lov_object_id { /* per-child structure */
 
 struct lov_mds_md {
         __u32 lmm_magic;
-        __u32 lmm_unused;          /* was packed size of extended attribute */
         __u64 lmm_object_id;       /* lov object id */
-        __u32 lmm_stripe_offset;   /* starting stripe offset in lmd_objects */
-        __u32 lmm_stripe_count;    /* number of stipes in use for this object */
-        __u64 lmm_stripe_size;     /* size of the stripe */
-        __u32 lmm_ost_count;       /* how many OST idx are in this LOV md */
-        __u32 lmm_stripe_pattern;  /* per-lov object stripe pattern */
+        __u32 lmm_stripe_size;     /* size of the stripe */
+        __u32 lmm_stripe_offset;   /* starting stripe offset in lmm_objects */
+        __u16 lmm_stripe_count;    /* number of stipes in use for this object */
+        __u16 lmm_ost_count;       /* how many OST idx are in this LOV md */
         struct lov_object_id lmm_objects[0];
 };
 
@@ -334,6 +338,7 @@ struct ost_body {
 #define MDS_GETSTATUS  9
 #define MDS_STATFS     10
 #define MDS_GETLOVINFO 11
+#define MDS_GETATTR_NAME 12
 
 #define REINT_SETATTR  1
 #define REINT_CREATE   2
@@ -549,4 +554,33 @@ struct ldlm_reply {
         __u64  lock_policy_res1;
         __u64  lock_policy_res2;
 };
+
+/*
+ * ptlbd, portal block device requests
+ */
+typedef enum {
+        PTLBD_QUERY = 200,
+        PTLBD_READ = 201,
+        PTLBD_WRITE = 202,
+} ptlbd_cmd_t;
+
+struct ptlbd_op {
+        __u16 op_cmd;
+        __u16 op_lun;
+        __u16 op_niob_cnt;
+        __u16 op__padding;
+        __u32 op_block_cnt;
+};
+
+struct ptlbd_niob {
+        __u64 n_xid;
+        __u64 n_block_nr;
+        __u32 n_offset;
+        __u32 n_length;
+};
+
+struct ptlbd_rsp {
+        __u16 r_status;
+        __u16 r_error_cnt;
+};
 #endif
index aa58c49..b1f9288 100644 (file)
@@ -53,10 +53,13 @@ struct ptlrpc_request;
 struct obd_device;
 struct recovd_data;
 struct recovd_obd;
+struct obd_export;
 #include <linux/lustre_ha.h>
 
 int target_handle_connect(struct ptlrpc_request *req);
 int target_handle_disconnect(struct ptlrpc_request *req);
+int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp,
+                            char *cluuid);
 int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd,
                        obd_uuid_t cluuid, struct recovd_obd *recovd,
                        ptlrpc_recovery_cb_t recover);
@@ -138,17 +141,6 @@ static inline void ldlm_object2handle(void *object, struct lustre_handle *handle
         handle->addr = (__u64)(unsigned long)object;
 }
 
-struct obd_statfs;
-struct statfs;
-void statfs_pack(struct obd_statfs *osfs, struct statfs *sfs);
-void statfs_unpack(struct statfs *sfs, struct obd_statfs *osfs);
-void obd_statfs_pack(struct obd_statfs *tgt, struct obd_statfs *src);
-static inline void
-obd_statfs_unpack(struct obd_statfs *tgt, struct obd_statfs *src)
-{
-        obd_statfs_pack(tgt, src);
-}
-
 #include <linux/portals_lib.h>
 
 /*
@@ -408,11 +400,13 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
         }
 
         if (data->ioc_inllen2) {
-                data->ioc_inlbuf2 = &data->ioc_bulk[0] + size_round(data->ioc_inllen1);
+                data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+                        size_round(data->ioc_inllen1);
         }
 
         if (data->ioc_inllen3) {
-                data->ioc_inlbuf3 = &data->ioc_bulk[0] + size_round(data->ioc_inllen1) + 
+                data->ioc_inlbuf3 = &data->ioc_bulk[0] +
+                        size_round(data->ioc_inllen1) +
                         size_round(data->ioc_inllen2);
         }
 
@@ -426,7 +420,7 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
 #define OBD_IOC_CLEANUP                _IO  ('f', 103      )
 #define OBD_IOC_DESTROY                _IOW ('f', 104, long)
 #define OBD_IOC_PREALLOCATE            _IOWR('f', 105, long)
-#define OBD_IOC_DEC_USE_COUNT          _IO  ('f', 106      )
+
 #define OBD_IOC_SETATTR                _IOW ('f', 107, long)
 #define OBD_IOC_GETATTR                _IOR ('f', 108, long)
 #define OBD_IOC_READ                   _IOWR('f', 109, long)
index a965bcb..deb9656 100644 (file)
@@ -224,7 +224,7 @@ struct ldlm_lock;
 int ll_lock_callback(struct ldlm_lock *, struct ldlm_lock_desc *, void *data,
                      __u32 data_len, int flag);
 int ll_size_lock(struct inode *, struct lov_stripe_md *, obd_off start,
-                 int mode, struct lustre_handle **);
+                 int mode, struct lustre_handle *);
 int ll_size_unlock(struct inode *, struct lov_stripe_md *, int mode,
                    struct lustre_handle *);
 int ll_file_size(struct inode *inode, struct lov_stripe_md *md);
index 558c10b..7a02dae 100644 (file)
@@ -155,7 +155,7 @@ int mds_pack_md(struct mds_obd *mds, struct ptlrpc_request *req,
 
 /* mds/mds_fs.c */
 int mds_fs_setup(struct obd_device *obddev, struct vfsmount *mnt);
-void mds_fs_cleanup(struct obd_device *obddev);
+int mds_fs_cleanup(struct obd_device *obddev);
 
 /* mdc/mdc_request.c */
 int mdc_enqueue(struct lustre_handle *conn, int lock_type,
@@ -167,8 +167,11 @@ int mdc_getlovinfo(struct obd_device *obd, struct lustre_handle *mdc_connh,
                    struct ptlrpc_request **request);
 int mdc_getstatus(struct lustre_handle *conn, struct ll_fid *rootfid);
 int mdc_getattr(struct lustre_handle *conn,
-                obd_id ino, int type, unsigned long valid, size_t ea_size,
+                obd_id ino, int type, unsigned long valid, unsigned int ea_size,
                 struct ptlrpc_request **request);
+int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent,
+                     char *filename, int namelen, unsigned long valid,
+                     unsigned int ea_size, struct ptlrpc_request **request);
 int mdc_setattr(struct lustre_handle *conn,
                 struct inode *, struct iattr *iattr, struct ptlrpc_request **);
 int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags,
index 142db3b..081492c 100644 (file)
 
 #define LDLM_NUM_THREADS        4
 #define LDLM_NEVENTS    1024
-#define LDLM_NBUFS      20
-#define LDLM_BUFSIZE    (32 * 1024)
+#define LDLM_NBUFS      100
+#define LDLM_BUFSIZE    (8 * 1024)
 #define LDLM_MAXREQSIZE 1024
 
 #define MDT_NUM_THREADS 8
 #define MDS_NEVENTS     1024
-#define MDS_NBUFS       20
-#define MDS_BUFSIZE     (32 * 1024)
+#define MDS_NBUFS       100
+#define MDS_BUFSIZE     (8 * 1024)
 #define MDS_MAXREQSIZE  1024
 
 #define OST_NUM_THREADS 6
 #define OST_NEVENTS     min(num_physpages / 16, 32768UL)
-#define OST_NBUFS       min(OST_NEVENTS / 128, 256UL)
-#define OST_BUFSIZE     ((OST_NEVENTS > 4096UL ? 128 : 32) * 1024)
+#define OST_NBUFS       min(OST_NEVENTS / 128, 1280UL)
+#define OST_BUFSIZE     ((OST_NEVENTS > 4096UL ? 32 : 8) * 1024)
 #define OST_MAXREQSIZE  (8 * 1024)
 
+#define PTLBD_NUM_THREADS        4
+#define PTLBD_NEVENTS    1024
+#define PTLBD_NBUFS      20
+#define PTLBD_BUFSIZE    (32 * 1024)
+#define PTLBD_MAXREQSIZE 1024
+
 #define CONN_INVALID 1
 
 struct ptlrpc_connection {
@@ -137,7 +143,6 @@ struct ptlrpc_request {
         __u64 rq_xid;
 
         int rq_level;
-        time_t rq_timeout;
         //        void * rq_reply_handle;
         wait_queue_head_t rq_wait_for_rep;
 
@@ -160,13 +165,14 @@ struct ptlrpc_request {
 #define DEBUG_REQ(level, req, fmt, args...)                                    \
 do {                                                                           \
 CDEBUG(level,                                                                  \
-       "@@@ " fmt " req x"LPD64"/t"LPD64" o%d->%s:%d lens %d/%d ref %d fl "    \
-       "%x\n" ,  ## args, req->rq_xid, req->rq_transno,                        \
+       "@@@ " fmt " req@%p x"LPD64"/t"LPD64" o%d->%s:%d lens %d/%d ref %d fl " \
+       "%x\n" ,  ## args, req, req->rq_xid, req->rq_reqmsg->transno,           \
        req->rq_reqmsg ? req->rq_reqmsg->opc : -1,                              \
        req->rq_connection ? (char *)req->rq_connection->c_remote_uuid : "<?>", \
        (req->rq_import && req->rq_import->imp_client) ?                        \
            req->rq_import->imp_client->cli_request_portal : -1,                \
-       req->rq_reqlen, req->rq_replen, req->rq_refcount, req->rq_flags);       \
+       req->rq_reqlen, req->rq_replen,                                         \
+       atomic_read (&req->rq_refcount), req->rq_flags);                        \
 } while (0)
 
 struct ptlrpc_bulk_page {
index 9612846..94ffd4f 100644 (file)
 
 struct lov_oinfo { /* per-child structure */
         __u64 loi_id;              /* object ID on the target OST */
-        struct lustre_handle *loi_handle; /* handle for object on OST */
+        struct lustre_handle *loi_handle; /* open file handle for obj on OST */
         int loi_ost_idx;           /* OST stripe index in lmd_objects array */
 };
 
 struct lov_stripe_md {
-        __u32 lsm_magic;
         __u64 lsm_object_id;       /* lov object id */
-        __u64 lsm_stripe_size;     /* size of the stripe */
-        __u32 lsm_stripe_pattern;  /* per-lov object stripe pattern */
+        __u32 lsm_magic;
+        __u32 lsm_stripe_size;     /* size of the stripe */
         int   lsm_stripe_offset;   /* offset of first stripe in lmd_objects */
         int   lsm_stripe_count;    /* how many objects are being striped on */
         struct lov_oinfo lsm_oinfo[0];
 };
 
+#define IOC_OSC_TYPE         'h'
+#define IOC_OSC_MIN_NR       20
+#define IOC_OSC_REGISTER_LOV _IOWR(IOC_OSC_TYPE, 20, struct obd_device *)
+#define IOC_OSC_MAX_NR       50
+
+#define IOC_MDC_TYPE         'i'
+#define IOC_MDC_MIN_NR       20
+#define IOC_MDC_LOOKUP       _IOWR(IOC_MDC_TYPE, 20, struct obd_device *)
+#define IOC_MDC_MAX_NR       50
+
 #ifdef __KERNEL__
 # include <linux/fs.h>
 # include <linux/list.h>
@@ -46,9 +55,9 @@ struct obd_type {
 };
 
 struct brw_page {
-        struct page *pg;
-        obd_size count;
         obd_off  off;
+        struct page *pg;
+        int count;
         obd_flag flag;
 };
 
@@ -95,6 +104,7 @@ struct filter_obd {
         struct dentry *fo_dentry_O_mode[16];
         spinlock_t fo_objidlock;        /* protects fo_lastobjid increment */
         __u64 fo_lastobjid;
+        __u64 fo_last_committed;
         struct file_operations *fo_fop;
         struct inode_operations *fo_iop;
         struct address_space_operations *fo_aops;
@@ -115,11 +125,6 @@ struct client_obd {
         struct obd_device   *cl_containing_lov;
 };
 
-#define IOC_OSC_TYPE         'h'
-#define IOC_OSC_MIN_NR       20
-#define IOC_OSC_REGISTER_LOV _IOWR('h', 20, struct obd_device *)
-#define IOC_OSC_MAX_NR       50
-
 struct mds_obd {
         struct ptlrpc_service           *mds_service;
 
@@ -146,6 +151,9 @@ struct mds_obd {
         struct list_head                 mds_delayed_reply_queue;
         spinlock_t                       mds_processing_task_lock;
         pid_t                            mds_processing_task;
+
+        int                              mds_has_lov_desc;
+        struct lov_desc                  mds_lov_desc;
 };
 
 struct ldlm_obd {
@@ -169,6 +177,19 @@ struct echo_obd {
         atomic_t eo_write;
 };
 
+/*
+ * this struct does double-duty acting as either a client or
+ * server instance .. maybe not wise.
+ */
+struct ptlbd_obd {
+        /* server's */
+        struct ptlrpc_service *ptlbd_service;
+        /* client's */
+        struct ptlrpc_client bd_client;
+        struct obd_import bd_import;
+        int refcount; /* XXX sigh */
+};
+
 struct recovd_obd {
         spinlock_t            recovd_lock;
         struct list_head      recovd_managed_items; /* items managed  */
@@ -202,6 +223,11 @@ struct echo_client_obd {
         struct lustre_handle conn;   /* the local connection to osc/lov */
 };
 
+struct cache_obd {
+        struct lustre_handle cobd_target;       /* local connection to target obd */
+        struct lustre_handle cobd_cache;        /* local connection to cache obd */
+};
+
 struct lov_tgt_desc {
         obd_uuid_t uuid;
         struct lustre_handle conn;
@@ -260,6 +286,8 @@ struct obd_device {
                 struct recovd_obd recovd;
                 struct trace_obd trace;
                 struct lov_obd lov;
+                struct cache_obd cobd;
+                struct ptlbd_obd ptlbd;
 #if 0
                 struct snap_obd snap;
 #endif
@@ -270,6 +298,7 @@ struct obd_device {
 };
 
 struct obd_ops {
+        struct module *o_owner;
         int (*o_iocontrol)(unsigned int cmd, struct lustre_handle *, int len,
                            void *karg, void *uarg);
         int (*o_get_info)(struct lustre_handle *, obd_count keylen, void *key,
diff --git a/lustre/include/linux/obd_cache.h b/lustre/include/linux/obd_cache.h
new file mode 100644 (file)
index 0000000..e75b9f4
--- /dev/null
@@ -0,0 +1,13 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+
+#ifndef _OBD_CACHE_H__
+#define _OBD_CACHE_H__
+
+#ifdef __KERNEL__
+
+#define OBD_CACHE_DEVICENAME "cobd"
+
+#endif
+#endif
index 197de84..ed3eb99 100644 (file)
@@ -673,8 +673,6 @@ static inline void iattr_from_obdo(struct iattr *attr, struct obdo *oa,
 static inline void obdo_from_inode(struct obdo *dst, struct inode *src,
                                    obd_flag valid)
 {
-//        if (valid & OBD_MD_FLID)
-//                dst->o_id = src->i_ino;
         if (valid & OBD_MD_FLATIME)
                 dst->o_atime = src->i_atime;
         if (valid & OBD_MD_FLMTIME)
@@ -710,8 +708,8 @@ static inline void obdo_from_inode(struct obdo *dst, struct inode *src,
 static inline void obdo_to_inode(struct inode *dst, struct obdo *src,
                                  obd_flag valid)
 {
-//        if (valid & OBD_MD_FLID)
-//                dst->i_ino = src->o_id;
+        valid &= src->o_valid;
+
         if (valid & OBD_MD_FLATIME)
                 dst->i_atime = src->o_atime;
         if (valid & OBD_MD_FLMTIME)
@@ -847,7 +845,8 @@ int class_name2dev(char *name);
 int class_uuid2dev(char *uuid);
 struct obd_device *class_uuid2obd(char *uuid);
 struct obd_export *class_new_export(struct obd_device *obddev);
-struct obd_type *class_nm_to_type(char* name);
+struct obd_type *class_get_type(char *name);
+void class_put_type(struct obd_type *type);
 void class_destroy_export(struct obd_export *exp);
 int class_connect(struct lustre_handle *conn, struct obd_device *obd,
                   obd_uuid_t cluuid);
@@ -866,6 +865,13 @@ static inline struct ptlrpc_connection *class_rd2conn(struct recovd_data *rd)
         return list_entry(rd, struct ptlrpc_connection, c_recovd_data);
 }
 
+struct obd_statfs;
+struct statfs;
+void statfs_pack(struct obd_statfs *osfs, struct statfs *sfs);
+void statfs_unpack(struct statfs *sfs, struct obd_statfs *osfs);
+void obd_statfs_pack(struct obd_statfs *tgt, struct obd_statfs *src);
+void obd_statfs_unpack(struct obd_statfs *tgt, struct obd_statfs *src);
+
 #endif
 
 /* sysctl.c */
index e999451..5de0a25 100644 (file)
@@ -36,7 +36,7 @@
 void ost_pack_niobuf(void **tmp, __u64 offset, __u32 len, __u32 flags,
                      __u32 xid);
 void ost_unpack_niobuf(void **tmp, struct niobuf_remote **nbp);
-void ost_pack_ioo(void **tmp, struct lov_stripe_md *oa, int bufcnt);
-void ost_unpack_ioo(void **tmp, struct obd_ioobj **ioop);
+void ost_pack_ioo(struct obd_ioobj **ioop, struct lov_stripe_md *oa,int bufcnt);
+void ost_unpack_ioo(struct obd_ioobj **tmp, struct obd_ioobj **ioop);
 
 #endif
diff --git a/lustre/include/linux/obd_ptlbd.h b/lustre/include/linux/obd_ptlbd.h
new file mode 100644 (file)
index 0000000..b4f9fe9
--- /dev/null
@@ -0,0 +1,30 @@
+#ifndef _OBD_PTLBD_H
+#define _OBD_PTLBD_H
+
+#include <linux/lustre_idl.h>
+/*
+ * Copyright (C) 2002  Cluster File Systems, Inc.
+ *
+ * This code is issued under the GNU General Public License.
+ * See the file COPYING in this distribution
+ */
+
+#define OBD_PTLBD_SV_DEVICENAME "ptlbd_server"
+#define OBD_PTLBD_CL_DEVICENAME "ptlbd_client"
+
+/* XXX maybe this isn't the best header to be dumping all this in.. */
+
+extern int ptlbd_blk_init(void);
+extern int ptlbd_cl_init(void);
+extern int ptlbd_sv_init(void);
+
+extern void ptlbd_blk_exit(void);
+extern void ptlbd_cl_exit(void);
+extern void ptlbd_sv_exit(void);
+
+extern void ptlbd_blk_register(struct ptlbd_obd *ptlbd);
+extern int ptlbd_send_req(struct ptlbd_obd *, ptlbd_cmd_t cmd, 
+               struct buffer_head *);
+extern int ptlbd_parse_req(struct ptlrpc_request *req);
+
+#endif
index f6c2770..e3e23f4 100644 (file)
@@ -31,6 +31,7 @@
 
 /* global variables */
 extern atomic_t obd_memory;
+extern int obd_memmax;
 extern unsigned long obd_fail_loc;
 extern unsigned long obd_timeout;
 extern char obd_recovery_upcall[128];
@@ -66,6 +67,7 @@ extern char obd_recovery_upcall[128];
 #define OBD_FAIL_MDS_GETSTATUS_PACK      0x11c
 #define OBD_FAIL_MDS_STATFS_PACK         0x11d
 #define OBD_FAIL_MDS_STATFS_NET          0x11e
+#define OBD_FAIL_MDS_GETATTR_NAME_NET    0x11f
 
 #define OBD_FAIL_OST                     0x200
 #define OBD_FAIL_OST_CONNECT_NET         0x201
@@ -156,13 +158,17 @@ do {                                                                    \
         int s = (size);                                                 \
         (ptr) = lptr = kmalloc(s, GFP_KERNEL);                          \
         if (lptr == NULL) {                                             \
-                CERROR("kmalloc of '" #ptr "' (%ld bytes) failed "      \
+                CERROR("kmalloc of '" #ptr "' (%d bytes) failed "       \
                        "at %s:%d\n", s, __FILE__, __LINE__);            \
         } else {                                                        \
+                int obd_curmem;                                         \
                 memset(lptr, 0, s);                                     \
                 atomic_add(s, &obd_memory);                             \
+                obd_curmem = atomic_read(&obd_memory);                  \
+                if (obd_curmem > obd_memmax)                            \
+                        obd_memmax = obd_curmem;                        \
                 CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p "      \
-                       "(tot %d)\n", s, lptr, atomic_read(&obd_memory));\
+                       "(tot %d)\n", s, lptr, obd_curmem);              \
         }                                                               \
 } while (0)
 
index 680d4f0..e8ffd5b 100644 (file)
@@ -65,6 +65,10 @@ void l_lock(struct lustre_lock *lock)
                 owner = 1;
         spin_unlock(&lock->l_spin);
 
+        /* This is safe to increment outside the spinlock because we
+         * can only have 1 CPU running on the current task
+         * (i.e. l_owner == current), regardless of the number of CPUs.
+         */
         if (owner) {
                 ++lock->l_depth;
         } else {
index 468eb2b..ae1153f 100644 (file)
@@ -67,7 +67,8 @@ static void policy_internal(struct list_head *queue, struct ldlm_extent *req_ex,
 }
 
 /* apply the internal policy by walking all the lists */
-int ldlm_extent_policy(struct ldlm_lock *lock, void *req_cookie,
+int ldlm_extent_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+                       void *req_cookie,
                        ldlm_mode_t mode, int flags, void *data)
 {
         struct ldlm_resource *res = lock->l_resource;
@@ -79,11 +80,11 @@ int ldlm_extent_policy(struct ldlm_lock *lock, void *req_cookie,
         if (!res)
                 LBUG();
 
-        l_lock(&res->lr_namespace->ns_lock);
+        l_lock(&ns->ns_lock);
         policy_internal(&res->lr_granted, req_ex, &new_ex, mode);
         policy_internal(&res->lr_converting, req_ex, &new_ex, mode);
         policy_internal(&res->lr_waiting, req_ex, &new_ex, mode);
-        l_unlock(&res->lr_namespace->ns_lock);
+        l_unlock(&ns->ns_lock);
 
         memcpy(&lock->l_extent, &new_ex, sizeof(new_ex));
 
index 81b3b5d..a1220ab 100644 (file)
@@ -25,7 +25,6 @@
 
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/random.h>
 #include <linux/lustre_dlm.h>
 #include <linux/lustre_mds.h>
 #include <linux/obd_class.h>
@@ -102,12 +101,13 @@ ldlm_res_compat ldlm_res_compat_table[] = {
 
 static ldlm_res_policy ldlm_intent_policy_func;
 
-static int ldlm_plain_policy(struct ldlm_lock *lock, void *req_cookie,
-                             ldlm_mode_t mode, int flags, void *data)
+static int ldlm_plain_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+                             void *req_cookie, ldlm_mode_t mode, int flags,
+                             void *data)
 {
         if ((flags & LDLM_FL_HAS_INTENT) && ldlm_intent_policy_func) {
-                return ldlm_intent_policy_func(lock, req_cookie, mode, flags, 
-                                               data);
+                return ldlm_intent_policy_func(ns, lock, req_cookie, mode,
+                                               flags, data);
         }
 
         return ELDLM_OK;
@@ -186,6 +186,8 @@ void ldlm_lock_remove_from_lru(struct ldlm_lock *lock)
         EXIT;
 }
 
+/* Only called with strict == 0 by recovery, to mark in-use locks as
+ * should-be-destroyed */
 void ldlm_lock_destroy(struct ldlm_lock *lock)
 {
         ENTRY;
@@ -194,16 +196,16 @@ void ldlm_lock_destroy(struct ldlm_lock *lock)
         if (!list_empty(&lock->l_children)) {
                 LDLM_DEBUG(lock, "still has children (%p)!",
                            lock->l_children.next);
-                ldlm_lock_dump(lock);
+                ldlm_lock_dump(D_ERROR, lock);
                 LBUG();
         }
         if (lock->l_readers || lock->l_writers) {
                 LDLM_DEBUG(lock, "lock still has references");
-                ldlm_lock_dump(lock);
+                ldlm_lock_dump(D_OTHER, lock);
         }
 
         if (!list_empty(&lock->l_res_link)) {
-                ldlm_lock_dump(lock);
+                ldlm_lock_dump(D_ERROR, lock);
                 LBUG();
         }
 
@@ -217,6 +219,7 @@ void ldlm_lock_destroy(struct ldlm_lock *lock)
 
         list_del_init(&lock->l_export_chain);
         ldlm_lock_remove_from_lru(lock);
+        portals_handle_unhash(&lock->l_handle);
 
 #if 0
         /* Wake anyone waiting for this lock */
@@ -257,7 +260,6 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_lock *parent,
         if (lock == NULL)
                 RETURN(NULL);
 
-        get_random_bytes(&lock->l_random, sizeof(__u64));
         lock->l_resource = ldlm_resource_getref(resource);
 
         atomic_set(&lock->l_refc, 2);
@@ -279,12 +281,15 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_lock *parent,
                 l_unlock(&parent->l_resource->lr_namespace->ns_lock);
         }
 
+        INIT_LIST_HEAD(&lock->l_handle.h_link);
+        portals_handle_hash(&lock->l_handle, lock_handle_addref);
+
         RETURN(lock);
 }
 
-int ldlm_lock_change_resource(struct ldlm_lock *lock, __u64 new_resid[3])
+int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+                              __u64 new_resid[3])
 {
-        struct ldlm_namespace *ns = lock->l_resource->lr_namespace;
         struct ldlm_resource *oldres = lock->l_resource;
         ENTRY;
 
@@ -321,66 +326,63 @@ int ldlm_lock_change_resource(struct ldlm_lock *lock, __u64 new_resid[3])
 
 void ldlm_lock2handle(struct ldlm_lock *lock, struct lustre_handle *lockh)
 {
-        lockh->addr = (__u64) (unsigned long)lock;
-        lockh->cookie = lock->l_random;
+        //lockh->addr = (__u64)(unsigned long)lock;
+        memset(&lockh->addr, 0x69, sizeof(lockh->addr));
+        lockh->cookie = lock->l_handle.h_cookie;
 }
 
-/* 
- * if flags: atomically get the lock and set the flags. 
- * Return NULL if flag already set
+/* if flags: atomically get the lock and set the flags. 
+ *           Return NULL if flag already set
  */
 
-struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *handle, int strict,
-                                     int flags)
+struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *handle, int flags)
 {
         struct ldlm_lock *lock = NULL, *retval = NULL;
         ENTRY;
 
         LASSERT(handle);
 
-        if (!handle->addr)
+        lock = portals_handle2object(handle->cookie);
+        if (lock == NULL)
                 RETURN(NULL);
 
-        lock = (struct ldlm_lock *)(unsigned long)(handle->addr);
-        if (!kmem_cache_validate(ldlm_lock_slab, (void *)lock)) {
-                //CERROR("bogus lock %p\n", lock);
-                GOTO(out2, retval);
-        }
-
-        if (lock->l_random != handle->cookie) {
-                //CERROR("bogus cookie: lock %p has "LPX64" vs. handle "LPX64
-                //       "\n", lock, lock->l_random, handle->cookie);
-                GOTO(out2, NULL);
-        }
-        if (!lock->l_resource) {
-                CERROR("trying to lock bogus resource: lock %p\n", lock);
-                //LDLM_DEBUG(lock, "ldlm_handle2lock(%p)", lock);
-                GOTO(out2, retval);
-        }
-        if (!lock->l_resource->lr_namespace) {
-                CERROR("trying to lock bogus namespace: lock %p\n", lock);
-                //LDLM_DEBUG(lock, "ldlm_handle2lock(%p)", lock);
-                GOTO(out2, retval);
-        }
+        LASSERT(lock->l_resource != NULL);
+        LASSERT(lock->l_resource->lr_namespace != NULL);
 
         l_lock(&lock->l_resource->lr_namespace->ns_lock);
-        if (strict && lock->l_destroyed) {
+
+        /* It's unlikely but possible that someone marked the lock as
+         * destroyed after we did handle2object on it */
+        if (lock->l_destroyed) {
                 CERROR("lock already destroyed: lock %p\n", lock);
-                //LDLM_DEBUG(lock, "ldlm_handle2lock(%p)", lock);
-                GOTO(out, NULL);
+                LDLM_LOCK_PUT(lock);
+                GOTO(out, retval);
         }
 
-        if (flags && (lock->l_flags & flags))
-                GOTO(out, NULL);
+        if (flags && (lock->l_flags & flags)) {
+                LDLM_LOCK_PUT(lock);
+                GOTO(out, retval);
+        }
 
         if (flags)
                 lock->l_flags |= flags;
 
-        retval = LDLM_LOCK_GET(lock);
+        retval = lock;
         EXIT;
  out:
         l_unlock(&lock->l_resource->lr_namespace->ns_lock);
- out2:
+        return retval;
+}
+
+struct ldlm_lock *ldlm_handle2lock_ns(struct ldlm_namespace *ns,
+                                      struct lustre_handle *handle)
+{
+        struct ldlm_lock *retval = NULL;
+
+        l_lock(&ns->ns_lock);
+        retval = __ldlm_handle2lock(handle, 0);
+        l_unlock(&ns->ns_lock);
+
         return retval;
 }
 
@@ -456,7 +458,7 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
 
 void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode)
 {
-        struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0, 0);
+        struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0);
         struct ldlm_namespace *ns;
         ENTRY;
 
@@ -466,10 +468,13 @@ void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode)
         LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
         ns = lock->l_resource->lr_namespace;
         l_lock(&lock->l_resource->lr_namespace->ns_lock);
-        if (mode == LCK_NL || mode == LCK_CR || mode == LCK_PR)
+        if (mode == LCK_NL || mode == LCK_CR || mode == LCK_PR) {
+                LASSERT(lock->l_readers > 0);
                 lock->l_readers--;
-        else
+        } else {
+                LASSERT(lock->l_writers > 0);
                 lock->l_writers--;
+        }
 
         /* If we received a blocked AST and this was the last reference,
          * run the callback. */
@@ -493,8 +498,9 @@ void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode)
                 ns->ns_nr_unused++;
                 l_unlock(&lock->l_resource->lr_namespace->ns_lock);
                 ldlm_cancel_lru(ns);
-        } else
+        } else {
                 l_unlock(&lock->l_resource->lr_namespace->ns_lock);
+        }
 
         LDLM_LOCK_PUT(lock);    /* matches the ldlm_lock_get in addref */
         LDLM_LOCK_PUT(lock);    /* matches the handle2lock above */
@@ -711,8 +717,8 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
         return lock;
 }
 
-/* Must be called with lock->l_lock and lock->l_resource->lr_lock not held */
-ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock * lock,
+ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
+                               struct ldlm_lock *lock,
                                void *cookie, int cookie_len,
                                int *flags,
                                ldlm_completion_callback completion,
@@ -734,7 +740,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock * lock,
         if (!local && !(*flags & LDLM_FL_REPLAY) &&
             (policy = ldlm_res_policy_table[res->lr_type])) {
                 int rc;
-                rc = policy(lock, cookie, lock->l_req_mode, *flags, NULL);
+                rc = policy(ns, lock, cookie, lock->l_req_mode, *flags, NULL);
 
                 if (rc == ELDLM_LOCK_CHANGED) {
                         res = lock->l_resource;
@@ -745,7 +751,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock * lock,
                 }
         }
 
-        l_lock(&res->lr_namespace->ns_lock);
+        l_lock(&ns->ns_lock);
         if (local && lock->l_req_mode == lock->l_granted_mode) {
                 /* The server returned a blocked lock, but it was granted before
                  * we got a chance to actually enqueue it.  We don't need to do
@@ -767,7 +773,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock * lock,
          * FIXME (bug 268): Detect obvious lies by checking compatibility in
          * granted/converting queues. */
         ldlm_resource_unlink_lock(lock);
-        if (local || (*flags & LDLM_FL_REPLAY)) {
+        if (local) {
                 if (*flags & LDLM_FL_BLOCK_CONV)
                         ldlm_resource_add_lock(res, res->lr_converting.prev,
                                                lock);
@@ -776,6 +782,19 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock * lock,
                 else
                         ldlm_grant_lock(lock);
                 GOTO(out, ELDLM_OK);
+        } else if (*flags & LDLM_FL_REPLAY) {
+                if (*flags & LDLM_FL_BLOCK_CONV) {
+                        ldlm_resource_add_lock(res, res->lr_converting.prev,
+                                               lock);
+                        GOTO(out, ELDLM_OK);
+                } else if (*flags & LDLM_FL_BLOCK_WAIT) {
+                        ldlm_resource_add_lock(res, res->lr_waiting.prev, lock);
+                        GOTO(out, ELDLM_OK);
+                } else if (*flags & LDLM_FL_BLOCK_GRANTED) {
+                        ldlm_grant_lock(lock);
+                        GOTO(out, ELDLM_OK);
+                }
+                /* If no flags, fall through to normal enqueue path. */
         }
 
         /* FIXME: We may want to optimize by checking lr_most_restr */
@@ -798,7 +817,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock * lock,
         ldlm_grant_lock(lock);
         EXIT;
       out:
-        l_unlock(&res->lr_namespace->ns_lock);
+        l_unlock(&ns->ns_lock);
         /* Don't set 'completion_ast' until here so that if the lock is granted
          * immediately we don't do an unnecessary completion call. */
         lock->l_completion_ast = completion;
@@ -828,10 +847,10 @@ static int ldlm_reprocess_queue(struct ldlm_resource *res,
         RETURN(0);
 }
 
-void ldlm_run_ast_work(struct list_head *rpc_list)
+int ldlm_run_ast_work(struct list_head *rpc_list)
 {
         struct list_head *tmp, *pos;
-        int rc;
+        int rc, retval = 0;
         ENTRY;
 
         list_for_each_safe(tmp, pos, rpc_list) {
@@ -844,20 +863,34 @@ void ldlm_run_ast_work(struct list_head *rpc_list)
                                  w->w_datalen, LDLM_CB_BLOCKING);
                 else
                         rc = w->w_lock->l_completion_ast(w->w_lock, w->w_flags);
-                if (rc)
+                if (rc == -ERESTART)
+                        retval = rc;
+                else if (rc)
                         CERROR("Failed AST - should clean & disconnect "
                                "client\n");
                 LDLM_LOCK_PUT(w->w_lock);
                 list_del(&w->w_list);
                 OBD_FREE(w, sizeof(*w));
         }
-        EXIT;
+        RETURN(retval);
+}
+
+static int reprocess_one_queue(struct ldlm_resource *res, void *closure)
+{
+        ldlm_reprocess_all(res);
+        return LDLM_ITER_CONTINUE;
+}
+
+void ldlm_reprocess_all_ns(struct ldlm_namespace *ns)
+{
+        (void)ldlm_namespace_foreach_res(ns, reprocess_one_queue, NULL);
 }
 
 /* Must be called with resource->lr_lock not taken. */
 void ldlm_reprocess_all(struct ldlm_resource *res)
 {
         struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
+        int rc;
         ENTRY;
 
         /* Local lock trees don't get reprocessed. */
@@ -866,6 +899,7 @@ void ldlm_reprocess_all(struct ldlm_resource *res)
                 return;
         }
 
+ restart:
         l_lock(&res->lr_namespace->ns_lock);
         res->lr_tmp = &rpc_list;
 
@@ -876,7 +910,9 @@ void ldlm_reprocess_all(struct ldlm_resource *res)
         res->lr_tmp = NULL;
         l_unlock(&res->lr_namespace->ns_lock);
 
-        ldlm_run_ast_work(&rpc_list);
+        rc = ldlm_run_ast_work(&rpc_list);
+        if (rc == -ERESTART)
+                goto restart;
         EXIT;
 }
 
@@ -905,10 +941,12 @@ void ldlm_lock_cancel(struct ldlm_lock *lock)
         ns = res->lr_namespace;
 
         l_lock(&ns->ns_lock);
+        /* Please do not, no matter how tempting, remove this LBUG without
+         * talking to me first. -phik */
         if (lock->l_readers || lock->l_writers) {
                 LDLM_DEBUG(lock, "lock still has references");
-                ldlm_lock_dump(lock);
-                //LBUG();
+                ldlm_lock_dump(D_OTHER, lock);
+                LBUG();
         }
 
         ldlm_cancel_callback(lock);
@@ -1001,18 +1039,18 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
         RETURN(res);
 }
 
-void ldlm_lock_dump(struct ldlm_lock *lock)
+void ldlm_lock_dump(int level, struct ldlm_lock *lock)
 {
         char ver[128];
 
-        if (!(portal_debug & D_OTHER))
+        if (!(portal_debug & level))
                 return;
 
         if (RES_VERSION_SIZE != 4)
                 LBUG();
 
         if (!lock) {
-                CDEBUG(D_OTHER, "  NULL LDLM lock\n");
+                CDEBUG(level, "  NULL LDLM lock\n");
                 return;
         }
 
@@ -1020,27 +1058,26 @@ void ldlm_lock_dump(struct ldlm_lock *lock)
                  lock->l_version[0], lock->l_version[1],
                  lock->l_version[2], lock->l_version[3]);
 
-        CDEBUG(D_OTHER, "  -- Lock dump: %p (%s)\n", lock, ver);
+        CDEBUG(level, "  -- Lock dump: %p (%s)\n", lock, ver);
         if (lock->l_export && lock->l_export->exp_connection)
-                CDEBUG(D_OTHER, "  Node: NID %x (rhandle: "LPX64")\n",
+                CDEBUG(level, "  Node: NID %x (rhandle: "LPX64")\n",
                        lock->l_export->exp_connection->c_peer.peer_nid,
-                       lock->l_remote_handle.addr);
+                       lock->l_remote_handle.cookie);
         else
-                CDEBUG(D_OTHER, "  Node: local\n");
-        CDEBUG(D_OTHER, "  Parent: %p\n", lock->l_parent);
-        CDEBUG(D_OTHER, "  Resource: %p ("LPD64")\n", lock->l_resource,
+                CDEBUG(level, "  Node: local\n");
+        CDEBUG(level, "  Parent: %p\n", lock->l_parent);
+        CDEBUG(level, "  Resource: %p ("LPD64")\n", lock->l_resource,
                lock->l_resource->lr_name[0]);
-        CDEBUG(D_OTHER, "  Requested mode: %d, granted mode: %d\n",
+        CDEBUG(level, "  Requested mode: %d, granted mode: %d\n",
                (int)lock->l_req_mode, (int)lock->l_granted_mode);
-        CDEBUG(D_OTHER, "  Readers: %u ; Writers; %u\n",
+        CDEBUG(level, "  Readers: %u ; Writers; %u\n",
                lock->l_readers, lock->l_writers);
         if (lock->l_resource->lr_type == LDLM_EXTENT)
-                CDEBUG(D_OTHER, "  Extent: %Lu -> %Lu\n",
-                       (unsigned long long)lock->l_extent.start,
-                       (unsigned long long)lock->l_extent.end);
+                CDEBUG(level, "  Extent: "LPU64" -> "LPU64"\n",
+                       lock->l_extent.start, lock->l_extent.end);
 }
 
-void ldlm_lock_dump_handle(struct lustre_handle *lockh)
+void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh)
 {
         struct ldlm_lock *lock;
 
@@ -1048,7 +1085,7 @@ void ldlm_lock_dump_handle(struct lustre_handle *lockh)
         if (lock == NULL)
                 return;
 
-        ldlm_lock_dump(lock);
+        ldlm_lock_dump(D_OTHER, lock);
 
         LDLM_LOCK_PUT(lock);
 }
index b76fbcd..d826db1 100644 (file)
@@ -57,6 +57,7 @@ static void waiting_locks_callback(unsigned long unused)
                                                  l_pending_chain);
                 if (l->l_callback_timeout > jiffies)
                         break;
+                CERROR("lock timer expired, lock %p\n", l);
                 LDLM_DEBUG(l, "timer expired, recovering exp %p on conn %p",
                            l->l_export, l->l_export->exp_connection);
                 recovd_conn_fail(l->l_export->exp_connection);
@@ -162,14 +163,26 @@ static int ldlm_server_blocking_ast(struct ldlm_lock *lock,
         memcpy(&body->lock_desc, desc, sizeof(*desc));
 
         LDLM_DEBUG(lock, "server preparing blocking AST");
-        req->rq_replen = 0; /* no reply needed */
+        req->rq_replen = lustre_msg_size(0, NULL);
 
         ldlm_add_waiting_lock(lock);
         l_unlock(&lock->l_resource->lr_namespace->ns_lock);
 
-        (void)ptl_send_rpc(req);
+        req->rq_level = LUSTRE_CONN_RECOVD;
+        rc = ptlrpc_queue_wait(req);
+        if (rc == -ETIMEDOUT || rc == -EINTR) {
+                ldlm_expired_completion_wait(lock);
+        } else if (rc) {
+                CERROR("client returned %d from blocking AST for lock %p\n",
+                       req->rq_status, lock);
+                LDLM_DEBUG(lock, "client returned error %d from blocking AST",
+                           req->rq_status);
+                ldlm_lock_cancel(lock);
+                /* Server-side AST functions are called from ldlm_reprocess_all,
+                 * which needs to be told to please restart its reprocessing. */
+                rc = -ERESTART;
+        }
 
-        /* not waiting for reply */
         ptlrpc_req_finished(req);
 
         RETURN(rc);
@@ -199,11 +212,22 @@ static int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags)
         ldlm_lock2desc(lock, &body->lock_desc);
 
         LDLM_DEBUG(lock, "server preparing completion AST");
-        req->rq_replen = 0; /* no reply needed */
-
-        (void)ptl_send_rpc(req);
-
-        /* not waiting for reply */
+        req->rq_replen = lustre_msg_size(0, NULL);
+
+        req->rq_level = LUSTRE_CONN_RECOVD;
+        rc = ptlrpc_queue_wait(req);
+        if (rc == -ETIMEDOUT || rc == -EINTR) {
+                ldlm_expired_completion_wait(lock);
+        } else if (rc) {
+                CERROR("client returned %d from completion AST for lock %p\n",
+                       req->rq_status, lock);
+                LDLM_DEBUG(lock, "client returned error %d from completion AST",
+                           req->rq_status);
+                ldlm_lock_cancel(lock);
+                /* Server-side AST functions are called from ldlm_reprocess_all,
+                 * which needs to be told to please restart its reprocessing. */
+                rc = -ERESTART;
+        }
         ptlrpc_req_finished(req);
 
         RETURN(rc);
@@ -265,8 +289,8 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req)
                  &lock->l_export->exp_ldlm_data.led_held_locks);
         l_unlock(&lock->l_resource->lr_namespace->ns_lock);
 
-        err = ldlm_lock_enqueue(lock, cookie, cookielen, &flags,
-                                ldlm_server_completion_ast,
+        err = ldlm_lock_enqueue(obddev->obd_namespace, lock, cookie, cookielen,
+                                &flags, ldlm_server_completion_ast,
                                 ldlm_server_blocking_ast);
         if (err != ELDLM_OK)
                 GOTO(out, err);
@@ -384,7 +408,11 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
         RETURN(0);
 }
 
-static int ldlm_handle_bl_callback(struct ptlrpc_request *req)
+struct ldlm_lock *ldlm_handle2lock_ns(struct ldlm_namespace *ns,
+                                      struct lustre_handle *handle);
+
+static int ldlm_handle_bl_callback(struct ptlrpc_request *req,
+                                   struct ldlm_namespace *ns)
 {
         struct ldlm_request *dlm_req;
         struct ldlm_lock *lock;
@@ -395,11 +423,11 @@ static int ldlm_handle_bl_callback(struct ptlrpc_request *req)
 
         dlm_req = lustre_msg_buf(req->rq_reqmsg, 0);
 
-        lock = ldlm_handle2lock(&dlm_req->lock_handle1);
+        lock = ldlm_handle2lock_ns(ns, &dlm_req->lock_handle1);
         if (!lock) {
                 CERROR("blocking callback on lock "LPX64" - lock disappeared\n",
-                       dlm_req->lock_handle1.addr);
-                RETURN(0);
+                       dlm_req->lock_handle1.cookie);
+                RETURN(-EINVAL);
         }
 
         LDLM_DEBUG(lock, "client blocking AST callback handler START");
@@ -426,7 +454,8 @@ static int ldlm_handle_bl_callback(struct ptlrpc_request *req)
         RETURN(0);
 }
 
-static int ldlm_handle_cp_callback(struct ptlrpc_request *req)
+static int ldlm_handle_cp_callback(struct ptlrpc_request *req,
+                                   struct ldlm_namespace *ns)
 {
         struct list_head ast_list = LIST_HEAD_INIT(ast_list);
         struct ldlm_request *dlm_req;
@@ -437,16 +466,16 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req)
 
         dlm_req = lustre_msg_buf(req->rq_reqmsg, 0);
 
-        lock = ldlm_handle2lock(&dlm_req->lock_handle1);
+        lock = ldlm_handle2lock_ns(ns, &dlm_req->lock_handle1);
         if (!lock) {
                 CERROR("completion callback on lock "LPX64" - lock "
-                       "disappeared\n", dlm_req->lock_handle1.addr);
-                RETURN(0);
+                       "disappeared\n", dlm_req->lock_handle1.cookie);
+                RETURN(-EINVAL);
         }
 
         LDLM_DEBUG(lock, "client completion callback handler START");
 
-        l_lock(&lock->l_resource->lr_namespace->ns_lock);
+        l_lock(&ns->ns_lock);
 
         /* If we receive the completion AST before the actual enqueue returned,
          * then we might need to switch lock modes, resources, or extents. */
@@ -461,14 +490,14 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req)
         if (memcmp(dlm_req->lock_desc.l_resource.lr_name,
                    lock->l_resource->lr_name,
                    sizeof(__u64) * RES_NAME_SIZE) != 0) {
-                ldlm_lock_change_resource(lock,
+                ldlm_lock_change_resource(ns, lock,
                                          dlm_req->lock_desc.l_resource.lr_name);
                 LDLM_DEBUG(lock, "completion AST, new resource");
         }
         lock->l_resource->lr_tmp = &ast_list;
         ldlm_grant_lock(lock);
         lock->l_resource->lr_tmp = NULL;
-        l_unlock(&lock->l_resource->lr_namespace->ns_lock);
+        l_unlock(&ns->ns_lock);
         LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work");
         LDLM_LOCK_PUT(lock);
 
@@ -481,12 +510,13 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req)
 
 static int ldlm_callback_handler(struct ptlrpc_request *req)
 {
+        struct ldlm_namespace *ns;
         int rc;
         ENTRY;
 
         rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
         if (rc) {
-                CERROR("lustre_ldlm: Invalid request: %d\n", rc);
+                CERROR("Invalid request: %d\n", rc);
                 RETURN(rc);
         }
 
@@ -501,32 +531,44 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                 dlm_req = lustre_msg_buf(req->rq_reqmsg, 0);
                 CERROR("--> lock addr: "LPX64", cookie: "LPX64"\n",
                        dlm_req->lock_handle1.addr,dlm_req->lock_handle1.cookie);
-                CERROR("--> ignoring this error as a temporary workaround!  "
-                       "beware!\n");
-                //RETURN(-ENOTCONN);
+                RETURN(-ENOTCONN);
         }
 
+        LASSERT(req->rq_export != NULL);
+        LASSERT(req->rq_export->exp_obd != NULL);
+        ns = req->rq_export->exp_obd->obd_namespace;
+        LASSERT(ns != NULL);
+
         switch (req->rq_reqmsg->opc) {
         case LDLM_BL_CALLBACK:
                 CDEBUG(D_INODE, "blocking ast\n");
                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_BL_CALLBACK, 0);
-                rc = ldlm_handle_bl_callback(req);
-                RETURN(rc);
+                rc = ldlm_handle_bl_callback(req, ns);
+                break;
         case LDLM_CP_CALLBACK:
                 CDEBUG(D_INODE, "completion ast\n");
                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_CP_CALLBACK, 0);
-                rc = ldlm_handle_cp_callback(req);
-                RETURN(rc);
-
+                rc = ldlm_handle_cp_callback(req, ns);
+                break;
         default:
                 CERROR("invalid opcode %d\n", req->rq_reqmsg->opc);
                 RETURN(-EINVAL);
         }
 
+        req->rq_status = rc;
+        if (rc) {
+                ptlrpc_error(req->rq_svc, req);
+        } else {
+                rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen,
+                                     &req->rq_repmsg);
+                if (rc)
+                        RETURN(rc);
+                ptlrpc_reply(req->rq_svc, req);
+        }
+
         RETURN(0);
 }
 
-
 static int ldlm_cancel_handler(struct ptlrpc_request *req)
 {
         int rc;
@@ -539,11 +581,14 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req)
         }
 
         if (req->rq_export == NULL) {
+                struct ldlm_request *dlm_req;
                 CERROR("operation %d with bad export (ptl req %d/rep %d)\n",
                        req->rq_reqmsg->opc, req->rq_request_portal,
                        req->rq_reply_portal);
                 CERROR("--> export addr: "LPX64", cookie: "LPX64"\n",
                        req->rq_reqmsg->addr, req->rq_reqmsg->cookie);
+                dlm_req = lustre_msg_buf(req->rq_reqmsg, 0);
+                ldlm_lock_dump_handle(D_ERROR, &dlm_req->lock_handle1);
                 CERROR("--> ignoring this error as a temporary workaround!  "
                        "beware!\n");
                 //RETURN(-ENOTCONN);
@@ -568,7 +613,6 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req)
         RETURN(0);
 }
 
-
 static int ldlm_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
                           void *karg, void *uarg)
 {
@@ -579,7 +623,7 @@ static int ldlm_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
 
         if (_IOC_TYPE(cmd) != IOC_LDLM_TYPE || _IOC_NR(cmd) < IOC_LDLM_MIN_NR ||
             _IOC_NR(cmd) > IOC_LDLM_MAX_NR) {
-                CDEBUG(D_IOCTL, "invalid ioctl (type %ld, nr %ld, size %ld)\n",
+                CDEBUG(D_IOCTL, "invalid ioctl (type %d, nr %d, size %d)\n",
                        _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
                 RETURN(-EINVAL);
         }
@@ -619,11 +663,9 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf)
         if (ldlm_already_setup)
                 RETURN(-EALREADY);
 
-        MOD_INC_USE_COUNT;
-
         rc = ldlm_proc_setup(obddev);
         if (rc != 0)
-                GOTO(out_dec, rc);
+                RETURN(rc);
 
         ldlm->ldlm_cb_service =
                 ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE,
@@ -689,8 +731,6 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf)
  out_proc:
         ldlm_proc_cleanup(obddev);
 
- out_dec:
-        MOD_DEC_USE_COUNT;
         return rc;
 }
 
@@ -711,7 +751,6 @@ static int ldlm_cleanup(struct obd_device *obddev)
         ldlm_proc_cleanup(obddev);
 
         ldlm_already_setup = 0;
-        MOD_DEC_USE_COUNT;
         RETURN(0);
 }
 
@@ -723,6 +762,7 @@ static int ldlm_connect(struct lustre_handle *conn, struct obd_device *src,
 }
 
 struct obd_ops ldlm_obd_ops = {
+        o_owner:       THIS_MODULE,
         o_iocontrol:   ldlm_iocontrol,
         o_setup:       ldlm_setup,
         o_cleanup:     ldlm_cleanup,
@@ -798,7 +838,9 @@ EXPORT_SYMBOL(ldlm_namespace_dump);
 EXPORT_SYMBOL(ldlm_cancel_locks_for_export);
 EXPORT_SYMBOL(ldlm_replay_locks);
 EXPORT_SYMBOL(ldlm_resource_foreach);
+EXPORT_SYMBOL(ldlm_reprocess_all_ns);
 EXPORT_SYMBOL(ldlm_namespace_foreach);
+EXPORT_SYMBOL(ldlm_namespace_foreach_res);
 EXPORT_SYMBOL(l_lock);
 EXPORT_SYMBOL(l_unlock);
 
index 7a972b9..b71dd20 100644 (file)
@@ -30,7 +30,7 @@ static int interrupted_completion_wait(void *data)
         RETURN(1);
 }
 
-static int expired_completion_wait(void *data)
+int ldlm_expired_completion_wait(void *data)
 {
         struct ldlm_lock *lock = data;
         struct ptlrpc_connection *conn;
@@ -48,6 +48,7 @@ static int expired_completion_wait(void *data)
                 LDLM_DEBUG(lock, "timed out waiting for completion");
                 CERROR("lock %p timed out from %s\n", lock,
                        conn->c_remote_uuid);
+                ldlm_lock_dump(D_ERROR, lock);
                 class_signal_connection_failure(conn);
         }
         RETURN(0);
@@ -56,7 +57,7 @@ static int expired_completion_wait(void *data)
 int ldlm_completion_ast(struct ldlm_lock *lock, int flags)
 {
         struct l_wait_info lwi =
-                LWI_TIMEOUT_INTR(obd_timeout * HZ, expired_completion_wait,
+                LWI_TIMEOUT_INTR(obd_timeout * HZ, ldlm_expired_completion_wait,
                                  interrupted_completion_wait, lock);
         int rc = 0;
         ENTRY;
@@ -75,7 +76,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags)
 
         LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
                    "sleeping");
-        ldlm_lock_dump(lock);
+        ldlm_lock_dump(D_OTHER, lock);
         ldlm_reprocess_all(lock->l_resource);
 
  noreproc:
@@ -131,7 +132,7 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
         ldlm_lock2handle(lock, lockh);
         lock->l_connh = NULL;
 
-        err = ldlm_lock_enqueue(lock, cookie, cookielen, flags, completion,
+        err = ldlm_lock_enqueue(ns, lock, cookie, cookielen, flags, completion,
                                 blocking);
         if (err != ELDLM_OK)
                 GOTO(out, err);
@@ -243,7 +244,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
                 /* FIXME: if we've already received a completion AST, this will
                  * LBUG! */
                 ldlm_lock_destroy(lock);
-                GOTO(out, rc);
+                GOTO(out_req, rc);
         }
 
         reply = lustre_msg_buf(req->rq_repmsg, 0);
@@ -282,28 +283,28 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
                                (long)reply->lock_resource_name[0],
                                (long)lock->l_resource->lr_name[0]);
 
-                        ldlm_lock_change_resource(lock,
+                        ldlm_lock_change_resource(ns, lock,
                                                   reply->lock_resource_name);
                         if (lock->l_resource == NULL) {
                                 LBUG();
-                                RETURN(-ENOMEM);
+                                GOTO(out_req, rc = -ENOMEM);
                         }
                         LDLM_DEBUG(lock, "client-side enqueue, new resource");
                 }
         }
 
         if (!is_replay) {
-                rc = ldlm_lock_enqueue(lock, cookie, cookielen, flags,
+                rc = ldlm_lock_enqueue(ns, lock, cookie, cookielen, flags,
                                        completion, blocking);
                 if (lock->l_completion_ast)
                         lock->l_completion_ast(lock, *flags);
         }
 
-        if (!req_passed_in)
-                ptlrpc_req_finished(req);
-
         LDLM_DEBUG(lock, "client-side enqueue END");
         EXIT;
+ out_req:
+        if (!req_passed_in)
+                ptlrpc_req_finished(req);
  out:
         LDLM_LOCK_PUT(lock);
  out_nolock:
@@ -437,7 +438,7 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
         ENTRY;
 
         /* concurrent cancels on the same handle can happen */
-        lock = __ldlm_handle2lock(lockh, 0, LDLM_FL_CANCELING);
+        lock = __ldlm_handle2lock(lockh, LDLM_FL_CANCELING);
         if (lock == NULL)
                 RETURN(0);
 
@@ -620,6 +621,9 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, __u64 *res_id,
         int i;
         ENTRY;
 
+        if (ns == NULL)
+                RETURN(ELDLM_OK);
+
         if (res_id)
                 RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, flags));
 
@@ -698,11 +702,22 @@ static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
         return helper->iter(lock, helper->closure);
 }
 
+static int ldlm_res_iter_helper(struct ldlm_resource *res, void *closure)
+{
+        return ldlm_resource_foreach(res, ldlm_iter_helper, closure);
+}
+
 int ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
                            void *closure)
 {
-        int i, rc = LDLM_ITER_CONTINUE;
         struct iter_helper_data helper = { iter: iter, closure: closure };
+        return ldlm_namespace_foreach_res(ns, ldlm_res_iter_helper, &helper);
+}
+
+int ldlm_namespace_foreach_res(struct ldlm_namespace *ns,
+                               ldlm_res_iterator_t iter, void *closure)
+{
+        int i, rc = LDLM_ITER_CONTINUE;
         
         l_lock(&ns->ns_lock);
         for (i = 0; i < RES_HASH_SIZE; i++) {
@@ -712,8 +727,7 @@ int ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
                                 list_entry(tmp, struct ldlm_resource, lr_hash);
 
                         ldlm_resource_getref(res);
-                        rc = ldlm_resource_foreach(res, ldlm_iter_helper,
-                                                   &helper);
+                        rc = iter(res, closure);
                         ldlm_resource_putref(res);
                         if (rc == LDLM_ITER_STOP)
                                 GOTO(out, rc);
@@ -735,22 +749,44 @@ static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
         return LDLM_ITER_CONTINUE;
 }
 
-static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock,
-                           int last)
+static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
 {
         struct ptlrpc_request *req;
         struct ldlm_request *body;
         struct ldlm_reply *reply;
         int rc, size;
-        int flags = LDLM_FL_REPLAY;
-
-        flags |= lock->l_flags & 
-                (LDLM_FL_BLOCK_GRANTED|LDLM_FL_BLOCK_CONV|LDLM_FL_BLOCK_WAIT);
-
+        int flags;
+
+        /*
+         * If granted mode matches the requested mode, this lock is granted.
+         *
+         * If they differ, but we have a granted mode, then we were granted
+         * one mode and now want another: ergo, converting.
+         *
+         * If we haven't been granted anything and are on a resource list,
+         * then we're blocked/waiting.
+         *
+         * If we haven't been granted anything and we're NOT on a resource list,
+         * then we haven't got a reply yet and don't have a known disposition.
+         * This happens whenever a lock enqueue is the request that triggers
+         * recovery.
+         */
+        if (lock->l_granted_mode == lock->l_req_mode)
+                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
+        else if (lock->l_granted_mode)
+                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV;
+        else if (!list_empty(&lock->l_res_link))
+                flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
+        else
+                flags = LDLM_FL_REPLAY;
+                
         size = sizeof(*body);
         req = ptlrpc_prep_req(imp, LDLM_ENQUEUE, 1, &size, NULL);
         if (!req)
                 RETURN(-ENOMEM);
+
+        /* We're part of recovery, so don't wait for it. */
+        req->rq_level = LUSTRE_CONN_RECOVD;
         
         body = lustre_msg_buf(req->rq_reqmsg, 0);
         ldlm_lock2desc(lock, &body->lock_desc);
@@ -760,9 +796,6 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock,
         size = sizeof(*reply);
         req->rq_replen = lustre_msg_size(1, &size);
 
-        if (last)
-                req->rq_reqmsg->flags |= MSG_LAST_REPLAY;
-
         LDLM_DEBUG(lock, "replaying lock:");
         rc = ptlrpc_queue_wait(req);
         if (rc != ELDLM_OK)
@@ -792,7 +825,7 @@ int ldlm_replay_locks(struct obd_import *imp)
 
         list_for_each_safe(pos, next, &list) {
                 lock = list_entry(pos, struct ldlm_lock, l_pending_chain);
-                rc = replay_one_lock(imp, lock, (next == &list));
+                rc = replay_one_lock(imp, lock);
                 if (rc)
                         break; /* or try to do the rest? */
         }
index d1f5b61..e5960bd 100644 (file)
@@ -134,7 +134,9 @@ out_ns:
 
 extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock);
 
-/* If 'local_only' is true, don't try to tell the server, just cleanup. */
+/* If 'local_only' is true, don't try to tell the server, just cleanup.
+ * This is currently only used for recovery, and we make certain assumptions
+ * as a result--notably, that we shouldn't cancel locks with refs. -phil */
 static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
                              int local_only)
 {
@@ -147,6 +149,18 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
                 LDLM_LOCK_GET(lock);
 
+                if (local_only && (lock->l_readers || lock->l_writers)) {
+                        /* This is a little bit gross, but much better than the
+                         * alternative: pretend that we got a blocking AST from
+                         * the server, so that when the lock is decref'd, it
+                         * will go away ... */
+                        lock->l_flags |= LDLM_FL_CBPENDING;
+                        /* ... without sending a CANCEL message. */
+                        lock->l_flags |= LDLM_FL_CANCELING;
+                        LDLM_LOCK_PUT(lock);
+                        continue;
+                }
+
                 /* At shutdown time, don't call the cancellation callback */
                 lock->l_flags |= LDLM_FL_CANCEL;
 
@@ -170,12 +184,18 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q,
                 }
                 LDLM_LOCK_PUT(lock);
         }
+        EXIT;
 }
 
 int ldlm_namespace_cleanup(struct ldlm_namespace *ns, int local_only)
 {
         int i;
 
+        if (ns == NULL) {
+                CDEBUG(D_INFO, "NULL ns, skipping cleanup\n");
+                return ELDLM_OK;
+        }
+
         l_lock(&ns->ns_lock);
         for (i = 0; i < RES_HASH_SIZE; i++) {
                 struct list_head *tmp, *pos;
@@ -431,7 +451,8 @@ void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head,
         l_lock(&res->lr_namespace->ns_lock);
 
         ldlm_resource_dump(res);
-        ldlm_lock_dump(lock);
+        CDEBUG(D_OTHER, "About to grant this lock:\n");
+        ldlm_lock_dump(D_OTHER, lock);
 
         LASSERT(list_empty(&lock->l_res_link));
 
@@ -510,20 +531,20 @@ void ldlm_resource_dump(struct ldlm_resource *res)
         list_for_each(tmp, &res->lr_granted) {
                 struct ldlm_lock *lock;
                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
-                ldlm_lock_dump(lock);
+                ldlm_lock_dump(D_OTHER, lock);
         }
 
         CDEBUG(D_OTHER, "Converting locks:\n");
         list_for_each(tmp, &res->lr_converting) {
                 struct ldlm_lock *lock;
                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
-                ldlm_lock_dump(lock);
+                ldlm_lock_dump(D_OTHER, lock);
         }
 
         CDEBUG(D_OTHER, "Waiting locks:\n");
         list_for_each(tmp, &res->lr_waiting) {
                 struct ldlm_lock *lock;
                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
-                ldlm_lock_dump(lock);
+                ldlm_lock_dump(D_OTHER, lock);
         }
 }
index ce7a73d..b34c9ab 100644 (file)
@@ -172,7 +172,7 @@ int ldlm_test_basics(struct obd_device *obddev)
         lock1 = ldlm_lock_create(ns, NULL, res_id, LDLM_PLAIN, LCK_CR, NULL, 0);
         if (lock1 == NULL)
                 LBUG();
-        err = ldlm_lock_enqueue(lock1, NULL, 0, &flags,
+        err = ldlm_lock_enqueue(ns, lock1, NULL, 0, &flags,
                                 ldlm_completion_ast, ldlm_blocking_ast);
         if (err != ELDLM_OK)
                 LBUG();
@@ -180,7 +180,7 @@ int ldlm_test_basics(struct obd_device *obddev)
         lock = ldlm_lock_create(ns, NULL, res_id, LDLM_PLAIN, LCK_EX, NULL, 0);
         if (lock == NULL)
                 LBUG();
-        err = ldlm_lock_enqueue(lock, NULL, 0, &flags,
+        err = ldlm_lock_enqueue(ns, lock, NULL, 0, &flags,
                                 ldlm_completion_ast, ldlm_blocking_ast);
         if (err != ELDLM_OK)
                 LBUG();
@@ -222,7 +222,8 @@ int ldlm_test_extents(struct obd_device *obddev)
                                  0);
         if (lock1 == NULL)
                 LBUG();
-        err = ldlm_lock_enqueue(lock1, &ext1, sizeof(ext1), &flags, NULL, NULL);
+        err = ldlm_lock_enqueue(ns, lock1, &ext1, sizeof(ext1), &flags, NULL,
+                                NULL);
         if (err != ELDLM_OK)
                 LBUG();
         if (!(flags & LDLM_FL_LOCK_CHANGED))
@@ -231,7 +232,8 @@ int ldlm_test_extents(struct obd_device *obddev)
         flags = 0;
         lock2 = ldlm_lock_create(ns, NULL, res_id, LDLM_EXTENT, LCK_PR,
                                 NULL, 0);
-        err = ldlm_lock_enqueue(lock2, &ext2, sizeof(ext2), &flags, NULL, NULL);
+        err = ldlm_lock_enqueue(ns, lock2, &ext2, sizeof(ext2), &flags, NULL,
+                                NULL);
         if (err != ELDLM_OK)
                 LBUG();
         if (!(flags & LDLM_FL_LOCK_CHANGED))
@@ -241,7 +243,7 @@ int ldlm_test_extents(struct obd_device *obddev)
         lock = ldlm_lock_create(ns, NULL, res_id, LDLM_EXTENT, LCK_EX, NULL, 0);
         if (lock == NULL)
                 LBUG();
-        err = ldlm_lock_enqueue(lock, &ext3, sizeof(ext3), &flags,
+        err = ldlm_lock_enqueue(ns, lock, &ext3, sizeof(ext3), &flags,
                                 NULL, NULL);
         if (err != ELDLM_OK)
                 LBUG();
@@ -293,7 +295,7 @@ static int ldlm_test_network(struct obd_device *obddev,
         CERROR("ldlm_cli_convert: %d\n", err);
 
         lock = ldlm_handle2lock(&lockh1);
-        ldlm_lock_dump(lock);
+        ldlm_lock_dump(D_OTHER, lock);
         ldlm_lock_put(lock);
 
         /* Need to decrement old mode. Don't bother incrementing new
@@ -432,6 +434,7 @@ static int ldlm_do_convert(void)
 static int ldlm_test_main(void *data)
 {
         struct ldlm_test_thread *thread = data;
+        unsigned long flags;
         ENTRY;
 
         lock_kernel();
@@ -440,10 +443,10 @@ static int ldlm_test_main(void *data)
         sigfillset(&current->blocked);
         recalc_sigpending();
 #else
-        spin_lock_irq(&current->sigmask_lock);
+        spin_lock_irqsave(&current->sigmask_lock, flags);
         sigfillset(&current->blocked);
         recalc_sigpending(current);
-        spin_unlock_irq(&current->sigmask_lock);
+        spin_unlock_irqrestore(&current->sigmask_lock, flags);
 #endif
 
         sprintf(current->comm, "ldlm_test");
index da31808..1bcc388 100644 (file)
@@ -1,4 +1,4 @@
-EXTRA_DIST = mds_updates.c obd_pack.c ll_pack.c simple.c
+EXTRA_DIST = mds_updates.c obd_pack.c  simple.c
 EXTRA_DIST += client.c target.c
 
 include $(top_srcdir)/Rules
index 03fa4e2..5bf0d4a 100644 (file)
@@ -44,12 +44,12 @@ struct obd_device *client_tgtuuid2obd(char *tgtuuid)
 {
         int i;
 
-        for (i=0; i < MAX_OBD_DEVICES; i++) {
+        for (i = 0; i < MAX_OBD_DEVICES; i++) {
                 struct obd_device *obd = &obd_dev[i];
                 if ((strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0) ||
                     (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0)) {
                         struct client_obd *cli = &obd->u.cli;
-                        if (strncmp(tgtuuid, cli->cl_target_uuid, 
+                        if (strncmp(tgtuuid, cli->cl_target_uuid,
                                     sizeof(cli->cl_target_uuid)) == 0)
                                 return obd;
                 }
@@ -107,7 +107,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
         imp->imp_connection = ptlrpc_uuid_to_connection(server_uuid);
         if (!imp->imp_connection)
                 RETURN(-ENOENT);
-        
+
         INIT_LIST_HEAD(&imp->imp_replay_list);
         INIT_LIST_HEAD(&imp->imp_sending_list);
         INIT_LIST_HEAD(&imp->imp_delayed_list);
@@ -120,7 +120,6 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
 
         cli->cl_max_mds_easize = sizeof(struct lov_mds_md);
 
-        MOD_INC_USE_COUNT;
         RETURN(0);
 }
 
@@ -131,7 +130,6 @@ int client_obd_cleanup(struct obd_device * obddev)
         ptlrpc_cleanup_client(&obd->cl_import);
         ptlrpc_put_connection(obd->cl_import.imp_connection);
 
-        MOD_DEC_USE_COUNT;
         return 0;
 }
 
@@ -150,12 +148,10 @@ int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd,
 
         ENTRY;
         down(&cli->cl_sem);
-        MOD_INC_USE_COUNT;
         rc = class_connect(conn, obd, cluuid);
-        if (rc) {
-                MOD_DEC_USE_COUNT;
+        if (rc)
                 GOTO(out_sem, rc);
-        }
+
         cli->cl_conn_count++;
         if (cli->cl_conn_count > 1)
                 GOTO(out_sem, rc);
@@ -217,7 +213,6 @@ out_ldlm:
 out_disco:
                         cli->cl_conn_count--;
                         class_disconnect(conn);
-                        MOD_DEC_USE_COUNT;
                 }
         }
 out_sem:
@@ -251,20 +246,20 @@ int client_obd_disconnect(struct lustre_handle *conn)
 
         cli->cl_conn_count--;
         if (cli->cl_conn_count)
-                GOTO(out_disco, rc = 0);
+                GOTO(out_no_disconnect, rc = 0);
 
         ldlm_namespace_free(obd->obd_namespace);
         obd->obd_namespace = NULL;
         request = ptlrpc_prep_req(&cli->cl_import, rq_opc, 0, NULL,
                                   NULL);
         if (!request)
-                GOTO(out_disco, rc = -ENOMEM);
-        
+                GOTO(out_req, rc = -ENOMEM);
+
         request->rq_replen = lustre_msg_size(0, NULL);
 
         /* Process disconnects even if we're waiting for recovery. */
         request->rq_level = LUSTRE_CONN_RECOVD;
-        
+
         rc = ptlrpc_queue_wait(request);
         if (rc)
                 GOTO(out_req, rc);
@@ -273,12 +268,11 @@ int client_obd_disconnect(struct lustre_handle *conn)
  out_req:
         if (request)
                 ptlrpc_req_finished(request);
- out_disco:
+        list_del_init(&cli->cl_import.imp_chain);
+ out_no_disconnect:
         err = class_disconnect(conn);
         if (!rc && err)
                 rc = err;
-        list_del_init(&cli->cl_import.imp_chain);
-        MOD_DEC_USE_COUNT;
  out_sem:
         up(&cli->cl_sem);
         RETURN(rc);
index 8b3c33a..a03d2bb 100644 (file)
 #include <linux/obd_ost.h>
 #include <linux/lustre_net.h>
 
-void ost_pack_ioo(void **tmp, struct lov_stripe_md *lsm, int bufcnt)
+void ost_pack_ioo(struct obd_ioobj **tmp, struct lov_stripe_md *lsm,int bufcnt)
 {
         struct obd_ioobj *ioo = *tmp;
-        char *c = *tmp;
+        void *p = *tmp;
 
         ioo->ioo_id = HTON__u64(lsm->lsm_object_id);
         ioo->ioo_gr = HTON__u64(0);
         ioo->ioo_type = HTON__u32(S_IFREG);
         ioo->ioo_bufcnt = HTON__u32(bufcnt);
-        *tmp = c + sizeof(*ioo);
+        *tmp = p + sizeof(*ioo);
 }
 
-void ost_unpack_ioo(void **tmp, struct obd_ioobj **ioop)
+void ost_unpack_ioo(struct obd_ioobj **tmp, struct obd_ioobj **ioop)
 {
-        char *c = *tmp;
+        void *p = *tmp;
         struct obd_ioobj *ioo = *tmp;
         *ioop = *tmp;
 
@@ -49,7 +49,7 @@ void ost_unpack_ioo(void **tmp, struct obd_ioobj **ioop)
         ioo->ioo_gr = NTOH__u64(ioo->ioo_gr);
         ioo->ioo_type = NTOH__u32(ioo->ioo_type);
         ioo->ioo_bufcnt = NTOH__u32(ioo->ioo_bufcnt);
-        *tmp = c + sizeof(*ioo);
+        *tmp = p + sizeof(*ioo);
 }
 
 void ost_pack_niobuf(void **tmp, __u64 offset, __u32 len, __u32 flags,
index cb4ccda..73a4383 100644 (file)
 
 #ifdef OBD_CTXT_DEBUG
 /* Debugging check only needed during development */
-#define ASSERT_CTXT_MAGIC(magic) do { if ((magic) != OBD_RUN_CTXT_MAGIC) { \
-                                CERROR("bad ctxt magic\n"); LBUG(); } } while(0)
-#define ASSERT_NOT_KERNEL_CTXT(msg) do { if (segment_eq(get_fs(), get_ds())) { \
-                                        CERROR(msg); LBUG(); } } while(0)
-#define ASSERT_KERNEL_CTXT(msg) do { if (!segment_eq(get_fs(), get_ds())) { \
-                                        CERROR(msg); LBUG(); } } while(0)
+#define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
+#define ASSERT_NOT_KERNEL_CTXT(msg) LASSERT(!segment_eq(get_fs(), get_ds()))
+#define ASSERT_KERNEL_CTXT(msg) LASSERT(segment_eq(get_fs(), get_ds()))
 #else
 #define ASSERT_CTXT_MAGIC(magic) do {} while(0)
 #define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
@@ -56,6 +53,8 @@ void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx,
         */
 
         save->fs = get_fs();
+        LASSERT(atomic_read(&current->fs->pwd->d_count));
+        LASSERT(atomic_read(&new_ctx->pwd->d_count));
         save->pwd = dget(current->fs->pwd);
         save->pwdmnt = mntget(current->fs->pwdmnt);
 
@@ -218,14 +217,19 @@ int lustre_fread(struct file *file, char *str, int len, loff_t *off)
  */
 int lustre_fwrite(struct file *file, const char *str, int len, loff_t *off)
 {
+        ENTRY;
         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
-        if (!file || !file->f_op || !off)
+        if (!file)
+                RETURN(-ENOENT);
+        if (!file->f_op)
                 RETURN(-ENOSYS);
+        if (!off)
+                RETURN(-EINVAL);
 
         if (!file->f_op->write)
                 RETURN(-EROFS);
 
-        return file->f_op->write(file, str, len, off);
+        RETURN(file->f_op->write(file, str, len, off));
 }
 
 /*
@@ -234,9 +238,10 @@ int lustre_fwrite(struct file *file, const char *str, int len, loff_t *off)
  */
 int lustre_fsync(struct file *file)
 {
+        ENTRY;
         ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n");
         if (!file || !file->f_op || !file->f_op->fsync)
                 RETURN(-ENOSYS);
 
-        return file->f_op->fsync(file, file->f_dentry, 0);
+        RETURN(file->f_op->fsync(file, file->f_dentry, 0));
 }
index 141e155..3889f1c 100644 (file)
 #include <linux/lustre_net.h>
 #include <linux/lustre_dlm.h>
 
+int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp,
+                            char *cluuid)
+{
+        if (exp->exp_connection) {
+                struct lustre_handle *hdl;
+                hdl = &exp->exp_ldlm_data.led_import.imp_handle;
+                /* Might be a re-connect after a partition. */
+                if (!memcmp(conn, hdl, sizeof *conn)) {
+                        CERROR("%s reconnecting\n", cluuid);
+                        conn->addr = (__u64) (unsigned long)exp;
+                        conn->cookie = exp->exp_cookie;
+                        RETURN(EALREADY);
+                } else {
+                        CERROR("%s reconnecting from %s, "
+                               "handle mismatch (ours "LPX64"/"LPX64", "
+                               "theirs "LPX64"/"LPX64")\n", cluuid,
+                               exp->exp_connection->c_remote_uuid, hdl->addr,
+                               hdl->cookie, conn->addr, conn->cookie);
+                        /* XXX disconnect them here? */
+                        memset(conn, 0, sizeof *conn);
+                        /* This is a little scary, but right now we build this
+                         * file separately into each server module, so I won't
+                         * go _immediately_ to hell.
+                         */
+                        RETURN(-EALREADY);
+                }
+        }
+
+        conn->addr = (__u64) (unsigned long)exp;
+        conn->cookie = exp->exp_cookie;
+        CDEBUG(D_INFO, "existing export for UUID '%s' at %p\n", cluuid, exp);
+        CDEBUG(D_IOCTL,"connect: addr %Lx cookie %Lx\n",
+               (long long)conn->addr, (long long)conn->cookie);
+        RETURN(0);
+}
+
 int target_handle_connect(struct ptlrpc_request *req)
 {
         struct obd_device *target;
@@ -73,6 +109,9 @@ int target_handle_connect(struct ptlrpc_request *req)
         if (rc && rc != EALREADY)
                 GOTO(out, rc);
 
+        /* If all else goes well, this is our RPC return code. */
+        req->rq_status = rc;
+
         rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg);
         if (rc)
                 GOTO(out, rc);
@@ -100,10 +139,15 @@ int target_handle_connect(struct ptlrpc_request *req)
         dlmimp->imp_handle.addr = req->rq_reqmsg->addr;
         dlmimp->imp_handle.cookie = req->rq_reqmsg->cookie;
         dlmimp->imp_obd = /* LDLM! */ NULL;
+        dlmimp->imp_recover = NULL;
+        INIT_LIST_HEAD(&dlmimp->imp_replay_list);
+        INIT_LIST_HEAD(&dlmimp->imp_sending_list);
+        INIT_LIST_HEAD(&dlmimp->imp_delayed_list);
         spin_lock_init(&dlmimp->imp_lock);
         dlmimp->imp_level = LUSTRE_CONN_FULL;
 out:
-        req->rq_status = rc;
+        if (rc)
+                req->rq_status = rc;
         RETURN(rc);
 }
 
index 071c0fd..c536a0a 100644 (file)
@@ -9,13 +9,8 @@ MODULE = llite
 modulefs_DATA = llite.o
 EXTRA_PROGRAMS = llite
 
-LINX= ll_pack.c
-
 llite_SOURCES = dcache.c commit_callback.c super.c rw.c super25.c
-llite_SOURCES += file.c dir.c sysctl.c symlink.c $(LINX)
+llite_SOURCES += file.c dir.c sysctl.c symlink.c
 llite_SOURCES += recover.c namei.c lproc_llite.c
 
-ll_pack.c:
-       test -e ll_pack.c || ln -sf $(top_srcdir)/lib/ll_pack.c .
-
 include $(top_srcdir)/Rules
index e5a595a..a62716b 100644 (file)
@@ -1,10 +1,10 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- *  The daemon that causes completed but not committed transactions 
+ *  The daemon that causes completed but not committed transactions
  *   on the MDS to be flushed periodically when they are committed.
- *   A gratuitous getattr RPC is made to the MDS to discover the 
- *   last committed record. 
+ *   A gratuitous getattr RPC is made to the MDS to discover the
+ *   last committed record.
  *
  *  Lustre High Availability Daemon
  *
 
 static int ll_commitcbd_check_event(struct ll_sb_info *sbi)
 {
-        int rc = 0; 
+        int rc = 0;
         ENTRY;
 
-        spin_lock(&sbi->ll_commitcbd_lock); 
-        if (sbi->ll_commitcbd_flags & LL_COMMITCBD_STOPPING) { 
+        spin_lock(&sbi->ll_commitcbd_lock);
+        if (sbi->ll_commitcbd_flags & LL_COMMITCBD_STOPPING)
                 GOTO(out, rc = 1);
-        }
 
+        EXIT;
  out:
         spin_unlock(&sbi->ll_commitcbd_lock);
-        RETURN(rc);
+        return rc;
 }
 
 static int ll_commitcbd_main(void *arg)
 {
         struct ll_sb_info *sbi = (struct ll_sb_info *)arg;
-
+        unsigned long flags;
         ENTRY;
 
         lock_kernel();
         daemonize();
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        spin_lock_irq(&current->sigmask_lock);
+        spin_lock_irqsave(&current->sigmask_lock, flags);
         sigfillset(&current->blocked);
         our_recalc_sigpending(current);
-        spin_unlock_irq(&current->sigmask_lock);
+        spin_unlock_irqrestore(&current->sigmask_lock, flags);
 #else
         sigfillset(&current->blocked);
         our_recalc_sigpending(current);
@@ -80,19 +80,19 @@ static int ll_commitcbd_main(void *arg)
 
         /* And now, loop forever on requests */
         while (1) {
-                wait_event(sbi->ll_commitcbd_waitq, 
+                wait_event(sbi->ll_commitcbd_waitq,
                            ll_commitcbd_check_event(sbi));
 
                 spin_lock(&sbi->ll_commitcbd_lock);
                 if (sbi->ll_commitcbd_flags & LL_COMMITCBD_STOPPING) {
                         spin_unlock(&sbi->ll_commitcbd_lock);
-                        CERROR("lustre_commitd quitting\n"); 
+                        CERROR("lustre_commitd quitting\n");
                         EXIT;
                         break;
                 }
 
                 schedule_timeout(sbi->ll_commitcbd_timeout);
-                CERROR("commit callback daemon woken up - FIXME\n"); 
+                CERROR("commit callback daemon woken up - FIXME\n");
                 spin_unlock(&sbi->ll_commitcbd_lock);
         }
 
@@ -116,7 +116,7 @@ int ll_commitcbd_setup(struct ll_sb_info *sbi)
                 CERROR("cannot start thread\n");
                 RETURN(rc);
         }
-        wait_event(sbi->ll_commitcbd_ctl_waitq, 
+        wait_event(sbi->ll_commitcbd_ctl_waitq,
                    sbi->ll_commitcbd_flags & LL_COMMITCBD_RUNNING);
         RETURN(0);
 }
index ab9596f..921eea2 100644 (file)
@@ -91,7 +91,7 @@ static int ll_dir_readpage(struct file *file, struct page *page)
                 unlock_page(page);
                 RETURN(rc);
         }
-        ldlm_lock_dump_handle(&lockh);
+        ldlm_lock_dump_handle(D_OTHER, &lockh);
 
         if (PageUptodate(page)) {
                 CERROR("Explain this please?\n");
@@ -745,7 +745,69 @@ not_empty:
         return 0;
 }
 
+static int ll_dir_ioctl(struct inode *inode, struct file *file,
+                        unsigned int cmd, unsigned long arg)
+{
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct obd_ioctl_data *data;
+        ENTRY;
+
+        switch(cmd) {
+        case IOC_MDC_LOOKUP: {
+                struct ptlrpc_request *request = NULL;
+                char *buf = NULL;
+                char *filename;
+                int namelen, rc, err, len = 0;
+                int ea_size = 0; // obd_size_wiremd(&sbi->ll_osc_conn, NULL);
+                unsigned long valid;
+
+                rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
+                if (rc)
+                        RETURN(rc);
+                data = (void *)buf;
+
+                filename = data->ioc_inlbuf1;
+                namelen = data->ioc_inllen1;
+
+                if (namelen < 1) {
+                        CERROR("IOC_MDC_LOOKUP missing filename\n");
+                        GOTO(out, rc = -EINVAL);
+                }
+
+                valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE;
+                rc = mdc_getattr_name(&sbi->ll_mdc_conn, inode, filename,
+                                      namelen, valid, ea_size, &request);
+                if (rc < 0) {
+                        CERROR("mdc_getattr_name: %d\n", rc);
+                        GOTO(out, rc);
+                } else {
+                        struct mds_body *body;
+                        body = lustre_msg_buf(request->rq_repmsg, 0);
+                        /* surely there's a better way -phik */
+                        data->ioc_obdo1.o_mode = body->mode;
+                        data->ioc_obdo1.o_uid = body->uid;
+                        data->ioc_obdo1.o_gid = body->gid;
+                }
+
+                err = copy_to_user((void *)arg, buf, len);
+                if (err)
+                        GOTO(out_req, rc = -EFAULT);
+
+                EXIT;
+        out_req:
+                ptlrpc_req_finished(request);
+        out:
+                OBD_FREE(buf, len);
+                return rc;
+        }
+        default:
+                CERROR("unrecognized ioctl %#x\n", cmd);
+                RETURN(-ENOTTY);
+        }
+}
+
 struct file_operations ll_dir_operations = {
         read: generic_read_dir,
-        readdir: ll_readdir
+        readdir: ll_readdir,
+        ioctl: ll_dir_ioctl
 };
index 87c9012..6b37d99 100644 (file)
 int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc);
 extern int ll_setattr(struct dentry *de, struct iattr *attr);
 
-int ll_create_objects(struct super_block *sb, obd_id id, uid_t uid, gid_t gid,
-                      struct lov_stripe_md **lsmp)
+static int ll_mdc_open(struct lustre_handle *mdc_conn, struct inode *inode,
+                       struct file *file, struct lov_mds_md *lmm, int lmm_size)
 {
+        struct ptlrpc_request *req = NULL;
+        struct ll_file_data *fd;
+        int rc;
+        ENTRY;
+
+        LASSERT(!file->private_data);
+
+        fd = kmem_cache_alloc(ll_file_data_slab, SLAB_KERNEL);
+        if (!fd)
+                RETURN(-ENOMEM);
+
+        memset(fd, 0, sizeof(*fd));
+        fd->fd_mdshandle.addr = (__u64)(unsigned long)file;
+        get_random_bytes(&fd->fd_mdshandle.cookie,
+                         sizeof(fd->fd_mdshandle.cookie));
+
+        rc = mdc_open(mdc_conn, inode->i_ino, S_IFREG | inode->i_mode,
+                      file->f_flags, lmm, lmm_size, &fd->fd_mdshandle, &req);
+
+        /* This is the "reply" refcount. */
+        ptlrpc_req_finished(req);
+
+        if (rc)
+                GOTO(out_fd, rc);
+
+        fd->fd_req = req;
+        file->private_data = fd;
+
+        if (!fd->fd_mdshandle.addr ||
+            fd->fd_mdshandle.addr == (__u64)(unsigned long)file) {
+                CERROR("hmm, mdc_open didn't assign fd_mdshandle?\n");
+                /* XXX handle this how, abort or is it non-fatal? */
+        }
+
+        file->f_flags &= ~O_LOV_DELAY_CREATE;
+        RETURN(0);
+
+out_fd:
+        fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC;
+        kmem_cache_free(ll_file_data_slab, fd);
+
+        return -abs(rc);
+}
+
+static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
+                        struct file *file)
+{
+        struct ll_file_data *fd = file->private_data;
+        struct ptlrpc_request *req = NULL;
+        unsigned long flags;
+        struct obd_import *imp = fd->fd_req->rq_import;
+        int rc;
+
+        /* Complete the open request and remove it from replay list */
+        DEBUG_REQ(D_HA, fd->fd_req, "matched open req %p", fd->fd_req);
+        rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, inode->i_ino,
+                       inode->i_mode, &fd->fd_mdshandle, &req);
+
+        if (rc)
+                CERROR("inode %lu close failed: rc = %d\n", inode->i_ino, rc);
+        ptlrpc_req_finished(req);
+
+        spin_lock_irqsave(&imp->imp_lock, flags);
+        if (fd->fd_req->rq_transno) {
+                /* This caused an EA to be written, need to replay as a normal
+                 * transaction now.  Our reference is now effectively owned
+                 * by the imp_replay_list, and we'll be committed just like
+                 * other transno-having requests now.
+                 */
+                fd->fd_req->rq_flags &= ~PTL_RPC_FL_REPLAY;
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
+        } else {
+                /* No transno means that we can just drop our ref. */
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
+                ptlrpc_req_finished(fd->fd_req);
+        }
+        fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC;
+        file->private_data = NULL;
+        kmem_cache_free(ll_file_data_slab, fd);
+
+        return -abs(rc);
+}
+
+static int ll_osc_open(struct lustre_handle *conn, struct inode *inode,
+                       struct file *file, struct lov_stripe_md *lsm)
+{
+        struct ll_file_data *fd;
         struct obdo *oa;
         int rc;
         ENTRY;
@@ -43,231 +130,245 @@ int ll_create_objects(struct super_block *sb, obd_id id, uid_t uid, gid_t gid,
         oa = obdo_alloc();
         if (!oa)
                 RETURN(-ENOMEM);
+        oa->o_id = lsm->lsm_object_id;
+        oa->o_mode = S_IFREG;
+        oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
+                OBD_MD_FLBLOCKS;
+        rc = obd_open(conn, oa, lsm);
+        if (rc)
+                GOTO(out, rc);
 
-        oa->o_mode = S_IFREG | 0600;
-        oa->o_id = id;
-        oa->o_uid = uid;
-        oa->o_gid = gid;
-        oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
-                OBD_MD_FLUID | OBD_MD_FLGID;
-        rc = obd_create(ll_s2obdconn(sb), oa, lsmp);
-        obdo_free(oa);
+        obdo_to_inode(inode, oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
 
-        if (!rc)
-                LASSERT(*lsmp && (*lsmp)->lsm_object_id);
+        fd = file->private_data;
+        obd_oa2handle(&fd->fd_osthandle, oa);
+
+        atomic_inc(&ll_i2info(inode)->lli_open_count);
+out:
+        obdo_free(oa);
         RETURN(rc);
 }
 
-static int ll_file_open(struct inode *inode, struct file *file)
+/* Caller must hold lli_open_sem to protect lli->lli_smd from changing and
+ * duplicate objects from being created.  We only install lsm to lli_smd if
+ * the mdc open was successful (hence stored stripe MD on MDS), otherwise
+ * other nodes could try to create different objects for the same file.
+ */
+static int ll_create_open_obj(struct lustre_handle *conn, struct inode *inode,
+                              struct file *file, struct lov_stripe_md *lsm)
 {
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
         struct ll_inode_info *lli = ll_i2info(inode);
-        struct lustre_handle *conn = ll_i2obdconn(inode);
-        struct ptlrpc_request *req = NULL;
-        struct ll_file_data *fd;
-        struct obdo *oa;
-        struct lov_stripe_md *lsm;
         struct lov_mds_md *lmm = NULL;
         int lmm_size = 0;
-        int rc = 0;
+        struct obdo *oa;
+        int rc, err;
         ENTRY;
 
-        LASSERT(!file->private_data);
-
-        lsm = lli->lli_smd;
+        oa = obdo_alloc();
+        if (!oa)
+                RETURN(-ENOMEM);
 
-        /*  delayed create of object (intent created inode) */
-        /*  XXX object needs to be cleaned up if mdc_open fails */
-        /*  XXX error handling appropriate here? */
-        if (lsm == NULL) {
-                if (file->f_flags & O_LOV_DELAY_CREATE) {
-                        CDEBUG(D_INODE, "delaying object creation\n");
-                        RETURN(0);
-                }
-                down(&lli->lli_open_sem);
-                /* Check to see if we lost the race */
-                if (!lli->lli_smd)
-                        rc = ll_create_objects(inode->i_sb, inode->i_ino, 0, 0,
-                                               &lli->lli_smd);
-                up(&lli->lli_open_sem);
-                if (rc)
-                        RETURN(rc);
+        oa->o_mode = S_IFREG | 0600;
+        oa->o_id = inode->i_ino;
+        /* Keep these 0 for now, because chown/chgrp does not change the
+         * ownership on the OST, and we don't want to allow BA OST NFS
+         * users to access these objects by mistake.
+         */
+        oa->o_uid = 0;
+        oa->o_gid = 0;
+        oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
+                OBD_MD_FLUID | OBD_MD_FLGID;
 
-                lsm = lli->lli_smd;
+        rc = obd_create(conn, oa, &lsm);
+        if (rc) {
+                CERROR("error creating objects for inode %lu: rc = %d\n",
+                       inode->i_ino, rc);
+                GOTO(out_oa, rc);
         }
 
-        /* XXX We should only send this to MDS if we just created these
-         *     objects, except we also need to handle the user-stripe case.
-         */
-        rc = obd_packmd(conn, &lmm, lli->lli_smd);
+        LASSERT(lsm && lsm->lsm_object_id);
+        rc = obd_packmd(conn, &lmm, lsm);
         if (rc < 0)
-                GOTO(out, rc);
+                GOTO(out_destroy, rc);
 
         lmm_size = rc;
 
-        fd = kmem_cache_alloc(ll_file_data_slab, SLAB_KERNEL);
-        if (!fd) {
-                if (lmm)
-                        obd_free_wiremd(conn, &lmm);
-                GOTO(out, rc = -ENOMEM);
-        }
-        memset(fd, 0, sizeof(*fd));
+        rc = ll_mdc_open(&ll_i2sbi(inode)->ll_mdc_conn,inode,file,lmm,lmm_size);
 
-        fd->fd_mdshandle.addr = (__u64)(unsigned long)file;
-        get_random_bytes(&fd->fd_mdshandle.cookie,
-                         sizeof(fd->fd_mdshandle.cookie));
-        rc = mdc_open(&sbi->ll_mdc_conn, inode->i_ino, S_IFREG | inode->i_mode,
-                      file->f_flags, lmm, lmm_size, &fd->fd_mdshandle, &req);
-        if (lmm)
-                obd_free_wiremd(conn, &lmm);
-        fd->fd_req = req;
+        obd_free_wiremd(conn, &lmm);
 
-        /* This is the "reply" refcount. */
-        ptlrpc_req_finished(req);
-        if (rc)
-                GOTO(out_req, -abs(rc));
-        if (!fd->fd_mdshandle.addr ||
-            fd->fd_mdshandle.addr == (__u64)(unsigned long)file) {
-                CERROR("hmm, mdc_open didn't assign fd_mdshandle?\n");
-                /* XXX handle this how, abort or is it non-fatal? */
+        /* If we couldn't complete mdc_open() and store the stripe MD on the
+         * MDS, we need to destroy the objects now or they will be leaked.
+         */
+        if (rc) {
+                CERROR("error MDS opening %lu with delayed create: rc %d\n",
+                       inode->i_ino, rc);
+                GOTO(out_destroy, rc);
         }
+        lli->lli_smd = lsm;
 
-        oa = obdo_alloc();
-        if (!oa)
-                GOTO(out_mdc, rc = -EINVAL);
+        EXIT;
+out_oa:
+        obdo_free(oa);
+        return rc;
 
+out_destroy:
+        obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
         oa->o_id = lsm->lsm_object_id;
-        oa->o_mode = S_IFREG;
-        oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
-                OBD_MD_FLBLOCKS;
-        rc = obd_open(ll_i2obdconn(inode), oa, lsm);
-        obdo_to_inode(inode, oa, oa->o_valid & (OBD_MD_FLSIZE|OBD_MD_FLBLOCKS));
+        oa->o_valid |= OBD_MD_FLID;
+        err = obd_destroy(conn, oa, lsm);
+        obd_free_memmd(conn, &lsm);
+        if (err)
+                CERROR("error uncreating inode %lu objects: rc %d\n",
+                       inode->i_ino, err);
+        goto out_oa;
+}
 
-        obd_oa2handle(&fd->fd_osthandle, oa);
-        obdo_free(oa);
+/* Open a file, and (for the very first open) create objects on the OSTs at
+ * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
+ * creation or open until ll_lov_setstripe() ioctl is called.  We grab
+ * lli_open_sem to ensure no other process will create objects, send the
+ * stripe MD to the MDS, or try to destroy the objects if that fails.
+ *
+ * If we already have the stripe MD locally, we don't request it in
+ * mdc_open() by passing a lmm_size = 0.
+ *
+ * It is up to the application to ensure no other processes open this file
+ * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
+ * used.  We might be able to avoid races of that sort by getting lli_open_sem
+ * before returning in the O_LOV_DELAY_CREATE case and dropping it here
+ * or in ll_file_release(), but I'm not sure that is desirable/necessary.
+ */
+static int ll_file_open(struct inode *inode, struct file *file)
+{
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct lustre_handle *conn = ll_i2obdconn(inode);
+        struct lov_stripe_md *lsm;
+        int rc = 0;
+        ENTRY;
 
-        if (rc)
-                GOTO(out_mdc, rc = -abs(rc));
+        lsm = lli->lli_smd;
+        if (lsm == NULL) {
+                if (file->f_flags & O_LOV_DELAY_CREATE) {
+                        CDEBUG(D_INODE, "delaying object creation\n");
+                        RETURN(0);
+                }
 
-        atomic_inc(&lli->lli_open_count);
+                down(&lli->lli_open_sem);
+                if (!lli->lli_smd) {
+                        rc = ll_create_open_obj(conn, inode, file, NULL);
+                        up(&lli->lli_open_sem);
+                } else {
+                        CERROR("stripe already set on ino %lu\n", inode->i_ino);
+                        up(&lli->lli_open_sem);
+                        rc = ll_mdc_open(&sbi->ll_mdc_conn, inode, file,NULL,0);
+                }
+                lsm = lli->lli_smd;
+        } else
+                rc = ll_mdc_open(&sbi->ll_mdc_conn, inode, file, NULL, 0);
 
-        file->private_data = fd;
+        if (rc)
+                RETURN(rc);
 
+        rc = ll_osc_open(conn, inode, file, lsm);
+        if (rc)
+                GOTO(out_close, rc);
         RETURN(0);
-out_mdc:
-        mdc_close(&sbi->ll_mdc_conn, inode->i_ino,
-                  S_IFREG, &fd->fd_mdshandle, &req);
-out_req:
-        ptlrpc_req_finished(req); /* once for an early "commit" */
-//out_fd:
-        fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC;
-        kmem_cache_free(ll_file_data_slab, fd);
-out:
+out_close:
+        ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
         return rc;
 }
 
 int ll_size_lock(struct inode *inode, struct lov_stripe_md *lsm, obd_off start,
-                 int mode, struct lustre_handle **lockhs_p)
+                 int mode, struct lustre_handle *lockh)
 {
         struct ll_sb_info *sbi = ll_i2sbi(inode);
         struct ldlm_extent extent;
-        struct lustre_handle *lockhs = NULL;
-        int rc, flags = 0, stripe_count;
+        int rc, flags = 0;
         ENTRY;
 
-        if (sbi->ll_flags & LL_SBI_NOLCK) {
-                *lockhs_p = NULL;
+        /* XXX phil: can we do this?  won't it screw the file size up? */
+        if (sbi->ll_flags & LL_SBI_NOLCK)
                 RETURN(0);
-        }
-
-        stripe_count = lsm->lsm_stripe_count;
-        if (!stripe_count)
-                stripe_count = 1;
-
-        OBD_ALLOC(lockhs, stripe_count * sizeof(*lockhs));
-        if (lockhs == NULL)
-                RETURN(-ENOMEM);
 
         extent.start = start;
         extent.end = OBD_OBJECT_EOF;
 
         rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, &extent,
                          sizeof(extent), mode, &flags, ll_lock_callback,
-                         inode, sizeof(*inode), lockhs);
-        if (rc != ELDLM_OK) {
-                CERROR("lock enqueue: %d\n", rc);
-                OBD_FREE(lockhs, stripe_count * sizeof(*lockhs));
-        } else
-                *lockhs_p = lockhs;
+                         inode, sizeof(*inode), lockh);
         RETURN(rc);
 }
 
 int ll_size_unlock(struct inode *inode, struct lov_stripe_md *lsm, int mode,
-                   struct lustre_handle *lockhs)
+                   struct lustre_handle *lockh)
 {
         struct ll_sb_info *sbi = ll_i2sbi(inode);
-        int rc, stripe_count;
+        int rc;
         ENTRY;
 
+        /* XXX phil: can we do this?  won't it screw the file size up? */
         if (sbi->ll_flags & LL_SBI_NOLCK)
                 RETURN(0);
 
-        if (lockhs == NULL) {
-                LBUG();
-                RETURN(-EINVAL);
-        }
-
-        rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockhs);
+        rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh);
         if (rc != ELDLM_OK) {
                 CERROR("lock cancel: %d\n", rc);
                 LBUG();
         }
 
-        stripe_count = lsm->lsm_stripe_count;
-        if (!stripe_count)
-                stripe_count = 1;
-
-        OBD_FREE(lockhs, stripe_count * sizeof(*lockhs));
         RETURN(rc);
 }
 
 int ll_file_size(struct inode *inode, struct lov_stripe_md *lsm)
 {
         struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct lustre_handle *lockhs;
+        //struct lustre_handle lockh = { 0, 0 };
         struct obdo oa;
-        int err, rc;
+        //int err;
+        int rc;
         ENTRY;
 
         LASSERT(lsm);
         LASSERT(sbi);
 
-        rc = ll_size_lock(inode, lsm, 0, LCK_PR, &lockhs);
+        /* XXX do not yet need size lock - OST size always correct (sync write)
+        rc = ll_size_lock(inode, lsm, 0, LCK_PR, &lockh);
         if (rc != ELDLM_OK) {
                 CERROR("lock enqueue: %d\n", rc);
                 RETURN(rc);
         }
+        */
 
         memset(&oa, 0, sizeof oa);
         oa.o_id = lsm->lsm_object_id;
         oa.o_mode = S_IFREG;
         oa.o_valid = OBD_MD_FLID|OBD_MD_FLTYPE|OBD_MD_FLSIZE|OBD_MD_FLBLOCKS;
         rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
-        if (!rc)
-                obdo_to_inode(inode, &oa,
-                              oa.o_valid & ~(OBD_MD_FLTYPE | OBD_MD_FLMODE));
-
-        err = ll_size_unlock(inode, lsm, LCK_PR, lockhs);
+        if (!rc) {
+                obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+                CDEBUG(D_INODE, LPX64" size %Lu/%Lu\n",
+                       lsm->lsm_object_id, inode->i_size, inode->i_size);
+        }
+        /* XXX do not need size lock, because OST size always correct (sync write)
+        err = ll_size_unlock(inode, lsm, LCK_PR, &lockh);
         if (err != ELDLM_OK) {
                 CERROR("lock cancel: %d\n", err);
-                LBUG();
+                if (!rc)
+                        rc = err;
         }
+        */
         RETURN(rc);
 }
 
+/* While this returns an error code, fput() the caller does not, so we need
+ * to make every effort to clean up all of our state here.  Also, applications
+ * rarely check close errors and even if an error is returned they will not
+ * re-try the close call.
+ */
 static int ll_file_release(struct inode *inode, struct file *file)
 {
-        struct ptlrpc_request *req = NULL;
         struct ll_file_data *fd;
         struct obdo oa;
         struct ll_sb_info *sbi = ll_i2sbi(inode);
@@ -278,93 +379,34 @@ static int ll_file_release(struct inode *inode, struct file *file)
         ENTRY;
 
         fd = (struct ll_file_data *)file->private_data;
-        if (!fd) {
-                LASSERT(file->f_flags & O_LOV_DELAY_CREATE);
-                GOTO(out, rc = 0);
-        }
+        if (!fd) /* no process opened the file after an mcreate */
+                RETURN(rc = 0);
 
         memset(&oa, 0, sizeof(oa));
         oa.o_id = lsm->lsm_object_id;
         oa.o_mode = S_IFREG;
         oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID;
         obd_handle2oa(&oa, &fd->fd_osthandle);
-        rc = obd_close(ll_i2obdconn(inode), &oa, lsm);
+        rc = obd_close(&sbi->ll_osc_conn, &oa, lsm);
         if (rc)
-                GOTO(out_mdc, rc = -abs(rc));
-
-#if 0
-#error "This should only be done on the node that already has the EOF lock"
-#error "and only in the case where the file size actually changed.  For now"
-#error "we don't care about the size on the MDS, since we never use it (the"
-#error "OST always has the authoritative size and we don't even use the MDS."
-        /* If this fails and we goto out_fd, the file size on the MDS is out of
-         * date.  Is that a big deal? */
-        if (file->f_mode & FMODE_WRITE) {
-                struct lustre_handle *lockhs;
-
-                rc = ll_size_lock(inode, lsm, 0, LCK_PR, &lockhs);
-                if (rc)
-                        GOTO(out_mdc, -abs(rc));
-
-                oa.o_id = lsm->lsm_object_id;
-                oa.o_mode = S_IFREG;
-                oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
-                        OBD_MD_FLBLOCKS;
-                rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
-                if (!rc) {
-                        struct iattr attr;
-                        attr.ia_valid = (ATTR_MTIME | ATTR_CTIME | ATTR_ATIME |
-                                         ATTR_SIZE);
-                        attr.ia_mtime = inode->i_mtime;
-                        attr.ia_ctime = inode->i_ctime;
-                        attr.ia_atime = inode->i_atime;
-                        attr.ia_size = oa.o_size;
-
-                        inode->i_blocks = oa.o_blocks;
-
-                        /* XXX: this introduces a small race that we should
-                         * evaluate */
-                        rc = ll_inode_setattr(inode, &attr, 0);
-                }
-                rc2 = ll_size_unlock(inode, lli->lli_smd, LCK_PR, lockhs);
-                if (rc2) {
-                        CERROR("lock cancel: %d\n", rc);
-                        LBUG();
-                        if (!rc)
-                                rc = rc2;
-                }
-        }
-#endif
+                CERROR("inode %lu object close failed: rc = %d\n",
+                       inode->i_ino, rc);
 
-out_mdc:
-        rc2 = mdc_close(&sbi->ll_mdc_conn, inode->i_ino,
-                        S_IFREG, &fd->fd_mdshandle, &req);
-        ptlrpc_req_finished(req);
-        if (rc2) {
-                if (!rc)
-                        rc = -abs(rc2);
-                GOTO(out_fd, rc);
-        }
-        DEBUG_REQ(D_HA, fd->fd_req, "matched open for this close: ");
-        ptlrpc_req_finished(fd->fd_req);
+        rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
+        if (rc2 && !rc)
+                rc = rc2;
 
         if (atomic_dec_and_test(&lli->lli_open_count)) {
                 CDEBUG(D_INFO, "last close, cancelling unused locks\n");
-                rc = obd_cancel_unused(ll_i2obdconn(inode), lsm, 0);
-                if (rc)
+                rc2 = obd_cancel_unused(&sbi->ll_osc_conn, lsm, 0);
+                if (rc2 && !rc) {
+                        rc = rc2;
                         CERROR("obd_cancel_unused: %d\n", rc);
-        } else {
+                }
+        } else
                 CDEBUG(D_INFO, "not last close, not cancelling unused locks\n");
-        }
-
-        EXIT;
 
-out_fd:
-        fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC;
-        file->private_data = NULL;
-        kmem_cache_free(ll_file_data_slab, fd);
-out:
-        return rc;
+        RETURN(rc);
 }
 
 static inline void ll_remove_suid(struct inode *inode)
@@ -401,7 +443,7 @@ int ll_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
                      void *data, __u32 data_len, int flag)
 {
         struct inode *inode = data;
-        struct lustre_handle lockh;
+        struct lustre_handle lockh = { 0, 0 };
         int rc;
         ENTRY;
 
@@ -438,7 +480,7 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
         struct ll_file_data *fd = (struct ll_file_data *)filp->private_data;
         struct inode *inode = filp->f_dentry->d_inode;
         struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct lustre_handle *lockhs = NULL;
+        struct lustre_handle lockh = { 0, 0 };
         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
         int flags = 0;
         ldlm_error_t err;
@@ -449,17 +491,13 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
          * call us */
         retval = ll_file_size(inode, lsm);
         if (retval < 0) {
-                CERROR("ll_file_size: %d\n", retval);
+                CERROR("ll_file_size: "LPSZ"\n", retval);
                 RETURN(retval);
         }
 
         if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) &&
             !(sbi->ll_flags & LL_SBI_NOLCK)) {
                 struct ldlm_extent extent;
-                OBD_ALLOC(lockhs, lsm->lsm_stripe_count * sizeof(*lockhs));
-                if (!lockhs)
-                        RETURN(-ENOMEM);
-
                 extent.start = *ppos;
                 extent.end = *ppos + count;
                 CDEBUG(D_INFO, "Locking inode %lu, start "LPU64" end "LPU64"\n",
@@ -468,15 +506,14 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
                 err = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT,
                                   &extent, sizeof(extent), LCK_PR, &flags,
                                   ll_lock_callback, inode, sizeof(*inode),
-                                  lockhs);
+                                  &lockh);
                 if (err != ELDLM_OK) {
-                        OBD_FREE(lockhs, lsm->lsm_stripe_count*sizeof(*lockhs));
                         CERROR("lock enqueue: err: %d\n", err);
                         RETURN(err);
                 }
         }
 
-        CDEBUG(D_INFO, "Reading inode %lu, %d bytes, offset %Ld\n",
+        CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n",
                inode->i_ino, count, *ppos);
         retval = generic_file_read(filp, buf, count, ppos);
 
@@ -485,15 +522,13 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
 
         if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) &&
             !(sbi->ll_flags & LL_SBI_NOLCK)) {
-                err = obd_cancel(&sbi->ll_osc_conn, lsm, LCK_PR, lockhs);
+                err = obd_cancel(&sbi->ll_osc_conn, lsm, LCK_PR, &lockh);
                 if (err != ELDLM_OK) {
                         CERROR("lock cancel: err: %d\n", err);
                         retval = err;
                 }
         }
 
-        if (lockhs)
-                OBD_FREE(lockhs, lsm->lsm_stripe_count * sizeof(*lockhs));
         RETURN(retval);
 }
 
@@ -506,7 +541,7 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
         struct ll_file_data *fd = (struct ll_file_data *)file->private_data;
         struct inode *inode = file->f_dentry->d_inode;
         struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct lustre_handle *lockhs = NULL, *eof_lockhs = NULL;
+        struct lustre_handle lockh = { 0, 0 }, eof_lockh = { 0, 0 };
         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
         int flags = 0;
         ldlm_error_t err;
@@ -520,7 +555,7 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
                 if (!oa)
                         RETURN(-ENOMEM);
 
-                err = ll_size_lock(inode, lsm, 0, LCK_PW, &eof_lockhs);
+                err = ll_size_lock(inode, lsm, 0, LCK_PW, &eof_lockh);
                 if (err) {
                         obdo_free(oa);
                         RETURN(err);
@@ -545,9 +580,6 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
         if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) &&
             !(sbi->ll_flags & LL_SBI_NOLCK)) {
                 struct ldlm_extent extent;
-                OBD_ALLOC(lockhs, lsm->lsm_stripe_count * sizeof(*lockhs));
-                if (!lockhs)
-                        GOTO(out_eof, retval = -ENOMEM);
                 extent.start = *ppos;
                 extent.end = *ppos + count;
                 CDEBUG(D_INFO, "Locking inode %lu, start "LPU64" end "LPU64"\n",
@@ -556,35 +588,31 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
                 err = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT,
                                   &extent, sizeof(extent), LCK_PW, &flags,
                                   ll_lock_callback, inode, sizeof(*inode),
-                                  lockhs);
+                                  &lockh);
                 if (err != ELDLM_OK) {
                         CERROR("lock enqueue: err: %d\n", err);
-                        GOTO(out_free, retval = err);
+                        GOTO(out_eof, retval = err);
                 }
         }
 
-        CDEBUG(D_INFO, "Writing inode %lu, %ld bytes, offset "LPD64"\n",
-               inode->i_ino, (long)count, *ppos);
+        CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
+               inode->i_ino, count, *ppos);
 
         retval = generic_file_write(file, buf, count, ppos);
 
         if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) ||
             sbi->ll_flags & LL_SBI_NOLCK) {
-                err = obd_cancel(&sbi->ll_osc_conn, lsm, LCK_PW, lockhs);
+                err = obd_cancel(&sbi->ll_osc_conn, lsm, LCK_PW, &lockh);
                 if (err != ELDLM_OK) {
                         CERROR("lock cancel: err: %d\n", err);
-                        GOTO(out_free, retval = err);
+                        GOTO(out_eof, retval = err);
                 }
         }
 
         EXIT;
- out_free:
-        if (lockhs)
-                OBD_FREE(lockhs, lsm->lsm_stripe_count * sizeof(*lockhs));
-
  out_eof:
         if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) {
-                err = ll_size_unlock(inode, lsm, LCK_PW, eof_lockhs);
+                err = ll_size_unlock(inode, lsm, LCK_PW, &eof_lockh);
                 if (err && !retval)
                         retval = err;
         }
@@ -592,121 +620,54 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
         return retval;
 }
 
-/* Retrieve object striping information.
- *
- * @arg is a pointer to a user struct with one or more of the fields set to
- * indicate the application preference: lmm_stripe_count, lmm_stripe_size,
- * lmm_stripe_offset, and lmm_stripe_pattern.  lmm_magic must be LOV_MAGIC.
- */
 static int ll_lov_setstripe(struct inode *inode, struct file *file,
                             unsigned long arg)
 {
         struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_mds_md *lmm = NULL, *lmmu = (void *)arg;
-        struct lustre_handle *conn = ll_i2obdconn(inode);
+        struct lustre_handle *conn;
+        struct lov_stripe_md *lsm;
         int rc;
+        ENTRY;
 
-        rc = obd_alloc_wiremd(conn, &lmm);
-        if (rc < 0)
-                RETURN(rc);
-
-        rc = copy_from_user(lmm, lmmu, sizeof(*lmm));
-        if (rc)
-                GOTO(out_free, rc = -EFAULT);
+        down(&lli->lli_open_sem);
+        lsm = lli->lli_smd;
+        if (lsm) {
+                up(&lli->lli_open_sem);
+                CERROR("stripe already set for ino %lu\n", inode->i_ino);
+                /* If we haven't already done the open, do so now */
+                if (file->f_flags & O_LOV_DELAY_CREATE) {
+                        int rc2 = ll_file_open(inode, file);
+                        if (rc2)
+                                RETURN(rc2);
+                }
 
-        if (lmm->lmm_magic != LOV_MAGIC) {
-                CERROR("bad LOV magic %X\n", lmm->lmm_magic);
-                GOTO(out_free, rc = -EINVAL);
+                RETURN(-EALREADY);
         }
 
-        down(&lli->lli_open_sem);
-        if (lli->lli_smd) {
-                CERROR("striping data already set for %lu\n", inode->i_ino);
-                GOTO(out_lov_up, rc = -EPERM);
-        }
-        rc = obd_unpackmd(conn, &lli->lli_smd, lmm);
-        if (rc < 0) {
-                CERROR("error setting LOV striping on %lu: rc = %d\n",
-                       inode->i_ino, rc);
-                GOTO(out_lov_up, rc);
-        }
+        conn = ll_i2obdconn(inode);
+
+        rc = obd_iocontrol(LL_IOC_LOV_SETSTRIPE, conn, 0, &lsm, (void *)arg);
+        if (!rc)
+                rc = ll_create_open_obj(conn, inode, file, lsm);
+        up(&lli->lli_open_sem);
 
-        rc = ll_create_objects(inode->i_sb, inode->i_ino, 0, 0, &lli->lli_smd);
         if (rc) {
-                obd_free_memmd(conn, &lli->lli_smd);
-        } else {
-                file->f_flags &= ~O_LOV_DELAY_CREATE;
-                rc = ll_file_open(inode, file);
+                obd_free_memmd(conn, &lsm);
+                RETURN(rc);
         }
-out_lov_up:
-        up(&lli->lli_open_sem);
-out_free:
-        obd_free_wiremd(conn, &lmm);
-        return rc;
+        rc = ll_osc_open(conn, inode, file, lli->lli_smd);
+        RETURN(rc);
 }
 
-/* Retrieve object striping information.
- *
- * @arg is a pointer to a user struct with lmm_ost_count indicating
- * the maximum number of OST indices which will fit in the user buffer.
- * lmm_magic must be LOV_MAGIC.
- */
 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
 {
-        struct lov_mds_md lmm, *lmmu = (void *)arg, *lmmk = NULL;
         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
         struct lustre_handle *conn = ll_i2obdconn(inode);
-        int ost_count, rc, lmm_size;
 
         if (!lsm)
                 RETURN(-ENODATA);
 
-        rc = copy_from_user(&lmm, lmmu, sizeof(lmm));
-        if (rc)
-                RETURN(-EFAULT);
-
-        if (lmm.lmm_magic != LOV_MAGIC)
-                RETURN(-EINVAL);
-
-        if (lsm->lsm_stripe_count == 0)
-                ost_count = 1;
-        else {
-                struct obd_device *obd = class_conn2obd(conn);
-                struct lov_obd *lov = &obd->u.lov;
-                ost_count = lov->desc.ld_tgt_count;
-        }
-
-        /* XXX we _could_ check if indices > user lmm_ost_count are zero */
-        if (lmm.lmm_ost_count < ost_count)
-                RETURN(-EOVERFLOW);
-
-        rc = obd_packmd(conn, &lmmk, lsm);
-        if (rc < 0)
-                RETURN(rc);
-
-        lmm_size = rc;
-
-        /* LOV STACKING layering violation to make LOV/OSC return same data */
-        if (lsm->lsm_stripe_count == 0) {
-                struct lov_object_id *loi;
-
-                loi = (void *)lmmu + offsetof(typeof(*lmmu), lmm_objects);
-                rc = copy_to_user(loi, &lsm->lsm_object_id, sizeof(*loi));
-                if (rc) {
-                        lmm_size = 0;
-                        rc = -EFAULT;
-                } else {
-                        lmmk->lmm_magic = LOV_MAGIC;
-                        lmmk->lmm_ost_count = lmmk->lmm_stripe_count = 1;
-                }
-        }
-
-        if (lmm_size && copy_to_user(lmmu, lmmk, lmm_size))
-                rc = -EFAULT;
-
-        obd_free_wiremd(conn, &lmmk);
-
-        RETURN(rc);
+        return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, conn, 0, lsm, (void *)arg);
 }
 
 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
@@ -822,7 +783,7 @@ static int ll_inode_revalidate(struct dentry *dentry)
                 rc = mdc_getattr(&sbi->ll_mdc_conn, inode->i_ino,
                                  inode->i_mode, valid, datalen, &req);
                 if (rc) {
-                        CERROR("failure %d inode "LPX64"\n", rc, inode->i_ino);
+                        CERROR("failure %d inode %lu\n", rc, inode->i_ino);
                         ptlrpc_req_finished(req);
                         RETURN(-abs(rc));
                 }
index 54a81a4..81a5aad 100644 (file)
@@ -287,7 +287,9 @@ int ll_intent_lock(struct inode *parent, struct dentry **de,
                         GOTO(out, flag = LL_LOOKUP_POSITIVE);
                 }
 
-                /* Do a getattr now that we have the lock */
+                /* Do a getattr now that we have the lock, and fetch the
+                 * up-to-date stripe MD at the same time.
+                 */
                 valid = OBD_MD_FLNOTOBD;
                 if (it->it_op == IT_READLINK) {
                         datalen = mds_body->size;
@@ -340,7 +342,6 @@ int ll_intent_lock(struct inode *parent, struct dentry **de,
                 }
         }
 
-        EXIT;
  out:
         if (intent_finish != NULL) {
                 rc = intent_finish(flag, request, de, it, offset, ino);
@@ -485,17 +486,18 @@ static struct dentry *ll_lookup2(struct inode *parent, struct dentry *dentry,
 {
         struct dentry *save = dentry;
         int rc;
+        ENTRY;
 
         rc = ll_intent_lock(parent, &dentry, it, lookup2_finish);
         if (rc < 0) {
                 CERROR("ll_intent_lock: %d\n", rc);
-                return ERR_PTR(rc);
+                RETURN(ERR_PTR(rc));
         }
 
         if (dentry == save)
-                return NULL;
+                RETURN(NULL);
         else
-                return dentry;
+                RETURN(dentry);
 }
 
 static struct inode *ll_create_node(struct inode *dir, const char *name,
index 3310c34..4c7ad42 100644 (file)
@@ -35,9 +35,10 @@ int ll_recover(struct recovd_data *rd, int phase)
                                 list_entry(tmp, struct obd_import, imp_chain);
 
                         if (phase == PTLRPC_RECOVD_PHASE_PREPARE) {
-                                spin_lock(&imp->imp_lock);
+                                unsigned long flags;
+                                spin_lock_irqsave(&imp->imp_lock, flags);
                                 imp->imp_level = LUSTRE_CONN_RECOVD;
-                                spin_unlock(&imp->imp_lock);
+                                spin_unlock_irqrestore(&imp->imp_lock, flags);
                         }
                         imp->imp_recover(imp, phase);
                 }
index 7f486fb..e1402d1 100644 (file)
@@ -107,6 +107,16 @@ static int ll_brw(int cmd, struct inode *inode, struct page *page, int create)
         else
                 pg.count = PAGE_SIZE;
 
+        CDEBUG(D_PAGE, "%s %d bytes ino %lu at "LPU64"/"LPX64"\n",
+              cmd & OBD_BRW_WRITE ? "write" : "read", pg.count, inode->i_ino,
+              pg.off, pg.off);
+        if (pg.count == 0) {
+                CERROR("ZERO COUNT: ino %lu: size %p:%Lu(%p:%Lu) idx %lu off "
+                       LPU64"\n",
+                       inode->i_ino, inode, inode->i_size, page->mapping->host,
+                       page->mapping->host->i_size, page->index, pg.off);
+        }
+
         pg.flag = create ? OBD_BRW_CREATE : 0;
 
         set->brw_callback = ll_brw_sync_wait;
@@ -160,7 +170,7 @@ void ll_truncate(struct inode *inode)
 {
         struct obdo oa = {0};
         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
-        struct lustre_handle *lockhs = NULL;
+        struct lustre_handle lockh = { 0, 0 };
         int err;
         ENTRY;
 
@@ -174,10 +184,10 @@ void ll_truncate(struct inode *inode)
         oa.o_mode = inode->i_mode;
         oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE;
 
-        CDEBUG(D_INFO, "calling punch for "LPX64" (all bytes after "LPD64")\n",
+        CDEBUG(D_INFO, "calling punch for "LPX64" (all bytes after %Lu)\n",
                oa.o_id, inode->i_size);
 
-        err = ll_size_lock(inode, lsm, inode->i_size, LCK_PW, &lockhs);
+        err = ll_size_lock(inode, lsm, inode->i_size, LCK_PW, &lockh);
         if (err) {
                 CERROR("ll_size_lock failed: %d\n", err);
                 return;
@@ -191,7 +201,7 @@ void ll_truncate(struct inode *inode)
         else
                 obdo_to_inode(inode, &oa, oa.o_valid);
 
-        err = ll_size_unlock(inode, lsm, LCK_PW, lockhs);
+        err = ll_size_unlock(inode, lsm, LCK_PW, &lockh);
         if (err)
                 CERROR("ll_size_unlock failed: %d\n", err);
 
@@ -280,6 +290,7 @@ static int ll_commit_write(struct file *file, struct page *page,
 
         pg.pg = page;
         pg.count = to;
+        /* XXX make the starting offset "from" */
         pg.off = (((obd_off)page->index) << PAGE_SHIFT);
         pg.flag = create ? OBD_BRW_CREATE : 0;
 
@@ -292,7 +303,7 @@ static int ll_commit_write(struct file *file, struct page *page,
         if (!PageLocked(page))
                 LBUG();
 
-        CDEBUG(D_INODE, "commit_page writing (off "LPD64"), count "LPD64"\n",
+        CDEBUG(D_INODE, "commit_page writing (off "LPD64"), count %d\n",
                pg.off, pg.count);
 
         set->brw_callback = ll_brw_sync_wait;
index cb3ae90..73b6ea5 100644 (file)
@@ -95,8 +95,8 @@ static void ll_options(char *options, char **ost, char **mds, int *flags)
 #define log2(n) ffz(~(n))
 #endif
 
-static struct super_block * ll_read_super(struct super_block *sb,
-                                          void *data, int silent)
+static struct super_block *ll_read_super(struct super_block *sb,
+                                         void *data, int silent)
 {
         struct inode *root = 0;
         struct obd_device *obd;
@@ -112,13 +112,10 @@ static struct super_block * ll_read_super(struct super_block *sb,
         class_uuid_t uuid;
 
         ENTRY;
-        MOD_INC_USE_COUNT;
 
         OBD_ALLOC(sbi, sizeof(*sbi));
-        if (!sbi) {
-                MOD_DEC_USE_COUNT;
+        if (!sbi)
                 RETURN(NULL);
-        }
 
         INIT_LIST_HEAD(&sbi->ll_conn_chain);
         INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list);
@@ -238,7 +235,6 @@ out_mdc:
 out_free:
         OBD_FREE(sbi, sizeof(*sbi));
 
-        MOD_DEC_USE_COUNT;
         goto out_dev;
 } /* ll_read_super */
 
@@ -275,7 +271,6 @@ static void ll_put_super(struct super_block *sb)
 
         OBD_FREE(sbi, sizeof(*sbi));
 
-        MOD_DEC_USE_COUNT;
         EXIT;
 } /* ll_put_super */
 
@@ -300,16 +295,16 @@ static void ll_clear_inode(struct inode *inode)
                 }
         }
 
-        if (atomic_read(&inode->i_count) == 0) {
-                char *symlink_name = lli->lli_symlink_name;
+        if (atomic_read(&inode->i_count) != 0)
+                CERROR("clearing in-use inode %lu: count = %d\n",
+                       inode->i_ino, atomic_read(&inode->i_count));
 
-                if (lli->lli_smd)
-                        obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd);
+        if (lli->lli_smd)
+                obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd);
 
-                if (symlink_name) {
-                        OBD_FREE(symlink_name, strlen(symlink_name) + 1);
-                        lli->lli_symlink_name = NULL;
-                }
+        if (lli->lli_symlink_name) {
+                OBD_FREE(lli->lli_symlink_name,strlen(lli->lli_symlink_name)+1);
+                lli->lli_symlink_name = NULL;
         }
 
         EXIT;
@@ -323,8 +318,9 @@ static void ll_delete_inode(struct inode *inode)
                 struct obdo *oa;
                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 
+                /* mcreate with no open */
                 if (!lsm)
-                        GOTO(out, -EINVAL);
+                        GOTO(out, 0);
 
                 if (lsm->lsm_object_id == 0) {
                         CERROR("This really happens\n");
@@ -337,13 +333,13 @@ static void ll_delete_inode(struct inode *inode)
                         GOTO(out, -ENOMEM);
 
                 oa->o_id = lsm->lsm_object_id;
-                oa->o_mode = inode->i_mode;
-                oa->o_valid = OBD_MD_FLID | OBD_MD_FLEASIZE | OBD_MD_FLTYPE;
+                obdo_from_inode(oa, inode, OBD_MD_FLID | OBD_MD_FLTYPE);
 
                 err = obd_destroy(ll_i2obdconn(inode), oa, lsm);
                 obdo_free(oa);
-                CDEBUG(D_SUPER, "obd destroy of objid "LPX64" error %d\n",
-                       lsm->lsm_object_id, err);
+                if (err)
+                        CDEBUG(D_SUPER, "obd destroy objid "LPX64" error %d\n",
+                               lsm->lsm_object_id, err);
         }
 out:
         clear_inode(inode);
@@ -386,18 +382,23 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc)
 {
         struct ptlrpc_request *request = NULL;
         struct ll_sb_info *sbi = ll_i2sbi(inode);
-        int err;
-
+        int err = 0;
         ENTRY;
 
         /* change incore inode */
         ll_attr2inode(inode, attr, do_trunc);
 
-        err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, &request);
-        if (err)
-                CERROR("mdc_setattr fails (%d)\n", err);
+        /* Don't send size changes to MDS to avoid "fast EA" problems, and
+         * also avoid a pointless RPC (we get file size from OST anyways).
+         */
+        attr->ia_valid &= ~ATTR_SIZE;
+        if (attr->ia_valid) {
+                err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, &request);
+                if (err)
+                        CERROR("mdc_setattr fails (%d)\n", err);
 
-        ptlrpc_req_finished(request);
+                ptlrpc_req_finished(request);
+        }
 
         RETURN(err);
 }
@@ -503,7 +504,6 @@ static void ll_read_inode2(struct inode *inode, void *opaque)
         /* core attributes first */
         ll_update_inode(inode, body);
 
-        //if (body->valid & OBD_MD_FLEASIZE)
         LASSERT(!lli->lli_smd);
         if (lic && lic->lic_lmm)
                 obd_unpackmd(ll_i2obdconn(inode), &lli->lli_smd, lic->lic_lmm);
@@ -515,8 +515,7 @@ static void ll_read_inode2(struct inode *inode, void *opaque)
                 rc = ll_file_size(inode, lli->lli_smd);
                 if (rc) {
                         CERROR("ll_file_size: %d\n", rc);
-                        /* FIXME: need to somehow prevent inode creation */
-                        LBUG();
+                        ll_clear_inode(inode);
                         make_bad_inode(inode);
                 }
         }
@@ -548,8 +547,8 @@ static inline void invalidate_request_list(struct list_head *req_list)
         list_for_each_safe(tmp, n, req_list) {
                 struct ptlrpc_request *req =
                         list_entry(tmp, struct ptlrpc_request, rq_list);
-                CERROR("invalidating req xid "LPD64" op %d to %s:%d\n",
-                       (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
+                CERROR("invalidating req xid "LPU64" op %d to %s:%d\n",
+                       req->rq_xid, req->rq_reqmsg->opc,
                        req->rq_connection->c_remote_uuid,
                        req->rq_import->imp_client->cli_request_portal);
                 req->rq_flags |= PTL_RPC_FL_ERR;
@@ -591,8 +590,11 @@ struct super_operations ll_super_operations =
         umount_begin: ll_umount_begin
 };
 
-struct file_system_type lustre_lite_fs_type = {
-        "lustre_lite", 0, ll_read_super, NULL
+static struct file_system_type lustre_lite_fs_type = {
+        name:           "lustre_lite",
+        fs_flags:       0,
+        read_super:     ll_read_super,
+        owner:          THIS_MODULE,
 };
 
 static int __init init_lustre_lite(void)
index cd6544a..557d715 100644 (file)
@@ -114,13 +114,10 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent)
         class_uuid_t uuid;
 
         ENTRY;
-        MOD_INC_USE_COUNT;
 
         OBD_ALLOC(sbi, sizeof(*sbi));
-        if (!sbi) {
-                MOD_DEC_USE_COUNT;
+        if (!sbi)
                 RETURN(-ENOMEM);
-        }
 
         INIT_LIST_HEAD(&sbi->ll_conn_chain);
         generate_random_uuid(uuid);
@@ -238,7 +235,6 @@ out_mdc:
 out_free:
         OBD_FREE(sbi, sizeof(*sbi));
 
-        MOD_DEC_USE_COUNT;
         goto out_dev;
 } /* ll_fill_super */
 
@@ -272,25 +268,45 @@ static void ll_put_super(struct super_block *sb)
         obd_disconnect(&sbi->ll_mdc_conn);
         OBD_FREE(sbi, sizeof(*sbi));
 
-        MOD_DEC_USE_COUNT;
         EXIT;
 } /* ll_put_super */
 
 static void ll_clear_inode(struct inode *inode)
 {
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ll_inode_info *lli = ll_i2info(inode);
+        int rc;
         ENTRY;
 
-        if (atomic_read(&inode->i_count) == 0) {
-                struct ll_inode_info *lli = ll_i2info(inode);
-                char *symlink_name = lli->lli_symlink_name;
+#warning "Is there a reason we don't do this in 2.5, but we do in 2.4?"
+#if 0
+        rc = mdc_cancel_unused(&sbi->ll_mdc_conn, inode, LDLM_FL_NO_CALLBACK);
+        if (rc < 0) {
+                CERROR("mdc_cancel_unused: %d\n", rc);
+                /* XXX FIXME do something dramatic */
+        }
 
-                if (lli->lli_smd)
-                        obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd);
-                if (symlink_name) {
-                        OBD_FREE(symlink_name, strlen(symlink_name) + 1);
-                        lli->lli_symlink_name = NULL;
+        if (lli->lli_smd) {
+                rc = obd_cancel_unused(&sbi->ll_osc_conn, lli->lli_smd, 0);
+                if (rc < 0) {
+                        CERROR("obd_cancel_unused: %d\n", rc);
+                        /* XXX FIXME do something dramatic */
                 }
         }
+#endif
+
+        if (atomic_read(&inode->i_count) != 0)
+                CERROR("clearing in-use inode %lu: count = %d\n",
+                       inode->i_ino, atomic_read(&inode->i_count));
+
+        if (lli->lli_smd)
+                obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd);
+
+        if (lli->lli_symlink_name) {
+                OBD_FREE(lli->lli_symlink_name,strlen(lli->lli_symlink_name)+1);
+                lli->lli_symlink_name = NULL;
+        }
+
         EXIT;
 }
 
@@ -302,8 +318,9 @@ static void ll_delete_inode(struct inode *inode)
                 struct obdo *oa;
                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 
+                /* mcreate with no open */
                 if (!lsm)
-                        GOTO(out, -EINVAL);
+                        GOTO(out, 0);
 
                 if (lsm->lsm_object_id == 0) {
                         CERROR("This really happens\n");
@@ -317,12 +334,13 @@ static void ll_delete_inode(struct inode *inode)
 
                 oa->o_id = lsm->lsm_object_id;
                 oa->o_mode = inode->i_mode;
-                oa->o_valid = OBD_MD_FLID | OBD_MD_FLEASIZE | OBD_MD_FLTYPE;
+                oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
 
                 err = obd_destroy(ll_i2obdconn(inode), oa, lsm);
                 obdo_free(oa);
-                CDEBUG(D_SUPER, "obd destroy of objid "LPX64" error %d\n",
-                       lsm->lsm_object_id, err);
+                if (err)
+                        CDEBUG(D_SUPER, "obd destroy objid "LPX64" error %d\n",
+                               lsm->lsm_object_id, err);
         }
 out:
         clear_inode(inode);
@@ -365,18 +383,24 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc)
 {
         struct ptlrpc_request *request = NULL;
         struct ll_sb_info *sbi = ll_i2sbi(inode);
-        int err;
+        int err = 0;
 
         ENTRY;
 
         /* change incore inode */
         ll_attr2inode(inode, attr, do_trunc);
 
-        err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, &request);
-        if (err)
-                CERROR("mdc_setattr fails (%d)\n", err);
+        /* Don't send size changes to MDS to avoid "fast EA" problems, and
+         * also avoid a pointless RPC (we get file size from OST anyways).
+         */
+        attr->ia_valid &= ~ATTR_SIZE;
+        if (attr->ia_valid) {
+                err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, &request);
+                if (err)
+                        CERROR("mdc_setattr fails (%d)\n", err);
 
-        ptlrpc_req_finished(request);
+                ptlrpc_req_finished(request);
+        }
 
         RETURN(err);
 }
@@ -482,7 +506,6 @@ int ll_read_inode2(struct inode *inode, void *opaque)
         /* core attributes first */
         ll_update_inode(inode, body);
 
-        //if (body->valid & OBD_MD_FLEASIZE)
         LASSERT(!lli->lli_smd);
         if (lic && lic->lic_lmm)
                 obd_unpackmd(ll_i2obdconn(inode), &lli->lli_smd, lic->lic_lmm);
@@ -492,9 +515,9 @@ int ll_read_inode2(struct inode *inode, void *opaque)
                 rc = ll_file_size(inode, lli->lli_smd);
                 if (rc) {
                         CERROR("ll_file_size: %d\n", rc);
-                        /* FIXME: need to somehow prevent inode creation */
-                        LBUG();
+                        ll_clear_inode(inode);
                         make_bad_inode(inode);
+                        RETURN(rc);
                 }
         }
 
index ef86d58..5be4717 100644 (file)
@@ -89,7 +89,7 @@ static int ll_follow_link(struct dentry *dentry, struct nameidata *nd,
         struct inode *inode = dentry->d_inode;
         struct ll_inode_info *lli = ll_i2info(inode);
         struct ptlrpc_request *request;
-        int op, mode, rc;
+        int op = 0, mode = 0, rc;
         char *symname;
         ENTRY;
 
index fe5aad4..7135743 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/lustre_lib.h>
 #include <linux/lustre_net.h>
 #include <linux/lustre_idl.h>
+#include <linux/lustre_lite.h> /* for LL_IOC_LOV_[GS]ETSTRIPE */
 #include <linux/lustre_mds.h>
 #include <linux/obd_class.h>
 #include <linux/obd_lov.h>
@@ -42,10 +43,19 @@ struct lov_file_handles {
         struct lustre_handle *lfh_handles;
 };
 
+struct lov_lock_handles {
+        __u64 llh_cookie;
+        struct lustre_handle llh_handles[0];
+};
+
 extern int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmm,
                        struct lov_stripe_md *lsm);
 extern int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsm,
                          struct lov_mds_md *lmm);
+extern int lov_setstripe(struct lustre_handle *conn,
+                         struct lov_stripe_md **lsmp, struct lov_mds_md *lmmu);
+extern int lov_getstripe(struct lustre_handle *conn, struct lov_mds_md *lmmu,
+                         struct lov_stripe_md *lsm);
 
 /* obd methods */
 int lov_attach(struct obd_device *dev, obd_count len, void *data)
@@ -72,10 +82,9 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
         int rc, rc2, i;
         ENTRY;
 
-        MOD_INC_USE_COUNT;
         rc = class_connect(conn, obd, cluuid);
         if (rc)
-                GOTO(out_dec, rc);
+                RETURN(rc);
 
         /* We don't want to actually do the underlying connections more than
          * once, so keep track. */
@@ -84,6 +93,7 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
                 RETURN(0);
 
         exp = class_conn2export(conn);
+        spin_lock_init(&exp->exp_lov_data.led_lock);
         INIT_LIST_HEAD(&exp->exp_lov_data.led_open_head);
 
         /* retrieve LOV metadata from MDS */
@@ -159,7 +169,6 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
 
         for (i = 0; i < desc->ld_tgt_count; i++) {
                 struct obd_device *tgt = client_tgtuuid2obd(uuidarray[i]);
-                int rc2;
 
                 if (!tgt) {
                         CERROR("Target %s not attached\n", uuidarray[i]);
@@ -174,26 +183,20 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
                 rc = obd_connect(&lov->tgts[i].conn, tgt, NULL, recovd,
                                  recover);
 
-                /* Register even if connect failed, so that we get reactivation
-                 * notices.
-                 */
-                rc2 = obd_iocontrol(IOC_OSC_REGISTER_LOV, &lov->tgts[i].conn,
-                                    sizeof(struct obd_device *), obd, NULL);
-                if (rc2) {
-                        CERROR("Target %s REGISTER_LOV error %d\n",
-                               uuidarray[i], rc2);
-                        GOTO(out_disc, rc2);
+                if (rc) {
+                        CERROR("Target %s connect error %d\n", uuidarray[i],
+                               rc);
+                        GOTO(out_disc, rc);
                 }
-
-                /* But mark failed-connect OSCs as inactive! */
+                        
+                rc = obd_iocontrol(IOC_OSC_REGISTER_LOV, &lov->tgts[i].conn,
+                                    sizeof(struct obd_device *), obd, NULL);
                 if (rc) {
-                        CDEBUG(D_INFO, "Target %s connect error %d\n",
+                        CERROR("Target %s REGISTER_LOV error %d\n",
                                uuidarray[i], rc);
-                        LASSERT(lov->tgts[i].active == 0);
-                        rc = 0;
-                        continue;
+                        GOTO(out_disc, rc);
                 }
-                
+
                 desc->ld_active_tgt_count++;
                 lov->tgts[i].active = 1;
         }
@@ -205,6 +208,7 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
         RETURN(rc);
 
  out_disc:
+        i--; /* skip failed-connect OSC */
         while (i-- > 0) {
                 desc->ld_active_tgt_count--;
                 lov->tgts[i].active = 0;
@@ -216,8 +220,6 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
         OBD_FREE(lov->tgts, lov->bufsize);
  out_conn:
         class_disconnect(conn);
- out_dec:
-        MOD_DEC_USE_COUNT;
         goto out;
 }
 
@@ -256,6 +258,7 @@ static int lov_disconnect(struct lustre_handle *conn)
         lov->tgts = NULL;
 
         exp = class_conn2export(conn);
+        spin_lock(&exp->exp_lov_data.led_lock);
         list_for_each_safe(p, n, &exp->exp_lov_data.led_open_head) {
                 /* XXX close these, instead of just discarding them? */
                 struct lov_file_handles *lfh;
@@ -267,11 +270,10 @@ static int lov_disconnect(struct lustre_handle *conn)
                          lfh->lfh_count * sizeof(*lfh->lfh_handles));
                 kmem_cache_free(lov_file_cache, lfh);
         }
+        spin_unlock(&exp->exp_lov_data.led_lock);
 
  out_local:
         rc = class_disconnect(conn);
-        if (!rc)
-                MOD_DEC_USE_COUNT;
         return rc;
 }
 
@@ -286,6 +288,7 @@ static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid,
                               int activate)
 {
         struct obd_device *obd;
+        struct lov_tgt_desc *tgt;
         int i, rc = 0;
         ENTRY;
 
@@ -293,27 +296,31 @@ static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid,
                lov, uuid, activate);
 
         spin_lock(&lov->lov_lock);
-        for (i = 0; i < lov->desc.ld_tgt_count; i++)
-                if (strncmp(uuid, lov->tgts[i].uuid,
-                            sizeof(lov->tgts[i].uuid)) == 0)
+        for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) {
+                CDEBUG(D_INFO, "lov idx %d is %s conn "LPX64"\n",
+                       i, tgt->uuid, tgt->conn.addr);
+                if (strncmp(uuid, tgt->uuid, sizeof(tgt->uuid)) == 0)
                         break;
+        }
 
         if (i == lov->desc.ld_tgt_count)
                 GOTO(out, rc = -EINVAL);
 
-        obd = class_conn2obd(&lov->tgts[i].conn);
+        obd = class_conn2obd(&tgt->conn);
         if (obd == NULL) {
                 LBUG();
                 GOTO(out, rc = -ENOTCONN);
         }
 
-        CDEBUG(D_INFO, "Found OBD %p type %s\n", obd, obd->obd_type->typ_name);
+        CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LOV idx %d\n",
+               obd->obd_name, obd->obd_uuid, obd->obd_minor, obd,
+               obd->obd_type->typ_name, i);
         if (strcmp(obd->obd_type->typ_name, "osc") != 0) {
                 LBUG();
                 GOTO(out, rc = -EBADF);
         }
 
-        if (lov->tgts[i].active == activate) {
+        if (tgt->active == activate) {
                 CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
                        activate ? "" : "in");
                 GOTO(out, rc = -EALREADY);
@@ -321,7 +328,7 @@ static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid,
 
         CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in");
 
-        lov->tgts[i].active = activate;
+        tgt->active = activate;
         if (activate) {
                 /*
                  * foreach(export)
@@ -341,6 +348,7 @@ static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid,
                 lov->desc.ld_active_tgt_count--;
         }
 
+#warning "FIXME: walk open files list for objects that need opening"
         EXIT;
  out:
         spin_unlock(&lov->lov_lock);
@@ -400,7 +408,8 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa,
         struct lov_stripe_md *lsm;
         struct lov_oinfo *loi;
         struct obdo *tmp;
-        int ost_count, ost_idx = 1;
+        int ost_count, ost_idx;
+        int first = 1, obj_alloc = 0;
         int rc = 0, i;
         ENTRY;
 
@@ -409,119 +418,111 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa,
         if (!export)
                 RETURN(-EINVAL);
 
-        tmp = obdo_alloc();
-        if (!tmp)
-                RETURN(-ENOMEM);
-
         lov = &export->exp_obd->u.lov;
 
         if (!lov->desc.ld_active_tgt_count)
                 RETURN(-EIO);
 
-        spin_lock(&lov->lov_lock);
-        ost_count = lov->desc.ld_tgt_count;
+        tmp = obdo_alloc();
+        if (!tmp)
+                RETURN(-ENOMEM);
 
         lsm = *ea;
 
-        /* Can't create more stripes than we have targets (incl inactive). */
-        if (lsm && lsm->lsm_stripe_count > lov->desc.ld_tgt_count)
-                GOTO(out_tmp, rc = -EINVAL);
-
-        /* Free the user lsm if it needs to be changed, to avoid memory leaks */
-        if (!lsm || (lsm &&
-                     lsm->lsm_stripe_count > lov->desc.ld_active_tgt_count)) {
-                struct lov_stripe_md *lsm_new = NULL;
-                rc = obd_alloc_memmd(conn, &lsm_new);
-                if (rc < 0) {
-                        spin_unlock(&lov->lov_lock);
-                        if (lsm)
-                                obd_free_memmd(conn, &lsm);
+        if (!lsm) {
+                rc = obd_alloc_memmd(conn, &lsm);
+                if (rc < 0)
                         GOTO(out_tmp, rc);
-                }
-                if (lsm) {
-                        LASSERT(lsm->lsm_magic == LOV_MAGIC);
-                        CERROR("replace user LOV MD: stripes %u > %u active\n",
-                               lsm->lsm_stripe_count,
-                               lov->desc.ld_active_tgt_count);
-                        lsm_new->lsm_stripe_offset = lsm->lsm_stripe_offset;
-                        lsm_new->lsm_stripe_size = lsm->lsm_stripe_size;
-                        lsm_new->lsm_stripe_pattern = lsm->lsm_stripe_pattern;
-                        obd_free_memmd(conn, &lsm);
-                }
-                lsm = lsm_new;
-                ost_idx = 0; /* if lsm->lsm_stripe_offset is set yet */
+
+                rc = 0;
                 lsm->lsm_magic = LOV_MAGIC;
         }
 
+        ost_count = lov->desc.ld_tgt_count;
+
         LASSERT(oa->o_valid & OBD_MD_FLID);
         lsm->lsm_object_id = oa->o_id;
         if (!lsm->lsm_stripe_size)
                 lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size;
 
-        /* Because of 64-bit divide/mod operations only work with a 32-bit
-         * divisor in a 32-bit kernel, we cannot support a stripe width
-         * of 4GB or larger on 32-bit CPUs.
-         */
-        if (lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL) {
-                CERROR("LOV: stripe width "LPU64"x%u > %lu on 32-bit system\n",
-                       lsm->lsm_stripe_size, lsm->lsm_stripe_count, ~0UL);
-                spin_unlock(&lov->lov_lock);
-                GOTO(out_free, rc = -EINVAL);
-        }
-
-        if (!ost_idx || lsm->lsm_stripe_offset >= ost_count) {
+        if (!*ea || lsm->lsm_stripe_offset >= ost_count) {
                 int mult = lsm->lsm_object_id * lsm->lsm_stripe_count;
                 int stripe_offset = mult % ost_count;
                 int sub_offset = (mult / ost_count) % lsm->lsm_stripe_count;
 
-                lsm->lsm_stripe_offset = stripe_offset + sub_offset;
-        }
-
-        /* Start with lsm_stripe_offset on an active OSC to avoid confusion */
-        while (!lov->tgts[lsm->lsm_stripe_offset].active)
-                lsm->lsm_stripe_offset = (lsm->lsm_stripe_offset+1) % ost_count;
-
-        /* Pick the OSTs before we release the lock */
-        ost_idx = lsm->lsm_stripe_offset;
-        for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
-                CDEBUG(D_INODE, "objid "LPX64"[%d] is ost_idx %d (uuid %s)\n",
-                       lsm->lsm_object_id, i, ost_idx, lov->tgts[ost_idx].uuid);
-                loi->loi_ost_idx = ost_idx;
-                do {
-                        ost_idx = (ost_idx + 1) % ost_count;
-                } while (!lov->tgts[ost_idx].active);
-        }
-
-        spin_unlock(&lov->lov_lock);
+                ost_idx = stripe_offset + sub_offset;
+        } else
+                ost_idx = lsm->lsm_stripe_offset;
 
         CDEBUG(D_INODE, "allocating %d subobjs for objid "LPX64" at idx %d\n",
-               lsm->lsm_stripe_count,lsm->lsm_object_id,lsm->lsm_stripe_offset);
+               lsm->lsm_stripe_count, lsm->lsm_object_id, ost_idx);
 
-        for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
+        loi = lsm->lsm_oinfo;
+        for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
                 struct lov_stripe_md obj_md;
                 struct lov_stripe_md *obj_mdp = &obj_md;
+                int err;
 
-                ost_idx = loi->loi_ost_idx;
+                if (lov->tgts[ost_idx].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx);
+                        continue;
+                }
 
                 /* create data objects with "parent" OA */
                 memcpy(tmp, oa, sizeof(*tmp));
                 /* XXX: LOV STACKING: use real "obj_mdp" sub-data */
-                rc = obd_create(&lov->tgts[ost_idx].conn, tmp, &obj_mdp);
-                if (rc) {
-                        CERROR("error creating objid "LPX64" sub-object on "
-                               "OST idx %d: rc = %d\n", oa->o_id, ost_idx, rc);
-                        GOTO(out_cleanup, rc);
+                err = obd_create(&lov->tgts[ost_idx].conn, tmp, &obj_mdp);
+                if (err) {
+                        if (lov->tgts[ost_idx].active) {
+                                CERROR("error creating objid "LPX64" sub-object"
+                                       "on OST idx %d: rc = %d\n",
+                                       oa->o_id, ost_idx, err);
+                                if (!rc)
+                                        rc = err;
+                        }
+                        continue;
                 }
                 loi->loi_id = tmp->o_id;
+                loi->loi_ost_idx = ost_idx;
                 CDEBUG(D_INODE, "objid "LPX64" has subobj "LPX64" at idx %d\n",
                        lsm->lsm_object_id, loi->loi_id, ost_idx);
+
+                if (first) {
+                        lsm->lsm_stripe_offset = ost_idx;
+                        first = 0;
+                }
+
+                ++obj_alloc;
+                ++loi;
+
+                /* If we have allocated enough objects, we are OK */
+                if (obj_alloc == lsm->lsm_stripe_count) {
+                        rc = 0;
+                        GOTO(out_done, rc);
+                }
         }
 
+        if (*ea)
+                GOTO(out_cleanup, rc);
+        else {
+                struct lov_stripe_md *lsm_new;
+                /* XXX LOV STACKING call into osc for sizes */
+                int size = lov_stripe_md_size(obj_alloc);
+
+                OBD_ALLOC(lsm_new, size);
+                if (!lsm_new)
+                        GOTO(out_cleanup, rc = -ENOMEM);
+                memcpy(lsm_new, lsm, size);
+                /* XXX LOV STACKING call into osc for sizes */
+                OBD_FREE(lsm, lov_stripe_md_size(lsm->lsm_stripe_count));
+                lsm = lsm_new;
+        }
+ out_done:
         *ea = lsm;
 
  out_tmp:
         obdo_free(tmp);
-        RETURN(rc);
+        return rc;
 
  out_cleanup:
         while (i-- > 0) {
@@ -538,7 +539,6 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa,
                                oa->o_id, loi->loi_id, loi->loi_ost_idx,
                                err);
         }
- out_free:
         if (!*ea)
                 obd_free_memmd(conn, &lsm);
         goto out_tmp;
@@ -561,7 +561,7 @@ static int lov_destroy(struct lustre_handle *conn, struct obdo *oa,
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
-                CERROR("LOV striping magic bad %#lx != %#lx\n",
+                CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
                 RETURN(-EINVAL);
         }
@@ -576,6 +576,7 @@ static int lov_destroy(struct lustre_handle *conn, struct obdo *oa,
         for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
                 int err;
                 if (lov->tgts[loi->loi_ost_idx].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
                         /* Orphan clean up will (someday) fix this up. */
                         continue;
                 }
@@ -667,7 +668,7 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa,
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
-                CERROR("LOV striping magic bad %#lx != %#lx\n",
+                CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
                 RETURN(-EINVAL);
         }
@@ -680,14 +681,15 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa,
         if (oa->o_valid & OBD_MD_FLHANDLE)
                 lfh = lov_handle2lfh(obdo_handle(oa));
 
+        CDEBUG(D_INFO, "objid "LPX64": %ux%u byte stripes\n",
+               lsm->lsm_object_id, lsm->lsm_stripe_count, lsm->lsm_stripe_size);
         for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
                 int err;
 
-                if (loi->loi_id == 0)
-                        continue;
-
-                if (lov->tgts[loi->loi_ost_idx].active == 0)
+                if (lov->tgts[loi->loi_ost_idx].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
                         continue;
+                }
 
                 CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx "
                        "%u\n", oa->o_id, i, loi->loi_id, loi->loi_ost_idx);
@@ -739,7 +741,7 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa,
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
-                CERROR("LOV striping magic bad %#lx != %#lx\n",
+                CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
                 RETURN(-EINVAL);
         }
@@ -803,7 +805,7 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa,
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
-                CERROR("LOV striping magic bad %#lx != %#lx\n",
+                CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
                 RETURN(-EINVAL);
         }
@@ -829,6 +831,7 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa,
         for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
 
                 if (lov->tgts[loi->loi_ost_idx].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
                         continue;
                 }
 
@@ -863,7 +866,9 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa,
         handle->addr = (__u64)(unsigned long)lfh;
         handle->cookie = lfh->lfh_cookie;
         oa->o_valid |= OBD_MD_FLHANDLE;
+        spin_lock(&export->exp_lov_data.led_lock);
         list_add(&lfh->lfh_list, &export->exp_lov_data.led_open_head);
+        spin_unlock(&export->exp_lov_data.led_lock);
 
 out_tmp:
         obdo_free(tmp);
@@ -914,7 +919,7 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa,
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
-                CERROR("LOV striping magic bad %#lx != %#lx\n",
+                CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
                 RETURN(-EINVAL);
         }
@@ -928,9 +933,11 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa,
         lov = &export->exp_obd->u.lov;
         for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
                 int err;
-                
-                if (lov->tgts[loi->loi_ost_idx].active == 0)
+
+                if (lov->tgts[loi->loi_ost_idx].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
                         continue;
+                }
 
                 /* create data objects with "parent" OA */
                 memcpy(&tmp, oa, sizeof(tmp));
@@ -1029,7 +1036,7 @@ static int lov_punch(struct lustre_handle *conn, struct obdo *oa,
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
-                CERROR("LOV striping magic bad %#lx != %#lx\n",
+                CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
                 RETURN(-EINVAL);
         }
@@ -1048,6 +1055,7 @@ static int lov_punch(struct lustre_handle *conn, struct obdo *oa,
 
                 if (starti == endi)
                         continue;
+
                 /* create data objects with "parent" OA */
                 memcpy(&tmp, oa, sizeof(tmp));
                 tmp.o_id = loi->loi_id;
@@ -1094,7 +1102,7 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn,
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
-                CERROR("LOV striping magic bad %#lx != %#lx\n",
+                CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
                 RETURN(-EINVAL);
         }
@@ -1159,13 +1167,49 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn,
         RETURN(rc);
 }
 
+static struct lov_lock_handles *lov_newlockh(struct lov_stripe_md *lsm)
+{
+        struct lov_lock_handles *lov_lockh;
+
+        OBD_ALLOC(lov_lockh, sizeof(*lov_lockh) +
+                  sizeof(*lov_lockh->llh_handles) * lsm->lsm_stripe_count);
+        if (!lov_lockh)
+                return NULL;
+
+        get_random_bytes(&lov_lockh->llh_cookie, sizeof(lov_lockh->llh_cookie));
+
+        return lov_lockh;
+}
+
+/* We are only ever passed local lock handles here, so we do not need to
+ * validate (and we can't really because these structs are variable sized
+ * and therefore alloced, and not from a private slab).
+ *
+ * We just check because we can...
+ */
+static struct lov_lock_handles *lov_h2lovlockh(struct lustre_handle *handle)
+{
+        struct lov_lock_handles *lov_lockh = NULL;
+
+        if (!handle || !handle->addr)
+                RETURN(NULL);
+
+        lov_lockh = (struct lov_lock_handles *)(unsigned long)(handle->addr);
+        if (lov_lockh->llh_cookie != handle->cookie)
+                RETURN(NULL);
+
+        return lov_lockh;
+}
+
 static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                        struct lustre_handle *parent_lock,
                        __u32 type, void *cookie, int cookielen, __u32 mode,
                        int *flags, void *cb, void *data, int datalen,
-                       struct lustre_handle *lockhs)
+                       struct lustre_handle *lockh)
 {
         struct obd_export *export = class_conn2export(conn);
+        struct lov_lock_handles *lov_lockh = NULL;
+        struct lustre_handle *lov_lockhp;
         struct lov_obd *lov;
         struct lov_oinfo *loi;
         struct lov_stripe_md submd;
@@ -1178,7 +1222,7 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
-                CERROR("LOV striping magic bad %#lx != %#lx\n",
+                CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
                 RETURN(-EINVAL);
         }
@@ -1190,33 +1234,45 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         if (!export || !export->exp_obd)
                 RETURN(-ENODEV);
 
-        memset(lockhs, 0, sizeof(*lockhs) * lsm->lsm_stripe_count);
+        if (lsm->lsm_stripe_count > 1) {
+                lov_lockh = lov_newlockh(lsm);
+                if (!lov_lockh)
+                        RETURN(-ENOMEM);
+
+                lockh->addr = (__u64)(unsigned long)lov_lockh;
+                lockh->cookie = lov_lockh->llh_cookie;
+                lov_lockhp = lov_lockh->llh_handles;
+        } else
+                lov_lockhp = lockh;
 
         lov = &export->exp_obd->u.lov;
-        for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
+        for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count;
+             i++, loi++, lov_lockhp++) {
                 struct ldlm_extent *extent = (struct ldlm_extent *)cookie;
                 struct ldlm_extent sub_ext;
 
-                if (lov->tgts[loi->loi_ost_idx].active == 0)
+                if (lov->tgts[loi->loi_ost_idx].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
                         continue;
+                }
 
                 *flags = 0;
                 sub_ext.start = lov_stripe_offset(lsm, extent->start, i);
                 sub_ext.end = lov_stripe_offset(lsm, extent->end, i);
-                if (sub_ext.start == sub_ext.end)
+                if (sub_ext.start == sub_ext.end /* || !active */)
                         continue;
 
+                /* XXX LOV STACKING: submd should be from the subobj */
                 submd.lsm_object_id = loi->loi_id;
-                /* XXX submd should be that from the subobj, it should come
-                 *     opaquely from the LOV.
-                 */
                 submd.lsm_stripe_count = 0;
                 /* XXX submd is not fully initialized here */
                 *flags = 0;
                 rc = obd_enqueue(&(lov->tgts[loi->loi_ost_idx].conn), &submd,
                                  parent_lock, type, &sub_ext, sizeof(sub_ext),
-                                 mode, flags, cb, data, datalen, &(lockhs[i]));
+                                 mode, flags, cb, data, datalen, lov_lockhp);
                 // XXX add a lock debug statement here
+                if (rc)
+                        memset(lov_lockhp, 0, sizeof(*lov_lockhp));
                 if (rc && lov->tgts[loi->loi_ost_idx].active) {
                         CERROR("Error enqueue objid "LPX64" subobj "LPX64
                                " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
@@ -1224,33 +1280,47 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                         goto out_locks;
                 }
         }
-
         RETURN(0);
 
- out_locks:
-        for (i--, loi = &lsm->lsm_oinfo[i]; i >= 0; i--, loi--) {
+out_locks:
+        while (loi--, lov_lockhp--, i-- > 0) {
+                struct lov_stripe_md submd;
                 int err;
-                
-                if (lov->tgts[loi->loi_ost_idx].active == 0)
+
+                if (lov_lockhp->addr == 0 ||
+                    lov->tgts[loi->loi_ost_idx].active == 0)
                         continue;
 
+                /* XXX LOV STACKING: submd should be from the subobj */
                 submd.lsm_object_id = loi->loi_id;
                 submd.lsm_stripe_count = 0;
                 err = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd,
-                                 mode, &lockhs[i]);
+                                 mode, lov_lockhp);
                 if (err) {
-                        CERROR("Error cancelling objid "LPX64" subobj "LPX64
+                        CERROR("Error cancelling objid "LPX64
                                " on OST idx %d after enqueue error: rc = %d\n",
                                loi->loi_id, loi->loi_ost_idx, err);
                 }
         }
+
+        if (lsm->lsm_stripe_count > 1) {
+                lov_lockh->llh_cookie = DEAD_HANDLE_MAGIC;
+                OBD_FREE(lov_lockh, sizeof(*lov_lockh) +
+                          sizeof(*lov_lockh->llh_handles) *
+                          lsm->lsm_stripe_count);
+        }
+        lockh->addr = 0;
+        lockh->cookie = DEAD_HANDLE_MAGIC;
+
         RETURN(rc);
 }
 
 static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm,
-                      __u32 mode, struct lustre_handle *lockhs)
+                      __u32 mode, struct lustre_handle *lockh)
 {
         struct obd_export *export = class_conn2export(conn);
+        struct lov_lock_handles *lov_lockh = NULL;
+        struct lustre_handle *lov_lockhp;
         struct lov_obd *lov;
         struct lov_oinfo *loi;
         int rc = 0, i;
@@ -1262,7 +1332,7 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
-                CERROR("LOV striping magic bad %#lx != %#lx\n",
+                CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
                 RETURN(-EINVAL);
         }
@@ -1270,29 +1340,55 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         if (!export || !export->exp_obd)
                 RETURN(-ENODEV);
 
+        LASSERT(lockh);
+        if (lsm->lsm_stripe_count > 1) {
+                lov_lockh = lov_h2lovlockh(lockh);
+                if (!lov_lockh) {
+                        CERROR("LOV: invalid lov lock handle %p\n", lockh);
+                        RETURN(-EINVAL);
+                }
+
+                lov_lockhp = lov_lockh->llh_handles;
+        } else
+                lov_lockhp = lockh;
+
         lov = &export->exp_obd->u.lov;
-        for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
+        for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count;
+             i++, loi++, lov_lockhp++ ) {
                 struct lov_stripe_md submd;
                 int err;
 
-                if (lov->tgts[loi->loi_ost_idx].active == 0)
-                        continue;
-
-                if (lockhs[i].addr == 0)
+                if (lov_lockhp->addr == 0) {
+                        CDEBUG(D_HA, "lov idx %d no lock?\n", loi->loi_ost_idx);
                         continue;
+                }
 
+                /* XXX LOV STACKING: submd should be from the subobj */
                 submd.lsm_object_id = loi->loi_id;
                 submd.lsm_stripe_count = 0;
                 err = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd,
-                                mode, &lockhs[i]);
-                if (err && lov->tgts[loi->loi_ost_idx].active) {
-                        CERROR("Error cancel objid "LPX64" subobj "LPX64
-                               " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
-                               loi->loi_id, loi->loi_ost_idx, err);
-                        if (!rc)
-                                rc = err;
+                                 mode, lov_lockhp);
+                if (err) {
+                        if (lov->tgts[loi->loi_ost_idx].active) {
+                                CERROR("Error cancel objid "LPX64" subobj "
+                                       LPX64" on OST idx %d: rc = %d\n",
+                                       lsm->lsm_object_id,
+                                       loi->loi_id, loi->loi_ost_idx, err);
+                                if (!rc)
+                                        rc = err;
+                        }
                 }
         }
+
+        if (lsm->lsm_stripe_count > 1) {
+                lov_lockh->llh_cookie = DEAD_HANDLE_MAGIC;
+                OBD_FREE(lov_lockh, sizeof(*lov_lockh) +
+                          sizeof(*lov_lockh->llh_handles) *
+                          lsm->lsm_stripe_count);
+        }
+        lockh->addr = 0;
+        lockh->cookie = DEAD_HANDLE_MAGIC;
+
         RETURN(rc);
 }
 
@@ -1302,7 +1398,7 @@ static int lov_cancel_unused(struct lustre_handle *conn,
         struct obd_export *export = class_conn2export(conn);
         struct lov_obd *lov;
         struct lov_oinfo *loi;
-        int rc = 0, i, err;
+        int rc = 0, i;
         ENTRY;
 
         if (!lsm) {
@@ -1316,6 +1412,7 @@ static int lov_cancel_unused(struct lustre_handle *conn,
         lov = &export->exp_obd->u.lov;
         for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
                 struct lov_stripe_md submd;
+                int err;
 
                 submd.lsm_object_id = loi->loi_id;
                 submd.lsm_stripe_count = 0;
@@ -1352,12 +1449,14 @@ static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                 int err;
 
-                if (!lov->tgts[i].active)
+                if (!lov->tgts[i].active) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", i);
                         continue;
+                }
 
                 err = obd_statfs(&lov->tgts[i].conn, &lov_sfs);
                 if (err) {
-                        CERROR("Error statfs OSC %s idx %d: err = %d\n",
+                        CERROR("Error statfs OSC %s i %d: err = %d\n",
                                lov->tgts[i].uuid, i, err);
                         if (!rc)
                                 rc = err;
@@ -1389,7 +1488,6 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
 {
         struct obd_device *obddev = class_conn2obd(conn);
         struct lov_obd *lov = &obddev->u.lov;
-        struct obd_ioctl_data *data = karg;
         int i, count = lov->desc.ld_tgt_count;
         int rc;
 
@@ -1397,10 +1495,12 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
 
         switch (cmd) {
         case IOC_LOV_SET_OSC_ACTIVE: {
+                struct obd_ioctl_data *data = karg;
                 rc = lov_set_osc_active(lov,data->ioc_inlbuf1,data->ioc_offset);
                 break;
         }
         case OBD_IOC_LOV_GET_CONFIG: {
+                struct obd_ioctl_data *data = karg;
                 struct lov_tgt_desc *tgtdesc;
                 struct lov_desc *desc;
                 obd_uuid_t *uuidp;
@@ -1437,13 +1537,21 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
                 OBD_FREE(buf, len);
                 break;
         }
+        case LL_IOC_LOV_SETSTRIPE:
+                rc = lov_setstripe(conn, karg, uarg);
+                break;
+        case LL_IOC_LOV_GETSTRIPE:
+                rc = lov_getstripe(conn, karg, uarg);
+                break;
         default:
                 if (count == 0)
                         RETURN(-ENOTTY);
                 rc = 0;
                 for (i = 0; i < count; i++) {
-                        int err = obd_iocontrol(cmd, &lov->tgts[i].conn,
-                                                len, karg, uarg);
+                        int err;
+
+                        err = obd_iocontrol(cmd, &lov->tgts[i].conn,
+                                            len, karg, uarg);
                         if (err && !rc)
                                 rc = err;
                 }
@@ -1453,6 +1561,7 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
 }
 
 struct obd_ops lov_obd_ops = {
+        o_owner:       THIS_MODULE,
         o_attach:      lov_attach,
         o_detach:      lov_detach,
         o_setup:       lov_setup,
index 247015c..3d4b4b8 100644 (file)
 #include <linux/lustre_net.h>
 #include <linux/obd.h>
 #include <linux/obd_lov.h>
+#include <linux/obd_class.h>
 #include <linux/obd_support.h>
 
 /* lov_packdesc() is in mds/mds_lov.c */
-
 void lov_unpackdesc(struct lov_desc *ld)
 {
         ld->ld_tgt_count = NTOH__u32(ld->ld_tgt_count);
@@ -39,6 +39,28 @@ void lov_unpackdesc(struct lov_desc *ld)
         ld->ld_pattern = HTON__u32(ld->ld_pattern);
 }
 
+void lov_dump_lmm(int level, struct lov_mds_md *lmm)
+{
+        struct lov_object_id *loi;
+        int idx;
+
+        CDEBUG(level, "objid "LPX64", magic %#08x, ost_count %u\n",
+               lmm->lmm_object_id, lmm->lmm_magic, lmm->lmm_ost_count);
+        CDEBUG(level,"stripe_size %u, stripe_count %u, stripe_offset %u\n",
+               lmm->lmm_stripe_size, lmm->lmm_stripe_count,
+               lmm->lmm_stripe_offset);
+        for (idx = 0, loi = lmm->lmm_objects; idx < lmm->lmm_ost_count;
+             idx++, loi++)
+                CDEBUG(level, "ost idx %u subobj "LPX64"\n", idx,
+                       loi->l_object_id);
+}
+
+#define LMM_ASSERT(test)                                                \
+do {                                                                    \
+        if (!(test)) lov_dump_lmm(D_ERROR, lmm);                        \
+        LASSERT(test); /* so we know what assertion failed */           \
+} while(0)
+
 /* Pack LOV object metadata for shipment to the MDS.
  *
  * XXX In the future, this will be enhanced to get the EA size from the
@@ -60,12 +82,19 @@ int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp,
         ENTRY;
 
         if (lsm) {
+                int i, max = 0;
                 if (lsm->lsm_magic != LOV_MAGIC) {
-                        CERROR("bad mem LOV MAGIC: %#08x != %#08x\n",
+                        CERROR("bad mem LOV MAGIC: %#010x != %#010x\n",
                                lsm->lsm_magic, LOV_MAGIC);
                         RETURN(-EINVAL);
                 }
                 stripe_count = lsm->lsm_stripe_count;
+
+                for (i = 0,loi = lsm->lsm_oinfo; i < stripe_count; i++,loi++) {
+                        if (loi->loi_ost_idx > max)
+                                max = loi->loi_ost_idx;
+                }
+                ost_count = max + 1;
         }
 
         /* XXX LOV STACKING call into osc for sizes */
@@ -93,14 +122,14 @@ int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp,
         lmm->lmm_stripe_count = (stripe_count);
         if (!lsm)
                 RETURN(lmm_size);
+
         /* XXX endianness */
         lmm->lmm_magic = (lsm->lsm_magic);
         lmm->lmm_object_id = (lsm->lsm_object_id);
         LASSERT(lsm->lsm_object_id);
         lmm->lmm_stripe_size = (lsm->lsm_stripe_size);
-        lmm->lmm_stripe_pattern = (lsm->lsm_stripe_pattern);
         lmm->lmm_stripe_offset = (lsm->lsm_stripe_offset);
-        lmm->lmm_ost_count = (lov->desc.ld_tgt_count);
+        lmm->lmm_ost_count = (ost_count);
 
         /* Only fill in the object ids which we are actually using.
          * Assumes lmm_objects is otherwise zero-filled. */
@@ -113,6 +142,16 @@ int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp,
         RETURN(lmm_size);
 }
 
+static int lov_get_stripecnt(struct lov_obd *lov, int stripe_count)
+{
+        if (!stripe_count)
+                stripe_count = lov->desc.ld_default_stripe_count;
+        if (!stripe_count || stripe_count > lov->desc.ld_active_tgt_count)
+                stripe_count = lov->desc.ld_active_tgt_count;
+
+        return stripe_count;
+}
+
 int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
                  struct lov_mds_md *lmm)
 {
@@ -120,9 +159,9 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
         struct lov_obd *lov = &obd->u.lov;
         struct lov_stripe_md *lsm;
         struct lov_oinfo *loi;
-        int ost_count = lov->desc.ld_active_tgt_count;
+        int ost_count;
         int ost_offset = 0;
-        int stripe_count = 0;
+        int stripe_count;
         int lsm_size;
         int i;
         ENTRY;
@@ -135,12 +174,9 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
                         RETURN(-EINVAL);
                 }
                 stripe_count = (lmm->lmm_stripe_count);
-        }
-
-        if (!stripe_count)
-                stripe_count = lov->desc.ld_default_stripe_count;
-        if (!stripe_count || stripe_count > ost_count)
-                stripe_count = ost_count;
+                LASSERT(stripe_count);
+        } else
+                stripe_count = lov_get_stripecnt(lov, 0);
 
         /* XXX LOV STACKING call into osc for sizes */
         lsm_size = lov_stripe_md_size(stripe_count);
@@ -171,9 +207,12 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
         ost_offset = lsm->lsm_stripe_offset = (lmm->lmm_stripe_offset);
         lsm->lsm_magic = (lmm->lmm_magic);
         lsm->lsm_object_id = (lmm->lmm_object_id);
-        LASSERT(lsm->lsm_object_id);
         lsm->lsm_stripe_size = (lmm->lmm_stripe_size);
-        lsm->lsm_stripe_pattern = (lmm->lmm_stripe_pattern);
+
+        ost_count = (lmm->lmm_ost_count);
+
+        LMM_ASSERT(lsm->lsm_object_id);
+        LMM_ASSERT(ost_count);
 
         for (i = 0, loi = lsm->lsm_oinfo; i < ost_count; i++, ost_offset++) {
                 ost_offset %= ost_count;
@@ -181,13 +220,126 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
                 if (!lmm->lmm_objects[ost_offset].l_object_id)
                         continue;
 
-                LASSERT(loi - lsm->lsm_oinfo < stripe_count);
+                LMM_ASSERT(loi - lsm->lsm_oinfo < stripe_count);
                 /* XXX LOV STACKING call down to osc_unpackmd() */
                 loi->loi_id = (lmm->lmm_objects[ost_offset].l_object_id);
                 loi->loi_ost_idx = ost_offset;
                 loi++;
         }
-        LASSERT(loi - lsm->lsm_oinfo == stripe_count);
+        LMM_ASSERT(loi - lsm->lsm_oinfo > 0);
+        LMM_ASSERT(loi - lsm->lsm_oinfo == stripe_count);
 
         RETURN(lsm_size);
 }
+
+/* Configure object striping information on a new file.
+ *
+ * @lmmu is a pointer to a user struct with one or more of the fields set to
+ * indicate the application preference: lmm_stripe_count, lmm_stripe_size,
+ * lmm_stripe_offset, and lmm_stripe_pattern.  lmm_magic must be LOV_MAGIC.
+ * @lsmp is a pointer to an in-core stripe MD that needs to be filled in.
+ */
+int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
+                  struct lov_mds_md *lmmu)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        struct lov_obd *lov = &obd->u.lov;
+        struct lov_mds_md lmm;
+        struct lov_stripe_md *lsm;
+        int stripe_count;
+        int rc;
+        ENTRY;
+
+        rc = copy_from_user(&lmm, lmmu, sizeof(lmm));
+        if (rc)
+                RETURN(-EFAULT);
+
+        if (lmm.lmm_magic != LOV_MAGIC) {
+                CERROR("bad wire LOV MAGIC: %#08x != %#08x\n",
+                       lmm.lmm_magic, LOV_MAGIC);
+                RETURN(-EINVAL);
+        }
+        if (lmm.lmm_stripe_count > lov->desc.ld_tgt_count) {
+                CERROR("stripe count %d more than OST count %d\n",
+                       (int)lmm.lmm_stripe_count, lov->desc.ld_tgt_count);
+                RETURN(-EINVAL);
+        }
+        if (lmm.lmm_stripe_offset >= lov->desc.ld_tgt_count) {
+                CERROR("stripe offset %d more than max OST index %d\n",
+                       (int)lmm.lmm_stripe_count, lov->desc.ld_tgt_count);
+                RETURN(-EINVAL);
+        }
+        if (lmm.lmm_stripe_size & (PAGE_SIZE - 1)) {
+                CERROR("stripe size %u not multiple of %lu\n",
+                       lmm.lmm_stripe_size, PAGE_SIZE);
+                RETURN(-EINVAL);
+        }
+        if (lmm.lmm_stripe_size * lmm.lmm_stripe_count > ~0UL) {
+                CERROR("stripe width %ux%u > %lu on 32-bit system\n",
+                       lmm.lmm_stripe_size, (int)lmm.lmm_stripe_count, ~0UL);
+                RETURN(-EINVAL);
+        }
+
+        stripe_count = lov_get_stripecnt(lov, lmm.lmm_stripe_count);
+
+        /* XXX LOV STACKING call into osc for sizes */
+        OBD_ALLOC(lsm, lov_stripe_md_size(stripe_count));
+        if (!lsm)
+                RETURN(-ENOMEM);
+
+        lsm->lsm_magic = LOV_MAGIC;
+        /* This is all validated in lov_create() */
+        lsm->lsm_stripe_count = stripe_count;
+        lsm->lsm_stripe_offset = lmm.lmm_stripe_offset;
+        lsm->lsm_stripe_size = lmm.lmm_stripe_size;
+
+        *lsmp = lsm;
+
+        RETURN(rc);
+}
+
+/* Retrieve object striping information.
+ *
+ * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_MAGIC.
+ */
+int lov_getstripe(struct lustre_handle *conn, struct lov_stripe_md *lsm,
+                  struct lov_mds_md *lmmu)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        struct lov_obd *lov = &obd->u.lov;
+        struct lov_mds_md lmm, *lmmk = NULL;
+        int ost_count, rc, lmm_size;
+        ENTRY;
+
+        if (!lsm)
+                RETURN(-ENODATA);
+
+        rc = copy_from_user(&lmm, lmmu, sizeof(lmm));
+        if (rc)
+                RETURN(-EFAULT);
+
+        if (lmm.lmm_magic != LOV_MAGIC)
+                RETURN(-EINVAL);
+
+        ost_count = lov->desc.ld_tgt_count;
+
+        /* XXX we _could_ check if indices > user lmm_ost_count are zero */
+        if (lmm.lmm_ost_count < ost_count)
+                RETURN(-EOVERFLOW);
+
+        rc = lov_packmd(conn, &lmmk, lsm);
+        if (rc < 0)
+                RETURN(rc);
+
+        lmm_size = rc;
+        rc = 0;
+
+        if (lmm_size && copy_to_user(lmmu, lmmk, lmm_size))
+                rc = -EFAULT;
+
+        obd_free_wiremd(conn, &lmmk);
+
+        RETURN(rc);
+}
index a68b57e..0812e00 100644 (file)
  * Common STATUS namespace
  */
 
-int rd_uuid(char* page, char **start, off_t off, int count, int *eof, 
+int rd_uuid(char *page, char **start, off_t off, int count, int *eof,
             void *data)
 {
-        int len = 0;
         struct obd_device* dev = (struct obd_device*)data;
-        len += snprintf(page, count, "%s\n", dev->obd_uuid);
-        return len;
-        
-
+        return snprintf(page, count, "%s\n", dev->obd_uuid);
 }
-int rd_stripesize(char* page, char **start, off_t off, int count, int *eof, 
+
+int rd_stripesize(char *page, char **start, off_t off, int count, int *eof,
                   void *data)
 {
-        struct obd_device* dev = (struct obd_device*)data;
-        int len = 0; 
-        struct lov_obd* lov = &dev->u.lov;
-        len += snprintf(page, count, LPU64"\n", 
-                        (__u64)(lov->desc.ld_default_stripe_size));
-        
-        return len;
+        struct obd_device *dev = (struct obd_device*)data;
+        struct lov_desc *desc = &dev->u.lov.desc;
+
+        return snprintf(page, count, LPU64"\n", desc->ld_default_stripe_size);
 }
 
-int rd_stripeoffset(char* page, char **start, off_t off, int count, int *eof, 
+int rd_stripeoffset(char *page, char **start, off_t off, int count, int *eof,
                     void *data)
 {
         struct obd_device* dev = (struct obd_device*)data;
-        int len = 0;
         struct lov_obd* lov = &dev->u.lov;
-        len += snprintf(page, count, LPU64"\n", 
-                        lov->desc.ld_default_stripe_offset);
-        return len;
 
+        return snprintf(page, count, LPU64"\n",
+                        lov->desc.ld_default_stripe_offset);
 }
 
-int rd_stripetype(char* page, char **start, off_t off, int count, int *eof, 
+int rd_stripetype(char *page, char **start, off_t off, int count, int *eof,
                   void *data)
 {
         struct obd_device* dev = (struct obd_device*)data;
-        int len = 0;
         struct lov_obd* lov = &dev->u.lov;
-        len += snprintf(page, count, LPU64"\n", 
-                        (__u64)(lov->desc.ld_pattern));
-        return len;
 
+        return snprintf(page, count, "%u\n", lov->desc.ld_pattern);
 }
-int rd_stripecount(char* page, char **start, off_t off, int count, int *eof, 
+
+int rd_stripecount(char *page, char **start, off_t off, int count, int *eof,
                    void *data)
-{       
+{
         struct obd_device* dev = (struct obd_device*)data;
-        int len = 0;
         struct lov_obd* lov = &dev->u.lov;
-        len += snprintf(page, count, LPU64"\n", 
-                        (__u64)(lov->desc.ld_default_stripe_count));
-        return len;
 
+        return snprintf(page, count, "%u\n", lov->desc.ld_default_stripe_count);
 }
-int rd_numobd(char* page, char **start, off_t off, int count, int *eof, 
+
+int rd_numobd(char *page, char **start, off_t off, int count, int *eof,
               void *data)
-{       
-        struct obd_device* dev = (struct obd_device*)data;
-        int len = 0;
-        struct lov_obd* lov=&dev->u.lov;
-        len += snprintf(page, count, LPU64"\n", 
-                        (__u64)(lov->desc.ld_tgt_count));
-        return len;
+{
+        struct obd_device *dev = (struct obd_device*)data;
+        struct lov_obd *lov = &dev->u.lov;
+
+        return snprintf(page, count, "%u\n", lov->desc.ld_tgt_count);
 
 }
 
-int rd_activeobd(char* page, char **start, off_t off, int count, int *eof, 
+int rd_activeobd(char *page, char **start, off_t off, int count, int *eof,
                  void *data)
-{       
+{
         struct obd_device* dev = (struct obd_device*)data;
-        int len = 0;
         struct lov_obd* lov = &dev->u.lov;
-        len += snprintf(page, count, LPU64"\n", 
-                        (__u64)(lov->desc.ld_active_tgt_count));
-        return len;
 
+        return snprintf(page, count, "%u\n", lov->desc.ld_active_tgt_count);
 }
 
-int rd_blksize(char* page, char **start, off_t off, int count, int *eof, 
+int rd_blksize(char *page, char **start, off_t off, int count, int *eof,
                void *data)
 {
         return 0;
 }
 
 
-int rd_kbtotal(char* page, char **start, off_t off, int count, int *eof, 
+int rd_kbtotal(char *page, char **start, off_t off, int count, int *eof,
                void *data)
 {
         return 0;
 }
 
 
-int rd_kbfree(char* page, char **start, off_t off, int count, int *eof, 
+int rd_kbfree(char *page, char **start, off_t off, int count, int *eof,
               void *data)
 {
         return 0;
 }
 
-int rd_filestotal(char* page, char **start, off_t off, int count, int *eof, 
+int rd_filestotal(char *page, char **start, off_t off, int count, int *eof,
                   void *data)
 {
         return 0;
 }
 
-int rd_filesfree(char* page, char **start, off_t off, int count, int *eof, 
+int rd_filesfree(char* page, char **start, off_t off, int count, int *eof,
                  void *data)
 {
         return 0;
 }
 
-int rd_filegroups(char* page, char **start, off_t off, int count, int *eof, 
-                 void *data)
+int rd_filegroups(char* page, char **start, off_t off, int count, int *eof,
+                  void *data)
 {
         return 0;
 }
 
-int rd_target(char* page, char **start, off_t off, int count, int *eof, 
+int rd_target(char *page, char **start, off_t off, int count, int *eof,
               void *data)
 {
         struct obd_device* dev = (struct obd_device*)data;
         int len = 0, i = 0;
         struct lov_obd* lov = &dev->u.lov;
         struct lov_tgt_desc* tgts = lov->tgts;
-        while(i < lov->desc.ld_tgt_count){
-                len += snprintf(&page[len], count, "%d: %s\n", i, tgts->uuid);
+        while (i < lov->desc.ld_tgt_count) {
+                len += snprintf(&page[len], count - len, "%d: %s %sACTIVE\n",
+                                i, tgts->uuid, tgts->active ? "" : "IN");
                 i++;
                 tgts++;
         }
-        
+
         return len;
 }
+
 int rd_mdc(char* page, char **start, off_t off, int count, int *eof, void *data)
 {
         struct obd_device* dev = (struct obd_device*)data;
@@ -186,16 +171,15 @@ struct lprocfs_vars status_var_nm_1[] = {
         {"status/kbytesfree", rd_kbfree, 0, 0},
         {"status/target_obd", rd_target, 0, 0},
         {"status/target_mdc", rd_mdc, 0, 0},
-       
         {0}
 };
-int rd_numrefs(char* page, char **start, off_t off, int count, int *eof, 
+
+int rd_numrefs(char *page, char **start, off_t off, int count, int *eof,
                void *data)
 {
         struct obd_type* class = (struct obd_type*)data;
-        int len = 0;
-        len += snprintf(page, count, "%d\n", class->typ_refcnt);
-        return len;
+
+        return snprintf(page, count, "%d\n", class->typ_refcnt);
 }
 
 struct lprocfs_vars status_class_var[]={
index 8dd9175..1d9c099 100644 (file)
@@ -9,11 +9,9 @@ MODULE = mdc
 modulefs_DATA = mdc.o
 EXTRA_PROGRAMS = mdc
 
-LINX= mds_updates.c ll_pack.c client.c
+LINX= mds_updates.c client.c
 mdc_SOURCES =  mdc_request.c mdc_reint.c lproc_mdc.c $(LINX)
 
-ll_pack.c: 
-       test -e ll_pack.c || ln -sf $(top_srcdir)/lib/ll_pack.c .
 mds_updates.c: 
        test -e mds_updates.c || ln -sf $(top_srcdir)/lib/mds_updates.c .
 client.c: 
index c856d10..a97cfb5 100644 (file)
@@ -37,24 +37,25 @@ extern int mds_queue_req(struct ptlrpc_request *);
 extern struct lprocfs_vars status_var_nm_1[];
 extern struct lprocfs_vars status_class_var[];
 
-/* should become mdc_getinfo() */
-int mdc_getstatus(struct lustre_handle *conn, struct ll_fid *rootfid)
+/* Helper that implements most of mdc_getstatus and signal_completed_replay. */
+static int send_getstatus(struct obd_import *imp, struct ll_fid *rootfid,
+                          int level, int msg_flags)
 {
         struct ptlrpc_request *req;
         struct mds_body *body;
         int rc, size = sizeof(*body);
         ENTRY;
 
-        req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_GETSTATUS, 1, &size,
-                              NULL);
+        req = ptlrpc_prep_req(imp, MDS_GETSTATUS, 1, &size, NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
         body = lustre_msg_buf(req->rq_reqmsg, 0);
-        req->rq_level = LUSTRE_CONN_CON;
+        req->rq_level = level;
         req->rq_replen = lustre_msg_size(1, &size);
-
+        
         mds_pack_req_body(req);
+        req->rq_reqmsg->flags |= msg_flags;
         rc = ptlrpc_queue_wait(req);
 
         if (!rc) {
@@ -74,6 +75,13 @@ int mdc_getstatus(struct lustre_handle *conn, struct ll_fid *rootfid)
         return rc;
 }
 
+/* should become mdc_getinfo() */
+int mdc_getstatus(struct lustre_handle *conn, struct ll_fid *rootfid)
+{
+        return send_getstatus(class_conn2cliimp(conn), rootfid, LUSTRE_CONN_CON,
+                              0);
+}
+
 int mdc_getlovinfo(struct obd_device *obd, struct lustre_handle *mdc_connh,
                    struct ptlrpc_request **request)
 {
@@ -104,9 +112,8 @@ int mdc_getlovinfo(struct obd_device *obd, struct lustre_handle *mdc_connh,
         RETURN(rc);
 }
 
-
 int mdc_getattr(struct lustre_handle *conn,
-                obd_id ino, int type, unsigned long valid, size_t ea_size,
+                obd_id ino, int type, unsigned long valid, unsigned int ea_size,
                 struct ptlrpc_request **request)
 {
         struct ptlrpc_request *req;
@@ -130,7 +137,7 @@ int mdc_getattr(struct lustre_handle *conn,
                 size[bufcount] = ea_size;
                 bufcount++;
                 body->size = ea_size;
-                CDEBUG(D_INODE, "reserving %d bytes for MD/symlink in packet\n",
+                CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n",
                        ea_size);
         }
         req->rq_replen = lustre_msg_size(bufcount, size);
@@ -150,6 +157,50 @@ int mdc_getattr(struct lustre_handle *conn,
         return rc;
 }
 
+int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent,
+                     char *filename, int namelen, unsigned long valid,
+                     unsigned int ea_size, struct ptlrpc_request **request)
+{
+        struct ptlrpc_request *req;
+        struct mds_body *body;
+        int rc, size[2] = {sizeof(*body), namelen}, bufcount = 1;
+        ENTRY;
+
+        req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_GETATTR_NAME, 2,
+                              size, NULL);
+        if (!req)
+                GOTO(out, rc = -ENOMEM);
+
+        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        ll_inode2fid(&body->fid1, parent);
+        body->valid = valid;
+        memcpy(lustre_msg_buf(req->rq_reqmsg, 1), filename, namelen);
+
+        if (ea_size) {
+                size[1] = ea_size;
+                bufcount++;
+                body->size = ea_size;
+                CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n",
+                       ea_size);
+                valid |= OBD_MD_FLEASIZE;
+        }
+
+        req->rq_replen = lustre_msg_size(bufcount, size);
+        mds_pack_req_body(req);
+
+        rc = ptlrpc_queue_wait(req);
+
+        if (!rc) {
+                body = lustre_msg_buf(req->rq_repmsg, 0);
+                mds_unpack_body(body);
+        }
+
+        EXIT;
+ out:
+        *request = req;
+        return rc;
+}
+
 void d_delete_aliases(struct inode *inode)
 {
         struct dentry *dentry = NULL;
@@ -187,15 +238,19 @@ static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
                 break;
         case LDLM_CB_CANCELING: {
                 /* Invalidate all dentries associated with this inode */
-                struct inode *inode = data;
-
-#warning "FIXME: what tells us that 'inode' is valid at all?"
-                if (inode->i_state & I_FREEING)
-                        break;
+                struct inode *inode;
 
-                LASSERT(inode != NULL);
+                LASSERT(data != NULL);
                 LASSERT(data_len == sizeof(*inode));
 
+                /* XXX what tells us that 'data' is a valid inode at all?
+                 *     we should probably validate the lock handle first?
+                 */
+                inode = igrab(data);
+
+                if (inode == NULL)      /* inode->i_state & I_FREEING */
+                        break;
+
                 if (S_ISDIR(inode->i_mode)) {
                         CDEBUG(D_INODE, "invalidating inode %lu\n",
                                inode->i_ino);
@@ -203,12 +258,10 @@ static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
                         ll_invalidate_inode_pages(inode);
                 }
 
-                if (inode != inode->i_sb->s_root->d_inode) {
-                        /* XXX should this igrab move up 12 lines? */
-                        LASSERT(igrab(inode) == inode);
+                if (inode != inode->i_sb->s_root->d_inode)
                         d_delete_aliases(inode);
-                        iput(inode);
-                }
+
+                iput(inode);
                 break;
         }
         default:
@@ -225,11 +278,16 @@ void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff,
         struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, reqoff);
         struct mds_body *body = lustre_msg_buf(req->rq_repmsg, repoff);
 
-        DEBUG_REQ(D_HA, req, "storing generation %x for ino "LPD64,
-                  body->fid1.generation, body->fid1.id);
         memcpy(&rec->cr_replayfid, &body->fid1, sizeof rec->cr_replayfid);
+        DEBUG_REQ(D_HA, req, "storing generation %x for ino "LPD64,
+                  rec->cr_replayfid.generation, rec->cr_replayfid.id);
 }
 
+/* We always reserve enough space in the reply packet for a stripe MD, because
+ * we don't know in advance the file type.
+ *
+ * XXX we could get that from ext2_dir_entry_2 file_type
+ */
 int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                 struct lookup_intent *it, int lock_mode, struct inode *dir,
                 struct dentry *de, struct lustre_handle *lockh,
@@ -408,7 +466,8 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                                     &lockh2)) {
                         /* We already have a lock; cancel the old one */
                         ldlm_lock_decref(lockh, lock_mode);
-                        ldlm_cli_cancel(lockh);
+                        /* FIXME: bug 563 */
+                        //ldlm_cli_cancel(lockh);
                         memcpy(lockh, &lockh2, sizeof(lockh2));
                 }
                 LDLM_LOCK_PUT(lock);
@@ -459,6 +518,13 @@ static void mdc_replay_open(struct ptlrpc_request *req)
         memcpy(saved->fh, &body->handle, sizeof(body->handle));
 }
 
+/* If lmm is non-NULL and lmm_size is non-zero, the stripe MD is stored on
+ * the MDS.  Otherwise, we have already read a copy from the MDS (probably
+ * during mdc_enqueue() and we do not need to send it to the MDS again.
+ *
+ * In the future (when we support the non-intent case) we need to be able
+ * to read the stripe MD from the MDS here (need to fix mds_open() too).
+ */
 int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags,
              struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh,
              struct ptlrpc_request **request)
@@ -469,9 +535,9 @@ int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags,
         struct ptlrpc_request *req;
         ENTRY;
 
-        if (lmm && lmm_size) {
+        if (lmm_size) {
                 bufcount = 3;
-                size[2] = size[1]; /* shuffle the spare data along */
+                size[2] = size[1]; /* shuffle the replay data along */
                 size[1] = lmm_size;
         }
 
@@ -487,12 +553,14 @@ int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags,
         body->flags = HTON__u32(flags);
         memcpy(&body->handle, fh, sizeof(body->handle));
 
-        if (lmm && lmm_size) {
-                CDEBUG(D_INODE, "sending %u bytes MD for ino "LPU64"\n",
-                       lmm_size, ino);
-                lustre_msg_set_op_flags(req->rq_reqmsg, MDS_OPEN_HAS_EA);
-                memcpy(lustre_msg_buf(req->rq_reqmsg, 1), lmm, lmm_size);
+        if (lmm_size) {
                 body->flags |= HTON__u32(OBD_MD_FLEASIZE);
+                if (lmm) {
+                        CDEBUG(D_INODE, "sending %u bytes MD for ino "LPU64"\n",
+                               lmm_size, ino);
+                        lustre_msg_set_op_flags(req->rq_reqmsg,MDS_OPEN_HAS_EA);
+                        memcpy(lustre_msg_buf(req->rq_reqmsg,1), lmm, lmm_size);
+                }
         }
 
         req->rq_replen = lustre_msg_size(1, size);
@@ -502,12 +570,12 @@ int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags,
                 body = lustre_msg_buf(req->rq_repmsg, 0);
                 mds_unpack_body(body);
                 memcpy(fh, &body->handle, sizeof(*fh));
-        }
 
-        /* If open is replayed, we need to fix up the fh. */
-        req->rq_replay_cb = mdc_replay_open;
-        replay_data = lustre_msg_buf(req->rq_reqmsg, lmm ? 2 : 1);
-        replay_data->fh = fh;
+                /* If open is replayed, we need to fix up the fh. */
+                req->rq_replay_cb = mdc_replay_open;
+                replay_data = lustre_msg_buf(req->rq_reqmsg, lmm ? 2 : 1);
+                replay_data->fh = fh;
+        }
 
         EXIT;
  out:
@@ -635,9 +703,19 @@ static int mdc_detach(struct obd_device *dev)
         return lprocfs_dereg_obd(dev);
 }
 
+/* Send a mostly-dummy GETSTATUS request and indicate that we're done replay. */
+static int signal_completed_replay(struct obd_import *imp)
+{
+        struct ll_fid fid;
+        
+        return send_getstatus(imp, &fid, LUSTRE_CONN_RECOVD, MSG_LAST_REPLAY);
+}
+
 static int mdc_recover(struct obd_import *imp, int phase)
 {
         int rc;
+        unsigned long flags;
+        struct ptlrpc_request *req;
         ENTRY;
 
         switch(phase) {
@@ -647,13 +725,30 @@ static int mdc_recover(struct obd_import *imp, int phase)
                 RETURN(0);
             case PTLRPC_RECOVD_PHASE_RECOVER:
         reconnect:
-                rc = ptlrpc_reconnect_import(imp, MDS_CONNECT);
+                rc = ptlrpc_reconnect_import(imp, MDS_CONNECT, &req);
+
+                /* We were still connected, just go about our business. */
                 if (rc == EALREADY)
-                        RETURN(ptlrpc_replay(imp, 0));
-                if (rc)
+                        GOTO(skip_replay, rc);
+
+                if (rc) {
+                        ptlrpc_req_finished(req);
                         RETURN(rc);
+                }
+                
+                /* We can't replay, which might be a problem. */
+                if (!(lustre_msg_get_flags(req->rq_repmsg) &
+                      MSG_REPLAY_IN_PROGRESS)) {
+                        if (phase != PTLRPC_RECOVD_PHASE_NOTCONN) {
+                             CERROR("can't replay, invalidating\n");
+                             ldlm_namespace_cleanup(imp->imp_obd->obd_namespace,
+                                                    1);
+                             ptlrpc_abort_inflight(imp);
+                        }
+                        goto skip_replay;
+                }
 
-                rc = ptlrpc_replay(imp, 0 /* no last flag*/);
+                rc = ptlrpc_replay(imp);
                 if (rc)
                         RETURN(rc);
 
@@ -661,9 +756,16 @@ static int mdc_recover(struct obd_import *imp, int phase)
                 if (rc)
                         RETURN(rc);
 
-                spin_lock(&imp->imp_lock);
+                rc = signal_completed_replay(imp);
+                if (rc)
+                        RETURN(rc);
+
+        skip_replay:
+                ptlrpc_req_finished(req);
+                spin_lock_irqsave(&imp->imp_lock, flags);
                 imp->imp_level = LUSTRE_CONN_FULL;
-                spin_unlock(&imp->imp_lock);
+                imp->imp_flags &= ~IMP_INVALID;
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
 
                 ptlrpc_wake_delayed(imp);
 
@@ -693,13 +795,14 @@ static int mdc_connect(struct lustre_handle *conn, struct obd_device *obd,
 }
 
 struct obd_ops mdc_obd_ops = {
-        o_attach: mdc_attach,
-        o_detach: mdc_detach,
-        o_setup:   client_obd_setup,
-        o_cleanup: client_obd_cleanup,
-        o_connect: mdc_connect,
-        o_disconnect: client_obd_disconnect,
-        o_statfs: mdc_statfs,
+        o_owner:       THIS_MODULE,
+        o_attach:      mdc_attach,
+        o_detach:      mdc_detach,
+        o_setup:       client_obd_setup,
+        o_cleanup:     client_obd_cleanup,
+        o_connect:     mdc_connect,
+        o_disconnect:  client_obd_disconnect,
+        o_statfs:      mdc_statfs
 };
 
 static int __init ptlrpc_request_init(void)
@@ -723,6 +826,7 @@ EXPORT_SYMBOL(mdc_getlovinfo);
 EXPORT_SYMBOL(mdc_enqueue);
 EXPORT_SYMBOL(mdc_cancel_unused);
 EXPORT_SYMBOL(mdc_getattr);
+EXPORT_SYMBOL(mdc_getattr_name);
 EXPORT_SYMBOL(mdc_create);
 EXPORT_SYMBOL(mdc_unlink);
 EXPORT_SYMBOL(mdc_rename);
index 6a0855e..12f06fc 100644 (file)
@@ -10,10 +10,8 @@ MODULE = mds
 modulefs_DATA = mds.o
 EXTRA_PROGRAMS = mds
 
-LINX= mds_updates.c simple.c ll_pack.c target.c
+LINX= mds_updates.c simple.c target.c
 
-ll_pack.c: 
-       test -e ll_pack.c || ln -sf $(top_srcdir)/lib/ll_pack.c
 mds_updates.c: 
        test -e mds_updates.c || ln -sf $(top_srcdir)/lib/mds_updates.c
 simple.c: 
index ea30d51..bfdad03 100644 (file)
@@ -297,7 +297,9 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
         if (!conn || !obd || !cluuid)
                 RETURN(-EINVAL);
 
-        MOD_INC_USE_COUNT;
+        /* lctl gets a backstage, all-access pass. */
+        if (!strcmp(cluuid, "OBD_CLASS_UUID"))
+                goto dont_check_exports;
 
         spin_lock(&obd->obd_dev_lock);
         list_for_each(p, &obd->obd_exports) {
@@ -308,41 +310,10 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
                         continue;
                 }
                 if (!memcmp(cluuid, mcd->mcd_uuid, sizeof mcd->mcd_uuid)) {
-                        /* XXX make handle-found-export a subroutine */
-                        LASSERT(exp->exp_obd == obd);
-
                         spin_unlock(&obd->obd_dev_lock);
-                        if (exp->exp_connection) {
-                                struct lustre_handle *hdl;
-                                hdl = &exp->exp_ldlm_data.led_import.imp_handle;
-                                /* Might be a re-connect after a partition. */
-                                if (!memcmp(conn, hdl, sizeof *conn)) {
-                                        CERROR("%s reconnecting\n", cluuid);
-                                        conn->addr = (__u64) (unsigned long)exp;
-                                        conn->cookie = exp->exp_cookie;
-                                        rc = EALREADY;
-                                } else {
-                                        CERROR("%s reconnecting from %s, "
-                                               "handle mismatch (ours %Lx/%Lx, "
-                                               "theirs %Lx/%Lx)\n", cluuid,
-                                               exp->exp_connection->
-                                               c_remote_uuid, hdl->addr,
-                                               hdl->cookie, conn->addr,
-                                               conn->cookie);
-                                        /* XXX disconnect them here? */
-                                        memset(conn, 0, sizeof *conn);
-                                        rc = -EALREADY;
-                                }
-                                MOD_DEC_USE_COUNT;
-                                RETURN(rc);
-                        }
-                        conn->addr = (__u64) (unsigned long)exp;
-                        conn->cookie = exp->exp_cookie;
-                        CDEBUG(D_INFO, "existing export for UUID '%s' at %p\n",
-                               cluuid, exp);
-                        CDEBUG(D_IOCTL,"connect: addr %Lx cookie %Lx\n",
-                               (long long)conn->addr, (long long)conn->cookie);
-                        RETURN(0);
+                        LASSERT(exp->exp_obd == obd);
+                        
+                        RETURN(target_handle_reconnect(conn, exp, cluuid));
                 }
         }
         spin_unlock(&obd->obd_dev_lock);
@@ -350,10 +321,10 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
         if (obd->u.mds.mds_recoverable_clients != 0) {
                 CERROR("denying connection for new client %s: in recovery\n",
                        cluuid);
-                MOD_DEC_USE_COUNT;
                 RETURN(-EBUSY);
         }
 
+ dont_check_exports:
         /* XXX There is a small race between checking the list and adding a
          * new connection for the same UUID, but the real threat (list
          * corruption when multiple different clients connect) is solved.
@@ -366,7 +337,7 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
          */
         rc = class_connect(conn, obd, cluuid);
         if (rc)
-                GOTO(out_dec, rc);
+                RETURN(rc);
         exp = class_conn2export(conn);
         LASSERT(exp);
         med = &exp->exp_mds_data;
@@ -393,8 +364,6 @@ out_mcd:
         OBD_FREE(mcd, sizeof(*mcd));
 out_export:
         class_disconnect(conn);
-out_dec:
-        MOD_DEC_USE_COUNT;
 
         return rc;
 }
@@ -427,11 +396,12 @@ static int mds_disconnect(struct lustre_handle *conn)
         list_for_each_safe(tmp, n, &med->med_open_head) {
                 struct mds_file_data *mfd =
                         list_entry(tmp, struct mds_file_data, mfd_list);
+                CERROR("force closing client file handle for %*s\n",
+                       mfd->mfd_file->f_dentry->d_name.len,
+                       mfd->mfd_file->f_dentry->d_name.name);
                 rc = mds_close_mfd(mfd, med);
-                if (rc) {
-                        /* XXX better diagnostics, with file path and stuff */
-                        CDEBUG(D_INODE, "Error %d closing mfd %p\n", rc, mfd);
-                }
+                if (rc)
+                        CDEBUG(D_INODE, "Error closing file: %d\n", rc);
         }
         spin_unlock(&med->med_open_lock);
 
@@ -439,8 +409,6 @@ static int mds_disconnect(struct lustre_handle *conn)
         mds_client_free(export);
 
         rc = class_disconnect(conn);
-        if (!rc)
-                MOD_DEC_USE_COUNT;
 
         RETURN(rc);
 }
@@ -473,7 +441,7 @@ static int mds_getstatus(struct ptlrpc_request *req)
         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_GETSTATUS_PACK)) {
                 CERROR("mds: out of memory for message: size=%d\n", size);
                 req->rq_status = -ENOMEM;
-                RETURN(0);
+                RETURN(-ENOMEM);
         }
 
         /* Flush any outstanding transactions to disk so the client will
@@ -510,16 +478,17 @@ static int mds_getlovinfo(struct ptlrpc_request *req)
         if (rc) {
                 CERROR("mds: out of memory for message: size=%d\n", size[1]);
                 req->rq_status = -ENOMEM;
-                RETURN(0);
+                RETURN(-ENOMEM);
         }
 
-        desc = lustre_msg_buf(req->rq_repmsg, 0);
-        rc = mds_get_lovdesc(mds, desc);
-        if (rc) {
-                req->rq_status = rc;
+        if (!mds->mds_has_lov_desc) {
+                req->rq_status = -ENOENT;
                 RETURN(0);
         }
 
+        desc = lustre_msg_buf(req->rq_repmsg, 0);
+        memcpy(desc, &mds->mds_lov_desc, sizeof *desc);
+        lov_packdesc(desc);
         tgt_count = le32_to_cpu(desc->ld_tgt_count);
         if (tgt_count * sizeof(obd_uuid_t) > streq->repbuf) {
                 CERROR("too many targets, enlarge client buffers\n");
@@ -527,8 +496,6 @@ static int mds_getlovinfo(struct ptlrpc_request *req)
                 RETURN(0);
         }
 
-        /* XXX the MDS should not really know about this */
-        mds->mds_max_mdsize = lov_mds_md_size(tgt_count);
         rc = mds_get_lovtgts(mds, tgt_count,
                              lustre_msg_buf(req->rq_repmsg, 1));
         if (rc) {
@@ -627,7 +594,7 @@ static int mds_getattr_internal(struct mds_obd *mds, struct dentry *dentry,
         mds_pack_inode2fid(&body->fid1, inode);
         mds_pack_inode2body(body, inode);
 
-        if (S_ISREG(inode->i_mode) /* && reqbody->valid & OBD_MD_FLEASIZE */) {
+        if (S_ISREG(inode->i_mode) && reqbody->valid & OBD_MD_FLEASIZE) {
                 rc = mds_pack_md(mds, req, reply_off + 1, body, inode);
         } else if (S_ISLNK(inode->i_mode) && reqbody->valid & OBD_MD_LINKNAME) {
                 char *symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1);
@@ -645,6 +612,58 @@ static int mds_getattr_internal(struct mds_obd *mds, struct dentry *dentry,
         RETURN(rc);
 }
 
+static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode,
+                                int offset)
+{
+        struct mds_obd *mds = mds_req2mds(req);
+        struct mds_body *body;
+        int rc = 0, size[2] = {sizeof(*body)}, bufcount = 1;
+        ENTRY;
+
+        body = lustre_msg_buf(req->rq_reqmsg, offset);
+
+        if (S_ISREG(inode->i_mode) && body->valid & OBD_MD_FLEASIZE) {
+                int rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0);
+                CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n",
+                       rc, inode->i_ino);
+                if (rc < 0) {
+                        if (rc != -ENODATA)
+                                CERROR("error getting inode %lu MD: rc = %d\n",
+                                       inode->i_ino, rc);
+                        size[bufcount] = 0;
+                } else if (rc > mds->mds_max_mdsize) {
+                        size[bufcount] = 0;
+                        CERROR("MD size %d larger than maximum possible %u\n",
+                               rc, mds->mds_max_mdsize);
+                } else
+                        size[bufcount] = rc;
+                bufcount++;
+        } else if (body->valid & OBD_MD_LINKNAME) {
+                size[bufcount] = MIN(inode->i_size + 1, body->size);
+                bufcount++;
+                CDEBUG(D_INODE, "symlink size: %Lu, reply space: "LPU64"\n",
+                       inode->i_size + 1, body->size);
+        }
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) {
+                CERROR("failed MDS_GETATTR_PACK test\n");
+                req->rq_status = -ENOMEM;
+                GOTO(out, rc = -ENOMEM);
+        }
+
+        rc = lustre_pack_msg(bufcount, size, NULL, &req->rq_replen,
+                             &req->rq_repmsg);
+        if (rc) {
+                CERROR("out of memoryK\n");
+                req->rq_status = rc;
+                GOTO(out, rc);
+        }
+
+        EXIT;
+ out:
+        return(rc);
+}
+
 static int mds_getattr_name(int offset, struct ptlrpc_request *req)
 {
         struct mds_obd *mds = mds_req2mds(req);
@@ -680,7 +699,7 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req)
         push_ctxt(&saved, &mds->mds_ctxt, &uc);
         de = mds_fid2dentry(mds, &body->fid1, NULL);
         if (IS_ERR(de)) {
-                GOTO(out_pre_de, rc = -ENOENT);
+                GOTO(out_pre_de, rc = PTR_ERR(de));
         }
 
         dir = de->d_inode;
@@ -703,29 +722,33 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req)
                         GOTO(out_create_de, rc = -EIO);
                 }
         }
-        ldlm_lock_dump_handle(&lockh);
+        ldlm_lock_dump_handle(D_OTHER, &lockh);
 
         down(&dir->i_sem);
         dchild = lookup_one_len(name, de, namelen - 1);
+        up(&dir->i_sem);
         if (IS_ERR(dchild)) {
                 CDEBUG(D_INODE, "child lookup error %ld\n", PTR_ERR(dchild));
-                up(&dir->i_sem);
                 GOTO(out_create_dchild, rc = PTR_ERR(dchild));
+        } else if (dchild->d_inode == NULL) {
+                GOTO(out_create_dchild, rc = -ENOENT);
         }
 
+        if (req->rq_repmsg == NULL)
+                mds_getattr_pack_msg(req, dchild->d_inode, offset);
+
         rc = mds_getattr_internal(mds, dchild, req, body, offset);
 
         EXIT;
 out_create_dchild:
         l_dput(dchild);
-        up(&dir->i_sem);
         ldlm_lock_decref(&lockh, lock_mode);
 out_create_de:
         l_dput(de);
 out_pre_de:
         req->rq_status = rc;
         pop_ctxt(&saved, &mds->mds_ctxt, &uc);
-        return 0;
+        return rc;
 }
 
 static int mds_getattr(int offset, struct ptlrpc_request *req)
@@ -733,10 +756,9 @@ static int mds_getattr(int offset, struct ptlrpc_request *req)
         struct mds_obd *mds = mds_req2mds(req);
         struct obd_run_ctxt saved;
         struct dentry *de;
-        struct inode *inode;
         struct mds_body *body;
         struct obd_ucred uc;
-        int rc = 0, size[2] = {sizeof(*body)}, bufcount = 1;
+        int rc = 0;
         ENTRY;
 
         body = lustre_msg_buf(req->rq_reqmsg, offset);
@@ -750,49 +772,12 @@ static int mds_getattr(int offset, struct ptlrpc_request *req)
                 GOTO(out_pop, PTR_ERR(de));
         }
 
-        inode = de->d_inode;
-        if (S_ISREG(body->fid1.f_type)) {
-                int rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0);
-                CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n",
-                       rc, inode->i_ino);
-                if (rc < 0) {
-                        if (rc != -ENODATA)
-                                CERROR("error getting inode %lu MD: rc = %d\n",
-                                       inode->i_ino, rc);
-                        size[bufcount] = 0;
-                } else if (rc > mds->mds_max_mdsize) {
-                        size[bufcount] = 0;
-                        CERROR("MD size %d larger than maximum possible %u\n",
-                               rc, mds->mds_max_mdsize);
-                } else
-                        size[bufcount] = rc;
-                bufcount++;
-        } else if (body->valid & OBD_MD_LINKNAME) {
-                size[bufcount] = MIN(inode->i_size + 1, body->size);
-                bufcount++;
-                CDEBUG(D_INODE, "symlink size: %d, reply space: %d\n",
-                       inode->i_size + 1, body->size);
-        }
-
-        if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) {
-                CERROR("failed MDS_GETATTR_PACK test\n");
-                req->rq_status = -ENOMEM;
-                GOTO(out, rc = -ENOMEM);
-        }
-
-        rc = lustre_pack_msg(bufcount, size, NULL, &req->rq_replen,
-                             &req->rq_repmsg);
-        if (rc) {
-                CERROR("out of memoryK\n");
-                req->rq_status = rc;
-                GOTO(out, rc);
-        }
+        rc = mds_getattr_pack_msg(req, de->d_inode, offset);
 
         req->rq_status = mds_getattr_internal(mds, de, req, body, 0);
 
-        EXIT;
-out:
         l_dput(de);
+        EXIT;
 out_pop:
         pop_ctxt(&saved, &mds->mds_ctxt, &uc);
         return rc;
@@ -871,7 +856,7 @@ static int mds_store_md(struct mds_obd *mds, struct ptlrpc_request *req,
         uc.ouc_cap = body->capability;
         push_ctxt(&saved, &mds->mds_ctxt, &uc);
         mds_start_transno(mds);
-        handle = fsfilt_start(obd, inode,FSFILT_OP_SETATTR);
+        handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR);
         if (IS_ERR(handle)) {
                 rc = PTR_ERR(handle);
                 mds_finish_transno(mds, handle, req, rc);
@@ -1002,9 +987,10 @@ static int mds_close(struct ptlrpc_request *req)
 
         mfd = mds_handle2mfd(&body->handle);
         if (!mfd) {
-                CERROR("no handle for file close "LPD64
-                       ": addr "LPX64", cookie "LPX64"\n",
-                       body->fid1.id, body->handle.addr, body->handle.cookie);
+                DEBUG_REQ(D_ERROR, req, "no handle for file close "LPD64
+                          ": addr "LPX64", cookie "LPX64"\n",
+                          body->fid1.id, body->handle.addr,
+                          body->handle.cookie);
                 RETURN(-ESTALE);
         }
 
@@ -1068,7 +1054,6 @@ static int mds_readpage(struct ptlrpc_request *req)
         /* to make this asynchronous make sure that the handling function
            doesn't send a reply when this function completes. Instead a
            callback function would send the reply */
-        /* note: in case of an error, dentry_open puts dentry */
         rc = mds_sendpage(req, file, body->size);
 
         filp_close(file, 0);
@@ -1103,43 +1088,61 @@ static int check_for_next_transno(struct mds_obd *mds)
         struct ptlrpc_request *req;
         req = list_entry(mds->mds_recovery_queue.next,
                          struct ptlrpc_request, rq_list);
+        LASSERT(req->rq_reqmsg->transno >= mds->mds_next_recovery_transno);
         return req->rq_reqmsg->transno == mds->mds_next_recovery_transno;
 }
 
 static void process_recovery_queue(struct mds_obd *mds)
 {
         struct ptlrpc_request *req;
+        ENTRY;
 
         for (;;) {
                 spin_lock(&mds->mds_processing_task_lock);
+                LASSERT(mds->mds_processing_task == current->pid);
                 req = list_entry(mds->mds_recovery_queue.next,
                                  struct ptlrpc_request, rq_list);
 
                 if (req->rq_reqmsg->transno != mds->mds_next_recovery_transno) {
                         spin_unlock(&mds->mds_processing_task_lock);
+                        CDEBUG(D_HA, "Waiting for transno "LPD64" (1st is "
+                               LPD64")\n",
+                               mds->mds_next_recovery_transno,
+                               req->rq_reqmsg->transno);
                         wait_event(mds->mds_next_transno_waitq,
                                    check_for_next_transno(mds));
                         continue;
                 }
-                list_del(&req->rq_list);
+                list_del_init(&req->rq_list);
                 spin_unlock(&mds->mds_processing_task_lock);
 
-                DEBUG_REQ(D_HA, req, "");
-                mds_handle(req);
-
-                if (list_empty(&mds->mds_recovery_queue))
+                DEBUG_REQ(D_ERROR, req, "processing: ");
+                (void)mds_handle(req);
+                mds_fsync_super(mds->mds_sb);
+                OBD_FREE(req, sizeof *req);
+                spin_lock(&mds->mds_processing_task_lock);
+                mds->mds_next_recovery_transno++;
+                if (list_empty(&mds->mds_recovery_queue)) {
+                        mds->mds_processing_task = 0;
+                        spin_unlock(&mds->mds_processing_task_lock);
                         break;
+                }
+                spin_unlock(&mds->mds_processing_task_lock);
         }
+        EXIT;
 }
 
 static int queue_recovery_request(struct ptlrpc_request *req,
                                   struct mds_obd *mds)
 {
         struct list_head *tmp;
-        int inserted = 0, transno = req->rq_reqmsg->transno;
+        int inserted = 0;
+        __u64 transno = req->rq_reqmsg->transno;
+        struct ptlrpc_request *saved_req;
 
         if (!transno) {
-                DEBUG_REQ(D_HA, req, "not queueing");
+                INIT_LIST_HEAD(&req->rq_list);
+                DEBUG_REQ(D_ERROR, req, "not queueing");
                 return 1;
         }
 
@@ -1147,14 +1150,23 @@ static int queue_recovery_request(struct ptlrpc_request *req,
 
         if (mds->mds_processing_task == current->pid) {
                 /* Processing the queue right now, don't re-add. */
+                LASSERT(list_empty(&req->rq_list));
                 spin_unlock(&mds->mds_processing_task_lock);
                 return 1;
         }
 
+        OBD_ALLOC(saved_req, sizeof *saved_req);
+        if (!saved_req)
+                LBUG();
+        memcpy(saved_req, req, sizeof *req);
+        req = saved_req;
+        INIT_LIST_HEAD(&req->rq_list);
+
         /* XXX O(n^2) */
         list_for_each(tmp, &mds->mds_recovery_queue) {
                 struct ptlrpc_request *reqiter =
                         list_entry(tmp, struct ptlrpc_request, rq_list);
+
                 if (reqiter->rq_reqmsg->transno > transno) {
                         list_add_tail(&req->rq_list, &reqiter->rq_list);
                         inserted = 1;
@@ -1162,16 +1174,17 @@ static int queue_recovery_request(struct ptlrpc_request *req,
                 }
         }
 
-        if (!inserted)
+        if (!inserted) {
                 list_add_tail(&req->rq_list, &mds->mds_recovery_queue);
+        }
 
         if (mds->mds_processing_task != 0) {
                 /* Someone else is processing this queue, we'll leave it to
                  * them.
                  */
-                spin_unlock(&mds->mds_processing_task_lock);
                 if (transno == mds->mds_next_recovery_transno)
                         wake_up(&mds->mds_next_transno_waitq);
+                spin_unlock(&mds->mds_processing_task_lock);
                 return 0;
         }
 
@@ -1191,10 +1204,10 @@ static int filter_recovery_request(struct ptlrpc_request *req,
         switch (req->rq_reqmsg->opc) {
         case MDS_CONNECT:
         case MDS_DISCONNECT:
-        case MDS_OPEN:
                *process = 1;
                RETURN(0);
 
+        case MDS_OPEN:
         case MDS_GETSTATUS: /* used in unmounting */
         case MDS_REINT:
         case LDLM_ENQUEUE:
@@ -1204,6 +1217,7 @@ static int filter_recovery_request(struct ptlrpc_request *req,
         default:
                 DEBUG_REQ(D_ERROR, req, "not permitted during recovery");
                 *process = 0;
+                /* XXX what should we set rq_status to here? */
                 RETURN(ptlrpc_error(req->rq_svc, req));
         }
 }
@@ -1211,7 +1225,9 @@ static int filter_recovery_request(struct ptlrpc_request *req,
 static int mds_queue_final_reply(struct ptlrpc_request *req, int rc)
 {
         struct mds_obd *mds = mds_req2mds(req);
+        struct ptlrpc_request *saved_req;
 
+        spin_lock(&mds->mds_processing_task_lock);
         if (rc) {
                 /* Just like ptlrpc_error, but without the sending. */
                 lustre_pack_msg(0, NULL, NULL, &req->rq_replen,
@@ -1219,22 +1235,29 @@ static int mds_queue_final_reply(struct ptlrpc_request *req, int rc)
                 req->rq_type = PTL_RPC_MSG_ERR;
         }
 
+        LASSERT(list_empty(&req->rq_list));
+        OBD_ALLOC(saved_req, sizeof *saved_req);
+        memcpy(saved_req, req, sizeof *saved_req);
+        req = saved_req;
         list_add(&req->rq_list, &mds->mds_delayed_reply_queue);
         if (--mds->mds_recoverable_clients == 0) {
                 struct list_head *tmp, *n;
-
-                CDEBUG(D_HA,
+                ldlm_reprocess_all_ns(req->rq_export->exp_obd->obd_namespace);
+                CDEBUG(D_ERROR,
                        "all clients recovered, sending delayed replies\n");
                 list_for_each_safe(tmp, n, &mds->mds_delayed_reply_queue) {
                         req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                        DEBUG_REQ(D_HA, req, "delayed:");
+                        DEBUG_REQ(D_ERROR, req, "delayed:");
                         ptlrpc_reply(req->rq_svc, req);
+                        list_del(&req->rq_list);
+                        OBD_FREE(req, sizeof *req);
                 }
         } else {
-                CDEBUG(D_HA, "%d recoverable clients remain\n",
+                CERROR("%d recoverable clients remain\n",
                        mds->mds_recoverable_clients);
         }
 
+        spin_unlock(&mds->mds_processing_task_lock);
         return 1;
 }
 
@@ -1255,7 +1278,7 @@ int mds_handle(struct ptlrpc_request *req)
 
         rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_HANDLE_UNPACK)) {
-                CERROR("lustre_mds: Invalid request\n");
+                DEBUG_REQ(D_ERROR, req, "invalid request (%d)", rc);
                 GOTO(out, rc);
         }
 
@@ -1286,6 +1309,12 @@ int mds_handle(struct ptlrpc_request *req)
                         mds = mds_req2mds(req);
                         mds_fsync_super(mds->mds_sb);
                 }
+
+                /* Let the client know if it can replay. */
+                if (mds->mds_recoverable_clients) {
+                        lustre_msg_add_flags(req->rq_repmsg,
+                                             MSG_REPLAY_IN_PROGRESS);
+                }
                 break;
 
         case MDS_DISCONNECT:
@@ -1295,7 +1324,8 @@ int mds_handle(struct ptlrpc_request *req)
                 /* Make sure that last_rcvd is correct. */
                 if (!rc)
                         mds_fsync_super(mds->mds_sb);
-                goto out;
+                req->rq_status = rc;
+                break;
 
         case MDS_GETSTATUS:
                 DEBUG_REQ(D_INODE, req, "getstatus");
@@ -1314,6 +1344,12 @@ int mds_handle(struct ptlrpc_request *req)
                 rc = mds_getattr(0, req);
                 break;
 
+        case MDS_GETATTR_NAME:
+                DEBUG_REQ(D_INODE, req, "getattr_name");
+                OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_NAME_NET, 0);
+                rc = mds_getattr_name(0, req);
+                break;
+
         case MDS_STATFS:
                 DEBUG_REQ(D_INODE, req, "statfs");
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_STATFS_NET, 0);
@@ -1387,14 +1423,15 @@ int mds_handle(struct ptlrpc_request *req)
 
         EXIT;
 
-        if (!rc) {
+        /* If we're DISCONNECTing, the mds_export_data is already freed */
+        if (!rc && req->rq_reqmsg->opc != MDS_DISCONNECT) {
                 struct mds_export_data *med = &req->rq_export->exp_mds_data;
 
                 req->rq_repmsg->last_xid =
                         HTON__u64(le64_to_cpu(med->med_mcd->mcd_last_xid));
                 req->rq_repmsg->last_committed =
                         HTON__u64(mds->mds_last_committed);
-                CDEBUG(D_INFO, "last_rcvd ~%Lu, last_committed %Lu, xid %d\n",
+                CDEBUG(D_INFO, "last_transno %Lu, last_committed %Lu, xid %d\n",
                        (unsigned long long)mds->mds_last_rcvd,
                        (unsigned long long)mds->mds_last_committed,
                        cpu_to_le32(req->rq_xid));
@@ -1408,14 +1445,14 @@ int mds_handle(struct ptlrpc_request *req)
                 return mds_queue_final_reply(req, rc);
         }
 
+        /* XXX bug 578 */
         /* MDS_CONNECT / EALREADY (note: not -EALREADY!) isn't an error */
         if (rc && (req->rq_reqmsg->opc != MDS_CONNECT ||
                    rc != EALREADY)) {
-                CERROR("mds: processing error (opcode %d): %d\n",
-                       req->rq_reqmsg->opc, rc);
+                DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc);
                 ptlrpc_error(req->rq_svc, req);
         } else {
-                CDEBUG(D_NET, "sending reply\n");
+                DEBUG_REQ(D_NET, req, "sending reply");
                 ptlrpc_reply(req->rq_svc, req);
         }
         return 0;
@@ -1426,17 +1463,17 @@ int mds_handle(struct ptlrpc_request *req)
  * then the server last_rcvd value may be less than that of the clients.
  * This will alert us that we may need to do client recovery.
  *
- * Assumes we are already in the server filesystem context.
- *
  * Also assumes for mds_last_rcvd that we are not modifying it (no locking).
  */
 int mds_update_server_data(struct mds_obd *mds)
 {
         struct mds_server_data *msd = mds->mds_server_data;
         struct file *filp = mds->mds_rcvd_filp;
+        struct obd_run_ctxt saved;
         loff_t off = 0;
         int rc;
 
+        push_ctxt(&saved, &mds->mds_ctxt, NULL);
         msd->msd_last_rcvd = cpu_to_le64(mds->mds_last_rcvd);
         msd->msd_mount_count = cpu_to_le64(mds->mds_mount_count);
 
@@ -1447,8 +1484,8 @@ int mds_update_server_data(struct mds_obd *mds)
         if (rc != sizeof(*msd)) {
                 CERROR("error writing MDS server data: rc = %d\n", rc);
                 if (rc > 0)
-                        RETURN(-EIO);
-                RETURN(rc);
+                        rc = -EIO;
+                GOTO(out, rc);
         }
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         rc = fsync_dev(filp->f_dentry->d_inode->i_rdev);
@@ -1458,25 +1495,9 @@ int mds_update_server_data(struct mds_obd *mds)
         if (rc)
                 CERROR("error flushing MDS server data: rc = %d\n", rc);
 
-        return 0;
-}
-
-/* Do recovery actions for the MDS */
-static int mds_recovery_complete(struct obd_device *obddev)
-{
-        struct mds_obd *mds = &obddev->u.mds;
-        struct obd_run_ctxt saved;
-        int rc;
-
-        LASSERT(mds->mds_recoverable_clients == 0);
-
-        /* This happens at the end when recovery is complete */
-        ++mds->mds_mount_count;
-        push_ctxt(&saved, &mds->mds_ctxt, NULL);
-        rc = mds_update_server_data(mds);
+out:
         pop_ctxt(&saved, &mds->mds_ctxt, NULL);
-
-        return rc;
+        RETURN(rc);
 }
 
 /* mount the file system (secretly) */
@@ -1488,16 +1509,15 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf)
         int rc = 0;
         ENTRY;
 
-        MOD_INC_USE_COUNT;
 #ifdef CONFIG_DEV_RDONLY
         dev_clear_rdonly(2);
 #endif
         if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2)
-                GOTO(err_dec, rc = -EINVAL);
+                RETURN(rc = -EINVAL);
 
         obddev->obd_fsops = fsfilt_get_ops(data->ioc_inlbuf2);
         if (IS_ERR(obddev->obd_fsops))
-                GOTO(err_dec, rc = PTR_ERR(obddev->obd_fsops));
+                RETURN(rc = PTR_ERR(obddev->obd_fsops));
 
         mnt = do_kern_mount(data->ioc_inlbuf2, 0, data->ioc_inlbuf1, NULL);
         if (IS_ERR(mnt)) {
@@ -1531,8 +1551,10 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf)
 
         spin_lock_init(&mds->mds_processing_task_lock);
         mds->mds_processing_task = 0;
+        mds->mds_has_lov_desc = 0;
         INIT_LIST_HEAD(&mds->mds_recovery_queue);
         INIT_LIST_HEAD(&mds->mds_delayed_reply_queue);
+        init_waitqueue_head(&mds->mds_next_transno_waitq);
 
         RETURN(0);
 
@@ -1545,8 +1567,6 @@ err_put:
         lock_kernel();
 err_ops:
         fsfilt_put_ops(obddev->obd_fsops);
-err_dec:
-        MOD_DEC_USE_COUNT;
         RETURN(rc);
 }
 
@@ -1554,24 +1574,14 @@ static int mds_cleanup(struct obd_device *obddev)
 {
         struct super_block *sb;
         struct mds_obd *mds = &obddev->u.mds;
-        struct obd_run_ctxt saved;
         ENTRY;
 
         sb = mds->mds_sb;
         if (!mds->mds_sb)
                 RETURN(0);
 
-        push_ctxt(&saved, &mds->mds_ctxt, NULL);
         mds_update_server_data(mds);
-
-        if (mds->mds_rcvd_filp) {
-                int rc = filp_close(mds->mds_rcvd_filp, 0);
-                mds->mds_rcvd_filp = NULL;
-
-                if (rc)
-                        CERROR("last_rcvd file won't close, rc=%d\n", rc);
-        }
-        pop_ctxt(&saved, &mds->mds_ctxt, NULL);
+        mds_fs_cleanup(obddev);
 
         unlock_kernel();
         mntput(mds->mds_vfsmnt);
@@ -1583,15 +1593,14 @@ static int mds_cleanup(struct obd_device *obddev)
 #ifdef CONFIG_DEV_RDONLY
         dev_clear_rdonly(2);
 #endif
-        mds_fs_cleanup(obddev);
         fsfilt_put_ops(obddev->obd_fsops);
 
-        MOD_DEC_USE_COUNT;
         RETURN(0);
 }
 
-static int ldlm_intent_policy(struct ldlm_lock *lock, void *req_cookie,
-                              ldlm_mode_t mode, int flags, void *data)
+static int ldlm_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
+                              void *req_cookie, ldlm_mode_t mode, int flags,
+                              void *data)
 {
         struct ptlrpc_request *req = req_cookie;
         int rc = 0;
@@ -1603,7 +1612,7 @@ static int ldlm_intent_policy(struct ldlm_lock *lock, void *req_cookie,
         if (req->rq_reqmsg->bufcount > 1) {
                 /* an intent needs to be considered */
                 struct ldlm_intent *it = lustre_msg_buf(req->rq_reqmsg, 1);
-                struct mds_obd *mds= &req->rq_export->exp_obd->u.mds;
+                struct mds_obd *mds = &req->rq_export->exp_obd->u.mds;
                 struct mds_body *mds_rep;
                 struct ldlm_reply *rep;
                 __u64 new_resid[3] = {0, 0, 0}, old_res;
@@ -1692,8 +1701,9 @@ static int ldlm_intent_policy(struct ldlm_lock *lock, void *req_cookie,
                 rep->lock_policy_res2 = req->rq_status;
                 mds_rep = lustre_msg_buf(req->rq_repmsg, 1);
 
-                /* If the client is about to open a file that doesn't have an MD
-                 * stripe record, it's going to need a write lock. */
+                /* If the client is about to open a file that doesn't have an
+                 * MD stripe record, it's going to need a write lock.
+                 */
                 if (it->opc & IT_OPEN && !(mds_rep->valid & OBD_MD_FLEASIZE)) {
                         LDLM_DEBUG(lock, "open with no EA; returning PW lock");
                         lock->l_req_mode = LCK_PW;
@@ -1711,7 +1721,7 @@ static int ldlm_intent_policy(struct ldlm_lock *lock, void *req_cookie,
                         LBUG();
                 old_res = lock->l_resource->lr_name[0];
 
-                ldlm_lock_change_resource(lock, new_resid);
+                ldlm_lock_change_resource(ns, lock, new_resid);
                 if (lock->l_resource == NULL) {
                         LBUG();
                         RETURN(-ENOMEM);
@@ -1749,15 +1759,13 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf)
         int rc = 0;
         ENTRY;
 
-        MOD_INC_USE_COUNT;
-
         mds->mds_service = ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
                                            MDS_BUFSIZE, MDS_MAXREQSIZE,
                                            MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL,
                                            "self", mds_handle, "mds");
         if (!mds->mds_service) {
                 CERROR("failed to start service\n");
-                GOTO(err_dec, rc = -ENOMEM);
+                RETURN(rc = -ENOMEM);
         }
 
         for (i = 0; i < MDT_NUM_THREADS; i++) {
@@ -1775,8 +1783,6 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf)
 err_thread:
         ptlrpc_stop_all_threads(mds->mds_service);
         ptlrpc_unregister_service(mds->mds_service);
-err_dec:
-        MOD_DEC_USE_COUNT;
         RETURN(rc);
 }
 
@@ -1789,7 +1795,6 @@ static int mdt_cleanup(struct obd_device *obddev)
         ptlrpc_stop_all_threads(mds->mds_service);
         ptlrpc_unregister_service(mds->mds_service);
 
-        MOD_DEC_USE_COUNT;
         RETURN(0);
 }
 
@@ -1798,6 +1803,7 @@ extern int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn,
 
 /* use obd ops to offer management infrastructure */
 static struct obd_ops mds_obd_ops = {
+        o_owner:       THIS_MODULE,
         o_attach:      mds_attach,
         o_detach:      mds_detach,
         o_connect:     mds_connect,
@@ -1808,6 +1814,7 @@ static struct obd_ops mds_obd_ops = {
 };
 
 static struct obd_ops mdt_obd_ops = {
+        o_owner:       THIS_MODULE,
         o_setup:       mdt_setup,
         o_cleanup:     mdt_cleanup,
 };
index 7028603..37c7bc8 100644 (file)
 #include <linux/lustre_fsfilt.h>
 #include <linux/lprocfs_status.h>
 
-int rd_uuid(char* page, char **start, off_t off, int count, int *eof, 
+int rd_uuid(char *page, char **start, off_t off, int count, int *eof,
             void *data)
 {
         struct obd_device* temp = (struct obd_device*)data;
-        int len = 0;
-        len += snprintf(page, count, "%s\n", temp->obd_uuid); 
-        return len;  
+        return snprintf(page, count, "%s\n", temp->obd_uuid);
 }
-int rd_blksize(char* page, char **start, off_t off, int count, int *eof, 
+
+int rd_blksize(char *page, char **start, off_t off, int count, int *eof,
                void *data)
 {
         struct obd_device* temp = (struct obd_device*)data;
         struct mds_obd *mds = &temp->u.mds;
         struct statfs mystats;
-        int rc, len = 0;
-        
+        int rc;
+
         rc = vfs_statfs(mds->mds_sb, &mystats);
         if (rc) {
                 CERROR("mds: statfs failed: rc %d\n", rc);
                 return 0;
         }
-        len += snprintf(page, count, LPU64"\n", (__u64)(mystats.f_bsize)); 
-        return len;
-
+        return snprintf(page, count, LPU64"\n", (__u64)(mystats.f_bsize));
 }
-int rd_kbtotal(char* page, char **start, off_t off, int count, int *eof, 
+
+int rd_kbtotal(char *page, char **start, off_t off, int count, int *eof,
                void *data)
 {
         struct obd_device* temp = (struct obd_device*)data;
         struct mds_obd *mds = &temp->u.mds;
         struct statfs mystats;
-        int rc, len = 0;
+        int rc;
         __u32 blk_size;
         __u64 result;
-        
+
         rc = vfs_statfs(mds->mds_sb, &mystats);
         if (rc) {
                 CERROR("mds: statfs failed: rc %d\n", rc);
                 return 0;
         }
-        
+
         blk_size = mystats.f_bsize;
         blk_size >>= 10;
         result = mystats.f_blocks;
-        while(blk_size >>= 1){
+        while(blk_size >>= 1)
                 result <<= 1;
-        }
-        len += snprintf(page, count, LPU64"\n", result); 
-        return len;  
-        
+
+        return snprintf(page, count, LPU64"\n", result);
 }
 
-int rd_kbfree(char* page, char **start, off_t off, int count, int *eof, 
+int rd_kbfree(char *page, char **start, off_t off, int count, int *eof,
               void *data)
 {
         struct obd_device* temp = (struct obd_device*)data;
         struct mds_obd *mds = &temp->u.mds;
         struct statfs mystats;
-        int rc, len = 0;
+        int rc;
         __u32 blk_size;
         __u64 result;
-        
 
         rc = vfs_statfs(mds->mds_sb, &mystats);
         if (rc) {
@@ -96,12 +91,10 @@ int rd_kbfree(char* page, char **start, off_t off, int count, int *eof,
         blk_size = mystats.f_bsize;
         blk_size >>= 10;
         result = mystats.f_blocks;
-        while(blk_size >>= 1){
+        while (blk_size >>= 1)
                 result <<= 1;
-        }
-        len += snprintf(page, count, LPU64"\n", result);
-        return len;  
-        
+
+        return snprintf(page, count, LPU64"\n", result);
 }
 
 int rd_fstype(char *page, char **start, off_t off, int count, int *eof,
@@ -112,45 +105,41 @@ int rd_fstype(char *page, char **start, off_t off, int count, int *eof,
         return snprintf(page, count, "%s\n", obd->obd_fsops->fs_type);
 }
 
-int rd_filestotal(char* page, char **start, off_t off, int count, int *eof, 
+int rd_filestotal(char *page, char **start, off_t off, int count, int *eof,
                   void *data)
 {
         struct obd_device* temp = (struct obd_device*)data;
         struct mds_obd *mds = &temp->u.mds;
         struct statfs mystats;
-        int rc, len = 0;
-        
+        int rc;
+
         rc = vfs_statfs(mds->mds_sb, &mystats);
         if (rc) {
                 CERROR("mds: statfs failed: rc %d\n", rc);
                 return 0;
         }
-        
-        len += snprintf(page, count, LPU64"\n", (__u64)(mystats.f_files));
-        return len;  
-
-        
+        return snprintf(page, count, LPU64"\n", (__u64)(mystats.f_files));
 }
 
-int rd_filesfree(char* page, char **start, off_t off, int count, int *eof, 
-                  void *data)
+int rd_filesfree(char *page, char **start, off_t off, int count, int *eof,
+                 void *data)
 {
         struct obd_device* temp = (struct obd_device*)data;
         struct mds_obd *mds = &temp->u.mds;
         struct statfs mystats;
         int rc, len = 0;
-        
+
         rc = vfs_statfs(mds->mds_sb, &mystats);
         if (rc) {
                 CERROR("mds: statfs failed: rc %d\n", rc);
                 return 0;
         }
-        
+
         len += snprintf(page, count, LPU64"\n", (__u64)(mystats.f_ffree));
-        return len; 
+        return len;
 }
 
-int rd_filegroups(char* page, char **start, off_t off, int count, int *eof, 
+int rd_filegroups(char *page, char **start, off_t off, int count, int *eof,
                   void *data)
 {
         return 0;
@@ -166,13 +155,13 @@ struct lprocfs_vars status_var_nm_1[]={
         {"status/filegroups", rd_filegroups, 0, 0},
         {0}
 };
-int rd_numrefs(char* page, char **start, off_t off, int count, int *eof, 
+
+int rd_numrefs(char *page, char **start, off_t off, int count, int *eof,
                void *data)
 {
-        struct obd_type* class = (struct obd_type*)data;
-        int len = 0;
-        len += snprintf(page, count, "%d\n", class->typ_refcnt);
-        return len;
+        struct obd_type *class = (struct obd_type*)data;
+
+        return snprintf(page, count, "%d\n", class->typ_refcnt);
 }
 
 struct lprocfs_vars status_class_var[]={
index 83201aa..3f6c420 100644 (file)
@@ -88,6 +88,9 @@ int mds_client_add(struct mds_obd *mds, struct mds_export_data *med, int cl_off)
                                 RETURN(written);
                         RETURN(-EIO);
                 }
+                CDEBUG(D_INFO, "wrote client mcd at off %u (len %u)\n",
+                       MDS_LR_CLIENT + (cl_off * MDS_LR_SIZE),
+                       (unsigned int)sizeof(*med->med_mcd));
         }
         return 0;
 }
@@ -104,28 +107,27 @@ int mds_client_free(struct obd_export *exp)
         if (!med->med_mcd)
                 RETURN(0);
 
-        CDEBUG(D_INFO, "freeing client at offset %d with UUID '%s'\n",
-               med->med_off, med->med_mcd->mcd_uuid);
+        off = MDS_LR_CLIENT + (med->med_off * MDS_LR_SIZE);
+
+        CDEBUG(D_INFO, "freeing client at offset %u (%lld)with UUID '%s'\n",
+               med->med_off, off, med->med_mcd->mcd_uuid);
 
         if (!test_and_clear_bit(med->med_off, last_rcvd_slots)) {
-                CERROR("MDS client %d: bit already clear in bitmap!!\n",
+                CERROR("MDS client %u: bit already clear in bitmap!!\n",
                        med->med_off);
                 LBUG();
         }
 
-        off = med->med_off;
-
         memset(&zero_mcd, 0, sizeof zero_mcd);
         push_ctxt(&saved, &mds->mds_ctxt, NULL);
         written = lustre_fwrite(mds->mds_rcvd_filp, (const char *)&zero_mcd,
-                                sizeof zero_mcd, &off);
+                                sizeof(zero_mcd), &off);
         pop_ctxt(&saved, &mds->mds_ctxt, NULL);
 
-        if (written != sizeof zero_mcd) {
+        if (written != sizeof(zero_mcd)) {
                 CERROR("error zeroing out client %s off %d in %s: %d\n",
                        med->med_mcd->mcd_uuid, med->med_off, LAST_RCVD,
                        written);
-                LBUG();
         } else {
                 CDEBUG(D_INFO, "zeroed out disconnecting client %s at off %d\n",
                        med->med_mcd->mcd_uuid, med->med_off);
@@ -151,7 +153,7 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
         struct mds_client_data *mcd = NULL;
         loff_t off = 0;
         int cl_off;
-        int max_off = f->f_dentry->d_inode->i_size / sizeof(*mcd);
+        unsigned long last_rcvd_size = f->f_dentry->d_inode->i_size;
         __u64 last_rcvd = 0;
         __u64 last_mount;
         int rc = 0;
@@ -169,12 +171,14 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
 
         if (rc != sizeof(*msd)) {
                 CERROR("error reading MDS %s: rc = %d\n", LAST_RCVD, rc);
-                if (rc > 0) {
+                if (rc > 0)
                         rc = -EIO;
-                }
                 GOTO(err_msd, rc);
         }
 
+        CDEBUG(D_INODE, "last_rcvd has size %lu (msd + %lu clients)\n",
+               last_rcvd_size, (last_rcvd_size - sizeof *msd) / sizeof *mcd);
+
         /*
          * When we do a clean MDS shutdown, we save the last_rcvd into
          * the header.  If we find clients with higher last_rcvd values
@@ -182,17 +186,14 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
          */
         last_rcvd = le64_to_cpu(msd->msd_last_rcvd);
         mds->mds_last_rcvd = last_rcvd;
-        CDEBUG(D_INODE, "got %Lu for server last_rcvd value\n",
-               (unsigned long long)last_rcvd);
+        CDEBUG(D_INODE, "got "LPU64" for server last_rcvd value\n", last_rcvd);
 
         last_mount = le64_to_cpu(msd->msd_mount_count);
         mds->mds_mount_count = last_mount;
-        CDEBUG(D_INODE, "got %Lu for server last_mount value\n",
-               (unsigned long long)last_mount);
+        CDEBUG(D_INODE, "got "LPU64" for server last_mount value\n",last_mount);
 
-        for (off = MDS_LR_CLIENT, cl_off = 0;
-             off < max_off;
-             off += MDS_LR_SIZE, cl_off++) {
+        /* off is adjusted by lustre_fread, so we don't adjust it in the loop */
+        for (off = MDS_LR_CLIENT, cl_off = 0; off < last_rcvd_size; cl_off++) {
                 int mount_age;
 
                 if (!mcd) {
@@ -205,7 +206,7 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
                 if (rc != sizeof(*mcd)) {
                         CERROR("error reading MDS %s offset %d: rc = %d\n",
                                LAST_RCVD, cl_off, rc);
-                        if (rc > 0)
+                        if (rc > 0) /* XXX fatal error or just abort reading? */
                                 rc = -EIO;
                         break;
                 }
@@ -218,11 +219,11 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
 
                 last_rcvd = le64_to_cpu(mcd->mcd_last_rcvd);
 
-                /* The exports are cleaned up by mds_disconnect, so they
-                 * need to be set up like real exports also.
+                /* These exports are cleaned up by mds_disconnect(), so they
+                 * need to be set up like real exports as mds_connect() does.
                  */
                 mount_age = last_mount - le64_to_cpu(mcd->mcd_mount_count);
-                if (last_rcvd && mount_age < MDS_MOUNT_RECOV) {
+                if (mount_age < MDS_MOUNT_RECOV) {
                         struct obd_export *exp = class_new_export(obddev);
                         struct mds_export_data *med;
 
@@ -234,13 +235,12 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
                         med = &exp->exp_mds_data;
                         med->med_mcd = mcd;
                         mds_client_add(mds, med, cl_off);
-                        /* XXX put this in a helper if it gets more complex */
+                        /* create helper if export init gets more complex */
                         INIT_LIST_HEAD(&med->med_open_head);
                         spin_lock_init(&med->med_open_lock);
 
                         mcd = NULL;
                         mds->mds_recoverable_clients++;
-                        MOD_INC_USE_COUNT;
                 } else {
                         CDEBUG(D_INFO,
                                "discarded client %d, UUID '%s', count %Ld\n",
@@ -248,18 +248,18 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
                                (long long)le64_to_cpu(mcd->mcd_mount_count));
                 }
 
-                if (last_rcvd > mds->mds_last_rcvd) {
-                        CDEBUG(D_OTHER,
-                               "client at offset %d has last_rcvd = %Lu\n",
-                               cl_off, (unsigned long long)last_rcvd);
+                CDEBUG(D_OTHER, "client at offset %d has last_rcvd = %Lu\n",
+                       cl_off, (unsigned long long)last_rcvd);
+
+                if (last_rcvd > mds->mds_last_rcvd)
                         mds->mds_last_rcvd = last_rcvd;
-                }
         }
 
         mds->mds_last_committed = mds->mds_last_rcvd;
         if (mds->mds_recoverable_clients) {
-                CERROR("need recovery: %d recoverable clients, last_rcvd %Lu\n",
+                CERROR("RECOVERY: %d recoverable clients, last_rcvd "LPU64"\n",
                        mds->mds_recoverable_clients, mds->mds_last_rcvd);
+                mds->mds_next_recovery_transno = mds->mds_last_committed + 1;
         }
 
         if (mcd)
@@ -312,7 +312,7 @@ static int mds_fs_prep(struct obd_device *obddev)
         if (!S_ISREG(f->f_dentry->d_inode->i_mode)) {
                 CERROR("%s is not a regular file!: mode = %o\n", LAST_RCVD,
                        f->f_dentry->d_inode->i_mode);
-                GOTO(err_pop, rc = -ENOENT);
+                GOTO(err_filp, rc = -ENOENT);
         }
 
         rc = fsfilt_journal_data(obddev, f);
@@ -355,10 +355,24 @@ int mds_fs_setup(struct obd_device *obddev, struct vfsmount *mnt)
         RETURN(mds_fs_prep(obddev));
 }
 
-void mds_fs_cleanup(struct obd_device *obddev)
+int mds_fs_cleanup(struct obd_device *obddev)
 {
         struct mds_obd *mds = &obddev->u.mds;
+        struct obd_run_ctxt saved;
+        int rc = 0;
 
         class_disconnect_all(obddev); /* this cleans up client info too */
         mds_server_free_data(mds);
+
+        push_ctxt(&saved, &mds->mds_ctxt, NULL);
+        if (mds->mds_rcvd_filp) {
+                rc = filp_close(mds->mds_rcvd_filp, 0);
+                mds->mds_rcvd_filp = NULL;
+
+                if (rc)
+                        CERROR("last_rcvd file won't close, rc=%d\n", rc);
+        }
+        pop_ctxt(&saved, &mds->mds_ctxt, NULL);
+
+        return rc;
 }
index ba9a750..b548792 100644 (file)
@@ -46,6 +46,32 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc,
         ENTRY;
 
         tgt_count = desc->ld_tgt_count;
+        if (desc->ld_default_stripe_count > desc->ld_tgt_count) {
+                CERROR("default stripe count %u > OST count %u\n",
+                       desc->ld_default_stripe_count, desc->ld_tgt_count);
+                RETURN(-EINVAL);
+        }
+        if (desc->ld_default_stripe_size & (PAGE_SIZE - 1)) {
+                CERROR("default stripe size "LPU64" not a multiple of %lu\n",
+                       desc->ld_default_stripe_size, PAGE_SIZE);
+                RETURN(-EINVAL);
+        }
+        if (desc->ld_default_stripe_offset > desc->ld_tgt_count) {
+                CERROR("default stripe offset "LPU64" > max OST index %u\n",
+                       desc->ld_default_stripe_offset, desc->ld_tgt_count);
+                RETURN(-EINVAL);
+        }
+        if (desc->ld_pattern != 0) {
+                CERROR("stripe pattern %u unknown\n",
+                       desc->ld_pattern);
+                RETURN(-EINVAL);
+        }
+
+        memcpy(&mds->mds_lov_desc, desc, sizeof *desc);
+        mds->mds_has_lov_desc = 1;
+        /* XXX the MDS should not really know about this */
+        mds->mds_max_mdsize = lov_mds_md_size(desc->ld_tgt_count);
+
         lov_packdesc(desc);
 
         push_ctxt(&saved, &mds->mds_ctxt, NULL);
@@ -55,6 +81,7 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc,
                 GOTO(out, rc = PTR_ERR(f));
         }
 
+#warning FIXME: if there is an existing LOVDESC, verify new tgt_count > old
         rc = lustre_fwrite(f, (char *)desc, sizeof(*desc), &f->f_pos);
         if (filp_close(f, 0))
                 CERROR("Error closing LOVDESC file\n");
@@ -69,6 +96,7 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc,
                 GOTO(out, rc = PTR_ERR(f));
         }
 
+#warning FIXME: if there is an existing LOVTGTS, verify existing UUIDs same
         rc = 0;
         for (i = 0; i < tgt_count ; i++) {
                 rc = lustre_fwrite(f, uuidarray[i],
index 9151326..3d340f7 100644 (file)
@@ -86,7 +86,7 @@ int mds_finish_transno(struct mds_obd *mds, void *handle,
         written = lustre_fwrite(mds->mds_rcvd_filp, (char *)mcd, sizeof(*mcd),
                                 &off);
         CDEBUG(D_INODE, "wrote trans #"LPD64" for client %s at #%d: written = "
-               "%d\n", last_rcvd, mcd->mcd_uuid, med->med_off, written);
+               LPSZ"\n", last_rcvd, mcd->mcd_uuid, med->med_off, written);
 
         if (written == sizeof(*mcd))
                 GOTO(out, rc = 0);
@@ -220,7 +220,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
         CDEBUG(D_INODE, "parent ino %lu name %s mode %o\n",
                dir->i_ino, rec->ur_name, rec->ur_mode);
 
-        ldlm_lock_dump_handle(&lockh);
+        ldlm_lock_dump_handle(D_OTHER, &lockh);
 
         down(&dir->i_sem);
         dchild = lookup_one_len(rec->ur_name, de, rec->ur_namelen - 1);
@@ -264,6 +264,11 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                         rec->ur_mode |= S_ISGID;
         }
 
+        if (rec->ur_fid2->id)
+                dchild->d_fsdata = (void *)(unsigned long)rec->ur_fid2->id;
+        else
+                LASSERT(!(rec->ur_opcode & REINT_REPLAYING));
+
         /* From here on, we must exit via a path that calls mds_finish_transno,
          * so that we release the mds_transno_sem (and, in the case of success,
          * update the transno correctly).  out_create_commit and
@@ -314,6 +319,11 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                 GOTO(out_transno_dchild, rc = -EINVAL);
         }
 
+        /* In case we stored the desired inum in here, we want to clean up.
+         * We also do this in the out_transno_dchild block, for the error cases.
+         */
+        dchild->d_fsdata = NULL;
+
         if (rc) {
                 CDEBUG(D_INODE, "error during create: %d\n", rc);
                 GOTO(out_create_commit, rc);
@@ -331,13 +341,14 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                         ATTR_MTIME | ATTR_CTIME;
 
                 if (rec->ur_fid2->id) {
-                        LASSERT(rec->ur_opcode & REINT_REPLAYING);
+                        LASSERT(rec->ur_fid2->id == inode->i_ino);
                         inode->i_generation = rec->ur_fid2->generation;
                         /* Dirtied and committed by the upcoming setattr. */
-                        CDEBUG(D_INODE, "recreated ino %lu with gen %lu\n",
+                        CDEBUG(D_INODE, "recreated ino %lu with gen %x\n",
                                inode->i_ino, inode->i_generation);
                 } else {
-                        CDEBUG(D_INODE, "created ino %lu\n", inode->i_ino);
+                        CDEBUG(D_INODE, "created ino %lu with gen %x\n",
+                               inode->i_ino, inode->i_generation);
                 }
 
                 rc = fsfilt_setattr(obd, dchild, handle, &iattr);
@@ -376,6 +387,7 @@ out_create:
         return 0;
 
 out_transno_dchild:
+        dchild->d_fsdata = NULL;
         /* Need to release the transno lock, and then put the dchild. */
         LASSERT(rc);
         mds_finish_transno(mds, handle, req, rc);
@@ -442,7 +454,8 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
 
         dir = de->d_inode;
         inode = dchild->d_inode;
-        CDEBUG(D_INODE, "parent ino %lu\n", dir->i_ino);
+        DEBUG_REQ(D_INODE, req, "parent ino %lu, child ino %lu\n", dir->i_ino,
+                  inode ? inode->i_ino : 0);
 
         if (!inode) {
                 if (rec->ur_opcode & REINT_REPLAYING) {
@@ -572,7 +585,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
                         GOTO(out_link_src_put, rc = -EIO);
                 }
         } else {
-                ldlm_lock_dump_handle(&srclockh);
+                ldlm_lock_dump_handle(D_OTHER, &srclockh);
         }
 
         de_tgt_dir = mds_fid2dentry(mds, rec->ur_fid2, NULL);
@@ -597,7 +610,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
                         GOTO(out_link_tgt_dir_put, rc = -EIO);
                 }
         } else {
-                ldlm_lock_dump_handle(&tgtlockh);
+                ldlm_lock_dump_handle(D_OTHER, &tgtlockh);
         }
 
         down(&de_tgt_dir->d_inode->i_sem);
@@ -709,7 +722,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
                         GOTO(out_rename_srcput, rc = -EIO);
                 }
         } else {
-                ldlm_lock_dump_handle(&srclockh);
+                ldlm_lock_dump_handle(D_OTHER, &srclockh);
         }
 
         de_tgtdir = mds_fid2dentry(mds, rec->ur_fid2, NULL);
@@ -734,7 +747,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
                         GOTO(out_rename_tgtput, rc = -EIO);
                 }
         } else {
-                ldlm_lock_dump_handle(&tgtlockh);
+                ldlm_lock_dump_handle(D_OTHER, &tgtlockh);
         }
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
index ed2b321..6f7f8fc 100644 (file)
@@ -9,16 +9,18 @@ else
 FSMOD = fsfilt_extN
 endif
 
-modulefs_DATA = lustre_build_version obdclass.o $(FSMOD).o
-EXTRA_PROGRAMS = obdclass $(FSMOD)
+modulefs_DATA = lustre_build_version obdclass.o $(FSMOD).o fsfilt_reiserfs.o
+EXTRA_PROGRAMS = obdclass $(FSMOD) fsfilt_reiserfs
 
 obdclass_SOURCES = debug.c genops.c class_obd.c sysctl.c uuid.c lprocfs_status.c
-obdclass_SOURCES += fsfilt.c
+obdclass_SOURCES += fsfilt.c statfs_pack.c
 
 include $(top_srcdir)/Rules
+
+# XXX I'm sure there's some automake mv-if-different helper for this.
 lustre_build_version:
        perl $(top_srcdir)/scripts/version_tag.pl $(top_srcdir) > tmpver
-       diff -u $(top_builddir)/include/linux/lustre_build_version.h tmpver \
-                       2> /dev/null &&\
-               $(RM) tmpver || \
+       cmp -z $(top_builddir)/include/linux/lustre_build_version.h tmpver \
+               2> /dev/null &&                                            \
+               $(RM) tmpver ||                                            \
                mv tmpver $(top_builddir)/include/linux/lustre_build_version.h
index 4769d61..61e9114 100644 (file)
@@ -52,22 +52,19 @@ struct semaphore obd_conf_sem;   /* serialize configuration commands */
 struct obd_device obd_dev[MAX_OBD_DEVICES];
 struct list_head obd_types;
 atomic_t obd_memory;
+int obd_memmax;
 
 /* The following are visible and mutable through /proc/sys/lustre/. */
 unsigned long obd_fail_loc;
 unsigned long obd_timeout = 100;
 char obd_recovery_upcall[128] = "/usr/lib/lustre/ha_assist";
 
-extern struct obd_type *class_nm_to_type(char *nm);
-
 /*  opening /dev/obd */
 static int obd_class_open(struct inode * inode, struct file * file)
 {
         ENTRY;
 
         file->private_data = NULL;
-        CDEBUG(D_IOCTL, "MOD_INC_USE for open: count = %d\n",
-               atomic_read(&(THIS_MODULE)->uc.usecount));
         MOD_INC_USE_COUNT;
         RETURN(0);
 }
@@ -80,9 +77,6 @@ static int obd_class_release(struct inode * inode, struct file * file)
         // XXX drop lsm, connections here
         if (file->private_data)
                 file->private_data = NULL;
-
-        CDEBUG(D_IOCTL, "MOD_DEC_USE for close: count = %d\n",
-               atomic_read(&(THIS_MODULE)->uc.usecount) - 1);
         MOD_DEC_USE_COUNT;
         RETURN(0);
 }
@@ -329,16 +323,16 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                 if (obd->obd_flags & OBD_ATTACHED || obd->obd_type) {
                         CERROR("OBD: Device %d already typed as %s.\n",
                                obd->obd_minor, MKSTR(obd->obd_type->typ_name));
-                        GOTO(out, err=-EBUSY);
+                        GOTO(out, err = -EBUSY);
                 }
 
                 if (!data->ioc_inllen1 || !data->ioc_inlbuf1) {
                         CERROR("No type passed!\n");
-                        GOTO(out, err=-EINVAL);
+                        GOTO(out, err = -EINVAL);
                 }
                 if (data->ioc_inlbuf1[data->ioc_inllen1-1] !=0) {
                         CERROR("Type not nul terminated!\n");
-                        GOTO(out, err=-EINVAL);
+                        GOTO(out, err = -EINVAL);
                 }
 
                 CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
@@ -346,10 +340,10 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                        MKSTR(data->ioc_inlbuf2), MKSTR(data->ioc_inlbuf3));
 
                 /* find the type */
-                type = class_nm_to_type(data->ioc_inlbuf1);
+                type = class_get_type(data->ioc_inlbuf1);
                 if (!type) {
                         CERROR("OBD: unknown type dev %d\n", obd->obd_minor);
-                        GOTO(out, err=-EINVAL);
+                        GOTO(out, err = -EINVAL);
                 }
 
                 minor = obd->obd_minor;
@@ -364,8 +358,8 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                         int len = strlen(data->ioc_inlbuf2) + 1;
                         OBD_ALLOC(obd->obd_name, len);
                         if (!obd->obd_name) {
-                                CERROR("no memory\n");
-                                LBUG();
+                                class_put_type(obd->obd_type);
+                                GOTO(out, err = -ENOMEM);
                         }
                         memcpy(obd->obd_name, data->ioc_inlbuf2, len);
                 } else {
@@ -374,11 +368,12 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                 if (data->ioc_inlbuf3) {
                         int len = strlen(data->ioc_inlbuf3);
                         if (len >= sizeof(obd->obd_uuid)) {
-                                CERROR("uuid must be < %d bytes long\n",
+                                CERROR("uuid must be < "LPSZ" bytes long\n",
                                        sizeof(obd->obd_uuid));
                                 if (obd->obd_name)
                                         OBD_FREE(obd->obd_name,
                                                  strlen(obd->obd_name) + 1);
+                                class_put_type(obd->obd_type);
                                 GOTO(out, err=-EINVAL);
                         }
                         memcpy(obd->obd_uuid, data->ioc_inlbuf3, len);
@@ -389,6 +384,7 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                 if (err) {
                         if(data->ioc_inlbuf2)
                                 OBD_FREE(obd->obd_name, strlen(obd->obd_name)+1);
+                        class_put_type(obd->obd_type);
                         obd->obd_type = NULL;
                 } else {
                         obd->obd_flags |= OBD_ATTACHED;
@@ -396,10 +392,6 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                         type->typ_refcnt++;
                         CDEBUG(D_IOCTL, "OBD: dev %d attached type %s\n",
                                obd->obd_minor, data->ioc_inlbuf1);
-
-                        CDEBUG(D_IOCTL, "MOD_INC_USE for attach: count = %d\n",
-                               atomic_read(&(THIS_MODULE)->uc.usecount));
-                        MOD_INC_USE_COUNT;
                 }
 
                 GOTO(out, err);
@@ -423,8 +415,8 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                         }
                         forcibly_detach_exports(obd);
                 }
-                   if (OBP(obd, detach))
-                        err=OBP(obd,detach)(obd);
+                if (OBP(obd, detach))
+                        err = OBP(obd,detach)(obd);
 
                 if (obd->obd_name) {
                         OBD_FREE(obd->obd_name, strlen(obd->obd_name)+1);
@@ -433,10 +425,8 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
 
                 obd->obd_flags &= ~OBD_ATTACHED;
                 obd->obd_type->typ_refcnt--;
+                class_put_type(obd->obd_type);
                 obd->obd_type = NULL;
-                CDEBUG(D_IOCTL, "MOD_DEC_USE for detach: count = %d\n",
-                       atomic_read(&(THIS_MODULE)->uc.usecount) - 1);
-                MOD_DEC_USE_COUNT;
                 GOTO(out, err = 0);
         }
 
@@ -505,13 +495,6 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                 GOTO(out, err);
         }
 
-        case OBD_IOC_DEC_USE_COUNT: {
-                CDEBUG(D_IOCTL, "MOD_DEC_USE for force dec: count = %d\n",
-                       atomic_read(&(THIS_MODULE)->uc.usecount) - 1);
-                MOD_DEC_USE_COUNT;
-                GOTO(out, err=0);
-        }
-
         default:
                 obd_data2conn(&conn, data);
 
@@ -620,6 +603,7 @@ EXPORT_SYMBOL(obd_kmap_put);
 EXPORT_SYMBOL(obd_dev);
 EXPORT_SYMBOL(obdo_cachep);
 EXPORT_SYMBOL(obd_memory);
+EXPORT_SYMBOL(obd_memmax);
 EXPORT_SYMBOL(obd_fail_loc);
 EXPORT_SYMBOL(obd_timeout);
 EXPORT_SYMBOL(obd_recovery_upcall);
@@ -627,6 +611,8 @@ EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
 
 EXPORT_SYMBOL(class_register_type);
 EXPORT_SYMBOL(class_unregister_type);
+EXPORT_SYMBOL(class_get_type);
+EXPORT_SYMBOL(class_put_type);
 EXPORT_SYMBOL(class_name2dev);
 EXPORT_SYMBOL(class_uuid2dev);
 EXPORT_SYMBOL(class_uuid2obd);
@@ -642,7 +628,6 @@ EXPORT_SYMBOL(class_disconnect_all);
 EXPORT_SYMBOL(class_uuid_unparse);
 
 EXPORT_SYMBOL(class_signal_connection_failure);
-EXPORT_SYMBOL(class_nm_to_type);
 
 static int __init init_obdclass(void)
 {
@@ -666,9 +651,9 @@ static int __init init_obdclass(void)
                 obd->obd_minor = i;
 
         err = obd_init_caches();
-
         if (err)
                 return err;
+
         obd_sysctl_init();
 
         err = lprocfs_reg_main();
@@ -696,7 +681,8 @@ static void __exit cleanup_obdclass(void)
 
         err = lprocfs_dereg_main();
 
-        CERROR("obd memory leaked: %ld bytes\n", obd_memory);
+        CERROR("obd mem max: %d leaked: %d\n", obd_memmax,
+               atomic_read(&obd_memory));
         EXIT;
 }
 
index 97a84df..07ce0b3 100644 (file)
@@ -6,9 +6,6 @@
 #include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/slab.h>
-#include <linux/extN_fs.h>
-#include <linux/extN_jbd.h>
-#include <linux/extN_xattr.h>
 #include <linux/kp30.h>
 #include <linux/lustre_fsfilt.h>
 
index 9b5a1f9..4302392 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/kp30.h>
 #include <linux/lustre_fsfilt.h>
 #include <linux/obd.h>
+#include <linux/obd_class.h>
 #include <linux/module.h>
 
 static kmem_cache_t *fcb_cache;
@@ -216,12 +217,18 @@ static void *fsfilt_extN_brw_start(int objcount, struct fsfilt_objinfo *fso,
         RETURN(handle);
 }
 
-static int fsfilt_extN_commit(struct inode *inode, void *handle)
+static int fsfilt_extN_commit(struct inode *inode, void *h /*, force_sync */)
 {
         int rc;
+        handle_t *handle = h;
+
+#if 0
+        if (force_sync)
+                handle->h_sync = 1; /* recovery likes this */
+#endif
 
         lock_kernel();
-        rc = journal_stop((handle_t *)handle);
+        rc = journal_stop(handle);
         unlock_kernel();
 
         return rc;
@@ -234,6 +241,31 @@ static int fsfilt_extN_setattr(struct dentry *dentry, void *handle,
         int rc;
 
         lock_kernel();
+
+        /* A _really_ horrible hack to avoid removing the data stored
+         * in the block pointers; this is really the "small" stripe MD data.
+         * We can avoid further hackery by virtue of the MDS file size being
+         * zero all the time (which doesn't invoke block truncate at unlink
+         * time), so we assert we never change the MDS file size from zero.
+         */
+        if (iattr->ia_valid & ATTR_SIZE) {
+                CERROR("hmm, setting %*s file size to %lld\n",
+                       dentry->d_name.len, dentry->d_name.name, iattr->ia_size);
+                LASSERT(iattr->ia_size == 0);
+#if 0
+                /* ATTR_SIZE would invoke truncate: clear it */
+                iattr->ia_valid &= ~ATTR_SIZE;
+                inode->i_size = iattr->ia_size;
+
+                /* make sure _something_ gets set - so new inode
+                 * goes to disk (probably won't work over XFS
+                 */
+                if (!iattr->ia_valid & ATTR_MODE) {
+                        iattr->ia_valid |= ATTR_MODE;
+                        iattr->ia_mode = inode->i_mode;
+                }
+#endif
+        }
         if (inode->i_op->setattr)
                 rc = inode->i_op->setattr(dentry, iattr);
         else
@@ -249,29 +281,58 @@ static int fsfilt_extN_set_md(struct inode *inode, void *handle,
 {
         int rc;
 
-        down(&inode->i_sem);
-        lock_kernel();
-        rc = extN_xattr_set(handle, inode, EXTN_XATTR_INDEX_LUSTRE,
-                            XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
-        unlock_kernel();
-        up(&inode->i_sem);
+        /* Nasty hack city - store stripe MD data in the block pointers if
+         * it will fit, because putting it in an EA currently kills the MDS
+         * performance.  We'll fix this with "fast EAs" in the future.
+         */
+        if (lmm_size <= sizeof(EXTN_I(inode)->i_data) -
+                        sizeof(EXTN_I(inode)->i_data[0])) {
+                /* XXX old_size is debugging only */
+                int old_size = EXTN_I(inode)->i_data[0];
+                if (old_size != 0) {
+                        LASSERT(old_size < sizeof(EXTN_I(inode)->i_data));
+                        CERROR("setting EA on %lu again... interesting\n",
+                               inode->i_ino);
+                }
+
+                EXTN_I(inode)->i_data[0] = cpu_to_le32(lmm_size);
+                memcpy(&EXTN_I(inode)->i_data[1], lmm, lmm_size);
+                mark_inode_dirty(inode);
+                return 0;
+        } else {
+                down(&inode->i_sem);
+                lock_kernel();
+                rc = extN_xattr_set(handle, inode, EXTN_XATTR_INDEX_LUSTRE,
+                                    XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
+                unlock_kernel();
+                up(&inode->i_sem);
+        }
 
-        if (rc) {
+        if (rc)
                 CERROR("error adding MD data to inode %lu: rc = %d\n",
                        inode->i_ino, rc);
-                if (rc != -ENOSPC) LBUG();
-        }
         return rc;
 }
 
-static int fsfilt_extN_get_md(struct inode *inode, void *lmm, int size)
+static int fsfilt_extN_get_md(struct inode *inode, void *lmm, int lmm_size)
 {
         int rc;
 
+        if (EXTN_I(inode)->i_data[0]) {
+                int size = le32_to_cpu(EXTN_I(inode)->i_data[0]);
+                LASSERT(size < sizeof(EXTN_I(inode)->i_data));
+                if (lmm) {
+                        if (size > lmm_size)
+                                return -ERANGE;
+                        memcpy(lmm, &EXTN_I(inode)->i_data[1], size);
+                }
+                return size;
+        }
+
         down(&inode->i_sem);
         lock_kernel();
         rc = extN_xattr_get(inode, EXTN_XATTR_INDEX_LUSTRE,
-                            XATTR_LUSTRE_MDS_OBJID, lmm, size);
+                            XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size);
         unlock_kernel();
         up(&inode->i_sem);
 
@@ -282,7 +343,7 @@ static int fsfilt_extN_get_md(struct inode *inode, void *lmm, int size)
         if (rc < 0) {
                 CDEBUG(D_INFO, "error getting EA %s from inode %lu: "
                        "rc = %d\n", XATTR_LUSTRE_MDS_OBJID, inode->i_ino, rc);
-                memset(lmm, 0, size);
+                memset(lmm, 0, lmm_size);
                 return (rc == -ENODATA) ? 0 : rc;
         }
 
diff --git a/lustre/obdclass/fsfilt_reiserfs.c b/lustre/obdclass/fsfilt_reiserfs.c
new file mode 100644 (file)
index 0000000..1ec5916
--- /dev/null
@@ -0,0 +1,193 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  lustre/lib/fsfilt_reiserfs.c
+ *  Lustre filesystem abstraction routines
+ *
+ *  Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Andreas Dilger <adilger@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * NOTE - According to Hans Reiser, this could actually be implemented more
+ *        efficiently than creating a directory and putting ASCII objids in it.
+ *        Instead, we should return the reiserfs object ID as the lustre objid
+ *        (although I'm not sure what impact that would have on backup/restore).
+ */
+
+#define DEBUG_SUBSYSTEM S_FILTER
+
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/kp30.h>
+#include <linux/lustre_fsfilt.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/module.h>
+
+static void *fsfilt_reiserfs_start(struct inode *inode, int op)
+{
+        return (void *)0xf00f00be;
+}
+
+static void *fsfilt_reiserfs_brw_start(int objcount, struct fsfilt_objinfo *fso,
+                                   int niocount, struct niobuf_remote *nb)
+{
+        return (void *)0xf00f00be;
+}
+
+static int fsfilt_reiserfs_commit(struct inode *inode, void *handle)
+{
+        if (handle != (void *)0xf00f00be) {
+                CERROR("bad handle %p", handle);
+                return -EINVAL;
+        }
+
+        return 0;
+}
+
+static int fsfilt_reiserfs_setattr(struct dentry *dentry, void *handle,
+                               struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        int rc;
+
+        lock_kernel();
+
+        /* A _really_ horrible hack to avoid removing the data stored
+         * in the block pointers; this is really the "small" stripe MD data.
+         * We can avoid further hackery by virtue of the MDS file size being
+         * zero all the time (which doesn't invoke block truncate at unlink
+         * time), so we assert we never change the MDS file size from zero.
+         */
+        if (iattr->ia_valid & ATTR_SIZE) {
+                CERROR("hmm, setting %*s file size to %llu\n",
+                       dentry->d_name.len, dentry->d_name.name, iattr->ia_size);
+                LASSERT(iattr->ia_size == 0);
+#if 0
+                /* ATTR_SIZE would invoke truncate: clear it */
+                iattr->ia_valid &= ~ATTR_SIZE;
+                inode->i_size = iattr->ia_size;
+
+                /* make sure _something_ gets set - so new inode
+                 * goes to disk (probably won't work over XFS
+                 */
+                if (!iattr->ia_valid & ATTR_MODE) {
+                        iattr->ia_valid |= ATTR_MODE;
+                        iattr->ia_mode = inode->i_mode;
+                }
+#endif
+        }
+        if (inode->i_op->setattr)
+                rc = inode->i_op->setattr(dentry, iattr);
+        else
+                rc = inode_setattr(inode, iattr);
+
+        unlock_kernel();
+
+        return rc;
+}
+
+static int fsfilt_reiserfs_set_md(struct inode *inode, void *handle,
+                                  void *lmm, int lmm_size)
+{
+        /* XXX write stripe data into MDS file itself */
+        CERROR("not implemented yet\n");
+
+        return -ENOSYS;
+}
+
+static int fsfilt_reiserfs_get_md(struct inode *inode, void *lmm, int lmm_size)
+{
+        if (lmm == NULL)
+                return inode->i_size;
+
+        CERROR("not implemented yet\n");
+        return -ENOSYS;
+}
+
+static ssize_t fsfilt_reiserfs_readpage(struct file *file, char *buf, size_t count,
+                                        loff_t *offset)
+{
+        return file->f_op->read(file, buf, count, offset);
+}
+
+static int fsfilt_reiserfs_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd,
+                                         void *handle, fsfilt_cb_t cb_func)
+{
+        static long next = 0;
+
+        if (time_after(jiffies, next)) {
+                CERROR("no journal callback kernel patch, faking it...\n");
+                next = jiffies + 300 * HZ;
+        }
+
+        cb_func(obd, last_rcvd, 0);
+
+        return 0;
+}
+
+static int fsfilt_reiserfs_journal_data(struct file *filp)
+{
+        CERROR("not implemented yet\n");
+        return 0;
+}
+
+static int fsfilt_reiserfs_statfs(struct super_block *sb, struct obd_statfs *osfs)
+{
+        struct statfs sfs;
+        int rc = vfs_statfs(sb, &sfs);
+
+        statfs_pack(osfs, &sfs);
+        return rc;
+}
+
+static struct fsfilt_operations fsfilt_reiserfs_ops = {
+        fs_type:                "reiserfs",
+        fs_owner:               THIS_MODULE,
+        fs_start:               fsfilt_reiserfs_start,
+        fs_brw_start:           fsfilt_reiserfs_brw_start,
+        fs_commit:              fsfilt_reiserfs_commit,
+        fs_setattr:             fsfilt_reiserfs_setattr,
+        fs_set_md:              fsfilt_reiserfs_set_md,
+        fs_get_md:              fsfilt_reiserfs_get_md,
+        fs_readpage:            fsfilt_reiserfs_readpage,
+        fs_journal_data:        fsfilt_reiserfs_journal_data,
+        fs_set_last_rcvd:       fsfilt_reiserfs_set_last_rcvd,
+        fs_statfs:              fsfilt_reiserfs_statfs,
+};
+
+static int __init fsfilt_reiserfs_init(void)
+{
+        return fsfilt_register_ops(&fsfilt_reiserfs_ops);
+}
+
+static void __exit fsfilt_reiserfs_exit(void)
+{
+        fsfilt_unregister_ops(&fsfilt_reiserfs_ops);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Lustre reiserfs Filesystem Helper v0.1");
+MODULE_LICENSE("GPL");
+
+module_init(fsfilt_reiserfs_init);
+module_exit(fsfilt_reiserfs_exit);
index 8a0ed36..994949e 100644 (file)
@@ -41,53 +41,59 @@ int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
  * support functions: we could use inter-module communication, but this
  * is more portable to other OS's
  */
-static struct obd_type *class_search_type(char *nm)
+static struct obd_type *class_search_type(char *name)
 {
         struct list_head *tmp;
         struct obd_type *type;
-        CDEBUG(D_INFO, "SEARCH %s\n", nm);
+        CDEBUG(D_INFO, "SEARCH %s\n", name);
 
         tmp = &obd_types;
         list_for_each(tmp, &obd_types) {
                 type = list_entry(tmp, struct obd_type, typ_chain);
                 CDEBUG(D_INFO, "TYP %s\n", type->typ_name);
-                if (strlen(type->typ_name) == strlen(nm) &&
-                    strcmp(type->typ_name, nm) == 0 ) {
+                if (strlen(type->typ_name) == strlen(name) &&
+                    strcmp(type->typ_name, name) == 0) {
                         return type;
                 }
         }
         return NULL;
 }
 
-struct obd_type *class_nm_to_type(char *nm)
+struct obd_type *class_get_type(char *name)
 {
-        struct obd_type *type = class_search_type(nm);
+        struct obd_type *type = class_search_type(name);
 
 #ifdef CONFIG_KMOD
-        if ( !type ) {
-                if ( !request_module(nm) ) {
-                        CDEBUG(D_INFO, "Loaded module '%s'\n", nm);
-                        type = class_search_type(nm);
-                } else {
-                        CDEBUG(D_INFO, "Can't load module '%s'\n", nm);
-                }
+        if (!type) {
+                if (!request_module(name)) {
+                        CDEBUG(D_INFO, "Loaded module '%s'\n", name);
+                        type = class_search_type(name);
+                } else
+                        CDEBUG(D_INFO, "Can't load module '%s'\n", name);
         }
 #endif
+        if (type)
+                __MOD_INC_USE_COUNT(type->typ_ops->o_owner);
         return type;
 }
 
+void class_put_type(struct obd_type *type)
+{
+        LASSERT(type);
+        __MOD_DEC_USE_COUNT(type->typ_ops->o_owner);
+}
+
 int class_register_type(struct obd_ops *ops, struct lprocfs_vars *vars,
-                        char *nm)
+                        char *name)
 {
         struct obd_type *type;
         int rc;
-
         ENTRY;
 
-        LASSERT (strnlen (nm, 1024) < 1024);    /* sanity check */
-        
-        if (class_search_type(nm)) {
-                CDEBUG(D_IOCTL, "Type %s already registered\n", nm);
+        LASSERT(strnlen(name, 1024) < 1024);    /* sanity check */
+
+        if (class_search_type(name)) {
+                CDEBUG(D_IOCTL, "Type %s already registered\n", name);
                 RETURN(-EEXIST);
         }
 
@@ -97,38 +103,33 @@ int class_register_type(struct obd_ops *ops, struct lprocfs_vars *vars,
                 RETURN(rc);
 
         OBD_ALLOC(type->typ_ops, sizeof(*type->typ_ops));
-        OBD_ALLOC(type->typ_name, strlen(nm) + 1);
-        if (type->typ_ops == NULL ||
-            type->typ_name == NULL)
+        OBD_ALLOC(type->typ_name, strlen(name) + 1);
+        if (type->typ_ops == NULL || type->typ_name == NULL)
                 GOTO (failed, rc);
-        
+
         *(type->typ_ops) = *ops;
-        strcpy(type->typ_name, nm);
+        strcpy(type->typ_name, name);
         list_add(&type->typ_chain, &obd_types);
 
         rc = lprocfs_reg_class(type, vars, type);
         if (rc != 0) {
-                list_del (&type->typ_chain);
-                GOTO (failed, rc);
+                list_del(&type->typ_chain);
+                GOTO(failed, rc);
         }
-        
-        CDEBUG(D_INFO, "MOD_INC_USE for register_type: count = %d\n",
-               atomic_read(&(THIS_MODULE)->uc.usecount));
-        MOD_INC_USE_COUNT;
+
         RETURN (0);
 
  failed:
         if (type->typ_ops != NULL)
-                OBD_FREE (type->typ_name, strlen (nm) + 1);
+                OBD_FREE(type->typ_name, strlen(name) + 1);
         if (type->typ_ops != NULL)
                 OBD_FREE (type->typ_ops, sizeof (*type->typ_ops));
         RETURN(rc);
 }
 
-int class_unregister_type(char *nm)
+int class_unregister_type(char *name)
 {
-        struct obd_type *type = class_nm_to_type(nm);
-
+        struct obd_type *type = class_search_type(name);
         ENTRY;
 
         if (!type) {
@@ -137,7 +138,7 @@ int class_unregister_type(char *nm)
         }
 
         if (type->typ_refcnt) {
-                CERROR("type %s has refcount (%d)\n", nm, type->typ_refcnt);
+                CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt);
                 /* This is a bad situation, let's make the best of it */
                 /* Remove ops, but leave the name for debugging */
                 OBD_FREE(type->typ_ops, sizeof(*type->typ_ops));
@@ -147,13 +148,10 @@ int class_unregister_type(char *nm)
                 lprocfs_dereg_class(type);
 
         list_del(&type->typ_chain);
-        OBD_FREE(type->typ_name, strlen(nm) + 1);
+        OBD_FREE(type->typ_name, strlen(name) + 1);
         if (type->typ_ops != NULL)
                 OBD_FREE(type->typ_ops, sizeof(*type->typ_ops));
         OBD_FREE(type, sizeof(*type));
-        CDEBUG(D_INFO, "MOD_DEC_USE for register_type: count = %d\n",
-               atomic_read(&(THIS_MODULE)->uc.usecount) - 1);
-        MOD_DEC_USE_COUNT;
         RETURN(0);
 } /* class_unregister_type */
 
@@ -165,7 +163,7 @@ int class_name2dev(char *name)
         if (!name)
                 return -1;
 
-        for (i=0; i < MAX_OBD_DEVICES; i++) {
+        for (i = 0; i < MAX_OBD_DEVICES; i++) {
                 struct obd_device *obd = &obd_dev[i];
                 if (obd->obd_name && strcmp(name, obd->obd_name) == 0) {
                         res = i;
@@ -181,7 +179,7 @@ int class_uuid2dev(char *uuid)
         int res = -1;
         int i;
 
-        for (i=0; i < MAX_OBD_DEVICES; i++) {
+        for (i = 0; i < MAX_OBD_DEVICES; i++) {
                 struct obd_device *obd = &obd_dev[i];
                 if (strncmp(uuid, obd->obd_uuid, sizeof(obd->obd_uuid)) == 0) {
                         res = i;
@@ -197,7 +195,7 @@ struct obd_device *class_uuid2obd(char *uuid)
 {
         int i;
 
-        for (i=0; i < MAX_OBD_DEVICES; i++) {
+        for (i = 0; i < MAX_OBD_DEVICES; i++) {
                 struct obd_device *obd = &obd_dev[i];
                 if (strncmp(uuid, obd->obd_uuid, sizeof(obd->obd_uuid)) == 0)
                         return obd;
@@ -428,7 +426,9 @@ void class_disconnect_all(struct obd_device *obddev)
                         spin_unlock(&obddev->obd_dev_lock);
                         CERROR("force disconnecting %s:%s export %p\n",
                                export->exp_obd->obd_type->typ_name,
-                               export->exp_connection->c_remote_uuid, export);
+                               export->exp_connection ?
+                               (char *)export->exp_connection->c_remote_uuid :
+                               "<unconnected>", export);
                         rc = obd_disconnect(&conn);
                         if (rc < 0) {
                                 /* AED: not so sure about this...  We can't
index 62a806e..f096772 100644 (file)
@@ -91,7 +91,8 @@ void lprocfs_remove_all(struct proc_dir_entry* root)
                 rm_entry = temp;
                 temp = temp->parent;
                 remove_proc_entry(rm_entry->name, rm_entry->parent);
-                if (temp == parent) break;
+                if (temp == parent)
+                        break;
         }
 }
 
@@ -111,7 +112,7 @@ struct proc_dir_entry* lprocfs_new_dir(struct proc_dir_entry* root,
         new_root = root;
         mover_str = temp_string;
         while ((my_str = strsep(&mover_str, tok))) {
-                if(!*my_str)
+                if (!*my_str)
                         continue;
                 CDEBUG(D_OTHER, "SEARCH= %s\t, ROOT=%s\n", my_str,
                        new_root->name);
@@ -120,7 +121,7 @@ struct proc_dir_entry* lprocfs_new_dir(struct proc_dir_entry* root,
                         CDEBUG(D_OTHER, "Adding: %s\n", my_str);
                         temp_entry = lprocfs_mkdir(my_str, new_root);
                         if (temp_entry == NULL) {
-                                CDEBUG(D_OTHER, 
+                                CDEBUG(D_OTHER,
                                        "! Did not create new dir %s !!\n",
                                        my_str);
                                 return temp_entry;
@@ -131,8 +132,7 @@ struct proc_dir_entry* lprocfs_new_dir(struct proc_dir_entry* root,
         return new_root;
 }
 
-int lprocfs_new_vars(struct proc_dir_entry* root, 
-                     struct lprocfs_vars* list,
+int lprocfs_new_vars(struct proc_dir_entry* root, struct lprocfs_vars* list,
                      const char* tok, void* data)
 {
         struct proc_dir_entry *temp_root;
@@ -188,9 +188,9 @@ int lprocfs_reg_obd(struct obd_device *device, struct lprocfs_vars *list,
 {
         struct proc_dir_entry* this_dev_root;
         int retval;
-        
-        if(lprocfs_srch(device->obd_type->typ_procroot, device->obd_name)){
-                CDEBUG(D_OTHER, "Device with name [%s] exists!", 
+
+        if (lprocfs_srch(device->obd_type->typ_procroot, device->obd_name)) {
+                CDEBUG(D_OTHER, "Device with name [%s] exists!",
                                 device->obd_name);
                 return 0;
         }
@@ -227,7 +227,7 @@ int lprocfs_dereg_obd(struct obd_device* device)
 
 struct proc_dir_entry* lprocfs_reg_mnt(char* mnt_name)
 {
-        if(lprocfs_srch(proc_lustre_fs_root, mnt_name)){
+        if (lprocfs_srch(proc_lustre_fs_root, mnt_name)) {
                 CDEBUG(D_OTHER, "Mount with same name exists!");
                 return 0;
         }
@@ -236,7 +236,7 @@ struct proc_dir_entry* lprocfs_reg_mnt(char* mnt_name)
 
 int lprocfs_dereg_mnt(struct proc_dir_entry* root)
 {
-        if(root == NULL){
+        if (root == NULL) {
                 CDEBUG(D_OTHER, "Non-existent root!");
                 return 0;
         }
@@ -247,7 +247,6 @@ int lprocfs_dereg_mnt(struct proc_dir_entry* root)
 int lprocfs_reg_class(struct obd_type* type, struct lprocfs_vars* list,
                       void* data)
 {
-        
         struct proc_dir_entry* root;
         int retval;
         root = lprocfs_mkdir(type->typ_name, proc_lustre_dev_root);
@@ -259,9 +258,8 @@ int lprocfs_reg_class(struct obd_type* type, struct lprocfs_vars* list,
 
 int lprocfs_dereg_class(struct obd_type* class)
 {
-        if(class == NULL){
-                CDEBUG(D_OTHER, "Non-existent class",
-                       class->typ_name);
+        if (class == NULL) {
+                CDEBUG(D_OTHER, "Non-existent class");
                 return 0;
         }
         lprocfs_remove_all(class->typ_procroot);
@@ -270,6 +268,7 @@ int lprocfs_dereg_class(struct obd_type* class)
         return 0;
 
 }
+
 int lprocfs_reg_main()
 {
         proc_lustre_root = lprocfs_mkdir("lustre", &proc_root);
similarity index 88%
rename from lustre/lib/ll_pack.c
rename to lustre/obdclass/statfs_pack.c
index 184c2c1..876d41c 100644 (file)
@@ -22,8 +22,9 @@
  *
  */
 
-#define DEBUG_SUBSYSTEM S_LLITE
+#define DEBUG_SUBSYSTEM S_CLASS
 
+#define EXPORT_SYMTAB
 #include <linux/lustre_net.h>
 #include <linux/obd_support.h>
 
@@ -39,7 +40,10 @@ void obd_statfs_pack(struct obd_statfs *tgt, struct obd_statfs *src)
         tgt->os_namelen = HTON__u32(src->os_namelen);
 }
 
-#define obd_statfs_unpack(tgt, src) obd_statfs_pack(tgt, src)
+void obd_statfs_unpack(struct obd_statfs *tgt, struct obd_statfs *src)
+{
+        obd_statfs_pack(tgt, src);
+}
 
 void statfs_pack(struct obd_statfs *osfs, struct statfs *sfs)
 {
@@ -65,3 +69,7 @@ void statfs_unpack(struct statfs *sfs, struct obd_statfs *osfs)
         sfs->f_namelen = osfs->os_namelen;
 }
 
+EXPORT_SYMBOL(obd_statfs_pack);
+EXPORT_SYMBOL(obd_statfs_unpack);
+EXPORT_SYMBOL(statfs_pack);
+EXPORT_SYMBOL(statfs_unpack);
index 76fddd8..8339327 100644 (file)
@@ -109,26 +109,7 @@ static int echo_connect(struct lustre_handle *conn, struct obd_device *obd,
                         obd_uuid_t cluuid, struct recovd_obd *recovd,
                         ptlrpc_recovery_cb_t recover)
 {
-        int rc;
-
-        MOD_INC_USE_COUNT;
-        rc = class_connect(conn, obd, cluuid);
-
-        if (rc)
-                MOD_DEC_USE_COUNT;
-
-        return rc;
-}
-
-static int echo_disconnect(struct lustre_handle *conn)
-{
-        int rc;
-
-        rc = class_disconnect(conn);
-        if (!rc)
-                MOD_DEC_USE_COUNT;
-
-        return rc;
+        return class_connect(conn, obd, cluuid);
 }
 
 static __u64 echo_next_id(struct obd_device *obddev)
@@ -148,7 +129,7 @@ int echo_create(struct lustre_handle *conn, struct obdo *oa,
         struct obd_device *obd = class_conn2obd(conn);
 
         if (!obd) {
-                CERROR("invalid client %Lx\n", conn->addr);
+                CERROR("invalid client "LPX64"\n", conn->addr);
                 return -EINVAL;
         }
 
@@ -453,20 +434,21 @@ int echo_detach(struct obd_device *dev)
 }
 
 static struct obd_ops echo_obd_ops = {
-        o_attach:       echo_attach,
-        o_detach:       echo_detach,
-        o_connect:      echo_connect,
-        o_disconnect:   echo_disconnect,
-        o_create:       echo_create,
-        o_destroy:      echo_destroy,
-        o_open:         echo_open,
-        o_close:        echo_close,
-        o_getattr:      echo_getattr,
-        o_setattr:      echo_setattr,
-        o_preprw:       echo_preprw,
-        o_commitrw:     echo_commitrw,
-        o_setup:        echo_setup,
-        o_cleanup:      echo_cleanup
+        o_owner:       THIS_MODULE,
+        o_attach:      echo_attach,
+        o_detach:      echo_detach,
+        o_connect:     echo_connect,
+        o_disconnect:  class_disconnect,
+        o_create:      echo_create,
+        o_destroy:     echo_destroy,
+        o_open:        echo_open,
+        o_close:       echo_close,
+        o_getattr:     echo_getattr,
+        o_setattr:     echo_setattr,
+        o_preprw:      echo_preprw,
+        o_commitrw:    echo_commitrw,
+        o_setup:       echo_setup,
+        o_cleanup:     echo_cleanup
 };
 
 extern int echo_client_init(void);
index 3d2f222..e9c0e90 100644 (file)
@@ -173,7 +173,7 @@ static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn, int l
                                 void *addr = kmap(pgp->pg);
 
                                 rc = page_debug_check("test_brw", addr,
-                                                       PAGE_SIZE, pgp->off, id);
+                                                       pgp->count, pgp->off, id);
                                 kunmap(pgp->pg);
                         }
                         __free_pages(pgp->pg, 0);
@@ -184,7 +184,7 @@ static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn, int l
                 GOTO(out, rc);
         }
         default:
-                CERROR ("echo_ioctl(): unrecognised ioctl %#lx\n", cmd);
+                CERROR ("echo_ioctl(): unrecognised ioctl %#x\n", cmd);
                 GOTO (out, rc = -ENOTTY);
         }
 
@@ -209,23 +209,17 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf)
                 RETURN(-EINVAL);
         }
 
-        MOD_INC_USE_COUNT;
         tgt = class_uuid2obd(data->ioc_inlbuf1);
         if (!tgt || !(tgt->obd_flags & OBD_ATTACHED) ||
             !(tgt->obd_flags & OBD_SET_UP)) {
                 CERROR("device not attached or not set up (%d)\n",
                        data->ioc_dev);
-                GOTO(error_dec, rc = -EINVAL);
+                RETURN(rc = -EINVAL);
         }
 
         rc = obd_connect(&ec->conn, tgt, NULL, NULL, NULL);
-        if (rc) {
+        if (rc)
                 CERROR("fail to connect to device %d\n", data->ioc_dev);
-                GOTO(error_dec, rc = -EINVAL);
-        }
-        RETURN(rc);
-error_dec:
-        MOD_DEC_USE_COUNT;
         RETURN(rc);
 }
 
@@ -246,7 +240,6 @@ static int echo_cleanup(struct obd_device * obddev)
                 RETURN(-EINVAL);
         }
 
-        MOD_DEC_USE_COUNT;
         RETURN(0);
 }
 
@@ -258,6 +251,7 @@ static int echo_connect(struct lustre_handle *conn, struct obd_device *src,
 }
 
 static struct obd_ops echo_obd_ops = {
+        o_owner:       THIS_MODULE,
         o_setup:       echo_setup,
         o_cleanup:     echo_cleanup,
         o_iocontrol:   echo_iocontrol,
index a237004..c6658d6 100644 (file)
@@ -8,10 +8,7 @@ MODULE = obdfilter
 modulefs_DATA = obdfilter.o
 EXTRA_PROGRAMS = obdfilter
 
-LINX=simple.c ll_pack.c
-ll_pack.c:
-       test -e ll_pack.c || ln -sf $(top_srcdir)/lib/ll_pack.c
-
+LINX=simple.c
 simple.c:
        test -e simple.c || ln -sf $(top_srcdir)/lib/simple.c
 
index a370e56..2d495b2 100644 (file)
@@ -275,11 +275,11 @@ static struct dentry *filter_fid2dentry(struct obd_device *obd,
         len = sprintf(name, LPU64, id);
         CDEBUG(D_INODE, "opening object O/%*s/%s\n",
                dparent->d_name.len, dparent->d_name.name, name);
-        if (!locked)
-                down(&dparent->d_inode->i_sem);
+        //if (!locked)
+                //down(&dparent->d_inode->i_sem);
         dchild = lookup_one_len(name, dparent, len);
-        if (!locked)
-                up(&dparent->d_inode->i_sem);
+        //if (!locked)
+                //up(&dparent->d_inode->i_sem);
         if (IS_ERR(dchild)) {
                 CERROR("child lookup error %ld\n", PTR_ERR(dchild));
                 RETURN(dchild);
@@ -333,7 +333,7 @@ static struct file *filter_obj_open(struct obd_export *export,
                 RETURN(ERR_PTR(-EINVAL));
         }
 
-        ffd = kmem_cache_alloc(filter_open_cache, SLAB_KERNEL);
+        PORTAL_SLAB_ALLOC(ffd, filter_open_cache, sizeof(*ffd));
         if (!ffd) {
                 CERROR("obdfilter: out of memory\n");
                 RETURN(ERR_PTR(-ENOMEM));
@@ -352,7 +352,7 @@ static struct file *filter_obj_open(struct obd_export *export,
         pop_ctxt(&saved, &filter->fo_ctxt, NULL);
 
         if (IS_ERR(file)) {
-                CERROR("error opening %s: rc %d\n", name, PTR_ERR(file));
+                CERROR("error opening %s: rc %ld\n", name, PTR_ERR(file));
                 GOTO(out_fdd, file);
         }
 
@@ -397,7 +397,7 @@ out_fdd:
         kmem_cache_free(filter_dentry_cache, fdd);
 out_ffd:
         ffd->ffd_servercookie = DEAD_HANDLE_MAGIC;
-        kmem_cache_free(filter_open_cache, ffd);
+        PORTAL_SLAB_FREE(ffd, filter_open_cache, sizeof(*ffd));
         goto out;
 }
 
@@ -459,7 +459,7 @@ static int filter_close_internal(struct obd_device *obd,
         }
 
         f_dput(object_dentry);
-        kmem_cache_free(filter_open_cache, ffd);
+        PORTAL_SLAB_FREE(ffd, filter_open_cache, sizeof(*ffd));
 
         RETURN(rc);
 }
@@ -1423,8 +1423,8 @@ out_ctxt:
 }
 
 static int filter_brw(int cmd, struct lustre_handle *conn,
-                              struct lov_stripe_md *lsm, obd_count oa_bufs,
-                              struct brw_page *pga, struct obd_brw_set *set)
+                      struct lov_stripe_md *lsm, obd_count oa_bufs,
+                      struct brw_page *pga, struct obd_brw_set *set)
 {
         struct obd_ioobj        ioo;
         struct niobuf_local     *lnb;
@@ -1437,10 +1437,10 @@ static int filter_brw(int cmd, struct lustre_handle *conn,
         OBD_ALLOC(lnb, oa_bufs * sizeof(struct niobuf_local));
         OBD_ALLOC(rnb, oa_bufs * sizeof(struct niobuf_remote));
 
-        if ( lnb == NULL || rnb == NULL )
+        if (lnb == NULL || rnb == NULL)
                 GOTO(out, ret = -ENOMEM);
 
-        for ( i = 0 ; i < oa_bufs ; i++ ) {
+        for (i = 0; i < oa_bufs; i++) {
                 rnb[i].offset = pga[i].off;
                 rnb[i].len = pga[i].count;
         }
@@ -1450,16 +1450,16 @@ static int filter_brw(int cmd, struct lustre_handle *conn,
         ioo.ioo_type = S_IFREG;
         ioo.ioo_bufcnt = oa_bufs;
 
-        ret = filter_preprw(cmd, conn, 1, &ioo, oa_bufs, rnb, lnb, 
-                                &desc_private);
-        if ( ret != 0 )
+        ret = filter_preprw(cmd, conn, 1, &ioo, oa_bufs, rnb, lnb,
+                            &desc_private);
+        if (ret != 0)
                 GOTO(out, ret);
 
-        for ( i = 0; i < oa_bufs ; i++ ) {
+        for (i = 0; i < oa_bufs; i++) {
                 void *virt = kmap(pga[i].pg);
                 obd_off off = pga[i].off & ~PAGE_MASK;
 
-                if ( cmd & OBD_BRW_WRITE ) 
+                if (cmd & OBD_BRW_WRITE)
                         memcpy(lnb[i].addr + off, virt + off, pga[i].count);
                 else
                         memcpy(virt + off, lnb[i].addr + off, pga[i].count);
@@ -1470,9 +1470,9 @@ static int filter_brw(int cmd, struct lustre_handle *conn,
         ret = filter_commitrw(cmd, conn, 1, &ioo, oa_bufs, lnb, desc_private);
 
 out:
-        if ( lnb )
+        if (lnb)
                 OBD_FREE(lnb, oa_bufs * sizeof(struct niobuf_local));
-        if ( rnb )
+        if (rnb)
                 OBD_FREE(rnb, oa_bufs * sizeof(struct niobuf_remote));
         RETURN(ret);
 }
@@ -1608,29 +1608,30 @@ int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst,
 }
 
 static struct obd_ops filter_obd_ops = {
-        o_attach:      filter_attach,
-        o_detach:      filter_detach,
-        o_get_info:    filter_get_info,
-        o_setup:       filter_setup,
-        o_cleanup:     filter_cleanup,
-        o_connect:     filter_connect,
-        o_disconnect:  filter_disconnect,
-        o_statfs:      filter_statfs,
-        o_getattr:     filter_getattr,
-        o_create:      filter_create,
-        o_setattr:     filter_setattr,
-        o_destroy:     filter_destroy,
-        o_open:        filter_open,
-        o_close:       filter_close,
-        o_brw:         filter_brw,
-        o_punch:       filter_truncate,
-        o_preprw:      filter_preprw,
-        o_commitrw:    filter_commitrw
+        o_owner:        THIS_MODULE,
+        o_attach:       filter_attach,
+        o_detach:       filter_detach,
+        o_get_info:     filter_get_info,
+        o_setup:        filter_setup,
+        o_cleanup:      filter_cleanup,
+        o_connect:      filter_connect,
+        o_disconnect:   filter_disconnect,
+        o_statfs:       filter_statfs,
+        o_getattr:      filter_getattr,
+        o_create:       filter_create,
+        o_setattr:      filter_setattr,
+        o_destroy:      filter_destroy,
+        o_open:         filter_open,
+        o_close:        filter_close,
+        o_brw:          filter_brw,
+        o_punch:        filter_truncate,
+        o_preprw:       filter_preprw,
+        o_commitrw:     filter_commitrw
 #if 0
-        o_preallocate: filter_preallocate_inodes,
-        o_migrate:     filter_migrate,
-        o_copy:        filter_copy_data,
-        o_iterate:     filter_iterate
+        o_preallocate:  filter_preallocate_inodes,
+        o_migrate:      filter_migrate,
+        o_copy:         filter_copy_data,
+        o_iterate:      filter_iterate
 #endif
 };
 
index 284c2d6..2348a5b 100644 (file)
@@ -9,13 +9,11 @@ MODULE = osc
 modulefs_DATA = osc.o
 EXTRA_PROGRAMS = osc
 
-LINX= obd_pack.c ll_pack.c client.c
+LINX= obd_pack.c client.c
 osc_SOURCES = osc_request.c lproc_osc.c $(LINX)
 
 obd_pack.c: 
        test -e obd_pack.c || ln -sf $(top_srcdir)/lib/obd_pack.c
-ll_pack.c: 
-       test -e ll_pack.c || ln -sf $(top_srcdir)/lib/ll_pack.c
 client.c: 
        test -e client.c || ln -sf $(top_srcdir)/lib/client.c
 
index 1e2f72e..85b1694 100644 (file)
@@ -399,7 +399,7 @@ static void unmap_and_decref_bulk_desc(void *data)
 }
 
 /*  this is the callback function which is invoked by the Portals
- *  event handler associated with the bulk_sink queue and bulk_source queue. 
+ *  event handler associated with the bulk_sink queue and bulk_source queue.
  */
 static void osc_ptl_ev_hdlr(struct ptlrpc_bulk_desc *desc)
 {
@@ -428,7 +428,9 @@ static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         struct ptlrpc_bulk_desc *desc = NULL;
         struct ost_body *body;
         int rc, size[3] = {sizeof(*body)}, mapped = 0;
-        void *iooptr, *nioptr;
+        unsigned long flags;
+        struct obd_ioobj *iooptr;
+        void *nioptr;
         __u32 xid;
         ENTRY;
 
@@ -453,9 +455,9 @@ static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         ost_pack_ioo(&iooptr, lsm, page_count);
         /* end almost identical to brw_write case */
 
-        spin_lock(&imp->imp_lock);
+        spin_lock_irqsave(&imp->imp_lock, flags);
         xid = ++imp->imp_last_xid;       /* single xid for all pages */
-        spin_unlock(&imp->imp_lock);
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
 
         obd_kmap_get(page_count, 0);
 
@@ -521,26 +523,27 @@ out_unmap:
         goto out_req;
 }
 
-static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *md,
+static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                          obd_count page_count, struct brw_page *pga,
                          struct obd_brw_set *set)
 {
-        struct ptlrpc_connection *connection =
-                client_conn2cli(conn)->cl_import.imp_connection;
+        struct obd_import *imp = class_conn2cliimp(conn);
+        struct ptlrpc_connection *connection = imp->imp_connection;
         struct ptlrpc_request *request = NULL;
         struct ptlrpc_bulk_desc *desc = NULL;
         struct ost_body *body;
         struct niobuf_local *local = NULL;
         struct niobuf_remote *remote;
-        int rc, j, size[3] = {sizeof(*body)}, mapped = 0;
-        void *iooptr, *nioptr;
+        int rc, size[3] = {sizeof(*body)}, mapped = 0;
+        int j;
+        struct obd_ioobj *iooptr;
+        void *nioptr;
         ENTRY;
 
         size[1] = sizeof(struct obd_ioobj);
-        size[2] = page_count * sizeof(*remote);
+        size[2] = page_count * sizeof(struct niobuf_remote);
 
-        request = ptlrpc_prep_req(class_conn2cliimp(conn), OST_WRITE, 3, size,
-                                  NULL);
+        request = ptlrpc_prep_req(imp, OST_WRITE, 3, size, NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
@@ -548,14 +551,14 @@ static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *md,
 
         desc = ptlrpc_prep_bulk(connection);
         if (!desc)
-               GOTO(out_req, rc = -ENOMEM);
+                GOTO(out_req, rc = -ENOMEM);
         desc->bd_portal = OSC_BULK_PORTAL;
         desc->bd_ptl_ev_hdlr = osc_ptl_ev_hdlr;
         CDEBUG(D_PAGE, "desc = %p\n", desc);
 
         iooptr = lustre_msg_buf(request->rq_reqmsg, 1);
         nioptr = lustre_msg_buf(request->rq_reqmsg, 2);
-        ost_pack_ioo(&iooptr, md, page_count);
+        ost_pack_ioo(&iooptr, lsm, page_count);
         /* end almost identical to brw_read case */
 
         OBD_ALLOC(local, page_count * sizeof(*local));
@@ -567,7 +570,7 @@ static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *md,
         for (mapped = 0; mapped < page_count; mapped++) {
                 local[mapped].addr = kmap(pga[mapped].pg);
 
-                CDEBUG(D_INFO, "kmap(pg) = %p ; pg->flags = %lx ; pg->count = "
+                CDEBUG(D_INFO, "kmap(pg) = %p ; pg->flags = %lx ; pg->refcount = "
                        "%d ; page %d of %d\n",
                        local[mapped].addr, pga[mapped].pg->flags,
                        page_count(pga[mapped].pg),
@@ -604,7 +607,7 @@ static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *md,
                 if (!bulk)
                         GOTO(out_unmap, rc = -ENOMEM);
 
-                bulk->bp_buf = (void *)(unsigned long)local[j].addr;
+                bulk->bp_buf = local[j].addr;
                 bulk->bp_buflen = local[j].len;
                 bulk->bp_xid = remote->xid;
                 bulk->bp_page = pga[j].pg;
@@ -776,6 +779,50 @@ static int osc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
         return rc;
 }
 
+/* Retrieve object striping information.
+ *
+ * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating
+ * the maximum number of OST indices which will fit in the user buffer.
+ * lmm_magic must be LOV_MAGIC (we only use 1 slot here).
+ */
+static int osc_getstripe(struct lustre_handle *conn, struct lov_stripe_md *lsm,
+                         struct lov_mds_md *lmmu)
+{
+        struct lov_mds_md lmm, *lmmk;
+        int rc, lmm_size;
+        ENTRY;
+
+        if (!lsm)
+                RETURN(-ENODATA);
+
+        rc = copy_from_user(&lmm, lmmu, sizeof(lmm));
+        if (rc)
+                RETURN(-EFAULT);
+
+        if (lmm.lmm_magic != LOV_MAGIC)
+                RETURN(-EINVAL);
+
+        if (lmm.lmm_ost_count < 1)
+                RETURN(-EOVERFLOW);
+
+        lmm_size = sizeof(lmm) + sizeof(lmm.lmm_objects[0]);
+        OBD_ALLOC(lmmk, lmm_size);
+        if (rc < 0)
+                RETURN(rc);
+
+        lmmk->lmm_stripe_count = 1;
+        lmmk->lmm_ost_count = 1;
+        lmmk->lmm_object_id = lsm->lsm_object_id;
+        lmmk->lmm_objects[0].l_object_id = lsm->lsm_object_id;
+
+        if (copy_to_user(lmmu, lmmk, lmm_size))
+                rc = -EFAULT;
+
+        OBD_FREE(lmmk, lmm_size);
+
+        RETURN(rc);
+}
+
 static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
                          void *karg, void *uarg)
 {
@@ -878,8 +925,16 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
                 OBD_FREE(buf, len);
                 GOTO(out, err);
         }
+        case LL_IOC_LOV_SETSTRIPE:
+                err = obd_alloc_memmd(conn, karg);
+                if (err > 0)
+                        err = 0;
+                GOTO(out, err);
+        case LL_IOC_LOV_GETSTRIPE:
+                err = osc_getstripe(conn, karg, uarg);
+                GOTO(out, err);
         default:
-                CERROR ("osc_ioctl(): unrecognised ioctl %#lx\n", cmd);
+                CERROR ("osc_ioctl(): unrecognised ioctl %#x\n", cmd);
                 GOTO(out, err = -ENOTTY);
         }
 out:
@@ -904,7 +959,7 @@ static void set_osc_active(struct obd_import *imp, int active)
 
                 fakeconn.addr = (__u64)(unsigned long)exp;
                 fakeconn.cookie = exp->exp_cookie;
-                ioc_data.ioc_inlbuf1 = imp->imp_obd->obd_uuid;
+                ioc_data.ioc_inlbuf1 = imp->imp_obd->u.cli.cl_target_uuid;
                 ioc_data.ioc_offset = active;
                 rc = obd_iocontrol(IOC_LOV_SET_OSC_ACTIVE, &fakeconn,
                                    sizeof ioc_data, &ioc_data, NULL);
@@ -919,42 +974,11 @@ static void set_osc_active(struct obd_import *imp, int active)
         }
 }
 
-
-/* XXX looks a lot like super.c:invalidate_request_list, don't it? */
-static void abort_inflight_for_import(struct obd_import *imp)
-{
-        struct list_head *tmp, *n;
-
-        /* Make sure that no new requests get processed for this import.
-         * ptlrpc_queue_wait must (and does) hold imp_lock while testing this
-         * flag and then putting requests on sending_list or delayed_list.
-         */
-        spin_lock(&imp->imp_lock);
-        imp->imp_flags |= IMP_INVALID;
-        spin_unlock(&imp->imp_lock);
-
-        list_for_each_safe(tmp, n, &imp->imp_sending_list) {
-                struct ptlrpc_request *req =
-                        list_entry(tmp, struct ptlrpc_request, rq_list);
-
-                DEBUG_REQ(D_HA, req, "inflight");
-                req->rq_flags |= PTL_RPC_FL_ERR;
-                wake_up(&req->rq_wait_for_rep);
-        }
-
-        list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
-                struct ptlrpc_request *req =
-                        list_entry(tmp, struct ptlrpc_request, rq_list);
-
-                DEBUG_REQ(D_HA, req, "aborting waiting req");
-                req->rq_flags |= PTL_RPC_FL_ERR;
-                wake_up(&req->rq_wait_for_rep);
-        }
-}
-
 static int osc_recover(struct obd_import *imp, int phase)
 {
         int rc;
+        unsigned long flags;
+        struct ptlrpc_request *req;
         ENTRY;
 
         switch(phase) {
@@ -969,15 +993,21 @@ static int osc_recover(struct obd_import *imp, int phase)
 
             case PTLRPC_RECOVD_PHASE_RECOVER:
                 imp->imp_flags &= ~IMP_INVALID;
-                rc = ptlrpc_reconnect_import(imp, OST_CONNECT);
+                rc = ptlrpc_reconnect_import(imp, OST_CONNECT, &req);
+                ptlrpc_req_finished(req);
                 if (rc) {
                         imp->imp_flags |= IMP_INVALID;
                         RETURN(rc);
                 }
 
-                spin_lock(&imp->imp_lock);
+                spin_lock_irqsave(&imp->imp_lock, flags);
                 imp->imp_level = LUSTRE_CONN_FULL;
-                spin_unlock(&imp->imp_lock);
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+                /* Is this the right place?  Should we do this in _PREPARE
+                 * as well?  What about raising the level right away?
+                 */
+                ptlrpc_wake_delayed(imp);
 
                 set_osc_active(imp, 1 /* active */);
                 RETURN(0);
@@ -1001,6 +1031,7 @@ static int osc_connect(struct lustre_handle *conn, struct obd_device *obd,
 }
 
 struct obd_ops osc_obd_ops = {
+        o_owner:        THIS_MODULE,
         o_attach:       osc_attach,
         o_detach:       osc_detach,
         o_setup:        client_obd_setup,
index 3ad390a..c158a0f 100644 (file)
@@ -8,10 +8,8 @@ MODULE = ost
 modulefs_DATA = ost.o
 EXTRA_PROGRAMS = ost
 
-LINX=obd_pack.c ll_pack.c target.c
+LINX=obd_pack.c target.c
 
-ll_pack.c: 
-       test -e ll_pack.c || ln -sf $(top_srcdir)/lib/ll_pack.c
 obd_pack.c: 
        test -e obd_pack.c || ln -sf $(top_srcdir)/lib/obd_pack.c
 target.c: 
index 6ccb240..db7857c 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/obd_ost.h>
 #include <linux/lustre_net.h>
 #include <linux/lustre_dlm.h>
+#include <linux/lustre_export.h>
 #include <linux/init.h>
 #include <linux/lprocfs_status.h>
 
@@ -212,10 +213,10 @@ static int ost_setattr(struct ptlrpc_request *req)
 
 static int ost_bulk_timeout(void *data)
 {
-        struct ptlrpc_bulk_desc *desc = data;
-
         ENTRY;
-        recovd_conn_fail(desc->bd_connection);
+        /* We don't fail the connection here, because having the export
+         * killed makes the (vital) call to commitrw very sad.
+         */
         RETURN(1);
 }
 
@@ -223,7 +224,8 @@ static int ost_brw_read(struct ptlrpc_request *req)
 {
         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
         struct ptlrpc_bulk_desc *desc;
-        void *tmp1, *tmp2, *end2;
+        struct obd_ioobj *tmp1;
+        void *tmp2, *end2;
         struct niobuf_remote *remote_nb;
         struct niobuf_local *local_nb = NULL;
         struct obd_ioobj *ioo;
@@ -316,16 +318,19 @@ static int ost_brw_write(struct ptlrpc_request *req)
 {
         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
         struct ptlrpc_bulk_desc *desc;
+        struct obd_ioobj *tmp1;
+        void *tmp2, *end2;
         struct niobuf_remote *remote_nb;
-        struct niobuf_local *local_nb, *lnb;
+        struct niobuf_local *local_nb = NULL;
+        struct niobuf_local *lnb;
         struct obd_ioobj *ioo;
         struct ost_body *body;
-        int cmd, rc, i, j, objcount, niocount, size[2] = {sizeof(*body)};
-        void *tmp1, *tmp2, *end2;
+        struct l_wait_info lwi;
+        int rc, cmd, i, j, objcount, niocount;
+        int size[2] = {sizeof(*body)};
         void *desc_priv = NULL;
         int reply_sent = 0;
         struct ptlrpc_service *srv;
-        struct l_wait_info lwi;
         __u32 xid;
         ENTRY;
 
@@ -415,11 +420,15 @@ static int ost_brw_write(struct ptlrpc_request *req)
         if (rc) {
                 if (rc != -ETIMEDOUT)
                         LBUG();
-                GOTO(fail_bulk, rc);
+                ptlrpc_abort_bulk(desc);
+                recovd_conn_fail(desc->bd_connection);
+                obd_commitrw(cmd, conn, objcount, tmp1, niocount, local_nb,
+                             desc->bd_desc_private);
+        } else {
+                rc = obd_commitrw(cmd, conn, objcount, tmp1, niocount, local_nb,
+                                  desc->bd_desc_private);
         }
 
-        rc = obd_commitrw(cmd, conn, objcount, tmp1, niocount, local_nb,
-                          desc->bd_desc_private);
         ptlrpc_bulk_decref(desc);
         EXIT;
 out_free:
@@ -438,7 +447,7 @@ out:
 fail_bulk:
         ptlrpc_free_bulk(desc);
 fail_preprw:
-        /* FIXME: how do we undo the preprw? */
+        /* FIXME: how do we undo the preprw? - answer = call commitrw */
         goto out_free;
 }
 
@@ -457,6 +466,7 @@ static int ost_handle(struct ptlrpc_request *req)
             req->rq_export == NULL) {
                 CERROR("lustre_ost: operation %d on unconnected OST\n",
                        req->rq_reqmsg->opc);
+                req->rq_status = -ENOTCONN;
                 GOTO(out, rc = -ENOTCONN);
         }
 
@@ -592,19 +602,18 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf)
                 RETURN(-EINVAL);
         }
 
-        MOD_INC_USE_COUNT;
         tgt = class_uuid2obd(data->ioc_inlbuf1);
         if (!tgt || !(tgt->obd_flags & OBD_ATTACHED) ||
             !(tgt->obd_flags & OBD_SET_UP)) {
                 CERROR("device not attached or not set up (%d)\n",
                        data->ioc_dev);
-                GOTO(error_dec, err = -EINVAL);
+                RETURN(err = -EINVAL);
         }
 
         err = obd_connect(&ost->ost_conn, tgt, NULL, NULL, NULL);
         if (err) {
                 CERROR("fail to connect to device %d\n", data->ioc_dev);
-                GOTO(error_dec, err = -EINVAL);
+                RETURN(err);
         }
 
         ost->ost_service = ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS,
@@ -630,8 +639,6 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf)
 
 error_disc:
         obd_disconnect(&ost->ost_conn);
-error_dec:
-        MOD_DEC_USE_COUNT;
         RETURN(err);
 }
 
@@ -651,14 +658,12 @@ static int ost_cleanup(struct obd_device * obddev)
         ptlrpc_unregister_service(ost->ost_service);
 
         err = obd_disconnect(&ost->ost_conn);
-        if (err) {
+        if (err)
                 CERROR("lustre ost: fail to disconnect device\n");
-                RETURN(-EINVAL);
-        }
 
-        MOD_DEC_USE_COUNT;
-        RETURN(0);
+        RETURN(err);
 }
+
 int ost_attach(struct obd_device *dev, obd_count len, void *data)
 {
         return lprocfs_reg_obd(dev, status_var_nm_1, dev);
@@ -667,24 +672,71 @@ int ost_attach(struct obd_device *dev, obd_count len, void *data)
 int ost_detach(struct obd_device *dev)
 {
         return lprocfs_dereg_obd(dev);
-       
 }
 
+/* This is so similar to mds_connect that it makes my heart weep: we should
+ * shuffle the UUID into obd_export proper and make this all happen in
+ * target_handle_connect.
+ */
+static int ost_connect(struct lustre_handle *conn,
+                       struct obd_device *obd, obd_uuid_t cluuid,
+                       struct recovd_obd *recovd,
+                       ptlrpc_recovery_cb_t recover)
+{
+        struct obd_export *exp;
+        struct ost_export_data *oed;
+        struct list_head *p;
+        int rc;
+        ENTRY;
+
+        if (!conn || !obd || !cluuid)
+                RETURN(-EINVAL);
+
+        /* lctl gets a backstage, all-access pass. */
+        if (!strcmp(cluuid, "OBD_CLASS_UUID"))
+                goto dont_check_exports;
+
+        spin_lock(&obd->obd_dev_lock);
+        list_for_each(p, &obd->obd_exports) {
+                exp = list_entry(p, struct obd_export, exp_obd_chain);
+                oed = &exp->exp_ost_data;
+                if (!memcmp(cluuid, oed->oed_uuid, sizeof oed->oed_uuid)) {
+                        spin_unlock(&obd->obd_dev_lock);
+                        LASSERT(exp->exp_obd == obd);
+
+                        RETURN(target_handle_reconnect(conn, exp, cluuid));
+                }
+        }
+
+ dont_check_exports:
+        rc = class_connect(conn, obd, cluuid);
+        if (rc)
+                RETURN(rc);
+        exp = class_conn2export(conn);
+        LASSERT(exp);
+
+        oed = &exp->exp_ost_data;
+        memcpy(oed->oed_uuid, cluuid, sizeof oed->oed_uuid);
+
+        RETURN(0);
+}
 
 
 /* use obd ops to offer management infrastructure */
 static struct obd_ops ost_obd_ops = {
-        o_attach:      ost_attach,
-        o_detach:      ost_detach,
-        o_setup:       ost_setup,
-        o_cleanup:     ost_cleanup,
+        o_owner:        THIS_MODULE,
+        o_attach:       ost_attach,
+        o_detach:       ost_detach,
+        o_setup:        ost_setup,
+        o_cleanup:      ost_cleanup,
+        o_connect:      ost_connect,
 };
 
 static int __init ost_init(void)
 {
         int rc;
 
-        rc = class_register_type(&ost_obd_ops, status_class_var, 
+        rc = class_register_type(&ost_obd_ops, status_class_var,
                                  LUSTRE_OST_NAME);
         RETURN(rc);
 
@@ -692,7 +744,6 @@ static int __init ost_init(void)
 
 static void __exit ost_exit(void)
 {
-        
         class_unregister_type(LUSTRE_OST_NAME);
 }
 
diff --git a/lustre/patches/.cvsignore b/lustre/patches/.cvsignore
deleted file mode 100644 (file)
index e530020..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-.Xrefs
-config.log
-config.status
-configure
-Makefile
-Makefile.in
-.deps
-TAGS
diff --git a/lustre/ptlbd/.cvsignore b/lustre/ptlbd/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lustre/ptlbd/Makefile.am b/lustre/ptlbd/Makefile.am
new file mode 100644 (file)
index 0000000..bfaeb25
--- /dev/null
@@ -0,0 +1,14 @@
+# Copyright (C) 2002  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+DEFS=
+
+MODULE = ptlbd
+modulefs_DATA = ptlbd.o
+EXTRA_PROGRAMS = ptlbd
+
+ptlbd_SOURCES = blk.c client.c main.c rpc.c server.c
+
+include $(top_srcdir)/Rules
diff --git a/lustre/ptlbd/blk.c b/lustre/ptlbd/blk.c
new file mode 100644 (file)
index 0000000..4a79343
--- /dev/null
@@ -0,0 +1,247 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/module.h>
+#include <linux/major.h>
+#include <linux/smp.h>
+
+#define DEBUG_SUBSYSTEM S_PTLBD
+
+#include <linux/lustre_lite.h>
+#include <linux/lustre_ha.h>
+#include <linux/obd_support.h>
+#include <linux/lustre_idl.h>
+#include <linux/obd_ptlbd.h>
+
+/*
+ * todo:
+ *   assign proper major number
+ *   allow more minors
+ *   discover actual block sizes?
+ *   allow more than one sector per io
+ *   think about vary-io
+ *   restrict single ops to sequential block io
+ *   ddn target addresses need to be 32 bit
+ *   cant get to addresses after 0xFFFF0000
+ */
+
+#define PTLBD_MAJOR 253
+#define PTLBD_MAX_MINOR 1
+
+#define MAJOR_NR PTLBD_MAJOR
+#define LOCAL_END_REQUEST
+#include <linux/blk.h>
+#include <linux/blkdev.h>
+#include <linux/devfs_fs_kernel.h>
+
+static int ptlbd_size_size[PTLBD_MAX_MINOR];
+static int ptlbd_size[PTLBD_MAX_MINOR];
+static int ptlbd_hardsect_size[PTLBD_MAX_MINOR];
+static int ptlbd_max_sectors[PTLBD_MAX_MINOR];
+//RHism static char ptlbd_dev_varyio[PTLBD_MAX_MINOR];
+
+/*
+ * per minor state, indexed by minor.
+ */
+
+static struct ptlbd_obd *one_for_now;
+
+void ptlbd_blk_register(struct ptlbd_obd *ptlbd)
+{
+        ENTRY;
+        one_for_now = ptlbd;
+        EXIT;
+}
+
+static struct ptlbd_obd * ptlbd_get_minor(int minor)
+{
+        ENTRY;
+        if ( minor >= PTLBD_MAX_MINOR ) 
+                RETURN( ERR_PTR(-ENODEV) );
+        RETURN(one_for_now);
+}
+
+static struct ptlbd_obd * ptlbd_get_inode(struct inode  *inode)
+{
+        ENTRY;
+
+        if ( inode == NULL ) /* can this really happen? */
+                RETURN( ERR_PTR(-EINVAL) );
+
+        return ptlbd_get_minor(MINOR(inode->i_rdev));
+}
+
+static int ptlbd_open(struct inode *inode, struct file  *file)
+{
+        struct ptlbd_obd *ptlbd = ptlbd_get_inode(inode);
+        ENTRY;
+
+        if ( IS_ERR(ptlbd) )
+                RETURN(PTR_ERR(ptlbd));
+        if ( ptlbd->bd_import.imp_connection == NULL )
+                RETURN(-ENODEV);
+
+        ptlbd->refcount++;
+        RETURN(0);
+}
+
+static int ptlbd_ioctl(struct inode *inode, struct file *file,
+                unsigned int cmd, unsigned long arg)
+{
+        struct ptlbd_obd *ptlbd;
+
+        if ( ! capable(CAP_SYS_ADMIN) )
+                RETURN(-EPERM);
+
+        ptlbd = ptlbd_get_inode(inode);
+        if ( IS_ERR(ptlbd) )
+                RETURN( PTR_ERR(ptlbd) );
+
+        /* XXX getattr{,64} */
+
+        RETURN(-EINVAL);
+}
+
+static int ptlbd_release(struct inode *inode, struct file *file)
+{
+        struct ptlbd_obd *ptlbd = ptlbd_get_inode(inode);
+        ENTRY;
+
+        if ( IS_ERR(ptlbd) ) 
+                RETURN( PTR_ERR(ptlbd) );
+
+        ptlbd->refcount--;
+        RETURN(0);
+}
+
+static void ptlbd_end_request_havelock(struct request *req)
+{
+        struct buffer_head *bh;
+        int uptodate = 1;
+
+        if ( req->errors )
+                uptodate = 0;
+
+        while( (bh = req->bh) != NULL ) {
+                blk_finished_io(bh->b_size >> 9);
+                req->bh = bh->b_reqnext;
+                bh->b_reqnext = NULL;
+                bh->b_end_io(bh, uptodate);
+        }
+        blkdev_release_request(req);
+}
+
+#if 0
+static void ptlbd_end_request_getlock(struct request *req)
+{
+        unsigned long flags;
+
+        spin_lock_irqsave(&io_request_lock, flags);
+        ptlbd_end_request_havelock(req);
+        spin_unlock_irqrestore(&io_request_lock, flags);
+}
+#endif
+
+static void ptlbd_request(request_queue_t *q)
+{
+        struct ptlbd_obd *ptlbd;
+        struct request *req;
+        ptlbd_cmd_t cmd;
+        ENTRY;
+
+        while ( !QUEUE_EMPTY ) {
+                req = CURRENT;
+                ptlbd = ptlbd_get_minor(MINOR(req->rq_dev));
+
+                blkdev_dequeue_request(req);
+
+                if ( ptlbd->refcount <= 0 ) {
+                        req->errors++;
+                        ptlbd_end_request_havelock(req);
+                        return;
+                }
+
+                spin_unlock_irq(&io_request_lock);
+
+                /* XXX dunno if we're supposed to get this or not.. */
+                LASSERT(req->cmd != READA);
+
+                if ( req->cmd == READ )
+                        cmd = PTLBD_READ;
+                else 
+                        cmd = PTLBD_WRITE;
+
+                ptlbd_send_req(ptlbd, cmd, req->bh);
+
+                spin_lock_irq(&io_request_lock);
+
+                ptlbd_end_request_havelock(req);
+        }
+}
+
+static struct block_device_operations ptlbd_ops = {
+        .owner = THIS_MODULE,
+        .open = ptlbd_open,
+        .release = ptlbd_release,
+        .ioctl = ptlbd_ioctl,
+};
+
+int ptlbd_blk_init(void)
+{
+        int ret;
+        int i;
+        ENTRY;
+
+        ret = register_blkdev(PTLBD_MAJOR, "ptlbd", &ptlbd_ops);
+        if ( ret < 0 ) 
+                RETURN(ret);
+
+        blk_size[PTLBD_MAJOR] = ptlbd_size;
+        blksize_size[PTLBD_MAJOR] = ptlbd_size_size;
+        hardsect_size[PTLBD_MAJOR] = ptlbd_hardsect_size;
+        max_sectors[PTLBD_MAJOR] = ptlbd_max_sectors;
+        //RHism blkdev_varyio[PTLBD_MAJOR] = ptlbd_dev_varyio;
+
+        blk_init_queue(BLK_DEFAULT_QUEUE(PTLBD_MAJOR), ptlbd_request);
+        blk_queue_headactive(BLK_DEFAULT_QUEUE(MAJOR_NR), 0);
+
+        for ( i = 0 ; i < PTLBD_MAX_MINOR ; i++) {
+                ptlbd_size_size[i] = 4096;
+                ptlbd_size[i] = (4096*2048) >> BLOCK_SIZE_BITS;
+                ptlbd_hardsect_size[i] = 4096;
+                ptlbd_max_sectors[i] = 2;
+                //RHism ptlbd_dev_varyio[i] = 0;
+                /* XXX register_disk? */
+        }
+
+        return 0;
+}
+
+void ptlbd_blk_exit(void)
+{
+        int ret;
+        ENTRY;
+        blk_cleanup_queue(BLK_DEFAULT_QUEUE(PTLBD_MAJOR));
+        ret = unregister_blkdev(PTLBD_MAJOR, "ptlbd");
+        if ( ret )  /* XXX */
+                printk("unregister_blkdev() failed: %d\n", ret);
+}
+
+#undef MAJOR_NR
diff --git a/lustre/ptlbd/client.c b/lustre/ptlbd/client.c
new file mode 100644 (file)
index 0000000..d57e001
--- /dev/null
@@ -0,0 +1,142 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#define DEBUG_SUBSYSTEM S_PTLBD
+
+#include <linux/obd_support.h>
+#include <linux/obd_class.h>
+#include <linux/lustre_debug.h>
+#include <linux/lprocfs_status.h>
+#include <linux/obd_ptlbd.h>
+
+static int ptlbd_cl_setup(struct obd_device *obddev, obd_count len, void *buf)
+{
+        struct ptlbd_obd *ptlbd = &obddev->u.ptlbd;
+        struct obd_import *imp = &ptlbd->bd_import;
+        struct obd_ioctl_data* data = buf;
+        obd_uuid_t server_uuid;
+        ENTRY;
+
+        if ( ptlbd->bd_import.imp_connection != NULL )
+                RETURN(-EALREADY);
+
+        if (data->ioc_inllen1 < 1) {
+                CERROR("requires a PTLBD server UUID\n");
+                RETURN(-EINVAL);
+        }
+
+        if (data->ioc_inllen1 > 37) {
+                CERROR("PTLBD server UUID must be less than 38 characters\n");
+                RETURN(-EINVAL);
+        }
+
+        memcpy(server_uuid, data->ioc_inlbuf1, MIN(data->ioc_inllen1,
+                                                   sizeof(server_uuid)));
+
+        imp->imp_connection = ptlrpc_uuid_to_connection(server_uuid);
+        if (!imp->imp_connection)
+                RETURN(-ENOENT);
+
+        INIT_LIST_HEAD(&imp->imp_replay_list);
+        INIT_LIST_HEAD(&imp->imp_sending_list);
+        INIT_LIST_HEAD(&imp->imp_delayed_list);
+        spin_lock_init(&imp->imp_lock);
+        /*
+         * from client_obd_connect.. *shrug*
+         */
+        INIT_LIST_HEAD(&imp->imp_chain);
+        imp->imp_last_xid = 0;
+        imp->imp_max_transno = 0;
+        imp->imp_peer_last_xid = 0;
+        imp->imp_peer_committed_transno = 0;
+        imp->imp_level = LUSTRE_CONN_FULL;
+
+        ptlrpc_init_client(PTLBD_REQUEST_PORTAL, PTLBD_REPLY_PORTAL, 
+                        "ptlbd", &ptlbd->bd_client);
+        imp->imp_client = &ptlbd->bd_client;
+        imp->imp_obd = obddev;
+
+        ptlbd_blk_register(ptlbd);
+
+        RETURN(0);
+}
+
+static int ptlbd_cl_cleanup(struct obd_device *obddev)
+{
+//        struct ptlbd_obd *ptlbd = &obddev->u.ptlbd;
+        ENTRY;
+
+        CERROR("I should be cleaning things up\n");
+
+        RETURN(0);
+}
+
+#if 0
+static int ptlbd_cl_connect(struct lustre_handle *conn, struct obd_device *obd,
+                        obd_uuid_t cluuid, struct recovd_obd *recovd,
+                        ptlrpc_recovery_cb_t recover)
+{
+        struct ptlbd_obd *ptlbd = &obd->u.ptlbd;
+        struct obd_import *imp = &ptlbd->bd_import;
+        int rc;
+        ENTRY;
+
+        rc = class_connect(conn, obd, cluuid);
+        if (rc) 
+                RETURN(rc);
+
+        INIT_LIST_HEAD(&imp->imp_chain);
+        imp->imp_last_xid = 0;
+        imp->imp_max_transno = 0;
+        imp->imp_peer_last_xid = 0;
+        imp->imp_peer_committed_transno = 0;
+        imp->imp_level = LUSTRE_CONN_FULL;
+
+        RETURN(0);
+}
+#endif
+
+static struct obd_ops ptlbd_cl_obd_ops = {
+        o_owner:        THIS_MODULE,
+        o_setup:        ptlbd_cl_setup,
+        o_cleanup:      ptlbd_cl_cleanup,
+#if 0
+        o_connect:      ptlbd_cl_connect,
+        o_disconnect:   class_disconnect
+#endif
+};
+
+int ptlbd_cl_init(void)
+{
+        extern struct lprocfs_vars status_class_var[];
+
+        return class_register_type(&ptlbd_cl_obd_ops, status_class_var,
+                                   OBD_PTLBD_CL_DEVICENAME);
+}
+
+void ptlbd_cl_exit(void)
+{
+        class_unregister_type(OBD_PTLBD_CL_DEVICENAME);
+}
diff --git a/lustre/ptlbd/main.c b/lustre/ptlbd/main.c
new file mode 100644 (file)
index 0000000..a95cc3f
--- /dev/null
@@ -0,0 +1,70 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/major.h>
+#include <linux/smp.h>
+
+#define DEBUG_SUBSYSTEM S_PTLBD
+
+#include <linux/lustre_lite.h>
+#include <linux/lustre_ha.h>
+#include <linux/obd_support.h>
+
+#include <linux/obd_ptlbd.h>
+
+static int __init ptlbd_init(void)
+{
+        int ret;
+        ENTRY;
+
+        ret = ptlbd_cl_init();
+        if ( ret < 0 ) 
+                RETURN(ret);
+
+        ret = ptlbd_sv_init();
+        if ( ret < 0 ) 
+                GOTO(out_cl, ret);
+
+        ret = ptlbd_blk_init();
+        if ( ret < 0 ) 
+                GOTO(out_sv, ret);
+
+        RETURN(0);
+
+out_sv:
+        ptlbd_sv_exit();
+out_cl:
+        ptlbd_cl_exit();
+        RETURN(ret);
+}
+
+static void __exit ptlbd_exit(void)
+{
+        ENTRY;
+        ptlbd_cl_exit();
+        ptlbd_sv_exit();
+        EXIT;
+}
+
+module_init(ptlbd_init);
+module_exit(ptlbd_exit);
+MODULE_LICENSE("GPL");
diff --git a/lustre/ptlbd/rpc.c b/lustre/ptlbd/rpc.c
new file mode 100644 (file)
index 0000000..5ff5177
--- /dev/null
@@ -0,0 +1,550 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#define DEBUG_SUBSYSTEM S_PTLBD
+
+#include <linux/obd_support.h>
+#include <linux/obd_class.h>
+#include <linux/lustre_debug.h>
+#include <linux/lprocfs_status.h>
+#include <linux/obd_ptlbd.h>
+
+static __u32 get_next_xid(struct obd_import *imp)
+{
+        unsigned long flags;
+        __u32 xid;
+        spin_lock_irqsave(&imp->imp_lock, flags);
+        xid = ++imp->imp_last_xid;
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
+        return xid;
+}
+
+static int ptlbd_brw_callback(struct obd_brw_set *set, int phase)
+{
+        ENTRY;
+        RETURN(0);
+}
+
+static void decref_bulk_desc(void *data)
+{
+        struct ptlrpc_bulk_desc *desc = data;
+        ENTRY;
+
+        ptlrpc_bulk_decref(desc);
+        EXIT;
+}
+
+/*  this is the callback function which is invoked by the Portals
+ *  event handler associated with the bulk_sink queue and bulk_source queue. 
+ */
+static void ptlbd_ptl_ev_hdlr(struct ptlrpc_bulk_desc *desc)
+{
+        ENTRY;
+
+        LASSERT(desc->bd_brw_set != NULL);
+        LASSERT(desc->bd_brw_set->brw_callback != NULL);
+
+        desc->bd_brw_set->brw_callback(desc->bd_brw_set, CB_PHASE_FINISH);
+
+        prepare_work(&desc->bd_queue, decref_bulk_desc, desc);
+        schedule_work(&desc->bd_queue);
+
+        EXIT;
+}
+
+
+int ptlbd_write_put_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, 
+                struct buffer_head *first_bh, unsigned int page_count)
+{
+        struct obd_import *imp = &ptlbd->bd_import;
+        struct ptlbd_op *op;
+        struct ptlbd_niob *niob, *niobs;
+        struct ptlbd_rsp *rsp;
+        struct ptlrpc_request *req;
+        struct ptlrpc_bulk_desc *desc;
+        struct buffer_head *bh;
+        int rc, size[2];
+        struct obd_brw_set *set;
+        ENTRY;
+
+        size[0] = sizeof(struct ptlbd_op);
+        size[1] = page_count * sizeof(struct ptlbd_niob);
+
+        req = ptlrpc_prep_req(imp, cmd, 2, size, NULL);
+        if (!req)
+                GOTO(out, rc = -ENOMEM);
+        /* XXX might not need these */
+        req->rq_request_portal = PTLBD_REQUEST_PORTAL;
+        req->rq_reply_portal = PTLBD_REPLY_PORTAL;
+
+        op = lustre_msg_buf(req->rq_reqmsg, 0);
+        niobs = lustre_msg_buf(req->rq_reqmsg, 1);
+
+        /* XXX pack */
+        op->op_cmd = cmd;
+        op->op_lun = 0;
+        op->op_niob_cnt = page_count;
+        op->op__padding = 0;
+        op->op_block_cnt = page_count;
+
+        desc = ptlrpc_prep_bulk(imp->imp_connection);
+        if ( desc == NULL )
+                GOTO(out_req, rc = -ENOMEM);
+        desc->bd_portal = PTLBD_BULK_PORTAL;
+        desc->bd_ptl_ev_hdlr = ptlbd_ptl_ev_hdlr;
+
+        /* XXX someone needs to free this */
+        set = obd_brw_set_new();
+        if (set == NULL)
+                GOTO(out_desc, rc = -ENOMEM);
+
+        set->brw_callback = ptlbd_brw_callback;
+#if 0
+        xid = get_next_xid(imp);
+#endif
+
+        for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_next, niob++ ) {
+#if 0
+                struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
+                if (bulk == NULL)
+                        GOTO(out_set, rc = -ENOMEM);
+#endif
+
+#if 0
+                niob->n_xid = xid;
+#endif
+                niob->n_block_nr = bh->b_blocknr;
+                niob->n_offset = bh_offset(bh);
+                niob->n_length = bh->b_size;
+
+
+#if 0
+                bulk->bp_xid = xid;
+                bulk->bp_buf = bh->b_data;
+                bulk->bp_page = bh->b_page;
+                bulk->bp_buflen = bh->b_size;
+#endif
+        }
+
+
+        size[0] = sizeof(struct ptlbd_rsp);
+        size[1] = sizeof(struct ptlbd_niob) * page_count;
+        req->rq_replen = lustre_msg_size(2, size);
+
+        /* XXX find out how we're really supposed to manage levels */
+        req->rq_level = imp->imp_level;
+        rc = ptlrpc_queue_wait(req);
+
+        rsp = lustre_msg_buf(req->rq_repmsg, 0);
+
+        niob = lustre_msg_buf(req->rq_repmsg, 1);
+        /* XXX check that op->num matches ours */
+        for ( bh = first_bh ; bh ; bh = bh->b_next, niob++ ) {
+                struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
+                if (bulk == NULL)
+                        GOTO(out_set, rc = -ENOMEM);
+
+                bulk->bp_xid = niob->n_xid;
+                bulk->bp_page = bh->b_page;
+                bulk->bp_buf = bh->b_data;
+                bulk->bp_buflen = bh->b_size;
+        }
+
+        obd_brw_set_add(set, desc);
+        rc = ptlrpc_send_bulk(desc);
+
+        /* if there's an error, no brw_finish called, just like
+         * osc_brw_read */
+
+        GOTO(out_req, rc);
+
+out_set:
+        obd_brw_set_free(set);
+out_desc:
+        ptlrpc_bulk_decref(desc);
+out_req:
+        ptlrpc_req_finished(req);
+out:
+        RETURN(rc);
+}
+
+int ptlbd_read_put_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, 
+                struct buffer_head *first_bh, unsigned int page_count)
+{
+        struct obd_import *imp = &ptlbd->bd_import;
+        struct ptlbd_op *op;
+        struct ptlbd_niob *niob, *niobs;
+        struct ptlbd_rsp *rsp;
+        struct ptlrpc_request *req;
+        struct ptlrpc_bulk_desc *desc;
+        struct buffer_head *bh;
+        int rc, rep_size, size[2];
+        struct obd_brw_set *set;
+        __u32 xid;
+        ENTRY;
+
+        size[0] = sizeof(struct ptlbd_op);
+        size[1] = page_count * sizeof(struct ptlbd_niob);
+
+        req = ptlrpc_prep_req(imp, cmd, 2, size, NULL);
+        if (!req)
+                GOTO(out, rc = -ENOMEM);
+        /* XXX might not need these? */
+        req->rq_request_portal = PTLBD_REQUEST_PORTAL;
+        req->rq_reply_portal = PTLBD_REPLY_PORTAL;
+
+        op = lustre_msg_buf(req->rq_reqmsg, 0);
+        niobs = lustre_msg_buf(req->rq_reqmsg, 1);
+
+        /* XXX pack */
+        op->op_cmd = cmd;
+        op->op_lun = 0;
+        op->op_niob_cnt = page_count;
+        op->op__padding = 0;
+        op->op_block_cnt = page_count;
+
+        desc = ptlrpc_prep_bulk(imp->imp_connection);
+        if ( desc == NULL )
+                GOTO(out_req, rc = -ENOMEM);
+        desc->bd_portal = PTLBD_BULK_PORTAL;
+        desc->bd_ptl_ev_hdlr = ptlbd_ptl_ev_hdlr;
+
+        /* XXX someone needs to free this */
+        set = obd_brw_set_new();
+        if (set == NULL)
+                GOTO(out_desc, rc = -ENOMEM);
+
+        set->brw_callback = ptlbd_brw_callback;
+
+        xid = get_next_xid(imp);
+
+        for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_next, niob++ ) {
+                struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
+                if (bulk == NULL)
+                        GOTO(out_set, rc = -ENOMEM);
+
+                niob->n_xid = xid;
+                niob->n_block_nr = bh->b_blocknr;
+                niob->n_offset = bh_offset(bh);
+                niob->n_length = bh->b_size;
+
+                bulk->bp_xid = xid;
+                bulk->bp_buf = bh->b_data;
+                bulk->bp_page = bh->b_page;
+                bulk->bp_buflen = bh->b_size;
+        }
+
+        /* XXX put in OBD_FAIL_CHECK for ptlbd? */
+        rc = ptlrpc_register_bulk(desc);
+        if (rc)
+                GOTO(out_set, rc);
+
+        obd_brw_set_add(set, desc);
+
+        rep_size = sizeof(struct ptlbd_rsp);
+        req->rq_replen = lustre_msg_size(1, &rep_size);
+
+        /* XXX find out how we're really supposed to manage levels */
+        req->rq_level = imp->imp_level;
+        rc = ptlrpc_queue_wait(req);
+
+        rsp = lustre_msg_buf(req->rq_repmsg, 0);
+
+        /* if there's an error, no brw_finish called, just like
+         * osc_brw_read */
+
+        GOTO(out_req, rc);
+
+out_set:
+        obd_brw_set_free(set);
+out_desc:
+        ptlrpc_bulk_decref(desc);
+out_req:
+        ptlrpc_req_finished(req);
+out:
+        RETURN(rc);
+}
+
+int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, 
+                struct buffer_head *first_bh)
+{
+        unsigned int page_count = 0;
+        struct buffer_head *bh;
+        int rc;
+        ENTRY;
+
+        for ( page_count = 0, bh = first_bh ; bh ; bh = bh->b_next )
+                page_count++;
+
+        switch (cmd) {
+                case PTLBD_READ:
+                        rc = ptlbd_read_put_req(ptlbd, cmd, 
+                                        first_bh, page_count);
+                        break;
+                case PTLBD_WRITE:
+                        rc = ptlbd_write_put_req(ptlbd, cmd, 
+                                        first_bh, page_count);
+                        break;
+                default:
+                        rc = -EINVAL;
+                        break;
+        };
+
+        RETURN(rc);
+}
+
+static int ptlbd_bulk_timeout(void *data)
+{
+/*        struct ptlrpc_bulk_desc *desc = data;*/
+        ENTRY;
+
+        CERROR("ugh, timed out\n");
+
+        RETURN(1);
+}
+
+#define SILLY_MAX 2048
+static struct page *pages[SILLY_MAX] = {NULL,};
+
+static struct page * fake_page(int block_nr)
+{
+        if ( block_nr >= SILLY_MAX )
+                return NULL;
+
+        if (pages[block_nr] == NULL) {
+                void *vaddr = (void *)get_free_page(GFP_KERNEL);
+                pages[block_nr] = virt_to_page(vaddr);
+        } 
+        return pages[block_nr];
+}
+
+static int ptlbd_put_write(struct ptlrpc_request *req)
+{
+        struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
+        struct ptlbd_op *op;
+        struct ptlbd_niob *reply_niob, *request_niob;
+        struct ptlbd_rsp *rsp;
+        struct ptlrpc_bulk_desc *desc;
+        struct ptlrpc_service *srv;
+        struct l_wait_info lwi;
+        int size[2];
+        int i, page_count, rc;
+        __u32 xid;
+
+        op = lustre_msg_buf(req->rq_reqmsg, 0);
+        request_niob = lustre_msg_buf(req->rq_reqmsg, 1);
+        page_count = req->rq_reqmsg->buflens[1] / sizeof(struct ptlbd_niob);
+
+        size[0] = sizeof(struct ptlbd_rsp);
+        size[1] = sizeof(struct ptlbd_niob) * page_count;
+        rc = lustre_pack_msg(2, size, NULL, &req->rq_replen, &req->rq_repmsg);
+        if (rc)
+                GOTO(out, rc);
+        reply_niob = lustre_msg_buf(req->rq_repmsg, 1);
+
+        desc = ptlrpc_prep_bulk(req->rq_connection);
+        if (desc == NULL)
+                GOTO(out, rc = -ENOMEM);
+        desc->bd_ptl_ev_hdlr = NULL;
+        desc->bd_portal = PTLBD_BULK_PORTAL;
+        memcpy(&(desc->bd_conn), &conn, sizeof(conn)); /* XXX what? */
+
+        srv = req->rq_obd->u.ptlbd.ptlbd_service;
+        spin_lock(&srv->srv_lock);
+        xid = srv->srv_xid++;                   /* single xid for all pages */
+        spin_unlock(&srv->srv_lock);
+
+        for ( i = 0; i < page_count; i++) {
+                struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
+                if (bulk == NULL)
+                        GOTO(out_desc, rc = -ENOMEM);
+                        
+                reply_niob[i] = request_niob[i];
+                reply_niob[i].n_xid = xid;
+
+                bulk->bp_xid = xid;
+                bulk->bp_page = fake_page(request_niob[i].n_block_nr);
+                bulk->bp_buf = page_address(bulk->bp_page);
+                bulk->bp_buflen = request_niob[i].n_length;
+        }
+
+        rc = ptlrpc_register_bulk(desc);
+        if ( rc )
+                GOTO(out_desc, rc);
+
+        rsp = lustre_msg_buf(req->rq_reqmsg, 0);
+        rsp->r_status = 42;
+        rsp->r_error_cnt = 13;
+        ptlrpc_reply(req->rq_svc, req);
+
+        /* this synchronization probably isn't good enough */
+        lwi = LWI_TIMEOUT(obd_timeout * HZ, ptlbd_bulk_timeout, desc);
+        rc = l_wait_event(desc->bd_waitq, desc->bd_flags &PTL_BULK_FL_RCVD, 
+                        &lwi);
+
+out_desc:
+        ptlrpc_free_bulk(desc);
+out:
+        RETURN(rc);
+}
+
+static int ptlbd_put_read(struct ptlrpc_request *req)
+{
+        struct ptlbd_op *op;
+        struct ptlbd_niob *niob, *niobs;
+        struct ptlbd_rsp *rsp;
+        struct ptlrpc_bulk_desc *desc;
+        struct l_wait_info lwi;
+        int size[1];
+        int i, page_count, rc;
+
+        op = lustre_msg_buf(req->rq_reqmsg, 0);
+        niobs = lustre_msg_buf(req->rq_reqmsg, 1);
+        page_count = req->rq_reqmsg->buflens[1] / sizeof(struct ptlbd_niob);
+
+        desc = ptlrpc_prep_bulk(req->rq_connection);
+        if (desc == NULL)
+                GOTO(out, rc = -ENOMEM);
+        desc->bd_portal = PTLBD_BULK_PORTAL;
+
+        for ( i = 0, niob = niobs ; i < page_count; niob++, i++) {
+                struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
+                if (bulk == NULL)
+                        GOTO(out_bulk, rc = -ENOMEM);
+
+                /* 
+                 * XXX what about the block number? 
+                 */
+                bulk->bp_xid = niob->n_xid;
+                bulk->bp_page = fake_page(niob->n_block_nr);
+                bulk->bp_buf = page_address(bulk->bp_page);
+                bulk->bp_buflen = niob->n_length;
+        }
+
+        rc = ptlrpc_send_bulk(desc);
+        if ( rc )
+                GOTO(out_bulk, rc);
+
+        /* this synchronization probably isn't good enough */
+        lwi = LWI_TIMEOUT(obd_timeout * HZ, ptlbd_bulk_timeout, desc);
+        rc = l_wait_event(desc->bd_waitq, desc->bd_flags &PTL_BULK_FL_SENT, 
+                        &lwi);
+
+        size[0] = sizeof(struct ptlbd_rsp);
+        rc = lustre_pack_msg(1, size, NULL, &req->rq_replen, &req->rq_repmsg);
+        if ( rc )
+                GOTO(out, rc);
+
+        rsp = lustre_msg_buf(req->rq_repmsg, 0);
+        if ( rsp == NULL )
+                GOTO(out, rc = -EINVAL);
+
+        rsp->r_error_cnt = 42;
+        rsp->r_status = 69;
+
+        req->rq_status = 0; /* XXX */
+        ptlrpc_reply(req->rq_svc, req);
+
+out_bulk:
+        ptlrpc_free_bulk(desc);
+out:
+        RETURN(rc);
+}
+
+
+int ptlbd_parse_req(struct ptlrpc_request *req)
+{
+        struct ptlbd_op *op;
+        int rc;
+        ENTRY;
+
+        rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
+        if ( rc )
+                RETURN(rc);
+
+        op = lustre_msg_buf(req->rq_reqmsg, 0);
+
+        switch(op->op_cmd) {
+                case PTLBD_READ:
+                        ptlbd_put_read(req);
+                        break;
+                case PTLBD_WRITE:
+                        ptlbd_put_write(req);
+                        break;
+                default:
+                        CERROR("fix this %d\n", op->op_cmd);
+                        break;
+        }
+
+        RETURN(0);
+}
+
+
+#if 0
+int ptlbd_bh_req(int cmd, struct ptlbd_state *st, struct buffer_head *first_bh)
+{
+        struct obd_brw_set *set = NULL;
+        struct brw_page *pg = NULL;
+        struct buffer_head *bh;
+        int rc, i, pg_bytes = 0;
+        ENTRY;
+
+        for ( bh = first_bh ; bh ; bh = bh->b_reqnext ) 
+                pg_bytes += sizeof(struct brw_page);
+
+        OBD_ALLOC(pg, pg_bytes);
+        if ( pg == NULL )
+                GOTO(out, rc = -ENOMEM);
+
+        set = obd_brw_set_new();
+        if (set == NULL)
+                GOTO(out, rc = -ENOMEM);
+
+        for ( i = 0, bh = first_bh ; bh ; bh = bh->b_reqnext, i++) {
+                pg[i].pg = bh->b_page;
+                pg[i].off = bh_offset(bh);
+                pg[i].count = bh->b_size;
+                pg[i].flag = 0;
+        }
+
+        set->brw_callback = ll_brw_sync_wait;
+        rc = obd_brw(cmd, /* lsm */NULL, num_pages, pg, set);
+        if ( rc )
+                GOTO(out, rc);
+
+        rc = ll_brw_sync_wait(set, CB_PHASE_START);
+        if (rc)
+                CERROR("error from callback: rc = %d\n", rc);
+
+out:
+        if ( pg != NULL )
+                OBD_FREE(pg, pg_bytes);
+        if ( set != NULL )
+                obd_brw_set_free(set);
+
+        RETURN(rc); 
+}
+#endif
diff --git a/lustre/ptlbd/server.c b/lustre/ptlbd/server.c
new file mode 100644 (file)
index 0000000..422f0e1
--- /dev/null
@@ -0,0 +1,154 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#define DEBUG_SUBSYSTEM S_PTLBD
+
+#include <linux/obd_support.h>
+#include <linux/obd_class.h>
+#include <linux/lustre_debug.h>
+#include <linux/lprocfs_status.h>
+#include <linux/obd_ptlbd.h>
+
+#if 0
+static int ptlbd_sv_callback(struct ptlrpc_request *req)
+{
+        int rc;
+        ENTRY;
+
+        rc = ptlbd_parse_request(req);
+
+        rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
+        if ( rc )
+                GOTO(out, rc);
+
+        printk("callback got a friggin opc %d\n", req->rq_reqmsg->opc);
+
+out:
+        RETURN(rc);
+}
+#endif
+
+static int ptlbd_sv_already_setup = 1;
+
+static int ptlbd_sv_setup(struct obd_device *obddev, obd_count len, void *buf)
+{
+#if 0
+        struct obd_ioctl_data* data = buf;
+        obd_uuid_t server_uuid;
+#endif
+        struct ptlbd_obd *ptlbd = &obddev->u.ptlbd;
+        int rc;
+        ENTRY;
+
+#if 0
+        if (data->ioc_inllen1 < 1) {
+                CERROR("requires a PTLBD server UUID\n");
+                RETURN(rc = -EINVAL);
+        }
+
+        if (data->ioc_inllen1 > 37) {
+                CERROR("PTLBD server UUID must be less than 38 characters\n");
+                RETURN(rc = -EINVAL);
+        }
+
+        memcpy(server_uuid, data->ioc_inlbuf1, MIN(data->ioc_inllen1,
+                                                   sizeof(server_uuid)));
+
+#endif
+        ptlbd->ptlbd_service =
+                ptlrpc_init_svc(PTLBD_NEVENTS, PTLBD_NBUFS, PTLBD_BUFSIZE,
+                                PTLBD_MAXREQSIZE, PTLBD_REQUEST_PORTAL,
+                                PTLBD_REPLY_PORTAL, "self", 
+                                ptlbd_parse_req, "ptlbd_sv");
+
+        if (!ptlbd->ptlbd_service) {
+                CERROR("failed to start service\n");
+                RETURN(rc = -ENOMEM);
+        }
+
+        rc = ptlrpc_start_thread(obddev, ptlbd->ptlbd_service, "ptldb");
+        if (rc) {
+                CERROR("cannot start PTLBD thread: rc %d\n", rc);
+                LBUG();
+                GOTO(out_thread, rc);
+        }
+
+        ptlbd_sv_already_setup = 1;
+
+        RETURN(0);
+
+ out_thread:
+        ptlrpc_stop_all_threads(ptlbd->ptlbd_service);
+        ptlrpc_unregister_service(ptlbd->ptlbd_service);
+
+        return rc;
+}
+
+static int ptlbd_sv_cleanup(struct obd_device *obddev)
+{
+        struct ptlbd_obd *ptlbd = &obddev->u.ptlbd;
+        ENTRY;
+
+        /* XXX check for state */
+
+        ptlrpc_stop_all_threads(ptlbd->ptlbd_service);
+        ptlrpc_unregister_service(ptlbd->ptlbd_service);
+
+        ptlbd_sv_already_setup = 0;
+        RETURN(0);
+}
+
+#if 0
+static int ptlbd_sv_connect(struct lustre_handle *conn, struct obd_device *src,
+                        obd_uuid_t cluuid, struct recovd_obd *recovd,
+                        ptlrpc_recovery_cb_t recover)
+{
+        return class_connect(conn, src, cluuid);
+}
+#endif
+
+static struct obd_ops ptlbd_sv_obd_ops = {
+        o_owner:        THIS_MODULE,
+/*        o_iocontrol:    ptlbd_iocontrol,*/
+        o_setup:        ptlbd_sv_setup,
+        o_cleanup:      ptlbd_sv_cleanup,
+#if 0
+        o_connect:      ptlbd_sv_connect,
+        o_disconnect:   class_disconnect
+#endif
+};
+
+int ptlbd_sv_init(void)
+{
+        extern struct lprocfs_vars status_class_var[];
+
+        return class_register_type(&ptlbd_sv_obd_ops, status_class_var,
+                                   OBD_PTLBD_SV_DEVICENAME);
+}
+
+void ptlbd_sv_exit(void)
+{
+        class_unregister_type(OBD_PTLBD_SV_DEVICENAME);
+}
index ccaa108..1d6c719 100644 (file)
@@ -259,6 +259,7 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
 {
         struct ptlrpc_connection *conn;
         struct ptlrpc_request *request;
+        unsigned long flags;
         int rc;
         ENTRY;
 
@@ -292,9 +293,9 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
         INIT_LIST_HEAD(&request->rq_list);
         atomic_set(&request->rq_refcount, 1);
 
-        spin_lock(&imp->imp_lock);
+        spin_lock_irqsave(&imp->imp_lock, flags);
         request->rq_xid = HTON__u32(++imp->imp_last_xid);
-        spin_unlock(&imp->imp_lock);
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
 
         request->rq_reqmsg->magic = PTLRPC_MSG_MAGIC;
         request->rq_reqmsg->version = PTLRPC_MSG_VERSION;
@@ -318,7 +319,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
                        request, request->rq_reqmsg->opc,
                        request->rq_connection->c_remote_uuid,
                        request->rq_import->imp_client->cli_request_portal,
-                       request->rq_refcount);
+                       atomic_read (&request->rq_refcount));
                 /* LBUG(); */
         }
 
@@ -333,11 +334,13 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
         }
 
         if (request->rq_import) {
+                unsigned long flags = 0;
                 if (!locked)
-                        spin_lock(&request->rq_import->imp_lock);
+                        spin_lock_irqsave(&request->rq_import->imp_lock, flags);
                 list_del_init(&request->rq_list);
                 if (!locked)
-                        spin_unlock(&request->rq_import->imp_lock);
+                        spin_unlock_irqrestore(&request->rq_import->imp_lock,
+                                               flags);
         }
 
         ptlrpc_put_connection(request->rq_connection);
@@ -356,6 +359,12 @@ static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked)
         if (request == NULL)
                 RETURN(1);
 
+        if (request == (void *)(long)(0x5a5a5a5a5a5a5a5a)) {
+                CERROR("dereferencing freed request (bug 575)\n");
+                LBUG();
+                RETURN(1);
+        }
+
         DEBUG_REQ(D_INFO, request, "refcount now %u",
                   atomic_read(&request->rq_refcount) - 1);
 
@@ -379,6 +388,8 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req)
         ENTRY;
         if (req->rq_repmsg != NULL) {
                 req->rq_transno = NTOH__u64(req->rq_repmsg->transno);
+                /* Store transno in reqmsg for replay. */
+                req->rq_reqmsg->transno = req->rq_repmsg->transno;
                 req->rq_flags |= PTL_RPC_FL_REPLIED;
                 GOTO(out, rc = 1);
         }
@@ -412,7 +423,7 @@ static int ptlrpc_check_status(struct ptlrpc_request *req)
 
         err = req->rq_repmsg->status;
         if (req->rq_repmsg->type == NTOH__u32(PTL_RPC_MSG_ERR)) {
-                DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR (%d)\n", err);
+                DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR (%d)", err);
                 RETURN(err ? err : -EINVAL);
         }
 
@@ -438,10 +449,13 @@ static int ptlrpc_abort(struct ptlrpc_request *request)
 {
         /* First remove the ME for the reply; in theory, this means
          * that we can tear down the buffer safely. */
-        PtlMEUnlink(request->rq_reply_me_h);
+        if (PtlMEUnlink(request->rq_reply_me_h) != PTL_OK)
+                RETURN(0);
         OBD_FREE(request->rq_reply_md.start, request->rq_replen);
+
+        memset(&request->rq_reply_me_h, 0, sizeof(request->rq_reply_me_h));
+        request->rq_reply_md.start = NULL;
         request->rq_repmsg = NULL;
-        request->rq_replen = 0;
         return 0;
 }
 
@@ -487,11 +501,12 @@ void ptlrpc_cleanup_client(struct obd_import *imp)
         struct list_head *tmp, *saved;
         struct ptlrpc_request *req;
         struct ptlrpc_connection *conn = imp->imp_connection;
+        unsigned long flags;
         ENTRY;
 
         LASSERT(conn);
 
-        spin_lock(&imp->imp_lock);
+        spin_lock_irqsave(&imp->imp_lock, flags);
         list_for_each_safe(tmp, saved, &imp->imp_replay_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
 
@@ -501,7 +516,7 @@ void ptlrpc_cleanup_client(struct obd_import *imp)
                 req->rq_import = NULL;
                 __ptlrpc_req_finished(req, 0);
         }
-        spin_unlock(&imp->imp_lock);
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
         
         EXIT;
         return;
@@ -554,6 +569,7 @@ static int expired_request(void *data)
         }
 
         DEBUG_REQ(D_ERROR, req, "timeout");
+        ptlrpc_abort(req);
         req->rq_flags |= PTL_RPC_FL_TIMEOUT;
 
         if (!req->rq_import) {
@@ -571,16 +587,13 @@ static int expired_request(void *data)
         if (!req->rq_import->imp_connection->c_recovd_data.rd_recovd)
                 RETURN(1);
 
-        req->rq_timeout = 0;
         recovd_conn_fail(req->rq_import->imp_connection);
 
-#if 0
         /* If this request is for recovery or other primordial tasks,
          * don't go back to sleep.
          */
         if (req->rq_level < LUSTRE_CONN_FULL)
                 RETURN(1);
-#endif
         RETURN(0);
 }
 
@@ -592,24 +605,13 @@ static int interrupted_request(void *data)
         RETURN(1); /* ignored, as of this writing */
 }
 
-/* If the import has been invalidated (such as by an OST failure), the
- * request must fail with -EIO.
- *
- * Must be called with imp_lock held, will drop it if it returns -EIO.
- */
-#define EIO_IF_INVALID(req)                                                   \
-if (req->rq_import->imp_flags & IMP_INVALID) {                                \
-        DEBUG_REQ(D_ERROR, req, "IMP_INVALID:");                              \
-        spin_unlock(&imp->imp_lock);                                          \
-        RETURN(-EIO);                                                         \
-}
-
 int ptlrpc_queue_wait(struct ptlrpc_request *req)
 {
         int rc = 0;
         struct l_wait_info lwi;
         struct obd_import *imp = req->rq_import;
         struct ptlrpc_connection *conn = imp->imp_connection;
+        unsigned int flags;
         ENTRY;
 
         init_waitqueue_head(&req->rq_wait_for_rep);
@@ -620,12 +622,22 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                NTOH__u32(req->rq_reqmsg->status), req->rq_xid,
                conn->c_peer.peer_nid, NTOH__u32(req->rq_reqmsg->opc));
 
-        spin_lock(&imp->imp_lock);
-        EIO_IF_INVALID(req);
+        spin_lock_irqsave(&imp->imp_lock, flags);
+
+        /* 
+         * If the import has been invalidated (such as by an OST failure), the
+         * request must fail with -EIO.
+         */
+        if (req->rq_import->imp_flags & IMP_INVALID) {
+                DEBUG_REQ(D_ERROR, req, "IMP_INVALID:");
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
+                RETURN(-EIO);
+        }
+
         if (req->rq_level > imp->imp_level) {
                 list_del(&req->rq_list);
                 list_add_tail(&req->rq_list, &imp->imp_delayed_list);
-                spin_unlock(&imp->imp_lock);
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
 
                 DEBUG_REQ(D_HA, req, "\"%s\" waiting for recovery: (%d < %d)",
                           current->comm, req->rq_level, imp->imp_level);
@@ -634,15 +646,16 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                                   (req->rq_level <= imp->imp_level) ||
                                   (req->rq_flags & PTL_RPC_FL_ERR), &lwi);
 
-                spin_lock(&imp->imp_lock);
+                spin_lock_irqsave(&imp->imp_lock, flags);
                 list_del_init(&req->rq_list);
-                spin_unlock(&imp->imp_lock);
 
                 if (req->rq_flags & PTL_RPC_FL_ERR)
-                        RETURN(-EIO);
+                        rc = -EIO;
 
-                if (rc)
+                if (rc) {
+                        spin_unlock_irqrestore(&imp->imp_lock, flags);
                         RETURN(rc);
+                }
 
                 CERROR("process %d resumed\n", current->pid);
         }
@@ -650,7 +663,7 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
 
         LASSERT(list_empty(&req->rq_list));
         list_add_tail(&req->rq_list, &imp->imp_sending_list);
-        spin_unlock(&imp->imp_lock);
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
         rc = ptl_send_rpc(req);
         if (rc) {
                 CDEBUG(D_HA, "error %d, opcode %d, need recovery\n", rc,
@@ -660,15 +673,15 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                                        interrupted_request, req);
         } else {
                 DEBUG_REQ(D_NET, req, "-- sleeping");
-                lwi = LWI_TIMEOUT_INTR(req->rq_timeout * HZ, expired_request,
+                lwi = LWI_TIMEOUT_INTR(obd_timeout * HZ, expired_request,
                                        interrupted_request, req);
         }
         l_wait_event(req->rq_wait_for_rep, ptlrpc_check_reply(req), &lwi);
         DEBUG_REQ(D_NET, req, "-- done sleeping");
 
-        spin_lock(&imp->imp_lock);
+        spin_lock_irqsave(&imp->imp_lock, flags);
         list_del_init(&req->rq_list);
-        spin_unlock(&imp->imp_lock);
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
 
         if (req->rq_flags & PTL_RPC_FL_ERR) {
                 ptlrpc_abort(req);
@@ -681,6 +694,7 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                 req->rq_flags &= ~PTL_RPC_FL_RESEND;
                 lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
                 DEBUG_REQ(D_HA, req, "resending: ");
+                spin_lock_irqsave(&imp->imp_lock, flags);
                 goto resend;
         }
 
@@ -713,11 +727,11 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                 GOTO(out, rc = -EINVAL);
         }
 #endif
-        DEBUG_REQ(D_NET, req, "status %d\n", req->rq_repmsg->status);
+        DEBUG_REQ(D_NET, req, "status %d", req->rq_repmsg->status);
 
         /* We're a rejected connection, need to invalidate and rebuild. */
         if (req->rq_repmsg->status == -ENOTCONN) {
-                spin_lock(&imp->imp_lock);
+                spin_lock_irqsave(&imp->imp_lock, flags);
                 /* If someone else is reconnecting us (CONN_RECOVD) or has
                  * already completed it (handle mismatch), then we just need
                  * to get out.
@@ -725,20 +739,23 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                 if (imp->imp_level == LUSTRE_CONN_RECOVD ||
                     imp->imp_handle.addr != req->rq_reqmsg->addr ||
                     imp->imp_handle.cookie != req->rq_reqmsg->cookie) {
-                        spin_unlock(&imp->imp_lock);
+                        spin_unlock_irqrestore(&imp->imp_lock, flags);
                         GOTO(out, rc = -EIO);
                 }
                 imp->imp_level = LUSTRE_CONN_RECOVD;
-                spin_unlock(&imp->imp_lock);
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
                 rc = imp->imp_recover(imp, PTLRPC_RECOVD_PHASE_NOTCONN);
                 if (rc)
                         LBUG();
                 GOTO(out, rc = -EIO);
         }
 
+        rc = ptlrpc_check_status(req);
+
         if (req->rq_import->imp_flags & IMP_REPLAYABLE) {
-                spin_lock(&imp->imp_lock);
-                if (req->rq_flags & PTL_RPC_FL_REPLAY || req->rq_transno != 0) {
+                spin_lock_irqsave(&imp->imp_lock, flags);
+                if ((req->rq_flags & PTL_RPC_FL_REPLAY || req->rq_transno != 0)
+                    && rc >= 0) {
                         /* Balanced in ptlrpc_free_committed, usually. */
                         atomic_inc(&req->rq_refcount);
                         list_add_tail(&req->rq_list, &imp->imp_replay_list);
@@ -758,18 +775,14 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                 imp->imp_peer_committed_transno =
                         req->rq_repmsg->last_committed;
                 ptlrpc_free_committed(imp);
-                spin_unlock(&imp->imp_lock);
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
         }
 
-        rc = ptlrpc_check_status(req);
-
         EXIT;
  out:
         return rc;
 }
 
-#undef EIO_IF_INVALID
-
 int ptlrpc_replay_req(struct ptlrpc_request *req)
 {
         int rc = 0, old_level, old_status = 0;
@@ -780,7 +793,6 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
         init_waitqueue_head(&req->rq_wait_for_rep);
         DEBUG_REQ(D_NET, req, "");
 
-        req->rq_timeout = obd_timeout;
         req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
         req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie;
 
@@ -837,15 +849,16 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
 /* XXX looks a lot like super.c:invalidate_request_list, don't it? */
 void ptlrpc_abort_inflight(struct obd_import *imp)
 {
+        unsigned long flags;
         struct list_head *tmp, *n;
 
         /* Make sure that no new requests get processed for this import.
          * ptlrpc_queue_wait must (and does) hold imp_lock while testing this
          * flag and then putting requests on sending_list or delayed_list.
          */
-        spin_lock(&imp->imp_lock);
+        spin_lock_irqsave(&imp->imp_lock, flags);
         imp->imp_flags |= IMP_INVALID;
-        spin_unlock(&imp->imp_lock);
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
 
         list_for_each_safe(tmp, n, &imp->imp_sending_list) {
                 struct ptlrpc_request *req =
index 5cbdbc5..1d6284e 100644 (file)
@@ -148,6 +148,14 @@ int ptlrpc_send_bulk(struct ptlrpc_bulk_desc *desc)
 
                 iov[desc->bd_md.niov].iov_base = bulk->bp_buf;
                 iov[desc->bd_md.niov].iov_len = bulk->bp_buflen;
+                if (iov[desc->bd_md.niov].iov_len <= 0) {
+                        CERROR("bad bp_buflen[%d] @ %p: %d\n", desc->bd_md.niov,
+                               bulk->bp_buf, bulk->bp_buflen);
+                        CERROR("desc: xid %u, pages %d, ptl %d, ref %d\n",
+                               xid, desc->bd_page_count, desc->bd_portal,
+                               atomic_read(&desc->bd_refcount));
+                        LBUG();
+                }
                 desc->bd_md.niov++;
                 desc->bd_md.length += bulk->bp_buflen;
         }
@@ -384,22 +392,20 @@ int ptl_send_rpc(struct ptlrpc_request *request)
         /* add a ref, which will be balanced in request_out_callback */
         atomic_inc(&request->rq_refcount);
         if (request->rq_replen != 0) {
-                /* request->rq_repmsg is set only when the reply comes in, in
-                 * client_packet_callback() */
-                if (request->rq_reply_md.start) {
+                if (request->rq_reply_md.start != NULL) {
                         rc = PtlMEUnlink(request->rq_reply_me_h);
-                        LASSERT (rc == PTL_OK);
-                        OBD_FREE(request->rq_reply_md.start,
-                                 request->rq_replen);
-                        /* If we're resending, rq_repmsg needs to be NULLed out
-                         * again so that ptlrpc_check_reply doesn't trip early.
-                         */
+                        if (rc != PTL_OK && rc != PTL_INV_ME) {
+                                CERROR("rc %d\n", rc);
+                                LBUG();
+                        }
+                        repbuf = (char *)request->rq_reply_md.start;
                         request->rq_repmsg = NULL;
-                }
-                OBD_ALLOC(repbuf, request->rq_replen);
-                if (!repbuf) {
-                        LBUG();
-                        RETURN(ENOMEM);
+                } else {
+                        OBD_ALLOC(repbuf, request->rq_replen);
+                        if (!repbuf) {
+                                LBUG();
+                                RETURN(ENOMEM);
+                        }
                 }
 
                 rc = PtlMEAttach(request->rq_connection->c_peer.peer_ni,
index 0bbc4b0..d544a19 100644 (file)
@@ -23,7 +23,6 @@
 /* dump_connection_list, but shorter for nicer debugging logs */
 static void d_c_l(struct list_head *head)
 {
-        int sanity = 0;
         struct list_head *tmp;
 
         list_for_each(tmp, head) {
@@ -33,8 +32,6 @@ static void d_c_l(struct list_head *head)
                 CDEBUG(D_HA, "   %p = %s (%d/%d)\n", conn, conn->c_remote_uuid,
                        conn->c_recovd_data.rd_phase,
                        conn->c_recovd_data.rd_next_phase);
-                if (sanity++ > 1000)
-                        LBUG();
         }
 }
 
@@ -277,7 +274,7 @@ static int recovd_handle_event(struct recovd_obd *recovd)
 static int recovd_main(void *arg)
 {
         struct recovd_obd *recovd = (struct recovd_obd *)arg;
-
+        unsigned long flags;
         ENTRY;
 
         lock_kernel();
@@ -287,10 +284,10 @@ static int recovd_main(void *arg)
         sigfillset(&current->blocked);
         recalc_sigpending();
 #else
-        spin_lock_irq(&current->sigmask_lock);
+        spin_lock_irqsave(&current->sigmask_lock, flags);
         sigfillset(&current->blocked);
         recalc_sigpending(current);
-        spin_unlock_irq(&current->sigmask_lock);
+        spin_unlock_irqrestore(&current->sigmask_lock, flags);
 #endif
 
         sprintf(current->comm, "lustre_recovd");
index acdecf8..b4f3c85 100644 (file)
@@ -24,7 +24,8 @@
 #include <linux/lustre_net.h>
 #include <linux/obd.h>
 
-int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc)
+int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc,
+                            struct ptlrpc_request **reqptr)
 {
         struct obd_device *obd = imp->imp_obd;
         struct client_obd *cli = &obd->u.cli;
@@ -37,6 +38,8 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc)
         int rc;
 
         request = ptlrpc_prep_req(imp, rq_opc, 2, size, tmp);
+        if (!request)
+                RETURN(-ENOMEM);
         request->rq_level = LUSTRE_CONN_NEW;
         request->rq_replen = lustre_msg_size(0, NULL);
         /*
@@ -60,7 +63,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc)
                             sizeof (old_hdl.addr)) &&
                     !memcmp(&old_hdl.cookie, &request->rq_repmsg->cookie,
                             sizeof (old_hdl.cookie))) {
-                        CERROR("%s@%s didn't like our handle %Lx/%Lx, failed\n",
+                        CERROR("%s@%s didn't like our handle "LPX64"/"LPX64", failed\n",
                                cli->cl_target_uuid, conn->c_remote_uuid,
                                (__u64)(unsigned long)ldlmexp,
                                ldlmexp->exp_cookie);
@@ -70,7 +73,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc)
                 old_hdl.addr = request->rq_repmsg->addr;
                 old_hdl.cookie = request->rq_repmsg->cookie;
                 if (memcmp(&imp->imp_handle, &old_hdl, sizeof(old_hdl))) {
-                        CERROR("%s@%s changed handle from %Lx/%Lx to %Lx/%Lx; "
+                        CERROR("%s@%s changed handle from "LPX64"/"LPX64" to "LPX64"/"LPX64"; "
                                "copying, but this may foreshadow disaster\n",
                                cli->cl_target_uuid, conn->c_remote_uuid,
                                old_hdl.addr, old_hdl.cookie,
@@ -87,7 +90,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc)
                 old_hdl = imp->imp_handle;
                 imp->imp_handle.addr = request->rq_repmsg->addr;
                 imp->imp_handle.cookie = request->rq_repmsg->cookie;
-                CERROR("now connected to %s@%s (%Lx/%Lx, was %Lx/%Lx)!\n",
+                CERROR("now connected to %s@%s ("LPX64"/"LPX64", was "LPX64"/"LPX64")!\n",
                        cli->cl_target_uuid, conn->c_remote_uuid,
                        imp->imp_handle.addr, imp->imp_handle.cookie,
                        old_hdl.addr, old_hdl.cookie);
@@ -99,7 +102,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc)
         }
 
  out_disc:
-        ptlrpc_req_finished(request);
+        *reqptr = request;
         return rc;
 }
 
@@ -136,18 +139,19 @@ int ptlrpc_run_recovery_upcall(struct ptlrpc_connection *conn)
         RETURN(0);
 }
 
-int ptlrpc_replay(struct obd_import *imp, int send_last_flag)
+int ptlrpc_replay(struct obd_import *imp)
 {
         int rc = 0;
         struct list_head *tmp, *pos;
         struct ptlrpc_request *req;
+        unsigned long flags;
         __u64 committed = imp->imp_peer_committed_transno;
         ENTRY;
 
         /* It might have committed some after we last spoke, so make sure we
          * get rid of them now.
          */
-        spin_lock(&imp->imp_lock);
+        spin_lock_irqsave(&imp->imp_lock, flags);
 
         ptlrpc_free_committed(imp);
 
@@ -162,26 +166,20 @@ int ptlrpc_replay(struct obd_import *imp, int send_last_flag)
         list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
 
-                if (req->rq_transno == imp->imp_max_transno &&
-                    send_last_flag) {
-                        req->rq_reqmsg->flags |= MSG_LAST_REPLAY;
-                        DEBUG_REQ(D_HA, req, "LAST_REPLAY:");
-                } else {
-                        DEBUG_REQ(D_HA, req, "REPLAY:");
-                }
+                DEBUG_REQ(D_HA, req, "REPLAY:");
 
+                /* XXX locking WRT failure during replay? */
                 rc = ptlrpc_replay_req(req);
-                req->rq_reqmsg->flags &= ~MSG_LAST_REPLAY;
 
                 if (rc) {
-                        CERROR("recovery replay error %d for req %Ld\n",
+                        CERROR("recovery replay error %d for req "LPD64"\n",
                                rc, req->rq_xid);
                         GOTO(out, rc);
                 }
         }
 
  out:
-        spin_unlock(&imp->imp_lock);
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
         return rc;
 }
 
@@ -192,7 +190,7 @@ int ptlrpc_replay(struct obd_import *imp, int send_last_flag)
 
 static int resend_type(struct ptlrpc_request *req, __u64 committed)
 {
-        if (req->rq_transno < committed) {
+        if (req->rq_transno && req->rq_transno < committed) {
                 if (req->rq_flags & PTL_RPC_FL_REPLIED) {
                         /* Saw the reply and it was committed, no biggie. */
                         DEBUG_REQ(D_HA, req, "NO_RESEND");
@@ -217,11 +215,12 @@ int ptlrpc_resend(struct obd_import *imp)
         int rc = 0;
         struct list_head *tmp, *pos;
         struct ptlrpc_request *req;
+        unsigned long flags;
         __u64 committed = imp->imp_peer_committed_transno;
 
         ENTRY;
 
-        spin_lock(&imp->imp_lock);
+        spin_lock_irqsave(&imp->imp_lock, flags);
         list_for_each(tmp, &imp->imp_sending_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
                 DEBUG_REQ(D_HA, req, "SENDING: ");
@@ -259,19 +258,21 @@ int ptlrpc_resend(struct obd_import *imp)
                 }
         }
 
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
         RETURN(rc);
 }
 
 void ptlrpc_wake_delayed(struct obd_import *imp)
 {
+        unsigned long flags;
         struct list_head *tmp, *pos;
         struct ptlrpc_request *req;
 
-        spin_lock(&imp->imp_lock);
+        spin_lock_irqsave(&imp->imp_lock, flags);
         list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
                 DEBUG_REQ(D_HA, req, "waking:");
                 wake_up(&req->rq_wait_for_rep);
         }
-        spin_unlock(&imp->imp_lock);
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
 }
index 1384b5d..200c029 100644 (file)
@@ -46,16 +46,10 @@ int connmgr_setup(struct obd_device *obddev, obd_count len, void *buf)
         int err;
         ENTRY;
 
-        MOD_INC_USE_COUNT;
         memset(recovd, 0, sizeof(*recovd));
 
         err = recovd_setup(recovd);
-        if (err) {
-                MOD_DEC_USE_COUNT;
-                RETURN(err);
-        }
-
-        RETURN(0);
+        RETURN(err);
 }
 
 int connmgr_cleanup(struct obd_device *dev)
@@ -64,15 +58,11 @@ int connmgr_cleanup(struct obd_device *dev)
         int err;
 
         err = recovd_cleanup(recovd);
-        if (err)
-                LBUG();
-
-        MOD_DEC_USE_COUNT;
-        RETURN(0);
+        RETURN(err);
 }
 
-int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, void *karg,
-                      void *uarg)
+int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len,
+                      void *karg, void *uarg)
 {
         struct ptlrpc_connection *conn = NULL;
         struct obd_device *obd = class_conn2obd(hdl);
@@ -85,7 +75,7 @@ int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, void
 
         if (cmd != OBD_IOC_RECOVD_NEWCONN && cmd != OBD_IOC_RECOVD_FAILCONN)
                 RETURN(-EINVAL); /* XXX ENOSYS? */
-        
+
         /* Find the connection that's been rebuilt or has failed. */
         spin_lock(&recovd->recovd_lock);
         list_for_each(tmp, &recovd->recovd_troubled_items) {
@@ -106,9 +96,9 @@ int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, void
                 list_for_each(tmp, &recovd->recovd_managed_items) {
                         conn = list_entry(tmp, struct ptlrpc_connection,
                                           c_recovd_data.rd_managed_chain);
-                        
+
                         LASSERT(conn->c_recovd_data.rd_recovd == recovd);
-                        
+
                         if (!strcmp(conn->c_remote_uuid, data->ioc_inlbuf1))
                                 break;
                         conn = NULL;
@@ -152,7 +142,7 @@ int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, void
         }
         ptlrpc_readdress_connection(conn, conn->c_remote_uuid);
         spin_unlock(&conn->c_lock);
-        
+
         conn->c_recovd_data.rd_phase = RD_PREPARED;
         wake_up(&recovd->recovd_waitq);
  out:
@@ -176,27 +166,29 @@ int conmgr_detach(struct obd_device *dev)
 {
         return lprocfs_dereg_obd(dev);
 }
+
 /* use obd ops to offer management infrastructure */
 static struct obd_ops recovd_obd_ops = {
-        o_attach:      connmgr_attach,
-        o_detach:      conmgr_detach,
-        o_setup:       connmgr_setup,
-        o_cleanup:     connmgr_cleanup,
-        o_iocontrol:   connmgr_iocontrol,
-        o_connect:     connmgr_connect,
-        o_disconnect:  class_disconnect
+        o_owner:        THIS_MODULE,
+        o_attach:       connmgr_attach,
+        o_detach:       conmgr_detach,
+        o_setup:        connmgr_setup,
+        o_cleanup:      connmgr_cleanup,
+        o_iocontrol:    connmgr_iocontrol,
+        o_connect:      connmgr_connect,
+        o_disconnect:   class_disconnect
 };
 
 static int __init ptlrpc_init(void)
 {
-        int rc; 
+        int rc;
         rc = ptlrpc_init_portals();
-        if (rc) 
+        if (rc)
                 RETURN(rc);
         ptlrpc_init_connection();
         rc = class_register_type(&recovd_obd_ops, status_class_var,
                                  LUSTRE_HA_NAME);
-        if (rc) 
+        if (rc)
                 RETURN(rc);
         ptlrpc_put_connection_superhack = ptlrpc_put_connection;
         return 0;
index d497668..c20fc48 100644 (file)
@@ -128,7 +128,7 @@ ptlrpc_init_svc(__u32 nevents, __u32 nbufs,
                 ptlrpc_link_svc_me(rqbd);
         }
 
-        CDEBUG(D_NET, "Starting service listening on portal %d (eq: %p)\n",
+        CDEBUG(D_NET, "Starting service listening on portal %d (eq: %lu)\n",
                service->srv_req_portal, service->srv_eq_h.handle_idx);
 
         RETURN(service);
@@ -171,8 +171,7 @@ static int handle_incoming_request(struct obd_device *obddev,
                 goto out;
         }
 
-        CDEBUG(D_RPCTRACE, "Handling RPC pid:xid:nid:opc %d:"
-               LPX64":%x:%d\n",
+        CDEBUG(D_RPCTRACE, "Handling RPC pid:xid:nid:opc %d:"LPX64":"LPX64":%d\n",
                NTOH__u32(request->rq_reqmsg->status),
                request->rq_xid,
                event->initiator.nid,
@@ -254,7 +253,7 @@ static int ptlrpc_main(void *arg)
         struct ptlrpc_request *request;
         ptl_event_t *event;
         int rc = 0;
-
+        unsigned long flags;
         ENTRY;
 
         lock_kernel();
@@ -264,10 +263,10 @@ static int ptlrpc_main(void *arg)
         sigfillset(&current->blocked);
         recalc_sigpending();
 #else
-        spin_lock_irq(&current->sigmask_lock);
+        spin_lock_irqsave(&current->sigmask_lock, flags);
         sigfillset(&current->blocked);
         recalc_sigpending(current);
-        spin_unlock_irq(&current->sigmask_lock);
+        spin_unlock_irqrestore(&current->sigmask_lock, flags);
 #endif
 
 #ifdef __arch_um__
@@ -383,6 +382,9 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
         list_add(&thread->t_link, &svc->srv_threads);
         spin_unlock(&svc->srv_lock);
 
+        /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
+         * just drop the VM and FILES in ptlrpc_daemonize() right away.
+         */
         rc = kernel_thread(ptlrpc_main, (void *) &d, CLONE_VM | CLONE_FILES);
         if (rc < 0) {
                 CERROR("cannot start thread\n");
index a6d15c2..b9e1962 100644 (file)
@@ -22,6 +22,7 @@ newfile
 openclose
 createdestroy
 createmany
+statmany
 mkdirmany
 lovstripe
 *.xml
@@ -29,3 +30,4 @@ stat
 setuid
 multifstat
 checkstat
+wantedi
index 12b7d52..aa00642 100644 (file)
@@ -12,7 +12,7 @@ EXTRA_DIST = $(pkgexample_SCRIPTS) $(noinst_SCRIPTS) $(noinst_DATA) \
        rundbench \
        elan-client.cfg    mds.cfg      trivial.sh
 pkgexampledir = '${exec_prefix}/usr/lib/$(PACKAGE)/examples'
-pkgexample_SCRIPTS = llmount.sh llmountcleanup.sh llecho.sh local.sh uml.sh lov.sh
+pkgexample_SCRIPTS = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh local.sh echo.sh uml.sh lov.sh
 noinst_SCRIPTS = llsetup.sh llrsetup.sh llcleanup.sh
 noinst_DATA = lustre.cfg
 noinst_SCRIPTS += fs.sh intent-test.sh intent-test2.sh leak_finder.pl \
@@ -24,9 +24,9 @@ noinst_SCRIPTS += fs.sh intent-test.sh intent-test2.sh leak_finder.pl \
        runtests runvmstat snaprun.sh tbox.sh  common.sh
 noinst_PROGRAMS = openunlink testreq truncate directio openme writeme mcreate
 noinst_PROGRAMS += munlink tchmod toexcl fsx test_brw openclose createdestroy
-noinst_PROGRAMS += lovstripe stat createmany mkdirmany multifstat
+noinst_PROGRAMS += stat createmany statmany mkdirmany multifstat
 # noinst_PROGRAMS += ldaptest 
-noinst_PROGRAMS += checkstat
+noinst_PROGRAMS += checkstat wantedi
 
 # ldaptest_SOURCES = ldaptest.c
 tchmod_SOURCES = tchmod.c
@@ -43,11 +43,12 @@ fsx_SOURCES = fsx.c
 test_brw_SOURCES = test_brw.c
 openclose_SOURCES = openclose.c
 createdestroy_SOURCES = createdestroy.c
-lovstripe_SOURCES = lovstripe.c
 stat_SOURCES = stat.c
 createmany_SOURCES = createmany.c
+statmany_SOURCES = statmany.c
 mkdirmany_SOURCES = mkdirmany.c
 multifstat_SOURCES = multifstat.c
 checkstat_SOURCES = checkstat.c
+wantedi_SOURCES = wantedi.c
 
 include $(top_srcdir)/Rules
index 77015a6..c56eda8 100644 (file)
 
 int main(int argc, char ** argv)
 {
-        int i, rc, count;
+        int i, rc = 0, do_open;
         char filename[4096];
+        long int start, last, end, count;
 
-        if (argc < 3) {
-                printf("Usage %s filenamebase count\n", argv[0]);
+        if (argc != 4) {
+                printf("Usage %s <-o|-m> filenamebase <count|-time>\n",
+                       argv[0]);
                 return 1;
         }
 
-        if (strlen(argv[1]) > 4080) {
+        if (strcmp(argv[1], "-o") == 0) {
+                do_open = 1;
+        } else if (strcmp(argv[1], "-m") == 0) {
+                do_open = 0;
+        } else {
+                printf("Usage %s {-o|-m} filenamebase <count|-time>\n",
+                       argv[0]);
+                return 1;
+        }
+
+        if (strlen(argv[2]) > 4080) {
                 printf("name too long\n");
                 return 1;
         }
 
-        count = strtoul(argv[2], NULL, 0);
+        start = last = time(0);
+
+        end = strtol(argv[3], NULL, 0);
 
-        for (i=0 ; i < count ; i++) {
-                sprintf(filename, "%s-%d", argv[1], i);
-                rc = mknod(filename, S_IFREG| 0444, 0);
-                if (rc) {
-                        printf("mknod(%s) error: %s\n",
-                               filename, strerror(errno));
-                        break;
+        if (end > 0) {
+                count = end;
+                end = -1UL >> 1;
+        } else {
+                end = start - end;
+                count = -1UL >> 1;
+        }
+
+        for (i = 0; i < count && time(0) < end; i++) {
+                sprintf(filename, "%s%d", argv[2], i);
+                if (do_open) {
+                        int fd = open(filename, O_CREAT|O_RDWR, 0644);
+                        if (fd < 0) {
+                                printf("open(%s) error: %s\n", filename,
+                                       strerror(errno));
+                                rc = errno;
+                                break;
+                        }
+                        close(fd);
+                } else {
+                        rc = mknod(filename, S_IFREG| 0444, 0);
+                        if (rc) {
+                                printf("mknod(%s) error: %s\n",
+                                       filename, strerror(errno));
+                                rc = errno;
+                                break;
+                        }
+                }
+                if ((i % 10000) == 0) {
+                        printf(" - created %d (time %ld ; total %ld ; last %ld)\n",
+                               i, time(0), time(0) - start, time(0) - last);
+                        last = time(0);
                 }
-               if ((i % 10000) == 0)
-                   printf(" - created %d (time %ld)\n", i, time(0));
         }
+        printf("total: %d creates in %ld seconds: %f creates/second\n", i,
+               time(0) - start, ((float)i / (time(0) - start)));
+
         return rc;
 }
diff --git a/lustre/tests/echo.sh b/lustre/tests/echo.sh
new file mode 100755 (executable)
index 0000000..f30f056
--- /dev/null
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+config=${1:-$(basename $0 .sh).xml}
+LMC=${LMC:-../utils/lmc -m $config}
+
+SERVER=localhost
+CLIENT=localhost
+
+# FIXME: make LMC not require MDS for obdecho LOV
+MDSDEV=$TMP/mds1
+MDSSIZE=10000
+
+STRIPE_BYTES=65536
+STRIPES_PER_OBJ=2      # 0 means stripe over all OSTs
+
+LOV=0
+while [ "$1" ]; do
+        case $1 in
+        --lov) LOV="1" ;;
+       *) OPTS="$OPTS $1" ;;
+        esac
+        shift
+done
+
+rm -f $config
+# create nodes
+$LMC --add node --node $SERVER  || exit 1
+$LMC --add net --node $SERVER --nid $SERVER --nettype tcp || exit 2
+
+if (($LOV)); then
+    $LMC --add mds --node $SERVER --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 10
+    $LMC --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 || exit 11
+    $LMC --add ost --node $SERVER --lov lov1 --obdtype=obdecho || exit 12
+    $LMC --add ost --node $SERVER --lov lov1 --obdtype=obdecho || exit 13
+    OBD_NAME=lov1
+else
+    $LMC --add ost --obd obd1 --node $SERVER --obdtype=obdecho || exit 2
+    OBD_NAME=obd1
+fi
+
+if [ "$SERVER" != "$CLIENT" ]; then
+   $LMC --add node --node $CLIENT  || exit 1
+   $LMC --add net --node $CLIENT --nid $CLIENT --nettype tcp || exit 2
+fi
+
+$LMC --add echo_client --node $CLIENT --obd ${OBD_NAME} || exit 3
+
index e99289c..c077223 100644 (file)
@@ -1,50 +1,12 @@
 #!/bin/sh
 
-config=echo.xml
 LCONF=${LCONF:-../utils/lconf}
-LMC=${LMC:-../utils/lmc}
+NAME=${NAME:-echo}
 
-SERVER=localhost
-CLIENT=cfs4
+config=$NAME.xml
+mkconfig=./$NAME.sh
 
-# FIXME: make LMC not require MDS for obdecho LOV
-MDSDEV=$TMP/mds1
-MDSSIZE=10000
-
-STRIPE_BYTES=65536
-STRIPES_PER_OBJ=2      # 0 means stripe over all OSTs
-
-LOV=0
-while [ "$1" ]; do
-        case $1 in
-        --lov) LOV="1" ;;
-       *) OPTS="$OPTS $1" ;;
-        esac
-        shift
-done
-
-rm -f $config
-# create nodes
-$LMC -o $config --add node --node $SERVER  || exit 1
-$LMC -m $config --add net --node $SERVER --nid $SERVER --nettype tcp || exit 2
-
-if (($LOV)); then
-    $LMC -m $config --add mds --node $SERVER --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 10
-    $LMC -m $config --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 || exit 11
-    $LMC -m $config --add ost --node $SERVER --lov lov1 --obdtype=obdecho || exit 12
-    $LMC -m $config --add ost --node $SERVER --lov lov1 --obdtype=obdecho || exit 13
-    OBD_NAME=lov1
-else
-    $LMC -m $config --add ost --obd obd1 --node $SERVER --obdtype=obdecho || exit 2
-    OBD_NAME=obd1
-fi
-
-if [ "$SERVER" != "$CLIENT" ]; then
-   $LMC -m $config --add node --node $CLIENT  || exit 1
-   $LMC -m $config --add net --node $CLIENT --nid $CLIENT --nettype tcp || exit 2
-fi
-
-$LMC -m $config --add echo_client --node $CLIENT --obd ${OBD_NAME} || exit 3
+sh $mkconfig $config || exit 1
 
 $LCONF --reformat --gdb $OPTS $config || exit 4
 
index de4b35d..2d63fa9 100755 (executable)
@@ -1,10 +1,15 @@
 #!/bin/sh
 
 LCONF=../utils/lconf
+NAME=${NAME:-echo}
+TMP=${TMP:-/tmp}
 
-if [ -f echo.xml ]; then
-   ${LCONF} --cleanup echo.xml
-else
-   echo "no echo.xml found"
+config=$NAME.xml
+mkconfig=./$NAME.sh
+
+if [ ! -f $config ]; then
+   sh $mkconfig $config || exit 1
 fi
 
+${LCONF} --cleanup echo.xml
+
index eb4618b..efc7c0c 100755 (executable)
@@ -7,8 +7,6 @@ NAME=${NAME:-local}
 config=$NAME.xml
 mkconfig=./$NAME.sh
 
-if [ ! -f $config -o $mkconfig -nt $config ]; then
-   sh $mkconfig $config || exit 1
-fi
+sh $mkconfig $config || exit 1
 
 ${LCONF} --reformat --gdb $config || exit 2
index 82f2a17..b8b99d9 100755 (executable)
@@ -13,14 +13,17 @@ fi
 
 sync; sleep 2; sync
 ${LCONF} --cleanup --dump $TMP/debug $config
-LEAK=`dmesg | grep -v " 0 bytes" | grep leaked`
-if [ "$LEAK" ]; then
-       echo "$LEAK" 1>&2
-       mv $TMP/debug $TMP/debug.`date +%s`
-       #exit -1
-fi
 BUSY=`dmesg | grep -i destruct`
 if [ "$BUSY" ]; then
        echo "$BUSY" 1>&2
-       #exit -2
+       mv $TMP/debug $TMP/debug-busy.`date +%s`
+       exit -1
+fi
+LEAK_LUSTRE=`dmesg | tail -20 | grep -v "leaked: 0" | grep leaked`
+LEAK_PORTALS=`dmesg | tail -20 | grep "Portals memory leaked"`
+if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then
+       echo "$LEAK_LUSTRE" 1>&2
+       echo "$LEAK_PORTALS" 1>&2
+       mv $TMP/debug $TMP/debug-leak.`date +%s`
+       exit -2
 fi
index f680f4b..d892b58 100755 (executable)
@@ -5,11 +5,11 @@ config=${1:-local.xml}
 LMC="${LMC:-../utils/lmc} -m $config"
 TMP=${TMP:-/tmp}
 
-MDSDEV=$TMP/mds1
-MDSSIZE=50000
+MDSDEV=${MDSDEV:-$TMP/mds1}
+MDSSIZE=${MDSSIZE:-50000}
 
-OSTDEV=$TMP/ost1
-OSTSIZE=200000
+OSTDEV=${OSTDEV:-$TMP/ost1}
+OSTSIZE=${OSTSIZE:-200000}
 
 kver=`uname -r | cut -d "." -f 1,2`
 
index 54d4c66..c0b2839 100755 (executable)
@@ -5,13 +5,13 @@ config=${1:-lov.xml}
 LMC=${LMC:-../utils/lmc}
 TMP=${TMP:-/tmp}
 
-MDSDEV=$TMP/mds1
-MDSSIZE=50000
+MDSDEV=${MDSDEV:-$TMP/mds1}
+MDSSIZE=${MDSSIZE:-50000}
 
-OSTDEV1=$TMP/ost1
-OSTDEV2=$TMP/ost2
-OSTDEV3=$TMP/ost3
-OSTSIZE=100000
+OSTDEV1=${OSTDEV1:-$TMP/ost1}
+OSTDEV2=${OSTDEV2:-$TMP/ost2}
+OSTDEV3=${OSTDEV3:-$TMP/ost3}
+OSTSIZE=${OSTSIZE:-100000}
 
 STRIPE_BYTES=65536
 STRIPES_PER_OBJ=2      # 0 means stripe over all OSTs
diff --git a/lustre/tests/lovstripe.c b/lustre/tests/lovstripe.c
deleted file mode 100644 (file)
index 29769f1..0000000
+++ /dev/null
@@ -1,164 +0,0 @@
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/ioctl.h>
-#include <string.h>
-#include <errno.h>
-
-
-/****************** Custom includes ********************/
-#include <linux/lustre_lite.h>
-#include <linux/lustre_idl.h>
-
-
-/******************  Functions ******************/
-int write_file(char *name, struct lov_mds_md *striping, int bufsize,
-              char *buf1, char *buf2);
-
-
-/************************  Main **********************/
-
-#define STRIPE_SIZE 128 * 1024
-
-int main(int argc, char *argv[])
-{
-       struct lov_mds_md a_striping;
-       long bufsize = sizeof(long) * STRIPE_SIZE;
-       char *rbuf, *wbuf;
-       int data, *dp;
-       int result;
-
-       rbuf = malloc(bufsize);
-       wbuf = malloc(bufsize);
-       if (!rbuf || !wbuf) {
-               fprintf(stderr, "%s: unable to allocate buffers\n", argv[0]);
-               return 1;
-       }
-
-       /* Initialize to an easily-verified pattern */
-       for (data = 0, dp = (int *)wbuf; data < STRIPE_SIZE; data++, dp++)
-               *dp = data;
-
-       /*  Init defaults on striping info  */
-       a_striping.lmm_magic = LOV_MAGIC;
-       a_striping.lmm_stripe_size = STRIPE_SIZE;
-       a_striping.lmm_stripe_pattern = 0;
-
-       /*  Write file for OST1 only  */
-       /*       Start at OST 0, and use only 1 OST  */
-       a_striping.lmm_stripe_offset = 0;
-       a_striping.lmm_stripe_count = 1;
-
-       result = write_file("/mnt/lustre/ost1", &a_striping, bufsize,
-                           wbuf, rbuf);
-
-       if (result < 0)
-               goto out;
-
-       /*  Write file for OST2 only  */
-       /*       Start at OST 1, and use only 1 OST  */
-       a_striping.lmm_stripe_offset = 1;
-       a_striping.lmm_stripe_count = 1;
-
-       result = write_file("/mnt/lustre/ost2", &a_striping, bufsize,
-                           wbuf, rbuf);
-
-       if (result < 0)
-               goto out;
-
-       /*  Write file across both OST1 and OST2  */
-       /*       Start at OST 0, and use only 2 OSTs  */
-       a_striping.lmm_stripe_offset = 0;
-       a_striping.lmm_stripe_count = 2;
-
-       result = write_file("/mnt/lustre/ost1and2", &a_striping, bufsize,
-                           wbuf, rbuf);
-
-       if (result < 0)
-               goto out;
-
-out:
-       free(rbuf);
-       free(wbuf);
-       return result;
-}
-
-
-int write_file(char *name, struct lov_mds_md *striping, int bufsize,
-              char *wbuf, char *rbuf)
-{
-       int fd, result;
-
-       printf("opening %s\n", name);
-       fd = open(name, O_CREAT | O_RDWR | O_LOV_DELAY_CREATE, 0644);
-       if (fd < 0) {
-               fprintf(stderr, "\nUnable to open '%s': %s\n",
-                        name, strerror(errno));
-               return -errno;
-       }
-
-       printf("setting stripe data on %s\n", name);
-       result = ioctl(fd, LL_IOC_LOV_SETSTRIPE, striping);
-       if (result < 0) {
-               fprintf(stderr, "\nError on ioctl for '%s' (%d): %s\n",
-                       name, fd, strerror(errno));
-               close(fd);
-               return -errno;
-       }
-
-       /*  Write bogus data  */
-       printf("writing data to %s\n", name);
-       result = write(fd, wbuf, bufsize);
-       if (result < 0) {
-               fprintf(stderr, "\nerror: writing data to '%s' (%d): %s\n",
-                       name, fd, strerror(errno));
-               close(fd);
-               return -errno;
-       }
-
-       if (result != bufsize) {
-               fprintf(stderr, "\nerror: short write to '%s' (%d): %d != %d\n",
-                       name, fd, result, bufsize);
-               close(fd);
-               return -1;
-       }
-
-       /*  Seek to beginning again */
-       printf("seeking in %s\n", name);
-       result = lseek(fd, 0, SEEK_SET);
-       if (result < 0) {
-               fprintf(stderr, "\nerror: seeking to beginning '%s' (%d): %s\n",
-                       name, fd, strerror(errno));
-               close(fd);
-               return -errno;
-       }
-
-       /*  Read bogus data back  */
-       printf("reading data from %s\n", name);
-       result = read(fd, rbuf, bufsize);
-       if (result < 0) {
-               fprintf(stderr, "\nerror: reading data from '%s' (%d): %s\n",
-                       name, fd, strerror(errno));
-               close(fd);
-               return -errno;
-       }
-
-       if (result != bufsize) {
-               fprintf(stderr,"\nerror: short read from '%s' (%d): %d != %d\n",
-                       name, fd, result, bufsize);
-               close(fd);
-               return -1;
-       }
-
-       if (memcmp(wbuf, rbuf, bufsize)) {
-               fprintf(stderr, "\nerror: comparing data in '%s' (%d): %s\n",
-                       name, fd, strerror(errno));
-               close(fd);
-               return -1;
-       }
-
-       close(fd);
-
-       return 0;
-}
index 324b161..258598b 100644 (file)
@@ -441,7 +441,40 @@ pass
 $CLEAN
 $START
 
-echo '== cleanup ========================================='
+echo '== stripe sanity ================================= test27'
+echo "--test 26.1 create one stripe"
+mkdir $MOUNT/d27
+../utils/lstripe $MOUNT/d27/f0 4096 0 1
+$CHECKSTAT -t file $MOUNT/d27/f0
+echo "--test 26.2 write to one stripe file"
+cp /etc/hosts $MOUNT/d27/f0
+pass
+$CLEAN
+$START
+
+echo "--test 26.3 create two stripes"
+../utils/lstripe $MOUNT/d27/f01 4096 0 2
+echo "--test 26.4 write to two stripe file"
+cp /etc/hosts $MOUNT/d27/f01
+pass
+$CLEAN
+$START
+
+echo "--test 26.5 lstripe existing file (should return error)"
+../utils/lstripe $MOUNT/d27/f12 4096 1 2
+! ../utils/lstripe $MOUNT/d27/f12 4096 1 2
+pass
+$CLEAN
+$START
+
+echo "--test 26.6 lfind "
+../utils/lfind $MOUNT/d27
+pass
+$CLEAN
+$START
+
+
+echo '== cleanup ============================================='
 rm -r $MOUNT/[Rdfs][1-9]*
 
 echo '======================= finished ======================='
diff --git a/lustre/tests/statmany.c b/lustre/tests/statmany.c
new file mode 100644 (file)
index 0000000..f6370e3
--- /dev/null
@@ -0,0 +1,214 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <time.h>
+#include <limits.h>
+#include <sys/ioctl.h>
+
+#if 0
+#include <linux/extN_fs.h>
+#endif
+#include <linux/lustre_lib.h>
+#include <linux/obd.h>
+
+struct option longopts[] = {
+       {"ea", 0, 0, 'e'},
+       {"lookup", 0, 0, 'l'},
+       {"random", 0, 0, 'r'},
+       {"stat", 0, 0, 's'},
+       {NULL, 0, 0, 0},
+};
+char *shortopts = "ehlr:s0123456789";
+
+static int usage(char *prog, FILE *out)
+{
+        fprintf(out,
+               "Usage: %s [-r rand_seed] {-s|-e|-l} filenamebase total_files iterations\n"
+               "-r : random seed\n"
+               "-s : regular stat() calls\n"
+               "-e : open then GET_EA ioctl\n"
+               "-l : lookup ioctl only\n", prog);
+        exit(out == stderr);
+}
+
+#ifndef LONG_MAX
+#define LONG_MAX (1 << ((8 * sizeof(long)) - 1))
+#endif
+
+int main(int argc, char ** argv)
+{
+        long i, count, iter = LONG_MAX, mode, offset;
+        long int start, length = LONG_MAX, last, rc = 0;
+        char parent[4096], *t;
+       char c, *prog = argv[0], *base;
+       int seed = 0;
+       int fd = -1;
+
+       while ((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) {
+               char *e;
+               switch (c) {
+               case 'r':
+                       seed = strtoul(optarg, &e, 0);
+                       if (*e) {
+                               fprintf(stderr, "bad -r option %s\n", optarg);
+                               usage(prog, stderr);
+                       }
+                       break;
+               case 'e':
+               case 'l':
+               case 's':
+                       mode = c;
+                       break;
+               case '0':
+               case '1':
+               case '2':
+               case '3':
+               case '4':
+               case '5':
+               case '6':
+               case '7':
+               case '8':
+               case '9':
+                       if (length == LONG_MAX)
+                               length = c - '0';
+                       else
+                               length = length * 10 + (c - '0');
+                       break;
+               case 'h':
+                       usage(prog, stdout);
+               case '?':
+                       usage(prog, stderr);
+               }
+       }
+
+       if (optind + 2 + (length == LONG_MAX) != argc) {
+               fprintf(stderr, "missing filenamebase, total_files, or iterations\n");
+               usage(prog, stderr);
+       }
+
+        base = argv[optind];
+        if (strlen(base) > 4080) {
+                fprintf(stderr, "filenamebase too long\n");
+                exit(1);
+        }
+
+       if (seed == 0) {
+               int f = open("/dev/urandom", O_RDONLY);
+
+               if (f < 0 || read(f, &seed, sizeof(seed)) < sizeof(seed))
+                       seed = time(0);
+               if (f > 0)
+                       close(f);
+       }
+
+       printf("using seed %u\n", seed);
+       srand(seed);
+
+        count = strtoul(argv[optind + 1], NULL, 0);
+       if (length == LONG_MAX) {
+               iter = strtoul(argv[optind + 2], NULL, 0);
+               printf("running for %lu iterations\n", iter);
+       } else
+               printf("running for %lu seconds\n", length);
+
+        start = last = time(0);
+
+        t = strrchr(base, '/');
+        if (t == NULL) {
+                strcpy(parent, ".");
+                offset = -1;
+        } else {
+                strncpy(parent, base, t - base);
+                offset = t - base + 1;
+        }
+
+       if (mode == 'l') {
+               fd = open(parent, O_RDONLY);
+               if (fd < 0) {
+                       printf("open(%s) error: %s\n", parent,
+                              strerror(errno));
+                       exit(errno);
+               }
+       }
+
+        for (i = 0; i < iter && time(0) - start < length; i++) {
+                char filename[4096];
+                int tmp;
+
+                tmp = random() % count;
+                sprintf(filename, "%s%d", base, tmp);
+
+                if (mode == 'e') {
+#if 0
+                        fd = open(filename, O_RDWR|O_LARGEFILE);
+                        if (fd < 0) {
+                                printf("open(%s) error: %s\n", filename,
+                                       strerror(errno));
+                                break;
+                        }
+                        rc = ioctl(fd, EXTN_IOC_GETEA, NULL);
+                        if (rc < 0) {
+                                printf("ioctl(%s) error: %s\n", filename,
+                                       strerror(errno));
+                                break;
+                        }
+                        close(fd);
+                        break;
+#endif
+               } else if (mode == 's') {
+                        struct stat buf;
+
+                        rc = stat(filename, &buf);
+                        if (rc < 0) {
+                                printf("stat(%s) error: %s\n", filename,
+                                       strerror(errno));
+                                break;
+                        }
+               } else if (mode == 'l') {
+                        struct obd_ioctl_data data;
+                        char rawbuf[8192];
+                        char *buf = rawbuf;
+                        int max = sizeof(rawbuf);
+
+                        memset(&data, 0, sizeof(data));
+                        data.ioc_version = OBD_IOCTL_VERSION;
+                        data.ioc_len = sizeof(data);
+                        if (offset >= 0)
+                                data.ioc_inlbuf1 = filename + offset;
+                        else
+                                data.ioc_inlbuf1 = filename;
+                        data.ioc_inllen1 = strlen(data.ioc_inlbuf1) + 1;
+
+                        if (obd_ioctl_pack(&data, &buf, max)) {
+                                printf("ioctl_pack failed.\n");
+                                break;
+                        }
+
+                        rc = ioctl(fd, IOC_MDC_LOOKUP, buf);
+                        if (rc < 0) {
+                                printf("ioctl(%s) error: %s\n", filename,
+                                       strerror(errno));
+                                break;
+                        }
+                }
+                if ((i % 10000) == 0) {
+                        printf(" - stat %lu (time %ld ; total %ld ; last %ld)\n",
+                               i, time(0), time(0) - start, time(0) - last);
+                        last = time(0);
+                }
+        }
+
+       if (mode == 'l')
+               close(fd);
+
+        printf("total: %lu stats in %ld seconds: %f stats/second\n", i,
+               time(0) - start, ((float)i / (time(0) - start)));
+
+        exit(rc);
+}
index a8a381b..112a796 100644 (file)
@@ -21,28 +21,28 @@ OSTSIZE=100000
 
 # Three separate systems
 MDSNODE=uml1
-OSTNODE=uml2
+OSTNODES="uml2 uml2"
 CLIENTS="uml3"
 
 # Single system with additional clients
 #MDSNODE=uml1
-#OSTNODE=uml1
+#OSTNODES="uml1 uml1"
 #CLIENTS="$MDSNODE client"
 
 # Two systems with client on MDS, and additional clients (set up OST first)
 #MDSNODE=uml1
-#OSTNODE=uml2
+#OSTNODES="uml2 uml2"
 #CLIENTS="$MDSNODE client"
 
 # Two systems with client on OST, and additional clients (set up MDS first)
 #MDSNODE=uml1
-#OSTNODE=uml2
-#CLIENTS="$OSTNODE client"
+#OSTNODES="uml2 uml2"
+#CLIENTS="$OSTNODES client"
 
 rm -f $config
 
 # create nodes
-for NODE in $MDSNODE $OSTNODE $CLIENTS; do
+for NODE in $MDSNODE $OSTNODES $CLIENTS; do
        eval [ \$$NODE ] && continue
        ${LMC} -m $config --add net --node $NODE --nid $NODE --nettype tcp || exit 1
        eval "$NODE=done"
@@ -53,11 +53,14 @@ ${LMC} -m $config --add mds --format --node $MDSNODE --mds mds1 --dev $MDSDEV --
 
 # configure ost
 ${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt 0 --stripe_pattern 0 || exit 20
-${LMC} -m $config --add ost --node $OSTNODE --lov lov1 --dev $OSTDEV1 --size $OSTSIZE || exit 21
-${LMC} -m $config --add ost --node $OSTNODE --lov lov1 --dev $OSTDEV2 --size $OSTSIZE || exit 22
+COUNT=1
+for NODE in $OSTNODES; do
+       eval OSTDEV=\$OSTDEV$COUNT
+        ${LMC} -m $config --add ost --node $NODE --lov lov1 --dev $OSTDEV --size $OSTSIZE || exit 21
+       COUNT=`expr $COUNT + 1`
+done
 
 # create client config(s)
 for NODE in $CLIENTS; do
        ${LMC} -m $config --add mtpt --node $NODE --path /mnt/lustre --mds mds1 --lov lov1 || exit 30
 done
-
diff --git a/lustre/tests/wantedi.c b/lustre/tests/wantedi.c
new file mode 100644 (file)
index 0000000..426602f
--- /dev/null
@@ -0,0 +1,48 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <time.h>
+#include <limits.h>
+#include <sys/ioctl.h>
+#include <linux/lustre_lib.h>
+#include <linux/obd.h>
+
+static int usage(char *prog, FILE *out)
+{
+        fprintf(out,
+               "Usage: %s <dir> <desired child ino>\n", prog);
+        exit(out == stderr);
+}
+
+#define EXTN_IOC_CREATE_INUM            _IOW('f', 5, long)
+
+int main(int argc, char ** argv)
+{
+        int dirfd, wantedi, rc;
+
+       if (argc < 2 || argc > 3)
+               usage(argv[0], stderr);
+       
+       dirfd = open(argv[1], O_RDONLY);
+       if (dirfd < 0) {
+              perror("open");
+              exit(1);
+       }
+        
+       wantedi = atoi(argv[2]);
+       printf("Creating %s/%d with ino %d\n", argv[1], wantedi, wantedi);
+
+       rc = ioctl(dirfd, EXTN_IOC_CREATE_INUM, wantedi);
+       if (rc < 0) {
+              perror("ioctl(EXTN_IOC_CREATE_INUM)");
+              exit(2);
+       }
+
+        return 0;
+}
diff --git a/lustre/utils/automatic-reconnect-sample b/lustre/utils/automatic-reconnect-sample
new file mode 100755 (executable)
index 0000000..bf9ecc4
--- /dev/null
@@ -0,0 +1,34 @@
+#!/bin/sh
+
+if [ -z "$1" ]; then
+  echo "No UUID given to Lustre upcall!" | wall
+  exit 1
+fi
+
+# FIXME: OSTHOST can't be hard-coded!
+OST=$1
+OSTHOST=dev7
+LUSTRE=/home/pschwan/lustre/lustre
+
+while ( ! ping -c 1 -w 3 $OSTHOST ) ; do
+  sleep 2
+done;
+
+echo -n "OST $OSTHOST UUID $OST responding to pings : "
+date
+
+$LUSTRE/utils/lctl <<EOF
+network tcp
+close_uuid $OST
+del_uuid $OST
+connect $OSTHOST 988
+add_uuid $OST $OSTHOST
+quit
+EOF
+
+$LUSTRE/utils/lctl <<EOF
+device \$RPCDEV
+probe
+newconn $OST
+quit
+EOF
index 170c5d0..46549cc 100755 (executable)
@@ -24,8 +24,8 @@
 #
 # Based in part on the XML obdctl modifications done by Brian Behlendorf 
 
-import sys, getopt
-import string, os, stat, popen2, socket, time, random
+import sys, getopt, types
+import string, os, stat, popen2, socket, time, random, fcntl, FCNTL, select
 import re, exceptions
 import xml.dom.minidom
 
@@ -50,8 +50,10 @@ def usage():
     print """usage: lconf config.xml
 
 config.xml          Lustre configuration in xml format.
---get <url>         URL to fetch a config file
+--ldapurl           LDAP server URL, eg. ldap://localhost
+--config            Cluster config name used for LDAP query
 --node <nodename>   Load config for <nodename>
+--select service=nodeA,service2=nodeB   U
 -d | --cleanup      Cleans up config. (Shutdown)
 -f | --force        Forced unmounting and/or obd detach during cleanup
 -v | --verbose      Print system commands as they are run
@@ -73,7 +75,7 @@ config.xml          Lustre configuration in xml format.
                             30 - obd, mdd
                             40 - mds, ost
                             50 - mdc, osc
-                            60 - lov, lovconfig
+                            60 - lov
                             70 - mountpoint, echo_client
 --lustre=src_dir    Base directory of lustre sources. This parameter will cause lconf
                     to load modules from a source tree.
@@ -112,8 +114,11 @@ class Config:
         self._portals_dir = ''
        self._minlevel = 0
        self._maxlevel = 100
-        self._timeout = -1
+        self._timeout = 0
         self._recovery_upcall = ''
+        self._ldapurl = ''
+        self._config_name = ''
+        self._select = {}
 
     def verbose(self, flag = None):
         if flag: self._verbose = flag
@@ -151,10 +156,6 @@ class Config:
         if val: self._node = val
         return self._node
 
-    def url(self, val = None):
-        if val: self._url = val
-        return self._url
-
     def gdb_script(self):
         if os.path.isdir('/r'):
             return '/r' + self._gdb_script
@@ -170,7 +171,6 @@ class Config:
     def dump_file(self, val = None):
         if val: self._dump_file = val
         return self._dump_file
-
     def minlevel(self, val = None):
         if val: self._minlevel = int(val)
         return self._minlevel
@@ -195,6 +195,27 @@ class Config:
         if val: self._recovery_upcall = val
         return self._recovery_upcall
 
+    def ldapurl(self, val = None):
+        if val: self._ldapurl = val
+        return self._ldapurl
+
+    def config_name(self, val = None):
+        if val: self._config_name = val
+        return self._config_name
+
+    def init_select(self, arg):
+        # arg = "service=nodeA,service2=nodeB"
+        list = string.split(arg, ',')
+        for entry in list:
+            srv, node = string.split(entry, '=')
+            self._select[srv] = node
+        
+    def select(self, srv):
+        if self._select.has_key(srv):
+            return self._select[srv]
+        return None
+
+
 config = Config()
 
 # ============================================================ 
@@ -272,6 +293,10 @@ class LCTLInterface:
             else:
                 raise CommandError('lctl', "unable to find lctl binary.")
 
+    def set_nonblock(self, fd):
+        fl = fcntl.fcntl(fd, FCNTL.F_GETFL)
+        fcntl.fcntl(fd, FCNTL.F_SETFL, fl | os.O_NDELAY)
+
     def run(self, cmds):
         """
         run lctl
@@ -283,19 +308,42 @@ class LCTLInterface:
         """
         debug("+", self.lctl, cmds)
         if config.noexec(): return (0, [])
-        p = popen2.Popen3(self.lctl, 1)
-        p.tochild.write(cmds + "\n")
-        p.tochild.close()
-        out = p.fromchild.readlines()
-        err = p.childerr.readlines()
-        ret = p.wait()
+
+        child = popen2.Popen3(self.lctl, 1) # Capture stdout and stderr from command
+        child.tochild.write(cmds + "\n")
+        child.tochild.close()
+
+        # From "Python Cookbook" from O'Reilly
+        outfile = child.fromchild
+        outfd = outfile.fileno()
+        self.set_nonblock(outfd)
+        errfile = child.childerr
+        errfd = errfile.fileno()
+        self.set_nonblock(errfd)
+
+        outdata = errdata = ''
+        outeof = erreof = 0
+        while 1:
+            ready = select.select([outfd,errfd],[],[]) # Wait for input
+            if outfd in ready[0]:
+                outchunk = outfile.read()
+                if outchunk == '': outeof = 1
+                outdata = outdata + outchunk
+            if errfd in ready[0]:
+                errchunk = errfile.read()
+                if errchunk == '': erreof = 1
+                errdata = errdata + errchunk
+            if outeof and erreof: break
+        # end of "borrowed" code
+
+        ret = child.wait()
         if os.WIFEXITED(ret):
             rc = os.WEXITSTATUS(ret)
         else:
             rc = 0
-        if rc or len(err):
-            raise CommandError(self.lctl, err, rc)
-        return rc, out
+        if rc or len(errdata):
+            raise CommandError(self.lctl, errdata, rc)
+        return rc, outdata
 
     def runcmd(self, *args):
         """
@@ -587,8 +635,12 @@ def init_loop(file, size, fstype):
         return dev
     if config.reformat()  or not os.access(file, os.R_OK | os.W_OK):
         if size < 8000:
-            error(file, "size must be larger than 8MB")
-        run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,  file))
+            panic(file, "size must be larger than 8MB, currently set to:", size)
+        (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
+                                                                         file))
+        if ret:
+            panic("Unable to create backing store:", file)
+
     loop = loop_base()
     # find next free loop
     for n in xrange(0, MAX_LOOP_DEVICES):
@@ -707,11 +759,11 @@ class Module:
     """ Base class for the rest of the modules. The default cleanup method is
     defined here, as well as some utilitiy funcs.
     """
-    def __init__(self, module_name, dom_node):
-        self.dom_node = dom_node
+    def __init__(self, module_name, db):
+        self.db = db
         self.module_name = module_name
-        self.name = get_attr(dom_node, 'name')
-        self.uuid = get_attr(dom_node, 'uuid')
+        self.name = self.db.getName()
+        self.uuid = self.db.getUUID()
         self.kmodule_list = []
         self._server = None
         self._connected = 0
@@ -720,10 +772,9 @@ class Module:
         msg = string.join(map(str,args))
         print self.module_name + ":", self.name, self.uuid, msg
 
-
     def lookup_server(self, srv_uuid):
         """ Lookup a server's network information """
-        net = get_ost_net(self.dom_node.parentNode, srv_uuid)
+        net = self.db.get_ost_net(srv_uuid)
         if not net:
             panic ("Unable to find a server for:", srv_uuid)
         self._server = Network(net)
@@ -806,13 +857,13 @@ class Module:
         
 
 class Network(Module):
-    def __init__(self,dom_node):
-        Module.__init__(self, 'NETWORK', dom_node)
-        self.net_type = get_attr(dom_node,'type')
-        self.nid = get_text(dom_node, 'server', '*')
-        self.port = get_text_int(dom_node, 'port', 0)
-        self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
-        self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
+    def __init__(self,db):
+        Module.__init__(self, 'NETWORK', db)
+        self.net_type = self.db.get_val('nettype')
+        self.nid = self.db.get_val('nid', '*')
+        self.port = self.db.get_val_int('port', 0)
+        self.send_mem = self.db.get_val_int('send_mem', DEFAULT_TCPBUF)
+        self.recv_mem = self.db.get_val_int('recv_mem', DEFAULT_TCPBUF)
         if '*' in self.nid:
             self.nid = get_local_address(self.net_type, self.nid)
             if not self.nid:
@@ -842,20 +893,15 @@ class Network(Module):
             ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port)
             if ret:
                 raise CommandError(TCP_ACCEPTOR, out, ret)
-        ret = self.dom_node.getElementsByTagName('route_tbl')
-        for a in ret:
-            for r in a.getElementsByTagName('route'):
-                net_type = get_attr(r, 'type')
-                gw = get_attr(r, 'gw')
-                lo = get_attr(r, 'lo')
-                hi = get_attr(r,'hi', '')
-                lctl.add_route(net_type, gw, lo, hi)
-                if net_type in ('tcp', 'toe') and net_type == self.net_type and hi == '':
-                    srv = nid2server(self.dom_node.parentNode.parentNode, lo)
-                    if not srv:
-                        panic("no server for nid", lo)
-                    else:
-                        lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
+        for net_type, gw, lo, hi in self.db.get_route_tbl():
+            lctl.add_route(net_type, gw, lo, hi)
+            if net_type in ('tcp', 'toe') and net_type == self.net_type and hi == '':
+                srvdb = self.db.nid2server(lo)
+                if not srv:
+                    panic("no server for nid", lo)
+                else:
+                    srv = Network(srvdb)
+                    lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
 
             
         lctl.network(self.net_type, self.nid)
@@ -863,28 +909,25 @@ class Network(Module):
 
     def cleanup(self):
         self.info(self.net_type, self.nid, self.port)
-        ret = self.dom_node.getElementsByTagName('route_tbl')
-        for a in ret:
-            for r in a.getElementsByTagName('route'):
-                lo = get_attr(r, 'lo')
-                hi = get_attr(r,'hi', '')
-                if self.net_type in ('tcp', 'toe') and hi == '':
-                    srv = nid2server(self.dom_node.parentNode.parentNode, lo)
-                    if not srv:
-                        panic("no server for nid", lo)
-                    else:
-                        try:
-                            lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
-                        except CommandError, e:
-                            print "disconnect failed: ", self.name
-                            e.dump()
-                            cleanup_error(e.rc)
-                try:
-                    lctl.del_route(self.net_type, self.nid, lo, hi)
-                except CommandError, e:
-                    print "del_route failed: ", self.name
-                    e.dump()
-                    cleanup_error(e.rc)
+        for net_type, gw, lo, hi in self.db.get_route_tbl():
+            if self.net_type in ('tcp', 'toe') and hi == '':
+                srvdb = self.db.nid2server(lo)
+                if not srv:
+                    panic("no server for nid", lo)
+                else:
+                    srv = Network(srvdb)
+                    try:
+                        lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
+                    except CommandError, e:
+                        print "disconnect failed: ", self.name
+                        e.dump()
+                        cleanup_error(e.rc)
+            try:
+                lctl.del_route(self.net_type, self.nid, lo, hi)
+            except CommandError, e:
+                print "del_route failed: ", self.name
+                e.dump()
+                cleanup_error(e.rc)
               
         try:
             lctl.cleanup("RPCDEV", "RPCDEV_UUID")
@@ -903,8 +946,8 @@ class Network(Module):
             run("killall acceptor")
 
 class LDLM(Module):
-    def __init__(self,dom_node):
-        Module.__init__(self, 'LDLM', dom_node)
+    def __init__(self,db):
+        Module.__init__(self, 'LDLM', db)
         self.add_lustre_module('ldlm', 'ldlm') 
     def prepare(self):
         if is_prepared(self.uuid):
@@ -914,19 +957,16 @@ class LDLM(Module):
                     setup ="")
 
 class LOV(Module):
-    def __init__(self,dom_node):
-        Module.__init__(self, 'LOV', dom_node)
-        self.mds_uuid = get_first_ref(dom_node, 'mds')
-        mds= lookup(dom_node.parentNode, self.mds_uuid)
-        self.mds_name = getName(mds)
-        devs = dom_node.getElementsByTagName('devices')
-        if len(devs) > 0:
-            dev_node = devs[0]
-            self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
-            self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
-            self.pattern = get_attr_int(dev_node, 'pattern', 0)
-            self.devlist = get_all_refs(dev_node, 'obd')
-            self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
+    def __init__(self,db):
+        Module.__init__(self, 'LOV', db)
+        self.mds_uuid = self.db.get_first_ref('mds')
+        mds= self.db.lookup(self.mds_uuid)
+        self.mds_name = mds.getName()
+        self.stripe_sz = self.db.get_val_int('stripesize', 65536)
+        self.stripe_off = self.db.get_val_int('stripeoffset', 0)
+        self.pattern = self.db.get_val_int('stripepattern', 0)
+        self.devlist = self.db.get_refs('obd')
+        self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
         self.add_lustre_module('mdc', 'mdc')
         self.add_lustre_module('lov', 'lov')
 
@@ -934,7 +974,7 @@ class LOV(Module):
         if is_prepared(self.uuid):
             return
         for obd_uuid in self.devlist:
-            obd = lookup(self.dom_node.parentNode, obd_uuid)
+            obd = self.db.lookup(obd_uuid)
             osc = get_osc(obd)
             if osc:
                 try:
@@ -945,7 +985,7 @@ class LOV(Module):
                     print "Error preparing OSC %s (inactive)\n" % osc_uuid
             else:
                 panic('osc not found:', osc_uuid)
-        mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
+        mdc_uuid = prepare_mdc(self.db, self.mds_uuid)
         self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
                   self.stripe_off, self.pattern, self.devlist, self.mds_name)
         lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
@@ -955,19 +995,19 @@ class LOV(Module):
         if not is_prepared(self.uuid):
             return
         for obd_uuid in self.devlist:
-            obd = lookup(self.dom_node.parentNode, obd_uuid)
+            obd = self.db.lookup(obd_uuid)
             osc = get_osc(obd)
             if osc:
                 osc.cleanup()
             else:
                 panic('osc not found:', osc_uuid)
         Module.cleanup(self)
-        cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
+        cleanup_mdc(self.db, self.mds_uuid)
 
 
     def load_module(self):
         for obd_uuid in self.devlist:
-            obd = lookup(self.dom_node.parentNode, obd_uuid)
+            obd = self.db.lookup(obd_uuid)
             osc = get_osc(obd)
             if osc:
                 osc.load_module()
@@ -980,7 +1020,7 @@ class LOV(Module):
     def cleanup_module(self):
         Module.cleanup_module(self)
         for obd_uuid in self.devlist:
-            obd = lookup(self.dom_node.parentNode, obd_uuid)
+            obd = self.db.lookup(obd_uuid)
             osc = get_osc(obd)
             if osc:
                 osc.cleanup_module()
@@ -989,10 +1029,11 @@ class LOV(Module):
                 panic('osc not found:', osc_uuid)
 
 class LOVConfig(Module):
-    def __init__(self,dom_node):
-        Module.__init__(self, 'LOVConfig', dom_node)
-        self.lov_uuid = get_first_ref(dom_node, 'lov')
-        l = lookup(dom_node.parentNode, self.lov_uuid)
+    def __init__(self,db):
+        Module.__init__(self, 'LOVConfig', db)
+
+        self.lov_uuid = self.db.get_first_ref('lov')
+        l = self.db.lookup(self.lov_uuid)
         self.lov = LOV(l)
         
     def prepare(self):
@@ -1007,18 +1048,24 @@ class LOVConfig(Module):
         #nothing to do here
         pass
 
-
-class MDS(Module):
-    def __init__(self,dom_node):
-        Module.__init__(self, 'MDS', dom_node)
-        self.devname, self.size = get_device(dom_node)
-        self.fstype = get_text(dom_node, 'fstype')
+class MDSDEV(Module):
+    def __init__(self,db):
+        Module.__init__(self, 'MDSDEV', db)
+        self.devname = self.db.get_val('devpath','')
+        self.size = self.db.get_val_int('devsize', 0)
+        self.fstype = self.db.get_val('fstype', '')
+        # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
+        self.uuid = self.db.get_first_ref('mds')
+        mds = self.db.lookup(self.uuid)
+        self.name = mds.getName()
+        self.lovconfig_uuids = mds.get_refs('lovconfig')
         # FIXME: if fstype not set, then determine based on kernel version
-        self.format = get_text(dom_node, 'autoformat', "no")
+        self.format = self.db.get_val('autoformat', "no")
         if self.fstype == 'extN':
             self.add_lustre_module('extN', 'extN') 
         self.add_lustre_module('mds', 'mds')
-        self.add_lustre_module('obdclass', 'fsfilt_%s'%(self.fstype))
+        if self.fstype:
+            self.add_lustre_module('obdclass', 'fsfilt_%s' % (self.fstype))
             
     def prepare(self):
         if is_prepared(self.uuid):
@@ -1030,6 +1077,11 @@ class MDS(Module):
                         setup ="")
         lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
                     setup ="%s %s" %(blkdev, self.fstype))
+        for uuid in self.lovconfig_uuids:
+            db = self.db.lookup(uuid)
+            lovconfig = LOVConfig(db)
+            lovconfig.prepare()
+            
     def cleanup(self):
         if is_prepared('MDT_UUID'):
             try:
@@ -1046,40 +1098,49 @@ class MDS(Module):
 # Very unusual case, as there is no MDC element in the XML anymore
 # Builds itself from an MDS node
 class MDC(Module):
-    def __init__(self,dom_node):
-        self.mds = MDS(dom_node)
-        self.dom_node = dom_node
+    def __init__(self,db):
+        self.mds_uuid = db.getUUID()
+        self.mds_name = db.getName()
+        self.db = db
+        node_name =  config.select(self.mds_name)
+        if node_name:
+            self.mdd_uuid = self.db.get_mdd(node_name, self.mds_uuid)
+        else:
+            self.mdd_uuid = db.get_first_ref('active')
+        if not self.mdd_uuid:
+            panic("No MDSDEV found for MDS service:", self.mds_name)
         self.module_name = 'MDC'
         self.kmodule_list = []
         self._server = None
         self._connected = 0
 
         host = socket.gethostname()
-        self.name = 'MDC_%s' % (self.mds.name)
+        self.name = 'MDC_%s' % (self.mds_name)
         self.uuid = '%s_%05x_%05x' % (self.name, int(random.random() * 1048576),
                                       int(random.random() * 1048576))
 
-        self.lookup_server(self.mds.uuid)
+        self.lookup_server(self.mdd_uuid)
         self.add_lustre_module('mdc', 'mdc')
 
     def prepare(self):
         if is_prepared(self.uuid):
             return
-        self.info(self.mds.uuid)
+        self.info(self.mds_uuid)
         srv = self.get_server()
         lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
         lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
-                        setup ="%s %s" %(self.mds.uuid, srv.uuid))
+                        setup ="%s %s" %(self.mds_uuid, srv.uuid))
             
 class OBD(Module):
-    def __init__(self, dom_node):
-        Module.__init__(self, 'OBD', dom_node)
-        self.obdtype = get_attr(dom_node, 'type')
-        self.devname, self.size = get_device(dom_node)
-        self.fstype = get_text(dom_node, 'fstype')
-        self.active_target = get_text(dom_node, 'active_target')
+    def __init__(self, db):
+        Module.__init__(self, 'OBD', db)
+        self.obdtype = self.db.get_val('obdtype')
+        self.devname = self.db.get_val('devpath', '')
+        self.size = self.db.get_val_int('devsize', 0)
+        self.fstype = self.db.get_val('fstype', '')
+        self.active_target = self.db.get_first_ref('active')
         # FIXME: if fstype not set, then determine based on kernel version
-        self.format = get_text(dom_node, 'autoformat', 'yes')
+        self.format = self.db.get_val('autoformat', 'yes')
         if self.fstype == 'extN':
             self.add_lustre_module('extN', 'extN') 
         self.add_lustre_module(self.obdtype, self.obdtype)
@@ -1107,10 +1168,10 @@ class OBD(Module):
             clean_loop(self.devname)
 
 class COBD(Module):
-    def __init__(self, dom_node):
-        Module.__init__(self, 'COBD', dom_node)
-        self.real_uuid = get_first_ref(dom_node, 'real_obd')
-        self.cache_uuid = get_first_ref(dom_node, 'cache_obd')
+    def __init__(self, db):
+        Module.__init__(self, 'COBD', db)
+        self.real_uuid = self.db.get_first_ref('realobd')
+        self.cache_uuid = self.db.get_first_ref('cacheobd')
         self.add_lustre_module('cobd' , 'cobd')
 
     # need to check /proc/mounts and /etc/mtab before
@@ -1124,9 +1185,9 @@ class COBD(Module):
                     setup ="%s %s" %(self.real_uuid, self.cache_uuid))
 
 class OST(Module):
-    def __init__(self,dom_node):
-        Module.__init__(self, 'OST', dom_node)
-        self.obd_uuid = get_first_ref(dom_node, 'obd')
+    def __init__(self,db):
+        Module.__init__(self, 'OST', db)
+        self.obd_uuid = self.db.get_first_ref('obd')
         self.add_lustre_module('ost', 'ost')
 
     def prepare(self):
@@ -1139,12 +1200,12 @@ class OST(Module):
 
 # virtual interface for  OSC and LOV
 class VOSC(Module):
-    def __init__(self,dom_node):
-        Module.__init__(self, 'VOSC', dom_node)
-        if dom_node.nodeName == 'lov':
-            self.osc = LOV(dom_node)
+    def __init__(self,db):
+        Module.__init__(self, 'VOSC', db)
+        if db.get_class() == 'lov':
+            self.osc = LOV(db)
         else:
-            self.osc = get_osc(dom_node)
+            self.osc = get_osc(db)
     def get_uuid(self):
         return self.osc.uuid
     def prepare(self):
@@ -1158,8 +1219,8 @@ class VOSC(Module):
         
 
 class OSC(Module):
-    def __init__(self, dom_node, obd_name, obd_uuid, ost_uuid):
-        self.dom_node = dom_node
+    def __init__(self, db, obd_name, obd_uuid, ost_uuid):
+        self.db = db
         self.module_name = 'OSC'
         self.name = 'OSC_%s' % (obd_name)
         self.uuid = '%s_%05x' % (self.name, int(random.random() * 1048576))
@@ -1169,6 +1230,7 @@ class OSC(Module):
 
         self.obd_uuid = obd_uuid
         self.ost_uuid = ost_uuid
+        debug("OSC:", obd_uuid, ost_uuid)
         self.lookup_server(self.ost_uuid)
         self.add_lustre_module('osc', 'osc')
 
@@ -1211,11 +1273,11 @@ class OSC(Module):
             
 
 class ECHO_CLIENT(Module):
-    def __init__(self,dom_node):
-        Module.__init__(self, 'ECHO_CLIENT', dom_node)
+    def __init__(self,db):
+        Module.__init__(self, 'ECHO_CLIENT', db)
         self.add_lustre_module('obdecho', 'obdecho')
-        self.obd_uuid = get_first_ref(dom_node, 'obd')
-        obd = lookup(self.dom_node.parentNode, self.obd_uuid)
+        self.obd_uuid = self.db.get_first_ref('obd')
+        obd = self.db.lookup(self.obd_uuid)
         self.osc = VOSC(obd)
 
     def prepare(self):
@@ -1223,9 +1285,9 @@ class ECHO_CLIENT(Module):
             return
         self.osc.prepare() # XXX This is so cheating. -p
         self.info(self.obd_uuid)
-            
+
         lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid),
-                    setup = self.obd_uuid)
+                    setup = self.osc.get_uuid())
 
     def cleanup(self):
         if not is_prepared(self.uuid):
@@ -1241,20 +1303,20 @@ class ECHO_CLIENT(Module):
 
 
 class Mountpoint(Module):
-    def __init__(self,dom_node):
-        Module.__init__(self, 'MTPT', dom_node)
-        self.path = get_text(dom_node, 'path')
-        self.mds_uuid = get_first_ref(dom_node, 'mds')
-        self.obd_uuid = get_first_ref(dom_node, 'obd')
+    def __init__(self,db):
+        Module.__init__(self, 'MTPT', db)
+        self.path = self.db.get_val('path')
+        self.mds_uuid = self.db.get_first_ref('mds')
+        self.obd_uuid = self.db.get_first_ref('obd')
         self.add_lustre_module('mdc', 'mdc')
         self.add_lustre_module('llite', 'llite')
-        obd = lookup(self.dom_node.parentNode, self.obd_uuid)
+        obd = self.db.lookup(self.obd_uuid)
         self.osc = VOSC(obd)
 
 
     def prepare(self):
         self.osc.prepare()
-        mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
+        mdc_uuid = prepare_mdc(self.db, self.mds_uuid)
         self.info(self.path, self.mds_uuid, self.obd_uuid)
         cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
               (self.osc.get_uuid(), mdc_uuid, self.path)
@@ -1277,7 +1339,7 @@ class Mountpoint(Module):
             panic("fs is still mounted:", self.path)
 
         self.osc.cleanup()
-        cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
+        cleanup_mdc(self.db, self.mds_uuid)
 
     def load_module(self):
         self.osc.load_module()
@@ -1297,195 +1359,416 @@ def get_osc(obd_dom):
     osc = OSC(obd_dom, obd.name, obd.uuid, obd.active_target)
     return osc
 
+class LustreDB:
+    def lookup(self, uuid):
+        """ lookup returns a new LustreDB instance"""
+        return self._lookup_by_uuid(uuid)
+
+    def lookup_name(self, name, class_name = ""):
+        """ lookup returns a new LustreDB instance"""
+        return self._lookup_by_name(name, class_name)
+
+    def lookup_class(self, class_name):
+        """ lookup returns a new LustreDB instance"""
+        return self._lookup_by_class(class_name)
+
+    def get_val(self, tag, default=None):
+        v =  self._get_val(tag)
+        if v:
+            return v
+        if default != None:
+            return default
+        debug("LustreDB", self.getName(), " no value for:", tag)
+        return None
 
-def get_device(obd):
-    list = obd.getElementsByTagName('device')
-    if len(list) > 0:
-        dev = list[0]
-        dev.normalize();
-        size = get_attr_int(dev, 'size', 0)
-        return dev.firstChild.data, size
-    return '', 0
-
-# Get the text content from the first matching child
-# If there is no content (or it is all whitespace), return
-# the default
-def get_text(dom_node, tag, default=""):
-    list = dom_node.getElementsByTagName(tag)
-    if len(list) > 0:
-        dom_node = list[0]
-        dom_node.normalize()
-        if dom_node.firstChild:
-            txt = string.strip(dom_node.firstChild.data)
-            if txt:
-                return txt
-    return default
-
-def get_text_int(dom_node, tag, default=0):
-    list = dom_node.getElementsByTagName(tag)
-    n = default
-    if len(list) > 0:
-        dom_node = list[0]
-        dom_node.normalize()
-        if dom_node.firstChild:
-            txt = string.strip(dom_node.firstChild.data)
-            if txt:
-                try:
-                    n = int(txt)
-                except ValueError:
-                    panic("text value is not integer:", txt)
-    return n
-
-def get_attr(dom_node, attr, default=""):
-    v = dom_node.getAttribute(attr)
-    if v:
-        return v
-    return default
-
-def get_attr_int(dom_node, attr, default=0):
-    n = default
-    v = dom_node.getAttribute(attr)
-    if v:
+    def get_class(self):
+        return self._get_class()
+
+    def get_val_int(self, tag, default=0):
+        str = self._get_val(tag)
         try:
-            n = int(v)
+            if str:
+                return int(str)
+            return default
         except ValueError:
-            panic("attr value is not integer", v)
-    return n
-
-def get_first_ref(dom_node, tag):
-    """ Get the first uuidref of the type TAG. Used one only
-    one is expected.  Returns the uuid."""
-    uuid = None
-    refname = '%s_ref' % tag
-    list = dom_node.getElementsByTagName(refname)
-    if len(list) > 0:
-        uuid = getRef(list[0])
-    return uuid
+            panic("text value is not integer:", str)
+            
+    def get_first_ref(self, tag):
+        """ Get the first uuidref of the type TAG. Only
+        one is expected.  Returns the uuid."""
+        uuids = self._get_refs(tag)
+        if len(uuids) > 0:
+            return  uuids[0]
+        return None
     
-def get_all_refs(dom_node, tag):
-    """ Get all the refs of type TAG.  Returns list of uuids. """
-    uuids = []
-    refname = '%s_ref' % tag
-    list = dom_node.getElementsByTagName(refname)
-    if len(list) > 0:
-        for i in list:
-            uuids.append(getRef(i))
-    return uuids
-
-def get_ost_net(dom_node, uuid):
-    ost = lookup(dom_node, uuid)
-    uuid = get_first_ref(ost, 'network')
-    if not uuid:
+    def get_refs(self, tag):
+        """ Get all the refs of type TAG.  Returns list of uuids. """
+        uuids = self._get_refs(tag)
+        return uuids
+
+    def get_all_refs(self):
+        """ Get all the refs.  Returns list of uuids. """
+        uuids = self._get_all_refs()
+        return uuids
+
+    def get_ost_net(self, uuid):
+        ost = self.lookup(uuid)
+        uuid = ost.get_first_ref('network')
+        if not uuid:
+            return None
+        return ost.lookup(uuid)
+
+    def nid2server(self, nid):
+        netlist = self.parent.parent.attrs['network']
+        for net_db in netlist:
+            if net_db.get_val('nid') == nid: 
+                return net
         return None
-    return lookup(dom_node, uuid)
-
-def nid2server(dom_node, nid):
-    netlist = dom_node.getElementsByTagName('network')
-    for net_node in netlist:
-        if get_text(net_node, 'server') == nid:
-            return Network(net_node)
-    return None
     
-def lookup(dom_node, uuid):
-    for n in dom_node.childNodes:
-        if n.nodeType == n.ELEMENT_NODE:
-            if getUUID(n) == uuid:
-                return n
+    # the tag name is the service type
+    # fixme: this should do some checks to make sure the dom_node is a service
+    #
+    # determine what "level" a particular node is at.
+    
+    # the order of iniitailization is based on level. 
+    def getServiceLevel(self):
+        type = self.get_class()
+        ret=0;
+        if type in ('network',):
+            ret = 10
+        elif type in ('device', 'ldlm'):
+            ret = 20
+        elif type in ('obd', 'mdd', 'cobd'):
+            ret = 30
+        elif type in ('mdsdev','ost'):
+            ret = 40
+        elif type in ('mdc','osc'):
+            ret = 50
+        elif type in ('lov',):
+            ret = 60
+        elif type in ('mountpoint', 'echoclient'):
+            ret = 70
+
+        if ret < config.minlevel() or ret > config.maxlevel():
+            ret = 0 
+        return ret
+    
+    #
+    # return list of services in a profile. list is a list of tuples
+    # [(level, db_object),]
+    def getServices(self):
+        list = []
+        for ref_class, ref_uuid in self.get_all_refs(): 
+                servdb = self.lookup(ref_uuid)
+                if  servdb:
+                    level = servdb.getServiceLevel()
+                    if level > 0:
+                        list.append((level, servdb))
+                else:
+                    panic('service not found: ' + ref_uuid)
+                    
+        list.sort()
+        return list
+
+    # Find the mdsdev attached to node_name that points to
+    # mds_uuid
+    # node->profiles->mdsdev_refs->mds
+    def get_mdd(self, node_name, mds_uuid):
+        node_db = self.lookup_name(node_name)
+        if not node_db:
+            return None
+        prof_list = node_db.get_refs('profile')
+        for prof_uuid in prof_list:
+            prof_db = node_db.lookup(prof_uuid)
+            mdd_list = prof_db.get_refs('mdsdev')
+            for mdd_uuid in mdd_list:
+                mdd = self.lookup(mdd_uuid)
+                if mdd.get_first_ref('mds') == mds_uuid:
+                    return mdd_uuid
+        return None
+        
+
+class LustreDB_XML(LustreDB):
+    def __init__(self, dom, root_node):
+        # init xmlfile
+        self.dom_node = dom
+        self.root_node = root_node
+
+    def xmltext(self, dom_node, tag):
+        list = dom_node.getElementsByTagName(tag)
+        if len(list) > 0:
+            dom_node = list[0]
+            dom_node.normalize()
+            if dom_node.firstChild:
+                txt = string.strip(dom_node.firstChild.data)
+                if txt:
+                    return txt
+
+    def xmlattr(self, dom_node, attr):
+        return dom_node.getAttribute(attr)
+
+    def _get_val(self, tag):
+        """a value could be an attribute of the current node
+        or the text value in a child node"""
+        ret  = self.xmlattr(self.dom_node, tag)
+        if not ret:
+            ret = self.xmltext(self.dom_node, tag)
+        return ret
+
+    def _get_class(self):
+        return self.dom_node.nodeName
+
+    #
+    # [(ref_class, ref_uuid),]
+    def _get_all_refs(self):
+        list = []
+        for n in self.dom_node.childNodes: 
+            if n.nodeType == n.ELEMENT_NODE:
+                ref_uuid = self.xml_get_ref(n)
+                ref_class = n.nodeName
+                list.append((ref_class, ref_uuid))
+                    
+        list.sort()
+        return list
+
+    def _get_refs(self, tag):
+        """ Get all the refs of type TAG.  Returns list of uuids. """
+        uuids = []
+        refname = '%s_ref' % tag
+        reflist = self.dom_node.getElementsByTagName(refname)
+        for r in reflist:
+            uuids.append(self.xml_get_ref(r))
+        return uuids
+
+    def xmllookup_by_uuid(self, dom_node, uuid):
+        for n in dom_node.childNodes:
+            if n.nodeType == n.ELEMENT_NODE:
+                if self.xml_get_uuid(n) == uuid:
+                    return n
+                else:
+                    n = self.xmllookup_by_uuid(n, uuid)
+                    if n: return n
+        return None
+
+    def _lookup_by_uuid(self, uuid):
+        dom = self. xmllookup_by_uuid(self.root_node, uuid)
+        if dom:
+            return LustreDB_XML(dom, self.root_node)
+
+    def xmllookup_by_name(self, dom_node, name):
+        for n in dom_node.childNodes:
+            if n.nodeType == n.ELEMENT_NODE:
+                if self.xml_get_name(n) == name:
+                    return n
+                else:
+                    n = self.xmllookup_by_name(n, name)
+                    if n: return n
+        return None
+
+    def _lookup_by_name(self, name, class_name):
+        dom = self.xmllookup_by_name(self.root_node, name)
+        if dom:
+            return LustreDB_XML(dom, self.root_node)
+
+    def xmllookup_by_class(self, dom_node, class_name):
+        return dom_node.getElementsByTagName(class_name)
+
+    def _lookup_by_class(self, class_name):
+        ret = []
+        domlist = self.xmllookup_by_class(self.root_node, class_name)
+        for node in domlist:
+            ret.append(LustreDB_XML(node, self.root_node))
+        return ret
+
+    def xml_get_name(self, n):
+        return n.getAttribute('name')
+        
+    def getName(self):
+        return self.xml_get_name(self.dom_node)
+
+    def xml_get_ref(self, n):
+        return n.getAttribute('uuidref')
+
+    def xml_get_uuid(self, dom_node):
+        return dom_node.getAttribute('uuid')
+
+    def getUUID(self):
+        return self.xml_get_uuid(self.dom_node)
+
+    def get_routes(self, type, gw):
+        """ Return the routes as a list of tuples of the form:
+        [(type, gw, lo, hi),]"""
+        res = []
+        tbl = self.dom_node.getElementsByTagName('route_tbl')
+        for t in tbl:
+            routes = t.getElementsByTagName('route')
+            for r in routes:
+                lo = self.xmlattr(r, 'lo')
+                hi = self.xmlattr(r, 'hi', '')
+                res.append((type, gw, lo, hi))
+        return res
+
+    def get_route_tbl(self):
+        ret = []
+        tbls = self.dom_node.getElementsByTagName('route_tbl')
+        for tbl in tbls:
+            for r in tbl.getElementsByTagName('route'):
+                net_type = self.xmlattr(r, 'type')
+                gw = self.xmlattr(r, 'gw')
+                lo = self.xmlattr(r, 'lo')
+                hi = self.xmlattr(r,'hi', '')
+                ret.append((net_type, gw, lo, hi))
+        return ret
+
+
+# ================================================================    
+# LDAP Support
+class LustreDB_LDAP(LustreDB):
+    def __init__(self, name, attrs,
+                 base = "fs=lustre",
+                 parent = None,
+                 url  = "ldap://localhost",
+                 user = "cn=Manager, fs=lustre",
+                 pw   = "secret"
+                 ):
+        self._name = name
+        self._attrs = attrs
+        self._base = base
+        self._parent = parent
+        self._url  = url
+        self._user = user
+        self._pw   = pw
+        if parent:
+            self.l = parent.l
+            self._base = parent._base
+        else:
+            self.open()
+
+    def open(self):
+        import ldap
+        try:
+            self.l = ldap.initialize(self._url)
+            # Set LDAP protocol version used
+            self.l.protocol_version=ldap.VERSION3
+            # user and pw only needed if modifying db
+            self.l.bind_s("", "", ldap.AUTH_SIMPLE);
+        except ldap.LDAPerror, e:
+            panic(e)
+            # FIXME, do something useful here
+
+    def close(self):
+        self.l.unbind_s()
+
+    def ldap_search(self, filter):
+        """Return list of uuids matching the filter."""
+        import ldap
+        dn = self._base
+        ret = []
+        uuids = []
+        try:
+            for name, attrs in self.l.search_s(dn, ldap.SCOPE_ONELEVEL,
+                                        filter, ["uuid"]):
+                for v in attrs['uuid']:
+                    uuids.append(v)
+        except ldap.NO_SUCH_OBJECT, e:
+            pass
+        except ldap.LDAPError, e:
+            print e                     # FIXME: die here?
+        if len(uuids) > 0:
+            for uuid in uuids:
+                ret.append(self._lookup_by_uuid(uuid))
+        return ret
+
+    def _lookup_by_name(self, name, class_name):
+        list =  self.ldap_search("lustreName=%s" %(name))
+        if len(list) == 1:
+            return list[0]
+        return []
+
+    def _lookup_by_class(self, class_name):
+        return self.ldap_search("objectclass=%s" %(string.upper(class_name)))
+
+    def _lookup_by_uuid(self, uuid):
+        import ldap
+        dn = "uuid=%s,%s" % (uuid, self._base)
+        ret = None
+        try:
+            for name, attrs in self.l.search_s(dn, ldap.SCOPE_BASE,
+                                               "objectclass=*"):
+                ret = LustreDB_LDAP(name, attrs,  parent = self)
+                        
+        except ldap.NO_SUCH_OBJECT, e:
+            debug("NO_SUCH_OBJECT:", uuid)
+            pass                        # just return empty list
+        except ldap.LDAPError, e:
+            print e                     # FIXME: die here?
+        return ret
+
+
+    def _get_val(self, k):
+        ret = None
+        if self._attrs.has_key(k):
+            v = self._attrs[k]
+            if type(v) == types.ListType:
+                ret = str(v[0])
             else:
-                n = lookup(n, uuid)
-                if n: return n
-    return None
-            
-# Get name attribute of dom_node
-def getName(dom_node):
-    return dom_node.getAttribute('name')
+                ret = str(v)
+        return ret
 
-def getRef(dom_node):
-    return dom_node.getAttribute('uuidref')
+    def _get_class(self):
+        return string.lower(self._attrs['objectClass'][0])
 
-# Get name attribute of dom_node
-def getUUID(dom_node):
-    return dom_node.getAttribute('uuid')
+    #
+    # [(ref_class, ref_uuid),]
+    def _get_all_refs(self):
+        list = []
+        for k in self._attrs.keys():
+            if re.search('.*Ref', k):
+                for uuid in self._attrs[k]:
+                    list.append((k, uuid))
+        return list
 
-# the tag name is the service type
-# fixme: this should do some checks to make sure the dom_node is a service
-def getServiceType(dom_node):
-    return dom_node.nodeName
+    def _get_refs(self, tag):
+        """ Get all the refs of type TAG.  Returns list of uuids. """
+        uuids = []
+        refname = '%sRef' % tag
+        if self._attrs.has_key(refname):
+            return self._attrs[refname]
+        return []
 
-#
-# determine what "level" a particular node is at.
-# the order of iniitailization is based on level. 
-def getServiceLevel(dom_node):
-    type = getServiceType(dom_node)
-    ret=0;
-    if type in ('network',):
-        ret = 10
-    elif type in ('device', 'ldlm'):
-        ret = 20
-    elif type in ('obd', 'mdd', 'cobd'):
-        ret = 30
-    elif type in ('mds','ost'):
-        ret = 40
-    elif type in ('mdc','osc'):
-        ret = 50
-    elif type in ('lov', 'lovconfig'):
-        ret = 60
-    elif type in ('mountpoint', 'echo_client'):
-        ret = 70
-
-    if ret < config.minlevel() or ret > config.maxlevel():
-        ret = 0 
-    return ret
+    def getName(self):
+        return self._get_val('lustreName')
 
-#
-# return list of services in a profile. list is a list of tuples
-# [(level, dom_node),]
-def getServices(lustreNode, profileNode):
-    list = []
-    for n in profileNode.childNodes: 
-        if n.nodeType == n.ELEMENT_NODE:
-            servNode = lookup(lustreNode, getRef(n))
-            if not servNode:
-                print n
-                panic('service not found: ' + getRef(n))
-            level = getServiceLevel(servNode)
-           if level > 0:
-                list.append((level, servNode))
-    list.sort()
-    return list
-
-def getByName(lustreNode, name, tag):
-    ndList = lustreNode.getElementsByTagName(tag)
-    for nd in ndList:
-        if getName(nd) == name:
-            return nd
-    return None
-    
+    def getUUID(self):
+        return self._get_val('uuid')
+
+    def get_route_tbl(self):
+        return []
 
 ############################################################
 # MDC UUID hack - 
 # FIXME: clean this mess up!
 #
 saved_mdc = {}
-def prepare_mdc(dom_node, mds_uuid):
+def prepare_mdc(db, mds_uuid):
     global saved_mdc
-    mds_node = lookup(dom_node, mds_uuid);
-    if not mds_node:
+    mds_db = db.lookup(mds_uuid);
+    if not mds_db:
         panic("no mds:", mds_uuid)
     if saved_mdc.has_key(mds_uuid):
         return saved_mdc[mds_uuid]
-    mdc = MDC(mds_node)
+    mdc = MDC(mds_db)
     mdc.prepare()
     saved_mdc[mds_uuid] = mdc.uuid
     return mdc.uuid
 
-def cleanup_mdc(dom_node, mds_uuid):
+def cleanup_mdc(db, mds_uuid):
     global saved_mdc
-    mds_node = lookup(dom_node, mds_uuid);
-    if not mds_node:
+    mds_db = db.lookup(mds_uuid);
+    if not mds_db:
         panic("no mds:", mds_uuid)
     if not saved_mdc.has_key(mds_uuid):
-        mdc = MDC(mds_node)
+        mdc = MDC(mds_db)
         mdc.cleanup()
         saved_mdc[mds_uuid] = mdc.uuid
         
@@ -1497,58 +1780,45 @@ routes = []
 local_node = []
 router_flag = 0
 
-def init_node(dom_node):
+def init_node(node_db):
     global local_node, router_flag
-    netlist = dom_node.getElementsByTagName('network')
-    for dom_net in netlist:
-        type = get_attr(dom_net, 'type')
-        gw = get_text(dom_net, 'server')
+    netlist = node_db.lookup_class('network')
+    for db in netlist:
+        type = db.get_val('nettype')
+        gw = db.get_val('nid')
         local_node.append((type, gw))
 
 def node_needs_router():
     return router_flag
 
-def get_routes(type, gw, dom_net):
-    """ Return the routes as a list of tuples of the form:
-        [(type, gw, lo, hi),]"""
-    res = []
-    tbl = dom_net.getElementsByTagName('route_tbl')
-    for t in tbl:
-        routes = t.getElementsByTagName('route')
-        for r in routes:
-            lo = get_attr(r, 'lo')
-            hi = get_attr(r, 'hi', '')
-            res.append((type, gw, lo, hi))
-    return res
-    
-
 def init_route_config(lustre):
     """ Scan the lustre config looking for routers.  Build list of
     routes. """
     global routes, router_flag
     routes = []
-    list = lustre.getElementsByTagName('node')
-    for node in list:
-        if get_attr(node, 'router'):
+    list = lustre.lookup_class('node')
+    for node_db in list:
+        if node_db.get_val_int('router', 0):
             router_flag = 1
             for (local_type, local_nid) in local_node:
                 gw = None
-                netlist = node.getElementsByTagName('network')
-                for dom_net in netlist:
-                    if local_type == get_attr(dom_net, 'type'):
-                        gw = get_text(dom_net, 'server')
+                netlist = node_db.lookup_class('network')
+                for db in netlist:
+                    if local_type == db.get_val('type'):
+                        gw = db.get_val('server')
                         break
                 if not gw:
                     continue
-                for dom_net in netlist:
-                    if local_type != get_attr(dom_net, 'type'):
-                        for route in get_routes(local_type, gw, dom_net):
+                for db in netlist:
+                    if local_type != db.get_val('type'):
+                        for route in db.get_routes(local_type, gw):
                             routes.append(route)
     
 
 def local_net(net):
     global local_node
     for iface in local_node:
+        #debug("local_net a:", net.net_type, "b:", iface[0])
         if net.net_type == iface[0]:
             return 1
     return 0
@@ -1565,40 +1835,37 @@ def find_route(net):
     return None
            
     
-        
 
 ############################################################
 # lconf level logic
 # Start a service.
-def startService(dom_node, module_flag):
-    type = getServiceType(dom_node)
-    debug('Service:', type, getName(dom_node), getUUID(dom_node))
+def startService(db, module_flag):
+    type = db.get_class()
+    debug('Service:', type, db.getName(), db.getUUID())
     # there must be a more dynamic way of doing this...
     n = None
     if type == 'ldlm':
-        n = LDLM(dom_node)
+        n = LDLM(db)
     elif type == 'lov':
-        n = LOV(dom_node)
-    elif type == 'lovconfig':
-        n = LOVConfig(dom_node)
+        n = LOV(db)
     elif type == 'network':
-        n = Network(dom_node)
+        n = Network(db)
     elif type == 'obd':
-        n = OBD(dom_node)
+        n = OBD(db)
     elif type == 'cobd':
-        n = COBD(dom_node)
+        n = COBD(db)
     elif type == 'ost':
-        n = OST(dom_node)
-    elif type == 'mds':
-        n = MDS(dom_node)
+        n = OST(db)
+    elif type == 'mdsdev':
+        n = MDSDEV(db)
     elif type == 'osc':
-        n = VOSC(dom_node)
+        n = VOSC(db)
     elif type == 'mdc':
-        n = MDC(dom_node)
+        n = MDC(db)
     elif type == 'mountpoint':
-        n = Mountpoint(dom_node)
-    elif type == 'echo_client':
-        n = ECHO_CLIENT(dom_node)
+        n = Mountpoint(db)
+    elif type == 'echoclient':
+        n = ECHO_CLIENT(db)
     else:
         panic ("unknown service type:", type)
 
@@ -1625,10 +1892,10 @@ def startService(dom_node, module_flag):
 #  * make sure partitions are in place and prepared
 #  * initialize devices with lctl
 # Levels is important, and needs to be enforced.
-def startProfile(lustreNode, profileNode, module_flag):
-    if not profileNode:
+def startProfile(prof_db, module_flag):
+    if not prof_db:
         panic("profile:", profile, "not found.")
-    services = getServices(lustreNode, profileNode)
+    services = prof_db.getServices()
     if config.cleanup():
         services.reverse()
     for s in services:
@@ -1637,35 +1904,33 @@ def startProfile(lustreNode, profileNode, module_flag):
 
 #
 # Load profile for 
-def doHost(lustreNode, hosts):
+def doHost(lustreDB, hosts):
     global routes
     global router_flag 
-    dom_node = None
+    node_db = None
     for h in hosts:
-        dom_node = getByName(lustreNode, h, 'node')
-        if dom_node:
+        node_db = lustreDB.lookup_name(h, 'node')
+        if node_db:
             break
-    if not dom_node:
+    if not node_db:
         print 'No host entry found.'
         return
 
-    if get_attr(dom_node, 'router'):
-        router_flag = 1
-    else:
-        router_flag = 0
-    recovery_upcall = get_attr(dom_node, 'recovery_upcall')
-    timeout = get_attr_int(dom_node, 'timeout')
+    router_flag = node_db.get_val_int('router', 0)
+    recovery_upcall = node_db.get_val('recovery_upcall', '')
+    timeout = node_db.get_val_int('timeout', 0)
 
     if not router_flag:
-        init_node(dom_node)
-        init_route_config(lustreNode)
+        init_node(node_db)
+        init_route_config(lustreDB)
 
     # Two step process: (1) load modules, (2) setup lustre
     # if not cleaning, load modules first.
     module_flag = not config.cleanup()
-    reflist = dom_node.getElementsByTagName('profile')
-    for profile in reflist:
-            startProfile(lustreNode,  profile, module_flag)
+    prof_list = node_db.get_refs('profile')
+    for prof_uuid in prof_list:
+        prof_db = node_db.lookup(prof_uuid)
+        startProfile(prof_db, module_flag)
 
     if not config.cleanup():
         sys_set_debug_path()
@@ -1678,10 +1943,10 @@ def doHost(lustreNode, hosts):
         sys_set_timeout(timeout)
         sys_set_recovery_upcall(recovery_upcall)
             
-            
     module_flag = not module_flag
-    for profile in reflist:
-            startProfile(lustreNode,  profile, module_flag)
+    for prof_uuid in prof_list:
+        prof_db = node_db.lookup(prof_uuid)
+        startProfile(prof_db, module_flag)
 
 ############################################################
 # Command line processing
@@ -1692,7 +1957,8 @@ def parse_cmdline(argv):
                  "portals=", "makeldiff", "cleanup", "noexec",
                  "help", "node=", "nomod", "nosetup",
                  "dump=", "force", "minlevel=", "maxlevel=",
-                 "timeout=", "recovery_upcall="]
+                 "timeout=", "recovery_upcall=",
+                 "ldapurl=", "config=", "select="]
     opts = []
     args = []
 
@@ -1730,14 +1996,21 @@ def parse_cmdline(argv):
             config.dump_file(a)
         if o in ("-f", "--force"):
             config.force(1)
-       if o in ("--minlevel",):
+       if o == "--minlevel":
                config.minlevel(a)
-        if o in ("--maxlevel",):
+        if o == "--maxlevel":
                 config.maxlevel(a)
-        if o in ("--timeout",):
+        if o == "--timeout":
                 config.timeout(a)
-        if o in ("--recovery_upcall",):
+        if o == "--recovery_upcall":
                 config.recovery_upcall(a)
+        if o == "--ldapurl":
+                config.ldapurl(a)
+        if o == "--config":
+                config.config_name(a)
+        if o == "--select":
+                config.init_select(a)
+
     return args
 
 def fetch(url):
@@ -1793,9 +2066,9 @@ def sys_set_recovery_upcall(upcall):
 
 def sys_set_timeout(timeout):
     # the command overrides the value in the node config
-    if config.timeout() >= 0:
+    if config.timeout() > 0:
         timeout = config.timeout()
-    if timeout >= 0:
+    if timeout > 0:
         debug("setting timeout:", timeout)
         sysctl('lustre/timeout', timeout)
 
@@ -1867,10 +2140,17 @@ def main():
         if not os.access(args[0], os.R_OK):
             print 'File not found or readable:', args[0]
             sys.exit(1)
-        dom = xml.dom.minidom.parse(args[0])
-    elif config.url():
-        xmldata = fetch(config.url())
-        dom = xml.dom.minidom.parseString(xmldata)
+        try:
+            dom = xml.dom.minidom.parse(args[0])
+        except Exception:
+            panic("%s does not appear to be a config file." % (args[0]))
+            sys.exit(1) # make sure to die here, even in debug mode.
+        db = LustreDB_XML(dom.documentElement, dom.documentElement)
+    elif config.ldapurl():
+        if not config.config_name():
+            panic("--ldapurl requires --config name")
+        dn = "config=%s,fs=lustre" % (config.config_name())
+        db = LustreDB_LDAP('', {}, base=dn, url = config.ldapurl())
     else:
         usage()
 
@@ -1902,7 +2182,8 @@ def main():
     sys_make_devices()
     sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
     sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
-    doHost(dom.documentElement, node_list)
+
+    doHost(db, node_list)
 
 if __name__ == "__main__":
     try:
index 2e6324c..2217058 100644 (file)
@@ -179,6 +179,7 @@ command_t cmdlist[] = {
          "usage: lov_set_osc_active <OSC UUID> <1|0 (active|inactive)>"},
         {"newconn", jt_obd_newconn, 0, "newconn <olduuid> [newuuid]"},
         {"failconn", jt_obd_failconn, 0, "failconn <uuid>"},
+        {"lookup", jt_obd_mdc_lookup, 0, "usage: lookup <directory> <file>"},
 
         /* Debug commands */
         {"======== debug =========", jt_noop, 0, "debug"},
index 26f6a3f..93777d6 100644 (file)
@@ -16,7 +16,7 @@
 #include <linux/lustre_lite.h>
 #include <linux/obd_lov.h>
 
-#warning Max obds per lov currently hardcoded to 1000 in lov/lov_obd.c
+/* XXX Max obds per lov currently hardcoded to 1000 in lov/lov_obd.c */
 #define MAX_LOV_UUID_COUNT     1000
 #define OBD_NOT_FOUND          ((__u32)-1)
 
@@ -128,7 +128,7 @@ init()
        else
                buflen = lmmlen;
 
-#warning max ioctl buffer size currently hardcoded to 8192
+       /* XXX max ioctl buffer size currently hardcoded to 8192 */
        if (buflen > 8192) {
                int nuuids, remaining, nluoinfos;
 
@@ -194,12 +194,12 @@ processFile(const char *path, const struct stat *sp, int flag, struct FTW *ftwp)
        if (flag != FTW_F)
                return 0;
 
-       if ((obdcount == 0) && (getobdindex(path) == OBD_NOT_FOUND)) {
+       if (getobdindex(path) == OBD_NOT_FOUND && obdcount == 0) {
                /* terminate nftw walking this tree */
                return(1);
        }
 
-       if ((fd = open(path, O_RDONLY)) < 0) {
+       if ((fd = open(path, O_RDONLY | O_LOV_DELAY_CREATE)) < 0) {
                errMsg("open \"%.20s\" failed.", path);
                perror("open");
                exit(1);
@@ -212,21 +212,24 @@ processFile(const char *path, const struct stat *sp, int flag, struct FTW *ftwp)
        if ((rc = ioctl(fd, LL_IOC_LOV_GETSTRIPE, (void *)lmm)) < 0) {
                errMsg("LL_IOC_LOV_GETSTRIPE ioctl failed.");
                perror("ioctl");
-               exit(1);
+               return 0;
        }
 
        close(fd);
 
-       if (query || verbose || lmm->lmm_objects[obdindex].l_object_id)
+       if (query || verbose ||
+           (obdindex != OBD_NOT_FOUND &&
+            lmm->lmm_objects[obdindex].l_object_id))
                printf("%s\n", path);
 
        if (verbose) {
                printf("lmm_magic:          0x%x\n", lmm->lmm_magic);
                printf("lmm_object_id:      "LPX64"\n", lmm->lmm_object_id);
-               printf("lmm_stripe_offset:  %d\n", lmm->lmm_stripe_offset);
-               printf("lmm_stripe_count:   %d\n", lmm->lmm_stripe_count);
-               printf("lmm_ost_count:      %d\n", lmm->lmm_ost_count);
-               printf("lmm_stripe_pattern: %d\n", lmm->lmm_stripe_pattern);
+               printf("lmm_stripe_offset:  %u\n", (int)lmm->lmm_stripe_offset);
+               printf("lmm_stripe_count:   %u\n", (int)lmm->lmm_stripe_count);
+               printf("lmm_stripe_size:    %u\n", (int)lmm->lmm_stripe_size);
+               printf("lmm_ost_count:      %u\n", lmm->lmm_ost_count);
+               printf("lmm_stripe_pattern: %d\n", lmm->lmm_magic & 0xf);
        }
 
        count = lmm->lmm_ost_count;
index 3ea5265..4d40a5b 100755 (executable)
@@ -53,6 +53,7 @@ Object creation command summary:
   --node node_name
   --mds mds_name
   --dev path
+  --fstype extN|ext3
   --size size
 
 --add lov
@@ -68,6 +69,7 @@ Object creation command summary:
   --lov lov_name 
   --dev path
   --size size
+  --fstype extN|ext3
   --obduuid uuid
   
 --add mtpt  - Mountpoint
@@ -179,13 +181,13 @@ class GenConfig:
     def network(self, name, uuid, hostname, net, port=0, tcpbuf=0):
         """create <network> node"""
         network = self.newService("network", name, uuid)
-        network.setAttribute("type", net);
-        self.addElement(network, "server", hostname)
+        network.setAttribute("nettype", net);
+        self.addElement(network, "nid", hostname)
         if port:
             self.addElement(network, "port", "%d" %(port))
         if tcpbuf:
-            self.addElement(network, "send_mem", "%d" %(tcpbuf))
-            self.addElement(network, "recv_mem", "%d" %(tcpbuf))
+            self.addElement(network, "sendmem", "%d" %(tcpbuf))
+            self.addElement(network, "recvmem", "%d" %(tcpbuf))
             
         return network
 
@@ -199,10 +201,15 @@ class GenConfig:
             ref.setAttribute("hi", hi)
         return ref
     
-    def node(self, name, uuid):
+    def profile(self, name, uuid):
+        """ create a host """
+        profile = self.newService("profile", name, uuid)
+        return profile
+
+    def node(self, name, uuid, prof_uuid):
         """ create a host """
         node = self.newService("node", name, uuid)
-        self.addElement(node, 'profile')
+        node.appendChild(self.ref("profile", prof_uuid))
         return node
 
     def ldlm(self, name, uuid):
@@ -212,27 +219,21 @@ class GenConfig:
 
     def obd(self, name, uuid, fs, obdtype, devname, format, ost_uuid, dev_size=0):
         obd = self.newService("obd", name, uuid)
-        obd.setAttribute('type', obdtype)
-        self.addElement(obd, 'active_target', ost_uuid)
+        obd.setAttribute('obdtype', obdtype)
+        obd.appendChild(self.ref("active", ost_uuid))
         if fs:
             self.addElement(obd, "fstype", fs)
         if devname:
-            dev = self.addElement(obd, "device", devname)
-            if (dev_size):
-                dev.setAttribute("size", "%s" % (dev_size))
+            dev = self.addElement(obd, "devpath", devname)
             self.addElement(obd, "autoformat", format)
+            if dev_size:
+                self.addElement(obd, "devsize", "%s" % (dev_size))
         return obd
 
-#    def osc(self, name, uuid, obd_uuid, net_uuid):
-#        osc = self.newService("osc", name, uuid)
-#        osc.appendChild(self.ref("ost", net_uuid))
-#        osc.appendChild(self.ref("obd", obd_uuid))
-#        return osc
-
     def cobd(self, name, uuid, real_uuid, cache_uuid):
         cobd = self.newService("cobd", name, uuid)
-        cobd.appendChild(self.ref("real_obd",real_uuid))
-        cobd.appendChild(self.ref("cache_obd",cache_uuid))
+        cobd.appendChild(self.ref("realobd",real_uuid))
+        cobd.appendChild(self.ref("cacheobd",cache_uuid))
         return cobd
 
     def ost(self, name, uuid, obd_uuid, net_uuid):
@@ -244,10 +245,9 @@ class GenConfig:
     def lov(self, name, uuid, mds_uuid, stripe_sz, stripe_cnt, pattern):
         lov = self.newService("lov", name, uuid)
         lov.appendChild(self.ref("mds", mds_uuid))
-        devs = self.addElement(lov, "devices" )
-        devs.setAttribute("stripesize", stripe_sz)
-        devs.setAttribute("stripecount", stripe_cnt)
-        devs.setAttribute("pattern", pattern)
+        lov.setAttribute("stripesize", stripe_sz)
+        lov.setAttribute("stripecount", stripe_cnt)
+        lov.setAttribute("stripepattern", pattern)
         return lov
 
     def lovconfig(self, name, uuid, lov_uuid):
@@ -255,20 +255,23 @@ class GenConfig:
         lovconfig.appendChild(self.ref("lov", lov_uuid))
         return lovconfig
 
-    def mds(self, name, uuid, fs, devname, format, net_uuid, node_uuid,
-            failover_uuid = "", dev_size=0 ):
+    def mds(self, name, uuid, mdd_uuid):
         mds = self.newService("mds", name, uuid)
-        self.addElement(mds, "fstype", fs)
-        dev = self.addElement(mds, "device", devname)
-        if dev_size:
-            dev.setAttribute("size", "%s" % (dev_size))
-        self.addElement(mds, "autoformat", format)
-        mds.appendChild(self.ref("network", net_uuid))
-        mds.appendChild(self.ref("node", node_uuid))
-        if failover_uuid:
-            mds.appendChild(self.ref("failover", failover_uuid))
+        mds.appendChild(self.ref("active",mdd_uuid))
         return mds
 
+    def mdsdev(self, name, uuid, fs, devname, format, net_uuid, node_uuid,
+            mds_uuid, dev_size=0 ):
+        mdd = self.newService("mdsdev", name, uuid)
+        self.addElement(mdd, "fstype", fs)
+        dev = self.addElement(mdd, "devpath", devname)
+        self.addElement(mdd, "autoformat", format)
+        if dev_size:
+                self.addElement(mdd, "devsize", "%s" % (dev_size))
+        mdd.appendChild(self.ref("network", net_uuid))
+        mdd.appendChild(self.ref("mds", mds_uuid))
+        return mdd
+
     def mountpoint(self, name, uuid, mds_uuid, osc_uuid, path):
         mtpt = self.newService("mountpoint", name, uuid)
         mtpt.appendChild(self.ref("mds", mds_uuid))
@@ -277,7 +280,7 @@ class GenConfig:
         return mtpt
 
     def echo_client(self, name, uuid, osc_uuid):
-        ec = self.newService("echo_client", name, uuid)
+        ec = self.newService("echoclient", name, uuid)
         ec.appendChild(self.ref("obd", osc_uuid))
         return ec
 
@@ -314,19 +317,6 @@ def lookup(node, uuid):
                 n = lookup(n, uuid)
                 if n: return n
     return None
-            
-
-def mds2node(lustre, mds_name):
-    """ Find the node a MDS is configured on """
-    mds = findByName(lustre, mds_name, 'mds')
-    ref = mds.getElementsByTagName('node_ref')
-    if not ref:
-        error("mds2node:", "no node_ref found for", '"'+mds_name+'"')
-    node_uuid = ref[0].getAttribute('uuidref')
-    node = lookup(lustre, node_uuid)
-    if not node:
-        error('mds2node:', "no node found for :", '"'+mds_name+'"')
-    return node
 
 
 def name2uuid(lustre, name, tag="",  fatal=1):
@@ -353,18 +343,16 @@ def get_net_uuid(lustre, node_name):
 
 
 def lov_add_obd(gen, lov, osc_uuid):
-    devs = lov.getElementsByTagName('devices')
-    if len(devs) == 1:
-        devs[0].appendChild(gen.ref("obd", osc_uuid))
-    else:
-        error("No devices element found for LOV:", lov)
-
+    lov.appendChild(gen.ref("obd", osc_uuid))
                             
 def node_add_profile(gen, node, ref, uuid):
-    ret = node.getElementsByTagName('profile')
+    refname = "%s_ref" % "profile"
+    ret = node.getElementsByTagName(refname)
     if not ret:
-        error('node has no profile:', node)
-    ret[0].appendChild(gen.ref(ref, uuid))
+        error('node has no profile ref:', node)
+    prof_uuid = ret[0].getAttribute('uuidref')
+    profile = lookup(node.parentNode, prof_uuid)
+    profile.appendChild(gen.ref(ref, uuid))
     
 def get_attr(dom_node, attr, default=""):
     v = dom_node.getAttribute(attr)
@@ -377,7 +365,13 @@ def get_attr(dom_node, attr, default=""):
 #
 def do_add_node(gen, lustre,  options, node_name):
     uuid = new_uuid(node_name)
-    node = gen.node(node_name, uuid)
+    prof_name = new_name("PROFILE_" + node_name)
+    prof_uuid = new_uuid(prof_name)
+    profile = gen.profile(prof_name, prof_uuid)
+    node = gen.node(node_name, uuid, prof_uuid)
+    lustre.appendChild(node)
+    lustre.appendChild(profile)
+
     node_add_profile(gen, node, 'ldlm', ldlm_uuid)
     if has_option(options, 'router'):
         node.setAttribute('router', '1')
@@ -385,7 +379,6 @@ def do_add_node(gen, lustre,  options, node_name):
         node.setAttribute('timeout', get_option(options, 'timeout'))
     if has_option(options, 'recovery_upcall'):
         node.setAttribute('recovery_upcall', get_option(options, 'recovery_upcall'))
-    lustre.appendChild(node)
     return node
 
     
@@ -393,7 +386,6 @@ def add_node(gen, lustre, options):
     """ create a node with a network config """
 
     node_name = get_option(options, 'node')
-
     ret = findByName(lustre, node_name, "node")
     if ret:
         print "Node:", node_name, "exists."
@@ -444,37 +436,41 @@ def add_route(gen, lustre, options):
     
     netlist = node.getElementsByTagName('network')
     net = netlist[0]
-    rlist = net.getElementsByTagName('route_tbl')
+    rlist = net.getElementsByTagName('routetbl')
     if len(rlist) > 0:
         rtbl = rlist[0]
     else:
-        rtbl = gen.addElement(net, 'route_tbl')
+        rtbl = gen.addElement(net, 'routetbl')
     rtbl.appendChild(gen.route(net_type, gw, lo, hi))
 
 
 def add_mds(gen, lustre, options):
     node_name = get_option(options, 'node')
-    mds_orig = get_option(options, 'mds')
-    mds_name = new_name(mds_orig)
-    if mds_name != mds_orig:
-        warning("name:", mds_orig, "already used. using:", mds_name)
+    mds_name = get_option(options, 'mds')
+    mdd_name = new_name("MDD_" + mds_name +"_" + node_name)
+    mdd_uuid = new_uuid(mdd_name)
+
+    mds_uuid = name2uuid(lustre, mds_name, fatal=0)
+    if not mds_uuid:
+        mds_uuid = new_uuid(mds_name)
+        mds = gen.mds(mds_name, mds_uuid, mdd_uuid)
+        lustre.appendChild(mds)
+        
     devname = get_option(options, 'dev')
     size = get_option(options, 'size', 0)
     fstype = get_option(options, 'fstype', 'extN')
 
-    mds_uuid = new_uuid(mds_name)
-
     node_uuid = name2uuid(lustre, node_name, 'node')
 
     node = findByName(lustre, node_name, "node")
-    node_add_profile(gen, node, "mds", mds_uuid)
+    node_add_profile(gen, node, "mdsdev", mdd_uuid)
     net_uuid = get_net_uuid(lustre, node_name)
     if not net_uuid:
         error("NODE: ", node_name, "not found")
 
-    mds = gen.mds(mds_name, mds_uuid, fstype, devname, get_format_flag(options),
-                  net_uuid, node_uuid, dev_size=size)
-    lustre.appendChild(mds)
+    mdd = gen.mdsdev(mdd_name, mdd_uuid, fstype, devname, get_format_flag(options),
+                  net_uuid, node_uuid, mds_uuid, dev_size=size)
+    lustre.appendChild(mdd)
                    
 
 def add_ost(gen, lustre, options):
@@ -552,7 +548,7 @@ def add_echo_client(gen, lustre, options):
 
     echoname = new_name('ECHO_'+ node_name)
     echo_uuid = new_uuid(echoname)
-    node_add_profile(gen, node, 'echo_client', echo_uuid)
+    node_add_profile(gen, node, 'echoclient', echo_uuid)
 
     lov_uuid = name2uuid(lustre, lov_name, tag='lov', fatal=0)
     if not lov_uuid:
@@ -584,11 +580,11 @@ def add_lov(gen, lustre, options):
     lov = gen.lov(name, uuid, mds_uuid, stripe_sz, stripe_cnt, pattern)
     lustre.appendChild(lov)
     
-    # add an lovconfig entry to the mds profile
+    # add an lovconfig entry to the active mdsdev profile
     lovconfig_name = new_name('LVCFG_' + name)
     lovconfig_uuid = new_uuid(lovconfig_name)
-    node = mds2node(lustre, mds_name)
-    node_add_profile(gen, node, "lovconfig", lovconfig_uuid)
+    mds = findByName(lustre, mds_name)
+    mds.appendChild(gen.ref("lovconfig", lovconfig_uuid))
     lovconfig = gen.lovconfig(lovconfig_name, lovconfig_uuid, uuid)
     lustre.appendChild(lovconfig)
 
@@ -882,5 +878,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
-    
index 65055a5..1aa9d91 100644 (file)
 
 void usage(char *pgm)
 {
-       fprintf(stderr, "\nIncorrect parameters!  Correct usage:\n\n" );
-       fprintf(stderr, "%s <output filename> <stripe size> <OST #> <stripe #>\n", pgm);
+       fprintf(stderr, "usage: %s <filename> <stripe size> <start stripe> <stripe count>\n", pgm);
 
-       fprintf(stderr, "\n\nArgument explanations:\n---------------------\n\n");
-       fprintf(stderr, "<output filename> = the full name and path of the output file to create\n");
-       fprintf(stderr, "<stripe size> = the number of bytes to have in each stripe.\n");
-       fprintf(stderr, "<OST #> = the OST number to start the striping on.\n");
-       fprintf(stderr, "<stripe #> = the number of stripes to use.\n");
-
-       fprintf(stderr, "\n\nExamples:\n---------\n\n");
-
-       fprintf(stderr, "%s /mnt/lustre/ost1 131072 0 1\n", pgm);
-       fprintf(stderr, "\t\tcreates a file only on ost1.\n\n");
-
-       fprintf(stderr, "%s /mnt/lustre/ost2 131072 1 1\n", pgm);
-       fprintf(stderr, "\t\tcreates a file only on ost2.\n\n");
-
-       fprintf(stderr, "%s /mnt/lustre/ost1and2 131072 0 2\n", pgm);
-       fprintf(stderr, "\t\tcreates a 128k file with 2 stripes, on ost1 and ost2.\n");
-
-       fprintf(stderr, "%s /mnt/lustre/ost1and2 131072 1 2\n", pgm);
-       fprintf(stderr, "\t\tcreates a 128k file with 2 stripes, on ost2 and ost1.\n");
+       fprintf(stderr, "\tstripe size: number of bytes in each stripe\n");
+       fprintf(stderr, "\tstripe start: OST index which holds first stripe\n");
+       fprintf(stderr, "\tstripe count: number of OSTs to stripe over\n");
 }
 
 int create_file(char *name, long stripe_size, int stripe_offset,
@@ -49,7 +32,6 @@ int create_file(char *name, long stripe_size, int stripe_offset,
 
        /*  Initialize IOCTL striping pattern structure  */
        a_striping.lmm_magic = LOV_MAGIC;
-       a_striping.lmm_stripe_pattern = 0;
        a_striping.lmm_stripe_size = stripe_size;
        a_striping.lmm_stripe_offset = stripe_offset;
        a_striping.lmm_stripe_count = stripe_count;
diff --git a/lustre/utils/lustre.dtd b/lustre/utils/lustre.dtd
deleted file mode 100644 (file)
index 2df183a..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-<!-- Lustre Management DTD -->\r
-\r
-<!-- basic entities -->\r
-<!ENTITY % tag.content "(#PCDATA)">\r
-<!ENTITY % tag.ref "\r
-  num CDATA #IMPLIED\r
-  name CDATA #IMPLIED\r
-  uuidref CDATA #REQUIRED">\r
-<!ENTITY % tag.attr "\r
-  name CDATA #REQUIRED\r
-  uuid CDATA #REQUIRED">\r
-\r
-<!-- main elements -->\r
-<!ELEMENT lustre (node | mountpoint | ldlm | echo_client |\r
-                  mds | mdc | obd | ost | osc | lov | lovconfig)*>\r
-\r
-<!ELEMENT node (network | profile)*>\r
-<!ATTLIST node router CDATA #IMPLIED\r
-               %tag.attr;>\r
-<!ELEMENT network (server | port | route_tbl | send_mem | recv_mem)*>\r
-<!ATTLIST network type (tcp | elan | gm) 'tcp'\r
-                  %tag.attr;>\r
-\r
-<!ELEMENT route_tbl (route)*>\r
-<!ELEMENT route %tag.content;>\r
-<!ATTLIST route type (elan | tcp | gm) #REQUIRED\r
-                gw CDATA #REQUIRED\r
-                lo CDATA #REQUIRED\r
-                hi CDATA #IMPLIED >\r
-\r
-<!ELEMENT profile (ldlm_ref | network_ref | obd_ref | ost_ref | osc_ref |\r
-                   echo_client_ref | mds_ref | mdc_ref | lov_ref |\r
-                   lovconfig_ref| mountpoint_ref)*>\r
-<!ATTLIST profile >\r
-\r
-<!ELEMENT mountpoint (path | fileset | mds_ref | osc_ref)*>\r
-<!ATTLIST mountpoint %tag.attr;>\r
-<!ELEMENT echo_client (osc_ref)*>\r
-<!ATTLIST echo_client %tag.attr;>\r
-<!ELEMENT ldlm EMPTY>\r
-<!ATTLIST ldlm %tag.attr;>\r
-\r
-<!ELEMENT obd (fstype | device | autoformat)*>\r
-<!ATTLIST obd %tag.attr; type (obdfilter | obdecho) 'obdfilter'>\r
-<!ELEMENT ost (network_ref | obd_ref | failover_ref)*>\r
-<!ATTLIST ost %tag.attr;>\r
-<!ELEMENT mds (network_ref | fstype | device | autoformat | \r
-               server_ref | failover_ref | node_ref )*>\r
-<!ATTLIST mds %tag.attr;>\r
-\r
-<!ELEMENT osc (ost_ref | obd_ref)*>\r
-<!ATTLIST osc %tag.attr;>\r
-<!ELEMENT mdc (network_ref | mds_ref)*>\r
-<!ATTLIST mdc %tag.attr;>\r
-<!ELEMENT lov (devices | mds_ref)*>\r
-<!ATTLIST lov %tag.attr;>\r
-<!ELEMENT lovconfig (lov_ref)>\r
-<!ATTLIST lovconfig %tag.attr;>\r
-<!ELEMENT devices (osc_ref)+>\r
-<!ATTLIST devices stripesize CDATA #REQUIRED\r
-                  stripecount CDATA #REQUIRED\r
-                  stripeoffset CDATA #IMPLIED\r
-                  pattern    CDATA #REQUIRED>\r
-\r
-<!-- basic elements -->\r
-\r
-<!ELEMENT fstype        %tag.content;>\r
-<!ELEMENT device        %tag.content;>\r
-<!ATTLIST device        size CDATA #IMPLIED>\r
-<!ELEMENT server        %tag.content;>\r
-<!ELEMENT port          %tag.content;>\r
-<!ELEMENT send_mem      %tag.content;>\r
-<!ELEMENT recv_mem      %tag.content;>\r
-<!ELEMENT autoformat    %tag.content;>\r
-<!ELEMENT path          %tag.content;>\r
-<!ELEMENT fileset       %tag.content;>\r
-\r
-<!-- id tag elements -->\r
-<!ELEMENT network_ref    %tag.content;>\r
-<!ATTLIST network_ref    %tag.ref;>\r
-<!ELEMENT node_ref       %tag.content;>\r
-<!ATTLIST node_ref       %tag.ref;>\r
-<!ELEMENT profile_ref    %tag.content;>\r
-<!ATTLIST profile_ref    %tag.ref;>\r
-<!ELEMENT obd_ref        %tag.content;>\r
-<!ATTLIST obd_ref        %tag.ref;>\r
-<!ELEMENT mds_ref        %tag.content;>\r
-<!ATTLIST mds_ref        %tag.ref;>\r
-<!ELEMENT osc_ref        %tag.content;>\r
-<!ATTLIST osc_ref        %tag.ref;>\r
-<!ELEMENT ost_ref        %tag.content;>\r
-<!ATTLIST ost_ref        %tag.ref;>\r
-<!ELEMENT lov_ref        %tag.content;>\r
-<!ATTLIST lov_ref        %tag.ref;>\r
-<!ELEMENT lovconfig_ref        %tag.content;>\r
-<!ATTLIST lovconfig_ref        %tag.ref;>\r
-<!ELEMENT mdc_ref        %tag.content;>\r
-<!ATTLIST mdc_ref        %tag.ref;>\r
-<!ELEMENT mountpoint_ref %tag.content;>\r
-<!ATTLIST mountpoint_ref %tag.ref;>\r
-<!ELEMENT echo_client_ref %tag.content;>\r
-<!ATTLIST echo_client_ref %tag.ref;>\r
-<!ELEMENT server_ref     %tag.content;>\r
-<!ATTLIST server_ref     %tag.ref;>\r
-<!ELEMENT failover_ref   %tag.content;>\r
-<!ATTLIST failover_ref   %tag.ref;>\r
-<!ELEMENT ldlm_ref   %tag.content;>\r
-<!ATTLIST ldlm_ref   %tag.ref;>\r
-\r
-\r
index ba22a9e..8c329ff 100644 (file)
@@ -1273,7 +1273,7 @@ int jt_obd_lov_setconfig(int argc, char **argv)
 
         if (strlen(argv[1]) > sizeof(desc.ld_uuid) - 1) {
                 fprintf(stderr,
-                        "error: %s: LOV uuid '%s' longer than %zd characters\n",
+                        "error: %s: LOV uuid '%s' longer than "LPSZ" characters\n",
                         cmdname(argv[0]), argv[1], sizeof(desc.ld_uuid) - 1);
                 return -EINVAL;
         }
@@ -1375,18 +1375,24 @@ int jt_obd_lov_getconfig(int argc, char **argv)
         struct obd_ioctl_data data;
         struct lov_desc desc;
         obd_uuid_t *uuidarray;
-        int rc;
+        char *path;
+        int rc, tmpfd;
 
+        /* FIXME: ug.  IOCINIT checks fd. */
+        tmpfd = fd;
+        fd = 1;
         IOCINIT(data);
+        fd = tmpfd;        
 
         if (argc != 2)
                 return CMD_HELP;
 
-        if (strlen(argv[1]) > sizeof(desc.ld_uuid) - 1) {
-                fprintf(stderr,
-                        "error: %s: LOV uuid '%s' longer than %zd characters\n",
-                        cmdname(argv[0]), argv[1], sizeof(desc.ld_uuid) - 1);
-                return -EINVAL;
+        path = argv[1];
+        tmpfd = open(path, O_RDONLY);
+        if (tmpfd < 0) {
+                fprintf(stderr, "open \"%s\" failed: %s\n", path,
+                        strerror(errno));
+                return -1;
         }
 
         memset(&desc, 0, sizeof(desc));
@@ -1397,7 +1403,8 @@ repeat:
         if (!uuidarray) {
                 fprintf(stderr, "error: %s: no memory for %d uuid's\n",
                         cmdname(argv[0]), desc.ld_tgt_count);
-                return -ENOMEM;
+                rc = -ENOMEM;
+                goto out;
         }
 
         data.ioc_inllen1 = sizeof(desc);
@@ -1410,7 +1417,7 @@ repeat:
                 rc = -EINVAL;
                 goto out;
         }
-        rc = ioctl(fd, OBD_IOC_LOV_GET_CONFIG, buf);
+        rc = ioctl(tmpfd, OBD_IOC_LOV_GET_CONFIG, buf);
         if (rc == -ENOSPC) {
                 free(uuidarray);
                 goto repeat;
@@ -1440,6 +1447,7 @@ repeat:
         }
 out:
         free(uuidarray);
+        close(tmpfd);
         return rc;
 }
 
@@ -1596,6 +1604,55 @@ int jt_obd_failconn(int argc, char **argv)
         return rc;
 }
 
+int jt_obd_mdc_lookup(int argc, char **argv)
+{
+        struct obd_ioctl_data data;
+        char *parent, *child;
+        int rc, tmpfd, verbose = 1;
+
+        if (argc < 3 || argc > 4)
+                return CMD_HELP;
+
+        parent = argv[1];
+        child = argv[2];
+        if (argc == 4)
+                verbose = get_verbose(argv[0], argv[3]);
+
+        /* FIXME: ug.  IOCINIT checks fd. */
+        tmpfd = fd;
+        fd = 1;
+        IOCINIT(data);
+        fd = tmpfd;        
+
+        data.ioc_inllen1 = strlen(child) + 1;
+        data.ioc_inlbuf1 = child;
+
+        IOC_PACK(argv[0], data);
+
+        tmpfd = open(parent, O_RDONLY);
+        if (tmpfd < 0) {
+                fprintf(stderr, "open \"%s\" failed: %s\n", parent,
+                        strerror(errno));
+                return -1;
+        }
+
+        rc = ioctl(tmpfd, IOC_MDC_LOOKUP, buf);
+        if (rc < 0) {
+                fprintf(stderr, "error: %s: ioctl error: %s\n",
+                        cmdname(argv[0]), strerror(rc = errno));
+        }
+        close(tmpfd);
+
+        if (verbose) {
+                IOC_UNPACK(argv[0], data);
+                printf("%s: mode %o uid %d gid %d\n", child,
+                       data.ioc_obdo1.o_mode, data.ioc_obdo1.o_uid,
+                       data.ioc_obdo1.o_gid);
+        }
+
+        return rc;
+}
+
 static void signal_server(int sig)
 {
         if (sig == SIGINT) {
index 01ece92..acc5c5f 100644 (file)
@@ -58,6 +58,7 @@ int jt_obd_dump_ldlm(int argc, char **argv);
 int jt_obd_lov_set_osc_active(int argc, char **argv);
 int jt_obd_newconn(int argc, char **argv);
 int jt_obd_failconn(int argc, char **argv);
+int jt_obd_mdc_lookup(int argc, char **argv);
 int jt_get_version(int argc, char **argv);
 
 #endif